]> git.saurik.com Git - redis.git/blob - redis.c
6bfbb84debfd9eae4ab85b95e28c62812d80c201
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int shutdown_asap;
373 time_t lastfsync;
374 int appendfd;
375 int appendseldb;
376 char *pidfile;
377 pid_t bgsavechildpid;
378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
380 sds aofbuf; /* AOF buffer, written before entering the event loop */
381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
386 char *appendfilename;
387 char *requirepass;
388 int rdbcompression;
389 int activerehashing;
390 /* Replication related */
391 int isslave;
392 char *masterauth;
393 char *masterhost;
394 int masterport;
395 redisClient *master; /* client that is master for this slave */
396 int replstate;
397 unsigned int maxclients;
398 unsigned long long maxmemory;
399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
406 /* Virtual memory configuration */
407 int vm_enabled;
408 char *vm_swap_file;
409 off_t vm_page_size;
410 off_t vm_pages;
411 unsigned long long vm_max_memory;
412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
421 time_t unixtime; /* Unix time sampled every second. */
422 /* Virtual memory I/O threads stuff */
423 /* An I/O thread process an element taken from the io_jobs queue and
424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
447 /* Pubsub */
448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
450 /* Misc */
451 FILE *devnull;
452 };
453
454 typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457 } pubsubPattern;
458
459 typedef void redisCommandProc(redisClient *c);
460 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
461 struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
469 redisVmPreloadProc *vm_preload_proc;
470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
474 };
475
476 struct redisFunctionSym {
477 char *name;
478 unsigned long pointer;
479 };
480
481 typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487 } redisSortObject;
488
489 typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492 } redisSortOperation;
493
494 /* ZSETs use a specialized version of Skiplists */
495
496 typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
498 struct zskiplistNode *backward;
499 unsigned int *span;
500 double score;
501 robj *obj;
502 } zskiplistNode;
503
504 typedef struct zskiplist {
505 struct zskiplistNode *header, *tail;
506 unsigned long length;
507 int level;
508 } zskiplist;
509
510 typedef struct zset {
511 dict *dict;
512 zskiplist *zsl;
513 } zset;
514
515 /* Our shared "common" objects */
516
517 #define REDIS_SHARED_INTEGERS 10000
518 struct sharedObjectsStruct {
519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
520 *colon, *nullbulk, *nullmultibulk, *queued,
521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
523 *select0, *select1, *select2, *select3, *select4,
524 *select5, *select6, *select7, *select8, *select9,
525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
528 } shared;
529
530 /* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
536 /* VM threaded I/O request message */
537 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
540 typedef struct iojob {
541 int type; /* Request type, REDIS_IOJOB_* */
542 redisDb *db;/* Redis database */
543 robj *key; /* This I/O request is about swapping this key */
544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550 } iojob;
551
552 /*================================ Prototypes =============================== */
553
554 static void freeStringObject(robj *o);
555 static void freeListObject(robj *o);
556 static void freeSetObject(robj *o);
557 static void decrRefCount(void *o);
558 static robj *createObject(int type, void *ptr);
559 static void freeClient(redisClient *c);
560 static int rdbLoad(char *filename);
561 static void addReply(redisClient *c, robj *obj);
562 static void addReplySds(redisClient *c, sds s);
563 static void incrRefCount(robj *o);
564 static int rdbSaveBackground(char *filename);
565 static robj *createStringObject(char *ptr, size_t len);
566 static robj *dupStringObject(robj *o);
567 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
568 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
569 static void flushAppendOnlyFile(void);
570 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
571 static int syncWithMaster(void);
572 static robj *tryObjectEncoding(robj *o);
573 static robj *getDecodedObject(robj *o);
574 static int removeExpire(redisDb *db, robj *key);
575 static int expireIfNeeded(redisDb *db, robj *key);
576 static int deleteIfVolatile(redisDb *db, robj *key);
577 static int deleteIfSwapped(redisDb *db, robj *key);
578 static int deleteKey(redisDb *db, robj *key);
579 static time_t getExpire(redisDb *db, robj *key);
580 static int setExpire(redisDb *db, robj *key, time_t when);
581 static void updateSlavesWaitingBgsave(int bgsaveerr);
582 static void freeMemoryIfNeeded(void);
583 static int processCommand(redisClient *c);
584 static void setupSigSegvAction(void);
585 static void rdbRemoveTempFile(pid_t childpid);
586 static void aofRemoveTempFile(pid_t childpid);
587 static size_t stringObjectLen(robj *o);
588 static void processInputBuffer(redisClient *c);
589 static zskiplist *zslCreate(void);
590 static void zslFree(zskiplist *zsl);
591 static void zslInsert(zskiplist *zsl, double score, robj *obj);
592 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
593 static void initClientMultiState(redisClient *c);
594 static void freeClientMultiState(redisClient *c);
595 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
596 static void unblockClientWaitingData(redisClient *c);
597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
598 static void vmInit(void);
599 static void vmMarkPagesFree(off_t page, off_t count);
600 static robj *vmLoadObject(robj *key);
601 static robj *vmPreviewObject(robj *key);
602 static int vmSwapOneObjectBlocking(void);
603 static int vmSwapOneObjectThreaded(void);
604 static int vmCanSwapOut(void);
605 static int tryFreeOneObjectFromFreelist(void);
606 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmCancelThreadedIOJob(robj *o);
609 static void lockThreadedIO(void);
610 static void unlockThreadedIO(void);
611 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612 static void freeIOJob(iojob *j);
613 static void queueIOJob(iojob *j);
614 static int vmWriteObjectOnSwap(robj *o, off_t page);
615 static robj *vmReadObjectFromSwap(off_t page, int type);
616 static void waitEmptyIOJobsQueue(void);
617 static void vmReopenSwapFile(void);
618 static int vmFreePage(off_t page);
619 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
620 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
622 static int dontWaitForSwappedKey(redisClient *c, robj *key);
623 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625 static struct redisCommand *lookupCommand(char *name);
626 static void call(redisClient *c, struct redisCommand *cmd);
627 static void resetClient(redisClient *c);
628 static void convertToRealHash(robj *o);
629 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631 static void freePubsubPattern(void *p);
632 static int listMatchPubsubPattern(void *a, void *b);
633 static int compareStringObjects(robj *a, robj *b);
634 static int equalStringObjects(robj *a, robj *b);
635 static void usage();
636 static int rewriteAppendOnlyFileBackground(void);
637 static int vmSwapObjectBlocking(robj *key, robj *val);
638 static int prepareForShutdown();
639 static void touchWatchedKey(redisDb *db, robj *key);
640 static void touchWatchedKeysOnFlush(int dbid);
641 static void unwatchAllKeys(redisClient *c);
642
643 static void authCommand(redisClient *c);
644 static void pingCommand(redisClient *c);
645 static void echoCommand(redisClient *c);
646 static void setCommand(redisClient *c);
647 static void setnxCommand(redisClient *c);
648 static void setexCommand(redisClient *c);
649 static void getCommand(redisClient *c);
650 static void delCommand(redisClient *c);
651 static void existsCommand(redisClient *c);
652 static void incrCommand(redisClient *c);
653 static void decrCommand(redisClient *c);
654 static void incrbyCommand(redisClient *c);
655 static void decrbyCommand(redisClient *c);
656 static void selectCommand(redisClient *c);
657 static void randomkeyCommand(redisClient *c);
658 static void keysCommand(redisClient *c);
659 static void dbsizeCommand(redisClient *c);
660 static void lastsaveCommand(redisClient *c);
661 static void saveCommand(redisClient *c);
662 static void bgsaveCommand(redisClient *c);
663 static void bgrewriteaofCommand(redisClient *c);
664 static void shutdownCommand(redisClient *c);
665 static void moveCommand(redisClient *c);
666 static void renameCommand(redisClient *c);
667 static void renamenxCommand(redisClient *c);
668 static void lpushCommand(redisClient *c);
669 static void rpushCommand(redisClient *c);
670 static void lpopCommand(redisClient *c);
671 static void rpopCommand(redisClient *c);
672 static void llenCommand(redisClient *c);
673 static void lindexCommand(redisClient *c);
674 static void lrangeCommand(redisClient *c);
675 static void ltrimCommand(redisClient *c);
676 static void typeCommand(redisClient *c);
677 static void lsetCommand(redisClient *c);
678 static void saddCommand(redisClient *c);
679 static void sremCommand(redisClient *c);
680 static void smoveCommand(redisClient *c);
681 static void sismemberCommand(redisClient *c);
682 static void scardCommand(redisClient *c);
683 static void spopCommand(redisClient *c);
684 static void srandmemberCommand(redisClient *c);
685 static void sinterCommand(redisClient *c);
686 static void sinterstoreCommand(redisClient *c);
687 static void sunionCommand(redisClient *c);
688 static void sunionstoreCommand(redisClient *c);
689 static void sdiffCommand(redisClient *c);
690 static void sdiffstoreCommand(redisClient *c);
691 static void syncCommand(redisClient *c);
692 static void flushdbCommand(redisClient *c);
693 static void flushallCommand(redisClient *c);
694 static void sortCommand(redisClient *c);
695 static void lremCommand(redisClient *c);
696 static void rpoplpushcommand(redisClient *c);
697 static void infoCommand(redisClient *c);
698 static void mgetCommand(redisClient *c);
699 static void monitorCommand(redisClient *c);
700 static void expireCommand(redisClient *c);
701 static void expireatCommand(redisClient *c);
702 static void getsetCommand(redisClient *c);
703 static void ttlCommand(redisClient *c);
704 static void slaveofCommand(redisClient *c);
705 static void debugCommand(redisClient *c);
706 static void msetCommand(redisClient *c);
707 static void msetnxCommand(redisClient *c);
708 static void zaddCommand(redisClient *c);
709 static void zincrbyCommand(redisClient *c);
710 static void zrangeCommand(redisClient *c);
711 static void zrangebyscoreCommand(redisClient *c);
712 static void zcountCommand(redisClient *c);
713 static void zrevrangeCommand(redisClient *c);
714 static void zcardCommand(redisClient *c);
715 static void zremCommand(redisClient *c);
716 static void zscoreCommand(redisClient *c);
717 static void zremrangebyscoreCommand(redisClient *c);
718 static void multiCommand(redisClient *c);
719 static void execCommand(redisClient *c);
720 static void discardCommand(redisClient *c);
721 static void blpopCommand(redisClient *c);
722 static void brpopCommand(redisClient *c);
723 static void appendCommand(redisClient *c);
724 static void substrCommand(redisClient *c);
725 static void zrankCommand(redisClient *c);
726 static void zrevrankCommand(redisClient *c);
727 static void hsetCommand(redisClient *c);
728 static void hsetnxCommand(redisClient *c);
729 static void hgetCommand(redisClient *c);
730 static void hmsetCommand(redisClient *c);
731 static void hmgetCommand(redisClient *c);
732 static void hdelCommand(redisClient *c);
733 static void hlenCommand(redisClient *c);
734 static void zremrangebyrankCommand(redisClient *c);
735 static void zunionstoreCommand(redisClient *c);
736 static void zinterstoreCommand(redisClient *c);
737 static void hkeysCommand(redisClient *c);
738 static void hvalsCommand(redisClient *c);
739 static void hgetallCommand(redisClient *c);
740 static void hexistsCommand(redisClient *c);
741 static void configCommand(redisClient *c);
742 static void hincrbyCommand(redisClient *c);
743 static void subscribeCommand(redisClient *c);
744 static void unsubscribeCommand(redisClient *c);
745 static void psubscribeCommand(redisClient *c);
746 static void punsubscribeCommand(redisClient *c);
747 static void publishCommand(redisClient *c);
748 static void watchCommand(redisClient *c);
749 static void unwatchCommand(redisClient *c);
750
751 /*================================= Globals ================================= */
752
753 /* Global vars */
754 static struct redisServer server; /* server global state */
755 static struct redisCommand cmdTable[] = {
756 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
758 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
763 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
767 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
779 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
780 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
783 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
788 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
789 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
800 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
801 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
811 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
812 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
821 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
825 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
838 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
846 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
851 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
854 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
857 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
862 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
864 {NULL,NULL,0,0,NULL,0,0,0}
865 };
866
867 /*============================ Utility functions ============================ */
868
869 /* Glob-style pattern matching. */
870 static int stringmatchlen(const char *pattern, int patternLen,
871 const char *string, int stringLen, int nocase)
872 {
873 while(patternLen) {
874 switch(pattern[0]) {
875 case '*':
876 while (pattern[1] == '*') {
877 pattern++;
878 patternLen--;
879 }
880 if (patternLen == 1)
881 return 1; /* match */
882 while(stringLen) {
883 if (stringmatchlen(pattern+1, patternLen-1,
884 string, stringLen, nocase))
885 return 1; /* match */
886 string++;
887 stringLen--;
888 }
889 return 0; /* no match */
890 break;
891 case '?':
892 if (stringLen == 0)
893 return 0; /* no match */
894 string++;
895 stringLen--;
896 break;
897 case '[':
898 {
899 int not, match;
900
901 pattern++;
902 patternLen--;
903 not = pattern[0] == '^';
904 if (not) {
905 pattern++;
906 patternLen--;
907 }
908 match = 0;
909 while(1) {
910 if (pattern[0] == '\\') {
911 pattern++;
912 patternLen--;
913 if (pattern[0] == string[0])
914 match = 1;
915 } else if (pattern[0] == ']') {
916 break;
917 } else if (patternLen == 0) {
918 pattern--;
919 patternLen++;
920 break;
921 } else if (pattern[1] == '-' && patternLen >= 3) {
922 int start = pattern[0];
923 int end = pattern[2];
924 int c = string[0];
925 if (start > end) {
926 int t = start;
927 start = end;
928 end = t;
929 }
930 if (nocase) {
931 start = tolower(start);
932 end = tolower(end);
933 c = tolower(c);
934 }
935 pattern += 2;
936 patternLen -= 2;
937 if (c >= start && c <= end)
938 match = 1;
939 } else {
940 if (!nocase) {
941 if (pattern[0] == string[0])
942 match = 1;
943 } else {
944 if (tolower((int)pattern[0]) == tolower((int)string[0]))
945 match = 1;
946 }
947 }
948 pattern++;
949 patternLen--;
950 }
951 if (not)
952 match = !match;
953 if (!match)
954 return 0; /* no match */
955 string++;
956 stringLen--;
957 break;
958 }
959 case '\\':
960 if (patternLen >= 2) {
961 pattern++;
962 patternLen--;
963 }
964 /* fall through */
965 default:
966 if (!nocase) {
967 if (pattern[0] != string[0])
968 return 0; /* no match */
969 } else {
970 if (tolower((int)pattern[0]) != tolower((int)string[0]))
971 return 0; /* no match */
972 }
973 string++;
974 stringLen--;
975 break;
976 }
977 pattern++;
978 patternLen--;
979 if (stringLen == 0) {
980 while(*pattern == '*') {
981 pattern++;
982 patternLen--;
983 }
984 break;
985 }
986 }
987 if (patternLen == 0 && stringLen == 0)
988 return 1;
989 return 0;
990 }
991
992 static int stringmatch(const char *pattern, const char *string, int nocase) {
993 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
994 }
995
996 /* Convert a string representing an amount of memory into the number of
997 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
998 * (1024*1024*1024).
999 *
1000 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1001 * set to 0 */
1002 static long long memtoll(const char *p, int *err) {
1003 const char *u;
1004 char buf[128];
1005 long mul; /* unit multiplier */
1006 long long val;
1007 unsigned int digits;
1008
1009 if (err) *err = 0;
1010 /* Search the first non digit character. */
1011 u = p;
1012 if (*u == '-') u++;
1013 while(*u && isdigit(*u)) u++;
1014 if (*u == '\0' || !strcasecmp(u,"b")) {
1015 mul = 1;
1016 } else if (!strcasecmp(u,"k")) {
1017 mul = 1000;
1018 } else if (!strcasecmp(u,"kb")) {
1019 mul = 1024;
1020 } else if (!strcasecmp(u,"m")) {
1021 mul = 1000*1000;
1022 } else if (!strcasecmp(u,"mb")) {
1023 mul = 1024*1024;
1024 } else if (!strcasecmp(u,"g")) {
1025 mul = 1000L*1000*1000;
1026 } else if (!strcasecmp(u,"gb")) {
1027 mul = 1024L*1024*1024;
1028 } else {
1029 if (err) *err = 1;
1030 mul = 1;
1031 }
1032 digits = u-p;
1033 if (digits >= sizeof(buf)) {
1034 if (err) *err = 1;
1035 return LLONG_MAX;
1036 }
1037 memcpy(buf,p,digits);
1038 buf[digits] = '\0';
1039 val = strtoll(buf,NULL,10);
1040 return val*mul;
1041 }
1042
1043 /* Convert a long long into a string. Returns the number of
1044 * characters needed to represent the number, that can be shorter if passed
1045 * buffer length is not enough to store the whole number. */
1046 static int ll2string(char *s, size_t len, long long value) {
1047 char buf[32], *p;
1048 unsigned long long v;
1049 size_t l;
1050
1051 if (len == 0) return 0;
1052 v = (value < 0) ? -value : value;
1053 p = buf+31; /* point to the last character */
1054 do {
1055 *p-- = '0'+(v%10);
1056 v /= 10;
1057 } while(v);
1058 if (value < 0) *p-- = '-';
1059 p++;
1060 l = 32-(p-buf);
1061 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1062 memcpy(s,p,l);
1063 s[l] = '\0';
1064 return l;
1065 }
1066
1067 static void redisLog(int level, const char *fmt, ...) {
1068 va_list ap;
1069 FILE *fp;
1070
1071 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1072 if (!fp) return;
1073
1074 va_start(ap, fmt);
1075 if (level >= server.verbosity) {
1076 char *c = ".-*#";
1077 char buf[64];
1078 time_t now;
1079
1080 now = time(NULL);
1081 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1082 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1083 vfprintf(fp, fmt, ap);
1084 fprintf(fp,"\n");
1085 fflush(fp);
1086 }
1087 va_end(ap);
1088
1089 if (server.logfile) fclose(fp);
1090 }
1091
1092 /*====================== Hash table type implementation ==================== */
1093
1094 /* This is an hash table type that uses the SDS dynamic strings libary as
1095 * keys and radis objects as values (objects can hold SDS strings,
1096 * lists, sets). */
1097
1098 static void dictVanillaFree(void *privdata, void *val)
1099 {
1100 DICT_NOTUSED(privdata);
1101 zfree(val);
1102 }
1103
1104 static void dictListDestructor(void *privdata, void *val)
1105 {
1106 DICT_NOTUSED(privdata);
1107 listRelease((list*)val);
1108 }
1109
1110 static int sdsDictKeyCompare(void *privdata, const void *key1,
1111 const void *key2)
1112 {
1113 int l1,l2;
1114 DICT_NOTUSED(privdata);
1115
1116 l1 = sdslen((sds)key1);
1117 l2 = sdslen((sds)key2);
1118 if (l1 != l2) return 0;
1119 return memcmp(key1, key2, l1) == 0;
1120 }
1121
1122 static void dictRedisObjectDestructor(void *privdata, void *val)
1123 {
1124 DICT_NOTUSED(privdata);
1125
1126 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1127 decrRefCount(val);
1128 }
1129
1130 static int dictObjKeyCompare(void *privdata, const void *key1,
1131 const void *key2)
1132 {
1133 const robj *o1 = key1, *o2 = key2;
1134 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1135 }
1136
1137 static unsigned int dictObjHash(const void *key) {
1138 const robj *o = key;
1139 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1140 }
1141
1142 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1143 const void *key2)
1144 {
1145 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1146 int cmp;
1147
1148 if (o1->encoding == REDIS_ENCODING_INT &&
1149 o2->encoding == REDIS_ENCODING_INT)
1150 return o1->ptr == o2->ptr;
1151
1152 o1 = getDecodedObject(o1);
1153 o2 = getDecodedObject(o2);
1154 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1155 decrRefCount(o1);
1156 decrRefCount(o2);
1157 return cmp;
1158 }
1159
1160 static unsigned int dictEncObjHash(const void *key) {
1161 robj *o = (robj*) key;
1162
1163 if (o->encoding == REDIS_ENCODING_RAW) {
1164 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1165 } else {
1166 if (o->encoding == REDIS_ENCODING_INT) {
1167 char buf[32];
1168 int len;
1169
1170 len = ll2string(buf,32,(long)o->ptr);
1171 return dictGenHashFunction((unsigned char*)buf, len);
1172 } else {
1173 unsigned int hash;
1174
1175 o = getDecodedObject(o);
1176 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 decrRefCount(o);
1178 return hash;
1179 }
1180 }
1181 }
1182
1183 /* Sets type and expires */
1184 static dictType setDictType = {
1185 dictEncObjHash, /* hash function */
1186 NULL, /* key dup */
1187 NULL, /* val dup */
1188 dictEncObjKeyCompare, /* key compare */
1189 dictRedisObjectDestructor, /* key destructor */
1190 NULL /* val destructor */
1191 };
1192
1193 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1194 static dictType zsetDictType = {
1195 dictEncObjHash, /* hash function */
1196 NULL, /* key dup */
1197 NULL, /* val dup */
1198 dictEncObjKeyCompare, /* key compare */
1199 dictRedisObjectDestructor, /* key destructor */
1200 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1201 };
1202
1203 /* Db->dict */
1204 static dictType dbDictType = {
1205 dictObjHash, /* hash function */
1206 NULL, /* key dup */
1207 NULL, /* val dup */
1208 dictObjKeyCompare, /* key compare */
1209 dictRedisObjectDestructor, /* key destructor */
1210 dictRedisObjectDestructor /* val destructor */
1211 };
1212
1213 /* Db->expires */
1214 static dictType keyptrDictType = {
1215 dictObjHash, /* hash function */
1216 NULL, /* key dup */
1217 NULL, /* val dup */
1218 dictObjKeyCompare, /* key compare */
1219 dictRedisObjectDestructor, /* key destructor */
1220 NULL /* val destructor */
1221 };
1222
1223 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1224 static dictType hashDictType = {
1225 dictEncObjHash, /* hash function */
1226 NULL, /* key dup */
1227 NULL, /* val dup */
1228 dictEncObjKeyCompare, /* key compare */
1229 dictRedisObjectDestructor, /* key destructor */
1230 dictRedisObjectDestructor /* val destructor */
1231 };
1232
1233 /* Keylist hash table type has unencoded redis objects as keys and
1234 * lists as values. It's used for blocking operations (BLPOP) and to
1235 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1236 static dictType keylistDictType = {
1237 dictObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictListDestructor /* val destructor */
1243 };
1244
1245 static void version();
1246
1247 /* ========================= Random utility functions ======================= */
1248
1249 /* Redis generally does not try to recover from out of memory conditions
1250 * when allocating objects or strings, it is not clear if it will be possible
1251 * to report this condition to the client since the networking layer itself
1252 * is based on heap allocation for send buffers, so we simply abort.
1253 * At least the code will be simpler to read... */
1254 static void oom(const char *msg) {
1255 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1256 sleep(1);
1257 abort();
1258 }
1259
1260 /* ====================== Redis server networking stuff ===================== */
1261 static void closeTimedoutClients(void) {
1262 redisClient *c;
1263 listNode *ln;
1264 time_t now = time(NULL);
1265 listIter li;
1266
1267 listRewind(server.clients,&li);
1268 while ((ln = listNext(&li)) != NULL) {
1269 c = listNodeValue(ln);
1270 if (server.maxidletime &&
1271 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1272 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1273 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1274 listLength(c->pubsub_patterns) == 0 &&
1275 (now - c->lastinteraction > server.maxidletime))
1276 {
1277 redisLog(REDIS_VERBOSE,"Closing idle client");
1278 freeClient(c);
1279 } else if (c->flags & REDIS_BLOCKED) {
1280 if (c->blockingto != 0 && c->blockingto < now) {
1281 addReply(c,shared.nullmultibulk);
1282 unblockClientWaitingData(c);
1283 }
1284 }
1285 }
1286 }
1287
1288 static int htNeedsResize(dict *dict) {
1289 long long size, used;
1290
1291 size = dictSlots(dict);
1292 used = dictSize(dict);
1293 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1294 (used*100/size < REDIS_HT_MINFILL));
1295 }
1296
1297 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1298 * we resize the hash table to save memory */
1299 static void tryResizeHashTables(void) {
1300 int j;
1301
1302 for (j = 0; j < server.dbnum; j++) {
1303 if (htNeedsResize(server.db[j].dict))
1304 dictResize(server.db[j].dict);
1305 if (htNeedsResize(server.db[j].expires))
1306 dictResize(server.db[j].expires);
1307 }
1308 }
1309
1310 /* Our hash table implementation performs rehashing incrementally while
1311 * we write/read from the hash table. Still if the server is idle, the hash
1312 * table will use two tables for a long time. So we try to use 1 millisecond
1313 * of CPU time at every serverCron() loop in order to rehash some key. */
1314 static void incrementallyRehash(void) {
1315 int j;
1316
1317 for (j = 0; j < server.dbnum; j++) {
1318 if (dictIsRehashing(server.db[j].dict)) {
1319 dictRehashMilliseconds(server.db[j].dict,1);
1320 break; /* already used our millisecond for this loop... */
1321 }
1322 }
1323 }
1324
1325 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1326 void backgroundSaveDoneHandler(int statloc) {
1327 int exitcode = WEXITSTATUS(statloc);
1328 int bysignal = WIFSIGNALED(statloc);
1329
1330 if (!bysignal && exitcode == 0) {
1331 redisLog(REDIS_NOTICE,
1332 "Background saving terminated with success");
1333 server.dirty = 0;
1334 server.lastsave = time(NULL);
1335 } else if (!bysignal && exitcode != 0) {
1336 redisLog(REDIS_WARNING, "Background saving error");
1337 } else {
1338 redisLog(REDIS_WARNING,
1339 "Background saving terminated by signal %d", WTERMSIG(statloc));
1340 rdbRemoveTempFile(server.bgsavechildpid);
1341 }
1342 server.bgsavechildpid = -1;
1343 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1344 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1345 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1346 }
1347
1348 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1349 * Handle this. */
1350 void backgroundRewriteDoneHandler(int statloc) {
1351 int exitcode = WEXITSTATUS(statloc);
1352 int bysignal = WIFSIGNALED(statloc);
1353
1354 if (!bysignal && exitcode == 0) {
1355 int fd;
1356 char tmpfile[256];
1357
1358 redisLog(REDIS_NOTICE,
1359 "Background append only file rewriting terminated with success");
1360 /* Now it's time to flush the differences accumulated by the parent */
1361 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1362 fd = open(tmpfile,O_WRONLY|O_APPEND);
1363 if (fd == -1) {
1364 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1365 goto cleanup;
1366 }
1367 /* Flush our data... */
1368 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1369 (signed) sdslen(server.bgrewritebuf)) {
1370 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1371 close(fd);
1372 goto cleanup;
1373 }
1374 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1375 /* Now our work is to rename the temp file into the stable file. And
1376 * switch the file descriptor used by the server for append only. */
1377 if (rename(tmpfile,server.appendfilename) == -1) {
1378 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1379 close(fd);
1380 goto cleanup;
1381 }
1382 /* Mission completed... almost */
1383 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1384 if (server.appendfd != -1) {
1385 /* If append only is actually enabled... */
1386 close(server.appendfd);
1387 server.appendfd = fd;
1388 fsync(fd);
1389 server.appendseldb = -1; /* Make sure it will issue SELECT */
1390 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1391 } else {
1392 /* If append only is disabled we just generate a dump in this
1393 * format. Why not? */
1394 close(fd);
1395 }
1396 } else if (!bysignal && exitcode != 0) {
1397 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1398 } else {
1399 redisLog(REDIS_WARNING,
1400 "Background append only file rewriting terminated by signal %d",
1401 WTERMSIG(statloc));
1402 }
1403 cleanup:
1404 sdsfree(server.bgrewritebuf);
1405 server.bgrewritebuf = sdsempty();
1406 aofRemoveTempFile(server.bgrewritechildpid);
1407 server.bgrewritechildpid = -1;
1408 }
1409
1410 /* This function is called once a background process of some kind terminates,
1411 * as we want to avoid resizing the hash tables when there is a child in order
1412 * to play well with copy-on-write (otherwise when a resize happens lots of
1413 * memory pages are copied). The goal of this function is to update the ability
1414 * for dict.c to resize the hash tables accordingly to the fact we have o not
1415 * running childs. */
1416 static void updateDictResizePolicy(void) {
1417 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1418 dictEnableResize();
1419 else
1420 dictDisableResize();
1421 }
1422
1423 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1424 int j, loops = server.cronloops++;
1425 REDIS_NOTUSED(eventLoop);
1426 REDIS_NOTUSED(id);
1427 REDIS_NOTUSED(clientData);
1428
1429 /* We take a cached value of the unix time in the global state because
1430 * with virtual memory and aging there is to store the current time
1431 * in objects at every object access, and accuracy is not needed.
1432 * To access a global var is faster than calling time(NULL) */
1433 server.unixtime = time(NULL);
1434
1435 /* We received a SIGTERM, shutting down here in a safe way, as it is
1436 * not ok doing so inside the signal handler. */
1437 if (server.shutdown_asap) {
1438 if (prepareForShutdown() == REDIS_OK) exit(0);
1439 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1440 }
1441
1442 /* Show some info about non-empty databases */
1443 for (j = 0; j < server.dbnum; j++) {
1444 long long size, used, vkeys;
1445
1446 size = dictSlots(server.db[j].dict);
1447 used = dictSize(server.db[j].dict);
1448 vkeys = dictSize(server.db[j].expires);
1449 if (!(loops % 50) && (used || vkeys)) {
1450 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1451 /* dictPrintStats(server.dict); */
1452 }
1453 }
1454
1455 /* We don't want to resize the hash tables while a bacground saving
1456 * is in progress: the saving child is created using fork() that is
1457 * implemented with a copy-on-write semantic in most modern systems, so
1458 * if we resize the HT while there is the saving child at work actually
1459 * a lot of memory movements in the parent will cause a lot of pages
1460 * copied. */
1461 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1462 if (!(loops % 10)) tryResizeHashTables();
1463 if (server.activerehashing) incrementallyRehash();
1464 }
1465
1466 /* Show information about connected clients */
1467 if (!(loops % 50)) {
1468 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1469 listLength(server.clients)-listLength(server.slaves),
1470 listLength(server.slaves),
1471 zmalloc_used_memory());
1472 }
1473
1474 /* Close connections of timedout clients */
1475 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1476 closeTimedoutClients();
1477
1478 /* Check if a background saving or AOF rewrite in progress terminated */
1479 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1480 int statloc;
1481 pid_t pid;
1482
1483 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1484 if (pid == server.bgsavechildpid) {
1485 backgroundSaveDoneHandler(statloc);
1486 } else {
1487 backgroundRewriteDoneHandler(statloc);
1488 }
1489 updateDictResizePolicy();
1490 }
1491 } else {
1492 /* If there is not a background saving in progress check if
1493 * we have to save now */
1494 time_t now = time(NULL);
1495 for (j = 0; j < server.saveparamslen; j++) {
1496 struct saveparam *sp = server.saveparams+j;
1497
1498 if (server.dirty >= sp->changes &&
1499 now-server.lastsave > sp->seconds) {
1500 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1501 sp->changes, sp->seconds);
1502 rdbSaveBackground(server.dbfilename);
1503 break;
1504 }
1505 }
1506 }
1507
1508 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1509 * will use few CPU cycles if there are few expiring keys, otherwise
1510 * it will get more aggressive to avoid that too much memory is used by
1511 * keys that can be removed from the keyspace. */
1512 for (j = 0; j < server.dbnum; j++) {
1513 int expired;
1514 redisDb *db = server.db+j;
1515
1516 /* Continue to expire if at the end of the cycle more than 25%
1517 * of the keys were expired. */
1518 do {
1519 long num = dictSize(db->expires);
1520 time_t now = time(NULL);
1521
1522 expired = 0;
1523 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1524 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1525 while (num--) {
1526 dictEntry *de;
1527 time_t t;
1528
1529 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1530 t = (time_t) dictGetEntryVal(de);
1531 if (now > t) {
1532 deleteKey(db,dictGetEntryKey(de));
1533 expired++;
1534 server.stat_expiredkeys++;
1535 }
1536 }
1537 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1538 }
1539
1540 /* Swap a few keys on disk if we are over the memory limit and VM
1541 * is enbled. Try to free objects from the free list first. */
1542 if (vmCanSwapOut()) {
1543 while (server.vm_enabled && zmalloc_used_memory() >
1544 server.vm_max_memory)
1545 {
1546 int retval;
1547
1548 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1549 retval = (server.vm_max_threads == 0) ?
1550 vmSwapOneObjectBlocking() :
1551 vmSwapOneObjectThreaded();
1552 if (retval == REDIS_ERR && !(loops % 300) &&
1553 zmalloc_used_memory() >
1554 (server.vm_max_memory+server.vm_max_memory/10))
1555 {
1556 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1557 }
1558 /* Note that when using threade I/O we free just one object,
1559 * because anyway when the I/O thread in charge to swap this
1560 * object out will finish, the handler of completed jobs
1561 * will try to swap more objects if we are still out of memory. */
1562 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1563 }
1564 }
1565
1566 /* Check if we should connect to a MASTER */
1567 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1568 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1569 if (syncWithMaster() == REDIS_OK) {
1570 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1571 if (server.appendonly) rewriteAppendOnlyFileBackground();
1572 }
1573 }
1574 return 100;
1575 }
1576
1577 /* This function gets called every time Redis is entering the
1578 * main loop of the event driven library, that is, before to sleep
1579 * for ready file descriptors. */
1580 static void beforeSleep(struct aeEventLoop *eventLoop) {
1581 REDIS_NOTUSED(eventLoop);
1582
1583 /* Awake clients that got all the swapped keys they requested */
1584 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1585 listIter li;
1586 listNode *ln;
1587
1588 listRewind(server.io_ready_clients,&li);
1589 while((ln = listNext(&li))) {
1590 redisClient *c = ln->value;
1591 struct redisCommand *cmd;
1592
1593 /* Resume the client. */
1594 listDelNode(server.io_ready_clients,ln);
1595 c->flags &= (~REDIS_IO_WAIT);
1596 server.vm_blocked_clients--;
1597 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1598 readQueryFromClient, c);
1599 cmd = lookupCommand(c->argv[0]->ptr);
1600 assert(cmd != NULL);
1601 call(c,cmd);
1602 resetClient(c);
1603 /* There may be more data to process in the input buffer. */
1604 if (c->querybuf && sdslen(c->querybuf) > 0)
1605 processInputBuffer(c);
1606 }
1607 }
1608 /* Write the AOF buffer on disk */
1609 flushAppendOnlyFile();
1610 }
1611
1612 static void createSharedObjects(void) {
1613 int j;
1614
1615 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1616 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1617 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1618 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1619 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1620 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1621 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1622 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1623 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1624 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1625 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1626 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1627 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1628 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR no such key\r\n"));
1630 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR syntax error\r\n"));
1632 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR source and destination objects are the same\r\n"));
1634 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR index out of range\r\n"));
1636 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1637 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1638 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1639 shared.select0 = createStringObject("select 0\r\n",10);
1640 shared.select1 = createStringObject("select 1\r\n",10);
1641 shared.select2 = createStringObject("select 2\r\n",10);
1642 shared.select3 = createStringObject("select 3\r\n",10);
1643 shared.select4 = createStringObject("select 4\r\n",10);
1644 shared.select5 = createStringObject("select 5\r\n",10);
1645 shared.select6 = createStringObject("select 6\r\n",10);
1646 shared.select7 = createStringObject("select 7\r\n",10);
1647 shared.select8 = createStringObject("select 8\r\n",10);
1648 shared.select9 = createStringObject("select 9\r\n",10);
1649 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1650 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1651 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1652 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1653 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1654 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1655 shared.mbulk3 = createStringObject("*3\r\n",4);
1656 shared.mbulk4 = createStringObject("*4\r\n",4);
1657 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1658 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1659 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1660 }
1661 }
1662
1663 static void appendServerSaveParams(time_t seconds, int changes) {
1664 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1665 server.saveparams[server.saveparamslen].seconds = seconds;
1666 server.saveparams[server.saveparamslen].changes = changes;
1667 server.saveparamslen++;
1668 }
1669
1670 static void resetServerSaveParams() {
1671 zfree(server.saveparams);
1672 server.saveparams = NULL;
1673 server.saveparamslen = 0;
1674 }
1675
1676 static void initServerConfig() {
1677 server.dbnum = REDIS_DEFAULT_DBNUM;
1678 server.port = REDIS_SERVERPORT;
1679 server.verbosity = REDIS_VERBOSE;
1680 server.maxidletime = REDIS_MAXIDLETIME;
1681 server.saveparams = NULL;
1682 server.logfile = NULL; /* NULL = log on standard output */
1683 server.bindaddr = NULL;
1684 server.glueoutputbuf = 1;
1685 server.daemonize = 0;
1686 server.appendonly = 0;
1687 server.appendfsync = APPENDFSYNC_EVERYSEC;
1688 server.lastfsync = time(NULL);
1689 server.appendfd = -1;
1690 server.appendseldb = -1; /* Make sure the first time will not match */
1691 server.pidfile = zstrdup("/var/run/redis.pid");
1692 server.dbfilename = zstrdup("dump.rdb");
1693 server.appendfilename = zstrdup("appendonly.aof");
1694 server.requirepass = NULL;
1695 server.rdbcompression = 1;
1696 server.activerehashing = 1;
1697 server.maxclients = 0;
1698 server.blpop_blocked_clients = 0;
1699 server.maxmemory = 0;
1700 server.vm_enabled = 0;
1701 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1702 server.vm_page_size = 256; /* 256 bytes per page */
1703 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1704 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1705 server.vm_max_threads = 4;
1706 server.vm_blocked_clients = 0;
1707 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1708 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1709 server.shutdown_asap = 0;
1710
1711 resetServerSaveParams();
1712
1713 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1714 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1715 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1716 /* Replication related */
1717 server.isslave = 0;
1718 server.masterauth = NULL;
1719 server.masterhost = NULL;
1720 server.masterport = 6379;
1721 server.master = NULL;
1722 server.replstate = REDIS_REPL_NONE;
1723
1724 /* Double constants initialization */
1725 R_Zero = 0.0;
1726 R_PosInf = 1.0/R_Zero;
1727 R_NegInf = -1.0/R_Zero;
1728 R_Nan = R_Zero/R_Zero;
1729 }
1730
1731 static void initServer() {
1732 int j;
1733
1734 signal(SIGHUP, SIG_IGN);
1735 signal(SIGPIPE, SIG_IGN);
1736 setupSigSegvAction();
1737
1738 server.devnull = fopen("/dev/null","w");
1739 if (server.devnull == NULL) {
1740 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1741 exit(1);
1742 }
1743 server.clients = listCreate();
1744 server.slaves = listCreate();
1745 server.monitors = listCreate();
1746 server.objfreelist = listCreate();
1747 createSharedObjects();
1748 server.el = aeCreateEventLoop();
1749 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1750 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1751 if (server.fd == -1) {
1752 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1753 exit(1);
1754 }
1755 for (j = 0; j < server.dbnum; j++) {
1756 server.db[j].dict = dictCreate(&dbDictType,NULL);
1757 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1758 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1759 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1760 if (server.vm_enabled)
1761 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1762 server.db[j].id = j;
1763 }
1764 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1765 server.pubsub_patterns = listCreate();
1766 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1767 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1768 server.cronloops = 0;
1769 server.bgsavechildpid = -1;
1770 server.bgrewritechildpid = -1;
1771 server.bgrewritebuf = sdsempty();
1772 server.aofbuf = sdsempty();
1773 server.lastsave = time(NULL);
1774 server.dirty = 0;
1775 server.stat_numcommands = 0;
1776 server.stat_numconnections = 0;
1777 server.stat_expiredkeys = 0;
1778 server.stat_starttime = time(NULL);
1779 server.unixtime = time(NULL);
1780 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1781 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1782 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1783
1784 if (server.appendonly) {
1785 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1786 if (server.appendfd == -1) {
1787 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1788 strerror(errno));
1789 exit(1);
1790 }
1791 }
1792
1793 if (server.vm_enabled) vmInit();
1794 }
1795
1796 /* Empty the whole database */
1797 static long long emptyDb() {
1798 int j;
1799 long long removed = 0;
1800
1801 for (j = 0; j < server.dbnum; j++) {
1802 removed += dictSize(server.db[j].dict);
1803 dictEmpty(server.db[j].dict);
1804 dictEmpty(server.db[j].expires);
1805 }
1806 return removed;
1807 }
1808
1809 static int yesnotoi(char *s) {
1810 if (!strcasecmp(s,"yes")) return 1;
1811 else if (!strcasecmp(s,"no")) return 0;
1812 else return -1;
1813 }
1814
1815 /* I agree, this is a very rudimental way to load a configuration...
1816 will improve later if the config gets more complex */
1817 static void loadServerConfig(char *filename) {
1818 FILE *fp;
1819 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1820 int linenum = 0;
1821 sds line = NULL;
1822
1823 if (filename[0] == '-' && filename[1] == '\0')
1824 fp = stdin;
1825 else {
1826 if ((fp = fopen(filename,"r")) == NULL) {
1827 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1828 exit(1);
1829 }
1830 }
1831
1832 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1833 sds *argv;
1834 int argc, j;
1835
1836 linenum++;
1837 line = sdsnew(buf);
1838 line = sdstrim(line," \t\r\n");
1839
1840 /* Skip comments and blank lines*/
1841 if (line[0] == '#' || line[0] == '\0') {
1842 sdsfree(line);
1843 continue;
1844 }
1845
1846 /* Split into arguments */
1847 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1848 sdstolower(argv[0]);
1849
1850 /* Execute config directives */
1851 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1852 server.maxidletime = atoi(argv[1]);
1853 if (server.maxidletime < 0) {
1854 err = "Invalid timeout value"; goto loaderr;
1855 }
1856 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1857 server.port = atoi(argv[1]);
1858 if (server.port < 1 || server.port > 65535) {
1859 err = "Invalid port"; goto loaderr;
1860 }
1861 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1862 server.bindaddr = zstrdup(argv[1]);
1863 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1864 int seconds = atoi(argv[1]);
1865 int changes = atoi(argv[2]);
1866 if (seconds < 1 || changes < 0) {
1867 err = "Invalid save parameters"; goto loaderr;
1868 }
1869 appendServerSaveParams(seconds,changes);
1870 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1871 if (chdir(argv[1]) == -1) {
1872 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1873 argv[1], strerror(errno));
1874 exit(1);
1875 }
1876 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1877 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1878 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1879 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1880 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1881 else {
1882 err = "Invalid log level. Must be one of debug, notice, warning";
1883 goto loaderr;
1884 }
1885 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1886 FILE *logfp;
1887
1888 server.logfile = zstrdup(argv[1]);
1889 if (!strcasecmp(server.logfile,"stdout")) {
1890 zfree(server.logfile);
1891 server.logfile = NULL;
1892 }
1893 if (server.logfile) {
1894 /* Test if we are able to open the file. The server will not
1895 * be able to abort just for this problem later... */
1896 logfp = fopen(server.logfile,"a");
1897 if (logfp == NULL) {
1898 err = sdscatprintf(sdsempty(),
1899 "Can't open the log file: %s", strerror(errno));
1900 goto loaderr;
1901 }
1902 fclose(logfp);
1903 }
1904 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1905 server.dbnum = atoi(argv[1]);
1906 if (server.dbnum < 1) {
1907 err = "Invalid number of databases"; goto loaderr;
1908 }
1909 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1910 loadServerConfig(argv[1]);
1911 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1912 server.maxclients = atoi(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1914 server.maxmemory = memtoll(argv[1],NULL);
1915 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1916 server.masterhost = sdsnew(argv[1]);
1917 server.masterport = atoi(argv[2]);
1918 server.replstate = REDIS_REPL_CONNECT;
1919 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1920 server.masterauth = zstrdup(argv[1]);
1921 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1922 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1923 err = "argument must be 'yes' or 'no'"; goto loaderr;
1924 }
1925 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1926 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1927 err = "argument must be 'yes' or 'no'"; goto loaderr;
1928 }
1929 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1930 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1931 err = "argument must be 'yes' or 'no'"; goto loaderr;
1932 }
1933 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1934 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1935 err = "argument must be 'yes' or 'no'"; goto loaderr;
1936 }
1937 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1938 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1939 err = "argument must be 'yes' or 'no'"; goto loaderr;
1940 }
1941 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1942 zfree(server.appendfilename);
1943 server.appendfilename = zstrdup(argv[1]);
1944 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1945 if (!strcasecmp(argv[1],"no")) {
1946 server.appendfsync = APPENDFSYNC_NO;
1947 } else if (!strcasecmp(argv[1],"always")) {
1948 server.appendfsync = APPENDFSYNC_ALWAYS;
1949 } else if (!strcasecmp(argv[1],"everysec")) {
1950 server.appendfsync = APPENDFSYNC_EVERYSEC;
1951 } else {
1952 err = "argument must be 'no', 'always' or 'everysec'";
1953 goto loaderr;
1954 }
1955 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1956 server.requirepass = zstrdup(argv[1]);
1957 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1958 zfree(server.pidfile);
1959 server.pidfile = zstrdup(argv[1]);
1960 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1961 zfree(server.dbfilename);
1962 server.dbfilename = zstrdup(argv[1]);
1963 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1964 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1965 err = "argument must be 'yes' or 'no'"; goto loaderr;
1966 }
1967 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1968 zfree(server.vm_swap_file);
1969 server.vm_swap_file = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1971 server.vm_max_memory = memtoll(argv[1],NULL);
1972 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1973 server.vm_page_size = memtoll(argv[1], NULL);
1974 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1975 server.vm_pages = memtoll(argv[1], NULL);
1976 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1977 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1978 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1979 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1981 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1982 } else {
1983 err = "Bad directive or wrong number of arguments"; goto loaderr;
1984 }
1985 for (j = 0; j < argc; j++)
1986 sdsfree(argv[j]);
1987 zfree(argv);
1988 sdsfree(line);
1989 }
1990 if (fp != stdin) fclose(fp);
1991 return;
1992
1993 loaderr:
1994 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1995 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1996 fprintf(stderr, ">>> '%s'\n", line);
1997 fprintf(stderr, "%s\n", err);
1998 exit(1);
1999 }
2000
2001 static void freeClientArgv(redisClient *c) {
2002 int j;
2003
2004 for (j = 0; j < c->argc; j++)
2005 decrRefCount(c->argv[j]);
2006 for (j = 0; j < c->mbargc; j++)
2007 decrRefCount(c->mbargv[j]);
2008 c->argc = 0;
2009 c->mbargc = 0;
2010 }
2011
2012 static void freeClient(redisClient *c) {
2013 listNode *ln;
2014
2015 /* Note that if the client we are freeing is blocked into a blocking
2016 * call, we have to set querybuf to NULL *before* to call
2017 * unblockClientWaitingData() to avoid processInputBuffer() will get
2018 * called. Also it is important to remove the file events after
2019 * this, because this call adds the READABLE event. */
2020 sdsfree(c->querybuf);
2021 c->querybuf = NULL;
2022 if (c->flags & REDIS_BLOCKED)
2023 unblockClientWaitingData(c);
2024
2025 /* UNWATCH all the keys */
2026 unwatchAllKeys(c);
2027 listRelease(c->watched_keys);
2028 /* Unsubscribe from all the pubsub channels */
2029 pubsubUnsubscribeAllChannels(c,0);
2030 pubsubUnsubscribeAllPatterns(c,0);
2031 dictRelease(c->pubsub_channels);
2032 listRelease(c->pubsub_patterns);
2033 /* Obvious cleanup */
2034 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2036 listRelease(c->reply);
2037 freeClientArgv(c);
2038 close(c->fd);
2039 /* Remove from the list of clients */
2040 ln = listSearchKey(server.clients,c);
2041 redisAssert(ln != NULL);
2042 listDelNode(server.clients,ln);
2043 /* Remove from the list of clients that are now ready to be restarted
2044 * after waiting for swapped keys */
2045 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2046 ln = listSearchKey(server.io_ready_clients,c);
2047 if (ln) {
2048 listDelNode(server.io_ready_clients,ln);
2049 server.vm_blocked_clients--;
2050 }
2051 }
2052 /* Remove from the list of clients waiting for swapped keys */
2053 while (server.vm_enabled && listLength(c->io_keys)) {
2054 ln = listFirst(c->io_keys);
2055 dontWaitForSwappedKey(c,ln->value);
2056 }
2057 listRelease(c->io_keys);
2058 /* Master/slave cleanup */
2059 if (c->flags & REDIS_SLAVE) {
2060 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2061 close(c->repldbfd);
2062 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2063 ln = listSearchKey(l,c);
2064 redisAssert(ln != NULL);
2065 listDelNode(l,ln);
2066 }
2067 if (c->flags & REDIS_MASTER) {
2068 server.master = NULL;
2069 server.replstate = REDIS_REPL_CONNECT;
2070 }
2071 /* Release memory */
2072 zfree(c->argv);
2073 zfree(c->mbargv);
2074 freeClientMultiState(c);
2075 zfree(c);
2076 }
2077
2078 #define GLUEREPLY_UP_TO (1024)
2079 static void glueReplyBuffersIfNeeded(redisClient *c) {
2080 int copylen = 0;
2081 char buf[GLUEREPLY_UP_TO];
2082 listNode *ln;
2083 listIter li;
2084 robj *o;
2085
2086 listRewind(c->reply,&li);
2087 while((ln = listNext(&li))) {
2088 int objlen;
2089
2090 o = ln->value;
2091 objlen = sdslen(o->ptr);
2092 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2093 memcpy(buf+copylen,o->ptr,objlen);
2094 copylen += objlen;
2095 listDelNode(c->reply,ln);
2096 } else {
2097 if (copylen == 0) return;
2098 break;
2099 }
2100 }
2101 /* Now the output buffer is empty, add the new single element */
2102 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2103 listAddNodeHead(c->reply,o);
2104 }
2105
2106 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen;
2109 robj *o;
2110 REDIS_NOTUSED(el);
2111 REDIS_NOTUSED(mask);
2112
2113 /* Use writev() if we have enough buffers to send */
2114 if (!server.glueoutputbuf &&
2115 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2116 !(c->flags & REDIS_MASTER))
2117 {
2118 sendReplyToClientWritev(el, fd, privdata, mask);
2119 return;
2120 }
2121
2122 while(listLength(c->reply)) {
2123 if (server.glueoutputbuf && listLength(c->reply) > 1)
2124 glueReplyBuffersIfNeeded(c);
2125
2126 o = listNodeValue(listFirst(c->reply));
2127 objlen = sdslen(o->ptr);
2128
2129 if (objlen == 0) {
2130 listDelNode(c->reply,listFirst(c->reply));
2131 continue;
2132 }
2133
2134 if (c->flags & REDIS_MASTER) {
2135 /* Don't reply to a master */
2136 nwritten = objlen - c->sentlen;
2137 } else {
2138 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2139 if (nwritten <= 0) break;
2140 }
2141 c->sentlen += nwritten;
2142 totwritten += nwritten;
2143 /* If we fully sent the object on head go to the next one */
2144 if (c->sentlen == objlen) {
2145 listDelNode(c->reply,listFirst(c->reply));
2146 c->sentlen = 0;
2147 }
2148 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2149 * bytes, in a single threaded server it's a good idea to serve
2150 * other clients as well, even if a very large request comes from
2151 * super fast link that is always able to accept data (in real world
2152 * scenario think about 'KEYS *' against the loopback interfae) */
2153 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2154 }
2155 if (nwritten == -1) {
2156 if (errno == EAGAIN) {
2157 nwritten = 0;
2158 } else {
2159 redisLog(REDIS_VERBOSE,
2160 "Error writing to client: %s", strerror(errno));
2161 freeClient(c);
2162 return;
2163 }
2164 }
2165 if (totwritten > 0) c->lastinteraction = time(NULL);
2166 if (listLength(c->reply) == 0) {
2167 c->sentlen = 0;
2168 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2169 }
2170 }
2171
2172 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2173 {
2174 redisClient *c = privdata;
2175 int nwritten = 0, totwritten = 0, objlen, willwrite;
2176 robj *o;
2177 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2178 int offset, ion = 0;
2179 REDIS_NOTUSED(el);
2180 REDIS_NOTUSED(mask);
2181
2182 listNode *node;
2183 while (listLength(c->reply)) {
2184 offset = c->sentlen;
2185 ion = 0;
2186 willwrite = 0;
2187
2188 /* fill-in the iov[] array */
2189 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2190 o = listNodeValue(node);
2191 objlen = sdslen(o->ptr);
2192
2193 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2194 break;
2195
2196 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2197 break; /* no more iovecs */
2198
2199 iov[ion].iov_base = ((char*)o->ptr) + offset;
2200 iov[ion].iov_len = objlen - offset;
2201 willwrite += objlen - offset;
2202 offset = 0; /* just for the first item */
2203 ion++;
2204 }
2205
2206 if(willwrite == 0)
2207 break;
2208
2209 /* write all collected blocks at once */
2210 if((nwritten = writev(fd, iov, ion)) < 0) {
2211 if (errno != EAGAIN) {
2212 redisLog(REDIS_VERBOSE,
2213 "Error writing to client: %s", strerror(errno));
2214 freeClient(c);
2215 return;
2216 }
2217 break;
2218 }
2219
2220 totwritten += nwritten;
2221 offset = c->sentlen;
2222
2223 /* remove written robjs from c->reply */
2224 while (nwritten && listLength(c->reply)) {
2225 o = listNodeValue(listFirst(c->reply));
2226 objlen = sdslen(o->ptr);
2227
2228 if(nwritten >= objlen - offset) {
2229 listDelNode(c->reply, listFirst(c->reply));
2230 nwritten -= objlen - offset;
2231 c->sentlen = 0;
2232 } else {
2233 /* partial write */
2234 c->sentlen += nwritten;
2235 break;
2236 }
2237 offset = 0;
2238 }
2239 }
2240
2241 if (totwritten > 0)
2242 c->lastinteraction = time(NULL);
2243
2244 if (listLength(c->reply) == 0) {
2245 c->sentlen = 0;
2246 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2247 }
2248 }
2249
2250 static struct redisCommand *lookupCommand(char *name) {
2251 int j = 0;
2252 while(cmdTable[j].name != NULL) {
2253 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2254 j++;
2255 }
2256 return NULL;
2257 }
2258
2259 /* resetClient prepare the client to process the next command */
2260 static void resetClient(redisClient *c) {
2261 freeClientArgv(c);
2262 c->bulklen = -1;
2263 c->multibulk = 0;
2264 }
2265
2266 /* Call() is the core of Redis execution of a command */
2267 static void call(redisClient *c, struct redisCommand *cmd) {
2268 long long dirty;
2269
2270 dirty = server.dirty;
2271 cmd->proc(c);
2272 dirty = server.dirty-dirty;
2273
2274 if (server.appendonly && dirty)
2275 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2276 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2277 listLength(server.slaves))
2278 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2279 if (listLength(server.monitors))
2280 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2281 server.stat_numcommands++;
2282 }
2283
2284 /* If this function gets called we already read a whole
2285 * command, argments are in the client argv/argc fields.
2286 * processCommand() execute the command or prepare the
2287 * server for a bulk read from the client.
2288 *
2289 * If 1 is returned the client is still alive and valid and
2290 * and other operations can be performed by the caller. Otherwise
2291 * if 0 is returned the client was destroied (i.e. after QUIT). */
2292 static int processCommand(redisClient *c) {
2293 struct redisCommand *cmd;
2294
2295 /* Free some memory if needed (maxmemory setting) */
2296 if (server.maxmemory) freeMemoryIfNeeded();
2297
2298 /* Handle the multi bulk command type. This is an alternative protocol
2299 * supported by Redis in order to receive commands that are composed of
2300 * multiple binary-safe "bulk" arguments. The latency of processing is
2301 * a bit higher but this allows things like multi-sets, so if this
2302 * protocol is used only for MSET and similar commands this is a big win. */
2303 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2304 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2305 if (c->multibulk <= 0) {
2306 resetClient(c);
2307 return 1;
2308 } else {
2309 decrRefCount(c->argv[c->argc-1]);
2310 c->argc--;
2311 return 1;
2312 }
2313 } else if (c->multibulk) {
2314 if (c->bulklen == -1) {
2315 if (((char*)c->argv[0]->ptr)[0] != '$') {
2316 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2317 resetClient(c);
2318 return 1;
2319 } else {
2320 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2321 decrRefCount(c->argv[0]);
2322 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2323 c->argc--;
2324 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2325 resetClient(c);
2326 return 1;
2327 }
2328 c->argc--;
2329 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2330 return 1;
2331 }
2332 } else {
2333 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2334 c->mbargv[c->mbargc] = c->argv[0];
2335 c->mbargc++;
2336 c->argc--;
2337 c->multibulk--;
2338 if (c->multibulk == 0) {
2339 robj **auxargv;
2340 int auxargc;
2341
2342 /* Here we need to swap the multi-bulk argc/argv with the
2343 * normal argc/argv of the client structure. */
2344 auxargv = c->argv;
2345 c->argv = c->mbargv;
2346 c->mbargv = auxargv;
2347
2348 auxargc = c->argc;
2349 c->argc = c->mbargc;
2350 c->mbargc = auxargc;
2351
2352 /* We need to set bulklen to something different than -1
2353 * in order for the code below to process the command without
2354 * to try to read the last argument of a bulk command as
2355 * a special argument. */
2356 c->bulklen = 0;
2357 /* continue below and process the command */
2358 } else {
2359 c->bulklen = -1;
2360 return 1;
2361 }
2362 }
2363 }
2364 /* -- end of multi bulk commands processing -- */
2365
2366 /* The QUIT command is handled as a special case. Normal command
2367 * procs are unable to close the client connection safely */
2368 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2369 freeClient(c);
2370 return 0;
2371 }
2372
2373 /* Now lookup the command and check ASAP about trivial error conditions
2374 * such wrong arity, bad command name and so forth. */
2375 cmd = lookupCommand(c->argv[0]->ptr);
2376 if (!cmd) {
2377 addReplySds(c,
2378 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2379 (char*)c->argv[0]->ptr));
2380 resetClient(c);
2381 return 1;
2382 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2383 (c->argc < -cmd->arity)) {
2384 addReplySds(c,
2385 sdscatprintf(sdsempty(),
2386 "-ERR wrong number of arguments for '%s' command\r\n",
2387 cmd->name));
2388 resetClient(c);
2389 return 1;
2390 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2391 /* This is a bulk command, we have to read the last argument yet. */
2392 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2393
2394 decrRefCount(c->argv[c->argc-1]);
2395 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2396 c->argc--;
2397 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2398 resetClient(c);
2399 return 1;
2400 }
2401 c->argc--;
2402 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2403 /* It is possible that the bulk read is already in the
2404 * buffer. Check this condition and handle it accordingly.
2405 * This is just a fast path, alternative to call processInputBuffer().
2406 * It's a good idea since the code is small and this condition
2407 * happens most of the times. */
2408 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2409 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2410 c->argc++;
2411 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2412 } else {
2413 /* Otherwise return... there is to read the last argument
2414 * from the socket. */
2415 return 1;
2416 }
2417 }
2418 /* Let's try to encode the bulk object to save space. */
2419 if (cmd->flags & REDIS_CMD_BULK)
2420 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2421
2422 /* Check if the user is authenticated */
2423 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2424 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2425 resetClient(c);
2426 return 1;
2427 }
2428
2429 /* Handle the maxmemory directive */
2430 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2431 zmalloc_used_memory() > server.maxmemory)
2432 {
2433 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2434 resetClient(c);
2435 return 1;
2436 }
2437
2438 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2439 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2440 &&
2441 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2442 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2443 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2444 resetClient(c);
2445 return 1;
2446 }
2447
2448 /* Exec the command */
2449 if (c->flags & REDIS_MULTI &&
2450 cmd->proc != execCommand && cmd->proc != discardCommand &&
2451 cmd->proc != multiCommand && cmd->proc != watchCommand)
2452 {
2453 queueMultiCommand(c,cmd);
2454 addReply(c,shared.queued);
2455 } else {
2456 if (server.vm_enabled && server.vm_max_threads > 0 &&
2457 blockClientOnSwappedKeys(c,cmd)) return 1;
2458 call(c,cmd);
2459 }
2460
2461 /* Prepare the client for the next command */
2462 resetClient(c);
2463 return 1;
2464 }
2465
2466 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2467 listNode *ln;
2468 listIter li;
2469 int outc = 0, j;
2470 robj **outv;
2471 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2472 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2473 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2474 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2475 robj *lenobj;
2476
2477 if (argc <= REDIS_STATIC_ARGS) {
2478 outv = static_outv;
2479 } else {
2480 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2481 }
2482
2483 lenobj = createObject(REDIS_STRING,
2484 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2485 lenobj->refcount = 0;
2486 outv[outc++] = lenobj;
2487 for (j = 0; j < argc; j++) {
2488 lenobj = createObject(REDIS_STRING,
2489 sdscatprintf(sdsempty(),"$%lu\r\n",
2490 (unsigned long) stringObjectLen(argv[j])));
2491 lenobj->refcount = 0;
2492 outv[outc++] = lenobj;
2493 outv[outc++] = argv[j];
2494 outv[outc++] = shared.crlf;
2495 }
2496
2497 /* Increment all the refcounts at start and decrement at end in order to
2498 * be sure to free objects if there is no slave in a replication state
2499 * able to be feed with commands */
2500 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2501 listRewind(slaves,&li);
2502 while((ln = listNext(&li))) {
2503 redisClient *slave = ln->value;
2504
2505 /* Don't feed slaves that are still waiting for BGSAVE to start */
2506 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2507
2508 /* Feed all the other slaves, MONITORs and so on */
2509 if (slave->slaveseldb != dictid) {
2510 robj *selectcmd;
2511
2512 switch(dictid) {
2513 case 0: selectcmd = shared.select0; break;
2514 case 1: selectcmd = shared.select1; break;
2515 case 2: selectcmd = shared.select2; break;
2516 case 3: selectcmd = shared.select3; break;
2517 case 4: selectcmd = shared.select4; break;
2518 case 5: selectcmd = shared.select5; break;
2519 case 6: selectcmd = shared.select6; break;
2520 case 7: selectcmd = shared.select7; break;
2521 case 8: selectcmd = shared.select8; break;
2522 case 9: selectcmd = shared.select9; break;
2523 default:
2524 selectcmd = createObject(REDIS_STRING,
2525 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2526 selectcmd->refcount = 0;
2527 break;
2528 }
2529 addReply(slave,selectcmd);
2530 slave->slaveseldb = dictid;
2531 }
2532 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2533 }
2534 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2535 if (outv != static_outv) zfree(outv);
2536 }
2537
2538 static sds sdscatrepr(sds s, char *p, size_t len) {
2539 s = sdscatlen(s,"\"",1);
2540 while(len--) {
2541 switch(*p) {
2542 case '\\':
2543 case '"':
2544 s = sdscatprintf(s,"\\%c",*p);
2545 break;
2546 case '\n': s = sdscatlen(s,"\\n",1); break;
2547 case '\r': s = sdscatlen(s,"\\r",1); break;
2548 case '\t': s = sdscatlen(s,"\\t",1); break;
2549 case '\a': s = sdscatlen(s,"\\a",1); break;
2550 case '\b': s = sdscatlen(s,"\\b",1); break;
2551 default:
2552 if (isprint(*p))
2553 s = sdscatprintf(s,"%c",*p);
2554 else
2555 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2556 break;
2557 }
2558 p++;
2559 }
2560 return sdscatlen(s,"\"",1);
2561 }
2562
2563 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2564 listNode *ln;
2565 listIter li;
2566 int j;
2567 sds cmdrepr = sdsnew("+");
2568 robj *cmdobj;
2569 struct timeval tv;
2570
2571 gettimeofday(&tv,NULL);
2572 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2573 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2574
2575 for (j = 0; j < argc; j++) {
2576 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2577 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2578 } else {
2579 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2580 sdslen(argv[j]->ptr));
2581 }
2582 if (j != argc-1)
2583 cmdrepr = sdscatlen(cmdrepr," ",1);
2584 }
2585 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2586 cmdobj = createObject(REDIS_STRING,cmdrepr);
2587
2588 listRewind(monitors,&li);
2589 while((ln = listNext(&li))) {
2590 redisClient *monitor = ln->value;
2591 addReply(monitor,cmdobj);
2592 }
2593 decrRefCount(cmdobj);
2594 }
2595
2596 static void processInputBuffer(redisClient *c) {
2597 again:
2598 /* Before to process the input buffer, make sure the client is not
2599 * waitig for a blocking operation such as BLPOP. Note that the first
2600 * iteration the client is never blocked, otherwise the processInputBuffer
2601 * would not be called at all, but after the execution of the first commands
2602 * in the input buffer the client may be blocked, and the "goto again"
2603 * will try to reiterate. The following line will make it return asap. */
2604 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2605 if (c->bulklen == -1) {
2606 /* Read the first line of the query */
2607 char *p = strchr(c->querybuf,'\n');
2608 size_t querylen;
2609
2610 if (p) {
2611 sds query, *argv;
2612 int argc, j;
2613
2614 query = c->querybuf;
2615 c->querybuf = sdsempty();
2616 querylen = 1+(p-(query));
2617 if (sdslen(query) > querylen) {
2618 /* leave data after the first line of the query in the buffer */
2619 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2620 }
2621 *p = '\0'; /* remove "\n" */
2622 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2623 sdsupdatelen(query);
2624
2625 /* Now we can split the query in arguments */
2626 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2627 sdsfree(query);
2628
2629 if (c->argv) zfree(c->argv);
2630 c->argv = zmalloc(sizeof(robj*)*argc);
2631
2632 for (j = 0; j < argc; j++) {
2633 if (sdslen(argv[j])) {
2634 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2635 c->argc++;
2636 } else {
2637 sdsfree(argv[j]);
2638 }
2639 }
2640 zfree(argv);
2641 if (c->argc) {
2642 /* Execute the command. If the client is still valid
2643 * after processCommand() return and there is something
2644 * on the query buffer try to process the next command. */
2645 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2646 } else {
2647 /* Nothing to process, argc == 0. Just process the query
2648 * buffer if it's not empty or return to the caller */
2649 if (sdslen(c->querybuf)) goto again;
2650 }
2651 return;
2652 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2653 redisLog(REDIS_VERBOSE, "Client protocol error");
2654 freeClient(c);
2655 return;
2656 }
2657 } else {
2658 /* Bulk read handling. Note that if we are at this point
2659 the client already sent a command terminated with a newline,
2660 we are reading the bulk data that is actually the last
2661 argument of the command. */
2662 int qbl = sdslen(c->querybuf);
2663
2664 if (c->bulklen <= qbl) {
2665 /* Copy everything but the final CRLF as final argument */
2666 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2667 c->argc++;
2668 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2669 /* Process the command. If the client is still valid after
2670 * the processing and there is more data in the buffer
2671 * try to parse it. */
2672 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2673 return;
2674 }
2675 }
2676 }
2677
2678 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2679 redisClient *c = (redisClient*) privdata;
2680 char buf[REDIS_IOBUF_LEN];
2681 int nread;
2682 REDIS_NOTUSED(el);
2683 REDIS_NOTUSED(mask);
2684
2685 nread = read(fd, buf, REDIS_IOBUF_LEN);
2686 if (nread == -1) {
2687 if (errno == EAGAIN) {
2688 nread = 0;
2689 } else {
2690 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2691 freeClient(c);
2692 return;
2693 }
2694 } else if (nread == 0) {
2695 redisLog(REDIS_VERBOSE, "Client closed connection");
2696 freeClient(c);
2697 return;
2698 }
2699 if (nread) {
2700 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2701 c->lastinteraction = time(NULL);
2702 } else {
2703 return;
2704 }
2705 processInputBuffer(c);
2706 }
2707
2708 static int selectDb(redisClient *c, int id) {
2709 if (id < 0 || id >= server.dbnum)
2710 return REDIS_ERR;
2711 c->db = &server.db[id];
2712 return REDIS_OK;
2713 }
2714
2715 static void *dupClientReplyValue(void *o) {
2716 incrRefCount((robj*)o);
2717 return o;
2718 }
2719
2720 static int listMatchObjects(void *a, void *b) {
2721 return equalStringObjects(a,b);
2722 }
2723
2724 static redisClient *createClient(int fd) {
2725 redisClient *c = zmalloc(sizeof(*c));
2726
2727 anetNonBlock(NULL,fd);
2728 anetTcpNoDelay(NULL,fd);
2729 if (!c) return NULL;
2730 selectDb(c,0);
2731 c->fd = fd;
2732 c->querybuf = sdsempty();
2733 c->argc = 0;
2734 c->argv = NULL;
2735 c->bulklen = -1;
2736 c->multibulk = 0;
2737 c->mbargc = 0;
2738 c->mbargv = NULL;
2739 c->sentlen = 0;
2740 c->flags = 0;
2741 c->lastinteraction = time(NULL);
2742 c->authenticated = 0;
2743 c->replstate = REDIS_REPL_NONE;
2744 c->reply = listCreate();
2745 listSetFreeMethod(c->reply,decrRefCount);
2746 listSetDupMethod(c->reply,dupClientReplyValue);
2747 c->blocking_keys = NULL;
2748 c->blocking_keys_num = 0;
2749 c->io_keys = listCreate();
2750 c->watched_keys = listCreate();
2751 listSetFreeMethod(c->io_keys,decrRefCount);
2752 c->pubsub_channels = dictCreate(&setDictType,NULL);
2753 c->pubsub_patterns = listCreate();
2754 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2755 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2756 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2757 readQueryFromClient, c) == AE_ERR) {
2758 freeClient(c);
2759 return NULL;
2760 }
2761 listAddNodeTail(server.clients,c);
2762 initClientMultiState(c);
2763 return c;
2764 }
2765
2766 static void addReply(redisClient *c, robj *obj) {
2767 if (listLength(c->reply) == 0 &&
2768 (c->replstate == REDIS_REPL_NONE ||
2769 c->replstate == REDIS_REPL_ONLINE) &&
2770 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2771 sendReplyToClient, c) == AE_ERR) return;
2772
2773 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2774 obj = dupStringObject(obj);
2775 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2776 }
2777 listAddNodeTail(c->reply,getDecodedObject(obj));
2778 }
2779
2780 static void addReplySds(redisClient *c, sds s) {
2781 robj *o = createObject(REDIS_STRING,s);
2782 addReply(c,o);
2783 decrRefCount(o);
2784 }
2785
2786 static void addReplyDouble(redisClient *c, double d) {
2787 char buf[128];
2788
2789 snprintf(buf,sizeof(buf),"%.17g",d);
2790 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2791 (unsigned long) strlen(buf),buf));
2792 }
2793
2794 static void addReplyLongLong(redisClient *c, long long ll) {
2795 char buf[128];
2796 size_t len;
2797
2798 if (ll == 0) {
2799 addReply(c,shared.czero);
2800 return;
2801 } else if (ll == 1) {
2802 addReply(c,shared.cone);
2803 return;
2804 }
2805 buf[0] = ':';
2806 len = ll2string(buf+1,sizeof(buf)-1,ll);
2807 buf[len+1] = '\r';
2808 buf[len+2] = '\n';
2809 addReplySds(c,sdsnewlen(buf,len+3));
2810 }
2811
2812 static void addReplyUlong(redisClient *c, unsigned long ul) {
2813 char buf[128];
2814 size_t len;
2815
2816 if (ul == 0) {
2817 addReply(c,shared.czero);
2818 return;
2819 } else if (ul == 1) {
2820 addReply(c,shared.cone);
2821 return;
2822 }
2823 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2824 addReplySds(c,sdsnewlen(buf,len));
2825 }
2826
2827 static void addReplyBulkLen(redisClient *c, robj *obj) {
2828 size_t len, intlen;
2829 char buf[128];
2830
2831 if (obj->encoding == REDIS_ENCODING_RAW) {
2832 len = sdslen(obj->ptr);
2833 } else {
2834 long n = (long)obj->ptr;
2835
2836 /* Compute how many bytes will take this integer as a radix 10 string */
2837 len = 1;
2838 if (n < 0) {
2839 len++;
2840 n = -n;
2841 }
2842 while((n = n/10) != 0) {
2843 len++;
2844 }
2845 }
2846 buf[0] = '$';
2847 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2848 buf[intlen+1] = '\r';
2849 buf[intlen+2] = '\n';
2850 addReplySds(c,sdsnewlen(buf,intlen+3));
2851 }
2852
2853 static void addReplyBulk(redisClient *c, robj *obj) {
2854 addReplyBulkLen(c,obj);
2855 addReply(c,obj);
2856 addReply(c,shared.crlf);
2857 }
2858
2859 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2860 static void addReplyBulkCString(redisClient *c, char *s) {
2861 if (s == NULL) {
2862 addReply(c,shared.nullbulk);
2863 } else {
2864 robj *o = createStringObject(s,strlen(s));
2865 addReplyBulk(c,o);
2866 decrRefCount(o);
2867 }
2868 }
2869
2870 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2871 int cport, cfd;
2872 char cip[128];
2873 redisClient *c;
2874 REDIS_NOTUSED(el);
2875 REDIS_NOTUSED(mask);
2876 REDIS_NOTUSED(privdata);
2877
2878 cfd = anetAccept(server.neterr, fd, cip, &cport);
2879 if (cfd == AE_ERR) {
2880 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2881 return;
2882 }
2883 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2884 if ((c = createClient(cfd)) == NULL) {
2885 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2886 close(cfd); /* May be already closed, just ingore errors */
2887 return;
2888 }
2889 /* If maxclient directive is set and this is one client more... close the
2890 * connection. Note that we create the client instead to check before
2891 * for this condition, since now the socket is already set in nonblocking
2892 * mode and we can send an error for free using the Kernel I/O */
2893 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2894 char *err = "-ERR max number of clients reached\r\n";
2895
2896 /* That's a best effort error message, don't check write errors */
2897 if (write(c->fd,err,strlen(err)) == -1) {
2898 /* Nothing to do, Just to avoid the warning... */
2899 }
2900 freeClient(c);
2901 return;
2902 }
2903 server.stat_numconnections++;
2904 }
2905
2906 /* ======================= Redis objects implementation ===================== */
2907
2908 static robj *createObject(int type, void *ptr) {
2909 robj *o;
2910
2911 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2912 if (listLength(server.objfreelist)) {
2913 listNode *head = listFirst(server.objfreelist);
2914 o = listNodeValue(head);
2915 listDelNode(server.objfreelist,head);
2916 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2917 } else {
2918 if (server.vm_enabled) {
2919 pthread_mutex_unlock(&server.obj_freelist_mutex);
2920 o = zmalloc(sizeof(*o));
2921 } else {
2922 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2923 }
2924 }
2925 o->type = type;
2926 o->encoding = REDIS_ENCODING_RAW;
2927 o->ptr = ptr;
2928 o->refcount = 1;
2929 if (server.vm_enabled) {
2930 /* Note that this code may run in the context of an I/O thread
2931 * and accessing to server.unixtime in theory is an error
2932 * (no locks). But in practice this is safe, and even if we read
2933 * garbage Redis will not fail, as it's just a statistical info */
2934 o->vm.atime = server.unixtime;
2935 o->storage = REDIS_VM_MEMORY;
2936 }
2937 return o;
2938 }
2939
2940 static robj *createStringObject(char *ptr, size_t len) {
2941 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2942 }
2943
2944 static robj *createStringObjectFromLongLong(long long value) {
2945 robj *o;
2946 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2947 incrRefCount(shared.integers[value]);
2948 o = shared.integers[value];
2949 } else {
2950 if (value >= LONG_MIN && value <= LONG_MAX) {
2951 o = createObject(REDIS_STRING, NULL);
2952 o->encoding = REDIS_ENCODING_INT;
2953 o->ptr = (void*)((long)value);
2954 } else {
2955 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2956 }
2957 }
2958 return o;
2959 }
2960
2961 static robj *dupStringObject(robj *o) {
2962 assert(o->encoding == REDIS_ENCODING_RAW);
2963 return createStringObject(o->ptr,sdslen(o->ptr));
2964 }
2965
2966 static robj *createListObject(void) {
2967 list *l = listCreate();
2968
2969 listSetFreeMethod(l,decrRefCount);
2970 return createObject(REDIS_LIST,l);
2971 }
2972
2973 static robj *createSetObject(void) {
2974 dict *d = dictCreate(&setDictType,NULL);
2975 return createObject(REDIS_SET,d);
2976 }
2977
2978 static robj *createHashObject(void) {
2979 /* All the Hashes start as zipmaps. Will be automatically converted
2980 * into hash tables if there are enough elements or big elements
2981 * inside. */
2982 unsigned char *zm = zipmapNew();
2983 robj *o = createObject(REDIS_HASH,zm);
2984 o->encoding = REDIS_ENCODING_ZIPMAP;
2985 return o;
2986 }
2987
2988 static robj *createZsetObject(void) {
2989 zset *zs = zmalloc(sizeof(*zs));
2990
2991 zs->dict = dictCreate(&zsetDictType,NULL);
2992 zs->zsl = zslCreate();
2993 return createObject(REDIS_ZSET,zs);
2994 }
2995
2996 static void freeStringObject(robj *o) {
2997 if (o->encoding == REDIS_ENCODING_RAW) {
2998 sdsfree(o->ptr);
2999 }
3000 }
3001
3002 static void freeListObject(robj *o) {
3003 listRelease((list*) o->ptr);
3004 }
3005
3006 static void freeSetObject(robj *o) {
3007 dictRelease((dict*) o->ptr);
3008 }
3009
3010 static void freeZsetObject(robj *o) {
3011 zset *zs = o->ptr;
3012
3013 dictRelease(zs->dict);
3014 zslFree(zs->zsl);
3015 zfree(zs);
3016 }
3017
3018 static void freeHashObject(robj *o) {
3019 switch (o->encoding) {
3020 case REDIS_ENCODING_HT:
3021 dictRelease((dict*) o->ptr);
3022 break;
3023 case REDIS_ENCODING_ZIPMAP:
3024 zfree(o->ptr);
3025 break;
3026 default:
3027 redisPanic("Unknown hash encoding type");
3028 break;
3029 }
3030 }
3031
3032 static void incrRefCount(robj *o) {
3033 o->refcount++;
3034 }
3035
3036 static void decrRefCount(void *obj) {
3037 robj *o = obj;
3038
3039 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3040 /* Object is a key of a swapped out value, or in the process of being
3041 * loaded. */
3042 if (server.vm_enabled &&
3043 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3044 {
3045 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3046 redisAssert(o->type == REDIS_STRING);
3047 freeStringObject(o);
3048 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3049 pthread_mutex_lock(&server.obj_freelist_mutex);
3050 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3051 !listAddNodeHead(server.objfreelist,o))
3052 zfree(o);
3053 pthread_mutex_unlock(&server.obj_freelist_mutex);
3054 server.vm_stats_swapped_objects--;
3055 return;
3056 }
3057 /* Object is in memory, or in the process of being swapped out. */
3058 if (--(o->refcount) == 0) {
3059 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3060 vmCancelThreadedIOJob(obj);
3061 switch(o->type) {
3062 case REDIS_STRING: freeStringObject(o); break;
3063 case REDIS_LIST: freeListObject(o); break;
3064 case REDIS_SET: freeSetObject(o); break;
3065 case REDIS_ZSET: freeZsetObject(o); break;
3066 case REDIS_HASH: freeHashObject(o); break;
3067 default: redisPanic("Unknown object type"); break;
3068 }
3069 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3070 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3071 !listAddNodeHead(server.objfreelist,o))
3072 zfree(o);
3073 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3074 }
3075 }
3076
3077 static robj *lookupKey(redisDb *db, robj *key) {
3078 dictEntry *de = dictFind(db->dict,key);
3079 if (de) {
3080 robj *key = dictGetEntryKey(de);
3081 robj *val = dictGetEntryVal(de);
3082
3083 if (server.vm_enabled) {
3084 if (key->storage == REDIS_VM_MEMORY ||
3085 key->storage == REDIS_VM_SWAPPING)
3086 {
3087 /* If we were swapping the object out, stop it, this key
3088 * was requested. */
3089 if (key->storage == REDIS_VM_SWAPPING)
3090 vmCancelThreadedIOJob(key);
3091 /* Update the access time of the key for the aging algorithm. */
3092 key->vm.atime = server.unixtime;
3093 } else {
3094 int notify = (key->storage == REDIS_VM_LOADING);
3095
3096 /* Our value was swapped on disk. Bring it at home. */
3097 redisAssert(val == NULL);
3098 val = vmLoadObject(key);
3099 dictGetEntryVal(de) = val;
3100
3101 /* Clients blocked by the VM subsystem may be waiting for
3102 * this key... */
3103 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3104 }
3105 }
3106 return val;
3107 } else {
3108 return NULL;
3109 }
3110 }
3111
3112 static robj *lookupKeyRead(redisDb *db, robj *key) {
3113 expireIfNeeded(db,key);
3114 return lookupKey(db,key);
3115 }
3116
3117 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3118 deleteIfVolatile(db,key);
3119 touchWatchedKey(db,key);
3120 return lookupKey(db,key);
3121 }
3122
3123 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3124 robj *o = lookupKeyRead(c->db, key);
3125 if (!o) addReply(c,reply);
3126 return o;
3127 }
3128
3129 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3130 robj *o = lookupKeyWrite(c->db, key);
3131 if (!o) addReply(c,reply);
3132 return o;
3133 }
3134
3135 static int checkType(redisClient *c, robj *o, int type) {
3136 if (o->type != type) {
3137 addReply(c,shared.wrongtypeerr);
3138 return 1;
3139 }
3140 return 0;
3141 }
3142
3143 static int deleteKey(redisDb *db, robj *key) {
3144 int retval;
3145
3146 /* We need to protect key from destruction: after the first dictDelete()
3147 * it may happen that 'key' is no longer valid if we don't increment
3148 * it's count. This may happen when we get the object reference directly
3149 * from the hash table with dictRandomKey() or dict iterators */
3150 incrRefCount(key);
3151 if (dictSize(db->expires)) dictDelete(db->expires,key);
3152 retval = dictDelete(db->dict,key);
3153 decrRefCount(key);
3154
3155 return retval == DICT_OK;
3156 }
3157
3158 /* Check if the nul-terminated string 's' can be represented by a long
3159 * (that is, is a number that fits into long without any other space or
3160 * character before or after the digits).
3161 *
3162 * If so, the function returns REDIS_OK and *longval is set to the value
3163 * of the number. Otherwise REDIS_ERR is returned */
3164 static int isStringRepresentableAsLong(sds s, long *longval) {
3165 char buf[32], *endptr;
3166 long value;
3167 int slen;
3168
3169 value = strtol(s, &endptr, 10);
3170 if (endptr[0] != '\0') return REDIS_ERR;
3171 slen = ll2string(buf,32,value);
3172
3173 /* If the number converted back into a string is not identical
3174 * then it's not possible to encode the string as integer */
3175 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3176 if (longval) *longval = value;
3177 return REDIS_OK;
3178 }
3179
3180 /* Try to encode a string object in order to save space */
3181 static robj *tryObjectEncoding(robj *o) {
3182 long value;
3183 sds s = o->ptr;
3184
3185 if (o->encoding != REDIS_ENCODING_RAW)
3186 return o; /* Already encoded */
3187
3188 /* It's not safe to encode shared objects: shared objects can be shared
3189 * everywhere in the "object space" of Redis. Encoded objects can only
3190 * appear as "values" (and not, for instance, as keys) */
3191 if (o->refcount > 1) return o;
3192
3193 /* Currently we try to encode only strings */
3194 redisAssert(o->type == REDIS_STRING);
3195
3196 /* Check if we can represent this string as a long integer */
3197 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3198
3199 /* Ok, this object can be encoded */
3200 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3201 decrRefCount(o);
3202 incrRefCount(shared.integers[value]);
3203 return shared.integers[value];
3204 } else {
3205 o->encoding = REDIS_ENCODING_INT;
3206 sdsfree(o->ptr);
3207 o->ptr = (void*) value;
3208 return o;
3209 }
3210 }
3211
3212 /* Get a decoded version of an encoded object (returned as a new object).
3213 * If the object is already raw-encoded just increment the ref count. */
3214 static robj *getDecodedObject(robj *o) {
3215 robj *dec;
3216
3217 if (o->encoding == REDIS_ENCODING_RAW) {
3218 incrRefCount(o);
3219 return o;
3220 }
3221 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3222 char buf[32];
3223
3224 ll2string(buf,32,(long)o->ptr);
3225 dec = createStringObject(buf,strlen(buf));
3226 return dec;
3227 } else {
3228 redisPanic("Unknown encoding type");
3229 }
3230 }
3231
3232 /* Compare two string objects via strcmp() or alike.
3233 * Note that the objects may be integer-encoded. In such a case we
3234 * use ll2string() to get a string representation of the numbers on the stack
3235 * and compare the strings, it's much faster than calling getDecodedObject().
3236 *
3237 * Important note: if objects are not integer encoded, but binary-safe strings,
3238 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3239 * binary safe. */
3240 static int compareStringObjects(robj *a, robj *b) {
3241 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3242 char bufa[128], bufb[128], *astr, *bstr;
3243 int bothsds = 1;
3244
3245 if (a == b) return 0;
3246 if (a->encoding != REDIS_ENCODING_RAW) {
3247 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3248 astr = bufa;
3249 bothsds = 0;
3250 } else {
3251 astr = a->ptr;
3252 }
3253 if (b->encoding != REDIS_ENCODING_RAW) {
3254 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3255 bstr = bufb;
3256 bothsds = 0;
3257 } else {
3258 bstr = b->ptr;
3259 }
3260 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3261 }
3262
3263 /* Equal string objects return 1 if the two objects are the same from the
3264 * point of view of a string comparison, otherwise 0 is returned. Note that
3265 * this function is faster then checking for (compareStringObject(a,b) == 0)
3266 * because it can perform some more optimization. */
3267 static int equalStringObjects(robj *a, robj *b) {
3268 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3269 return a->ptr == b->ptr;
3270 } else {
3271 return compareStringObjects(a,b) == 0;
3272 }
3273 }
3274
3275 static size_t stringObjectLen(robj *o) {
3276 redisAssert(o->type == REDIS_STRING);
3277 if (o->encoding == REDIS_ENCODING_RAW) {
3278 return sdslen(o->ptr);
3279 } else {
3280 char buf[32];
3281
3282 return ll2string(buf,32,(long)o->ptr);
3283 }
3284 }
3285
3286 static int getDoubleFromObject(robj *o, double *target) {
3287 double value;
3288 char *eptr;
3289
3290 if (o == NULL) {
3291 value = 0;
3292 } else {
3293 redisAssert(o->type == REDIS_STRING);
3294 if (o->encoding == REDIS_ENCODING_RAW) {
3295 value = strtod(o->ptr, &eptr);
3296 if (eptr[0] != '\0') return REDIS_ERR;
3297 } else if (o->encoding == REDIS_ENCODING_INT) {
3298 value = (long)o->ptr;
3299 } else {
3300 redisPanic("Unknown string encoding");
3301 }
3302 }
3303
3304 *target = value;
3305 return REDIS_OK;
3306 }
3307
3308 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3309 double value;
3310 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3311 if (msg != NULL) {
3312 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3313 } else {
3314 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3315 }
3316 return REDIS_ERR;
3317 }
3318
3319 *target = value;
3320 return REDIS_OK;
3321 }
3322
3323 static int getLongLongFromObject(robj *o, long long *target) {
3324 long long value;
3325 char *eptr;
3326
3327 if (o == NULL) {
3328 value = 0;
3329 } else {
3330 redisAssert(o->type == REDIS_STRING);
3331 if (o->encoding == REDIS_ENCODING_RAW) {
3332 value = strtoll(o->ptr, &eptr, 10);
3333 if (eptr[0] != '\0') return REDIS_ERR;
3334 } else if (o->encoding == REDIS_ENCODING_INT) {
3335 value = (long)o->ptr;
3336 } else {
3337 redisPanic("Unknown string encoding");
3338 }
3339 }
3340
3341 *target = value;
3342 return REDIS_OK;
3343 }
3344
3345 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3346 long long value;
3347 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3348 if (msg != NULL) {
3349 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3350 } else {
3351 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3352 }
3353 return REDIS_ERR;
3354 }
3355
3356 *target = value;
3357 return REDIS_OK;
3358 }
3359
3360 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3361 long long value;
3362
3363 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3364 if (value < LONG_MIN || value > LONG_MAX) {
3365 if (msg != NULL) {
3366 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3367 } else {
3368 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3369 }
3370 return REDIS_ERR;
3371 }
3372
3373 *target = value;
3374 return REDIS_OK;
3375 }
3376
3377 /*============================ RDB saving/loading =========================== */
3378
3379 static int rdbSaveType(FILE *fp, unsigned char type) {
3380 if (fwrite(&type,1,1,fp) == 0) return -1;
3381 return 0;
3382 }
3383
3384 static int rdbSaveTime(FILE *fp, time_t t) {
3385 int32_t t32 = (int32_t) t;
3386 if (fwrite(&t32,4,1,fp) == 0) return -1;
3387 return 0;
3388 }
3389
3390 /* check rdbLoadLen() comments for more info */
3391 static int rdbSaveLen(FILE *fp, uint32_t len) {
3392 unsigned char buf[2];
3393
3394 if (len < (1<<6)) {
3395 /* Save a 6 bit len */
3396 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3397 if (fwrite(buf,1,1,fp) == 0) return -1;
3398 } else if (len < (1<<14)) {
3399 /* Save a 14 bit len */
3400 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3401 buf[1] = len&0xFF;
3402 if (fwrite(buf,2,1,fp) == 0) return -1;
3403 } else {
3404 /* Save a 32 bit len */
3405 buf[0] = (REDIS_RDB_32BITLEN<<6);
3406 if (fwrite(buf,1,1,fp) == 0) return -1;
3407 len = htonl(len);
3408 if (fwrite(&len,4,1,fp) == 0) return -1;
3409 }
3410 return 0;
3411 }
3412
3413 /* Encode 'value' as an integer if possible (if integer will fit the
3414 * supported range). If the function sucessful encoded the integer
3415 * then the (up to 5 bytes) encoded representation is written in the
3416 * string pointed by 'enc' and the length is returned. Otherwise
3417 * 0 is returned. */
3418 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3419 /* Finally check if it fits in our ranges */
3420 if (value >= -(1<<7) && value <= (1<<7)-1) {
3421 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3422 enc[1] = value&0xFF;
3423 return 2;
3424 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3425 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3426 enc[1] = value&0xFF;
3427 enc[2] = (value>>8)&0xFF;
3428 return 3;
3429 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3430 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3431 enc[1] = value&0xFF;
3432 enc[2] = (value>>8)&0xFF;
3433 enc[3] = (value>>16)&0xFF;
3434 enc[4] = (value>>24)&0xFF;
3435 return 5;
3436 } else {
3437 return 0;
3438 }
3439 }
3440
3441 /* String objects in the form "2391" "-100" without any space and with a
3442 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3443 * encoded as integers to save space */
3444 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3445 long long value;
3446 char *endptr, buf[32];
3447
3448 /* Check if it's possible to encode this value as a number */
3449 value = strtoll(s, &endptr, 10);
3450 if (endptr[0] != '\0') return 0;
3451 ll2string(buf,32,value);
3452
3453 /* If the number converted back into a string is not identical
3454 * then it's not possible to encode the string as integer */
3455 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3456
3457 return rdbEncodeInteger(value,enc);
3458 }
3459
3460 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3461 size_t comprlen, outlen;
3462 unsigned char byte;
3463 void *out;
3464
3465 /* We require at least four bytes compression for this to be worth it */
3466 if (len <= 4) return 0;
3467 outlen = len-4;
3468 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3469 comprlen = lzf_compress(s, len, out, outlen);
3470 if (comprlen == 0) {
3471 zfree(out);
3472 return 0;
3473 }
3474 /* Data compressed! Let's save it on disk */
3475 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3476 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3477 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3478 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3479 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3480 zfree(out);
3481 return comprlen;
3482
3483 writeerr:
3484 zfree(out);
3485 return -1;
3486 }
3487
3488 /* Save a string objet as [len][data] on disk. If the object is a string
3489 * representation of an integer value we try to safe it in a special form */
3490 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3491 int enclen;
3492
3493 /* Try integer encoding */
3494 if (len <= 11) {
3495 unsigned char buf[5];
3496 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3497 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3498 return 0;
3499 }
3500 }
3501
3502 /* Try LZF compression - under 20 bytes it's unable to compress even
3503 * aaaaaaaaaaaaaaaaaa so skip it */
3504 if (server.rdbcompression && len > 20) {
3505 int retval;
3506
3507 retval = rdbSaveLzfStringObject(fp,s,len);
3508 if (retval == -1) return -1;
3509 if (retval > 0) return 0;
3510 /* retval == 0 means data can't be compressed, save the old way */
3511 }
3512
3513 /* Store verbatim */
3514 if (rdbSaveLen(fp,len) == -1) return -1;
3515 if (len && fwrite(s,len,1,fp) == 0) return -1;
3516 return 0;
3517 }
3518
3519 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3520 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3521 int retval;
3522
3523 /* Avoid to decode the object, then encode it again, if the
3524 * object is alrady integer encoded. */
3525 if (obj->encoding == REDIS_ENCODING_INT) {
3526 long val = (long) obj->ptr;
3527 unsigned char buf[5];
3528 int enclen;
3529
3530 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3531 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3532 return 0;
3533 }
3534 /* otherwise... fall throught and continue with the usual
3535 * code path. */
3536 }
3537
3538 /* Avoid incr/decr ref count business when possible.
3539 * This plays well with copy-on-write given that we are probably
3540 * in a child process (BGSAVE). Also this makes sure key objects
3541 * of swapped objects are not incRefCount-ed (an assert does not allow
3542 * this in order to avoid bugs) */
3543 if (obj->encoding != REDIS_ENCODING_RAW) {
3544 obj = getDecodedObject(obj);
3545 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3546 decrRefCount(obj);
3547 } else {
3548 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3549 }
3550 return retval;
3551 }
3552
3553 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3554 * 8 bit integer specifing the length of the representation.
3555 * This 8 bit integer has special values in order to specify the following
3556 * conditions:
3557 * 253: not a number
3558 * 254: + inf
3559 * 255: - inf
3560 */
3561 static int rdbSaveDoubleValue(FILE *fp, double val) {
3562 unsigned char buf[128];
3563 int len;
3564
3565 if (isnan(val)) {
3566 buf[0] = 253;
3567 len = 1;
3568 } else if (!isfinite(val)) {
3569 len = 1;
3570 buf[0] = (val < 0) ? 255 : 254;
3571 } else {
3572 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3573 /* Check if the float is in a safe range to be casted into a
3574 * long long. We are assuming that long long is 64 bit here.
3575 * Also we are assuming that there are no implementations around where
3576 * double has precision < 52 bit.
3577 *
3578 * Under this assumptions we test if a double is inside an interval
3579 * where casting to long long is safe. Then using two castings we
3580 * make sure the decimal part is zero. If all this is true we use
3581 * integer printing function that is much faster. */
3582 double min = -4503599627370495; /* (2^52)-1 */
3583 double max = 4503599627370496; /* -(2^52) */
3584 if (val > min && val < max && val == ((double)((long long)val)))
3585 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3586 else
3587 #endif
3588 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3589 buf[0] = strlen((char*)buf+1);
3590 len = buf[0]+1;
3591 }
3592 if (fwrite(buf,len,1,fp) == 0) return -1;
3593 return 0;
3594 }
3595
3596 /* Save a Redis object. */
3597 static int rdbSaveObject(FILE *fp, robj *o) {
3598 if (o->type == REDIS_STRING) {
3599 /* Save a string value */
3600 if (rdbSaveStringObject(fp,o) == -1) return -1;
3601 } else if (o->type == REDIS_LIST) {
3602 /* Save a list value */
3603 list *list = o->ptr;
3604 listIter li;
3605 listNode *ln;
3606
3607 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3608 listRewind(list,&li);
3609 while((ln = listNext(&li))) {
3610 robj *eleobj = listNodeValue(ln);
3611
3612 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3613 }
3614 } else if (o->type == REDIS_SET) {
3615 /* Save a set value */
3616 dict *set = o->ptr;
3617 dictIterator *di = dictGetIterator(set);
3618 dictEntry *de;
3619
3620 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3621 while((de = dictNext(di)) != NULL) {
3622 robj *eleobj = dictGetEntryKey(de);
3623
3624 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3625 }
3626 dictReleaseIterator(di);
3627 } else if (o->type == REDIS_ZSET) {
3628 /* Save a set value */
3629 zset *zs = o->ptr;
3630 dictIterator *di = dictGetIterator(zs->dict);
3631 dictEntry *de;
3632
3633 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3634 while((de = dictNext(di)) != NULL) {
3635 robj *eleobj = dictGetEntryKey(de);
3636 double *score = dictGetEntryVal(de);
3637
3638 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3639 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3640 }
3641 dictReleaseIterator(di);
3642 } else if (o->type == REDIS_HASH) {
3643 /* Save a hash value */
3644 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3645 unsigned char *p = zipmapRewind(o->ptr);
3646 unsigned int count = zipmapLen(o->ptr);
3647 unsigned char *key, *val;
3648 unsigned int klen, vlen;
3649
3650 if (rdbSaveLen(fp,count) == -1) return -1;
3651 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3652 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3653 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3654 }
3655 } else {
3656 dictIterator *di = dictGetIterator(o->ptr);
3657 dictEntry *de;
3658
3659 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3660 while((de = dictNext(di)) != NULL) {
3661 robj *key = dictGetEntryKey(de);
3662 robj *val = dictGetEntryVal(de);
3663
3664 if (rdbSaveStringObject(fp,key) == -1) return -1;
3665 if (rdbSaveStringObject(fp,val) == -1) return -1;
3666 }
3667 dictReleaseIterator(di);
3668 }
3669 } else {
3670 redisPanic("Unknown object type");
3671 }
3672 return 0;
3673 }
3674
3675 /* Return the length the object will have on disk if saved with
3676 * the rdbSaveObject() function. Currently we use a trick to get
3677 * this length with very little changes to the code. In the future
3678 * we could switch to a faster solution. */
3679 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3680 if (fp == NULL) fp = server.devnull;
3681 rewind(fp);
3682 assert(rdbSaveObject(fp,o) != 1);
3683 return ftello(fp);
3684 }
3685
3686 /* Return the number of pages required to save this object in the swap file */
3687 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3688 off_t bytes = rdbSavedObjectLen(o,fp);
3689
3690 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3691 }
3692
3693 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3694 static int rdbSave(char *filename) {
3695 dictIterator *di = NULL;
3696 dictEntry *de;
3697 FILE *fp;
3698 char tmpfile[256];
3699 int j;
3700 time_t now = time(NULL);
3701
3702 /* Wait for I/O therads to terminate, just in case this is a
3703 * foreground-saving, to avoid seeking the swap file descriptor at the
3704 * same time. */
3705 if (server.vm_enabled)
3706 waitEmptyIOJobsQueue();
3707
3708 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3709 fp = fopen(tmpfile,"w");
3710 if (!fp) {
3711 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3712 return REDIS_ERR;
3713 }
3714 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3715 for (j = 0; j < server.dbnum; j++) {
3716 redisDb *db = server.db+j;
3717 dict *d = db->dict;
3718 if (dictSize(d) == 0) continue;
3719 di = dictGetIterator(d);
3720 if (!di) {
3721 fclose(fp);
3722 return REDIS_ERR;
3723 }
3724
3725 /* Write the SELECT DB opcode */
3726 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3727 if (rdbSaveLen(fp,j) == -1) goto werr;
3728
3729 /* Iterate this DB writing every entry */
3730 while((de = dictNext(di)) != NULL) {
3731 robj *key = dictGetEntryKey(de);
3732 robj *o = dictGetEntryVal(de);
3733 time_t expiretime = getExpire(db,key);
3734
3735 /* Save the expire time */
3736 if (expiretime != -1) {
3737 /* If this key is already expired skip it */
3738 if (expiretime < now) continue;
3739 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3740 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3741 }
3742 /* Save the key and associated value. This requires special
3743 * handling if the value is swapped out. */
3744 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3745 key->storage == REDIS_VM_SWAPPING) {
3746 /* Save type, key, value */
3747 if (rdbSaveType(fp,o->type) == -1) goto werr;
3748 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3749 if (rdbSaveObject(fp,o) == -1) goto werr;
3750 } else {
3751 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3752 robj *po;
3753 /* Get a preview of the object in memory */
3754 po = vmPreviewObject(key);
3755 /* Save type, key, value */
3756 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3757 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3758 if (rdbSaveObject(fp,po) == -1) goto werr;
3759 /* Remove the loaded object from memory */
3760 decrRefCount(po);
3761 }
3762 }
3763 dictReleaseIterator(di);
3764 }
3765 /* EOF opcode */
3766 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3767
3768 /* Make sure data will not remain on the OS's output buffers */
3769 fflush(fp);
3770 fsync(fileno(fp));
3771 fclose(fp);
3772
3773 /* Use RENAME to make sure the DB file is changed atomically only
3774 * if the generate DB file is ok. */
3775 if (rename(tmpfile,filename) == -1) {
3776 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3777 unlink(tmpfile);
3778 return REDIS_ERR;
3779 }
3780 redisLog(REDIS_NOTICE,"DB saved on disk");
3781 server.dirty = 0;
3782 server.lastsave = time(NULL);
3783 return REDIS_OK;
3784
3785 werr:
3786 fclose(fp);
3787 unlink(tmpfile);
3788 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3789 if (di) dictReleaseIterator(di);
3790 return REDIS_ERR;
3791 }
3792
3793 static int rdbSaveBackground(char *filename) {
3794 pid_t childpid;
3795
3796 if (server.bgsavechildpid != -1) return REDIS_ERR;
3797 if (server.vm_enabled) waitEmptyIOJobsQueue();
3798 if ((childpid = fork()) == 0) {
3799 /* Child */
3800 if (server.vm_enabled) vmReopenSwapFile();
3801 close(server.fd);
3802 if (rdbSave(filename) == REDIS_OK) {
3803 _exit(0);
3804 } else {
3805 _exit(1);
3806 }
3807 } else {
3808 /* Parent */
3809 if (childpid == -1) {
3810 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3811 strerror(errno));
3812 return REDIS_ERR;
3813 }
3814 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3815 server.bgsavechildpid = childpid;
3816 updateDictResizePolicy();
3817 return REDIS_OK;
3818 }
3819 return REDIS_OK; /* unreached */
3820 }
3821
3822 static void rdbRemoveTempFile(pid_t childpid) {
3823 char tmpfile[256];
3824
3825 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3826 unlink(tmpfile);
3827 }
3828
3829 static int rdbLoadType(FILE *fp) {
3830 unsigned char type;
3831 if (fread(&type,1,1,fp) == 0) return -1;
3832 return type;
3833 }
3834
3835 static time_t rdbLoadTime(FILE *fp) {
3836 int32_t t32;
3837 if (fread(&t32,4,1,fp) == 0) return -1;
3838 return (time_t) t32;
3839 }
3840
3841 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3842 * of this file for a description of how this are stored on disk.
3843 *
3844 * isencoded is set to 1 if the readed length is not actually a length but
3845 * an "encoding type", check the above comments for more info */
3846 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3847 unsigned char buf[2];
3848 uint32_t len;
3849 int type;
3850
3851 if (isencoded) *isencoded = 0;
3852 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3853 type = (buf[0]&0xC0)>>6;
3854 if (type == REDIS_RDB_6BITLEN) {
3855 /* Read a 6 bit len */
3856 return buf[0]&0x3F;
3857 } else if (type == REDIS_RDB_ENCVAL) {
3858 /* Read a 6 bit len encoding type */
3859 if (isencoded) *isencoded = 1;
3860 return buf[0]&0x3F;
3861 } else if (type == REDIS_RDB_14BITLEN) {
3862 /* Read a 14 bit len */
3863 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3864 return ((buf[0]&0x3F)<<8)|buf[1];
3865 } else {
3866 /* Read a 32 bit len */
3867 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3868 return ntohl(len);
3869 }
3870 }
3871
3872 /* Load an integer-encoded object from file 'fp', with the specified
3873 * encoding type 'enctype'. If encode is true the function may return
3874 * an integer-encoded object as reply, otherwise the returned object
3875 * will always be encoded as a raw string. */
3876 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3877 unsigned char enc[4];
3878 long long val;
3879
3880 if (enctype == REDIS_RDB_ENC_INT8) {
3881 if (fread(enc,1,1,fp) == 0) return NULL;
3882 val = (signed char)enc[0];
3883 } else if (enctype == REDIS_RDB_ENC_INT16) {
3884 uint16_t v;
3885 if (fread(enc,2,1,fp) == 0) return NULL;
3886 v = enc[0]|(enc[1]<<8);
3887 val = (int16_t)v;
3888 } else if (enctype == REDIS_RDB_ENC_INT32) {
3889 uint32_t v;
3890 if (fread(enc,4,1,fp) == 0) return NULL;
3891 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3892 val = (int32_t)v;
3893 } else {
3894 val = 0; /* anti-warning */
3895 redisPanic("Unknown RDB integer encoding type");
3896 }
3897 if (encode)
3898 return createStringObjectFromLongLong(val);
3899 else
3900 return createObject(REDIS_STRING,sdsfromlonglong(val));
3901 }
3902
3903 static robj *rdbLoadLzfStringObject(FILE*fp) {
3904 unsigned int len, clen;
3905 unsigned char *c = NULL;
3906 sds val = NULL;
3907
3908 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3909 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3910 if ((c = zmalloc(clen)) == NULL) goto err;
3911 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3912 if (fread(c,clen,1,fp) == 0) goto err;
3913 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3914 zfree(c);
3915 return createObject(REDIS_STRING,val);
3916 err:
3917 zfree(c);
3918 sdsfree(val);
3919 return NULL;
3920 }
3921
3922 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3923 int isencoded;
3924 uint32_t len;
3925 sds val;
3926
3927 len = rdbLoadLen(fp,&isencoded);
3928 if (isencoded) {
3929 switch(len) {
3930 case REDIS_RDB_ENC_INT8:
3931 case REDIS_RDB_ENC_INT16:
3932 case REDIS_RDB_ENC_INT32:
3933 return rdbLoadIntegerObject(fp,len,encode);
3934 case REDIS_RDB_ENC_LZF:
3935 return rdbLoadLzfStringObject(fp);
3936 default:
3937 redisPanic("Unknown RDB encoding type");
3938 }
3939 }
3940
3941 if (len == REDIS_RDB_LENERR) return NULL;
3942 val = sdsnewlen(NULL,len);
3943 if (len && fread(val,len,1,fp) == 0) {
3944 sdsfree(val);
3945 return NULL;
3946 }
3947 return createObject(REDIS_STRING,val);
3948 }
3949
3950 static robj *rdbLoadStringObject(FILE *fp) {
3951 return rdbGenericLoadStringObject(fp,0);
3952 }
3953
3954 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3955 return rdbGenericLoadStringObject(fp,1);
3956 }
3957
3958 /* For information about double serialization check rdbSaveDoubleValue() */
3959 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3960 char buf[128];
3961 unsigned char len;
3962
3963 if (fread(&len,1,1,fp) == 0) return -1;
3964 switch(len) {
3965 case 255: *val = R_NegInf; return 0;
3966 case 254: *val = R_PosInf; return 0;
3967 case 253: *val = R_Nan; return 0;
3968 default:
3969 if (fread(buf,len,1,fp) == 0) return -1;
3970 buf[len] = '\0';
3971 sscanf(buf, "%lg", val);
3972 return 0;
3973 }
3974 }
3975
3976 /* Load a Redis object of the specified type from the specified file.
3977 * On success a newly allocated object is returned, otherwise NULL. */
3978 static robj *rdbLoadObject(int type, FILE *fp) {
3979 robj *o;
3980
3981 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3982 if (type == REDIS_STRING) {
3983 /* Read string value */
3984 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3985 o = tryObjectEncoding(o);
3986 } else if (type == REDIS_LIST || type == REDIS_SET) {
3987 /* Read list/set value */
3988 uint32_t listlen;
3989
3990 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3991 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3992 /* It's faster to expand the dict to the right size asap in order
3993 * to avoid rehashing */
3994 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3995 dictExpand(o->ptr,listlen);
3996 /* Load every single element of the list/set */
3997 while(listlen--) {
3998 robj *ele;
3999
4000 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4001 ele = tryObjectEncoding(ele);
4002 if (type == REDIS_LIST) {
4003 listAddNodeTail((list*)o->ptr,ele);
4004 } else {
4005 dictAdd((dict*)o->ptr,ele,NULL);
4006 }
4007 }
4008 } else if (type == REDIS_ZSET) {
4009 /* Read list/set value */
4010 size_t zsetlen;
4011 zset *zs;
4012
4013 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4014 o = createZsetObject();
4015 zs = o->ptr;
4016 /* Load every single element of the list/set */
4017 while(zsetlen--) {
4018 robj *ele;
4019 double *score = zmalloc(sizeof(double));
4020
4021 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4022 ele = tryObjectEncoding(ele);
4023 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4024 dictAdd(zs->dict,ele,score);
4025 zslInsert(zs->zsl,*score,ele);
4026 incrRefCount(ele); /* added to skiplist */
4027 }
4028 } else if (type == REDIS_HASH) {
4029 size_t hashlen;
4030
4031 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4032 o = createHashObject();
4033 /* Too many entries? Use an hash table. */
4034 if (hashlen > server.hash_max_zipmap_entries)
4035 convertToRealHash(o);
4036 /* Load every key/value, then set it into the zipmap or hash
4037 * table, as needed. */
4038 while(hashlen--) {
4039 robj *key, *val;
4040
4041 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4042 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4043 /* If we are using a zipmap and there are too big values
4044 * the object is converted to real hash table encoding. */
4045 if (o->encoding != REDIS_ENCODING_HT &&
4046 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4047 sdslen(val->ptr) > server.hash_max_zipmap_value))
4048 {
4049 convertToRealHash(o);
4050 }
4051
4052 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4053 unsigned char *zm = o->ptr;
4054
4055 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4056 val->ptr,sdslen(val->ptr),NULL);
4057 o->ptr = zm;
4058 decrRefCount(key);
4059 decrRefCount(val);
4060 } else {
4061 key = tryObjectEncoding(key);
4062 val = tryObjectEncoding(val);
4063 dictAdd((dict*)o->ptr,key,val);
4064 }
4065 }
4066 } else {
4067 redisPanic("Unknown object type");
4068 }
4069 return o;
4070 }
4071
4072 static int rdbLoad(char *filename) {
4073 FILE *fp;
4074 uint32_t dbid;
4075 int type, retval, rdbver;
4076 int swap_all_values = 0;
4077 dict *d = server.db[0].dict;
4078 redisDb *db = server.db+0;
4079 char buf[1024];
4080 time_t expiretime, now = time(NULL);
4081 long long loadedkeys = 0;
4082
4083 fp = fopen(filename,"r");
4084 if (!fp) return REDIS_ERR;
4085 if (fread(buf,9,1,fp) == 0) goto eoferr;
4086 buf[9] = '\0';
4087 if (memcmp(buf,"REDIS",5) != 0) {
4088 fclose(fp);
4089 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4090 return REDIS_ERR;
4091 }
4092 rdbver = atoi(buf+5);
4093 if (rdbver != 1) {
4094 fclose(fp);
4095 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4096 return REDIS_ERR;
4097 }
4098 while(1) {
4099 robj *key, *val;
4100
4101 expiretime = -1;
4102 /* Read type. */
4103 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4104 if (type == REDIS_EXPIRETIME) {
4105 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4106 /* We read the time so we need to read the object type again */
4107 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4108 }
4109 if (type == REDIS_EOF) break;
4110 /* Handle SELECT DB opcode as a special case */
4111 if (type == REDIS_SELECTDB) {
4112 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4113 goto eoferr;
4114 if (dbid >= (unsigned)server.dbnum) {
4115 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4116 exit(1);
4117 }
4118 db = server.db+dbid;
4119 d = db->dict;
4120 continue;
4121 }
4122 /* Read key */
4123 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4124 /* Read value */
4125 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4126 /* Check if the key already expired */
4127 if (expiretime != -1 && expiretime < now) {
4128 decrRefCount(key);
4129 decrRefCount(val);
4130 continue;
4131 }
4132 /* Add the new object in the hash table */
4133 retval = dictAdd(d,key,val);
4134 if (retval == DICT_ERR) {
4135 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4136 exit(1);
4137 }
4138 loadedkeys++;
4139 /* Set the expire time if needed */
4140 if (expiretime != -1) setExpire(db,key,expiretime);
4141
4142 /* Handle swapping while loading big datasets when VM is on */
4143
4144 /* If we detecter we are hopeless about fitting something in memory
4145 * we just swap every new key on disk. Directly...
4146 * Note that's important to check for this condition before resorting
4147 * to random sampling, otherwise we may try to swap already
4148 * swapped keys. */
4149 if (swap_all_values) {
4150 dictEntry *de = dictFind(d,key);
4151
4152 /* de may be NULL since the key already expired */
4153 if (de) {
4154 key = dictGetEntryKey(de);
4155 val = dictGetEntryVal(de);
4156
4157 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4158 dictGetEntryVal(de) = NULL;
4159 }
4160 }
4161 continue;
4162 }
4163
4164 /* If we have still some hope of having some value fitting memory
4165 * then we try random sampling. */
4166 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4167 while (zmalloc_used_memory() > server.vm_max_memory) {
4168 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4169 }
4170 if (zmalloc_used_memory() > server.vm_max_memory)
4171 swap_all_values = 1; /* We are already using too much mem */
4172 }
4173 }
4174 fclose(fp);
4175 return REDIS_OK;
4176
4177 eoferr: /* unexpected end of file is handled here with a fatal exit */
4178 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4179 exit(1);
4180 return REDIS_ERR; /* Just to avoid warning */
4181 }
4182
4183 /*================================== Shutdown =============================== */
4184 static int prepareForShutdown() {
4185 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4186 /* Kill the saving child if there is a background saving in progress.
4187 We want to avoid race conditions, for instance our saving child may
4188 overwrite the synchronous saving did by SHUTDOWN. */
4189 if (server.bgsavechildpid != -1) {
4190 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4191 kill(server.bgsavechildpid,SIGKILL);
4192 rdbRemoveTempFile(server.bgsavechildpid);
4193 }
4194 if (server.appendonly) {
4195 /* Append only file: fsync() the AOF and exit */
4196 fsync(server.appendfd);
4197 if (server.vm_enabled) unlink(server.vm_swap_file);
4198 } else {
4199 /* Snapshotting. Perform a SYNC SAVE and exit */
4200 if (rdbSave(server.dbfilename) == REDIS_OK) {
4201 if (server.daemonize)
4202 unlink(server.pidfile);
4203 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4204 } else {
4205 /* Ooops.. error saving! The best we can do is to continue
4206 * operating. Note that if there was a background saving process,
4207 * in the next cron() Redis will be notified that the background
4208 * saving aborted, handling special stuff like slaves pending for
4209 * synchronization... */
4210 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4211 return REDIS_ERR;
4212 }
4213 }
4214 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4215 return REDIS_OK;
4216 }
4217
4218 /*================================== Commands =============================== */
4219
4220 static void authCommand(redisClient *c) {
4221 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4222 c->authenticated = 1;
4223 addReply(c,shared.ok);
4224 } else {
4225 c->authenticated = 0;
4226 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4227 }
4228 }
4229
4230 static void pingCommand(redisClient *c) {
4231 addReply(c,shared.pong);
4232 }
4233
4234 static void echoCommand(redisClient *c) {
4235 addReplyBulk(c,c->argv[1]);
4236 }
4237
4238 /*=================================== Strings =============================== */
4239
4240 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4241 int retval;
4242 long seconds = 0; /* initialized to avoid an harmness warning */
4243
4244 if (expire) {
4245 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4246 return;
4247 if (seconds <= 0) {
4248 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4249 return;
4250 }
4251 }
4252
4253 touchWatchedKey(c->db,key);
4254 if (nx) deleteIfVolatile(c->db,key);
4255 retval = dictAdd(c->db->dict,key,val);
4256 if (retval == DICT_ERR) {
4257 if (!nx) {
4258 /* If the key is about a swapped value, we want a new key object
4259 * to overwrite the old. So we delete the old key in the database.
4260 * This will also make sure that swap pages about the old object
4261 * will be marked as free. */
4262 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4263 incrRefCount(key);
4264 dictReplace(c->db->dict,key,val);
4265 incrRefCount(val);
4266 } else {
4267 addReply(c,shared.czero);
4268 return;
4269 }
4270 } else {
4271 incrRefCount(key);
4272 incrRefCount(val);
4273 }
4274 server.dirty++;
4275 removeExpire(c->db,key);
4276 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4277 addReply(c, nx ? shared.cone : shared.ok);
4278 }
4279
4280 static void setCommand(redisClient *c) {
4281 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4282 }
4283
4284 static void setnxCommand(redisClient *c) {
4285 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4286 }
4287
4288 static void setexCommand(redisClient *c) {
4289 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4290 }
4291
4292 static int getGenericCommand(redisClient *c) {
4293 robj *o;
4294
4295 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4296 return REDIS_OK;
4297
4298 if (o->type != REDIS_STRING) {
4299 addReply(c,shared.wrongtypeerr);
4300 return REDIS_ERR;
4301 } else {
4302 addReplyBulk(c,o);
4303 return REDIS_OK;
4304 }
4305 }
4306
4307 static void getCommand(redisClient *c) {
4308 getGenericCommand(c);
4309 }
4310
4311 static void getsetCommand(redisClient *c) {
4312 if (getGenericCommand(c) == REDIS_ERR) return;
4313 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4314 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4315 } else {
4316 incrRefCount(c->argv[1]);
4317 }
4318 incrRefCount(c->argv[2]);
4319 server.dirty++;
4320 removeExpire(c->db,c->argv[1]);
4321 }
4322
4323 static void mgetCommand(redisClient *c) {
4324 int j;
4325
4326 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4327 for (j = 1; j < c->argc; j++) {
4328 robj *o = lookupKeyRead(c->db,c->argv[j]);
4329 if (o == NULL) {
4330 addReply(c,shared.nullbulk);
4331 } else {
4332 if (o->type != REDIS_STRING) {
4333 addReply(c,shared.nullbulk);
4334 } else {
4335 addReplyBulk(c,o);
4336 }
4337 }
4338 }
4339 }
4340
4341 static void msetGenericCommand(redisClient *c, int nx) {
4342 int j, busykeys = 0;
4343
4344 if ((c->argc % 2) == 0) {
4345 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4346 return;
4347 }
4348 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4349 * set nothing at all if at least one already key exists. */
4350 if (nx) {
4351 for (j = 1; j < c->argc; j += 2) {
4352 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4353 busykeys++;
4354 }
4355 }
4356 }
4357 if (busykeys) {
4358 addReply(c, shared.czero);
4359 return;
4360 }
4361
4362 for (j = 1; j < c->argc; j += 2) {
4363 int retval;
4364
4365 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4366 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4367 if (retval == DICT_ERR) {
4368 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4369 incrRefCount(c->argv[j+1]);
4370 } else {
4371 incrRefCount(c->argv[j]);
4372 incrRefCount(c->argv[j+1]);
4373 }
4374 removeExpire(c->db,c->argv[j]);
4375 }
4376 server.dirty += (c->argc-1)/2;
4377 addReply(c, nx ? shared.cone : shared.ok);
4378 }
4379
4380 static void msetCommand(redisClient *c) {
4381 msetGenericCommand(c,0);
4382 }
4383
4384 static void msetnxCommand(redisClient *c) {
4385 msetGenericCommand(c,1);
4386 }
4387
4388 static void incrDecrCommand(redisClient *c, long long incr) {
4389 long long value;
4390 int retval;
4391 robj *o;
4392
4393 o = lookupKeyWrite(c->db,c->argv[1]);
4394 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4395 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4396
4397 value += incr;
4398 o = createStringObjectFromLongLong(value);
4399 retval = dictAdd(c->db->dict,c->argv[1],o);
4400 if (retval == DICT_ERR) {
4401 dictReplace(c->db->dict,c->argv[1],o);
4402 removeExpire(c->db,c->argv[1]);
4403 } else {
4404 incrRefCount(c->argv[1]);
4405 }
4406 server.dirty++;
4407 addReply(c,shared.colon);
4408 addReply(c,o);
4409 addReply(c,shared.crlf);
4410 }
4411
4412 static void incrCommand(redisClient *c) {
4413 incrDecrCommand(c,1);
4414 }
4415
4416 static void decrCommand(redisClient *c) {
4417 incrDecrCommand(c,-1);
4418 }
4419
4420 static void incrbyCommand(redisClient *c) {
4421 long long incr;
4422
4423 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4424 incrDecrCommand(c,incr);
4425 }
4426
4427 static void decrbyCommand(redisClient *c) {
4428 long long incr;
4429
4430 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4431 incrDecrCommand(c,-incr);
4432 }
4433
4434 static void appendCommand(redisClient *c) {
4435 int retval;
4436 size_t totlen;
4437 robj *o;
4438
4439 o = lookupKeyWrite(c->db,c->argv[1]);
4440 if (o == NULL) {
4441 /* Create the key */
4442 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4443 incrRefCount(c->argv[1]);
4444 incrRefCount(c->argv[2]);
4445 totlen = stringObjectLen(c->argv[2]);
4446 } else {
4447 dictEntry *de;
4448
4449 de = dictFind(c->db->dict,c->argv[1]);
4450 assert(de != NULL);
4451
4452 o = dictGetEntryVal(de);
4453 if (o->type != REDIS_STRING) {
4454 addReply(c,shared.wrongtypeerr);
4455 return;
4456 }
4457 /* If the object is specially encoded or shared we have to make
4458 * a copy */
4459 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4460 robj *decoded = getDecodedObject(o);
4461
4462 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4463 decrRefCount(decoded);
4464 dictReplace(c->db->dict,c->argv[1],o);
4465 }
4466 /* APPEND! */
4467 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4468 o->ptr = sdscatlen(o->ptr,
4469 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4470 } else {
4471 o->ptr = sdscatprintf(o->ptr, "%ld",
4472 (unsigned long) c->argv[2]->ptr);
4473 }
4474 totlen = sdslen(o->ptr);
4475 }
4476 server.dirty++;
4477 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4478 }
4479
4480 static void substrCommand(redisClient *c) {
4481 robj *o;
4482 long start = atoi(c->argv[2]->ptr);
4483 long end = atoi(c->argv[3]->ptr);
4484 size_t rangelen, strlen;
4485 sds range;
4486
4487 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4488 checkType(c,o,REDIS_STRING)) return;
4489
4490 o = getDecodedObject(o);
4491 strlen = sdslen(o->ptr);
4492
4493 /* convert negative indexes */
4494 if (start < 0) start = strlen+start;
4495 if (end < 0) end = strlen+end;
4496 if (start < 0) start = 0;
4497 if (end < 0) end = 0;
4498
4499 /* indexes sanity checks */
4500 if (start > end || (size_t)start >= strlen) {
4501 /* Out of range start or start > end result in null reply */
4502 addReply(c,shared.nullbulk);
4503 decrRefCount(o);
4504 return;
4505 }
4506 if ((size_t)end >= strlen) end = strlen-1;
4507 rangelen = (end-start)+1;
4508
4509 /* Return the result */
4510 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4511 range = sdsnewlen((char*)o->ptr+start,rangelen);
4512 addReplySds(c,range);
4513 addReply(c,shared.crlf);
4514 decrRefCount(o);
4515 }
4516
4517 /* ========================= Type agnostic commands ========================= */
4518
4519 static void delCommand(redisClient *c) {
4520 int deleted = 0, j;
4521
4522 for (j = 1; j < c->argc; j++) {
4523 if (deleteKey(c->db,c->argv[j])) {
4524 touchWatchedKey(c->db,c->argv[j]);
4525 server.dirty++;
4526 deleted++;
4527 }
4528 }
4529 addReplyLongLong(c,deleted);
4530 }
4531
4532 static void existsCommand(redisClient *c) {
4533 expireIfNeeded(c->db,c->argv[1]);
4534 if (dictFind(c->db->dict,c->argv[1])) {
4535 addReply(c, shared.cone);
4536 } else {
4537 addReply(c, shared.czero);
4538 }
4539 }
4540
4541 static void selectCommand(redisClient *c) {
4542 int id = atoi(c->argv[1]->ptr);
4543
4544 if (selectDb(c,id) == REDIS_ERR) {
4545 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4546 } else {
4547 addReply(c,shared.ok);
4548 }
4549 }
4550
4551 static void randomkeyCommand(redisClient *c) {
4552 dictEntry *de;
4553 robj *key;
4554
4555 while(1) {
4556 de = dictGetRandomKey(c->db->dict);
4557 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4558 }
4559
4560 if (de == NULL) {
4561 addReply(c,shared.nullbulk);
4562 return;
4563 }
4564
4565 key = dictGetEntryKey(de);
4566 if (server.vm_enabled) {
4567 key = dupStringObject(key);
4568 addReplyBulk(c,key);
4569 decrRefCount(key);
4570 } else {
4571 addReplyBulk(c,key);
4572 }
4573 }
4574
4575 static void keysCommand(redisClient *c) {
4576 dictIterator *di;
4577 dictEntry *de;
4578 sds pattern = c->argv[1]->ptr;
4579 int plen = sdslen(pattern);
4580 unsigned long numkeys = 0;
4581 robj *lenobj = createObject(REDIS_STRING,NULL);
4582
4583 di = dictGetIterator(c->db->dict);
4584 addReply(c,lenobj);
4585 decrRefCount(lenobj);
4586 while((de = dictNext(di)) != NULL) {
4587 robj *keyobj = dictGetEntryKey(de);
4588
4589 sds key = keyobj->ptr;
4590 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4591 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4592 if (expireIfNeeded(c->db,keyobj) == 0) {
4593 addReplyBulk(c,keyobj);
4594 numkeys++;
4595 }
4596 }
4597 }
4598 dictReleaseIterator(di);
4599 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4600 }
4601
4602 static void dbsizeCommand(redisClient *c) {
4603 addReplySds(c,
4604 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4605 }
4606
4607 static void lastsaveCommand(redisClient *c) {
4608 addReplySds(c,
4609 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4610 }
4611
4612 static void typeCommand(redisClient *c) {
4613 robj *o;
4614 char *type;
4615
4616 o = lookupKeyRead(c->db,c->argv[1]);
4617 if (o == NULL) {
4618 type = "+none";
4619 } else {
4620 switch(o->type) {
4621 case REDIS_STRING: type = "+string"; break;
4622 case REDIS_LIST: type = "+list"; break;
4623 case REDIS_SET: type = "+set"; break;
4624 case REDIS_ZSET: type = "+zset"; break;
4625 case REDIS_HASH: type = "+hash"; break;
4626 default: type = "+unknown"; break;
4627 }
4628 }
4629 addReplySds(c,sdsnew(type));
4630 addReply(c,shared.crlf);
4631 }
4632
4633 static void saveCommand(redisClient *c) {
4634 if (server.bgsavechildpid != -1) {
4635 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4636 return;
4637 }
4638 if (rdbSave(server.dbfilename) == REDIS_OK) {
4639 addReply(c,shared.ok);
4640 } else {
4641 addReply(c,shared.err);
4642 }
4643 }
4644
4645 static void bgsaveCommand(redisClient *c) {
4646 if (server.bgsavechildpid != -1) {
4647 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4648 return;
4649 }
4650 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4651 char *status = "+Background saving started\r\n";
4652 addReplySds(c,sdsnew(status));
4653 } else {
4654 addReply(c,shared.err);
4655 }
4656 }
4657
4658 static void shutdownCommand(redisClient *c) {
4659 if (prepareForShutdown() == REDIS_OK)
4660 exit(0);
4661 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4662 }
4663
4664 static void renameGenericCommand(redisClient *c, int nx) {
4665 robj *o;
4666
4667 /* To use the same key as src and dst is probably an error */
4668 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4669 addReply(c,shared.sameobjecterr);
4670 return;
4671 }
4672
4673 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4674 return;
4675
4676 incrRefCount(o);
4677 deleteIfVolatile(c->db,c->argv[2]);
4678 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4679 if (nx) {
4680 decrRefCount(o);
4681 addReply(c,shared.czero);
4682 return;
4683 }
4684 dictReplace(c->db->dict,c->argv[2],o);
4685 } else {
4686 incrRefCount(c->argv[2]);
4687 }
4688 deleteKey(c->db,c->argv[1]);
4689 touchWatchedKey(c->db,c->argv[2]);
4690 server.dirty++;
4691 addReply(c,nx ? shared.cone : shared.ok);
4692 }
4693
4694 static void renameCommand(redisClient *c) {
4695 renameGenericCommand(c,0);
4696 }
4697
4698 static void renamenxCommand(redisClient *c) {
4699 renameGenericCommand(c,1);
4700 }
4701
4702 static void moveCommand(redisClient *c) {
4703 robj *o;
4704 redisDb *src, *dst;
4705 int srcid;
4706
4707 /* Obtain source and target DB pointers */
4708 src = c->db;
4709 srcid = c->db->id;
4710 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4711 addReply(c,shared.outofrangeerr);
4712 return;
4713 }
4714 dst = c->db;
4715 selectDb(c,srcid); /* Back to the source DB */
4716
4717 /* If the user is moving using as target the same
4718 * DB as the source DB it is probably an error. */
4719 if (src == dst) {
4720 addReply(c,shared.sameobjecterr);
4721 return;
4722 }
4723
4724 /* Check if the element exists and get a reference */
4725 o = lookupKeyWrite(c->db,c->argv[1]);
4726 if (!o) {
4727 addReply(c,shared.czero);
4728 return;
4729 }
4730
4731 /* Try to add the element to the target DB */
4732 deleteIfVolatile(dst,c->argv[1]);
4733 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4734 addReply(c,shared.czero);
4735 return;
4736 }
4737 incrRefCount(c->argv[1]);
4738 incrRefCount(o);
4739
4740 /* OK! key moved, free the entry in the source DB */
4741 deleteKey(src,c->argv[1]);
4742 server.dirty++;
4743 addReply(c,shared.cone);
4744 }
4745
4746 /* =================================== Lists ================================ */
4747 static void pushGenericCommand(redisClient *c, int where) {
4748 robj *lobj;
4749 list *list;
4750
4751 lobj = lookupKeyWrite(c->db,c->argv[1]);
4752 if (lobj == NULL) {
4753 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4754 addReply(c,shared.cone);
4755 return;
4756 }
4757 lobj = createListObject();
4758 list = lobj->ptr;
4759 if (where == REDIS_HEAD) {
4760 listAddNodeHead(list,c->argv[2]);
4761 } else {
4762 listAddNodeTail(list,c->argv[2]);
4763 }
4764 dictAdd(c->db->dict,c->argv[1],lobj);
4765 incrRefCount(c->argv[1]);
4766 incrRefCount(c->argv[2]);
4767 } else {
4768 if (lobj->type != REDIS_LIST) {
4769 addReply(c,shared.wrongtypeerr);
4770 return;
4771 }
4772 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4773 addReply(c,shared.cone);
4774 return;
4775 }
4776 list = lobj->ptr;
4777 if (where == REDIS_HEAD) {
4778 listAddNodeHead(list,c->argv[2]);
4779 } else {
4780 listAddNodeTail(list,c->argv[2]);
4781 }
4782 incrRefCount(c->argv[2]);
4783 }
4784 server.dirty++;
4785 addReplyLongLong(c,listLength(list));
4786 }
4787
4788 static void lpushCommand(redisClient *c) {
4789 pushGenericCommand(c,REDIS_HEAD);
4790 }
4791
4792 static void rpushCommand(redisClient *c) {
4793 pushGenericCommand(c,REDIS_TAIL);
4794 }
4795
4796 static void llenCommand(redisClient *c) {
4797 robj *o;
4798 list *l;
4799
4800 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4801 checkType(c,o,REDIS_LIST)) return;
4802
4803 l = o->ptr;
4804 addReplyUlong(c,listLength(l));
4805 }
4806
4807 static void lindexCommand(redisClient *c) {
4808 robj *o;
4809 int index = atoi(c->argv[2]->ptr);
4810 list *list;
4811 listNode *ln;
4812
4813 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4814 checkType(c,o,REDIS_LIST)) return;
4815 list = o->ptr;
4816
4817 ln = listIndex(list, index);
4818 if (ln == NULL) {
4819 addReply(c,shared.nullbulk);
4820 } else {
4821 robj *ele = listNodeValue(ln);
4822 addReplyBulk(c,ele);
4823 }
4824 }
4825
4826 static void lsetCommand(redisClient *c) {
4827 robj *o;
4828 int index = atoi(c->argv[2]->ptr);
4829 list *list;
4830 listNode *ln;
4831
4832 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4833 checkType(c,o,REDIS_LIST)) return;
4834 list = o->ptr;
4835
4836 ln = listIndex(list, index);
4837 if (ln == NULL) {
4838 addReply(c,shared.outofrangeerr);
4839 } else {
4840 robj *ele = listNodeValue(ln);
4841
4842 decrRefCount(ele);
4843 listNodeValue(ln) = c->argv[3];
4844 incrRefCount(c->argv[3]);
4845 addReply(c,shared.ok);
4846 server.dirty++;
4847 }
4848 }
4849
4850 static void popGenericCommand(redisClient *c, int where) {
4851 robj *o;
4852 list *list;
4853 listNode *ln;
4854
4855 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4856 checkType(c,o,REDIS_LIST)) return;
4857 list = o->ptr;
4858
4859 if (where == REDIS_HEAD)
4860 ln = listFirst(list);
4861 else
4862 ln = listLast(list);
4863
4864 if (ln == NULL) {
4865 addReply(c,shared.nullbulk);
4866 } else {
4867 robj *ele = listNodeValue(ln);
4868 addReplyBulk(c,ele);
4869 listDelNode(list,ln);
4870 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4871 server.dirty++;
4872 }
4873 }
4874
4875 static void lpopCommand(redisClient *c) {
4876 popGenericCommand(c,REDIS_HEAD);
4877 }
4878
4879 static void rpopCommand(redisClient *c) {
4880 popGenericCommand(c,REDIS_TAIL);
4881 }
4882
4883 static void lrangeCommand(redisClient *c) {
4884 robj *o;
4885 int start = atoi(c->argv[2]->ptr);
4886 int end = atoi(c->argv[3]->ptr);
4887 int llen;
4888 int rangelen, j;
4889 list *list;
4890 listNode *ln;
4891 robj *ele;
4892
4893 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4894 || checkType(c,o,REDIS_LIST)) return;
4895 list = o->ptr;
4896 llen = listLength(list);
4897
4898 /* convert negative indexes */
4899 if (start < 0) start = llen+start;
4900 if (end < 0) end = llen+end;
4901 if (start < 0) start = 0;
4902 if (end < 0) end = 0;
4903
4904 /* indexes sanity checks */
4905 if (start > end || start >= llen) {
4906 /* Out of range start or start > end result in empty list */
4907 addReply(c,shared.emptymultibulk);
4908 return;
4909 }
4910 if (end >= llen) end = llen-1;
4911 rangelen = (end-start)+1;
4912
4913 /* Return the result in form of a multi-bulk reply */
4914 ln = listIndex(list, start);
4915 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4916 for (j = 0; j < rangelen; j++) {
4917 ele = listNodeValue(ln);
4918 addReplyBulk(c,ele);
4919 ln = ln->next;
4920 }
4921 }
4922
4923 static void ltrimCommand(redisClient *c) {
4924 robj *o;
4925 int start = atoi(c->argv[2]->ptr);
4926 int end = atoi(c->argv[3]->ptr);
4927 int llen;
4928 int j, ltrim, rtrim;
4929 list *list;
4930 listNode *ln;
4931
4932 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4933 checkType(c,o,REDIS_LIST)) return;
4934 list = o->ptr;
4935 llen = listLength(list);
4936
4937 /* convert negative indexes */
4938 if (start < 0) start = llen+start;
4939 if (end < 0) end = llen+end;
4940 if (start < 0) start = 0;
4941 if (end < 0) end = 0;
4942
4943 /* indexes sanity checks */
4944 if (start > end || start >= llen) {
4945 /* Out of range start or start > end result in empty list */
4946 ltrim = llen;
4947 rtrim = 0;
4948 } else {
4949 if (end >= llen) end = llen-1;
4950 ltrim = start;
4951 rtrim = llen-end-1;
4952 }
4953
4954 /* Remove list elements to perform the trim */
4955 for (j = 0; j < ltrim; j++) {
4956 ln = listFirst(list);
4957 listDelNode(list,ln);
4958 }
4959 for (j = 0; j < rtrim; j++) {
4960 ln = listLast(list);
4961 listDelNode(list,ln);
4962 }
4963 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4964 server.dirty++;
4965 addReply(c,shared.ok);
4966 }
4967
4968 static void lremCommand(redisClient *c) {
4969 robj *o;
4970 list *list;
4971 listNode *ln, *next;
4972 int toremove = atoi(c->argv[2]->ptr);
4973 int removed = 0;
4974 int fromtail = 0;
4975
4976 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4977 checkType(c,o,REDIS_LIST)) return;
4978 list = o->ptr;
4979
4980 if (toremove < 0) {
4981 toremove = -toremove;
4982 fromtail = 1;
4983 }
4984 ln = fromtail ? list->tail : list->head;
4985 while (ln) {
4986 robj *ele = listNodeValue(ln);
4987
4988 next = fromtail ? ln->prev : ln->next;
4989 if (equalStringObjects(ele,c->argv[3])) {
4990 listDelNode(list,ln);
4991 server.dirty++;
4992 removed++;
4993 if (toremove && removed == toremove) break;
4994 }
4995 ln = next;
4996 }
4997 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4998 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4999 }
5000
5001 /* This is the semantic of this command:
5002 * RPOPLPUSH srclist dstlist:
5003 * IF LLEN(srclist) > 0
5004 * element = RPOP srclist
5005 * LPUSH dstlist element
5006 * RETURN element
5007 * ELSE
5008 * RETURN nil
5009 * END
5010 * END
5011 *
5012 * The idea is to be able to get an element from a list in a reliable way
5013 * since the element is not just returned but pushed against another list
5014 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5015 */
5016 static void rpoplpushcommand(redisClient *c) {
5017 robj *sobj;
5018 list *srclist;
5019 listNode *ln;
5020
5021 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5022 checkType(c,sobj,REDIS_LIST)) return;
5023 srclist = sobj->ptr;
5024 ln = listLast(srclist);
5025
5026 if (ln == NULL) {
5027 addReply(c,shared.nullbulk);
5028 } else {
5029 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5030 robj *ele = listNodeValue(ln);
5031 list *dstlist;
5032
5033 if (dobj && dobj->type != REDIS_LIST) {
5034 addReply(c,shared.wrongtypeerr);
5035 return;
5036 }
5037
5038 /* Add the element to the target list (unless it's directly
5039 * passed to some BLPOP-ing client */
5040 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5041 if (dobj == NULL) {
5042 /* Create the list if the key does not exist */
5043 dobj = createListObject();
5044 dictAdd(c->db->dict,c->argv[2],dobj);
5045 incrRefCount(c->argv[2]);
5046 }
5047 dstlist = dobj->ptr;
5048 listAddNodeHead(dstlist,ele);
5049 incrRefCount(ele);
5050 }
5051
5052 /* Send the element to the client as reply as well */
5053 addReplyBulk(c,ele);
5054
5055 /* Finally remove the element from the source list */
5056 listDelNode(srclist,ln);
5057 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5058 server.dirty++;
5059 }
5060 }
5061
5062 /* ==================================== Sets ================================ */
5063
5064 static void saddCommand(redisClient *c) {
5065 robj *set;
5066
5067 set = lookupKeyWrite(c->db,c->argv[1]);
5068 if (set == NULL) {
5069 set = createSetObject();
5070 dictAdd(c->db->dict,c->argv[1],set);
5071 incrRefCount(c->argv[1]);
5072 } else {
5073 if (set->type != REDIS_SET) {
5074 addReply(c,shared.wrongtypeerr);
5075 return;
5076 }
5077 }
5078 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5079 incrRefCount(c->argv[2]);
5080 server.dirty++;
5081 addReply(c,shared.cone);
5082 } else {
5083 addReply(c,shared.czero);
5084 }
5085 }
5086
5087 static void sremCommand(redisClient *c) {
5088 robj *set;
5089
5090 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5091 checkType(c,set,REDIS_SET)) return;
5092
5093 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5094 server.dirty++;
5095 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5096 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5097 addReply(c,shared.cone);
5098 } else {
5099 addReply(c,shared.czero);
5100 }
5101 }
5102
5103 static void smoveCommand(redisClient *c) {
5104 robj *srcset, *dstset;
5105
5106 srcset = lookupKeyWrite(c->db,c->argv[1]);
5107 dstset = lookupKeyWrite(c->db,c->argv[2]);
5108
5109 /* If the source key does not exist return 0, if it's of the wrong type
5110 * raise an error */
5111 if (srcset == NULL || srcset->type != REDIS_SET) {
5112 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5113 return;
5114 }
5115 /* Error if the destination key is not a set as well */
5116 if (dstset && dstset->type != REDIS_SET) {
5117 addReply(c,shared.wrongtypeerr);
5118 return;
5119 }
5120 /* Remove the element from the source set */
5121 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5122 /* Key not found in the src set! return zero */
5123 addReply(c,shared.czero);
5124 return;
5125 }
5126 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5127 deleteKey(c->db,c->argv[1]);
5128 server.dirty++;
5129 /* Add the element to the destination set */
5130 if (!dstset) {
5131 dstset = createSetObject();
5132 dictAdd(c->db->dict,c->argv[2],dstset);
5133 incrRefCount(c->argv[2]);
5134 }
5135 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5136 incrRefCount(c->argv[3]);
5137 addReply(c,shared.cone);
5138 }
5139
5140 static void sismemberCommand(redisClient *c) {
5141 robj *set;
5142
5143 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5144 checkType(c,set,REDIS_SET)) return;
5145
5146 if (dictFind(set->ptr,c->argv[2]))
5147 addReply(c,shared.cone);
5148 else
5149 addReply(c,shared.czero);
5150 }
5151
5152 static void scardCommand(redisClient *c) {
5153 robj *o;
5154 dict *s;
5155
5156 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5157 checkType(c,o,REDIS_SET)) return;
5158
5159 s = o->ptr;
5160 addReplyUlong(c,dictSize(s));
5161 }
5162
5163 static void spopCommand(redisClient *c) {
5164 robj *set;
5165 dictEntry *de;
5166
5167 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5168 checkType(c,set,REDIS_SET)) return;
5169
5170 de = dictGetRandomKey(set->ptr);
5171 if (de == NULL) {
5172 addReply(c,shared.nullbulk);
5173 } else {
5174 robj *ele = dictGetEntryKey(de);
5175
5176 addReplyBulk(c,ele);
5177 dictDelete(set->ptr,ele);
5178 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5179 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5180 server.dirty++;
5181 }
5182 }
5183
5184 static void srandmemberCommand(redisClient *c) {
5185 robj *set;
5186 dictEntry *de;
5187
5188 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5189 checkType(c,set,REDIS_SET)) return;
5190
5191 de = dictGetRandomKey(set->ptr);
5192 if (de == NULL) {
5193 addReply(c,shared.nullbulk);
5194 } else {
5195 robj *ele = dictGetEntryKey(de);
5196
5197 addReplyBulk(c,ele);
5198 }
5199 }
5200
5201 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5202 dict **d1 = (void*) s1, **d2 = (void*) s2;
5203
5204 return dictSize(*d1)-dictSize(*d2);
5205 }
5206
5207 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5208 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5209 dictIterator *di;
5210 dictEntry *de;
5211 robj *lenobj = NULL, *dstset = NULL;
5212 unsigned long j, cardinality = 0;
5213
5214 for (j = 0; j < setsnum; j++) {
5215 robj *setobj;
5216
5217 setobj = dstkey ?
5218 lookupKeyWrite(c->db,setskeys[j]) :
5219 lookupKeyRead(c->db,setskeys[j]);
5220 if (!setobj) {
5221 zfree(dv);
5222 if (dstkey) {
5223 if (deleteKey(c->db,dstkey))
5224 server.dirty++;
5225 addReply(c,shared.czero);
5226 } else {
5227 addReply(c,shared.emptymultibulk);
5228 }
5229 return;
5230 }
5231 if (setobj->type != REDIS_SET) {
5232 zfree(dv);
5233 addReply(c,shared.wrongtypeerr);
5234 return;
5235 }
5236 dv[j] = setobj->ptr;
5237 }
5238 /* Sort sets from the smallest to largest, this will improve our
5239 * algorithm's performace */
5240 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5241
5242 /* The first thing we should output is the total number of elements...
5243 * since this is a multi-bulk write, but at this stage we don't know
5244 * the intersection set size, so we use a trick, append an empty object
5245 * to the output list and save the pointer to later modify it with the
5246 * right length */
5247 if (!dstkey) {
5248 lenobj = createObject(REDIS_STRING,NULL);
5249 addReply(c,lenobj);
5250 decrRefCount(lenobj);
5251 } else {
5252 /* If we have a target key where to store the resulting set
5253 * create this key with an empty set inside */
5254 dstset = createSetObject();
5255 }
5256
5257 /* Iterate all the elements of the first (smallest) set, and test
5258 * the element against all the other sets, if at least one set does
5259 * not include the element it is discarded */
5260 di = dictGetIterator(dv[0]);
5261
5262 while((de = dictNext(di)) != NULL) {
5263 robj *ele;
5264
5265 for (j = 1; j < setsnum; j++)
5266 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5267 if (j != setsnum)
5268 continue; /* at least one set does not contain the member */
5269 ele = dictGetEntryKey(de);
5270 if (!dstkey) {
5271 addReplyBulk(c,ele);
5272 cardinality++;
5273 } else {
5274 dictAdd(dstset->ptr,ele,NULL);
5275 incrRefCount(ele);
5276 }
5277 }
5278 dictReleaseIterator(di);
5279
5280 if (dstkey) {
5281 /* Store the resulting set into the target, if the intersection
5282 * is not an empty set. */
5283 deleteKey(c->db,dstkey);
5284 if (dictSize((dict*)dstset->ptr) > 0) {
5285 dictAdd(c->db->dict,dstkey,dstset);
5286 incrRefCount(dstkey);
5287 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5288 } else {
5289 decrRefCount(dstset);
5290 addReply(c,shared.czero);
5291 }
5292 server.dirty++;
5293 } else {
5294 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5295 }
5296 zfree(dv);
5297 }
5298
5299 static void sinterCommand(redisClient *c) {
5300 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5301 }
5302
5303 static void sinterstoreCommand(redisClient *c) {
5304 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5305 }
5306
5307 #define REDIS_OP_UNION 0
5308 #define REDIS_OP_DIFF 1
5309 #define REDIS_OP_INTER 2
5310
5311 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5312 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5313 dictIterator *di;
5314 dictEntry *de;
5315 robj *dstset = NULL;
5316 int j, cardinality = 0;
5317
5318 for (j = 0; j < setsnum; j++) {
5319 robj *setobj;
5320
5321 setobj = dstkey ?
5322 lookupKeyWrite(c->db,setskeys[j]) :
5323 lookupKeyRead(c->db,setskeys[j]);
5324 if (!setobj) {
5325 dv[j] = NULL;
5326 continue;
5327 }
5328 if (setobj->type != REDIS_SET) {
5329 zfree(dv);
5330 addReply(c,shared.wrongtypeerr);
5331 return;
5332 }
5333 dv[j] = setobj->ptr;
5334 }
5335
5336 /* We need a temp set object to store our union. If the dstkey
5337 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5338 * this set object will be the resulting object to set into the target key*/
5339 dstset = createSetObject();
5340
5341 /* Iterate all the elements of all the sets, add every element a single
5342 * time to the result set */
5343 for (j = 0; j < setsnum; j++) {
5344 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5345 if (!dv[j]) continue; /* non existing keys are like empty sets */
5346
5347 di = dictGetIterator(dv[j]);
5348
5349 while((de = dictNext(di)) != NULL) {
5350 robj *ele;
5351
5352 /* dictAdd will not add the same element multiple times */
5353 ele = dictGetEntryKey(de);
5354 if (op == REDIS_OP_UNION || j == 0) {
5355 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5356 incrRefCount(ele);
5357 cardinality++;
5358 }
5359 } else if (op == REDIS_OP_DIFF) {
5360 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5361 cardinality--;
5362 }
5363 }
5364 }
5365 dictReleaseIterator(di);
5366
5367 /* result set is empty? Exit asap. */
5368 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5369 }
5370
5371 /* Output the content of the resulting set, if not in STORE mode */
5372 if (!dstkey) {
5373 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5374 di = dictGetIterator(dstset->ptr);
5375 while((de = dictNext(di)) != NULL) {
5376 robj *ele;
5377
5378 ele = dictGetEntryKey(de);
5379 addReplyBulk(c,ele);
5380 }
5381 dictReleaseIterator(di);
5382 decrRefCount(dstset);
5383 } else {
5384 /* If we have a target key where to store the resulting set
5385 * create this key with the result set inside */
5386 deleteKey(c->db,dstkey);
5387 if (dictSize((dict*)dstset->ptr) > 0) {
5388 dictAdd(c->db->dict,dstkey,dstset);
5389 incrRefCount(dstkey);
5390 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5391 } else {
5392 decrRefCount(dstset);
5393 addReply(c,shared.czero);
5394 }
5395 server.dirty++;
5396 }
5397 zfree(dv);
5398 }
5399
5400 static void sunionCommand(redisClient *c) {
5401 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5402 }
5403
5404 static void sunionstoreCommand(redisClient *c) {
5405 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5406 }
5407
5408 static void sdiffCommand(redisClient *c) {
5409 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5410 }
5411
5412 static void sdiffstoreCommand(redisClient *c) {
5413 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5414 }
5415
5416 /* ==================================== ZSets =============================== */
5417
5418 /* ZSETs are ordered sets using two data structures to hold the same elements
5419 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5420 * data structure.
5421 *
5422 * The elements are added to an hash table mapping Redis objects to scores.
5423 * At the same time the elements are added to a skip list mapping scores
5424 * to Redis objects (so objects are sorted by scores in this "view"). */
5425
5426 /* This skiplist implementation is almost a C translation of the original
5427 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5428 * Alternative to Balanced Trees", modified in three ways:
5429 * a) this implementation allows for repeated values.
5430 * b) the comparison is not just by key (our 'score') but by satellite data.
5431 * c) there is a back pointer, so it's a doubly linked list with the back
5432 * pointers being only at "level 1". This allows to traverse the list
5433 * from tail to head, useful for ZREVRANGE. */
5434
5435 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5436 zskiplistNode *zn = zmalloc(sizeof(*zn));
5437
5438 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5439 if (level > 1)
5440 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5441 else
5442 zn->span = NULL;
5443 zn->score = score;
5444 zn->obj = obj;
5445 return zn;
5446 }
5447
5448 static zskiplist *zslCreate(void) {
5449 int j;
5450 zskiplist *zsl;
5451
5452 zsl = zmalloc(sizeof(*zsl));
5453 zsl->level = 1;
5454 zsl->length = 0;
5455 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5456 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5457 zsl->header->forward[j] = NULL;
5458
5459 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5460 if (j < ZSKIPLIST_MAXLEVEL-1)
5461 zsl->header->span[j] = 0;
5462 }
5463 zsl->header->backward = NULL;
5464 zsl->tail = NULL;
5465 return zsl;
5466 }
5467
5468 static void zslFreeNode(zskiplistNode *node) {
5469 decrRefCount(node->obj);
5470 zfree(node->forward);
5471 zfree(node->span);
5472 zfree(node);
5473 }
5474
5475 static void zslFree(zskiplist *zsl) {
5476 zskiplistNode *node = zsl->header->forward[0], *next;
5477
5478 zfree(zsl->header->forward);
5479 zfree(zsl->header->span);
5480 zfree(zsl->header);
5481 while(node) {
5482 next = node->forward[0];
5483 zslFreeNode(node);
5484 node = next;
5485 }
5486 zfree(zsl);
5487 }
5488
5489 static int zslRandomLevel(void) {
5490 int level = 1;
5491 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5492 level += 1;
5493 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5494 }
5495
5496 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5497 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5498 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5499 int i, level;
5500
5501 x = zsl->header;
5502 for (i = zsl->level-1; i >= 0; i--) {
5503 /* store rank that is crossed to reach the insert position */
5504 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5505
5506 while (x->forward[i] &&
5507 (x->forward[i]->score < score ||
5508 (x->forward[i]->score == score &&
5509 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5510 rank[i] += i > 0 ? x->span[i-1] : 1;
5511 x = x->forward[i];
5512 }
5513 update[i] = x;
5514 }
5515 /* we assume the key is not already inside, since we allow duplicated
5516 * scores, and the re-insertion of score and redis object should never
5517 * happpen since the caller of zslInsert() should test in the hash table
5518 * if the element is already inside or not. */
5519 level = zslRandomLevel();
5520 if (level > zsl->level) {
5521 for (i = zsl->level; i < level; i++) {
5522 rank[i] = 0;
5523 update[i] = zsl->header;
5524 update[i]->span[i-1] = zsl->length;
5525 }
5526 zsl->level = level;
5527 }
5528 x = zslCreateNode(level,score,obj);
5529 for (i = 0; i < level; i++) {
5530 x->forward[i] = update[i]->forward[i];
5531 update[i]->forward[i] = x;
5532
5533 /* update span covered by update[i] as x is inserted here */
5534 if (i > 0) {
5535 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5536 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5537 }
5538 }
5539
5540 /* increment span for untouched levels */
5541 for (i = level; i < zsl->level; i++) {
5542 update[i]->span[i-1]++;
5543 }
5544
5545 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5546 if (x->forward[0])
5547 x->forward[0]->backward = x;
5548 else
5549 zsl->tail = x;
5550 zsl->length++;
5551 }
5552
5553 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5554 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5555 int i;
5556 for (i = 0; i < zsl->level; i++) {
5557 if (update[i]->forward[i] == x) {
5558 if (i > 0) {
5559 update[i]->span[i-1] += x->span[i-1] - 1;
5560 }
5561 update[i]->forward[i] = x->forward[i];
5562 } else {
5563 /* invariant: i > 0, because update[0]->forward[0]
5564 * is always equal to x */
5565 update[i]->span[i-1] -= 1;
5566 }
5567 }
5568 if (x->forward[0]) {
5569 x->forward[0]->backward = x->backward;
5570 } else {
5571 zsl->tail = x->backward;
5572 }
5573 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5574 zsl->level--;
5575 zsl->length--;
5576 }
5577
5578 /* Delete an element with matching score/object from the skiplist. */
5579 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5580 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5581 int i;
5582
5583 x = zsl->header;
5584 for (i = zsl->level-1; i >= 0; i--) {
5585 while (x->forward[i] &&
5586 (x->forward[i]->score < score ||
5587 (x->forward[i]->score == score &&
5588 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5589 x = x->forward[i];
5590 update[i] = x;
5591 }
5592 /* We may have multiple elements with the same score, what we need
5593 * is to find the element with both the right score and object. */
5594 x = x->forward[0];
5595 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5596 zslDeleteNode(zsl, x, update);
5597 zslFreeNode(x);
5598 return 1;
5599 } else {
5600 return 0; /* not found */
5601 }
5602 return 0; /* not found */
5603 }
5604
5605 /* Delete all the elements with score between min and max from the skiplist.
5606 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5607 * Note that this function takes the reference to the hash table view of the
5608 * sorted set, in order to remove the elements from the hash table too. */
5609 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5610 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5611 unsigned long removed = 0;
5612 int i;
5613
5614 x = zsl->header;
5615 for (i = zsl->level-1; i >= 0; i--) {
5616 while (x->forward[i] && x->forward[i]->score < min)
5617 x = x->forward[i];
5618 update[i] = x;
5619 }
5620 /* We may have multiple elements with the same score, what we need
5621 * is to find the element with both the right score and object. */
5622 x = x->forward[0];
5623 while (x && x->score <= max) {
5624 zskiplistNode *next = x->forward[0];
5625 zslDeleteNode(zsl, x, update);
5626 dictDelete(dict,x->obj);
5627 zslFreeNode(x);
5628 removed++;
5629 x = next;
5630 }
5631 return removed; /* not found */
5632 }
5633
5634 /* Delete all the elements with rank between start and end from the skiplist.
5635 * Start and end are inclusive. Note that start and end need to be 1-based */
5636 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5637 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5638 unsigned long traversed = 0, removed = 0;
5639 int i;
5640
5641 x = zsl->header;
5642 for (i = zsl->level-1; i >= 0; i--) {
5643 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5644 traversed += i > 0 ? x->span[i-1] : 1;
5645 x = x->forward[i];
5646 }
5647 update[i] = x;
5648 }
5649
5650 traversed++;
5651 x = x->forward[0];
5652 while (x && traversed <= end) {
5653 zskiplistNode *next = x->forward[0];
5654 zslDeleteNode(zsl, x, update);
5655 dictDelete(dict,x->obj);
5656 zslFreeNode(x);
5657 removed++;
5658 traversed++;
5659 x = next;
5660 }
5661 return removed;
5662 }
5663
5664 /* Find the first node having a score equal or greater than the specified one.
5665 * Returns NULL if there is no match. */
5666 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5667 zskiplistNode *x;
5668 int i;
5669
5670 x = zsl->header;
5671 for (i = zsl->level-1; i >= 0; i--) {
5672 while (x->forward[i] && x->forward[i]->score < score)
5673 x = x->forward[i];
5674 }
5675 /* We may have multiple elements with the same score, what we need
5676 * is to find the element with both the right score and object. */
5677 return x->forward[0];
5678 }
5679
5680 /* Find the rank for an element by both score and key.
5681 * Returns 0 when the element cannot be found, rank otherwise.
5682 * Note that the rank is 1-based due to the span of zsl->header to the
5683 * first element. */
5684 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5685 zskiplistNode *x;
5686 unsigned long rank = 0;
5687 int i;
5688
5689 x = zsl->header;
5690 for (i = zsl->level-1; i >= 0; i--) {
5691 while (x->forward[i] &&
5692 (x->forward[i]->score < score ||
5693 (x->forward[i]->score == score &&
5694 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5695 rank += i > 0 ? x->span[i-1] : 1;
5696 x = x->forward[i];
5697 }
5698
5699 /* x might be equal to zsl->header, so test if obj is non-NULL */
5700 if (x->obj && equalStringObjects(x->obj,o)) {
5701 return rank;
5702 }
5703 }
5704 return 0;
5705 }
5706
5707 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5708 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5709 zskiplistNode *x;
5710 unsigned long traversed = 0;
5711 int i;
5712
5713 x = zsl->header;
5714 for (i = zsl->level-1; i >= 0; i--) {
5715 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5716 {
5717 traversed += i > 0 ? x->span[i-1] : 1;
5718 x = x->forward[i];
5719 }
5720 if (traversed == rank) {
5721 return x;
5722 }
5723 }
5724 return NULL;
5725 }
5726
5727 /* The actual Z-commands implementations */
5728
5729 /* This generic command implements both ZADD and ZINCRBY.
5730 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5731 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5732 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5733 robj *zsetobj;
5734 zset *zs;
5735 double *score;
5736
5737 if (isnan(scoreval)) {
5738 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5739 return;
5740 }
5741
5742 zsetobj = lookupKeyWrite(c->db,key);
5743 if (zsetobj == NULL) {
5744 zsetobj = createZsetObject();
5745 dictAdd(c->db->dict,key,zsetobj);
5746 incrRefCount(key);
5747 } else {
5748 if (zsetobj->type != REDIS_ZSET) {
5749 addReply(c,shared.wrongtypeerr);
5750 return;
5751 }
5752 }
5753 zs = zsetobj->ptr;
5754
5755 /* Ok now since we implement both ZADD and ZINCRBY here the code
5756 * needs to handle the two different conditions. It's all about setting
5757 * '*score', that is, the new score to set, to the right value. */
5758 score = zmalloc(sizeof(double));
5759 if (doincrement) {
5760 dictEntry *de;
5761
5762 /* Read the old score. If the element was not present starts from 0 */
5763 de = dictFind(zs->dict,ele);
5764 if (de) {
5765 double *oldscore = dictGetEntryVal(de);
5766 *score = *oldscore + scoreval;
5767 } else {
5768 *score = scoreval;
5769 }
5770 if (isnan(*score)) {
5771 addReplySds(c,
5772 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5773 zfree(score);
5774 /* Note that we don't need to check if the zset may be empty and
5775 * should be removed here, as we can only obtain Nan as score if
5776 * there was already an element in the sorted set. */
5777 return;
5778 }
5779 } else {
5780 *score = scoreval;
5781 }
5782
5783 /* What follows is a simple remove and re-insert operation that is common
5784 * to both ZADD and ZINCRBY... */
5785 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5786 /* case 1: New element */
5787 incrRefCount(ele); /* added to hash */
5788 zslInsert(zs->zsl,*score,ele);
5789 incrRefCount(ele); /* added to skiplist */
5790 server.dirty++;
5791 if (doincrement)
5792 addReplyDouble(c,*score);
5793 else
5794 addReply(c,shared.cone);
5795 } else {
5796 dictEntry *de;
5797 double *oldscore;
5798
5799 /* case 2: Score update operation */
5800 de = dictFind(zs->dict,ele);
5801 redisAssert(de != NULL);
5802 oldscore = dictGetEntryVal(de);
5803 if (*score != *oldscore) {
5804 int deleted;
5805
5806 /* Remove and insert the element in the skip list with new score */
5807 deleted = zslDelete(zs->zsl,*oldscore,ele);
5808 redisAssert(deleted != 0);
5809 zslInsert(zs->zsl,*score,ele);
5810 incrRefCount(ele);
5811 /* Update the score in the hash table */
5812 dictReplace(zs->dict,ele,score);
5813 server.dirty++;
5814 } else {
5815 zfree(score);
5816 }
5817 if (doincrement)
5818 addReplyDouble(c,*score);
5819 else
5820 addReply(c,shared.czero);
5821 }
5822 }
5823
5824 static void zaddCommand(redisClient *c) {
5825 double scoreval;
5826
5827 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5828 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5829 }
5830
5831 static void zincrbyCommand(redisClient *c) {
5832 double scoreval;
5833
5834 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5835 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5836 }
5837
5838 static void zremCommand(redisClient *c) {
5839 robj *zsetobj;
5840 zset *zs;
5841 dictEntry *de;
5842 double *oldscore;
5843 int deleted;
5844
5845 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5846 checkType(c,zsetobj,REDIS_ZSET)) return;
5847
5848 zs = zsetobj->ptr;
5849 de = dictFind(zs->dict,c->argv[2]);
5850 if (de == NULL) {
5851 addReply(c,shared.czero);
5852 return;
5853 }
5854 /* Delete from the skiplist */
5855 oldscore = dictGetEntryVal(de);
5856 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5857 redisAssert(deleted != 0);
5858
5859 /* Delete from the hash table */
5860 dictDelete(zs->dict,c->argv[2]);
5861 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5862 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5863 server.dirty++;
5864 addReply(c,shared.cone);
5865 }
5866
5867 static void zremrangebyscoreCommand(redisClient *c) {
5868 double min;
5869 double max;
5870 long deleted;
5871 robj *zsetobj;
5872 zset *zs;
5873
5874 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5875 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5876
5877 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5878 checkType(c,zsetobj,REDIS_ZSET)) return;
5879
5880 zs = zsetobj->ptr;
5881 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5882 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5883 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5884 server.dirty += deleted;
5885 addReplyLongLong(c,deleted);
5886 }
5887
5888 static void zremrangebyrankCommand(redisClient *c) {
5889 long start;
5890 long end;
5891 int llen;
5892 long deleted;
5893 robj *zsetobj;
5894 zset *zs;
5895
5896 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5897 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5898
5899 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5900 checkType(c,zsetobj,REDIS_ZSET)) return;
5901 zs = zsetobj->ptr;
5902 llen = zs->zsl->length;
5903
5904 /* convert negative indexes */
5905 if (start < 0) start = llen+start;
5906 if (end < 0) end = llen+end;
5907 if (start < 0) start = 0;
5908 if (end < 0) end = 0;
5909
5910 /* indexes sanity checks */
5911 if (start > end || start >= llen) {
5912 addReply(c,shared.czero);
5913 return;
5914 }
5915 if (end >= llen) end = llen-1;
5916
5917 /* increment start and end because zsl*Rank functions
5918 * use 1-based rank */
5919 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5920 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5921 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5922 server.dirty += deleted;
5923 addReplyLongLong(c, deleted);
5924 }
5925
5926 typedef struct {
5927 dict *dict;
5928 double weight;
5929 } zsetopsrc;
5930
5931 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5932 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5933 unsigned long size1, size2;
5934 size1 = d1->dict ? dictSize(d1->dict) : 0;
5935 size2 = d2->dict ? dictSize(d2->dict) : 0;
5936 return size1 - size2;
5937 }
5938
5939 #define REDIS_AGGR_SUM 1
5940 #define REDIS_AGGR_MIN 2
5941 #define REDIS_AGGR_MAX 3
5942 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5943
5944 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5945 if (aggregate == REDIS_AGGR_SUM) {
5946 *target = *target + val;
5947 } else if (aggregate == REDIS_AGGR_MIN) {
5948 *target = val < *target ? val : *target;
5949 } else if (aggregate == REDIS_AGGR_MAX) {
5950 *target = val > *target ? val : *target;
5951 } else {
5952 /* safety net */
5953 redisPanic("Unknown ZUNION/INTER aggregate type");
5954 }
5955 }
5956
5957 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5958 int i, j, setnum;
5959 int aggregate = REDIS_AGGR_SUM;
5960 zsetopsrc *src;
5961 robj *dstobj;
5962 zset *dstzset;
5963 dictIterator *di;
5964 dictEntry *de;
5965
5966 /* expect setnum input keys to be given */
5967 setnum = atoi(c->argv[2]->ptr);
5968 if (setnum < 1) {
5969 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5970 return;
5971 }
5972
5973 /* test if the expected number of keys would overflow */
5974 if (3+setnum > c->argc) {
5975 addReply(c,shared.syntaxerr);
5976 return;
5977 }
5978
5979 /* read keys to be used for input */
5980 src = zmalloc(sizeof(zsetopsrc) * setnum);
5981 for (i = 0, j = 3; i < setnum; i++, j++) {
5982 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5983 if (!obj) {
5984 src[i].dict = NULL;
5985 } else {
5986 if (obj->type == REDIS_ZSET) {
5987 src[i].dict = ((zset*)obj->ptr)->dict;
5988 } else if (obj->type == REDIS_SET) {
5989 src[i].dict = (obj->ptr);
5990 } else {
5991 zfree(src);
5992 addReply(c,shared.wrongtypeerr);
5993 return;
5994 }
5995 }
5996
5997 /* default all weights to 1 */
5998 src[i].weight = 1.0;
5999 }
6000
6001 /* parse optional extra arguments */
6002 if (j < c->argc) {
6003 int remaining = c->argc - j;
6004
6005 while (remaining) {
6006 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6007 j++; remaining--;
6008 for (i = 0; i < setnum; i++, j++, remaining--) {
6009 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6010 return;
6011 }
6012 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6013 j++; remaining--;
6014 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6015 aggregate = REDIS_AGGR_SUM;
6016 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6017 aggregate = REDIS_AGGR_MIN;
6018 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6019 aggregate = REDIS_AGGR_MAX;
6020 } else {
6021 zfree(src);
6022 addReply(c,shared.syntaxerr);
6023 return;
6024 }
6025 j++; remaining--;
6026 } else {
6027 zfree(src);
6028 addReply(c,shared.syntaxerr);
6029 return;
6030 }
6031 }
6032 }
6033
6034 /* sort sets from the smallest to largest, this will improve our
6035 * algorithm's performance */
6036 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6037
6038 dstobj = createZsetObject();
6039 dstzset = dstobj->ptr;
6040
6041 if (op == REDIS_OP_INTER) {
6042 /* skip going over all entries if the smallest zset is NULL or empty */
6043 if (src[0].dict && dictSize(src[0].dict) > 0) {
6044 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6045 * from small to large, all src[i > 0].dict are non-empty too */
6046 di = dictGetIterator(src[0].dict);
6047 while((de = dictNext(di)) != NULL) {
6048 double *score = zmalloc(sizeof(double)), value;
6049 *score = src[0].weight * zunionInterDictValue(de);
6050
6051 for (j = 1; j < setnum; j++) {
6052 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6053 if (other) {
6054 value = src[j].weight * zunionInterDictValue(other);
6055 zunionInterAggregate(score, value, aggregate);
6056 } else {
6057 break;
6058 }
6059 }
6060
6061 /* skip entry when not present in every source dict */
6062 if (j != setnum) {
6063 zfree(score);
6064 } else {
6065 robj *o = dictGetEntryKey(de);
6066 dictAdd(dstzset->dict,o,score);
6067 incrRefCount(o); /* added to dictionary */
6068 zslInsert(dstzset->zsl,*score,o);
6069 incrRefCount(o); /* added to skiplist */
6070 }
6071 }
6072 dictReleaseIterator(di);
6073 }
6074 } else if (op == REDIS_OP_UNION) {
6075 for (i = 0; i < setnum; i++) {
6076 if (!src[i].dict) continue;
6077
6078 di = dictGetIterator(src[i].dict);
6079 while((de = dictNext(di)) != NULL) {
6080 /* skip key when already processed */
6081 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6082
6083 double *score = zmalloc(sizeof(double)), value;
6084 *score = src[i].weight * zunionInterDictValue(de);
6085
6086 /* because the zsets are sorted by size, its only possible
6087 * for sets at larger indices to hold this entry */
6088 for (j = (i+1); j < setnum; j++) {
6089 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6090 if (other) {
6091 value = src[j].weight * zunionInterDictValue(other);
6092 zunionInterAggregate(score, value, aggregate);
6093 }
6094 }
6095
6096 robj *o = dictGetEntryKey(de);
6097 dictAdd(dstzset->dict,o,score);
6098 incrRefCount(o); /* added to dictionary */
6099 zslInsert(dstzset->zsl,*score,o);
6100 incrRefCount(o); /* added to skiplist */
6101 }
6102 dictReleaseIterator(di);
6103 }
6104 } else {
6105 /* unknown operator */
6106 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6107 }
6108
6109 deleteKey(c->db,dstkey);
6110 if (dstzset->zsl->length) {
6111 dictAdd(c->db->dict,dstkey,dstobj);
6112 incrRefCount(dstkey);
6113 addReplyLongLong(c, dstzset->zsl->length);
6114 server.dirty++;
6115 } else {
6116 decrRefCount(dstobj);
6117 addReply(c, shared.czero);
6118 }
6119 zfree(src);
6120 }
6121
6122 static void zunionstoreCommand(redisClient *c) {
6123 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6124 }
6125
6126 static void zinterstoreCommand(redisClient *c) {
6127 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6128 }
6129
6130 static void zrangeGenericCommand(redisClient *c, int reverse) {
6131 robj *o;
6132 long start;
6133 long end;
6134 int withscores = 0;
6135 int llen;
6136 int rangelen, j;
6137 zset *zsetobj;
6138 zskiplist *zsl;
6139 zskiplistNode *ln;
6140 robj *ele;
6141
6142 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6143 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6144
6145 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6146 withscores = 1;
6147 } else if (c->argc >= 5) {
6148 addReply(c,shared.syntaxerr);
6149 return;
6150 }
6151
6152 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6153 || checkType(c,o,REDIS_ZSET)) return;
6154 zsetobj = o->ptr;
6155 zsl = zsetobj->zsl;
6156 llen = zsl->length;
6157
6158 /* convert negative indexes */
6159 if (start < 0) start = llen+start;
6160 if (end < 0) end = llen+end;
6161 if (start < 0) start = 0;
6162 if (end < 0) end = 0;
6163
6164 /* indexes sanity checks */
6165 if (start > end || start >= llen) {
6166 /* Out of range start or start > end result in empty list */
6167 addReply(c,shared.emptymultibulk);
6168 return;
6169 }
6170 if (end >= llen) end = llen-1;
6171 rangelen = (end-start)+1;
6172
6173 /* check if starting point is trivial, before searching
6174 * the element in log(N) time */
6175 if (reverse) {
6176 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6177 } else {
6178 ln = start == 0 ?
6179 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6180 }
6181
6182 /* Return the result in form of a multi-bulk reply */
6183 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6184 withscores ? (rangelen*2) : rangelen));
6185 for (j = 0; j < rangelen; j++) {
6186 ele = ln->obj;
6187 addReplyBulk(c,ele);
6188 if (withscores)
6189 addReplyDouble(c,ln->score);
6190 ln = reverse ? ln->backward : ln->forward[0];
6191 }
6192 }
6193
6194 static void zrangeCommand(redisClient *c) {
6195 zrangeGenericCommand(c,0);
6196 }
6197
6198 static void zrevrangeCommand(redisClient *c) {
6199 zrangeGenericCommand(c,1);
6200 }
6201
6202 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6203 * If justcount is non-zero, just the count is returned. */
6204 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6205 robj *o;
6206 double min, max;
6207 int minex = 0, maxex = 0; /* are min or max exclusive? */
6208 int offset = 0, limit = -1;
6209 int withscores = 0;
6210 int badsyntax = 0;
6211
6212 /* Parse the min-max interval. If one of the values is prefixed
6213 * by the "(" character, it's considered "open". For instance
6214 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6215 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6216 if (((char*)c->argv[2]->ptr)[0] == '(') {
6217 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6218 minex = 1;
6219 } else {
6220 min = strtod(c->argv[2]->ptr,NULL);
6221 }
6222 if (((char*)c->argv[3]->ptr)[0] == '(') {
6223 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6224 maxex = 1;
6225 } else {
6226 max = strtod(c->argv[3]->ptr,NULL);
6227 }
6228
6229 /* Parse "WITHSCORES": note that if the command was called with
6230 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6231 * enter the following paths to parse WITHSCORES and LIMIT. */
6232 if (c->argc == 5 || c->argc == 8) {
6233 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6234 withscores = 1;
6235 else
6236 badsyntax = 1;
6237 }
6238 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6239 badsyntax = 1;
6240 if (badsyntax) {
6241 addReplySds(c,
6242 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6243 return;
6244 }
6245
6246 /* Parse "LIMIT" */
6247 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6248 addReply(c,shared.syntaxerr);
6249 return;
6250 } else if (c->argc == (7 + withscores)) {
6251 offset = atoi(c->argv[5]->ptr);
6252 limit = atoi(c->argv[6]->ptr);
6253 if (offset < 0) offset = 0;
6254 }
6255
6256 /* Ok, lookup the key and get the range */
6257 o = lookupKeyRead(c->db,c->argv[1]);
6258 if (o == NULL) {
6259 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6260 } else {
6261 if (o->type != REDIS_ZSET) {
6262 addReply(c,shared.wrongtypeerr);
6263 } else {
6264 zset *zsetobj = o->ptr;
6265 zskiplist *zsl = zsetobj->zsl;
6266 zskiplistNode *ln;
6267 robj *ele, *lenobj = NULL;
6268 unsigned long rangelen = 0;
6269
6270 /* Get the first node with the score >= min, or with
6271 * score > min if 'minex' is true. */
6272 ln = zslFirstWithScore(zsl,min);
6273 while (minex && ln && ln->score == min) ln = ln->forward[0];
6274
6275 if (ln == NULL) {
6276 /* No element matching the speciifed interval */
6277 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6278 return;
6279 }
6280
6281 /* We don't know in advance how many matching elements there
6282 * are in the list, so we push this object that will represent
6283 * the multi-bulk length in the output buffer, and will "fix"
6284 * it later */
6285 if (!justcount) {
6286 lenobj = createObject(REDIS_STRING,NULL);
6287 addReply(c,lenobj);
6288 decrRefCount(lenobj);
6289 }
6290
6291 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6292 if (offset) {
6293 offset--;
6294 ln = ln->forward[0];
6295 continue;
6296 }
6297 if (limit == 0) break;
6298 if (!justcount) {
6299 ele = ln->obj;
6300 addReplyBulk(c,ele);
6301 if (withscores)
6302 addReplyDouble(c,ln->score);
6303 }
6304 ln = ln->forward[0];
6305 rangelen++;
6306 if (limit > 0) limit--;
6307 }
6308 if (justcount) {
6309 addReplyLongLong(c,(long)rangelen);
6310 } else {
6311 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6312 withscores ? (rangelen*2) : rangelen);
6313 }
6314 }
6315 }
6316 }
6317
6318 static void zrangebyscoreCommand(redisClient *c) {
6319 genericZrangebyscoreCommand(c,0);
6320 }
6321
6322 static void zcountCommand(redisClient *c) {
6323 genericZrangebyscoreCommand(c,1);
6324 }
6325
6326 static void zcardCommand(redisClient *c) {
6327 robj *o;
6328 zset *zs;
6329
6330 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6331 checkType(c,o,REDIS_ZSET)) return;
6332
6333 zs = o->ptr;
6334 addReplyUlong(c,zs->zsl->length);
6335 }
6336
6337 static void zscoreCommand(redisClient *c) {
6338 robj *o;
6339 zset *zs;
6340 dictEntry *de;
6341
6342 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6343 checkType(c,o,REDIS_ZSET)) return;
6344
6345 zs = o->ptr;
6346 de = dictFind(zs->dict,c->argv[2]);
6347 if (!de) {
6348 addReply(c,shared.nullbulk);
6349 } else {
6350 double *score = dictGetEntryVal(de);
6351
6352 addReplyDouble(c,*score);
6353 }
6354 }
6355
6356 static void zrankGenericCommand(redisClient *c, int reverse) {
6357 robj *o;
6358 zset *zs;
6359 zskiplist *zsl;
6360 dictEntry *de;
6361 unsigned long rank;
6362 double *score;
6363
6364 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6365 checkType(c,o,REDIS_ZSET)) return;
6366
6367 zs = o->ptr;
6368 zsl = zs->zsl;
6369 de = dictFind(zs->dict,c->argv[2]);
6370 if (!de) {
6371 addReply(c,shared.nullbulk);
6372 return;
6373 }
6374
6375 score = dictGetEntryVal(de);
6376 rank = zslGetRank(zsl, *score, c->argv[2]);
6377 if (rank) {
6378 if (reverse) {
6379 addReplyLongLong(c, zsl->length - rank);
6380 } else {
6381 addReplyLongLong(c, rank-1);
6382 }
6383 } else {
6384 addReply(c,shared.nullbulk);
6385 }
6386 }
6387
6388 static void zrankCommand(redisClient *c) {
6389 zrankGenericCommand(c, 0);
6390 }
6391
6392 static void zrevrankCommand(redisClient *c) {
6393 zrankGenericCommand(c, 1);
6394 }
6395
6396 /* ========================= Hashes utility functions ======================= */
6397 #define REDIS_HASH_KEY 1
6398 #define REDIS_HASH_VALUE 2
6399
6400 /* Check the length of a number of objects to see if we need to convert a
6401 * zipmap to a real hash. Note that we only check string encoded objects
6402 * as their string length can be queried in constant time. */
6403 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6404 int i;
6405 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6406
6407 for (i = start; i <= end; i++) {
6408 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6409 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6410 {
6411 convertToRealHash(subject);
6412 return;
6413 }
6414 }
6415 }
6416
6417 /* Encode given objects in-place when the hash uses a dict. */
6418 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6419 if (subject->encoding == REDIS_ENCODING_HT) {
6420 if (o1) *o1 = tryObjectEncoding(*o1);
6421 if (o2) *o2 = tryObjectEncoding(*o2);
6422 }
6423 }
6424
6425 /* Get the value from a hash identified by key. Returns either a string
6426 * object or NULL if the value cannot be found. The refcount of the object
6427 * is always increased by 1 when the value was found. */
6428 static robj *hashGet(robj *o, robj *key) {
6429 robj *value = NULL;
6430 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6431 unsigned char *v;
6432 unsigned int vlen;
6433 key = getDecodedObject(key);
6434 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6435 value = createStringObject((char*)v,vlen);
6436 }
6437 decrRefCount(key);
6438 } else {
6439 dictEntry *de = dictFind(o->ptr,key);
6440 if (de != NULL) {
6441 value = dictGetEntryVal(de);
6442 incrRefCount(value);
6443 }
6444 }
6445 return value;
6446 }
6447
6448 /* Test if the key exists in the given hash. Returns 1 if the key
6449 * exists and 0 when it doesn't. */
6450 static int hashExists(robj *o, robj *key) {
6451 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6452 key = getDecodedObject(key);
6453 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6454 decrRefCount(key);
6455 return 1;
6456 }
6457 decrRefCount(key);
6458 } else {
6459 if (dictFind(o->ptr,key) != NULL) {
6460 return 1;
6461 }
6462 }
6463 return 0;
6464 }
6465
6466 /* Add an element, discard the old if the key already exists.
6467 * Return 0 on insert and 1 on update. */
6468 static int hashSet(robj *o, robj *key, robj *value) {
6469 int update = 0;
6470 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6471 key = getDecodedObject(key);
6472 value = getDecodedObject(value);
6473 o->ptr = zipmapSet(o->ptr,
6474 key->ptr,sdslen(key->ptr),
6475 value->ptr,sdslen(value->ptr), &update);
6476 decrRefCount(key);
6477 decrRefCount(value);
6478
6479 /* Check if the zipmap needs to be upgraded to a real hash table */
6480 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6481 convertToRealHash(o);
6482 } else {
6483 if (dictReplace(o->ptr,key,value)) {
6484 /* Insert */
6485 incrRefCount(key);
6486 } else {
6487 /* Update */
6488 update = 1;
6489 }
6490 incrRefCount(value);
6491 }
6492 return update;
6493 }
6494
6495 /* Delete an element from a hash.
6496 * Return 1 on deleted and 0 on not found. */
6497 static int hashDelete(robj *o, robj *key) {
6498 int deleted = 0;
6499 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6500 key = getDecodedObject(key);
6501 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6502 decrRefCount(key);
6503 } else {
6504 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6505 /* Always check if the dictionary needs a resize after a delete. */
6506 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6507 }
6508 return deleted;
6509 }
6510
6511 /* Return the number of elements in a hash. */
6512 static unsigned long hashLength(robj *o) {
6513 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6514 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6515 }
6516
6517 /* Structure to hold hash iteration abstration. Note that iteration over
6518 * hashes involves both fields and values. Because it is possible that
6519 * not both are required, store pointers in the iterator to avoid
6520 * unnecessary memory allocation for fields/values. */
6521 typedef struct {
6522 int encoding;
6523 unsigned char *zi;
6524 unsigned char *zk, *zv;
6525 unsigned int zklen, zvlen;
6526
6527 dictIterator *di;
6528 dictEntry *de;
6529 } hashIterator;
6530
6531 static hashIterator *hashInitIterator(robj *subject) {
6532 hashIterator *hi = zmalloc(sizeof(hashIterator));
6533 hi->encoding = subject->encoding;
6534 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6535 hi->zi = zipmapRewind(subject->ptr);
6536 } else if (hi->encoding == REDIS_ENCODING_HT) {
6537 hi->di = dictGetIterator(subject->ptr);
6538 } else {
6539 redisAssert(NULL);
6540 }
6541 return hi;
6542 }
6543
6544 static void hashReleaseIterator(hashIterator *hi) {
6545 if (hi->encoding == REDIS_ENCODING_HT) {
6546 dictReleaseIterator(hi->di);
6547 }
6548 zfree(hi);
6549 }
6550
6551 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6552 * could be found and REDIS_ERR when the iterator reaches the end. */
6553 static int hashNext(hashIterator *hi) {
6554 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6555 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6556 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6557 } else {
6558 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6559 }
6560 return REDIS_OK;
6561 }
6562
6563 /* Get key or value object at current iteration position.
6564 * This increases the refcount of the field object by 1. */
6565 static robj *hashCurrent(hashIterator *hi, int what) {
6566 robj *o;
6567 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6568 if (what & REDIS_HASH_KEY) {
6569 o = createStringObject((char*)hi->zk,hi->zklen);
6570 } else {
6571 o = createStringObject((char*)hi->zv,hi->zvlen);
6572 }
6573 } else {
6574 if (what & REDIS_HASH_KEY) {
6575 o = dictGetEntryKey(hi->de);
6576 } else {
6577 o = dictGetEntryVal(hi->de);
6578 }
6579 incrRefCount(o);
6580 }
6581 return o;
6582 }
6583
6584 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6585 robj *o = lookupKeyWrite(c->db,key);
6586 if (o == NULL) {
6587 o = createHashObject();
6588 dictAdd(c->db->dict,key,o);
6589 incrRefCount(key);
6590 } else {
6591 if (o->type != REDIS_HASH) {
6592 addReply(c,shared.wrongtypeerr);
6593 return NULL;
6594 }
6595 }
6596 return o;
6597 }
6598
6599 /* ============================= Hash commands ============================== */
6600 static void hsetCommand(redisClient *c) {
6601 int update;
6602 robj *o;
6603
6604 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6605 hashTryConversion(o,c->argv,2,3);
6606 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6607 update = hashSet(o,c->argv[2],c->argv[3]);
6608 addReply(c, update ? shared.czero : shared.cone);
6609 server.dirty++;
6610 }
6611
6612 static void hsetnxCommand(redisClient *c) {
6613 robj *o;
6614 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6615 hashTryConversion(o,c->argv,2,3);
6616
6617 if (hashExists(o, c->argv[2])) {
6618 addReply(c, shared.czero);
6619 } else {
6620 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6621 hashSet(o,c->argv[2],c->argv[3]);
6622 addReply(c, shared.cone);
6623 server.dirty++;
6624 }
6625 }
6626
6627 static void hmsetCommand(redisClient *c) {
6628 int i;
6629 robj *o;
6630
6631 if ((c->argc % 2) == 1) {
6632 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6633 return;
6634 }
6635
6636 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6637 hashTryConversion(o,c->argv,2,c->argc-1);
6638 for (i = 2; i < c->argc; i += 2) {
6639 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6640 hashSet(o,c->argv[i],c->argv[i+1]);
6641 }
6642 addReply(c, shared.ok);
6643 server.dirty++;
6644 }
6645
6646 static void hincrbyCommand(redisClient *c) {
6647 long long value, incr;
6648 robj *o, *current, *new;
6649
6650 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6651 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6652 if ((current = hashGet(o,c->argv[2])) != NULL) {
6653 if (getLongLongFromObjectOrReply(c,current,&value,
6654 "hash value is not an integer") != REDIS_OK) {
6655 decrRefCount(current);
6656 return;
6657 }
6658 decrRefCount(current);
6659 } else {
6660 value = 0;
6661 }
6662
6663 value += incr;
6664 new = createStringObjectFromLongLong(value);
6665 hashTryObjectEncoding(o,&c->argv[2],NULL);
6666 hashSet(o,c->argv[2],new);
6667 decrRefCount(new);
6668 addReplyLongLong(c,value);
6669 server.dirty++;
6670 }
6671
6672 static void hgetCommand(redisClient *c) {
6673 robj *o, *value;
6674 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6675 checkType(c,o,REDIS_HASH)) return;
6676
6677 if ((value = hashGet(o,c->argv[2])) != NULL) {
6678 addReplyBulk(c,value);
6679 decrRefCount(value);
6680 } else {
6681 addReply(c,shared.nullbulk);
6682 }
6683 }
6684
6685 static void hmgetCommand(redisClient *c) {
6686 int i;
6687 robj *o, *value;
6688 o = lookupKeyRead(c->db,c->argv[1]);
6689 if (o != NULL && o->type != REDIS_HASH) {
6690 addReply(c,shared.wrongtypeerr);
6691 }
6692
6693 /* Note the check for o != NULL happens inside the loop. This is
6694 * done because objects that cannot be found are considered to be
6695 * an empty hash. The reply should then be a series of NULLs. */
6696 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6697 for (i = 2; i < c->argc; i++) {
6698 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6699 addReplyBulk(c,value);
6700 decrRefCount(value);
6701 } else {
6702 addReply(c,shared.nullbulk);
6703 }
6704 }
6705 }
6706
6707 static void hdelCommand(redisClient *c) {
6708 robj *o;
6709 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6710 checkType(c,o,REDIS_HASH)) return;
6711
6712 if (hashDelete(o,c->argv[2])) {
6713 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6714 addReply(c,shared.cone);
6715 server.dirty++;
6716 } else {
6717 addReply(c,shared.czero);
6718 }
6719 }
6720
6721 static void hlenCommand(redisClient *c) {
6722 robj *o;
6723 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6724 checkType(c,o,REDIS_HASH)) return;
6725
6726 addReplyUlong(c,hashLength(o));
6727 }
6728
6729 static void genericHgetallCommand(redisClient *c, int flags) {
6730 robj *o, *lenobj, *obj;
6731 unsigned long count = 0;
6732 hashIterator *hi;
6733
6734 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6735 || checkType(c,o,REDIS_HASH)) return;
6736
6737 lenobj = createObject(REDIS_STRING,NULL);
6738 addReply(c,lenobj);
6739 decrRefCount(lenobj);
6740
6741 hi = hashInitIterator(o);
6742 while (hashNext(hi) != REDIS_ERR) {
6743 if (flags & REDIS_HASH_KEY) {
6744 obj = hashCurrent(hi,REDIS_HASH_KEY);
6745 addReplyBulk(c,obj);
6746 decrRefCount(obj);
6747 count++;
6748 }
6749 if (flags & REDIS_HASH_VALUE) {
6750 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6751 addReplyBulk(c,obj);
6752 decrRefCount(obj);
6753 count++;
6754 }
6755 }
6756 hashReleaseIterator(hi);
6757
6758 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6759 }
6760
6761 static void hkeysCommand(redisClient *c) {
6762 genericHgetallCommand(c,REDIS_HASH_KEY);
6763 }
6764
6765 static void hvalsCommand(redisClient *c) {
6766 genericHgetallCommand(c,REDIS_HASH_VALUE);
6767 }
6768
6769 static void hgetallCommand(redisClient *c) {
6770 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6771 }
6772
6773 static void hexistsCommand(redisClient *c) {
6774 robj *o;
6775 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6776 checkType(c,o,REDIS_HASH)) return;
6777
6778 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6779 }
6780
6781 static void convertToRealHash(robj *o) {
6782 unsigned char *key, *val, *p, *zm = o->ptr;
6783 unsigned int klen, vlen;
6784 dict *dict = dictCreate(&hashDictType,NULL);
6785
6786 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6787 p = zipmapRewind(zm);
6788 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6789 robj *keyobj, *valobj;
6790
6791 keyobj = createStringObject((char*)key,klen);
6792 valobj = createStringObject((char*)val,vlen);
6793 keyobj = tryObjectEncoding(keyobj);
6794 valobj = tryObjectEncoding(valobj);
6795 dictAdd(dict,keyobj,valobj);
6796 }
6797 o->encoding = REDIS_ENCODING_HT;
6798 o->ptr = dict;
6799 zfree(zm);
6800 }
6801
6802 /* ========================= Non type-specific commands ==================== */
6803
6804 static void flushdbCommand(redisClient *c) {
6805 server.dirty += dictSize(c->db->dict);
6806 touchWatchedKeysOnFlush(c->db->id);
6807 dictEmpty(c->db->dict);
6808 dictEmpty(c->db->expires);
6809 addReply(c,shared.ok);
6810 }
6811
6812 static void flushallCommand(redisClient *c) {
6813 touchWatchedKeysOnFlush(-1);
6814 server.dirty += emptyDb();
6815 addReply(c,shared.ok);
6816 if (server.bgsavechildpid != -1) {
6817 kill(server.bgsavechildpid,SIGKILL);
6818 rdbRemoveTempFile(server.bgsavechildpid);
6819 }
6820 rdbSave(server.dbfilename);
6821 server.dirty++;
6822 }
6823
6824 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6825 redisSortOperation *so = zmalloc(sizeof(*so));
6826 so->type = type;
6827 so->pattern = pattern;
6828 return so;
6829 }
6830
6831 /* Return the value associated to the key with a name obtained
6832 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6833 * The returned object will always have its refcount increased by 1
6834 * when it is non-NULL. */
6835 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6836 char *p, *f;
6837 sds spat, ssub;
6838 robj keyobj, fieldobj, *o;
6839 int prefixlen, sublen, postfixlen, fieldlen;
6840 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6841 struct {
6842 long len;
6843 long free;
6844 char buf[REDIS_SORTKEY_MAX+1];
6845 } keyname, fieldname;
6846
6847 /* If the pattern is "#" return the substitution object itself in order
6848 * to implement the "SORT ... GET #" feature. */
6849 spat = pattern->ptr;
6850 if (spat[0] == '#' && spat[1] == '\0') {
6851 incrRefCount(subst);
6852 return subst;
6853 }
6854
6855 /* The substitution object may be specially encoded. If so we create
6856 * a decoded object on the fly. Otherwise getDecodedObject will just
6857 * increment the ref count, that we'll decrement later. */
6858 subst = getDecodedObject(subst);
6859
6860 ssub = subst->ptr;
6861 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6862 p = strchr(spat,'*');
6863 if (!p) {
6864 decrRefCount(subst);
6865 return NULL;
6866 }
6867
6868 /* Find out if we're dealing with a hash dereference. */
6869 if ((f = strstr(p+1, "->")) != NULL) {
6870 fieldlen = sdslen(spat)-(f-spat);
6871 /* this also copies \0 character */
6872 memcpy(fieldname.buf,f+2,fieldlen-1);
6873 fieldname.len = fieldlen-2;
6874 } else {
6875 fieldlen = 0;
6876 }
6877
6878 prefixlen = p-spat;
6879 sublen = sdslen(ssub);
6880 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6881 memcpy(keyname.buf,spat,prefixlen);
6882 memcpy(keyname.buf+prefixlen,ssub,sublen);
6883 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6884 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6885 keyname.len = prefixlen+sublen+postfixlen;
6886 decrRefCount(subst);
6887
6888 /* Lookup substituted key */
6889 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6890 o = lookupKeyRead(db,&keyobj);
6891 if (o == NULL) return NULL;
6892
6893 if (fieldlen > 0) {
6894 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6895
6896 /* Retrieve value from hash by the field name. This operation
6897 * already increases the refcount of the returned object. */
6898 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6899 o = hashGet(o, &fieldobj);
6900 } else {
6901 if (o->type != REDIS_STRING) return NULL;
6902
6903 /* Every object that this function returns needs to have its refcount
6904 * increased. sortCommand decreases it again. */
6905 incrRefCount(o);
6906 }
6907
6908 return o;
6909 }
6910
6911 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6912 * the additional parameter is not standard but a BSD-specific we have to
6913 * pass sorting parameters via the global 'server' structure */
6914 static int sortCompare(const void *s1, const void *s2) {
6915 const redisSortObject *so1 = s1, *so2 = s2;
6916 int cmp;
6917
6918 if (!server.sort_alpha) {
6919 /* Numeric sorting. Here it's trivial as we precomputed scores */
6920 if (so1->u.score > so2->u.score) {
6921 cmp = 1;
6922 } else if (so1->u.score < so2->u.score) {
6923 cmp = -1;
6924 } else {
6925 cmp = 0;
6926 }
6927 } else {
6928 /* Alphanumeric sorting */
6929 if (server.sort_bypattern) {
6930 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6931 /* At least one compare object is NULL */
6932 if (so1->u.cmpobj == so2->u.cmpobj)
6933 cmp = 0;
6934 else if (so1->u.cmpobj == NULL)
6935 cmp = -1;
6936 else
6937 cmp = 1;
6938 } else {
6939 /* We have both the objects, use strcoll */
6940 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6941 }
6942 } else {
6943 /* Compare elements directly. */
6944 cmp = compareStringObjects(so1->obj,so2->obj);
6945 }
6946 }
6947 return server.sort_desc ? -cmp : cmp;
6948 }
6949
6950 /* The SORT command is the most complex command in Redis. Warning: this code
6951 * is optimized for speed and a bit less for readability */
6952 static void sortCommand(redisClient *c) {
6953 list *operations;
6954 int outputlen = 0;
6955 int desc = 0, alpha = 0;
6956 int limit_start = 0, limit_count = -1, start, end;
6957 int j, dontsort = 0, vectorlen;
6958 int getop = 0; /* GET operation counter */
6959 robj *sortval, *sortby = NULL, *storekey = NULL;
6960 redisSortObject *vector; /* Resulting vector to sort */
6961
6962 /* Lookup the key to sort. It must be of the right types */
6963 sortval = lookupKeyRead(c->db,c->argv[1]);
6964 if (sortval == NULL) {
6965 addReply(c,shared.emptymultibulk);
6966 return;
6967 }
6968 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6969 sortval->type != REDIS_ZSET)
6970 {
6971 addReply(c,shared.wrongtypeerr);
6972 return;
6973 }
6974
6975 /* Create a list of operations to perform for every sorted element.
6976 * Operations can be GET/DEL/INCR/DECR */
6977 operations = listCreate();
6978 listSetFreeMethod(operations,zfree);
6979 j = 2;
6980
6981 /* Now we need to protect sortval incrementing its count, in the future
6982 * SORT may have options able to overwrite/delete keys during the sorting
6983 * and the sorted key itself may get destroied */
6984 incrRefCount(sortval);
6985
6986 /* The SORT command has an SQL-alike syntax, parse it */
6987 while(j < c->argc) {
6988 int leftargs = c->argc-j-1;
6989 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6990 desc = 0;
6991 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6992 desc = 1;
6993 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6994 alpha = 1;
6995 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6996 limit_start = atoi(c->argv[j+1]->ptr);
6997 limit_count = atoi(c->argv[j+2]->ptr);
6998 j+=2;
6999 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7000 storekey = c->argv[j+1];
7001 j++;
7002 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7003 sortby = c->argv[j+1];
7004 /* If the BY pattern does not contain '*', i.e. it is constant,
7005 * we don't need to sort nor to lookup the weight keys. */
7006 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7007 j++;
7008 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7009 listAddNodeTail(operations,createSortOperation(
7010 REDIS_SORT_GET,c->argv[j+1]));
7011 getop++;
7012 j++;
7013 } else {
7014 decrRefCount(sortval);
7015 listRelease(operations);
7016 addReply(c,shared.syntaxerr);
7017 return;
7018 }
7019 j++;
7020 }
7021
7022 /* Load the sorting vector with all the objects to sort */
7023 switch(sortval->type) {
7024 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7025 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7026 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7027 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7028 }
7029 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7030 j = 0;
7031
7032 if (sortval->type == REDIS_LIST) {
7033 list *list = sortval->ptr;
7034 listNode *ln;
7035 listIter li;
7036
7037 listRewind(list,&li);
7038 while((ln = listNext(&li))) {
7039 robj *ele = ln->value;
7040 vector[j].obj = ele;
7041 vector[j].u.score = 0;
7042 vector[j].u.cmpobj = NULL;
7043 j++;
7044 }
7045 } else {
7046 dict *set;
7047 dictIterator *di;
7048 dictEntry *setele;
7049
7050 if (sortval->type == REDIS_SET) {
7051 set = sortval->ptr;
7052 } else {
7053 zset *zs = sortval->ptr;
7054 set = zs->dict;
7055 }
7056
7057 di = dictGetIterator(set);
7058 while((setele = dictNext(di)) != NULL) {
7059 vector[j].obj = dictGetEntryKey(setele);
7060 vector[j].u.score = 0;
7061 vector[j].u.cmpobj = NULL;
7062 j++;
7063 }
7064 dictReleaseIterator(di);
7065 }
7066 redisAssert(j == vectorlen);
7067
7068 /* Now it's time to load the right scores in the sorting vector */
7069 if (dontsort == 0) {
7070 for (j = 0; j < vectorlen; j++) {
7071 robj *byval;
7072 if (sortby) {
7073 /* lookup value to sort by */
7074 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7075 if (!byval) continue;
7076 } else {
7077 /* use object itself to sort by */
7078 byval = vector[j].obj;
7079 }
7080
7081 if (alpha) {
7082 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7083 } else {
7084 if (byval->encoding == REDIS_ENCODING_RAW) {
7085 vector[j].u.score = strtod(byval->ptr,NULL);
7086 } else if (byval->encoding == REDIS_ENCODING_INT) {
7087 /* Don't need to decode the object if it's
7088 * integer-encoded (the only encoding supported) so
7089 * far. We can just cast it */
7090 vector[j].u.score = (long)byval->ptr;
7091 } else {
7092 redisAssert(1 != 1);
7093 }
7094 }
7095
7096 /* when the object was retrieved using lookupKeyByPattern,
7097 * its refcount needs to be decreased. */
7098 if (sortby) {
7099 decrRefCount(byval);
7100 }
7101 }
7102 }
7103
7104 /* We are ready to sort the vector... perform a bit of sanity check
7105 * on the LIMIT option too. We'll use a partial version of quicksort. */
7106 start = (limit_start < 0) ? 0 : limit_start;
7107 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7108 if (start >= vectorlen) {
7109 start = vectorlen-1;
7110 end = vectorlen-2;
7111 }
7112 if (end >= vectorlen) end = vectorlen-1;
7113
7114 if (dontsort == 0) {
7115 server.sort_desc = desc;
7116 server.sort_alpha = alpha;
7117 server.sort_bypattern = sortby ? 1 : 0;
7118 if (sortby && (start != 0 || end != vectorlen-1))
7119 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7120 else
7121 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7122 }
7123
7124 /* Send command output to the output buffer, performing the specified
7125 * GET/DEL/INCR/DECR operations if any. */
7126 outputlen = getop ? getop*(end-start+1) : end-start+1;
7127 if (storekey == NULL) {
7128 /* STORE option not specified, sent the sorting result to client */
7129 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7130 for (j = start; j <= end; j++) {
7131 listNode *ln;
7132 listIter li;
7133
7134 if (!getop) addReplyBulk(c,vector[j].obj);
7135 listRewind(operations,&li);
7136 while((ln = listNext(&li))) {
7137 redisSortOperation *sop = ln->value;
7138 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7139 vector[j].obj);
7140
7141 if (sop->type == REDIS_SORT_GET) {
7142 if (!val) {
7143 addReply(c,shared.nullbulk);
7144 } else {
7145 addReplyBulk(c,val);
7146 decrRefCount(val);
7147 }
7148 } else {
7149 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7150 }
7151 }
7152 }
7153 } else {
7154 robj *listObject = createListObject();
7155 list *listPtr = (list*) listObject->ptr;
7156
7157 /* STORE option specified, set the sorting result as a List object */
7158 for (j = start; j <= end; j++) {
7159 listNode *ln;
7160 listIter li;
7161
7162 if (!getop) {
7163 listAddNodeTail(listPtr,vector[j].obj);
7164 incrRefCount(vector[j].obj);
7165 }
7166 listRewind(operations,&li);
7167 while((ln = listNext(&li))) {
7168 redisSortOperation *sop = ln->value;
7169 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7170 vector[j].obj);
7171
7172 if (sop->type == REDIS_SORT_GET) {
7173 if (!val) {
7174 listAddNodeTail(listPtr,createStringObject("",0));
7175 } else {
7176 /* We should do a incrRefCount on val because it is
7177 * added to the list, but also a decrRefCount because
7178 * it is returned by lookupKeyByPattern. This results
7179 * in doing nothing at all. */
7180 listAddNodeTail(listPtr,val);
7181 }
7182 } else {
7183 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7184 }
7185 }
7186 }
7187 if (dictReplace(c->db->dict,storekey,listObject)) {
7188 incrRefCount(storekey);
7189 }
7190 /* Note: we add 1 because the DB is dirty anyway since even if the
7191 * SORT result is empty a new key is set and maybe the old content
7192 * replaced. */
7193 server.dirty += 1+outputlen;
7194 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7195 }
7196
7197 /* Cleanup */
7198 decrRefCount(sortval);
7199 listRelease(operations);
7200 for (j = 0; j < vectorlen; j++) {
7201 if (alpha && vector[j].u.cmpobj)
7202 decrRefCount(vector[j].u.cmpobj);
7203 }
7204 zfree(vector);
7205 }
7206
7207 /* Convert an amount of bytes into a human readable string in the form
7208 * of 100B, 2G, 100M, 4K, and so forth. */
7209 static void bytesToHuman(char *s, unsigned long long n) {
7210 double d;
7211
7212 if (n < 1024) {
7213 /* Bytes */
7214 sprintf(s,"%lluB",n);
7215 return;
7216 } else if (n < (1024*1024)) {
7217 d = (double)n/(1024);
7218 sprintf(s,"%.2fK",d);
7219 } else if (n < (1024LL*1024*1024)) {
7220 d = (double)n/(1024*1024);
7221 sprintf(s,"%.2fM",d);
7222 } else if (n < (1024LL*1024*1024*1024)) {
7223 d = (double)n/(1024LL*1024*1024);
7224 sprintf(s,"%.2fG",d);
7225 }
7226 }
7227
7228 /* Create the string returned by the INFO command. This is decoupled
7229 * by the INFO command itself as we need to report the same information
7230 * on memory corruption problems. */
7231 static sds genRedisInfoString(void) {
7232 sds info;
7233 time_t uptime = time(NULL)-server.stat_starttime;
7234 int j;
7235 char hmem[64];
7236
7237 bytesToHuman(hmem,zmalloc_used_memory());
7238 info = sdscatprintf(sdsempty(),
7239 "redis_version:%s\r\n"
7240 "redis_git_sha1:%s\r\n"
7241 "redis_git_dirty:%d\r\n"
7242 "arch_bits:%s\r\n"
7243 "multiplexing_api:%s\r\n"
7244 "process_id:%ld\r\n"
7245 "uptime_in_seconds:%ld\r\n"
7246 "uptime_in_days:%ld\r\n"
7247 "connected_clients:%d\r\n"
7248 "connected_slaves:%d\r\n"
7249 "blocked_clients:%d\r\n"
7250 "used_memory:%zu\r\n"
7251 "used_memory_human:%s\r\n"
7252 "changes_since_last_save:%lld\r\n"
7253 "bgsave_in_progress:%d\r\n"
7254 "last_save_time:%ld\r\n"
7255 "bgrewriteaof_in_progress:%d\r\n"
7256 "total_connections_received:%lld\r\n"
7257 "total_commands_processed:%lld\r\n"
7258 "expired_keys:%lld\r\n"
7259 "hash_max_zipmap_entries:%zu\r\n"
7260 "hash_max_zipmap_value:%zu\r\n"
7261 "pubsub_channels:%ld\r\n"
7262 "pubsub_patterns:%u\r\n"
7263 "vm_enabled:%d\r\n"
7264 "role:%s\r\n"
7265 ,REDIS_VERSION,
7266 REDIS_GIT_SHA1,
7267 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7268 (sizeof(long) == 8) ? "64" : "32",
7269 aeGetApiName(),
7270 (long) getpid(),
7271 uptime,
7272 uptime/(3600*24),
7273 listLength(server.clients)-listLength(server.slaves),
7274 listLength(server.slaves),
7275 server.blpop_blocked_clients,
7276 zmalloc_used_memory(),
7277 hmem,
7278 server.dirty,
7279 server.bgsavechildpid != -1,
7280 server.lastsave,
7281 server.bgrewritechildpid != -1,
7282 server.stat_numconnections,
7283 server.stat_numcommands,
7284 server.stat_expiredkeys,
7285 server.hash_max_zipmap_entries,
7286 server.hash_max_zipmap_value,
7287 dictSize(server.pubsub_channels),
7288 listLength(server.pubsub_patterns),
7289 server.vm_enabled != 0,
7290 server.masterhost == NULL ? "master" : "slave"
7291 );
7292 if (server.masterhost) {
7293 info = sdscatprintf(info,
7294 "master_host:%s\r\n"
7295 "master_port:%d\r\n"
7296 "master_link_status:%s\r\n"
7297 "master_last_io_seconds_ago:%d\r\n"
7298 ,server.masterhost,
7299 server.masterport,
7300 (server.replstate == REDIS_REPL_CONNECTED) ?
7301 "up" : "down",
7302 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7303 );
7304 }
7305 if (server.vm_enabled) {
7306 lockThreadedIO();
7307 info = sdscatprintf(info,
7308 "vm_conf_max_memory:%llu\r\n"
7309 "vm_conf_page_size:%llu\r\n"
7310 "vm_conf_pages:%llu\r\n"
7311 "vm_stats_used_pages:%llu\r\n"
7312 "vm_stats_swapped_objects:%llu\r\n"
7313 "vm_stats_swappin_count:%llu\r\n"
7314 "vm_stats_swappout_count:%llu\r\n"
7315 "vm_stats_io_newjobs_len:%lu\r\n"
7316 "vm_stats_io_processing_len:%lu\r\n"
7317 "vm_stats_io_processed_len:%lu\r\n"
7318 "vm_stats_io_active_threads:%lu\r\n"
7319 "vm_stats_blocked_clients:%lu\r\n"
7320 ,(unsigned long long) server.vm_max_memory,
7321 (unsigned long long) server.vm_page_size,
7322 (unsigned long long) server.vm_pages,
7323 (unsigned long long) server.vm_stats_used_pages,
7324 (unsigned long long) server.vm_stats_swapped_objects,
7325 (unsigned long long) server.vm_stats_swapins,
7326 (unsigned long long) server.vm_stats_swapouts,
7327 (unsigned long) listLength(server.io_newjobs),
7328 (unsigned long) listLength(server.io_processing),
7329 (unsigned long) listLength(server.io_processed),
7330 (unsigned long) server.io_active_threads,
7331 (unsigned long) server.vm_blocked_clients
7332 );
7333 unlockThreadedIO();
7334 }
7335 for (j = 0; j < server.dbnum; j++) {
7336 long long keys, vkeys;
7337
7338 keys = dictSize(server.db[j].dict);
7339 vkeys = dictSize(server.db[j].expires);
7340 if (keys || vkeys) {
7341 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7342 j, keys, vkeys);
7343 }
7344 }
7345 return info;
7346 }
7347
7348 static void infoCommand(redisClient *c) {
7349 sds info = genRedisInfoString();
7350 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7351 (unsigned long)sdslen(info)));
7352 addReplySds(c,info);
7353 addReply(c,shared.crlf);
7354 }
7355
7356 static void monitorCommand(redisClient *c) {
7357 /* ignore MONITOR if aleady slave or in monitor mode */
7358 if (c->flags & REDIS_SLAVE) return;
7359
7360 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7361 c->slaveseldb = 0;
7362 listAddNodeTail(server.monitors,c);
7363 addReply(c,shared.ok);
7364 }
7365
7366 /* ================================= Expire ================================= */
7367 static int removeExpire(redisDb *db, robj *key) {
7368 if (dictDelete(db->expires,key) == DICT_OK) {
7369 return 1;
7370 } else {
7371 return 0;
7372 }
7373 }
7374
7375 static int setExpire(redisDb *db, robj *key, time_t when) {
7376 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7377 return 0;
7378 } else {
7379 incrRefCount(key);
7380 return 1;
7381 }
7382 }
7383
7384 /* Return the expire time of the specified key, or -1 if no expire
7385 * is associated with this key (i.e. the key is non volatile) */
7386 static time_t getExpire(redisDb *db, robj *key) {
7387 dictEntry *de;
7388
7389 /* No expire? return ASAP */
7390 if (dictSize(db->expires) == 0 ||
7391 (de = dictFind(db->expires,key)) == NULL) return -1;
7392
7393 return (time_t) dictGetEntryVal(de);
7394 }
7395
7396 static int expireIfNeeded(redisDb *db, robj *key) {
7397 time_t when;
7398 dictEntry *de;
7399
7400 /* No expire? return ASAP */
7401 if (dictSize(db->expires) == 0 ||
7402 (de = dictFind(db->expires,key)) == NULL) return 0;
7403
7404 /* Lookup the expire */
7405 when = (time_t) dictGetEntryVal(de);
7406 if (time(NULL) <= when) return 0;
7407
7408 /* Delete the key */
7409 dictDelete(db->expires,key);
7410 server.stat_expiredkeys++;
7411 return dictDelete(db->dict,key) == DICT_OK;
7412 }
7413
7414 static int deleteIfVolatile(redisDb *db, robj *key) {
7415 dictEntry *de;
7416
7417 /* No expire? return ASAP */
7418 if (dictSize(db->expires) == 0 ||
7419 (de = dictFind(db->expires,key)) == NULL) return 0;
7420
7421 /* Delete the key */
7422 server.dirty++;
7423 server.stat_expiredkeys++;
7424 dictDelete(db->expires,key);
7425 return dictDelete(db->dict,key) == DICT_OK;
7426 }
7427
7428 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7429 dictEntry *de;
7430 time_t seconds;
7431
7432 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7433
7434 seconds -= offset;
7435
7436 de = dictFind(c->db->dict,key);
7437 if (de == NULL) {
7438 addReply(c,shared.czero);
7439 return;
7440 }
7441 if (seconds <= 0) {
7442 if (deleteKey(c->db,key)) server.dirty++;
7443 addReply(c, shared.cone);
7444 return;
7445 } else {
7446 time_t when = time(NULL)+seconds;
7447 if (setExpire(c->db,key,when)) {
7448 addReply(c,shared.cone);
7449 server.dirty++;
7450 } else {
7451 addReply(c,shared.czero);
7452 }
7453 return;
7454 }
7455 }
7456
7457 static void expireCommand(redisClient *c) {
7458 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7459 }
7460
7461 static void expireatCommand(redisClient *c) {
7462 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7463 }
7464
7465 static void ttlCommand(redisClient *c) {
7466 time_t expire;
7467 int ttl = -1;
7468
7469 expire = getExpire(c->db,c->argv[1]);
7470 if (expire != -1) {
7471 ttl = (int) (expire-time(NULL));
7472 if (ttl < 0) ttl = -1;
7473 }
7474 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7475 }
7476
7477 /* ================================ MULTI/EXEC ============================== */
7478
7479 /* Client state initialization for MULTI/EXEC */
7480 static void initClientMultiState(redisClient *c) {
7481 c->mstate.commands = NULL;
7482 c->mstate.count = 0;
7483 }
7484
7485 /* Release all the resources associated with MULTI/EXEC state */
7486 static void freeClientMultiState(redisClient *c) {
7487 int j;
7488
7489 for (j = 0; j < c->mstate.count; j++) {
7490 int i;
7491 multiCmd *mc = c->mstate.commands+j;
7492
7493 for (i = 0; i < mc->argc; i++)
7494 decrRefCount(mc->argv[i]);
7495 zfree(mc->argv);
7496 }
7497 zfree(c->mstate.commands);
7498 }
7499
7500 /* Add a new command into the MULTI commands queue */
7501 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7502 multiCmd *mc;
7503 int j;
7504
7505 c->mstate.commands = zrealloc(c->mstate.commands,
7506 sizeof(multiCmd)*(c->mstate.count+1));
7507 mc = c->mstate.commands+c->mstate.count;
7508 mc->cmd = cmd;
7509 mc->argc = c->argc;
7510 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7511 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7512 for (j = 0; j < c->argc; j++)
7513 incrRefCount(mc->argv[j]);
7514 c->mstate.count++;
7515 }
7516
7517 static void multiCommand(redisClient *c) {
7518 if (c->flags & REDIS_MULTI) {
7519 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7520 return;
7521 }
7522 c->flags |= REDIS_MULTI;
7523 addReply(c,shared.ok);
7524 }
7525
7526 static void discardCommand(redisClient *c) {
7527 if (!(c->flags & REDIS_MULTI)) {
7528 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7529 return;
7530 }
7531
7532 freeClientMultiState(c);
7533 initClientMultiState(c);
7534 c->flags &= (~REDIS_MULTI);
7535 addReply(c,shared.ok);
7536 }
7537
7538 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7539 * implememntation for more information. */
7540 static void execCommandReplicateMulti(redisClient *c) {
7541 struct redisCommand *cmd;
7542 robj *multistring = createStringObject("MULTI",5);
7543
7544 cmd = lookupCommand("multi");
7545 if (server.appendonly)
7546 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7547 if (listLength(server.slaves))
7548 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7549 decrRefCount(multistring);
7550 }
7551
7552 static void execCommand(redisClient *c) {
7553 int j;
7554 robj **orig_argv;
7555 int orig_argc;
7556
7557 if (!(c->flags & REDIS_MULTI)) {
7558 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7559 return;
7560 }
7561
7562 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7563 * A failed EXEC will return a multi bulk nil object. */
7564 if (c->flags & REDIS_DIRTY_CAS) {
7565 freeClientMultiState(c);
7566 initClientMultiState(c);
7567 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7568 unwatchAllKeys(c);
7569 addReply(c,shared.nullmultibulk);
7570 return;
7571 }
7572
7573 /* Replicate a MULTI request now that we are sure the block is executed.
7574 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7575 * both the AOF and the replication link will have the same consistency
7576 * and atomicity guarantees. */
7577 execCommandReplicateMulti(c);
7578
7579 /* Exec all the queued commands */
7580 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7581 orig_argv = c->argv;
7582 orig_argc = c->argc;
7583 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7584 for (j = 0; j < c->mstate.count; j++) {
7585 c->argc = c->mstate.commands[j].argc;
7586 c->argv = c->mstate.commands[j].argv;
7587 call(c,c->mstate.commands[j].cmd);
7588 }
7589 c->argv = orig_argv;
7590 c->argc = orig_argc;
7591 freeClientMultiState(c);
7592 initClientMultiState(c);
7593 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7594 /* Make sure the EXEC command is always replicated / AOF, since we
7595 * always send the MULTI command (we can't know beforehand if the
7596 * next operations will contain at least a modification to the DB). */
7597 server.dirty++;
7598 }
7599
7600 /* =========================== Blocking Operations ========================= */
7601
7602 /* Currently Redis blocking operations support is limited to list POP ops,
7603 * so the current implementation is not fully generic, but it is also not
7604 * completely specific so it will not require a rewrite to support new
7605 * kind of blocking operations in the future.
7606 *
7607 * Still it's important to note that list blocking operations can be already
7608 * used as a notification mechanism in order to implement other blocking
7609 * operations at application level, so there must be a very strong evidence
7610 * of usefulness and generality before new blocking operations are implemented.
7611 *
7612 * This is how the current blocking POP works, we use BLPOP as example:
7613 * - If the user calls BLPOP and the key exists and contains a non empty list
7614 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7615 * if there is not to block.
7616 * - If instead BLPOP is called and the key does not exists or the list is
7617 * empty we need to block. In order to do so we remove the notification for
7618 * new data to read in the client socket (so that we'll not serve new
7619 * requests if the blocking request is not served). Also we put the client
7620 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7621 * blocking for this keys.
7622 * - If a PUSH operation against a key with blocked clients waiting is
7623 * performed, we serve the first in the list: basically instead to push
7624 * the new element inside the list we return it to the (first / oldest)
7625 * blocking client, unblock the client, and remove it form the list.
7626 *
7627 * The above comment and the source code should be enough in order to understand
7628 * the implementation and modify / fix it later.
7629 */
7630
7631 /* Set a client in blocking mode for the specified key, with the specified
7632 * timeout */
7633 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7634 dictEntry *de;
7635 list *l;
7636 int j;
7637
7638 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7639 c->blocking_keys_num = numkeys;
7640 c->blockingto = timeout;
7641 for (j = 0; j < numkeys; j++) {
7642 /* Add the key in the client structure, to map clients -> keys */
7643 c->blocking_keys[j] = keys[j];
7644 incrRefCount(keys[j]);
7645
7646 /* And in the other "side", to map keys -> clients */
7647 de = dictFind(c->db->blocking_keys,keys[j]);
7648 if (de == NULL) {
7649 int retval;
7650
7651 /* For every key we take a list of clients blocked for it */
7652 l = listCreate();
7653 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7654 incrRefCount(keys[j]);
7655 assert(retval == DICT_OK);
7656 } else {
7657 l = dictGetEntryVal(de);
7658 }
7659 listAddNodeTail(l,c);
7660 }
7661 /* Mark the client as a blocked client */
7662 c->flags |= REDIS_BLOCKED;
7663 server.blpop_blocked_clients++;
7664 }
7665
7666 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7667 static void unblockClientWaitingData(redisClient *c) {
7668 dictEntry *de;
7669 list *l;
7670 int j;
7671
7672 assert(c->blocking_keys != NULL);
7673 /* The client may wait for multiple keys, so unblock it for every key. */
7674 for (j = 0; j < c->blocking_keys_num; j++) {
7675 /* Remove this client from the list of clients waiting for this key. */
7676 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7677 assert(de != NULL);
7678 l = dictGetEntryVal(de);
7679 listDelNode(l,listSearchKey(l,c));
7680 /* If the list is empty we need to remove it to avoid wasting memory */
7681 if (listLength(l) == 0)
7682 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7683 decrRefCount(c->blocking_keys[j]);
7684 }
7685 /* Cleanup the client structure */
7686 zfree(c->blocking_keys);
7687 c->blocking_keys = NULL;
7688 c->flags &= (~REDIS_BLOCKED);
7689 server.blpop_blocked_clients--;
7690 /* We want to process data if there is some command waiting
7691 * in the input buffer. Note that this is safe even if
7692 * unblockClientWaitingData() gets called from freeClient() because
7693 * freeClient() will be smart enough to call this function
7694 * *after* c->querybuf was set to NULL. */
7695 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7696 }
7697
7698 /* This should be called from any function PUSHing into lists.
7699 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7700 * 'ele' is the element pushed.
7701 *
7702 * If the function returns 0 there was no client waiting for a list push
7703 * against this key.
7704 *
7705 * If the function returns 1 there was a client waiting for a list push
7706 * against this key, the element was passed to this client thus it's not
7707 * needed to actually add it to the list and the caller should return asap. */
7708 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7709 struct dictEntry *de;
7710 redisClient *receiver;
7711 list *l;
7712 listNode *ln;
7713
7714 de = dictFind(c->db->blocking_keys,key);
7715 if (de == NULL) return 0;
7716 l = dictGetEntryVal(de);
7717 ln = listFirst(l);
7718 assert(ln != NULL);
7719 receiver = ln->value;
7720
7721 addReplySds(receiver,sdsnew("*2\r\n"));
7722 addReplyBulk(receiver,key);
7723 addReplyBulk(receiver,ele);
7724 unblockClientWaitingData(receiver);
7725 return 1;
7726 }
7727
7728 /* Blocking RPOP/LPOP */
7729 static void blockingPopGenericCommand(redisClient *c, int where) {
7730 robj *o;
7731 time_t timeout;
7732 int j;
7733
7734 for (j = 1; j < c->argc-1; j++) {
7735 o = lookupKeyWrite(c->db,c->argv[j]);
7736 if (o != NULL) {
7737 if (o->type != REDIS_LIST) {
7738 addReply(c,shared.wrongtypeerr);
7739 return;
7740 } else {
7741 list *list = o->ptr;
7742 if (listLength(list) != 0) {
7743 /* If the list contains elements fall back to the usual
7744 * non-blocking POP operation */
7745 robj *argv[2], **orig_argv;
7746 int orig_argc;
7747
7748 /* We need to alter the command arguments before to call
7749 * popGenericCommand() as the command takes a single key. */
7750 orig_argv = c->argv;
7751 orig_argc = c->argc;
7752 argv[1] = c->argv[j];
7753 c->argv = argv;
7754 c->argc = 2;
7755
7756 /* Also the return value is different, we need to output
7757 * the multi bulk reply header and the key name. The
7758 * "real" command will add the last element (the value)
7759 * for us. If this souds like an hack to you it's just
7760 * because it is... */
7761 addReplySds(c,sdsnew("*2\r\n"));
7762 addReplyBulk(c,argv[1]);
7763 popGenericCommand(c,where);
7764
7765 /* Fix the client structure with the original stuff */
7766 c->argv = orig_argv;
7767 c->argc = orig_argc;
7768 return;
7769 }
7770 }
7771 }
7772 }
7773 /* If the list is empty or the key does not exists we must block */
7774 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7775 if (timeout > 0) timeout += time(NULL);
7776 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7777 }
7778
7779 static void blpopCommand(redisClient *c) {
7780 blockingPopGenericCommand(c,REDIS_HEAD);
7781 }
7782
7783 static void brpopCommand(redisClient *c) {
7784 blockingPopGenericCommand(c,REDIS_TAIL);
7785 }
7786
7787 /* =============================== Replication ============================= */
7788
7789 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7790 ssize_t nwritten, ret = size;
7791 time_t start = time(NULL);
7792
7793 timeout++;
7794 while(size) {
7795 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7796 nwritten = write(fd,ptr,size);
7797 if (nwritten == -1) return -1;
7798 ptr += nwritten;
7799 size -= nwritten;
7800 }
7801 if ((time(NULL)-start) > timeout) {
7802 errno = ETIMEDOUT;
7803 return -1;
7804 }
7805 }
7806 return ret;
7807 }
7808
7809 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7810 ssize_t nread, totread = 0;
7811 time_t start = time(NULL);
7812
7813 timeout++;
7814 while(size) {
7815 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7816 nread = read(fd,ptr,size);
7817 if (nread == -1) return -1;
7818 ptr += nread;
7819 size -= nread;
7820 totread += nread;
7821 }
7822 if ((time(NULL)-start) > timeout) {
7823 errno = ETIMEDOUT;
7824 return -1;
7825 }
7826 }
7827 return totread;
7828 }
7829
7830 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7831 ssize_t nread = 0;
7832
7833 size--;
7834 while(size) {
7835 char c;
7836
7837 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7838 if (c == '\n') {
7839 *ptr = '\0';
7840 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7841 return nread;
7842 } else {
7843 *ptr++ = c;
7844 *ptr = '\0';
7845 nread++;
7846 }
7847 }
7848 return nread;
7849 }
7850
7851 static void syncCommand(redisClient *c) {
7852 /* ignore SYNC if aleady slave or in monitor mode */
7853 if (c->flags & REDIS_SLAVE) return;
7854
7855 /* SYNC can't be issued when the server has pending data to send to
7856 * the client about already issued commands. We need a fresh reply
7857 * buffer registering the differences between the BGSAVE and the current
7858 * dataset, so that we can copy to other slaves if needed. */
7859 if (listLength(c->reply) != 0) {
7860 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7861 return;
7862 }
7863
7864 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7865 /* Here we need to check if there is a background saving operation
7866 * in progress, or if it is required to start one */
7867 if (server.bgsavechildpid != -1) {
7868 /* Ok a background save is in progress. Let's check if it is a good
7869 * one for replication, i.e. if there is another slave that is
7870 * registering differences since the server forked to save */
7871 redisClient *slave;
7872 listNode *ln;
7873 listIter li;
7874
7875 listRewind(server.slaves,&li);
7876 while((ln = listNext(&li))) {
7877 slave = ln->value;
7878 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7879 }
7880 if (ln) {
7881 /* Perfect, the server is already registering differences for
7882 * another slave. Set the right state, and copy the buffer. */
7883 listRelease(c->reply);
7884 c->reply = listDup(slave->reply);
7885 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7886 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7887 } else {
7888 /* No way, we need to wait for the next BGSAVE in order to
7889 * register differences */
7890 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7891 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7892 }
7893 } else {
7894 /* Ok we don't have a BGSAVE in progress, let's start one */
7895 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7896 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7897 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7898 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7899 return;
7900 }
7901 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7902 }
7903 c->repldbfd = -1;
7904 c->flags |= REDIS_SLAVE;
7905 c->slaveseldb = 0;
7906 listAddNodeTail(server.slaves,c);
7907 return;
7908 }
7909
7910 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7911 redisClient *slave = privdata;
7912 REDIS_NOTUSED(el);
7913 REDIS_NOTUSED(mask);
7914 char buf[REDIS_IOBUF_LEN];
7915 ssize_t nwritten, buflen;
7916
7917 if (slave->repldboff == 0) {
7918 /* Write the bulk write count before to transfer the DB. In theory here
7919 * we don't know how much room there is in the output buffer of the
7920 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7921 * operations) will never be smaller than the few bytes we need. */
7922 sds bulkcount;
7923
7924 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7925 slave->repldbsize);
7926 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7927 {
7928 sdsfree(bulkcount);
7929 freeClient(slave);
7930 return;
7931 }
7932 sdsfree(bulkcount);
7933 }
7934 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7935 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7936 if (buflen <= 0) {
7937 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7938 (buflen == 0) ? "premature EOF" : strerror(errno));
7939 freeClient(slave);
7940 return;
7941 }
7942 if ((nwritten = write(fd,buf,buflen)) == -1) {
7943 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7944 strerror(errno));
7945 freeClient(slave);
7946 return;
7947 }
7948 slave->repldboff += nwritten;
7949 if (slave->repldboff == slave->repldbsize) {
7950 close(slave->repldbfd);
7951 slave->repldbfd = -1;
7952 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7953 slave->replstate = REDIS_REPL_ONLINE;
7954 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7955 sendReplyToClient, slave) == AE_ERR) {
7956 freeClient(slave);
7957 return;
7958 }
7959 addReplySds(slave,sdsempty());
7960 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7961 }
7962 }
7963
7964 /* This function is called at the end of every backgrond saving.
7965 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7966 * otherwise REDIS_ERR is passed to the function.
7967 *
7968 * The goal of this function is to handle slaves waiting for a successful
7969 * background saving in order to perform non-blocking synchronization. */
7970 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7971 listNode *ln;
7972 int startbgsave = 0;
7973 listIter li;
7974
7975 listRewind(server.slaves,&li);
7976 while((ln = listNext(&li))) {
7977 redisClient *slave = ln->value;
7978
7979 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7980 startbgsave = 1;
7981 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7982 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7983 struct redis_stat buf;
7984
7985 if (bgsaveerr != REDIS_OK) {
7986 freeClient(slave);
7987 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7988 continue;
7989 }
7990 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7991 redis_fstat(slave->repldbfd,&buf) == -1) {
7992 freeClient(slave);
7993 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7994 continue;
7995 }
7996 slave->repldboff = 0;
7997 slave->repldbsize = buf.st_size;
7998 slave->replstate = REDIS_REPL_SEND_BULK;
7999 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8000 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8001 freeClient(slave);
8002 continue;
8003 }
8004 }
8005 }
8006 if (startbgsave) {
8007 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8008 listIter li;
8009
8010 listRewind(server.slaves,&li);
8011 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8012 while((ln = listNext(&li))) {
8013 redisClient *slave = ln->value;
8014
8015 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8016 freeClient(slave);
8017 }
8018 }
8019 }
8020 }
8021
8022 static int syncWithMaster(void) {
8023 char buf[1024], tmpfile[256], authcmd[1024];
8024 long dumpsize;
8025 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8026 int dfd, maxtries = 5;
8027
8028 if (fd == -1) {
8029 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8030 strerror(errno));
8031 return REDIS_ERR;
8032 }
8033
8034 /* AUTH with the master if required. */
8035 if(server.masterauth) {
8036 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8037 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8038 close(fd);
8039 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8040 strerror(errno));
8041 return REDIS_ERR;
8042 }
8043 /* Read the AUTH result. */
8044 if (syncReadLine(fd,buf,1024,3600) == -1) {
8045 close(fd);
8046 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8047 strerror(errno));
8048 return REDIS_ERR;
8049 }
8050 if (buf[0] != '+') {
8051 close(fd);
8052 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8053 return REDIS_ERR;
8054 }
8055 }
8056
8057 /* Issue the SYNC command */
8058 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8059 close(fd);
8060 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8061 strerror(errno));
8062 return REDIS_ERR;
8063 }
8064 /* Read the bulk write count */
8065 if (syncReadLine(fd,buf,1024,3600) == -1) {
8066 close(fd);
8067 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8068 strerror(errno));
8069 return REDIS_ERR;
8070 }
8071 if (buf[0] != '$') {
8072 close(fd);
8073 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8074 return REDIS_ERR;
8075 }
8076 dumpsize = strtol(buf+1,NULL,10);
8077 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8078 /* Read the bulk write data on a temp file */
8079 while(maxtries--) {
8080 snprintf(tmpfile,256,
8081 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8082 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8083 if (dfd != -1) break;
8084 sleep(1);
8085 }
8086 if (dfd == -1) {
8087 close(fd);
8088 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8089 return REDIS_ERR;
8090 }
8091 while(dumpsize) {
8092 int nread, nwritten;
8093
8094 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8095 if (nread == -1) {
8096 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8097 strerror(errno));
8098 close(fd);
8099 close(dfd);
8100 return REDIS_ERR;
8101 }
8102 nwritten = write(dfd,buf,nread);
8103 if (nwritten == -1) {
8104 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8105 close(fd);
8106 close(dfd);
8107 return REDIS_ERR;
8108 }
8109 dumpsize -= nread;
8110 }
8111 close(dfd);
8112 if (rename(tmpfile,server.dbfilename) == -1) {
8113 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8114 unlink(tmpfile);
8115 close(fd);
8116 return REDIS_ERR;
8117 }
8118 emptyDb();
8119 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8120 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8121 close(fd);
8122 return REDIS_ERR;
8123 }
8124 server.master = createClient(fd);
8125 server.master->flags |= REDIS_MASTER;
8126 server.master->authenticated = 1;
8127 server.replstate = REDIS_REPL_CONNECTED;
8128 return REDIS_OK;
8129 }
8130
8131 static void slaveofCommand(redisClient *c) {
8132 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8133 !strcasecmp(c->argv[2]->ptr,"one")) {
8134 if (server.masterhost) {
8135 sdsfree(server.masterhost);
8136 server.masterhost = NULL;
8137 if (server.master) freeClient(server.master);
8138 server.replstate = REDIS_REPL_NONE;
8139 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8140 }
8141 } else {
8142 sdsfree(server.masterhost);
8143 server.masterhost = sdsdup(c->argv[1]->ptr);
8144 server.masterport = atoi(c->argv[2]->ptr);
8145 if (server.master) freeClient(server.master);
8146 server.replstate = REDIS_REPL_CONNECT;
8147 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8148 server.masterhost, server.masterport);
8149 }
8150 addReply(c,shared.ok);
8151 }
8152
8153 /* ============================ Maxmemory directive ======================== */
8154
8155 /* Try to free one object form the pre-allocated objects free list.
8156 * This is useful under low mem conditions as by default we take 1 million
8157 * free objects allocated. On success REDIS_OK is returned, otherwise
8158 * REDIS_ERR. */
8159 static int tryFreeOneObjectFromFreelist(void) {
8160 robj *o;
8161
8162 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8163 if (listLength(server.objfreelist)) {
8164 listNode *head = listFirst(server.objfreelist);
8165 o = listNodeValue(head);
8166 listDelNode(server.objfreelist,head);
8167 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8168 zfree(o);
8169 return REDIS_OK;
8170 } else {
8171 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8172 return REDIS_ERR;
8173 }
8174 }
8175
8176 /* This function gets called when 'maxmemory' is set on the config file to limit
8177 * the max memory used by the server, and we are out of memory.
8178 * This function will try to, in order:
8179 *
8180 * - Free objects from the free list
8181 * - Try to remove keys with an EXPIRE set
8182 *
8183 * It is not possible to free enough memory to reach used-memory < maxmemory
8184 * the server will start refusing commands that will enlarge even more the
8185 * memory usage.
8186 */
8187 static void freeMemoryIfNeeded(void) {
8188 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8189 int j, k, freed = 0;
8190
8191 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8192 for (j = 0; j < server.dbnum; j++) {
8193 int minttl = -1;
8194 robj *minkey = NULL;
8195 struct dictEntry *de;
8196
8197 if (dictSize(server.db[j].expires)) {
8198 freed = 1;
8199 /* From a sample of three keys drop the one nearest to
8200 * the natural expire */
8201 for (k = 0; k < 3; k++) {
8202 time_t t;
8203
8204 de = dictGetRandomKey(server.db[j].expires);
8205 t = (time_t) dictGetEntryVal(de);
8206 if (minttl == -1 || t < minttl) {
8207 minkey = dictGetEntryKey(de);
8208 minttl = t;
8209 }
8210 }
8211 deleteKey(server.db+j,minkey);
8212 }
8213 }
8214 if (!freed) return; /* nothing to free... */
8215 }
8216 }
8217
8218 /* ============================== Append Only file ========================== */
8219
8220 /* Write the append only file buffer on disk.
8221 *
8222 * Since we are required to write the AOF before replying to the client,
8223 * and the only way the client socket can get a write is entering when the
8224 * the event loop, we accumulate all the AOF writes in a memory
8225 * buffer and write it on disk using this function just before entering
8226 * the event loop again. */
8227 static void flushAppendOnlyFile(void) {
8228 time_t now;
8229 ssize_t nwritten;
8230
8231 if (sdslen(server.aofbuf) == 0) return;
8232
8233 /* We want to perform a single write. This should be guaranteed atomic
8234 * at least if the filesystem we are writing is a real physical one.
8235 * While this will save us against the server being killed I don't think
8236 * there is much to do about the whole server stopping for power problems
8237 * or alike */
8238 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8239 if (nwritten != (signed)sdslen(server.aofbuf)) {
8240 /* Ooops, we are in troubles. The best thing to do for now is
8241 * aborting instead of giving the illusion that everything is
8242 * working as expected. */
8243 if (nwritten == -1) {
8244 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8245 } else {
8246 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8247 }
8248 exit(1);
8249 }
8250 sdsfree(server.aofbuf);
8251 server.aofbuf = sdsempty();
8252
8253 /* Fsync if needed */
8254 now = time(NULL);
8255 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8256 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8257 now-server.lastfsync > 1))
8258 {
8259 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8260 * flushing metadata. */
8261 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8262 server.lastfsync = now;
8263 }
8264 }
8265
8266 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8267 int j;
8268 buf = sdscatprintf(buf,"*%d\r\n",argc);
8269 for (j = 0; j < argc; j++) {
8270 robj *o = getDecodedObject(argv[j]);
8271 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8272 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8273 buf = sdscatlen(buf,"\r\n",2);
8274 decrRefCount(o);
8275 }
8276 return buf;
8277 }
8278
8279 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8280 int argc = 3;
8281 long when;
8282 robj *argv[3];
8283
8284 /* Make sure we can use strtol */
8285 seconds = getDecodedObject(seconds);
8286 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8287 decrRefCount(seconds);
8288
8289 argv[0] = createStringObject("EXPIREAT",8);
8290 argv[1] = key;
8291 argv[2] = createObject(REDIS_STRING,
8292 sdscatprintf(sdsempty(),"%ld",when));
8293 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8294 decrRefCount(argv[0]);
8295 decrRefCount(argv[2]);
8296 return buf;
8297 }
8298
8299 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8300 sds buf = sdsempty();
8301 robj *tmpargv[3];
8302
8303 /* The DB this command was targetting is not the same as the last command
8304 * we appendend. To issue a SELECT command is needed. */
8305 if (dictid != server.appendseldb) {
8306 char seldb[64];
8307
8308 snprintf(seldb,sizeof(seldb),"%d",dictid);
8309 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8310 (unsigned long)strlen(seldb),seldb);
8311 server.appendseldb = dictid;
8312 }
8313
8314 if (cmd->proc == expireCommand) {
8315 /* Translate EXPIRE into EXPIREAT */
8316 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8317 } else if (cmd->proc == setexCommand) {
8318 /* Translate SETEX to SET and EXPIREAT */
8319 tmpargv[0] = createStringObject("SET",3);
8320 tmpargv[1] = argv[1];
8321 tmpargv[2] = argv[3];
8322 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8323 decrRefCount(tmpargv[0]);
8324 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8325 } else {
8326 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8327 }
8328
8329 /* Append to the AOF buffer. This will be flushed on disk just before
8330 * of re-entering the event loop, so before the client will get a
8331 * positive reply about the operation performed. */
8332 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8333
8334 /* If a background append only file rewriting is in progress we want to
8335 * accumulate the differences between the child DB and the current one
8336 * in a buffer, so that when the child process will do its work we
8337 * can append the differences to the new append only file. */
8338 if (server.bgrewritechildpid != -1)
8339 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8340
8341 sdsfree(buf);
8342 }
8343
8344 /* In Redis commands are always executed in the context of a client, so in
8345 * order to load the append only file we need to create a fake client. */
8346 static struct redisClient *createFakeClient(void) {
8347 struct redisClient *c = zmalloc(sizeof(*c));
8348
8349 selectDb(c,0);
8350 c->fd = -1;
8351 c->querybuf = sdsempty();
8352 c->argc = 0;
8353 c->argv = NULL;
8354 c->flags = 0;
8355 /* We set the fake client as a slave waiting for the synchronization
8356 * so that Redis will not try to send replies to this client. */
8357 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8358 c->reply = listCreate();
8359 listSetFreeMethod(c->reply,decrRefCount);
8360 listSetDupMethod(c->reply,dupClientReplyValue);
8361 initClientMultiState(c);
8362 return c;
8363 }
8364
8365 static void freeFakeClient(struct redisClient *c) {
8366 sdsfree(c->querybuf);
8367 listRelease(c->reply);
8368 freeClientMultiState(c);
8369 zfree(c);
8370 }
8371
8372 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8373 * error (the append only file is zero-length) REDIS_ERR is returned. On
8374 * fatal error an error message is logged and the program exists. */
8375 int loadAppendOnlyFile(char *filename) {
8376 struct redisClient *fakeClient;
8377 FILE *fp = fopen(filename,"r");
8378 struct redis_stat sb;
8379 unsigned long long loadedkeys = 0;
8380 int appendonly = server.appendonly;
8381
8382 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8383 return REDIS_ERR;
8384
8385 if (fp == NULL) {
8386 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8387 exit(1);
8388 }
8389
8390 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8391 * to the same file we're about to read. */
8392 server.appendonly = 0;
8393
8394 fakeClient = createFakeClient();
8395 while(1) {
8396 int argc, j;
8397 unsigned long len;
8398 robj **argv;
8399 char buf[128];
8400 sds argsds;
8401 struct redisCommand *cmd;
8402
8403 if (fgets(buf,sizeof(buf),fp) == NULL) {
8404 if (feof(fp))
8405 break;
8406 else
8407 goto readerr;
8408 }
8409 if (buf[0] != '*') goto fmterr;
8410 argc = atoi(buf+1);
8411 argv = zmalloc(sizeof(robj*)*argc);
8412 for (j = 0; j < argc; j++) {
8413 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8414 if (buf[0] != '$') goto fmterr;
8415 len = strtol(buf+1,NULL,10);
8416 argsds = sdsnewlen(NULL,len);
8417 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8418 argv[j] = createObject(REDIS_STRING,argsds);
8419 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8420 }
8421
8422 /* Command lookup */
8423 cmd = lookupCommand(argv[0]->ptr);
8424 if (!cmd) {
8425 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8426 exit(1);
8427 }
8428 /* Try object encoding */
8429 if (cmd->flags & REDIS_CMD_BULK)
8430 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8431 /* Run the command in the context of a fake client */
8432 fakeClient->argc = argc;
8433 fakeClient->argv = argv;
8434 cmd->proc(fakeClient);
8435 /* Discard the reply objects list from the fake client */
8436 while(listLength(fakeClient->reply))
8437 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8438 /* Clean up, ready for the next command */
8439 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8440 zfree(argv);
8441 /* Handle swapping while loading big datasets when VM is on */
8442 loadedkeys++;
8443 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8444 while (zmalloc_used_memory() > server.vm_max_memory) {
8445 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8446 }
8447 }
8448 }
8449
8450 /* This point can only be reached when EOF is reached without errors.
8451 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8452 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8453
8454 fclose(fp);
8455 freeFakeClient(fakeClient);
8456 server.appendonly = appendonly;
8457 return REDIS_OK;
8458
8459 readerr:
8460 if (feof(fp)) {
8461 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8462 } else {
8463 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8464 }
8465 exit(1);
8466 fmterr:
8467 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8468 exit(1);
8469 }
8470
8471 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8472 static int fwriteBulkObject(FILE *fp, robj *obj) {
8473 char buf[128];
8474 int decrrc = 0;
8475
8476 /* Avoid the incr/decr ref count business if possible to help
8477 * copy-on-write (we are often in a child process when this function
8478 * is called).
8479 * Also makes sure that key objects don't get incrRefCount-ed when VM
8480 * is enabled */
8481 if (obj->encoding != REDIS_ENCODING_RAW) {
8482 obj = getDecodedObject(obj);
8483 decrrc = 1;
8484 }
8485 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8486 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8487 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8488 goto err;
8489 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8490 if (decrrc) decrRefCount(obj);
8491 return 1;
8492 err:
8493 if (decrrc) decrRefCount(obj);
8494 return 0;
8495 }
8496
8497 /* Write binary-safe string into a file in the bulkformat
8498 * $<count>\r\n<payload>\r\n */
8499 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8500 char buf[128];
8501
8502 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8503 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8504 if (len && fwrite(s,len,1,fp) == 0) return 0;
8505 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8506 return 1;
8507 }
8508
8509 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8510 static int fwriteBulkDouble(FILE *fp, double d) {
8511 char buf[128], dbuf[128];
8512
8513 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8514 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8515 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8516 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8517 return 1;
8518 }
8519
8520 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8521 static int fwriteBulkLong(FILE *fp, long l) {
8522 char buf[128], lbuf[128];
8523
8524 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8525 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8526 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8527 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8528 return 1;
8529 }
8530
8531 /* Write a sequence of commands able to fully rebuild the dataset into
8532 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8533 static int rewriteAppendOnlyFile(char *filename) {
8534 dictIterator *di = NULL;
8535 dictEntry *de;
8536 FILE *fp;
8537 char tmpfile[256];
8538 int j;
8539 time_t now = time(NULL);
8540
8541 /* Note that we have to use a different temp name here compared to the
8542 * one used by rewriteAppendOnlyFileBackground() function. */
8543 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8544 fp = fopen(tmpfile,"w");
8545 if (!fp) {
8546 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8547 return REDIS_ERR;
8548 }
8549 for (j = 0; j < server.dbnum; j++) {
8550 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8551 redisDb *db = server.db+j;
8552 dict *d = db->dict;
8553 if (dictSize(d) == 0) continue;
8554 di = dictGetIterator(d);
8555 if (!di) {
8556 fclose(fp);
8557 return REDIS_ERR;
8558 }
8559
8560 /* SELECT the new DB */
8561 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8562 if (fwriteBulkLong(fp,j) == 0) goto werr;
8563
8564 /* Iterate this DB writing every entry */
8565 while((de = dictNext(di)) != NULL) {
8566 robj *key, *o;
8567 time_t expiretime;
8568 int swapped;
8569
8570 key = dictGetEntryKey(de);
8571 /* If the value for this key is swapped, load a preview in memory.
8572 * We use a "swapped" flag to remember if we need to free the
8573 * value object instead to just increment the ref count anyway
8574 * in order to avoid copy-on-write of pages if we are forked() */
8575 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8576 key->storage == REDIS_VM_SWAPPING) {
8577 o = dictGetEntryVal(de);
8578 swapped = 0;
8579 } else {
8580 o = vmPreviewObject(key);
8581 swapped = 1;
8582 }
8583 expiretime = getExpire(db,key);
8584
8585 /* Save the key and associated value */
8586 if (o->type == REDIS_STRING) {
8587 /* Emit a SET command */
8588 char cmd[]="*3\r\n$3\r\nSET\r\n";
8589 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8590 /* Key and value */
8591 if (fwriteBulkObject(fp,key) == 0) goto werr;
8592 if (fwriteBulkObject(fp,o) == 0) goto werr;
8593 } else if (o->type == REDIS_LIST) {
8594 /* Emit the RPUSHes needed to rebuild the list */
8595 list *list = o->ptr;
8596 listNode *ln;
8597 listIter li;
8598
8599 listRewind(list,&li);
8600 while((ln = listNext(&li))) {
8601 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8602 robj *eleobj = listNodeValue(ln);
8603
8604 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8605 if (fwriteBulkObject(fp,key) == 0) goto werr;
8606 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8607 }
8608 } else if (o->type == REDIS_SET) {
8609 /* Emit the SADDs needed to rebuild the set */
8610 dict *set = o->ptr;
8611 dictIterator *di = dictGetIterator(set);
8612 dictEntry *de;
8613
8614 while((de = dictNext(di)) != NULL) {
8615 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8616 robj *eleobj = dictGetEntryKey(de);
8617
8618 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8619 if (fwriteBulkObject(fp,key) == 0) goto werr;
8620 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8621 }
8622 dictReleaseIterator(di);
8623 } else if (o->type == REDIS_ZSET) {
8624 /* Emit the ZADDs needed to rebuild the sorted set */
8625 zset *zs = o->ptr;
8626 dictIterator *di = dictGetIterator(zs->dict);
8627 dictEntry *de;
8628
8629 while((de = dictNext(di)) != NULL) {
8630 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8631 robj *eleobj = dictGetEntryKey(de);
8632 double *score = dictGetEntryVal(de);
8633
8634 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8635 if (fwriteBulkObject(fp,key) == 0) goto werr;
8636 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8637 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8638 }
8639 dictReleaseIterator(di);
8640 } else if (o->type == REDIS_HASH) {
8641 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8642
8643 /* Emit the HSETs needed to rebuild the hash */
8644 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8645 unsigned char *p = zipmapRewind(o->ptr);
8646 unsigned char *field, *val;
8647 unsigned int flen, vlen;
8648
8649 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8650 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8651 if (fwriteBulkObject(fp,key) == 0) goto werr;
8652 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8653 return -1;
8654 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8655 return -1;
8656 }
8657 } else {
8658 dictIterator *di = dictGetIterator(o->ptr);
8659 dictEntry *de;
8660
8661 while((de = dictNext(di)) != NULL) {
8662 robj *field = dictGetEntryKey(de);
8663 robj *val = dictGetEntryVal(de);
8664
8665 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8666 if (fwriteBulkObject(fp,key) == 0) goto werr;
8667 if (fwriteBulkObject(fp,field) == -1) return -1;
8668 if (fwriteBulkObject(fp,val) == -1) return -1;
8669 }
8670 dictReleaseIterator(di);
8671 }
8672 } else {
8673 redisPanic("Unknown object type");
8674 }
8675 /* Save the expire time */
8676 if (expiretime != -1) {
8677 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8678 /* If this key is already expired skip it */
8679 if (expiretime < now) continue;
8680 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8681 if (fwriteBulkObject(fp,key) == 0) goto werr;
8682 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8683 }
8684 if (swapped) decrRefCount(o);
8685 }
8686 dictReleaseIterator(di);
8687 }
8688
8689 /* Make sure data will not remain on the OS's output buffers */
8690 fflush(fp);
8691 fsync(fileno(fp));
8692 fclose(fp);
8693
8694 /* Use RENAME to make sure the DB file is changed atomically only
8695 * if the generate DB file is ok. */
8696 if (rename(tmpfile,filename) == -1) {
8697 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8698 unlink(tmpfile);
8699 return REDIS_ERR;
8700 }
8701 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8702 return REDIS_OK;
8703
8704 werr:
8705 fclose(fp);
8706 unlink(tmpfile);
8707 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8708 if (di) dictReleaseIterator(di);
8709 return REDIS_ERR;
8710 }
8711
8712 /* This is how rewriting of the append only file in background works:
8713 *
8714 * 1) The user calls BGREWRITEAOF
8715 * 2) Redis calls this function, that forks():
8716 * 2a) the child rewrite the append only file in a temp file.
8717 * 2b) the parent accumulates differences in server.bgrewritebuf.
8718 * 3) When the child finished '2a' exists.
8719 * 4) The parent will trap the exit code, if it's OK, will append the
8720 * data accumulated into server.bgrewritebuf into the temp file, and
8721 * finally will rename(2) the temp file in the actual file name.
8722 * The the new file is reopened as the new append only file. Profit!
8723 */
8724 static int rewriteAppendOnlyFileBackground(void) {
8725 pid_t childpid;
8726
8727 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8728 if (server.vm_enabled) waitEmptyIOJobsQueue();
8729 if ((childpid = fork()) == 0) {
8730 /* Child */
8731 char tmpfile[256];
8732
8733 if (server.vm_enabled) vmReopenSwapFile();
8734 close(server.fd);
8735 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8736 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8737 _exit(0);
8738 } else {
8739 _exit(1);
8740 }
8741 } else {
8742 /* Parent */
8743 if (childpid == -1) {
8744 redisLog(REDIS_WARNING,
8745 "Can't rewrite append only file in background: fork: %s",
8746 strerror(errno));
8747 return REDIS_ERR;
8748 }
8749 redisLog(REDIS_NOTICE,
8750 "Background append only file rewriting started by pid %d",childpid);
8751 server.bgrewritechildpid = childpid;
8752 updateDictResizePolicy();
8753 /* We set appendseldb to -1 in order to force the next call to the
8754 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8755 * accumulated by the parent into server.bgrewritebuf will start
8756 * with a SELECT statement and it will be safe to merge. */
8757 server.appendseldb = -1;
8758 return REDIS_OK;
8759 }
8760 return REDIS_OK; /* unreached */
8761 }
8762
8763 static void bgrewriteaofCommand(redisClient *c) {
8764 if (server.bgrewritechildpid != -1) {
8765 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8766 return;
8767 }
8768 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8769 char *status = "+Background append only file rewriting started\r\n";
8770 addReplySds(c,sdsnew(status));
8771 } else {
8772 addReply(c,shared.err);
8773 }
8774 }
8775
8776 static void aofRemoveTempFile(pid_t childpid) {
8777 char tmpfile[256];
8778
8779 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8780 unlink(tmpfile);
8781 }
8782
8783 /* Virtual Memory is composed mainly of two subsystems:
8784 * - Blocking Virutal Memory
8785 * - Threaded Virtual Memory I/O
8786 * The two parts are not fully decoupled, but functions are split among two
8787 * different sections of the source code (delimited by comments) in order to
8788 * make more clear what functionality is about the blocking VM and what about
8789 * the threaded (not blocking) VM.
8790 *
8791 * Redis VM design:
8792 *
8793 * Redis VM is a blocking VM (one that blocks reading swapped values from
8794 * disk into memory when a value swapped out is needed in memory) that is made
8795 * unblocking by trying to examine the command argument vector in order to
8796 * load in background values that will likely be needed in order to exec
8797 * the command. The command is executed only once all the relevant keys
8798 * are loaded into memory.
8799 *
8800 * This basically is almost as simple of a blocking VM, but almost as parallel
8801 * as a fully non-blocking VM.
8802 */
8803
8804 /* Called when the user switches from "appendonly yes" to "appendonly no"
8805 * at runtime using the CONFIG command. */
8806 static void stopAppendOnly(void) {
8807 flushAppendOnlyFile();
8808 fsync(server.appendfd);
8809 close(server.appendfd);
8810
8811 server.appendfd = -1;
8812 server.appendseldb = -1;
8813 server.appendonly = 0;
8814 /* rewrite operation in progress? kill it, wait child exit */
8815 if (server.bgsavechildpid != -1) {
8816 int statloc;
8817
8818 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8819 wait3(&statloc,0,NULL);
8820 /* reset the buffer accumulating changes while the child saves */
8821 sdsfree(server.bgrewritebuf);
8822 server.bgrewritebuf = sdsempty();
8823 server.bgsavechildpid = -1;
8824 }
8825 }
8826
8827 /* Called when the user switches from "appendonly no" to "appendonly yes"
8828 * at runtime using the CONFIG command. */
8829 static int startAppendOnly(void) {
8830 server.appendonly = 1;
8831 server.lastfsync = time(NULL);
8832 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8833 if (server.appendfd == -1) {
8834 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8835 return REDIS_ERR;
8836 }
8837 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8838 server.appendonly = 0;
8839 close(server.appendfd);
8840 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8841 return REDIS_ERR;
8842 }
8843 return REDIS_OK;
8844 }
8845
8846 /* =================== Virtual Memory - Blocking Side ====================== */
8847
8848 static void vmInit(void) {
8849 off_t totsize;
8850 int pipefds[2];
8851 size_t stacksize;
8852 struct flock fl;
8853
8854 if (server.vm_max_threads != 0)
8855 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8856
8857 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8858 /* Try to open the old swap file, otherwise create it */
8859 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8860 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8861 }
8862 if (server.vm_fp == NULL) {
8863 redisLog(REDIS_WARNING,
8864 "Can't open the swap file: %s. Exiting.",
8865 strerror(errno));
8866 exit(1);
8867 }
8868 server.vm_fd = fileno(server.vm_fp);
8869 /* Lock the swap file for writing, this is useful in order to avoid
8870 * another instance to use the same swap file for a config error. */
8871 fl.l_type = F_WRLCK;
8872 fl.l_whence = SEEK_SET;
8873 fl.l_start = fl.l_len = 0;
8874 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8875 redisLog(REDIS_WARNING,
8876 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8877 exit(1);
8878 }
8879 /* Initialize */
8880 server.vm_next_page = 0;
8881 server.vm_near_pages = 0;
8882 server.vm_stats_used_pages = 0;
8883 server.vm_stats_swapped_objects = 0;
8884 server.vm_stats_swapouts = 0;
8885 server.vm_stats_swapins = 0;
8886 totsize = server.vm_pages*server.vm_page_size;
8887 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8888 if (ftruncate(server.vm_fd,totsize) == -1) {
8889 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8890 strerror(errno));
8891 exit(1);
8892 } else {
8893 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8894 }
8895 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8896 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8897 (long long) (server.vm_pages+7)/8, server.vm_pages);
8898 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8899
8900 /* Initialize threaded I/O (used by Virtual Memory) */
8901 server.io_newjobs = listCreate();
8902 server.io_processing = listCreate();
8903 server.io_processed = listCreate();
8904 server.io_ready_clients = listCreate();
8905 pthread_mutex_init(&server.io_mutex,NULL);
8906 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8907 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8908 server.io_active_threads = 0;
8909 if (pipe(pipefds) == -1) {
8910 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8911 ,strerror(errno));
8912 exit(1);
8913 }
8914 server.io_ready_pipe_read = pipefds[0];
8915 server.io_ready_pipe_write = pipefds[1];
8916 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8917 /* LZF requires a lot of stack */
8918 pthread_attr_init(&server.io_threads_attr);
8919 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8920 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8921 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8922 /* Listen for events in the threaded I/O pipe */
8923 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8924 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8925 oom("creating file event");
8926 }
8927
8928 /* Mark the page as used */
8929 static void vmMarkPageUsed(off_t page) {
8930 off_t byte = page/8;
8931 int bit = page&7;
8932 redisAssert(vmFreePage(page) == 1);
8933 server.vm_bitmap[byte] |= 1<<bit;
8934 }
8935
8936 /* Mark N contiguous pages as used, with 'page' being the first. */
8937 static void vmMarkPagesUsed(off_t page, off_t count) {
8938 off_t j;
8939
8940 for (j = 0; j < count; j++)
8941 vmMarkPageUsed(page+j);
8942 server.vm_stats_used_pages += count;
8943 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8944 (long long)count, (long long)page);
8945 }
8946
8947 /* Mark the page as free */
8948 static void vmMarkPageFree(off_t page) {
8949 off_t byte = page/8;
8950 int bit = page&7;
8951 redisAssert(vmFreePage(page) == 0);
8952 server.vm_bitmap[byte] &= ~(1<<bit);
8953 }
8954
8955 /* Mark N contiguous pages as free, with 'page' being the first. */
8956 static void vmMarkPagesFree(off_t page, off_t count) {
8957 off_t j;
8958
8959 for (j = 0; j < count; j++)
8960 vmMarkPageFree(page+j);
8961 server.vm_stats_used_pages -= count;
8962 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8963 (long long)count, (long long)page);
8964 }
8965
8966 /* Test if the page is free */
8967 static int vmFreePage(off_t page) {
8968 off_t byte = page/8;
8969 int bit = page&7;
8970 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8971 }
8972
8973 /* Find N contiguous free pages storing the first page of the cluster in *first.
8974 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8975 * REDIS_ERR is returned.
8976 *
8977 * This function uses a simple algorithm: we try to allocate
8978 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8979 * again from the start of the swap file searching for free spaces.
8980 *
8981 * If it looks pretty clear that there are no free pages near our offset
8982 * we try to find less populated places doing a forward jump of
8983 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8984 * without hurry, and then we jump again and so forth...
8985 *
8986 * This function can be improved using a free list to avoid to guess
8987 * too much, since we could collect data about freed pages.
8988 *
8989 * note: I implemented this function just after watching an episode of
8990 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8991 */
8992 static int vmFindContiguousPages(off_t *first, off_t n) {
8993 off_t base, offset = 0, since_jump = 0, numfree = 0;
8994
8995 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8996 server.vm_near_pages = 0;
8997 server.vm_next_page = 0;
8998 }
8999 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9000 base = server.vm_next_page;
9001
9002 while(offset < server.vm_pages) {
9003 off_t this = base+offset;
9004
9005 /* If we overflow, restart from page zero */
9006 if (this >= server.vm_pages) {
9007 this -= server.vm_pages;
9008 if (this == 0) {
9009 /* Just overflowed, what we found on tail is no longer
9010 * interesting, as it's no longer contiguous. */
9011 numfree = 0;
9012 }
9013 }
9014 if (vmFreePage(this)) {
9015 /* This is a free page */
9016 numfree++;
9017 /* Already got N free pages? Return to the caller, with success */
9018 if (numfree == n) {
9019 *first = this-(n-1);
9020 server.vm_next_page = this+1;
9021 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9022 return REDIS_OK;
9023 }
9024 } else {
9025 /* The current one is not a free page */
9026 numfree = 0;
9027 }
9028
9029 /* Fast-forward if the current page is not free and we already
9030 * searched enough near this place. */
9031 since_jump++;
9032 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9033 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9034 since_jump = 0;
9035 /* Note that even if we rewind after the jump, we are don't need
9036 * to make sure numfree is set to zero as we only jump *if* it
9037 * is set to zero. */
9038 } else {
9039 /* Otherwise just check the next page */
9040 offset++;
9041 }
9042 }
9043 return REDIS_ERR;
9044 }
9045
9046 /* Write the specified object at the specified page of the swap file */
9047 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9048 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9049 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9050 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9051 redisLog(REDIS_WARNING,
9052 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9053 strerror(errno));
9054 return REDIS_ERR;
9055 }
9056 rdbSaveObject(server.vm_fp,o);
9057 fflush(server.vm_fp);
9058 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9059 return REDIS_OK;
9060 }
9061
9062 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9063 * needed to later retrieve the object into the key object.
9064 * If we can't find enough contiguous empty pages to swap the object on disk
9065 * REDIS_ERR is returned. */
9066 static int vmSwapObjectBlocking(robj *key, robj *val) {
9067 off_t pages = rdbSavedObjectPages(val,NULL);
9068 off_t page;
9069
9070 assert(key->storage == REDIS_VM_MEMORY);
9071 assert(key->refcount == 1);
9072 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9073 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9074 key->vm.page = page;
9075 key->vm.usedpages = pages;
9076 key->storage = REDIS_VM_SWAPPED;
9077 key->vtype = val->type;
9078 decrRefCount(val); /* Deallocate the object from memory. */
9079 vmMarkPagesUsed(page,pages);
9080 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9081 (unsigned char*) key->ptr,
9082 (unsigned long long) page, (unsigned long long) pages);
9083 server.vm_stats_swapped_objects++;
9084 server.vm_stats_swapouts++;
9085 return REDIS_OK;
9086 }
9087
9088 static robj *vmReadObjectFromSwap(off_t page, int type) {
9089 robj *o;
9090
9091 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9092 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9093 redisLog(REDIS_WARNING,
9094 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9095 strerror(errno));
9096 _exit(1);
9097 }
9098 o = rdbLoadObject(type,server.vm_fp);
9099 if (o == NULL) {
9100 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9101 _exit(1);
9102 }
9103 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9104 return o;
9105 }
9106
9107 /* Load the value object relative to the 'key' object from swap to memory.
9108 * The newly allocated object is returned.
9109 *
9110 * If preview is true the unserialized object is returned to the caller but
9111 * no changes are made to the key object, nor the pages are marked as freed */
9112 static robj *vmGenericLoadObject(robj *key, int preview) {
9113 robj *val;
9114
9115 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9116 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9117 if (!preview) {
9118 key->storage = REDIS_VM_MEMORY;
9119 key->vm.atime = server.unixtime;
9120 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9121 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9122 (unsigned char*) key->ptr);
9123 server.vm_stats_swapped_objects--;
9124 } else {
9125 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9126 (unsigned char*) key->ptr);
9127 }
9128 server.vm_stats_swapins++;
9129 return val;
9130 }
9131
9132 /* Plain object loading, from swap to memory */
9133 static robj *vmLoadObject(robj *key) {
9134 /* If we are loading the object in background, stop it, we
9135 * need to load this object synchronously ASAP. */
9136 if (key->storage == REDIS_VM_LOADING)
9137 vmCancelThreadedIOJob(key);
9138 return vmGenericLoadObject(key,0);
9139 }
9140
9141 /* Just load the value on disk, without to modify the key.
9142 * This is useful when we want to perform some operation on the value
9143 * without to really bring it from swap to memory, like while saving the
9144 * dataset or rewriting the append only log. */
9145 static robj *vmPreviewObject(robj *key) {
9146 return vmGenericLoadObject(key,1);
9147 }
9148
9149 /* How a good candidate is this object for swapping?
9150 * The better candidate it is, the greater the returned value.
9151 *
9152 * Currently we try to perform a fast estimation of the object size in
9153 * memory, and combine it with aging informations.
9154 *
9155 * Basically swappability = idle-time * log(estimated size)
9156 *
9157 * Bigger objects are preferred over smaller objects, but not
9158 * proportionally, this is why we use the logarithm. This algorithm is
9159 * just a first try and will probably be tuned later. */
9160 static double computeObjectSwappability(robj *o) {
9161 time_t age = server.unixtime - o->vm.atime;
9162 long asize = 0;
9163 list *l;
9164 dict *d;
9165 struct dictEntry *de;
9166 int z;
9167
9168 if (age <= 0) return 0;
9169 switch(o->type) {
9170 case REDIS_STRING:
9171 if (o->encoding != REDIS_ENCODING_RAW) {
9172 asize = sizeof(*o);
9173 } else {
9174 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9175 }
9176 break;
9177 case REDIS_LIST:
9178 l = o->ptr;
9179 listNode *ln = listFirst(l);
9180
9181 asize = sizeof(list);
9182 if (ln) {
9183 robj *ele = ln->value;
9184 long elesize;
9185
9186 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9187 (sizeof(*o)+sdslen(ele->ptr)) :
9188 sizeof(*o);
9189 asize += (sizeof(listNode)+elesize)*listLength(l);
9190 }
9191 break;
9192 case REDIS_SET:
9193 case REDIS_ZSET:
9194 z = (o->type == REDIS_ZSET);
9195 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9196
9197 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9198 if (z) asize += sizeof(zset)-sizeof(dict);
9199 if (dictSize(d)) {
9200 long elesize;
9201 robj *ele;
9202
9203 de = dictGetRandomKey(d);
9204 ele = dictGetEntryKey(de);
9205 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9206 (sizeof(*o)+sdslen(ele->ptr)) :
9207 sizeof(*o);
9208 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9209 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9210 }
9211 break;
9212 case REDIS_HASH:
9213 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9214 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9215 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9216 unsigned int klen, vlen;
9217 unsigned char *key, *val;
9218
9219 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9220 klen = 0;
9221 vlen = 0;
9222 }
9223 asize = len*(klen+vlen+3);
9224 } else if (o->encoding == REDIS_ENCODING_HT) {
9225 d = o->ptr;
9226 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9227 if (dictSize(d)) {
9228 long elesize;
9229 robj *ele;
9230
9231 de = dictGetRandomKey(d);
9232 ele = dictGetEntryKey(de);
9233 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9234 (sizeof(*o)+sdslen(ele->ptr)) :
9235 sizeof(*o);
9236 ele = dictGetEntryVal(de);
9237 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9238 (sizeof(*o)+sdslen(ele->ptr)) :
9239 sizeof(*o);
9240 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9241 }
9242 }
9243 break;
9244 }
9245 return (double)age*log(1+asize);
9246 }
9247
9248 /* Try to swap an object that's a good candidate for swapping.
9249 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9250 * to swap any object at all.
9251 *
9252 * If 'usethreaded' is true, Redis will try to swap the object in background
9253 * using I/O threads. */
9254 static int vmSwapOneObject(int usethreads) {
9255 int j, i;
9256 struct dictEntry *best = NULL;
9257 double best_swappability = 0;
9258 redisDb *best_db = NULL;
9259 robj *key, *val;
9260
9261 for (j = 0; j < server.dbnum; j++) {
9262 redisDb *db = server.db+j;
9263 /* Why maxtries is set to 100?
9264 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9265 * are swappable objects */
9266 int maxtries = 100;
9267
9268 if (dictSize(db->dict) == 0) continue;
9269 for (i = 0; i < 5; i++) {
9270 dictEntry *de;
9271 double swappability;
9272
9273 if (maxtries) maxtries--;
9274 de = dictGetRandomKey(db->dict);
9275 key = dictGetEntryKey(de);
9276 val = dictGetEntryVal(de);
9277 /* Only swap objects that are currently in memory.
9278 *
9279 * Also don't swap shared objects if threaded VM is on, as we
9280 * try to ensure that the main thread does not touch the
9281 * object while the I/O thread is using it, but we can't
9282 * control other keys without adding additional mutex. */
9283 if (key->storage != REDIS_VM_MEMORY ||
9284 (server.vm_max_threads != 0 && val->refcount != 1)) {
9285 if (maxtries) i--; /* don't count this try */
9286 continue;
9287 }
9288 swappability = computeObjectSwappability(val);
9289 if (!best || swappability > best_swappability) {
9290 best = de;
9291 best_swappability = swappability;
9292 best_db = db;
9293 }
9294 }
9295 }
9296 if (best == NULL) return REDIS_ERR;
9297 key = dictGetEntryKey(best);
9298 val = dictGetEntryVal(best);
9299
9300 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9301 key->ptr, best_swappability);
9302
9303 /* Unshare the key if needed */
9304 if (key->refcount > 1) {
9305 robj *newkey = dupStringObject(key);
9306 decrRefCount(key);
9307 key = dictGetEntryKey(best) = newkey;
9308 }
9309 /* Swap it */
9310 if (usethreads) {
9311 vmSwapObjectThreaded(key,val,best_db);
9312 return REDIS_OK;
9313 } else {
9314 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9315 dictGetEntryVal(best) = NULL;
9316 return REDIS_OK;
9317 } else {
9318 return REDIS_ERR;
9319 }
9320 }
9321 }
9322
9323 static int vmSwapOneObjectBlocking() {
9324 return vmSwapOneObject(0);
9325 }
9326
9327 static int vmSwapOneObjectThreaded() {
9328 return vmSwapOneObject(1);
9329 }
9330
9331 /* Return true if it's safe to swap out objects in a given moment.
9332 * Basically we don't want to swap objects out while there is a BGSAVE
9333 * or a BGAEOREWRITE running in backgroud. */
9334 static int vmCanSwapOut(void) {
9335 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9336 }
9337
9338 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9339 * and was deleted. Otherwise 0 is returned. */
9340 static int deleteIfSwapped(redisDb *db, robj *key) {
9341 dictEntry *de;
9342 robj *foundkey;
9343
9344 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9345 foundkey = dictGetEntryKey(de);
9346 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9347 deleteKey(db,key);
9348 return 1;
9349 }
9350
9351 /* =================== Virtual Memory - Threaded I/O ======================= */
9352
9353 static void freeIOJob(iojob *j) {
9354 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9355 j->type == REDIS_IOJOB_DO_SWAP ||
9356 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9357 decrRefCount(j->val);
9358 /* We don't decrRefCount the j->key field as we did't incremented
9359 * the count creating IO Jobs. This is because the key field here is
9360 * just used as an indentifier and if a key is removed the Job should
9361 * never be touched again. */
9362 zfree(j);
9363 }
9364
9365 /* Every time a thread finished a Job, it writes a byte into the write side
9366 * of an unix pipe in order to "awake" the main thread, and this function
9367 * is called. */
9368 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9369 int mask)
9370 {
9371 char buf[1];
9372 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9373 REDIS_NOTUSED(el);
9374 REDIS_NOTUSED(mask);
9375 REDIS_NOTUSED(privdata);
9376
9377 /* For every byte we read in the read side of the pipe, there is one
9378 * I/O job completed to process. */
9379 while((retval = read(fd,buf,1)) == 1) {
9380 iojob *j;
9381 listNode *ln;
9382 robj *key;
9383 struct dictEntry *de;
9384
9385 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9386
9387 /* Get the processed element (the oldest one) */
9388 lockThreadedIO();
9389 assert(listLength(server.io_processed) != 0);
9390 if (toprocess == -1) {
9391 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9392 if (toprocess <= 0) toprocess = 1;
9393 }
9394 ln = listFirst(server.io_processed);
9395 j = ln->value;
9396 listDelNode(server.io_processed,ln);
9397 unlockThreadedIO();
9398 /* If this job is marked as canceled, just ignore it */
9399 if (j->canceled) {
9400 freeIOJob(j);
9401 continue;
9402 }
9403 /* Post process it in the main thread, as there are things we
9404 * can do just here to avoid race conditions and/or invasive locks */
9405 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9406 de = dictFind(j->db->dict,j->key);
9407 assert(de != NULL);
9408 key = dictGetEntryKey(de);
9409 if (j->type == REDIS_IOJOB_LOAD) {
9410 redisDb *db;
9411
9412 /* Key loaded, bring it at home */
9413 key->storage = REDIS_VM_MEMORY;
9414 key->vm.atime = server.unixtime;
9415 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9416 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9417 (unsigned char*) key->ptr);
9418 server.vm_stats_swapped_objects--;
9419 server.vm_stats_swapins++;
9420 dictGetEntryVal(de) = j->val;
9421 incrRefCount(j->val);
9422 db = j->db;
9423 freeIOJob(j);
9424 /* Handle clients waiting for this key to be loaded. */
9425 handleClientsBlockedOnSwappedKey(db,key);
9426 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9427 /* Now we know the amount of pages required to swap this object.
9428 * Let's find some space for it, and queue this task again
9429 * rebranded as REDIS_IOJOB_DO_SWAP. */
9430 if (!vmCanSwapOut() ||
9431 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9432 {
9433 /* Ooops... no space or we can't swap as there is
9434 * a fork()ed Redis trying to save stuff on disk. */
9435 freeIOJob(j);
9436 key->storage = REDIS_VM_MEMORY; /* undo operation */
9437 } else {
9438 /* Note that we need to mark this pages as used now,
9439 * if the job will be canceled, we'll mark them as freed
9440 * again. */
9441 vmMarkPagesUsed(j->page,j->pages);
9442 j->type = REDIS_IOJOB_DO_SWAP;
9443 lockThreadedIO();
9444 queueIOJob(j);
9445 unlockThreadedIO();
9446 }
9447 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9448 robj *val;
9449
9450 /* Key swapped. We can finally free some memory. */
9451 if (key->storage != REDIS_VM_SWAPPING) {
9452 printf("key->storage: %d\n",key->storage);
9453 printf("key->name: %s\n",(char*)key->ptr);
9454 printf("key->refcount: %d\n",key->refcount);
9455 printf("val: %p\n",(void*)j->val);
9456 printf("val->type: %d\n",j->val->type);
9457 printf("val->ptr: %s\n",(char*)j->val->ptr);
9458 }
9459 redisAssert(key->storage == REDIS_VM_SWAPPING);
9460 val = dictGetEntryVal(de);
9461 key->vm.page = j->page;
9462 key->vm.usedpages = j->pages;
9463 key->storage = REDIS_VM_SWAPPED;
9464 key->vtype = j->val->type;
9465 decrRefCount(val); /* Deallocate the object from memory. */
9466 dictGetEntryVal(de) = NULL;
9467 redisLog(REDIS_DEBUG,
9468 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9469 (unsigned char*) key->ptr,
9470 (unsigned long long) j->page, (unsigned long long) j->pages);
9471 server.vm_stats_swapped_objects++;
9472 server.vm_stats_swapouts++;
9473 freeIOJob(j);
9474 /* Put a few more swap requests in queue if we are still
9475 * out of memory */
9476 if (trytoswap && vmCanSwapOut() &&
9477 zmalloc_used_memory() > server.vm_max_memory)
9478 {
9479 int more = 1;
9480 while(more) {
9481 lockThreadedIO();
9482 more = listLength(server.io_newjobs) <
9483 (unsigned) server.vm_max_threads;
9484 unlockThreadedIO();
9485 /* Don't waste CPU time if swappable objects are rare. */
9486 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9487 trytoswap = 0;
9488 break;
9489 }
9490 }
9491 }
9492 }
9493 processed++;
9494 if (processed == toprocess) return;
9495 }
9496 if (retval < 0 && errno != EAGAIN) {
9497 redisLog(REDIS_WARNING,
9498 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9499 strerror(errno));
9500 }
9501 }
9502
9503 static void lockThreadedIO(void) {
9504 pthread_mutex_lock(&server.io_mutex);
9505 }
9506
9507 static void unlockThreadedIO(void) {
9508 pthread_mutex_unlock(&server.io_mutex);
9509 }
9510
9511 /* Remove the specified object from the threaded I/O queue if still not
9512 * processed, otherwise make sure to flag it as canceled. */
9513 static void vmCancelThreadedIOJob(robj *o) {
9514 list *lists[3] = {
9515 server.io_newjobs, /* 0 */
9516 server.io_processing, /* 1 */
9517 server.io_processed /* 2 */
9518 };
9519 int i;
9520
9521 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9522 again:
9523 lockThreadedIO();
9524 /* Search for a matching key in one of the queues */
9525 for (i = 0; i < 3; i++) {
9526 listNode *ln;
9527 listIter li;
9528
9529 listRewind(lists[i],&li);
9530 while ((ln = listNext(&li)) != NULL) {
9531 iojob *job = ln->value;
9532
9533 if (job->canceled) continue; /* Skip this, already canceled. */
9534 if (job->key == o) {
9535 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9536 (void*)job, (char*)o->ptr, job->type, i);
9537 /* Mark the pages as free since the swap didn't happened
9538 * or happened but is now discarded. */
9539 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9540 vmMarkPagesFree(job->page,job->pages);
9541 /* Cancel the job. It depends on the list the job is
9542 * living in. */
9543 switch(i) {
9544 case 0: /* io_newjobs */
9545 /* If the job was yet not processed the best thing to do
9546 * is to remove it from the queue at all */
9547 freeIOJob(job);
9548 listDelNode(lists[i],ln);
9549 break;
9550 case 1: /* io_processing */
9551 /* Oh Shi- the thread is messing with the Job:
9552 *
9553 * Probably it's accessing the object if this is a
9554 * PREPARE_SWAP or DO_SWAP job.
9555 * If it's a LOAD job it may be reading from disk and
9556 * if we don't wait for the job to terminate before to
9557 * cancel it, maybe in a few microseconds data can be
9558 * corrupted in this pages. So the short story is:
9559 *
9560 * Better to wait for the job to move into the
9561 * next queue (processed)... */
9562
9563 /* We try again and again until the job is completed. */
9564 unlockThreadedIO();
9565 /* But let's wait some time for the I/O thread
9566 * to finish with this job. After all this condition
9567 * should be very rare. */
9568 usleep(1);
9569 goto again;
9570 case 2: /* io_processed */
9571 /* The job was already processed, that's easy...
9572 * just mark it as canceled so that we'll ignore it
9573 * when processing completed jobs. */
9574 job->canceled = 1;
9575 break;
9576 }
9577 /* Finally we have to adjust the storage type of the object
9578 * in order to "UNDO" the operaiton. */
9579 if (o->storage == REDIS_VM_LOADING)
9580 o->storage = REDIS_VM_SWAPPED;
9581 else if (o->storage == REDIS_VM_SWAPPING)
9582 o->storage = REDIS_VM_MEMORY;
9583 unlockThreadedIO();
9584 return;
9585 }
9586 }
9587 }
9588 unlockThreadedIO();
9589 assert(1 != 1); /* We should never reach this */
9590 }
9591
9592 static void *IOThreadEntryPoint(void *arg) {
9593 iojob *j;
9594 listNode *ln;
9595 REDIS_NOTUSED(arg);
9596
9597 pthread_detach(pthread_self());
9598 while(1) {
9599 /* Get a new job to process */
9600 lockThreadedIO();
9601 if (listLength(server.io_newjobs) == 0) {
9602 /* No new jobs in queue, exit. */
9603 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9604 (long) pthread_self());
9605 server.io_active_threads--;
9606 unlockThreadedIO();
9607 return NULL;
9608 }
9609 ln = listFirst(server.io_newjobs);
9610 j = ln->value;
9611 listDelNode(server.io_newjobs,ln);
9612 /* Add the job in the processing queue */
9613 j->thread = pthread_self();
9614 listAddNodeTail(server.io_processing,j);
9615 ln = listLast(server.io_processing); /* We use ln later to remove it */
9616 unlockThreadedIO();
9617 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9618 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9619
9620 /* Process the Job */
9621 if (j->type == REDIS_IOJOB_LOAD) {
9622 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9623 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9624 FILE *fp = fopen("/dev/null","w+");
9625 j->pages = rdbSavedObjectPages(j->val,fp);
9626 fclose(fp);
9627 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9628 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9629 j->canceled = 1;
9630 }
9631
9632 /* Done: insert the job into the processed queue */
9633 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9634 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9635 lockThreadedIO();
9636 listDelNode(server.io_processing,ln);
9637 listAddNodeTail(server.io_processed,j);
9638 unlockThreadedIO();
9639
9640 /* Signal the main thread there is new stuff to process */
9641 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9642 }
9643 return NULL; /* never reached */
9644 }
9645
9646 static void spawnIOThread(void) {
9647 pthread_t thread;
9648 sigset_t mask, omask;
9649 int err;
9650
9651 sigemptyset(&mask);
9652 sigaddset(&mask,SIGCHLD);
9653 sigaddset(&mask,SIGHUP);
9654 sigaddset(&mask,SIGPIPE);
9655 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9656 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9657 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9658 strerror(err));
9659 usleep(1000000);
9660 }
9661 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9662 server.io_active_threads++;
9663 }
9664
9665 /* We need to wait for the last thread to exit before we are able to
9666 * fork() in order to BGSAVE or BGREWRITEAOF. */
9667 static void waitEmptyIOJobsQueue(void) {
9668 while(1) {
9669 int io_processed_len;
9670
9671 lockThreadedIO();
9672 if (listLength(server.io_newjobs) == 0 &&
9673 listLength(server.io_processing) == 0 &&
9674 server.io_active_threads == 0)
9675 {
9676 unlockThreadedIO();
9677 return;
9678 }
9679 /* While waiting for empty jobs queue condition we post-process some
9680 * finshed job, as I/O threads may be hanging trying to write against
9681 * the io_ready_pipe_write FD but there are so much pending jobs that
9682 * it's blocking. */
9683 io_processed_len = listLength(server.io_processed);
9684 unlockThreadedIO();
9685 if (io_processed_len) {
9686 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9687 usleep(1000); /* 1 millisecond */
9688 } else {
9689 usleep(10000); /* 10 milliseconds */
9690 }
9691 }
9692 }
9693
9694 static void vmReopenSwapFile(void) {
9695 /* Note: we don't close the old one as we are in the child process
9696 * and don't want to mess at all with the original file object. */
9697 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9698 if (server.vm_fp == NULL) {
9699 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9700 server.vm_swap_file);
9701 _exit(1);
9702 }
9703 server.vm_fd = fileno(server.vm_fp);
9704 }
9705
9706 /* This function must be called while with threaded IO locked */
9707 static void queueIOJob(iojob *j) {
9708 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9709 (void*)j, j->type, (char*)j->key->ptr);
9710 listAddNodeTail(server.io_newjobs,j);
9711 if (server.io_active_threads < server.vm_max_threads)
9712 spawnIOThread();
9713 }
9714
9715 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9716 iojob *j;
9717
9718 assert(key->storage == REDIS_VM_MEMORY);
9719 assert(key->refcount == 1);
9720
9721 j = zmalloc(sizeof(*j));
9722 j->type = REDIS_IOJOB_PREPARE_SWAP;
9723 j->db = db;
9724 j->key = key;
9725 j->val = val;
9726 incrRefCount(val);
9727 j->canceled = 0;
9728 j->thread = (pthread_t) -1;
9729 key->storage = REDIS_VM_SWAPPING;
9730
9731 lockThreadedIO();
9732 queueIOJob(j);
9733 unlockThreadedIO();
9734 return REDIS_OK;
9735 }
9736
9737 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9738
9739 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9740 * If there is not already a job loading the key, it is craeted.
9741 * The key is added to the io_keys list in the client structure, and also
9742 * in the hash table mapping swapped keys to waiting clients, that is,
9743 * server.io_waited_keys. */
9744 static int waitForSwappedKey(redisClient *c, robj *key) {
9745 struct dictEntry *de;
9746 robj *o;
9747 list *l;
9748
9749 /* If the key does not exist or is already in RAM we don't need to
9750 * block the client at all. */
9751 de = dictFind(c->db->dict,key);
9752 if (de == NULL) return 0;
9753 o = dictGetEntryKey(de);
9754 if (o->storage == REDIS_VM_MEMORY) {
9755 return 0;
9756 } else if (o->storage == REDIS_VM_SWAPPING) {
9757 /* We were swapping the key, undo it! */
9758 vmCancelThreadedIOJob(o);
9759 return 0;
9760 }
9761
9762 /* OK: the key is either swapped, or being loaded just now. */
9763
9764 /* Add the key to the list of keys this client is waiting for.
9765 * This maps clients to keys they are waiting for. */
9766 listAddNodeTail(c->io_keys,key);
9767 incrRefCount(key);
9768
9769 /* Add the client to the swapped keys => clients waiting map. */
9770 de = dictFind(c->db->io_keys,key);
9771 if (de == NULL) {
9772 int retval;
9773
9774 /* For every key we take a list of clients blocked for it */
9775 l = listCreate();
9776 retval = dictAdd(c->db->io_keys,key,l);
9777 incrRefCount(key);
9778 assert(retval == DICT_OK);
9779 } else {
9780 l = dictGetEntryVal(de);
9781 }
9782 listAddNodeTail(l,c);
9783
9784 /* Are we already loading the key from disk? If not create a job */
9785 if (o->storage == REDIS_VM_SWAPPED) {
9786 iojob *j;
9787
9788 o->storage = REDIS_VM_LOADING;
9789 j = zmalloc(sizeof(*j));
9790 j->type = REDIS_IOJOB_LOAD;
9791 j->db = c->db;
9792 j->key = o;
9793 j->key->vtype = o->vtype;
9794 j->page = o->vm.page;
9795 j->val = NULL;
9796 j->canceled = 0;
9797 j->thread = (pthread_t) -1;
9798 lockThreadedIO();
9799 queueIOJob(j);
9800 unlockThreadedIO();
9801 }
9802 return 1;
9803 }
9804
9805 /* Preload keys for any command with first, last and step values for
9806 * the command keys prototype, as defined in the command table. */
9807 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9808 int j, last;
9809 if (cmd->vm_firstkey == 0) return;
9810 last = cmd->vm_lastkey;
9811 if (last < 0) last = argc+last;
9812 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9813 redisAssert(j < argc);
9814 waitForSwappedKey(c,argv[j]);
9815 }
9816 }
9817
9818 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9819 * Note that the number of keys to preload is user-defined, so we need to
9820 * apply a sanity check against argc. */
9821 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9822 int i, num;
9823 REDIS_NOTUSED(cmd);
9824
9825 num = atoi(argv[2]->ptr);
9826 if (num > (argc-3)) return;
9827 for (i = 0; i < num; i++) {
9828 waitForSwappedKey(c,argv[3+i]);
9829 }
9830 }
9831
9832 /* Preload keys needed to execute the entire MULTI/EXEC block.
9833 *
9834 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9835 * and will block the client when any command requires a swapped out value. */
9836 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9837 int i, margc;
9838 struct redisCommand *mcmd;
9839 robj **margv;
9840 REDIS_NOTUSED(cmd);
9841 REDIS_NOTUSED(argc);
9842 REDIS_NOTUSED(argv);
9843
9844 if (!(c->flags & REDIS_MULTI)) return;
9845 for (i = 0; i < c->mstate.count; i++) {
9846 mcmd = c->mstate.commands[i].cmd;
9847 margc = c->mstate.commands[i].argc;
9848 margv = c->mstate.commands[i].argv;
9849
9850 if (mcmd->vm_preload_proc != NULL) {
9851 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9852 } else {
9853 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9854 }
9855 }
9856 }
9857
9858 /* Is this client attempting to run a command against swapped keys?
9859 * If so, block it ASAP, load the keys in background, then resume it.
9860 *
9861 * The important idea about this function is that it can fail! If keys will
9862 * still be swapped when the client is resumed, this key lookups will
9863 * just block loading keys from disk. In practical terms this should only
9864 * happen with SORT BY command or if there is a bug in this function.
9865 *
9866 * Return 1 if the client is marked as blocked, 0 if the client can
9867 * continue as the keys it is going to access appear to be in memory. */
9868 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9869 if (cmd->vm_preload_proc != NULL) {
9870 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9871 } else {
9872 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9873 }
9874
9875 /* If the client was blocked for at least one key, mark it as blocked. */
9876 if (listLength(c->io_keys)) {
9877 c->flags |= REDIS_IO_WAIT;
9878 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9879 server.vm_blocked_clients++;
9880 return 1;
9881 } else {
9882 return 0;
9883 }
9884 }
9885
9886 /* Remove the 'key' from the list of blocked keys for a given client.
9887 *
9888 * The function returns 1 when there are no longer blocking keys after
9889 * the current one was removed (and the client can be unblocked). */
9890 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9891 list *l;
9892 listNode *ln;
9893 listIter li;
9894 struct dictEntry *de;
9895
9896 /* Remove the key from the list of keys this client is waiting for. */
9897 listRewind(c->io_keys,&li);
9898 while ((ln = listNext(&li)) != NULL) {
9899 if (equalStringObjects(ln->value,key)) {
9900 listDelNode(c->io_keys,ln);
9901 break;
9902 }
9903 }
9904 assert(ln != NULL);
9905
9906 /* Remove the client form the key => waiting clients map. */
9907 de = dictFind(c->db->io_keys,key);
9908 assert(de != NULL);
9909 l = dictGetEntryVal(de);
9910 ln = listSearchKey(l,c);
9911 assert(ln != NULL);
9912 listDelNode(l,ln);
9913 if (listLength(l) == 0)
9914 dictDelete(c->db->io_keys,key);
9915
9916 return listLength(c->io_keys) == 0;
9917 }
9918
9919 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9920 struct dictEntry *de;
9921 list *l;
9922 listNode *ln;
9923 int len;
9924
9925 de = dictFind(db->io_keys,key);
9926 if (!de) return;
9927
9928 l = dictGetEntryVal(de);
9929 len = listLength(l);
9930 /* Note: we can't use something like while(listLength(l)) as the list
9931 * can be freed by the calling function when we remove the last element. */
9932 while (len--) {
9933 ln = listFirst(l);
9934 redisClient *c = ln->value;
9935
9936 if (dontWaitForSwappedKey(c,key)) {
9937 /* Put the client in the list of clients ready to go as we
9938 * loaded all the keys about it. */
9939 listAddNodeTail(server.io_ready_clients,c);
9940 }
9941 }
9942 }
9943
9944 /* =========================== Remote Configuration ========================= */
9945
9946 static void configSetCommand(redisClient *c) {
9947 robj *o = getDecodedObject(c->argv[3]);
9948 long long ll;
9949
9950 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9951 zfree(server.dbfilename);
9952 server.dbfilename = zstrdup(o->ptr);
9953 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9954 zfree(server.requirepass);
9955 server.requirepass = zstrdup(o->ptr);
9956 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9957 zfree(server.masterauth);
9958 server.masterauth = zstrdup(o->ptr);
9959 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9960 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9961 ll < 0) goto badfmt;
9962 server.maxmemory = ll;
9963 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9964 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9965 ll < 0 || ll > LONG_MAX) goto badfmt;
9966 server.maxidletime = ll;
9967 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9968 if (!strcasecmp(o->ptr,"no")) {
9969 server.appendfsync = APPENDFSYNC_NO;
9970 } else if (!strcasecmp(o->ptr,"everysec")) {
9971 server.appendfsync = APPENDFSYNC_EVERYSEC;
9972 } else if (!strcasecmp(o->ptr,"always")) {
9973 server.appendfsync = APPENDFSYNC_ALWAYS;
9974 } else {
9975 goto badfmt;
9976 }
9977 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9978 int old = server.appendonly;
9979 int new = yesnotoi(o->ptr);
9980
9981 if (new == -1) goto badfmt;
9982 if (old != new) {
9983 if (new == 0) {
9984 stopAppendOnly();
9985 } else {
9986 if (startAppendOnly() == REDIS_ERR) {
9987 addReplySds(c,sdscatprintf(sdsempty(),
9988 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9989 decrRefCount(o);
9990 return;
9991 }
9992 }
9993 }
9994 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9995 int vlen, j;
9996 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9997
9998 /* Perform sanity check before setting the new config:
9999 * - Even number of args
10000 * - Seconds >= 1, changes >= 0 */
10001 if (vlen & 1) {
10002 sdsfreesplitres(v,vlen);
10003 goto badfmt;
10004 }
10005 for (j = 0; j < vlen; j++) {
10006 char *eptr;
10007 long val;
10008
10009 val = strtoll(v[j], &eptr, 10);
10010 if (eptr[0] != '\0' ||
10011 ((j & 1) == 0 && val < 1) ||
10012 ((j & 1) == 1 && val < 0)) {
10013 sdsfreesplitres(v,vlen);
10014 goto badfmt;
10015 }
10016 }
10017 /* Finally set the new config */
10018 resetServerSaveParams();
10019 for (j = 0; j < vlen; j += 2) {
10020 time_t seconds;
10021 int changes;
10022
10023 seconds = strtoll(v[j],NULL,10);
10024 changes = strtoll(v[j+1],NULL,10);
10025 appendServerSaveParams(seconds, changes);
10026 }
10027 sdsfreesplitres(v,vlen);
10028 } else {
10029 addReplySds(c,sdscatprintf(sdsempty(),
10030 "-ERR not supported CONFIG parameter %s\r\n",
10031 (char*)c->argv[2]->ptr));
10032 decrRefCount(o);
10033 return;
10034 }
10035 decrRefCount(o);
10036 addReply(c,shared.ok);
10037 return;
10038
10039 badfmt: /* Bad format errors */
10040 addReplySds(c,sdscatprintf(sdsempty(),
10041 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10042 (char*)o->ptr,
10043 (char*)c->argv[2]->ptr));
10044 decrRefCount(o);
10045 }
10046
10047 static void configGetCommand(redisClient *c) {
10048 robj *o = getDecodedObject(c->argv[2]);
10049 robj *lenobj = createObject(REDIS_STRING,NULL);
10050 char *pattern = o->ptr;
10051 int matches = 0;
10052
10053 addReply(c,lenobj);
10054 decrRefCount(lenobj);
10055
10056 if (stringmatch(pattern,"dbfilename",0)) {
10057 addReplyBulkCString(c,"dbfilename");
10058 addReplyBulkCString(c,server.dbfilename);
10059 matches++;
10060 }
10061 if (stringmatch(pattern,"requirepass",0)) {
10062 addReplyBulkCString(c,"requirepass");
10063 addReplyBulkCString(c,server.requirepass);
10064 matches++;
10065 }
10066 if (stringmatch(pattern,"masterauth",0)) {
10067 addReplyBulkCString(c,"masterauth");
10068 addReplyBulkCString(c,server.masterauth);
10069 matches++;
10070 }
10071 if (stringmatch(pattern,"maxmemory",0)) {
10072 char buf[128];
10073
10074 ll2string(buf,128,server.maxmemory);
10075 addReplyBulkCString(c,"maxmemory");
10076 addReplyBulkCString(c,buf);
10077 matches++;
10078 }
10079 if (stringmatch(pattern,"timeout",0)) {
10080 char buf[128];
10081
10082 ll2string(buf,128,server.maxidletime);
10083 addReplyBulkCString(c,"timeout");
10084 addReplyBulkCString(c,buf);
10085 matches++;
10086 }
10087 if (stringmatch(pattern,"appendonly",0)) {
10088 addReplyBulkCString(c,"appendonly");
10089 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10090 matches++;
10091 }
10092 if (stringmatch(pattern,"appendfsync",0)) {
10093 char *policy;
10094
10095 switch(server.appendfsync) {
10096 case APPENDFSYNC_NO: policy = "no"; break;
10097 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10098 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10099 default: policy = "unknown"; break; /* too harmless to panic */
10100 }
10101 addReplyBulkCString(c,"appendfsync");
10102 addReplyBulkCString(c,policy);
10103 matches++;
10104 }
10105 if (stringmatch(pattern,"save",0)) {
10106 sds buf = sdsempty();
10107 int j;
10108
10109 for (j = 0; j < server.saveparamslen; j++) {
10110 buf = sdscatprintf(buf,"%ld %d",
10111 server.saveparams[j].seconds,
10112 server.saveparams[j].changes);
10113 if (j != server.saveparamslen-1)
10114 buf = sdscatlen(buf," ",1);
10115 }
10116 addReplyBulkCString(c,"save");
10117 addReplyBulkCString(c,buf);
10118 sdsfree(buf);
10119 matches++;
10120 }
10121 decrRefCount(o);
10122 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10123 }
10124
10125 static void configCommand(redisClient *c) {
10126 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10127 if (c->argc != 4) goto badarity;
10128 configSetCommand(c);
10129 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10130 if (c->argc != 3) goto badarity;
10131 configGetCommand(c);
10132 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10133 if (c->argc != 2) goto badarity;
10134 server.stat_numcommands = 0;
10135 server.stat_numconnections = 0;
10136 server.stat_expiredkeys = 0;
10137 server.stat_starttime = time(NULL);
10138 addReply(c,shared.ok);
10139 } else {
10140 addReplySds(c,sdscatprintf(sdsempty(),
10141 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10142 }
10143 return;
10144
10145 badarity:
10146 addReplySds(c,sdscatprintf(sdsempty(),
10147 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10148 (char*) c->argv[1]->ptr));
10149 }
10150
10151 /* =========================== Pubsub implementation ======================== */
10152
10153 static void freePubsubPattern(void *p) {
10154 pubsubPattern *pat = p;
10155
10156 decrRefCount(pat->pattern);
10157 zfree(pat);
10158 }
10159
10160 static int listMatchPubsubPattern(void *a, void *b) {
10161 pubsubPattern *pa = a, *pb = b;
10162
10163 return (pa->client == pb->client) &&
10164 (equalStringObjects(pa->pattern,pb->pattern));
10165 }
10166
10167 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10168 * 0 if the client was already subscribed to that channel. */
10169 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10170 struct dictEntry *de;
10171 list *clients = NULL;
10172 int retval = 0;
10173
10174 /* Add the channel to the client -> channels hash table */
10175 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10176 retval = 1;
10177 incrRefCount(channel);
10178 /* Add the client to the channel -> list of clients hash table */
10179 de = dictFind(server.pubsub_channels,channel);
10180 if (de == NULL) {
10181 clients = listCreate();
10182 dictAdd(server.pubsub_channels,channel,clients);
10183 incrRefCount(channel);
10184 } else {
10185 clients = dictGetEntryVal(de);
10186 }
10187 listAddNodeTail(clients,c);
10188 }
10189 /* Notify the client */
10190 addReply(c,shared.mbulk3);
10191 addReply(c,shared.subscribebulk);
10192 addReplyBulk(c,channel);
10193 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10194 return retval;
10195 }
10196
10197 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10198 * 0 if the client was not subscribed to the specified channel. */
10199 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10200 struct dictEntry *de;
10201 list *clients;
10202 listNode *ln;
10203 int retval = 0;
10204
10205 /* Remove the channel from the client -> channels hash table */
10206 incrRefCount(channel); /* channel may be just a pointer to the same object
10207 we have in the hash tables. Protect it... */
10208 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10209 retval = 1;
10210 /* Remove the client from the channel -> clients list hash table */
10211 de = dictFind(server.pubsub_channels,channel);
10212 assert(de != NULL);
10213 clients = dictGetEntryVal(de);
10214 ln = listSearchKey(clients,c);
10215 assert(ln != NULL);
10216 listDelNode(clients,ln);
10217 if (listLength(clients) == 0) {
10218 /* Free the list and associated hash entry at all if this was
10219 * the latest client, so that it will be possible to abuse
10220 * Redis PUBSUB creating millions of channels. */
10221 dictDelete(server.pubsub_channels,channel);
10222 }
10223 }
10224 /* Notify the client */
10225 if (notify) {
10226 addReply(c,shared.mbulk3);
10227 addReply(c,shared.unsubscribebulk);
10228 addReplyBulk(c,channel);
10229 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10230 listLength(c->pubsub_patterns));
10231
10232 }
10233 decrRefCount(channel); /* it is finally safe to release it */
10234 return retval;
10235 }
10236
10237 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10238 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10239 int retval = 0;
10240
10241 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10242 retval = 1;
10243 pubsubPattern *pat;
10244 listAddNodeTail(c->pubsub_patterns,pattern);
10245 incrRefCount(pattern);
10246 pat = zmalloc(sizeof(*pat));
10247 pat->pattern = getDecodedObject(pattern);
10248 pat->client = c;
10249 listAddNodeTail(server.pubsub_patterns,pat);
10250 }
10251 /* Notify the client */
10252 addReply(c,shared.mbulk3);
10253 addReply(c,shared.psubscribebulk);
10254 addReplyBulk(c,pattern);
10255 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10256 return retval;
10257 }
10258
10259 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10260 * 0 if the client was not subscribed to the specified channel. */
10261 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10262 listNode *ln;
10263 pubsubPattern pat;
10264 int retval = 0;
10265
10266 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10267 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10268 retval = 1;
10269 listDelNode(c->pubsub_patterns,ln);
10270 pat.client = c;
10271 pat.pattern = pattern;
10272 ln = listSearchKey(server.pubsub_patterns,&pat);
10273 listDelNode(server.pubsub_patterns,ln);
10274 }
10275 /* Notify the client */
10276 if (notify) {
10277 addReply(c,shared.mbulk3);
10278 addReply(c,shared.punsubscribebulk);
10279 addReplyBulk(c,pattern);
10280 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10281 listLength(c->pubsub_patterns));
10282 }
10283 decrRefCount(pattern);
10284 return retval;
10285 }
10286
10287 /* Unsubscribe from all the channels. Return the number of channels the
10288 * client was subscribed from. */
10289 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10290 dictIterator *di = dictGetIterator(c->pubsub_channels);
10291 dictEntry *de;
10292 int count = 0;
10293
10294 while((de = dictNext(di)) != NULL) {
10295 robj *channel = dictGetEntryKey(de);
10296
10297 count += pubsubUnsubscribeChannel(c,channel,notify);
10298 }
10299 dictReleaseIterator(di);
10300 return count;
10301 }
10302
10303 /* Unsubscribe from all the patterns. Return the number of patterns the
10304 * client was subscribed from. */
10305 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10306 listNode *ln;
10307 listIter li;
10308 int count = 0;
10309
10310 listRewind(c->pubsub_patterns,&li);
10311 while ((ln = listNext(&li)) != NULL) {
10312 robj *pattern = ln->value;
10313
10314 count += pubsubUnsubscribePattern(c,pattern,notify);
10315 }
10316 return count;
10317 }
10318
10319 /* Publish a message */
10320 static int pubsubPublishMessage(robj *channel, robj *message) {
10321 int receivers = 0;
10322 struct dictEntry *de;
10323 listNode *ln;
10324 listIter li;
10325
10326 /* Send to clients listening for that channel */
10327 de = dictFind(server.pubsub_channels,channel);
10328 if (de) {
10329 list *list = dictGetEntryVal(de);
10330 listNode *ln;
10331 listIter li;
10332
10333 listRewind(list,&li);
10334 while ((ln = listNext(&li)) != NULL) {
10335 redisClient *c = ln->value;
10336
10337 addReply(c,shared.mbulk3);
10338 addReply(c,shared.messagebulk);
10339 addReplyBulk(c,channel);
10340 addReplyBulk(c,message);
10341 receivers++;
10342 }
10343 }
10344 /* Send to clients listening to matching channels */
10345 if (listLength(server.pubsub_patterns)) {
10346 listRewind(server.pubsub_patterns,&li);
10347 channel = getDecodedObject(channel);
10348 while ((ln = listNext(&li)) != NULL) {
10349 pubsubPattern *pat = ln->value;
10350
10351 if (stringmatchlen((char*)pat->pattern->ptr,
10352 sdslen(pat->pattern->ptr),
10353 (char*)channel->ptr,
10354 sdslen(channel->ptr),0)) {
10355 addReply(pat->client,shared.mbulk4);
10356 addReply(pat->client,shared.pmessagebulk);
10357 addReplyBulk(pat->client,pat->pattern);
10358 addReplyBulk(pat->client,channel);
10359 addReplyBulk(pat->client,message);
10360 receivers++;
10361 }
10362 }
10363 decrRefCount(channel);
10364 }
10365 return receivers;
10366 }
10367
10368 static void subscribeCommand(redisClient *c) {
10369 int j;
10370
10371 for (j = 1; j < c->argc; j++)
10372 pubsubSubscribeChannel(c,c->argv[j]);
10373 }
10374
10375 static void unsubscribeCommand(redisClient *c) {
10376 if (c->argc == 1) {
10377 pubsubUnsubscribeAllChannels(c,1);
10378 return;
10379 } else {
10380 int j;
10381
10382 for (j = 1; j < c->argc; j++)
10383 pubsubUnsubscribeChannel(c,c->argv[j],1);
10384 }
10385 }
10386
10387 static void psubscribeCommand(redisClient *c) {
10388 int j;
10389
10390 for (j = 1; j < c->argc; j++)
10391 pubsubSubscribePattern(c,c->argv[j]);
10392 }
10393
10394 static void punsubscribeCommand(redisClient *c) {
10395 if (c->argc == 1) {
10396 pubsubUnsubscribeAllPatterns(c,1);
10397 return;
10398 } else {
10399 int j;
10400
10401 for (j = 1; j < c->argc; j++)
10402 pubsubUnsubscribePattern(c,c->argv[j],1);
10403 }
10404 }
10405
10406 static void publishCommand(redisClient *c) {
10407 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10408 addReplyLongLong(c,receivers);
10409 }
10410
10411 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10412 *
10413 * The implementation uses a per-DB hash table mapping keys to list of clients
10414 * WATCHing those keys, so that given a key that is going to be modified
10415 * we can mark all the associated clients as dirty.
10416 *
10417 * Also every client contains a list of WATCHed keys so that's possible to
10418 * un-watch such keys when the client is freed or when UNWATCH is called. */
10419
10420 /* In the client->watched_keys list we need to use watchedKey structures
10421 * as in order to identify a key in Redis we need both the key name and the
10422 * DB */
10423 typedef struct watchedKey {
10424 robj *key;
10425 redisDb *db;
10426 } watchedKey;
10427
10428 /* Watch for the specified key */
10429 static void watchForKey(redisClient *c, robj *key) {
10430 list *clients = NULL;
10431 listIter li;
10432 listNode *ln;
10433 watchedKey *wk;
10434
10435 /* Check if we are already watching for this key */
10436 listRewind(c->watched_keys,&li);
10437 while((ln = listNext(&li))) {
10438 wk = listNodeValue(ln);
10439 if (wk->db == c->db && equalStringObjects(key,wk->key))
10440 return; /* Key already watched */
10441 }
10442 /* This key is not already watched in this DB. Let's add it */
10443 clients = dictFetchValue(c->db->watched_keys,key);
10444 if (!clients) {
10445 clients = listCreate();
10446 dictAdd(c->db->watched_keys,key,clients);
10447 incrRefCount(key);
10448 }
10449 listAddNodeTail(clients,c);
10450 /* Add the new key to the lits of keys watched by this client */
10451 wk = zmalloc(sizeof(*wk));
10452 wk->key = key;
10453 wk->db = c->db;
10454 incrRefCount(key);
10455 listAddNodeTail(c->watched_keys,wk);
10456 }
10457
10458 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10459 * flag is up to the caller. */
10460 static void unwatchAllKeys(redisClient *c) {
10461 listIter li;
10462 listNode *ln;
10463
10464 if (listLength(c->watched_keys) == 0) return;
10465 listRewind(c->watched_keys,&li);
10466 while((ln = listNext(&li))) {
10467 list *clients;
10468 watchedKey *wk;
10469
10470 /* Lookup the watched key -> clients list and remove the client
10471 * from the list */
10472 wk = listNodeValue(ln);
10473 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10474 assert(clients != NULL);
10475 listDelNode(clients,listSearchKey(clients,c));
10476 /* Kill the entry at all if this was the only client */
10477 if (listLength(clients) == 0)
10478 dictDelete(wk->db->watched_keys, wk->key);
10479 /* Remove this watched key from the client->watched list */
10480 listDelNode(c->watched_keys,ln);
10481 decrRefCount(wk->key);
10482 zfree(wk);
10483 }
10484 }
10485
10486 /* "Touch" a key, so that if this key is being WATCHed by some client the
10487 * next EXEC will fail. */
10488 static void touchWatchedKey(redisDb *db, robj *key) {
10489 list *clients;
10490 listIter li;
10491 listNode *ln;
10492
10493 if (dictSize(db->watched_keys) == 0) return;
10494 clients = dictFetchValue(db->watched_keys, key);
10495 if (!clients) return;
10496
10497 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10498 /* Check if we are already watching for this key */
10499 listRewind(clients,&li);
10500 while((ln = listNext(&li))) {
10501 redisClient *c = listNodeValue(ln);
10502
10503 c->flags |= REDIS_DIRTY_CAS;
10504 }
10505 }
10506
10507 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10508 * flush but will be deleted as effect of the flushing operation should
10509 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10510 * a FLUSHALL operation (all the DBs flushed). */
10511 static void touchWatchedKeysOnFlush(int dbid) {
10512 listIter li1, li2;
10513 listNode *ln;
10514
10515 /* For every client, check all the waited keys */
10516 listRewind(server.clients,&li1);
10517 while((ln = listNext(&li1))) {
10518 redisClient *c = listNodeValue(ln);
10519 listRewind(c->watched_keys,&li2);
10520 while((ln = listNext(&li2))) {
10521 watchedKey *wk = listNodeValue(ln);
10522
10523 /* For every watched key matching the specified DB, if the
10524 * key exists, mark the client as dirty, as the key will be
10525 * removed. */
10526 if (dbid == -1 || wk->db->id == dbid) {
10527 if (dictFind(wk->db->dict, wk->key) != NULL)
10528 c->flags |= REDIS_DIRTY_CAS;
10529 }
10530 }
10531 }
10532 }
10533
10534 static void watchCommand(redisClient *c) {
10535 int j;
10536
10537 if (c->flags & REDIS_MULTI) {
10538 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10539 return;
10540 }
10541 for (j = 1; j < c->argc; j++)
10542 watchForKey(c,c->argv[j]);
10543 addReply(c,shared.ok);
10544 }
10545
10546 static void unwatchCommand(redisClient *c) {
10547 unwatchAllKeys(c);
10548 c->flags &= (~REDIS_DIRTY_CAS);
10549 addReply(c,shared.ok);
10550 }
10551
10552 /* ================================= Debugging ============================== */
10553
10554 /* Compute the sha1 of string at 's' with 'len' bytes long.
10555 * The SHA1 is then xored againt the string pointed by digest.
10556 * Since xor is commutative, this operation is used in order to
10557 * "add" digests relative to unordered elements.
10558 *
10559 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10560 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10561 SHA1_CTX ctx;
10562 unsigned char hash[20], *s = ptr;
10563 int j;
10564
10565 SHA1Init(&ctx);
10566 SHA1Update(&ctx,s,len);
10567 SHA1Final(hash,&ctx);
10568
10569 for (j = 0; j < 20; j++)
10570 digest[j] ^= hash[j];
10571 }
10572
10573 static void xorObjectDigest(unsigned char *digest, robj *o) {
10574 o = getDecodedObject(o);
10575 xorDigest(digest,o->ptr,sdslen(o->ptr));
10576 decrRefCount(o);
10577 }
10578
10579 /* This function instead of just computing the SHA1 and xoring it
10580 * against diget, also perform the digest of "digest" itself and
10581 * replace the old value with the new one.
10582 *
10583 * So the final digest will be:
10584 *
10585 * digest = SHA1(digest xor SHA1(data))
10586 *
10587 * This function is used every time we want to preserve the order so
10588 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10589 *
10590 * Also note that mixdigest("foo") followed by mixdigest("bar")
10591 * will lead to a different digest compared to "fo", "obar".
10592 */
10593 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10594 SHA1_CTX ctx;
10595 char *s = ptr;
10596
10597 xorDigest(digest,s,len);
10598 SHA1Init(&ctx);
10599 SHA1Update(&ctx,digest,20);
10600 SHA1Final(digest,&ctx);
10601 }
10602
10603 static void mixObjectDigest(unsigned char *digest, robj *o) {
10604 o = getDecodedObject(o);
10605 mixDigest(digest,o->ptr,sdslen(o->ptr));
10606 decrRefCount(o);
10607 }
10608
10609 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10610 * are not ordered, we use a trick: every aggregate digest is the xor
10611 * of the digests of their elements. This way the order will not change
10612 * the result. For list instead we use a feedback entering the output digest
10613 * as input in order to ensure that a different ordered list will result in
10614 * a different digest. */
10615 static void computeDatasetDigest(unsigned char *final) {
10616 unsigned char digest[20];
10617 char buf[128];
10618 dictIterator *di = NULL;
10619 dictEntry *de;
10620 int j;
10621 uint32_t aux;
10622
10623 memset(final,0,20); /* Start with a clean result */
10624
10625 for (j = 0; j < server.dbnum; j++) {
10626 redisDb *db = server.db+j;
10627
10628 if (dictSize(db->dict) == 0) continue;
10629 di = dictGetIterator(db->dict);
10630
10631 /* hash the DB id, so the same dataset moved in a different
10632 * DB will lead to a different digest */
10633 aux = htonl(j);
10634 mixDigest(final,&aux,sizeof(aux));
10635
10636 /* Iterate this DB writing every entry */
10637 while((de = dictNext(di)) != NULL) {
10638 robj *key, *o, *kcopy;
10639 time_t expiretime;
10640
10641 memset(digest,0,20); /* This key-val digest */
10642 key = dictGetEntryKey(de);
10643
10644 if (!server.vm_enabled) {
10645 mixObjectDigest(digest,key);
10646 o = dictGetEntryVal(de);
10647 } else {
10648 /* Don't work with the key directly as when VM is active
10649 * this is unsafe: TODO: fix decrRefCount to check if the
10650 * count really reached 0 to avoid this mess */
10651 kcopy = dupStringObject(key);
10652 mixObjectDigest(digest,kcopy);
10653 o = lookupKeyRead(db,kcopy);
10654 decrRefCount(kcopy);
10655 }
10656 aux = htonl(o->type);
10657 mixDigest(digest,&aux,sizeof(aux));
10658 expiretime = getExpire(db,key);
10659
10660 /* Save the key and associated value */
10661 if (o->type == REDIS_STRING) {
10662 mixObjectDigest(digest,o);
10663 } else if (o->type == REDIS_LIST) {
10664 list *list = o->ptr;
10665 listNode *ln;
10666 listIter li;
10667
10668 listRewind(list,&li);
10669 while((ln = listNext(&li))) {
10670 robj *eleobj = listNodeValue(ln);
10671
10672 mixObjectDigest(digest,eleobj);
10673 }
10674 } else if (o->type == REDIS_SET) {
10675 dict *set = o->ptr;
10676 dictIterator *di = dictGetIterator(set);
10677 dictEntry *de;
10678
10679 while((de = dictNext(di)) != NULL) {
10680 robj *eleobj = dictGetEntryKey(de);
10681
10682 xorObjectDigest(digest,eleobj);
10683 }
10684 dictReleaseIterator(di);
10685 } else if (o->type == REDIS_ZSET) {
10686 zset *zs = o->ptr;
10687 dictIterator *di = dictGetIterator(zs->dict);
10688 dictEntry *de;
10689
10690 while((de = dictNext(di)) != NULL) {
10691 robj *eleobj = dictGetEntryKey(de);
10692 double *score = dictGetEntryVal(de);
10693 unsigned char eledigest[20];
10694
10695 snprintf(buf,sizeof(buf),"%.17g",*score);
10696 memset(eledigest,0,20);
10697 mixObjectDigest(eledigest,eleobj);
10698 mixDigest(eledigest,buf,strlen(buf));
10699 xorDigest(digest,eledigest,20);
10700 }
10701 dictReleaseIterator(di);
10702 } else if (o->type == REDIS_HASH) {
10703 hashIterator *hi;
10704 robj *obj;
10705
10706 hi = hashInitIterator(o);
10707 while (hashNext(hi) != REDIS_ERR) {
10708 unsigned char eledigest[20];
10709
10710 memset(eledigest,0,20);
10711 obj = hashCurrent(hi,REDIS_HASH_KEY);
10712 mixObjectDigest(eledigest,obj);
10713 decrRefCount(obj);
10714 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10715 mixObjectDigest(eledigest,obj);
10716 decrRefCount(obj);
10717 xorDigest(digest,eledigest,20);
10718 }
10719 hashReleaseIterator(hi);
10720 } else {
10721 redisPanic("Unknown object type");
10722 }
10723 /* If the key has an expire, add it to the mix */
10724 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10725 /* We can finally xor the key-val digest to the final digest */
10726 xorDigest(final,digest,20);
10727 }
10728 dictReleaseIterator(di);
10729 }
10730 }
10731
10732 static void debugCommand(redisClient *c) {
10733 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10734 *((char*)-1) = 'x';
10735 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10736 if (rdbSave(server.dbfilename) != REDIS_OK) {
10737 addReply(c,shared.err);
10738 return;
10739 }
10740 emptyDb();
10741 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10742 addReply(c,shared.err);
10743 return;
10744 }
10745 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10746 addReply(c,shared.ok);
10747 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10748 emptyDb();
10749 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10750 addReply(c,shared.err);
10751 return;
10752 }
10753 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10754 addReply(c,shared.ok);
10755 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10756 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10757 robj *key, *val;
10758
10759 if (!de) {
10760 addReply(c,shared.nokeyerr);
10761 return;
10762 }
10763 key = dictGetEntryKey(de);
10764 val = dictGetEntryVal(de);
10765 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10766 key->storage == REDIS_VM_SWAPPING)) {
10767 char *strenc;
10768 char buf[128];
10769
10770 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10771 strenc = strencoding[val->encoding];
10772 } else {
10773 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10774 strenc = buf;
10775 }
10776 addReplySds(c,sdscatprintf(sdsempty(),
10777 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10778 "encoding:%s serializedlength:%lld\r\n",
10779 (void*)key, key->refcount, (void*)val, val->refcount,
10780 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10781 } else {
10782 addReplySds(c,sdscatprintf(sdsempty(),
10783 "+Key at:%p refcount:%d, value swapped at: page %llu "
10784 "using %llu pages\r\n",
10785 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10786 (unsigned long long) key->vm.usedpages));
10787 }
10788 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10789 lookupKeyRead(c->db,c->argv[2]);
10790 addReply(c,shared.ok);
10791 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10792 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10793 robj *key, *val;
10794
10795 if (!server.vm_enabled) {
10796 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10797 return;
10798 }
10799 if (!de) {
10800 addReply(c,shared.nokeyerr);
10801 return;
10802 }
10803 key = dictGetEntryKey(de);
10804 val = dictGetEntryVal(de);
10805 /* If the key is shared we want to create a copy */
10806 if (key->refcount > 1) {
10807 robj *newkey = dupStringObject(key);
10808 decrRefCount(key);
10809 key = dictGetEntryKey(de) = newkey;
10810 }
10811 /* Swap it */
10812 if (key->storage != REDIS_VM_MEMORY) {
10813 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10814 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10815 dictGetEntryVal(de) = NULL;
10816 addReply(c,shared.ok);
10817 } else {
10818 addReply(c,shared.err);
10819 }
10820 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10821 long keys, j;
10822 robj *key, *val;
10823 char buf[128];
10824
10825 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10826 return;
10827 for (j = 0; j < keys; j++) {
10828 snprintf(buf,sizeof(buf),"key:%lu",j);
10829 key = createStringObject(buf,strlen(buf));
10830 if (lookupKeyRead(c->db,key) != NULL) {
10831 decrRefCount(key);
10832 continue;
10833 }
10834 snprintf(buf,sizeof(buf),"value:%lu",j);
10835 val = createStringObject(buf,strlen(buf));
10836 dictAdd(c->db->dict,key,val);
10837 }
10838 addReply(c,shared.ok);
10839 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10840 unsigned char digest[20];
10841 sds d = sdsnew("+");
10842 int j;
10843
10844 computeDatasetDigest(digest);
10845 for (j = 0; j < 20; j++)
10846 d = sdscatprintf(d, "%02x",digest[j]);
10847
10848 d = sdscatlen(d,"\r\n",2);
10849 addReplySds(c,d);
10850 } else {
10851 addReplySds(c,sdsnew(
10852 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10853 }
10854 }
10855
10856 static void _redisAssert(char *estr, char *file, int line) {
10857 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10858 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10859 #ifdef HAVE_BACKTRACE
10860 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10861 *((char*)-1) = 'x';
10862 #endif
10863 }
10864
10865 static void _redisPanic(char *msg, char *file, int line) {
10866 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10867 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10868 #ifdef HAVE_BACKTRACE
10869 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10870 *((char*)-1) = 'x';
10871 #endif
10872 }
10873
10874 /* =================================== Main! ================================ */
10875
10876 #ifdef __linux__
10877 int linuxOvercommitMemoryValue(void) {
10878 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10879 char buf[64];
10880
10881 if (!fp) return -1;
10882 if (fgets(buf,64,fp) == NULL) {
10883 fclose(fp);
10884 return -1;
10885 }
10886 fclose(fp);
10887
10888 return atoi(buf);
10889 }
10890
10891 void linuxOvercommitMemoryWarning(void) {
10892 if (linuxOvercommitMemoryValue() == 0) {
10893 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10894 }
10895 }
10896 #endif /* __linux__ */
10897
10898 static void daemonize(void) {
10899 int fd;
10900 FILE *fp;
10901
10902 if (fork() != 0) exit(0); /* parent exits */
10903 setsid(); /* create a new session */
10904
10905 /* Every output goes to /dev/null. If Redis is daemonized but
10906 * the 'logfile' is set to 'stdout' in the configuration file
10907 * it will not log at all. */
10908 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10909 dup2(fd, STDIN_FILENO);
10910 dup2(fd, STDOUT_FILENO);
10911 dup2(fd, STDERR_FILENO);
10912 if (fd > STDERR_FILENO) close(fd);
10913 }
10914 /* Try to write the pid file */
10915 fp = fopen(server.pidfile,"w");
10916 if (fp) {
10917 fprintf(fp,"%d\n",getpid());
10918 fclose(fp);
10919 }
10920 }
10921
10922 static void version() {
10923 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10924 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
10925 exit(0);
10926 }
10927
10928 static void usage() {
10929 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10930 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10931 exit(1);
10932 }
10933
10934 int main(int argc, char **argv) {
10935 time_t start;
10936
10937 initServerConfig();
10938 if (argc == 2) {
10939 if (strcmp(argv[1], "-v") == 0 ||
10940 strcmp(argv[1], "--version") == 0) version();
10941 if (strcmp(argv[1], "--help") == 0) usage();
10942 resetServerSaveParams();
10943 loadServerConfig(argv[1]);
10944 } else if ((argc > 2)) {
10945 usage();
10946 } else {
10947 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10948 }
10949 if (server.daemonize) daemonize();
10950 initServer();
10951 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10952 #ifdef __linux__
10953 linuxOvercommitMemoryWarning();
10954 #endif
10955 start = time(NULL);
10956 if (server.appendonly) {
10957 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10958 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10959 } else {
10960 if (rdbLoad(server.dbfilename) == REDIS_OK)
10961 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10962 }
10963 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10964 aeSetBeforeSleepProc(server.el,beforeSleep);
10965 aeMain(server.el);
10966 aeDeleteEventLoop(server.el);
10967 return 0;
10968 }
10969
10970 /* ============================= Backtrace support ========================= */
10971
10972 #ifdef HAVE_BACKTRACE
10973 static char *findFuncName(void *pointer, unsigned long *offset);
10974
10975 static void *getMcontextEip(ucontext_t *uc) {
10976 #if defined(__FreeBSD__)
10977 return (void*) uc->uc_mcontext.mc_eip;
10978 #elif defined(__dietlibc__)
10979 return (void*) uc->uc_mcontext.eip;
10980 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10981 #if __x86_64__
10982 return (void*) uc->uc_mcontext->__ss.__rip;
10983 #else
10984 return (void*) uc->uc_mcontext->__ss.__eip;
10985 #endif
10986 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10987 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10988 return (void*) uc->uc_mcontext->__ss.__rip;
10989 #else
10990 return (void*) uc->uc_mcontext->__ss.__eip;
10991 #endif
10992 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10993 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10994 #elif defined(__ia64__) /* Linux IA64 */
10995 return (void*) uc->uc_mcontext.sc_ip;
10996 #else
10997 return NULL;
10998 #endif
10999 }
11000
11001 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11002 void *trace[100];
11003 char **messages = NULL;
11004 int i, trace_size = 0;
11005 unsigned long offset=0;
11006 ucontext_t *uc = (ucontext_t*) secret;
11007 sds infostring;
11008 REDIS_NOTUSED(info);
11009
11010 redisLog(REDIS_WARNING,
11011 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11012 infostring = genRedisInfoString();
11013 redisLog(REDIS_WARNING, "%s",infostring);
11014 /* It's not safe to sdsfree() the returned string under memory
11015 * corruption conditions. Let it leak as we are going to abort */
11016
11017 trace_size = backtrace(trace, 100);
11018 /* overwrite sigaction with caller's address */
11019 if (getMcontextEip(uc) != NULL) {
11020 trace[1] = getMcontextEip(uc);
11021 }
11022 messages = backtrace_symbols(trace, trace_size);
11023
11024 for (i=1; i<trace_size; ++i) {
11025 char *fn = findFuncName(trace[i], &offset), *p;
11026
11027 p = strchr(messages[i],'+');
11028 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11029 redisLog(REDIS_WARNING,"%s", messages[i]);
11030 } else {
11031 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11032 }
11033 }
11034 /* free(messages); Don't call free() with possibly corrupted memory. */
11035 _exit(0);
11036 }
11037
11038 static void sigtermHandler(int sig) {
11039 REDIS_NOTUSED(sig);
11040
11041 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11042 server.shutdown_asap = 1;
11043 }
11044
11045 static void setupSigSegvAction(void) {
11046 struct sigaction act;
11047
11048 sigemptyset (&act.sa_mask);
11049 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11050 * is used. Otherwise, sa_handler is used */
11051 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11052 act.sa_sigaction = segvHandler;
11053 sigaction (SIGSEGV, &act, NULL);
11054 sigaction (SIGBUS, &act, NULL);
11055 sigaction (SIGFPE, &act, NULL);
11056 sigaction (SIGILL, &act, NULL);
11057 sigaction (SIGBUS, &act, NULL);
11058
11059 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11060 act.sa_handler = sigtermHandler;
11061 sigaction (SIGTERM, &act, NULL);
11062 return;
11063 }
11064
11065 #include "staticsymbols.h"
11066 /* This function try to convert a pointer into a function name. It's used in
11067 * oreder to provide a backtrace under segmentation fault that's able to
11068 * display functions declared as static (otherwise the backtrace is useless). */
11069 static char *findFuncName(void *pointer, unsigned long *offset){
11070 int i, ret = -1;
11071 unsigned long off, minoff = 0;
11072
11073 /* Try to match against the Symbol with the smallest offset */
11074 for (i=0; symsTable[i].pointer; i++) {
11075 unsigned long lp = (unsigned long) pointer;
11076
11077 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11078 off=lp-symsTable[i].pointer;
11079 if (ret < 0 || off < minoff) {
11080 minoff=off;
11081 ret=i;
11082 }
11083 }
11084 }
11085 if (ret == -1) return NULL;
11086 *offset = minoff;
11087 return symsTable[ret].name;
11088 }
11089 #else /* HAVE_BACKTRACE */
11090 static void setupSigSegvAction(void) {
11091 }
11092 #endif /* HAVE_BACKTRACE */
11093
11094
11095
11096 /* The End */
11097
11098
11099