]> git.saurik.com Git - redis.git/blob - redis.c
08fcd978c4ee297a864377de3dd7f8d52554586b
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.0"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int shutdown_asap;
373 time_t lastfsync;
374 int appendfd;
375 int appendseldb;
376 char *pidfile;
377 pid_t bgsavechildpid;
378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
380 sds aofbuf; /* AOF buffer, written before entering the event loop */
381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
386 char *appendfilename;
387 char *requirepass;
388 int rdbcompression;
389 int activerehashing;
390 /* Replication related */
391 int isslave;
392 char *masterauth;
393 char *masterhost;
394 int masterport;
395 redisClient *master; /* client that is master for this slave */
396 int replstate;
397 unsigned int maxclients;
398 unsigned long long maxmemory;
399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
406 /* Virtual memory configuration */
407 int vm_enabled;
408 char *vm_swap_file;
409 off_t vm_page_size;
410 off_t vm_pages;
411 unsigned long long vm_max_memory;
412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
421 time_t unixtime; /* Unix time sampled every second. */
422 /* Virtual memory I/O threads stuff */
423 /* An I/O thread process an element taken from the io_jobs queue and
424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
447 /* Pubsub */
448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
450 /* Misc */
451 FILE *devnull;
452 };
453
454 typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457 } pubsubPattern;
458
459 typedef void redisCommandProc(redisClient *c);
460 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
461 struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
469 redisVmPreloadProc *vm_preload_proc;
470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
474 };
475
476 struct redisFunctionSym {
477 char *name;
478 unsigned long pointer;
479 };
480
481 typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487 } redisSortObject;
488
489 typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492 } redisSortOperation;
493
494 /* ZSETs use a specialized version of Skiplists */
495
496 typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
498 struct zskiplistNode *backward;
499 unsigned int *span;
500 double score;
501 robj *obj;
502 } zskiplistNode;
503
504 typedef struct zskiplist {
505 struct zskiplistNode *header, *tail;
506 unsigned long length;
507 int level;
508 } zskiplist;
509
510 typedef struct zset {
511 dict *dict;
512 zskiplist *zsl;
513 } zset;
514
515 /* Our shared "common" objects */
516
517 #define REDIS_SHARED_INTEGERS 10000
518 struct sharedObjectsStruct {
519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
520 *colon, *nullbulk, *nullmultibulk, *queued,
521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
523 *select0, *select1, *select2, *select3, *select4,
524 *select5, *select6, *select7, *select8, *select9,
525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
528 } shared;
529
530 /* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
536 /* VM threaded I/O request message */
537 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
540 typedef struct iojob {
541 int type; /* Request type, REDIS_IOJOB_* */
542 redisDb *db;/* Redis database */
543 robj *key; /* This I/O request is about swapping this key */
544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550 } iojob;
551
552 /*================================ Prototypes =============================== */
553
554 static void freeStringObject(robj *o);
555 static void freeListObject(robj *o);
556 static void freeSetObject(robj *o);
557 static void decrRefCount(void *o);
558 static robj *createObject(int type, void *ptr);
559 static void freeClient(redisClient *c);
560 static int rdbLoad(char *filename);
561 static void addReply(redisClient *c, robj *obj);
562 static void addReplySds(redisClient *c, sds s);
563 static void incrRefCount(robj *o);
564 static int rdbSaveBackground(char *filename);
565 static robj *createStringObject(char *ptr, size_t len);
566 static robj *dupStringObject(robj *o);
567 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
568 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
569 static void flushAppendOnlyFile(void);
570 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
571 static int syncWithMaster(void);
572 static robj *tryObjectEncoding(robj *o);
573 static robj *getDecodedObject(robj *o);
574 static int removeExpire(redisDb *db, robj *key);
575 static int expireIfNeeded(redisDb *db, robj *key);
576 static int deleteIfVolatile(redisDb *db, robj *key);
577 static int deleteIfSwapped(redisDb *db, robj *key);
578 static int deleteKey(redisDb *db, robj *key);
579 static time_t getExpire(redisDb *db, robj *key);
580 static int setExpire(redisDb *db, robj *key, time_t when);
581 static void updateSlavesWaitingBgsave(int bgsaveerr);
582 static void freeMemoryIfNeeded(void);
583 static int processCommand(redisClient *c);
584 static void setupSigSegvAction(void);
585 static void rdbRemoveTempFile(pid_t childpid);
586 static void aofRemoveTempFile(pid_t childpid);
587 static size_t stringObjectLen(robj *o);
588 static void processInputBuffer(redisClient *c);
589 static zskiplist *zslCreate(void);
590 static void zslFree(zskiplist *zsl);
591 static void zslInsert(zskiplist *zsl, double score, robj *obj);
592 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
593 static void initClientMultiState(redisClient *c);
594 static void freeClientMultiState(redisClient *c);
595 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
596 static void unblockClientWaitingData(redisClient *c);
597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
598 static void vmInit(void);
599 static void vmMarkPagesFree(off_t page, off_t count);
600 static robj *vmLoadObject(robj *key);
601 static robj *vmPreviewObject(robj *key);
602 static int vmSwapOneObjectBlocking(void);
603 static int vmSwapOneObjectThreaded(void);
604 static int vmCanSwapOut(void);
605 static int tryFreeOneObjectFromFreelist(void);
606 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmCancelThreadedIOJob(robj *o);
609 static void lockThreadedIO(void);
610 static void unlockThreadedIO(void);
611 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612 static void freeIOJob(iojob *j);
613 static void queueIOJob(iojob *j);
614 static int vmWriteObjectOnSwap(robj *o, off_t page);
615 static robj *vmReadObjectFromSwap(off_t page, int type);
616 static void waitEmptyIOJobsQueue(void);
617 static void vmReopenSwapFile(void);
618 static int vmFreePage(off_t page);
619 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
620 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
622 static int dontWaitForSwappedKey(redisClient *c, robj *key);
623 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625 static struct redisCommand *lookupCommand(char *name);
626 static void call(redisClient *c, struct redisCommand *cmd);
627 static void resetClient(redisClient *c);
628 static void convertToRealHash(robj *o);
629 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631 static void freePubsubPattern(void *p);
632 static int listMatchPubsubPattern(void *a, void *b);
633 static int compareStringObjects(robj *a, robj *b);
634 static int equalStringObjects(robj *a, robj *b);
635 static void usage();
636 static int rewriteAppendOnlyFileBackground(void);
637 static int vmSwapObjectBlocking(robj *key, robj *val);
638 static int prepareForShutdown();
639 static void touchWatchedKey(redisDb *db, robj *key);
640 static void unwatchAllKeys(redisClient *c);
641
642 static void authCommand(redisClient *c);
643 static void pingCommand(redisClient *c);
644 static void echoCommand(redisClient *c);
645 static void setCommand(redisClient *c);
646 static void setnxCommand(redisClient *c);
647 static void setexCommand(redisClient *c);
648 static void getCommand(redisClient *c);
649 static void delCommand(redisClient *c);
650 static void existsCommand(redisClient *c);
651 static void incrCommand(redisClient *c);
652 static void decrCommand(redisClient *c);
653 static void incrbyCommand(redisClient *c);
654 static void decrbyCommand(redisClient *c);
655 static void selectCommand(redisClient *c);
656 static void randomkeyCommand(redisClient *c);
657 static void keysCommand(redisClient *c);
658 static void dbsizeCommand(redisClient *c);
659 static void lastsaveCommand(redisClient *c);
660 static void saveCommand(redisClient *c);
661 static void bgsaveCommand(redisClient *c);
662 static void bgrewriteaofCommand(redisClient *c);
663 static void shutdownCommand(redisClient *c);
664 static void moveCommand(redisClient *c);
665 static void renameCommand(redisClient *c);
666 static void renamenxCommand(redisClient *c);
667 static void lpushCommand(redisClient *c);
668 static void rpushCommand(redisClient *c);
669 static void lpopCommand(redisClient *c);
670 static void rpopCommand(redisClient *c);
671 static void llenCommand(redisClient *c);
672 static void lindexCommand(redisClient *c);
673 static void lrangeCommand(redisClient *c);
674 static void ltrimCommand(redisClient *c);
675 static void typeCommand(redisClient *c);
676 static void lsetCommand(redisClient *c);
677 static void saddCommand(redisClient *c);
678 static void sremCommand(redisClient *c);
679 static void smoveCommand(redisClient *c);
680 static void sismemberCommand(redisClient *c);
681 static void scardCommand(redisClient *c);
682 static void spopCommand(redisClient *c);
683 static void srandmemberCommand(redisClient *c);
684 static void sinterCommand(redisClient *c);
685 static void sinterstoreCommand(redisClient *c);
686 static void sunionCommand(redisClient *c);
687 static void sunionstoreCommand(redisClient *c);
688 static void sdiffCommand(redisClient *c);
689 static void sdiffstoreCommand(redisClient *c);
690 static void syncCommand(redisClient *c);
691 static void flushdbCommand(redisClient *c);
692 static void flushallCommand(redisClient *c);
693 static void sortCommand(redisClient *c);
694 static void lremCommand(redisClient *c);
695 static void rpoplpushcommand(redisClient *c);
696 static void infoCommand(redisClient *c);
697 static void mgetCommand(redisClient *c);
698 static void monitorCommand(redisClient *c);
699 static void expireCommand(redisClient *c);
700 static void expireatCommand(redisClient *c);
701 static void getsetCommand(redisClient *c);
702 static void ttlCommand(redisClient *c);
703 static void slaveofCommand(redisClient *c);
704 static void debugCommand(redisClient *c);
705 static void msetCommand(redisClient *c);
706 static void msetnxCommand(redisClient *c);
707 static void zaddCommand(redisClient *c);
708 static void zincrbyCommand(redisClient *c);
709 static void zrangeCommand(redisClient *c);
710 static void zrangebyscoreCommand(redisClient *c);
711 static void zcountCommand(redisClient *c);
712 static void zrevrangeCommand(redisClient *c);
713 static void zcardCommand(redisClient *c);
714 static void zremCommand(redisClient *c);
715 static void zscoreCommand(redisClient *c);
716 static void zremrangebyscoreCommand(redisClient *c);
717 static void multiCommand(redisClient *c);
718 static void execCommand(redisClient *c);
719 static void discardCommand(redisClient *c);
720 static void blpopCommand(redisClient *c);
721 static void brpopCommand(redisClient *c);
722 static void appendCommand(redisClient *c);
723 static void substrCommand(redisClient *c);
724 static void zrankCommand(redisClient *c);
725 static void zrevrankCommand(redisClient *c);
726 static void hsetCommand(redisClient *c);
727 static void hsetnxCommand(redisClient *c);
728 static void hgetCommand(redisClient *c);
729 static void hmsetCommand(redisClient *c);
730 static void hmgetCommand(redisClient *c);
731 static void hdelCommand(redisClient *c);
732 static void hlenCommand(redisClient *c);
733 static void zremrangebyrankCommand(redisClient *c);
734 static void zunionstoreCommand(redisClient *c);
735 static void zinterstoreCommand(redisClient *c);
736 static void hkeysCommand(redisClient *c);
737 static void hvalsCommand(redisClient *c);
738 static void hgetallCommand(redisClient *c);
739 static void hexistsCommand(redisClient *c);
740 static void configCommand(redisClient *c);
741 static void hincrbyCommand(redisClient *c);
742 static void subscribeCommand(redisClient *c);
743 static void unsubscribeCommand(redisClient *c);
744 static void psubscribeCommand(redisClient *c);
745 static void punsubscribeCommand(redisClient *c);
746 static void publishCommand(redisClient *c);
747 static void watchCommand(redisClient *c);
748 static void unwatchCommand(redisClient *c);
749
750 /*================================= Globals ================================= */
751
752 /* Global vars */
753 static struct redisServer server; /* server global state */
754 static struct redisCommand cmdTable[] = {
755 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
757 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
758 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
760 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
762 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
764 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
766 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
775 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
778 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
779 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
782 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
787 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
788 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
789 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
790 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
791 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
792 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
796 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
799 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
800 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
806 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
807 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
820 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
824 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
825 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
837 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
843 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
845 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
850 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
853 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
856 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
861 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {NULL,NULL,0,0,NULL,0,0,0}
864 };
865
866 /*============================ Utility functions ============================ */
867
868 /* Glob-style pattern matching. */
869 static int stringmatchlen(const char *pattern, int patternLen,
870 const char *string, int stringLen, int nocase)
871 {
872 while(patternLen) {
873 switch(pattern[0]) {
874 case '*':
875 while (pattern[1] == '*') {
876 pattern++;
877 patternLen--;
878 }
879 if (patternLen == 1)
880 return 1; /* match */
881 while(stringLen) {
882 if (stringmatchlen(pattern+1, patternLen-1,
883 string, stringLen, nocase))
884 return 1; /* match */
885 string++;
886 stringLen--;
887 }
888 return 0; /* no match */
889 break;
890 case '?':
891 if (stringLen == 0)
892 return 0; /* no match */
893 string++;
894 stringLen--;
895 break;
896 case '[':
897 {
898 int not, match;
899
900 pattern++;
901 patternLen--;
902 not = pattern[0] == '^';
903 if (not) {
904 pattern++;
905 patternLen--;
906 }
907 match = 0;
908 while(1) {
909 if (pattern[0] == '\\') {
910 pattern++;
911 patternLen--;
912 if (pattern[0] == string[0])
913 match = 1;
914 } else if (pattern[0] == ']') {
915 break;
916 } else if (patternLen == 0) {
917 pattern--;
918 patternLen++;
919 break;
920 } else if (pattern[1] == '-' && patternLen >= 3) {
921 int start = pattern[0];
922 int end = pattern[2];
923 int c = string[0];
924 if (start > end) {
925 int t = start;
926 start = end;
927 end = t;
928 }
929 if (nocase) {
930 start = tolower(start);
931 end = tolower(end);
932 c = tolower(c);
933 }
934 pattern += 2;
935 patternLen -= 2;
936 if (c >= start && c <= end)
937 match = 1;
938 } else {
939 if (!nocase) {
940 if (pattern[0] == string[0])
941 match = 1;
942 } else {
943 if (tolower((int)pattern[0]) == tolower((int)string[0]))
944 match = 1;
945 }
946 }
947 pattern++;
948 patternLen--;
949 }
950 if (not)
951 match = !match;
952 if (!match)
953 return 0; /* no match */
954 string++;
955 stringLen--;
956 break;
957 }
958 case '\\':
959 if (patternLen >= 2) {
960 pattern++;
961 patternLen--;
962 }
963 /* fall through */
964 default:
965 if (!nocase) {
966 if (pattern[0] != string[0])
967 return 0; /* no match */
968 } else {
969 if (tolower((int)pattern[0]) != tolower((int)string[0]))
970 return 0; /* no match */
971 }
972 string++;
973 stringLen--;
974 break;
975 }
976 pattern++;
977 patternLen--;
978 if (stringLen == 0) {
979 while(*pattern == '*') {
980 pattern++;
981 patternLen--;
982 }
983 break;
984 }
985 }
986 if (patternLen == 0 && stringLen == 0)
987 return 1;
988 return 0;
989 }
990
991 static int stringmatch(const char *pattern, const char *string, int nocase) {
992 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
993 }
994
995 /* Convert a string representing an amount of memory into the number of
996 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
997 * (1024*1024*1024).
998 *
999 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1000 * set to 0 */
1001 static long long memtoll(const char *p, int *err) {
1002 const char *u;
1003 char buf[128];
1004 long mul; /* unit multiplier */
1005 long long val;
1006 unsigned int digits;
1007
1008 if (err) *err = 0;
1009 /* Search the first non digit character. */
1010 u = p;
1011 if (*u == '-') u++;
1012 while(*u && isdigit(*u)) u++;
1013 if (*u == '\0' || !strcasecmp(u,"b")) {
1014 mul = 1;
1015 } else if (!strcasecmp(u,"k")) {
1016 mul = 1000;
1017 } else if (!strcasecmp(u,"kb")) {
1018 mul = 1024;
1019 } else if (!strcasecmp(u,"m")) {
1020 mul = 1000*1000;
1021 } else if (!strcasecmp(u,"mb")) {
1022 mul = 1024*1024;
1023 } else if (!strcasecmp(u,"g")) {
1024 mul = 1000L*1000*1000;
1025 } else if (!strcasecmp(u,"gb")) {
1026 mul = 1024L*1024*1024;
1027 } else {
1028 if (err) *err = 1;
1029 mul = 1;
1030 }
1031 digits = u-p;
1032 if (digits >= sizeof(buf)) {
1033 if (err) *err = 1;
1034 return LLONG_MAX;
1035 }
1036 memcpy(buf,p,digits);
1037 buf[digits] = '\0';
1038 val = strtoll(buf,NULL,10);
1039 return val*mul;
1040 }
1041
1042 /* Convert a long long into a string. Returns the number of
1043 * characters needed to represent the number, that can be shorter if passed
1044 * buffer length is not enough to store the whole number. */
1045 static int ll2string(char *s, size_t len, long long value) {
1046 char buf[32], *p;
1047 unsigned long long v;
1048 size_t l;
1049
1050 if (len == 0) return 0;
1051 v = (value < 0) ? -value : value;
1052 p = buf+31; /* point to the last character */
1053 do {
1054 *p-- = '0'+(v%10);
1055 v /= 10;
1056 } while(v);
1057 if (value < 0) *p-- = '-';
1058 p++;
1059 l = 32-(p-buf);
1060 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1061 memcpy(s,p,l);
1062 s[l] = '\0';
1063 return l;
1064 }
1065
1066 static void redisLog(int level, const char *fmt, ...) {
1067 va_list ap;
1068 FILE *fp;
1069
1070 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1071 if (!fp) return;
1072
1073 va_start(ap, fmt);
1074 if (level >= server.verbosity) {
1075 char *c = ".-*#";
1076 char buf[64];
1077 time_t now;
1078
1079 now = time(NULL);
1080 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1081 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1082 vfprintf(fp, fmt, ap);
1083 fprintf(fp,"\n");
1084 fflush(fp);
1085 }
1086 va_end(ap);
1087
1088 if (server.logfile) fclose(fp);
1089 }
1090
1091 /*====================== Hash table type implementation ==================== */
1092
1093 /* This is an hash table type that uses the SDS dynamic strings libary as
1094 * keys and radis objects as values (objects can hold SDS strings,
1095 * lists, sets). */
1096
1097 static void dictVanillaFree(void *privdata, void *val)
1098 {
1099 DICT_NOTUSED(privdata);
1100 zfree(val);
1101 }
1102
1103 static void dictListDestructor(void *privdata, void *val)
1104 {
1105 DICT_NOTUSED(privdata);
1106 listRelease((list*)val);
1107 }
1108
1109 static int sdsDictKeyCompare(void *privdata, const void *key1,
1110 const void *key2)
1111 {
1112 int l1,l2;
1113 DICT_NOTUSED(privdata);
1114
1115 l1 = sdslen((sds)key1);
1116 l2 = sdslen((sds)key2);
1117 if (l1 != l2) return 0;
1118 return memcmp(key1, key2, l1) == 0;
1119 }
1120
1121 static void dictRedisObjectDestructor(void *privdata, void *val)
1122 {
1123 DICT_NOTUSED(privdata);
1124
1125 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1126 decrRefCount(val);
1127 }
1128
1129 static int dictObjKeyCompare(void *privdata, const void *key1,
1130 const void *key2)
1131 {
1132 const robj *o1 = key1, *o2 = key2;
1133 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1134 }
1135
1136 static unsigned int dictObjHash(const void *key) {
1137 const robj *o = key;
1138 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1139 }
1140
1141 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1142 const void *key2)
1143 {
1144 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1145 int cmp;
1146
1147 if (o1->encoding == REDIS_ENCODING_INT &&
1148 o2->encoding == REDIS_ENCODING_INT)
1149 return o1->ptr == o2->ptr;
1150
1151 o1 = getDecodedObject(o1);
1152 o2 = getDecodedObject(o2);
1153 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1154 decrRefCount(o1);
1155 decrRefCount(o2);
1156 return cmp;
1157 }
1158
1159 static unsigned int dictEncObjHash(const void *key) {
1160 robj *o = (robj*) key;
1161
1162 if (o->encoding == REDIS_ENCODING_RAW) {
1163 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1164 } else {
1165 if (o->encoding == REDIS_ENCODING_INT) {
1166 char buf[32];
1167 int len;
1168
1169 len = ll2string(buf,32,(long)o->ptr);
1170 return dictGenHashFunction((unsigned char*)buf, len);
1171 } else {
1172 unsigned int hash;
1173
1174 o = getDecodedObject(o);
1175 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1176 decrRefCount(o);
1177 return hash;
1178 }
1179 }
1180 }
1181
1182 /* Sets type and expires */
1183 static dictType setDictType = {
1184 dictEncObjHash, /* hash function */
1185 NULL, /* key dup */
1186 NULL, /* val dup */
1187 dictEncObjKeyCompare, /* key compare */
1188 dictRedisObjectDestructor, /* key destructor */
1189 NULL /* val destructor */
1190 };
1191
1192 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1193 static dictType zsetDictType = {
1194 dictEncObjHash, /* hash function */
1195 NULL, /* key dup */
1196 NULL, /* val dup */
1197 dictEncObjKeyCompare, /* key compare */
1198 dictRedisObjectDestructor, /* key destructor */
1199 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1200 };
1201
1202 /* Db->dict */
1203 static dictType dbDictType = {
1204 dictObjHash, /* hash function */
1205 NULL, /* key dup */
1206 NULL, /* val dup */
1207 dictObjKeyCompare, /* key compare */
1208 dictRedisObjectDestructor, /* key destructor */
1209 dictRedisObjectDestructor /* val destructor */
1210 };
1211
1212 /* Db->expires */
1213 static dictType keyptrDictType = {
1214 dictObjHash, /* hash function */
1215 NULL, /* key dup */
1216 NULL, /* val dup */
1217 dictObjKeyCompare, /* key compare */
1218 dictRedisObjectDestructor, /* key destructor */
1219 NULL /* val destructor */
1220 };
1221
1222 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1223 static dictType hashDictType = {
1224 dictEncObjHash, /* hash function */
1225 NULL, /* key dup */
1226 NULL, /* val dup */
1227 dictEncObjKeyCompare, /* key compare */
1228 dictRedisObjectDestructor, /* key destructor */
1229 dictRedisObjectDestructor /* val destructor */
1230 };
1231
1232 /* Keylist hash table type has unencoded redis objects as keys and
1233 * lists as values. It's used for blocking operations (BLPOP) and to
1234 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1235 static dictType keylistDictType = {
1236 dictObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
1241 dictListDestructor /* val destructor */
1242 };
1243
1244 static void version();
1245
1246 /* ========================= Random utility functions ======================= */
1247
1248 /* Redis generally does not try to recover from out of memory conditions
1249 * when allocating objects or strings, it is not clear if it will be possible
1250 * to report this condition to the client since the networking layer itself
1251 * is based on heap allocation for send buffers, so we simply abort.
1252 * At least the code will be simpler to read... */
1253 static void oom(const char *msg) {
1254 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1255 sleep(1);
1256 abort();
1257 }
1258
1259 /* ====================== Redis server networking stuff ===================== */
1260 static void closeTimedoutClients(void) {
1261 redisClient *c;
1262 listNode *ln;
1263 time_t now = time(NULL);
1264 listIter li;
1265
1266 listRewind(server.clients,&li);
1267 while ((ln = listNext(&li)) != NULL) {
1268 c = listNodeValue(ln);
1269 if (server.maxidletime &&
1270 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1271 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1272 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1273 listLength(c->pubsub_patterns) == 0 &&
1274 (now - c->lastinteraction > server.maxidletime))
1275 {
1276 redisLog(REDIS_VERBOSE,"Closing idle client");
1277 freeClient(c);
1278 } else if (c->flags & REDIS_BLOCKED) {
1279 if (c->blockingto != 0 && c->blockingto < now) {
1280 addReply(c,shared.nullmultibulk);
1281 unblockClientWaitingData(c);
1282 }
1283 }
1284 }
1285 }
1286
1287 static int htNeedsResize(dict *dict) {
1288 long long size, used;
1289
1290 size = dictSlots(dict);
1291 used = dictSize(dict);
1292 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1293 (used*100/size < REDIS_HT_MINFILL));
1294 }
1295
1296 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1297 * we resize the hash table to save memory */
1298 static void tryResizeHashTables(void) {
1299 int j;
1300
1301 for (j = 0; j < server.dbnum; j++) {
1302 if (htNeedsResize(server.db[j].dict))
1303 dictResize(server.db[j].dict);
1304 if (htNeedsResize(server.db[j].expires))
1305 dictResize(server.db[j].expires);
1306 }
1307 }
1308
1309 /* Our hash table implementation performs rehashing incrementally while
1310 * we write/read from the hash table. Still if the server is idle, the hash
1311 * table will use two tables for a long time. So we try to use 1 millisecond
1312 * of CPU time at every serverCron() loop in order to rehash some key. */
1313 static void incrementallyRehash(void) {
1314 int j;
1315
1316 for (j = 0; j < server.dbnum; j++) {
1317 if (dictIsRehashing(server.db[j].dict)) {
1318 dictRehashMilliseconds(server.db[j].dict,1);
1319 break; /* already used our millisecond for this loop... */
1320 }
1321 }
1322 }
1323
1324 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1325 void backgroundSaveDoneHandler(int statloc) {
1326 int exitcode = WEXITSTATUS(statloc);
1327 int bysignal = WIFSIGNALED(statloc);
1328
1329 if (!bysignal && exitcode == 0) {
1330 redisLog(REDIS_NOTICE,
1331 "Background saving terminated with success");
1332 server.dirty = 0;
1333 server.lastsave = time(NULL);
1334 } else if (!bysignal && exitcode != 0) {
1335 redisLog(REDIS_WARNING, "Background saving error");
1336 } else {
1337 redisLog(REDIS_WARNING,
1338 "Background saving terminated by signal %d", WTERMSIG(statloc));
1339 rdbRemoveTempFile(server.bgsavechildpid);
1340 }
1341 server.bgsavechildpid = -1;
1342 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1343 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1344 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1345 }
1346
1347 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1348 * Handle this. */
1349 void backgroundRewriteDoneHandler(int statloc) {
1350 int exitcode = WEXITSTATUS(statloc);
1351 int bysignal = WIFSIGNALED(statloc);
1352
1353 if (!bysignal && exitcode == 0) {
1354 int fd;
1355 char tmpfile[256];
1356
1357 redisLog(REDIS_NOTICE,
1358 "Background append only file rewriting terminated with success");
1359 /* Now it's time to flush the differences accumulated by the parent */
1360 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1361 fd = open(tmpfile,O_WRONLY|O_APPEND);
1362 if (fd == -1) {
1363 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1364 goto cleanup;
1365 }
1366 /* Flush our data... */
1367 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1368 (signed) sdslen(server.bgrewritebuf)) {
1369 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1370 close(fd);
1371 goto cleanup;
1372 }
1373 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1374 /* Now our work is to rename the temp file into the stable file. And
1375 * switch the file descriptor used by the server for append only. */
1376 if (rename(tmpfile,server.appendfilename) == -1) {
1377 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1378 close(fd);
1379 goto cleanup;
1380 }
1381 /* Mission completed... almost */
1382 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1383 if (server.appendfd != -1) {
1384 /* If append only is actually enabled... */
1385 close(server.appendfd);
1386 server.appendfd = fd;
1387 fsync(fd);
1388 server.appendseldb = -1; /* Make sure it will issue SELECT */
1389 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1390 } else {
1391 /* If append only is disabled we just generate a dump in this
1392 * format. Why not? */
1393 close(fd);
1394 }
1395 } else if (!bysignal && exitcode != 0) {
1396 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1397 } else {
1398 redisLog(REDIS_WARNING,
1399 "Background append only file rewriting terminated by signal %d",
1400 WTERMSIG(statloc));
1401 }
1402 cleanup:
1403 sdsfree(server.bgrewritebuf);
1404 server.bgrewritebuf = sdsempty();
1405 aofRemoveTempFile(server.bgrewritechildpid);
1406 server.bgrewritechildpid = -1;
1407 }
1408
1409 /* This function is called once a background process of some kind terminates,
1410 * as we want to avoid resizing the hash tables when there is a child in order
1411 * to play well with copy-on-write (otherwise when a resize happens lots of
1412 * memory pages are copied). The goal of this function is to update the ability
1413 * for dict.c to resize the hash tables accordingly to the fact we have o not
1414 * running childs. */
1415 static void updateDictResizePolicy(void) {
1416 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1417 dictEnableResize();
1418 else
1419 dictDisableResize();
1420 }
1421
1422 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1423 int j, loops = server.cronloops++;
1424 REDIS_NOTUSED(eventLoop);
1425 REDIS_NOTUSED(id);
1426 REDIS_NOTUSED(clientData);
1427
1428 /* We take a cached value of the unix time in the global state because
1429 * with virtual memory and aging there is to store the current time
1430 * in objects at every object access, and accuracy is not needed.
1431 * To access a global var is faster than calling time(NULL) */
1432 server.unixtime = time(NULL);
1433
1434 /* We received a SIGTERM, shutting down here in a safe way, as it is
1435 * not ok doing so inside the signal handler. */
1436 if (server.shutdown_asap) {
1437 if (prepareForShutdown() == REDIS_OK) exit(0);
1438 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1439 }
1440
1441 /* Show some info about non-empty databases */
1442 for (j = 0; j < server.dbnum; j++) {
1443 long long size, used, vkeys;
1444
1445 size = dictSlots(server.db[j].dict);
1446 used = dictSize(server.db[j].dict);
1447 vkeys = dictSize(server.db[j].expires);
1448 if (!(loops % 50) && (used || vkeys)) {
1449 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1450 /* dictPrintStats(server.dict); */
1451 }
1452 }
1453
1454 /* We don't want to resize the hash tables while a bacground saving
1455 * is in progress: the saving child is created using fork() that is
1456 * implemented with a copy-on-write semantic in most modern systems, so
1457 * if we resize the HT while there is the saving child at work actually
1458 * a lot of memory movements in the parent will cause a lot of pages
1459 * copied. */
1460 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1461 if (!(loops % 10)) tryResizeHashTables();
1462 if (server.activerehashing) incrementallyRehash();
1463 }
1464
1465 /* Show information about connected clients */
1466 if (!(loops % 50)) {
1467 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1468 listLength(server.clients)-listLength(server.slaves),
1469 listLength(server.slaves),
1470 zmalloc_used_memory());
1471 }
1472
1473 /* Close connections of timedout clients */
1474 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1475 closeTimedoutClients();
1476
1477 /* Check if a background saving or AOF rewrite in progress terminated */
1478 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1479 int statloc;
1480 pid_t pid;
1481
1482 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1483 if (pid == server.bgsavechildpid) {
1484 backgroundSaveDoneHandler(statloc);
1485 } else {
1486 backgroundRewriteDoneHandler(statloc);
1487 }
1488 updateDictResizePolicy();
1489 }
1490 } else {
1491 /* If there is not a background saving in progress check if
1492 * we have to save now */
1493 time_t now = time(NULL);
1494 for (j = 0; j < server.saveparamslen; j++) {
1495 struct saveparam *sp = server.saveparams+j;
1496
1497 if (server.dirty >= sp->changes &&
1498 now-server.lastsave > sp->seconds) {
1499 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1500 sp->changes, sp->seconds);
1501 rdbSaveBackground(server.dbfilename);
1502 break;
1503 }
1504 }
1505 }
1506
1507 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1508 * will use few CPU cycles if there are few expiring keys, otherwise
1509 * it will get more aggressive to avoid that too much memory is used by
1510 * keys that can be removed from the keyspace. */
1511 for (j = 0; j < server.dbnum; j++) {
1512 int expired;
1513 redisDb *db = server.db+j;
1514
1515 /* Continue to expire if at the end of the cycle more than 25%
1516 * of the keys were expired. */
1517 do {
1518 long num = dictSize(db->expires);
1519 time_t now = time(NULL);
1520
1521 expired = 0;
1522 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1523 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1524 while (num--) {
1525 dictEntry *de;
1526 time_t t;
1527
1528 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1529 t = (time_t) dictGetEntryVal(de);
1530 if (now > t) {
1531 deleteKey(db,dictGetEntryKey(de));
1532 expired++;
1533 server.stat_expiredkeys++;
1534 }
1535 }
1536 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1537 }
1538
1539 /* Swap a few keys on disk if we are over the memory limit and VM
1540 * is enbled. Try to free objects from the free list first. */
1541 if (vmCanSwapOut()) {
1542 while (server.vm_enabled && zmalloc_used_memory() >
1543 server.vm_max_memory)
1544 {
1545 int retval;
1546
1547 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1548 retval = (server.vm_max_threads == 0) ?
1549 vmSwapOneObjectBlocking() :
1550 vmSwapOneObjectThreaded();
1551 if (retval == REDIS_ERR && !(loops % 300) &&
1552 zmalloc_used_memory() >
1553 (server.vm_max_memory+server.vm_max_memory/10))
1554 {
1555 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1556 }
1557 /* Note that when using threade I/O we free just one object,
1558 * because anyway when the I/O thread in charge to swap this
1559 * object out will finish, the handler of completed jobs
1560 * will try to swap more objects if we are still out of memory. */
1561 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1562 }
1563 }
1564
1565 /* Check if we should connect to a MASTER */
1566 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1567 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1568 if (syncWithMaster() == REDIS_OK) {
1569 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1570 if (server.appendonly) rewriteAppendOnlyFileBackground();
1571 }
1572 }
1573 return 100;
1574 }
1575
1576 /* This function gets called every time Redis is entering the
1577 * main loop of the event driven library, that is, before to sleep
1578 * for ready file descriptors. */
1579 static void beforeSleep(struct aeEventLoop *eventLoop) {
1580 REDIS_NOTUSED(eventLoop);
1581
1582 /* Awake clients that got all the swapped keys they requested */
1583 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1584 listIter li;
1585 listNode *ln;
1586
1587 listRewind(server.io_ready_clients,&li);
1588 while((ln = listNext(&li))) {
1589 redisClient *c = ln->value;
1590 struct redisCommand *cmd;
1591
1592 /* Resume the client. */
1593 listDelNode(server.io_ready_clients,ln);
1594 c->flags &= (~REDIS_IO_WAIT);
1595 server.vm_blocked_clients--;
1596 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1597 readQueryFromClient, c);
1598 cmd = lookupCommand(c->argv[0]->ptr);
1599 assert(cmd != NULL);
1600 call(c,cmd);
1601 resetClient(c);
1602 /* There may be more data to process in the input buffer. */
1603 if (c->querybuf && sdslen(c->querybuf) > 0)
1604 processInputBuffer(c);
1605 }
1606 }
1607 /* Write the AOF buffer on disk */
1608 flushAppendOnlyFile();
1609 }
1610
1611 static void createSharedObjects(void) {
1612 int j;
1613
1614 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1615 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1616 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1617 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1618 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1619 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1620 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1621 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1622 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1623 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1624 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1625 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1626 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1627 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR no such key\r\n"));
1629 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR syntax error\r\n"));
1631 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR source and destination objects are the same\r\n"));
1633 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR index out of range\r\n"));
1635 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1636 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1637 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1638 shared.select0 = createStringObject("select 0\r\n",10);
1639 shared.select1 = createStringObject("select 1\r\n",10);
1640 shared.select2 = createStringObject("select 2\r\n",10);
1641 shared.select3 = createStringObject("select 3\r\n",10);
1642 shared.select4 = createStringObject("select 4\r\n",10);
1643 shared.select5 = createStringObject("select 5\r\n",10);
1644 shared.select6 = createStringObject("select 6\r\n",10);
1645 shared.select7 = createStringObject("select 7\r\n",10);
1646 shared.select8 = createStringObject("select 8\r\n",10);
1647 shared.select9 = createStringObject("select 9\r\n",10);
1648 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1649 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1650 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1651 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1652 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1653 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1654 shared.mbulk3 = createStringObject("*3\r\n",4);
1655 shared.mbulk4 = createStringObject("*4\r\n",4);
1656 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1657 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1658 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1659 }
1660 }
1661
1662 static void appendServerSaveParams(time_t seconds, int changes) {
1663 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1664 server.saveparams[server.saveparamslen].seconds = seconds;
1665 server.saveparams[server.saveparamslen].changes = changes;
1666 server.saveparamslen++;
1667 }
1668
1669 static void resetServerSaveParams() {
1670 zfree(server.saveparams);
1671 server.saveparams = NULL;
1672 server.saveparamslen = 0;
1673 }
1674
1675 static void initServerConfig() {
1676 server.dbnum = REDIS_DEFAULT_DBNUM;
1677 server.port = REDIS_SERVERPORT;
1678 server.verbosity = REDIS_VERBOSE;
1679 server.maxidletime = REDIS_MAXIDLETIME;
1680 server.saveparams = NULL;
1681 server.logfile = NULL; /* NULL = log on standard output */
1682 server.bindaddr = NULL;
1683 server.glueoutputbuf = 1;
1684 server.daemonize = 0;
1685 server.appendonly = 0;
1686 server.appendfsync = APPENDFSYNC_EVERYSEC;
1687 server.lastfsync = time(NULL);
1688 server.appendfd = -1;
1689 server.appendseldb = -1; /* Make sure the first time will not match */
1690 server.pidfile = zstrdup("/var/run/redis.pid");
1691 server.dbfilename = zstrdup("dump.rdb");
1692 server.appendfilename = zstrdup("appendonly.aof");
1693 server.requirepass = NULL;
1694 server.rdbcompression = 1;
1695 server.activerehashing = 1;
1696 server.maxclients = 0;
1697 server.blpop_blocked_clients = 0;
1698 server.maxmemory = 0;
1699 server.vm_enabled = 0;
1700 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1701 server.vm_page_size = 256; /* 256 bytes per page */
1702 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1703 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1704 server.vm_max_threads = 4;
1705 server.vm_blocked_clients = 0;
1706 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1707 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1708 server.shutdown_asap = 0;
1709
1710 resetServerSaveParams();
1711
1712 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1713 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1714 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1715 /* Replication related */
1716 server.isslave = 0;
1717 server.masterauth = NULL;
1718 server.masterhost = NULL;
1719 server.masterport = 6379;
1720 server.master = NULL;
1721 server.replstate = REDIS_REPL_NONE;
1722
1723 /* Double constants initialization */
1724 R_Zero = 0.0;
1725 R_PosInf = 1.0/R_Zero;
1726 R_NegInf = -1.0/R_Zero;
1727 R_Nan = R_Zero/R_Zero;
1728 }
1729
1730 static void initServer() {
1731 int j;
1732
1733 signal(SIGHUP, SIG_IGN);
1734 signal(SIGPIPE, SIG_IGN);
1735 setupSigSegvAction();
1736
1737 server.devnull = fopen("/dev/null","w");
1738 if (server.devnull == NULL) {
1739 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1740 exit(1);
1741 }
1742 server.clients = listCreate();
1743 server.slaves = listCreate();
1744 server.monitors = listCreate();
1745 server.objfreelist = listCreate();
1746 createSharedObjects();
1747 server.el = aeCreateEventLoop();
1748 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1749 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1750 if (server.fd == -1) {
1751 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1752 exit(1);
1753 }
1754 for (j = 0; j < server.dbnum; j++) {
1755 server.db[j].dict = dictCreate(&dbDictType,NULL);
1756 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1757 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1758 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1759 if (server.vm_enabled)
1760 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].id = j;
1762 }
1763 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1764 server.pubsub_patterns = listCreate();
1765 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1766 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1767 server.cronloops = 0;
1768 server.bgsavechildpid = -1;
1769 server.bgrewritechildpid = -1;
1770 server.bgrewritebuf = sdsempty();
1771 server.aofbuf = sdsempty();
1772 server.lastsave = time(NULL);
1773 server.dirty = 0;
1774 server.stat_numcommands = 0;
1775 server.stat_numconnections = 0;
1776 server.stat_expiredkeys = 0;
1777 server.stat_starttime = time(NULL);
1778 server.unixtime = time(NULL);
1779 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1780 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1781 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1782
1783 if (server.appendonly) {
1784 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1785 if (server.appendfd == -1) {
1786 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1787 strerror(errno));
1788 exit(1);
1789 }
1790 }
1791
1792 if (server.vm_enabled) vmInit();
1793 }
1794
1795 /* Empty the whole database */
1796 static long long emptyDb() {
1797 int j;
1798 long long removed = 0;
1799
1800 for (j = 0; j < server.dbnum; j++) {
1801 removed += dictSize(server.db[j].dict);
1802 dictEmpty(server.db[j].dict);
1803 dictEmpty(server.db[j].expires);
1804 }
1805 return removed;
1806 }
1807
1808 static int yesnotoi(char *s) {
1809 if (!strcasecmp(s,"yes")) return 1;
1810 else if (!strcasecmp(s,"no")) return 0;
1811 else return -1;
1812 }
1813
1814 /* I agree, this is a very rudimental way to load a configuration...
1815 will improve later if the config gets more complex */
1816 static void loadServerConfig(char *filename) {
1817 FILE *fp;
1818 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1819 int linenum = 0;
1820 sds line = NULL;
1821
1822 if (filename[0] == '-' && filename[1] == '\0')
1823 fp = stdin;
1824 else {
1825 if ((fp = fopen(filename,"r")) == NULL) {
1826 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1827 exit(1);
1828 }
1829 }
1830
1831 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1832 sds *argv;
1833 int argc, j;
1834
1835 linenum++;
1836 line = sdsnew(buf);
1837 line = sdstrim(line," \t\r\n");
1838
1839 /* Skip comments and blank lines*/
1840 if (line[0] == '#' || line[0] == '\0') {
1841 sdsfree(line);
1842 continue;
1843 }
1844
1845 /* Split into arguments */
1846 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1847 sdstolower(argv[0]);
1848
1849 /* Execute config directives */
1850 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1851 server.maxidletime = atoi(argv[1]);
1852 if (server.maxidletime < 0) {
1853 err = "Invalid timeout value"; goto loaderr;
1854 }
1855 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1856 server.port = atoi(argv[1]);
1857 if (server.port < 1 || server.port > 65535) {
1858 err = "Invalid port"; goto loaderr;
1859 }
1860 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1861 server.bindaddr = zstrdup(argv[1]);
1862 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1863 int seconds = atoi(argv[1]);
1864 int changes = atoi(argv[2]);
1865 if (seconds < 1 || changes < 0) {
1866 err = "Invalid save parameters"; goto loaderr;
1867 }
1868 appendServerSaveParams(seconds,changes);
1869 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1870 if (chdir(argv[1]) == -1) {
1871 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1872 argv[1], strerror(errno));
1873 exit(1);
1874 }
1875 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1876 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1877 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1878 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1879 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1880 else {
1881 err = "Invalid log level. Must be one of debug, notice, warning";
1882 goto loaderr;
1883 }
1884 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1885 FILE *logfp;
1886
1887 server.logfile = zstrdup(argv[1]);
1888 if (!strcasecmp(server.logfile,"stdout")) {
1889 zfree(server.logfile);
1890 server.logfile = NULL;
1891 }
1892 if (server.logfile) {
1893 /* Test if we are able to open the file. The server will not
1894 * be able to abort just for this problem later... */
1895 logfp = fopen(server.logfile,"a");
1896 if (logfp == NULL) {
1897 err = sdscatprintf(sdsempty(),
1898 "Can't open the log file: %s", strerror(errno));
1899 goto loaderr;
1900 }
1901 fclose(logfp);
1902 }
1903 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1904 server.dbnum = atoi(argv[1]);
1905 if (server.dbnum < 1) {
1906 err = "Invalid number of databases"; goto loaderr;
1907 }
1908 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1909 loadServerConfig(argv[1]);
1910 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1911 server.maxclients = atoi(argv[1]);
1912 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1913 server.maxmemory = memtoll(argv[1],NULL);
1914 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1915 server.masterhost = sdsnew(argv[1]);
1916 server.masterport = atoi(argv[2]);
1917 server.replstate = REDIS_REPL_CONNECT;
1918 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1919 server.masterauth = zstrdup(argv[1]);
1920 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1921 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1922 err = "argument must be 'yes' or 'no'"; goto loaderr;
1923 }
1924 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1925 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1926 err = "argument must be 'yes' or 'no'"; goto loaderr;
1927 }
1928 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1929 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1930 err = "argument must be 'yes' or 'no'"; goto loaderr;
1931 }
1932 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1933 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1934 err = "argument must be 'yes' or 'no'"; goto loaderr;
1935 }
1936 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1937 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1938 err = "argument must be 'yes' or 'no'"; goto loaderr;
1939 }
1940 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1941 zfree(server.appendfilename);
1942 server.appendfilename = zstrdup(argv[1]);
1943 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1944 if (!strcasecmp(argv[1],"no")) {
1945 server.appendfsync = APPENDFSYNC_NO;
1946 } else if (!strcasecmp(argv[1],"always")) {
1947 server.appendfsync = APPENDFSYNC_ALWAYS;
1948 } else if (!strcasecmp(argv[1],"everysec")) {
1949 server.appendfsync = APPENDFSYNC_EVERYSEC;
1950 } else {
1951 err = "argument must be 'no', 'always' or 'everysec'";
1952 goto loaderr;
1953 }
1954 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1955 server.requirepass = zstrdup(argv[1]);
1956 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1957 zfree(server.pidfile);
1958 server.pidfile = zstrdup(argv[1]);
1959 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1960 zfree(server.dbfilename);
1961 server.dbfilename = zstrdup(argv[1]);
1962 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1963 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1964 err = "argument must be 'yes' or 'no'"; goto loaderr;
1965 }
1966 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1967 zfree(server.vm_swap_file);
1968 server.vm_swap_file = zstrdup(argv[1]);
1969 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1970 server.vm_max_memory = memtoll(argv[1],NULL);
1971 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1972 server.vm_page_size = memtoll(argv[1], NULL);
1973 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1974 server.vm_pages = memtoll(argv[1], NULL);
1975 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1976 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1977 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1978 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1979 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1980 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1981 } else {
1982 err = "Bad directive or wrong number of arguments"; goto loaderr;
1983 }
1984 for (j = 0; j < argc; j++)
1985 sdsfree(argv[j]);
1986 zfree(argv);
1987 sdsfree(line);
1988 }
1989 if (fp != stdin) fclose(fp);
1990 return;
1991
1992 loaderr:
1993 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1994 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1995 fprintf(stderr, ">>> '%s'\n", line);
1996 fprintf(stderr, "%s\n", err);
1997 exit(1);
1998 }
1999
2000 static void freeClientArgv(redisClient *c) {
2001 int j;
2002
2003 for (j = 0; j < c->argc; j++)
2004 decrRefCount(c->argv[j]);
2005 for (j = 0; j < c->mbargc; j++)
2006 decrRefCount(c->mbargv[j]);
2007 c->argc = 0;
2008 c->mbargc = 0;
2009 }
2010
2011 static void freeClient(redisClient *c) {
2012 listNode *ln;
2013
2014 /* Note that if the client we are freeing is blocked into a blocking
2015 * call, we have to set querybuf to NULL *before* to call
2016 * unblockClientWaitingData() to avoid processInputBuffer() will get
2017 * called. Also it is important to remove the file events after
2018 * this, because this call adds the READABLE event. */
2019 sdsfree(c->querybuf);
2020 c->querybuf = NULL;
2021 if (c->flags & REDIS_BLOCKED)
2022 unblockClientWaitingData(c);
2023
2024 /* UNWATCH all the keys */
2025 unwatchAllKeys(c);
2026 listRelease(c->watched_keys);
2027 /* Unsubscribe from all the pubsub channels */
2028 pubsubUnsubscribeAllChannels(c,0);
2029 pubsubUnsubscribeAllPatterns(c,0);
2030 dictRelease(c->pubsub_channels);
2031 listRelease(c->pubsub_patterns);
2032 /* Obvious cleanup */
2033 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2034 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2035 listRelease(c->reply);
2036 freeClientArgv(c);
2037 close(c->fd);
2038 /* Remove from the list of clients */
2039 ln = listSearchKey(server.clients,c);
2040 redisAssert(ln != NULL);
2041 listDelNode(server.clients,ln);
2042 /* Remove from the list of clients that are now ready to be restarted
2043 * after waiting for swapped keys */
2044 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2045 ln = listSearchKey(server.io_ready_clients,c);
2046 if (ln) {
2047 listDelNode(server.io_ready_clients,ln);
2048 server.vm_blocked_clients--;
2049 }
2050 }
2051 /* Remove from the list of clients waiting for swapped keys */
2052 while (server.vm_enabled && listLength(c->io_keys)) {
2053 ln = listFirst(c->io_keys);
2054 dontWaitForSwappedKey(c,ln->value);
2055 }
2056 listRelease(c->io_keys);
2057 /* Master/slave cleanup */
2058 if (c->flags & REDIS_SLAVE) {
2059 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2060 close(c->repldbfd);
2061 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2062 ln = listSearchKey(l,c);
2063 redisAssert(ln != NULL);
2064 listDelNode(l,ln);
2065 }
2066 if (c->flags & REDIS_MASTER) {
2067 server.master = NULL;
2068 server.replstate = REDIS_REPL_CONNECT;
2069 }
2070 /* Release memory */
2071 zfree(c->argv);
2072 zfree(c->mbargv);
2073 freeClientMultiState(c);
2074 zfree(c);
2075 }
2076
2077 #define GLUEREPLY_UP_TO (1024)
2078 static void glueReplyBuffersIfNeeded(redisClient *c) {
2079 int copylen = 0;
2080 char buf[GLUEREPLY_UP_TO];
2081 listNode *ln;
2082 listIter li;
2083 robj *o;
2084
2085 listRewind(c->reply,&li);
2086 while((ln = listNext(&li))) {
2087 int objlen;
2088
2089 o = ln->value;
2090 objlen = sdslen(o->ptr);
2091 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2092 memcpy(buf+copylen,o->ptr,objlen);
2093 copylen += objlen;
2094 listDelNode(c->reply,ln);
2095 } else {
2096 if (copylen == 0) return;
2097 break;
2098 }
2099 }
2100 /* Now the output buffer is empty, add the new single element */
2101 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2102 listAddNodeHead(c->reply,o);
2103 }
2104
2105 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2106 redisClient *c = privdata;
2107 int nwritten = 0, totwritten = 0, objlen;
2108 robj *o;
2109 REDIS_NOTUSED(el);
2110 REDIS_NOTUSED(mask);
2111
2112 /* Use writev() if we have enough buffers to send */
2113 if (!server.glueoutputbuf &&
2114 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2115 !(c->flags & REDIS_MASTER))
2116 {
2117 sendReplyToClientWritev(el, fd, privdata, mask);
2118 return;
2119 }
2120
2121 while(listLength(c->reply)) {
2122 if (server.glueoutputbuf && listLength(c->reply) > 1)
2123 glueReplyBuffersIfNeeded(c);
2124
2125 o = listNodeValue(listFirst(c->reply));
2126 objlen = sdslen(o->ptr);
2127
2128 if (objlen == 0) {
2129 listDelNode(c->reply,listFirst(c->reply));
2130 continue;
2131 }
2132
2133 if (c->flags & REDIS_MASTER) {
2134 /* Don't reply to a master */
2135 nwritten = objlen - c->sentlen;
2136 } else {
2137 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2138 if (nwritten <= 0) break;
2139 }
2140 c->sentlen += nwritten;
2141 totwritten += nwritten;
2142 /* If we fully sent the object on head go to the next one */
2143 if (c->sentlen == objlen) {
2144 listDelNode(c->reply,listFirst(c->reply));
2145 c->sentlen = 0;
2146 }
2147 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2148 * bytes, in a single threaded server it's a good idea to serve
2149 * other clients as well, even if a very large request comes from
2150 * super fast link that is always able to accept data (in real world
2151 * scenario think about 'KEYS *' against the loopback interfae) */
2152 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2153 }
2154 if (nwritten == -1) {
2155 if (errno == EAGAIN) {
2156 nwritten = 0;
2157 } else {
2158 redisLog(REDIS_VERBOSE,
2159 "Error writing to client: %s", strerror(errno));
2160 freeClient(c);
2161 return;
2162 }
2163 }
2164 if (totwritten > 0) c->lastinteraction = time(NULL);
2165 if (listLength(c->reply) == 0) {
2166 c->sentlen = 0;
2167 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2168 }
2169 }
2170
2171 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2172 {
2173 redisClient *c = privdata;
2174 int nwritten = 0, totwritten = 0, objlen, willwrite;
2175 robj *o;
2176 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2177 int offset, ion = 0;
2178 REDIS_NOTUSED(el);
2179 REDIS_NOTUSED(mask);
2180
2181 listNode *node;
2182 while (listLength(c->reply)) {
2183 offset = c->sentlen;
2184 ion = 0;
2185 willwrite = 0;
2186
2187 /* fill-in the iov[] array */
2188 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2189 o = listNodeValue(node);
2190 objlen = sdslen(o->ptr);
2191
2192 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2193 break;
2194
2195 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2196 break; /* no more iovecs */
2197
2198 iov[ion].iov_base = ((char*)o->ptr) + offset;
2199 iov[ion].iov_len = objlen - offset;
2200 willwrite += objlen - offset;
2201 offset = 0; /* just for the first item */
2202 ion++;
2203 }
2204
2205 if(willwrite == 0)
2206 break;
2207
2208 /* write all collected blocks at once */
2209 if((nwritten = writev(fd, iov, ion)) < 0) {
2210 if (errno != EAGAIN) {
2211 redisLog(REDIS_VERBOSE,
2212 "Error writing to client: %s", strerror(errno));
2213 freeClient(c);
2214 return;
2215 }
2216 break;
2217 }
2218
2219 totwritten += nwritten;
2220 offset = c->sentlen;
2221
2222 /* remove written robjs from c->reply */
2223 while (nwritten && listLength(c->reply)) {
2224 o = listNodeValue(listFirst(c->reply));
2225 objlen = sdslen(o->ptr);
2226
2227 if(nwritten >= objlen - offset) {
2228 listDelNode(c->reply, listFirst(c->reply));
2229 nwritten -= objlen - offset;
2230 c->sentlen = 0;
2231 } else {
2232 /* partial write */
2233 c->sentlen += nwritten;
2234 break;
2235 }
2236 offset = 0;
2237 }
2238 }
2239
2240 if (totwritten > 0)
2241 c->lastinteraction = time(NULL);
2242
2243 if (listLength(c->reply) == 0) {
2244 c->sentlen = 0;
2245 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2246 }
2247 }
2248
2249 static struct redisCommand *lookupCommand(char *name) {
2250 int j = 0;
2251 while(cmdTable[j].name != NULL) {
2252 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2253 j++;
2254 }
2255 return NULL;
2256 }
2257
2258 /* resetClient prepare the client to process the next command */
2259 static void resetClient(redisClient *c) {
2260 freeClientArgv(c);
2261 c->bulklen = -1;
2262 c->multibulk = 0;
2263 }
2264
2265 /* Call() is the core of Redis execution of a command */
2266 static void call(redisClient *c, struct redisCommand *cmd) {
2267 long long dirty;
2268
2269 dirty = server.dirty;
2270 cmd->proc(c);
2271 dirty = server.dirty-dirty;
2272
2273 if (server.appendonly && dirty)
2274 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2275 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2276 listLength(server.slaves))
2277 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2278 if (listLength(server.monitors))
2279 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2280 server.stat_numcommands++;
2281 }
2282
2283 /* If this function gets called we already read a whole
2284 * command, argments are in the client argv/argc fields.
2285 * processCommand() execute the command or prepare the
2286 * server for a bulk read from the client.
2287 *
2288 * If 1 is returned the client is still alive and valid and
2289 * and other operations can be performed by the caller. Otherwise
2290 * if 0 is returned the client was destroied (i.e. after QUIT). */
2291 static int processCommand(redisClient *c) {
2292 struct redisCommand *cmd;
2293
2294 /* Free some memory if needed (maxmemory setting) */
2295 if (server.maxmemory) freeMemoryIfNeeded();
2296
2297 /* Handle the multi bulk command type. This is an alternative protocol
2298 * supported by Redis in order to receive commands that are composed of
2299 * multiple binary-safe "bulk" arguments. The latency of processing is
2300 * a bit higher but this allows things like multi-sets, so if this
2301 * protocol is used only for MSET and similar commands this is a big win. */
2302 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2303 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2304 if (c->multibulk <= 0) {
2305 resetClient(c);
2306 return 1;
2307 } else {
2308 decrRefCount(c->argv[c->argc-1]);
2309 c->argc--;
2310 return 1;
2311 }
2312 } else if (c->multibulk) {
2313 if (c->bulklen == -1) {
2314 if (((char*)c->argv[0]->ptr)[0] != '$') {
2315 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2316 resetClient(c);
2317 return 1;
2318 } else {
2319 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2320 decrRefCount(c->argv[0]);
2321 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2322 c->argc--;
2323 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2324 resetClient(c);
2325 return 1;
2326 }
2327 c->argc--;
2328 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2329 return 1;
2330 }
2331 } else {
2332 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2333 c->mbargv[c->mbargc] = c->argv[0];
2334 c->mbargc++;
2335 c->argc--;
2336 c->multibulk--;
2337 if (c->multibulk == 0) {
2338 robj **auxargv;
2339 int auxargc;
2340
2341 /* Here we need to swap the multi-bulk argc/argv with the
2342 * normal argc/argv of the client structure. */
2343 auxargv = c->argv;
2344 c->argv = c->mbargv;
2345 c->mbargv = auxargv;
2346
2347 auxargc = c->argc;
2348 c->argc = c->mbargc;
2349 c->mbargc = auxargc;
2350
2351 /* We need to set bulklen to something different than -1
2352 * in order for the code below to process the command without
2353 * to try to read the last argument of a bulk command as
2354 * a special argument. */
2355 c->bulklen = 0;
2356 /* continue below and process the command */
2357 } else {
2358 c->bulklen = -1;
2359 return 1;
2360 }
2361 }
2362 }
2363 /* -- end of multi bulk commands processing -- */
2364
2365 /* The QUIT command is handled as a special case. Normal command
2366 * procs are unable to close the client connection safely */
2367 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2368 freeClient(c);
2369 return 0;
2370 }
2371
2372 /* Now lookup the command and check ASAP about trivial error conditions
2373 * such wrong arity, bad command name and so forth. */
2374 cmd = lookupCommand(c->argv[0]->ptr);
2375 if (!cmd) {
2376 addReplySds(c,
2377 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2378 (char*)c->argv[0]->ptr));
2379 resetClient(c);
2380 return 1;
2381 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2382 (c->argc < -cmd->arity)) {
2383 addReplySds(c,
2384 sdscatprintf(sdsempty(),
2385 "-ERR wrong number of arguments for '%s' command\r\n",
2386 cmd->name));
2387 resetClient(c);
2388 return 1;
2389 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2390 /* This is a bulk command, we have to read the last argument yet. */
2391 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2392
2393 decrRefCount(c->argv[c->argc-1]);
2394 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2395 c->argc--;
2396 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2397 resetClient(c);
2398 return 1;
2399 }
2400 c->argc--;
2401 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2402 /* It is possible that the bulk read is already in the
2403 * buffer. Check this condition and handle it accordingly.
2404 * This is just a fast path, alternative to call processInputBuffer().
2405 * It's a good idea since the code is small and this condition
2406 * happens most of the times. */
2407 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2408 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2409 c->argc++;
2410 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2411 } else {
2412 /* Otherwise return... there is to read the last argument
2413 * from the socket. */
2414 return 1;
2415 }
2416 }
2417 /* Let's try to encode the bulk object to save space. */
2418 if (cmd->flags & REDIS_CMD_BULK)
2419 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2420
2421 /* Check if the user is authenticated */
2422 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2423 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2424 resetClient(c);
2425 return 1;
2426 }
2427
2428 /* Handle the maxmemory directive */
2429 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2430 zmalloc_used_memory() > server.maxmemory)
2431 {
2432 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2433 resetClient(c);
2434 return 1;
2435 }
2436
2437 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2438 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2439 &&
2440 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2441 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2442 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2443 resetClient(c);
2444 return 1;
2445 }
2446
2447 /* Exec the command */
2448 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2449 queueMultiCommand(c,cmd);
2450 addReply(c,shared.queued);
2451 } else {
2452 if (server.vm_enabled && server.vm_max_threads > 0 &&
2453 blockClientOnSwappedKeys(c,cmd)) return 1;
2454 call(c,cmd);
2455 }
2456
2457 /* Prepare the client for the next command */
2458 resetClient(c);
2459 return 1;
2460 }
2461
2462 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2463 listNode *ln;
2464 listIter li;
2465 int outc = 0, j;
2466 robj **outv;
2467 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2468 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2469 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2470 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2471 robj *lenobj;
2472
2473 if (argc <= REDIS_STATIC_ARGS) {
2474 outv = static_outv;
2475 } else {
2476 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2477 }
2478
2479 lenobj = createObject(REDIS_STRING,
2480 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2481 lenobj->refcount = 0;
2482 outv[outc++] = lenobj;
2483 for (j = 0; j < argc; j++) {
2484 lenobj = createObject(REDIS_STRING,
2485 sdscatprintf(sdsempty(),"$%lu\r\n",
2486 (unsigned long) stringObjectLen(argv[j])));
2487 lenobj->refcount = 0;
2488 outv[outc++] = lenobj;
2489 outv[outc++] = argv[j];
2490 outv[outc++] = shared.crlf;
2491 }
2492
2493 /* Increment all the refcounts at start and decrement at end in order to
2494 * be sure to free objects if there is no slave in a replication state
2495 * able to be feed with commands */
2496 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2497 listRewind(slaves,&li);
2498 while((ln = listNext(&li))) {
2499 redisClient *slave = ln->value;
2500
2501 /* Don't feed slaves that are still waiting for BGSAVE to start */
2502 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2503
2504 /* Feed all the other slaves, MONITORs and so on */
2505 if (slave->slaveseldb != dictid) {
2506 robj *selectcmd;
2507
2508 switch(dictid) {
2509 case 0: selectcmd = shared.select0; break;
2510 case 1: selectcmd = shared.select1; break;
2511 case 2: selectcmd = shared.select2; break;
2512 case 3: selectcmd = shared.select3; break;
2513 case 4: selectcmd = shared.select4; break;
2514 case 5: selectcmd = shared.select5; break;
2515 case 6: selectcmd = shared.select6; break;
2516 case 7: selectcmd = shared.select7; break;
2517 case 8: selectcmd = shared.select8; break;
2518 case 9: selectcmd = shared.select9; break;
2519 default:
2520 selectcmd = createObject(REDIS_STRING,
2521 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2522 selectcmd->refcount = 0;
2523 break;
2524 }
2525 addReply(slave,selectcmd);
2526 slave->slaveseldb = dictid;
2527 }
2528 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2529 }
2530 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2531 if (outv != static_outv) zfree(outv);
2532 }
2533
2534 static sds sdscatrepr(sds s, char *p, size_t len) {
2535 s = sdscatlen(s,"\"",1);
2536 while(len--) {
2537 switch(*p) {
2538 case '\\':
2539 case '"':
2540 s = sdscatprintf(s,"\\%c",*p);
2541 break;
2542 case '\n': s = sdscatlen(s,"\\n",1); break;
2543 case '\r': s = sdscatlen(s,"\\r",1); break;
2544 case '\t': s = sdscatlen(s,"\\t",1); break;
2545 case '\a': s = sdscatlen(s,"\\a",1); break;
2546 case '\b': s = sdscatlen(s,"\\b",1); break;
2547 default:
2548 if (isprint(*p))
2549 s = sdscatprintf(s,"%c",*p);
2550 else
2551 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2552 break;
2553 }
2554 p++;
2555 }
2556 return sdscatlen(s,"\"",1);
2557 }
2558
2559 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2560 listNode *ln;
2561 listIter li;
2562 int j;
2563 sds cmdrepr = sdsnew("+");
2564 robj *cmdobj;
2565 struct timeval tv;
2566
2567 gettimeofday(&tv,NULL);
2568 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2569 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2570
2571 for (j = 0; j < argc; j++) {
2572 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2573 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2574 } else {
2575 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2576 sdslen(argv[j]->ptr));
2577 }
2578 if (j != argc-1)
2579 cmdrepr = sdscatlen(cmdrepr," ",1);
2580 }
2581 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2582 cmdobj = createObject(REDIS_STRING,cmdrepr);
2583
2584 listRewind(monitors,&li);
2585 while((ln = listNext(&li))) {
2586 redisClient *monitor = ln->value;
2587 addReply(monitor,cmdobj);
2588 }
2589 decrRefCount(cmdobj);
2590 }
2591
2592 static void processInputBuffer(redisClient *c) {
2593 again:
2594 /* Before to process the input buffer, make sure the client is not
2595 * waitig for a blocking operation such as BLPOP. Note that the first
2596 * iteration the client is never blocked, otherwise the processInputBuffer
2597 * would not be called at all, but after the execution of the first commands
2598 * in the input buffer the client may be blocked, and the "goto again"
2599 * will try to reiterate. The following line will make it return asap. */
2600 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2601 if (c->bulklen == -1) {
2602 /* Read the first line of the query */
2603 char *p = strchr(c->querybuf,'\n');
2604 size_t querylen;
2605
2606 if (p) {
2607 sds query, *argv;
2608 int argc, j;
2609
2610 query = c->querybuf;
2611 c->querybuf = sdsempty();
2612 querylen = 1+(p-(query));
2613 if (sdslen(query) > querylen) {
2614 /* leave data after the first line of the query in the buffer */
2615 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2616 }
2617 *p = '\0'; /* remove "\n" */
2618 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2619 sdsupdatelen(query);
2620
2621 /* Now we can split the query in arguments */
2622 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2623 sdsfree(query);
2624
2625 if (c->argv) zfree(c->argv);
2626 c->argv = zmalloc(sizeof(robj*)*argc);
2627
2628 for (j = 0; j < argc; j++) {
2629 if (sdslen(argv[j])) {
2630 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2631 c->argc++;
2632 } else {
2633 sdsfree(argv[j]);
2634 }
2635 }
2636 zfree(argv);
2637 if (c->argc) {
2638 /* Execute the command. If the client is still valid
2639 * after processCommand() return and there is something
2640 * on the query buffer try to process the next command. */
2641 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2642 } else {
2643 /* Nothing to process, argc == 0. Just process the query
2644 * buffer if it's not empty or return to the caller */
2645 if (sdslen(c->querybuf)) goto again;
2646 }
2647 return;
2648 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2649 redisLog(REDIS_VERBOSE, "Client protocol error");
2650 freeClient(c);
2651 return;
2652 }
2653 } else {
2654 /* Bulk read handling. Note that if we are at this point
2655 the client already sent a command terminated with a newline,
2656 we are reading the bulk data that is actually the last
2657 argument of the command. */
2658 int qbl = sdslen(c->querybuf);
2659
2660 if (c->bulklen <= qbl) {
2661 /* Copy everything but the final CRLF as final argument */
2662 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2663 c->argc++;
2664 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2665 /* Process the command. If the client is still valid after
2666 * the processing and there is more data in the buffer
2667 * try to parse it. */
2668 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2669 return;
2670 }
2671 }
2672 }
2673
2674 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2675 redisClient *c = (redisClient*) privdata;
2676 char buf[REDIS_IOBUF_LEN];
2677 int nread;
2678 REDIS_NOTUSED(el);
2679 REDIS_NOTUSED(mask);
2680
2681 nread = read(fd, buf, REDIS_IOBUF_LEN);
2682 if (nread == -1) {
2683 if (errno == EAGAIN) {
2684 nread = 0;
2685 } else {
2686 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2687 freeClient(c);
2688 return;
2689 }
2690 } else if (nread == 0) {
2691 redisLog(REDIS_VERBOSE, "Client closed connection");
2692 freeClient(c);
2693 return;
2694 }
2695 if (nread) {
2696 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2697 c->lastinteraction = time(NULL);
2698 } else {
2699 return;
2700 }
2701 processInputBuffer(c);
2702 }
2703
2704 static int selectDb(redisClient *c, int id) {
2705 if (id < 0 || id >= server.dbnum)
2706 return REDIS_ERR;
2707 c->db = &server.db[id];
2708 return REDIS_OK;
2709 }
2710
2711 static void *dupClientReplyValue(void *o) {
2712 incrRefCount((robj*)o);
2713 return o;
2714 }
2715
2716 static int listMatchObjects(void *a, void *b) {
2717 return equalStringObjects(a,b);
2718 }
2719
2720 static redisClient *createClient(int fd) {
2721 redisClient *c = zmalloc(sizeof(*c));
2722
2723 anetNonBlock(NULL,fd);
2724 anetTcpNoDelay(NULL,fd);
2725 if (!c) return NULL;
2726 selectDb(c,0);
2727 c->fd = fd;
2728 c->querybuf = sdsempty();
2729 c->argc = 0;
2730 c->argv = NULL;
2731 c->bulklen = -1;
2732 c->multibulk = 0;
2733 c->mbargc = 0;
2734 c->mbargv = NULL;
2735 c->sentlen = 0;
2736 c->flags = 0;
2737 c->lastinteraction = time(NULL);
2738 c->authenticated = 0;
2739 c->replstate = REDIS_REPL_NONE;
2740 c->reply = listCreate();
2741 listSetFreeMethod(c->reply,decrRefCount);
2742 listSetDupMethod(c->reply,dupClientReplyValue);
2743 c->blocking_keys = NULL;
2744 c->blocking_keys_num = 0;
2745 c->io_keys = listCreate();
2746 c->watched_keys = listCreate();
2747 listSetFreeMethod(c->io_keys,decrRefCount);
2748 c->pubsub_channels = dictCreate(&setDictType,NULL);
2749 c->pubsub_patterns = listCreate();
2750 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2751 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2752 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2753 readQueryFromClient, c) == AE_ERR) {
2754 freeClient(c);
2755 return NULL;
2756 }
2757 listAddNodeTail(server.clients,c);
2758 initClientMultiState(c);
2759 return c;
2760 }
2761
2762 static void addReply(redisClient *c, robj *obj) {
2763 if (listLength(c->reply) == 0 &&
2764 (c->replstate == REDIS_REPL_NONE ||
2765 c->replstate == REDIS_REPL_ONLINE) &&
2766 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2767 sendReplyToClient, c) == AE_ERR) return;
2768
2769 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2770 obj = dupStringObject(obj);
2771 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2772 }
2773 listAddNodeTail(c->reply,getDecodedObject(obj));
2774 }
2775
2776 static void addReplySds(redisClient *c, sds s) {
2777 robj *o = createObject(REDIS_STRING,s);
2778 addReply(c,o);
2779 decrRefCount(o);
2780 }
2781
2782 static void addReplyDouble(redisClient *c, double d) {
2783 char buf[128];
2784
2785 snprintf(buf,sizeof(buf),"%.17g",d);
2786 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2787 (unsigned long) strlen(buf),buf));
2788 }
2789
2790 static void addReplyLongLong(redisClient *c, long long ll) {
2791 char buf[128];
2792 size_t len;
2793
2794 if (ll == 0) {
2795 addReply(c,shared.czero);
2796 return;
2797 } else if (ll == 1) {
2798 addReply(c,shared.cone);
2799 return;
2800 }
2801 buf[0] = ':';
2802 len = ll2string(buf+1,sizeof(buf)-1,ll);
2803 buf[len+1] = '\r';
2804 buf[len+2] = '\n';
2805 addReplySds(c,sdsnewlen(buf,len+3));
2806 }
2807
2808 static void addReplyUlong(redisClient *c, unsigned long ul) {
2809 char buf[128];
2810 size_t len;
2811
2812 if (ul == 0) {
2813 addReply(c,shared.czero);
2814 return;
2815 } else if (ul == 1) {
2816 addReply(c,shared.cone);
2817 return;
2818 }
2819 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2820 addReplySds(c,sdsnewlen(buf,len));
2821 }
2822
2823 static void addReplyBulkLen(redisClient *c, robj *obj) {
2824 size_t len, intlen;
2825 char buf[128];
2826
2827 if (obj->encoding == REDIS_ENCODING_RAW) {
2828 len = sdslen(obj->ptr);
2829 } else {
2830 long n = (long)obj->ptr;
2831
2832 /* Compute how many bytes will take this integer as a radix 10 string */
2833 len = 1;
2834 if (n < 0) {
2835 len++;
2836 n = -n;
2837 }
2838 while((n = n/10) != 0) {
2839 len++;
2840 }
2841 }
2842 buf[0] = '$';
2843 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2844 buf[intlen+1] = '\r';
2845 buf[intlen+2] = '\n';
2846 addReplySds(c,sdsnewlen(buf,intlen+3));
2847 }
2848
2849 static void addReplyBulk(redisClient *c, robj *obj) {
2850 addReplyBulkLen(c,obj);
2851 addReply(c,obj);
2852 addReply(c,shared.crlf);
2853 }
2854
2855 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2856 static void addReplyBulkCString(redisClient *c, char *s) {
2857 if (s == NULL) {
2858 addReply(c,shared.nullbulk);
2859 } else {
2860 robj *o = createStringObject(s,strlen(s));
2861 addReplyBulk(c,o);
2862 decrRefCount(o);
2863 }
2864 }
2865
2866 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2867 int cport, cfd;
2868 char cip[128];
2869 redisClient *c;
2870 REDIS_NOTUSED(el);
2871 REDIS_NOTUSED(mask);
2872 REDIS_NOTUSED(privdata);
2873
2874 cfd = anetAccept(server.neterr, fd, cip, &cport);
2875 if (cfd == AE_ERR) {
2876 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2877 return;
2878 }
2879 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2880 if ((c = createClient(cfd)) == NULL) {
2881 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2882 close(cfd); /* May be already closed, just ingore errors */
2883 return;
2884 }
2885 /* If maxclient directive is set and this is one client more... close the
2886 * connection. Note that we create the client instead to check before
2887 * for this condition, since now the socket is already set in nonblocking
2888 * mode and we can send an error for free using the Kernel I/O */
2889 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2890 char *err = "-ERR max number of clients reached\r\n";
2891
2892 /* That's a best effort error message, don't check write errors */
2893 if (write(c->fd,err,strlen(err)) == -1) {
2894 /* Nothing to do, Just to avoid the warning... */
2895 }
2896 freeClient(c);
2897 return;
2898 }
2899 server.stat_numconnections++;
2900 }
2901
2902 /* ======================= Redis objects implementation ===================== */
2903
2904 static robj *createObject(int type, void *ptr) {
2905 robj *o;
2906
2907 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2908 if (listLength(server.objfreelist)) {
2909 listNode *head = listFirst(server.objfreelist);
2910 o = listNodeValue(head);
2911 listDelNode(server.objfreelist,head);
2912 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2913 } else {
2914 if (server.vm_enabled) {
2915 pthread_mutex_unlock(&server.obj_freelist_mutex);
2916 o = zmalloc(sizeof(*o));
2917 } else {
2918 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2919 }
2920 }
2921 o->type = type;
2922 o->encoding = REDIS_ENCODING_RAW;
2923 o->ptr = ptr;
2924 o->refcount = 1;
2925 if (server.vm_enabled) {
2926 /* Note that this code may run in the context of an I/O thread
2927 * and accessing to server.unixtime in theory is an error
2928 * (no locks). But in practice this is safe, and even if we read
2929 * garbage Redis will not fail, as it's just a statistical info */
2930 o->vm.atime = server.unixtime;
2931 o->storage = REDIS_VM_MEMORY;
2932 }
2933 return o;
2934 }
2935
2936 static robj *createStringObject(char *ptr, size_t len) {
2937 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2938 }
2939
2940 static robj *createStringObjectFromLongLong(long long value) {
2941 robj *o;
2942 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2943 incrRefCount(shared.integers[value]);
2944 o = shared.integers[value];
2945 } else {
2946 if (value >= LONG_MIN && value <= LONG_MAX) {
2947 o = createObject(REDIS_STRING, NULL);
2948 o->encoding = REDIS_ENCODING_INT;
2949 o->ptr = (void*)((long)value);
2950 } else {
2951 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2952 }
2953 }
2954 return o;
2955 }
2956
2957 static robj *dupStringObject(robj *o) {
2958 assert(o->encoding == REDIS_ENCODING_RAW);
2959 return createStringObject(o->ptr,sdslen(o->ptr));
2960 }
2961
2962 static robj *createListObject(void) {
2963 list *l = listCreate();
2964
2965 listSetFreeMethod(l,decrRefCount);
2966 return createObject(REDIS_LIST,l);
2967 }
2968
2969 static robj *createSetObject(void) {
2970 dict *d = dictCreate(&setDictType,NULL);
2971 return createObject(REDIS_SET,d);
2972 }
2973
2974 static robj *createHashObject(void) {
2975 /* All the Hashes start as zipmaps. Will be automatically converted
2976 * into hash tables if there are enough elements or big elements
2977 * inside. */
2978 unsigned char *zm = zipmapNew();
2979 robj *o = createObject(REDIS_HASH,zm);
2980 o->encoding = REDIS_ENCODING_ZIPMAP;
2981 return o;
2982 }
2983
2984 static robj *createZsetObject(void) {
2985 zset *zs = zmalloc(sizeof(*zs));
2986
2987 zs->dict = dictCreate(&zsetDictType,NULL);
2988 zs->zsl = zslCreate();
2989 return createObject(REDIS_ZSET,zs);
2990 }
2991
2992 static void freeStringObject(robj *o) {
2993 if (o->encoding == REDIS_ENCODING_RAW) {
2994 sdsfree(o->ptr);
2995 }
2996 }
2997
2998 static void freeListObject(robj *o) {
2999 listRelease((list*) o->ptr);
3000 }
3001
3002 static void freeSetObject(robj *o) {
3003 dictRelease((dict*) o->ptr);
3004 }
3005
3006 static void freeZsetObject(robj *o) {
3007 zset *zs = o->ptr;
3008
3009 dictRelease(zs->dict);
3010 zslFree(zs->zsl);
3011 zfree(zs);
3012 }
3013
3014 static void freeHashObject(robj *o) {
3015 switch (o->encoding) {
3016 case REDIS_ENCODING_HT:
3017 dictRelease((dict*) o->ptr);
3018 break;
3019 case REDIS_ENCODING_ZIPMAP:
3020 zfree(o->ptr);
3021 break;
3022 default:
3023 redisPanic("Unknown hash encoding type");
3024 break;
3025 }
3026 }
3027
3028 static void incrRefCount(robj *o) {
3029 o->refcount++;
3030 }
3031
3032 static void decrRefCount(void *obj) {
3033 robj *o = obj;
3034
3035 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3036 /* Object is a key of a swapped out value, or in the process of being
3037 * loaded. */
3038 if (server.vm_enabled &&
3039 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3040 {
3041 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3042 redisAssert(o->type == REDIS_STRING);
3043 freeStringObject(o);
3044 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3045 pthread_mutex_lock(&server.obj_freelist_mutex);
3046 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3047 !listAddNodeHead(server.objfreelist,o))
3048 zfree(o);
3049 pthread_mutex_unlock(&server.obj_freelist_mutex);
3050 server.vm_stats_swapped_objects--;
3051 return;
3052 }
3053 /* Object is in memory, or in the process of being swapped out. */
3054 if (--(o->refcount) == 0) {
3055 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3056 vmCancelThreadedIOJob(obj);
3057 switch(o->type) {
3058 case REDIS_STRING: freeStringObject(o); break;
3059 case REDIS_LIST: freeListObject(o); break;
3060 case REDIS_SET: freeSetObject(o); break;
3061 case REDIS_ZSET: freeZsetObject(o); break;
3062 case REDIS_HASH: freeHashObject(o); break;
3063 default: redisPanic("Unknown object type"); break;
3064 }
3065 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3066 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3067 !listAddNodeHead(server.objfreelist,o))
3068 zfree(o);
3069 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3070 }
3071 }
3072
3073 static robj *lookupKey(redisDb *db, robj *key) {
3074 dictEntry *de = dictFind(db->dict,key);
3075 if (de) {
3076 robj *key = dictGetEntryKey(de);
3077 robj *val = dictGetEntryVal(de);
3078
3079 if (server.vm_enabled) {
3080 if (key->storage == REDIS_VM_MEMORY ||
3081 key->storage == REDIS_VM_SWAPPING)
3082 {
3083 /* If we were swapping the object out, stop it, this key
3084 * was requested. */
3085 if (key->storage == REDIS_VM_SWAPPING)
3086 vmCancelThreadedIOJob(key);
3087 /* Update the access time of the key for the aging algorithm. */
3088 key->vm.atime = server.unixtime;
3089 } else {
3090 int notify = (key->storage == REDIS_VM_LOADING);
3091
3092 /* Our value was swapped on disk. Bring it at home. */
3093 redisAssert(val == NULL);
3094 val = vmLoadObject(key);
3095 dictGetEntryVal(de) = val;
3096
3097 /* Clients blocked by the VM subsystem may be waiting for
3098 * this key... */
3099 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3100 }
3101 }
3102 return val;
3103 } else {
3104 return NULL;
3105 }
3106 }
3107
3108 static robj *lookupKeyRead(redisDb *db, robj *key) {
3109 expireIfNeeded(db,key);
3110 return lookupKey(db,key);
3111 }
3112
3113 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3114 deleteIfVolatile(db,key);
3115 touchWatchedKey(db,key);
3116 return lookupKey(db,key);
3117 }
3118
3119 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3120 robj *o = lookupKeyRead(c->db, key);
3121 if (!o) addReply(c,reply);
3122 return o;
3123 }
3124
3125 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3126 robj *o = lookupKeyWrite(c->db, key);
3127 if (!o) addReply(c,reply);
3128 return o;
3129 }
3130
3131 static int checkType(redisClient *c, robj *o, int type) {
3132 if (o->type != type) {
3133 addReply(c,shared.wrongtypeerr);
3134 return 1;
3135 }
3136 return 0;
3137 }
3138
3139 static int deleteKey(redisDb *db, robj *key) {
3140 int retval;
3141
3142 /* We need to protect key from destruction: after the first dictDelete()
3143 * it may happen that 'key' is no longer valid if we don't increment
3144 * it's count. This may happen when we get the object reference directly
3145 * from the hash table with dictRandomKey() or dict iterators */
3146 incrRefCount(key);
3147 if (dictSize(db->expires)) dictDelete(db->expires,key);
3148 retval = dictDelete(db->dict,key);
3149 decrRefCount(key);
3150
3151 return retval == DICT_OK;
3152 }
3153
3154 /* Check if the nul-terminated string 's' can be represented by a long
3155 * (that is, is a number that fits into long without any other space or
3156 * character before or after the digits).
3157 *
3158 * If so, the function returns REDIS_OK and *longval is set to the value
3159 * of the number. Otherwise REDIS_ERR is returned */
3160 static int isStringRepresentableAsLong(sds s, long *longval) {
3161 char buf[32], *endptr;
3162 long value;
3163 int slen;
3164
3165 value = strtol(s, &endptr, 10);
3166 if (endptr[0] != '\0') return REDIS_ERR;
3167 slen = ll2string(buf,32,value);
3168
3169 /* If the number converted back into a string is not identical
3170 * then it's not possible to encode the string as integer */
3171 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3172 if (longval) *longval = value;
3173 return REDIS_OK;
3174 }
3175
3176 /* Try to encode a string object in order to save space */
3177 static robj *tryObjectEncoding(robj *o) {
3178 long value;
3179 sds s = o->ptr;
3180
3181 if (o->encoding != REDIS_ENCODING_RAW)
3182 return o; /* Already encoded */
3183
3184 /* It's not safe to encode shared objects: shared objects can be shared
3185 * everywhere in the "object space" of Redis. Encoded objects can only
3186 * appear as "values" (and not, for instance, as keys) */
3187 if (o->refcount > 1) return o;
3188
3189 /* Currently we try to encode only strings */
3190 redisAssert(o->type == REDIS_STRING);
3191
3192 /* Check if we can represent this string as a long integer */
3193 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3194
3195 /* Ok, this object can be encoded */
3196 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3197 decrRefCount(o);
3198 incrRefCount(shared.integers[value]);
3199 return shared.integers[value];
3200 } else {
3201 o->encoding = REDIS_ENCODING_INT;
3202 sdsfree(o->ptr);
3203 o->ptr = (void*) value;
3204 return o;
3205 }
3206 }
3207
3208 /* Get a decoded version of an encoded object (returned as a new object).
3209 * If the object is already raw-encoded just increment the ref count. */
3210 static robj *getDecodedObject(robj *o) {
3211 robj *dec;
3212
3213 if (o->encoding == REDIS_ENCODING_RAW) {
3214 incrRefCount(o);
3215 return o;
3216 }
3217 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3218 char buf[32];
3219
3220 ll2string(buf,32,(long)o->ptr);
3221 dec = createStringObject(buf,strlen(buf));
3222 return dec;
3223 } else {
3224 redisPanic("Unknown encoding type");
3225 }
3226 }
3227
3228 /* Compare two string objects via strcmp() or alike.
3229 * Note that the objects may be integer-encoded. In such a case we
3230 * use ll2string() to get a string representation of the numbers on the stack
3231 * and compare the strings, it's much faster than calling getDecodedObject().
3232 *
3233 * Important note: if objects are not integer encoded, but binary-safe strings,
3234 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3235 * binary safe. */
3236 static int compareStringObjects(robj *a, robj *b) {
3237 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3238 char bufa[128], bufb[128], *astr, *bstr;
3239 int bothsds = 1;
3240
3241 if (a == b) return 0;
3242 if (a->encoding != REDIS_ENCODING_RAW) {
3243 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3244 astr = bufa;
3245 bothsds = 0;
3246 } else {
3247 astr = a->ptr;
3248 }
3249 if (b->encoding != REDIS_ENCODING_RAW) {
3250 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3251 bstr = bufb;
3252 bothsds = 0;
3253 } else {
3254 bstr = b->ptr;
3255 }
3256 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3257 }
3258
3259 /* Equal string objects return 1 if the two objects are the same from the
3260 * point of view of a string comparison, otherwise 0 is returned. Note that
3261 * this function is faster then checking for (compareStringObject(a,b) == 0)
3262 * because it can perform some more optimization. */
3263 static int equalStringObjects(robj *a, robj *b) {
3264 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3265 return a->ptr == b->ptr;
3266 } else {
3267 return compareStringObjects(a,b) == 0;
3268 }
3269 }
3270
3271 static size_t stringObjectLen(robj *o) {
3272 redisAssert(o->type == REDIS_STRING);
3273 if (o->encoding == REDIS_ENCODING_RAW) {
3274 return sdslen(o->ptr);
3275 } else {
3276 char buf[32];
3277
3278 return ll2string(buf,32,(long)o->ptr);
3279 }
3280 }
3281
3282 static int getDoubleFromObject(robj *o, double *target) {
3283 double value;
3284 char *eptr;
3285
3286 if (o == NULL) {
3287 value = 0;
3288 } else {
3289 redisAssert(o->type == REDIS_STRING);
3290 if (o->encoding == REDIS_ENCODING_RAW) {
3291 value = strtod(o->ptr, &eptr);
3292 if (eptr[0] != '\0') return REDIS_ERR;
3293 } else if (o->encoding == REDIS_ENCODING_INT) {
3294 value = (long)o->ptr;
3295 } else {
3296 redisPanic("Unknown string encoding");
3297 }
3298 }
3299
3300 *target = value;
3301 return REDIS_OK;
3302 }
3303
3304 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3305 double value;
3306 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3307 if (msg != NULL) {
3308 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3309 } else {
3310 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3311 }
3312 return REDIS_ERR;
3313 }
3314
3315 *target = value;
3316 return REDIS_OK;
3317 }
3318
3319 static int getLongLongFromObject(robj *o, long long *target) {
3320 long long value;
3321 char *eptr;
3322
3323 if (o == NULL) {
3324 value = 0;
3325 } else {
3326 redisAssert(o->type == REDIS_STRING);
3327 if (o->encoding == REDIS_ENCODING_RAW) {
3328 value = strtoll(o->ptr, &eptr, 10);
3329 if (eptr[0] != '\0') return REDIS_ERR;
3330 } else if (o->encoding == REDIS_ENCODING_INT) {
3331 value = (long)o->ptr;
3332 } else {
3333 redisPanic("Unknown string encoding");
3334 }
3335 }
3336
3337 *target = value;
3338 return REDIS_OK;
3339 }
3340
3341 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3342 long long value;
3343 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3344 if (msg != NULL) {
3345 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3346 } else {
3347 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3348 }
3349 return REDIS_ERR;
3350 }
3351
3352 *target = value;
3353 return REDIS_OK;
3354 }
3355
3356 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3357 long long value;
3358
3359 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3360 if (value < LONG_MIN || value > LONG_MAX) {
3361 if (msg != NULL) {
3362 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3363 } else {
3364 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3365 }
3366 return REDIS_ERR;
3367 }
3368
3369 *target = value;
3370 return REDIS_OK;
3371 }
3372
3373 /*============================ RDB saving/loading =========================== */
3374
3375 static int rdbSaveType(FILE *fp, unsigned char type) {
3376 if (fwrite(&type,1,1,fp) == 0) return -1;
3377 return 0;
3378 }
3379
3380 static int rdbSaveTime(FILE *fp, time_t t) {
3381 int32_t t32 = (int32_t) t;
3382 if (fwrite(&t32,4,1,fp) == 0) return -1;
3383 return 0;
3384 }
3385
3386 /* check rdbLoadLen() comments for more info */
3387 static int rdbSaveLen(FILE *fp, uint32_t len) {
3388 unsigned char buf[2];
3389
3390 if (len < (1<<6)) {
3391 /* Save a 6 bit len */
3392 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3393 if (fwrite(buf,1,1,fp) == 0) return -1;
3394 } else if (len < (1<<14)) {
3395 /* Save a 14 bit len */
3396 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3397 buf[1] = len&0xFF;
3398 if (fwrite(buf,2,1,fp) == 0) return -1;
3399 } else {
3400 /* Save a 32 bit len */
3401 buf[0] = (REDIS_RDB_32BITLEN<<6);
3402 if (fwrite(buf,1,1,fp) == 0) return -1;
3403 len = htonl(len);
3404 if (fwrite(&len,4,1,fp) == 0) return -1;
3405 }
3406 return 0;
3407 }
3408
3409 /* Encode 'value' as an integer if possible (if integer will fit the
3410 * supported range). If the function sucessful encoded the integer
3411 * then the (up to 5 bytes) encoded representation is written in the
3412 * string pointed by 'enc' and the length is returned. Otherwise
3413 * 0 is returned. */
3414 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3415 /* Finally check if it fits in our ranges */
3416 if (value >= -(1<<7) && value <= (1<<7)-1) {
3417 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3418 enc[1] = value&0xFF;
3419 return 2;
3420 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3421 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3422 enc[1] = value&0xFF;
3423 enc[2] = (value>>8)&0xFF;
3424 return 3;
3425 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3426 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3427 enc[1] = value&0xFF;
3428 enc[2] = (value>>8)&0xFF;
3429 enc[3] = (value>>16)&0xFF;
3430 enc[4] = (value>>24)&0xFF;
3431 return 5;
3432 } else {
3433 return 0;
3434 }
3435 }
3436
3437 /* String objects in the form "2391" "-100" without any space and with a
3438 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3439 * encoded as integers to save space */
3440 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3441 long long value;
3442 char *endptr, buf[32];
3443
3444 /* Check if it's possible to encode this value as a number */
3445 value = strtoll(s, &endptr, 10);
3446 if (endptr[0] != '\0') return 0;
3447 ll2string(buf,32,value);
3448
3449 /* If the number converted back into a string is not identical
3450 * then it's not possible to encode the string as integer */
3451 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3452
3453 return rdbEncodeInteger(value,enc);
3454 }
3455
3456 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3457 size_t comprlen, outlen;
3458 unsigned char byte;
3459 void *out;
3460
3461 /* We require at least four bytes compression for this to be worth it */
3462 if (len <= 4) return 0;
3463 outlen = len-4;
3464 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3465 comprlen = lzf_compress(s, len, out, outlen);
3466 if (comprlen == 0) {
3467 zfree(out);
3468 return 0;
3469 }
3470 /* Data compressed! Let's save it on disk */
3471 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3472 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3473 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3474 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3475 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3476 zfree(out);
3477 return comprlen;
3478
3479 writeerr:
3480 zfree(out);
3481 return -1;
3482 }
3483
3484 /* Save a string objet as [len][data] on disk. If the object is a string
3485 * representation of an integer value we try to safe it in a special form */
3486 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3487 int enclen;
3488
3489 /* Try integer encoding */
3490 if (len <= 11) {
3491 unsigned char buf[5];
3492 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3493 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3494 return 0;
3495 }
3496 }
3497
3498 /* Try LZF compression - under 20 bytes it's unable to compress even
3499 * aaaaaaaaaaaaaaaaaa so skip it */
3500 if (server.rdbcompression && len > 20) {
3501 int retval;
3502
3503 retval = rdbSaveLzfStringObject(fp,s,len);
3504 if (retval == -1) return -1;
3505 if (retval > 0) return 0;
3506 /* retval == 0 means data can't be compressed, save the old way */
3507 }
3508
3509 /* Store verbatim */
3510 if (rdbSaveLen(fp,len) == -1) return -1;
3511 if (len && fwrite(s,len,1,fp) == 0) return -1;
3512 return 0;
3513 }
3514
3515 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3516 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3517 int retval;
3518
3519 /* Avoid to decode the object, then encode it again, if the
3520 * object is alrady integer encoded. */
3521 if (obj->encoding == REDIS_ENCODING_INT) {
3522 long val = (long) obj->ptr;
3523 unsigned char buf[5];
3524 int enclen;
3525
3526 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3527 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3528 return 0;
3529 }
3530 /* otherwise... fall throught and continue with the usual
3531 * code path. */
3532 }
3533
3534 /* Avoid incr/decr ref count business when possible.
3535 * This plays well with copy-on-write given that we are probably
3536 * in a child process (BGSAVE). Also this makes sure key objects
3537 * of swapped objects are not incRefCount-ed (an assert does not allow
3538 * this in order to avoid bugs) */
3539 if (obj->encoding != REDIS_ENCODING_RAW) {
3540 obj = getDecodedObject(obj);
3541 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3542 decrRefCount(obj);
3543 } else {
3544 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3545 }
3546 return retval;
3547 }
3548
3549 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3550 * 8 bit integer specifing the length of the representation.
3551 * This 8 bit integer has special values in order to specify the following
3552 * conditions:
3553 * 253: not a number
3554 * 254: + inf
3555 * 255: - inf
3556 */
3557 static int rdbSaveDoubleValue(FILE *fp, double val) {
3558 unsigned char buf[128];
3559 int len;
3560
3561 if (isnan(val)) {
3562 buf[0] = 253;
3563 len = 1;
3564 } else if (!isfinite(val)) {
3565 len = 1;
3566 buf[0] = (val < 0) ? 255 : 254;
3567 } else {
3568 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3569 /* Check if the float is in a safe range to be casted into a
3570 * long long. We are assuming that long long is 64 bit here.
3571 * Also we are assuming that there are no implementations around where
3572 * double has precision < 52 bit.
3573 *
3574 * Under this assumptions we test if a double is inside an interval
3575 * where casting to long long is safe. Then using two castings we
3576 * make sure the decimal part is zero. If all this is true we use
3577 * integer printing function that is much faster. */
3578 double min = -4503599627370495; /* (2^52)-1 */
3579 double max = 4503599627370496; /* -(2^52) */
3580 if (val > min && val < max && val == ((double)((long long)val)))
3581 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3582 else
3583 #endif
3584 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3585 buf[0] = strlen((char*)buf+1);
3586 len = buf[0]+1;
3587 }
3588 if (fwrite(buf,len,1,fp) == 0) return -1;
3589 return 0;
3590 }
3591
3592 /* Save a Redis object. */
3593 static int rdbSaveObject(FILE *fp, robj *o) {
3594 if (o->type == REDIS_STRING) {
3595 /* Save a string value */
3596 if (rdbSaveStringObject(fp,o) == -1) return -1;
3597 } else if (o->type == REDIS_LIST) {
3598 /* Save a list value */
3599 list *list = o->ptr;
3600 listIter li;
3601 listNode *ln;
3602
3603 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3604 listRewind(list,&li);
3605 while((ln = listNext(&li))) {
3606 robj *eleobj = listNodeValue(ln);
3607
3608 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3609 }
3610 } else if (o->type == REDIS_SET) {
3611 /* Save a set value */
3612 dict *set = o->ptr;
3613 dictIterator *di = dictGetIterator(set);
3614 dictEntry *de;
3615
3616 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3617 while((de = dictNext(di)) != NULL) {
3618 robj *eleobj = dictGetEntryKey(de);
3619
3620 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3621 }
3622 dictReleaseIterator(di);
3623 } else if (o->type == REDIS_ZSET) {
3624 /* Save a set value */
3625 zset *zs = o->ptr;
3626 dictIterator *di = dictGetIterator(zs->dict);
3627 dictEntry *de;
3628
3629 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3630 while((de = dictNext(di)) != NULL) {
3631 robj *eleobj = dictGetEntryKey(de);
3632 double *score = dictGetEntryVal(de);
3633
3634 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3635 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3636 }
3637 dictReleaseIterator(di);
3638 } else if (o->type == REDIS_HASH) {
3639 /* Save a hash value */
3640 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3641 unsigned char *p = zipmapRewind(o->ptr);
3642 unsigned int count = zipmapLen(o->ptr);
3643 unsigned char *key, *val;
3644 unsigned int klen, vlen;
3645
3646 if (rdbSaveLen(fp,count) == -1) return -1;
3647 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3648 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3649 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3650 }
3651 } else {
3652 dictIterator *di = dictGetIterator(o->ptr);
3653 dictEntry *de;
3654
3655 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3656 while((de = dictNext(di)) != NULL) {
3657 robj *key = dictGetEntryKey(de);
3658 robj *val = dictGetEntryVal(de);
3659
3660 if (rdbSaveStringObject(fp,key) == -1) return -1;
3661 if (rdbSaveStringObject(fp,val) == -1) return -1;
3662 }
3663 dictReleaseIterator(di);
3664 }
3665 } else {
3666 redisPanic("Unknown object type");
3667 }
3668 return 0;
3669 }
3670
3671 /* Return the length the object will have on disk if saved with
3672 * the rdbSaveObject() function. Currently we use a trick to get
3673 * this length with very little changes to the code. In the future
3674 * we could switch to a faster solution. */
3675 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3676 if (fp == NULL) fp = server.devnull;
3677 rewind(fp);
3678 assert(rdbSaveObject(fp,o) != 1);
3679 return ftello(fp);
3680 }
3681
3682 /* Return the number of pages required to save this object in the swap file */
3683 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3684 off_t bytes = rdbSavedObjectLen(o,fp);
3685
3686 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3687 }
3688
3689 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3690 static int rdbSave(char *filename) {
3691 dictIterator *di = NULL;
3692 dictEntry *de;
3693 FILE *fp;
3694 char tmpfile[256];
3695 int j;
3696 time_t now = time(NULL);
3697
3698 /* Wait for I/O therads to terminate, just in case this is a
3699 * foreground-saving, to avoid seeking the swap file descriptor at the
3700 * same time. */
3701 if (server.vm_enabled)
3702 waitEmptyIOJobsQueue();
3703
3704 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3705 fp = fopen(tmpfile,"w");
3706 if (!fp) {
3707 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3708 return REDIS_ERR;
3709 }
3710 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3711 for (j = 0; j < server.dbnum; j++) {
3712 redisDb *db = server.db+j;
3713 dict *d = db->dict;
3714 if (dictSize(d) == 0) continue;
3715 di = dictGetIterator(d);
3716 if (!di) {
3717 fclose(fp);
3718 return REDIS_ERR;
3719 }
3720
3721 /* Write the SELECT DB opcode */
3722 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3723 if (rdbSaveLen(fp,j) == -1) goto werr;
3724
3725 /* Iterate this DB writing every entry */
3726 while((de = dictNext(di)) != NULL) {
3727 robj *key = dictGetEntryKey(de);
3728 robj *o = dictGetEntryVal(de);
3729 time_t expiretime = getExpire(db,key);
3730
3731 /* Save the expire time */
3732 if (expiretime != -1) {
3733 /* If this key is already expired skip it */
3734 if (expiretime < now) continue;
3735 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3736 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3737 }
3738 /* Save the key and associated value. This requires special
3739 * handling if the value is swapped out. */
3740 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3741 key->storage == REDIS_VM_SWAPPING) {
3742 /* Save type, key, value */
3743 if (rdbSaveType(fp,o->type) == -1) goto werr;
3744 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3745 if (rdbSaveObject(fp,o) == -1) goto werr;
3746 } else {
3747 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3748 robj *po;
3749 /* Get a preview of the object in memory */
3750 po = vmPreviewObject(key);
3751 /* Save type, key, value */
3752 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3753 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3754 if (rdbSaveObject(fp,po) == -1) goto werr;
3755 /* Remove the loaded object from memory */
3756 decrRefCount(po);
3757 }
3758 }
3759 dictReleaseIterator(di);
3760 }
3761 /* EOF opcode */
3762 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3763
3764 /* Make sure data will not remain on the OS's output buffers */
3765 fflush(fp);
3766 fsync(fileno(fp));
3767 fclose(fp);
3768
3769 /* Use RENAME to make sure the DB file is changed atomically only
3770 * if the generate DB file is ok. */
3771 if (rename(tmpfile,filename) == -1) {
3772 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3773 unlink(tmpfile);
3774 return REDIS_ERR;
3775 }
3776 redisLog(REDIS_NOTICE,"DB saved on disk");
3777 server.dirty = 0;
3778 server.lastsave = time(NULL);
3779 return REDIS_OK;
3780
3781 werr:
3782 fclose(fp);
3783 unlink(tmpfile);
3784 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3785 if (di) dictReleaseIterator(di);
3786 return REDIS_ERR;
3787 }
3788
3789 static int rdbSaveBackground(char *filename) {
3790 pid_t childpid;
3791
3792 if (server.bgsavechildpid != -1) return REDIS_ERR;
3793 if (server.vm_enabled) waitEmptyIOJobsQueue();
3794 if ((childpid = fork()) == 0) {
3795 /* Child */
3796 if (server.vm_enabled) vmReopenSwapFile();
3797 close(server.fd);
3798 if (rdbSave(filename) == REDIS_OK) {
3799 _exit(0);
3800 } else {
3801 _exit(1);
3802 }
3803 } else {
3804 /* Parent */
3805 if (childpid == -1) {
3806 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3807 strerror(errno));
3808 return REDIS_ERR;
3809 }
3810 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3811 server.bgsavechildpid = childpid;
3812 updateDictResizePolicy();
3813 return REDIS_OK;
3814 }
3815 return REDIS_OK; /* unreached */
3816 }
3817
3818 static void rdbRemoveTempFile(pid_t childpid) {
3819 char tmpfile[256];
3820
3821 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3822 unlink(tmpfile);
3823 }
3824
3825 static int rdbLoadType(FILE *fp) {
3826 unsigned char type;
3827 if (fread(&type,1,1,fp) == 0) return -1;
3828 return type;
3829 }
3830
3831 static time_t rdbLoadTime(FILE *fp) {
3832 int32_t t32;
3833 if (fread(&t32,4,1,fp) == 0) return -1;
3834 return (time_t) t32;
3835 }
3836
3837 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3838 * of this file for a description of how this are stored on disk.
3839 *
3840 * isencoded is set to 1 if the readed length is not actually a length but
3841 * an "encoding type", check the above comments for more info */
3842 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3843 unsigned char buf[2];
3844 uint32_t len;
3845 int type;
3846
3847 if (isencoded) *isencoded = 0;
3848 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3849 type = (buf[0]&0xC0)>>6;
3850 if (type == REDIS_RDB_6BITLEN) {
3851 /* Read a 6 bit len */
3852 return buf[0]&0x3F;
3853 } else if (type == REDIS_RDB_ENCVAL) {
3854 /* Read a 6 bit len encoding type */
3855 if (isencoded) *isencoded = 1;
3856 return buf[0]&0x3F;
3857 } else if (type == REDIS_RDB_14BITLEN) {
3858 /* Read a 14 bit len */
3859 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3860 return ((buf[0]&0x3F)<<8)|buf[1];
3861 } else {
3862 /* Read a 32 bit len */
3863 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3864 return ntohl(len);
3865 }
3866 }
3867
3868 /* Load an integer-encoded object from file 'fp', with the specified
3869 * encoding type 'enctype'. If encode is true the function may return
3870 * an integer-encoded object as reply, otherwise the returned object
3871 * will always be encoded as a raw string. */
3872 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3873 unsigned char enc[4];
3874 long long val;
3875
3876 if (enctype == REDIS_RDB_ENC_INT8) {
3877 if (fread(enc,1,1,fp) == 0) return NULL;
3878 val = (signed char)enc[0];
3879 } else if (enctype == REDIS_RDB_ENC_INT16) {
3880 uint16_t v;
3881 if (fread(enc,2,1,fp) == 0) return NULL;
3882 v = enc[0]|(enc[1]<<8);
3883 val = (int16_t)v;
3884 } else if (enctype == REDIS_RDB_ENC_INT32) {
3885 uint32_t v;
3886 if (fread(enc,4,1,fp) == 0) return NULL;
3887 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3888 val = (int32_t)v;
3889 } else {
3890 val = 0; /* anti-warning */
3891 redisPanic("Unknown RDB integer encoding type");
3892 }
3893 if (encode)
3894 return createStringObjectFromLongLong(val);
3895 else
3896 return createObject(REDIS_STRING,sdsfromlonglong(val));
3897 }
3898
3899 static robj *rdbLoadLzfStringObject(FILE*fp) {
3900 unsigned int len, clen;
3901 unsigned char *c = NULL;
3902 sds val = NULL;
3903
3904 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3905 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3906 if ((c = zmalloc(clen)) == NULL) goto err;
3907 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3908 if (fread(c,clen,1,fp) == 0) goto err;
3909 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3910 zfree(c);
3911 return createObject(REDIS_STRING,val);
3912 err:
3913 zfree(c);
3914 sdsfree(val);
3915 return NULL;
3916 }
3917
3918 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3919 int isencoded;
3920 uint32_t len;
3921 sds val;
3922
3923 len = rdbLoadLen(fp,&isencoded);
3924 if (isencoded) {
3925 switch(len) {
3926 case REDIS_RDB_ENC_INT8:
3927 case REDIS_RDB_ENC_INT16:
3928 case REDIS_RDB_ENC_INT32:
3929 return rdbLoadIntegerObject(fp,len,encode);
3930 case REDIS_RDB_ENC_LZF:
3931 return rdbLoadLzfStringObject(fp);
3932 default:
3933 redisPanic("Unknown RDB encoding type");
3934 }
3935 }
3936
3937 if (len == REDIS_RDB_LENERR) return NULL;
3938 val = sdsnewlen(NULL,len);
3939 if (len && fread(val,len,1,fp) == 0) {
3940 sdsfree(val);
3941 return NULL;
3942 }
3943 return createObject(REDIS_STRING,val);
3944 }
3945
3946 static robj *rdbLoadStringObject(FILE *fp) {
3947 return rdbGenericLoadStringObject(fp,0);
3948 }
3949
3950 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3951 return rdbGenericLoadStringObject(fp,1);
3952 }
3953
3954 /* For information about double serialization check rdbSaveDoubleValue() */
3955 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3956 char buf[128];
3957 unsigned char len;
3958
3959 if (fread(&len,1,1,fp) == 0) return -1;
3960 switch(len) {
3961 case 255: *val = R_NegInf; return 0;
3962 case 254: *val = R_PosInf; return 0;
3963 case 253: *val = R_Nan; return 0;
3964 default:
3965 if (fread(buf,len,1,fp) == 0) return -1;
3966 buf[len] = '\0';
3967 sscanf(buf, "%lg", val);
3968 return 0;
3969 }
3970 }
3971
3972 /* Load a Redis object of the specified type from the specified file.
3973 * On success a newly allocated object is returned, otherwise NULL. */
3974 static robj *rdbLoadObject(int type, FILE *fp) {
3975 robj *o;
3976
3977 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3978 if (type == REDIS_STRING) {
3979 /* Read string value */
3980 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3981 o = tryObjectEncoding(o);
3982 } else if (type == REDIS_LIST || type == REDIS_SET) {
3983 /* Read list/set value */
3984 uint32_t listlen;
3985
3986 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3987 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3988 /* It's faster to expand the dict to the right size asap in order
3989 * to avoid rehashing */
3990 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3991 dictExpand(o->ptr,listlen);
3992 /* Load every single element of the list/set */
3993 while(listlen--) {
3994 robj *ele;
3995
3996 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3997 ele = tryObjectEncoding(ele);
3998 if (type == REDIS_LIST) {
3999 listAddNodeTail((list*)o->ptr,ele);
4000 } else {
4001 dictAdd((dict*)o->ptr,ele,NULL);
4002 }
4003 }
4004 } else if (type == REDIS_ZSET) {
4005 /* Read list/set value */
4006 size_t zsetlen;
4007 zset *zs;
4008
4009 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4010 o = createZsetObject();
4011 zs = o->ptr;
4012 /* Load every single element of the list/set */
4013 while(zsetlen--) {
4014 robj *ele;
4015 double *score = zmalloc(sizeof(double));
4016
4017 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4018 ele = tryObjectEncoding(ele);
4019 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4020 dictAdd(zs->dict,ele,score);
4021 zslInsert(zs->zsl,*score,ele);
4022 incrRefCount(ele); /* added to skiplist */
4023 }
4024 } else if (type == REDIS_HASH) {
4025 size_t hashlen;
4026
4027 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4028 o = createHashObject();
4029 /* Too many entries? Use an hash table. */
4030 if (hashlen > server.hash_max_zipmap_entries)
4031 convertToRealHash(o);
4032 /* Load every key/value, then set it into the zipmap or hash
4033 * table, as needed. */
4034 while(hashlen--) {
4035 robj *key, *val;
4036
4037 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4038 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4039 /* If we are using a zipmap and there are too big values
4040 * the object is converted to real hash table encoding. */
4041 if (o->encoding != REDIS_ENCODING_HT &&
4042 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4043 sdslen(val->ptr) > server.hash_max_zipmap_value))
4044 {
4045 convertToRealHash(o);
4046 }
4047
4048 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4049 unsigned char *zm = o->ptr;
4050
4051 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4052 val->ptr,sdslen(val->ptr),NULL);
4053 o->ptr = zm;
4054 decrRefCount(key);
4055 decrRefCount(val);
4056 } else {
4057 key = tryObjectEncoding(key);
4058 val = tryObjectEncoding(val);
4059 dictAdd((dict*)o->ptr,key,val);
4060 }
4061 }
4062 } else {
4063 redisPanic("Unknown object type");
4064 }
4065 return o;
4066 }
4067
4068 static int rdbLoad(char *filename) {
4069 FILE *fp;
4070 uint32_t dbid;
4071 int type, retval, rdbver;
4072 int swap_all_values = 0;
4073 dict *d = server.db[0].dict;
4074 redisDb *db = server.db+0;
4075 char buf[1024];
4076 time_t expiretime, now = time(NULL);
4077 long long loadedkeys = 0;
4078
4079 fp = fopen(filename,"r");
4080 if (!fp) return REDIS_ERR;
4081 if (fread(buf,9,1,fp) == 0) goto eoferr;
4082 buf[9] = '\0';
4083 if (memcmp(buf,"REDIS",5) != 0) {
4084 fclose(fp);
4085 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4086 return REDIS_ERR;
4087 }
4088 rdbver = atoi(buf+5);
4089 if (rdbver != 1) {
4090 fclose(fp);
4091 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4092 return REDIS_ERR;
4093 }
4094 while(1) {
4095 robj *key, *val;
4096
4097 expiretime = -1;
4098 /* Read type. */
4099 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4100 if (type == REDIS_EXPIRETIME) {
4101 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4102 /* We read the time so we need to read the object type again */
4103 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4104 }
4105 if (type == REDIS_EOF) break;
4106 /* Handle SELECT DB opcode as a special case */
4107 if (type == REDIS_SELECTDB) {
4108 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4109 goto eoferr;
4110 if (dbid >= (unsigned)server.dbnum) {
4111 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4112 exit(1);
4113 }
4114 db = server.db+dbid;
4115 d = db->dict;
4116 continue;
4117 }
4118 /* Read key */
4119 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4120 /* Read value */
4121 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4122 /* Check if the key already expired */
4123 if (expiretime != -1 && expiretime < now) {
4124 decrRefCount(key);
4125 decrRefCount(val);
4126 continue;
4127 }
4128 /* Add the new object in the hash table */
4129 retval = dictAdd(d,key,val);
4130 if (retval == DICT_ERR) {
4131 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4132 exit(1);
4133 }
4134 loadedkeys++;
4135 /* Set the expire time if needed */
4136 if (expiretime != -1) setExpire(db,key,expiretime);
4137
4138 /* Handle swapping while loading big datasets when VM is on */
4139
4140 /* If we detecter we are hopeless about fitting something in memory
4141 * we just swap every new key on disk. Directly...
4142 * Note that's important to check for this condition before resorting
4143 * to random sampling, otherwise we may try to swap already
4144 * swapped keys. */
4145 if (swap_all_values) {
4146 dictEntry *de = dictFind(d,key);
4147
4148 /* de may be NULL since the key already expired */
4149 if (de) {
4150 key = dictGetEntryKey(de);
4151 val = dictGetEntryVal(de);
4152
4153 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4154 dictGetEntryVal(de) = NULL;
4155 }
4156 }
4157 continue;
4158 }
4159
4160 /* If we have still some hope of having some value fitting memory
4161 * then we try random sampling. */
4162 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4163 while (zmalloc_used_memory() > server.vm_max_memory) {
4164 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4165 }
4166 if (zmalloc_used_memory() > server.vm_max_memory)
4167 swap_all_values = 1; /* We are already using too much mem */
4168 }
4169 }
4170 fclose(fp);
4171 return REDIS_OK;
4172
4173 eoferr: /* unexpected end of file is handled here with a fatal exit */
4174 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4175 exit(1);
4176 return REDIS_ERR; /* Just to avoid warning */
4177 }
4178
4179 /*================================== Shutdown =============================== */
4180 static int prepareForShutdown() {
4181 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4182 /* Kill the saving child if there is a background saving in progress.
4183 We want to avoid race conditions, for instance our saving child may
4184 overwrite the synchronous saving did by SHUTDOWN. */
4185 if (server.bgsavechildpid != -1) {
4186 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4187 kill(server.bgsavechildpid,SIGKILL);
4188 rdbRemoveTempFile(server.bgsavechildpid);
4189 }
4190 if (server.appendonly) {
4191 /* Append only file: fsync() the AOF and exit */
4192 fsync(server.appendfd);
4193 if (server.vm_enabled) unlink(server.vm_swap_file);
4194 } else {
4195 /* Snapshotting. Perform a SYNC SAVE and exit */
4196 if (rdbSave(server.dbfilename) == REDIS_OK) {
4197 if (server.daemonize)
4198 unlink(server.pidfile);
4199 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4200 } else {
4201 /* Ooops.. error saving! The best we can do is to continue
4202 * operating. Note that if there was a background saving process,
4203 * in the next cron() Redis will be notified that the background
4204 * saving aborted, handling special stuff like slaves pending for
4205 * synchronization... */
4206 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4207 return REDIS_ERR;
4208 }
4209 }
4210 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4211 return REDIS_OK;
4212 }
4213
4214 /*================================== Commands =============================== */
4215
4216 static void authCommand(redisClient *c) {
4217 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4218 c->authenticated = 1;
4219 addReply(c,shared.ok);
4220 } else {
4221 c->authenticated = 0;
4222 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4223 }
4224 }
4225
4226 static void pingCommand(redisClient *c) {
4227 addReply(c,shared.pong);
4228 }
4229
4230 static void echoCommand(redisClient *c) {
4231 addReplyBulk(c,c->argv[1]);
4232 }
4233
4234 /*=================================== Strings =============================== */
4235
4236 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4237 int retval;
4238 long seconds = 0; /* initialized to avoid an harmness warning */
4239
4240 if (expire) {
4241 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4242 return;
4243 if (seconds <= 0) {
4244 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4245 return;
4246 }
4247 }
4248
4249 touchWatchedKey(c->db,key);
4250 if (nx) deleteIfVolatile(c->db,key);
4251 retval = dictAdd(c->db->dict,key,val);
4252 if (retval == DICT_ERR) {
4253 if (!nx) {
4254 /* If the key is about a swapped value, we want a new key object
4255 * to overwrite the old. So we delete the old key in the database.
4256 * This will also make sure that swap pages about the old object
4257 * will be marked as free. */
4258 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4259 incrRefCount(key);
4260 dictReplace(c->db->dict,key,val);
4261 incrRefCount(val);
4262 } else {
4263 addReply(c,shared.czero);
4264 return;
4265 }
4266 } else {
4267 incrRefCount(key);
4268 incrRefCount(val);
4269 }
4270 server.dirty++;
4271 removeExpire(c->db,key);
4272 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4273 addReply(c, nx ? shared.cone : shared.ok);
4274 }
4275
4276 static void setCommand(redisClient *c) {
4277 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4278 }
4279
4280 static void setnxCommand(redisClient *c) {
4281 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4282 }
4283
4284 static void setexCommand(redisClient *c) {
4285 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4286 }
4287
4288 static int getGenericCommand(redisClient *c) {
4289 robj *o;
4290
4291 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4292 return REDIS_OK;
4293
4294 if (o->type != REDIS_STRING) {
4295 addReply(c,shared.wrongtypeerr);
4296 return REDIS_ERR;
4297 } else {
4298 addReplyBulk(c,o);
4299 return REDIS_OK;
4300 }
4301 }
4302
4303 static void getCommand(redisClient *c) {
4304 getGenericCommand(c);
4305 }
4306
4307 static void getsetCommand(redisClient *c) {
4308 if (getGenericCommand(c) == REDIS_ERR) return;
4309 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4310 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4311 } else {
4312 incrRefCount(c->argv[1]);
4313 }
4314 incrRefCount(c->argv[2]);
4315 server.dirty++;
4316 removeExpire(c->db,c->argv[1]);
4317 }
4318
4319 static void mgetCommand(redisClient *c) {
4320 int j;
4321
4322 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4323 for (j = 1; j < c->argc; j++) {
4324 robj *o = lookupKeyRead(c->db,c->argv[j]);
4325 if (o == NULL) {
4326 addReply(c,shared.nullbulk);
4327 } else {
4328 if (o->type != REDIS_STRING) {
4329 addReply(c,shared.nullbulk);
4330 } else {
4331 addReplyBulk(c,o);
4332 }
4333 }
4334 }
4335 }
4336
4337 static void msetGenericCommand(redisClient *c, int nx) {
4338 int j, busykeys = 0;
4339
4340 if ((c->argc % 2) == 0) {
4341 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4342 return;
4343 }
4344 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4345 * set nothing at all if at least one already key exists. */
4346 if (nx) {
4347 for (j = 1; j < c->argc; j += 2) {
4348 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4349 busykeys++;
4350 }
4351 }
4352 }
4353 if (busykeys) {
4354 addReply(c, shared.czero);
4355 return;
4356 }
4357
4358 for (j = 1; j < c->argc; j += 2) {
4359 int retval;
4360
4361 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4362 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4363 if (retval == DICT_ERR) {
4364 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4365 incrRefCount(c->argv[j+1]);
4366 } else {
4367 incrRefCount(c->argv[j]);
4368 incrRefCount(c->argv[j+1]);
4369 }
4370 removeExpire(c->db,c->argv[j]);
4371 }
4372 server.dirty += (c->argc-1)/2;
4373 addReply(c, nx ? shared.cone : shared.ok);
4374 }
4375
4376 static void msetCommand(redisClient *c) {
4377 msetGenericCommand(c,0);
4378 }
4379
4380 static void msetnxCommand(redisClient *c) {
4381 msetGenericCommand(c,1);
4382 }
4383
4384 static void incrDecrCommand(redisClient *c, long long incr) {
4385 long long value;
4386 int retval;
4387 robj *o;
4388
4389 o = lookupKeyWrite(c->db,c->argv[1]);
4390 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4391 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4392
4393 value += incr;
4394 o = createStringObjectFromLongLong(value);
4395 retval = dictAdd(c->db->dict,c->argv[1],o);
4396 if (retval == DICT_ERR) {
4397 dictReplace(c->db->dict,c->argv[1],o);
4398 removeExpire(c->db,c->argv[1]);
4399 } else {
4400 incrRefCount(c->argv[1]);
4401 }
4402 server.dirty++;
4403 addReply(c,shared.colon);
4404 addReply(c,o);
4405 addReply(c,shared.crlf);
4406 }
4407
4408 static void incrCommand(redisClient *c) {
4409 incrDecrCommand(c,1);
4410 }
4411
4412 static void decrCommand(redisClient *c) {
4413 incrDecrCommand(c,-1);
4414 }
4415
4416 static void incrbyCommand(redisClient *c) {
4417 long long incr;
4418
4419 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4420 incrDecrCommand(c,incr);
4421 }
4422
4423 static void decrbyCommand(redisClient *c) {
4424 long long incr;
4425
4426 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4427 incrDecrCommand(c,-incr);
4428 }
4429
4430 static void appendCommand(redisClient *c) {
4431 int retval;
4432 size_t totlen;
4433 robj *o;
4434
4435 o = lookupKeyWrite(c->db,c->argv[1]);
4436 if (o == NULL) {
4437 /* Create the key */
4438 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4439 incrRefCount(c->argv[1]);
4440 incrRefCount(c->argv[2]);
4441 totlen = stringObjectLen(c->argv[2]);
4442 } else {
4443 dictEntry *de;
4444
4445 de = dictFind(c->db->dict,c->argv[1]);
4446 assert(de != NULL);
4447
4448 o = dictGetEntryVal(de);
4449 if (o->type != REDIS_STRING) {
4450 addReply(c,shared.wrongtypeerr);
4451 return;
4452 }
4453 /* If the object is specially encoded or shared we have to make
4454 * a copy */
4455 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4456 robj *decoded = getDecodedObject(o);
4457
4458 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4459 decrRefCount(decoded);
4460 dictReplace(c->db->dict,c->argv[1],o);
4461 }
4462 /* APPEND! */
4463 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4464 o->ptr = sdscatlen(o->ptr,
4465 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4466 } else {
4467 o->ptr = sdscatprintf(o->ptr, "%ld",
4468 (unsigned long) c->argv[2]->ptr);
4469 }
4470 totlen = sdslen(o->ptr);
4471 }
4472 server.dirty++;
4473 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4474 }
4475
4476 static void substrCommand(redisClient *c) {
4477 robj *o;
4478 long start = atoi(c->argv[2]->ptr);
4479 long end = atoi(c->argv[3]->ptr);
4480 size_t rangelen, strlen;
4481 sds range;
4482
4483 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4484 checkType(c,o,REDIS_STRING)) return;
4485
4486 o = getDecodedObject(o);
4487 strlen = sdslen(o->ptr);
4488
4489 /* convert negative indexes */
4490 if (start < 0) start = strlen+start;
4491 if (end < 0) end = strlen+end;
4492 if (start < 0) start = 0;
4493 if (end < 0) end = 0;
4494
4495 /* indexes sanity checks */
4496 if (start > end || (size_t)start >= strlen) {
4497 /* Out of range start or start > end result in null reply */
4498 addReply(c,shared.nullbulk);
4499 decrRefCount(o);
4500 return;
4501 }
4502 if ((size_t)end >= strlen) end = strlen-1;
4503 rangelen = (end-start)+1;
4504
4505 /* Return the result */
4506 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4507 range = sdsnewlen((char*)o->ptr+start,rangelen);
4508 addReplySds(c,range);
4509 addReply(c,shared.crlf);
4510 decrRefCount(o);
4511 }
4512
4513 /* ========================= Type agnostic commands ========================= */
4514
4515 static void delCommand(redisClient *c) {
4516 int deleted = 0, j;
4517
4518 for (j = 1; j < c->argc; j++) {
4519 if (deleteKey(c->db,c->argv[j])) {
4520 touchWatchedKey(c->db,c->argv[j]);
4521 server.dirty++;
4522 deleted++;
4523 }
4524 }
4525 addReplyLongLong(c,deleted);
4526 }
4527
4528 static void existsCommand(redisClient *c) {
4529 expireIfNeeded(c->db,c->argv[1]);
4530 if (dictFind(c->db->dict,c->argv[1])) {
4531 addReply(c, shared.cone);
4532 } else {
4533 addReply(c, shared.czero);
4534 }
4535 }
4536
4537 static void selectCommand(redisClient *c) {
4538 int id = atoi(c->argv[1]->ptr);
4539
4540 if (selectDb(c,id) == REDIS_ERR) {
4541 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4542 } else {
4543 addReply(c,shared.ok);
4544 }
4545 }
4546
4547 static void randomkeyCommand(redisClient *c) {
4548 dictEntry *de;
4549 robj *key;
4550
4551 while(1) {
4552 de = dictGetRandomKey(c->db->dict);
4553 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4554 }
4555
4556 if (de == NULL) {
4557 addReply(c,shared.nullbulk);
4558 return;
4559 }
4560
4561 key = dictGetEntryKey(de);
4562 if (server.vm_enabled) {
4563 key = dupStringObject(key);
4564 addReplyBulk(c,key);
4565 decrRefCount(key);
4566 } else {
4567 addReplyBulk(c,key);
4568 }
4569 }
4570
4571 static void keysCommand(redisClient *c) {
4572 dictIterator *di;
4573 dictEntry *de;
4574 sds pattern = c->argv[1]->ptr;
4575 int plen = sdslen(pattern);
4576 unsigned long numkeys = 0;
4577 robj *lenobj = createObject(REDIS_STRING,NULL);
4578
4579 di = dictGetIterator(c->db->dict);
4580 addReply(c,lenobj);
4581 decrRefCount(lenobj);
4582 while((de = dictNext(di)) != NULL) {
4583 robj *keyobj = dictGetEntryKey(de);
4584
4585 sds key = keyobj->ptr;
4586 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4587 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4588 if (expireIfNeeded(c->db,keyobj) == 0) {
4589 addReplyBulk(c,keyobj);
4590 numkeys++;
4591 }
4592 }
4593 }
4594 dictReleaseIterator(di);
4595 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4596 }
4597
4598 static void dbsizeCommand(redisClient *c) {
4599 addReplySds(c,
4600 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4601 }
4602
4603 static void lastsaveCommand(redisClient *c) {
4604 addReplySds(c,
4605 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4606 }
4607
4608 static void typeCommand(redisClient *c) {
4609 robj *o;
4610 char *type;
4611
4612 o = lookupKeyRead(c->db,c->argv[1]);
4613 if (o == NULL) {
4614 type = "+none";
4615 } else {
4616 switch(o->type) {
4617 case REDIS_STRING: type = "+string"; break;
4618 case REDIS_LIST: type = "+list"; break;
4619 case REDIS_SET: type = "+set"; break;
4620 case REDIS_ZSET: type = "+zset"; break;
4621 case REDIS_HASH: type = "+hash"; break;
4622 default: type = "+unknown"; break;
4623 }
4624 }
4625 addReplySds(c,sdsnew(type));
4626 addReply(c,shared.crlf);
4627 }
4628
4629 static void saveCommand(redisClient *c) {
4630 if (server.bgsavechildpid != -1) {
4631 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4632 return;
4633 }
4634 if (rdbSave(server.dbfilename) == REDIS_OK) {
4635 addReply(c,shared.ok);
4636 } else {
4637 addReply(c,shared.err);
4638 }
4639 }
4640
4641 static void bgsaveCommand(redisClient *c) {
4642 if (server.bgsavechildpid != -1) {
4643 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4644 return;
4645 }
4646 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4647 char *status = "+Background saving started\r\n";
4648 addReplySds(c,sdsnew(status));
4649 } else {
4650 addReply(c,shared.err);
4651 }
4652 }
4653
4654 static void shutdownCommand(redisClient *c) {
4655 if (prepareForShutdown() == REDIS_OK)
4656 exit(0);
4657 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4658 }
4659
4660 static void renameGenericCommand(redisClient *c, int nx) {
4661 robj *o;
4662
4663 /* To use the same key as src and dst is probably an error */
4664 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4665 addReply(c,shared.sameobjecterr);
4666 return;
4667 }
4668
4669 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4670 return;
4671
4672 incrRefCount(o);
4673 deleteIfVolatile(c->db,c->argv[2]);
4674 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4675 if (nx) {
4676 decrRefCount(o);
4677 addReply(c,shared.czero);
4678 return;
4679 }
4680 dictReplace(c->db->dict,c->argv[2],o);
4681 } else {
4682 incrRefCount(c->argv[2]);
4683 }
4684 deleteKey(c->db,c->argv[1]);
4685 server.dirty++;
4686 addReply(c,nx ? shared.cone : shared.ok);
4687 }
4688
4689 static void renameCommand(redisClient *c) {
4690 renameGenericCommand(c,0);
4691 }
4692
4693 static void renamenxCommand(redisClient *c) {
4694 renameGenericCommand(c,1);
4695 }
4696
4697 static void moveCommand(redisClient *c) {
4698 robj *o;
4699 redisDb *src, *dst;
4700 int srcid;
4701
4702 /* Obtain source and target DB pointers */
4703 src = c->db;
4704 srcid = c->db->id;
4705 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4706 addReply(c,shared.outofrangeerr);
4707 return;
4708 }
4709 dst = c->db;
4710 selectDb(c,srcid); /* Back to the source DB */
4711
4712 /* If the user is moving using as target the same
4713 * DB as the source DB it is probably an error. */
4714 if (src == dst) {
4715 addReply(c,shared.sameobjecterr);
4716 return;
4717 }
4718
4719 /* Check if the element exists and get a reference */
4720 o = lookupKeyWrite(c->db,c->argv[1]);
4721 if (!o) {
4722 addReply(c,shared.czero);
4723 return;
4724 }
4725
4726 /* Try to add the element to the target DB */
4727 deleteIfVolatile(dst,c->argv[1]);
4728 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4729 addReply(c,shared.czero);
4730 return;
4731 }
4732 incrRefCount(c->argv[1]);
4733 incrRefCount(o);
4734
4735 /* OK! key moved, free the entry in the source DB */
4736 deleteKey(src,c->argv[1]);
4737 server.dirty++;
4738 addReply(c,shared.cone);
4739 }
4740
4741 /* =================================== Lists ================================ */
4742 static void pushGenericCommand(redisClient *c, int where) {
4743 robj *lobj;
4744 list *list;
4745
4746 lobj = lookupKeyWrite(c->db,c->argv[1]);
4747 if (lobj == NULL) {
4748 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4749 addReply(c,shared.cone);
4750 return;
4751 }
4752 lobj = createListObject();
4753 list = lobj->ptr;
4754 if (where == REDIS_HEAD) {
4755 listAddNodeHead(list,c->argv[2]);
4756 } else {
4757 listAddNodeTail(list,c->argv[2]);
4758 }
4759 dictAdd(c->db->dict,c->argv[1],lobj);
4760 incrRefCount(c->argv[1]);
4761 incrRefCount(c->argv[2]);
4762 } else {
4763 if (lobj->type != REDIS_LIST) {
4764 addReply(c,shared.wrongtypeerr);
4765 return;
4766 }
4767 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4768 addReply(c,shared.cone);
4769 return;
4770 }
4771 list = lobj->ptr;
4772 if (where == REDIS_HEAD) {
4773 listAddNodeHead(list,c->argv[2]);
4774 } else {
4775 listAddNodeTail(list,c->argv[2]);
4776 }
4777 incrRefCount(c->argv[2]);
4778 }
4779 server.dirty++;
4780 addReplyLongLong(c,listLength(list));
4781 }
4782
4783 static void lpushCommand(redisClient *c) {
4784 pushGenericCommand(c,REDIS_HEAD);
4785 }
4786
4787 static void rpushCommand(redisClient *c) {
4788 pushGenericCommand(c,REDIS_TAIL);
4789 }
4790
4791 static void llenCommand(redisClient *c) {
4792 robj *o;
4793 list *l;
4794
4795 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4796 checkType(c,o,REDIS_LIST)) return;
4797
4798 l = o->ptr;
4799 addReplyUlong(c,listLength(l));
4800 }
4801
4802 static void lindexCommand(redisClient *c) {
4803 robj *o;
4804 int index = atoi(c->argv[2]->ptr);
4805 list *list;
4806 listNode *ln;
4807
4808 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4809 checkType(c,o,REDIS_LIST)) return;
4810 list = o->ptr;
4811
4812 ln = listIndex(list, index);
4813 if (ln == NULL) {
4814 addReply(c,shared.nullbulk);
4815 } else {
4816 robj *ele = listNodeValue(ln);
4817 addReplyBulk(c,ele);
4818 }
4819 }
4820
4821 static void lsetCommand(redisClient *c) {
4822 robj *o;
4823 int index = atoi(c->argv[2]->ptr);
4824 list *list;
4825 listNode *ln;
4826
4827 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4828 checkType(c,o,REDIS_LIST)) return;
4829 list = o->ptr;
4830
4831 ln = listIndex(list, index);
4832 if (ln == NULL) {
4833 addReply(c,shared.outofrangeerr);
4834 } else {
4835 robj *ele = listNodeValue(ln);
4836
4837 decrRefCount(ele);
4838 listNodeValue(ln) = c->argv[3];
4839 incrRefCount(c->argv[3]);
4840 addReply(c,shared.ok);
4841 server.dirty++;
4842 }
4843 }
4844
4845 static void popGenericCommand(redisClient *c, int where) {
4846 robj *o;
4847 list *list;
4848 listNode *ln;
4849
4850 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4851 checkType(c,o,REDIS_LIST)) return;
4852 list = o->ptr;
4853
4854 if (where == REDIS_HEAD)
4855 ln = listFirst(list);
4856 else
4857 ln = listLast(list);
4858
4859 if (ln == NULL) {
4860 addReply(c,shared.nullbulk);
4861 } else {
4862 robj *ele = listNodeValue(ln);
4863 addReplyBulk(c,ele);
4864 listDelNode(list,ln);
4865 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4866 server.dirty++;
4867 }
4868 }
4869
4870 static void lpopCommand(redisClient *c) {
4871 popGenericCommand(c,REDIS_HEAD);
4872 }
4873
4874 static void rpopCommand(redisClient *c) {
4875 popGenericCommand(c,REDIS_TAIL);
4876 }
4877
4878 static void lrangeCommand(redisClient *c) {
4879 robj *o;
4880 int start = atoi(c->argv[2]->ptr);
4881 int end = atoi(c->argv[3]->ptr);
4882 int llen;
4883 int rangelen, j;
4884 list *list;
4885 listNode *ln;
4886 robj *ele;
4887
4888 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4889 || checkType(c,o,REDIS_LIST)) return;
4890 list = o->ptr;
4891 llen = listLength(list);
4892
4893 /* convert negative indexes */
4894 if (start < 0) start = llen+start;
4895 if (end < 0) end = llen+end;
4896 if (start < 0) start = 0;
4897 if (end < 0) end = 0;
4898
4899 /* indexes sanity checks */
4900 if (start > end || start >= llen) {
4901 /* Out of range start or start > end result in empty list */
4902 addReply(c,shared.emptymultibulk);
4903 return;
4904 }
4905 if (end >= llen) end = llen-1;
4906 rangelen = (end-start)+1;
4907
4908 /* Return the result in form of a multi-bulk reply */
4909 ln = listIndex(list, start);
4910 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4911 for (j = 0; j < rangelen; j++) {
4912 ele = listNodeValue(ln);
4913 addReplyBulk(c,ele);
4914 ln = ln->next;
4915 }
4916 }
4917
4918 static void ltrimCommand(redisClient *c) {
4919 robj *o;
4920 int start = atoi(c->argv[2]->ptr);
4921 int end = atoi(c->argv[3]->ptr);
4922 int llen;
4923 int j, ltrim, rtrim;
4924 list *list;
4925 listNode *ln;
4926
4927 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4928 checkType(c,o,REDIS_LIST)) return;
4929 list = o->ptr;
4930 llen = listLength(list);
4931
4932 /* convert negative indexes */
4933 if (start < 0) start = llen+start;
4934 if (end < 0) end = llen+end;
4935 if (start < 0) start = 0;
4936 if (end < 0) end = 0;
4937
4938 /* indexes sanity checks */
4939 if (start > end || start >= llen) {
4940 /* Out of range start or start > end result in empty list */
4941 ltrim = llen;
4942 rtrim = 0;
4943 } else {
4944 if (end >= llen) end = llen-1;
4945 ltrim = start;
4946 rtrim = llen-end-1;
4947 }
4948
4949 /* Remove list elements to perform the trim */
4950 for (j = 0; j < ltrim; j++) {
4951 ln = listFirst(list);
4952 listDelNode(list,ln);
4953 }
4954 for (j = 0; j < rtrim; j++) {
4955 ln = listLast(list);
4956 listDelNode(list,ln);
4957 }
4958 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4959 server.dirty++;
4960 addReply(c,shared.ok);
4961 }
4962
4963 static void lremCommand(redisClient *c) {
4964 robj *o;
4965 list *list;
4966 listNode *ln, *next;
4967 int toremove = atoi(c->argv[2]->ptr);
4968 int removed = 0;
4969 int fromtail = 0;
4970
4971 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4972 checkType(c,o,REDIS_LIST)) return;
4973 list = o->ptr;
4974
4975 if (toremove < 0) {
4976 toremove = -toremove;
4977 fromtail = 1;
4978 }
4979 ln = fromtail ? list->tail : list->head;
4980 while (ln) {
4981 robj *ele = listNodeValue(ln);
4982
4983 next = fromtail ? ln->prev : ln->next;
4984 if (equalStringObjects(ele,c->argv[3])) {
4985 listDelNode(list,ln);
4986 server.dirty++;
4987 removed++;
4988 if (toremove && removed == toremove) break;
4989 }
4990 ln = next;
4991 }
4992 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4993 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4994 }
4995
4996 /* This is the semantic of this command:
4997 * RPOPLPUSH srclist dstlist:
4998 * IF LLEN(srclist) > 0
4999 * element = RPOP srclist
5000 * LPUSH dstlist element
5001 * RETURN element
5002 * ELSE
5003 * RETURN nil
5004 * END
5005 * END
5006 *
5007 * The idea is to be able to get an element from a list in a reliable way
5008 * since the element is not just returned but pushed against another list
5009 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5010 */
5011 static void rpoplpushcommand(redisClient *c) {
5012 robj *sobj;
5013 list *srclist;
5014 listNode *ln;
5015
5016 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5017 checkType(c,sobj,REDIS_LIST)) return;
5018 srclist = sobj->ptr;
5019 ln = listLast(srclist);
5020
5021 if (ln == NULL) {
5022 addReply(c,shared.nullbulk);
5023 } else {
5024 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5025 robj *ele = listNodeValue(ln);
5026 list *dstlist;
5027
5028 if (dobj && dobj->type != REDIS_LIST) {
5029 addReply(c,shared.wrongtypeerr);
5030 return;
5031 }
5032
5033 /* Add the element to the target list (unless it's directly
5034 * passed to some BLPOP-ing client */
5035 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5036 if (dobj == NULL) {
5037 /* Create the list if the key does not exist */
5038 dobj = createListObject();
5039 dictAdd(c->db->dict,c->argv[2],dobj);
5040 incrRefCount(c->argv[2]);
5041 }
5042 dstlist = dobj->ptr;
5043 listAddNodeHead(dstlist,ele);
5044 incrRefCount(ele);
5045 }
5046
5047 /* Send the element to the client as reply as well */
5048 addReplyBulk(c,ele);
5049
5050 /* Finally remove the element from the source list */
5051 listDelNode(srclist,ln);
5052 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5053 server.dirty++;
5054 }
5055 }
5056
5057 /* ==================================== Sets ================================ */
5058
5059 static void saddCommand(redisClient *c) {
5060 robj *set;
5061
5062 set = lookupKeyWrite(c->db,c->argv[1]);
5063 if (set == NULL) {
5064 set = createSetObject();
5065 dictAdd(c->db->dict,c->argv[1],set);
5066 incrRefCount(c->argv[1]);
5067 } else {
5068 if (set->type != REDIS_SET) {
5069 addReply(c,shared.wrongtypeerr);
5070 return;
5071 }
5072 }
5073 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5074 incrRefCount(c->argv[2]);
5075 server.dirty++;
5076 addReply(c,shared.cone);
5077 } else {
5078 addReply(c,shared.czero);
5079 }
5080 }
5081
5082 static void sremCommand(redisClient *c) {
5083 robj *set;
5084
5085 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5086 checkType(c,set,REDIS_SET)) return;
5087
5088 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5089 server.dirty++;
5090 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5091 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5092 addReply(c,shared.cone);
5093 } else {
5094 addReply(c,shared.czero);
5095 }
5096 }
5097
5098 static void smoveCommand(redisClient *c) {
5099 robj *srcset, *dstset;
5100
5101 srcset = lookupKeyWrite(c->db,c->argv[1]);
5102 dstset = lookupKeyWrite(c->db,c->argv[2]);
5103
5104 /* If the source key does not exist return 0, if it's of the wrong type
5105 * raise an error */
5106 if (srcset == NULL || srcset->type != REDIS_SET) {
5107 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5108 return;
5109 }
5110 /* Error if the destination key is not a set as well */
5111 if (dstset && dstset->type != REDIS_SET) {
5112 addReply(c,shared.wrongtypeerr);
5113 return;
5114 }
5115 /* Remove the element from the source set */
5116 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5117 /* Key not found in the src set! return zero */
5118 addReply(c,shared.czero);
5119 return;
5120 }
5121 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5122 deleteKey(c->db,c->argv[1]);
5123 server.dirty++;
5124 /* Add the element to the destination set */
5125 if (!dstset) {
5126 dstset = createSetObject();
5127 dictAdd(c->db->dict,c->argv[2],dstset);
5128 incrRefCount(c->argv[2]);
5129 }
5130 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5131 incrRefCount(c->argv[3]);
5132 addReply(c,shared.cone);
5133 }
5134
5135 static void sismemberCommand(redisClient *c) {
5136 robj *set;
5137
5138 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5139 checkType(c,set,REDIS_SET)) return;
5140
5141 if (dictFind(set->ptr,c->argv[2]))
5142 addReply(c,shared.cone);
5143 else
5144 addReply(c,shared.czero);
5145 }
5146
5147 static void scardCommand(redisClient *c) {
5148 robj *o;
5149 dict *s;
5150
5151 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5152 checkType(c,o,REDIS_SET)) return;
5153
5154 s = o->ptr;
5155 addReplyUlong(c,dictSize(s));
5156 }
5157
5158 static void spopCommand(redisClient *c) {
5159 robj *set;
5160 dictEntry *de;
5161
5162 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5163 checkType(c,set,REDIS_SET)) return;
5164
5165 de = dictGetRandomKey(set->ptr);
5166 if (de == NULL) {
5167 addReply(c,shared.nullbulk);
5168 } else {
5169 robj *ele = dictGetEntryKey(de);
5170
5171 addReplyBulk(c,ele);
5172 dictDelete(set->ptr,ele);
5173 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5174 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5175 server.dirty++;
5176 }
5177 }
5178
5179 static void srandmemberCommand(redisClient *c) {
5180 robj *set;
5181 dictEntry *de;
5182
5183 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5184 checkType(c,set,REDIS_SET)) return;
5185
5186 de = dictGetRandomKey(set->ptr);
5187 if (de == NULL) {
5188 addReply(c,shared.nullbulk);
5189 } else {
5190 robj *ele = dictGetEntryKey(de);
5191
5192 addReplyBulk(c,ele);
5193 }
5194 }
5195
5196 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5197 dict **d1 = (void*) s1, **d2 = (void*) s2;
5198
5199 return dictSize(*d1)-dictSize(*d2);
5200 }
5201
5202 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5203 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5204 dictIterator *di;
5205 dictEntry *de;
5206 robj *lenobj = NULL, *dstset = NULL;
5207 unsigned long j, cardinality = 0;
5208
5209 for (j = 0; j < setsnum; j++) {
5210 robj *setobj;
5211
5212 setobj = dstkey ?
5213 lookupKeyWrite(c->db,setskeys[j]) :
5214 lookupKeyRead(c->db,setskeys[j]);
5215 if (!setobj) {
5216 zfree(dv);
5217 if (dstkey) {
5218 if (deleteKey(c->db,dstkey))
5219 server.dirty++;
5220 addReply(c,shared.czero);
5221 } else {
5222 addReply(c,shared.emptymultibulk);
5223 }
5224 return;
5225 }
5226 if (setobj->type != REDIS_SET) {
5227 zfree(dv);
5228 addReply(c,shared.wrongtypeerr);
5229 return;
5230 }
5231 dv[j] = setobj->ptr;
5232 }
5233 /* Sort sets from the smallest to largest, this will improve our
5234 * algorithm's performace */
5235 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5236
5237 /* The first thing we should output is the total number of elements...
5238 * since this is a multi-bulk write, but at this stage we don't know
5239 * the intersection set size, so we use a trick, append an empty object
5240 * to the output list and save the pointer to later modify it with the
5241 * right length */
5242 if (!dstkey) {
5243 lenobj = createObject(REDIS_STRING,NULL);
5244 addReply(c,lenobj);
5245 decrRefCount(lenobj);
5246 } else {
5247 /* If we have a target key where to store the resulting set
5248 * create this key with an empty set inside */
5249 dstset = createSetObject();
5250 }
5251
5252 /* Iterate all the elements of the first (smallest) set, and test
5253 * the element against all the other sets, if at least one set does
5254 * not include the element it is discarded */
5255 di = dictGetIterator(dv[0]);
5256
5257 while((de = dictNext(di)) != NULL) {
5258 robj *ele;
5259
5260 for (j = 1; j < setsnum; j++)
5261 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5262 if (j != setsnum)
5263 continue; /* at least one set does not contain the member */
5264 ele = dictGetEntryKey(de);
5265 if (!dstkey) {
5266 addReplyBulk(c,ele);
5267 cardinality++;
5268 } else {
5269 dictAdd(dstset->ptr,ele,NULL);
5270 incrRefCount(ele);
5271 }
5272 }
5273 dictReleaseIterator(di);
5274
5275 if (dstkey) {
5276 /* Store the resulting set into the target, if the intersection
5277 * is not an empty set. */
5278 deleteKey(c->db,dstkey);
5279 if (dictSize((dict*)dstset->ptr) > 0) {
5280 dictAdd(c->db->dict,dstkey,dstset);
5281 incrRefCount(dstkey);
5282 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5283 } else {
5284 decrRefCount(dstset);
5285 addReply(c,shared.czero);
5286 }
5287 server.dirty++;
5288 } else {
5289 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5290 }
5291 zfree(dv);
5292 }
5293
5294 static void sinterCommand(redisClient *c) {
5295 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5296 }
5297
5298 static void sinterstoreCommand(redisClient *c) {
5299 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5300 }
5301
5302 #define REDIS_OP_UNION 0
5303 #define REDIS_OP_DIFF 1
5304 #define REDIS_OP_INTER 2
5305
5306 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5307 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5308 dictIterator *di;
5309 dictEntry *de;
5310 robj *dstset = NULL;
5311 int j, cardinality = 0;
5312
5313 for (j = 0; j < setsnum; j++) {
5314 robj *setobj;
5315
5316 setobj = dstkey ?
5317 lookupKeyWrite(c->db,setskeys[j]) :
5318 lookupKeyRead(c->db,setskeys[j]);
5319 if (!setobj) {
5320 dv[j] = NULL;
5321 continue;
5322 }
5323 if (setobj->type != REDIS_SET) {
5324 zfree(dv);
5325 addReply(c,shared.wrongtypeerr);
5326 return;
5327 }
5328 dv[j] = setobj->ptr;
5329 }
5330
5331 /* We need a temp set object to store our union. If the dstkey
5332 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5333 * this set object will be the resulting object to set into the target key*/
5334 dstset = createSetObject();
5335
5336 /* Iterate all the elements of all the sets, add every element a single
5337 * time to the result set */
5338 for (j = 0; j < setsnum; j++) {
5339 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5340 if (!dv[j]) continue; /* non existing keys are like empty sets */
5341
5342 di = dictGetIterator(dv[j]);
5343
5344 while((de = dictNext(di)) != NULL) {
5345 robj *ele;
5346
5347 /* dictAdd will not add the same element multiple times */
5348 ele = dictGetEntryKey(de);
5349 if (op == REDIS_OP_UNION || j == 0) {
5350 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5351 incrRefCount(ele);
5352 cardinality++;
5353 }
5354 } else if (op == REDIS_OP_DIFF) {
5355 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5356 cardinality--;
5357 }
5358 }
5359 }
5360 dictReleaseIterator(di);
5361
5362 /* result set is empty? Exit asap. */
5363 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5364 }
5365
5366 /* Output the content of the resulting set, if not in STORE mode */
5367 if (!dstkey) {
5368 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5369 di = dictGetIterator(dstset->ptr);
5370 while((de = dictNext(di)) != NULL) {
5371 robj *ele;
5372
5373 ele = dictGetEntryKey(de);
5374 addReplyBulk(c,ele);
5375 }
5376 dictReleaseIterator(di);
5377 decrRefCount(dstset);
5378 } else {
5379 /* If we have a target key where to store the resulting set
5380 * create this key with the result set inside */
5381 deleteKey(c->db,dstkey);
5382 if (dictSize((dict*)dstset->ptr) > 0) {
5383 dictAdd(c->db->dict,dstkey,dstset);
5384 incrRefCount(dstkey);
5385 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5386 } else {
5387 decrRefCount(dstset);
5388 addReply(c,shared.czero);
5389 }
5390 server.dirty++;
5391 }
5392 zfree(dv);
5393 }
5394
5395 static void sunionCommand(redisClient *c) {
5396 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5397 }
5398
5399 static void sunionstoreCommand(redisClient *c) {
5400 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5401 }
5402
5403 static void sdiffCommand(redisClient *c) {
5404 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5405 }
5406
5407 static void sdiffstoreCommand(redisClient *c) {
5408 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5409 }
5410
5411 /* ==================================== ZSets =============================== */
5412
5413 /* ZSETs are ordered sets using two data structures to hold the same elements
5414 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5415 * data structure.
5416 *
5417 * The elements are added to an hash table mapping Redis objects to scores.
5418 * At the same time the elements are added to a skip list mapping scores
5419 * to Redis objects (so objects are sorted by scores in this "view"). */
5420
5421 /* This skiplist implementation is almost a C translation of the original
5422 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5423 * Alternative to Balanced Trees", modified in three ways:
5424 * a) this implementation allows for repeated values.
5425 * b) the comparison is not just by key (our 'score') but by satellite data.
5426 * c) there is a back pointer, so it's a doubly linked list with the back
5427 * pointers being only at "level 1". This allows to traverse the list
5428 * from tail to head, useful for ZREVRANGE. */
5429
5430 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5431 zskiplistNode *zn = zmalloc(sizeof(*zn));
5432
5433 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5434 if (level > 1)
5435 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5436 else
5437 zn->span = NULL;
5438 zn->score = score;
5439 zn->obj = obj;
5440 return zn;
5441 }
5442
5443 static zskiplist *zslCreate(void) {
5444 int j;
5445 zskiplist *zsl;
5446
5447 zsl = zmalloc(sizeof(*zsl));
5448 zsl->level = 1;
5449 zsl->length = 0;
5450 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5451 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5452 zsl->header->forward[j] = NULL;
5453
5454 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5455 if (j < ZSKIPLIST_MAXLEVEL-1)
5456 zsl->header->span[j] = 0;
5457 }
5458 zsl->header->backward = NULL;
5459 zsl->tail = NULL;
5460 return zsl;
5461 }
5462
5463 static void zslFreeNode(zskiplistNode *node) {
5464 decrRefCount(node->obj);
5465 zfree(node->forward);
5466 zfree(node->span);
5467 zfree(node);
5468 }
5469
5470 static void zslFree(zskiplist *zsl) {
5471 zskiplistNode *node = zsl->header->forward[0], *next;
5472
5473 zfree(zsl->header->forward);
5474 zfree(zsl->header->span);
5475 zfree(zsl->header);
5476 while(node) {
5477 next = node->forward[0];
5478 zslFreeNode(node);
5479 node = next;
5480 }
5481 zfree(zsl);
5482 }
5483
5484 static int zslRandomLevel(void) {
5485 int level = 1;
5486 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5487 level += 1;
5488 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5489 }
5490
5491 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5492 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5493 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5494 int i, level;
5495
5496 x = zsl->header;
5497 for (i = zsl->level-1; i >= 0; i--) {
5498 /* store rank that is crossed to reach the insert position */
5499 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5500
5501 while (x->forward[i] &&
5502 (x->forward[i]->score < score ||
5503 (x->forward[i]->score == score &&
5504 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5505 rank[i] += i > 0 ? x->span[i-1] : 1;
5506 x = x->forward[i];
5507 }
5508 update[i] = x;
5509 }
5510 /* we assume the key is not already inside, since we allow duplicated
5511 * scores, and the re-insertion of score and redis object should never
5512 * happpen since the caller of zslInsert() should test in the hash table
5513 * if the element is already inside or not. */
5514 level = zslRandomLevel();
5515 if (level > zsl->level) {
5516 for (i = zsl->level; i < level; i++) {
5517 rank[i] = 0;
5518 update[i] = zsl->header;
5519 update[i]->span[i-1] = zsl->length;
5520 }
5521 zsl->level = level;
5522 }
5523 x = zslCreateNode(level,score,obj);
5524 for (i = 0; i < level; i++) {
5525 x->forward[i] = update[i]->forward[i];
5526 update[i]->forward[i] = x;
5527
5528 /* update span covered by update[i] as x is inserted here */
5529 if (i > 0) {
5530 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5531 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5532 }
5533 }
5534
5535 /* increment span for untouched levels */
5536 for (i = level; i < zsl->level; i++) {
5537 update[i]->span[i-1]++;
5538 }
5539
5540 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5541 if (x->forward[0])
5542 x->forward[0]->backward = x;
5543 else
5544 zsl->tail = x;
5545 zsl->length++;
5546 }
5547
5548 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5549 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5550 int i;
5551 for (i = 0; i < zsl->level; i++) {
5552 if (update[i]->forward[i] == x) {
5553 if (i > 0) {
5554 update[i]->span[i-1] += x->span[i-1] - 1;
5555 }
5556 update[i]->forward[i] = x->forward[i];
5557 } else {
5558 /* invariant: i > 0, because update[0]->forward[0]
5559 * is always equal to x */
5560 update[i]->span[i-1] -= 1;
5561 }
5562 }
5563 if (x->forward[0]) {
5564 x->forward[0]->backward = x->backward;
5565 } else {
5566 zsl->tail = x->backward;
5567 }
5568 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5569 zsl->level--;
5570 zsl->length--;
5571 }
5572
5573 /* Delete an element with matching score/object from the skiplist. */
5574 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5575 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5576 int i;
5577
5578 x = zsl->header;
5579 for (i = zsl->level-1; i >= 0; i--) {
5580 while (x->forward[i] &&
5581 (x->forward[i]->score < score ||
5582 (x->forward[i]->score == score &&
5583 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5584 x = x->forward[i];
5585 update[i] = x;
5586 }
5587 /* We may have multiple elements with the same score, what we need
5588 * is to find the element with both the right score and object. */
5589 x = x->forward[0];
5590 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5591 zslDeleteNode(zsl, x, update);
5592 zslFreeNode(x);
5593 return 1;
5594 } else {
5595 return 0; /* not found */
5596 }
5597 return 0; /* not found */
5598 }
5599
5600 /* Delete all the elements with score between min and max from the skiplist.
5601 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5602 * Note that this function takes the reference to the hash table view of the
5603 * sorted set, in order to remove the elements from the hash table too. */
5604 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5605 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5606 unsigned long removed = 0;
5607 int i;
5608
5609 x = zsl->header;
5610 for (i = zsl->level-1; i >= 0; i--) {
5611 while (x->forward[i] && x->forward[i]->score < min)
5612 x = x->forward[i];
5613 update[i] = x;
5614 }
5615 /* We may have multiple elements with the same score, what we need
5616 * is to find the element with both the right score and object. */
5617 x = x->forward[0];
5618 while (x && x->score <= max) {
5619 zskiplistNode *next = x->forward[0];
5620 zslDeleteNode(zsl, x, update);
5621 dictDelete(dict,x->obj);
5622 zslFreeNode(x);
5623 removed++;
5624 x = next;
5625 }
5626 return removed; /* not found */
5627 }
5628
5629 /* Delete all the elements with rank between start and end from the skiplist.
5630 * Start and end are inclusive. Note that start and end need to be 1-based */
5631 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5632 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5633 unsigned long traversed = 0, removed = 0;
5634 int i;
5635
5636 x = zsl->header;
5637 for (i = zsl->level-1; i >= 0; i--) {
5638 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5639 traversed += i > 0 ? x->span[i-1] : 1;
5640 x = x->forward[i];
5641 }
5642 update[i] = x;
5643 }
5644
5645 traversed++;
5646 x = x->forward[0];
5647 while (x && traversed <= end) {
5648 zskiplistNode *next = x->forward[0];
5649 zslDeleteNode(zsl, x, update);
5650 dictDelete(dict,x->obj);
5651 zslFreeNode(x);
5652 removed++;
5653 traversed++;
5654 x = next;
5655 }
5656 return removed;
5657 }
5658
5659 /* Find the first node having a score equal or greater than the specified one.
5660 * Returns NULL if there is no match. */
5661 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5662 zskiplistNode *x;
5663 int i;
5664
5665 x = zsl->header;
5666 for (i = zsl->level-1; i >= 0; i--) {
5667 while (x->forward[i] && x->forward[i]->score < score)
5668 x = x->forward[i];
5669 }
5670 /* We may have multiple elements with the same score, what we need
5671 * is to find the element with both the right score and object. */
5672 return x->forward[0];
5673 }
5674
5675 /* Find the rank for an element by both score and key.
5676 * Returns 0 when the element cannot be found, rank otherwise.
5677 * Note that the rank is 1-based due to the span of zsl->header to the
5678 * first element. */
5679 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5680 zskiplistNode *x;
5681 unsigned long rank = 0;
5682 int i;
5683
5684 x = zsl->header;
5685 for (i = zsl->level-1; i >= 0; i--) {
5686 while (x->forward[i] &&
5687 (x->forward[i]->score < score ||
5688 (x->forward[i]->score == score &&
5689 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5690 rank += i > 0 ? x->span[i-1] : 1;
5691 x = x->forward[i];
5692 }
5693
5694 /* x might be equal to zsl->header, so test if obj is non-NULL */
5695 if (x->obj && equalStringObjects(x->obj,o)) {
5696 return rank;
5697 }
5698 }
5699 return 0;
5700 }
5701
5702 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5703 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5704 zskiplistNode *x;
5705 unsigned long traversed = 0;
5706 int i;
5707
5708 x = zsl->header;
5709 for (i = zsl->level-1; i >= 0; i--) {
5710 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5711 {
5712 traversed += i > 0 ? x->span[i-1] : 1;
5713 x = x->forward[i];
5714 }
5715 if (traversed == rank) {
5716 return x;
5717 }
5718 }
5719 return NULL;
5720 }
5721
5722 /* The actual Z-commands implementations */
5723
5724 /* This generic command implements both ZADD and ZINCRBY.
5725 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5726 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5727 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5728 robj *zsetobj;
5729 zset *zs;
5730 double *score;
5731
5732 zsetobj = lookupKeyWrite(c->db,key);
5733 if (zsetobj == NULL) {
5734 zsetobj = createZsetObject();
5735 dictAdd(c->db->dict,key,zsetobj);
5736 incrRefCount(key);
5737 } else {
5738 if (zsetobj->type != REDIS_ZSET) {
5739 addReply(c,shared.wrongtypeerr);
5740 return;
5741 }
5742 }
5743 zs = zsetobj->ptr;
5744
5745 /* Ok now since we implement both ZADD and ZINCRBY here the code
5746 * needs to handle the two different conditions. It's all about setting
5747 * '*score', that is, the new score to set, to the right value. */
5748 score = zmalloc(sizeof(double));
5749 if (doincrement) {
5750 dictEntry *de;
5751
5752 /* Read the old score. If the element was not present starts from 0 */
5753 de = dictFind(zs->dict,ele);
5754 if (de) {
5755 double *oldscore = dictGetEntryVal(de);
5756 *score = *oldscore + scoreval;
5757 } else {
5758 *score = scoreval;
5759 }
5760 } else {
5761 *score = scoreval;
5762 }
5763
5764 /* What follows is a simple remove and re-insert operation that is common
5765 * to both ZADD and ZINCRBY... */
5766 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5767 /* case 1: New element */
5768 incrRefCount(ele); /* added to hash */
5769 zslInsert(zs->zsl,*score,ele);
5770 incrRefCount(ele); /* added to skiplist */
5771 server.dirty++;
5772 if (doincrement)
5773 addReplyDouble(c,*score);
5774 else
5775 addReply(c,shared.cone);
5776 } else {
5777 dictEntry *de;
5778 double *oldscore;
5779
5780 /* case 2: Score update operation */
5781 de = dictFind(zs->dict,ele);
5782 redisAssert(de != NULL);
5783 oldscore = dictGetEntryVal(de);
5784 if (*score != *oldscore) {
5785 int deleted;
5786
5787 /* Remove and insert the element in the skip list with new score */
5788 deleted = zslDelete(zs->zsl,*oldscore,ele);
5789 redisAssert(deleted != 0);
5790 zslInsert(zs->zsl,*score,ele);
5791 incrRefCount(ele);
5792 /* Update the score in the hash table */
5793 dictReplace(zs->dict,ele,score);
5794 server.dirty++;
5795 } else {
5796 zfree(score);
5797 }
5798 if (doincrement)
5799 addReplyDouble(c,*score);
5800 else
5801 addReply(c,shared.czero);
5802 }
5803 }
5804
5805 static void zaddCommand(redisClient *c) {
5806 double scoreval;
5807
5808 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5809 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5810 }
5811
5812 static void zincrbyCommand(redisClient *c) {
5813 double scoreval;
5814
5815 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5816 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5817 }
5818
5819 static void zremCommand(redisClient *c) {
5820 robj *zsetobj;
5821 zset *zs;
5822 dictEntry *de;
5823 double *oldscore;
5824 int deleted;
5825
5826 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5827 checkType(c,zsetobj,REDIS_ZSET)) return;
5828
5829 zs = zsetobj->ptr;
5830 de = dictFind(zs->dict,c->argv[2]);
5831 if (de == NULL) {
5832 addReply(c,shared.czero);
5833 return;
5834 }
5835 /* Delete from the skiplist */
5836 oldscore = dictGetEntryVal(de);
5837 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5838 redisAssert(deleted != 0);
5839
5840 /* Delete from the hash table */
5841 dictDelete(zs->dict,c->argv[2]);
5842 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5843 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5844 server.dirty++;
5845 addReply(c,shared.cone);
5846 }
5847
5848 static void zremrangebyscoreCommand(redisClient *c) {
5849 double min;
5850 double max;
5851 long deleted;
5852 robj *zsetobj;
5853 zset *zs;
5854
5855 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5856 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5857
5858 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5859 checkType(c,zsetobj,REDIS_ZSET)) return;
5860
5861 zs = zsetobj->ptr;
5862 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5863 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5864 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5865 server.dirty += deleted;
5866 addReplyLongLong(c,deleted);
5867 }
5868
5869 static void zremrangebyrankCommand(redisClient *c) {
5870 long start;
5871 long end;
5872 int llen;
5873 long deleted;
5874 robj *zsetobj;
5875 zset *zs;
5876
5877 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5878 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5879
5880 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5881 checkType(c,zsetobj,REDIS_ZSET)) return;
5882 zs = zsetobj->ptr;
5883 llen = zs->zsl->length;
5884
5885 /* convert negative indexes */
5886 if (start < 0) start = llen+start;
5887 if (end < 0) end = llen+end;
5888 if (start < 0) start = 0;
5889 if (end < 0) end = 0;
5890
5891 /* indexes sanity checks */
5892 if (start > end || start >= llen) {
5893 addReply(c,shared.czero);
5894 return;
5895 }
5896 if (end >= llen) end = llen-1;
5897
5898 /* increment start and end because zsl*Rank functions
5899 * use 1-based rank */
5900 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5901 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5902 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5903 server.dirty += deleted;
5904 addReplyLongLong(c, deleted);
5905 }
5906
5907 typedef struct {
5908 dict *dict;
5909 double weight;
5910 } zsetopsrc;
5911
5912 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5913 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5914 unsigned long size1, size2;
5915 size1 = d1->dict ? dictSize(d1->dict) : 0;
5916 size2 = d2->dict ? dictSize(d2->dict) : 0;
5917 return size1 - size2;
5918 }
5919
5920 #define REDIS_AGGR_SUM 1
5921 #define REDIS_AGGR_MIN 2
5922 #define REDIS_AGGR_MAX 3
5923
5924 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5925 if (aggregate == REDIS_AGGR_SUM) {
5926 *target = *target + val;
5927 } else if (aggregate == REDIS_AGGR_MIN) {
5928 *target = val < *target ? val : *target;
5929 } else if (aggregate == REDIS_AGGR_MAX) {
5930 *target = val > *target ? val : *target;
5931 } else {
5932 /* safety net */
5933 redisPanic("Unknown ZUNION/INTER aggregate type");
5934 }
5935 }
5936
5937 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5938 int i, j, zsetnum;
5939 int aggregate = REDIS_AGGR_SUM;
5940 zsetopsrc *src;
5941 robj *dstobj;
5942 zset *dstzset;
5943 dictIterator *di;
5944 dictEntry *de;
5945
5946 /* expect zsetnum input keys to be given */
5947 zsetnum = atoi(c->argv[2]->ptr);
5948 if (zsetnum < 1) {
5949 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5950 return;
5951 }
5952
5953 /* test if the expected number of keys would overflow */
5954 if (3+zsetnum > c->argc) {
5955 addReply(c,shared.syntaxerr);
5956 return;
5957 }
5958
5959 /* read keys to be used for input */
5960 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5961 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5962 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5963 if (!zsetobj) {
5964 src[i].dict = NULL;
5965 } else {
5966 if (zsetobj->type != REDIS_ZSET) {
5967 zfree(src);
5968 addReply(c,shared.wrongtypeerr);
5969 return;
5970 }
5971 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5972 }
5973
5974 /* default all weights to 1 */
5975 src[i].weight = 1.0;
5976 }
5977
5978 /* parse optional extra arguments */
5979 if (j < c->argc) {
5980 int remaining = c->argc - j;
5981
5982 while (remaining) {
5983 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5984 j++; remaining--;
5985 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5986 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5987 return;
5988 }
5989 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5990 j++; remaining--;
5991 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5992 aggregate = REDIS_AGGR_SUM;
5993 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5994 aggregate = REDIS_AGGR_MIN;
5995 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5996 aggregate = REDIS_AGGR_MAX;
5997 } else {
5998 zfree(src);
5999 addReply(c,shared.syntaxerr);
6000 return;
6001 }
6002 j++; remaining--;
6003 } else {
6004 zfree(src);
6005 addReply(c,shared.syntaxerr);
6006 return;
6007 }
6008 }
6009 }
6010
6011 /* sort sets from the smallest to largest, this will improve our
6012 * algorithm's performance */
6013 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
6014
6015 dstobj = createZsetObject();
6016 dstzset = dstobj->ptr;
6017
6018 if (op == REDIS_OP_INTER) {
6019 /* skip going over all entries if the smallest zset is NULL or empty */
6020 if (src[0].dict && dictSize(src[0].dict) > 0) {
6021 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6022 * from small to large, all src[i > 0].dict are non-empty too */
6023 di = dictGetIterator(src[0].dict);
6024 while((de = dictNext(di)) != NULL) {
6025 double *score = zmalloc(sizeof(double)), value;
6026 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
6027
6028 for (j = 1; j < zsetnum; j++) {
6029 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6030 if (other) {
6031 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6032 zunionInterAggregate(score, value, aggregate);
6033 } else {
6034 break;
6035 }
6036 }
6037
6038 /* skip entry when not present in every source dict */
6039 if (j != zsetnum) {
6040 zfree(score);
6041 } else {
6042 robj *o = dictGetEntryKey(de);
6043 dictAdd(dstzset->dict,o,score);
6044 incrRefCount(o); /* added to dictionary */
6045 zslInsert(dstzset->zsl,*score,o);
6046 incrRefCount(o); /* added to skiplist */
6047 }
6048 }
6049 dictReleaseIterator(di);
6050 }
6051 } else if (op == REDIS_OP_UNION) {
6052 for (i = 0; i < zsetnum; i++) {
6053 if (!src[i].dict) continue;
6054
6055 di = dictGetIterator(src[i].dict);
6056 while((de = dictNext(di)) != NULL) {
6057 /* skip key when already processed */
6058 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6059
6060 double *score = zmalloc(sizeof(double)), value;
6061 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
6062
6063 /* because the zsets are sorted by size, its only possible
6064 * for sets at larger indices to hold this entry */
6065 for (j = (i+1); j < zsetnum; j++) {
6066 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6067 if (other) {
6068 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6069 zunionInterAggregate(score, value, aggregate);
6070 }
6071 }
6072
6073 robj *o = dictGetEntryKey(de);
6074 dictAdd(dstzset->dict,o,score);
6075 incrRefCount(o); /* added to dictionary */
6076 zslInsert(dstzset->zsl,*score,o);
6077 incrRefCount(o); /* added to skiplist */
6078 }
6079 dictReleaseIterator(di);
6080 }
6081 } else {
6082 /* unknown operator */
6083 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6084 }
6085
6086 deleteKey(c->db,dstkey);
6087 if (dstzset->zsl->length) {
6088 dictAdd(c->db->dict,dstkey,dstobj);
6089 incrRefCount(dstkey);
6090 addReplyLongLong(c, dstzset->zsl->length);
6091 server.dirty++;
6092 } else {
6093 decrRefCount(dstobj);
6094 addReply(c, shared.czero);
6095 }
6096 zfree(src);
6097 }
6098
6099 static void zunionstoreCommand(redisClient *c) {
6100 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6101 }
6102
6103 static void zinterstoreCommand(redisClient *c) {
6104 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6105 }
6106
6107 static void zrangeGenericCommand(redisClient *c, int reverse) {
6108 robj *o;
6109 long start;
6110 long end;
6111 int withscores = 0;
6112 int llen;
6113 int rangelen, j;
6114 zset *zsetobj;
6115 zskiplist *zsl;
6116 zskiplistNode *ln;
6117 robj *ele;
6118
6119 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6120 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6121
6122 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6123 withscores = 1;
6124 } else if (c->argc >= 5) {
6125 addReply(c,shared.syntaxerr);
6126 return;
6127 }
6128
6129 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6130 || checkType(c,o,REDIS_ZSET)) return;
6131 zsetobj = o->ptr;
6132 zsl = zsetobj->zsl;
6133 llen = zsl->length;
6134
6135 /* convert negative indexes */
6136 if (start < 0) start = llen+start;
6137 if (end < 0) end = llen+end;
6138 if (start < 0) start = 0;
6139 if (end < 0) end = 0;
6140
6141 /* indexes sanity checks */
6142 if (start > end || start >= llen) {
6143 /* Out of range start or start > end result in empty list */
6144 addReply(c,shared.emptymultibulk);
6145 return;
6146 }
6147 if (end >= llen) end = llen-1;
6148 rangelen = (end-start)+1;
6149
6150 /* check if starting point is trivial, before searching
6151 * the element in log(N) time */
6152 if (reverse) {
6153 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6154 } else {
6155 ln = start == 0 ?
6156 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6157 }
6158
6159 /* Return the result in form of a multi-bulk reply */
6160 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6161 withscores ? (rangelen*2) : rangelen));
6162 for (j = 0; j < rangelen; j++) {
6163 ele = ln->obj;
6164 addReplyBulk(c,ele);
6165 if (withscores)
6166 addReplyDouble(c,ln->score);
6167 ln = reverse ? ln->backward : ln->forward[0];
6168 }
6169 }
6170
6171 static void zrangeCommand(redisClient *c) {
6172 zrangeGenericCommand(c,0);
6173 }
6174
6175 static void zrevrangeCommand(redisClient *c) {
6176 zrangeGenericCommand(c,1);
6177 }
6178
6179 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6180 * If justcount is non-zero, just the count is returned. */
6181 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6182 robj *o;
6183 double min, max;
6184 int minex = 0, maxex = 0; /* are min or max exclusive? */
6185 int offset = 0, limit = -1;
6186 int withscores = 0;
6187 int badsyntax = 0;
6188
6189 /* Parse the min-max interval. If one of the values is prefixed
6190 * by the "(" character, it's considered "open". For instance
6191 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6192 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6193 if (((char*)c->argv[2]->ptr)[0] == '(') {
6194 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6195 minex = 1;
6196 } else {
6197 min = strtod(c->argv[2]->ptr,NULL);
6198 }
6199 if (((char*)c->argv[3]->ptr)[0] == '(') {
6200 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6201 maxex = 1;
6202 } else {
6203 max = strtod(c->argv[3]->ptr,NULL);
6204 }
6205
6206 /* Parse "WITHSCORES": note that if the command was called with
6207 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6208 * enter the following paths to parse WITHSCORES and LIMIT. */
6209 if (c->argc == 5 || c->argc == 8) {
6210 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6211 withscores = 1;
6212 else
6213 badsyntax = 1;
6214 }
6215 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6216 badsyntax = 1;
6217 if (badsyntax) {
6218 addReplySds(c,
6219 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6220 return;
6221 }
6222
6223 /* Parse "LIMIT" */
6224 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6225 addReply(c,shared.syntaxerr);
6226 return;
6227 } else if (c->argc == (7 + withscores)) {
6228 offset = atoi(c->argv[5]->ptr);
6229 limit = atoi(c->argv[6]->ptr);
6230 if (offset < 0) offset = 0;
6231 }
6232
6233 /* Ok, lookup the key and get the range */
6234 o = lookupKeyRead(c->db,c->argv[1]);
6235 if (o == NULL) {
6236 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6237 } else {
6238 if (o->type != REDIS_ZSET) {
6239 addReply(c,shared.wrongtypeerr);
6240 } else {
6241 zset *zsetobj = o->ptr;
6242 zskiplist *zsl = zsetobj->zsl;
6243 zskiplistNode *ln;
6244 robj *ele, *lenobj = NULL;
6245 unsigned long rangelen = 0;
6246
6247 /* Get the first node with the score >= min, or with
6248 * score > min if 'minex' is true. */
6249 ln = zslFirstWithScore(zsl,min);
6250 while (minex && ln && ln->score == min) ln = ln->forward[0];
6251
6252 if (ln == NULL) {
6253 /* No element matching the speciifed interval */
6254 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6255 return;
6256 }
6257
6258 /* We don't know in advance how many matching elements there
6259 * are in the list, so we push this object that will represent
6260 * the multi-bulk length in the output buffer, and will "fix"
6261 * it later */
6262 if (!justcount) {
6263 lenobj = createObject(REDIS_STRING,NULL);
6264 addReply(c,lenobj);
6265 decrRefCount(lenobj);
6266 }
6267
6268 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6269 if (offset) {
6270 offset--;
6271 ln = ln->forward[0];
6272 continue;
6273 }
6274 if (limit == 0) break;
6275 if (!justcount) {
6276 ele = ln->obj;
6277 addReplyBulk(c,ele);
6278 if (withscores)
6279 addReplyDouble(c,ln->score);
6280 }
6281 ln = ln->forward[0];
6282 rangelen++;
6283 if (limit > 0) limit--;
6284 }
6285 if (justcount) {
6286 addReplyLongLong(c,(long)rangelen);
6287 } else {
6288 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6289 withscores ? (rangelen*2) : rangelen);
6290 }
6291 }
6292 }
6293 }
6294
6295 static void zrangebyscoreCommand(redisClient *c) {
6296 genericZrangebyscoreCommand(c,0);
6297 }
6298
6299 static void zcountCommand(redisClient *c) {
6300 genericZrangebyscoreCommand(c,1);
6301 }
6302
6303 static void zcardCommand(redisClient *c) {
6304 robj *o;
6305 zset *zs;
6306
6307 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6308 checkType(c,o,REDIS_ZSET)) return;
6309
6310 zs = o->ptr;
6311 addReplyUlong(c,zs->zsl->length);
6312 }
6313
6314 static void zscoreCommand(redisClient *c) {
6315 robj *o;
6316 zset *zs;
6317 dictEntry *de;
6318
6319 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6320 checkType(c,o,REDIS_ZSET)) return;
6321
6322 zs = o->ptr;
6323 de = dictFind(zs->dict,c->argv[2]);
6324 if (!de) {
6325 addReply(c,shared.nullbulk);
6326 } else {
6327 double *score = dictGetEntryVal(de);
6328
6329 addReplyDouble(c,*score);
6330 }
6331 }
6332
6333 static void zrankGenericCommand(redisClient *c, int reverse) {
6334 robj *o;
6335 zset *zs;
6336 zskiplist *zsl;
6337 dictEntry *de;
6338 unsigned long rank;
6339 double *score;
6340
6341 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6342 checkType(c,o,REDIS_ZSET)) return;
6343
6344 zs = o->ptr;
6345 zsl = zs->zsl;
6346 de = dictFind(zs->dict,c->argv[2]);
6347 if (!de) {
6348 addReply(c,shared.nullbulk);
6349 return;
6350 }
6351
6352 score = dictGetEntryVal(de);
6353 rank = zslGetRank(zsl, *score, c->argv[2]);
6354 if (rank) {
6355 if (reverse) {
6356 addReplyLongLong(c, zsl->length - rank);
6357 } else {
6358 addReplyLongLong(c, rank-1);
6359 }
6360 } else {
6361 addReply(c,shared.nullbulk);
6362 }
6363 }
6364
6365 static void zrankCommand(redisClient *c) {
6366 zrankGenericCommand(c, 0);
6367 }
6368
6369 static void zrevrankCommand(redisClient *c) {
6370 zrankGenericCommand(c, 1);
6371 }
6372
6373 /* ========================= Hashes utility functions ======================= */
6374 #define REDIS_HASH_KEY 1
6375 #define REDIS_HASH_VALUE 2
6376
6377 /* Check the length of a number of objects to see if we need to convert a
6378 * zipmap to a real hash. Note that we only check string encoded objects
6379 * as their string length can be queried in constant time. */
6380 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6381 int i;
6382 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6383
6384 for (i = start; i <= end; i++) {
6385 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6386 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6387 {
6388 convertToRealHash(subject);
6389 return;
6390 }
6391 }
6392 }
6393
6394 /* Encode given objects in-place when the hash uses a dict. */
6395 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6396 if (subject->encoding == REDIS_ENCODING_HT) {
6397 if (o1) *o1 = tryObjectEncoding(*o1);
6398 if (o2) *o2 = tryObjectEncoding(*o2);
6399 }
6400 }
6401
6402 /* Get the value from a hash identified by key. Returns either a string
6403 * object or NULL if the value cannot be found. The refcount of the object
6404 * is always increased by 1 when the value was found. */
6405 static robj *hashGet(robj *o, robj *key) {
6406 robj *value = NULL;
6407 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6408 unsigned char *v;
6409 unsigned int vlen;
6410 key = getDecodedObject(key);
6411 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6412 value = createStringObject((char*)v,vlen);
6413 }
6414 decrRefCount(key);
6415 } else {
6416 dictEntry *de = dictFind(o->ptr,key);
6417 if (de != NULL) {
6418 value = dictGetEntryVal(de);
6419 incrRefCount(value);
6420 }
6421 }
6422 return value;
6423 }
6424
6425 /* Test if the key exists in the given hash. Returns 1 if the key
6426 * exists and 0 when it doesn't. */
6427 static int hashExists(robj *o, robj *key) {
6428 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6429 key = getDecodedObject(key);
6430 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6431 decrRefCount(key);
6432 return 1;
6433 }
6434 decrRefCount(key);
6435 } else {
6436 if (dictFind(o->ptr,key) != NULL) {
6437 return 1;
6438 }
6439 }
6440 return 0;
6441 }
6442
6443 /* Add an element, discard the old if the key already exists.
6444 * Return 0 on insert and 1 on update. */
6445 static int hashSet(robj *o, robj *key, robj *value) {
6446 int update = 0;
6447 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6448 key = getDecodedObject(key);
6449 value = getDecodedObject(value);
6450 o->ptr = zipmapSet(o->ptr,
6451 key->ptr,sdslen(key->ptr),
6452 value->ptr,sdslen(value->ptr), &update);
6453 decrRefCount(key);
6454 decrRefCount(value);
6455
6456 /* Check if the zipmap needs to be upgraded to a real hash table */
6457 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6458 convertToRealHash(o);
6459 } else {
6460 if (dictReplace(o->ptr,key,value)) {
6461 /* Insert */
6462 incrRefCount(key);
6463 } else {
6464 /* Update */
6465 update = 1;
6466 }
6467 incrRefCount(value);
6468 }
6469 return update;
6470 }
6471
6472 /* Delete an element from a hash.
6473 * Return 1 on deleted and 0 on not found. */
6474 static int hashDelete(robj *o, robj *key) {
6475 int deleted = 0;
6476 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6477 key = getDecodedObject(key);
6478 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6479 decrRefCount(key);
6480 } else {
6481 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6482 /* Always check if the dictionary needs a resize after a delete. */
6483 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6484 }
6485 return deleted;
6486 }
6487
6488 /* Return the number of elements in a hash. */
6489 static unsigned long hashLength(robj *o) {
6490 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6491 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6492 }
6493
6494 /* Structure to hold hash iteration abstration. Note that iteration over
6495 * hashes involves both fields and values. Because it is possible that
6496 * not both are required, store pointers in the iterator to avoid
6497 * unnecessary memory allocation for fields/values. */
6498 typedef struct {
6499 int encoding;
6500 unsigned char *zi;
6501 unsigned char *zk, *zv;
6502 unsigned int zklen, zvlen;
6503
6504 dictIterator *di;
6505 dictEntry *de;
6506 } hashIterator;
6507
6508 static hashIterator *hashInitIterator(robj *subject) {
6509 hashIterator *hi = zmalloc(sizeof(hashIterator));
6510 hi->encoding = subject->encoding;
6511 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6512 hi->zi = zipmapRewind(subject->ptr);
6513 } else if (hi->encoding == REDIS_ENCODING_HT) {
6514 hi->di = dictGetIterator(subject->ptr);
6515 } else {
6516 redisAssert(NULL);
6517 }
6518 return hi;
6519 }
6520
6521 static void hashReleaseIterator(hashIterator *hi) {
6522 if (hi->encoding == REDIS_ENCODING_HT) {
6523 dictReleaseIterator(hi->di);
6524 }
6525 zfree(hi);
6526 }
6527
6528 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6529 * could be found and REDIS_ERR when the iterator reaches the end. */
6530 static int hashNext(hashIterator *hi) {
6531 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6532 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6533 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6534 } else {
6535 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6536 }
6537 return REDIS_OK;
6538 }
6539
6540 /* Get key or value object at current iteration position.
6541 * This increases the refcount of the field object by 1. */
6542 static robj *hashCurrent(hashIterator *hi, int what) {
6543 robj *o;
6544 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6545 if (what & REDIS_HASH_KEY) {
6546 o = createStringObject((char*)hi->zk,hi->zklen);
6547 } else {
6548 o = createStringObject((char*)hi->zv,hi->zvlen);
6549 }
6550 } else {
6551 if (what & REDIS_HASH_KEY) {
6552 o = dictGetEntryKey(hi->de);
6553 } else {
6554 o = dictGetEntryVal(hi->de);
6555 }
6556 incrRefCount(o);
6557 }
6558 return o;
6559 }
6560
6561 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6562 robj *o = lookupKeyWrite(c->db,key);
6563 if (o == NULL) {
6564 o = createHashObject();
6565 dictAdd(c->db->dict,key,o);
6566 incrRefCount(key);
6567 } else {
6568 if (o->type != REDIS_HASH) {
6569 addReply(c,shared.wrongtypeerr);
6570 return NULL;
6571 }
6572 }
6573 return o;
6574 }
6575
6576 /* ============================= Hash commands ============================== */
6577 static void hsetCommand(redisClient *c) {
6578 int update;
6579 robj *o;
6580
6581 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6582 hashTryConversion(o,c->argv,2,3);
6583 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6584 update = hashSet(o,c->argv[2],c->argv[3]);
6585 addReply(c, update ? shared.czero : shared.cone);
6586 server.dirty++;
6587 }
6588
6589 static void hsetnxCommand(redisClient *c) {
6590 robj *o;
6591 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6592 hashTryConversion(o,c->argv,2,3);
6593
6594 if (hashExists(o, c->argv[2])) {
6595 addReply(c, shared.czero);
6596 } else {
6597 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6598 hashSet(o,c->argv[2],c->argv[3]);
6599 addReply(c, shared.cone);
6600 server.dirty++;
6601 }
6602 }
6603
6604 static void hmsetCommand(redisClient *c) {
6605 int i;
6606 robj *o;
6607
6608 if ((c->argc % 2) == 1) {
6609 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6610 return;
6611 }
6612
6613 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6614 hashTryConversion(o,c->argv,2,c->argc-1);
6615 for (i = 2; i < c->argc; i += 2) {
6616 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6617 hashSet(o,c->argv[i],c->argv[i+1]);
6618 }
6619 addReply(c, shared.ok);
6620 server.dirty++;
6621 }
6622
6623 static void hincrbyCommand(redisClient *c) {
6624 long long value, incr;
6625 robj *o, *current, *new;
6626
6627 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6628 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6629 if ((current = hashGet(o,c->argv[2])) != NULL) {
6630 if (getLongLongFromObjectOrReply(c,current,&value,
6631 "hash value is not an integer") != REDIS_OK) {
6632 decrRefCount(current);
6633 return;
6634 }
6635 decrRefCount(current);
6636 } else {
6637 value = 0;
6638 }
6639
6640 value += incr;
6641 new = createStringObjectFromLongLong(value);
6642 hashTryObjectEncoding(o,&c->argv[2],NULL);
6643 hashSet(o,c->argv[2],new);
6644 decrRefCount(new);
6645 addReplyLongLong(c,value);
6646 server.dirty++;
6647 }
6648
6649 static void hgetCommand(redisClient *c) {
6650 robj *o, *value;
6651 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6652 checkType(c,o,REDIS_HASH)) return;
6653
6654 if ((value = hashGet(o,c->argv[2])) != NULL) {
6655 addReplyBulk(c,value);
6656 decrRefCount(value);
6657 } else {
6658 addReply(c,shared.nullbulk);
6659 }
6660 }
6661
6662 static void hmgetCommand(redisClient *c) {
6663 int i;
6664 robj *o, *value;
6665 o = lookupKeyRead(c->db,c->argv[1]);
6666 if (o != NULL && o->type != REDIS_HASH) {
6667 addReply(c,shared.wrongtypeerr);
6668 }
6669
6670 /* Note the check for o != NULL happens inside the loop. This is
6671 * done because objects that cannot be found are considered to be
6672 * an empty hash. The reply should then be a series of NULLs. */
6673 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6674 for (i = 2; i < c->argc; i++) {
6675 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6676 addReplyBulk(c,value);
6677 decrRefCount(value);
6678 } else {
6679 addReply(c,shared.nullbulk);
6680 }
6681 }
6682 }
6683
6684 static void hdelCommand(redisClient *c) {
6685 robj *o;
6686 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6687 checkType(c,o,REDIS_HASH)) return;
6688
6689 if (hashDelete(o,c->argv[2])) {
6690 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6691 addReply(c,shared.cone);
6692 server.dirty++;
6693 } else {
6694 addReply(c,shared.czero);
6695 }
6696 }
6697
6698 static void hlenCommand(redisClient *c) {
6699 robj *o;
6700 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6701 checkType(c,o,REDIS_HASH)) return;
6702
6703 addReplyUlong(c,hashLength(o));
6704 }
6705
6706 static void genericHgetallCommand(redisClient *c, int flags) {
6707 robj *o, *lenobj, *obj;
6708 unsigned long count = 0;
6709 hashIterator *hi;
6710
6711 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6712 || checkType(c,o,REDIS_HASH)) return;
6713
6714 lenobj = createObject(REDIS_STRING,NULL);
6715 addReply(c,lenobj);
6716 decrRefCount(lenobj);
6717
6718 hi = hashInitIterator(o);
6719 while (hashNext(hi) != REDIS_ERR) {
6720 if (flags & REDIS_HASH_KEY) {
6721 obj = hashCurrent(hi,REDIS_HASH_KEY);
6722 addReplyBulk(c,obj);
6723 decrRefCount(obj);
6724 count++;
6725 }
6726 if (flags & REDIS_HASH_VALUE) {
6727 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6728 addReplyBulk(c,obj);
6729 decrRefCount(obj);
6730 count++;
6731 }
6732 }
6733 hashReleaseIterator(hi);
6734
6735 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6736 }
6737
6738 static void hkeysCommand(redisClient *c) {
6739 genericHgetallCommand(c,REDIS_HASH_KEY);
6740 }
6741
6742 static void hvalsCommand(redisClient *c) {
6743 genericHgetallCommand(c,REDIS_HASH_VALUE);
6744 }
6745
6746 static void hgetallCommand(redisClient *c) {
6747 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6748 }
6749
6750 static void hexistsCommand(redisClient *c) {
6751 robj *o;
6752 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6753 checkType(c,o,REDIS_HASH)) return;
6754
6755 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6756 }
6757
6758 static void convertToRealHash(robj *o) {
6759 unsigned char *key, *val, *p, *zm = o->ptr;
6760 unsigned int klen, vlen;
6761 dict *dict = dictCreate(&hashDictType,NULL);
6762
6763 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6764 p = zipmapRewind(zm);
6765 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6766 robj *keyobj, *valobj;
6767
6768 keyobj = createStringObject((char*)key,klen);
6769 valobj = createStringObject((char*)val,vlen);
6770 keyobj = tryObjectEncoding(keyobj);
6771 valobj = tryObjectEncoding(valobj);
6772 dictAdd(dict,keyobj,valobj);
6773 }
6774 o->encoding = REDIS_ENCODING_HT;
6775 o->ptr = dict;
6776 zfree(zm);
6777 }
6778
6779 /* ========================= Non type-specific commands ==================== */
6780
6781 static void flushdbCommand(redisClient *c) {
6782 server.dirty += dictSize(c->db->dict);
6783 dictEmpty(c->db->dict);
6784 dictEmpty(c->db->expires);
6785 addReply(c,shared.ok);
6786 }
6787
6788 static void flushallCommand(redisClient *c) {
6789 server.dirty += emptyDb();
6790 addReply(c,shared.ok);
6791 if (server.bgsavechildpid != -1) {
6792 kill(server.bgsavechildpid,SIGKILL);
6793 rdbRemoveTempFile(server.bgsavechildpid);
6794 }
6795 rdbSave(server.dbfilename);
6796 server.dirty++;
6797 }
6798
6799 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6800 redisSortOperation *so = zmalloc(sizeof(*so));
6801 so->type = type;
6802 so->pattern = pattern;
6803 return so;
6804 }
6805
6806 /* Return the value associated to the key with a name obtained
6807 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6808 * The returned object will always have its refcount increased by 1
6809 * when it is non-NULL. */
6810 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6811 char *p, *f;
6812 sds spat, ssub;
6813 robj keyobj, fieldobj, *o;
6814 int prefixlen, sublen, postfixlen, fieldlen;
6815 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6816 struct {
6817 long len;
6818 long free;
6819 char buf[REDIS_SORTKEY_MAX+1];
6820 } keyname, fieldname;
6821
6822 /* If the pattern is "#" return the substitution object itself in order
6823 * to implement the "SORT ... GET #" feature. */
6824 spat = pattern->ptr;
6825 if (spat[0] == '#' && spat[1] == '\0') {
6826 incrRefCount(subst);
6827 return subst;
6828 }
6829
6830 /* The substitution object may be specially encoded. If so we create
6831 * a decoded object on the fly. Otherwise getDecodedObject will just
6832 * increment the ref count, that we'll decrement later. */
6833 subst = getDecodedObject(subst);
6834
6835 ssub = subst->ptr;
6836 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6837 p = strchr(spat,'*');
6838 if (!p) {
6839 decrRefCount(subst);
6840 return NULL;
6841 }
6842
6843 /* Find out if we're dealing with a hash dereference. */
6844 if ((f = strstr(p+1, "->")) != NULL) {
6845 fieldlen = sdslen(spat)-(f-spat);
6846 /* this also copies \0 character */
6847 memcpy(fieldname.buf,f+2,fieldlen-1);
6848 fieldname.len = fieldlen-2;
6849 } else {
6850 fieldlen = 0;
6851 }
6852
6853 prefixlen = p-spat;
6854 sublen = sdslen(ssub);
6855 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6856 memcpy(keyname.buf,spat,prefixlen);
6857 memcpy(keyname.buf+prefixlen,ssub,sublen);
6858 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6859 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6860 keyname.len = prefixlen+sublen+postfixlen;
6861 decrRefCount(subst);
6862
6863 /* Lookup substituted key */
6864 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6865 o = lookupKeyRead(db,&keyobj);
6866 if (o == NULL) return NULL;
6867
6868 if (fieldlen > 0) {
6869 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6870
6871 /* Retrieve value from hash by the field name. This operation
6872 * already increases the refcount of the returned object. */
6873 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6874 o = hashGet(o, &fieldobj);
6875 } else {
6876 if (o->type != REDIS_STRING) return NULL;
6877
6878 /* Every object that this function returns needs to have its refcount
6879 * increased. sortCommand decreases it again. */
6880 incrRefCount(o);
6881 }
6882
6883 return o;
6884 }
6885
6886 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6887 * the additional parameter is not standard but a BSD-specific we have to
6888 * pass sorting parameters via the global 'server' structure */
6889 static int sortCompare(const void *s1, const void *s2) {
6890 const redisSortObject *so1 = s1, *so2 = s2;
6891 int cmp;
6892
6893 if (!server.sort_alpha) {
6894 /* Numeric sorting. Here it's trivial as we precomputed scores */
6895 if (so1->u.score > so2->u.score) {
6896 cmp = 1;
6897 } else if (so1->u.score < so2->u.score) {
6898 cmp = -1;
6899 } else {
6900 cmp = 0;
6901 }
6902 } else {
6903 /* Alphanumeric sorting */
6904 if (server.sort_bypattern) {
6905 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6906 /* At least one compare object is NULL */
6907 if (so1->u.cmpobj == so2->u.cmpobj)
6908 cmp = 0;
6909 else if (so1->u.cmpobj == NULL)
6910 cmp = -1;
6911 else
6912 cmp = 1;
6913 } else {
6914 /* We have both the objects, use strcoll */
6915 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6916 }
6917 } else {
6918 /* Compare elements directly. */
6919 cmp = compareStringObjects(so1->obj,so2->obj);
6920 }
6921 }
6922 return server.sort_desc ? -cmp : cmp;
6923 }
6924
6925 /* The SORT command is the most complex command in Redis. Warning: this code
6926 * is optimized for speed and a bit less for readability */
6927 static void sortCommand(redisClient *c) {
6928 list *operations;
6929 int outputlen = 0;
6930 int desc = 0, alpha = 0;
6931 int limit_start = 0, limit_count = -1, start, end;
6932 int j, dontsort = 0, vectorlen;
6933 int getop = 0; /* GET operation counter */
6934 robj *sortval, *sortby = NULL, *storekey = NULL;
6935 redisSortObject *vector; /* Resulting vector to sort */
6936
6937 /* Lookup the key to sort. It must be of the right types */
6938 sortval = lookupKeyRead(c->db,c->argv[1]);
6939 if (sortval == NULL) {
6940 addReply(c,shared.emptymultibulk);
6941 return;
6942 }
6943 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6944 sortval->type != REDIS_ZSET)
6945 {
6946 addReply(c,shared.wrongtypeerr);
6947 return;
6948 }
6949
6950 /* Create a list of operations to perform for every sorted element.
6951 * Operations can be GET/DEL/INCR/DECR */
6952 operations = listCreate();
6953 listSetFreeMethod(operations,zfree);
6954 j = 2;
6955
6956 /* Now we need to protect sortval incrementing its count, in the future
6957 * SORT may have options able to overwrite/delete keys during the sorting
6958 * and the sorted key itself may get destroied */
6959 incrRefCount(sortval);
6960
6961 /* The SORT command has an SQL-alike syntax, parse it */
6962 while(j < c->argc) {
6963 int leftargs = c->argc-j-1;
6964 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6965 desc = 0;
6966 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6967 desc = 1;
6968 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6969 alpha = 1;
6970 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6971 limit_start = atoi(c->argv[j+1]->ptr);
6972 limit_count = atoi(c->argv[j+2]->ptr);
6973 j+=2;
6974 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6975 storekey = c->argv[j+1];
6976 j++;
6977 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6978 sortby = c->argv[j+1];
6979 /* If the BY pattern does not contain '*', i.e. it is constant,
6980 * we don't need to sort nor to lookup the weight keys. */
6981 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6982 j++;
6983 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6984 listAddNodeTail(operations,createSortOperation(
6985 REDIS_SORT_GET,c->argv[j+1]));
6986 getop++;
6987 j++;
6988 } else {
6989 decrRefCount(sortval);
6990 listRelease(operations);
6991 addReply(c,shared.syntaxerr);
6992 return;
6993 }
6994 j++;
6995 }
6996
6997 /* Load the sorting vector with all the objects to sort */
6998 switch(sortval->type) {
6999 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7000 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7001 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7002 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7003 }
7004 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7005 j = 0;
7006
7007 if (sortval->type == REDIS_LIST) {
7008 list *list = sortval->ptr;
7009 listNode *ln;
7010 listIter li;
7011
7012 listRewind(list,&li);
7013 while((ln = listNext(&li))) {
7014 robj *ele = ln->value;
7015 vector[j].obj = ele;
7016 vector[j].u.score = 0;
7017 vector[j].u.cmpobj = NULL;
7018 j++;
7019 }
7020 } else {
7021 dict *set;
7022 dictIterator *di;
7023 dictEntry *setele;
7024
7025 if (sortval->type == REDIS_SET) {
7026 set = sortval->ptr;
7027 } else {
7028 zset *zs = sortval->ptr;
7029 set = zs->dict;
7030 }
7031
7032 di = dictGetIterator(set);
7033 while((setele = dictNext(di)) != NULL) {
7034 vector[j].obj = dictGetEntryKey(setele);
7035 vector[j].u.score = 0;
7036 vector[j].u.cmpobj = NULL;
7037 j++;
7038 }
7039 dictReleaseIterator(di);
7040 }
7041 redisAssert(j == vectorlen);
7042
7043 /* Now it's time to load the right scores in the sorting vector */
7044 if (dontsort == 0) {
7045 for (j = 0; j < vectorlen; j++) {
7046 robj *byval;
7047 if (sortby) {
7048 /* lookup value to sort by */
7049 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7050 if (!byval) continue;
7051 } else {
7052 /* use object itself to sort by */
7053 byval = vector[j].obj;
7054 }
7055
7056 if (alpha) {
7057 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7058 } else {
7059 if (byval->encoding == REDIS_ENCODING_RAW) {
7060 vector[j].u.score = strtod(byval->ptr,NULL);
7061 } else if (byval->encoding == REDIS_ENCODING_INT) {
7062 /* Don't need to decode the object if it's
7063 * integer-encoded (the only encoding supported) so
7064 * far. We can just cast it */
7065 vector[j].u.score = (long)byval->ptr;
7066 } else {
7067 redisAssert(1 != 1);
7068 }
7069 }
7070
7071 /* when the object was retrieved using lookupKeyByPattern,
7072 * its refcount needs to be decreased. */
7073 if (sortby) {
7074 decrRefCount(byval);
7075 }
7076 }
7077 }
7078
7079 /* We are ready to sort the vector... perform a bit of sanity check
7080 * on the LIMIT option too. We'll use a partial version of quicksort. */
7081 start = (limit_start < 0) ? 0 : limit_start;
7082 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7083 if (start >= vectorlen) {
7084 start = vectorlen-1;
7085 end = vectorlen-2;
7086 }
7087 if (end >= vectorlen) end = vectorlen-1;
7088
7089 if (dontsort == 0) {
7090 server.sort_desc = desc;
7091 server.sort_alpha = alpha;
7092 server.sort_bypattern = sortby ? 1 : 0;
7093 if (sortby && (start != 0 || end != vectorlen-1))
7094 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7095 else
7096 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7097 }
7098
7099 /* Send command output to the output buffer, performing the specified
7100 * GET/DEL/INCR/DECR operations if any. */
7101 outputlen = getop ? getop*(end-start+1) : end-start+1;
7102 if (storekey == NULL) {
7103 /* STORE option not specified, sent the sorting result to client */
7104 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7105 for (j = start; j <= end; j++) {
7106 listNode *ln;
7107 listIter li;
7108
7109 if (!getop) addReplyBulk(c,vector[j].obj);
7110 listRewind(operations,&li);
7111 while((ln = listNext(&li))) {
7112 redisSortOperation *sop = ln->value;
7113 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7114 vector[j].obj);
7115
7116 if (sop->type == REDIS_SORT_GET) {
7117 if (!val) {
7118 addReply(c,shared.nullbulk);
7119 } else {
7120 addReplyBulk(c,val);
7121 decrRefCount(val);
7122 }
7123 } else {
7124 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7125 }
7126 }
7127 }
7128 } else {
7129 robj *listObject = createListObject();
7130 list *listPtr = (list*) listObject->ptr;
7131
7132 /* STORE option specified, set the sorting result as a List object */
7133 for (j = start; j <= end; j++) {
7134 listNode *ln;
7135 listIter li;
7136
7137 if (!getop) {
7138 listAddNodeTail(listPtr,vector[j].obj);
7139 incrRefCount(vector[j].obj);
7140 }
7141 listRewind(operations,&li);
7142 while((ln = listNext(&li))) {
7143 redisSortOperation *sop = ln->value;
7144 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7145 vector[j].obj);
7146
7147 if (sop->type == REDIS_SORT_GET) {
7148 if (!val) {
7149 listAddNodeTail(listPtr,createStringObject("",0));
7150 } else {
7151 /* We should do a incrRefCount on val because it is
7152 * added to the list, but also a decrRefCount because
7153 * it is returned by lookupKeyByPattern. This results
7154 * in doing nothing at all. */
7155 listAddNodeTail(listPtr,val);
7156 }
7157 } else {
7158 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7159 }
7160 }
7161 }
7162 if (dictReplace(c->db->dict,storekey,listObject)) {
7163 incrRefCount(storekey);
7164 }
7165 /* Note: we add 1 because the DB is dirty anyway since even if the
7166 * SORT result is empty a new key is set and maybe the old content
7167 * replaced. */
7168 server.dirty += 1+outputlen;
7169 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7170 }
7171
7172 /* Cleanup */
7173 decrRefCount(sortval);
7174 listRelease(operations);
7175 for (j = 0; j < vectorlen; j++) {
7176 if (alpha && vector[j].u.cmpobj)
7177 decrRefCount(vector[j].u.cmpobj);
7178 }
7179 zfree(vector);
7180 }
7181
7182 /* Convert an amount of bytes into a human readable string in the form
7183 * of 100B, 2G, 100M, 4K, and so forth. */
7184 static void bytesToHuman(char *s, unsigned long long n) {
7185 double d;
7186
7187 if (n < 1024) {
7188 /* Bytes */
7189 sprintf(s,"%lluB",n);
7190 return;
7191 } else if (n < (1024*1024)) {
7192 d = (double)n/(1024);
7193 sprintf(s,"%.2fK",d);
7194 } else if (n < (1024LL*1024*1024)) {
7195 d = (double)n/(1024*1024);
7196 sprintf(s,"%.2fM",d);
7197 } else if (n < (1024LL*1024*1024*1024)) {
7198 d = (double)n/(1024LL*1024*1024);
7199 sprintf(s,"%.2fG",d);
7200 }
7201 }
7202
7203 /* Create the string returned by the INFO command. This is decoupled
7204 * by the INFO command itself as we need to report the same information
7205 * on memory corruption problems. */
7206 static sds genRedisInfoString(void) {
7207 sds info;
7208 time_t uptime = time(NULL)-server.stat_starttime;
7209 int j;
7210 char hmem[64];
7211
7212 bytesToHuman(hmem,zmalloc_used_memory());
7213 info = sdscatprintf(sdsempty(),
7214 "redis_version:%s\r\n"
7215 "redis_git_sha1:%s\r\n"
7216 "redis_git_dirty:%d\r\n"
7217 "arch_bits:%s\r\n"
7218 "multiplexing_api:%s\r\n"
7219 "process_id:%ld\r\n"
7220 "uptime_in_seconds:%ld\r\n"
7221 "uptime_in_days:%ld\r\n"
7222 "connected_clients:%d\r\n"
7223 "connected_slaves:%d\r\n"
7224 "blocked_clients:%d\r\n"
7225 "used_memory:%zu\r\n"
7226 "used_memory_human:%s\r\n"
7227 "changes_since_last_save:%lld\r\n"
7228 "bgsave_in_progress:%d\r\n"
7229 "last_save_time:%ld\r\n"
7230 "bgrewriteaof_in_progress:%d\r\n"
7231 "total_connections_received:%lld\r\n"
7232 "total_commands_processed:%lld\r\n"
7233 "expired_keys:%lld\r\n"
7234 "hash_max_zipmap_entries:%zu\r\n"
7235 "hash_max_zipmap_value:%zu\r\n"
7236 "pubsub_channels:%ld\r\n"
7237 "pubsub_patterns:%u\r\n"
7238 "vm_enabled:%d\r\n"
7239 "role:%s\r\n"
7240 ,REDIS_VERSION,
7241 REDIS_GIT_SHA1,
7242 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7243 (sizeof(long) == 8) ? "64" : "32",
7244 aeGetApiName(),
7245 (long) getpid(),
7246 uptime,
7247 uptime/(3600*24),
7248 listLength(server.clients)-listLength(server.slaves),
7249 listLength(server.slaves),
7250 server.blpop_blocked_clients,
7251 zmalloc_used_memory(),
7252 hmem,
7253 server.dirty,
7254 server.bgsavechildpid != -1,
7255 server.lastsave,
7256 server.bgrewritechildpid != -1,
7257 server.stat_numconnections,
7258 server.stat_numcommands,
7259 server.stat_expiredkeys,
7260 server.hash_max_zipmap_entries,
7261 server.hash_max_zipmap_value,
7262 dictSize(server.pubsub_channels),
7263 listLength(server.pubsub_patterns),
7264 server.vm_enabled != 0,
7265 server.masterhost == NULL ? "master" : "slave"
7266 );
7267 if (server.masterhost) {
7268 info = sdscatprintf(info,
7269 "master_host:%s\r\n"
7270 "master_port:%d\r\n"
7271 "master_link_status:%s\r\n"
7272 "master_last_io_seconds_ago:%d\r\n"
7273 ,server.masterhost,
7274 server.masterport,
7275 (server.replstate == REDIS_REPL_CONNECTED) ?
7276 "up" : "down",
7277 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7278 );
7279 }
7280 if (server.vm_enabled) {
7281 lockThreadedIO();
7282 info = sdscatprintf(info,
7283 "vm_conf_max_memory:%llu\r\n"
7284 "vm_conf_page_size:%llu\r\n"
7285 "vm_conf_pages:%llu\r\n"
7286 "vm_stats_used_pages:%llu\r\n"
7287 "vm_stats_swapped_objects:%llu\r\n"
7288 "vm_stats_swappin_count:%llu\r\n"
7289 "vm_stats_swappout_count:%llu\r\n"
7290 "vm_stats_io_newjobs_len:%lu\r\n"
7291 "vm_stats_io_processing_len:%lu\r\n"
7292 "vm_stats_io_processed_len:%lu\r\n"
7293 "vm_stats_io_active_threads:%lu\r\n"
7294 "vm_stats_blocked_clients:%lu\r\n"
7295 ,(unsigned long long) server.vm_max_memory,
7296 (unsigned long long) server.vm_page_size,
7297 (unsigned long long) server.vm_pages,
7298 (unsigned long long) server.vm_stats_used_pages,
7299 (unsigned long long) server.vm_stats_swapped_objects,
7300 (unsigned long long) server.vm_stats_swapins,
7301 (unsigned long long) server.vm_stats_swapouts,
7302 (unsigned long) listLength(server.io_newjobs),
7303 (unsigned long) listLength(server.io_processing),
7304 (unsigned long) listLength(server.io_processed),
7305 (unsigned long) server.io_active_threads,
7306 (unsigned long) server.vm_blocked_clients
7307 );
7308 unlockThreadedIO();
7309 }
7310 for (j = 0; j < server.dbnum; j++) {
7311 long long keys, vkeys;
7312
7313 keys = dictSize(server.db[j].dict);
7314 vkeys = dictSize(server.db[j].expires);
7315 if (keys || vkeys) {
7316 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7317 j, keys, vkeys);
7318 }
7319 }
7320 return info;
7321 }
7322
7323 static void infoCommand(redisClient *c) {
7324 sds info = genRedisInfoString();
7325 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7326 (unsigned long)sdslen(info)));
7327 addReplySds(c,info);
7328 addReply(c,shared.crlf);
7329 }
7330
7331 static void monitorCommand(redisClient *c) {
7332 /* ignore MONITOR if aleady slave or in monitor mode */
7333 if (c->flags & REDIS_SLAVE) return;
7334
7335 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7336 c->slaveseldb = 0;
7337 listAddNodeTail(server.monitors,c);
7338 addReply(c,shared.ok);
7339 }
7340
7341 /* ================================= Expire ================================= */
7342 static int removeExpire(redisDb *db, robj *key) {
7343 if (dictDelete(db->expires,key) == DICT_OK) {
7344 return 1;
7345 } else {
7346 return 0;
7347 }
7348 }
7349
7350 static int setExpire(redisDb *db, robj *key, time_t when) {
7351 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7352 return 0;
7353 } else {
7354 incrRefCount(key);
7355 return 1;
7356 }
7357 }
7358
7359 /* Return the expire time of the specified key, or -1 if no expire
7360 * is associated with this key (i.e. the key is non volatile) */
7361 static time_t getExpire(redisDb *db, robj *key) {
7362 dictEntry *de;
7363
7364 /* No expire? return ASAP */
7365 if (dictSize(db->expires) == 0 ||
7366 (de = dictFind(db->expires,key)) == NULL) return -1;
7367
7368 return (time_t) dictGetEntryVal(de);
7369 }
7370
7371 static int expireIfNeeded(redisDb *db, robj *key) {
7372 time_t when;
7373 dictEntry *de;
7374
7375 /* No expire? return ASAP */
7376 if (dictSize(db->expires) == 0 ||
7377 (de = dictFind(db->expires,key)) == NULL) return 0;
7378
7379 /* Lookup the expire */
7380 when = (time_t) dictGetEntryVal(de);
7381 if (time(NULL) <= when) return 0;
7382
7383 /* Delete the key */
7384 dictDelete(db->expires,key);
7385 server.stat_expiredkeys++;
7386 return dictDelete(db->dict,key) == DICT_OK;
7387 }
7388
7389 static int deleteIfVolatile(redisDb *db, robj *key) {
7390 dictEntry *de;
7391
7392 /* No expire? return ASAP */
7393 if (dictSize(db->expires) == 0 ||
7394 (de = dictFind(db->expires,key)) == NULL) return 0;
7395
7396 /* Delete the key */
7397 server.dirty++;
7398 server.stat_expiredkeys++;
7399 dictDelete(db->expires,key);
7400 return dictDelete(db->dict,key) == DICT_OK;
7401 }
7402
7403 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7404 dictEntry *de;
7405 time_t seconds;
7406
7407 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7408
7409 seconds -= offset;
7410
7411 de = dictFind(c->db->dict,key);
7412 if (de == NULL) {
7413 addReply(c,shared.czero);
7414 return;
7415 }
7416 if (seconds <= 0) {
7417 if (deleteKey(c->db,key)) server.dirty++;
7418 addReply(c, shared.cone);
7419 return;
7420 } else {
7421 time_t when = time(NULL)+seconds;
7422 if (setExpire(c->db,key,when)) {
7423 addReply(c,shared.cone);
7424 server.dirty++;
7425 } else {
7426 addReply(c,shared.czero);
7427 }
7428 return;
7429 }
7430 }
7431
7432 static void expireCommand(redisClient *c) {
7433 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7434 }
7435
7436 static void expireatCommand(redisClient *c) {
7437 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7438 }
7439
7440 static void ttlCommand(redisClient *c) {
7441 time_t expire;
7442 int ttl = -1;
7443
7444 expire = getExpire(c->db,c->argv[1]);
7445 if (expire != -1) {
7446 ttl = (int) (expire-time(NULL));
7447 if (ttl < 0) ttl = -1;
7448 }
7449 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7450 }
7451
7452 /* ================================ MULTI/EXEC ============================== */
7453
7454 /* Client state initialization for MULTI/EXEC */
7455 static void initClientMultiState(redisClient *c) {
7456 c->mstate.commands = NULL;
7457 c->mstate.count = 0;
7458 }
7459
7460 /* Release all the resources associated with MULTI/EXEC state */
7461 static void freeClientMultiState(redisClient *c) {
7462 int j;
7463
7464 for (j = 0; j < c->mstate.count; j++) {
7465 int i;
7466 multiCmd *mc = c->mstate.commands+j;
7467
7468 for (i = 0; i < mc->argc; i++)
7469 decrRefCount(mc->argv[i]);
7470 zfree(mc->argv);
7471 }
7472 zfree(c->mstate.commands);
7473 }
7474
7475 /* Add a new command into the MULTI commands queue */
7476 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7477 multiCmd *mc;
7478 int j;
7479
7480 c->mstate.commands = zrealloc(c->mstate.commands,
7481 sizeof(multiCmd)*(c->mstate.count+1));
7482 mc = c->mstate.commands+c->mstate.count;
7483 mc->cmd = cmd;
7484 mc->argc = c->argc;
7485 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7486 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7487 for (j = 0; j < c->argc; j++)
7488 incrRefCount(mc->argv[j]);
7489 c->mstate.count++;
7490 }
7491
7492 static void multiCommand(redisClient *c) {
7493 c->flags |= REDIS_MULTI;
7494 addReply(c,shared.ok);
7495 }
7496
7497 static void discardCommand(redisClient *c) {
7498 if (!(c->flags & REDIS_MULTI)) {
7499 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7500 return;
7501 }
7502
7503 freeClientMultiState(c);
7504 initClientMultiState(c);
7505 c->flags &= (~REDIS_MULTI);
7506 addReply(c,shared.ok);
7507 }
7508
7509 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7510 * implememntation for more information. */
7511 static void execCommandReplicateMulti(redisClient *c) {
7512 struct redisCommand *cmd;
7513 robj *multistring = createStringObject("MULTI",5);
7514
7515 cmd = lookupCommand("multi");
7516 if (server.appendonly)
7517 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7518 if (listLength(server.slaves))
7519 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7520 decrRefCount(multistring);
7521 }
7522
7523 static void execCommand(redisClient *c) {
7524 int j;
7525 robj **orig_argv;
7526 int orig_argc;
7527
7528 if (!(c->flags & REDIS_MULTI)) {
7529 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7530 return;
7531 }
7532
7533 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7534 * A failed EXEC will return a multi bulk nil object. */
7535 if (c->flags & REDIS_DIRTY_CAS) {
7536 freeClientMultiState(c);
7537 initClientMultiState(c);
7538 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7539 unwatchAllKeys(c);
7540 addReply(c,shared.nullmultibulk);
7541 return;
7542 }
7543
7544 /* Replicate a MULTI request now that we are sure the block is executed.
7545 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7546 * both the AOF and the replication link will have the same consistency
7547 * and atomicity guarantees. */
7548 execCommandReplicateMulti(c);
7549
7550 /* Exec all the queued commands */
7551 orig_argv = c->argv;
7552 orig_argc = c->argc;
7553 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7554 for (j = 0; j < c->mstate.count; j++) {
7555 c->argc = c->mstate.commands[j].argc;
7556 c->argv = c->mstate.commands[j].argv;
7557 call(c,c->mstate.commands[j].cmd);
7558 }
7559 c->argv = orig_argv;
7560 c->argc = orig_argc;
7561 freeClientMultiState(c);
7562 initClientMultiState(c);
7563 c->flags &= (~REDIS_MULTI);
7564 unwatchAllKeys(c);
7565 /* Make sure the EXEC command is always replicated / AOF, since we
7566 * always send the MULTI command (we can't know beforehand if the
7567 * next operations will contain at least a modification to the DB). */
7568 server.dirty++;
7569 }
7570
7571 /* =========================== Blocking Operations ========================= */
7572
7573 /* Currently Redis blocking operations support is limited to list POP ops,
7574 * so the current implementation is not fully generic, but it is also not
7575 * completely specific so it will not require a rewrite to support new
7576 * kind of blocking operations in the future.
7577 *
7578 * Still it's important to note that list blocking operations can be already
7579 * used as a notification mechanism in order to implement other blocking
7580 * operations at application level, so there must be a very strong evidence
7581 * of usefulness and generality before new blocking operations are implemented.
7582 *
7583 * This is how the current blocking POP works, we use BLPOP as example:
7584 * - If the user calls BLPOP and the key exists and contains a non empty list
7585 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7586 * if there is not to block.
7587 * - If instead BLPOP is called and the key does not exists or the list is
7588 * empty we need to block. In order to do so we remove the notification for
7589 * new data to read in the client socket (so that we'll not serve new
7590 * requests if the blocking request is not served). Also we put the client
7591 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7592 * blocking for this keys.
7593 * - If a PUSH operation against a key with blocked clients waiting is
7594 * performed, we serve the first in the list: basically instead to push
7595 * the new element inside the list we return it to the (first / oldest)
7596 * blocking client, unblock the client, and remove it form the list.
7597 *
7598 * The above comment and the source code should be enough in order to understand
7599 * the implementation and modify / fix it later.
7600 */
7601
7602 /* Set a client in blocking mode for the specified key, with the specified
7603 * timeout */
7604 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7605 dictEntry *de;
7606 list *l;
7607 int j;
7608
7609 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7610 c->blocking_keys_num = numkeys;
7611 c->blockingto = timeout;
7612 for (j = 0; j < numkeys; j++) {
7613 /* Add the key in the client structure, to map clients -> keys */
7614 c->blocking_keys[j] = keys[j];
7615 incrRefCount(keys[j]);
7616
7617 /* And in the other "side", to map keys -> clients */
7618 de = dictFind(c->db->blocking_keys,keys[j]);
7619 if (de == NULL) {
7620 int retval;
7621
7622 /* For every key we take a list of clients blocked for it */
7623 l = listCreate();
7624 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7625 incrRefCount(keys[j]);
7626 assert(retval == DICT_OK);
7627 } else {
7628 l = dictGetEntryVal(de);
7629 }
7630 listAddNodeTail(l,c);
7631 }
7632 /* Mark the client as a blocked client */
7633 c->flags |= REDIS_BLOCKED;
7634 server.blpop_blocked_clients++;
7635 }
7636
7637 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7638 static void unblockClientWaitingData(redisClient *c) {
7639 dictEntry *de;
7640 list *l;
7641 int j;
7642
7643 assert(c->blocking_keys != NULL);
7644 /* The client may wait for multiple keys, so unblock it for every key. */
7645 for (j = 0; j < c->blocking_keys_num; j++) {
7646 /* Remove this client from the list of clients waiting for this key. */
7647 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7648 assert(de != NULL);
7649 l = dictGetEntryVal(de);
7650 listDelNode(l,listSearchKey(l,c));
7651 /* If the list is empty we need to remove it to avoid wasting memory */
7652 if (listLength(l) == 0)
7653 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7654 decrRefCount(c->blocking_keys[j]);
7655 }
7656 /* Cleanup the client structure */
7657 zfree(c->blocking_keys);
7658 c->blocking_keys = NULL;
7659 c->flags &= (~REDIS_BLOCKED);
7660 server.blpop_blocked_clients--;
7661 /* We want to process data if there is some command waiting
7662 * in the input buffer. Note that this is safe even if
7663 * unblockClientWaitingData() gets called from freeClient() because
7664 * freeClient() will be smart enough to call this function
7665 * *after* c->querybuf was set to NULL. */
7666 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7667 }
7668
7669 /* This should be called from any function PUSHing into lists.
7670 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7671 * 'ele' is the element pushed.
7672 *
7673 * If the function returns 0 there was no client waiting for a list push
7674 * against this key.
7675 *
7676 * If the function returns 1 there was a client waiting for a list push
7677 * against this key, the element was passed to this client thus it's not
7678 * needed to actually add it to the list and the caller should return asap. */
7679 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7680 struct dictEntry *de;
7681 redisClient *receiver;
7682 list *l;
7683 listNode *ln;
7684
7685 de = dictFind(c->db->blocking_keys,key);
7686 if (de == NULL) return 0;
7687 l = dictGetEntryVal(de);
7688 ln = listFirst(l);
7689 assert(ln != NULL);
7690 receiver = ln->value;
7691
7692 addReplySds(receiver,sdsnew("*2\r\n"));
7693 addReplyBulk(receiver,key);
7694 addReplyBulk(receiver,ele);
7695 unblockClientWaitingData(receiver);
7696 return 1;
7697 }
7698
7699 /* Blocking RPOP/LPOP */
7700 static void blockingPopGenericCommand(redisClient *c, int where) {
7701 robj *o;
7702 time_t timeout;
7703 int j;
7704
7705 for (j = 1; j < c->argc-1; j++) {
7706 o = lookupKeyWrite(c->db,c->argv[j]);
7707 if (o != NULL) {
7708 if (o->type != REDIS_LIST) {
7709 addReply(c,shared.wrongtypeerr);
7710 return;
7711 } else {
7712 list *list = o->ptr;
7713 if (listLength(list) != 0) {
7714 /* If the list contains elements fall back to the usual
7715 * non-blocking POP operation */
7716 robj *argv[2], **orig_argv;
7717 int orig_argc;
7718
7719 /* We need to alter the command arguments before to call
7720 * popGenericCommand() as the command takes a single key. */
7721 orig_argv = c->argv;
7722 orig_argc = c->argc;
7723 argv[1] = c->argv[j];
7724 c->argv = argv;
7725 c->argc = 2;
7726
7727 /* Also the return value is different, we need to output
7728 * the multi bulk reply header and the key name. The
7729 * "real" command will add the last element (the value)
7730 * for us. If this souds like an hack to you it's just
7731 * because it is... */
7732 addReplySds(c,sdsnew("*2\r\n"));
7733 addReplyBulk(c,argv[1]);
7734 popGenericCommand(c,where);
7735
7736 /* Fix the client structure with the original stuff */
7737 c->argv = orig_argv;
7738 c->argc = orig_argc;
7739 return;
7740 }
7741 }
7742 }
7743 }
7744 /* If the list is empty or the key does not exists we must block */
7745 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7746 if (timeout > 0) timeout += time(NULL);
7747 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7748 }
7749
7750 static void blpopCommand(redisClient *c) {
7751 blockingPopGenericCommand(c,REDIS_HEAD);
7752 }
7753
7754 static void brpopCommand(redisClient *c) {
7755 blockingPopGenericCommand(c,REDIS_TAIL);
7756 }
7757
7758 /* =============================== Replication ============================= */
7759
7760 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7761 ssize_t nwritten, ret = size;
7762 time_t start = time(NULL);
7763
7764 timeout++;
7765 while(size) {
7766 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7767 nwritten = write(fd,ptr,size);
7768 if (nwritten == -1) return -1;
7769 ptr += nwritten;
7770 size -= nwritten;
7771 }
7772 if ((time(NULL)-start) > timeout) {
7773 errno = ETIMEDOUT;
7774 return -1;
7775 }
7776 }
7777 return ret;
7778 }
7779
7780 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7781 ssize_t nread, totread = 0;
7782 time_t start = time(NULL);
7783
7784 timeout++;
7785 while(size) {
7786 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7787 nread = read(fd,ptr,size);
7788 if (nread == -1) return -1;
7789 ptr += nread;
7790 size -= nread;
7791 totread += nread;
7792 }
7793 if ((time(NULL)-start) > timeout) {
7794 errno = ETIMEDOUT;
7795 return -1;
7796 }
7797 }
7798 return totread;
7799 }
7800
7801 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7802 ssize_t nread = 0;
7803
7804 size--;
7805 while(size) {
7806 char c;
7807
7808 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7809 if (c == '\n') {
7810 *ptr = '\0';
7811 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7812 return nread;
7813 } else {
7814 *ptr++ = c;
7815 *ptr = '\0';
7816 nread++;
7817 }
7818 }
7819 return nread;
7820 }
7821
7822 static void syncCommand(redisClient *c) {
7823 /* ignore SYNC if aleady slave or in monitor mode */
7824 if (c->flags & REDIS_SLAVE) return;
7825
7826 /* SYNC can't be issued when the server has pending data to send to
7827 * the client about already issued commands. We need a fresh reply
7828 * buffer registering the differences between the BGSAVE and the current
7829 * dataset, so that we can copy to other slaves if needed. */
7830 if (listLength(c->reply) != 0) {
7831 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7832 return;
7833 }
7834
7835 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7836 /* Here we need to check if there is a background saving operation
7837 * in progress, or if it is required to start one */
7838 if (server.bgsavechildpid != -1) {
7839 /* Ok a background save is in progress. Let's check if it is a good
7840 * one for replication, i.e. if there is another slave that is
7841 * registering differences since the server forked to save */
7842 redisClient *slave;
7843 listNode *ln;
7844 listIter li;
7845
7846 listRewind(server.slaves,&li);
7847 while((ln = listNext(&li))) {
7848 slave = ln->value;
7849 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7850 }
7851 if (ln) {
7852 /* Perfect, the server is already registering differences for
7853 * another slave. Set the right state, and copy the buffer. */
7854 listRelease(c->reply);
7855 c->reply = listDup(slave->reply);
7856 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7857 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7858 } else {
7859 /* No way, we need to wait for the next BGSAVE in order to
7860 * register differences */
7861 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7862 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7863 }
7864 } else {
7865 /* Ok we don't have a BGSAVE in progress, let's start one */
7866 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7867 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7868 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7869 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7870 return;
7871 }
7872 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7873 }
7874 c->repldbfd = -1;
7875 c->flags |= REDIS_SLAVE;
7876 c->slaveseldb = 0;
7877 listAddNodeTail(server.slaves,c);
7878 return;
7879 }
7880
7881 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7882 redisClient *slave = privdata;
7883 REDIS_NOTUSED(el);
7884 REDIS_NOTUSED(mask);
7885 char buf[REDIS_IOBUF_LEN];
7886 ssize_t nwritten, buflen;
7887
7888 if (slave->repldboff == 0) {
7889 /* Write the bulk write count before to transfer the DB. In theory here
7890 * we don't know how much room there is in the output buffer of the
7891 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7892 * operations) will never be smaller than the few bytes we need. */
7893 sds bulkcount;
7894
7895 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7896 slave->repldbsize);
7897 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7898 {
7899 sdsfree(bulkcount);
7900 freeClient(slave);
7901 return;
7902 }
7903 sdsfree(bulkcount);
7904 }
7905 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7906 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7907 if (buflen <= 0) {
7908 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7909 (buflen == 0) ? "premature EOF" : strerror(errno));
7910 freeClient(slave);
7911 return;
7912 }
7913 if ((nwritten = write(fd,buf,buflen)) == -1) {
7914 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7915 strerror(errno));
7916 freeClient(slave);
7917 return;
7918 }
7919 slave->repldboff += nwritten;
7920 if (slave->repldboff == slave->repldbsize) {
7921 close(slave->repldbfd);
7922 slave->repldbfd = -1;
7923 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7924 slave->replstate = REDIS_REPL_ONLINE;
7925 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7926 sendReplyToClient, slave) == AE_ERR) {
7927 freeClient(slave);
7928 return;
7929 }
7930 addReplySds(slave,sdsempty());
7931 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7932 }
7933 }
7934
7935 /* This function is called at the end of every backgrond saving.
7936 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7937 * otherwise REDIS_ERR is passed to the function.
7938 *
7939 * The goal of this function is to handle slaves waiting for a successful
7940 * background saving in order to perform non-blocking synchronization. */
7941 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7942 listNode *ln;
7943 int startbgsave = 0;
7944 listIter li;
7945
7946 listRewind(server.slaves,&li);
7947 while((ln = listNext(&li))) {
7948 redisClient *slave = ln->value;
7949
7950 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7951 startbgsave = 1;
7952 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7953 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7954 struct redis_stat buf;
7955
7956 if (bgsaveerr != REDIS_OK) {
7957 freeClient(slave);
7958 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7959 continue;
7960 }
7961 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7962 redis_fstat(slave->repldbfd,&buf) == -1) {
7963 freeClient(slave);
7964 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7965 continue;
7966 }
7967 slave->repldboff = 0;
7968 slave->repldbsize = buf.st_size;
7969 slave->replstate = REDIS_REPL_SEND_BULK;
7970 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7971 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7972 freeClient(slave);
7973 continue;
7974 }
7975 }
7976 }
7977 if (startbgsave) {
7978 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7979 listIter li;
7980
7981 listRewind(server.slaves,&li);
7982 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7983 while((ln = listNext(&li))) {
7984 redisClient *slave = ln->value;
7985
7986 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7987 freeClient(slave);
7988 }
7989 }
7990 }
7991 }
7992
7993 static int syncWithMaster(void) {
7994 char buf[1024], tmpfile[256], authcmd[1024];
7995 long dumpsize;
7996 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7997 int dfd, maxtries = 5;
7998
7999 if (fd == -1) {
8000 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8001 strerror(errno));
8002 return REDIS_ERR;
8003 }
8004
8005 /* AUTH with the master if required. */
8006 if(server.masterauth) {
8007 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8008 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8009 close(fd);
8010 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8011 strerror(errno));
8012 return REDIS_ERR;
8013 }
8014 /* Read the AUTH result. */
8015 if (syncReadLine(fd,buf,1024,3600) == -1) {
8016 close(fd);
8017 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8018 strerror(errno));
8019 return REDIS_ERR;
8020 }
8021 if (buf[0] != '+') {
8022 close(fd);
8023 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8024 return REDIS_ERR;
8025 }
8026 }
8027
8028 /* Issue the SYNC command */
8029 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8030 close(fd);
8031 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8032 strerror(errno));
8033 return REDIS_ERR;
8034 }
8035 /* Read the bulk write count */
8036 if (syncReadLine(fd,buf,1024,3600) == -1) {
8037 close(fd);
8038 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8039 strerror(errno));
8040 return REDIS_ERR;
8041 }
8042 if (buf[0] != '$') {
8043 close(fd);
8044 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8045 return REDIS_ERR;
8046 }
8047 dumpsize = strtol(buf+1,NULL,10);
8048 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8049 /* Read the bulk write data on a temp file */
8050 while(maxtries--) {
8051 snprintf(tmpfile,256,
8052 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8053 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8054 if (dfd != -1) break;
8055 sleep(1);
8056 }
8057 if (dfd == -1) {
8058 close(fd);
8059 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8060 return REDIS_ERR;
8061 }
8062 while(dumpsize) {
8063 int nread, nwritten;
8064
8065 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8066 if (nread == -1) {
8067 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8068 strerror(errno));
8069 close(fd);
8070 close(dfd);
8071 return REDIS_ERR;
8072 }
8073 nwritten = write(dfd,buf,nread);
8074 if (nwritten == -1) {
8075 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8076 close(fd);
8077 close(dfd);
8078 return REDIS_ERR;
8079 }
8080 dumpsize -= nread;
8081 }
8082 close(dfd);
8083 if (rename(tmpfile,server.dbfilename) == -1) {
8084 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8085 unlink(tmpfile);
8086 close(fd);
8087 return REDIS_ERR;
8088 }
8089 emptyDb();
8090 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8091 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8092 close(fd);
8093 return REDIS_ERR;
8094 }
8095 server.master = createClient(fd);
8096 server.master->flags |= REDIS_MASTER;
8097 server.master->authenticated = 1;
8098 server.replstate = REDIS_REPL_CONNECTED;
8099 return REDIS_OK;
8100 }
8101
8102 static void slaveofCommand(redisClient *c) {
8103 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8104 !strcasecmp(c->argv[2]->ptr,"one")) {
8105 if (server.masterhost) {
8106 sdsfree(server.masterhost);
8107 server.masterhost = NULL;
8108 if (server.master) freeClient(server.master);
8109 server.replstate = REDIS_REPL_NONE;
8110 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8111 }
8112 } else {
8113 sdsfree(server.masterhost);
8114 server.masterhost = sdsdup(c->argv[1]->ptr);
8115 server.masterport = atoi(c->argv[2]->ptr);
8116 if (server.master) freeClient(server.master);
8117 server.replstate = REDIS_REPL_CONNECT;
8118 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8119 server.masterhost, server.masterport);
8120 }
8121 addReply(c,shared.ok);
8122 }
8123
8124 /* ============================ Maxmemory directive ======================== */
8125
8126 /* Try to free one object form the pre-allocated objects free list.
8127 * This is useful under low mem conditions as by default we take 1 million
8128 * free objects allocated. On success REDIS_OK is returned, otherwise
8129 * REDIS_ERR. */
8130 static int tryFreeOneObjectFromFreelist(void) {
8131 robj *o;
8132
8133 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8134 if (listLength(server.objfreelist)) {
8135 listNode *head = listFirst(server.objfreelist);
8136 o = listNodeValue(head);
8137 listDelNode(server.objfreelist,head);
8138 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8139 zfree(o);
8140 return REDIS_OK;
8141 } else {
8142 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8143 return REDIS_ERR;
8144 }
8145 }
8146
8147 /* This function gets called when 'maxmemory' is set on the config file to limit
8148 * the max memory used by the server, and we are out of memory.
8149 * This function will try to, in order:
8150 *
8151 * - Free objects from the free list
8152 * - Try to remove keys with an EXPIRE set
8153 *
8154 * It is not possible to free enough memory to reach used-memory < maxmemory
8155 * the server will start refusing commands that will enlarge even more the
8156 * memory usage.
8157 */
8158 static void freeMemoryIfNeeded(void) {
8159 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8160 int j, k, freed = 0;
8161
8162 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8163 for (j = 0; j < server.dbnum; j++) {
8164 int minttl = -1;
8165 robj *minkey = NULL;
8166 struct dictEntry *de;
8167
8168 if (dictSize(server.db[j].expires)) {
8169 freed = 1;
8170 /* From a sample of three keys drop the one nearest to
8171 * the natural expire */
8172 for (k = 0; k < 3; k++) {
8173 time_t t;
8174
8175 de = dictGetRandomKey(server.db[j].expires);
8176 t = (time_t) dictGetEntryVal(de);
8177 if (minttl == -1 || t < minttl) {
8178 minkey = dictGetEntryKey(de);
8179 minttl = t;
8180 }
8181 }
8182 deleteKey(server.db+j,minkey);
8183 }
8184 }
8185 if (!freed) return; /* nothing to free... */
8186 }
8187 }
8188
8189 /* ============================== Append Only file ========================== */
8190
8191 /* Write the append only file buffer on disk.
8192 *
8193 * Since we are required to write the AOF before replying to the client,
8194 * and the only way the client socket can get a write is entering when the
8195 * the event loop, we accumulate all the AOF writes in a memory
8196 * buffer and write it on disk using this function just before entering
8197 * the event loop again. */
8198 static void flushAppendOnlyFile(void) {
8199 time_t now;
8200 ssize_t nwritten;
8201
8202 if (sdslen(server.aofbuf) == 0) return;
8203
8204 /* We want to perform a single write. This should be guaranteed atomic
8205 * at least if the filesystem we are writing is a real physical one.
8206 * While this will save us against the server being killed I don't think
8207 * there is much to do about the whole server stopping for power problems
8208 * or alike */
8209 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8210 if (nwritten != (signed)sdslen(server.aofbuf)) {
8211 /* Ooops, we are in troubles. The best thing to do for now is
8212 * aborting instead of giving the illusion that everything is
8213 * working as expected. */
8214 if (nwritten == -1) {
8215 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8216 } else {
8217 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8218 }
8219 exit(1);
8220 }
8221 sdsfree(server.aofbuf);
8222 server.aofbuf = sdsempty();
8223
8224 /* Fsync if needed */
8225 now = time(NULL);
8226 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8227 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8228 now-server.lastfsync > 1))
8229 {
8230 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8231 * flushing metadata. */
8232 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8233 server.lastfsync = now;
8234 }
8235 }
8236
8237 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8238 int j;
8239 buf = sdscatprintf(buf,"*%d\r\n",argc);
8240 for (j = 0; j < argc; j++) {
8241 robj *o = getDecodedObject(argv[j]);
8242 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8243 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8244 buf = sdscatlen(buf,"\r\n",2);
8245 decrRefCount(o);
8246 }
8247 return buf;
8248 }
8249
8250 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8251 int argc = 3;
8252 long when;
8253 robj *argv[3];
8254
8255 /* Make sure we can use strtol */
8256 seconds = getDecodedObject(seconds);
8257 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8258 decrRefCount(seconds);
8259
8260 argv[0] = createStringObject("EXPIREAT",8);
8261 argv[1] = key;
8262 argv[2] = createObject(REDIS_STRING,
8263 sdscatprintf(sdsempty(),"%ld",when));
8264 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8265 decrRefCount(argv[0]);
8266 decrRefCount(argv[2]);
8267 return buf;
8268 }
8269
8270 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8271 sds buf = sdsempty();
8272 robj *tmpargv[3];
8273
8274 /* The DB this command was targetting is not the same as the last command
8275 * we appendend. To issue a SELECT command is needed. */
8276 if (dictid != server.appendseldb) {
8277 char seldb[64];
8278
8279 snprintf(seldb,sizeof(seldb),"%d",dictid);
8280 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8281 (unsigned long)strlen(seldb),seldb);
8282 server.appendseldb = dictid;
8283 }
8284
8285 if (cmd->proc == expireCommand) {
8286 /* Translate EXPIRE into EXPIREAT */
8287 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8288 } else if (cmd->proc == setexCommand) {
8289 /* Translate SETEX to SET and EXPIREAT */
8290 tmpargv[0] = createStringObject("SET",3);
8291 tmpargv[1] = argv[1];
8292 tmpargv[2] = argv[3];
8293 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8294 decrRefCount(tmpargv[0]);
8295 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8296 } else {
8297 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8298 }
8299
8300 /* Append to the AOF buffer. This will be flushed on disk just before
8301 * of re-entering the event loop, so before the client will get a
8302 * positive reply about the operation performed. */
8303 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8304
8305 /* If a background append only file rewriting is in progress we want to
8306 * accumulate the differences between the child DB and the current one
8307 * in a buffer, so that when the child process will do its work we
8308 * can append the differences to the new append only file. */
8309 if (server.bgrewritechildpid != -1)
8310 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8311
8312 sdsfree(buf);
8313 }
8314
8315 /* In Redis commands are always executed in the context of a client, so in
8316 * order to load the append only file we need to create a fake client. */
8317 static struct redisClient *createFakeClient(void) {
8318 struct redisClient *c = zmalloc(sizeof(*c));
8319
8320 selectDb(c,0);
8321 c->fd = -1;
8322 c->querybuf = sdsempty();
8323 c->argc = 0;
8324 c->argv = NULL;
8325 c->flags = 0;
8326 /* We set the fake client as a slave waiting for the synchronization
8327 * so that Redis will not try to send replies to this client. */
8328 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8329 c->reply = listCreate();
8330 listSetFreeMethod(c->reply,decrRefCount);
8331 listSetDupMethod(c->reply,dupClientReplyValue);
8332 initClientMultiState(c);
8333 return c;
8334 }
8335
8336 static void freeFakeClient(struct redisClient *c) {
8337 sdsfree(c->querybuf);
8338 listRelease(c->reply);
8339 freeClientMultiState(c);
8340 zfree(c);
8341 }
8342
8343 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8344 * error (the append only file is zero-length) REDIS_ERR is returned. On
8345 * fatal error an error message is logged and the program exists. */
8346 int loadAppendOnlyFile(char *filename) {
8347 struct redisClient *fakeClient;
8348 FILE *fp = fopen(filename,"r");
8349 struct redis_stat sb;
8350 unsigned long long loadedkeys = 0;
8351 int appendonly = server.appendonly;
8352
8353 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8354 return REDIS_ERR;
8355
8356 if (fp == NULL) {
8357 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8358 exit(1);
8359 }
8360
8361 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8362 * to the same file we're about to read. */
8363 server.appendonly = 0;
8364
8365 fakeClient = createFakeClient();
8366 while(1) {
8367 int argc, j;
8368 unsigned long len;
8369 robj **argv;
8370 char buf[128];
8371 sds argsds;
8372 struct redisCommand *cmd;
8373
8374 if (fgets(buf,sizeof(buf),fp) == NULL) {
8375 if (feof(fp))
8376 break;
8377 else
8378 goto readerr;
8379 }
8380 if (buf[0] != '*') goto fmterr;
8381 argc = atoi(buf+1);
8382 argv = zmalloc(sizeof(robj*)*argc);
8383 for (j = 0; j < argc; j++) {
8384 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8385 if (buf[0] != '$') goto fmterr;
8386 len = strtol(buf+1,NULL,10);
8387 argsds = sdsnewlen(NULL,len);
8388 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8389 argv[j] = createObject(REDIS_STRING,argsds);
8390 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8391 }
8392
8393 /* Command lookup */
8394 cmd = lookupCommand(argv[0]->ptr);
8395 if (!cmd) {
8396 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8397 exit(1);
8398 }
8399 /* Try object encoding */
8400 if (cmd->flags & REDIS_CMD_BULK)
8401 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8402 /* Run the command in the context of a fake client */
8403 fakeClient->argc = argc;
8404 fakeClient->argv = argv;
8405 cmd->proc(fakeClient);
8406 /* Discard the reply objects list from the fake client */
8407 while(listLength(fakeClient->reply))
8408 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8409 /* Clean up, ready for the next command */
8410 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8411 zfree(argv);
8412 /* Handle swapping while loading big datasets when VM is on */
8413 loadedkeys++;
8414 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8415 while (zmalloc_used_memory() > server.vm_max_memory) {
8416 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8417 }
8418 }
8419 }
8420
8421 /* This point can only be reached when EOF is reached without errors.
8422 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8423 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8424
8425 fclose(fp);
8426 freeFakeClient(fakeClient);
8427 server.appendonly = appendonly;
8428 return REDIS_OK;
8429
8430 readerr:
8431 if (feof(fp)) {
8432 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8433 } else {
8434 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8435 }
8436 exit(1);
8437 fmterr:
8438 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8439 exit(1);
8440 }
8441
8442 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8443 static int fwriteBulkObject(FILE *fp, robj *obj) {
8444 char buf[128];
8445 int decrrc = 0;
8446
8447 /* Avoid the incr/decr ref count business if possible to help
8448 * copy-on-write (we are often in a child process when this function
8449 * is called).
8450 * Also makes sure that key objects don't get incrRefCount-ed when VM
8451 * is enabled */
8452 if (obj->encoding != REDIS_ENCODING_RAW) {
8453 obj = getDecodedObject(obj);
8454 decrrc = 1;
8455 }
8456 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8457 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8458 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8459 goto err;
8460 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8461 if (decrrc) decrRefCount(obj);
8462 return 1;
8463 err:
8464 if (decrrc) decrRefCount(obj);
8465 return 0;
8466 }
8467
8468 /* Write binary-safe string into a file in the bulkformat
8469 * $<count>\r\n<payload>\r\n */
8470 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8471 char buf[128];
8472
8473 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8474 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8475 if (len && fwrite(s,len,1,fp) == 0) return 0;
8476 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8477 return 1;
8478 }
8479
8480 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8481 static int fwriteBulkDouble(FILE *fp, double d) {
8482 char buf[128], dbuf[128];
8483
8484 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8485 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8486 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8487 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8488 return 1;
8489 }
8490
8491 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8492 static int fwriteBulkLong(FILE *fp, long l) {
8493 char buf[128], lbuf[128];
8494
8495 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8496 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8497 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8498 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8499 return 1;
8500 }
8501
8502 /* Write a sequence of commands able to fully rebuild the dataset into
8503 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8504 static int rewriteAppendOnlyFile(char *filename) {
8505 dictIterator *di = NULL;
8506 dictEntry *de;
8507 FILE *fp;
8508 char tmpfile[256];
8509 int j;
8510 time_t now = time(NULL);
8511
8512 /* Note that we have to use a different temp name here compared to the
8513 * one used by rewriteAppendOnlyFileBackground() function. */
8514 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8515 fp = fopen(tmpfile,"w");
8516 if (!fp) {
8517 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8518 return REDIS_ERR;
8519 }
8520 for (j = 0; j < server.dbnum; j++) {
8521 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8522 redisDb *db = server.db+j;
8523 dict *d = db->dict;
8524 if (dictSize(d) == 0) continue;
8525 di = dictGetIterator(d);
8526 if (!di) {
8527 fclose(fp);
8528 return REDIS_ERR;
8529 }
8530
8531 /* SELECT the new DB */
8532 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8533 if (fwriteBulkLong(fp,j) == 0) goto werr;
8534
8535 /* Iterate this DB writing every entry */
8536 while((de = dictNext(di)) != NULL) {
8537 robj *key, *o;
8538 time_t expiretime;
8539 int swapped;
8540
8541 key = dictGetEntryKey(de);
8542 /* If the value for this key is swapped, load a preview in memory.
8543 * We use a "swapped" flag to remember if we need to free the
8544 * value object instead to just increment the ref count anyway
8545 * in order to avoid copy-on-write of pages if we are forked() */
8546 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8547 key->storage == REDIS_VM_SWAPPING) {
8548 o = dictGetEntryVal(de);
8549 swapped = 0;
8550 } else {
8551 o = vmPreviewObject(key);
8552 swapped = 1;
8553 }
8554 expiretime = getExpire(db,key);
8555
8556 /* Save the key and associated value */
8557 if (o->type == REDIS_STRING) {
8558 /* Emit a SET command */
8559 char cmd[]="*3\r\n$3\r\nSET\r\n";
8560 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8561 /* Key and value */
8562 if (fwriteBulkObject(fp,key) == 0) goto werr;
8563 if (fwriteBulkObject(fp,o) == 0) goto werr;
8564 } else if (o->type == REDIS_LIST) {
8565 /* Emit the RPUSHes needed to rebuild the list */
8566 list *list = o->ptr;
8567 listNode *ln;
8568 listIter li;
8569
8570 listRewind(list,&li);
8571 while((ln = listNext(&li))) {
8572 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8573 robj *eleobj = listNodeValue(ln);
8574
8575 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8576 if (fwriteBulkObject(fp,key) == 0) goto werr;
8577 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8578 }
8579 } else if (o->type == REDIS_SET) {
8580 /* Emit the SADDs needed to rebuild the set */
8581 dict *set = o->ptr;
8582 dictIterator *di = dictGetIterator(set);
8583 dictEntry *de;
8584
8585 while((de = dictNext(di)) != NULL) {
8586 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8587 robj *eleobj = dictGetEntryKey(de);
8588
8589 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8590 if (fwriteBulkObject(fp,key) == 0) goto werr;
8591 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8592 }
8593 dictReleaseIterator(di);
8594 } else if (o->type == REDIS_ZSET) {
8595 /* Emit the ZADDs needed to rebuild the sorted set */
8596 zset *zs = o->ptr;
8597 dictIterator *di = dictGetIterator(zs->dict);
8598 dictEntry *de;
8599
8600 while((de = dictNext(di)) != NULL) {
8601 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8602 robj *eleobj = dictGetEntryKey(de);
8603 double *score = dictGetEntryVal(de);
8604
8605 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8606 if (fwriteBulkObject(fp,key) == 0) goto werr;
8607 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8608 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8609 }
8610 dictReleaseIterator(di);
8611 } else if (o->type == REDIS_HASH) {
8612 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8613
8614 /* Emit the HSETs needed to rebuild the hash */
8615 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8616 unsigned char *p = zipmapRewind(o->ptr);
8617 unsigned char *field, *val;
8618 unsigned int flen, vlen;
8619
8620 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8621 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8622 if (fwriteBulkObject(fp,key) == 0) goto werr;
8623 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8624 return -1;
8625 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8626 return -1;
8627 }
8628 } else {
8629 dictIterator *di = dictGetIterator(o->ptr);
8630 dictEntry *de;
8631
8632 while((de = dictNext(di)) != NULL) {
8633 robj *field = dictGetEntryKey(de);
8634 robj *val = dictGetEntryVal(de);
8635
8636 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8637 if (fwriteBulkObject(fp,key) == 0) goto werr;
8638 if (fwriteBulkObject(fp,field) == -1) return -1;
8639 if (fwriteBulkObject(fp,val) == -1) return -1;
8640 }
8641 dictReleaseIterator(di);
8642 }
8643 } else {
8644 redisPanic("Unknown object type");
8645 }
8646 /* Save the expire time */
8647 if (expiretime != -1) {
8648 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8649 /* If this key is already expired skip it */
8650 if (expiretime < now) continue;
8651 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8652 if (fwriteBulkObject(fp,key) == 0) goto werr;
8653 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8654 }
8655 if (swapped) decrRefCount(o);
8656 }
8657 dictReleaseIterator(di);
8658 }
8659
8660 /* Make sure data will not remain on the OS's output buffers */
8661 fflush(fp);
8662 fsync(fileno(fp));
8663 fclose(fp);
8664
8665 /* Use RENAME to make sure the DB file is changed atomically only
8666 * if the generate DB file is ok. */
8667 if (rename(tmpfile,filename) == -1) {
8668 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8669 unlink(tmpfile);
8670 return REDIS_ERR;
8671 }
8672 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8673 return REDIS_OK;
8674
8675 werr:
8676 fclose(fp);
8677 unlink(tmpfile);
8678 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8679 if (di) dictReleaseIterator(di);
8680 return REDIS_ERR;
8681 }
8682
8683 /* This is how rewriting of the append only file in background works:
8684 *
8685 * 1) The user calls BGREWRITEAOF
8686 * 2) Redis calls this function, that forks():
8687 * 2a) the child rewrite the append only file in a temp file.
8688 * 2b) the parent accumulates differences in server.bgrewritebuf.
8689 * 3) When the child finished '2a' exists.
8690 * 4) The parent will trap the exit code, if it's OK, will append the
8691 * data accumulated into server.bgrewritebuf into the temp file, and
8692 * finally will rename(2) the temp file in the actual file name.
8693 * The the new file is reopened as the new append only file. Profit!
8694 */
8695 static int rewriteAppendOnlyFileBackground(void) {
8696 pid_t childpid;
8697
8698 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8699 if (server.vm_enabled) waitEmptyIOJobsQueue();
8700 if ((childpid = fork()) == 0) {
8701 /* Child */
8702 char tmpfile[256];
8703
8704 if (server.vm_enabled) vmReopenSwapFile();
8705 close(server.fd);
8706 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8707 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8708 _exit(0);
8709 } else {
8710 _exit(1);
8711 }
8712 } else {
8713 /* Parent */
8714 if (childpid == -1) {
8715 redisLog(REDIS_WARNING,
8716 "Can't rewrite append only file in background: fork: %s",
8717 strerror(errno));
8718 return REDIS_ERR;
8719 }
8720 redisLog(REDIS_NOTICE,
8721 "Background append only file rewriting started by pid %d",childpid);
8722 server.bgrewritechildpid = childpid;
8723 updateDictResizePolicy();
8724 /* We set appendseldb to -1 in order to force the next call to the
8725 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8726 * accumulated by the parent into server.bgrewritebuf will start
8727 * with a SELECT statement and it will be safe to merge. */
8728 server.appendseldb = -1;
8729 return REDIS_OK;
8730 }
8731 return REDIS_OK; /* unreached */
8732 }
8733
8734 static void bgrewriteaofCommand(redisClient *c) {
8735 if (server.bgrewritechildpid != -1) {
8736 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8737 return;
8738 }
8739 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8740 char *status = "+Background append only file rewriting started\r\n";
8741 addReplySds(c,sdsnew(status));
8742 } else {
8743 addReply(c,shared.err);
8744 }
8745 }
8746
8747 static void aofRemoveTempFile(pid_t childpid) {
8748 char tmpfile[256];
8749
8750 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8751 unlink(tmpfile);
8752 }
8753
8754 /* Virtual Memory is composed mainly of two subsystems:
8755 * - Blocking Virutal Memory
8756 * - Threaded Virtual Memory I/O
8757 * The two parts are not fully decoupled, but functions are split among two
8758 * different sections of the source code (delimited by comments) in order to
8759 * make more clear what functionality is about the blocking VM and what about
8760 * the threaded (not blocking) VM.
8761 *
8762 * Redis VM design:
8763 *
8764 * Redis VM is a blocking VM (one that blocks reading swapped values from
8765 * disk into memory when a value swapped out is needed in memory) that is made
8766 * unblocking by trying to examine the command argument vector in order to
8767 * load in background values that will likely be needed in order to exec
8768 * the command. The command is executed only once all the relevant keys
8769 * are loaded into memory.
8770 *
8771 * This basically is almost as simple of a blocking VM, but almost as parallel
8772 * as a fully non-blocking VM.
8773 */
8774
8775 /* Called when the user switches from "appendonly yes" to "appendonly no"
8776 * at runtime using the CONFIG command. */
8777 static void stopAppendOnly(void) {
8778 flushAppendOnlyFile();
8779 fsync(server.appendfd);
8780 close(server.appendfd);
8781
8782 server.appendfd = -1;
8783 server.appendseldb = -1;
8784 server.appendonly = 0;
8785 /* rewrite operation in progress? kill it, wait child exit */
8786 if (server.bgsavechildpid != -1) {
8787 int statloc;
8788
8789 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8790 wait3(&statloc,0,NULL);
8791 /* reset the buffer accumulating changes while the child saves */
8792 sdsfree(server.bgrewritebuf);
8793 server.bgrewritebuf = sdsempty();
8794 server.bgsavechildpid = -1;
8795 }
8796 }
8797
8798 /* Called when the user switches from "appendonly no" to "appendonly yes"
8799 * at runtime using the CONFIG command. */
8800 static int startAppendOnly(void) {
8801 server.appendonly = 1;
8802 server.lastfsync = time(NULL);
8803 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8804 if (server.appendfd == -1) {
8805 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8806 return REDIS_ERR;
8807 }
8808 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8809 server.appendonly = 0;
8810 close(server.appendfd);
8811 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8812 return REDIS_ERR;
8813 }
8814 return REDIS_OK;
8815 }
8816
8817 /* =================== Virtual Memory - Blocking Side ====================== */
8818
8819 static void vmInit(void) {
8820 off_t totsize;
8821 int pipefds[2];
8822 size_t stacksize;
8823 struct flock fl;
8824
8825 if (server.vm_max_threads != 0)
8826 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8827
8828 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8829 /* Try to open the old swap file, otherwise create it */
8830 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8831 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8832 }
8833 if (server.vm_fp == NULL) {
8834 redisLog(REDIS_WARNING,
8835 "Can't open the swap file: %s. Exiting.",
8836 strerror(errno));
8837 exit(1);
8838 }
8839 server.vm_fd = fileno(server.vm_fp);
8840 /* Lock the swap file for writing, this is useful in order to avoid
8841 * another instance to use the same swap file for a config error. */
8842 fl.l_type = F_WRLCK;
8843 fl.l_whence = SEEK_SET;
8844 fl.l_start = fl.l_len = 0;
8845 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8846 redisLog(REDIS_WARNING,
8847 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8848 exit(1);
8849 }
8850 /* Initialize */
8851 server.vm_next_page = 0;
8852 server.vm_near_pages = 0;
8853 server.vm_stats_used_pages = 0;
8854 server.vm_stats_swapped_objects = 0;
8855 server.vm_stats_swapouts = 0;
8856 server.vm_stats_swapins = 0;
8857 totsize = server.vm_pages*server.vm_page_size;
8858 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8859 if (ftruncate(server.vm_fd,totsize) == -1) {
8860 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8861 strerror(errno));
8862 exit(1);
8863 } else {
8864 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8865 }
8866 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8867 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8868 (long long) (server.vm_pages+7)/8, server.vm_pages);
8869 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8870
8871 /* Initialize threaded I/O (used by Virtual Memory) */
8872 server.io_newjobs = listCreate();
8873 server.io_processing = listCreate();
8874 server.io_processed = listCreate();
8875 server.io_ready_clients = listCreate();
8876 pthread_mutex_init(&server.io_mutex,NULL);
8877 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8878 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8879 server.io_active_threads = 0;
8880 if (pipe(pipefds) == -1) {
8881 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8882 ,strerror(errno));
8883 exit(1);
8884 }
8885 server.io_ready_pipe_read = pipefds[0];
8886 server.io_ready_pipe_write = pipefds[1];
8887 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8888 /* LZF requires a lot of stack */
8889 pthread_attr_init(&server.io_threads_attr);
8890 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8891 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8892 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8893 /* Listen for events in the threaded I/O pipe */
8894 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8895 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8896 oom("creating file event");
8897 }
8898
8899 /* Mark the page as used */
8900 static void vmMarkPageUsed(off_t page) {
8901 off_t byte = page/8;
8902 int bit = page&7;
8903 redisAssert(vmFreePage(page) == 1);
8904 server.vm_bitmap[byte] |= 1<<bit;
8905 }
8906
8907 /* Mark N contiguous pages as used, with 'page' being the first. */
8908 static void vmMarkPagesUsed(off_t page, off_t count) {
8909 off_t j;
8910
8911 for (j = 0; j < count; j++)
8912 vmMarkPageUsed(page+j);
8913 server.vm_stats_used_pages += count;
8914 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8915 (long long)count, (long long)page);
8916 }
8917
8918 /* Mark the page as free */
8919 static void vmMarkPageFree(off_t page) {
8920 off_t byte = page/8;
8921 int bit = page&7;
8922 redisAssert(vmFreePage(page) == 0);
8923 server.vm_bitmap[byte] &= ~(1<<bit);
8924 }
8925
8926 /* Mark N contiguous pages as free, with 'page' being the first. */
8927 static void vmMarkPagesFree(off_t page, off_t count) {
8928 off_t j;
8929
8930 for (j = 0; j < count; j++)
8931 vmMarkPageFree(page+j);
8932 server.vm_stats_used_pages -= count;
8933 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8934 (long long)count, (long long)page);
8935 }
8936
8937 /* Test if the page is free */
8938 static int vmFreePage(off_t page) {
8939 off_t byte = page/8;
8940 int bit = page&7;
8941 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8942 }
8943
8944 /* Find N contiguous free pages storing the first page of the cluster in *first.
8945 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8946 * REDIS_ERR is returned.
8947 *
8948 * This function uses a simple algorithm: we try to allocate
8949 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8950 * again from the start of the swap file searching for free spaces.
8951 *
8952 * If it looks pretty clear that there are no free pages near our offset
8953 * we try to find less populated places doing a forward jump of
8954 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8955 * without hurry, and then we jump again and so forth...
8956 *
8957 * This function can be improved using a free list to avoid to guess
8958 * too much, since we could collect data about freed pages.
8959 *
8960 * note: I implemented this function just after watching an episode of
8961 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8962 */
8963 static int vmFindContiguousPages(off_t *first, off_t n) {
8964 off_t base, offset = 0, since_jump = 0, numfree = 0;
8965
8966 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8967 server.vm_near_pages = 0;
8968 server.vm_next_page = 0;
8969 }
8970 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8971 base = server.vm_next_page;
8972
8973 while(offset < server.vm_pages) {
8974 off_t this = base+offset;
8975
8976 /* If we overflow, restart from page zero */
8977 if (this >= server.vm_pages) {
8978 this -= server.vm_pages;
8979 if (this == 0) {
8980 /* Just overflowed, what we found on tail is no longer
8981 * interesting, as it's no longer contiguous. */
8982 numfree = 0;
8983 }
8984 }
8985 if (vmFreePage(this)) {
8986 /* This is a free page */
8987 numfree++;
8988 /* Already got N free pages? Return to the caller, with success */
8989 if (numfree == n) {
8990 *first = this-(n-1);
8991 server.vm_next_page = this+1;
8992 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8993 return REDIS_OK;
8994 }
8995 } else {
8996 /* The current one is not a free page */
8997 numfree = 0;
8998 }
8999
9000 /* Fast-forward if the current page is not free and we already
9001 * searched enough near this place. */
9002 since_jump++;
9003 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9004 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9005 since_jump = 0;
9006 /* Note that even if we rewind after the jump, we are don't need
9007 * to make sure numfree is set to zero as we only jump *if* it
9008 * is set to zero. */
9009 } else {
9010 /* Otherwise just check the next page */
9011 offset++;
9012 }
9013 }
9014 return REDIS_ERR;
9015 }
9016
9017 /* Write the specified object at the specified page of the swap file */
9018 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9019 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9020 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9021 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9022 redisLog(REDIS_WARNING,
9023 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9024 strerror(errno));
9025 return REDIS_ERR;
9026 }
9027 rdbSaveObject(server.vm_fp,o);
9028 fflush(server.vm_fp);
9029 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9030 return REDIS_OK;
9031 }
9032
9033 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9034 * needed to later retrieve the object into the key object.
9035 * If we can't find enough contiguous empty pages to swap the object on disk
9036 * REDIS_ERR is returned. */
9037 static int vmSwapObjectBlocking(robj *key, robj *val) {
9038 off_t pages = rdbSavedObjectPages(val,NULL);
9039 off_t page;
9040
9041 assert(key->storage == REDIS_VM_MEMORY);
9042 assert(key->refcount == 1);
9043 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9044 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9045 key->vm.page = page;
9046 key->vm.usedpages = pages;
9047 key->storage = REDIS_VM_SWAPPED;
9048 key->vtype = val->type;
9049 decrRefCount(val); /* Deallocate the object from memory. */
9050 vmMarkPagesUsed(page,pages);
9051 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9052 (unsigned char*) key->ptr,
9053 (unsigned long long) page, (unsigned long long) pages);
9054 server.vm_stats_swapped_objects++;
9055 server.vm_stats_swapouts++;
9056 return REDIS_OK;
9057 }
9058
9059 static robj *vmReadObjectFromSwap(off_t page, int type) {
9060 robj *o;
9061
9062 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9063 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9064 redisLog(REDIS_WARNING,
9065 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9066 strerror(errno));
9067 _exit(1);
9068 }
9069 o = rdbLoadObject(type,server.vm_fp);
9070 if (o == NULL) {
9071 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9072 _exit(1);
9073 }
9074 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9075 return o;
9076 }
9077
9078 /* Load the value object relative to the 'key' object from swap to memory.
9079 * The newly allocated object is returned.
9080 *
9081 * If preview is true the unserialized object is returned to the caller but
9082 * no changes are made to the key object, nor the pages are marked as freed */
9083 static robj *vmGenericLoadObject(robj *key, int preview) {
9084 robj *val;
9085
9086 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9087 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9088 if (!preview) {
9089 key->storage = REDIS_VM_MEMORY;
9090 key->vm.atime = server.unixtime;
9091 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9092 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9093 (unsigned char*) key->ptr);
9094 server.vm_stats_swapped_objects--;
9095 } else {
9096 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9097 (unsigned char*) key->ptr);
9098 }
9099 server.vm_stats_swapins++;
9100 return val;
9101 }
9102
9103 /* Plain object loading, from swap to memory */
9104 static robj *vmLoadObject(robj *key) {
9105 /* If we are loading the object in background, stop it, we
9106 * need to load this object synchronously ASAP. */
9107 if (key->storage == REDIS_VM_LOADING)
9108 vmCancelThreadedIOJob(key);
9109 return vmGenericLoadObject(key,0);
9110 }
9111
9112 /* Just load the value on disk, without to modify the key.
9113 * This is useful when we want to perform some operation on the value
9114 * without to really bring it from swap to memory, like while saving the
9115 * dataset or rewriting the append only log. */
9116 static robj *vmPreviewObject(robj *key) {
9117 return vmGenericLoadObject(key,1);
9118 }
9119
9120 /* How a good candidate is this object for swapping?
9121 * The better candidate it is, the greater the returned value.
9122 *
9123 * Currently we try to perform a fast estimation of the object size in
9124 * memory, and combine it with aging informations.
9125 *
9126 * Basically swappability = idle-time * log(estimated size)
9127 *
9128 * Bigger objects are preferred over smaller objects, but not
9129 * proportionally, this is why we use the logarithm. This algorithm is
9130 * just a first try and will probably be tuned later. */
9131 static double computeObjectSwappability(robj *o) {
9132 time_t age = server.unixtime - o->vm.atime;
9133 long asize = 0;
9134 list *l;
9135 dict *d;
9136 struct dictEntry *de;
9137 int z;
9138
9139 if (age <= 0) return 0;
9140 switch(o->type) {
9141 case REDIS_STRING:
9142 if (o->encoding != REDIS_ENCODING_RAW) {
9143 asize = sizeof(*o);
9144 } else {
9145 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9146 }
9147 break;
9148 case REDIS_LIST:
9149 l = o->ptr;
9150 listNode *ln = listFirst(l);
9151
9152 asize = sizeof(list);
9153 if (ln) {
9154 robj *ele = ln->value;
9155 long elesize;
9156
9157 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9158 (sizeof(*o)+sdslen(ele->ptr)) :
9159 sizeof(*o);
9160 asize += (sizeof(listNode)+elesize)*listLength(l);
9161 }
9162 break;
9163 case REDIS_SET:
9164 case REDIS_ZSET:
9165 z = (o->type == REDIS_ZSET);
9166 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9167
9168 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9169 if (z) asize += sizeof(zset)-sizeof(dict);
9170 if (dictSize(d)) {
9171 long elesize;
9172 robj *ele;
9173
9174 de = dictGetRandomKey(d);
9175 ele = dictGetEntryKey(de);
9176 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9177 (sizeof(*o)+sdslen(ele->ptr)) :
9178 sizeof(*o);
9179 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9180 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9181 }
9182 break;
9183 case REDIS_HASH:
9184 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9185 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9186 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9187 unsigned int klen, vlen;
9188 unsigned char *key, *val;
9189
9190 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9191 klen = 0;
9192 vlen = 0;
9193 }
9194 asize = len*(klen+vlen+3);
9195 } else if (o->encoding == REDIS_ENCODING_HT) {
9196 d = o->ptr;
9197 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9198 if (dictSize(d)) {
9199 long elesize;
9200 robj *ele;
9201
9202 de = dictGetRandomKey(d);
9203 ele = dictGetEntryKey(de);
9204 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9205 (sizeof(*o)+sdslen(ele->ptr)) :
9206 sizeof(*o);
9207 ele = dictGetEntryVal(de);
9208 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9209 (sizeof(*o)+sdslen(ele->ptr)) :
9210 sizeof(*o);
9211 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9212 }
9213 }
9214 break;
9215 }
9216 return (double)age*log(1+asize);
9217 }
9218
9219 /* Try to swap an object that's a good candidate for swapping.
9220 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9221 * to swap any object at all.
9222 *
9223 * If 'usethreaded' is true, Redis will try to swap the object in background
9224 * using I/O threads. */
9225 static int vmSwapOneObject(int usethreads) {
9226 int j, i;
9227 struct dictEntry *best = NULL;
9228 double best_swappability = 0;
9229 redisDb *best_db = NULL;
9230 robj *key, *val;
9231
9232 for (j = 0; j < server.dbnum; j++) {
9233 redisDb *db = server.db+j;
9234 /* Why maxtries is set to 100?
9235 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9236 * are swappable objects */
9237 int maxtries = 100;
9238
9239 if (dictSize(db->dict) == 0) continue;
9240 for (i = 0; i < 5; i++) {
9241 dictEntry *de;
9242 double swappability;
9243
9244 if (maxtries) maxtries--;
9245 de = dictGetRandomKey(db->dict);
9246 key = dictGetEntryKey(de);
9247 val = dictGetEntryVal(de);
9248 /* Only swap objects that are currently in memory.
9249 *
9250 * Also don't swap shared objects if threaded VM is on, as we
9251 * try to ensure that the main thread does not touch the
9252 * object while the I/O thread is using it, but we can't
9253 * control other keys without adding additional mutex. */
9254 if (key->storage != REDIS_VM_MEMORY ||
9255 (server.vm_max_threads != 0 && val->refcount != 1)) {
9256 if (maxtries) i--; /* don't count this try */
9257 continue;
9258 }
9259 swappability = computeObjectSwappability(val);
9260 if (!best || swappability > best_swappability) {
9261 best = de;
9262 best_swappability = swappability;
9263 best_db = db;
9264 }
9265 }
9266 }
9267 if (best == NULL) return REDIS_ERR;
9268 key = dictGetEntryKey(best);
9269 val = dictGetEntryVal(best);
9270
9271 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9272 key->ptr, best_swappability);
9273
9274 /* Unshare the key if needed */
9275 if (key->refcount > 1) {
9276 robj *newkey = dupStringObject(key);
9277 decrRefCount(key);
9278 key = dictGetEntryKey(best) = newkey;
9279 }
9280 /* Swap it */
9281 if (usethreads) {
9282 vmSwapObjectThreaded(key,val,best_db);
9283 return REDIS_OK;
9284 } else {
9285 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9286 dictGetEntryVal(best) = NULL;
9287 return REDIS_OK;
9288 } else {
9289 return REDIS_ERR;
9290 }
9291 }
9292 }
9293
9294 static int vmSwapOneObjectBlocking() {
9295 return vmSwapOneObject(0);
9296 }
9297
9298 static int vmSwapOneObjectThreaded() {
9299 return vmSwapOneObject(1);
9300 }
9301
9302 /* Return true if it's safe to swap out objects in a given moment.
9303 * Basically we don't want to swap objects out while there is a BGSAVE
9304 * or a BGAEOREWRITE running in backgroud. */
9305 static int vmCanSwapOut(void) {
9306 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9307 }
9308
9309 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9310 * and was deleted. Otherwise 0 is returned. */
9311 static int deleteIfSwapped(redisDb *db, robj *key) {
9312 dictEntry *de;
9313 robj *foundkey;
9314
9315 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9316 foundkey = dictGetEntryKey(de);
9317 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9318 deleteKey(db,key);
9319 return 1;
9320 }
9321
9322 /* =================== Virtual Memory - Threaded I/O ======================= */
9323
9324 static void freeIOJob(iojob *j) {
9325 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9326 j->type == REDIS_IOJOB_DO_SWAP ||
9327 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9328 decrRefCount(j->val);
9329 /* We don't decrRefCount the j->key field as we did't incremented
9330 * the count creating IO Jobs. This is because the key field here is
9331 * just used as an indentifier and if a key is removed the Job should
9332 * never be touched again. */
9333 zfree(j);
9334 }
9335
9336 /* Every time a thread finished a Job, it writes a byte into the write side
9337 * of an unix pipe in order to "awake" the main thread, and this function
9338 * is called. */
9339 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9340 int mask)
9341 {
9342 char buf[1];
9343 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9344 REDIS_NOTUSED(el);
9345 REDIS_NOTUSED(mask);
9346 REDIS_NOTUSED(privdata);
9347
9348 /* For every byte we read in the read side of the pipe, there is one
9349 * I/O job completed to process. */
9350 while((retval = read(fd,buf,1)) == 1) {
9351 iojob *j;
9352 listNode *ln;
9353 robj *key;
9354 struct dictEntry *de;
9355
9356 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9357
9358 /* Get the processed element (the oldest one) */
9359 lockThreadedIO();
9360 assert(listLength(server.io_processed) != 0);
9361 if (toprocess == -1) {
9362 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9363 if (toprocess <= 0) toprocess = 1;
9364 }
9365 ln = listFirst(server.io_processed);
9366 j = ln->value;
9367 listDelNode(server.io_processed,ln);
9368 unlockThreadedIO();
9369 /* If this job is marked as canceled, just ignore it */
9370 if (j->canceled) {
9371 freeIOJob(j);
9372 continue;
9373 }
9374 /* Post process it in the main thread, as there are things we
9375 * can do just here to avoid race conditions and/or invasive locks */
9376 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9377 de = dictFind(j->db->dict,j->key);
9378 assert(de != NULL);
9379 key = dictGetEntryKey(de);
9380 if (j->type == REDIS_IOJOB_LOAD) {
9381 redisDb *db;
9382
9383 /* Key loaded, bring it at home */
9384 key->storage = REDIS_VM_MEMORY;
9385 key->vm.atime = server.unixtime;
9386 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9387 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9388 (unsigned char*) key->ptr);
9389 server.vm_stats_swapped_objects--;
9390 server.vm_stats_swapins++;
9391 dictGetEntryVal(de) = j->val;
9392 incrRefCount(j->val);
9393 db = j->db;
9394 freeIOJob(j);
9395 /* Handle clients waiting for this key to be loaded. */
9396 handleClientsBlockedOnSwappedKey(db,key);
9397 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9398 /* Now we know the amount of pages required to swap this object.
9399 * Let's find some space for it, and queue this task again
9400 * rebranded as REDIS_IOJOB_DO_SWAP. */
9401 if (!vmCanSwapOut() ||
9402 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9403 {
9404 /* Ooops... no space or we can't swap as there is
9405 * a fork()ed Redis trying to save stuff on disk. */
9406 freeIOJob(j);
9407 key->storage = REDIS_VM_MEMORY; /* undo operation */
9408 } else {
9409 /* Note that we need to mark this pages as used now,
9410 * if the job will be canceled, we'll mark them as freed
9411 * again. */
9412 vmMarkPagesUsed(j->page,j->pages);
9413 j->type = REDIS_IOJOB_DO_SWAP;
9414 lockThreadedIO();
9415 queueIOJob(j);
9416 unlockThreadedIO();
9417 }
9418 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9419 robj *val;
9420
9421 /* Key swapped. We can finally free some memory. */
9422 if (key->storage != REDIS_VM_SWAPPING) {
9423 printf("key->storage: %d\n",key->storage);
9424 printf("key->name: %s\n",(char*)key->ptr);
9425 printf("key->refcount: %d\n",key->refcount);
9426 printf("val: %p\n",(void*)j->val);
9427 printf("val->type: %d\n",j->val->type);
9428 printf("val->ptr: %s\n",(char*)j->val->ptr);
9429 }
9430 redisAssert(key->storage == REDIS_VM_SWAPPING);
9431 val = dictGetEntryVal(de);
9432 key->vm.page = j->page;
9433 key->vm.usedpages = j->pages;
9434 key->storage = REDIS_VM_SWAPPED;
9435 key->vtype = j->val->type;
9436 decrRefCount(val); /* Deallocate the object from memory. */
9437 dictGetEntryVal(de) = NULL;
9438 redisLog(REDIS_DEBUG,
9439 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9440 (unsigned char*) key->ptr,
9441 (unsigned long long) j->page, (unsigned long long) j->pages);
9442 server.vm_stats_swapped_objects++;
9443 server.vm_stats_swapouts++;
9444 freeIOJob(j);
9445 /* Put a few more swap requests in queue if we are still
9446 * out of memory */
9447 if (trytoswap && vmCanSwapOut() &&
9448 zmalloc_used_memory() > server.vm_max_memory)
9449 {
9450 int more = 1;
9451 while(more) {
9452 lockThreadedIO();
9453 more = listLength(server.io_newjobs) <
9454 (unsigned) server.vm_max_threads;
9455 unlockThreadedIO();
9456 /* Don't waste CPU time if swappable objects are rare. */
9457 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9458 trytoswap = 0;
9459 break;
9460 }
9461 }
9462 }
9463 }
9464 processed++;
9465 if (processed == toprocess) return;
9466 }
9467 if (retval < 0 && errno != EAGAIN) {
9468 redisLog(REDIS_WARNING,
9469 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9470 strerror(errno));
9471 }
9472 }
9473
9474 static void lockThreadedIO(void) {
9475 pthread_mutex_lock(&server.io_mutex);
9476 }
9477
9478 static void unlockThreadedIO(void) {
9479 pthread_mutex_unlock(&server.io_mutex);
9480 }
9481
9482 /* Remove the specified object from the threaded I/O queue if still not
9483 * processed, otherwise make sure to flag it as canceled. */
9484 static void vmCancelThreadedIOJob(robj *o) {
9485 list *lists[3] = {
9486 server.io_newjobs, /* 0 */
9487 server.io_processing, /* 1 */
9488 server.io_processed /* 2 */
9489 };
9490 int i;
9491
9492 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9493 again:
9494 lockThreadedIO();
9495 /* Search for a matching key in one of the queues */
9496 for (i = 0; i < 3; i++) {
9497 listNode *ln;
9498 listIter li;
9499
9500 listRewind(lists[i],&li);
9501 while ((ln = listNext(&li)) != NULL) {
9502 iojob *job = ln->value;
9503
9504 if (job->canceled) continue; /* Skip this, already canceled. */
9505 if (job->key == o) {
9506 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9507 (void*)job, (char*)o->ptr, job->type, i);
9508 /* Mark the pages as free since the swap didn't happened
9509 * or happened but is now discarded. */
9510 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9511 vmMarkPagesFree(job->page,job->pages);
9512 /* Cancel the job. It depends on the list the job is
9513 * living in. */
9514 switch(i) {
9515 case 0: /* io_newjobs */
9516 /* If the job was yet not processed the best thing to do
9517 * is to remove it from the queue at all */
9518 freeIOJob(job);
9519 listDelNode(lists[i],ln);
9520 break;
9521 case 1: /* io_processing */
9522 /* Oh Shi- the thread is messing with the Job:
9523 *
9524 * Probably it's accessing the object if this is a
9525 * PREPARE_SWAP or DO_SWAP job.
9526 * If it's a LOAD job it may be reading from disk and
9527 * if we don't wait for the job to terminate before to
9528 * cancel it, maybe in a few microseconds data can be
9529 * corrupted in this pages. So the short story is:
9530 *
9531 * Better to wait for the job to move into the
9532 * next queue (processed)... */
9533
9534 /* We try again and again until the job is completed. */
9535 unlockThreadedIO();
9536 /* But let's wait some time for the I/O thread
9537 * to finish with this job. After all this condition
9538 * should be very rare. */
9539 usleep(1);
9540 goto again;
9541 case 2: /* io_processed */
9542 /* The job was already processed, that's easy...
9543 * just mark it as canceled so that we'll ignore it
9544 * when processing completed jobs. */
9545 job->canceled = 1;
9546 break;
9547 }
9548 /* Finally we have to adjust the storage type of the object
9549 * in order to "UNDO" the operaiton. */
9550 if (o->storage == REDIS_VM_LOADING)
9551 o->storage = REDIS_VM_SWAPPED;
9552 else if (o->storage == REDIS_VM_SWAPPING)
9553 o->storage = REDIS_VM_MEMORY;
9554 unlockThreadedIO();
9555 return;
9556 }
9557 }
9558 }
9559 unlockThreadedIO();
9560 assert(1 != 1); /* We should never reach this */
9561 }
9562
9563 static void *IOThreadEntryPoint(void *arg) {
9564 iojob *j;
9565 listNode *ln;
9566 REDIS_NOTUSED(arg);
9567
9568 pthread_detach(pthread_self());
9569 while(1) {
9570 /* Get a new job to process */
9571 lockThreadedIO();
9572 if (listLength(server.io_newjobs) == 0) {
9573 /* No new jobs in queue, exit. */
9574 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9575 (long) pthread_self());
9576 server.io_active_threads--;
9577 unlockThreadedIO();
9578 return NULL;
9579 }
9580 ln = listFirst(server.io_newjobs);
9581 j = ln->value;
9582 listDelNode(server.io_newjobs,ln);
9583 /* Add the job in the processing queue */
9584 j->thread = pthread_self();
9585 listAddNodeTail(server.io_processing,j);
9586 ln = listLast(server.io_processing); /* We use ln later to remove it */
9587 unlockThreadedIO();
9588 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9589 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9590
9591 /* Process the Job */
9592 if (j->type == REDIS_IOJOB_LOAD) {
9593 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9594 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9595 FILE *fp = fopen("/dev/null","w+");
9596 j->pages = rdbSavedObjectPages(j->val,fp);
9597 fclose(fp);
9598 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9599 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9600 j->canceled = 1;
9601 }
9602
9603 /* Done: insert the job into the processed queue */
9604 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9605 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9606 lockThreadedIO();
9607 listDelNode(server.io_processing,ln);
9608 listAddNodeTail(server.io_processed,j);
9609 unlockThreadedIO();
9610
9611 /* Signal the main thread there is new stuff to process */
9612 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9613 }
9614 return NULL; /* never reached */
9615 }
9616
9617 static void spawnIOThread(void) {
9618 pthread_t thread;
9619 sigset_t mask, omask;
9620 int err;
9621
9622 sigemptyset(&mask);
9623 sigaddset(&mask,SIGCHLD);
9624 sigaddset(&mask,SIGHUP);
9625 sigaddset(&mask,SIGPIPE);
9626 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9627 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9628 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9629 strerror(err));
9630 usleep(1000000);
9631 }
9632 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9633 server.io_active_threads++;
9634 }
9635
9636 /* We need to wait for the last thread to exit before we are able to
9637 * fork() in order to BGSAVE or BGREWRITEAOF. */
9638 static void waitEmptyIOJobsQueue(void) {
9639 while(1) {
9640 int io_processed_len;
9641
9642 lockThreadedIO();
9643 if (listLength(server.io_newjobs) == 0 &&
9644 listLength(server.io_processing) == 0 &&
9645 server.io_active_threads == 0)
9646 {
9647 unlockThreadedIO();
9648 return;
9649 }
9650 /* While waiting for empty jobs queue condition we post-process some
9651 * finshed job, as I/O threads may be hanging trying to write against
9652 * the io_ready_pipe_write FD but there are so much pending jobs that
9653 * it's blocking. */
9654 io_processed_len = listLength(server.io_processed);
9655 unlockThreadedIO();
9656 if (io_processed_len) {
9657 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9658 usleep(1000); /* 1 millisecond */
9659 } else {
9660 usleep(10000); /* 10 milliseconds */
9661 }
9662 }
9663 }
9664
9665 static void vmReopenSwapFile(void) {
9666 /* Note: we don't close the old one as we are in the child process
9667 * and don't want to mess at all with the original file object. */
9668 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9669 if (server.vm_fp == NULL) {
9670 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9671 server.vm_swap_file);
9672 _exit(1);
9673 }
9674 server.vm_fd = fileno(server.vm_fp);
9675 }
9676
9677 /* This function must be called while with threaded IO locked */
9678 static void queueIOJob(iojob *j) {
9679 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9680 (void*)j, j->type, (char*)j->key->ptr);
9681 listAddNodeTail(server.io_newjobs,j);
9682 if (server.io_active_threads < server.vm_max_threads)
9683 spawnIOThread();
9684 }
9685
9686 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9687 iojob *j;
9688
9689 assert(key->storage == REDIS_VM_MEMORY);
9690 assert(key->refcount == 1);
9691
9692 j = zmalloc(sizeof(*j));
9693 j->type = REDIS_IOJOB_PREPARE_SWAP;
9694 j->db = db;
9695 j->key = key;
9696 j->val = val;
9697 incrRefCount(val);
9698 j->canceled = 0;
9699 j->thread = (pthread_t) -1;
9700 key->storage = REDIS_VM_SWAPPING;
9701
9702 lockThreadedIO();
9703 queueIOJob(j);
9704 unlockThreadedIO();
9705 return REDIS_OK;
9706 }
9707
9708 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9709
9710 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9711 * If there is not already a job loading the key, it is craeted.
9712 * The key is added to the io_keys list in the client structure, and also
9713 * in the hash table mapping swapped keys to waiting clients, that is,
9714 * server.io_waited_keys. */
9715 static int waitForSwappedKey(redisClient *c, robj *key) {
9716 struct dictEntry *de;
9717 robj *o;
9718 list *l;
9719
9720 /* If the key does not exist or is already in RAM we don't need to
9721 * block the client at all. */
9722 de = dictFind(c->db->dict,key);
9723 if (de == NULL) return 0;
9724 o = dictGetEntryKey(de);
9725 if (o->storage == REDIS_VM_MEMORY) {
9726 return 0;
9727 } else if (o->storage == REDIS_VM_SWAPPING) {
9728 /* We were swapping the key, undo it! */
9729 vmCancelThreadedIOJob(o);
9730 return 0;
9731 }
9732
9733 /* OK: the key is either swapped, or being loaded just now. */
9734
9735 /* Add the key to the list of keys this client is waiting for.
9736 * This maps clients to keys they are waiting for. */
9737 listAddNodeTail(c->io_keys,key);
9738 incrRefCount(key);
9739
9740 /* Add the client to the swapped keys => clients waiting map. */
9741 de = dictFind(c->db->io_keys,key);
9742 if (de == NULL) {
9743 int retval;
9744
9745 /* For every key we take a list of clients blocked for it */
9746 l = listCreate();
9747 retval = dictAdd(c->db->io_keys,key,l);
9748 incrRefCount(key);
9749 assert(retval == DICT_OK);
9750 } else {
9751 l = dictGetEntryVal(de);
9752 }
9753 listAddNodeTail(l,c);
9754
9755 /* Are we already loading the key from disk? If not create a job */
9756 if (o->storage == REDIS_VM_SWAPPED) {
9757 iojob *j;
9758
9759 o->storage = REDIS_VM_LOADING;
9760 j = zmalloc(sizeof(*j));
9761 j->type = REDIS_IOJOB_LOAD;
9762 j->db = c->db;
9763 j->key = o;
9764 j->key->vtype = o->vtype;
9765 j->page = o->vm.page;
9766 j->val = NULL;
9767 j->canceled = 0;
9768 j->thread = (pthread_t) -1;
9769 lockThreadedIO();
9770 queueIOJob(j);
9771 unlockThreadedIO();
9772 }
9773 return 1;
9774 }
9775
9776 /* Preload keys for any command with first, last and step values for
9777 * the command keys prototype, as defined in the command table. */
9778 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9779 int j, last;
9780 if (cmd->vm_firstkey == 0) return;
9781 last = cmd->vm_lastkey;
9782 if (last < 0) last = argc+last;
9783 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9784 redisAssert(j < argc);
9785 waitForSwappedKey(c,argv[j]);
9786 }
9787 }
9788
9789 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9790 * Note that the number of keys to preload is user-defined, so we need to
9791 * apply a sanity check against argc. */
9792 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9793 int i, num;
9794 REDIS_NOTUSED(cmd);
9795
9796 num = atoi(argv[2]->ptr);
9797 if (num > (argc-3)) return;
9798 for (i = 0; i < num; i++) {
9799 waitForSwappedKey(c,argv[3+i]);
9800 }
9801 }
9802
9803 /* Preload keys needed to execute the entire MULTI/EXEC block.
9804 *
9805 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9806 * and will block the client when any command requires a swapped out value. */
9807 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9808 int i, margc;
9809 struct redisCommand *mcmd;
9810 robj **margv;
9811 REDIS_NOTUSED(cmd);
9812 REDIS_NOTUSED(argc);
9813 REDIS_NOTUSED(argv);
9814
9815 if (!(c->flags & REDIS_MULTI)) return;
9816 for (i = 0; i < c->mstate.count; i++) {
9817 mcmd = c->mstate.commands[i].cmd;
9818 margc = c->mstate.commands[i].argc;
9819 margv = c->mstate.commands[i].argv;
9820
9821 if (mcmd->vm_preload_proc != NULL) {
9822 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9823 } else {
9824 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9825 }
9826 }
9827 }
9828
9829 /* Is this client attempting to run a command against swapped keys?
9830 * If so, block it ASAP, load the keys in background, then resume it.
9831 *
9832 * The important idea about this function is that it can fail! If keys will
9833 * still be swapped when the client is resumed, this key lookups will
9834 * just block loading keys from disk. In practical terms this should only
9835 * happen with SORT BY command or if there is a bug in this function.
9836 *
9837 * Return 1 if the client is marked as blocked, 0 if the client can
9838 * continue as the keys it is going to access appear to be in memory. */
9839 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9840 if (cmd->vm_preload_proc != NULL) {
9841 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9842 } else {
9843 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9844 }
9845
9846 /* If the client was blocked for at least one key, mark it as blocked. */
9847 if (listLength(c->io_keys)) {
9848 c->flags |= REDIS_IO_WAIT;
9849 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9850 server.vm_blocked_clients++;
9851 return 1;
9852 } else {
9853 return 0;
9854 }
9855 }
9856
9857 /* Remove the 'key' from the list of blocked keys for a given client.
9858 *
9859 * The function returns 1 when there are no longer blocking keys after
9860 * the current one was removed (and the client can be unblocked). */
9861 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9862 list *l;
9863 listNode *ln;
9864 listIter li;
9865 struct dictEntry *de;
9866
9867 /* Remove the key from the list of keys this client is waiting for. */
9868 listRewind(c->io_keys,&li);
9869 while ((ln = listNext(&li)) != NULL) {
9870 if (equalStringObjects(ln->value,key)) {
9871 listDelNode(c->io_keys,ln);
9872 break;
9873 }
9874 }
9875 assert(ln != NULL);
9876
9877 /* Remove the client form the key => waiting clients map. */
9878 de = dictFind(c->db->io_keys,key);
9879 assert(de != NULL);
9880 l = dictGetEntryVal(de);
9881 ln = listSearchKey(l,c);
9882 assert(ln != NULL);
9883 listDelNode(l,ln);
9884 if (listLength(l) == 0)
9885 dictDelete(c->db->io_keys,key);
9886
9887 return listLength(c->io_keys) == 0;
9888 }
9889
9890 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9891 struct dictEntry *de;
9892 list *l;
9893 listNode *ln;
9894 int len;
9895
9896 de = dictFind(db->io_keys,key);
9897 if (!de) return;
9898
9899 l = dictGetEntryVal(de);
9900 len = listLength(l);
9901 /* Note: we can't use something like while(listLength(l)) as the list
9902 * can be freed by the calling function when we remove the last element. */
9903 while (len--) {
9904 ln = listFirst(l);
9905 redisClient *c = ln->value;
9906
9907 if (dontWaitForSwappedKey(c,key)) {
9908 /* Put the client in the list of clients ready to go as we
9909 * loaded all the keys about it. */
9910 listAddNodeTail(server.io_ready_clients,c);
9911 }
9912 }
9913 }
9914
9915 /* =========================== Remote Configuration ========================= */
9916
9917 static void configSetCommand(redisClient *c) {
9918 robj *o = getDecodedObject(c->argv[3]);
9919 long long ll;
9920
9921 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9922 zfree(server.dbfilename);
9923 server.dbfilename = zstrdup(o->ptr);
9924 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9925 zfree(server.requirepass);
9926 server.requirepass = zstrdup(o->ptr);
9927 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9928 zfree(server.masterauth);
9929 server.masterauth = zstrdup(o->ptr);
9930 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9931 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9932 ll < 0) goto badfmt;
9933 server.maxmemory = ll;
9934 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9935 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9936 ll < 0 || ll > LONG_MAX) goto badfmt;
9937 server.maxidletime = ll;
9938 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9939 if (!strcasecmp(o->ptr,"no")) {
9940 server.appendfsync = APPENDFSYNC_NO;
9941 } else if (!strcasecmp(o->ptr,"everysec")) {
9942 server.appendfsync = APPENDFSYNC_EVERYSEC;
9943 } else if (!strcasecmp(o->ptr,"always")) {
9944 server.appendfsync = APPENDFSYNC_ALWAYS;
9945 } else {
9946 goto badfmt;
9947 }
9948 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9949 int old = server.appendonly;
9950 int new = yesnotoi(o->ptr);
9951
9952 if (new == -1) goto badfmt;
9953 if (old != new) {
9954 if (new == 0) {
9955 stopAppendOnly();
9956 } else {
9957 if (startAppendOnly() == REDIS_ERR) {
9958 addReplySds(c,sdscatprintf(sdsempty(),
9959 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9960 decrRefCount(o);
9961 return;
9962 }
9963 }
9964 }
9965 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9966 int vlen, j;
9967 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9968
9969 /* Perform sanity check before setting the new config:
9970 * - Even number of args
9971 * - Seconds >= 1, changes >= 0 */
9972 if (vlen & 1) {
9973 sdsfreesplitres(v,vlen);
9974 goto badfmt;
9975 }
9976 for (j = 0; j < vlen; j++) {
9977 char *eptr;
9978 long val;
9979
9980 val = strtoll(v[j], &eptr, 10);
9981 if (eptr[0] != '\0' ||
9982 ((j & 1) == 0 && val < 1) ||
9983 ((j & 1) == 1 && val < 0)) {
9984 sdsfreesplitres(v,vlen);
9985 goto badfmt;
9986 }
9987 }
9988 /* Finally set the new config */
9989 resetServerSaveParams();
9990 for (j = 0; j < vlen; j += 2) {
9991 time_t seconds;
9992 int changes;
9993
9994 seconds = strtoll(v[j],NULL,10);
9995 changes = strtoll(v[j+1],NULL,10);
9996 appendServerSaveParams(seconds, changes);
9997 }
9998 sdsfreesplitres(v,vlen);
9999 } else {
10000 addReplySds(c,sdscatprintf(sdsempty(),
10001 "-ERR not supported CONFIG parameter %s\r\n",
10002 (char*)c->argv[2]->ptr));
10003 decrRefCount(o);
10004 return;
10005 }
10006 decrRefCount(o);
10007 addReply(c,shared.ok);
10008 return;
10009
10010 badfmt: /* Bad format errors */
10011 addReplySds(c,sdscatprintf(sdsempty(),
10012 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10013 (char*)o->ptr,
10014 (char*)c->argv[2]->ptr));
10015 decrRefCount(o);
10016 }
10017
10018 static void configGetCommand(redisClient *c) {
10019 robj *o = getDecodedObject(c->argv[2]);
10020 robj *lenobj = createObject(REDIS_STRING,NULL);
10021 char *pattern = o->ptr;
10022 int matches = 0;
10023
10024 addReply(c,lenobj);
10025 decrRefCount(lenobj);
10026
10027 if (stringmatch(pattern,"dbfilename",0)) {
10028 addReplyBulkCString(c,"dbfilename");
10029 addReplyBulkCString(c,server.dbfilename);
10030 matches++;
10031 }
10032 if (stringmatch(pattern,"requirepass",0)) {
10033 addReplyBulkCString(c,"requirepass");
10034 addReplyBulkCString(c,server.requirepass);
10035 matches++;
10036 }
10037 if (stringmatch(pattern,"masterauth",0)) {
10038 addReplyBulkCString(c,"masterauth");
10039 addReplyBulkCString(c,server.masterauth);
10040 matches++;
10041 }
10042 if (stringmatch(pattern,"maxmemory",0)) {
10043 char buf[128];
10044
10045 ll2string(buf,128,server.maxmemory);
10046 addReplyBulkCString(c,"maxmemory");
10047 addReplyBulkCString(c,buf);
10048 matches++;
10049 }
10050 if (stringmatch(pattern,"timeout",0)) {
10051 char buf[128];
10052
10053 ll2string(buf,128,server.maxidletime);
10054 addReplyBulkCString(c,"timeout");
10055 addReplyBulkCString(c,buf);
10056 matches++;
10057 }
10058 if (stringmatch(pattern,"appendonly",0)) {
10059 addReplyBulkCString(c,"appendonly");
10060 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10061 matches++;
10062 }
10063 if (stringmatch(pattern,"appendfsync",0)) {
10064 char *policy;
10065
10066 switch(server.appendfsync) {
10067 case APPENDFSYNC_NO: policy = "no"; break;
10068 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10069 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10070 default: policy = "unknown"; break; /* too harmless to panic */
10071 }
10072 addReplyBulkCString(c,"appendfsync");
10073 addReplyBulkCString(c,policy);
10074 matches++;
10075 }
10076 if (stringmatch(pattern,"save",0)) {
10077 sds buf = sdsempty();
10078 int j;
10079
10080 for (j = 0; j < server.saveparamslen; j++) {
10081 buf = sdscatprintf(buf,"%ld %d",
10082 server.saveparams[j].seconds,
10083 server.saveparams[j].changes);
10084 if (j != server.saveparamslen-1)
10085 buf = sdscatlen(buf," ",1);
10086 }
10087 addReplyBulkCString(c,"save");
10088 addReplyBulkCString(c,buf);
10089 sdsfree(buf);
10090 matches++;
10091 }
10092 decrRefCount(o);
10093 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10094 }
10095
10096 static void configCommand(redisClient *c) {
10097 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10098 if (c->argc != 4) goto badarity;
10099 configSetCommand(c);
10100 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10101 if (c->argc != 3) goto badarity;
10102 configGetCommand(c);
10103 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10104 if (c->argc != 2) goto badarity;
10105 server.stat_numcommands = 0;
10106 server.stat_numconnections = 0;
10107 server.stat_expiredkeys = 0;
10108 server.stat_starttime = time(NULL);
10109 addReply(c,shared.ok);
10110 } else {
10111 addReplySds(c,sdscatprintf(sdsempty(),
10112 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10113 }
10114 return;
10115
10116 badarity:
10117 addReplySds(c,sdscatprintf(sdsempty(),
10118 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10119 (char*) c->argv[1]->ptr));
10120 }
10121
10122 /* =========================== Pubsub implementation ======================== */
10123
10124 static void freePubsubPattern(void *p) {
10125 pubsubPattern *pat = p;
10126
10127 decrRefCount(pat->pattern);
10128 zfree(pat);
10129 }
10130
10131 static int listMatchPubsubPattern(void *a, void *b) {
10132 pubsubPattern *pa = a, *pb = b;
10133
10134 return (pa->client == pb->client) &&
10135 (equalStringObjects(pa->pattern,pb->pattern));
10136 }
10137
10138 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10139 * 0 if the client was already subscribed to that channel. */
10140 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10141 struct dictEntry *de;
10142 list *clients = NULL;
10143 int retval = 0;
10144
10145 /* Add the channel to the client -> channels hash table */
10146 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10147 retval = 1;
10148 incrRefCount(channel);
10149 /* Add the client to the channel -> list of clients hash table */
10150 de = dictFind(server.pubsub_channels,channel);
10151 if (de == NULL) {
10152 clients = listCreate();
10153 dictAdd(server.pubsub_channels,channel,clients);
10154 incrRefCount(channel);
10155 } else {
10156 clients = dictGetEntryVal(de);
10157 }
10158 listAddNodeTail(clients,c);
10159 }
10160 /* Notify the client */
10161 addReply(c,shared.mbulk3);
10162 addReply(c,shared.subscribebulk);
10163 addReplyBulk(c,channel);
10164 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10165 return retval;
10166 }
10167
10168 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10169 * 0 if the client was not subscribed to the specified channel. */
10170 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10171 struct dictEntry *de;
10172 list *clients;
10173 listNode *ln;
10174 int retval = 0;
10175
10176 /* Remove the channel from the client -> channels hash table */
10177 incrRefCount(channel); /* channel may be just a pointer to the same object
10178 we have in the hash tables. Protect it... */
10179 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10180 retval = 1;
10181 /* Remove the client from the channel -> clients list hash table */
10182 de = dictFind(server.pubsub_channels,channel);
10183 assert(de != NULL);
10184 clients = dictGetEntryVal(de);
10185 ln = listSearchKey(clients,c);
10186 assert(ln != NULL);
10187 listDelNode(clients,ln);
10188 if (listLength(clients) == 0) {
10189 /* Free the list and associated hash entry at all if this was
10190 * the latest client, so that it will be possible to abuse
10191 * Redis PUBSUB creating millions of channels. */
10192 dictDelete(server.pubsub_channels,channel);
10193 }
10194 }
10195 /* Notify the client */
10196 if (notify) {
10197 addReply(c,shared.mbulk3);
10198 addReply(c,shared.unsubscribebulk);
10199 addReplyBulk(c,channel);
10200 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10201 listLength(c->pubsub_patterns));
10202
10203 }
10204 decrRefCount(channel); /* it is finally safe to release it */
10205 return retval;
10206 }
10207
10208 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10209 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10210 int retval = 0;
10211
10212 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10213 retval = 1;
10214 pubsubPattern *pat;
10215 listAddNodeTail(c->pubsub_patterns,pattern);
10216 incrRefCount(pattern);
10217 pat = zmalloc(sizeof(*pat));
10218 pat->pattern = getDecodedObject(pattern);
10219 pat->client = c;
10220 listAddNodeTail(server.pubsub_patterns,pat);
10221 }
10222 /* Notify the client */
10223 addReply(c,shared.mbulk3);
10224 addReply(c,shared.psubscribebulk);
10225 addReplyBulk(c,pattern);
10226 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10227 return retval;
10228 }
10229
10230 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10231 * 0 if the client was not subscribed to the specified channel. */
10232 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10233 listNode *ln;
10234 pubsubPattern pat;
10235 int retval = 0;
10236
10237 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10238 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10239 retval = 1;
10240 listDelNode(c->pubsub_patterns,ln);
10241 pat.client = c;
10242 pat.pattern = pattern;
10243 ln = listSearchKey(server.pubsub_patterns,&pat);
10244 listDelNode(server.pubsub_patterns,ln);
10245 }
10246 /* Notify the client */
10247 if (notify) {
10248 addReply(c,shared.mbulk3);
10249 addReply(c,shared.punsubscribebulk);
10250 addReplyBulk(c,pattern);
10251 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10252 listLength(c->pubsub_patterns));
10253 }
10254 decrRefCount(pattern);
10255 return retval;
10256 }
10257
10258 /* Unsubscribe from all the channels. Return the number of channels the
10259 * client was subscribed from. */
10260 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10261 dictIterator *di = dictGetIterator(c->pubsub_channels);
10262 dictEntry *de;
10263 int count = 0;
10264
10265 while((de = dictNext(di)) != NULL) {
10266 robj *channel = dictGetEntryKey(de);
10267
10268 count += pubsubUnsubscribeChannel(c,channel,notify);
10269 }
10270 dictReleaseIterator(di);
10271 return count;
10272 }
10273
10274 /* Unsubscribe from all the patterns. Return the number of patterns the
10275 * client was subscribed from. */
10276 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10277 listNode *ln;
10278 listIter li;
10279 int count = 0;
10280
10281 listRewind(c->pubsub_patterns,&li);
10282 while ((ln = listNext(&li)) != NULL) {
10283 robj *pattern = ln->value;
10284
10285 count += pubsubUnsubscribePattern(c,pattern,notify);
10286 }
10287 return count;
10288 }
10289
10290 /* Publish a message */
10291 static int pubsubPublishMessage(robj *channel, robj *message) {
10292 int receivers = 0;
10293 struct dictEntry *de;
10294 listNode *ln;
10295 listIter li;
10296
10297 /* Send to clients listening for that channel */
10298 de = dictFind(server.pubsub_channels,channel);
10299 if (de) {
10300 list *list = dictGetEntryVal(de);
10301 listNode *ln;
10302 listIter li;
10303
10304 listRewind(list,&li);
10305 while ((ln = listNext(&li)) != NULL) {
10306 redisClient *c = ln->value;
10307
10308 addReply(c,shared.mbulk3);
10309 addReply(c,shared.messagebulk);
10310 addReplyBulk(c,channel);
10311 addReplyBulk(c,message);
10312 receivers++;
10313 }
10314 }
10315 /* Send to clients listening to matching channels */
10316 if (listLength(server.pubsub_patterns)) {
10317 listRewind(server.pubsub_patterns,&li);
10318 channel = getDecodedObject(channel);
10319 while ((ln = listNext(&li)) != NULL) {
10320 pubsubPattern *pat = ln->value;
10321
10322 if (stringmatchlen((char*)pat->pattern->ptr,
10323 sdslen(pat->pattern->ptr),
10324 (char*)channel->ptr,
10325 sdslen(channel->ptr),0)) {
10326 addReply(pat->client,shared.mbulk4);
10327 addReply(pat->client,shared.pmessagebulk);
10328 addReplyBulk(pat->client,pat->pattern);
10329 addReplyBulk(pat->client,channel);
10330 addReplyBulk(pat->client,message);
10331 receivers++;
10332 }
10333 }
10334 decrRefCount(channel);
10335 }
10336 return receivers;
10337 }
10338
10339 static void subscribeCommand(redisClient *c) {
10340 int j;
10341
10342 for (j = 1; j < c->argc; j++)
10343 pubsubSubscribeChannel(c,c->argv[j]);
10344 }
10345
10346 static void unsubscribeCommand(redisClient *c) {
10347 if (c->argc == 1) {
10348 pubsubUnsubscribeAllChannels(c,1);
10349 return;
10350 } else {
10351 int j;
10352
10353 for (j = 1; j < c->argc; j++)
10354 pubsubUnsubscribeChannel(c,c->argv[j],1);
10355 }
10356 }
10357
10358 static void psubscribeCommand(redisClient *c) {
10359 int j;
10360
10361 for (j = 1; j < c->argc; j++)
10362 pubsubSubscribePattern(c,c->argv[j]);
10363 }
10364
10365 static void punsubscribeCommand(redisClient *c) {
10366 if (c->argc == 1) {
10367 pubsubUnsubscribeAllPatterns(c,1);
10368 return;
10369 } else {
10370 int j;
10371
10372 for (j = 1; j < c->argc; j++)
10373 pubsubUnsubscribePattern(c,c->argv[j],1);
10374 }
10375 }
10376
10377 static void publishCommand(redisClient *c) {
10378 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10379 addReplyLongLong(c,receivers);
10380 }
10381
10382 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10383 *
10384 * The implementation uses a per-DB hash table mapping keys to list of clients
10385 * WATCHing those keys, so that given a key that is going to be modified
10386 * we can mark all the associated clients as dirty.
10387 *
10388 * Also every client contains a list of WATCHed keys so that's possible to
10389 * un-watch such keys when the client is freed or when UNWATCH is called. */
10390
10391 /* In the client->watched_keys list we need to use watchedKey structures
10392 * as in order to identify a key in Redis we need both the key name and the
10393 * DB */
10394 typedef struct watchedKey {
10395 robj *key;
10396 redisDb *db;
10397 } watchedKey;
10398
10399 /* Watch for the specified key */
10400 static void watchForKey(redisClient *c, robj *key) {
10401 list *clients = NULL;
10402 listIter li;
10403 listNode *ln;
10404 watchedKey *wk;
10405
10406 /* Check if we are already watching for this key */
10407 listRewind(c->watched_keys,&li);
10408 while((ln = listNext(&li))) {
10409 wk = listNodeValue(ln);
10410 if (wk->db == c->db && equalStringObjects(key,wk->key))
10411 return; /* Key already watched */
10412 }
10413 /* This key is not already watched in this DB. Let's add it */
10414 clients = dictFetchValue(c->db->watched_keys,key);
10415 if (!clients) {
10416 clients = listCreate();
10417 dictAdd(c->db->watched_keys,key,clients);
10418 incrRefCount(key);
10419 }
10420 listAddNodeTail(clients,c);
10421 /* Add the new key to the lits of keys watched by this client */
10422 wk = zmalloc(sizeof(*wk));
10423 wk->key = key;
10424 wk->db = c->db;
10425 incrRefCount(key);
10426 listAddNodeTail(c->watched_keys,wk);
10427 }
10428
10429 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10430 * flag is up to the caller. */
10431 static void unwatchAllKeys(redisClient *c) {
10432 listIter li;
10433 listNode *ln;
10434
10435 if (listLength(c->watched_keys) == 0) return;
10436 listRewind(c->watched_keys,&li);
10437 while((ln = listNext(&li))) {
10438 list *clients;
10439 watchedKey *wk;
10440
10441 /* Lookup the watched key -> clients list and remove the client
10442 * from the list */
10443 wk = listNodeValue(ln);
10444 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10445 assert(clients != NULL);
10446 listDelNode(clients,listSearchKey(clients,c));
10447 /* Kill the entry at all if this was the only client */
10448 if (listLength(clients) == 0)
10449 dictDelete(wk->db->watched_keys, wk->key);
10450 /* Remove this watched key from the client->watched list */
10451 listDelNode(c->watched_keys,ln);
10452 decrRefCount(wk->key);
10453 zfree(wk);
10454 }
10455 }
10456
10457 /* "Touch" a key, so that if this key is being WATCHed by soem client the
10458 * next EXEC will fail. */
10459 static void touchWatchedKey(redisDb *db, robj *key) {
10460 list *clients;
10461 listIter li;
10462 listNode *ln;
10463
10464 if (dictSize(db->watched_keys) == 0) return;
10465 clients = dictFetchValue(db->watched_keys, key);
10466 if (!clients) return;
10467
10468 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10469 /* Check if we are already watching for this key */
10470 listRewind(clients,&li);
10471 while((ln = listNext(&li))) {
10472 redisClient *c = listNodeValue(ln);
10473
10474 c->flags |= REDIS_DIRTY_CAS;
10475 }
10476 }
10477
10478 static void watchCommand(redisClient *c) {
10479 int j;
10480
10481 for (j = 1; j < c->argc; j++)
10482 watchForKey(c,c->argv[j]);
10483 addReply(c,shared.ok);
10484 }
10485
10486 static void unwatchCommand(redisClient *c) {
10487 unwatchAllKeys(c);
10488 c->flags &= (~REDIS_DIRTY_CAS);
10489 addReply(c,shared.ok);
10490 }
10491
10492 /* ================================= Debugging ============================== */
10493
10494 /* Compute the sha1 of string at 's' with 'len' bytes long.
10495 * The SHA1 is then xored againt the string pointed by digest.
10496 * Since xor is commutative, this operation is used in order to
10497 * "add" digests relative to unordered elements.
10498 *
10499 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10500 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10501 SHA1_CTX ctx;
10502 unsigned char hash[20], *s = ptr;
10503 int j;
10504
10505 SHA1Init(&ctx);
10506 SHA1Update(&ctx,s,len);
10507 SHA1Final(hash,&ctx);
10508
10509 for (j = 0; j < 20; j++)
10510 digest[j] ^= hash[j];
10511 }
10512
10513 static void xorObjectDigest(unsigned char *digest, robj *o) {
10514 o = getDecodedObject(o);
10515 xorDigest(digest,o->ptr,sdslen(o->ptr));
10516 decrRefCount(o);
10517 }
10518
10519 /* This function instead of just computing the SHA1 and xoring it
10520 * against diget, also perform the digest of "digest" itself and
10521 * replace the old value with the new one.
10522 *
10523 * So the final digest will be:
10524 *
10525 * digest = SHA1(digest xor SHA1(data))
10526 *
10527 * This function is used every time we want to preserve the order so
10528 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10529 *
10530 * Also note that mixdigest("foo") followed by mixdigest("bar")
10531 * will lead to a different digest compared to "fo", "obar".
10532 */
10533 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10534 SHA1_CTX ctx;
10535 char *s = ptr;
10536
10537 xorDigest(digest,s,len);
10538 SHA1Init(&ctx);
10539 SHA1Update(&ctx,digest,20);
10540 SHA1Final(digest,&ctx);
10541 }
10542
10543 static void mixObjectDigest(unsigned char *digest, robj *o) {
10544 o = getDecodedObject(o);
10545 mixDigest(digest,o->ptr,sdslen(o->ptr));
10546 decrRefCount(o);
10547 }
10548
10549 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10550 * are not ordered, we use a trick: every aggregate digest is the xor
10551 * of the digests of their elements. This way the order will not change
10552 * the result. For list instead we use a feedback entering the output digest
10553 * as input in order to ensure that a different ordered list will result in
10554 * a different digest. */
10555 static void computeDatasetDigest(unsigned char *final) {
10556 unsigned char digest[20];
10557 char buf[128];
10558 dictIterator *di = NULL;
10559 dictEntry *de;
10560 int j;
10561 uint32_t aux;
10562
10563 memset(final,0,20); /* Start with a clean result */
10564
10565 for (j = 0; j < server.dbnum; j++) {
10566 redisDb *db = server.db+j;
10567
10568 if (dictSize(db->dict) == 0) continue;
10569 di = dictGetIterator(db->dict);
10570
10571 /* hash the DB id, so the same dataset moved in a different
10572 * DB will lead to a different digest */
10573 aux = htonl(j);
10574 mixDigest(final,&aux,sizeof(aux));
10575
10576 /* Iterate this DB writing every entry */
10577 while((de = dictNext(di)) != NULL) {
10578 robj *key, *o, *kcopy;
10579 time_t expiretime;
10580
10581 memset(digest,0,20); /* This key-val digest */
10582 key = dictGetEntryKey(de);
10583
10584 if (!server.vm_enabled) {
10585 mixObjectDigest(digest,key);
10586 o = dictGetEntryVal(de);
10587 } else {
10588 /* Don't work with the key directly as when VM is active
10589 * this is unsafe: TODO: fix decrRefCount to check if the
10590 * count really reached 0 to avoid this mess */
10591 kcopy = dupStringObject(key);
10592 mixObjectDigest(digest,kcopy);
10593 o = lookupKeyRead(db,kcopy);
10594 decrRefCount(kcopy);
10595 }
10596 aux = htonl(o->type);
10597 mixDigest(digest,&aux,sizeof(aux));
10598 expiretime = getExpire(db,key);
10599
10600 /* Save the key and associated value */
10601 if (o->type == REDIS_STRING) {
10602 mixObjectDigest(digest,o);
10603 } else if (o->type == REDIS_LIST) {
10604 list *list = o->ptr;
10605 listNode *ln;
10606 listIter li;
10607
10608 listRewind(list,&li);
10609 while((ln = listNext(&li))) {
10610 robj *eleobj = listNodeValue(ln);
10611
10612 mixObjectDigest(digest,eleobj);
10613 }
10614 } else if (o->type == REDIS_SET) {
10615 dict *set = o->ptr;
10616 dictIterator *di = dictGetIterator(set);
10617 dictEntry *de;
10618
10619 while((de = dictNext(di)) != NULL) {
10620 robj *eleobj = dictGetEntryKey(de);
10621
10622 xorObjectDigest(digest,eleobj);
10623 }
10624 dictReleaseIterator(di);
10625 } else if (o->type == REDIS_ZSET) {
10626 zset *zs = o->ptr;
10627 dictIterator *di = dictGetIterator(zs->dict);
10628 dictEntry *de;
10629
10630 while((de = dictNext(di)) != NULL) {
10631 robj *eleobj = dictGetEntryKey(de);
10632 double *score = dictGetEntryVal(de);
10633 unsigned char eledigest[20];
10634
10635 snprintf(buf,sizeof(buf),"%.17g",*score);
10636 memset(eledigest,0,20);
10637 mixObjectDigest(eledigest,eleobj);
10638 mixDigest(eledigest,buf,strlen(buf));
10639 xorDigest(digest,eledigest,20);
10640 }
10641 dictReleaseIterator(di);
10642 } else if (o->type == REDIS_HASH) {
10643 hashIterator *hi;
10644 robj *obj;
10645
10646 hi = hashInitIterator(o);
10647 while (hashNext(hi) != REDIS_ERR) {
10648 unsigned char eledigest[20];
10649
10650 memset(eledigest,0,20);
10651 obj = hashCurrent(hi,REDIS_HASH_KEY);
10652 mixObjectDigest(eledigest,obj);
10653 decrRefCount(obj);
10654 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10655 mixObjectDigest(eledigest,obj);
10656 decrRefCount(obj);
10657 xorDigest(digest,eledigest,20);
10658 }
10659 hashReleaseIterator(hi);
10660 } else {
10661 redisPanic("Unknown object type");
10662 }
10663 /* If the key has an expire, add it to the mix */
10664 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10665 /* We can finally xor the key-val digest to the final digest */
10666 xorDigest(final,digest,20);
10667 }
10668 dictReleaseIterator(di);
10669 }
10670 }
10671
10672 static void debugCommand(redisClient *c) {
10673 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10674 *((char*)-1) = 'x';
10675 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10676 if (rdbSave(server.dbfilename) != REDIS_OK) {
10677 addReply(c,shared.err);
10678 return;
10679 }
10680 emptyDb();
10681 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10682 addReply(c,shared.err);
10683 return;
10684 }
10685 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10686 addReply(c,shared.ok);
10687 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10688 emptyDb();
10689 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10690 addReply(c,shared.err);
10691 return;
10692 }
10693 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10694 addReply(c,shared.ok);
10695 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10696 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10697 robj *key, *val;
10698
10699 if (!de) {
10700 addReply(c,shared.nokeyerr);
10701 return;
10702 }
10703 key = dictGetEntryKey(de);
10704 val = dictGetEntryVal(de);
10705 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10706 key->storage == REDIS_VM_SWAPPING)) {
10707 char *strenc;
10708 char buf[128];
10709
10710 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10711 strenc = strencoding[val->encoding];
10712 } else {
10713 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10714 strenc = buf;
10715 }
10716 addReplySds(c,sdscatprintf(sdsempty(),
10717 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10718 "encoding:%s serializedlength:%lld\r\n",
10719 (void*)key, key->refcount, (void*)val, val->refcount,
10720 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10721 } else {
10722 addReplySds(c,sdscatprintf(sdsempty(),
10723 "+Key at:%p refcount:%d, value swapped at: page %llu "
10724 "using %llu pages\r\n",
10725 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10726 (unsigned long long) key->vm.usedpages));
10727 }
10728 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10729 lookupKeyRead(c->db,c->argv[2]);
10730 addReply(c,shared.ok);
10731 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10732 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10733 robj *key, *val;
10734
10735 if (!server.vm_enabled) {
10736 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10737 return;
10738 }
10739 if (!de) {
10740 addReply(c,shared.nokeyerr);
10741 return;
10742 }
10743 key = dictGetEntryKey(de);
10744 val = dictGetEntryVal(de);
10745 /* If the key is shared we want to create a copy */
10746 if (key->refcount > 1) {
10747 robj *newkey = dupStringObject(key);
10748 decrRefCount(key);
10749 key = dictGetEntryKey(de) = newkey;
10750 }
10751 /* Swap it */
10752 if (key->storage != REDIS_VM_MEMORY) {
10753 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10754 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10755 dictGetEntryVal(de) = NULL;
10756 addReply(c,shared.ok);
10757 } else {
10758 addReply(c,shared.err);
10759 }
10760 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10761 long keys, j;
10762 robj *key, *val;
10763 char buf[128];
10764
10765 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10766 return;
10767 for (j = 0; j < keys; j++) {
10768 snprintf(buf,sizeof(buf),"key:%lu",j);
10769 key = createStringObject(buf,strlen(buf));
10770 if (lookupKeyRead(c->db,key) != NULL) {
10771 decrRefCount(key);
10772 continue;
10773 }
10774 snprintf(buf,sizeof(buf),"value:%lu",j);
10775 val = createStringObject(buf,strlen(buf));
10776 dictAdd(c->db->dict,key,val);
10777 }
10778 addReply(c,shared.ok);
10779 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10780 unsigned char digest[20];
10781 sds d = sdsnew("+");
10782 int j;
10783
10784 computeDatasetDigest(digest);
10785 for (j = 0; j < 20; j++)
10786 d = sdscatprintf(d, "%02x",digest[j]);
10787
10788 d = sdscatlen(d,"\r\n",2);
10789 addReplySds(c,d);
10790 } else {
10791 addReplySds(c,sdsnew(
10792 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10793 }
10794 }
10795
10796 static void _redisAssert(char *estr, char *file, int line) {
10797 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10798 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10799 #ifdef HAVE_BACKTRACE
10800 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10801 *((char*)-1) = 'x';
10802 #endif
10803 }
10804
10805 static void _redisPanic(char *msg, char *file, int line) {
10806 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10807 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10808 #ifdef HAVE_BACKTRACE
10809 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10810 *((char*)-1) = 'x';
10811 #endif
10812 }
10813
10814 /* =================================== Main! ================================ */
10815
10816 #ifdef __linux__
10817 int linuxOvercommitMemoryValue(void) {
10818 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10819 char buf[64];
10820
10821 if (!fp) return -1;
10822 if (fgets(buf,64,fp) == NULL) {
10823 fclose(fp);
10824 return -1;
10825 }
10826 fclose(fp);
10827
10828 return atoi(buf);
10829 }
10830
10831 void linuxOvercommitMemoryWarning(void) {
10832 if (linuxOvercommitMemoryValue() == 0) {
10833 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10834 }
10835 }
10836 #endif /* __linux__ */
10837
10838 static void daemonize(void) {
10839 int fd;
10840 FILE *fp;
10841
10842 if (fork() != 0) exit(0); /* parent exits */
10843 setsid(); /* create a new session */
10844
10845 /* Every output goes to /dev/null. If Redis is daemonized but
10846 * the 'logfile' is set to 'stdout' in the configuration file
10847 * it will not log at all. */
10848 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10849 dup2(fd, STDIN_FILENO);
10850 dup2(fd, STDOUT_FILENO);
10851 dup2(fd, STDERR_FILENO);
10852 if (fd > STDERR_FILENO) close(fd);
10853 }
10854 /* Try to write the pid file */
10855 fp = fopen(server.pidfile,"w");
10856 if (fp) {
10857 fprintf(fp,"%d\n",getpid());
10858 fclose(fp);
10859 }
10860 }
10861
10862 static void version() {
10863 printf("Redis server version %s\n", REDIS_VERSION);
10864 exit(0);
10865 }
10866
10867 static void usage() {
10868 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10869 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10870 exit(1);
10871 }
10872
10873 int main(int argc, char **argv) {
10874 time_t start;
10875
10876 initServerConfig();
10877 if (argc == 2) {
10878 if (strcmp(argv[1], "-v") == 0 ||
10879 strcmp(argv[1], "--version") == 0) version();
10880 if (strcmp(argv[1], "--help") == 0) usage();
10881 resetServerSaveParams();
10882 loadServerConfig(argv[1]);
10883 } else if ((argc > 2)) {
10884 usage();
10885 } else {
10886 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10887 }
10888 if (server.daemonize) daemonize();
10889 initServer();
10890 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10891 #ifdef __linux__
10892 linuxOvercommitMemoryWarning();
10893 #endif
10894 start = time(NULL);
10895 if (server.appendonly) {
10896 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10897 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10898 } else {
10899 if (rdbLoad(server.dbfilename) == REDIS_OK)
10900 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10901 }
10902 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10903 aeSetBeforeSleepProc(server.el,beforeSleep);
10904 aeMain(server.el);
10905 aeDeleteEventLoop(server.el);
10906 return 0;
10907 }
10908
10909 /* ============================= Backtrace support ========================= */
10910
10911 #ifdef HAVE_BACKTRACE
10912 static char *findFuncName(void *pointer, unsigned long *offset);
10913
10914 static void *getMcontextEip(ucontext_t *uc) {
10915 #if defined(__FreeBSD__)
10916 return (void*) uc->uc_mcontext.mc_eip;
10917 #elif defined(__dietlibc__)
10918 return (void*) uc->uc_mcontext.eip;
10919 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10920 #if __x86_64__
10921 return (void*) uc->uc_mcontext->__ss.__rip;
10922 #else
10923 return (void*) uc->uc_mcontext->__ss.__eip;
10924 #endif
10925 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10926 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10927 return (void*) uc->uc_mcontext->__ss.__rip;
10928 #else
10929 return (void*) uc->uc_mcontext->__ss.__eip;
10930 #endif
10931 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10932 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10933 #elif defined(__ia64__) /* Linux IA64 */
10934 return (void*) uc->uc_mcontext.sc_ip;
10935 #else
10936 return NULL;
10937 #endif
10938 }
10939
10940 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10941 void *trace[100];
10942 char **messages = NULL;
10943 int i, trace_size = 0;
10944 unsigned long offset=0;
10945 ucontext_t *uc = (ucontext_t*) secret;
10946 sds infostring;
10947 REDIS_NOTUSED(info);
10948
10949 redisLog(REDIS_WARNING,
10950 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10951 infostring = genRedisInfoString();
10952 redisLog(REDIS_WARNING, "%s",infostring);
10953 /* It's not safe to sdsfree() the returned string under memory
10954 * corruption conditions. Let it leak as we are going to abort */
10955
10956 trace_size = backtrace(trace, 100);
10957 /* overwrite sigaction with caller's address */
10958 if (getMcontextEip(uc) != NULL) {
10959 trace[1] = getMcontextEip(uc);
10960 }
10961 messages = backtrace_symbols(trace, trace_size);
10962
10963 for (i=1; i<trace_size; ++i) {
10964 char *fn = findFuncName(trace[i], &offset), *p;
10965
10966 p = strchr(messages[i],'+');
10967 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10968 redisLog(REDIS_WARNING,"%s", messages[i]);
10969 } else {
10970 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10971 }
10972 }
10973 /* free(messages); Don't call free() with possibly corrupted memory. */
10974 _exit(0);
10975 }
10976
10977 static void sigtermHandler(int sig) {
10978 REDIS_NOTUSED(sig);
10979
10980 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
10981 server.shutdown_asap = 1;
10982 }
10983
10984 static void setupSigSegvAction(void) {
10985 struct sigaction act;
10986
10987 sigemptyset (&act.sa_mask);
10988 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10989 * is used. Otherwise, sa_handler is used */
10990 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10991 act.sa_sigaction = segvHandler;
10992 sigaction (SIGSEGV, &act, NULL);
10993 sigaction (SIGBUS, &act, NULL);
10994 sigaction (SIGFPE, &act, NULL);
10995 sigaction (SIGILL, &act, NULL);
10996 sigaction (SIGBUS, &act, NULL);
10997
10998 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
10999 act.sa_handler = sigtermHandler;
11000 sigaction (SIGTERM, &act, NULL);
11001 return;
11002 }
11003
11004 #include "staticsymbols.h"
11005 /* This function try to convert a pointer into a function name. It's used in
11006 * oreder to provide a backtrace under segmentation fault that's able to
11007 * display functions declared as static (otherwise the backtrace is useless). */
11008 static char *findFuncName(void *pointer, unsigned long *offset){
11009 int i, ret = -1;
11010 unsigned long off, minoff = 0;
11011
11012 /* Try to match against the Symbol with the smallest offset */
11013 for (i=0; symsTable[i].pointer; i++) {
11014 unsigned long lp = (unsigned long) pointer;
11015
11016 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11017 off=lp-symsTable[i].pointer;
11018 if (ret < 0 || off < minoff) {
11019 minoff=off;
11020 ret=i;
11021 }
11022 }
11023 }
11024 if (ret == -1) return NULL;
11025 *offset = minoff;
11026 return symsTable[ret].name;
11027 }
11028 #else /* HAVE_BACKTRACE */
11029 static void setupSigSegvAction(void) {
11030 }
11031 #endif /* HAVE_BACKTRACE */
11032
11033
11034
11035 /* The End */
11036
11037
11038