]> git.saurik.com Git - redis.git/blob - redis.c
949bb58eac20d52c96a953747abdcd5855e05921
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int no_appendfsync_on_rewrite;
373 int shutdown_asap;
374 time_t lastfsync;
375 int appendfd;
376 int appendseldb;
377 char *pidfile;
378 pid_t bgsavechildpid;
379 pid_t bgrewritechildpid;
380 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
381 sds aofbuf; /* AOF buffer, written before entering the event loop */
382 struct saveparam *saveparams;
383 int saveparamslen;
384 char *logfile;
385 char *bindaddr;
386 char *dbfilename;
387 char *appendfilename;
388 char *requirepass;
389 int rdbcompression;
390 int activerehashing;
391 /* Replication related */
392 int isslave;
393 char *masterauth;
394 char *masterhost;
395 int masterport;
396 redisClient *master; /* client that is master for this slave */
397 int replstate;
398 unsigned int maxclients;
399 unsigned long long maxmemory;
400 unsigned int blpop_blocked_clients;
401 unsigned int vm_blocked_clients;
402 /* Sort parameters - qsort_r() is only available under BSD so we
403 * have to take this state global, in order to pass it to sortCompare() */
404 int sort_desc;
405 int sort_alpha;
406 int sort_bypattern;
407 /* Virtual memory configuration */
408 int vm_enabled;
409 char *vm_swap_file;
410 off_t vm_page_size;
411 off_t vm_pages;
412 unsigned long long vm_max_memory;
413 /* Hashes config */
414 size_t hash_max_zipmap_entries;
415 size_t hash_max_zipmap_value;
416 /* Virtual memory state */
417 FILE *vm_fp;
418 int vm_fd;
419 off_t vm_next_page; /* Next probably empty page */
420 off_t vm_near_pages; /* Number of pages allocated sequentially */
421 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
422 time_t unixtime; /* Unix time sampled every second. */
423 /* Virtual memory I/O threads stuff */
424 /* An I/O thread process an element taken from the io_jobs queue and
425 * put the result of the operation in the io_done list. While the
426 * job is being processed, it's put on io_processing queue. */
427 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
428 list *io_processing; /* List of VM I/O jobs being processed */
429 list *io_processed; /* List of VM I/O jobs already processed */
430 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
431 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
432 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
433 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
434 pthread_attr_t io_threads_attr; /* attributes for threads creation */
435 int io_active_threads; /* Number of running I/O threads */
436 int vm_max_threads; /* Max number of I/O threads running at the same time */
437 /* Our main thread is blocked on the event loop, locking for sockets ready
438 * to be read or written, so when a threaded I/O operation is ready to be
439 * processed by the main thread, the I/O thread will use a unix pipe to
440 * awake the main thread. The followings are the two pipe FDs. */
441 int io_ready_pipe_read;
442 int io_ready_pipe_write;
443 /* Virtual memory stats */
444 unsigned long long vm_stats_used_pages;
445 unsigned long long vm_stats_swapped_objects;
446 unsigned long long vm_stats_swapouts;
447 unsigned long long vm_stats_swapins;
448 /* Pubsub */
449 dict *pubsub_channels; /* Map channels to list of subscribed clients */
450 list *pubsub_patterns; /* A list of pubsub_patterns */
451 /* Misc */
452 FILE *devnull;
453 };
454
455 typedef struct pubsubPattern {
456 redisClient *client;
457 robj *pattern;
458 } pubsubPattern;
459
460 typedef void redisCommandProc(redisClient *c);
461 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
462 struct redisCommand {
463 char *name;
464 redisCommandProc *proc;
465 int arity;
466 int flags;
467 /* Use a function to determine which keys need to be loaded
468 * in the background prior to executing this command. Takes precedence
469 * over vm_firstkey and others, ignored when NULL */
470 redisVmPreloadProc *vm_preload_proc;
471 /* What keys should be loaded in background when calling this command? */
472 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
473 int vm_lastkey; /* THe last argument that's a key */
474 int vm_keystep; /* The step between first and last key */
475 };
476
477 struct redisFunctionSym {
478 char *name;
479 unsigned long pointer;
480 };
481
482 typedef struct _redisSortObject {
483 robj *obj;
484 union {
485 double score;
486 robj *cmpobj;
487 } u;
488 } redisSortObject;
489
490 typedef struct _redisSortOperation {
491 int type;
492 robj *pattern;
493 } redisSortOperation;
494
495 /* ZSETs use a specialized version of Skiplists */
496
497 typedef struct zskiplistNode {
498 struct zskiplistNode **forward;
499 struct zskiplistNode *backward;
500 unsigned int *span;
501 double score;
502 robj *obj;
503 } zskiplistNode;
504
505 typedef struct zskiplist {
506 struct zskiplistNode *header, *tail;
507 unsigned long length;
508 int level;
509 } zskiplist;
510
511 typedef struct zset {
512 dict *dict;
513 zskiplist *zsl;
514 } zset;
515
516 /* Our shared "common" objects */
517
518 #define REDIS_SHARED_INTEGERS 10000
519 struct sharedObjectsStruct {
520 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
521 *colon, *nullbulk, *nullmultibulk, *queued,
522 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
523 *outofrangeerr, *plus,
524 *select0, *select1, *select2, *select3, *select4,
525 *select5, *select6, *select7, *select8, *select9,
526 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
527 *mbulk4, *psubscribebulk, *punsubscribebulk,
528 *integers[REDIS_SHARED_INTEGERS];
529 } shared;
530
531 /* Global vars that are actally used as constants. The following double
532 * values are used for double on-disk serialization, and are initialized
533 * at runtime to avoid strange compiler optimizations. */
534
535 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
536
537 /* VM threaded I/O request message */
538 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
539 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
540 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
541 typedef struct iojob {
542 int type; /* Request type, REDIS_IOJOB_* */
543 redisDb *db;/* Redis database */
544 robj *key; /* This I/O request is about swapping this key */
545 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
546 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
547 off_t page; /* Swap page where to read/write the object */
548 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
549 int canceled; /* True if this command was canceled by blocking side of VM */
550 pthread_t thread; /* ID of the thread processing this entry */
551 } iojob;
552
553 /*================================ Prototypes =============================== */
554
555 static void freeStringObject(robj *o);
556 static void freeListObject(robj *o);
557 static void freeSetObject(robj *o);
558 static void decrRefCount(void *o);
559 static robj *createObject(int type, void *ptr);
560 static void freeClient(redisClient *c);
561 static int rdbLoad(char *filename);
562 static void addReply(redisClient *c, robj *obj);
563 static void addReplySds(redisClient *c, sds s);
564 static void incrRefCount(robj *o);
565 static int rdbSaveBackground(char *filename);
566 static robj *createStringObject(char *ptr, size_t len);
567 static robj *dupStringObject(robj *o);
568 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
569 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
570 static void flushAppendOnlyFile(void);
571 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
572 static int syncWithMaster(void);
573 static robj *tryObjectEncoding(robj *o);
574 static robj *getDecodedObject(robj *o);
575 static int removeExpire(redisDb *db, robj *key);
576 static int expireIfNeeded(redisDb *db, robj *key);
577 static int deleteIfVolatile(redisDb *db, robj *key);
578 static int deleteIfSwapped(redisDb *db, robj *key);
579 static int deleteKey(redisDb *db, robj *key);
580 static time_t getExpire(redisDb *db, robj *key);
581 static int setExpire(redisDb *db, robj *key, time_t when);
582 static void updateSlavesWaitingBgsave(int bgsaveerr);
583 static void freeMemoryIfNeeded(void);
584 static int processCommand(redisClient *c);
585 static void setupSigSegvAction(void);
586 static void rdbRemoveTempFile(pid_t childpid);
587 static void aofRemoveTempFile(pid_t childpid);
588 static size_t stringObjectLen(robj *o);
589 static void processInputBuffer(redisClient *c);
590 static zskiplist *zslCreate(void);
591 static void zslFree(zskiplist *zsl);
592 static void zslInsert(zskiplist *zsl, double score, robj *obj);
593 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
594 static void initClientMultiState(redisClient *c);
595 static void freeClientMultiState(redisClient *c);
596 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
597 static void unblockClientWaitingData(redisClient *c);
598 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
599 static void vmInit(void);
600 static void vmMarkPagesFree(off_t page, off_t count);
601 static robj *vmLoadObject(robj *key);
602 static robj *vmPreviewObject(robj *key);
603 static int vmSwapOneObjectBlocking(void);
604 static int vmSwapOneObjectThreaded(void);
605 static int vmCanSwapOut(void);
606 static int tryFreeOneObjectFromFreelist(void);
607 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
609 static void vmCancelThreadedIOJob(robj *o);
610 static void lockThreadedIO(void);
611 static void unlockThreadedIO(void);
612 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
613 static void freeIOJob(iojob *j);
614 static void queueIOJob(iojob *j);
615 static int vmWriteObjectOnSwap(robj *o, off_t page);
616 static robj *vmReadObjectFromSwap(off_t page, int type);
617 static void waitEmptyIOJobsQueue(void);
618 static void vmReopenSwapFile(void);
619 static int vmFreePage(off_t page);
620 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
622 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
623 static int dontWaitForSwappedKey(redisClient *c, robj *key);
624 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
625 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
626 static struct redisCommand *lookupCommand(char *name);
627 static void call(redisClient *c, struct redisCommand *cmd);
628 static void resetClient(redisClient *c);
629 static void convertToRealHash(robj *o);
630 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
631 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
632 static void freePubsubPattern(void *p);
633 static int listMatchPubsubPattern(void *a, void *b);
634 static int compareStringObjects(robj *a, robj *b);
635 static int equalStringObjects(robj *a, robj *b);
636 static void usage();
637 static int rewriteAppendOnlyFileBackground(void);
638 static int vmSwapObjectBlocking(robj *key, robj *val);
639 static int prepareForShutdown();
640 static void touchWatchedKey(redisDb *db, robj *key);
641 static void touchWatchedKeysOnFlush(int dbid);
642 static void unwatchAllKeys(redisClient *c);
643
644 static void authCommand(redisClient *c);
645 static void pingCommand(redisClient *c);
646 static void echoCommand(redisClient *c);
647 static void setCommand(redisClient *c);
648 static void setnxCommand(redisClient *c);
649 static void setexCommand(redisClient *c);
650 static void getCommand(redisClient *c);
651 static void delCommand(redisClient *c);
652 static void existsCommand(redisClient *c);
653 static void incrCommand(redisClient *c);
654 static void decrCommand(redisClient *c);
655 static void incrbyCommand(redisClient *c);
656 static void decrbyCommand(redisClient *c);
657 static void selectCommand(redisClient *c);
658 static void randomkeyCommand(redisClient *c);
659 static void keysCommand(redisClient *c);
660 static void dbsizeCommand(redisClient *c);
661 static void lastsaveCommand(redisClient *c);
662 static void saveCommand(redisClient *c);
663 static void bgsaveCommand(redisClient *c);
664 static void bgrewriteaofCommand(redisClient *c);
665 static void shutdownCommand(redisClient *c);
666 static void moveCommand(redisClient *c);
667 static void renameCommand(redisClient *c);
668 static void renamenxCommand(redisClient *c);
669 static void lpushCommand(redisClient *c);
670 static void rpushCommand(redisClient *c);
671 static void lpopCommand(redisClient *c);
672 static void rpopCommand(redisClient *c);
673 static void llenCommand(redisClient *c);
674 static void lindexCommand(redisClient *c);
675 static void lrangeCommand(redisClient *c);
676 static void ltrimCommand(redisClient *c);
677 static void typeCommand(redisClient *c);
678 static void lsetCommand(redisClient *c);
679 static void saddCommand(redisClient *c);
680 static void sremCommand(redisClient *c);
681 static void smoveCommand(redisClient *c);
682 static void sismemberCommand(redisClient *c);
683 static void scardCommand(redisClient *c);
684 static void spopCommand(redisClient *c);
685 static void srandmemberCommand(redisClient *c);
686 static void sinterCommand(redisClient *c);
687 static void sinterstoreCommand(redisClient *c);
688 static void sunionCommand(redisClient *c);
689 static void sunionstoreCommand(redisClient *c);
690 static void sdiffCommand(redisClient *c);
691 static void sdiffstoreCommand(redisClient *c);
692 static void syncCommand(redisClient *c);
693 static void flushdbCommand(redisClient *c);
694 static void flushallCommand(redisClient *c);
695 static void sortCommand(redisClient *c);
696 static void lremCommand(redisClient *c);
697 static void rpoplpushcommand(redisClient *c);
698 static void infoCommand(redisClient *c);
699 static void mgetCommand(redisClient *c);
700 static void monitorCommand(redisClient *c);
701 static void expireCommand(redisClient *c);
702 static void expireatCommand(redisClient *c);
703 static void getsetCommand(redisClient *c);
704 static void ttlCommand(redisClient *c);
705 static void slaveofCommand(redisClient *c);
706 static void debugCommand(redisClient *c);
707 static void msetCommand(redisClient *c);
708 static void msetnxCommand(redisClient *c);
709 static void zaddCommand(redisClient *c);
710 static void zincrbyCommand(redisClient *c);
711 static void zrangeCommand(redisClient *c);
712 static void zrangebyscoreCommand(redisClient *c);
713 static void zcountCommand(redisClient *c);
714 static void zrevrangeCommand(redisClient *c);
715 static void zcardCommand(redisClient *c);
716 static void zremCommand(redisClient *c);
717 static void zscoreCommand(redisClient *c);
718 static void zremrangebyscoreCommand(redisClient *c);
719 static void multiCommand(redisClient *c);
720 static void execCommand(redisClient *c);
721 static void discardCommand(redisClient *c);
722 static void blpopCommand(redisClient *c);
723 static void brpopCommand(redisClient *c);
724 static void appendCommand(redisClient *c);
725 static void substrCommand(redisClient *c);
726 static void zrankCommand(redisClient *c);
727 static void zrevrankCommand(redisClient *c);
728 static void hsetCommand(redisClient *c);
729 static void hsetnxCommand(redisClient *c);
730 static void hgetCommand(redisClient *c);
731 static void hmsetCommand(redisClient *c);
732 static void hmgetCommand(redisClient *c);
733 static void hdelCommand(redisClient *c);
734 static void hlenCommand(redisClient *c);
735 static void zremrangebyrankCommand(redisClient *c);
736 static void zunionstoreCommand(redisClient *c);
737 static void zinterstoreCommand(redisClient *c);
738 static void hkeysCommand(redisClient *c);
739 static void hvalsCommand(redisClient *c);
740 static void hgetallCommand(redisClient *c);
741 static void hexistsCommand(redisClient *c);
742 static void configCommand(redisClient *c);
743 static void hincrbyCommand(redisClient *c);
744 static void subscribeCommand(redisClient *c);
745 static void unsubscribeCommand(redisClient *c);
746 static void psubscribeCommand(redisClient *c);
747 static void punsubscribeCommand(redisClient *c);
748 static void publishCommand(redisClient *c);
749 static void watchCommand(redisClient *c);
750 static void unwatchCommand(redisClient *c);
751
752 /*================================= Globals ================================= */
753
754 /* Global vars */
755 static struct redisServer server; /* server global state */
756 static struct redisCommand *commandTable;
757 static struct redisCommand readonlyCommandTable[] = {
758 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
761 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
762 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
765 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
769 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
781 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
782 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
785 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
803 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
817 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
823 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
828 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
848 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
859 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
864 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
866 };
867
868 /*============================ Utility functions ============================ */
869
870 /* Glob-style pattern matching. */
871 static int stringmatchlen(const char *pattern, int patternLen,
872 const char *string, int stringLen, int nocase)
873 {
874 while(patternLen) {
875 switch(pattern[0]) {
876 case '*':
877 while (pattern[1] == '*') {
878 pattern++;
879 patternLen--;
880 }
881 if (patternLen == 1)
882 return 1; /* match */
883 while(stringLen) {
884 if (stringmatchlen(pattern+1, patternLen-1,
885 string, stringLen, nocase))
886 return 1; /* match */
887 string++;
888 stringLen--;
889 }
890 return 0; /* no match */
891 break;
892 case '?':
893 if (stringLen == 0)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 case '[':
899 {
900 int not, match;
901
902 pattern++;
903 patternLen--;
904 not = pattern[0] == '^';
905 if (not) {
906 pattern++;
907 patternLen--;
908 }
909 match = 0;
910 while(1) {
911 if (pattern[0] == '\\') {
912 pattern++;
913 patternLen--;
914 if (pattern[0] == string[0])
915 match = 1;
916 } else if (pattern[0] == ']') {
917 break;
918 } else if (patternLen == 0) {
919 pattern--;
920 patternLen++;
921 break;
922 } else if (pattern[1] == '-' && patternLen >= 3) {
923 int start = pattern[0];
924 int end = pattern[2];
925 int c = string[0];
926 if (start > end) {
927 int t = start;
928 start = end;
929 end = t;
930 }
931 if (nocase) {
932 start = tolower(start);
933 end = tolower(end);
934 c = tolower(c);
935 }
936 pattern += 2;
937 patternLen -= 2;
938 if (c >= start && c <= end)
939 match = 1;
940 } else {
941 if (!nocase) {
942 if (pattern[0] == string[0])
943 match = 1;
944 } else {
945 if (tolower((int)pattern[0]) == tolower((int)string[0]))
946 match = 1;
947 }
948 }
949 pattern++;
950 patternLen--;
951 }
952 if (not)
953 match = !match;
954 if (!match)
955 return 0; /* no match */
956 string++;
957 stringLen--;
958 break;
959 }
960 case '\\':
961 if (patternLen >= 2) {
962 pattern++;
963 patternLen--;
964 }
965 /* fall through */
966 default:
967 if (!nocase) {
968 if (pattern[0] != string[0])
969 return 0; /* no match */
970 } else {
971 if (tolower((int)pattern[0]) != tolower((int)string[0]))
972 return 0; /* no match */
973 }
974 string++;
975 stringLen--;
976 break;
977 }
978 pattern++;
979 patternLen--;
980 if (stringLen == 0) {
981 while(*pattern == '*') {
982 pattern++;
983 patternLen--;
984 }
985 break;
986 }
987 }
988 if (patternLen == 0 && stringLen == 0)
989 return 1;
990 return 0;
991 }
992
993 static int stringmatch(const char *pattern, const char *string, int nocase) {
994 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
995 }
996
997 /* Convert a string representing an amount of memory into the number of
998 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
999 * (1024*1024*1024).
1000 *
1001 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1002 * set to 0 */
1003 static long long memtoll(const char *p, int *err) {
1004 const char *u;
1005 char buf[128];
1006 long mul; /* unit multiplier */
1007 long long val;
1008 unsigned int digits;
1009
1010 if (err) *err = 0;
1011 /* Search the first non digit character. */
1012 u = p;
1013 if (*u == '-') u++;
1014 while(*u && isdigit(*u)) u++;
1015 if (*u == '\0' || !strcasecmp(u,"b")) {
1016 mul = 1;
1017 } else if (!strcasecmp(u,"k")) {
1018 mul = 1000;
1019 } else if (!strcasecmp(u,"kb")) {
1020 mul = 1024;
1021 } else if (!strcasecmp(u,"m")) {
1022 mul = 1000*1000;
1023 } else if (!strcasecmp(u,"mb")) {
1024 mul = 1024*1024;
1025 } else if (!strcasecmp(u,"g")) {
1026 mul = 1000L*1000*1000;
1027 } else if (!strcasecmp(u,"gb")) {
1028 mul = 1024L*1024*1024;
1029 } else {
1030 if (err) *err = 1;
1031 mul = 1;
1032 }
1033 digits = u-p;
1034 if (digits >= sizeof(buf)) {
1035 if (err) *err = 1;
1036 return LLONG_MAX;
1037 }
1038 memcpy(buf,p,digits);
1039 buf[digits] = '\0';
1040 val = strtoll(buf,NULL,10);
1041 return val*mul;
1042 }
1043
1044 /* Convert a long long into a string. Returns the number of
1045 * characters needed to represent the number, that can be shorter if passed
1046 * buffer length is not enough to store the whole number. */
1047 static int ll2string(char *s, size_t len, long long value) {
1048 char buf[32], *p;
1049 unsigned long long v;
1050 size_t l;
1051
1052 if (len == 0) return 0;
1053 v = (value < 0) ? -value : value;
1054 p = buf+31; /* point to the last character */
1055 do {
1056 *p-- = '0'+(v%10);
1057 v /= 10;
1058 } while(v);
1059 if (value < 0) *p-- = '-';
1060 p++;
1061 l = 32-(p-buf);
1062 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1063 memcpy(s,p,l);
1064 s[l] = '\0';
1065 return l;
1066 }
1067
1068 static void redisLog(int level, const char *fmt, ...) {
1069 va_list ap;
1070 FILE *fp;
1071
1072 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1073 if (!fp) return;
1074
1075 va_start(ap, fmt);
1076 if (level >= server.verbosity) {
1077 char *c = ".-*#";
1078 char buf[64];
1079 time_t now;
1080
1081 now = time(NULL);
1082 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1083 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1084 vfprintf(fp, fmt, ap);
1085 fprintf(fp,"\n");
1086 fflush(fp);
1087 }
1088 va_end(ap);
1089
1090 if (server.logfile) fclose(fp);
1091 }
1092
1093 /*====================== Hash table type implementation ==================== */
1094
1095 /* This is an hash table type that uses the SDS dynamic strings libary as
1096 * keys and radis objects as values (objects can hold SDS strings,
1097 * lists, sets). */
1098
1099 static void dictVanillaFree(void *privdata, void *val)
1100 {
1101 DICT_NOTUSED(privdata);
1102 zfree(val);
1103 }
1104
1105 static void dictListDestructor(void *privdata, void *val)
1106 {
1107 DICT_NOTUSED(privdata);
1108 listRelease((list*)val);
1109 }
1110
1111 static int sdsDictKeyCompare(void *privdata, const void *key1,
1112 const void *key2)
1113 {
1114 int l1,l2;
1115 DICT_NOTUSED(privdata);
1116
1117 l1 = sdslen((sds)key1);
1118 l2 = sdslen((sds)key2);
1119 if (l1 != l2) return 0;
1120 return memcmp(key1, key2, l1) == 0;
1121 }
1122
1123 static void dictRedisObjectDestructor(void *privdata, void *val)
1124 {
1125 DICT_NOTUSED(privdata);
1126
1127 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1128 decrRefCount(val);
1129 }
1130
1131 static int dictObjKeyCompare(void *privdata, const void *key1,
1132 const void *key2)
1133 {
1134 const robj *o1 = key1, *o2 = key2;
1135 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1136 }
1137
1138 static unsigned int dictObjHash(const void *key) {
1139 const robj *o = key;
1140 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1141 }
1142
1143 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145 {
1146 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1147 int cmp;
1148
1149 if (o1->encoding == REDIS_ENCODING_INT &&
1150 o2->encoding == REDIS_ENCODING_INT)
1151 return o1->ptr == o2->ptr;
1152
1153 o1 = getDecodedObject(o1);
1154 o2 = getDecodedObject(o2);
1155 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1156 decrRefCount(o1);
1157 decrRefCount(o2);
1158 return cmp;
1159 }
1160
1161 static unsigned int dictEncObjHash(const void *key) {
1162 robj *o = (robj*) key;
1163
1164 if (o->encoding == REDIS_ENCODING_RAW) {
1165 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1166 } else {
1167 if (o->encoding == REDIS_ENCODING_INT) {
1168 char buf[32];
1169 int len;
1170
1171 len = ll2string(buf,32,(long)o->ptr);
1172 return dictGenHashFunction((unsigned char*)buf, len);
1173 } else {
1174 unsigned int hash;
1175
1176 o = getDecodedObject(o);
1177 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 decrRefCount(o);
1179 return hash;
1180 }
1181 }
1182 }
1183
1184 /* Sets type and expires */
1185 static dictType setDictType = {
1186 dictEncObjHash, /* hash function */
1187 NULL, /* key dup */
1188 NULL, /* val dup */
1189 dictEncObjKeyCompare, /* key compare */
1190 dictRedisObjectDestructor, /* key destructor */
1191 NULL /* val destructor */
1192 };
1193
1194 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1195 static dictType zsetDictType = {
1196 dictEncObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictEncObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
1201 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1202 };
1203
1204 /* Db->dict */
1205 static dictType dbDictType = {
1206 dictObjHash, /* hash function */
1207 NULL, /* key dup */
1208 NULL, /* val dup */
1209 dictObjKeyCompare, /* key compare */
1210 dictRedisObjectDestructor, /* key destructor */
1211 dictRedisObjectDestructor /* val destructor */
1212 };
1213
1214 /* Db->expires */
1215 static dictType keyptrDictType = {
1216 dictObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222 };
1223
1224 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1225 static dictType hashDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictRedisObjectDestructor /* val destructor */
1232 };
1233
1234 /* Keylist hash table type has unencoded redis objects as keys and
1235 * lists as values. It's used for blocking operations (BLPOP) and to
1236 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1237 static dictType keylistDictType = {
1238 dictObjHash, /* hash function */
1239 NULL, /* key dup */
1240 NULL, /* val dup */
1241 dictObjKeyCompare, /* key compare */
1242 dictRedisObjectDestructor, /* key destructor */
1243 dictListDestructor /* val destructor */
1244 };
1245
1246 static void version();
1247
1248 /* ========================= Random utility functions ======================= */
1249
1250 /* Redis generally does not try to recover from out of memory conditions
1251 * when allocating objects or strings, it is not clear if it will be possible
1252 * to report this condition to the client since the networking layer itself
1253 * is based on heap allocation for send buffers, so we simply abort.
1254 * At least the code will be simpler to read... */
1255 static void oom(const char *msg) {
1256 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1257 sleep(1);
1258 abort();
1259 }
1260
1261 /* ====================== Redis server networking stuff ===================== */
1262 static void closeTimedoutClients(void) {
1263 redisClient *c;
1264 listNode *ln;
1265 time_t now = time(NULL);
1266 listIter li;
1267
1268 listRewind(server.clients,&li);
1269 while ((ln = listNext(&li)) != NULL) {
1270 c = listNodeValue(ln);
1271 if (server.maxidletime &&
1272 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1273 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1274 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1275 listLength(c->pubsub_patterns) == 0 &&
1276 (now - c->lastinteraction > server.maxidletime))
1277 {
1278 redisLog(REDIS_VERBOSE,"Closing idle client");
1279 freeClient(c);
1280 } else if (c->flags & REDIS_BLOCKED) {
1281 if (c->blockingto != 0 && c->blockingto < now) {
1282 addReply(c,shared.nullmultibulk);
1283 unblockClientWaitingData(c);
1284 }
1285 }
1286 }
1287 }
1288
1289 static int htNeedsResize(dict *dict) {
1290 long long size, used;
1291
1292 size = dictSlots(dict);
1293 used = dictSize(dict);
1294 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1295 (used*100/size < REDIS_HT_MINFILL));
1296 }
1297
1298 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1299 * we resize the hash table to save memory */
1300 static void tryResizeHashTables(void) {
1301 int j;
1302
1303 for (j = 0; j < server.dbnum; j++) {
1304 if (htNeedsResize(server.db[j].dict))
1305 dictResize(server.db[j].dict);
1306 if (htNeedsResize(server.db[j].expires))
1307 dictResize(server.db[j].expires);
1308 }
1309 }
1310
1311 /* Our hash table implementation performs rehashing incrementally while
1312 * we write/read from the hash table. Still if the server is idle, the hash
1313 * table will use two tables for a long time. So we try to use 1 millisecond
1314 * of CPU time at every serverCron() loop in order to rehash some key. */
1315 static void incrementallyRehash(void) {
1316 int j;
1317
1318 for (j = 0; j < server.dbnum; j++) {
1319 if (dictIsRehashing(server.db[j].dict)) {
1320 dictRehashMilliseconds(server.db[j].dict,1);
1321 break; /* already used our millisecond for this loop... */
1322 }
1323 }
1324 }
1325
1326 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1327 void backgroundSaveDoneHandler(int statloc) {
1328 int exitcode = WEXITSTATUS(statloc);
1329 int bysignal = WIFSIGNALED(statloc);
1330
1331 if (!bysignal && exitcode == 0) {
1332 redisLog(REDIS_NOTICE,
1333 "Background saving terminated with success");
1334 server.dirty = 0;
1335 server.lastsave = time(NULL);
1336 } else if (!bysignal && exitcode != 0) {
1337 redisLog(REDIS_WARNING, "Background saving error");
1338 } else {
1339 redisLog(REDIS_WARNING,
1340 "Background saving terminated by signal %d", WTERMSIG(statloc));
1341 rdbRemoveTempFile(server.bgsavechildpid);
1342 }
1343 server.bgsavechildpid = -1;
1344 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1345 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1346 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1347 }
1348
1349 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1350 * Handle this. */
1351 void backgroundRewriteDoneHandler(int statloc) {
1352 int exitcode = WEXITSTATUS(statloc);
1353 int bysignal = WIFSIGNALED(statloc);
1354
1355 if (!bysignal && exitcode == 0) {
1356 int fd;
1357 char tmpfile[256];
1358
1359 redisLog(REDIS_NOTICE,
1360 "Background append only file rewriting terminated with success");
1361 /* Now it's time to flush the differences accumulated by the parent */
1362 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1363 fd = open(tmpfile,O_WRONLY|O_APPEND);
1364 if (fd == -1) {
1365 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1366 goto cleanup;
1367 }
1368 /* Flush our data... */
1369 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1370 (signed) sdslen(server.bgrewritebuf)) {
1371 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1372 close(fd);
1373 goto cleanup;
1374 }
1375 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1376 /* Now our work is to rename the temp file into the stable file. And
1377 * switch the file descriptor used by the server for append only. */
1378 if (rename(tmpfile,server.appendfilename) == -1) {
1379 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1380 close(fd);
1381 goto cleanup;
1382 }
1383 /* Mission completed... almost */
1384 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1385 if (server.appendfd != -1) {
1386 /* If append only is actually enabled... */
1387 close(server.appendfd);
1388 server.appendfd = fd;
1389 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1390 server.appendseldb = -1; /* Make sure it will issue SELECT */
1391 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1392 } else {
1393 /* If append only is disabled we just generate a dump in this
1394 * format. Why not? */
1395 close(fd);
1396 }
1397 } else if (!bysignal && exitcode != 0) {
1398 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1399 } else {
1400 redisLog(REDIS_WARNING,
1401 "Background append only file rewriting terminated by signal %d",
1402 WTERMSIG(statloc));
1403 }
1404 cleanup:
1405 sdsfree(server.bgrewritebuf);
1406 server.bgrewritebuf = sdsempty();
1407 aofRemoveTempFile(server.bgrewritechildpid);
1408 server.bgrewritechildpid = -1;
1409 }
1410
1411 /* This function is called once a background process of some kind terminates,
1412 * as we want to avoid resizing the hash tables when there is a child in order
1413 * to play well with copy-on-write (otherwise when a resize happens lots of
1414 * memory pages are copied). The goal of this function is to update the ability
1415 * for dict.c to resize the hash tables accordingly to the fact we have o not
1416 * running childs. */
1417 static void updateDictResizePolicy(void) {
1418 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1419 dictEnableResize();
1420 else
1421 dictDisableResize();
1422 }
1423
1424 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1425 int j, loops = server.cronloops++;
1426 REDIS_NOTUSED(eventLoop);
1427 REDIS_NOTUSED(id);
1428 REDIS_NOTUSED(clientData);
1429
1430 /* We take a cached value of the unix time in the global state because
1431 * with virtual memory and aging there is to store the current time
1432 * in objects at every object access, and accuracy is not needed.
1433 * To access a global var is faster than calling time(NULL) */
1434 server.unixtime = time(NULL);
1435
1436 /* We received a SIGTERM, shutting down here in a safe way, as it is
1437 * not ok doing so inside the signal handler. */
1438 if (server.shutdown_asap) {
1439 if (prepareForShutdown() == REDIS_OK) exit(0);
1440 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1441 }
1442
1443 /* Show some info about non-empty databases */
1444 for (j = 0; j < server.dbnum; j++) {
1445 long long size, used, vkeys;
1446
1447 size = dictSlots(server.db[j].dict);
1448 used = dictSize(server.db[j].dict);
1449 vkeys = dictSize(server.db[j].expires);
1450 if (!(loops % 50) && (used || vkeys)) {
1451 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1452 /* dictPrintStats(server.dict); */
1453 }
1454 }
1455
1456 /* We don't want to resize the hash tables while a bacground saving
1457 * is in progress: the saving child is created using fork() that is
1458 * implemented with a copy-on-write semantic in most modern systems, so
1459 * if we resize the HT while there is the saving child at work actually
1460 * a lot of memory movements in the parent will cause a lot of pages
1461 * copied. */
1462 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1463 if (!(loops % 10)) tryResizeHashTables();
1464 if (server.activerehashing) incrementallyRehash();
1465 }
1466
1467 /* Show information about connected clients */
1468 if (!(loops % 50)) {
1469 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1470 listLength(server.clients)-listLength(server.slaves),
1471 listLength(server.slaves),
1472 zmalloc_used_memory());
1473 }
1474
1475 /* Close connections of timedout clients */
1476 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1477 closeTimedoutClients();
1478
1479 /* Check if a background saving or AOF rewrite in progress terminated */
1480 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1481 int statloc;
1482 pid_t pid;
1483
1484 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1485 if (pid == server.bgsavechildpid) {
1486 backgroundSaveDoneHandler(statloc);
1487 } else {
1488 backgroundRewriteDoneHandler(statloc);
1489 }
1490 updateDictResizePolicy();
1491 }
1492 } else {
1493 /* If there is not a background saving in progress check if
1494 * we have to save now */
1495 time_t now = time(NULL);
1496 for (j = 0; j < server.saveparamslen; j++) {
1497 struct saveparam *sp = server.saveparams+j;
1498
1499 if (server.dirty >= sp->changes &&
1500 now-server.lastsave > sp->seconds) {
1501 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1502 sp->changes, sp->seconds);
1503 rdbSaveBackground(server.dbfilename);
1504 break;
1505 }
1506 }
1507 }
1508
1509 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1510 * will use few CPU cycles if there are few expiring keys, otherwise
1511 * it will get more aggressive to avoid that too much memory is used by
1512 * keys that can be removed from the keyspace. */
1513 for (j = 0; j < server.dbnum; j++) {
1514 int expired;
1515 redisDb *db = server.db+j;
1516
1517 /* Continue to expire if at the end of the cycle more than 25%
1518 * of the keys were expired. */
1519 do {
1520 long num = dictSize(db->expires);
1521 time_t now = time(NULL);
1522
1523 expired = 0;
1524 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1525 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1526 while (num--) {
1527 dictEntry *de;
1528 time_t t;
1529
1530 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1531 t = (time_t) dictGetEntryVal(de);
1532 if (now > t) {
1533 deleteKey(db,dictGetEntryKey(de));
1534 expired++;
1535 server.stat_expiredkeys++;
1536 }
1537 }
1538 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1539 }
1540
1541 /* Swap a few keys on disk if we are over the memory limit and VM
1542 * is enbled. Try to free objects from the free list first. */
1543 if (vmCanSwapOut()) {
1544 while (server.vm_enabled && zmalloc_used_memory() >
1545 server.vm_max_memory)
1546 {
1547 int retval;
1548
1549 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1550 retval = (server.vm_max_threads == 0) ?
1551 vmSwapOneObjectBlocking() :
1552 vmSwapOneObjectThreaded();
1553 if (retval == REDIS_ERR && !(loops % 300) &&
1554 zmalloc_used_memory() >
1555 (server.vm_max_memory+server.vm_max_memory/10))
1556 {
1557 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1558 }
1559 /* Note that when using threade I/O we free just one object,
1560 * because anyway when the I/O thread in charge to swap this
1561 * object out will finish, the handler of completed jobs
1562 * will try to swap more objects if we are still out of memory. */
1563 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1564 }
1565 }
1566
1567 /* Check if we should connect to a MASTER */
1568 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1569 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1570 if (syncWithMaster() == REDIS_OK) {
1571 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1572 if (server.appendonly) rewriteAppendOnlyFileBackground();
1573 }
1574 }
1575 return 100;
1576 }
1577
1578 /* This function gets called every time Redis is entering the
1579 * main loop of the event driven library, that is, before to sleep
1580 * for ready file descriptors. */
1581 static void beforeSleep(struct aeEventLoop *eventLoop) {
1582 REDIS_NOTUSED(eventLoop);
1583
1584 /* Awake clients that got all the swapped keys they requested */
1585 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1586 listIter li;
1587 listNode *ln;
1588
1589 listRewind(server.io_ready_clients,&li);
1590 while((ln = listNext(&li))) {
1591 redisClient *c = ln->value;
1592 struct redisCommand *cmd;
1593
1594 /* Resume the client. */
1595 listDelNode(server.io_ready_clients,ln);
1596 c->flags &= (~REDIS_IO_WAIT);
1597 server.vm_blocked_clients--;
1598 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1599 readQueryFromClient, c);
1600 cmd = lookupCommand(c->argv[0]->ptr);
1601 assert(cmd != NULL);
1602 call(c,cmd);
1603 resetClient(c);
1604 /* There may be more data to process in the input buffer. */
1605 if (c->querybuf && sdslen(c->querybuf) > 0)
1606 processInputBuffer(c);
1607 }
1608 }
1609 /* Write the AOF buffer on disk */
1610 flushAppendOnlyFile();
1611 }
1612
1613 static void createSharedObjects(void) {
1614 int j;
1615
1616 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1617 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1618 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1619 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1620 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1621 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1622 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1623 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1624 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1625 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1626 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1627 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1629 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR no such key\r\n"));
1631 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR syntax error\r\n"));
1633 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR source and destination objects are the same\r\n"));
1635 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR index out of range\r\n"));
1637 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1638 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1639 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1640 shared.select0 = createStringObject("select 0\r\n",10);
1641 shared.select1 = createStringObject("select 1\r\n",10);
1642 shared.select2 = createStringObject("select 2\r\n",10);
1643 shared.select3 = createStringObject("select 3\r\n",10);
1644 shared.select4 = createStringObject("select 4\r\n",10);
1645 shared.select5 = createStringObject("select 5\r\n",10);
1646 shared.select6 = createStringObject("select 6\r\n",10);
1647 shared.select7 = createStringObject("select 7\r\n",10);
1648 shared.select8 = createStringObject("select 8\r\n",10);
1649 shared.select9 = createStringObject("select 9\r\n",10);
1650 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1651 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1652 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1653 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1654 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1655 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1656 shared.mbulk3 = createStringObject("*3\r\n",4);
1657 shared.mbulk4 = createStringObject("*4\r\n",4);
1658 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1659 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1660 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1661 }
1662 }
1663
1664 static void appendServerSaveParams(time_t seconds, int changes) {
1665 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1666 server.saveparams[server.saveparamslen].seconds = seconds;
1667 server.saveparams[server.saveparamslen].changes = changes;
1668 server.saveparamslen++;
1669 }
1670
1671 static void resetServerSaveParams() {
1672 zfree(server.saveparams);
1673 server.saveparams = NULL;
1674 server.saveparamslen = 0;
1675 }
1676
1677 static void initServerConfig() {
1678 server.dbnum = REDIS_DEFAULT_DBNUM;
1679 server.port = REDIS_SERVERPORT;
1680 server.verbosity = REDIS_VERBOSE;
1681 server.maxidletime = REDIS_MAXIDLETIME;
1682 server.saveparams = NULL;
1683 server.logfile = NULL; /* NULL = log on standard output */
1684 server.bindaddr = NULL;
1685 server.glueoutputbuf = 1;
1686 server.daemonize = 0;
1687 server.appendonly = 0;
1688 server.appendfsync = APPENDFSYNC_EVERYSEC;
1689 server.no_appendfsync_on_rewrite = 0;
1690 server.lastfsync = time(NULL);
1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
1696 server.requirepass = NULL;
1697 server.rdbcompression = 1;
1698 server.activerehashing = 1;
1699 server.maxclients = 0;
1700 server.blpop_blocked_clients = 0;
1701 server.maxmemory = 0;
1702 server.vm_enabled = 0;
1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1707 server.vm_max_threads = 4;
1708 server.vm_blocked_clients = 0;
1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1711 server.shutdown_asap = 0;
1712
1713 resetServerSaveParams();
1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
1720 server.masterauth = NULL;
1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
1731 }
1732
1733 static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
1738 setupSigSegvAction();
1739
1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
1745 server.clients = listCreate();
1746 server.slaves = listCreate();
1747 server.monitors = listCreate();
1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
1757 for (j = 0; j < server.dbnum; j++) {
1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1764 server.db[j].id = j;
1765 }
1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1770 server.cronloops = 0;
1771 server.bgsavechildpid = -1;
1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
1774 server.aofbuf = sdsempty();
1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
1779 server.stat_expiredkeys = 0;
1780 server.stat_starttime = time(NULL);
1781 server.unixtime = time(NULL);
1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1785
1786 if (server.appendonly) {
1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
1794
1795 if (server.vm_enabled) vmInit();
1796 }
1797
1798 /* Empty the whole database */
1799 static long long emptyDb() {
1800 int j;
1801 long long removed = 0;
1802
1803 for (j = 0; j < server.dbnum; j++) {
1804 removed += dictSize(server.db[j].dict);
1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
1808 return removed;
1809 }
1810
1811 static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815 }
1816
1817 /* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819 static void loadServerConfig(char *filename) {
1820 FILE *fp;
1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1830 exit(1);
1831 }
1832 }
1833
1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1854 server.maxidletime = atoi(argv[1]);
1855 if (server.maxidletime < 0) {
1856 err = "Invalid timeout value"; goto loaderr;
1857 }
1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1864 server.bindaddr = zstrdup(argv[1]);
1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1888 FILE *logfp;
1889
1890 server.logfile = zstrdup(argv[1]);
1891 if (!strcasecmp(server.logfile,"stdout")) {
1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
1904 fclose(logfp);
1905 }
1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1916 server.maxmemory = memtoll(argv[1],NULL);
1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
1946 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1947 && argc == 2) {
1948 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1949 err = "argument must be 'yes' or 'no'"; goto loaderr;
1950 }
1951 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1952 if (!strcasecmp(argv[1],"no")) {
1953 server.appendfsync = APPENDFSYNC_NO;
1954 } else if (!strcasecmp(argv[1],"always")) {
1955 server.appendfsync = APPENDFSYNC_ALWAYS;
1956 } else if (!strcasecmp(argv[1],"everysec")) {
1957 server.appendfsync = APPENDFSYNC_EVERYSEC;
1958 } else {
1959 err = "argument must be 'no', 'always' or 'everysec'";
1960 goto loaderr;
1961 }
1962 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1963 server.requirepass = zstrdup(argv[1]);
1964 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1965 zfree(server.pidfile);
1966 server.pidfile = zstrdup(argv[1]);
1967 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1968 zfree(server.dbfilename);
1969 server.dbfilename = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1971 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
1974 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1975 zfree(server.vm_swap_file);
1976 server.vm_swap_file = zstrdup(argv[1]);
1977 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1978 server.vm_max_memory = memtoll(argv[1],NULL);
1979 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1980 server.vm_page_size = memtoll(argv[1], NULL);
1981 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1982 server.vm_pages = memtoll(argv[1], NULL);
1983 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1984 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1985 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1986 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1987 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1988 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1989 } else {
1990 err = "Bad directive or wrong number of arguments"; goto loaderr;
1991 }
1992 for (j = 0; j < argc; j++)
1993 sdsfree(argv[j]);
1994 zfree(argv);
1995 sdsfree(line);
1996 }
1997 if (fp != stdin) fclose(fp);
1998 return;
1999
2000 loaderr:
2001 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2002 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2003 fprintf(stderr, ">>> '%s'\n", line);
2004 fprintf(stderr, "%s\n", err);
2005 exit(1);
2006 }
2007
2008 static void freeClientArgv(redisClient *c) {
2009 int j;
2010
2011 for (j = 0; j < c->argc; j++)
2012 decrRefCount(c->argv[j]);
2013 for (j = 0; j < c->mbargc; j++)
2014 decrRefCount(c->mbargv[j]);
2015 c->argc = 0;
2016 c->mbargc = 0;
2017 }
2018
2019 static void freeClient(redisClient *c) {
2020 listNode *ln;
2021
2022 /* Note that if the client we are freeing is blocked into a blocking
2023 * call, we have to set querybuf to NULL *before* to call
2024 * unblockClientWaitingData() to avoid processInputBuffer() will get
2025 * called. Also it is important to remove the file events after
2026 * this, because this call adds the READABLE event. */
2027 sdsfree(c->querybuf);
2028 c->querybuf = NULL;
2029 if (c->flags & REDIS_BLOCKED)
2030 unblockClientWaitingData(c);
2031
2032 /* UNWATCH all the keys */
2033 unwatchAllKeys(c);
2034 listRelease(c->watched_keys);
2035 /* Unsubscribe from all the pubsub channels */
2036 pubsubUnsubscribeAllChannels(c,0);
2037 pubsubUnsubscribeAllPatterns(c,0);
2038 dictRelease(c->pubsub_channels);
2039 listRelease(c->pubsub_patterns);
2040 /* Obvious cleanup */
2041 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2042 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2043 listRelease(c->reply);
2044 freeClientArgv(c);
2045 close(c->fd);
2046 /* Remove from the list of clients */
2047 ln = listSearchKey(server.clients,c);
2048 redisAssert(ln != NULL);
2049 listDelNode(server.clients,ln);
2050 /* Remove from the list of clients that are now ready to be restarted
2051 * after waiting for swapped keys */
2052 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2053 ln = listSearchKey(server.io_ready_clients,c);
2054 if (ln) {
2055 listDelNode(server.io_ready_clients,ln);
2056 server.vm_blocked_clients--;
2057 }
2058 }
2059 /* Remove from the list of clients waiting for swapped keys */
2060 while (server.vm_enabled && listLength(c->io_keys)) {
2061 ln = listFirst(c->io_keys);
2062 dontWaitForSwappedKey(c,ln->value);
2063 }
2064 listRelease(c->io_keys);
2065 /* Master/slave cleanup */
2066 if (c->flags & REDIS_SLAVE) {
2067 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2068 close(c->repldbfd);
2069 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2070 ln = listSearchKey(l,c);
2071 redisAssert(ln != NULL);
2072 listDelNode(l,ln);
2073 }
2074 if (c->flags & REDIS_MASTER) {
2075 server.master = NULL;
2076 server.replstate = REDIS_REPL_CONNECT;
2077 }
2078 /* Release memory */
2079 zfree(c->argv);
2080 zfree(c->mbargv);
2081 freeClientMultiState(c);
2082 zfree(c);
2083 }
2084
2085 #define GLUEREPLY_UP_TO (1024)
2086 static void glueReplyBuffersIfNeeded(redisClient *c) {
2087 int copylen = 0;
2088 char buf[GLUEREPLY_UP_TO];
2089 listNode *ln;
2090 listIter li;
2091 robj *o;
2092
2093 listRewind(c->reply,&li);
2094 while((ln = listNext(&li))) {
2095 int objlen;
2096
2097 o = ln->value;
2098 objlen = sdslen(o->ptr);
2099 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2100 memcpy(buf+copylen,o->ptr,objlen);
2101 copylen += objlen;
2102 listDelNode(c->reply,ln);
2103 } else {
2104 if (copylen == 0) return;
2105 break;
2106 }
2107 }
2108 /* Now the output buffer is empty, add the new single element */
2109 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2110 listAddNodeHead(c->reply,o);
2111 }
2112
2113 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2114 redisClient *c = privdata;
2115 int nwritten = 0, totwritten = 0, objlen;
2116 robj *o;
2117 REDIS_NOTUSED(el);
2118 REDIS_NOTUSED(mask);
2119
2120 /* Use writev() if we have enough buffers to send */
2121 if (!server.glueoutputbuf &&
2122 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2123 !(c->flags & REDIS_MASTER))
2124 {
2125 sendReplyToClientWritev(el, fd, privdata, mask);
2126 return;
2127 }
2128
2129 while(listLength(c->reply)) {
2130 if (server.glueoutputbuf && listLength(c->reply) > 1)
2131 glueReplyBuffersIfNeeded(c);
2132
2133 o = listNodeValue(listFirst(c->reply));
2134 objlen = sdslen(o->ptr);
2135
2136 if (objlen == 0) {
2137 listDelNode(c->reply,listFirst(c->reply));
2138 continue;
2139 }
2140
2141 if (c->flags & REDIS_MASTER) {
2142 /* Don't reply to a master */
2143 nwritten = objlen - c->sentlen;
2144 } else {
2145 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2146 if (nwritten <= 0) break;
2147 }
2148 c->sentlen += nwritten;
2149 totwritten += nwritten;
2150 /* If we fully sent the object on head go to the next one */
2151 if (c->sentlen == objlen) {
2152 listDelNode(c->reply,listFirst(c->reply));
2153 c->sentlen = 0;
2154 }
2155 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2156 * bytes, in a single threaded server it's a good idea to serve
2157 * other clients as well, even if a very large request comes from
2158 * super fast link that is always able to accept data (in real world
2159 * scenario think about 'KEYS *' against the loopback interfae) */
2160 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2161 }
2162 if (nwritten == -1) {
2163 if (errno == EAGAIN) {
2164 nwritten = 0;
2165 } else {
2166 redisLog(REDIS_VERBOSE,
2167 "Error writing to client: %s", strerror(errno));
2168 freeClient(c);
2169 return;
2170 }
2171 }
2172 if (totwritten > 0) c->lastinteraction = time(NULL);
2173 if (listLength(c->reply) == 0) {
2174 c->sentlen = 0;
2175 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2176 }
2177 }
2178
2179 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2180 {
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen, willwrite;
2183 robj *o;
2184 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2185 int offset, ion = 0;
2186 REDIS_NOTUSED(el);
2187 REDIS_NOTUSED(mask);
2188
2189 listNode *node;
2190 while (listLength(c->reply)) {
2191 offset = c->sentlen;
2192 ion = 0;
2193 willwrite = 0;
2194
2195 /* fill-in the iov[] array */
2196 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2197 o = listNodeValue(node);
2198 objlen = sdslen(o->ptr);
2199
2200 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2201 break;
2202
2203 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2204 break; /* no more iovecs */
2205
2206 iov[ion].iov_base = ((char*)o->ptr) + offset;
2207 iov[ion].iov_len = objlen - offset;
2208 willwrite += objlen - offset;
2209 offset = 0; /* just for the first item */
2210 ion++;
2211 }
2212
2213 if(willwrite == 0)
2214 break;
2215
2216 /* write all collected blocks at once */
2217 if((nwritten = writev(fd, iov, ion)) < 0) {
2218 if (errno != EAGAIN) {
2219 redisLog(REDIS_VERBOSE,
2220 "Error writing to client: %s", strerror(errno));
2221 freeClient(c);
2222 return;
2223 }
2224 break;
2225 }
2226
2227 totwritten += nwritten;
2228 offset = c->sentlen;
2229
2230 /* remove written robjs from c->reply */
2231 while (nwritten && listLength(c->reply)) {
2232 o = listNodeValue(listFirst(c->reply));
2233 objlen = sdslen(o->ptr);
2234
2235 if(nwritten >= objlen - offset) {
2236 listDelNode(c->reply, listFirst(c->reply));
2237 nwritten -= objlen - offset;
2238 c->sentlen = 0;
2239 } else {
2240 /* partial write */
2241 c->sentlen += nwritten;
2242 break;
2243 }
2244 offset = 0;
2245 }
2246 }
2247
2248 if (totwritten > 0)
2249 c->lastinteraction = time(NULL);
2250
2251 if (listLength(c->reply) == 0) {
2252 c->sentlen = 0;
2253 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2254 }
2255 }
2256
2257 static int qsortRedisCommands(const void *r1, const void *r2) {
2258 return strcasecmp(
2259 ((struct redisCommand*)r1)->name,
2260 ((struct redisCommand*)r2)->name);
2261 }
2262
2263 static void sortCommandTable() {
2264 /* Copy and sort the read-only version of the command table */
2265 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2266 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2267 qsort(commandTable,
2268 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2269 sizeof(struct redisCommand),qsortRedisCommands);
2270 }
2271
2272 static struct redisCommand *lookupCommand(char *name) {
2273 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2274 return bsearch(
2275 &tmp,
2276 commandTable,
2277 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2278 sizeof(struct redisCommand),
2279 qsortRedisCommands);
2280 }
2281
2282 /* resetClient prepare the client to process the next command */
2283 static void resetClient(redisClient *c) {
2284 freeClientArgv(c);
2285 c->bulklen = -1;
2286 c->multibulk = 0;
2287 }
2288
2289 /* Call() is the core of Redis execution of a command */
2290 static void call(redisClient *c, struct redisCommand *cmd) {
2291 long long dirty;
2292
2293 dirty = server.dirty;
2294 cmd->proc(c);
2295 dirty = server.dirty-dirty;
2296
2297 if (server.appendonly && dirty)
2298 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2299 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2300 listLength(server.slaves))
2301 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2302 if (listLength(server.monitors))
2303 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2304 server.stat_numcommands++;
2305 }
2306
2307 /* If this function gets called we already read a whole
2308 * command, argments are in the client argv/argc fields.
2309 * processCommand() execute the command or prepare the
2310 * server for a bulk read from the client.
2311 *
2312 * If 1 is returned the client is still alive and valid and
2313 * and other operations can be performed by the caller. Otherwise
2314 * if 0 is returned the client was destroied (i.e. after QUIT). */
2315 static int processCommand(redisClient *c) {
2316 struct redisCommand *cmd;
2317
2318 /* Free some memory if needed (maxmemory setting) */
2319 if (server.maxmemory) freeMemoryIfNeeded();
2320
2321 /* Handle the multi bulk command type. This is an alternative protocol
2322 * supported by Redis in order to receive commands that are composed of
2323 * multiple binary-safe "bulk" arguments. The latency of processing is
2324 * a bit higher but this allows things like multi-sets, so if this
2325 * protocol is used only for MSET and similar commands this is a big win. */
2326 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2327 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2328 if (c->multibulk <= 0) {
2329 resetClient(c);
2330 return 1;
2331 } else {
2332 decrRefCount(c->argv[c->argc-1]);
2333 c->argc--;
2334 return 1;
2335 }
2336 } else if (c->multibulk) {
2337 if (c->bulklen == -1) {
2338 if (((char*)c->argv[0]->ptr)[0] != '$') {
2339 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2340 resetClient(c);
2341 return 1;
2342 } else {
2343 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2344 decrRefCount(c->argv[0]);
2345 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2346 c->argc--;
2347 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2348 resetClient(c);
2349 return 1;
2350 }
2351 c->argc--;
2352 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2353 return 1;
2354 }
2355 } else {
2356 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2357 c->mbargv[c->mbargc] = c->argv[0];
2358 c->mbargc++;
2359 c->argc--;
2360 c->multibulk--;
2361 if (c->multibulk == 0) {
2362 robj **auxargv;
2363 int auxargc;
2364
2365 /* Here we need to swap the multi-bulk argc/argv with the
2366 * normal argc/argv of the client structure. */
2367 auxargv = c->argv;
2368 c->argv = c->mbargv;
2369 c->mbargv = auxargv;
2370
2371 auxargc = c->argc;
2372 c->argc = c->mbargc;
2373 c->mbargc = auxargc;
2374
2375 /* We need to set bulklen to something different than -1
2376 * in order for the code below to process the command without
2377 * to try to read the last argument of a bulk command as
2378 * a special argument. */
2379 c->bulklen = 0;
2380 /* continue below and process the command */
2381 } else {
2382 c->bulklen = -1;
2383 return 1;
2384 }
2385 }
2386 }
2387 /* -- end of multi bulk commands processing -- */
2388
2389 /* The QUIT command is handled as a special case. Normal command
2390 * procs are unable to close the client connection safely */
2391 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2392 freeClient(c);
2393 return 0;
2394 }
2395
2396 /* Now lookup the command and check ASAP about trivial error conditions
2397 * such wrong arity, bad command name and so forth. */
2398 cmd = lookupCommand(c->argv[0]->ptr);
2399 if (!cmd) {
2400 addReplySds(c,
2401 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2402 (char*)c->argv[0]->ptr));
2403 resetClient(c);
2404 return 1;
2405 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2406 (c->argc < -cmd->arity)) {
2407 addReplySds(c,
2408 sdscatprintf(sdsempty(),
2409 "-ERR wrong number of arguments for '%s' command\r\n",
2410 cmd->name));
2411 resetClient(c);
2412 return 1;
2413 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2414 /* This is a bulk command, we have to read the last argument yet. */
2415 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2416
2417 decrRefCount(c->argv[c->argc-1]);
2418 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2419 c->argc--;
2420 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2421 resetClient(c);
2422 return 1;
2423 }
2424 c->argc--;
2425 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2426 /* It is possible that the bulk read is already in the
2427 * buffer. Check this condition and handle it accordingly.
2428 * This is just a fast path, alternative to call processInputBuffer().
2429 * It's a good idea since the code is small and this condition
2430 * happens most of the times. */
2431 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2432 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2433 c->argc++;
2434 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2435 } else {
2436 /* Otherwise return... there is to read the last argument
2437 * from the socket. */
2438 return 1;
2439 }
2440 }
2441 /* Let's try to encode the bulk object to save space. */
2442 if (cmd->flags & REDIS_CMD_BULK)
2443 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2444
2445 /* Check if the user is authenticated */
2446 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2447 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2448 resetClient(c);
2449 return 1;
2450 }
2451
2452 /* Handle the maxmemory directive */
2453 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2454 zmalloc_used_memory() > server.maxmemory)
2455 {
2456 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2457 resetClient(c);
2458 return 1;
2459 }
2460
2461 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2462 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2463 &&
2464 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2465 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2466 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2467 resetClient(c);
2468 return 1;
2469 }
2470
2471 /* Exec the command */
2472 if (c->flags & REDIS_MULTI &&
2473 cmd->proc != execCommand && cmd->proc != discardCommand &&
2474 cmd->proc != multiCommand && cmd->proc != watchCommand)
2475 {
2476 queueMultiCommand(c,cmd);
2477 addReply(c,shared.queued);
2478 } else {
2479 if (server.vm_enabled && server.vm_max_threads > 0 &&
2480 blockClientOnSwappedKeys(c,cmd)) return 1;
2481 call(c,cmd);
2482 }
2483
2484 /* Prepare the client for the next command */
2485 resetClient(c);
2486 return 1;
2487 }
2488
2489 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2490 listNode *ln;
2491 listIter li;
2492 int outc = 0, j;
2493 robj **outv;
2494 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2495 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2496 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2497 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2498 robj *lenobj;
2499
2500 if (argc <= REDIS_STATIC_ARGS) {
2501 outv = static_outv;
2502 } else {
2503 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2504 }
2505
2506 lenobj = createObject(REDIS_STRING,
2507 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2508 lenobj->refcount = 0;
2509 outv[outc++] = lenobj;
2510 for (j = 0; j < argc; j++) {
2511 lenobj = createObject(REDIS_STRING,
2512 sdscatprintf(sdsempty(),"$%lu\r\n",
2513 (unsigned long) stringObjectLen(argv[j])));
2514 lenobj->refcount = 0;
2515 outv[outc++] = lenobj;
2516 outv[outc++] = argv[j];
2517 outv[outc++] = shared.crlf;
2518 }
2519
2520 /* Increment all the refcounts at start and decrement at end in order to
2521 * be sure to free objects if there is no slave in a replication state
2522 * able to be feed with commands */
2523 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2524 listRewind(slaves,&li);
2525 while((ln = listNext(&li))) {
2526 redisClient *slave = ln->value;
2527
2528 /* Don't feed slaves that are still waiting for BGSAVE to start */
2529 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2530
2531 /* Feed all the other slaves, MONITORs and so on */
2532 if (slave->slaveseldb != dictid) {
2533 robj *selectcmd;
2534
2535 switch(dictid) {
2536 case 0: selectcmd = shared.select0; break;
2537 case 1: selectcmd = shared.select1; break;
2538 case 2: selectcmd = shared.select2; break;
2539 case 3: selectcmd = shared.select3; break;
2540 case 4: selectcmd = shared.select4; break;
2541 case 5: selectcmd = shared.select5; break;
2542 case 6: selectcmd = shared.select6; break;
2543 case 7: selectcmd = shared.select7; break;
2544 case 8: selectcmd = shared.select8; break;
2545 case 9: selectcmd = shared.select9; break;
2546 default:
2547 selectcmd = createObject(REDIS_STRING,
2548 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2549 selectcmd->refcount = 0;
2550 break;
2551 }
2552 addReply(slave,selectcmd);
2553 slave->slaveseldb = dictid;
2554 }
2555 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2556 }
2557 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2558 if (outv != static_outv) zfree(outv);
2559 }
2560
2561 static sds sdscatrepr(sds s, char *p, size_t len) {
2562 s = sdscatlen(s,"\"",1);
2563 while(len--) {
2564 switch(*p) {
2565 case '\\':
2566 case '"':
2567 s = sdscatprintf(s,"\\%c",*p);
2568 break;
2569 case '\n': s = sdscatlen(s,"\\n",1); break;
2570 case '\r': s = sdscatlen(s,"\\r",1); break;
2571 case '\t': s = sdscatlen(s,"\\t",1); break;
2572 case '\a': s = sdscatlen(s,"\\a",1); break;
2573 case '\b': s = sdscatlen(s,"\\b",1); break;
2574 default:
2575 if (isprint(*p))
2576 s = sdscatprintf(s,"%c",*p);
2577 else
2578 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2579 break;
2580 }
2581 p++;
2582 }
2583 return sdscatlen(s,"\"",1);
2584 }
2585
2586 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2587 listNode *ln;
2588 listIter li;
2589 int j;
2590 sds cmdrepr = sdsnew("+");
2591 robj *cmdobj;
2592 struct timeval tv;
2593
2594 gettimeofday(&tv,NULL);
2595 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2596 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2597
2598 for (j = 0; j < argc; j++) {
2599 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2600 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2601 } else {
2602 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2603 sdslen(argv[j]->ptr));
2604 }
2605 if (j != argc-1)
2606 cmdrepr = sdscatlen(cmdrepr," ",1);
2607 }
2608 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2609 cmdobj = createObject(REDIS_STRING,cmdrepr);
2610
2611 listRewind(monitors,&li);
2612 while((ln = listNext(&li))) {
2613 redisClient *monitor = ln->value;
2614 addReply(monitor,cmdobj);
2615 }
2616 decrRefCount(cmdobj);
2617 }
2618
2619 static void processInputBuffer(redisClient *c) {
2620 again:
2621 /* Before to process the input buffer, make sure the client is not
2622 * waitig for a blocking operation such as BLPOP. Note that the first
2623 * iteration the client is never blocked, otherwise the processInputBuffer
2624 * would not be called at all, but after the execution of the first commands
2625 * in the input buffer the client may be blocked, and the "goto again"
2626 * will try to reiterate. The following line will make it return asap. */
2627 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2628 if (c->bulklen == -1) {
2629 /* Read the first line of the query */
2630 char *p = strchr(c->querybuf,'\n');
2631 size_t querylen;
2632
2633 if (p) {
2634 sds query, *argv;
2635 int argc, j;
2636
2637 query = c->querybuf;
2638 c->querybuf = sdsempty();
2639 querylen = 1+(p-(query));
2640 if (sdslen(query) > querylen) {
2641 /* leave data after the first line of the query in the buffer */
2642 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2643 }
2644 *p = '\0'; /* remove "\n" */
2645 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2646 sdsupdatelen(query);
2647
2648 /* Now we can split the query in arguments */
2649 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2650 sdsfree(query);
2651
2652 if (c->argv) zfree(c->argv);
2653 c->argv = zmalloc(sizeof(robj*)*argc);
2654
2655 for (j = 0; j < argc; j++) {
2656 if (sdslen(argv[j])) {
2657 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2658 c->argc++;
2659 } else {
2660 sdsfree(argv[j]);
2661 }
2662 }
2663 zfree(argv);
2664 if (c->argc) {
2665 /* Execute the command. If the client is still valid
2666 * after processCommand() return and there is something
2667 * on the query buffer try to process the next command. */
2668 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2669 } else {
2670 /* Nothing to process, argc == 0. Just process the query
2671 * buffer if it's not empty or return to the caller */
2672 if (sdslen(c->querybuf)) goto again;
2673 }
2674 return;
2675 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2676 redisLog(REDIS_VERBOSE, "Client protocol error");
2677 freeClient(c);
2678 return;
2679 }
2680 } else {
2681 /* Bulk read handling. Note that if we are at this point
2682 the client already sent a command terminated with a newline,
2683 we are reading the bulk data that is actually the last
2684 argument of the command. */
2685 int qbl = sdslen(c->querybuf);
2686
2687 if (c->bulklen <= qbl) {
2688 /* Copy everything but the final CRLF as final argument */
2689 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2690 c->argc++;
2691 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2692 /* Process the command. If the client is still valid after
2693 * the processing and there is more data in the buffer
2694 * try to parse it. */
2695 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2696 return;
2697 }
2698 }
2699 }
2700
2701 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2702 redisClient *c = (redisClient*) privdata;
2703 char buf[REDIS_IOBUF_LEN];
2704 int nread;
2705 REDIS_NOTUSED(el);
2706 REDIS_NOTUSED(mask);
2707
2708 nread = read(fd, buf, REDIS_IOBUF_LEN);
2709 if (nread == -1) {
2710 if (errno == EAGAIN) {
2711 nread = 0;
2712 } else {
2713 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2714 freeClient(c);
2715 return;
2716 }
2717 } else if (nread == 0) {
2718 redisLog(REDIS_VERBOSE, "Client closed connection");
2719 freeClient(c);
2720 return;
2721 }
2722 if (nread) {
2723 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2724 c->lastinteraction = time(NULL);
2725 } else {
2726 return;
2727 }
2728 processInputBuffer(c);
2729 }
2730
2731 static int selectDb(redisClient *c, int id) {
2732 if (id < 0 || id >= server.dbnum)
2733 return REDIS_ERR;
2734 c->db = &server.db[id];
2735 return REDIS_OK;
2736 }
2737
2738 static void *dupClientReplyValue(void *o) {
2739 incrRefCount((robj*)o);
2740 return o;
2741 }
2742
2743 static int listMatchObjects(void *a, void *b) {
2744 return equalStringObjects(a,b);
2745 }
2746
2747 static redisClient *createClient(int fd) {
2748 redisClient *c = zmalloc(sizeof(*c));
2749
2750 anetNonBlock(NULL,fd);
2751 anetTcpNoDelay(NULL,fd);
2752 if (!c) return NULL;
2753 selectDb(c,0);
2754 c->fd = fd;
2755 c->querybuf = sdsempty();
2756 c->argc = 0;
2757 c->argv = NULL;
2758 c->bulklen = -1;
2759 c->multibulk = 0;
2760 c->mbargc = 0;
2761 c->mbargv = NULL;
2762 c->sentlen = 0;
2763 c->flags = 0;
2764 c->lastinteraction = time(NULL);
2765 c->authenticated = 0;
2766 c->replstate = REDIS_REPL_NONE;
2767 c->reply = listCreate();
2768 listSetFreeMethod(c->reply,decrRefCount);
2769 listSetDupMethod(c->reply,dupClientReplyValue);
2770 c->blocking_keys = NULL;
2771 c->blocking_keys_num = 0;
2772 c->io_keys = listCreate();
2773 c->watched_keys = listCreate();
2774 listSetFreeMethod(c->io_keys,decrRefCount);
2775 c->pubsub_channels = dictCreate(&setDictType,NULL);
2776 c->pubsub_patterns = listCreate();
2777 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2778 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2779 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2780 readQueryFromClient, c) == AE_ERR) {
2781 freeClient(c);
2782 return NULL;
2783 }
2784 listAddNodeTail(server.clients,c);
2785 initClientMultiState(c);
2786 return c;
2787 }
2788
2789 static void addReply(redisClient *c, robj *obj) {
2790 if (listLength(c->reply) == 0 &&
2791 (c->replstate == REDIS_REPL_NONE ||
2792 c->replstate == REDIS_REPL_ONLINE) &&
2793 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2794 sendReplyToClient, c) == AE_ERR) return;
2795
2796 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2797 obj = dupStringObject(obj);
2798 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2799 }
2800 listAddNodeTail(c->reply,getDecodedObject(obj));
2801 }
2802
2803 static void addReplySds(redisClient *c, sds s) {
2804 robj *o = createObject(REDIS_STRING,s);
2805 addReply(c,o);
2806 decrRefCount(o);
2807 }
2808
2809 static void addReplyDouble(redisClient *c, double d) {
2810 char buf[128];
2811
2812 snprintf(buf,sizeof(buf),"%.17g",d);
2813 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2814 (unsigned long) strlen(buf),buf));
2815 }
2816
2817 static void addReplyLongLong(redisClient *c, long long ll) {
2818 char buf[128];
2819 size_t len;
2820
2821 if (ll == 0) {
2822 addReply(c,shared.czero);
2823 return;
2824 } else if (ll == 1) {
2825 addReply(c,shared.cone);
2826 return;
2827 }
2828 buf[0] = ':';
2829 len = ll2string(buf+1,sizeof(buf)-1,ll);
2830 buf[len+1] = '\r';
2831 buf[len+2] = '\n';
2832 addReplySds(c,sdsnewlen(buf,len+3));
2833 }
2834
2835 static void addReplyUlong(redisClient *c, unsigned long ul) {
2836 char buf[128];
2837 size_t len;
2838
2839 if (ul == 0) {
2840 addReply(c,shared.czero);
2841 return;
2842 } else if (ul == 1) {
2843 addReply(c,shared.cone);
2844 return;
2845 }
2846 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2847 addReplySds(c,sdsnewlen(buf,len));
2848 }
2849
2850 static void addReplyBulkLen(redisClient *c, robj *obj) {
2851 size_t len, intlen;
2852 char buf[128];
2853
2854 if (obj->encoding == REDIS_ENCODING_RAW) {
2855 len = sdslen(obj->ptr);
2856 } else {
2857 long n = (long)obj->ptr;
2858
2859 /* Compute how many bytes will take this integer as a radix 10 string */
2860 len = 1;
2861 if (n < 0) {
2862 len++;
2863 n = -n;
2864 }
2865 while((n = n/10) != 0) {
2866 len++;
2867 }
2868 }
2869 buf[0] = '$';
2870 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2871 buf[intlen+1] = '\r';
2872 buf[intlen+2] = '\n';
2873 addReplySds(c,sdsnewlen(buf,intlen+3));
2874 }
2875
2876 static void addReplyBulk(redisClient *c, robj *obj) {
2877 addReplyBulkLen(c,obj);
2878 addReply(c,obj);
2879 addReply(c,shared.crlf);
2880 }
2881
2882 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2883 static void addReplyBulkCString(redisClient *c, char *s) {
2884 if (s == NULL) {
2885 addReply(c,shared.nullbulk);
2886 } else {
2887 robj *o = createStringObject(s,strlen(s));
2888 addReplyBulk(c,o);
2889 decrRefCount(o);
2890 }
2891 }
2892
2893 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2894 int cport, cfd;
2895 char cip[128];
2896 redisClient *c;
2897 REDIS_NOTUSED(el);
2898 REDIS_NOTUSED(mask);
2899 REDIS_NOTUSED(privdata);
2900
2901 cfd = anetAccept(server.neterr, fd, cip, &cport);
2902 if (cfd == AE_ERR) {
2903 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2904 return;
2905 }
2906 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2907 if ((c = createClient(cfd)) == NULL) {
2908 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2909 close(cfd); /* May be already closed, just ingore errors */
2910 return;
2911 }
2912 /* If maxclient directive is set and this is one client more... close the
2913 * connection. Note that we create the client instead to check before
2914 * for this condition, since now the socket is already set in nonblocking
2915 * mode and we can send an error for free using the Kernel I/O */
2916 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2917 char *err = "-ERR max number of clients reached\r\n";
2918
2919 /* That's a best effort error message, don't check write errors */
2920 if (write(c->fd,err,strlen(err)) == -1) {
2921 /* Nothing to do, Just to avoid the warning... */
2922 }
2923 freeClient(c);
2924 return;
2925 }
2926 server.stat_numconnections++;
2927 }
2928
2929 /* ======================= Redis objects implementation ===================== */
2930
2931 static robj *createObject(int type, void *ptr) {
2932 robj *o;
2933
2934 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2935 if (listLength(server.objfreelist)) {
2936 listNode *head = listFirst(server.objfreelist);
2937 o = listNodeValue(head);
2938 listDelNode(server.objfreelist,head);
2939 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2940 } else {
2941 if (server.vm_enabled) {
2942 pthread_mutex_unlock(&server.obj_freelist_mutex);
2943 o = zmalloc(sizeof(*o));
2944 } else {
2945 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2946 }
2947 }
2948 o->type = type;
2949 o->encoding = REDIS_ENCODING_RAW;
2950 o->ptr = ptr;
2951 o->refcount = 1;
2952 if (server.vm_enabled) {
2953 /* Note that this code may run in the context of an I/O thread
2954 * and accessing to server.unixtime in theory is an error
2955 * (no locks). But in practice this is safe, and even if we read
2956 * garbage Redis will not fail, as it's just a statistical info */
2957 o->vm.atime = server.unixtime;
2958 o->storage = REDIS_VM_MEMORY;
2959 }
2960 return o;
2961 }
2962
2963 static robj *createStringObject(char *ptr, size_t len) {
2964 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2965 }
2966
2967 static robj *createStringObjectFromLongLong(long long value) {
2968 robj *o;
2969 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2970 incrRefCount(shared.integers[value]);
2971 o = shared.integers[value];
2972 } else {
2973 if (value >= LONG_MIN && value <= LONG_MAX) {
2974 o = createObject(REDIS_STRING, NULL);
2975 o->encoding = REDIS_ENCODING_INT;
2976 o->ptr = (void*)((long)value);
2977 } else {
2978 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2979 }
2980 }
2981 return o;
2982 }
2983
2984 static robj *dupStringObject(robj *o) {
2985 assert(o->encoding == REDIS_ENCODING_RAW);
2986 return createStringObject(o->ptr,sdslen(o->ptr));
2987 }
2988
2989 static robj *createListObject(void) {
2990 list *l = listCreate();
2991
2992 listSetFreeMethod(l,decrRefCount);
2993 return createObject(REDIS_LIST,l);
2994 }
2995
2996 static robj *createSetObject(void) {
2997 dict *d = dictCreate(&setDictType,NULL);
2998 return createObject(REDIS_SET,d);
2999 }
3000
3001 static robj *createHashObject(void) {
3002 /* All the Hashes start as zipmaps. Will be automatically converted
3003 * into hash tables if there are enough elements or big elements
3004 * inside. */
3005 unsigned char *zm = zipmapNew();
3006 robj *o = createObject(REDIS_HASH,zm);
3007 o->encoding = REDIS_ENCODING_ZIPMAP;
3008 return o;
3009 }
3010
3011 static robj *createZsetObject(void) {
3012 zset *zs = zmalloc(sizeof(*zs));
3013
3014 zs->dict = dictCreate(&zsetDictType,NULL);
3015 zs->zsl = zslCreate();
3016 return createObject(REDIS_ZSET,zs);
3017 }
3018
3019 static void freeStringObject(robj *o) {
3020 if (o->encoding == REDIS_ENCODING_RAW) {
3021 sdsfree(o->ptr);
3022 }
3023 }
3024
3025 static void freeListObject(robj *o) {
3026 listRelease((list*) o->ptr);
3027 }
3028
3029 static void freeSetObject(robj *o) {
3030 dictRelease((dict*) o->ptr);
3031 }
3032
3033 static void freeZsetObject(robj *o) {
3034 zset *zs = o->ptr;
3035
3036 dictRelease(zs->dict);
3037 zslFree(zs->zsl);
3038 zfree(zs);
3039 }
3040
3041 static void freeHashObject(robj *o) {
3042 switch (o->encoding) {
3043 case REDIS_ENCODING_HT:
3044 dictRelease((dict*) o->ptr);
3045 break;
3046 case REDIS_ENCODING_ZIPMAP:
3047 zfree(o->ptr);
3048 break;
3049 default:
3050 redisPanic("Unknown hash encoding type");
3051 break;
3052 }
3053 }
3054
3055 static void incrRefCount(robj *o) {
3056 o->refcount++;
3057 }
3058
3059 static void decrRefCount(void *obj) {
3060 robj *o = obj;
3061
3062 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3063 /* Object is a key of a swapped out value, or in the process of being
3064 * loaded. */
3065 if (server.vm_enabled &&
3066 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3067 {
3068 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3069 redisAssert(o->type == REDIS_STRING);
3070 freeStringObject(o);
3071 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3072 pthread_mutex_lock(&server.obj_freelist_mutex);
3073 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3074 !listAddNodeHead(server.objfreelist,o))
3075 zfree(o);
3076 pthread_mutex_unlock(&server.obj_freelist_mutex);
3077 server.vm_stats_swapped_objects--;
3078 return;
3079 }
3080 /* Object is in memory, or in the process of being swapped out. */
3081 if (--(o->refcount) == 0) {
3082 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3083 vmCancelThreadedIOJob(obj);
3084 switch(o->type) {
3085 case REDIS_STRING: freeStringObject(o); break;
3086 case REDIS_LIST: freeListObject(o); break;
3087 case REDIS_SET: freeSetObject(o); break;
3088 case REDIS_ZSET: freeZsetObject(o); break;
3089 case REDIS_HASH: freeHashObject(o); break;
3090 default: redisPanic("Unknown object type"); break;
3091 }
3092 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3093 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3094 !listAddNodeHead(server.objfreelist,o))
3095 zfree(o);
3096 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3097 }
3098 }
3099
3100 static robj *lookupKey(redisDb *db, robj *key) {
3101 dictEntry *de = dictFind(db->dict,key);
3102 if (de) {
3103 robj *key = dictGetEntryKey(de);
3104 robj *val = dictGetEntryVal(de);
3105
3106 if (server.vm_enabled) {
3107 if (key->storage == REDIS_VM_MEMORY ||
3108 key->storage == REDIS_VM_SWAPPING)
3109 {
3110 /* If we were swapping the object out, stop it, this key
3111 * was requested. */
3112 if (key->storage == REDIS_VM_SWAPPING)
3113 vmCancelThreadedIOJob(key);
3114 /* Update the access time of the key for the aging algorithm. */
3115 key->vm.atime = server.unixtime;
3116 } else {
3117 int notify = (key->storage == REDIS_VM_LOADING);
3118
3119 /* Our value was swapped on disk. Bring it at home. */
3120 redisAssert(val == NULL);
3121 val = vmLoadObject(key);
3122 dictGetEntryVal(de) = val;
3123
3124 /* Clients blocked by the VM subsystem may be waiting for
3125 * this key... */
3126 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3127 }
3128 }
3129 return val;
3130 } else {
3131 return NULL;
3132 }
3133 }
3134
3135 static robj *lookupKeyRead(redisDb *db, robj *key) {
3136 expireIfNeeded(db,key);
3137 return lookupKey(db,key);
3138 }
3139
3140 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3141 deleteIfVolatile(db,key);
3142 touchWatchedKey(db,key);
3143 return lookupKey(db,key);
3144 }
3145
3146 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3147 robj *o = lookupKeyRead(c->db, key);
3148 if (!o) addReply(c,reply);
3149 return o;
3150 }
3151
3152 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3153 robj *o = lookupKeyWrite(c->db, key);
3154 if (!o) addReply(c,reply);
3155 return o;
3156 }
3157
3158 static int checkType(redisClient *c, robj *o, int type) {
3159 if (o->type != type) {
3160 addReply(c,shared.wrongtypeerr);
3161 return 1;
3162 }
3163 return 0;
3164 }
3165
3166 static int deleteKey(redisDb *db, robj *key) {
3167 int retval;
3168
3169 /* We need to protect key from destruction: after the first dictDelete()
3170 * it may happen that 'key' is no longer valid if we don't increment
3171 * it's count. This may happen when we get the object reference directly
3172 * from the hash table with dictRandomKey() or dict iterators */
3173 incrRefCount(key);
3174 if (dictSize(db->expires)) dictDelete(db->expires,key);
3175 retval = dictDelete(db->dict,key);
3176 decrRefCount(key);
3177
3178 return retval == DICT_OK;
3179 }
3180
3181 /* Check if the nul-terminated string 's' can be represented by a long
3182 * (that is, is a number that fits into long without any other space or
3183 * character before or after the digits).
3184 *
3185 * If so, the function returns REDIS_OK and *longval is set to the value
3186 * of the number. Otherwise REDIS_ERR is returned */
3187 static int isStringRepresentableAsLong(sds s, long *longval) {
3188 char buf[32], *endptr;
3189 long value;
3190 int slen;
3191
3192 value = strtol(s, &endptr, 10);
3193 if (endptr[0] != '\0') return REDIS_ERR;
3194 slen = ll2string(buf,32,value);
3195
3196 /* If the number converted back into a string is not identical
3197 * then it's not possible to encode the string as integer */
3198 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3199 if (longval) *longval = value;
3200 return REDIS_OK;
3201 }
3202
3203 /* Try to encode a string object in order to save space */
3204 static robj *tryObjectEncoding(robj *o) {
3205 long value;
3206 sds s = o->ptr;
3207
3208 if (o->encoding != REDIS_ENCODING_RAW)
3209 return o; /* Already encoded */
3210
3211 /* It's not safe to encode shared objects: shared objects can be shared
3212 * everywhere in the "object space" of Redis. Encoded objects can only
3213 * appear as "values" (and not, for instance, as keys) */
3214 if (o->refcount > 1) return o;
3215
3216 /* Currently we try to encode only strings */
3217 redisAssert(o->type == REDIS_STRING);
3218
3219 /* Check if we can represent this string as a long integer */
3220 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3221
3222 /* Ok, this object can be encoded */
3223 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3224 decrRefCount(o);
3225 incrRefCount(shared.integers[value]);
3226 return shared.integers[value];
3227 } else {
3228 o->encoding = REDIS_ENCODING_INT;
3229 sdsfree(o->ptr);
3230 o->ptr = (void*) value;
3231 return o;
3232 }
3233 }
3234
3235 /* Get a decoded version of an encoded object (returned as a new object).
3236 * If the object is already raw-encoded just increment the ref count. */
3237 static robj *getDecodedObject(robj *o) {
3238 robj *dec;
3239
3240 if (o->encoding == REDIS_ENCODING_RAW) {
3241 incrRefCount(o);
3242 return o;
3243 }
3244 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3245 char buf[32];
3246
3247 ll2string(buf,32,(long)o->ptr);
3248 dec = createStringObject(buf,strlen(buf));
3249 return dec;
3250 } else {
3251 redisPanic("Unknown encoding type");
3252 }
3253 }
3254
3255 /* Compare two string objects via strcmp() or alike.
3256 * Note that the objects may be integer-encoded. In such a case we
3257 * use ll2string() to get a string representation of the numbers on the stack
3258 * and compare the strings, it's much faster than calling getDecodedObject().
3259 *
3260 * Important note: if objects are not integer encoded, but binary-safe strings,
3261 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3262 * binary safe. */
3263 static int compareStringObjects(robj *a, robj *b) {
3264 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3265 char bufa[128], bufb[128], *astr, *bstr;
3266 int bothsds = 1;
3267
3268 if (a == b) return 0;
3269 if (a->encoding != REDIS_ENCODING_RAW) {
3270 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3271 astr = bufa;
3272 bothsds = 0;
3273 } else {
3274 astr = a->ptr;
3275 }
3276 if (b->encoding != REDIS_ENCODING_RAW) {
3277 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3278 bstr = bufb;
3279 bothsds = 0;
3280 } else {
3281 bstr = b->ptr;
3282 }
3283 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3284 }
3285
3286 /* Equal string objects return 1 if the two objects are the same from the
3287 * point of view of a string comparison, otherwise 0 is returned. Note that
3288 * this function is faster then checking for (compareStringObject(a,b) == 0)
3289 * because it can perform some more optimization. */
3290 static int equalStringObjects(robj *a, robj *b) {
3291 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3292 return a->ptr == b->ptr;
3293 } else {
3294 return compareStringObjects(a,b) == 0;
3295 }
3296 }
3297
3298 static size_t stringObjectLen(robj *o) {
3299 redisAssert(o->type == REDIS_STRING);
3300 if (o->encoding == REDIS_ENCODING_RAW) {
3301 return sdslen(o->ptr);
3302 } else {
3303 char buf[32];
3304
3305 return ll2string(buf,32,(long)o->ptr);
3306 }
3307 }
3308
3309 static int getDoubleFromObject(robj *o, double *target) {
3310 double value;
3311 char *eptr;
3312
3313 if (o == NULL) {
3314 value = 0;
3315 } else {
3316 redisAssert(o->type == REDIS_STRING);
3317 if (o->encoding == REDIS_ENCODING_RAW) {
3318 value = strtod(o->ptr, &eptr);
3319 if (eptr[0] != '\0') return REDIS_ERR;
3320 } else if (o->encoding == REDIS_ENCODING_INT) {
3321 value = (long)o->ptr;
3322 } else {
3323 redisPanic("Unknown string encoding");
3324 }
3325 }
3326
3327 *target = value;
3328 return REDIS_OK;
3329 }
3330
3331 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3332 double value;
3333 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3334 if (msg != NULL) {
3335 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3336 } else {
3337 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3338 }
3339 return REDIS_ERR;
3340 }
3341
3342 *target = value;
3343 return REDIS_OK;
3344 }
3345
3346 static int getLongLongFromObject(robj *o, long long *target) {
3347 long long value;
3348 char *eptr;
3349
3350 if (o == NULL) {
3351 value = 0;
3352 } else {
3353 redisAssert(o->type == REDIS_STRING);
3354 if (o->encoding == REDIS_ENCODING_RAW) {
3355 value = strtoll(o->ptr, &eptr, 10);
3356 if (eptr[0] != '\0') return REDIS_ERR;
3357 } else if (o->encoding == REDIS_ENCODING_INT) {
3358 value = (long)o->ptr;
3359 } else {
3360 redisPanic("Unknown string encoding");
3361 }
3362 }
3363
3364 *target = value;
3365 return REDIS_OK;
3366 }
3367
3368 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3369 long long value;
3370 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3371 if (msg != NULL) {
3372 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3373 } else {
3374 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3375 }
3376 return REDIS_ERR;
3377 }
3378
3379 *target = value;
3380 return REDIS_OK;
3381 }
3382
3383 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3384 long long value;
3385
3386 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3387 if (value < LONG_MIN || value > LONG_MAX) {
3388 if (msg != NULL) {
3389 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3390 } else {
3391 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3392 }
3393 return REDIS_ERR;
3394 }
3395
3396 *target = value;
3397 return REDIS_OK;
3398 }
3399
3400 /*============================ RDB saving/loading =========================== */
3401
3402 static int rdbSaveType(FILE *fp, unsigned char type) {
3403 if (fwrite(&type,1,1,fp) == 0) return -1;
3404 return 0;
3405 }
3406
3407 static int rdbSaveTime(FILE *fp, time_t t) {
3408 int32_t t32 = (int32_t) t;
3409 if (fwrite(&t32,4,1,fp) == 0) return -1;
3410 return 0;
3411 }
3412
3413 /* check rdbLoadLen() comments for more info */
3414 static int rdbSaveLen(FILE *fp, uint32_t len) {
3415 unsigned char buf[2];
3416
3417 if (len < (1<<6)) {
3418 /* Save a 6 bit len */
3419 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3420 if (fwrite(buf,1,1,fp) == 0) return -1;
3421 } else if (len < (1<<14)) {
3422 /* Save a 14 bit len */
3423 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3424 buf[1] = len&0xFF;
3425 if (fwrite(buf,2,1,fp) == 0) return -1;
3426 } else {
3427 /* Save a 32 bit len */
3428 buf[0] = (REDIS_RDB_32BITLEN<<6);
3429 if (fwrite(buf,1,1,fp) == 0) return -1;
3430 len = htonl(len);
3431 if (fwrite(&len,4,1,fp) == 0) return -1;
3432 }
3433 return 0;
3434 }
3435
3436 /* Encode 'value' as an integer if possible (if integer will fit the
3437 * supported range). If the function sucessful encoded the integer
3438 * then the (up to 5 bytes) encoded representation is written in the
3439 * string pointed by 'enc' and the length is returned. Otherwise
3440 * 0 is returned. */
3441 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3442 /* Finally check if it fits in our ranges */
3443 if (value >= -(1<<7) && value <= (1<<7)-1) {
3444 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3445 enc[1] = value&0xFF;
3446 return 2;
3447 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3448 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3449 enc[1] = value&0xFF;
3450 enc[2] = (value>>8)&0xFF;
3451 return 3;
3452 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3453 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3454 enc[1] = value&0xFF;
3455 enc[2] = (value>>8)&0xFF;
3456 enc[3] = (value>>16)&0xFF;
3457 enc[4] = (value>>24)&0xFF;
3458 return 5;
3459 } else {
3460 return 0;
3461 }
3462 }
3463
3464 /* String objects in the form "2391" "-100" without any space and with a
3465 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3466 * encoded as integers to save space */
3467 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3468 long long value;
3469 char *endptr, buf[32];
3470
3471 /* Check if it's possible to encode this value as a number */
3472 value = strtoll(s, &endptr, 10);
3473 if (endptr[0] != '\0') return 0;
3474 ll2string(buf,32,value);
3475
3476 /* If the number converted back into a string is not identical
3477 * then it's not possible to encode the string as integer */
3478 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3479
3480 return rdbEncodeInteger(value,enc);
3481 }
3482
3483 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3484 size_t comprlen, outlen;
3485 unsigned char byte;
3486 void *out;
3487
3488 /* We require at least four bytes compression for this to be worth it */
3489 if (len <= 4) return 0;
3490 outlen = len-4;
3491 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3492 comprlen = lzf_compress(s, len, out, outlen);
3493 if (comprlen == 0) {
3494 zfree(out);
3495 return 0;
3496 }
3497 /* Data compressed! Let's save it on disk */
3498 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3499 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3500 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3501 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3502 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3503 zfree(out);
3504 return comprlen;
3505
3506 writeerr:
3507 zfree(out);
3508 return -1;
3509 }
3510
3511 /* Save a string objet as [len][data] on disk. If the object is a string
3512 * representation of an integer value we try to safe it in a special form */
3513 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3514 int enclen;
3515
3516 /* Try integer encoding */
3517 if (len <= 11) {
3518 unsigned char buf[5];
3519 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3520 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3521 return 0;
3522 }
3523 }
3524
3525 /* Try LZF compression - under 20 bytes it's unable to compress even
3526 * aaaaaaaaaaaaaaaaaa so skip it */
3527 if (server.rdbcompression && len > 20) {
3528 int retval;
3529
3530 retval = rdbSaveLzfStringObject(fp,s,len);
3531 if (retval == -1) return -1;
3532 if (retval > 0) return 0;
3533 /* retval == 0 means data can't be compressed, save the old way */
3534 }
3535
3536 /* Store verbatim */
3537 if (rdbSaveLen(fp,len) == -1) return -1;
3538 if (len && fwrite(s,len,1,fp) == 0) return -1;
3539 return 0;
3540 }
3541
3542 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3543 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3544 int retval;
3545
3546 /* Avoid to decode the object, then encode it again, if the
3547 * object is alrady integer encoded. */
3548 if (obj->encoding == REDIS_ENCODING_INT) {
3549 long val = (long) obj->ptr;
3550 unsigned char buf[5];
3551 int enclen;
3552
3553 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3554 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3555 return 0;
3556 }
3557 /* otherwise... fall throught and continue with the usual
3558 * code path. */
3559 }
3560
3561 /* Avoid incr/decr ref count business when possible.
3562 * This plays well with copy-on-write given that we are probably
3563 * in a child process (BGSAVE). Also this makes sure key objects
3564 * of swapped objects are not incRefCount-ed (an assert does not allow
3565 * this in order to avoid bugs) */
3566 if (obj->encoding != REDIS_ENCODING_RAW) {
3567 obj = getDecodedObject(obj);
3568 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3569 decrRefCount(obj);
3570 } else {
3571 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3572 }
3573 return retval;
3574 }
3575
3576 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3577 * 8 bit integer specifing the length of the representation.
3578 * This 8 bit integer has special values in order to specify the following
3579 * conditions:
3580 * 253: not a number
3581 * 254: + inf
3582 * 255: - inf
3583 */
3584 static int rdbSaveDoubleValue(FILE *fp, double val) {
3585 unsigned char buf[128];
3586 int len;
3587
3588 if (isnan(val)) {
3589 buf[0] = 253;
3590 len = 1;
3591 } else if (!isfinite(val)) {
3592 len = 1;
3593 buf[0] = (val < 0) ? 255 : 254;
3594 } else {
3595 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3596 /* Check if the float is in a safe range to be casted into a
3597 * long long. We are assuming that long long is 64 bit here.
3598 * Also we are assuming that there are no implementations around where
3599 * double has precision < 52 bit.
3600 *
3601 * Under this assumptions we test if a double is inside an interval
3602 * where casting to long long is safe. Then using two castings we
3603 * make sure the decimal part is zero. If all this is true we use
3604 * integer printing function that is much faster. */
3605 double min = -4503599627370495; /* (2^52)-1 */
3606 double max = 4503599627370496; /* -(2^52) */
3607 if (val > min && val < max && val == ((double)((long long)val)))
3608 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3609 else
3610 #endif
3611 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3612 buf[0] = strlen((char*)buf+1);
3613 len = buf[0]+1;
3614 }
3615 if (fwrite(buf,len,1,fp) == 0) return -1;
3616 return 0;
3617 }
3618
3619 /* Save a Redis object. */
3620 static int rdbSaveObject(FILE *fp, robj *o) {
3621 if (o->type == REDIS_STRING) {
3622 /* Save a string value */
3623 if (rdbSaveStringObject(fp,o) == -1) return -1;
3624 } else if (o->type == REDIS_LIST) {
3625 /* Save a list value */
3626 list *list = o->ptr;
3627 listIter li;
3628 listNode *ln;
3629
3630 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3631 listRewind(list,&li);
3632 while((ln = listNext(&li))) {
3633 robj *eleobj = listNodeValue(ln);
3634
3635 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3636 }
3637 } else if (o->type == REDIS_SET) {
3638 /* Save a set value */
3639 dict *set = o->ptr;
3640 dictIterator *di = dictGetIterator(set);
3641 dictEntry *de;
3642
3643 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3644 while((de = dictNext(di)) != NULL) {
3645 robj *eleobj = dictGetEntryKey(de);
3646
3647 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3648 }
3649 dictReleaseIterator(di);
3650 } else if (o->type == REDIS_ZSET) {
3651 /* Save a set value */
3652 zset *zs = o->ptr;
3653 dictIterator *di = dictGetIterator(zs->dict);
3654 dictEntry *de;
3655
3656 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3657 while((de = dictNext(di)) != NULL) {
3658 robj *eleobj = dictGetEntryKey(de);
3659 double *score = dictGetEntryVal(de);
3660
3661 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3662 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3663 }
3664 dictReleaseIterator(di);
3665 } else if (o->type == REDIS_HASH) {
3666 /* Save a hash value */
3667 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3668 unsigned char *p = zipmapRewind(o->ptr);
3669 unsigned int count = zipmapLen(o->ptr);
3670 unsigned char *key, *val;
3671 unsigned int klen, vlen;
3672
3673 if (rdbSaveLen(fp,count) == -1) return -1;
3674 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3675 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3676 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3677 }
3678 } else {
3679 dictIterator *di = dictGetIterator(o->ptr);
3680 dictEntry *de;
3681
3682 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3683 while((de = dictNext(di)) != NULL) {
3684 robj *key = dictGetEntryKey(de);
3685 robj *val = dictGetEntryVal(de);
3686
3687 if (rdbSaveStringObject(fp,key) == -1) return -1;
3688 if (rdbSaveStringObject(fp,val) == -1) return -1;
3689 }
3690 dictReleaseIterator(di);
3691 }
3692 } else {
3693 redisPanic("Unknown object type");
3694 }
3695 return 0;
3696 }
3697
3698 /* Return the length the object will have on disk if saved with
3699 * the rdbSaveObject() function. Currently we use a trick to get
3700 * this length with very little changes to the code. In the future
3701 * we could switch to a faster solution. */
3702 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3703 if (fp == NULL) fp = server.devnull;
3704 rewind(fp);
3705 assert(rdbSaveObject(fp,o) != 1);
3706 return ftello(fp);
3707 }
3708
3709 /* Return the number of pages required to save this object in the swap file */
3710 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3711 off_t bytes = rdbSavedObjectLen(o,fp);
3712
3713 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3714 }
3715
3716 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3717 static int rdbSave(char *filename) {
3718 dictIterator *di = NULL;
3719 dictEntry *de;
3720 FILE *fp;
3721 char tmpfile[256];
3722 int j;
3723 time_t now = time(NULL);
3724
3725 /* Wait for I/O therads to terminate, just in case this is a
3726 * foreground-saving, to avoid seeking the swap file descriptor at the
3727 * same time. */
3728 if (server.vm_enabled)
3729 waitEmptyIOJobsQueue();
3730
3731 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3732 fp = fopen(tmpfile,"w");
3733 if (!fp) {
3734 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3735 return REDIS_ERR;
3736 }
3737 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3738 for (j = 0; j < server.dbnum; j++) {
3739 redisDb *db = server.db+j;
3740 dict *d = db->dict;
3741 if (dictSize(d) == 0) continue;
3742 di = dictGetIterator(d);
3743 if (!di) {
3744 fclose(fp);
3745 return REDIS_ERR;
3746 }
3747
3748 /* Write the SELECT DB opcode */
3749 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3750 if (rdbSaveLen(fp,j) == -1) goto werr;
3751
3752 /* Iterate this DB writing every entry */
3753 while((de = dictNext(di)) != NULL) {
3754 robj *key = dictGetEntryKey(de);
3755 robj *o = dictGetEntryVal(de);
3756 time_t expiretime = getExpire(db,key);
3757
3758 /* Save the expire time */
3759 if (expiretime != -1) {
3760 /* If this key is already expired skip it */
3761 if (expiretime < now) continue;
3762 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3763 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3764 }
3765 /* Save the key and associated value. This requires special
3766 * handling if the value is swapped out. */
3767 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3768 key->storage == REDIS_VM_SWAPPING) {
3769 /* Save type, key, value */
3770 if (rdbSaveType(fp,o->type) == -1) goto werr;
3771 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3772 if (rdbSaveObject(fp,o) == -1) goto werr;
3773 } else {
3774 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3775 robj *po;
3776 /* Get a preview of the object in memory */
3777 po = vmPreviewObject(key);
3778 /* Save type, key, value */
3779 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3780 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3781 if (rdbSaveObject(fp,po) == -1) goto werr;
3782 /* Remove the loaded object from memory */
3783 decrRefCount(po);
3784 }
3785 }
3786 dictReleaseIterator(di);
3787 }
3788 /* EOF opcode */
3789 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3790
3791 /* Make sure data will not remain on the OS's output buffers */
3792 fflush(fp);
3793 fsync(fileno(fp));
3794 fclose(fp);
3795
3796 /* Use RENAME to make sure the DB file is changed atomically only
3797 * if the generate DB file is ok. */
3798 if (rename(tmpfile,filename) == -1) {
3799 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3800 unlink(tmpfile);
3801 return REDIS_ERR;
3802 }
3803 redisLog(REDIS_NOTICE,"DB saved on disk");
3804 server.dirty = 0;
3805 server.lastsave = time(NULL);
3806 return REDIS_OK;
3807
3808 werr:
3809 fclose(fp);
3810 unlink(tmpfile);
3811 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3812 if (di) dictReleaseIterator(di);
3813 return REDIS_ERR;
3814 }
3815
3816 static int rdbSaveBackground(char *filename) {
3817 pid_t childpid;
3818
3819 if (server.bgsavechildpid != -1) return REDIS_ERR;
3820 if (server.vm_enabled) waitEmptyIOJobsQueue();
3821 if ((childpid = fork()) == 0) {
3822 /* Child */
3823 if (server.vm_enabled) vmReopenSwapFile();
3824 close(server.fd);
3825 if (rdbSave(filename) == REDIS_OK) {
3826 _exit(0);
3827 } else {
3828 _exit(1);
3829 }
3830 } else {
3831 /* Parent */
3832 if (childpid == -1) {
3833 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3834 strerror(errno));
3835 return REDIS_ERR;
3836 }
3837 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3838 server.bgsavechildpid = childpid;
3839 updateDictResizePolicy();
3840 return REDIS_OK;
3841 }
3842 return REDIS_OK; /* unreached */
3843 }
3844
3845 static void rdbRemoveTempFile(pid_t childpid) {
3846 char tmpfile[256];
3847
3848 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3849 unlink(tmpfile);
3850 }
3851
3852 static int rdbLoadType(FILE *fp) {
3853 unsigned char type;
3854 if (fread(&type,1,1,fp) == 0) return -1;
3855 return type;
3856 }
3857
3858 static time_t rdbLoadTime(FILE *fp) {
3859 int32_t t32;
3860 if (fread(&t32,4,1,fp) == 0) return -1;
3861 return (time_t) t32;
3862 }
3863
3864 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3865 * of this file for a description of how this are stored on disk.
3866 *
3867 * isencoded is set to 1 if the readed length is not actually a length but
3868 * an "encoding type", check the above comments for more info */
3869 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3870 unsigned char buf[2];
3871 uint32_t len;
3872 int type;
3873
3874 if (isencoded) *isencoded = 0;
3875 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3876 type = (buf[0]&0xC0)>>6;
3877 if (type == REDIS_RDB_6BITLEN) {
3878 /* Read a 6 bit len */
3879 return buf[0]&0x3F;
3880 } else if (type == REDIS_RDB_ENCVAL) {
3881 /* Read a 6 bit len encoding type */
3882 if (isencoded) *isencoded = 1;
3883 return buf[0]&0x3F;
3884 } else if (type == REDIS_RDB_14BITLEN) {
3885 /* Read a 14 bit len */
3886 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3887 return ((buf[0]&0x3F)<<8)|buf[1];
3888 } else {
3889 /* Read a 32 bit len */
3890 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3891 return ntohl(len);
3892 }
3893 }
3894
3895 /* Load an integer-encoded object from file 'fp', with the specified
3896 * encoding type 'enctype'. If encode is true the function may return
3897 * an integer-encoded object as reply, otherwise the returned object
3898 * will always be encoded as a raw string. */
3899 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3900 unsigned char enc[4];
3901 long long val;
3902
3903 if (enctype == REDIS_RDB_ENC_INT8) {
3904 if (fread(enc,1,1,fp) == 0) return NULL;
3905 val = (signed char)enc[0];
3906 } else if (enctype == REDIS_RDB_ENC_INT16) {
3907 uint16_t v;
3908 if (fread(enc,2,1,fp) == 0) return NULL;
3909 v = enc[0]|(enc[1]<<8);
3910 val = (int16_t)v;
3911 } else if (enctype == REDIS_RDB_ENC_INT32) {
3912 uint32_t v;
3913 if (fread(enc,4,1,fp) == 0) return NULL;
3914 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3915 val = (int32_t)v;
3916 } else {
3917 val = 0; /* anti-warning */
3918 redisPanic("Unknown RDB integer encoding type");
3919 }
3920 if (encode)
3921 return createStringObjectFromLongLong(val);
3922 else
3923 return createObject(REDIS_STRING,sdsfromlonglong(val));
3924 }
3925
3926 static robj *rdbLoadLzfStringObject(FILE*fp) {
3927 unsigned int len, clen;
3928 unsigned char *c = NULL;
3929 sds val = NULL;
3930
3931 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3932 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3933 if ((c = zmalloc(clen)) == NULL) goto err;
3934 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3935 if (fread(c,clen,1,fp) == 0) goto err;
3936 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3937 zfree(c);
3938 return createObject(REDIS_STRING,val);
3939 err:
3940 zfree(c);
3941 sdsfree(val);
3942 return NULL;
3943 }
3944
3945 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3946 int isencoded;
3947 uint32_t len;
3948 sds val;
3949
3950 len = rdbLoadLen(fp,&isencoded);
3951 if (isencoded) {
3952 switch(len) {
3953 case REDIS_RDB_ENC_INT8:
3954 case REDIS_RDB_ENC_INT16:
3955 case REDIS_RDB_ENC_INT32:
3956 return rdbLoadIntegerObject(fp,len,encode);
3957 case REDIS_RDB_ENC_LZF:
3958 return rdbLoadLzfStringObject(fp);
3959 default:
3960 redisPanic("Unknown RDB encoding type");
3961 }
3962 }
3963
3964 if (len == REDIS_RDB_LENERR) return NULL;
3965 val = sdsnewlen(NULL,len);
3966 if (len && fread(val,len,1,fp) == 0) {
3967 sdsfree(val);
3968 return NULL;
3969 }
3970 return createObject(REDIS_STRING,val);
3971 }
3972
3973 static robj *rdbLoadStringObject(FILE *fp) {
3974 return rdbGenericLoadStringObject(fp,0);
3975 }
3976
3977 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3978 return rdbGenericLoadStringObject(fp,1);
3979 }
3980
3981 /* For information about double serialization check rdbSaveDoubleValue() */
3982 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3983 char buf[128];
3984 unsigned char len;
3985
3986 if (fread(&len,1,1,fp) == 0) return -1;
3987 switch(len) {
3988 case 255: *val = R_NegInf; return 0;
3989 case 254: *val = R_PosInf; return 0;
3990 case 253: *val = R_Nan; return 0;
3991 default:
3992 if (fread(buf,len,1,fp) == 0) return -1;
3993 buf[len] = '\0';
3994 sscanf(buf, "%lg", val);
3995 return 0;
3996 }
3997 }
3998
3999 /* Load a Redis object of the specified type from the specified file.
4000 * On success a newly allocated object is returned, otherwise NULL. */
4001 static robj *rdbLoadObject(int type, FILE *fp) {
4002 robj *o;
4003
4004 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4005 if (type == REDIS_STRING) {
4006 /* Read string value */
4007 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4008 o = tryObjectEncoding(o);
4009 } else if (type == REDIS_LIST || type == REDIS_SET) {
4010 /* Read list/set value */
4011 uint32_t listlen;
4012
4013 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4014 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
4015 /* It's faster to expand the dict to the right size asap in order
4016 * to avoid rehashing */
4017 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4018 dictExpand(o->ptr,listlen);
4019 /* Load every single element of the list/set */
4020 while(listlen--) {
4021 robj *ele;
4022
4023 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4024 ele = tryObjectEncoding(ele);
4025 if (type == REDIS_LIST) {
4026 listAddNodeTail((list*)o->ptr,ele);
4027 } else {
4028 dictAdd((dict*)o->ptr,ele,NULL);
4029 }
4030 }
4031 } else if (type == REDIS_ZSET) {
4032 /* Read list/set value */
4033 size_t zsetlen;
4034 zset *zs;
4035
4036 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4037 o = createZsetObject();
4038 zs = o->ptr;
4039 /* Load every single element of the list/set */
4040 while(zsetlen--) {
4041 robj *ele;
4042 double *score = zmalloc(sizeof(double));
4043
4044 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4045 ele = tryObjectEncoding(ele);
4046 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4047 dictAdd(zs->dict,ele,score);
4048 zslInsert(zs->zsl,*score,ele);
4049 incrRefCount(ele); /* added to skiplist */
4050 }
4051 } else if (type == REDIS_HASH) {
4052 size_t hashlen;
4053
4054 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4055 o = createHashObject();
4056 /* Too many entries? Use an hash table. */
4057 if (hashlen > server.hash_max_zipmap_entries)
4058 convertToRealHash(o);
4059 /* Load every key/value, then set it into the zipmap or hash
4060 * table, as needed. */
4061 while(hashlen--) {
4062 robj *key, *val;
4063
4064 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4065 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4066 /* If we are using a zipmap and there are too big values
4067 * the object is converted to real hash table encoding. */
4068 if (o->encoding != REDIS_ENCODING_HT &&
4069 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4070 sdslen(val->ptr) > server.hash_max_zipmap_value))
4071 {
4072 convertToRealHash(o);
4073 }
4074
4075 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4076 unsigned char *zm = o->ptr;
4077
4078 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4079 val->ptr,sdslen(val->ptr),NULL);
4080 o->ptr = zm;
4081 decrRefCount(key);
4082 decrRefCount(val);
4083 } else {
4084 key = tryObjectEncoding(key);
4085 val = tryObjectEncoding(val);
4086 dictAdd((dict*)o->ptr,key,val);
4087 }
4088 }
4089 } else {
4090 redisPanic("Unknown object type");
4091 }
4092 return o;
4093 }
4094
4095 static int rdbLoad(char *filename) {
4096 FILE *fp;
4097 uint32_t dbid;
4098 int type, retval, rdbver;
4099 int swap_all_values = 0;
4100 dict *d = server.db[0].dict;
4101 redisDb *db = server.db+0;
4102 char buf[1024];
4103 time_t expiretime, now = time(NULL);
4104 long long loadedkeys = 0;
4105
4106 fp = fopen(filename,"r");
4107 if (!fp) return REDIS_ERR;
4108 if (fread(buf,9,1,fp) == 0) goto eoferr;
4109 buf[9] = '\0';
4110 if (memcmp(buf,"REDIS",5) != 0) {
4111 fclose(fp);
4112 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4113 return REDIS_ERR;
4114 }
4115 rdbver = atoi(buf+5);
4116 if (rdbver != 1) {
4117 fclose(fp);
4118 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4119 return REDIS_ERR;
4120 }
4121 while(1) {
4122 robj *key, *val;
4123
4124 expiretime = -1;
4125 /* Read type. */
4126 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4127 if (type == REDIS_EXPIRETIME) {
4128 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4129 /* We read the time so we need to read the object type again */
4130 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4131 }
4132 if (type == REDIS_EOF) break;
4133 /* Handle SELECT DB opcode as a special case */
4134 if (type == REDIS_SELECTDB) {
4135 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4136 goto eoferr;
4137 if (dbid >= (unsigned)server.dbnum) {
4138 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4139 exit(1);
4140 }
4141 db = server.db+dbid;
4142 d = db->dict;
4143 continue;
4144 }
4145 /* Read key */
4146 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4147 /* Read value */
4148 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4149 /* Check if the key already expired */
4150 if (expiretime != -1 && expiretime < now) {
4151 decrRefCount(key);
4152 decrRefCount(val);
4153 continue;
4154 }
4155 /* Add the new object in the hash table */
4156 retval = dictAdd(d,key,val);
4157 if (retval == DICT_ERR) {
4158 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4159 exit(1);
4160 }
4161 loadedkeys++;
4162 /* Set the expire time if needed */
4163 if (expiretime != -1) setExpire(db,key,expiretime);
4164
4165 /* Handle swapping while loading big datasets when VM is on */
4166
4167 /* If we detecter we are hopeless about fitting something in memory
4168 * we just swap every new key on disk. Directly...
4169 * Note that's important to check for this condition before resorting
4170 * to random sampling, otherwise we may try to swap already
4171 * swapped keys. */
4172 if (swap_all_values) {
4173 dictEntry *de = dictFind(d,key);
4174
4175 /* de may be NULL since the key already expired */
4176 if (de) {
4177 key = dictGetEntryKey(de);
4178 val = dictGetEntryVal(de);
4179
4180 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4181 dictGetEntryVal(de) = NULL;
4182 }
4183 }
4184 continue;
4185 }
4186
4187 /* If we have still some hope of having some value fitting memory
4188 * then we try random sampling. */
4189 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4190 while (zmalloc_used_memory() > server.vm_max_memory) {
4191 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4192 }
4193 if (zmalloc_used_memory() > server.vm_max_memory)
4194 swap_all_values = 1; /* We are already using too much mem */
4195 }
4196 }
4197 fclose(fp);
4198 return REDIS_OK;
4199
4200 eoferr: /* unexpected end of file is handled here with a fatal exit */
4201 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4202 exit(1);
4203 return REDIS_ERR; /* Just to avoid warning */
4204 }
4205
4206 /*================================== Shutdown =============================== */
4207 static int prepareForShutdown() {
4208 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4209 /* Kill the saving child if there is a background saving in progress.
4210 We want to avoid race conditions, for instance our saving child may
4211 overwrite the synchronous saving did by SHUTDOWN. */
4212 if (server.bgsavechildpid != -1) {
4213 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4214 kill(server.bgsavechildpid,SIGKILL);
4215 rdbRemoveTempFile(server.bgsavechildpid);
4216 }
4217 if (server.appendonly) {
4218 /* Append only file: fsync() the AOF and exit */
4219 aof_fsync(server.appendfd);
4220 if (server.vm_enabled) unlink(server.vm_swap_file);
4221 } else {
4222 /* Snapshotting. Perform a SYNC SAVE and exit */
4223 if (rdbSave(server.dbfilename) == REDIS_OK) {
4224 if (server.daemonize)
4225 unlink(server.pidfile);
4226 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4227 } else {
4228 /* Ooops.. error saving! The best we can do is to continue
4229 * operating. Note that if there was a background saving process,
4230 * in the next cron() Redis will be notified that the background
4231 * saving aborted, handling special stuff like slaves pending for
4232 * synchronization... */
4233 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4234 return REDIS_ERR;
4235 }
4236 }
4237 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4238 return REDIS_OK;
4239 }
4240
4241 /*================================== Commands =============================== */
4242
4243 static void authCommand(redisClient *c) {
4244 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4245 c->authenticated = 1;
4246 addReply(c,shared.ok);
4247 } else {
4248 c->authenticated = 0;
4249 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4250 }
4251 }
4252
4253 static void pingCommand(redisClient *c) {
4254 addReply(c,shared.pong);
4255 }
4256
4257 static void echoCommand(redisClient *c) {
4258 addReplyBulk(c,c->argv[1]);
4259 }
4260
4261 /*=================================== Strings =============================== */
4262
4263 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4264 int retval;
4265 long seconds = 0; /* initialized to avoid an harmness warning */
4266
4267 if (expire) {
4268 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4269 return;
4270 if (seconds <= 0) {
4271 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4272 return;
4273 }
4274 }
4275
4276 touchWatchedKey(c->db,key);
4277 if (nx) deleteIfVolatile(c->db,key);
4278 retval = dictAdd(c->db->dict,key,val);
4279 if (retval == DICT_ERR) {
4280 if (!nx) {
4281 /* If the key is about a swapped value, we want a new key object
4282 * to overwrite the old. So we delete the old key in the database.
4283 * This will also make sure that swap pages about the old object
4284 * will be marked as free. */
4285 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4286 incrRefCount(key);
4287 dictReplace(c->db->dict,key,val);
4288 incrRefCount(val);
4289 } else {
4290 addReply(c,shared.czero);
4291 return;
4292 }
4293 } else {
4294 incrRefCount(key);
4295 incrRefCount(val);
4296 }
4297 server.dirty++;
4298 removeExpire(c->db,key);
4299 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4300 addReply(c, nx ? shared.cone : shared.ok);
4301 }
4302
4303 static void setCommand(redisClient *c) {
4304 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4305 }
4306
4307 static void setnxCommand(redisClient *c) {
4308 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4309 }
4310
4311 static void setexCommand(redisClient *c) {
4312 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4313 }
4314
4315 static int getGenericCommand(redisClient *c) {
4316 robj *o;
4317
4318 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4319 return REDIS_OK;
4320
4321 if (o->type != REDIS_STRING) {
4322 addReply(c,shared.wrongtypeerr);
4323 return REDIS_ERR;
4324 } else {
4325 addReplyBulk(c,o);
4326 return REDIS_OK;
4327 }
4328 }
4329
4330 static void getCommand(redisClient *c) {
4331 getGenericCommand(c);
4332 }
4333
4334 static void getsetCommand(redisClient *c) {
4335 if (getGenericCommand(c) == REDIS_ERR) return;
4336 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4337 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4338 } else {
4339 incrRefCount(c->argv[1]);
4340 }
4341 incrRefCount(c->argv[2]);
4342 server.dirty++;
4343 removeExpire(c->db,c->argv[1]);
4344 }
4345
4346 static void mgetCommand(redisClient *c) {
4347 int j;
4348
4349 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4350 for (j = 1; j < c->argc; j++) {
4351 robj *o = lookupKeyRead(c->db,c->argv[j]);
4352 if (o == NULL) {
4353 addReply(c,shared.nullbulk);
4354 } else {
4355 if (o->type != REDIS_STRING) {
4356 addReply(c,shared.nullbulk);
4357 } else {
4358 addReplyBulk(c,o);
4359 }
4360 }
4361 }
4362 }
4363
4364 static void msetGenericCommand(redisClient *c, int nx) {
4365 int j, busykeys = 0;
4366
4367 if ((c->argc % 2) == 0) {
4368 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4369 return;
4370 }
4371 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4372 * set nothing at all if at least one already key exists. */
4373 if (nx) {
4374 for (j = 1; j < c->argc; j += 2) {
4375 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4376 busykeys++;
4377 }
4378 }
4379 }
4380 if (busykeys) {
4381 addReply(c, shared.czero);
4382 return;
4383 }
4384
4385 for (j = 1; j < c->argc; j += 2) {
4386 int retval;
4387
4388 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4389 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4390 if (retval == DICT_ERR) {
4391 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4392 incrRefCount(c->argv[j+1]);
4393 } else {
4394 incrRefCount(c->argv[j]);
4395 incrRefCount(c->argv[j+1]);
4396 }
4397 removeExpire(c->db,c->argv[j]);
4398 }
4399 server.dirty += (c->argc-1)/2;
4400 addReply(c, nx ? shared.cone : shared.ok);
4401 }
4402
4403 static void msetCommand(redisClient *c) {
4404 msetGenericCommand(c,0);
4405 }
4406
4407 static void msetnxCommand(redisClient *c) {
4408 msetGenericCommand(c,1);
4409 }
4410
4411 static void incrDecrCommand(redisClient *c, long long incr) {
4412 long long value;
4413 int retval;
4414 robj *o;
4415
4416 o = lookupKeyWrite(c->db,c->argv[1]);
4417 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4418 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4419
4420 value += incr;
4421 o = createStringObjectFromLongLong(value);
4422 retval = dictAdd(c->db->dict,c->argv[1],o);
4423 if (retval == DICT_ERR) {
4424 dictReplace(c->db->dict,c->argv[1],o);
4425 removeExpire(c->db,c->argv[1]);
4426 } else {
4427 incrRefCount(c->argv[1]);
4428 }
4429 server.dirty++;
4430 addReply(c,shared.colon);
4431 addReply(c,o);
4432 addReply(c,shared.crlf);
4433 }
4434
4435 static void incrCommand(redisClient *c) {
4436 incrDecrCommand(c,1);
4437 }
4438
4439 static void decrCommand(redisClient *c) {
4440 incrDecrCommand(c,-1);
4441 }
4442
4443 static void incrbyCommand(redisClient *c) {
4444 long long incr;
4445
4446 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4447 incrDecrCommand(c,incr);
4448 }
4449
4450 static void decrbyCommand(redisClient *c) {
4451 long long incr;
4452
4453 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4454 incrDecrCommand(c,-incr);
4455 }
4456
4457 static void appendCommand(redisClient *c) {
4458 int retval;
4459 size_t totlen;
4460 robj *o;
4461
4462 o = lookupKeyWrite(c->db,c->argv[1]);
4463 if (o == NULL) {
4464 /* Create the key */
4465 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4466 incrRefCount(c->argv[1]);
4467 incrRefCount(c->argv[2]);
4468 totlen = stringObjectLen(c->argv[2]);
4469 } else {
4470 dictEntry *de;
4471
4472 de = dictFind(c->db->dict,c->argv[1]);
4473 assert(de != NULL);
4474
4475 o = dictGetEntryVal(de);
4476 if (o->type != REDIS_STRING) {
4477 addReply(c,shared.wrongtypeerr);
4478 return;
4479 }
4480 /* If the object is specially encoded or shared we have to make
4481 * a copy */
4482 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4483 robj *decoded = getDecodedObject(o);
4484
4485 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4486 decrRefCount(decoded);
4487 dictReplace(c->db->dict,c->argv[1],o);
4488 }
4489 /* APPEND! */
4490 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4491 o->ptr = sdscatlen(o->ptr,
4492 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4493 } else {
4494 o->ptr = sdscatprintf(o->ptr, "%ld",
4495 (unsigned long) c->argv[2]->ptr);
4496 }
4497 totlen = sdslen(o->ptr);
4498 }
4499 server.dirty++;
4500 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4501 }
4502
4503 static void substrCommand(redisClient *c) {
4504 robj *o;
4505 long start = atoi(c->argv[2]->ptr);
4506 long end = atoi(c->argv[3]->ptr);
4507 size_t rangelen, strlen;
4508 sds range;
4509
4510 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4511 checkType(c,o,REDIS_STRING)) return;
4512
4513 o = getDecodedObject(o);
4514 strlen = sdslen(o->ptr);
4515
4516 /* convert negative indexes */
4517 if (start < 0) start = strlen+start;
4518 if (end < 0) end = strlen+end;
4519 if (start < 0) start = 0;
4520 if (end < 0) end = 0;
4521
4522 /* indexes sanity checks */
4523 if (start > end || (size_t)start >= strlen) {
4524 /* Out of range start or start > end result in null reply */
4525 addReply(c,shared.nullbulk);
4526 decrRefCount(o);
4527 return;
4528 }
4529 if ((size_t)end >= strlen) end = strlen-1;
4530 rangelen = (end-start)+1;
4531
4532 /* Return the result */
4533 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4534 range = sdsnewlen((char*)o->ptr+start,rangelen);
4535 addReplySds(c,range);
4536 addReply(c,shared.crlf);
4537 decrRefCount(o);
4538 }
4539
4540 /* ========================= Type agnostic commands ========================= */
4541
4542 static void delCommand(redisClient *c) {
4543 int deleted = 0, j;
4544
4545 for (j = 1; j < c->argc; j++) {
4546 if (deleteKey(c->db,c->argv[j])) {
4547 touchWatchedKey(c->db,c->argv[j]);
4548 server.dirty++;
4549 deleted++;
4550 }
4551 }
4552 addReplyLongLong(c,deleted);
4553 }
4554
4555 static void existsCommand(redisClient *c) {
4556 expireIfNeeded(c->db,c->argv[1]);
4557 if (dictFind(c->db->dict,c->argv[1])) {
4558 addReply(c, shared.cone);
4559 } else {
4560 addReply(c, shared.czero);
4561 }
4562 }
4563
4564 static void selectCommand(redisClient *c) {
4565 int id = atoi(c->argv[1]->ptr);
4566
4567 if (selectDb(c,id) == REDIS_ERR) {
4568 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4569 } else {
4570 addReply(c,shared.ok);
4571 }
4572 }
4573
4574 static void randomkeyCommand(redisClient *c) {
4575 dictEntry *de;
4576 robj *key;
4577
4578 while(1) {
4579 de = dictGetRandomKey(c->db->dict);
4580 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4581 }
4582
4583 if (de == NULL) {
4584 addReply(c,shared.nullbulk);
4585 return;
4586 }
4587
4588 key = dictGetEntryKey(de);
4589 if (server.vm_enabled) {
4590 key = dupStringObject(key);
4591 addReplyBulk(c,key);
4592 decrRefCount(key);
4593 } else {
4594 addReplyBulk(c,key);
4595 }
4596 }
4597
4598 static void keysCommand(redisClient *c) {
4599 dictIterator *di;
4600 dictEntry *de;
4601 sds pattern = c->argv[1]->ptr;
4602 int plen = sdslen(pattern);
4603 unsigned long numkeys = 0;
4604 robj *lenobj = createObject(REDIS_STRING,NULL);
4605
4606 di = dictGetIterator(c->db->dict);
4607 addReply(c,lenobj);
4608 decrRefCount(lenobj);
4609 while((de = dictNext(di)) != NULL) {
4610 robj *keyobj = dictGetEntryKey(de);
4611
4612 sds key = keyobj->ptr;
4613 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4614 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4615 if (expireIfNeeded(c->db,keyobj) == 0) {
4616 addReplyBulk(c,keyobj);
4617 numkeys++;
4618 }
4619 }
4620 }
4621 dictReleaseIterator(di);
4622 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4623 }
4624
4625 static void dbsizeCommand(redisClient *c) {
4626 addReplySds(c,
4627 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4628 }
4629
4630 static void lastsaveCommand(redisClient *c) {
4631 addReplySds(c,
4632 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4633 }
4634
4635 static void typeCommand(redisClient *c) {
4636 robj *o;
4637 char *type;
4638
4639 o = lookupKeyRead(c->db,c->argv[1]);
4640 if (o == NULL) {
4641 type = "+none";
4642 } else {
4643 switch(o->type) {
4644 case REDIS_STRING: type = "+string"; break;
4645 case REDIS_LIST: type = "+list"; break;
4646 case REDIS_SET: type = "+set"; break;
4647 case REDIS_ZSET: type = "+zset"; break;
4648 case REDIS_HASH: type = "+hash"; break;
4649 default: type = "+unknown"; break;
4650 }
4651 }
4652 addReplySds(c,sdsnew(type));
4653 addReply(c,shared.crlf);
4654 }
4655
4656 static void saveCommand(redisClient *c) {
4657 if (server.bgsavechildpid != -1) {
4658 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4659 return;
4660 }
4661 if (rdbSave(server.dbfilename) == REDIS_OK) {
4662 addReply(c,shared.ok);
4663 } else {
4664 addReply(c,shared.err);
4665 }
4666 }
4667
4668 static void bgsaveCommand(redisClient *c) {
4669 if (server.bgsavechildpid != -1) {
4670 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4671 return;
4672 }
4673 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4674 char *status = "+Background saving started\r\n";
4675 addReplySds(c,sdsnew(status));
4676 } else {
4677 addReply(c,shared.err);
4678 }
4679 }
4680
4681 static void shutdownCommand(redisClient *c) {
4682 if (prepareForShutdown() == REDIS_OK)
4683 exit(0);
4684 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4685 }
4686
4687 static void renameGenericCommand(redisClient *c, int nx) {
4688 robj *o;
4689
4690 /* To use the same key as src and dst is probably an error */
4691 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4692 addReply(c,shared.sameobjecterr);
4693 return;
4694 }
4695
4696 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4697 return;
4698
4699 incrRefCount(o);
4700 deleteIfVolatile(c->db,c->argv[2]);
4701 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4702 if (nx) {
4703 decrRefCount(o);
4704 addReply(c,shared.czero);
4705 return;
4706 }
4707 dictReplace(c->db->dict,c->argv[2],o);
4708 } else {
4709 incrRefCount(c->argv[2]);
4710 }
4711 deleteKey(c->db,c->argv[1]);
4712 touchWatchedKey(c->db,c->argv[2]);
4713 server.dirty++;
4714 addReply(c,nx ? shared.cone : shared.ok);
4715 }
4716
4717 static void renameCommand(redisClient *c) {
4718 renameGenericCommand(c,0);
4719 }
4720
4721 static void renamenxCommand(redisClient *c) {
4722 renameGenericCommand(c,1);
4723 }
4724
4725 static void moveCommand(redisClient *c) {
4726 robj *o;
4727 redisDb *src, *dst;
4728 int srcid;
4729
4730 /* Obtain source and target DB pointers */
4731 src = c->db;
4732 srcid = c->db->id;
4733 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4734 addReply(c,shared.outofrangeerr);
4735 return;
4736 }
4737 dst = c->db;
4738 selectDb(c,srcid); /* Back to the source DB */
4739
4740 /* If the user is moving using as target the same
4741 * DB as the source DB it is probably an error. */
4742 if (src == dst) {
4743 addReply(c,shared.sameobjecterr);
4744 return;
4745 }
4746
4747 /* Check if the element exists and get a reference */
4748 o = lookupKeyWrite(c->db,c->argv[1]);
4749 if (!o) {
4750 addReply(c,shared.czero);
4751 return;
4752 }
4753
4754 /* Try to add the element to the target DB */
4755 deleteIfVolatile(dst,c->argv[1]);
4756 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4757 addReply(c,shared.czero);
4758 return;
4759 }
4760 incrRefCount(c->argv[1]);
4761 incrRefCount(o);
4762
4763 /* OK! key moved, free the entry in the source DB */
4764 deleteKey(src,c->argv[1]);
4765 server.dirty++;
4766 addReply(c,shared.cone);
4767 }
4768
4769 /* =================================== Lists ================================ */
4770 static void pushGenericCommand(redisClient *c, int where) {
4771 robj *lobj;
4772 list *list;
4773
4774 lobj = lookupKeyWrite(c->db,c->argv[1]);
4775 if (lobj == NULL) {
4776 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4777 addReply(c,shared.cone);
4778 return;
4779 }
4780 lobj = createListObject();
4781 list = lobj->ptr;
4782 if (where == REDIS_HEAD) {
4783 listAddNodeHead(list,c->argv[2]);
4784 } else {
4785 listAddNodeTail(list,c->argv[2]);
4786 }
4787 dictAdd(c->db->dict,c->argv[1],lobj);
4788 incrRefCount(c->argv[1]);
4789 incrRefCount(c->argv[2]);
4790 } else {
4791 if (lobj->type != REDIS_LIST) {
4792 addReply(c,shared.wrongtypeerr);
4793 return;
4794 }
4795 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4796 addReply(c,shared.cone);
4797 return;
4798 }
4799 list = lobj->ptr;
4800 if (where == REDIS_HEAD) {
4801 listAddNodeHead(list,c->argv[2]);
4802 } else {
4803 listAddNodeTail(list,c->argv[2]);
4804 }
4805 incrRefCount(c->argv[2]);
4806 }
4807 server.dirty++;
4808 addReplyLongLong(c,listLength(list));
4809 }
4810
4811 static void lpushCommand(redisClient *c) {
4812 pushGenericCommand(c,REDIS_HEAD);
4813 }
4814
4815 static void rpushCommand(redisClient *c) {
4816 pushGenericCommand(c,REDIS_TAIL);
4817 }
4818
4819 static void llenCommand(redisClient *c) {
4820 robj *o;
4821 list *l;
4822
4823 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4824 checkType(c,o,REDIS_LIST)) return;
4825
4826 l = o->ptr;
4827 addReplyUlong(c,listLength(l));
4828 }
4829
4830 static void lindexCommand(redisClient *c) {
4831 robj *o;
4832 int index = atoi(c->argv[2]->ptr);
4833 list *list;
4834 listNode *ln;
4835
4836 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4837 checkType(c,o,REDIS_LIST)) return;
4838 list = o->ptr;
4839
4840 ln = listIndex(list, index);
4841 if (ln == NULL) {
4842 addReply(c,shared.nullbulk);
4843 } else {
4844 robj *ele = listNodeValue(ln);
4845 addReplyBulk(c,ele);
4846 }
4847 }
4848
4849 static void lsetCommand(redisClient *c) {
4850 robj *o;
4851 int index = atoi(c->argv[2]->ptr);
4852 list *list;
4853 listNode *ln;
4854
4855 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4856 checkType(c,o,REDIS_LIST)) return;
4857 list = o->ptr;
4858
4859 ln = listIndex(list, index);
4860 if (ln == NULL) {
4861 addReply(c,shared.outofrangeerr);
4862 } else {
4863 robj *ele = listNodeValue(ln);
4864
4865 decrRefCount(ele);
4866 listNodeValue(ln) = c->argv[3];
4867 incrRefCount(c->argv[3]);
4868 addReply(c,shared.ok);
4869 server.dirty++;
4870 }
4871 }
4872
4873 static void popGenericCommand(redisClient *c, int where) {
4874 robj *o;
4875 list *list;
4876 listNode *ln;
4877
4878 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4879 checkType(c,o,REDIS_LIST)) return;
4880 list = o->ptr;
4881
4882 if (where == REDIS_HEAD)
4883 ln = listFirst(list);
4884 else
4885 ln = listLast(list);
4886
4887 if (ln == NULL) {
4888 addReply(c,shared.nullbulk);
4889 } else {
4890 robj *ele = listNodeValue(ln);
4891 addReplyBulk(c,ele);
4892 listDelNode(list,ln);
4893 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4894 server.dirty++;
4895 }
4896 }
4897
4898 static void lpopCommand(redisClient *c) {
4899 popGenericCommand(c,REDIS_HEAD);
4900 }
4901
4902 static void rpopCommand(redisClient *c) {
4903 popGenericCommand(c,REDIS_TAIL);
4904 }
4905
4906 static void lrangeCommand(redisClient *c) {
4907 robj *o;
4908 int start = atoi(c->argv[2]->ptr);
4909 int end = atoi(c->argv[3]->ptr);
4910 int llen;
4911 int rangelen, j;
4912 list *list;
4913 listNode *ln;
4914 robj *ele;
4915
4916 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4917 || checkType(c,o,REDIS_LIST)) return;
4918 list = o->ptr;
4919 llen = listLength(list);
4920
4921 /* convert negative indexes */
4922 if (start < 0) start = llen+start;
4923 if (end < 0) end = llen+end;
4924 if (start < 0) start = 0;
4925 if (end < 0) end = 0;
4926
4927 /* indexes sanity checks */
4928 if (start > end || start >= llen) {
4929 /* Out of range start or start > end result in empty list */
4930 addReply(c,shared.emptymultibulk);
4931 return;
4932 }
4933 if (end >= llen) end = llen-1;
4934 rangelen = (end-start)+1;
4935
4936 /* Return the result in form of a multi-bulk reply */
4937 ln = listIndex(list, start);
4938 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4939 for (j = 0; j < rangelen; j++) {
4940 ele = listNodeValue(ln);
4941 addReplyBulk(c,ele);
4942 ln = ln->next;
4943 }
4944 }
4945
4946 static void ltrimCommand(redisClient *c) {
4947 robj *o;
4948 int start = atoi(c->argv[2]->ptr);
4949 int end = atoi(c->argv[3]->ptr);
4950 int llen;
4951 int j, ltrim, rtrim;
4952 list *list;
4953 listNode *ln;
4954
4955 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4956 checkType(c,o,REDIS_LIST)) return;
4957 list = o->ptr;
4958 llen = listLength(list);
4959
4960 /* convert negative indexes */
4961 if (start < 0) start = llen+start;
4962 if (end < 0) end = llen+end;
4963 if (start < 0) start = 0;
4964 if (end < 0) end = 0;
4965
4966 /* indexes sanity checks */
4967 if (start > end || start >= llen) {
4968 /* Out of range start or start > end result in empty list */
4969 ltrim = llen;
4970 rtrim = 0;
4971 } else {
4972 if (end >= llen) end = llen-1;
4973 ltrim = start;
4974 rtrim = llen-end-1;
4975 }
4976
4977 /* Remove list elements to perform the trim */
4978 for (j = 0; j < ltrim; j++) {
4979 ln = listFirst(list);
4980 listDelNode(list,ln);
4981 }
4982 for (j = 0; j < rtrim; j++) {
4983 ln = listLast(list);
4984 listDelNode(list,ln);
4985 }
4986 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4987 server.dirty++;
4988 addReply(c,shared.ok);
4989 }
4990
4991 static void lremCommand(redisClient *c) {
4992 robj *o;
4993 list *list;
4994 listNode *ln, *next;
4995 int toremove = atoi(c->argv[2]->ptr);
4996 int removed = 0;
4997 int fromtail = 0;
4998
4999 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5000 checkType(c,o,REDIS_LIST)) return;
5001 list = o->ptr;
5002
5003 if (toremove < 0) {
5004 toremove = -toremove;
5005 fromtail = 1;
5006 }
5007 ln = fromtail ? list->tail : list->head;
5008 while (ln) {
5009 robj *ele = listNodeValue(ln);
5010
5011 next = fromtail ? ln->prev : ln->next;
5012 if (equalStringObjects(ele,c->argv[3])) {
5013 listDelNode(list,ln);
5014 server.dirty++;
5015 removed++;
5016 if (toremove && removed == toremove) break;
5017 }
5018 ln = next;
5019 }
5020 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5021 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5022 }
5023
5024 /* This is the semantic of this command:
5025 * RPOPLPUSH srclist dstlist:
5026 * IF LLEN(srclist) > 0
5027 * element = RPOP srclist
5028 * LPUSH dstlist element
5029 * RETURN element
5030 * ELSE
5031 * RETURN nil
5032 * END
5033 * END
5034 *
5035 * The idea is to be able to get an element from a list in a reliable way
5036 * since the element is not just returned but pushed against another list
5037 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5038 */
5039 static void rpoplpushcommand(redisClient *c) {
5040 robj *sobj;
5041 list *srclist;
5042 listNode *ln;
5043
5044 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5045 checkType(c,sobj,REDIS_LIST)) return;
5046 srclist = sobj->ptr;
5047 ln = listLast(srclist);
5048
5049 if (ln == NULL) {
5050 addReply(c,shared.nullbulk);
5051 } else {
5052 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5053 robj *ele = listNodeValue(ln);
5054 list *dstlist;
5055
5056 if (dobj && dobj->type != REDIS_LIST) {
5057 addReply(c,shared.wrongtypeerr);
5058 return;
5059 }
5060
5061 /* Add the element to the target list (unless it's directly
5062 * passed to some BLPOP-ing client */
5063 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5064 if (dobj == NULL) {
5065 /* Create the list if the key does not exist */
5066 dobj = createListObject();
5067 dictAdd(c->db->dict,c->argv[2],dobj);
5068 incrRefCount(c->argv[2]);
5069 }
5070 dstlist = dobj->ptr;
5071 listAddNodeHead(dstlist,ele);
5072 incrRefCount(ele);
5073 }
5074
5075 /* Send the element to the client as reply as well */
5076 addReplyBulk(c,ele);
5077
5078 /* Finally remove the element from the source list */
5079 listDelNode(srclist,ln);
5080 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5081 server.dirty++;
5082 }
5083 }
5084
5085 /* ==================================== Sets ================================ */
5086
5087 static void saddCommand(redisClient *c) {
5088 robj *set;
5089
5090 set = lookupKeyWrite(c->db,c->argv[1]);
5091 if (set == NULL) {
5092 set = createSetObject();
5093 dictAdd(c->db->dict,c->argv[1],set);
5094 incrRefCount(c->argv[1]);
5095 } else {
5096 if (set->type != REDIS_SET) {
5097 addReply(c,shared.wrongtypeerr);
5098 return;
5099 }
5100 }
5101 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5102 incrRefCount(c->argv[2]);
5103 server.dirty++;
5104 addReply(c,shared.cone);
5105 } else {
5106 addReply(c,shared.czero);
5107 }
5108 }
5109
5110 static void sremCommand(redisClient *c) {
5111 robj *set;
5112
5113 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5114 checkType(c,set,REDIS_SET)) return;
5115
5116 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5117 server.dirty++;
5118 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5119 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5120 addReply(c,shared.cone);
5121 } else {
5122 addReply(c,shared.czero);
5123 }
5124 }
5125
5126 static void smoveCommand(redisClient *c) {
5127 robj *srcset, *dstset;
5128
5129 srcset = lookupKeyWrite(c->db,c->argv[1]);
5130 dstset = lookupKeyWrite(c->db,c->argv[2]);
5131
5132 /* If the source key does not exist return 0, if it's of the wrong type
5133 * raise an error */
5134 if (srcset == NULL || srcset->type != REDIS_SET) {
5135 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5136 return;
5137 }
5138 /* Error if the destination key is not a set as well */
5139 if (dstset && dstset->type != REDIS_SET) {
5140 addReply(c,shared.wrongtypeerr);
5141 return;
5142 }
5143 /* Remove the element from the source set */
5144 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5145 /* Key not found in the src set! return zero */
5146 addReply(c,shared.czero);
5147 return;
5148 }
5149 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5150 deleteKey(c->db,c->argv[1]);
5151 server.dirty++;
5152 /* Add the element to the destination set */
5153 if (!dstset) {
5154 dstset = createSetObject();
5155 dictAdd(c->db->dict,c->argv[2],dstset);
5156 incrRefCount(c->argv[2]);
5157 }
5158 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5159 incrRefCount(c->argv[3]);
5160 addReply(c,shared.cone);
5161 }
5162
5163 static void sismemberCommand(redisClient *c) {
5164 robj *set;
5165
5166 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5167 checkType(c,set,REDIS_SET)) return;
5168
5169 if (dictFind(set->ptr,c->argv[2]))
5170 addReply(c,shared.cone);
5171 else
5172 addReply(c,shared.czero);
5173 }
5174
5175 static void scardCommand(redisClient *c) {
5176 robj *o;
5177 dict *s;
5178
5179 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5180 checkType(c,o,REDIS_SET)) return;
5181
5182 s = o->ptr;
5183 addReplyUlong(c,dictSize(s));
5184 }
5185
5186 static void spopCommand(redisClient *c) {
5187 robj *set;
5188 dictEntry *de;
5189
5190 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5191 checkType(c,set,REDIS_SET)) return;
5192
5193 de = dictGetRandomKey(set->ptr);
5194 if (de == NULL) {
5195 addReply(c,shared.nullbulk);
5196 } else {
5197 robj *ele = dictGetEntryKey(de);
5198
5199 addReplyBulk(c,ele);
5200 dictDelete(set->ptr,ele);
5201 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5202 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5203 server.dirty++;
5204 }
5205 }
5206
5207 static void srandmemberCommand(redisClient *c) {
5208 robj *set;
5209 dictEntry *de;
5210
5211 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5212 checkType(c,set,REDIS_SET)) return;
5213
5214 de = dictGetRandomKey(set->ptr);
5215 if (de == NULL) {
5216 addReply(c,shared.nullbulk);
5217 } else {
5218 robj *ele = dictGetEntryKey(de);
5219
5220 addReplyBulk(c,ele);
5221 }
5222 }
5223
5224 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5225 dict **d1 = (void*) s1, **d2 = (void*) s2;
5226
5227 return dictSize(*d1)-dictSize(*d2);
5228 }
5229
5230 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5231 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5232 dictIterator *di;
5233 dictEntry *de;
5234 robj *lenobj = NULL, *dstset = NULL;
5235 unsigned long j, cardinality = 0;
5236
5237 for (j = 0; j < setsnum; j++) {
5238 robj *setobj;
5239
5240 setobj = dstkey ?
5241 lookupKeyWrite(c->db,setskeys[j]) :
5242 lookupKeyRead(c->db,setskeys[j]);
5243 if (!setobj) {
5244 zfree(dv);
5245 if (dstkey) {
5246 if (deleteKey(c->db,dstkey))
5247 server.dirty++;
5248 addReply(c,shared.czero);
5249 } else {
5250 addReply(c,shared.emptymultibulk);
5251 }
5252 return;
5253 }
5254 if (setobj->type != REDIS_SET) {
5255 zfree(dv);
5256 addReply(c,shared.wrongtypeerr);
5257 return;
5258 }
5259 dv[j] = setobj->ptr;
5260 }
5261 /* Sort sets from the smallest to largest, this will improve our
5262 * algorithm's performace */
5263 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5264
5265 /* The first thing we should output is the total number of elements...
5266 * since this is a multi-bulk write, but at this stage we don't know
5267 * the intersection set size, so we use a trick, append an empty object
5268 * to the output list and save the pointer to later modify it with the
5269 * right length */
5270 if (!dstkey) {
5271 lenobj = createObject(REDIS_STRING,NULL);
5272 addReply(c,lenobj);
5273 decrRefCount(lenobj);
5274 } else {
5275 /* If we have a target key where to store the resulting set
5276 * create this key with an empty set inside */
5277 dstset = createSetObject();
5278 }
5279
5280 /* Iterate all the elements of the first (smallest) set, and test
5281 * the element against all the other sets, if at least one set does
5282 * not include the element it is discarded */
5283 di = dictGetIterator(dv[0]);
5284
5285 while((de = dictNext(di)) != NULL) {
5286 robj *ele;
5287
5288 for (j = 1; j < setsnum; j++)
5289 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5290 if (j != setsnum)
5291 continue; /* at least one set does not contain the member */
5292 ele = dictGetEntryKey(de);
5293 if (!dstkey) {
5294 addReplyBulk(c,ele);
5295 cardinality++;
5296 } else {
5297 dictAdd(dstset->ptr,ele,NULL);
5298 incrRefCount(ele);
5299 }
5300 }
5301 dictReleaseIterator(di);
5302
5303 if (dstkey) {
5304 /* Store the resulting set into the target, if the intersection
5305 * is not an empty set. */
5306 deleteKey(c->db,dstkey);
5307 if (dictSize((dict*)dstset->ptr) > 0) {
5308 dictAdd(c->db->dict,dstkey,dstset);
5309 incrRefCount(dstkey);
5310 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5311 } else {
5312 decrRefCount(dstset);
5313 addReply(c,shared.czero);
5314 }
5315 server.dirty++;
5316 } else {
5317 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5318 }
5319 zfree(dv);
5320 }
5321
5322 static void sinterCommand(redisClient *c) {
5323 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5324 }
5325
5326 static void sinterstoreCommand(redisClient *c) {
5327 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5328 }
5329
5330 #define REDIS_OP_UNION 0
5331 #define REDIS_OP_DIFF 1
5332 #define REDIS_OP_INTER 2
5333
5334 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5335 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5336 dictIterator *di;
5337 dictEntry *de;
5338 robj *dstset = NULL;
5339 int j, cardinality = 0;
5340
5341 for (j = 0; j < setsnum; j++) {
5342 robj *setobj;
5343
5344 setobj = dstkey ?
5345 lookupKeyWrite(c->db,setskeys[j]) :
5346 lookupKeyRead(c->db,setskeys[j]);
5347 if (!setobj) {
5348 dv[j] = NULL;
5349 continue;
5350 }
5351 if (setobj->type != REDIS_SET) {
5352 zfree(dv);
5353 addReply(c,shared.wrongtypeerr);
5354 return;
5355 }
5356 dv[j] = setobj->ptr;
5357 }
5358
5359 /* We need a temp set object to store our union. If the dstkey
5360 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5361 * this set object will be the resulting object to set into the target key*/
5362 dstset = createSetObject();
5363
5364 /* Iterate all the elements of all the sets, add every element a single
5365 * time to the result set */
5366 for (j = 0; j < setsnum; j++) {
5367 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5368 if (!dv[j]) continue; /* non existing keys are like empty sets */
5369
5370 di = dictGetIterator(dv[j]);
5371
5372 while((de = dictNext(di)) != NULL) {
5373 robj *ele;
5374
5375 /* dictAdd will not add the same element multiple times */
5376 ele = dictGetEntryKey(de);
5377 if (op == REDIS_OP_UNION || j == 0) {
5378 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5379 incrRefCount(ele);
5380 cardinality++;
5381 }
5382 } else if (op == REDIS_OP_DIFF) {
5383 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5384 cardinality--;
5385 }
5386 }
5387 }
5388 dictReleaseIterator(di);
5389
5390 /* result set is empty? Exit asap. */
5391 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5392 }
5393
5394 /* Output the content of the resulting set, if not in STORE mode */
5395 if (!dstkey) {
5396 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5397 di = dictGetIterator(dstset->ptr);
5398 while((de = dictNext(di)) != NULL) {
5399 robj *ele;
5400
5401 ele = dictGetEntryKey(de);
5402 addReplyBulk(c,ele);
5403 }
5404 dictReleaseIterator(di);
5405 decrRefCount(dstset);
5406 } else {
5407 /* If we have a target key where to store the resulting set
5408 * create this key with the result set inside */
5409 deleteKey(c->db,dstkey);
5410 if (dictSize((dict*)dstset->ptr) > 0) {
5411 dictAdd(c->db->dict,dstkey,dstset);
5412 incrRefCount(dstkey);
5413 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5414 } else {
5415 decrRefCount(dstset);
5416 addReply(c,shared.czero);
5417 }
5418 server.dirty++;
5419 }
5420 zfree(dv);
5421 }
5422
5423 static void sunionCommand(redisClient *c) {
5424 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5425 }
5426
5427 static void sunionstoreCommand(redisClient *c) {
5428 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5429 }
5430
5431 static void sdiffCommand(redisClient *c) {
5432 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5433 }
5434
5435 static void sdiffstoreCommand(redisClient *c) {
5436 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5437 }
5438
5439 /* ==================================== ZSets =============================== */
5440
5441 /* ZSETs are ordered sets using two data structures to hold the same elements
5442 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5443 * data structure.
5444 *
5445 * The elements are added to an hash table mapping Redis objects to scores.
5446 * At the same time the elements are added to a skip list mapping scores
5447 * to Redis objects (so objects are sorted by scores in this "view"). */
5448
5449 /* This skiplist implementation is almost a C translation of the original
5450 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5451 * Alternative to Balanced Trees", modified in three ways:
5452 * a) this implementation allows for repeated values.
5453 * b) the comparison is not just by key (our 'score') but by satellite data.
5454 * c) there is a back pointer, so it's a doubly linked list with the back
5455 * pointers being only at "level 1". This allows to traverse the list
5456 * from tail to head, useful for ZREVRANGE. */
5457
5458 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5459 zskiplistNode *zn = zmalloc(sizeof(*zn));
5460
5461 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5462 if (level > 1)
5463 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5464 else
5465 zn->span = NULL;
5466 zn->score = score;
5467 zn->obj = obj;
5468 return zn;
5469 }
5470
5471 static zskiplist *zslCreate(void) {
5472 int j;
5473 zskiplist *zsl;
5474
5475 zsl = zmalloc(sizeof(*zsl));
5476 zsl->level = 1;
5477 zsl->length = 0;
5478 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5479 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5480 zsl->header->forward[j] = NULL;
5481
5482 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5483 if (j < ZSKIPLIST_MAXLEVEL-1)
5484 zsl->header->span[j] = 0;
5485 }
5486 zsl->header->backward = NULL;
5487 zsl->tail = NULL;
5488 return zsl;
5489 }
5490
5491 static void zslFreeNode(zskiplistNode *node) {
5492 decrRefCount(node->obj);
5493 zfree(node->forward);
5494 zfree(node->span);
5495 zfree(node);
5496 }
5497
5498 static void zslFree(zskiplist *zsl) {
5499 zskiplistNode *node = zsl->header->forward[0], *next;
5500
5501 zfree(zsl->header->forward);
5502 zfree(zsl->header->span);
5503 zfree(zsl->header);
5504 while(node) {
5505 next = node->forward[0];
5506 zslFreeNode(node);
5507 node = next;
5508 }
5509 zfree(zsl);
5510 }
5511
5512 static int zslRandomLevel(void) {
5513 int level = 1;
5514 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5515 level += 1;
5516 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5517 }
5518
5519 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5520 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5521 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5522 int i, level;
5523
5524 x = zsl->header;
5525 for (i = zsl->level-1; i >= 0; i--) {
5526 /* store rank that is crossed to reach the insert position */
5527 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5528
5529 while (x->forward[i] &&
5530 (x->forward[i]->score < score ||
5531 (x->forward[i]->score == score &&
5532 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5533 rank[i] += i > 0 ? x->span[i-1] : 1;
5534 x = x->forward[i];
5535 }
5536 update[i] = x;
5537 }
5538 /* we assume the key is not already inside, since we allow duplicated
5539 * scores, and the re-insertion of score and redis object should never
5540 * happpen since the caller of zslInsert() should test in the hash table
5541 * if the element is already inside or not. */
5542 level = zslRandomLevel();
5543 if (level > zsl->level) {
5544 for (i = zsl->level; i < level; i++) {
5545 rank[i] = 0;
5546 update[i] = zsl->header;
5547 update[i]->span[i-1] = zsl->length;
5548 }
5549 zsl->level = level;
5550 }
5551 x = zslCreateNode(level,score,obj);
5552 for (i = 0; i < level; i++) {
5553 x->forward[i] = update[i]->forward[i];
5554 update[i]->forward[i] = x;
5555
5556 /* update span covered by update[i] as x is inserted here */
5557 if (i > 0) {
5558 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5559 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5560 }
5561 }
5562
5563 /* increment span for untouched levels */
5564 for (i = level; i < zsl->level; i++) {
5565 update[i]->span[i-1]++;
5566 }
5567
5568 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5569 if (x->forward[0])
5570 x->forward[0]->backward = x;
5571 else
5572 zsl->tail = x;
5573 zsl->length++;
5574 }
5575
5576 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5577 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5578 int i;
5579 for (i = 0; i < zsl->level; i++) {
5580 if (update[i]->forward[i] == x) {
5581 if (i > 0) {
5582 update[i]->span[i-1] += x->span[i-1] - 1;
5583 }
5584 update[i]->forward[i] = x->forward[i];
5585 } else {
5586 /* invariant: i > 0, because update[0]->forward[0]
5587 * is always equal to x */
5588 update[i]->span[i-1] -= 1;
5589 }
5590 }
5591 if (x->forward[0]) {
5592 x->forward[0]->backward = x->backward;
5593 } else {
5594 zsl->tail = x->backward;
5595 }
5596 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5597 zsl->level--;
5598 zsl->length--;
5599 }
5600
5601 /* Delete an element with matching score/object from the skiplist. */
5602 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5603 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5604 int i;
5605
5606 x = zsl->header;
5607 for (i = zsl->level-1; i >= 0; i--) {
5608 while (x->forward[i] &&
5609 (x->forward[i]->score < score ||
5610 (x->forward[i]->score == score &&
5611 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5612 x = x->forward[i];
5613 update[i] = x;
5614 }
5615 /* We may have multiple elements with the same score, what we need
5616 * is to find the element with both the right score and object. */
5617 x = x->forward[0];
5618 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5619 zslDeleteNode(zsl, x, update);
5620 zslFreeNode(x);
5621 return 1;
5622 } else {
5623 return 0; /* not found */
5624 }
5625 return 0; /* not found */
5626 }
5627
5628 /* Delete all the elements with score between min and max from the skiplist.
5629 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5630 * Note that this function takes the reference to the hash table view of the
5631 * sorted set, in order to remove the elements from the hash table too. */
5632 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5633 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5634 unsigned long removed = 0;
5635 int i;
5636
5637 x = zsl->header;
5638 for (i = zsl->level-1; i >= 0; i--) {
5639 while (x->forward[i] && x->forward[i]->score < min)
5640 x = x->forward[i];
5641 update[i] = x;
5642 }
5643 /* We may have multiple elements with the same score, what we need
5644 * is to find the element with both the right score and object. */
5645 x = x->forward[0];
5646 while (x && x->score <= max) {
5647 zskiplistNode *next = x->forward[0];
5648 zslDeleteNode(zsl, x, update);
5649 dictDelete(dict,x->obj);
5650 zslFreeNode(x);
5651 removed++;
5652 x = next;
5653 }
5654 return removed; /* not found */
5655 }
5656
5657 /* Delete all the elements with rank between start and end from the skiplist.
5658 * Start and end are inclusive. Note that start and end need to be 1-based */
5659 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5660 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5661 unsigned long traversed = 0, removed = 0;
5662 int i;
5663
5664 x = zsl->header;
5665 for (i = zsl->level-1; i >= 0; i--) {
5666 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5667 traversed += i > 0 ? x->span[i-1] : 1;
5668 x = x->forward[i];
5669 }
5670 update[i] = x;
5671 }
5672
5673 traversed++;
5674 x = x->forward[0];
5675 while (x && traversed <= end) {
5676 zskiplistNode *next = x->forward[0];
5677 zslDeleteNode(zsl, x, update);
5678 dictDelete(dict,x->obj);
5679 zslFreeNode(x);
5680 removed++;
5681 traversed++;
5682 x = next;
5683 }
5684 return removed;
5685 }
5686
5687 /* Find the first node having a score equal or greater than the specified one.
5688 * Returns NULL if there is no match. */
5689 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5690 zskiplistNode *x;
5691 int i;
5692
5693 x = zsl->header;
5694 for (i = zsl->level-1; i >= 0; i--) {
5695 while (x->forward[i] && x->forward[i]->score < score)
5696 x = x->forward[i];
5697 }
5698 /* We may have multiple elements with the same score, what we need
5699 * is to find the element with both the right score and object. */
5700 return x->forward[0];
5701 }
5702
5703 /* Find the rank for an element by both score and key.
5704 * Returns 0 when the element cannot be found, rank otherwise.
5705 * Note that the rank is 1-based due to the span of zsl->header to the
5706 * first element. */
5707 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5708 zskiplistNode *x;
5709 unsigned long rank = 0;
5710 int i;
5711
5712 x = zsl->header;
5713 for (i = zsl->level-1; i >= 0; i--) {
5714 while (x->forward[i] &&
5715 (x->forward[i]->score < score ||
5716 (x->forward[i]->score == score &&
5717 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5718 rank += i > 0 ? x->span[i-1] : 1;
5719 x = x->forward[i];
5720 }
5721
5722 /* x might be equal to zsl->header, so test if obj is non-NULL */
5723 if (x->obj && equalStringObjects(x->obj,o)) {
5724 return rank;
5725 }
5726 }
5727 return 0;
5728 }
5729
5730 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5731 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5732 zskiplistNode *x;
5733 unsigned long traversed = 0;
5734 int i;
5735
5736 x = zsl->header;
5737 for (i = zsl->level-1; i >= 0; i--) {
5738 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5739 {
5740 traversed += i > 0 ? x->span[i-1] : 1;
5741 x = x->forward[i];
5742 }
5743 if (traversed == rank) {
5744 return x;
5745 }
5746 }
5747 return NULL;
5748 }
5749
5750 /* The actual Z-commands implementations */
5751
5752 /* This generic command implements both ZADD and ZINCRBY.
5753 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5754 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5755 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5756 robj *zsetobj;
5757 zset *zs;
5758 double *score;
5759
5760 if (isnan(scoreval)) {
5761 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5762 return;
5763 }
5764
5765 zsetobj = lookupKeyWrite(c->db,key);
5766 if (zsetobj == NULL) {
5767 zsetobj = createZsetObject();
5768 dictAdd(c->db->dict,key,zsetobj);
5769 incrRefCount(key);
5770 } else {
5771 if (zsetobj->type != REDIS_ZSET) {
5772 addReply(c,shared.wrongtypeerr);
5773 return;
5774 }
5775 }
5776 zs = zsetobj->ptr;
5777
5778 /* Ok now since we implement both ZADD and ZINCRBY here the code
5779 * needs to handle the two different conditions. It's all about setting
5780 * '*score', that is, the new score to set, to the right value. */
5781 score = zmalloc(sizeof(double));
5782 if (doincrement) {
5783 dictEntry *de;
5784
5785 /* Read the old score. If the element was not present starts from 0 */
5786 de = dictFind(zs->dict,ele);
5787 if (de) {
5788 double *oldscore = dictGetEntryVal(de);
5789 *score = *oldscore + scoreval;
5790 } else {
5791 *score = scoreval;
5792 }
5793 if (isnan(*score)) {
5794 addReplySds(c,
5795 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5796 zfree(score);
5797 /* Note that we don't need to check if the zset may be empty and
5798 * should be removed here, as we can only obtain Nan as score if
5799 * there was already an element in the sorted set. */
5800 return;
5801 }
5802 } else {
5803 *score = scoreval;
5804 }
5805
5806 /* What follows is a simple remove and re-insert operation that is common
5807 * to both ZADD and ZINCRBY... */
5808 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5809 /* case 1: New element */
5810 incrRefCount(ele); /* added to hash */
5811 zslInsert(zs->zsl,*score,ele);
5812 incrRefCount(ele); /* added to skiplist */
5813 server.dirty++;
5814 if (doincrement)
5815 addReplyDouble(c,*score);
5816 else
5817 addReply(c,shared.cone);
5818 } else {
5819 dictEntry *de;
5820 double *oldscore;
5821
5822 /* case 2: Score update operation */
5823 de = dictFind(zs->dict,ele);
5824 redisAssert(de != NULL);
5825 oldscore = dictGetEntryVal(de);
5826 if (*score != *oldscore) {
5827 int deleted;
5828
5829 /* Remove and insert the element in the skip list with new score */
5830 deleted = zslDelete(zs->zsl,*oldscore,ele);
5831 redisAssert(deleted != 0);
5832 zslInsert(zs->zsl,*score,ele);
5833 incrRefCount(ele);
5834 /* Update the score in the hash table */
5835 dictReplace(zs->dict,ele,score);
5836 server.dirty++;
5837 } else {
5838 zfree(score);
5839 }
5840 if (doincrement)
5841 addReplyDouble(c,*score);
5842 else
5843 addReply(c,shared.czero);
5844 }
5845 }
5846
5847 static void zaddCommand(redisClient *c) {
5848 double scoreval;
5849
5850 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5851 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5852 }
5853
5854 static void zincrbyCommand(redisClient *c) {
5855 double scoreval;
5856
5857 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5858 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5859 }
5860
5861 static void zremCommand(redisClient *c) {
5862 robj *zsetobj;
5863 zset *zs;
5864 dictEntry *de;
5865 double *oldscore;
5866 int deleted;
5867
5868 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5869 checkType(c,zsetobj,REDIS_ZSET)) return;
5870
5871 zs = zsetobj->ptr;
5872 de = dictFind(zs->dict,c->argv[2]);
5873 if (de == NULL) {
5874 addReply(c,shared.czero);
5875 return;
5876 }
5877 /* Delete from the skiplist */
5878 oldscore = dictGetEntryVal(de);
5879 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5880 redisAssert(deleted != 0);
5881
5882 /* Delete from the hash table */
5883 dictDelete(zs->dict,c->argv[2]);
5884 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5885 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5886 server.dirty++;
5887 addReply(c,shared.cone);
5888 }
5889
5890 static void zremrangebyscoreCommand(redisClient *c) {
5891 double min;
5892 double max;
5893 long deleted;
5894 robj *zsetobj;
5895 zset *zs;
5896
5897 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5898 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5899
5900 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5901 checkType(c,zsetobj,REDIS_ZSET)) return;
5902
5903 zs = zsetobj->ptr;
5904 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5905 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5906 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5907 server.dirty += deleted;
5908 addReplyLongLong(c,deleted);
5909 }
5910
5911 static void zremrangebyrankCommand(redisClient *c) {
5912 long start;
5913 long end;
5914 int llen;
5915 long deleted;
5916 robj *zsetobj;
5917 zset *zs;
5918
5919 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5920 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5921
5922 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5923 checkType(c,zsetobj,REDIS_ZSET)) return;
5924 zs = zsetobj->ptr;
5925 llen = zs->zsl->length;
5926
5927 /* convert negative indexes */
5928 if (start < 0) start = llen+start;
5929 if (end < 0) end = llen+end;
5930 if (start < 0) start = 0;
5931 if (end < 0) end = 0;
5932
5933 /* indexes sanity checks */
5934 if (start > end || start >= llen) {
5935 addReply(c,shared.czero);
5936 return;
5937 }
5938 if (end >= llen) end = llen-1;
5939
5940 /* increment start and end because zsl*Rank functions
5941 * use 1-based rank */
5942 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5943 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5944 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5945 server.dirty += deleted;
5946 addReplyLongLong(c, deleted);
5947 }
5948
5949 typedef struct {
5950 dict *dict;
5951 double weight;
5952 } zsetopsrc;
5953
5954 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5955 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5956 unsigned long size1, size2;
5957 size1 = d1->dict ? dictSize(d1->dict) : 0;
5958 size2 = d2->dict ? dictSize(d2->dict) : 0;
5959 return size1 - size2;
5960 }
5961
5962 #define REDIS_AGGR_SUM 1
5963 #define REDIS_AGGR_MIN 2
5964 #define REDIS_AGGR_MAX 3
5965 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5966
5967 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5968 if (aggregate == REDIS_AGGR_SUM) {
5969 *target = *target + val;
5970 } else if (aggregate == REDIS_AGGR_MIN) {
5971 *target = val < *target ? val : *target;
5972 } else if (aggregate == REDIS_AGGR_MAX) {
5973 *target = val > *target ? val : *target;
5974 } else {
5975 /* safety net */
5976 redisPanic("Unknown ZUNION/INTER aggregate type");
5977 }
5978 }
5979
5980 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5981 int i, j, setnum;
5982 int aggregate = REDIS_AGGR_SUM;
5983 zsetopsrc *src;
5984 robj *dstobj;
5985 zset *dstzset;
5986 dictIterator *di;
5987 dictEntry *de;
5988
5989 /* expect setnum input keys to be given */
5990 setnum = atoi(c->argv[2]->ptr);
5991 if (setnum < 1) {
5992 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5993 return;
5994 }
5995
5996 /* test if the expected number of keys would overflow */
5997 if (3+setnum > c->argc) {
5998 addReply(c,shared.syntaxerr);
5999 return;
6000 }
6001
6002 /* read keys to be used for input */
6003 src = zmalloc(sizeof(zsetopsrc) * setnum);
6004 for (i = 0, j = 3; i < setnum; i++, j++) {
6005 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6006 if (!obj) {
6007 src[i].dict = NULL;
6008 } else {
6009 if (obj->type == REDIS_ZSET) {
6010 src[i].dict = ((zset*)obj->ptr)->dict;
6011 } else if (obj->type == REDIS_SET) {
6012 src[i].dict = (obj->ptr);
6013 } else {
6014 zfree(src);
6015 addReply(c,shared.wrongtypeerr);
6016 return;
6017 }
6018 }
6019
6020 /* default all weights to 1 */
6021 src[i].weight = 1.0;
6022 }
6023
6024 /* parse optional extra arguments */
6025 if (j < c->argc) {
6026 int remaining = c->argc - j;
6027
6028 while (remaining) {
6029 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6030 j++; remaining--;
6031 for (i = 0; i < setnum; i++, j++, remaining--) {
6032 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6033 return;
6034 }
6035 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6036 j++; remaining--;
6037 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6038 aggregate = REDIS_AGGR_SUM;
6039 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6040 aggregate = REDIS_AGGR_MIN;
6041 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6042 aggregate = REDIS_AGGR_MAX;
6043 } else {
6044 zfree(src);
6045 addReply(c,shared.syntaxerr);
6046 return;
6047 }
6048 j++; remaining--;
6049 } else {
6050 zfree(src);
6051 addReply(c,shared.syntaxerr);
6052 return;
6053 }
6054 }
6055 }
6056
6057 /* sort sets from the smallest to largest, this will improve our
6058 * algorithm's performance */
6059 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6060
6061 dstobj = createZsetObject();
6062 dstzset = dstobj->ptr;
6063
6064 if (op == REDIS_OP_INTER) {
6065 /* skip going over all entries if the smallest zset is NULL or empty */
6066 if (src[0].dict && dictSize(src[0].dict) > 0) {
6067 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6068 * from small to large, all src[i > 0].dict are non-empty too */
6069 di = dictGetIterator(src[0].dict);
6070 while((de = dictNext(di)) != NULL) {
6071 double *score = zmalloc(sizeof(double)), value;
6072 *score = src[0].weight * zunionInterDictValue(de);
6073
6074 for (j = 1; j < setnum; j++) {
6075 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6076 if (other) {
6077 value = src[j].weight * zunionInterDictValue(other);
6078 zunionInterAggregate(score, value, aggregate);
6079 } else {
6080 break;
6081 }
6082 }
6083
6084 /* skip entry when not present in every source dict */
6085 if (j != setnum) {
6086 zfree(score);
6087 } else {
6088 robj *o = dictGetEntryKey(de);
6089 dictAdd(dstzset->dict,o,score);
6090 incrRefCount(o); /* added to dictionary */
6091 zslInsert(dstzset->zsl,*score,o);
6092 incrRefCount(o); /* added to skiplist */
6093 }
6094 }
6095 dictReleaseIterator(di);
6096 }
6097 } else if (op == REDIS_OP_UNION) {
6098 for (i = 0; i < setnum; i++) {
6099 if (!src[i].dict) continue;
6100
6101 di = dictGetIterator(src[i].dict);
6102 while((de = dictNext(di)) != NULL) {
6103 /* skip key when already processed */
6104 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6105
6106 double *score = zmalloc(sizeof(double)), value;
6107 *score = src[i].weight * zunionInterDictValue(de);
6108
6109 /* because the zsets are sorted by size, its only possible
6110 * for sets at larger indices to hold this entry */
6111 for (j = (i+1); j < setnum; j++) {
6112 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6113 if (other) {
6114 value = src[j].weight * zunionInterDictValue(other);
6115 zunionInterAggregate(score, value, aggregate);
6116 }
6117 }
6118
6119 robj *o = dictGetEntryKey(de);
6120 dictAdd(dstzset->dict,o,score);
6121 incrRefCount(o); /* added to dictionary */
6122 zslInsert(dstzset->zsl,*score,o);
6123 incrRefCount(o); /* added to skiplist */
6124 }
6125 dictReleaseIterator(di);
6126 }
6127 } else {
6128 /* unknown operator */
6129 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6130 }
6131
6132 deleteKey(c->db,dstkey);
6133 if (dstzset->zsl->length) {
6134 dictAdd(c->db->dict,dstkey,dstobj);
6135 incrRefCount(dstkey);
6136 addReplyLongLong(c, dstzset->zsl->length);
6137 server.dirty++;
6138 } else {
6139 decrRefCount(dstobj);
6140 addReply(c, shared.czero);
6141 }
6142 zfree(src);
6143 }
6144
6145 static void zunionstoreCommand(redisClient *c) {
6146 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6147 }
6148
6149 static void zinterstoreCommand(redisClient *c) {
6150 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6151 }
6152
6153 static void zrangeGenericCommand(redisClient *c, int reverse) {
6154 robj *o;
6155 long start;
6156 long end;
6157 int withscores = 0;
6158 int llen;
6159 int rangelen, j;
6160 zset *zsetobj;
6161 zskiplist *zsl;
6162 zskiplistNode *ln;
6163 robj *ele;
6164
6165 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6166 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6167
6168 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6169 withscores = 1;
6170 } else if (c->argc >= 5) {
6171 addReply(c,shared.syntaxerr);
6172 return;
6173 }
6174
6175 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6176 || checkType(c,o,REDIS_ZSET)) return;
6177 zsetobj = o->ptr;
6178 zsl = zsetobj->zsl;
6179 llen = zsl->length;
6180
6181 /* convert negative indexes */
6182 if (start < 0) start = llen+start;
6183 if (end < 0) end = llen+end;
6184 if (start < 0) start = 0;
6185 if (end < 0) end = 0;
6186
6187 /* indexes sanity checks */
6188 if (start > end || start >= llen) {
6189 /* Out of range start or start > end result in empty list */
6190 addReply(c,shared.emptymultibulk);
6191 return;
6192 }
6193 if (end >= llen) end = llen-1;
6194 rangelen = (end-start)+1;
6195
6196 /* check if starting point is trivial, before searching
6197 * the element in log(N) time */
6198 if (reverse) {
6199 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6200 } else {
6201 ln = start == 0 ?
6202 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6203 }
6204
6205 /* Return the result in form of a multi-bulk reply */
6206 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6207 withscores ? (rangelen*2) : rangelen));
6208 for (j = 0; j < rangelen; j++) {
6209 ele = ln->obj;
6210 addReplyBulk(c,ele);
6211 if (withscores)
6212 addReplyDouble(c,ln->score);
6213 ln = reverse ? ln->backward : ln->forward[0];
6214 }
6215 }
6216
6217 static void zrangeCommand(redisClient *c) {
6218 zrangeGenericCommand(c,0);
6219 }
6220
6221 static void zrevrangeCommand(redisClient *c) {
6222 zrangeGenericCommand(c,1);
6223 }
6224
6225 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6226 * If justcount is non-zero, just the count is returned. */
6227 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6228 robj *o;
6229 double min, max;
6230 int minex = 0, maxex = 0; /* are min or max exclusive? */
6231 int offset = 0, limit = -1;
6232 int withscores = 0;
6233 int badsyntax = 0;
6234
6235 /* Parse the min-max interval. If one of the values is prefixed
6236 * by the "(" character, it's considered "open". For instance
6237 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6238 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6239 if (((char*)c->argv[2]->ptr)[0] == '(') {
6240 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6241 minex = 1;
6242 } else {
6243 min = strtod(c->argv[2]->ptr,NULL);
6244 }
6245 if (((char*)c->argv[3]->ptr)[0] == '(') {
6246 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6247 maxex = 1;
6248 } else {
6249 max = strtod(c->argv[3]->ptr,NULL);
6250 }
6251
6252 /* Parse "WITHSCORES": note that if the command was called with
6253 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6254 * enter the following paths to parse WITHSCORES and LIMIT. */
6255 if (c->argc == 5 || c->argc == 8) {
6256 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6257 withscores = 1;
6258 else
6259 badsyntax = 1;
6260 }
6261 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6262 badsyntax = 1;
6263 if (badsyntax) {
6264 addReplySds(c,
6265 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6266 return;
6267 }
6268
6269 /* Parse "LIMIT" */
6270 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6271 addReply(c,shared.syntaxerr);
6272 return;
6273 } else if (c->argc == (7 + withscores)) {
6274 offset = atoi(c->argv[5]->ptr);
6275 limit = atoi(c->argv[6]->ptr);
6276 if (offset < 0) offset = 0;
6277 }
6278
6279 /* Ok, lookup the key and get the range */
6280 o = lookupKeyRead(c->db,c->argv[1]);
6281 if (o == NULL) {
6282 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6283 } else {
6284 if (o->type != REDIS_ZSET) {
6285 addReply(c,shared.wrongtypeerr);
6286 } else {
6287 zset *zsetobj = o->ptr;
6288 zskiplist *zsl = zsetobj->zsl;
6289 zskiplistNode *ln;
6290 robj *ele, *lenobj = NULL;
6291 unsigned long rangelen = 0;
6292
6293 /* Get the first node with the score >= min, or with
6294 * score > min if 'minex' is true. */
6295 ln = zslFirstWithScore(zsl,min);
6296 while (minex && ln && ln->score == min) ln = ln->forward[0];
6297
6298 if (ln == NULL) {
6299 /* No element matching the speciifed interval */
6300 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6301 return;
6302 }
6303
6304 /* We don't know in advance how many matching elements there
6305 * are in the list, so we push this object that will represent
6306 * the multi-bulk length in the output buffer, and will "fix"
6307 * it later */
6308 if (!justcount) {
6309 lenobj = createObject(REDIS_STRING,NULL);
6310 addReply(c,lenobj);
6311 decrRefCount(lenobj);
6312 }
6313
6314 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6315 if (offset) {
6316 offset--;
6317 ln = ln->forward[0];
6318 continue;
6319 }
6320 if (limit == 0) break;
6321 if (!justcount) {
6322 ele = ln->obj;
6323 addReplyBulk(c,ele);
6324 if (withscores)
6325 addReplyDouble(c,ln->score);
6326 }
6327 ln = ln->forward[0];
6328 rangelen++;
6329 if (limit > 0) limit--;
6330 }
6331 if (justcount) {
6332 addReplyLongLong(c,(long)rangelen);
6333 } else {
6334 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6335 withscores ? (rangelen*2) : rangelen);
6336 }
6337 }
6338 }
6339 }
6340
6341 static void zrangebyscoreCommand(redisClient *c) {
6342 genericZrangebyscoreCommand(c,0);
6343 }
6344
6345 static void zcountCommand(redisClient *c) {
6346 genericZrangebyscoreCommand(c,1);
6347 }
6348
6349 static void zcardCommand(redisClient *c) {
6350 robj *o;
6351 zset *zs;
6352
6353 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6354 checkType(c,o,REDIS_ZSET)) return;
6355
6356 zs = o->ptr;
6357 addReplyUlong(c,zs->zsl->length);
6358 }
6359
6360 static void zscoreCommand(redisClient *c) {
6361 robj *o;
6362 zset *zs;
6363 dictEntry *de;
6364
6365 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6366 checkType(c,o,REDIS_ZSET)) return;
6367
6368 zs = o->ptr;
6369 de = dictFind(zs->dict,c->argv[2]);
6370 if (!de) {
6371 addReply(c,shared.nullbulk);
6372 } else {
6373 double *score = dictGetEntryVal(de);
6374
6375 addReplyDouble(c,*score);
6376 }
6377 }
6378
6379 static void zrankGenericCommand(redisClient *c, int reverse) {
6380 robj *o;
6381 zset *zs;
6382 zskiplist *zsl;
6383 dictEntry *de;
6384 unsigned long rank;
6385 double *score;
6386
6387 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6388 checkType(c,o,REDIS_ZSET)) return;
6389
6390 zs = o->ptr;
6391 zsl = zs->zsl;
6392 de = dictFind(zs->dict,c->argv[2]);
6393 if (!de) {
6394 addReply(c,shared.nullbulk);
6395 return;
6396 }
6397
6398 score = dictGetEntryVal(de);
6399 rank = zslGetRank(zsl, *score, c->argv[2]);
6400 if (rank) {
6401 if (reverse) {
6402 addReplyLongLong(c, zsl->length - rank);
6403 } else {
6404 addReplyLongLong(c, rank-1);
6405 }
6406 } else {
6407 addReply(c,shared.nullbulk);
6408 }
6409 }
6410
6411 static void zrankCommand(redisClient *c) {
6412 zrankGenericCommand(c, 0);
6413 }
6414
6415 static void zrevrankCommand(redisClient *c) {
6416 zrankGenericCommand(c, 1);
6417 }
6418
6419 /* ========================= Hashes utility functions ======================= */
6420 #define REDIS_HASH_KEY 1
6421 #define REDIS_HASH_VALUE 2
6422
6423 /* Check the length of a number of objects to see if we need to convert a
6424 * zipmap to a real hash. Note that we only check string encoded objects
6425 * as their string length can be queried in constant time. */
6426 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6427 int i;
6428 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6429
6430 for (i = start; i <= end; i++) {
6431 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6432 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6433 {
6434 convertToRealHash(subject);
6435 return;
6436 }
6437 }
6438 }
6439
6440 /* Encode given objects in-place when the hash uses a dict. */
6441 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6442 if (subject->encoding == REDIS_ENCODING_HT) {
6443 if (o1) *o1 = tryObjectEncoding(*o1);
6444 if (o2) *o2 = tryObjectEncoding(*o2);
6445 }
6446 }
6447
6448 /* Get the value from a hash identified by key. Returns either a string
6449 * object or NULL if the value cannot be found. The refcount of the object
6450 * is always increased by 1 when the value was found. */
6451 static robj *hashGet(robj *o, robj *key) {
6452 robj *value = NULL;
6453 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6454 unsigned char *v;
6455 unsigned int vlen;
6456 key = getDecodedObject(key);
6457 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6458 value = createStringObject((char*)v,vlen);
6459 }
6460 decrRefCount(key);
6461 } else {
6462 dictEntry *de = dictFind(o->ptr,key);
6463 if (de != NULL) {
6464 value = dictGetEntryVal(de);
6465 incrRefCount(value);
6466 }
6467 }
6468 return value;
6469 }
6470
6471 /* Test if the key exists in the given hash. Returns 1 if the key
6472 * exists and 0 when it doesn't. */
6473 static int hashExists(robj *o, robj *key) {
6474 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6475 key = getDecodedObject(key);
6476 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6477 decrRefCount(key);
6478 return 1;
6479 }
6480 decrRefCount(key);
6481 } else {
6482 if (dictFind(o->ptr,key) != NULL) {
6483 return 1;
6484 }
6485 }
6486 return 0;
6487 }
6488
6489 /* Add an element, discard the old if the key already exists.
6490 * Return 0 on insert and 1 on update. */
6491 static int hashSet(robj *o, robj *key, robj *value) {
6492 int update = 0;
6493 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6494 key = getDecodedObject(key);
6495 value = getDecodedObject(value);
6496 o->ptr = zipmapSet(o->ptr,
6497 key->ptr,sdslen(key->ptr),
6498 value->ptr,sdslen(value->ptr), &update);
6499 decrRefCount(key);
6500 decrRefCount(value);
6501
6502 /* Check if the zipmap needs to be upgraded to a real hash table */
6503 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6504 convertToRealHash(o);
6505 } else {
6506 if (dictReplace(o->ptr,key,value)) {
6507 /* Insert */
6508 incrRefCount(key);
6509 } else {
6510 /* Update */
6511 update = 1;
6512 }
6513 incrRefCount(value);
6514 }
6515 return update;
6516 }
6517
6518 /* Delete an element from a hash.
6519 * Return 1 on deleted and 0 on not found. */
6520 static int hashDelete(robj *o, robj *key) {
6521 int deleted = 0;
6522 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6523 key = getDecodedObject(key);
6524 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6525 decrRefCount(key);
6526 } else {
6527 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6528 /* Always check if the dictionary needs a resize after a delete. */
6529 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6530 }
6531 return deleted;
6532 }
6533
6534 /* Return the number of elements in a hash. */
6535 static unsigned long hashLength(robj *o) {
6536 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6537 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6538 }
6539
6540 /* Structure to hold hash iteration abstration. Note that iteration over
6541 * hashes involves both fields and values. Because it is possible that
6542 * not both are required, store pointers in the iterator to avoid
6543 * unnecessary memory allocation for fields/values. */
6544 typedef struct {
6545 int encoding;
6546 unsigned char *zi;
6547 unsigned char *zk, *zv;
6548 unsigned int zklen, zvlen;
6549
6550 dictIterator *di;
6551 dictEntry *de;
6552 } hashIterator;
6553
6554 static hashIterator *hashInitIterator(robj *subject) {
6555 hashIterator *hi = zmalloc(sizeof(hashIterator));
6556 hi->encoding = subject->encoding;
6557 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6558 hi->zi = zipmapRewind(subject->ptr);
6559 } else if (hi->encoding == REDIS_ENCODING_HT) {
6560 hi->di = dictGetIterator(subject->ptr);
6561 } else {
6562 redisAssert(NULL);
6563 }
6564 return hi;
6565 }
6566
6567 static void hashReleaseIterator(hashIterator *hi) {
6568 if (hi->encoding == REDIS_ENCODING_HT) {
6569 dictReleaseIterator(hi->di);
6570 }
6571 zfree(hi);
6572 }
6573
6574 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6575 * could be found and REDIS_ERR when the iterator reaches the end. */
6576 static int hashNext(hashIterator *hi) {
6577 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6578 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6579 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6580 } else {
6581 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6582 }
6583 return REDIS_OK;
6584 }
6585
6586 /* Get key or value object at current iteration position.
6587 * This increases the refcount of the field object by 1. */
6588 static robj *hashCurrent(hashIterator *hi, int what) {
6589 robj *o;
6590 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6591 if (what & REDIS_HASH_KEY) {
6592 o = createStringObject((char*)hi->zk,hi->zklen);
6593 } else {
6594 o = createStringObject((char*)hi->zv,hi->zvlen);
6595 }
6596 } else {
6597 if (what & REDIS_HASH_KEY) {
6598 o = dictGetEntryKey(hi->de);
6599 } else {
6600 o = dictGetEntryVal(hi->de);
6601 }
6602 incrRefCount(o);
6603 }
6604 return o;
6605 }
6606
6607 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6608 robj *o = lookupKeyWrite(c->db,key);
6609 if (o == NULL) {
6610 o = createHashObject();
6611 dictAdd(c->db->dict,key,o);
6612 incrRefCount(key);
6613 } else {
6614 if (o->type != REDIS_HASH) {
6615 addReply(c,shared.wrongtypeerr);
6616 return NULL;
6617 }
6618 }
6619 return o;
6620 }
6621
6622 /* ============================= Hash commands ============================== */
6623 static void hsetCommand(redisClient *c) {
6624 int update;
6625 robj *o;
6626
6627 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6628 hashTryConversion(o,c->argv,2,3);
6629 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6630 update = hashSet(o,c->argv[2],c->argv[3]);
6631 addReply(c, update ? shared.czero : shared.cone);
6632 server.dirty++;
6633 }
6634
6635 static void hsetnxCommand(redisClient *c) {
6636 robj *o;
6637 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6638 hashTryConversion(o,c->argv,2,3);
6639
6640 if (hashExists(o, c->argv[2])) {
6641 addReply(c, shared.czero);
6642 } else {
6643 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6644 hashSet(o,c->argv[2],c->argv[3]);
6645 addReply(c, shared.cone);
6646 server.dirty++;
6647 }
6648 }
6649
6650 static void hmsetCommand(redisClient *c) {
6651 int i;
6652 robj *o;
6653
6654 if ((c->argc % 2) == 1) {
6655 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6656 return;
6657 }
6658
6659 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6660 hashTryConversion(o,c->argv,2,c->argc-1);
6661 for (i = 2; i < c->argc; i += 2) {
6662 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6663 hashSet(o,c->argv[i],c->argv[i+1]);
6664 }
6665 addReply(c, shared.ok);
6666 server.dirty++;
6667 }
6668
6669 static void hincrbyCommand(redisClient *c) {
6670 long long value, incr;
6671 robj *o, *current, *new;
6672
6673 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6674 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6675 if ((current = hashGet(o,c->argv[2])) != NULL) {
6676 if (getLongLongFromObjectOrReply(c,current,&value,
6677 "hash value is not an integer") != REDIS_OK) {
6678 decrRefCount(current);
6679 return;
6680 }
6681 decrRefCount(current);
6682 } else {
6683 value = 0;
6684 }
6685
6686 value += incr;
6687 new = createStringObjectFromLongLong(value);
6688 hashTryObjectEncoding(o,&c->argv[2],NULL);
6689 hashSet(o,c->argv[2],new);
6690 decrRefCount(new);
6691 addReplyLongLong(c,value);
6692 server.dirty++;
6693 }
6694
6695 static void hgetCommand(redisClient *c) {
6696 robj *o, *value;
6697 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6698 checkType(c,o,REDIS_HASH)) return;
6699
6700 if ((value = hashGet(o,c->argv[2])) != NULL) {
6701 addReplyBulk(c,value);
6702 decrRefCount(value);
6703 } else {
6704 addReply(c,shared.nullbulk);
6705 }
6706 }
6707
6708 static void hmgetCommand(redisClient *c) {
6709 int i;
6710 robj *o, *value;
6711 o = lookupKeyRead(c->db,c->argv[1]);
6712 if (o != NULL && o->type != REDIS_HASH) {
6713 addReply(c,shared.wrongtypeerr);
6714 }
6715
6716 /* Note the check for o != NULL happens inside the loop. This is
6717 * done because objects that cannot be found are considered to be
6718 * an empty hash. The reply should then be a series of NULLs. */
6719 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6720 for (i = 2; i < c->argc; i++) {
6721 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6722 addReplyBulk(c,value);
6723 decrRefCount(value);
6724 } else {
6725 addReply(c,shared.nullbulk);
6726 }
6727 }
6728 }
6729
6730 static void hdelCommand(redisClient *c) {
6731 robj *o;
6732 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6733 checkType(c,o,REDIS_HASH)) return;
6734
6735 if (hashDelete(o,c->argv[2])) {
6736 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6737 addReply(c,shared.cone);
6738 server.dirty++;
6739 } else {
6740 addReply(c,shared.czero);
6741 }
6742 }
6743
6744 static void hlenCommand(redisClient *c) {
6745 robj *o;
6746 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6747 checkType(c,o,REDIS_HASH)) return;
6748
6749 addReplyUlong(c,hashLength(o));
6750 }
6751
6752 static void genericHgetallCommand(redisClient *c, int flags) {
6753 robj *o, *lenobj, *obj;
6754 unsigned long count = 0;
6755 hashIterator *hi;
6756
6757 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6758 || checkType(c,o,REDIS_HASH)) return;
6759
6760 lenobj = createObject(REDIS_STRING,NULL);
6761 addReply(c,lenobj);
6762 decrRefCount(lenobj);
6763
6764 hi = hashInitIterator(o);
6765 while (hashNext(hi) != REDIS_ERR) {
6766 if (flags & REDIS_HASH_KEY) {
6767 obj = hashCurrent(hi,REDIS_HASH_KEY);
6768 addReplyBulk(c,obj);
6769 decrRefCount(obj);
6770 count++;
6771 }
6772 if (flags & REDIS_HASH_VALUE) {
6773 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6774 addReplyBulk(c,obj);
6775 decrRefCount(obj);
6776 count++;
6777 }
6778 }
6779 hashReleaseIterator(hi);
6780
6781 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6782 }
6783
6784 static void hkeysCommand(redisClient *c) {
6785 genericHgetallCommand(c,REDIS_HASH_KEY);
6786 }
6787
6788 static void hvalsCommand(redisClient *c) {
6789 genericHgetallCommand(c,REDIS_HASH_VALUE);
6790 }
6791
6792 static void hgetallCommand(redisClient *c) {
6793 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6794 }
6795
6796 static void hexistsCommand(redisClient *c) {
6797 robj *o;
6798 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6799 checkType(c,o,REDIS_HASH)) return;
6800
6801 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6802 }
6803
6804 static void convertToRealHash(robj *o) {
6805 unsigned char *key, *val, *p, *zm = o->ptr;
6806 unsigned int klen, vlen;
6807 dict *dict = dictCreate(&hashDictType,NULL);
6808
6809 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6810 p = zipmapRewind(zm);
6811 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6812 robj *keyobj, *valobj;
6813
6814 keyobj = createStringObject((char*)key,klen);
6815 valobj = createStringObject((char*)val,vlen);
6816 keyobj = tryObjectEncoding(keyobj);
6817 valobj = tryObjectEncoding(valobj);
6818 dictAdd(dict,keyobj,valobj);
6819 }
6820 o->encoding = REDIS_ENCODING_HT;
6821 o->ptr = dict;
6822 zfree(zm);
6823 }
6824
6825 /* ========================= Non type-specific commands ==================== */
6826
6827 static void flushdbCommand(redisClient *c) {
6828 server.dirty += dictSize(c->db->dict);
6829 touchWatchedKeysOnFlush(c->db->id);
6830 dictEmpty(c->db->dict);
6831 dictEmpty(c->db->expires);
6832 addReply(c,shared.ok);
6833 }
6834
6835 static void flushallCommand(redisClient *c) {
6836 touchWatchedKeysOnFlush(-1);
6837 server.dirty += emptyDb();
6838 addReply(c,shared.ok);
6839 if (server.bgsavechildpid != -1) {
6840 kill(server.bgsavechildpid,SIGKILL);
6841 rdbRemoveTempFile(server.bgsavechildpid);
6842 }
6843 rdbSave(server.dbfilename);
6844 server.dirty++;
6845 }
6846
6847 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6848 redisSortOperation *so = zmalloc(sizeof(*so));
6849 so->type = type;
6850 so->pattern = pattern;
6851 return so;
6852 }
6853
6854 /* Return the value associated to the key with a name obtained
6855 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6856 * The returned object will always have its refcount increased by 1
6857 * when it is non-NULL. */
6858 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6859 char *p, *f;
6860 sds spat, ssub;
6861 robj keyobj, fieldobj, *o;
6862 int prefixlen, sublen, postfixlen, fieldlen;
6863 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6864 struct {
6865 long len;
6866 long free;
6867 char buf[REDIS_SORTKEY_MAX+1];
6868 } keyname, fieldname;
6869
6870 /* If the pattern is "#" return the substitution object itself in order
6871 * to implement the "SORT ... GET #" feature. */
6872 spat = pattern->ptr;
6873 if (spat[0] == '#' && spat[1] == '\0') {
6874 incrRefCount(subst);
6875 return subst;
6876 }
6877
6878 /* The substitution object may be specially encoded. If so we create
6879 * a decoded object on the fly. Otherwise getDecodedObject will just
6880 * increment the ref count, that we'll decrement later. */
6881 subst = getDecodedObject(subst);
6882
6883 ssub = subst->ptr;
6884 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6885 p = strchr(spat,'*');
6886 if (!p) {
6887 decrRefCount(subst);
6888 return NULL;
6889 }
6890
6891 /* Find out if we're dealing with a hash dereference. */
6892 if ((f = strstr(p+1, "->")) != NULL) {
6893 fieldlen = sdslen(spat)-(f-spat);
6894 /* this also copies \0 character */
6895 memcpy(fieldname.buf,f+2,fieldlen-1);
6896 fieldname.len = fieldlen-2;
6897 } else {
6898 fieldlen = 0;
6899 }
6900
6901 prefixlen = p-spat;
6902 sublen = sdslen(ssub);
6903 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6904 memcpy(keyname.buf,spat,prefixlen);
6905 memcpy(keyname.buf+prefixlen,ssub,sublen);
6906 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6907 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6908 keyname.len = prefixlen+sublen+postfixlen;
6909 decrRefCount(subst);
6910
6911 /* Lookup substituted key */
6912 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6913 o = lookupKeyRead(db,&keyobj);
6914 if (o == NULL) return NULL;
6915
6916 if (fieldlen > 0) {
6917 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6918
6919 /* Retrieve value from hash by the field name. This operation
6920 * already increases the refcount of the returned object. */
6921 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6922 o = hashGet(o, &fieldobj);
6923 } else {
6924 if (o->type != REDIS_STRING) return NULL;
6925
6926 /* Every object that this function returns needs to have its refcount
6927 * increased. sortCommand decreases it again. */
6928 incrRefCount(o);
6929 }
6930
6931 return o;
6932 }
6933
6934 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6935 * the additional parameter is not standard but a BSD-specific we have to
6936 * pass sorting parameters via the global 'server' structure */
6937 static int sortCompare(const void *s1, const void *s2) {
6938 const redisSortObject *so1 = s1, *so2 = s2;
6939 int cmp;
6940
6941 if (!server.sort_alpha) {
6942 /* Numeric sorting. Here it's trivial as we precomputed scores */
6943 if (so1->u.score > so2->u.score) {
6944 cmp = 1;
6945 } else if (so1->u.score < so2->u.score) {
6946 cmp = -1;
6947 } else {
6948 cmp = 0;
6949 }
6950 } else {
6951 /* Alphanumeric sorting */
6952 if (server.sort_bypattern) {
6953 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6954 /* At least one compare object is NULL */
6955 if (so1->u.cmpobj == so2->u.cmpobj)
6956 cmp = 0;
6957 else if (so1->u.cmpobj == NULL)
6958 cmp = -1;
6959 else
6960 cmp = 1;
6961 } else {
6962 /* We have both the objects, use strcoll */
6963 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6964 }
6965 } else {
6966 /* Compare elements directly. */
6967 cmp = compareStringObjects(so1->obj,so2->obj);
6968 }
6969 }
6970 return server.sort_desc ? -cmp : cmp;
6971 }
6972
6973 /* The SORT command is the most complex command in Redis. Warning: this code
6974 * is optimized for speed and a bit less for readability */
6975 static void sortCommand(redisClient *c) {
6976 list *operations;
6977 int outputlen = 0;
6978 int desc = 0, alpha = 0;
6979 int limit_start = 0, limit_count = -1, start, end;
6980 int j, dontsort = 0, vectorlen;
6981 int getop = 0; /* GET operation counter */
6982 robj *sortval, *sortby = NULL, *storekey = NULL;
6983 redisSortObject *vector; /* Resulting vector to sort */
6984
6985 /* Lookup the key to sort. It must be of the right types */
6986 sortval = lookupKeyRead(c->db,c->argv[1]);
6987 if (sortval == NULL) {
6988 addReply(c,shared.emptymultibulk);
6989 return;
6990 }
6991 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6992 sortval->type != REDIS_ZSET)
6993 {
6994 addReply(c,shared.wrongtypeerr);
6995 return;
6996 }
6997
6998 /* Create a list of operations to perform for every sorted element.
6999 * Operations can be GET/DEL/INCR/DECR */
7000 operations = listCreate();
7001 listSetFreeMethod(operations,zfree);
7002 j = 2;
7003
7004 /* Now we need to protect sortval incrementing its count, in the future
7005 * SORT may have options able to overwrite/delete keys during the sorting
7006 * and the sorted key itself may get destroied */
7007 incrRefCount(sortval);
7008
7009 /* The SORT command has an SQL-alike syntax, parse it */
7010 while(j < c->argc) {
7011 int leftargs = c->argc-j-1;
7012 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7013 desc = 0;
7014 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7015 desc = 1;
7016 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7017 alpha = 1;
7018 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7019 limit_start = atoi(c->argv[j+1]->ptr);
7020 limit_count = atoi(c->argv[j+2]->ptr);
7021 j+=2;
7022 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7023 storekey = c->argv[j+1];
7024 j++;
7025 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7026 sortby = c->argv[j+1];
7027 /* If the BY pattern does not contain '*', i.e. it is constant,
7028 * we don't need to sort nor to lookup the weight keys. */
7029 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7030 j++;
7031 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7032 listAddNodeTail(operations,createSortOperation(
7033 REDIS_SORT_GET,c->argv[j+1]));
7034 getop++;
7035 j++;
7036 } else {
7037 decrRefCount(sortval);
7038 listRelease(operations);
7039 addReply(c,shared.syntaxerr);
7040 return;
7041 }
7042 j++;
7043 }
7044
7045 /* Load the sorting vector with all the objects to sort */
7046 switch(sortval->type) {
7047 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7048 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7049 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7050 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7051 }
7052 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7053 j = 0;
7054
7055 if (sortval->type == REDIS_LIST) {
7056 list *list = sortval->ptr;
7057 listNode *ln;
7058 listIter li;
7059
7060 listRewind(list,&li);
7061 while((ln = listNext(&li))) {
7062 robj *ele = ln->value;
7063 vector[j].obj = ele;
7064 vector[j].u.score = 0;
7065 vector[j].u.cmpobj = NULL;
7066 j++;
7067 }
7068 } else {
7069 dict *set;
7070 dictIterator *di;
7071 dictEntry *setele;
7072
7073 if (sortval->type == REDIS_SET) {
7074 set = sortval->ptr;
7075 } else {
7076 zset *zs = sortval->ptr;
7077 set = zs->dict;
7078 }
7079
7080 di = dictGetIterator(set);
7081 while((setele = dictNext(di)) != NULL) {
7082 vector[j].obj = dictGetEntryKey(setele);
7083 vector[j].u.score = 0;
7084 vector[j].u.cmpobj = NULL;
7085 j++;
7086 }
7087 dictReleaseIterator(di);
7088 }
7089 redisAssert(j == vectorlen);
7090
7091 /* Now it's time to load the right scores in the sorting vector */
7092 if (dontsort == 0) {
7093 for (j = 0; j < vectorlen; j++) {
7094 robj *byval;
7095 if (sortby) {
7096 /* lookup value to sort by */
7097 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7098 if (!byval) continue;
7099 } else {
7100 /* use object itself to sort by */
7101 byval = vector[j].obj;
7102 }
7103
7104 if (alpha) {
7105 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7106 } else {
7107 if (byval->encoding == REDIS_ENCODING_RAW) {
7108 vector[j].u.score = strtod(byval->ptr,NULL);
7109 } else if (byval->encoding == REDIS_ENCODING_INT) {
7110 /* Don't need to decode the object if it's
7111 * integer-encoded (the only encoding supported) so
7112 * far. We can just cast it */
7113 vector[j].u.score = (long)byval->ptr;
7114 } else {
7115 redisAssert(1 != 1);
7116 }
7117 }
7118
7119 /* when the object was retrieved using lookupKeyByPattern,
7120 * its refcount needs to be decreased. */
7121 if (sortby) {
7122 decrRefCount(byval);
7123 }
7124 }
7125 }
7126
7127 /* We are ready to sort the vector... perform a bit of sanity check
7128 * on the LIMIT option too. We'll use a partial version of quicksort. */
7129 start = (limit_start < 0) ? 0 : limit_start;
7130 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7131 if (start >= vectorlen) {
7132 start = vectorlen-1;
7133 end = vectorlen-2;
7134 }
7135 if (end >= vectorlen) end = vectorlen-1;
7136
7137 if (dontsort == 0) {
7138 server.sort_desc = desc;
7139 server.sort_alpha = alpha;
7140 server.sort_bypattern = sortby ? 1 : 0;
7141 if (sortby && (start != 0 || end != vectorlen-1))
7142 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7143 else
7144 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7145 }
7146
7147 /* Send command output to the output buffer, performing the specified
7148 * GET/DEL/INCR/DECR operations if any. */
7149 outputlen = getop ? getop*(end-start+1) : end-start+1;
7150 if (storekey == NULL) {
7151 /* STORE option not specified, sent the sorting result to client */
7152 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7153 for (j = start; j <= end; j++) {
7154 listNode *ln;
7155 listIter li;
7156
7157 if (!getop) addReplyBulk(c,vector[j].obj);
7158 listRewind(operations,&li);
7159 while((ln = listNext(&li))) {
7160 redisSortOperation *sop = ln->value;
7161 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7162 vector[j].obj);
7163
7164 if (sop->type == REDIS_SORT_GET) {
7165 if (!val) {
7166 addReply(c,shared.nullbulk);
7167 } else {
7168 addReplyBulk(c,val);
7169 decrRefCount(val);
7170 }
7171 } else {
7172 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7173 }
7174 }
7175 }
7176 } else {
7177 robj *listObject = createListObject();
7178 list *listPtr = (list*) listObject->ptr;
7179
7180 /* STORE option specified, set the sorting result as a List object */
7181 for (j = start; j <= end; j++) {
7182 listNode *ln;
7183 listIter li;
7184
7185 if (!getop) {
7186 listAddNodeTail(listPtr,vector[j].obj);
7187 incrRefCount(vector[j].obj);
7188 }
7189 listRewind(operations,&li);
7190 while((ln = listNext(&li))) {
7191 redisSortOperation *sop = ln->value;
7192 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7193 vector[j].obj);
7194
7195 if (sop->type == REDIS_SORT_GET) {
7196 if (!val) {
7197 listAddNodeTail(listPtr,createStringObject("",0));
7198 } else {
7199 /* We should do a incrRefCount on val because it is
7200 * added to the list, but also a decrRefCount because
7201 * it is returned by lookupKeyByPattern. This results
7202 * in doing nothing at all. */
7203 listAddNodeTail(listPtr,val);
7204 }
7205 } else {
7206 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7207 }
7208 }
7209 }
7210 if (dictReplace(c->db->dict,storekey,listObject)) {
7211 incrRefCount(storekey);
7212 }
7213 /* Note: we add 1 because the DB is dirty anyway since even if the
7214 * SORT result is empty a new key is set and maybe the old content
7215 * replaced. */
7216 server.dirty += 1+outputlen;
7217 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7218 }
7219
7220 /* Cleanup */
7221 decrRefCount(sortval);
7222 listRelease(operations);
7223 for (j = 0; j < vectorlen; j++) {
7224 if (alpha && vector[j].u.cmpobj)
7225 decrRefCount(vector[j].u.cmpobj);
7226 }
7227 zfree(vector);
7228 }
7229
7230 /* Convert an amount of bytes into a human readable string in the form
7231 * of 100B, 2G, 100M, 4K, and so forth. */
7232 static void bytesToHuman(char *s, unsigned long long n) {
7233 double d;
7234
7235 if (n < 1024) {
7236 /* Bytes */
7237 sprintf(s,"%lluB",n);
7238 return;
7239 } else if (n < (1024*1024)) {
7240 d = (double)n/(1024);
7241 sprintf(s,"%.2fK",d);
7242 } else if (n < (1024LL*1024*1024)) {
7243 d = (double)n/(1024*1024);
7244 sprintf(s,"%.2fM",d);
7245 } else if (n < (1024LL*1024*1024*1024)) {
7246 d = (double)n/(1024LL*1024*1024);
7247 sprintf(s,"%.2fG",d);
7248 }
7249 }
7250
7251 /* Create the string returned by the INFO command. This is decoupled
7252 * by the INFO command itself as we need to report the same information
7253 * on memory corruption problems. */
7254 static sds genRedisInfoString(void) {
7255 sds info;
7256 time_t uptime = time(NULL)-server.stat_starttime;
7257 int j;
7258 char hmem[64];
7259
7260 bytesToHuman(hmem,zmalloc_used_memory());
7261 info = sdscatprintf(sdsempty(),
7262 "redis_version:%s\r\n"
7263 "redis_git_sha1:%s\r\n"
7264 "redis_git_dirty:%d\r\n"
7265 "arch_bits:%s\r\n"
7266 "multiplexing_api:%s\r\n"
7267 "process_id:%ld\r\n"
7268 "uptime_in_seconds:%ld\r\n"
7269 "uptime_in_days:%ld\r\n"
7270 "connected_clients:%d\r\n"
7271 "connected_slaves:%d\r\n"
7272 "blocked_clients:%d\r\n"
7273 "used_memory:%zu\r\n"
7274 "used_memory_human:%s\r\n"
7275 "changes_since_last_save:%lld\r\n"
7276 "bgsave_in_progress:%d\r\n"
7277 "last_save_time:%ld\r\n"
7278 "bgrewriteaof_in_progress:%d\r\n"
7279 "total_connections_received:%lld\r\n"
7280 "total_commands_processed:%lld\r\n"
7281 "expired_keys:%lld\r\n"
7282 "hash_max_zipmap_entries:%zu\r\n"
7283 "hash_max_zipmap_value:%zu\r\n"
7284 "pubsub_channels:%ld\r\n"
7285 "pubsub_patterns:%u\r\n"
7286 "vm_enabled:%d\r\n"
7287 "role:%s\r\n"
7288 ,REDIS_VERSION,
7289 REDIS_GIT_SHA1,
7290 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7291 (sizeof(long) == 8) ? "64" : "32",
7292 aeGetApiName(),
7293 (long) getpid(),
7294 uptime,
7295 uptime/(3600*24),
7296 listLength(server.clients)-listLength(server.slaves),
7297 listLength(server.slaves),
7298 server.blpop_blocked_clients,
7299 zmalloc_used_memory(),
7300 hmem,
7301 server.dirty,
7302 server.bgsavechildpid != -1,
7303 server.lastsave,
7304 server.bgrewritechildpid != -1,
7305 server.stat_numconnections,
7306 server.stat_numcommands,
7307 server.stat_expiredkeys,
7308 server.hash_max_zipmap_entries,
7309 server.hash_max_zipmap_value,
7310 dictSize(server.pubsub_channels),
7311 listLength(server.pubsub_patterns),
7312 server.vm_enabled != 0,
7313 server.masterhost == NULL ? "master" : "slave"
7314 );
7315 if (server.masterhost) {
7316 info = sdscatprintf(info,
7317 "master_host:%s\r\n"
7318 "master_port:%d\r\n"
7319 "master_link_status:%s\r\n"
7320 "master_last_io_seconds_ago:%d\r\n"
7321 ,server.masterhost,
7322 server.masterport,
7323 (server.replstate == REDIS_REPL_CONNECTED) ?
7324 "up" : "down",
7325 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7326 );
7327 }
7328 if (server.vm_enabled) {
7329 lockThreadedIO();
7330 info = sdscatprintf(info,
7331 "vm_conf_max_memory:%llu\r\n"
7332 "vm_conf_page_size:%llu\r\n"
7333 "vm_conf_pages:%llu\r\n"
7334 "vm_stats_used_pages:%llu\r\n"
7335 "vm_stats_swapped_objects:%llu\r\n"
7336 "vm_stats_swappin_count:%llu\r\n"
7337 "vm_stats_swappout_count:%llu\r\n"
7338 "vm_stats_io_newjobs_len:%lu\r\n"
7339 "vm_stats_io_processing_len:%lu\r\n"
7340 "vm_stats_io_processed_len:%lu\r\n"
7341 "vm_stats_io_active_threads:%lu\r\n"
7342 "vm_stats_blocked_clients:%lu\r\n"
7343 ,(unsigned long long) server.vm_max_memory,
7344 (unsigned long long) server.vm_page_size,
7345 (unsigned long long) server.vm_pages,
7346 (unsigned long long) server.vm_stats_used_pages,
7347 (unsigned long long) server.vm_stats_swapped_objects,
7348 (unsigned long long) server.vm_stats_swapins,
7349 (unsigned long long) server.vm_stats_swapouts,
7350 (unsigned long) listLength(server.io_newjobs),
7351 (unsigned long) listLength(server.io_processing),
7352 (unsigned long) listLength(server.io_processed),
7353 (unsigned long) server.io_active_threads,
7354 (unsigned long) server.vm_blocked_clients
7355 );
7356 unlockThreadedIO();
7357 }
7358 for (j = 0; j < server.dbnum; j++) {
7359 long long keys, vkeys;
7360
7361 keys = dictSize(server.db[j].dict);
7362 vkeys = dictSize(server.db[j].expires);
7363 if (keys || vkeys) {
7364 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7365 j, keys, vkeys);
7366 }
7367 }
7368 return info;
7369 }
7370
7371 static void infoCommand(redisClient *c) {
7372 sds info = genRedisInfoString();
7373 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7374 (unsigned long)sdslen(info)));
7375 addReplySds(c,info);
7376 addReply(c,shared.crlf);
7377 }
7378
7379 static void monitorCommand(redisClient *c) {
7380 /* ignore MONITOR if aleady slave or in monitor mode */
7381 if (c->flags & REDIS_SLAVE) return;
7382
7383 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7384 c->slaveseldb = 0;
7385 listAddNodeTail(server.monitors,c);
7386 addReply(c,shared.ok);
7387 }
7388
7389 /* ================================= Expire ================================= */
7390 static int removeExpire(redisDb *db, robj *key) {
7391 if (dictDelete(db->expires,key) == DICT_OK) {
7392 return 1;
7393 } else {
7394 return 0;
7395 }
7396 }
7397
7398 static int setExpire(redisDb *db, robj *key, time_t when) {
7399 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7400 return 0;
7401 } else {
7402 incrRefCount(key);
7403 return 1;
7404 }
7405 }
7406
7407 /* Return the expire time of the specified key, or -1 if no expire
7408 * is associated with this key (i.e. the key is non volatile) */
7409 static time_t getExpire(redisDb *db, robj *key) {
7410 dictEntry *de;
7411
7412 /* No expire? return ASAP */
7413 if (dictSize(db->expires) == 0 ||
7414 (de = dictFind(db->expires,key)) == NULL) return -1;
7415
7416 return (time_t) dictGetEntryVal(de);
7417 }
7418
7419 static int expireIfNeeded(redisDb *db, robj *key) {
7420 time_t when;
7421 dictEntry *de;
7422
7423 /* No expire? return ASAP */
7424 if (dictSize(db->expires) == 0 ||
7425 (de = dictFind(db->expires,key)) == NULL) return 0;
7426
7427 /* Lookup the expire */
7428 when = (time_t) dictGetEntryVal(de);
7429 if (time(NULL) <= when) return 0;
7430
7431 /* Delete the key */
7432 dictDelete(db->expires,key);
7433 server.stat_expiredkeys++;
7434 return dictDelete(db->dict,key) == DICT_OK;
7435 }
7436
7437 static int deleteIfVolatile(redisDb *db, robj *key) {
7438 dictEntry *de;
7439
7440 /* No expire? return ASAP */
7441 if (dictSize(db->expires) == 0 ||
7442 (de = dictFind(db->expires,key)) == NULL) return 0;
7443
7444 /* Delete the key */
7445 server.dirty++;
7446 server.stat_expiredkeys++;
7447 dictDelete(db->expires,key);
7448 return dictDelete(db->dict,key) == DICT_OK;
7449 }
7450
7451 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7452 dictEntry *de;
7453 time_t seconds;
7454
7455 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7456
7457 seconds -= offset;
7458
7459 de = dictFind(c->db->dict,key);
7460 if (de == NULL) {
7461 addReply(c,shared.czero);
7462 return;
7463 }
7464 if (seconds <= 0) {
7465 if (deleteKey(c->db,key)) server.dirty++;
7466 addReply(c, shared.cone);
7467 return;
7468 } else {
7469 time_t when = time(NULL)+seconds;
7470 if (setExpire(c->db,key,when)) {
7471 addReply(c,shared.cone);
7472 server.dirty++;
7473 } else {
7474 addReply(c,shared.czero);
7475 }
7476 return;
7477 }
7478 }
7479
7480 static void expireCommand(redisClient *c) {
7481 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7482 }
7483
7484 static void expireatCommand(redisClient *c) {
7485 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7486 }
7487
7488 static void ttlCommand(redisClient *c) {
7489 time_t expire;
7490 int ttl = -1;
7491
7492 expire = getExpire(c->db,c->argv[1]);
7493 if (expire != -1) {
7494 ttl = (int) (expire-time(NULL));
7495 if (ttl < 0) ttl = -1;
7496 }
7497 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7498 }
7499
7500 /* ================================ MULTI/EXEC ============================== */
7501
7502 /* Client state initialization for MULTI/EXEC */
7503 static void initClientMultiState(redisClient *c) {
7504 c->mstate.commands = NULL;
7505 c->mstate.count = 0;
7506 }
7507
7508 /* Release all the resources associated with MULTI/EXEC state */
7509 static void freeClientMultiState(redisClient *c) {
7510 int j;
7511
7512 for (j = 0; j < c->mstate.count; j++) {
7513 int i;
7514 multiCmd *mc = c->mstate.commands+j;
7515
7516 for (i = 0; i < mc->argc; i++)
7517 decrRefCount(mc->argv[i]);
7518 zfree(mc->argv);
7519 }
7520 zfree(c->mstate.commands);
7521 }
7522
7523 /* Add a new command into the MULTI commands queue */
7524 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7525 multiCmd *mc;
7526 int j;
7527
7528 c->mstate.commands = zrealloc(c->mstate.commands,
7529 sizeof(multiCmd)*(c->mstate.count+1));
7530 mc = c->mstate.commands+c->mstate.count;
7531 mc->cmd = cmd;
7532 mc->argc = c->argc;
7533 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7534 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7535 for (j = 0; j < c->argc; j++)
7536 incrRefCount(mc->argv[j]);
7537 c->mstate.count++;
7538 }
7539
7540 static void multiCommand(redisClient *c) {
7541 if (c->flags & REDIS_MULTI) {
7542 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7543 return;
7544 }
7545 c->flags |= REDIS_MULTI;
7546 addReply(c,shared.ok);
7547 }
7548
7549 static void discardCommand(redisClient *c) {
7550 if (!(c->flags & REDIS_MULTI)) {
7551 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7552 return;
7553 }
7554
7555 freeClientMultiState(c);
7556 initClientMultiState(c);
7557 c->flags &= (~REDIS_MULTI);
7558 addReply(c,shared.ok);
7559 }
7560
7561 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7562 * implememntation for more information. */
7563 static void execCommandReplicateMulti(redisClient *c) {
7564 struct redisCommand *cmd;
7565 robj *multistring = createStringObject("MULTI",5);
7566
7567 cmd = lookupCommand("multi");
7568 if (server.appendonly)
7569 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7570 if (listLength(server.slaves))
7571 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7572 decrRefCount(multistring);
7573 }
7574
7575 static void execCommand(redisClient *c) {
7576 int j;
7577 robj **orig_argv;
7578 int orig_argc;
7579
7580 if (!(c->flags & REDIS_MULTI)) {
7581 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7582 return;
7583 }
7584
7585 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7586 * A failed EXEC will return a multi bulk nil object. */
7587 if (c->flags & REDIS_DIRTY_CAS) {
7588 freeClientMultiState(c);
7589 initClientMultiState(c);
7590 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7591 unwatchAllKeys(c);
7592 addReply(c,shared.nullmultibulk);
7593 return;
7594 }
7595
7596 /* Replicate a MULTI request now that we are sure the block is executed.
7597 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7598 * both the AOF and the replication link will have the same consistency
7599 * and atomicity guarantees. */
7600 execCommandReplicateMulti(c);
7601
7602 /* Exec all the queued commands */
7603 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7604 orig_argv = c->argv;
7605 orig_argc = c->argc;
7606 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7607 for (j = 0; j < c->mstate.count; j++) {
7608 c->argc = c->mstate.commands[j].argc;
7609 c->argv = c->mstate.commands[j].argv;
7610 call(c,c->mstate.commands[j].cmd);
7611 }
7612 c->argv = orig_argv;
7613 c->argc = orig_argc;
7614 freeClientMultiState(c);
7615 initClientMultiState(c);
7616 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7617 /* Make sure the EXEC command is always replicated / AOF, since we
7618 * always send the MULTI command (we can't know beforehand if the
7619 * next operations will contain at least a modification to the DB). */
7620 server.dirty++;
7621 }
7622
7623 /* =========================== Blocking Operations ========================= */
7624
7625 /* Currently Redis blocking operations support is limited to list POP ops,
7626 * so the current implementation is not fully generic, but it is also not
7627 * completely specific so it will not require a rewrite to support new
7628 * kind of blocking operations in the future.
7629 *
7630 * Still it's important to note that list blocking operations can be already
7631 * used as a notification mechanism in order to implement other blocking
7632 * operations at application level, so there must be a very strong evidence
7633 * of usefulness and generality before new blocking operations are implemented.
7634 *
7635 * This is how the current blocking POP works, we use BLPOP as example:
7636 * - If the user calls BLPOP and the key exists and contains a non empty list
7637 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7638 * if there is not to block.
7639 * - If instead BLPOP is called and the key does not exists or the list is
7640 * empty we need to block. In order to do so we remove the notification for
7641 * new data to read in the client socket (so that we'll not serve new
7642 * requests if the blocking request is not served). Also we put the client
7643 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7644 * blocking for this keys.
7645 * - If a PUSH operation against a key with blocked clients waiting is
7646 * performed, we serve the first in the list: basically instead to push
7647 * the new element inside the list we return it to the (first / oldest)
7648 * blocking client, unblock the client, and remove it form the list.
7649 *
7650 * The above comment and the source code should be enough in order to understand
7651 * the implementation and modify / fix it later.
7652 */
7653
7654 /* Set a client in blocking mode for the specified key, with the specified
7655 * timeout */
7656 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7657 dictEntry *de;
7658 list *l;
7659 int j;
7660
7661 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7662 c->blocking_keys_num = numkeys;
7663 c->blockingto = timeout;
7664 for (j = 0; j < numkeys; j++) {
7665 /* Add the key in the client structure, to map clients -> keys */
7666 c->blocking_keys[j] = keys[j];
7667 incrRefCount(keys[j]);
7668
7669 /* And in the other "side", to map keys -> clients */
7670 de = dictFind(c->db->blocking_keys,keys[j]);
7671 if (de == NULL) {
7672 int retval;
7673
7674 /* For every key we take a list of clients blocked for it */
7675 l = listCreate();
7676 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7677 incrRefCount(keys[j]);
7678 assert(retval == DICT_OK);
7679 } else {
7680 l = dictGetEntryVal(de);
7681 }
7682 listAddNodeTail(l,c);
7683 }
7684 /* Mark the client as a blocked client */
7685 c->flags |= REDIS_BLOCKED;
7686 server.blpop_blocked_clients++;
7687 }
7688
7689 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7690 static void unblockClientWaitingData(redisClient *c) {
7691 dictEntry *de;
7692 list *l;
7693 int j;
7694
7695 assert(c->blocking_keys != NULL);
7696 /* The client may wait for multiple keys, so unblock it for every key. */
7697 for (j = 0; j < c->blocking_keys_num; j++) {
7698 /* Remove this client from the list of clients waiting for this key. */
7699 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7700 assert(de != NULL);
7701 l = dictGetEntryVal(de);
7702 listDelNode(l,listSearchKey(l,c));
7703 /* If the list is empty we need to remove it to avoid wasting memory */
7704 if (listLength(l) == 0)
7705 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7706 decrRefCount(c->blocking_keys[j]);
7707 }
7708 /* Cleanup the client structure */
7709 zfree(c->blocking_keys);
7710 c->blocking_keys = NULL;
7711 c->flags &= (~REDIS_BLOCKED);
7712 server.blpop_blocked_clients--;
7713 /* We want to process data if there is some command waiting
7714 * in the input buffer. Note that this is safe even if
7715 * unblockClientWaitingData() gets called from freeClient() because
7716 * freeClient() will be smart enough to call this function
7717 * *after* c->querybuf was set to NULL. */
7718 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7719 }
7720
7721 /* This should be called from any function PUSHing into lists.
7722 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7723 * 'ele' is the element pushed.
7724 *
7725 * If the function returns 0 there was no client waiting for a list push
7726 * against this key.
7727 *
7728 * If the function returns 1 there was a client waiting for a list push
7729 * against this key, the element was passed to this client thus it's not
7730 * needed to actually add it to the list and the caller should return asap. */
7731 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7732 struct dictEntry *de;
7733 redisClient *receiver;
7734 list *l;
7735 listNode *ln;
7736
7737 de = dictFind(c->db->blocking_keys,key);
7738 if (de == NULL) return 0;
7739 l = dictGetEntryVal(de);
7740 ln = listFirst(l);
7741 assert(ln != NULL);
7742 receiver = ln->value;
7743
7744 addReplySds(receiver,sdsnew("*2\r\n"));
7745 addReplyBulk(receiver,key);
7746 addReplyBulk(receiver,ele);
7747 unblockClientWaitingData(receiver);
7748 return 1;
7749 }
7750
7751 /* Blocking RPOP/LPOP */
7752 static void blockingPopGenericCommand(redisClient *c, int where) {
7753 robj *o;
7754 time_t timeout;
7755 int j;
7756
7757 for (j = 1; j < c->argc-1; j++) {
7758 o = lookupKeyWrite(c->db,c->argv[j]);
7759 if (o != NULL) {
7760 if (o->type != REDIS_LIST) {
7761 addReply(c,shared.wrongtypeerr);
7762 return;
7763 } else {
7764 list *list = o->ptr;
7765 if (listLength(list) != 0) {
7766 /* If the list contains elements fall back to the usual
7767 * non-blocking POP operation */
7768 robj *argv[2], **orig_argv;
7769 int orig_argc;
7770
7771 /* We need to alter the command arguments before to call
7772 * popGenericCommand() as the command takes a single key. */
7773 orig_argv = c->argv;
7774 orig_argc = c->argc;
7775 argv[1] = c->argv[j];
7776 c->argv = argv;
7777 c->argc = 2;
7778
7779 /* Also the return value is different, we need to output
7780 * the multi bulk reply header and the key name. The
7781 * "real" command will add the last element (the value)
7782 * for us. If this souds like an hack to you it's just
7783 * because it is... */
7784 addReplySds(c,sdsnew("*2\r\n"));
7785 addReplyBulk(c,argv[1]);
7786 popGenericCommand(c,where);
7787
7788 /* Fix the client structure with the original stuff */
7789 c->argv = orig_argv;
7790 c->argc = orig_argc;
7791 return;
7792 }
7793 }
7794 }
7795 }
7796 /* If the list is empty or the key does not exists we must block */
7797 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7798 if (timeout > 0) timeout += time(NULL);
7799 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7800 }
7801
7802 static void blpopCommand(redisClient *c) {
7803 blockingPopGenericCommand(c,REDIS_HEAD);
7804 }
7805
7806 static void brpopCommand(redisClient *c) {
7807 blockingPopGenericCommand(c,REDIS_TAIL);
7808 }
7809
7810 /* =============================== Replication ============================= */
7811
7812 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7813 ssize_t nwritten, ret = size;
7814 time_t start = time(NULL);
7815
7816 timeout++;
7817 while(size) {
7818 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7819 nwritten = write(fd,ptr,size);
7820 if (nwritten == -1) return -1;
7821 ptr += nwritten;
7822 size -= nwritten;
7823 }
7824 if ((time(NULL)-start) > timeout) {
7825 errno = ETIMEDOUT;
7826 return -1;
7827 }
7828 }
7829 return ret;
7830 }
7831
7832 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7833 ssize_t nread, totread = 0;
7834 time_t start = time(NULL);
7835
7836 timeout++;
7837 while(size) {
7838 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7839 nread = read(fd,ptr,size);
7840 if (nread == -1) return -1;
7841 ptr += nread;
7842 size -= nread;
7843 totread += nread;
7844 }
7845 if ((time(NULL)-start) > timeout) {
7846 errno = ETIMEDOUT;
7847 return -1;
7848 }
7849 }
7850 return totread;
7851 }
7852
7853 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7854 ssize_t nread = 0;
7855
7856 size--;
7857 while(size) {
7858 char c;
7859
7860 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7861 if (c == '\n') {
7862 *ptr = '\0';
7863 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7864 return nread;
7865 } else {
7866 *ptr++ = c;
7867 *ptr = '\0';
7868 nread++;
7869 }
7870 }
7871 return nread;
7872 }
7873
7874 static void syncCommand(redisClient *c) {
7875 /* ignore SYNC if aleady slave or in monitor mode */
7876 if (c->flags & REDIS_SLAVE) return;
7877
7878 /* SYNC can't be issued when the server has pending data to send to
7879 * the client about already issued commands. We need a fresh reply
7880 * buffer registering the differences between the BGSAVE and the current
7881 * dataset, so that we can copy to other slaves if needed. */
7882 if (listLength(c->reply) != 0) {
7883 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7884 return;
7885 }
7886
7887 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7888 /* Here we need to check if there is a background saving operation
7889 * in progress, or if it is required to start one */
7890 if (server.bgsavechildpid != -1) {
7891 /* Ok a background save is in progress. Let's check if it is a good
7892 * one for replication, i.e. if there is another slave that is
7893 * registering differences since the server forked to save */
7894 redisClient *slave;
7895 listNode *ln;
7896 listIter li;
7897
7898 listRewind(server.slaves,&li);
7899 while((ln = listNext(&li))) {
7900 slave = ln->value;
7901 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7902 }
7903 if (ln) {
7904 /* Perfect, the server is already registering differences for
7905 * another slave. Set the right state, and copy the buffer. */
7906 listRelease(c->reply);
7907 c->reply = listDup(slave->reply);
7908 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7909 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7910 } else {
7911 /* No way, we need to wait for the next BGSAVE in order to
7912 * register differences */
7913 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7914 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7915 }
7916 } else {
7917 /* Ok we don't have a BGSAVE in progress, let's start one */
7918 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7919 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7920 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7921 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7922 return;
7923 }
7924 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7925 }
7926 c->repldbfd = -1;
7927 c->flags |= REDIS_SLAVE;
7928 c->slaveseldb = 0;
7929 listAddNodeTail(server.slaves,c);
7930 return;
7931 }
7932
7933 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7934 redisClient *slave = privdata;
7935 REDIS_NOTUSED(el);
7936 REDIS_NOTUSED(mask);
7937 char buf[REDIS_IOBUF_LEN];
7938 ssize_t nwritten, buflen;
7939
7940 if (slave->repldboff == 0) {
7941 /* Write the bulk write count before to transfer the DB. In theory here
7942 * we don't know how much room there is in the output buffer of the
7943 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7944 * operations) will never be smaller than the few bytes we need. */
7945 sds bulkcount;
7946
7947 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7948 slave->repldbsize);
7949 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7950 {
7951 sdsfree(bulkcount);
7952 freeClient(slave);
7953 return;
7954 }
7955 sdsfree(bulkcount);
7956 }
7957 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7958 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7959 if (buflen <= 0) {
7960 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7961 (buflen == 0) ? "premature EOF" : strerror(errno));
7962 freeClient(slave);
7963 return;
7964 }
7965 if ((nwritten = write(fd,buf,buflen)) == -1) {
7966 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7967 strerror(errno));
7968 freeClient(slave);
7969 return;
7970 }
7971 slave->repldboff += nwritten;
7972 if (slave->repldboff == slave->repldbsize) {
7973 close(slave->repldbfd);
7974 slave->repldbfd = -1;
7975 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7976 slave->replstate = REDIS_REPL_ONLINE;
7977 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7978 sendReplyToClient, slave) == AE_ERR) {
7979 freeClient(slave);
7980 return;
7981 }
7982 addReplySds(slave,sdsempty());
7983 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7984 }
7985 }
7986
7987 /* This function is called at the end of every backgrond saving.
7988 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7989 * otherwise REDIS_ERR is passed to the function.
7990 *
7991 * The goal of this function is to handle slaves waiting for a successful
7992 * background saving in order to perform non-blocking synchronization. */
7993 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7994 listNode *ln;
7995 int startbgsave = 0;
7996 listIter li;
7997
7998 listRewind(server.slaves,&li);
7999 while((ln = listNext(&li))) {
8000 redisClient *slave = ln->value;
8001
8002 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8003 startbgsave = 1;
8004 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8005 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8006 struct redis_stat buf;
8007
8008 if (bgsaveerr != REDIS_OK) {
8009 freeClient(slave);
8010 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8011 continue;
8012 }
8013 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8014 redis_fstat(slave->repldbfd,&buf) == -1) {
8015 freeClient(slave);
8016 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8017 continue;
8018 }
8019 slave->repldboff = 0;
8020 slave->repldbsize = buf.st_size;
8021 slave->replstate = REDIS_REPL_SEND_BULK;
8022 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8023 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8024 freeClient(slave);
8025 continue;
8026 }
8027 }
8028 }
8029 if (startbgsave) {
8030 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8031 listIter li;
8032
8033 listRewind(server.slaves,&li);
8034 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8035 while((ln = listNext(&li))) {
8036 redisClient *slave = ln->value;
8037
8038 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8039 freeClient(slave);
8040 }
8041 }
8042 }
8043 }
8044
8045 static int syncWithMaster(void) {
8046 char buf[1024], tmpfile[256], authcmd[1024];
8047 long dumpsize;
8048 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8049 int dfd, maxtries = 5;
8050
8051 if (fd == -1) {
8052 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8053 strerror(errno));
8054 return REDIS_ERR;
8055 }
8056
8057 /* AUTH with the master if required. */
8058 if(server.masterauth) {
8059 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8060 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8061 close(fd);
8062 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8063 strerror(errno));
8064 return REDIS_ERR;
8065 }
8066 /* Read the AUTH result. */
8067 if (syncReadLine(fd,buf,1024,3600) == -1) {
8068 close(fd);
8069 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8070 strerror(errno));
8071 return REDIS_ERR;
8072 }
8073 if (buf[0] != '+') {
8074 close(fd);
8075 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8076 return REDIS_ERR;
8077 }
8078 }
8079
8080 /* Issue the SYNC command */
8081 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8082 close(fd);
8083 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8084 strerror(errno));
8085 return REDIS_ERR;
8086 }
8087 /* Read the bulk write count */
8088 if (syncReadLine(fd,buf,1024,3600) == -1) {
8089 close(fd);
8090 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8091 strerror(errno));
8092 return REDIS_ERR;
8093 }
8094 if (buf[0] != '$') {
8095 close(fd);
8096 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8097 return REDIS_ERR;
8098 }
8099 dumpsize = strtol(buf+1,NULL,10);
8100 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8101 /* Read the bulk write data on a temp file */
8102 while(maxtries--) {
8103 snprintf(tmpfile,256,
8104 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8105 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8106 if (dfd != -1) break;
8107 sleep(1);
8108 }
8109 if (dfd == -1) {
8110 close(fd);
8111 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8112 return REDIS_ERR;
8113 }
8114 while(dumpsize) {
8115 int nread, nwritten;
8116
8117 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8118 if (nread == -1) {
8119 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8120 strerror(errno));
8121 close(fd);
8122 close(dfd);
8123 return REDIS_ERR;
8124 }
8125 nwritten = write(dfd,buf,nread);
8126 if (nwritten == -1) {
8127 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8128 close(fd);
8129 close(dfd);
8130 return REDIS_ERR;
8131 }
8132 dumpsize -= nread;
8133 }
8134 close(dfd);
8135 if (rename(tmpfile,server.dbfilename) == -1) {
8136 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8137 unlink(tmpfile);
8138 close(fd);
8139 return REDIS_ERR;
8140 }
8141 emptyDb();
8142 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8143 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8144 close(fd);
8145 return REDIS_ERR;
8146 }
8147 server.master = createClient(fd);
8148 server.master->flags |= REDIS_MASTER;
8149 server.master->authenticated = 1;
8150 server.replstate = REDIS_REPL_CONNECTED;
8151 return REDIS_OK;
8152 }
8153
8154 static void slaveofCommand(redisClient *c) {
8155 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8156 !strcasecmp(c->argv[2]->ptr,"one")) {
8157 if (server.masterhost) {
8158 sdsfree(server.masterhost);
8159 server.masterhost = NULL;
8160 if (server.master) freeClient(server.master);
8161 server.replstate = REDIS_REPL_NONE;
8162 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8163 }
8164 } else {
8165 sdsfree(server.masterhost);
8166 server.masterhost = sdsdup(c->argv[1]->ptr);
8167 server.masterport = atoi(c->argv[2]->ptr);
8168 if (server.master) freeClient(server.master);
8169 server.replstate = REDIS_REPL_CONNECT;
8170 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8171 server.masterhost, server.masterport);
8172 }
8173 addReply(c,shared.ok);
8174 }
8175
8176 /* ============================ Maxmemory directive ======================== */
8177
8178 /* Try to free one object form the pre-allocated objects free list.
8179 * This is useful under low mem conditions as by default we take 1 million
8180 * free objects allocated. On success REDIS_OK is returned, otherwise
8181 * REDIS_ERR. */
8182 static int tryFreeOneObjectFromFreelist(void) {
8183 robj *o;
8184
8185 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8186 if (listLength(server.objfreelist)) {
8187 listNode *head = listFirst(server.objfreelist);
8188 o = listNodeValue(head);
8189 listDelNode(server.objfreelist,head);
8190 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8191 zfree(o);
8192 return REDIS_OK;
8193 } else {
8194 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8195 return REDIS_ERR;
8196 }
8197 }
8198
8199 /* This function gets called when 'maxmemory' is set on the config file to limit
8200 * the max memory used by the server, and we are out of memory.
8201 * This function will try to, in order:
8202 *
8203 * - Free objects from the free list
8204 * - Try to remove keys with an EXPIRE set
8205 *
8206 * It is not possible to free enough memory to reach used-memory < maxmemory
8207 * the server will start refusing commands that will enlarge even more the
8208 * memory usage.
8209 */
8210 static void freeMemoryIfNeeded(void) {
8211 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8212 int j, k, freed = 0;
8213
8214 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8215 for (j = 0; j < server.dbnum; j++) {
8216 int minttl = -1;
8217 robj *minkey = NULL;
8218 struct dictEntry *de;
8219
8220 if (dictSize(server.db[j].expires)) {
8221 freed = 1;
8222 /* From a sample of three keys drop the one nearest to
8223 * the natural expire */
8224 for (k = 0; k < 3; k++) {
8225 time_t t;
8226
8227 de = dictGetRandomKey(server.db[j].expires);
8228 t = (time_t) dictGetEntryVal(de);
8229 if (minttl == -1 || t < minttl) {
8230 minkey = dictGetEntryKey(de);
8231 minttl = t;
8232 }
8233 }
8234 deleteKey(server.db+j,minkey);
8235 }
8236 }
8237 if (!freed) return; /* nothing to free... */
8238 }
8239 }
8240
8241 /* ============================== Append Only file ========================== */
8242
8243 /* Write the append only file buffer on disk.
8244 *
8245 * Since we are required to write the AOF before replying to the client,
8246 * and the only way the client socket can get a write is entering when the
8247 * the event loop, we accumulate all the AOF writes in a memory
8248 * buffer and write it on disk using this function just before entering
8249 * the event loop again. */
8250 static void flushAppendOnlyFile(void) {
8251 time_t now;
8252 ssize_t nwritten;
8253
8254 if (sdslen(server.aofbuf) == 0) return;
8255
8256 /* We want to perform a single write. This should be guaranteed atomic
8257 * at least if the filesystem we are writing is a real physical one.
8258 * While this will save us against the server being killed I don't think
8259 * there is much to do about the whole server stopping for power problems
8260 * or alike */
8261 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8262 if (nwritten != (signed)sdslen(server.aofbuf)) {
8263 /* Ooops, we are in troubles. The best thing to do for now is
8264 * aborting instead of giving the illusion that everything is
8265 * working as expected. */
8266 if (nwritten == -1) {
8267 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8268 } else {
8269 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8270 }
8271 exit(1);
8272 }
8273 sdsfree(server.aofbuf);
8274 server.aofbuf = sdsempty();
8275
8276 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8277 * childs performing heavy I/O on disk. */
8278 if (server.no_appendfsync_on_rewrite &&
8279 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8280 return;
8281 /* Fsync if needed */
8282 now = time(NULL);
8283 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8284 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8285 now-server.lastfsync > 1))
8286 {
8287 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8288 * flushing metadata. */
8289 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8290 server.lastfsync = now;
8291 }
8292 }
8293
8294 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8295 int j;
8296 buf = sdscatprintf(buf,"*%d\r\n",argc);
8297 for (j = 0; j < argc; j++) {
8298 robj *o = getDecodedObject(argv[j]);
8299 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8300 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8301 buf = sdscatlen(buf,"\r\n",2);
8302 decrRefCount(o);
8303 }
8304 return buf;
8305 }
8306
8307 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8308 int argc = 3;
8309 long when;
8310 robj *argv[3];
8311
8312 /* Make sure we can use strtol */
8313 seconds = getDecodedObject(seconds);
8314 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8315 decrRefCount(seconds);
8316
8317 argv[0] = createStringObject("EXPIREAT",8);
8318 argv[1] = key;
8319 argv[2] = createObject(REDIS_STRING,
8320 sdscatprintf(sdsempty(),"%ld",when));
8321 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8322 decrRefCount(argv[0]);
8323 decrRefCount(argv[2]);
8324 return buf;
8325 }
8326
8327 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8328 sds buf = sdsempty();
8329 robj *tmpargv[3];
8330
8331 /* The DB this command was targetting is not the same as the last command
8332 * we appendend. To issue a SELECT command is needed. */
8333 if (dictid != server.appendseldb) {
8334 char seldb[64];
8335
8336 snprintf(seldb,sizeof(seldb),"%d",dictid);
8337 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8338 (unsigned long)strlen(seldb),seldb);
8339 server.appendseldb = dictid;
8340 }
8341
8342 if (cmd->proc == expireCommand) {
8343 /* Translate EXPIRE into EXPIREAT */
8344 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8345 } else if (cmd->proc == setexCommand) {
8346 /* Translate SETEX to SET and EXPIREAT */
8347 tmpargv[0] = createStringObject("SET",3);
8348 tmpargv[1] = argv[1];
8349 tmpargv[2] = argv[3];
8350 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8351 decrRefCount(tmpargv[0]);
8352 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8353 } else {
8354 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8355 }
8356
8357 /* Append to the AOF buffer. This will be flushed on disk just before
8358 * of re-entering the event loop, so before the client will get a
8359 * positive reply about the operation performed. */
8360 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8361
8362 /* If a background append only file rewriting is in progress we want to
8363 * accumulate the differences between the child DB and the current one
8364 * in a buffer, so that when the child process will do its work we
8365 * can append the differences to the new append only file. */
8366 if (server.bgrewritechildpid != -1)
8367 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8368
8369 sdsfree(buf);
8370 }
8371
8372 /* In Redis commands are always executed in the context of a client, so in
8373 * order to load the append only file we need to create a fake client. */
8374 static struct redisClient *createFakeClient(void) {
8375 struct redisClient *c = zmalloc(sizeof(*c));
8376
8377 selectDb(c,0);
8378 c->fd = -1;
8379 c->querybuf = sdsempty();
8380 c->argc = 0;
8381 c->argv = NULL;
8382 c->flags = 0;
8383 /* We set the fake client as a slave waiting for the synchronization
8384 * so that Redis will not try to send replies to this client. */
8385 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8386 c->reply = listCreate();
8387 listSetFreeMethod(c->reply,decrRefCount);
8388 listSetDupMethod(c->reply,dupClientReplyValue);
8389 initClientMultiState(c);
8390 return c;
8391 }
8392
8393 static void freeFakeClient(struct redisClient *c) {
8394 sdsfree(c->querybuf);
8395 listRelease(c->reply);
8396 freeClientMultiState(c);
8397 zfree(c);
8398 }
8399
8400 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8401 * error (the append only file is zero-length) REDIS_ERR is returned. On
8402 * fatal error an error message is logged and the program exists. */
8403 int loadAppendOnlyFile(char *filename) {
8404 struct redisClient *fakeClient;
8405 FILE *fp = fopen(filename,"r");
8406 struct redis_stat sb;
8407 unsigned long long loadedkeys = 0;
8408 int appendonly = server.appendonly;
8409
8410 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8411 return REDIS_ERR;
8412
8413 if (fp == NULL) {
8414 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8415 exit(1);
8416 }
8417
8418 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8419 * to the same file we're about to read. */
8420 server.appendonly = 0;
8421
8422 fakeClient = createFakeClient();
8423 while(1) {
8424 int argc, j;
8425 unsigned long len;
8426 robj **argv;
8427 char buf[128];
8428 sds argsds;
8429 struct redisCommand *cmd;
8430
8431 if (fgets(buf,sizeof(buf),fp) == NULL) {
8432 if (feof(fp))
8433 break;
8434 else
8435 goto readerr;
8436 }
8437 if (buf[0] != '*') goto fmterr;
8438 argc = atoi(buf+1);
8439 argv = zmalloc(sizeof(robj*)*argc);
8440 for (j = 0; j < argc; j++) {
8441 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8442 if (buf[0] != '$') goto fmterr;
8443 len = strtol(buf+1,NULL,10);
8444 argsds = sdsnewlen(NULL,len);
8445 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8446 argv[j] = createObject(REDIS_STRING,argsds);
8447 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8448 }
8449
8450 /* Command lookup */
8451 cmd = lookupCommand(argv[0]->ptr);
8452 if (!cmd) {
8453 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8454 exit(1);
8455 }
8456 /* Try object encoding */
8457 if (cmd->flags & REDIS_CMD_BULK)
8458 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8459 /* Run the command in the context of a fake client */
8460 fakeClient->argc = argc;
8461 fakeClient->argv = argv;
8462 cmd->proc(fakeClient);
8463 /* Discard the reply objects list from the fake client */
8464 while(listLength(fakeClient->reply))
8465 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8466 /* Clean up, ready for the next command */
8467 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8468 zfree(argv);
8469 /* Handle swapping while loading big datasets when VM is on */
8470 loadedkeys++;
8471 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8472 while (zmalloc_used_memory() > server.vm_max_memory) {
8473 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8474 }
8475 }
8476 }
8477
8478 /* This point can only be reached when EOF is reached without errors.
8479 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8480 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8481
8482 fclose(fp);
8483 freeFakeClient(fakeClient);
8484 server.appendonly = appendonly;
8485 return REDIS_OK;
8486
8487 readerr:
8488 if (feof(fp)) {
8489 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8490 } else {
8491 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8492 }
8493 exit(1);
8494 fmterr:
8495 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8496 exit(1);
8497 }
8498
8499 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8500 static int fwriteBulkObject(FILE *fp, robj *obj) {
8501 char buf[128];
8502 int decrrc = 0;
8503
8504 /* Avoid the incr/decr ref count business if possible to help
8505 * copy-on-write (we are often in a child process when this function
8506 * is called).
8507 * Also makes sure that key objects don't get incrRefCount-ed when VM
8508 * is enabled */
8509 if (obj->encoding != REDIS_ENCODING_RAW) {
8510 obj = getDecodedObject(obj);
8511 decrrc = 1;
8512 }
8513 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8514 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8515 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8516 goto err;
8517 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8518 if (decrrc) decrRefCount(obj);
8519 return 1;
8520 err:
8521 if (decrrc) decrRefCount(obj);
8522 return 0;
8523 }
8524
8525 /* Write binary-safe string into a file in the bulkformat
8526 * $<count>\r\n<payload>\r\n */
8527 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8528 char buf[128];
8529
8530 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8531 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8532 if (len && fwrite(s,len,1,fp) == 0) return 0;
8533 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8534 return 1;
8535 }
8536
8537 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8538 static int fwriteBulkDouble(FILE *fp, double d) {
8539 char buf[128], dbuf[128];
8540
8541 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8542 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8543 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8544 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8545 return 1;
8546 }
8547
8548 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8549 static int fwriteBulkLong(FILE *fp, long l) {
8550 char buf[128], lbuf[128];
8551
8552 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8553 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8554 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8555 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8556 return 1;
8557 }
8558
8559 /* Write a sequence of commands able to fully rebuild the dataset into
8560 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8561 static int rewriteAppendOnlyFile(char *filename) {
8562 dictIterator *di = NULL;
8563 dictEntry *de;
8564 FILE *fp;
8565 char tmpfile[256];
8566 int j;
8567 time_t now = time(NULL);
8568
8569 /* Note that we have to use a different temp name here compared to the
8570 * one used by rewriteAppendOnlyFileBackground() function. */
8571 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8572 fp = fopen(tmpfile,"w");
8573 if (!fp) {
8574 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8575 return REDIS_ERR;
8576 }
8577 for (j = 0; j < server.dbnum; j++) {
8578 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8579 redisDb *db = server.db+j;
8580 dict *d = db->dict;
8581 if (dictSize(d) == 0) continue;
8582 di = dictGetIterator(d);
8583 if (!di) {
8584 fclose(fp);
8585 return REDIS_ERR;
8586 }
8587
8588 /* SELECT the new DB */
8589 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8590 if (fwriteBulkLong(fp,j) == 0) goto werr;
8591
8592 /* Iterate this DB writing every entry */
8593 while((de = dictNext(di)) != NULL) {
8594 robj *key, *o;
8595 time_t expiretime;
8596 int swapped;
8597
8598 key = dictGetEntryKey(de);
8599 /* If the value for this key is swapped, load a preview in memory.
8600 * We use a "swapped" flag to remember if we need to free the
8601 * value object instead to just increment the ref count anyway
8602 * in order to avoid copy-on-write of pages if we are forked() */
8603 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8604 key->storage == REDIS_VM_SWAPPING) {
8605 o = dictGetEntryVal(de);
8606 swapped = 0;
8607 } else {
8608 o = vmPreviewObject(key);
8609 swapped = 1;
8610 }
8611 expiretime = getExpire(db,key);
8612
8613 /* Save the key and associated value */
8614 if (o->type == REDIS_STRING) {
8615 /* Emit a SET command */
8616 char cmd[]="*3\r\n$3\r\nSET\r\n";
8617 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8618 /* Key and value */
8619 if (fwriteBulkObject(fp,key) == 0) goto werr;
8620 if (fwriteBulkObject(fp,o) == 0) goto werr;
8621 } else if (o->type == REDIS_LIST) {
8622 /* Emit the RPUSHes needed to rebuild the list */
8623 list *list = o->ptr;
8624 listNode *ln;
8625 listIter li;
8626
8627 listRewind(list,&li);
8628 while((ln = listNext(&li))) {
8629 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8630 robj *eleobj = listNodeValue(ln);
8631
8632 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8633 if (fwriteBulkObject(fp,key) == 0) goto werr;
8634 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8635 }
8636 } else if (o->type == REDIS_SET) {
8637 /* Emit the SADDs needed to rebuild the set */
8638 dict *set = o->ptr;
8639 dictIterator *di = dictGetIterator(set);
8640 dictEntry *de;
8641
8642 while((de = dictNext(di)) != NULL) {
8643 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8644 robj *eleobj = dictGetEntryKey(de);
8645
8646 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8647 if (fwriteBulkObject(fp,key) == 0) goto werr;
8648 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8649 }
8650 dictReleaseIterator(di);
8651 } else if (o->type == REDIS_ZSET) {
8652 /* Emit the ZADDs needed to rebuild the sorted set */
8653 zset *zs = o->ptr;
8654 dictIterator *di = dictGetIterator(zs->dict);
8655 dictEntry *de;
8656
8657 while((de = dictNext(di)) != NULL) {
8658 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8659 robj *eleobj = dictGetEntryKey(de);
8660 double *score = dictGetEntryVal(de);
8661
8662 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8663 if (fwriteBulkObject(fp,key) == 0) goto werr;
8664 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8665 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8666 }
8667 dictReleaseIterator(di);
8668 } else if (o->type == REDIS_HASH) {
8669 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8670
8671 /* Emit the HSETs needed to rebuild the hash */
8672 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8673 unsigned char *p = zipmapRewind(o->ptr);
8674 unsigned char *field, *val;
8675 unsigned int flen, vlen;
8676
8677 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8678 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8679 if (fwriteBulkObject(fp,key) == 0) goto werr;
8680 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8681 return -1;
8682 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8683 return -1;
8684 }
8685 } else {
8686 dictIterator *di = dictGetIterator(o->ptr);
8687 dictEntry *de;
8688
8689 while((de = dictNext(di)) != NULL) {
8690 robj *field = dictGetEntryKey(de);
8691 robj *val = dictGetEntryVal(de);
8692
8693 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8694 if (fwriteBulkObject(fp,key) == 0) goto werr;
8695 if (fwriteBulkObject(fp,field) == -1) return -1;
8696 if (fwriteBulkObject(fp,val) == -1) return -1;
8697 }
8698 dictReleaseIterator(di);
8699 }
8700 } else {
8701 redisPanic("Unknown object type");
8702 }
8703 /* Save the expire time */
8704 if (expiretime != -1) {
8705 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8706 /* If this key is already expired skip it */
8707 if (expiretime < now) continue;
8708 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8709 if (fwriteBulkObject(fp,key) == 0) goto werr;
8710 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8711 }
8712 if (swapped) decrRefCount(o);
8713 }
8714 dictReleaseIterator(di);
8715 }
8716
8717 /* Make sure data will not remain on the OS's output buffers */
8718 fflush(fp);
8719 aof_fsync(fileno(fp));
8720 fclose(fp);
8721
8722 /* Use RENAME to make sure the DB file is changed atomically only
8723 * if the generate DB file is ok. */
8724 if (rename(tmpfile,filename) == -1) {
8725 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8726 unlink(tmpfile);
8727 return REDIS_ERR;
8728 }
8729 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8730 return REDIS_OK;
8731
8732 werr:
8733 fclose(fp);
8734 unlink(tmpfile);
8735 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8736 if (di) dictReleaseIterator(di);
8737 return REDIS_ERR;
8738 }
8739
8740 /* This is how rewriting of the append only file in background works:
8741 *
8742 * 1) The user calls BGREWRITEAOF
8743 * 2) Redis calls this function, that forks():
8744 * 2a) the child rewrite the append only file in a temp file.
8745 * 2b) the parent accumulates differences in server.bgrewritebuf.
8746 * 3) When the child finished '2a' exists.
8747 * 4) The parent will trap the exit code, if it's OK, will append the
8748 * data accumulated into server.bgrewritebuf into the temp file, and
8749 * finally will rename(2) the temp file in the actual file name.
8750 * The the new file is reopened as the new append only file. Profit!
8751 */
8752 static int rewriteAppendOnlyFileBackground(void) {
8753 pid_t childpid;
8754
8755 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8756 if (server.vm_enabled) waitEmptyIOJobsQueue();
8757 if ((childpid = fork()) == 0) {
8758 /* Child */
8759 char tmpfile[256];
8760
8761 if (server.vm_enabled) vmReopenSwapFile();
8762 close(server.fd);
8763 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8764 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8765 _exit(0);
8766 } else {
8767 _exit(1);
8768 }
8769 } else {
8770 /* Parent */
8771 if (childpid == -1) {
8772 redisLog(REDIS_WARNING,
8773 "Can't rewrite append only file in background: fork: %s",
8774 strerror(errno));
8775 return REDIS_ERR;
8776 }
8777 redisLog(REDIS_NOTICE,
8778 "Background append only file rewriting started by pid %d",childpid);
8779 server.bgrewritechildpid = childpid;
8780 updateDictResizePolicy();
8781 /* We set appendseldb to -1 in order to force the next call to the
8782 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8783 * accumulated by the parent into server.bgrewritebuf will start
8784 * with a SELECT statement and it will be safe to merge. */
8785 server.appendseldb = -1;
8786 return REDIS_OK;
8787 }
8788 return REDIS_OK; /* unreached */
8789 }
8790
8791 static void bgrewriteaofCommand(redisClient *c) {
8792 if (server.bgrewritechildpid != -1) {
8793 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8794 return;
8795 }
8796 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8797 char *status = "+Background append only file rewriting started\r\n";
8798 addReplySds(c,sdsnew(status));
8799 } else {
8800 addReply(c,shared.err);
8801 }
8802 }
8803
8804 static void aofRemoveTempFile(pid_t childpid) {
8805 char tmpfile[256];
8806
8807 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8808 unlink(tmpfile);
8809 }
8810
8811 /* Virtual Memory is composed mainly of two subsystems:
8812 * - Blocking Virutal Memory
8813 * - Threaded Virtual Memory I/O
8814 * The two parts are not fully decoupled, but functions are split among two
8815 * different sections of the source code (delimited by comments) in order to
8816 * make more clear what functionality is about the blocking VM and what about
8817 * the threaded (not blocking) VM.
8818 *
8819 * Redis VM design:
8820 *
8821 * Redis VM is a blocking VM (one that blocks reading swapped values from
8822 * disk into memory when a value swapped out is needed in memory) that is made
8823 * unblocking by trying to examine the command argument vector in order to
8824 * load in background values that will likely be needed in order to exec
8825 * the command. The command is executed only once all the relevant keys
8826 * are loaded into memory.
8827 *
8828 * This basically is almost as simple of a blocking VM, but almost as parallel
8829 * as a fully non-blocking VM.
8830 */
8831
8832 /* Called when the user switches from "appendonly yes" to "appendonly no"
8833 * at runtime using the CONFIG command. */
8834 static void stopAppendOnly(void) {
8835 flushAppendOnlyFile();
8836 aof_fsync(server.appendfd);
8837 close(server.appendfd);
8838
8839 server.appendfd = -1;
8840 server.appendseldb = -1;
8841 server.appendonly = 0;
8842 /* rewrite operation in progress? kill it, wait child exit */
8843 if (server.bgsavechildpid != -1) {
8844 int statloc;
8845
8846 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8847 wait3(&statloc,0,NULL);
8848 /* reset the buffer accumulating changes while the child saves */
8849 sdsfree(server.bgrewritebuf);
8850 server.bgrewritebuf = sdsempty();
8851 server.bgsavechildpid = -1;
8852 }
8853 }
8854
8855 /* Called when the user switches from "appendonly no" to "appendonly yes"
8856 * at runtime using the CONFIG command. */
8857 static int startAppendOnly(void) {
8858 server.appendonly = 1;
8859 server.lastfsync = time(NULL);
8860 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8861 if (server.appendfd == -1) {
8862 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8863 return REDIS_ERR;
8864 }
8865 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8866 server.appendonly = 0;
8867 close(server.appendfd);
8868 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8869 return REDIS_ERR;
8870 }
8871 return REDIS_OK;
8872 }
8873
8874 /* =================== Virtual Memory - Blocking Side ====================== */
8875
8876 static void vmInit(void) {
8877 off_t totsize;
8878 int pipefds[2];
8879 size_t stacksize;
8880 struct flock fl;
8881
8882 if (server.vm_max_threads != 0)
8883 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8884
8885 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8886 /* Try to open the old swap file, otherwise create it */
8887 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8888 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8889 }
8890 if (server.vm_fp == NULL) {
8891 redisLog(REDIS_WARNING,
8892 "Can't open the swap file: %s. Exiting.",
8893 strerror(errno));
8894 exit(1);
8895 }
8896 server.vm_fd = fileno(server.vm_fp);
8897 /* Lock the swap file for writing, this is useful in order to avoid
8898 * another instance to use the same swap file for a config error. */
8899 fl.l_type = F_WRLCK;
8900 fl.l_whence = SEEK_SET;
8901 fl.l_start = fl.l_len = 0;
8902 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8903 redisLog(REDIS_WARNING,
8904 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8905 exit(1);
8906 }
8907 /* Initialize */
8908 server.vm_next_page = 0;
8909 server.vm_near_pages = 0;
8910 server.vm_stats_used_pages = 0;
8911 server.vm_stats_swapped_objects = 0;
8912 server.vm_stats_swapouts = 0;
8913 server.vm_stats_swapins = 0;
8914 totsize = server.vm_pages*server.vm_page_size;
8915 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8916 if (ftruncate(server.vm_fd,totsize) == -1) {
8917 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8918 strerror(errno));
8919 exit(1);
8920 } else {
8921 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8922 }
8923 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8924 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8925 (long long) (server.vm_pages+7)/8, server.vm_pages);
8926 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8927
8928 /* Initialize threaded I/O (used by Virtual Memory) */
8929 server.io_newjobs = listCreate();
8930 server.io_processing = listCreate();
8931 server.io_processed = listCreate();
8932 server.io_ready_clients = listCreate();
8933 pthread_mutex_init(&server.io_mutex,NULL);
8934 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8935 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8936 server.io_active_threads = 0;
8937 if (pipe(pipefds) == -1) {
8938 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8939 ,strerror(errno));
8940 exit(1);
8941 }
8942 server.io_ready_pipe_read = pipefds[0];
8943 server.io_ready_pipe_write = pipefds[1];
8944 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8945 /* LZF requires a lot of stack */
8946 pthread_attr_init(&server.io_threads_attr);
8947 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8948 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8949 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8950 /* Listen for events in the threaded I/O pipe */
8951 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8952 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8953 oom("creating file event");
8954 }
8955
8956 /* Mark the page as used */
8957 static void vmMarkPageUsed(off_t page) {
8958 off_t byte = page/8;
8959 int bit = page&7;
8960 redisAssert(vmFreePage(page) == 1);
8961 server.vm_bitmap[byte] |= 1<<bit;
8962 }
8963
8964 /* Mark N contiguous pages as used, with 'page' being the first. */
8965 static void vmMarkPagesUsed(off_t page, off_t count) {
8966 off_t j;
8967
8968 for (j = 0; j < count; j++)
8969 vmMarkPageUsed(page+j);
8970 server.vm_stats_used_pages += count;
8971 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8972 (long long)count, (long long)page);
8973 }
8974
8975 /* Mark the page as free */
8976 static void vmMarkPageFree(off_t page) {
8977 off_t byte = page/8;
8978 int bit = page&7;
8979 redisAssert(vmFreePage(page) == 0);
8980 server.vm_bitmap[byte] &= ~(1<<bit);
8981 }
8982
8983 /* Mark N contiguous pages as free, with 'page' being the first. */
8984 static void vmMarkPagesFree(off_t page, off_t count) {
8985 off_t j;
8986
8987 for (j = 0; j < count; j++)
8988 vmMarkPageFree(page+j);
8989 server.vm_stats_used_pages -= count;
8990 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8991 (long long)count, (long long)page);
8992 }
8993
8994 /* Test if the page is free */
8995 static int vmFreePage(off_t page) {
8996 off_t byte = page/8;
8997 int bit = page&7;
8998 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8999 }
9000
9001 /* Find N contiguous free pages storing the first page of the cluster in *first.
9002 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9003 * REDIS_ERR is returned.
9004 *
9005 * This function uses a simple algorithm: we try to allocate
9006 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9007 * again from the start of the swap file searching for free spaces.
9008 *
9009 * If it looks pretty clear that there are no free pages near our offset
9010 * we try to find less populated places doing a forward jump of
9011 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9012 * without hurry, and then we jump again and so forth...
9013 *
9014 * This function can be improved using a free list to avoid to guess
9015 * too much, since we could collect data about freed pages.
9016 *
9017 * note: I implemented this function just after watching an episode of
9018 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9019 */
9020 static int vmFindContiguousPages(off_t *first, off_t n) {
9021 off_t base, offset = 0, since_jump = 0, numfree = 0;
9022
9023 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9024 server.vm_near_pages = 0;
9025 server.vm_next_page = 0;
9026 }
9027 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9028 base = server.vm_next_page;
9029
9030 while(offset < server.vm_pages) {
9031 off_t this = base+offset;
9032
9033 /* If we overflow, restart from page zero */
9034 if (this >= server.vm_pages) {
9035 this -= server.vm_pages;
9036 if (this == 0) {
9037 /* Just overflowed, what we found on tail is no longer
9038 * interesting, as it's no longer contiguous. */
9039 numfree = 0;
9040 }
9041 }
9042 if (vmFreePage(this)) {
9043 /* This is a free page */
9044 numfree++;
9045 /* Already got N free pages? Return to the caller, with success */
9046 if (numfree == n) {
9047 *first = this-(n-1);
9048 server.vm_next_page = this+1;
9049 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9050 return REDIS_OK;
9051 }
9052 } else {
9053 /* The current one is not a free page */
9054 numfree = 0;
9055 }
9056
9057 /* Fast-forward if the current page is not free and we already
9058 * searched enough near this place. */
9059 since_jump++;
9060 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9061 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9062 since_jump = 0;
9063 /* Note that even if we rewind after the jump, we are don't need
9064 * to make sure numfree is set to zero as we only jump *if* it
9065 * is set to zero. */
9066 } else {
9067 /* Otherwise just check the next page */
9068 offset++;
9069 }
9070 }
9071 return REDIS_ERR;
9072 }
9073
9074 /* Write the specified object at the specified page of the swap file */
9075 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9076 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9077 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9078 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9079 redisLog(REDIS_WARNING,
9080 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9081 strerror(errno));
9082 return REDIS_ERR;
9083 }
9084 rdbSaveObject(server.vm_fp,o);
9085 fflush(server.vm_fp);
9086 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9087 return REDIS_OK;
9088 }
9089
9090 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9091 * needed to later retrieve the object into the key object.
9092 * If we can't find enough contiguous empty pages to swap the object on disk
9093 * REDIS_ERR is returned. */
9094 static int vmSwapObjectBlocking(robj *key, robj *val) {
9095 off_t pages = rdbSavedObjectPages(val,NULL);
9096 off_t page;
9097
9098 assert(key->storage == REDIS_VM_MEMORY);
9099 assert(key->refcount == 1);
9100 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9101 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9102 key->vm.page = page;
9103 key->vm.usedpages = pages;
9104 key->storage = REDIS_VM_SWAPPED;
9105 key->vtype = val->type;
9106 decrRefCount(val); /* Deallocate the object from memory. */
9107 vmMarkPagesUsed(page,pages);
9108 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9109 (unsigned char*) key->ptr,
9110 (unsigned long long) page, (unsigned long long) pages);
9111 server.vm_stats_swapped_objects++;
9112 server.vm_stats_swapouts++;
9113 return REDIS_OK;
9114 }
9115
9116 static robj *vmReadObjectFromSwap(off_t page, int type) {
9117 robj *o;
9118
9119 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9120 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9121 redisLog(REDIS_WARNING,
9122 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9123 strerror(errno));
9124 _exit(1);
9125 }
9126 o = rdbLoadObject(type,server.vm_fp);
9127 if (o == NULL) {
9128 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9129 _exit(1);
9130 }
9131 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9132 return o;
9133 }
9134
9135 /* Load the value object relative to the 'key' object from swap to memory.
9136 * The newly allocated object is returned.
9137 *
9138 * If preview is true the unserialized object is returned to the caller but
9139 * no changes are made to the key object, nor the pages are marked as freed */
9140 static robj *vmGenericLoadObject(robj *key, int preview) {
9141 robj *val;
9142
9143 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9144 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9145 if (!preview) {
9146 key->storage = REDIS_VM_MEMORY;
9147 key->vm.atime = server.unixtime;
9148 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9149 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9150 (unsigned char*) key->ptr);
9151 server.vm_stats_swapped_objects--;
9152 } else {
9153 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9154 (unsigned char*) key->ptr);
9155 }
9156 server.vm_stats_swapins++;
9157 return val;
9158 }
9159
9160 /* Plain object loading, from swap to memory */
9161 static robj *vmLoadObject(robj *key) {
9162 /* If we are loading the object in background, stop it, we
9163 * need to load this object synchronously ASAP. */
9164 if (key->storage == REDIS_VM_LOADING)
9165 vmCancelThreadedIOJob(key);
9166 return vmGenericLoadObject(key,0);
9167 }
9168
9169 /* Just load the value on disk, without to modify the key.
9170 * This is useful when we want to perform some operation on the value
9171 * without to really bring it from swap to memory, like while saving the
9172 * dataset or rewriting the append only log. */
9173 static robj *vmPreviewObject(robj *key) {
9174 return vmGenericLoadObject(key,1);
9175 }
9176
9177 /* How a good candidate is this object for swapping?
9178 * The better candidate it is, the greater the returned value.
9179 *
9180 * Currently we try to perform a fast estimation of the object size in
9181 * memory, and combine it with aging informations.
9182 *
9183 * Basically swappability = idle-time * log(estimated size)
9184 *
9185 * Bigger objects are preferred over smaller objects, but not
9186 * proportionally, this is why we use the logarithm. This algorithm is
9187 * just a first try and will probably be tuned later. */
9188 static double computeObjectSwappability(robj *o) {
9189 time_t age = server.unixtime - o->vm.atime;
9190 long asize = 0;
9191 list *l;
9192 dict *d;
9193 struct dictEntry *de;
9194 int z;
9195
9196 if (age <= 0) return 0;
9197 switch(o->type) {
9198 case REDIS_STRING:
9199 if (o->encoding != REDIS_ENCODING_RAW) {
9200 asize = sizeof(*o);
9201 } else {
9202 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9203 }
9204 break;
9205 case REDIS_LIST:
9206 l = o->ptr;
9207 listNode *ln = listFirst(l);
9208
9209 asize = sizeof(list);
9210 if (ln) {
9211 robj *ele = ln->value;
9212 long elesize;
9213
9214 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9215 (sizeof(*o)+sdslen(ele->ptr)) :
9216 sizeof(*o);
9217 asize += (sizeof(listNode)+elesize)*listLength(l);
9218 }
9219 break;
9220 case REDIS_SET:
9221 case REDIS_ZSET:
9222 z = (o->type == REDIS_ZSET);
9223 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9224
9225 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9226 if (z) asize += sizeof(zset)-sizeof(dict);
9227 if (dictSize(d)) {
9228 long elesize;
9229 robj *ele;
9230
9231 de = dictGetRandomKey(d);
9232 ele = dictGetEntryKey(de);
9233 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9234 (sizeof(*o)+sdslen(ele->ptr)) :
9235 sizeof(*o);
9236 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9237 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9238 }
9239 break;
9240 case REDIS_HASH:
9241 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9242 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9243 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9244 unsigned int klen, vlen;
9245 unsigned char *key, *val;
9246
9247 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9248 klen = 0;
9249 vlen = 0;
9250 }
9251 asize = len*(klen+vlen+3);
9252 } else if (o->encoding == REDIS_ENCODING_HT) {
9253 d = o->ptr;
9254 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9255 if (dictSize(d)) {
9256 long elesize;
9257 robj *ele;
9258
9259 de = dictGetRandomKey(d);
9260 ele = dictGetEntryKey(de);
9261 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9262 (sizeof(*o)+sdslen(ele->ptr)) :
9263 sizeof(*o);
9264 ele = dictGetEntryVal(de);
9265 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9266 (sizeof(*o)+sdslen(ele->ptr)) :
9267 sizeof(*o);
9268 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9269 }
9270 }
9271 break;
9272 }
9273 return (double)age*log(1+asize);
9274 }
9275
9276 /* Try to swap an object that's a good candidate for swapping.
9277 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9278 * to swap any object at all.
9279 *
9280 * If 'usethreaded' is true, Redis will try to swap the object in background
9281 * using I/O threads. */
9282 static int vmSwapOneObject(int usethreads) {
9283 int j, i;
9284 struct dictEntry *best = NULL;
9285 double best_swappability = 0;
9286 redisDb *best_db = NULL;
9287 robj *key, *val;
9288
9289 for (j = 0; j < server.dbnum; j++) {
9290 redisDb *db = server.db+j;
9291 /* Why maxtries is set to 100?
9292 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9293 * are swappable objects */
9294 int maxtries = 100;
9295
9296 if (dictSize(db->dict) == 0) continue;
9297 for (i = 0; i < 5; i++) {
9298 dictEntry *de;
9299 double swappability;
9300
9301 if (maxtries) maxtries--;
9302 de = dictGetRandomKey(db->dict);
9303 key = dictGetEntryKey(de);
9304 val = dictGetEntryVal(de);
9305 /* Only swap objects that are currently in memory.
9306 *
9307 * Also don't swap shared objects if threaded VM is on, as we
9308 * try to ensure that the main thread does not touch the
9309 * object while the I/O thread is using it, but we can't
9310 * control other keys without adding additional mutex. */
9311 if (key->storage != REDIS_VM_MEMORY ||
9312 (server.vm_max_threads != 0 && val->refcount != 1)) {
9313 if (maxtries) i--; /* don't count this try */
9314 continue;
9315 }
9316 swappability = computeObjectSwappability(val);
9317 if (!best || swappability > best_swappability) {
9318 best = de;
9319 best_swappability = swappability;
9320 best_db = db;
9321 }
9322 }
9323 }
9324 if (best == NULL) return REDIS_ERR;
9325 key = dictGetEntryKey(best);
9326 val = dictGetEntryVal(best);
9327
9328 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9329 key->ptr, best_swappability);
9330
9331 /* Unshare the key if needed */
9332 if (key->refcount > 1) {
9333 robj *newkey = dupStringObject(key);
9334 decrRefCount(key);
9335 key = dictGetEntryKey(best) = newkey;
9336 }
9337 /* Swap it */
9338 if (usethreads) {
9339 vmSwapObjectThreaded(key,val,best_db);
9340 return REDIS_OK;
9341 } else {
9342 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9343 dictGetEntryVal(best) = NULL;
9344 return REDIS_OK;
9345 } else {
9346 return REDIS_ERR;
9347 }
9348 }
9349 }
9350
9351 static int vmSwapOneObjectBlocking() {
9352 return vmSwapOneObject(0);
9353 }
9354
9355 static int vmSwapOneObjectThreaded() {
9356 return vmSwapOneObject(1);
9357 }
9358
9359 /* Return true if it's safe to swap out objects in a given moment.
9360 * Basically we don't want to swap objects out while there is a BGSAVE
9361 * or a BGAEOREWRITE running in backgroud. */
9362 static int vmCanSwapOut(void) {
9363 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9364 }
9365
9366 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9367 * and was deleted. Otherwise 0 is returned. */
9368 static int deleteIfSwapped(redisDb *db, robj *key) {
9369 dictEntry *de;
9370 robj *foundkey;
9371
9372 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9373 foundkey = dictGetEntryKey(de);
9374 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9375 deleteKey(db,key);
9376 return 1;
9377 }
9378
9379 /* =================== Virtual Memory - Threaded I/O ======================= */
9380
9381 static void freeIOJob(iojob *j) {
9382 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9383 j->type == REDIS_IOJOB_DO_SWAP ||
9384 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9385 decrRefCount(j->val);
9386 /* We don't decrRefCount the j->key field as we did't incremented
9387 * the count creating IO Jobs. This is because the key field here is
9388 * just used as an indentifier and if a key is removed the Job should
9389 * never be touched again. */
9390 zfree(j);
9391 }
9392
9393 /* Every time a thread finished a Job, it writes a byte into the write side
9394 * of an unix pipe in order to "awake" the main thread, and this function
9395 * is called. */
9396 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9397 int mask)
9398 {
9399 char buf[1];
9400 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9401 REDIS_NOTUSED(el);
9402 REDIS_NOTUSED(mask);
9403 REDIS_NOTUSED(privdata);
9404
9405 /* For every byte we read in the read side of the pipe, there is one
9406 * I/O job completed to process. */
9407 while((retval = read(fd,buf,1)) == 1) {
9408 iojob *j;
9409 listNode *ln;
9410 robj *key;
9411 struct dictEntry *de;
9412
9413 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9414
9415 /* Get the processed element (the oldest one) */
9416 lockThreadedIO();
9417 assert(listLength(server.io_processed) != 0);
9418 if (toprocess == -1) {
9419 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9420 if (toprocess <= 0) toprocess = 1;
9421 }
9422 ln = listFirst(server.io_processed);
9423 j = ln->value;
9424 listDelNode(server.io_processed,ln);
9425 unlockThreadedIO();
9426 /* If this job is marked as canceled, just ignore it */
9427 if (j->canceled) {
9428 freeIOJob(j);
9429 continue;
9430 }
9431 /* Post process it in the main thread, as there are things we
9432 * can do just here to avoid race conditions and/or invasive locks */
9433 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9434 de = dictFind(j->db->dict,j->key);
9435 assert(de != NULL);
9436 key = dictGetEntryKey(de);
9437 if (j->type == REDIS_IOJOB_LOAD) {
9438 redisDb *db;
9439
9440 /* Key loaded, bring it at home */
9441 key->storage = REDIS_VM_MEMORY;
9442 key->vm.atime = server.unixtime;
9443 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9444 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9445 (unsigned char*) key->ptr);
9446 server.vm_stats_swapped_objects--;
9447 server.vm_stats_swapins++;
9448 dictGetEntryVal(de) = j->val;
9449 incrRefCount(j->val);
9450 db = j->db;
9451 freeIOJob(j);
9452 /* Handle clients waiting for this key to be loaded. */
9453 handleClientsBlockedOnSwappedKey(db,key);
9454 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9455 /* Now we know the amount of pages required to swap this object.
9456 * Let's find some space for it, and queue this task again
9457 * rebranded as REDIS_IOJOB_DO_SWAP. */
9458 if (!vmCanSwapOut() ||
9459 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9460 {
9461 /* Ooops... no space or we can't swap as there is
9462 * a fork()ed Redis trying to save stuff on disk. */
9463 freeIOJob(j);
9464 key->storage = REDIS_VM_MEMORY; /* undo operation */
9465 } else {
9466 /* Note that we need to mark this pages as used now,
9467 * if the job will be canceled, we'll mark them as freed
9468 * again. */
9469 vmMarkPagesUsed(j->page,j->pages);
9470 j->type = REDIS_IOJOB_DO_SWAP;
9471 lockThreadedIO();
9472 queueIOJob(j);
9473 unlockThreadedIO();
9474 }
9475 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9476 robj *val;
9477
9478 /* Key swapped. We can finally free some memory. */
9479 if (key->storage != REDIS_VM_SWAPPING) {
9480 printf("key->storage: %d\n",key->storage);
9481 printf("key->name: %s\n",(char*)key->ptr);
9482 printf("key->refcount: %d\n",key->refcount);
9483 printf("val: %p\n",(void*)j->val);
9484 printf("val->type: %d\n",j->val->type);
9485 printf("val->ptr: %s\n",(char*)j->val->ptr);
9486 }
9487 redisAssert(key->storage == REDIS_VM_SWAPPING);
9488 val = dictGetEntryVal(de);
9489 key->vm.page = j->page;
9490 key->vm.usedpages = j->pages;
9491 key->storage = REDIS_VM_SWAPPED;
9492 key->vtype = j->val->type;
9493 decrRefCount(val); /* Deallocate the object from memory. */
9494 dictGetEntryVal(de) = NULL;
9495 redisLog(REDIS_DEBUG,
9496 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9497 (unsigned char*) key->ptr,
9498 (unsigned long long) j->page, (unsigned long long) j->pages);
9499 server.vm_stats_swapped_objects++;
9500 server.vm_stats_swapouts++;
9501 freeIOJob(j);
9502 /* Put a few more swap requests in queue if we are still
9503 * out of memory */
9504 if (trytoswap && vmCanSwapOut() &&
9505 zmalloc_used_memory() > server.vm_max_memory)
9506 {
9507 int more = 1;
9508 while(more) {
9509 lockThreadedIO();
9510 more = listLength(server.io_newjobs) <
9511 (unsigned) server.vm_max_threads;
9512 unlockThreadedIO();
9513 /* Don't waste CPU time if swappable objects are rare. */
9514 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9515 trytoswap = 0;
9516 break;
9517 }
9518 }
9519 }
9520 }
9521 processed++;
9522 if (processed == toprocess) return;
9523 }
9524 if (retval < 0 && errno != EAGAIN) {
9525 redisLog(REDIS_WARNING,
9526 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9527 strerror(errno));
9528 }
9529 }
9530
9531 static void lockThreadedIO(void) {
9532 pthread_mutex_lock(&server.io_mutex);
9533 }
9534
9535 static void unlockThreadedIO(void) {
9536 pthread_mutex_unlock(&server.io_mutex);
9537 }
9538
9539 /* Remove the specified object from the threaded I/O queue if still not
9540 * processed, otherwise make sure to flag it as canceled. */
9541 static void vmCancelThreadedIOJob(robj *o) {
9542 list *lists[3] = {
9543 server.io_newjobs, /* 0 */
9544 server.io_processing, /* 1 */
9545 server.io_processed /* 2 */
9546 };
9547 int i;
9548
9549 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9550 again:
9551 lockThreadedIO();
9552 /* Search for a matching key in one of the queues */
9553 for (i = 0; i < 3; i++) {
9554 listNode *ln;
9555 listIter li;
9556
9557 listRewind(lists[i],&li);
9558 while ((ln = listNext(&li)) != NULL) {
9559 iojob *job = ln->value;
9560
9561 if (job->canceled) continue; /* Skip this, already canceled. */
9562 if (job->key == o) {
9563 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9564 (void*)job, (char*)o->ptr, job->type, i);
9565 /* Mark the pages as free since the swap didn't happened
9566 * or happened but is now discarded. */
9567 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9568 vmMarkPagesFree(job->page,job->pages);
9569 /* Cancel the job. It depends on the list the job is
9570 * living in. */
9571 switch(i) {
9572 case 0: /* io_newjobs */
9573 /* If the job was yet not processed the best thing to do
9574 * is to remove it from the queue at all */
9575 freeIOJob(job);
9576 listDelNode(lists[i],ln);
9577 break;
9578 case 1: /* io_processing */
9579 /* Oh Shi- the thread is messing with the Job:
9580 *
9581 * Probably it's accessing the object if this is a
9582 * PREPARE_SWAP or DO_SWAP job.
9583 * If it's a LOAD job it may be reading from disk and
9584 * if we don't wait for the job to terminate before to
9585 * cancel it, maybe in a few microseconds data can be
9586 * corrupted in this pages. So the short story is:
9587 *
9588 * Better to wait for the job to move into the
9589 * next queue (processed)... */
9590
9591 /* We try again and again until the job is completed. */
9592 unlockThreadedIO();
9593 /* But let's wait some time for the I/O thread
9594 * to finish with this job. After all this condition
9595 * should be very rare. */
9596 usleep(1);
9597 goto again;
9598 case 2: /* io_processed */
9599 /* The job was already processed, that's easy...
9600 * just mark it as canceled so that we'll ignore it
9601 * when processing completed jobs. */
9602 job->canceled = 1;
9603 break;
9604 }
9605 /* Finally we have to adjust the storage type of the object
9606 * in order to "UNDO" the operaiton. */
9607 if (o->storage == REDIS_VM_LOADING)
9608 o->storage = REDIS_VM_SWAPPED;
9609 else if (o->storage == REDIS_VM_SWAPPING)
9610 o->storage = REDIS_VM_MEMORY;
9611 unlockThreadedIO();
9612 return;
9613 }
9614 }
9615 }
9616 unlockThreadedIO();
9617 assert(1 != 1); /* We should never reach this */
9618 }
9619
9620 static void *IOThreadEntryPoint(void *arg) {
9621 iojob *j;
9622 listNode *ln;
9623 REDIS_NOTUSED(arg);
9624
9625 pthread_detach(pthread_self());
9626 while(1) {
9627 /* Get a new job to process */
9628 lockThreadedIO();
9629 if (listLength(server.io_newjobs) == 0) {
9630 /* No new jobs in queue, exit. */
9631 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9632 (long) pthread_self());
9633 server.io_active_threads--;
9634 unlockThreadedIO();
9635 return NULL;
9636 }
9637 ln = listFirst(server.io_newjobs);
9638 j = ln->value;
9639 listDelNode(server.io_newjobs,ln);
9640 /* Add the job in the processing queue */
9641 j->thread = pthread_self();
9642 listAddNodeTail(server.io_processing,j);
9643 ln = listLast(server.io_processing); /* We use ln later to remove it */
9644 unlockThreadedIO();
9645 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9646 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9647
9648 /* Process the Job */
9649 if (j->type == REDIS_IOJOB_LOAD) {
9650 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9651 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9652 FILE *fp = fopen("/dev/null","w+");
9653 j->pages = rdbSavedObjectPages(j->val,fp);
9654 fclose(fp);
9655 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9656 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9657 j->canceled = 1;
9658 }
9659
9660 /* Done: insert the job into the processed queue */
9661 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9662 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9663 lockThreadedIO();
9664 listDelNode(server.io_processing,ln);
9665 listAddNodeTail(server.io_processed,j);
9666 unlockThreadedIO();
9667
9668 /* Signal the main thread there is new stuff to process */
9669 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9670 }
9671 return NULL; /* never reached */
9672 }
9673
9674 static void spawnIOThread(void) {
9675 pthread_t thread;
9676 sigset_t mask, omask;
9677 int err;
9678
9679 sigemptyset(&mask);
9680 sigaddset(&mask,SIGCHLD);
9681 sigaddset(&mask,SIGHUP);
9682 sigaddset(&mask,SIGPIPE);
9683 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9684 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9685 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9686 strerror(err));
9687 usleep(1000000);
9688 }
9689 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9690 server.io_active_threads++;
9691 }
9692
9693 /* We need to wait for the last thread to exit before we are able to
9694 * fork() in order to BGSAVE or BGREWRITEAOF. */
9695 static void waitEmptyIOJobsQueue(void) {
9696 while(1) {
9697 int io_processed_len;
9698
9699 lockThreadedIO();
9700 if (listLength(server.io_newjobs) == 0 &&
9701 listLength(server.io_processing) == 0 &&
9702 server.io_active_threads == 0)
9703 {
9704 unlockThreadedIO();
9705 return;
9706 }
9707 /* While waiting for empty jobs queue condition we post-process some
9708 * finshed job, as I/O threads may be hanging trying to write against
9709 * the io_ready_pipe_write FD but there are so much pending jobs that
9710 * it's blocking. */
9711 io_processed_len = listLength(server.io_processed);
9712 unlockThreadedIO();
9713 if (io_processed_len) {
9714 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9715 usleep(1000); /* 1 millisecond */
9716 } else {
9717 usleep(10000); /* 10 milliseconds */
9718 }
9719 }
9720 }
9721
9722 static void vmReopenSwapFile(void) {
9723 /* Note: we don't close the old one as we are in the child process
9724 * and don't want to mess at all with the original file object. */
9725 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9726 if (server.vm_fp == NULL) {
9727 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9728 server.vm_swap_file);
9729 _exit(1);
9730 }
9731 server.vm_fd = fileno(server.vm_fp);
9732 }
9733
9734 /* This function must be called while with threaded IO locked */
9735 static void queueIOJob(iojob *j) {
9736 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9737 (void*)j, j->type, (char*)j->key->ptr);
9738 listAddNodeTail(server.io_newjobs,j);
9739 if (server.io_active_threads < server.vm_max_threads)
9740 spawnIOThread();
9741 }
9742
9743 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9744 iojob *j;
9745
9746 assert(key->storage == REDIS_VM_MEMORY);
9747 assert(key->refcount == 1);
9748
9749 j = zmalloc(sizeof(*j));
9750 j->type = REDIS_IOJOB_PREPARE_SWAP;
9751 j->db = db;
9752 j->key = key;
9753 j->val = val;
9754 incrRefCount(val);
9755 j->canceled = 0;
9756 j->thread = (pthread_t) -1;
9757 key->storage = REDIS_VM_SWAPPING;
9758
9759 lockThreadedIO();
9760 queueIOJob(j);
9761 unlockThreadedIO();
9762 return REDIS_OK;
9763 }
9764
9765 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9766
9767 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9768 * If there is not already a job loading the key, it is craeted.
9769 * The key is added to the io_keys list in the client structure, and also
9770 * in the hash table mapping swapped keys to waiting clients, that is,
9771 * server.io_waited_keys. */
9772 static int waitForSwappedKey(redisClient *c, robj *key) {
9773 struct dictEntry *de;
9774 robj *o;
9775 list *l;
9776
9777 /* If the key does not exist or is already in RAM we don't need to
9778 * block the client at all. */
9779 de = dictFind(c->db->dict,key);
9780 if (de == NULL) return 0;
9781 o = dictGetEntryKey(de);
9782 if (o->storage == REDIS_VM_MEMORY) {
9783 return 0;
9784 } else if (o->storage == REDIS_VM_SWAPPING) {
9785 /* We were swapping the key, undo it! */
9786 vmCancelThreadedIOJob(o);
9787 return 0;
9788 }
9789
9790 /* OK: the key is either swapped, or being loaded just now. */
9791
9792 /* Add the key to the list of keys this client is waiting for.
9793 * This maps clients to keys they are waiting for. */
9794 listAddNodeTail(c->io_keys,key);
9795 incrRefCount(key);
9796
9797 /* Add the client to the swapped keys => clients waiting map. */
9798 de = dictFind(c->db->io_keys,key);
9799 if (de == NULL) {
9800 int retval;
9801
9802 /* For every key we take a list of clients blocked for it */
9803 l = listCreate();
9804 retval = dictAdd(c->db->io_keys,key,l);
9805 incrRefCount(key);
9806 assert(retval == DICT_OK);
9807 } else {
9808 l = dictGetEntryVal(de);
9809 }
9810 listAddNodeTail(l,c);
9811
9812 /* Are we already loading the key from disk? If not create a job */
9813 if (o->storage == REDIS_VM_SWAPPED) {
9814 iojob *j;
9815
9816 o->storage = REDIS_VM_LOADING;
9817 j = zmalloc(sizeof(*j));
9818 j->type = REDIS_IOJOB_LOAD;
9819 j->db = c->db;
9820 j->key = o;
9821 j->key->vtype = o->vtype;
9822 j->page = o->vm.page;
9823 j->val = NULL;
9824 j->canceled = 0;
9825 j->thread = (pthread_t) -1;
9826 lockThreadedIO();
9827 queueIOJob(j);
9828 unlockThreadedIO();
9829 }
9830 return 1;
9831 }
9832
9833 /* Preload keys for any command with first, last and step values for
9834 * the command keys prototype, as defined in the command table. */
9835 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9836 int j, last;
9837 if (cmd->vm_firstkey == 0) return;
9838 last = cmd->vm_lastkey;
9839 if (last < 0) last = argc+last;
9840 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9841 redisAssert(j < argc);
9842 waitForSwappedKey(c,argv[j]);
9843 }
9844 }
9845
9846 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9847 * Note that the number of keys to preload is user-defined, so we need to
9848 * apply a sanity check against argc. */
9849 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9850 int i, num;
9851 REDIS_NOTUSED(cmd);
9852
9853 num = atoi(argv[2]->ptr);
9854 if (num > (argc-3)) return;
9855 for (i = 0; i < num; i++) {
9856 waitForSwappedKey(c,argv[3+i]);
9857 }
9858 }
9859
9860 /* Preload keys needed to execute the entire MULTI/EXEC block.
9861 *
9862 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9863 * and will block the client when any command requires a swapped out value. */
9864 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9865 int i, margc;
9866 struct redisCommand *mcmd;
9867 robj **margv;
9868 REDIS_NOTUSED(cmd);
9869 REDIS_NOTUSED(argc);
9870 REDIS_NOTUSED(argv);
9871
9872 if (!(c->flags & REDIS_MULTI)) return;
9873 for (i = 0; i < c->mstate.count; i++) {
9874 mcmd = c->mstate.commands[i].cmd;
9875 margc = c->mstate.commands[i].argc;
9876 margv = c->mstate.commands[i].argv;
9877
9878 if (mcmd->vm_preload_proc != NULL) {
9879 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9880 } else {
9881 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9882 }
9883 }
9884 }
9885
9886 /* Is this client attempting to run a command against swapped keys?
9887 * If so, block it ASAP, load the keys in background, then resume it.
9888 *
9889 * The important idea about this function is that it can fail! If keys will
9890 * still be swapped when the client is resumed, this key lookups will
9891 * just block loading keys from disk. In practical terms this should only
9892 * happen with SORT BY command or if there is a bug in this function.
9893 *
9894 * Return 1 if the client is marked as blocked, 0 if the client can
9895 * continue as the keys it is going to access appear to be in memory. */
9896 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9897 if (cmd->vm_preload_proc != NULL) {
9898 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9899 } else {
9900 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9901 }
9902
9903 /* If the client was blocked for at least one key, mark it as blocked. */
9904 if (listLength(c->io_keys)) {
9905 c->flags |= REDIS_IO_WAIT;
9906 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9907 server.vm_blocked_clients++;
9908 return 1;
9909 } else {
9910 return 0;
9911 }
9912 }
9913
9914 /* Remove the 'key' from the list of blocked keys for a given client.
9915 *
9916 * The function returns 1 when there are no longer blocking keys after
9917 * the current one was removed (and the client can be unblocked). */
9918 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9919 list *l;
9920 listNode *ln;
9921 listIter li;
9922 struct dictEntry *de;
9923
9924 /* Remove the key from the list of keys this client is waiting for. */
9925 listRewind(c->io_keys,&li);
9926 while ((ln = listNext(&li)) != NULL) {
9927 if (equalStringObjects(ln->value,key)) {
9928 listDelNode(c->io_keys,ln);
9929 break;
9930 }
9931 }
9932 assert(ln != NULL);
9933
9934 /* Remove the client form the key => waiting clients map. */
9935 de = dictFind(c->db->io_keys,key);
9936 assert(de != NULL);
9937 l = dictGetEntryVal(de);
9938 ln = listSearchKey(l,c);
9939 assert(ln != NULL);
9940 listDelNode(l,ln);
9941 if (listLength(l) == 0)
9942 dictDelete(c->db->io_keys,key);
9943
9944 return listLength(c->io_keys) == 0;
9945 }
9946
9947 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9948 struct dictEntry *de;
9949 list *l;
9950 listNode *ln;
9951 int len;
9952
9953 de = dictFind(db->io_keys,key);
9954 if (!de) return;
9955
9956 l = dictGetEntryVal(de);
9957 len = listLength(l);
9958 /* Note: we can't use something like while(listLength(l)) as the list
9959 * can be freed by the calling function when we remove the last element. */
9960 while (len--) {
9961 ln = listFirst(l);
9962 redisClient *c = ln->value;
9963
9964 if (dontWaitForSwappedKey(c,key)) {
9965 /* Put the client in the list of clients ready to go as we
9966 * loaded all the keys about it. */
9967 listAddNodeTail(server.io_ready_clients,c);
9968 }
9969 }
9970 }
9971
9972 /* =========================== Remote Configuration ========================= */
9973
9974 static void configSetCommand(redisClient *c) {
9975 robj *o = getDecodedObject(c->argv[3]);
9976 long long ll;
9977
9978 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9979 zfree(server.dbfilename);
9980 server.dbfilename = zstrdup(o->ptr);
9981 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9982 zfree(server.requirepass);
9983 server.requirepass = zstrdup(o->ptr);
9984 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9985 zfree(server.masterauth);
9986 server.masterauth = zstrdup(o->ptr);
9987 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9988 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9989 ll < 0) goto badfmt;
9990 server.maxmemory = ll;
9991 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9992 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9993 ll < 0 || ll > LONG_MAX) goto badfmt;
9994 server.maxidletime = ll;
9995 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9996 if (!strcasecmp(o->ptr,"no")) {
9997 server.appendfsync = APPENDFSYNC_NO;
9998 } else if (!strcasecmp(o->ptr,"everysec")) {
9999 server.appendfsync = APPENDFSYNC_EVERYSEC;
10000 } else if (!strcasecmp(o->ptr,"always")) {
10001 server.appendfsync = APPENDFSYNC_ALWAYS;
10002 } else {
10003 goto badfmt;
10004 }
10005 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10006 int yn = yesnotoi(o->ptr);
10007
10008 if (yn == -1) goto badfmt;
10009 server.no_appendfsync_on_rewrite = yn;
10010 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10011 int old = server.appendonly;
10012 int new = yesnotoi(o->ptr);
10013
10014 if (new == -1) goto badfmt;
10015 if (old != new) {
10016 if (new == 0) {
10017 stopAppendOnly();
10018 } else {
10019 if (startAppendOnly() == REDIS_ERR) {
10020 addReplySds(c,sdscatprintf(sdsempty(),
10021 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10022 decrRefCount(o);
10023 return;
10024 }
10025 }
10026 }
10027 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10028 int vlen, j;
10029 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10030
10031 /* Perform sanity check before setting the new config:
10032 * - Even number of args
10033 * - Seconds >= 1, changes >= 0 */
10034 if (vlen & 1) {
10035 sdsfreesplitres(v,vlen);
10036 goto badfmt;
10037 }
10038 for (j = 0; j < vlen; j++) {
10039 char *eptr;
10040 long val;
10041
10042 val = strtoll(v[j], &eptr, 10);
10043 if (eptr[0] != '\0' ||
10044 ((j & 1) == 0 && val < 1) ||
10045 ((j & 1) == 1 && val < 0)) {
10046 sdsfreesplitres(v,vlen);
10047 goto badfmt;
10048 }
10049 }
10050 /* Finally set the new config */
10051 resetServerSaveParams();
10052 for (j = 0; j < vlen; j += 2) {
10053 time_t seconds;
10054 int changes;
10055
10056 seconds = strtoll(v[j],NULL,10);
10057 changes = strtoll(v[j+1],NULL,10);
10058 appendServerSaveParams(seconds, changes);
10059 }
10060 sdsfreesplitres(v,vlen);
10061 } else {
10062 addReplySds(c,sdscatprintf(sdsempty(),
10063 "-ERR not supported CONFIG parameter %s\r\n",
10064 (char*)c->argv[2]->ptr));
10065 decrRefCount(o);
10066 return;
10067 }
10068 decrRefCount(o);
10069 addReply(c,shared.ok);
10070 return;
10071
10072 badfmt: /* Bad format errors */
10073 addReplySds(c,sdscatprintf(sdsempty(),
10074 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10075 (char*)o->ptr,
10076 (char*)c->argv[2]->ptr));
10077 decrRefCount(o);
10078 }
10079
10080 static void configGetCommand(redisClient *c) {
10081 robj *o = getDecodedObject(c->argv[2]);
10082 robj *lenobj = createObject(REDIS_STRING,NULL);
10083 char *pattern = o->ptr;
10084 int matches = 0;
10085
10086 addReply(c,lenobj);
10087 decrRefCount(lenobj);
10088
10089 if (stringmatch(pattern,"dbfilename",0)) {
10090 addReplyBulkCString(c,"dbfilename");
10091 addReplyBulkCString(c,server.dbfilename);
10092 matches++;
10093 }
10094 if (stringmatch(pattern,"requirepass",0)) {
10095 addReplyBulkCString(c,"requirepass");
10096 addReplyBulkCString(c,server.requirepass);
10097 matches++;
10098 }
10099 if (stringmatch(pattern,"masterauth",0)) {
10100 addReplyBulkCString(c,"masterauth");
10101 addReplyBulkCString(c,server.masterauth);
10102 matches++;
10103 }
10104 if (stringmatch(pattern,"maxmemory",0)) {
10105 char buf[128];
10106
10107 ll2string(buf,128,server.maxmemory);
10108 addReplyBulkCString(c,"maxmemory");
10109 addReplyBulkCString(c,buf);
10110 matches++;
10111 }
10112 if (stringmatch(pattern,"timeout",0)) {
10113 char buf[128];
10114
10115 ll2string(buf,128,server.maxidletime);
10116 addReplyBulkCString(c,"timeout");
10117 addReplyBulkCString(c,buf);
10118 matches++;
10119 }
10120 if (stringmatch(pattern,"appendonly",0)) {
10121 addReplyBulkCString(c,"appendonly");
10122 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10123 matches++;
10124 }
10125 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10126 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10127 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10128 matches++;
10129 }
10130 if (stringmatch(pattern,"appendfsync",0)) {
10131 char *policy;
10132
10133 switch(server.appendfsync) {
10134 case APPENDFSYNC_NO: policy = "no"; break;
10135 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10136 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10137 default: policy = "unknown"; break; /* too harmless to panic */
10138 }
10139 addReplyBulkCString(c,"appendfsync");
10140 addReplyBulkCString(c,policy);
10141 matches++;
10142 }
10143 if (stringmatch(pattern,"save",0)) {
10144 sds buf = sdsempty();
10145 int j;
10146
10147 for (j = 0; j < server.saveparamslen; j++) {
10148 buf = sdscatprintf(buf,"%ld %d",
10149 server.saveparams[j].seconds,
10150 server.saveparams[j].changes);
10151 if (j != server.saveparamslen-1)
10152 buf = sdscatlen(buf," ",1);
10153 }
10154 addReplyBulkCString(c,"save");
10155 addReplyBulkCString(c,buf);
10156 sdsfree(buf);
10157 matches++;
10158 }
10159 decrRefCount(o);
10160 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10161 }
10162
10163 static void configCommand(redisClient *c) {
10164 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10165 if (c->argc != 4) goto badarity;
10166 configSetCommand(c);
10167 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10168 if (c->argc != 3) goto badarity;
10169 configGetCommand(c);
10170 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10171 if (c->argc != 2) goto badarity;
10172 server.stat_numcommands = 0;
10173 server.stat_numconnections = 0;
10174 server.stat_expiredkeys = 0;
10175 server.stat_starttime = time(NULL);
10176 addReply(c,shared.ok);
10177 } else {
10178 addReplySds(c,sdscatprintf(sdsempty(),
10179 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10180 }
10181 return;
10182
10183 badarity:
10184 addReplySds(c,sdscatprintf(sdsempty(),
10185 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10186 (char*) c->argv[1]->ptr));
10187 }
10188
10189 /* =========================== Pubsub implementation ======================== */
10190
10191 static void freePubsubPattern(void *p) {
10192 pubsubPattern *pat = p;
10193
10194 decrRefCount(pat->pattern);
10195 zfree(pat);
10196 }
10197
10198 static int listMatchPubsubPattern(void *a, void *b) {
10199 pubsubPattern *pa = a, *pb = b;
10200
10201 return (pa->client == pb->client) &&
10202 (equalStringObjects(pa->pattern,pb->pattern));
10203 }
10204
10205 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10206 * 0 if the client was already subscribed to that channel. */
10207 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10208 struct dictEntry *de;
10209 list *clients = NULL;
10210 int retval = 0;
10211
10212 /* Add the channel to the client -> channels hash table */
10213 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10214 retval = 1;
10215 incrRefCount(channel);
10216 /* Add the client to the channel -> list of clients hash table */
10217 de = dictFind(server.pubsub_channels,channel);
10218 if (de == NULL) {
10219 clients = listCreate();
10220 dictAdd(server.pubsub_channels,channel,clients);
10221 incrRefCount(channel);
10222 } else {
10223 clients = dictGetEntryVal(de);
10224 }
10225 listAddNodeTail(clients,c);
10226 }
10227 /* Notify the client */
10228 addReply(c,shared.mbulk3);
10229 addReply(c,shared.subscribebulk);
10230 addReplyBulk(c,channel);
10231 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10232 return retval;
10233 }
10234
10235 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10236 * 0 if the client was not subscribed to the specified channel. */
10237 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10238 struct dictEntry *de;
10239 list *clients;
10240 listNode *ln;
10241 int retval = 0;
10242
10243 /* Remove the channel from the client -> channels hash table */
10244 incrRefCount(channel); /* channel may be just a pointer to the same object
10245 we have in the hash tables. Protect it... */
10246 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10247 retval = 1;
10248 /* Remove the client from the channel -> clients list hash table */
10249 de = dictFind(server.pubsub_channels,channel);
10250 assert(de != NULL);
10251 clients = dictGetEntryVal(de);
10252 ln = listSearchKey(clients,c);
10253 assert(ln != NULL);
10254 listDelNode(clients,ln);
10255 if (listLength(clients) == 0) {
10256 /* Free the list and associated hash entry at all if this was
10257 * the latest client, so that it will be possible to abuse
10258 * Redis PUBSUB creating millions of channels. */
10259 dictDelete(server.pubsub_channels,channel);
10260 }
10261 }
10262 /* Notify the client */
10263 if (notify) {
10264 addReply(c,shared.mbulk3);
10265 addReply(c,shared.unsubscribebulk);
10266 addReplyBulk(c,channel);
10267 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10268 listLength(c->pubsub_patterns));
10269
10270 }
10271 decrRefCount(channel); /* it is finally safe to release it */
10272 return retval;
10273 }
10274
10275 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10276 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10277 int retval = 0;
10278
10279 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10280 retval = 1;
10281 pubsubPattern *pat;
10282 listAddNodeTail(c->pubsub_patterns,pattern);
10283 incrRefCount(pattern);
10284 pat = zmalloc(sizeof(*pat));
10285 pat->pattern = getDecodedObject(pattern);
10286 pat->client = c;
10287 listAddNodeTail(server.pubsub_patterns,pat);
10288 }
10289 /* Notify the client */
10290 addReply(c,shared.mbulk3);
10291 addReply(c,shared.psubscribebulk);
10292 addReplyBulk(c,pattern);
10293 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10294 return retval;
10295 }
10296
10297 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10298 * 0 if the client was not subscribed to the specified channel. */
10299 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10300 listNode *ln;
10301 pubsubPattern pat;
10302 int retval = 0;
10303
10304 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10305 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10306 retval = 1;
10307 listDelNode(c->pubsub_patterns,ln);
10308 pat.client = c;
10309 pat.pattern = pattern;
10310 ln = listSearchKey(server.pubsub_patterns,&pat);
10311 listDelNode(server.pubsub_patterns,ln);
10312 }
10313 /* Notify the client */
10314 if (notify) {
10315 addReply(c,shared.mbulk3);
10316 addReply(c,shared.punsubscribebulk);
10317 addReplyBulk(c,pattern);
10318 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10319 listLength(c->pubsub_patterns));
10320 }
10321 decrRefCount(pattern);
10322 return retval;
10323 }
10324
10325 /* Unsubscribe from all the channels. Return the number of channels the
10326 * client was subscribed from. */
10327 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10328 dictIterator *di = dictGetIterator(c->pubsub_channels);
10329 dictEntry *de;
10330 int count = 0;
10331
10332 while((de = dictNext(di)) != NULL) {
10333 robj *channel = dictGetEntryKey(de);
10334
10335 count += pubsubUnsubscribeChannel(c,channel,notify);
10336 }
10337 dictReleaseIterator(di);
10338 return count;
10339 }
10340
10341 /* Unsubscribe from all the patterns. Return the number of patterns the
10342 * client was subscribed from. */
10343 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10344 listNode *ln;
10345 listIter li;
10346 int count = 0;
10347
10348 listRewind(c->pubsub_patterns,&li);
10349 while ((ln = listNext(&li)) != NULL) {
10350 robj *pattern = ln->value;
10351
10352 count += pubsubUnsubscribePattern(c,pattern,notify);
10353 }
10354 return count;
10355 }
10356
10357 /* Publish a message */
10358 static int pubsubPublishMessage(robj *channel, robj *message) {
10359 int receivers = 0;
10360 struct dictEntry *de;
10361 listNode *ln;
10362 listIter li;
10363
10364 /* Send to clients listening for that channel */
10365 de = dictFind(server.pubsub_channels,channel);
10366 if (de) {
10367 list *list = dictGetEntryVal(de);
10368 listNode *ln;
10369 listIter li;
10370
10371 listRewind(list,&li);
10372 while ((ln = listNext(&li)) != NULL) {
10373 redisClient *c = ln->value;
10374
10375 addReply(c,shared.mbulk3);
10376 addReply(c,shared.messagebulk);
10377 addReplyBulk(c,channel);
10378 addReplyBulk(c,message);
10379 receivers++;
10380 }
10381 }
10382 /* Send to clients listening to matching channels */
10383 if (listLength(server.pubsub_patterns)) {
10384 listRewind(server.pubsub_patterns,&li);
10385 channel = getDecodedObject(channel);
10386 while ((ln = listNext(&li)) != NULL) {
10387 pubsubPattern *pat = ln->value;
10388
10389 if (stringmatchlen((char*)pat->pattern->ptr,
10390 sdslen(pat->pattern->ptr),
10391 (char*)channel->ptr,
10392 sdslen(channel->ptr),0)) {
10393 addReply(pat->client,shared.mbulk4);
10394 addReply(pat->client,shared.pmessagebulk);
10395 addReplyBulk(pat->client,pat->pattern);
10396 addReplyBulk(pat->client,channel);
10397 addReplyBulk(pat->client,message);
10398 receivers++;
10399 }
10400 }
10401 decrRefCount(channel);
10402 }
10403 return receivers;
10404 }
10405
10406 static void subscribeCommand(redisClient *c) {
10407 int j;
10408
10409 for (j = 1; j < c->argc; j++)
10410 pubsubSubscribeChannel(c,c->argv[j]);
10411 }
10412
10413 static void unsubscribeCommand(redisClient *c) {
10414 if (c->argc == 1) {
10415 pubsubUnsubscribeAllChannels(c,1);
10416 return;
10417 } else {
10418 int j;
10419
10420 for (j = 1; j < c->argc; j++)
10421 pubsubUnsubscribeChannel(c,c->argv[j],1);
10422 }
10423 }
10424
10425 static void psubscribeCommand(redisClient *c) {
10426 int j;
10427
10428 for (j = 1; j < c->argc; j++)
10429 pubsubSubscribePattern(c,c->argv[j]);
10430 }
10431
10432 static void punsubscribeCommand(redisClient *c) {
10433 if (c->argc == 1) {
10434 pubsubUnsubscribeAllPatterns(c,1);
10435 return;
10436 } else {
10437 int j;
10438
10439 for (j = 1; j < c->argc; j++)
10440 pubsubUnsubscribePattern(c,c->argv[j],1);
10441 }
10442 }
10443
10444 static void publishCommand(redisClient *c) {
10445 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10446 addReplyLongLong(c,receivers);
10447 }
10448
10449 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10450 *
10451 * The implementation uses a per-DB hash table mapping keys to list of clients
10452 * WATCHing those keys, so that given a key that is going to be modified
10453 * we can mark all the associated clients as dirty.
10454 *
10455 * Also every client contains a list of WATCHed keys so that's possible to
10456 * un-watch such keys when the client is freed or when UNWATCH is called. */
10457
10458 /* In the client->watched_keys list we need to use watchedKey structures
10459 * as in order to identify a key in Redis we need both the key name and the
10460 * DB */
10461 typedef struct watchedKey {
10462 robj *key;
10463 redisDb *db;
10464 } watchedKey;
10465
10466 /* Watch for the specified key */
10467 static void watchForKey(redisClient *c, robj *key) {
10468 list *clients = NULL;
10469 listIter li;
10470 listNode *ln;
10471 watchedKey *wk;
10472
10473 /* Check if we are already watching for this key */
10474 listRewind(c->watched_keys,&li);
10475 while((ln = listNext(&li))) {
10476 wk = listNodeValue(ln);
10477 if (wk->db == c->db && equalStringObjects(key,wk->key))
10478 return; /* Key already watched */
10479 }
10480 /* This key is not already watched in this DB. Let's add it */
10481 clients = dictFetchValue(c->db->watched_keys,key);
10482 if (!clients) {
10483 clients = listCreate();
10484 dictAdd(c->db->watched_keys,key,clients);
10485 incrRefCount(key);
10486 }
10487 listAddNodeTail(clients,c);
10488 /* Add the new key to the lits of keys watched by this client */
10489 wk = zmalloc(sizeof(*wk));
10490 wk->key = key;
10491 wk->db = c->db;
10492 incrRefCount(key);
10493 listAddNodeTail(c->watched_keys,wk);
10494 }
10495
10496 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10497 * flag is up to the caller. */
10498 static void unwatchAllKeys(redisClient *c) {
10499 listIter li;
10500 listNode *ln;
10501
10502 if (listLength(c->watched_keys) == 0) return;
10503 listRewind(c->watched_keys,&li);
10504 while((ln = listNext(&li))) {
10505 list *clients;
10506 watchedKey *wk;
10507
10508 /* Lookup the watched key -> clients list and remove the client
10509 * from the list */
10510 wk = listNodeValue(ln);
10511 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10512 assert(clients != NULL);
10513 listDelNode(clients,listSearchKey(clients,c));
10514 /* Kill the entry at all if this was the only client */
10515 if (listLength(clients) == 0)
10516 dictDelete(wk->db->watched_keys, wk->key);
10517 /* Remove this watched key from the client->watched list */
10518 listDelNode(c->watched_keys,ln);
10519 decrRefCount(wk->key);
10520 zfree(wk);
10521 }
10522 }
10523
10524 /* "Touch" a key, so that if this key is being WATCHed by some client the
10525 * next EXEC will fail. */
10526 static void touchWatchedKey(redisDb *db, robj *key) {
10527 list *clients;
10528 listIter li;
10529 listNode *ln;
10530
10531 if (dictSize(db->watched_keys) == 0) return;
10532 clients = dictFetchValue(db->watched_keys, key);
10533 if (!clients) return;
10534
10535 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10536 /* Check if we are already watching for this key */
10537 listRewind(clients,&li);
10538 while((ln = listNext(&li))) {
10539 redisClient *c = listNodeValue(ln);
10540
10541 c->flags |= REDIS_DIRTY_CAS;
10542 }
10543 }
10544
10545 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10546 * flush but will be deleted as effect of the flushing operation should
10547 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10548 * a FLUSHALL operation (all the DBs flushed). */
10549 static void touchWatchedKeysOnFlush(int dbid) {
10550 listIter li1, li2;
10551 listNode *ln;
10552
10553 /* For every client, check all the waited keys */
10554 listRewind(server.clients,&li1);
10555 while((ln = listNext(&li1))) {
10556 redisClient *c = listNodeValue(ln);
10557 listRewind(c->watched_keys,&li2);
10558 while((ln = listNext(&li2))) {
10559 watchedKey *wk = listNodeValue(ln);
10560
10561 /* For every watched key matching the specified DB, if the
10562 * key exists, mark the client as dirty, as the key will be
10563 * removed. */
10564 if (dbid == -1 || wk->db->id == dbid) {
10565 if (dictFind(wk->db->dict, wk->key) != NULL)
10566 c->flags |= REDIS_DIRTY_CAS;
10567 }
10568 }
10569 }
10570 }
10571
10572 static void watchCommand(redisClient *c) {
10573 int j;
10574
10575 if (c->flags & REDIS_MULTI) {
10576 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10577 return;
10578 }
10579 for (j = 1; j < c->argc; j++)
10580 watchForKey(c,c->argv[j]);
10581 addReply(c,shared.ok);
10582 }
10583
10584 static void unwatchCommand(redisClient *c) {
10585 unwatchAllKeys(c);
10586 c->flags &= (~REDIS_DIRTY_CAS);
10587 addReply(c,shared.ok);
10588 }
10589
10590 /* ================================= Debugging ============================== */
10591
10592 /* Compute the sha1 of string at 's' with 'len' bytes long.
10593 * The SHA1 is then xored againt the string pointed by digest.
10594 * Since xor is commutative, this operation is used in order to
10595 * "add" digests relative to unordered elements.
10596 *
10597 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10598 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10599 SHA1_CTX ctx;
10600 unsigned char hash[20], *s = ptr;
10601 int j;
10602
10603 SHA1Init(&ctx);
10604 SHA1Update(&ctx,s,len);
10605 SHA1Final(hash,&ctx);
10606
10607 for (j = 0; j < 20; j++)
10608 digest[j] ^= hash[j];
10609 }
10610
10611 static void xorObjectDigest(unsigned char *digest, robj *o) {
10612 o = getDecodedObject(o);
10613 xorDigest(digest,o->ptr,sdslen(o->ptr));
10614 decrRefCount(o);
10615 }
10616
10617 /* This function instead of just computing the SHA1 and xoring it
10618 * against diget, also perform the digest of "digest" itself and
10619 * replace the old value with the new one.
10620 *
10621 * So the final digest will be:
10622 *
10623 * digest = SHA1(digest xor SHA1(data))
10624 *
10625 * This function is used every time we want to preserve the order so
10626 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10627 *
10628 * Also note that mixdigest("foo") followed by mixdigest("bar")
10629 * will lead to a different digest compared to "fo", "obar".
10630 */
10631 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10632 SHA1_CTX ctx;
10633 char *s = ptr;
10634
10635 xorDigest(digest,s,len);
10636 SHA1Init(&ctx);
10637 SHA1Update(&ctx,digest,20);
10638 SHA1Final(digest,&ctx);
10639 }
10640
10641 static void mixObjectDigest(unsigned char *digest, robj *o) {
10642 o = getDecodedObject(o);
10643 mixDigest(digest,o->ptr,sdslen(o->ptr));
10644 decrRefCount(o);
10645 }
10646
10647 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10648 * are not ordered, we use a trick: every aggregate digest is the xor
10649 * of the digests of their elements. This way the order will not change
10650 * the result. For list instead we use a feedback entering the output digest
10651 * as input in order to ensure that a different ordered list will result in
10652 * a different digest. */
10653 static void computeDatasetDigest(unsigned char *final) {
10654 unsigned char digest[20];
10655 char buf[128];
10656 dictIterator *di = NULL;
10657 dictEntry *de;
10658 int j;
10659 uint32_t aux;
10660
10661 memset(final,0,20); /* Start with a clean result */
10662
10663 for (j = 0; j < server.dbnum; j++) {
10664 redisDb *db = server.db+j;
10665
10666 if (dictSize(db->dict) == 0) continue;
10667 di = dictGetIterator(db->dict);
10668
10669 /* hash the DB id, so the same dataset moved in a different
10670 * DB will lead to a different digest */
10671 aux = htonl(j);
10672 mixDigest(final,&aux,sizeof(aux));
10673
10674 /* Iterate this DB writing every entry */
10675 while((de = dictNext(di)) != NULL) {
10676 robj *key, *o, *kcopy;
10677 time_t expiretime;
10678
10679 memset(digest,0,20); /* This key-val digest */
10680 key = dictGetEntryKey(de);
10681
10682 if (!server.vm_enabled) {
10683 mixObjectDigest(digest,key);
10684 o = dictGetEntryVal(de);
10685 } else {
10686 /* Don't work with the key directly as when VM is active
10687 * this is unsafe: TODO: fix decrRefCount to check if the
10688 * count really reached 0 to avoid this mess */
10689 kcopy = dupStringObject(key);
10690 mixObjectDigest(digest,kcopy);
10691 o = lookupKeyRead(db,kcopy);
10692 decrRefCount(kcopy);
10693 }
10694 aux = htonl(o->type);
10695 mixDigest(digest,&aux,sizeof(aux));
10696 expiretime = getExpire(db,key);
10697
10698 /* Save the key and associated value */
10699 if (o->type == REDIS_STRING) {
10700 mixObjectDigest(digest,o);
10701 } else if (o->type == REDIS_LIST) {
10702 list *list = o->ptr;
10703 listNode *ln;
10704 listIter li;
10705
10706 listRewind(list,&li);
10707 while((ln = listNext(&li))) {
10708 robj *eleobj = listNodeValue(ln);
10709
10710 mixObjectDigest(digest,eleobj);
10711 }
10712 } else if (o->type == REDIS_SET) {
10713 dict *set = o->ptr;
10714 dictIterator *di = dictGetIterator(set);
10715 dictEntry *de;
10716
10717 while((de = dictNext(di)) != NULL) {
10718 robj *eleobj = dictGetEntryKey(de);
10719
10720 xorObjectDigest(digest,eleobj);
10721 }
10722 dictReleaseIterator(di);
10723 } else if (o->type == REDIS_ZSET) {
10724 zset *zs = o->ptr;
10725 dictIterator *di = dictGetIterator(zs->dict);
10726 dictEntry *de;
10727
10728 while((de = dictNext(di)) != NULL) {
10729 robj *eleobj = dictGetEntryKey(de);
10730 double *score = dictGetEntryVal(de);
10731 unsigned char eledigest[20];
10732
10733 snprintf(buf,sizeof(buf),"%.17g",*score);
10734 memset(eledigest,0,20);
10735 mixObjectDigest(eledigest,eleobj);
10736 mixDigest(eledigest,buf,strlen(buf));
10737 xorDigest(digest,eledigest,20);
10738 }
10739 dictReleaseIterator(di);
10740 } else if (o->type == REDIS_HASH) {
10741 hashIterator *hi;
10742 robj *obj;
10743
10744 hi = hashInitIterator(o);
10745 while (hashNext(hi) != REDIS_ERR) {
10746 unsigned char eledigest[20];
10747
10748 memset(eledigest,0,20);
10749 obj = hashCurrent(hi,REDIS_HASH_KEY);
10750 mixObjectDigest(eledigest,obj);
10751 decrRefCount(obj);
10752 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10753 mixObjectDigest(eledigest,obj);
10754 decrRefCount(obj);
10755 xorDigest(digest,eledigest,20);
10756 }
10757 hashReleaseIterator(hi);
10758 } else {
10759 redisPanic("Unknown object type");
10760 }
10761 /* If the key has an expire, add it to the mix */
10762 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10763 /* We can finally xor the key-val digest to the final digest */
10764 xorDigest(final,digest,20);
10765 }
10766 dictReleaseIterator(di);
10767 }
10768 }
10769
10770 static void debugCommand(redisClient *c) {
10771 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10772 *((char*)-1) = 'x';
10773 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10774 if (rdbSave(server.dbfilename) != REDIS_OK) {
10775 addReply(c,shared.err);
10776 return;
10777 }
10778 emptyDb();
10779 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10780 addReply(c,shared.err);
10781 return;
10782 }
10783 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10784 addReply(c,shared.ok);
10785 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10786 emptyDb();
10787 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10788 addReply(c,shared.err);
10789 return;
10790 }
10791 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10792 addReply(c,shared.ok);
10793 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10794 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10795 robj *key, *val;
10796
10797 if (!de) {
10798 addReply(c,shared.nokeyerr);
10799 return;
10800 }
10801 key = dictGetEntryKey(de);
10802 val = dictGetEntryVal(de);
10803 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10804 key->storage == REDIS_VM_SWAPPING)) {
10805 char *strenc;
10806 char buf[128];
10807
10808 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10809 strenc = strencoding[val->encoding];
10810 } else {
10811 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10812 strenc = buf;
10813 }
10814 addReplySds(c,sdscatprintf(sdsempty(),
10815 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10816 "encoding:%s serializedlength:%lld\r\n",
10817 (void*)key, key->refcount, (void*)val, val->refcount,
10818 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10819 } else {
10820 addReplySds(c,sdscatprintf(sdsempty(),
10821 "+Key at:%p refcount:%d, value swapped at: page %llu "
10822 "using %llu pages\r\n",
10823 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10824 (unsigned long long) key->vm.usedpages));
10825 }
10826 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10827 lookupKeyRead(c->db,c->argv[2]);
10828 addReply(c,shared.ok);
10829 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10830 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10831 robj *key, *val;
10832
10833 if (!server.vm_enabled) {
10834 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10835 return;
10836 }
10837 if (!de) {
10838 addReply(c,shared.nokeyerr);
10839 return;
10840 }
10841 key = dictGetEntryKey(de);
10842 val = dictGetEntryVal(de);
10843 /* If the key is shared we want to create a copy */
10844 if (key->refcount > 1) {
10845 robj *newkey = dupStringObject(key);
10846 decrRefCount(key);
10847 key = dictGetEntryKey(de) = newkey;
10848 }
10849 /* Swap it */
10850 if (key->storage != REDIS_VM_MEMORY) {
10851 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10852 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10853 dictGetEntryVal(de) = NULL;
10854 addReply(c,shared.ok);
10855 } else {
10856 addReply(c,shared.err);
10857 }
10858 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10859 long keys, j;
10860 robj *key, *val;
10861 char buf[128];
10862
10863 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10864 return;
10865 for (j = 0; j < keys; j++) {
10866 snprintf(buf,sizeof(buf),"key:%lu",j);
10867 key = createStringObject(buf,strlen(buf));
10868 if (lookupKeyRead(c->db,key) != NULL) {
10869 decrRefCount(key);
10870 continue;
10871 }
10872 snprintf(buf,sizeof(buf),"value:%lu",j);
10873 val = createStringObject(buf,strlen(buf));
10874 dictAdd(c->db->dict,key,val);
10875 }
10876 addReply(c,shared.ok);
10877 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10878 unsigned char digest[20];
10879 sds d = sdsnew("+");
10880 int j;
10881
10882 computeDatasetDigest(digest);
10883 for (j = 0; j < 20; j++)
10884 d = sdscatprintf(d, "%02x",digest[j]);
10885
10886 d = sdscatlen(d,"\r\n",2);
10887 addReplySds(c,d);
10888 } else {
10889 addReplySds(c,sdsnew(
10890 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10891 }
10892 }
10893
10894 static void _redisAssert(char *estr, char *file, int line) {
10895 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10896 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10897 #ifdef HAVE_BACKTRACE
10898 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10899 *((char*)-1) = 'x';
10900 #endif
10901 }
10902
10903 static void _redisPanic(char *msg, char *file, int line) {
10904 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10905 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10906 #ifdef HAVE_BACKTRACE
10907 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10908 *((char*)-1) = 'x';
10909 #endif
10910 }
10911
10912 /* =================================== Main! ================================ */
10913
10914 #ifdef __linux__
10915 int linuxOvercommitMemoryValue(void) {
10916 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10917 char buf[64];
10918
10919 if (!fp) return -1;
10920 if (fgets(buf,64,fp) == NULL) {
10921 fclose(fp);
10922 return -1;
10923 }
10924 fclose(fp);
10925
10926 return atoi(buf);
10927 }
10928
10929 void linuxOvercommitMemoryWarning(void) {
10930 if (linuxOvercommitMemoryValue() == 0) {
10931 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10932 }
10933 }
10934 #endif /* __linux__ */
10935
10936 static void daemonize(void) {
10937 int fd;
10938 FILE *fp;
10939
10940 if (fork() != 0) exit(0); /* parent exits */
10941 setsid(); /* create a new session */
10942
10943 /* Every output goes to /dev/null. If Redis is daemonized but
10944 * the 'logfile' is set to 'stdout' in the configuration file
10945 * it will not log at all. */
10946 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10947 dup2(fd, STDIN_FILENO);
10948 dup2(fd, STDOUT_FILENO);
10949 dup2(fd, STDERR_FILENO);
10950 if (fd > STDERR_FILENO) close(fd);
10951 }
10952 /* Try to write the pid file */
10953 fp = fopen(server.pidfile,"w");
10954 if (fp) {
10955 fprintf(fp,"%d\n",getpid());
10956 fclose(fp);
10957 }
10958 }
10959
10960 static void version() {
10961 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10962 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
10963 exit(0);
10964 }
10965
10966 static void usage() {
10967 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10968 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10969 exit(1);
10970 }
10971
10972 int main(int argc, char **argv) {
10973 time_t start;
10974
10975 initServerConfig();
10976 sortCommandTable();
10977 if (argc == 2) {
10978 if (strcmp(argv[1], "-v") == 0 ||
10979 strcmp(argv[1], "--version") == 0) version();
10980 if (strcmp(argv[1], "--help") == 0) usage();
10981 resetServerSaveParams();
10982 loadServerConfig(argv[1]);
10983 } else if ((argc > 2)) {
10984 usage();
10985 } else {
10986 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10987 }
10988 if (server.daemonize) daemonize();
10989 initServer();
10990 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10991 #ifdef __linux__
10992 linuxOvercommitMemoryWarning();
10993 #endif
10994 start = time(NULL);
10995 if (server.appendonly) {
10996 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10997 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10998 } else {
10999 if (rdbLoad(server.dbfilename) == REDIS_OK)
11000 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11001 }
11002 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11003 aeSetBeforeSleepProc(server.el,beforeSleep);
11004 aeMain(server.el);
11005 aeDeleteEventLoop(server.el);
11006 return 0;
11007 }
11008
11009 /* ============================= Backtrace support ========================= */
11010
11011 #ifdef HAVE_BACKTRACE
11012 static char *findFuncName(void *pointer, unsigned long *offset);
11013
11014 static void *getMcontextEip(ucontext_t *uc) {
11015 #if defined(__FreeBSD__)
11016 return (void*) uc->uc_mcontext.mc_eip;
11017 #elif defined(__dietlibc__)
11018 return (void*) uc->uc_mcontext.eip;
11019 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11020 #if __x86_64__
11021 return (void*) uc->uc_mcontext->__ss.__rip;
11022 #else
11023 return (void*) uc->uc_mcontext->__ss.__eip;
11024 #endif
11025 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11026 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11027 return (void*) uc->uc_mcontext->__ss.__rip;
11028 #else
11029 return (void*) uc->uc_mcontext->__ss.__eip;
11030 #endif
11031 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11032 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11033 #elif defined(__ia64__) /* Linux IA64 */
11034 return (void*) uc->uc_mcontext.sc_ip;
11035 #else
11036 return NULL;
11037 #endif
11038 }
11039
11040 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11041 void *trace[100];
11042 char **messages = NULL;
11043 int i, trace_size = 0;
11044 unsigned long offset=0;
11045 ucontext_t *uc = (ucontext_t*) secret;
11046 sds infostring;
11047 REDIS_NOTUSED(info);
11048
11049 redisLog(REDIS_WARNING,
11050 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11051 infostring = genRedisInfoString();
11052 redisLog(REDIS_WARNING, "%s",infostring);
11053 /* It's not safe to sdsfree() the returned string under memory
11054 * corruption conditions. Let it leak as we are going to abort */
11055
11056 trace_size = backtrace(trace, 100);
11057 /* overwrite sigaction with caller's address */
11058 if (getMcontextEip(uc) != NULL) {
11059 trace[1] = getMcontextEip(uc);
11060 }
11061 messages = backtrace_symbols(trace, trace_size);
11062
11063 for (i=1; i<trace_size; ++i) {
11064 char *fn = findFuncName(trace[i], &offset), *p;
11065
11066 p = strchr(messages[i],'+');
11067 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11068 redisLog(REDIS_WARNING,"%s", messages[i]);
11069 } else {
11070 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11071 }
11072 }
11073 /* free(messages); Don't call free() with possibly corrupted memory. */
11074 _exit(0);
11075 }
11076
11077 static void sigtermHandler(int sig) {
11078 REDIS_NOTUSED(sig);
11079
11080 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11081 server.shutdown_asap = 1;
11082 }
11083
11084 static void setupSigSegvAction(void) {
11085 struct sigaction act;
11086
11087 sigemptyset (&act.sa_mask);
11088 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11089 * is used. Otherwise, sa_handler is used */
11090 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11091 act.sa_sigaction = segvHandler;
11092 sigaction (SIGSEGV, &act, NULL);
11093 sigaction (SIGBUS, &act, NULL);
11094 sigaction (SIGFPE, &act, NULL);
11095 sigaction (SIGILL, &act, NULL);
11096 sigaction (SIGBUS, &act, NULL);
11097
11098 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11099 act.sa_handler = sigtermHandler;
11100 sigaction (SIGTERM, &act, NULL);
11101 return;
11102 }
11103
11104 #include "staticsymbols.h"
11105 /* This function try to convert a pointer into a function name. It's used in
11106 * oreder to provide a backtrace under segmentation fault that's able to
11107 * display functions declared as static (otherwise the backtrace is useless). */
11108 static char *findFuncName(void *pointer, unsigned long *offset){
11109 int i, ret = -1;
11110 unsigned long off, minoff = 0;
11111
11112 /* Try to match against the Symbol with the smallest offset */
11113 for (i=0; symsTable[i].pointer; i++) {
11114 unsigned long lp = (unsigned long) pointer;
11115
11116 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11117 off=lp-symsTable[i].pointer;
11118 if (ret < 0 || off < minoff) {
11119 minoff=off;
11120 ret=i;
11121 }
11122 }
11123 }
11124 if (ret == -1) return NULL;
11125 *offset = minoff;
11126 return symsTable[ret].name;
11127 }
11128 #else /* HAVE_BACKTRACE */
11129 static void setupSigSegvAction(void) {
11130 }
11131 #endif /* HAVE_BACKTRACE */
11132
11133
11134
11135 /* The End */
11136
11137
11138