]> git.saurik.com Git - redis.git/blob - redis.c
9352356d2904a78368bf44dd1ada6b7787f58dcb
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int no_appendfsync_on_rewrite;
373 int shutdown_asap;
374 time_t lastfsync;
375 int appendfd;
376 int appendseldb;
377 char *pidfile;
378 pid_t bgsavechildpid;
379 pid_t bgrewritechildpid;
380 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
381 sds aofbuf; /* AOF buffer, written before entering the event loop */
382 struct saveparam *saveparams;
383 int saveparamslen;
384 char *logfile;
385 char *bindaddr;
386 char *dbfilename;
387 char *appendfilename;
388 char *requirepass;
389 int rdbcompression;
390 int activerehashing;
391 /* Replication related */
392 int isslave;
393 char *masterauth;
394 char *masterhost;
395 int masterport;
396 redisClient *master; /* client that is master for this slave */
397 int replstate;
398 unsigned int maxclients;
399 unsigned long long maxmemory;
400 unsigned int blpop_blocked_clients;
401 unsigned int vm_blocked_clients;
402 /* Sort parameters - qsort_r() is only available under BSD so we
403 * have to take this state global, in order to pass it to sortCompare() */
404 int sort_desc;
405 int sort_alpha;
406 int sort_bypattern;
407 /* Virtual memory configuration */
408 int vm_enabled;
409 char *vm_swap_file;
410 off_t vm_page_size;
411 off_t vm_pages;
412 unsigned long long vm_max_memory;
413 /* Hashes config */
414 size_t hash_max_zipmap_entries;
415 size_t hash_max_zipmap_value;
416 /* Virtual memory state */
417 FILE *vm_fp;
418 int vm_fd;
419 off_t vm_next_page; /* Next probably empty page */
420 off_t vm_near_pages; /* Number of pages allocated sequentially */
421 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
422 time_t unixtime; /* Unix time sampled every second. */
423 /* Virtual memory I/O threads stuff */
424 /* An I/O thread process an element taken from the io_jobs queue and
425 * put the result of the operation in the io_done list. While the
426 * job is being processed, it's put on io_processing queue. */
427 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
428 list *io_processing; /* List of VM I/O jobs being processed */
429 list *io_processed; /* List of VM I/O jobs already processed */
430 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
431 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
432 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
433 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
434 pthread_attr_t io_threads_attr; /* attributes for threads creation */
435 int io_active_threads; /* Number of running I/O threads */
436 int vm_max_threads; /* Max number of I/O threads running at the same time */
437 /* Our main thread is blocked on the event loop, locking for sockets ready
438 * to be read or written, so when a threaded I/O operation is ready to be
439 * processed by the main thread, the I/O thread will use a unix pipe to
440 * awake the main thread. The followings are the two pipe FDs. */
441 int io_ready_pipe_read;
442 int io_ready_pipe_write;
443 /* Virtual memory stats */
444 unsigned long long vm_stats_used_pages;
445 unsigned long long vm_stats_swapped_objects;
446 unsigned long long vm_stats_swapouts;
447 unsigned long long vm_stats_swapins;
448 /* Pubsub */
449 dict *pubsub_channels; /* Map channels to list of subscribed clients */
450 list *pubsub_patterns; /* A list of pubsub_patterns */
451 /* Misc */
452 FILE *devnull;
453 };
454
455 typedef struct pubsubPattern {
456 redisClient *client;
457 robj *pattern;
458 } pubsubPattern;
459
460 typedef void redisCommandProc(redisClient *c);
461 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
462 struct redisCommand {
463 char *name;
464 redisCommandProc *proc;
465 int arity;
466 int flags;
467 /* Use a function to determine which keys need to be loaded
468 * in the background prior to executing this command. Takes precedence
469 * over vm_firstkey and others, ignored when NULL */
470 redisVmPreloadProc *vm_preload_proc;
471 /* What keys should be loaded in background when calling this command? */
472 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
473 int vm_lastkey; /* THe last argument that's a key */
474 int vm_keystep; /* The step between first and last key */
475 };
476
477 struct redisFunctionSym {
478 char *name;
479 unsigned long pointer;
480 };
481
482 typedef struct _redisSortObject {
483 robj *obj;
484 union {
485 double score;
486 robj *cmpobj;
487 } u;
488 } redisSortObject;
489
490 typedef struct _redisSortOperation {
491 int type;
492 robj *pattern;
493 } redisSortOperation;
494
495 /* ZSETs use a specialized version of Skiplists */
496
497 typedef struct zskiplistNode {
498 struct zskiplistNode **forward;
499 struct zskiplistNode *backward;
500 unsigned int *span;
501 double score;
502 robj *obj;
503 } zskiplistNode;
504
505 typedef struct zskiplist {
506 struct zskiplistNode *header, *tail;
507 unsigned long length;
508 int level;
509 } zskiplist;
510
511 typedef struct zset {
512 dict *dict;
513 zskiplist *zsl;
514 } zset;
515
516 /* Our shared "common" objects */
517
518 #define REDIS_SHARED_INTEGERS 10000
519 struct sharedObjectsStruct {
520 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
521 *colon, *nullbulk, *nullmultibulk, *queued,
522 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
523 *outofrangeerr, *plus,
524 *select0, *select1, *select2, *select3, *select4,
525 *select5, *select6, *select7, *select8, *select9,
526 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
527 *mbulk4, *psubscribebulk, *punsubscribebulk,
528 *integers[REDIS_SHARED_INTEGERS];
529 } shared;
530
531 /* Global vars that are actally used as constants. The following double
532 * values are used for double on-disk serialization, and are initialized
533 * at runtime to avoid strange compiler optimizations. */
534
535 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
536
537 /* VM threaded I/O request message */
538 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
539 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
540 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
541 typedef struct iojob {
542 int type; /* Request type, REDIS_IOJOB_* */
543 redisDb *db;/* Redis database */
544 robj *key; /* This I/O request is about swapping this key */
545 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
546 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
547 off_t page; /* Swap page where to read/write the object */
548 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
549 int canceled; /* True if this command was canceled by blocking side of VM */
550 pthread_t thread; /* ID of the thread processing this entry */
551 } iojob;
552
553 /*================================ Prototypes =============================== */
554
555 static void freeStringObject(robj *o);
556 static void freeListObject(robj *o);
557 static void freeSetObject(robj *o);
558 static void decrRefCount(void *o);
559 static robj *createObject(int type, void *ptr);
560 static void freeClient(redisClient *c);
561 static int rdbLoad(char *filename);
562 static void addReply(redisClient *c, robj *obj);
563 static void addReplySds(redisClient *c, sds s);
564 static void incrRefCount(robj *o);
565 static int rdbSaveBackground(char *filename);
566 static robj *createStringObject(char *ptr, size_t len);
567 static robj *dupStringObject(robj *o);
568 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
569 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
570 static void flushAppendOnlyFile(void);
571 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
572 static int syncWithMaster(void);
573 static robj *tryObjectEncoding(robj *o);
574 static robj *getDecodedObject(robj *o);
575 static int removeExpire(redisDb *db, robj *key);
576 static int expireIfNeeded(redisDb *db, robj *key);
577 static int deleteIfVolatile(redisDb *db, robj *key);
578 static int deleteIfSwapped(redisDb *db, robj *key);
579 static int deleteKey(redisDb *db, robj *key);
580 static time_t getExpire(redisDb *db, robj *key);
581 static int setExpire(redisDb *db, robj *key, time_t when);
582 static void updateSlavesWaitingBgsave(int bgsaveerr);
583 static void freeMemoryIfNeeded(void);
584 static int processCommand(redisClient *c);
585 static void setupSigSegvAction(void);
586 static void rdbRemoveTempFile(pid_t childpid);
587 static void aofRemoveTempFile(pid_t childpid);
588 static size_t stringObjectLen(robj *o);
589 static void processInputBuffer(redisClient *c);
590 static zskiplist *zslCreate(void);
591 static void zslFree(zskiplist *zsl);
592 static void zslInsert(zskiplist *zsl, double score, robj *obj);
593 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
594 static void initClientMultiState(redisClient *c);
595 static void freeClientMultiState(redisClient *c);
596 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
597 static void unblockClientWaitingData(redisClient *c);
598 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
599 static void vmInit(void);
600 static void vmMarkPagesFree(off_t page, off_t count);
601 static robj *vmLoadObject(robj *key);
602 static robj *vmPreviewObject(robj *key);
603 static int vmSwapOneObjectBlocking(void);
604 static int vmSwapOneObjectThreaded(void);
605 static int vmCanSwapOut(void);
606 static int tryFreeOneObjectFromFreelist(void);
607 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
609 static void vmCancelThreadedIOJob(robj *o);
610 static void lockThreadedIO(void);
611 static void unlockThreadedIO(void);
612 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
613 static void freeIOJob(iojob *j);
614 static void queueIOJob(iojob *j);
615 static int vmWriteObjectOnSwap(robj *o, off_t page);
616 static robj *vmReadObjectFromSwap(off_t page, int type);
617 static void waitEmptyIOJobsQueue(void);
618 static void vmReopenSwapFile(void);
619 static int vmFreePage(off_t page);
620 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
622 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
623 static int dontWaitForSwappedKey(redisClient *c, robj *key);
624 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
625 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
626 static struct redisCommand *lookupCommand(char *name);
627 static void call(redisClient *c, struct redisCommand *cmd);
628 static void resetClient(redisClient *c);
629 static void convertToRealHash(robj *o);
630 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
631 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
632 static void freePubsubPattern(void *p);
633 static int listMatchPubsubPattern(void *a, void *b);
634 static int compareStringObjects(robj *a, robj *b);
635 static int equalStringObjects(robj *a, robj *b);
636 static void usage();
637 static int rewriteAppendOnlyFileBackground(void);
638 static int vmSwapObjectBlocking(robj *key, robj *val);
639 static int prepareForShutdown();
640 static void touchWatchedKey(redisDb *db, robj *key);
641 static void touchWatchedKeysOnFlush(int dbid);
642 static void unwatchAllKeys(redisClient *c);
643
644 static void authCommand(redisClient *c);
645 static void pingCommand(redisClient *c);
646 static void echoCommand(redisClient *c);
647 static void setCommand(redisClient *c);
648 static void setnxCommand(redisClient *c);
649 static void setexCommand(redisClient *c);
650 static void getCommand(redisClient *c);
651 static void delCommand(redisClient *c);
652 static void existsCommand(redisClient *c);
653 static void incrCommand(redisClient *c);
654 static void decrCommand(redisClient *c);
655 static void incrbyCommand(redisClient *c);
656 static void decrbyCommand(redisClient *c);
657 static void selectCommand(redisClient *c);
658 static void randomkeyCommand(redisClient *c);
659 static void keysCommand(redisClient *c);
660 static void dbsizeCommand(redisClient *c);
661 static void lastsaveCommand(redisClient *c);
662 static void saveCommand(redisClient *c);
663 static void bgsaveCommand(redisClient *c);
664 static void bgrewriteaofCommand(redisClient *c);
665 static void shutdownCommand(redisClient *c);
666 static void moveCommand(redisClient *c);
667 static void renameCommand(redisClient *c);
668 static void renamenxCommand(redisClient *c);
669 static void lpushCommand(redisClient *c);
670 static void rpushCommand(redisClient *c);
671 static void lpopCommand(redisClient *c);
672 static void rpopCommand(redisClient *c);
673 static void llenCommand(redisClient *c);
674 static void lindexCommand(redisClient *c);
675 static void lrangeCommand(redisClient *c);
676 static void ltrimCommand(redisClient *c);
677 static void typeCommand(redisClient *c);
678 static void lsetCommand(redisClient *c);
679 static void saddCommand(redisClient *c);
680 static void sremCommand(redisClient *c);
681 static void smoveCommand(redisClient *c);
682 static void sismemberCommand(redisClient *c);
683 static void scardCommand(redisClient *c);
684 static void spopCommand(redisClient *c);
685 static void srandmemberCommand(redisClient *c);
686 static void sinterCommand(redisClient *c);
687 static void sinterstoreCommand(redisClient *c);
688 static void sunionCommand(redisClient *c);
689 static void sunionstoreCommand(redisClient *c);
690 static void sdiffCommand(redisClient *c);
691 static void sdiffstoreCommand(redisClient *c);
692 static void syncCommand(redisClient *c);
693 static void flushdbCommand(redisClient *c);
694 static void flushallCommand(redisClient *c);
695 static void sortCommand(redisClient *c);
696 static void lremCommand(redisClient *c);
697 static void rpoplpushcommand(redisClient *c);
698 static void infoCommand(redisClient *c);
699 static void mgetCommand(redisClient *c);
700 static void monitorCommand(redisClient *c);
701 static void expireCommand(redisClient *c);
702 static void expireatCommand(redisClient *c);
703 static void getsetCommand(redisClient *c);
704 static void ttlCommand(redisClient *c);
705 static void slaveofCommand(redisClient *c);
706 static void debugCommand(redisClient *c);
707 static void msetCommand(redisClient *c);
708 static void msetnxCommand(redisClient *c);
709 static void zaddCommand(redisClient *c);
710 static void zincrbyCommand(redisClient *c);
711 static void zrangeCommand(redisClient *c);
712 static void zrangebyscoreCommand(redisClient *c);
713 static void zcountCommand(redisClient *c);
714 static void zrevrangeCommand(redisClient *c);
715 static void zcardCommand(redisClient *c);
716 static void zremCommand(redisClient *c);
717 static void zscoreCommand(redisClient *c);
718 static void zremrangebyscoreCommand(redisClient *c);
719 static void multiCommand(redisClient *c);
720 static void execCommand(redisClient *c);
721 static void discardCommand(redisClient *c);
722 static void blpopCommand(redisClient *c);
723 static void brpopCommand(redisClient *c);
724 static void appendCommand(redisClient *c);
725 static void substrCommand(redisClient *c);
726 static void zrankCommand(redisClient *c);
727 static void zrevrankCommand(redisClient *c);
728 static void hsetCommand(redisClient *c);
729 static void hsetnxCommand(redisClient *c);
730 static void hgetCommand(redisClient *c);
731 static void hmsetCommand(redisClient *c);
732 static void hmgetCommand(redisClient *c);
733 static void hdelCommand(redisClient *c);
734 static void hlenCommand(redisClient *c);
735 static void zremrangebyrankCommand(redisClient *c);
736 static void zunionstoreCommand(redisClient *c);
737 static void zinterstoreCommand(redisClient *c);
738 static void hkeysCommand(redisClient *c);
739 static void hvalsCommand(redisClient *c);
740 static void hgetallCommand(redisClient *c);
741 static void hexistsCommand(redisClient *c);
742 static void configCommand(redisClient *c);
743 static void hincrbyCommand(redisClient *c);
744 static void subscribeCommand(redisClient *c);
745 static void unsubscribeCommand(redisClient *c);
746 static void psubscribeCommand(redisClient *c);
747 static void punsubscribeCommand(redisClient *c);
748 static void publishCommand(redisClient *c);
749 static void watchCommand(redisClient *c);
750 static void unwatchCommand(redisClient *c);
751
752 /*================================= Globals ================================= */
753
754 /* Global vars */
755 static struct redisServer server; /* server global state */
756 static struct redisCommand *commandTable;
757 static struct redisCommand readonlyCommandTable[] = {
758 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
761 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
762 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
765 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
769 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
781 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
782 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
785 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
803 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
817 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
823 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
828 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
848 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
859 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
864 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
866 };
867
868 /*============================ Utility functions ============================ */
869
870 /* Glob-style pattern matching. */
871 static int stringmatchlen(const char *pattern, int patternLen,
872 const char *string, int stringLen, int nocase)
873 {
874 while(patternLen) {
875 switch(pattern[0]) {
876 case '*':
877 while (pattern[1] == '*') {
878 pattern++;
879 patternLen--;
880 }
881 if (patternLen == 1)
882 return 1; /* match */
883 while(stringLen) {
884 if (stringmatchlen(pattern+1, patternLen-1,
885 string, stringLen, nocase))
886 return 1; /* match */
887 string++;
888 stringLen--;
889 }
890 return 0; /* no match */
891 break;
892 case '?':
893 if (stringLen == 0)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 case '[':
899 {
900 int not, match;
901
902 pattern++;
903 patternLen--;
904 not = pattern[0] == '^';
905 if (not) {
906 pattern++;
907 patternLen--;
908 }
909 match = 0;
910 while(1) {
911 if (pattern[0] == '\\') {
912 pattern++;
913 patternLen--;
914 if (pattern[0] == string[0])
915 match = 1;
916 } else if (pattern[0] == ']') {
917 break;
918 } else if (patternLen == 0) {
919 pattern--;
920 patternLen++;
921 break;
922 } else if (pattern[1] == '-' && patternLen >= 3) {
923 int start = pattern[0];
924 int end = pattern[2];
925 int c = string[0];
926 if (start > end) {
927 int t = start;
928 start = end;
929 end = t;
930 }
931 if (nocase) {
932 start = tolower(start);
933 end = tolower(end);
934 c = tolower(c);
935 }
936 pattern += 2;
937 patternLen -= 2;
938 if (c >= start && c <= end)
939 match = 1;
940 } else {
941 if (!nocase) {
942 if (pattern[0] == string[0])
943 match = 1;
944 } else {
945 if (tolower((int)pattern[0]) == tolower((int)string[0]))
946 match = 1;
947 }
948 }
949 pattern++;
950 patternLen--;
951 }
952 if (not)
953 match = !match;
954 if (!match)
955 return 0; /* no match */
956 string++;
957 stringLen--;
958 break;
959 }
960 case '\\':
961 if (patternLen >= 2) {
962 pattern++;
963 patternLen--;
964 }
965 /* fall through */
966 default:
967 if (!nocase) {
968 if (pattern[0] != string[0])
969 return 0; /* no match */
970 } else {
971 if (tolower((int)pattern[0]) != tolower((int)string[0]))
972 return 0; /* no match */
973 }
974 string++;
975 stringLen--;
976 break;
977 }
978 pattern++;
979 patternLen--;
980 if (stringLen == 0) {
981 while(*pattern == '*') {
982 pattern++;
983 patternLen--;
984 }
985 break;
986 }
987 }
988 if (patternLen == 0 && stringLen == 0)
989 return 1;
990 return 0;
991 }
992
993 static int stringmatch(const char *pattern, const char *string, int nocase) {
994 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
995 }
996
997 /* Convert a string representing an amount of memory into the number of
998 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
999 * (1024*1024*1024).
1000 *
1001 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1002 * set to 0 */
1003 static long long memtoll(const char *p, int *err) {
1004 const char *u;
1005 char buf[128];
1006 long mul; /* unit multiplier */
1007 long long val;
1008 unsigned int digits;
1009
1010 if (err) *err = 0;
1011 /* Search the first non digit character. */
1012 u = p;
1013 if (*u == '-') u++;
1014 while(*u && isdigit(*u)) u++;
1015 if (*u == '\0' || !strcasecmp(u,"b")) {
1016 mul = 1;
1017 } else if (!strcasecmp(u,"k")) {
1018 mul = 1000;
1019 } else if (!strcasecmp(u,"kb")) {
1020 mul = 1024;
1021 } else if (!strcasecmp(u,"m")) {
1022 mul = 1000*1000;
1023 } else if (!strcasecmp(u,"mb")) {
1024 mul = 1024*1024;
1025 } else if (!strcasecmp(u,"g")) {
1026 mul = 1000L*1000*1000;
1027 } else if (!strcasecmp(u,"gb")) {
1028 mul = 1024L*1024*1024;
1029 } else {
1030 if (err) *err = 1;
1031 mul = 1;
1032 }
1033 digits = u-p;
1034 if (digits >= sizeof(buf)) {
1035 if (err) *err = 1;
1036 return LLONG_MAX;
1037 }
1038 memcpy(buf,p,digits);
1039 buf[digits] = '\0';
1040 val = strtoll(buf,NULL,10);
1041 return val*mul;
1042 }
1043
1044 /* Convert a long long into a string. Returns the number of
1045 * characters needed to represent the number, that can be shorter if passed
1046 * buffer length is not enough to store the whole number. */
1047 static int ll2string(char *s, size_t len, long long value) {
1048 char buf[32], *p;
1049 unsigned long long v;
1050 size_t l;
1051
1052 if (len == 0) return 0;
1053 v = (value < 0) ? -value : value;
1054 p = buf+31; /* point to the last character */
1055 do {
1056 *p-- = '0'+(v%10);
1057 v /= 10;
1058 } while(v);
1059 if (value < 0) *p-- = '-';
1060 p++;
1061 l = 32-(p-buf);
1062 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1063 memcpy(s,p,l);
1064 s[l] = '\0';
1065 return l;
1066 }
1067
1068 static void redisLog(int level, const char *fmt, ...) {
1069 va_list ap;
1070 FILE *fp;
1071
1072 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1073 if (!fp) return;
1074
1075 va_start(ap, fmt);
1076 if (level >= server.verbosity) {
1077 char *c = ".-*#";
1078 char buf[64];
1079 time_t now;
1080
1081 now = time(NULL);
1082 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1083 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1084 vfprintf(fp, fmt, ap);
1085 fprintf(fp,"\n");
1086 fflush(fp);
1087 }
1088 va_end(ap);
1089
1090 if (server.logfile) fclose(fp);
1091 }
1092
1093 /*====================== Hash table type implementation ==================== */
1094
1095 /* This is an hash table type that uses the SDS dynamic strings libary as
1096 * keys and radis objects as values (objects can hold SDS strings,
1097 * lists, sets). */
1098
1099 static void dictVanillaFree(void *privdata, void *val)
1100 {
1101 DICT_NOTUSED(privdata);
1102 zfree(val);
1103 }
1104
1105 static void dictListDestructor(void *privdata, void *val)
1106 {
1107 DICT_NOTUSED(privdata);
1108 listRelease((list*)val);
1109 }
1110
1111 static int sdsDictKeyCompare(void *privdata, const void *key1,
1112 const void *key2)
1113 {
1114 int l1,l2;
1115 DICT_NOTUSED(privdata);
1116
1117 l1 = sdslen((sds)key1);
1118 l2 = sdslen((sds)key2);
1119 if (l1 != l2) return 0;
1120 return memcmp(key1, key2, l1) == 0;
1121 }
1122
1123 static void dictRedisObjectDestructor(void *privdata, void *val)
1124 {
1125 DICT_NOTUSED(privdata);
1126
1127 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1128 decrRefCount(val);
1129 }
1130
1131 static int dictObjKeyCompare(void *privdata, const void *key1,
1132 const void *key2)
1133 {
1134 const robj *o1 = key1, *o2 = key2;
1135 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1136 }
1137
1138 static unsigned int dictObjHash(const void *key) {
1139 const robj *o = key;
1140 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1141 }
1142
1143 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145 {
1146 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1147 int cmp;
1148
1149 if (o1->encoding == REDIS_ENCODING_INT &&
1150 o2->encoding == REDIS_ENCODING_INT)
1151 return o1->ptr == o2->ptr;
1152
1153 o1 = getDecodedObject(o1);
1154 o2 = getDecodedObject(o2);
1155 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1156 decrRefCount(o1);
1157 decrRefCount(o2);
1158 return cmp;
1159 }
1160
1161 static unsigned int dictEncObjHash(const void *key) {
1162 robj *o = (robj*) key;
1163
1164 if (o->encoding == REDIS_ENCODING_RAW) {
1165 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1166 } else {
1167 if (o->encoding == REDIS_ENCODING_INT) {
1168 char buf[32];
1169 int len;
1170
1171 len = ll2string(buf,32,(long)o->ptr);
1172 return dictGenHashFunction((unsigned char*)buf, len);
1173 } else {
1174 unsigned int hash;
1175
1176 o = getDecodedObject(o);
1177 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 decrRefCount(o);
1179 return hash;
1180 }
1181 }
1182 }
1183
1184 /* Sets type and expires */
1185 static dictType setDictType = {
1186 dictEncObjHash, /* hash function */
1187 NULL, /* key dup */
1188 NULL, /* val dup */
1189 dictEncObjKeyCompare, /* key compare */
1190 dictRedisObjectDestructor, /* key destructor */
1191 NULL /* val destructor */
1192 };
1193
1194 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1195 static dictType zsetDictType = {
1196 dictEncObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictEncObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
1201 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1202 };
1203
1204 /* Db->dict */
1205 static dictType dbDictType = {
1206 dictObjHash, /* hash function */
1207 NULL, /* key dup */
1208 NULL, /* val dup */
1209 dictObjKeyCompare, /* key compare */
1210 dictRedisObjectDestructor, /* key destructor */
1211 dictRedisObjectDestructor /* val destructor */
1212 };
1213
1214 /* Db->expires */
1215 static dictType keyptrDictType = {
1216 dictObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222 };
1223
1224 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1225 static dictType hashDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictRedisObjectDestructor /* val destructor */
1232 };
1233
1234 /* Keylist hash table type has unencoded redis objects as keys and
1235 * lists as values. It's used for blocking operations (BLPOP) and to
1236 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1237 static dictType keylistDictType = {
1238 dictObjHash, /* hash function */
1239 NULL, /* key dup */
1240 NULL, /* val dup */
1241 dictObjKeyCompare, /* key compare */
1242 dictRedisObjectDestructor, /* key destructor */
1243 dictListDestructor /* val destructor */
1244 };
1245
1246 static void version();
1247
1248 /* ========================= Random utility functions ======================= */
1249
1250 /* Redis generally does not try to recover from out of memory conditions
1251 * when allocating objects or strings, it is not clear if it will be possible
1252 * to report this condition to the client since the networking layer itself
1253 * is based on heap allocation for send buffers, so we simply abort.
1254 * At least the code will be simpler to read... */
1255 static void oom(const char *msg) {
1256 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1257 sleep(1);
1258 abort();
1259 }
1260
1261 /* ====================== Redis server networking stuff ===================== */
1262 static void closeTimedoutClients(void) {
1263 redisClient *c;
1264 listNode *ln;
1265 time_t now = time(NULL);
1266 listIter li;
1267
1268 listRewind(server.clients,&li);
1269 while ((ln = listNext(&li)) != NULL) {
1270 c = listNodeValue(ln);
1271 if (server.maxidletime &&
1272 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1273 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1274 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1275 listLength(c->pubsub_patterns) == 0 &&
1276 (now - c->lastinteraction > server.maxidletime))
1277 {
1278 redisLog(REDIS_VERBOSE,"Closing idle client");
1279 freeClient(c);
1280 } else if (c->flags & REDIS_BLOCKED) {
1281 if (c->blockingto != 0 && c->blockingto < now) {
1282 addReply(c,shared.nullmultibulk);
1283 unblockClientWaitingData(c);
1284 }
1285 }
1286 }
1287 }
1288
1289 static int htNeedsResize(dict *dict) {
1290 long long size, used;
1291
1292 size = dictSlots(dict);
1293 used = dictSize(dict);
1294 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1295 (used*100/size < REDIS_HT_MINFILL));
1296 }
1297
1298 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1299 * we resize the hash table to save memory */
1300 static void tryResizeHashTables(void) {
1301 int j;
1302
1303 for (j = 0; j < server.dbnum; j++) {
1304 if (htNeedsResize(server.db[j].dict))
1305 dictResize(server.db[j].dict);
1306 if (htNeedsResize(server.db[j].expires))
1307 dictResize(server.db[j].expires);
1308 }
1309 }
1310
1311 /* Our hash table implementation performs rehashing incrementally while
1312 * we write/read from the hash table. Still if the server is idle, the hash
1313 * table will use two tables for a long time. So we try to use 1 millisecond
1314 * of CPU time at every serverCron() loop in order to rehash some key. */
1315 static void incrementallyRehash(void) {
1316 int j;
1317
1318 for (j = 0; j < server.dbnum; j++) {
1319 if (dictIsRehashing(server.db[j].dict)) {
1320 dictRehashMilliseconds(server.db[j].dict,1);
1321 break; /* already used our millisecond for this loop... */
1322 }
1323 }
1324 }
1325
1326 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1327 void backgroundSaveDoneHandler(int statloc) {
1328 int exitcode = WEXITSTATUS(statloc);
1329 int bysignal = WIFSIGNALED(statloc);
1330
1331 if (!bysignal && exitcode == 0) {
1332 redisLog(REDIS_NOTICE,
1333 "Background saving terminated with success");
1334 server.dirty = 0;
1335 server.lastsave = time(NULL);
1336 } else if (!bysignal && exitcode != 0) {
1337 redisLog(REDIS_WARNING, "Background saving error");
1338 } else {
1339 redisLog(REDIS_WARNING,
1340 "Background saving terminated by signal %d", WTERMSIG(statloc));
1341 rdbRemoveTempFile(server.bgsavechildpid);
1342 }
1343 server.bgsavechildpid = -1;
1344 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1345 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1346 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1347 }
1348
1349 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1350 * Handle this. */
1351 void backgroundRewriteDoneHandler(int statloc) {
1352 int exitcode = WEXITSTATUS(statloc);
1353 int bysignal = WIFSIGNALED(statloc);
1354
1355 if (!bysignal && exitcode == 0) {
1356 int fd;
1357 char tmpfile[256];
1358
1359 redisLog(REDIS_NOTICE,
1360 "Background append only file rewriting terminated with success");
1361 /* Now it's time to flush the differences accumulated by the parent */
1362 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1363 fd = open(tmpfile,O_WRONLY|O_APPEND);
1364 if (fd == -1) {
1365 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1366 goto cleanup;
1367 }
1368 /* Flush our data... */
1369 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1370 (signed) sdslen(server.bgrewritebuf)) {
1371 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1372 close(fd);
1373 goto cleanup;
1374 }
1375 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1376 /* Now our work is to rename the temp file into the stable file. And
1377 * switch the file descriptor used by the server for append only. */
1378 if (rename(tmpfile,server.appendfilename) == -1) {
1379 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1380 close(fd);
1381 goto cleanup;
1382 }
1383 /* Mission completed... almost */
1384 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1385 if (server.appendfd != -1) {
1386 /* If append only is actually enabled... */
1387 close(server.appendfd);
1388 server.appendfd = fd;
1389 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1390 server.appendseldb = -1; /* Make sure it will issue SELECT */
1391 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1392 } else {
1393 /* If append only is disabled we just generate a dump in this
1394 * format. Why not? */
1395 close(fd);
1396 }
1397 } else if (!bysignal && exitcode != 0) {
1398 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1399 } else {
1400 redisLog(REDIS_WARNING,
1401 "Background append only file rewriting terminated by signal %d",
1402 WTERMSIG(statloc));
1403 }
1404 cleanup:
1405 sdsfree(server.bgrewritebuf);
1406 server.bgrewritebuf = sdsempty();
1407 aofRemoveTempFile(server.bgrewritechildpid);
1408 server.bgrewritechildpid = -1;
1409 }
1410
1411 /* This function is called once a background process of some kind terminates,
1412 * as we want to avoid resizing the hash tables when there is a child in order
1413 * to play well with copy-on-write (otherwise when a resize happens lots of
1414 * memory pages are copied). The goal of this function is to update the ability
1415 * for dict.c to resize the hash tables accordingly to the fact we have o not
1416 * running childs. */
1417 static void updateDictResizePolicy(void) {
1418 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1419 dictEnableResize();
1420 else
1421 dictDisableResize();
1422 }
1423
1424 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1425 int j, loops = server.cronloops++;
1426 REDIS_NOTUSED(eventLoop);
1427 REDIS_NOTUSED(id);
1428 REDIS_NOTUSED(clientData);
1429
1430 /* We take a cached value of the unix time in the global state because
1431 * with virtual memory and aging there is to store the current time
1432 * in objects at every object access, and accuracy is not needed.
1433 * To access a global var is faster than calling time(NULL) */
1434 server.unixtime = time(NULL);
1435
1436 /* We received a SIGTERM, shutting down here in a safe way, as it is
1437 * not ok doing so inside the signal handler. */
1438 if (server.shutdown_asap) {
1439 if (prepareForShutdown() == REDIS_OK) exit(0);
1440 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1441 }
1442
1443 /* Show some info about non-empty databases */
1444 for (j = 0; j < server.dbnum; j++) {
1445 long long size, used, vkeys;
1446
1447 size = dictSlots(server.db[j].dict);
1448 used = dictSize(server.db[j].dict);
1449 vkeys = dictSize(server.db[j].expires);
1450 if (!(loops % 50) && (used || vkeys)) {
1451 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1452 /* dictPrintStats(server.dict); */
1453 }
1454 }
1455
1456 /* We don't want to resize the hash tables while a bacground saving
1457 * is in progress: the saving child is created using fork() that is
1458 * implemented with a copy-on-write semantic in most modern systems, so
1459 * if we resize the HT while there is the saving child at work actually
1460 * a lot of memory movements in the parent will cause a lot of pages
1461 * copied. */
1462 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1463 if (!(loops % 10)) tryResizeHashTables();
1464 if (server.activerehashing) incrementallyRehash();
1465 }
1466
1467 /* Show information about connected clients */
1468 if (!(loops % 50)) {
1469 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1470 listLength(server.clients)-listLength(server.slaves),
1471 listLength(server.slaves),
1472 zmalloc_used_memory());
1473 }
1474
1475 /* Close connections of timedout clients */
1476 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1477 closeTimedoutClients();
1478
1479 /* Check if a background saving or AOF rewrite in progress terminated */
1480 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1481 int statloc;
1482 pid_t pid;
1483
1484 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1485 if (pid == server.bgsavechildpid) {
1486 backgroundSaveDoneHandler(statloc);
1487 } else {
1488 backgroundRewriteDoneHandler(statloc);
1489 }
1490 updateDictResizePolicy();
1491 }
1492 } else {
1493 /* If there is not a background saving in progress check if
1494 * we have to save now */
1495 time_t now = time(NULL);
1496 for (j = 0; j < server.saveparamslen; j++) {
1497 struct saveparam *sp = server.saveparams+j;
1498
1499 if (server.dirty >= sp->changes &&
1500 now-server.lastsave > sp->seconds) {
1501 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1502 sp->changes, sp->seconds);
1503 rdbSaveBackground(server.dbfilename);
1504 break;
1505 }
1506 }
1507 }
1508
1509 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1510 * will use few CPU cycles if there are few expiring keys, otherwise
1511 * it will get more aggressive to avoid that too much memory is used by
1512 * keys that can be removed from the keyspace. */
1513 for (j = 0; j < server.dbnum; j++) {
1514 int expired;
1515 redisDb *db = server.db+j;
1516
1517 /* Continue to expire if at the end of the cycle more than 25%
1518 * of the keys were expired. */
1519 do {
1520 long num = dictSize(db->expires);
1521 time_t now = time(NULL);
1522
1523 expired = 0;
1524 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1525 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1526 while (num--) {
1527 dictEntry *de;
1528 time_t t;
1529
1530 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1531 t = (time_t) dictGetEntryVal(de);
1532 if (now > t) {
1533 deleteKey(db,dictGetEntryKey(de));
1534 expired++;
1535 server.stat_expiredkeys++;
1536 }
1537 }
1538 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1539 }
1540
1541 /* Swap a few keys on disk if we are over the memory limit and VM
1542 * is enbled. Try to free objects from the free list first. */
1543 if (vmCanSwapOut()) {
1544 while (server.vm_enabled && zmalloc_used_memory() >
1545 server.vm_max_memory)
1546 {
1547 int retval;
1548
1549 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1550 retval = (server.vm_max_threads == 0) ?
1551 vmSwapOneObjectBlocking() :
1552 vmSwapOneObjectThreaded();
1553 if (retval == REDIS_ERR && !(loops % 300) &&
1554 zmalloc_used_memory() >
1555 (server.vm_max_memory+server.vm_max_memory/10))
1556 {
1557 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1558 }
1559 /* Note that when using threade I/O we free just one object,
1560 * because anyway when the I/O thread in charge to swap this
1561 * object out will finish, the handler of completed jobs
1562 * will try to swap more objects if we are still out of memory. */
1563 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1564 }
1565 }
1566
1567 /* Check if we should connect to a MASTER */
1568 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1569 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1570 if (syncWithMaster() == REDIS_OK) {
1571 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1572 if (server.appendonly) rewriteAppendOnlyFileBackground();
1573 }
1574 }
1575 return 100;
1576 }
1577
1578 /* This function gets called every time Redis is entering the
1579 * main loop of the event driven library, that is, before to sleep
1580 * for ready file descriptors. */
1581 static void beforeSleep(struct aeEventLoop *eventLoop) {
1582 REDIS_NOTUSED(eventLoop);
1583
1584 /* Awake clients that got all the swapped keys they requested */
1585 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1586 listIter li;
1587 listNode *ln;
1588
1589 listRewind(server.io_ready_clients,&li);
1590 while((ln = listNext(&li))) {
1591 redisClient *c = ln->value;
1592 struct redisCommand *cmd;
1593
1594 /* Resume the client. */
1595 listDelNode(server.io_ready_clients,ln);
1596 c->flags &= (~REDIS_IO_WAIT);
1597 server.vm_blocked_clients--;
1598 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1599 readQueryFromClient, c);
1600 cmd = lookupCommand(c->argv[0]->ptr);
1601 assert(cmd != NULL);
1602 call(c,cmd);
1603 resetClient(c);
1604 /* There may be more data to process in the input buffer. */
1605 if (c->querybuf && sdslen(c->querybuf) > 0)
1606 processInputBuffer(c);
1607 }
1608 }
1609 /* Write the AOF buffer on disk */
1610 flushAppendOnlyFile();
1611 }
1612
1613 static void createSharedObjects(void) {
1614 int j;
1615
1616 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1617 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1618 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1619 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1620 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1621 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1622 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1623 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1624 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1625 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1626 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1627 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1629 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR no such key\r\n"));
1631 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR syntax error\r\n"));
1633 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR source and destination objects are the same\r\n"));
1635 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR index out of range\r\n"));
1637 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1638 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1639 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1640 shared.select0 = createStringObject("select 0\r\n",10);
1641 shared.select1 = createStringObject("select 1\r\n",10);
1642 shared.select2 = createStringObject("select 2\r\n",10);
1643 shared.select3 = createStringObject("select 3\r\n",10);
1644 shared.select4 = createStringObject("select 4\r\n",10);
1645 shared.select5 = createStringObject("select 5\r\n",10);
1646 shared.select6 = createStringObject("select 6\r\n",10);
1647 shared.select7 = createStringObject("select 7\r\n",10);
1648 shared.select8 = createStringObject("select 8\r\n",10);
1649 shared.select9 = createStringObject("select 9\r\n",10);
1650 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1651 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1652 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1653 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1654 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1655 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1656 shared.mbulk3 = createStringObject("*3\r\n",4);
1657 shared.mbulk4 = createStringObject("*4\r\n",4);
1658 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1659 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1660 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1661 }
1662 }
1663
1664 static void appendServerSaveParams(time_t seconds, int changes) {
1665 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1666 server.saveparams[server.saveparamslen].seconds = seconds;
1667 server.saveparams[server.saveparamslen].changes = changes;
1668 server.saveparamslen++;
1669 }
1670
1671 static void resetServerSaveParams() {
1672 zfree(server.saveparams);
1673 server.saveparams = NULL;
1674 server.saveparamslen = 0;
1675 }
1676
1677 static void initServerConfig() {
1678 server.dbnum = REDIS_DEFAULT_DBNUM;
1679 server.port = REDIS_SERVERPORT;
1680 server.verbosity = REDIS_VERBOSE;
1681 server.maxidletime = REDIS_MAXIDLETIME;
1682 server.saveparams = NULL;
1683 server.logfile = NULL; /* NULL = log on standard output */
1684 server.bindaddr = NULL;
1685 server.glueoutputbuf = 1;
1686 server.daemonize = 0;
1687 server.appendonly = 0;
1688 server.appendfsync = APPENDFSYNC_EVERYSEC;
1689 server.no_appendfsync_on_rewrite = 0;
1690 server.lastfsync = time(NULL);
1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
1696 server.requirepass = NULL;
1697 server.rdbcompression = 1;
1698 server.activerehashing = 1;
1699 server.maxclients = 0;
1700 server.blpop_blocked_clients = 0;
1701 server.maxmemory = 0;
1702 server.vm_enabled = 0;
1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1707 server.vm_max_threads = 4;
1708 server.vm_blocked_clients = 0;
1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1711 server.shutdown_asap = 0;
1712
1713 resetServerSaveParams();
1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
1720 server.masterauth = NULL;
1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
1731 }
1732
1733 static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
1738 setupSigSegvAction();
1739
1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
1745 server.clients = listCreate();
1746 server.slaves = listCreate();
1747 server.monitors = listCreate();
1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
1757 for (j = 0; j < server.dbnum; j++) {
1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1764 server.db[j].id = j;
1765 }
1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1770 server.cronloops = 0;
1771 server.bgsavechildpid = -1;
1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
1774 server.aofbuf = sdsempty();
1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
1779 server.stat_expiredkeys = 0;
1780 server.stat_starttime = time(NULL);
1781 server.unixtime = time(NULL);
1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1785
1786 if (server.appendonly) {
1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
1794
1795 if (server.vm_enabled) vmInit();
1796 }
1797
1798 /* Empty the whole database */
1799 static long long emptyDb() {
1800 int j;
1801 long long removed = 0;
1802
1803 for (j = 0; j < server.dbnum; j++) {
1804 removed += dictSize(server.db[j].dict);
1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
1808 return removed;
1809 }
1810
1811 static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815 }
1816
1817 /* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819 static void loadServerConfig(char *filename) {
1820 FILE *fp;
1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1830 exit(1);
1831 }
1832 }
1833
1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1854 server.maxidletime = atoi(argv[1]);
1855 if (server.maxidletime < 0) {
1856 err = "Invalid timeout value"; goto loaderr;
1857 }
1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1864 server.bindaddr = zstrdup(argv[1]);
1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1888 FILE *logfp;
1889
1890 server.logfile = zstrdup(argv[1]);
1891 if (!strcasecmp(server.logfile,"stdout")) {
1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
1904 fclose(logfp);
1905 }
1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1916 server.maxmemory = memtoll(argv[1],NULL);
1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
1946 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1947 && argc == 2) {
1948 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1949 err = "argument must be 'yes' or 'no'"; goto loaderr;
1950 }
1951 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1952 if (!strcasecmp(argv[1],"no")) {
1953 server.appendfsync = APPENDFSYNC_NO;
1954 } else if (!strcasecmp(argv[1],"always")) {
1955 server.appendfsync = APPENDFSYNC_ALWAYS;
1956 } else if (!strcasecmp(argv[1],"everysec")) {
1957 server.appendfsync = APPENDFSYNC_EVERYSEC;
1958 } else {
1959 err = "argument must be 'no', 'always' or 'everysec'";
1960 goto loaderr;
1961 }
1962 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1963 server.requirepass = zstrdup(argv[1]);
1964 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1965 zfree(server.pidfile);
1966 server.pidfile = zstrdup(argv[1]);
1967 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1968 zfree(server.dbfilename);
1969 server.dbfilename = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1971 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
1974 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1975 zfree(server.vm_swap_file);
1976 server.vm_swap_file = zstrdup(argv[1]);
1977 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1978 server.vm_max_memory = memtoll(argv[1],NULL);
1979 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1980 server.vm_page_size = memtoll(argv[1], NULL);
1981 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1982 server.vm_pages = memtoll(argv[1], NULL);
1983 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1984 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1985 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1986 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1987 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1988 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1989 } else {
1990 err = "Bad directive or wrong number of arguments"; goto loaderr;
1991 }
1992 for (j = 0; j < argc; j++)
1993 sdsfree(argv[j]);
1994 zfree(argv);
1995 sdsfree(line);
1996 }
1997 if (fp != stdin) fclose(fp);
1998 return;
1999
2000 loaderr:
2001 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2002 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2003 fprintf(stderr, ">>> '%s'\n", line);
2004 fprintf(stderr, "%s\n", err);
2005 exit(1);
2006 }
2007
2008 static void freeClientArgv(redisClient *c) {
2009 int j;
2010
2011 for (j = 0; j < c->argc; j++)
2012 decrRefCount(c->argv[j]);
2013 for (j = 0; j < c->mbargc; j++)
2014 decrRefCount(c->mbargv[j]);
2015 c->argc = 0;
2016 c->mbargc = 0;
2017 }
2018
2019 static void freeClient(redisClient *c) {
2020 listNode *ln;
2021
2022 /* Note that if the client we are freeing is blocked into a blocking
2023 * call, we have to set querybuf to NULL *before* to call
2024 * unblockClientWaitingData() to avoid processInputBuffer() will get
2025 * called. Also it is important to remove the file events after
2026 * this, because this call adds the READABLE event. */
2027 sdsfree(c->querybuf);
2028 c->querybuf = NULL;
2029 if (c->flags & REDIS_BLOCKED)
2030 unblockClientWaitingData(c);
2031
2032 /* UNWATCH all the keys */
2033 unwatchAllKeys(c);
2034 listRelease(c->watched_keys);
2035 /* Unsubscribe from all the pubsub channels */
2036 pubsubUnsubscribeAllChannels(c,0);
2037 pubsubUnsubscribeAllPatterns(c,0);
2038 dictRelease(c->pubsub_channels);
2039 listRelease(c->pubsub_patterns);
2040 /* Obvious cleanup */
2041 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2042 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2043 listRelease(c->reply);
2044 freeClientArgv(c);
2045 close(c->fd);
2046 /* Remove from the list of clients */
2047 ln = listSearchKey(server.clients,c);
2048 redisAssert(ln != NULL);
2049 listDelNode(server.clients,ln);
2050 /* Remove from the list of clients that are now ready to be restarted
2051 * after waiting for swapped keys */
2052 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2053 ln = listSearchKey(server.io_ready_clients,c);
2054 if (ln) {
2055 listDelNode(server.io_ready_clients,ln);
2056 server.vm_blocked_clients--;
2057 }
2058 }
2059 /* Remove from the list of clients waiting for swapped keys */
2060 while (server.vm_enabled && listLength(c->io_keys)) {
2061 ln = listFirst(c->io_keys);
2062 dontWaitForSwappedKey(c,ln->value);
2063 }
2064 listRelease(c->io_keys);
2065 /* Master/slave cleanup */
2066 if (c->flags & REDIS_SLAVE) {
2067 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2068 close(c->repldbfd);
2069 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2070 ln = listSearchKey(l,c);
2071 redisAssert(ln != NULL);
2072 listDelNode(l,ln);
2073 }
2074 if (c->flags & REDIS_MASTER) {
2075 server.master = NULL;
2076 server.replstate = REDIS_REPL_CONNECT;
2077 }
2078 /* Release memory */
2079 zfree(c->argv);
2080 zfree(c->mbargv);
2081 freeClientMultiState(c);
2082 zfree(c);
2083 }
2084
2085 #define GLUEREPLY_UP_TO (1024)
2086 static void glueReplyBuffersIfNeeded(redisClient *c) {
2087 int copylen = 0;
2088 char buf[GLUEREPLY_UP_TO];
2089 listNode *ln;
2090 listIter li;
2091 robj *o;
2092
2093 listRewind(c->reply,&li);
2094 while((ln = listNext(&li))) {
2095 int objlen;
2096
2097 o = ln->value;
2098 objlen = sdslen(o->ptr);
2099 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2100 memcpy(buf+copylen,o->ptr,objlen);
2101 copylen += objlen;
2102 listDelNode(c->reply,ln);
2103 } else {
2104 if (copylen == 0) return;
2105 break;
2106 }
2107 }
2108 /* Now the output buffer is empty, add the new single element */
2109 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2110 listAddNodeHead(c->reply,o);
2111 }
2112
2113 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2114 redisClient *c = privdata;
2115 int nwritten = 0, totwritten = 0, objlen;
2116 robj *o;
2117 REDIS_NOTUSED(el);
2118 REDIS_NOTUSED(mask);
2119
2120 /* Use writev() if we have enough buffers to send */
2121 if (!server.glueoutputbuf &&
2122 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2123 !(c->flags & REDIS_MASTER))
2124 {
2125 sendReplyToClientWritev(el, fd, privdata, mask);
2126 return;
2127 }
2128
2129 while(listLength(c->reply)) {
2130 if (server.glueoutputbuf && listLength(c->reply) > 1)
2131 glueReplyBuffersIfNeeded(c);
2132
2133 o = listNodeValue(listFirst(c->reply));
2134 objlen = sdslen(o->ptr);
2135
2136 if (objlen == 0) {
2137 listDelNode(c->reply,listFirst(c->reply));
2138 continue;
2139 }
2140
2141 if (c->flags & REDIS_MASTER) {
2142 /* Don't reply to a master */
2143 nwritten = objlen - c->sentlen;
2144 } else {
2145 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2146 if (nwritten <= 0) break;
2147 }
2148 c->sentlen += nwritten;
2149 totwritten += nwritten;
2150 /* If we fully sent the object on head go to the next one */
2151 if (c->sentlen == objlen) {
2152 listDelNode(c->reply,listFirst(c->reply));
2153 c->sentlen = 0;
2154 }
2155 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2156 * bytes, in a single threaded server it's a good idea to serve
2157 * other clients as well, even if a very large request comes from
2158 * super fast link that is always able to accept data (in real world
2159 * scenario think about 'KEYS *' against the loopback interfae) */
2160 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2161 }
2162 if (nwritten == -1) {
2163 if (errno == EAGAIN) {
2164 nwritten = 0;
2165 } else {
2166 redisLog(REDIS_VERBOSE,
2167 "Error writing to client: %s", strerror(errno));
2168 freeClient(c);
2169 return;
2170 }
2171 }
2172 if (totwritten > 0) c->lastinteraction = time(NULL);
2173 if (listLength(c->reply) == 0) {
2174 c->sentlen = 0;
2175 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2176 }
2177 }
2178
2179 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2180 {
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen, willwrite;
2183 robj *o;
2184 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2185 int offset, ion = 0;
2186 REDIS_NOTUSED(el);
2187 REDIS_NOTUSED(mask);
2188
2189 listNode *node;
2190 while (listLength(c->reply)) {
2191 offset = c->sentlen;
2192 ion = 0;
2193 willwrite = 0;
2194
2195 /* fill-in the iov[] array */
2196 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2197 o = listNodeValue(node);
2198 objlen = sdslen(o->ptr);
2199
2200 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2201 break;
2202
2203 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2204 break; /* no more iovecs */
2205
2206 iov[ion].iov_base = ((char*)o->ptr) + offset;
2207 iov[ion].iov_len = objlen - offset;
2208 willwrite += objlen - offset;
2209 offset = 0; /* just for the first item */
2210 ion++;
2211 }
2212
2213 if(willwrite == 0)
2214 break;
2215
2216 /* write all collected blocks at once */
2217 if((nwritten = writev(fd, iov, ion)) < 0) {
2218 if (errno != EAGAIN) {
2219 redisLog(REDIS_VERBOSE,
2220 "Error writing to client: %s", strerror(errno));
2221 freeClient(c);
2222 return;
2223 }
2224 break;
2225 }
2226
2227 totwritten += nwritten;
2228 offset = c->sentlen;
2229
2230 /* remove written robjs from c->reply */
2231 while (nwritten && listLength(c->reply)) {
2232 o = listNodeValue(listFirst(c->reply));
2233 objlen = sdslen(o->ptr);
2234
2235 if(nwritten >= objlen - offset) {
2236 listDelNode(c->reply, listFirst(c->reply));
2237 nwritten -= objlen - offset;
2238 c->sentlen = 0;
2239 } else {
2240 /* partial write */
2241 c->sentlen += nwritten;
2242 break;
2243 }
2244 offset = 0;
2245 }
2246 }
2247
2248 if (totwritten > 0)
2249 c->lastinteraction = time(NULL);
2250
2251 if (listLength(c->reply) == 0) {
2252 c->sentlen = 0;
2253 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2254 }
2255 }
2256
2257 static int qsortRedisCommands(const void *r1, const void *r2) {
2258 return strcasecmp(
2259 ((struct redisCommand*)r1)->name,
2260 ((struct redisCommand*)r2)->name);
2261 }
2262
2263 static void sortCommandTable() {
2264 /* Copy and sort the read-only version of the command table */
2265 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2266 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2267 qsort(commandTable,
2268 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2269 sizeof(struct redisCommand),qsortRedisCommands);
2270 }
2271
2272 static struct redisCommand *lookupCommand(char *name) {
2273 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2274 return bsearch(
2275 &tmp,
2276 commandTable,
2277 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2278 sizeof(struct redisCommand),
2279 qsortRedisCommands);
2280 }
2281
2282 /* resetClient prepare the client to process the next command */
2283 static void resetClient(redisClient *c) {
2284 freeClientArgv(c);
2285 c->bulklen = -1;
2286 c->multibulk = 0;
2287 }
2288
2289 /* Call() is the core of Redis execution of a command */
2290 static void call(redisClient *c, struct redisCommand *cmd) {
2291 long long dirty;
2292
2293 dirty = server.dirty;
2294 cmd->proc(c);
2295 dirty = server.dirty-dirty;
2296
2297 if (server.appendonly && dirty)
2298 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2299 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2300 listLength(server.slaves))
2301 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2302 if (listLength(server.monitors))
2303 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2304 server.stat_numcommands++;
2305 }
2306
2307 /* If this function gets called we already read a whole
2308 * command, argments are in the client argv/argc fields.
2309 * processCommand() execute the command or prepare the
2310 * server for a bulk read from the client.
2311 *
2312 * If 1 is returned the client is still alive and valid and
2313 * and other operations can be performed by the caller. Otherwise
2314 * if 0 is returned the client was destroied (i.e. after QUIT). */
2315 static int processCommand(redisClient *c) {
2316 struct redisCommand *cmd;
2317
2318 /* Free some memory if needed (maxmemory setting) */
2319 if (server.maxmemory) freeMemoryIfNeeded();
2320
2321 /* Handle the multi bulk command type. This is an alternative protocol
2322 * supported by Redis in order to receive commands that are composed of
2323 * multiple binary-safe "bulk" arguments. The latency of processing is
2324 * a bit higher but this allows things like multi-sets, so if this
2325 * protocol is used only for MSET and similar commands this is a big win. */
2326 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2327 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2328 if (c->multibulk <= 0) {
2329 resetClient(c);
2330 return 1;
2331 } else {
2332 decrRefCount(c->argv[c->argc-1]);
2333 c->argc--;
2334 return 1;
2335 }
2336 } else if (c->multibulk) {
2337 if (c->bulklen == -1) {
2338 if (((char*)c->argv[0]->ptr)[0] != '$') {
2339 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2340 resetClient(c);
2341 return 1;
2342 } else {
2343 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2344 decrRefCount(c->argv[0]);
2345 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2346 c->argc--;
2347 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2348 resetClient(c);
2349 return 1;
2350 }
2351 c->argc--;
2352 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2353 return 1;
2354 }
2355 } else {
2356 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2357 c->mbargv[c->mbargc] = c->argv[0];
2358 c->mbargc++;
2359 c->argc--;
2360 c->multibulk--;
2361 if (c->multibulk == 0) {
2362 robj **auxargv;
2363 int auxargc;
2364
2365 /* Here we need to swap the multi-bulk argc/argv with the
2366 * normal argc/argv of the client structure. */
2367 auxargv = c->argv;
2368 c->argv = c->mbargv;
2369 c->mbargv = auxargv;
2370
2371 auxargc = c->argc;
2372 c->argc = c->mbargc;
2373 c->mbargc = auxargc;
2374
2375 /* We need to set bulklen to something different than -1
2376 * in order for the code below to process the command without
2377 * to try to read the last argument of a bulk command as
2378 * a special argument. */
2379 c->bulklen = 0;
2380 /* continue below and process the command */
2381 } else {
2382 c->bulklen = -1;
2383 return 1;
2384 }
2385 }
2386 }
2387 /* -- end of multi bulk commands processing -- */
2388
2389 /* The QUIT command is handled as a special case. Normal command
2390 * procs are unable to close the client connection safely */
2391 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2392 freeClient(c);
2393 return 0;
2394 }
2395
2396 /* Now lookup the command and check ASAP about trivial error conditions
2397 * such wrong arity, bad command name and so forth. */
2398 cmd = lookupCommand(c->argv[0]->ptr);
2399 if (!cmd) {
2400 addReplySds(c,
2401 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2402 (char*)c->argv[0]->ptr));
2403 resetClient(c);
2404 return 1;
2405 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2406 (c->argc < -cmd->arity)) {
2407 addReplySds(c,
2408 sdscatprintf(sdsempty(),
2409 "-ERR wrong number of arguments for '%s' command\r\n",
2410 cmd->name));
2411 resetClient(c);
2412 return 1;
2413 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2414 /* This is a bulk command, we have to read the last argument yet. */
2415 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2416
2417 decrRefCount(c->argv[c->argc-1]);
2418 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2419 c->argc--;
2420 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2421 resetClient(c);
2422 return 1;
2423 }
2424 c->argc--;
2425 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2426 /* It is possible that the bulk read is already in the
2427 * buffer. Check this condition and handle it accordingly.
2428 * This is just a fast path, alternative to call processInputBuffer().
2429 * It's a good idea since the code is small and this condition
2430 * happens most of the times. */
2431 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2432 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2433 c->argc++;
2434 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2435 } else {
2436 /* Otherwise return... there is to read the last argument
2437 * from the socket. */
2438 return 1;
2439 }
2440 }
2441 /* Let's try to encode the bulk object to save space. */
2442 if (cmd->flags & REDIS_CMD_BULK)
2443 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2444
2445 /* Check if the user is authenticated */
2446 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2447 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2448 resetClient(c);
2449 return 1;
2450 }
2451
2452 /* Handle the maxmemory directive */
2453 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2454 zmalloc_used_memory() > server.maxmemory)
2455 {
2456 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2457 resetClient(c);
2458 return 1;
2459 }
2460
2461 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2462 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2463 &&
2464 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2465 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2466 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2467 resetClient(c);
2468 return 1;
2469 }
2470
2471 /* Exec the command */
2472 if (c->flags & REDIS_MULTI &&
2473 cmd->proc != execCommand && cmd->proc != discardCommand &&
2474 cmd->proc != multiCommand && cmd->proc != watchCommand)
2475 {
2476 queueMultiCommand(c,cmd);
2477 addReply(c,shared.queued);
2478 } else {
2479 if (server.vm_enabled && server.vm_max_threads > 0 &&
2480 blockClientOnSwappedKeys(c,cmd)) return 1;
2481 call(c,cmd);
2482 }
2483
2484 /* Prepare the client for the next command */
2485 resetClient(c);
2486 return 1;
2487 }
2488
2489 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2490 listNode *ln;
2491 listIter li;
2492 int outc = 0, j;
2493 robj **outv;
2494 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2495 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2496 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2497 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2498 robj *lenobj;
2499
2500 if (argc <= REDIS_STATIC_ARGS) {
2501 outv = static_outv;
2502 } else {
2503 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2504 }
2505
2506 lenobj = createObject(REDIS_STRING,
2507 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2508 lenobj->refcount = 0;
2509 outv[outc++] = lenobj;
2510 for (j = 0; j < argc; j++) {
2511 lenobj = createObject(REDIS_STRING,
2512 sdscatprintf(sdsempty(),"$%lu\r\n",
2513 (unsigned long) stringObjectLen(argv[j])));
2514 lenobj->refcount = 0;
2515 outv[outc++] = lenobj;
2516 outv[outc++] = argv[j];
2517 outv[outc++] = shared.crlf;
2518 }
2519
2520 /* Increment all the refcounts at start and decrement at end in order to
2521 * be sure to free objects if there is no slave in a replication state
2522 * able to be feed with commands */
2523 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2524 listRewind(slaves,&li);
2525 while((ln = listNext(&li))) {
2526 redisClient *slave = ln->value;
2527
2528 /* Don't feed slaves that are still waiting for BGSAVE to start */
2529 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2530
2531 /* Feed all the other slaves, MONITORs and so on */
2532 if (slave->slaveseldb != dictid) {
2533 robj *selectcmd;
2534
2535 switch(dictid) {
2536 case 0: selectcmd = shared.select0; break;
2537 case 1: selectcmd = shared.select1; break;
2538 case 2: selectcmd = shared.select2; break;
2539 case 3: selectcmd = shared.select3; break;
2540 case 4: selectcmd = shared.select4; break;
2541 case 5: selectcmd = shared.select5; break;
2542 case 6: selectcmd = shared.select6; break;
2543 case 7: selectcmd = shared.select7; break;
2544 case 8: selectcmd = shared.select8; break;
2545 case 9: selectcmd = shared.select9; break;
2546 default:
2547 selectcmd = createObject(REDIS_STRING,
2548 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2549 selectcmd->refcount = 0;
2550 break;
2551 }
2552 addReply(slave,selectcmd);
2553 slave->slaveseldb = dictid;
2554 }
2555 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2556 }
2557 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2558 if (outv != static_outv) zfree(outv);
2559 }
2560
2561 static sds sdscatrepr(sds s, char *p, size_t len) {
2562 s = sdscatlen(s,"\"",1);
2563 while(len--) {
2564 switch(*p) {
2565 case '\\':
2566 case '"':
2567 s = sdscatprintf(s,"\\%c",*p);
2568 break;
2569 case '\n': s = sdscatlen(s,"\\n",1); break;
2570 case '\r': s = sdscatlen(s,"\\r",1); break;
2571 case '\t': s = sdscatlen(s,"\\t",1); break;
2572 case '\a': s = sdscatlen(s,"\\a",1); break;
2573 case '\b': s = sdscatlen(s,"\\b",1); break;
2574 default:
2575 if (isprint(*p))
2576 s = sdscatprintf(s,"%c",*p);
2577 else
2578 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2579 break;
2580 }
2581 p++;
2582 }
2583 return sdscatlen(s,"\"",1);
2584 }
2585
2586 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2587 listNode *ln;
2588 listIter li;
2589 int j;
2590 sds cmdrepr = sdsnew("+");
2591 robj *cmdobj;
2592 struct timeval tv;
2593
2594 gettimeofday(&tv,NULL);
2595 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2596 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2597
2598 for (j = 0; j < argc; j++) {
2599 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2600 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2601 } else {
2602 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2603 sdslen(argv[j]->ptr));
2604 }
2605 if (j != argc-1)
2606 cmdrepr = sdscatlen(cmdrepr," ",1);
2607 }
2608 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2609 cmdobj = createObject(REDIS_STRING,cmdrepr);
2610
2611 listRewind(monitors,&li);
2612 while((ln = listNext(&li))) {
2613 redisClient *monitor = ln->value;
2614 addReply(monitor,cmdobj);
2615 }
2616 decrRefCount(cmdobj);
2617 }
2618
2619 static void processInputBuffer(redisClient *c) {
2620 again:
2621 /* Before to process the input buffer, make sure the client is not
2622 * waitig for a blocking operation such as BLPOP. Note that the first
2623 * iteration the client is never blocked, otherwise the processInputBuffer
2624 * would not be called at all, but after the execution of the first commands
2625 * in the input buffer the client may be blocked, and the "goto again"
2626 * will try to reiterate. The following line will make it return asap. */
2627 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2628 if (c->bulklen == -1) {
2629 /* Read the first line of the query */
2630 char *p = strchr(c->querybuf,'\n');
2631 size_t querylen;
2632
2633 if (p) {
2634 sds query, *argv;
2635 int argc, j;
2636
2637 query = c->querybuf;
2638 c->querybuf = sdsempty();
2639 querylen = 1+(p-(query));
2640 if (sdslen(query) > querylen) {
2641 /* leave data after the first line of the query in the buffer */
2642 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2643 }
2644 *p = '\0'; /* remove "\n" */
2645 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2646 sdsupdatelen(query);
2647
2648 /* Now we can split the query in arguments */
2649 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2650 sdsfree(query);
2651
2652 if (c->argv) zfree(c->argv);
2653 c->argv = zmalloc(sizeof(robj*)*argc);
2654
2655 for (j = 0; j < argc; j++) {
2656 if (sdslen(argv[j])) {
2657 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2658 c->argc++;
2659 } else {
2660 sdsfree(argv[j]);
2661 }
2662 }
2663 zfree(argv);
2664 if (c->argc) {
2665 /* Execute the command. If the client is still valid
2666 * after processCommand() return and there is something
2667 * on the query buffer try to process the next command. */
2668 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2669 } else {
2670 /* Nothing to process, argc == 0. Just process the query
2671 * buffer if it's not empty or return to the caller */
2672 if (sdslen(c->querybuf)) goto again;
2673 }
2674 return;
2675 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2676 redisLog(REDIS_VERBOSE, "Client protocol error");
2677 freeClient(c);
2678 return;
2679 }
2680 } else {
2681 /* Bulk read handling. Note that if we are at this point
2682 the client already sent a command terminated with a newline,
2683 we are reading the bulk data that is actually the last
2684 argument of the command. */
2685 int qbl = sdslen(c->querybuf);
2686
2687 if (c->bulklen <= qbl) {
2688 /* Copy everything but the final CRLF as final argument */
2689 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2690 c->argc++;
2691 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2692 /* Process the command. If the client is still valid after
2693 * the processing and there is more data in the buffer
2694 * try to parse it. */
2695 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2696 return;
2697 }
2698 }
2699 }
2700
2701 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2702 redisClient *c = (redisClient*) privdata;
2703 char buf[REDIS_IOBUF_LEN];
2704 int nread;
2705 REDIS_NOTUSED(el);
2706 REDIS_NOTUSED(mask);
2707
2708 nread = read(fd, buf, REDIS_IOBUF_LEN);
2709 if (nread == -1) {
2710 if (errno == EAGAIN) {
2711 nread = 0;
2712 } else {
2713 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2714 freeClient(c);
2715 return;
2716 }
2717 } else if (nread == 0) {
2718 redisLog(REDIS_VERBOSE, "Client closed connection");
2719 freeClient(c);
2720 return;
2721 }
2722 if (nread) {
2723 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2724 c->lastinteraction = time(NULL);
2725 } else {
2726 return;
2727 }
2728 processInputBuffer(c);
2729 }
2730
2731 static int selectDb(redisClient *c, int id) {
2732 if (id < 0 || id >= server.dbnum)
2733 return REDIS_ERR;
2734 c->db = &server.db[id];
2735 return REDIS_OK;
2736 }
2737
2738 static void *dupClientReplyValue(void *o) {
2739 incrRefCount((robj*)o);
2740 return o;
2741 }
2742
2743 static int listMatchObjects(void *a, void *b) {
2744 return equalStringObjects(a,b);
2745 }
2746
2747 static redisClient *createClient(int fd) {
2748 redisClient *c = zmalloc(sizeof(*c));
2749
2750 anetNonBlock(NULL,fd);
2751 anetTcpNoDelay(NULL,fd);
2752 if (!c) return NULL;
2753 selectDb(c,0);
2754 c->fd = fd;
2755 c->querybuf = sdsempty();
2756 c->argc = 0;
2757 c->argv = NULL;
2758 c->bulklen = -1;
2759 c->multibulk = 0;
2760 c->mbargc = 0;
2761 c->mbargv = NULL;
2762 c->sentlen = 0;
2763 c->flags = 0;
2764 c->lastinteraction = time(NULL);
2765 c->authenticated = 0;
2766 c->replstate = REDIS_REPL_NONE;
2767 c->reply = listCreate();
2768 listSetFreeMethod(c->reply,decrRefCount);
2769 listSetDupMethod(c->reply,dupClientReplyValue);
2770 c->blocking_keys = NULL;
2771 c->blocking_keys_num = 0;
2772 c->io_keys = listCreate();
2773 c->watched_keys = listCreate();
2774 listSetFreeMethod(c->io_keys,decrRefCount);
2775 c->pubsub_channels = dictCreate(&setDictType,NULL);
2776 c->pubsub_patterns = listCreate();
2777 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2778 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2779 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2780 readQueryFromClient, c) == AE_ERR) {
2781 freeClient(c);
2782 return NULL;
2783 }
2784 listAddNodeTail(server.clients,c);
2785 initClientMultiState(c);
2786 return c;
2787 }
2788
2789 static void addReply(redisClient *c, robj *obj) {
2790 if (listLength(c->reply) == 0 &&
2791 (c->replstate == REDIS_REPL_NONE ||
2792 c->replstate == REDIS_REPL_ONLINE) &&
2793 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2794 sendReplyToClient, c) == AE_ERR) return;
2795
2796 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2797 obj = dupStringObject(obj);
2798 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2799 }
2800 listAddNodeTail(c->reply,getDecodedObject(obj));
2801 }
2802
2803 static void addReplySds(redisClient *c, sds s) {
2804 robj *o = createObject(REDIS_STRING,s);
2805 addReply(c,o);
2806 decrRefCount(o);
2807 }
2808
2809 static void addReplyDouble(redisClient *c, double d) {
2810 char buf[128];
2811
2812 snprintf(buf,sizeof(buf),"%.17g",d);
2813 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2814 (unsigned long) strlen(buf),buf));
2815 }
2816
2817 static void addReplyLongLong(redisClient *c, long long ll) {
2818 char buf[128];
2819 size_t len;
2820
2821 if (ll == 0) {
2822 addReply(c,shared.czero);
2823 return;
2824 } else if (ll == 1) {
2825 addReply(c,shared.cone);
2826 return;
2827 }
2828 buf[0] = ':';
2829 len = ll2string(buf+1,sizeof(buf)-1,ll);
2830 buf[len+1] = '\r';
2831 buf[len+2] = '\n';
2832 addReplySds(c,sdsnewlen(buf,len+3));
2833 }
2834
2835 static void addReplyUlong(redisClient *c, unsigned long ul) {
2836 char buf[128];
2837 size_t len;
2838
2839 if (ul == 0) {
2840 addReply(c,shared.czero);
2841 return;
2842 } else if (ul == 1) {
2843 addReply(c,shared.cone);
2844 return;
2845 }
2846 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2847 addReplySds(c,sdsnewlen(buf,len));
2848 }
2849
2850 static void addReplyBulkLen(redisClient *c, robj *obj) {
2851 size_t len, intlen;
2852 char buf[128];
2853
2854 if (obj->encoding == REDIS_ENCODING_RAW) {
2855 len = sdslen(obj->ptr);
2856 } else {
2857 long n = (long)obj->ptr;
2858
2859 /* Compute how many bytes will take this integer as a radix 10 string */
2860 len = 1;
2861 if (n < 0) {
2862 len++;
2863 n = -n;
2864 }
2865 while((n = n/10) != 0) {
2866 len++;
2867 }
2868 }
2869 buf[0] = '$';
2870 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2871 buf[intlen+1] = '\r';
2872 buf[intlen+2] = '\n';
2873 addReplySds(c,sdsnewlen(buf,intlen+3));
2874 }
2875
2876 static void addReplyBulk(redisClient *c, robj *obj) {
2877 addReplyBulkLen(c,obj);
2878 addReply(c,obj);
2879 addReply(c,shared.crlf);
2880 }
2881
2882 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2883 static void addReplyBulkCString(redisClient *c, char *s) {
2884 if (s == NULL) {
2885 addReply(c,shared.nullbulk);
2886 } else {
2887 robj *o = createStringObject(s,strlen(s));
2888 addReplyBulk(c,o);
2889 decrRefCount(o);
2890 }
2891 }
2892
2893 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2894 int cport, cfd;
2895 char cip[128];
2896 redisClient *c;
2897 REDIS_NOTUSED(el);
2898 REDIS_NOTUSED(mask);
2899 REDIS_NOTUSED(privdata);
2900
2901 cfd = anetAccept(server.neterr, fd, cip, &cport);
2902 if (cfd == AE_ERR) {
2903 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2904 return;
2905 }
2906 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2907 if ((c = createClient(cfd)) == NULL) {
2908 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2909 close(cfd); /* May be already closed, just ingore errors */
2910 return;
2911 }
2912 /* If maxclient directive is set and this is one client more... close the
2913 * connection. Note that we create the client instead to check before
2914 * for this condition, since now the socket is already set in nonblocking
2915 * mode and we can send an error for free using the Kernel I/O */
2916 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2917 char *err = "-ERR max number of clients reached\r\n";
2918
2919 /* That's a best effort error message, don't check write errors */
2920 if (write(c->fd,err,strlen(err)) == -1) {
2921 /* Nothing to do, Just to avoid the warning... */
2922 }
2923 freeClient(c);
2924 return;
2925 }
2926 server.stat_numconnections++;
2927 }
2928
2929 /* ======================= Redis objects implementation ===================== */
2930
2931 static robj *createObject(int type, void *ptr) {
2932 robj *o;
2933
2934 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2935 if (listLength(server.objfreelist)) {
2936 listNode *head = listFirst(server.objfreelist);
2937 o = listNodeValue(head);
2938 listDelNode(server.objfreelist,head);
2939 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2940 } else {
2941 if (server.vm_enabled) {
2942 pthread_mutex_unlock(&server.obj_freelist_mutex);
2943 o = zmalloc(sizeof(*o));
2944 } else {
2945 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2946 }
2947 }
2948 o->type = type;
2949 o->encoding = REDIS_ENCODING_RAW;
2950 o->ptr = ptr;
2951 o->refcount = 1;
2952 if (server.vm_enabled) {
2953 /* Note that this code may run in the context of an I/O thread
2954 * and accessing to server.unixtime in theory is an error
2955 * (no locks). But in practice this is safe, and even if we read
2956 * garbage Redis will not fail, as it's just a statistical info */
2957 o->vm.atime = server.unixtime;
2958 o->storage = REDIS_VM_MEMORY;
2959 }
2960 return o;
2961 }
2962
2963 static robj *createStringObject(char *ptr, size_t len) {
2964 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2965 }
2966
2967 static robj *createStringObjectFromLongLong(long long value) {
2968 robj *o;
2969 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2970 incrRefCount(shared.integers[value]);
2971 o = shared.integers[value];
2972 } else {
2973 if (value >= LONG_MIN && value <= LONG_MAX) {
2974 o = createObject(REDIS_STRING, NULL);
2975 o->encoding = REDIS_ENCODING_INT;
2976 o->ptr = (void*)((long)value);
2977 } else {
2978 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2979 }
2980 }
2981 return o;
2982 }
2983
2984 static robj *dupStringObject(robj *o) {
2985 assert(o->encoding == REDIS_ENCODING_RAW);
2986 return createStringObject(o->ptr,sdslen(o->ptr));
2987 }
2988
2989 static robj *createListObject(void) {
2990 list *l = listCreate();
2991
2992 listSetFreeMethod(l,decrRefCount);
2993 return createObject(REDIS_LIST,l);
2994 }
2995
2996 static robj *createSetObject(void) {
2997 dict *d = dictCreate(&setDictType,NULL);
2998 return createObject(REDIS_SET,d);
2999 }
3000
3001 static robj *createHashObject(void) {
3002 /* All the Hashes start as zipmaps. Will be automatically converted
3003 * into hash tables if there are enough elements or big elements
3004 * inside. */
3005 unsigned char *zm = zipmapNew();
3006 robj *o = createObject(REDIS_HASH,zm);
3007 o->encoding = REDIS_ENCODING_ZIPMAP;
3008 return o;
3009 }
3010
3011 static robj *createZsetObject(void) {
3012 zset *zs = zmalloc(sizeof(*zs));
3013
3014 zs->dict = dictCreate(&zsetDictType,NULL);
3015 zs->zsl = zslCreate();
3016 return createObject(REDIS_ZSET,zs);
3017 }
3018
3019 static void freeStringObject(robj *o) {
3020 if (o->encoding == REDIS_ENCODING_RAW) {
3021 sdsfree(o->ptr);
3022 }
3023 }
3024
3025 static void freeListObject(robj *o) {
3026 listRelease((list*) o->ptr);
3027 }
3028
3029 static void freeSetObject(robj *o) {
3030 dictRelease((dict*) o->ptr);
3031 }
3032
3033 static void freeZsetObject(robj *o) {
3034 zset *zs = o->ptr;
3035
3036 dictRelease(zs->dict);
3037 zslFree(zs->zsl);
3038 zfree(zs);
3039 }
3040
3041 static void freeHashObject(robj *o) {
3042 switch (o->encoding) {
3043 case REDIS_ENCODING_HT:
3044 dictRelease((dict*) o->ptr);
3045 break;
3046 case REDIS_ENCODING_ZIPMAP:
3047 zfree(o->ptr);
3048 break;
3049 default:
3050 redisPanic("Unknown hash encoding type");
3051 break;
3052 }
3053 }
3054
3055 static void incrRefCount(robj *o) {
3056 o->refcount++;
3057 }
3058
3059 static void decrRefCount(void *obj) {
3060 robj *o = obj;
3061
3062 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3063 /* Object is a key of a swapped out value, or in the process of being
3064 * loaded. */
3065 if (server.vm_enabled &&
3066 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3067 {
3068 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3069 redisAssert(o->type == REDIS_STRING);
3070 freeStringObject(o);
3071 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3072 pthread_mutex_lock(&server.obj_freelist_mutex);
3073 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3074 !listAddNodeHead(server.objfreelist,o))
3075 zfree(o);
3076 pthread_mutex_unlock(&server.obj_freelist_mutex);
3077 server.vm_stats_swapped_objects--;
3078 return;
3079 }
3080 /* Object is in memory, or in the process of being swapped out. */
3081 if (--(o->refcount) == 0) {
3082 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3083 vmCancelThreadedIOJob(obj);
3084 switch(o->type) {
3085 case REDIS_STRING: freeStringObject(o); break;
3086 case REDIS_LIST: freeListObject(o); break;
3087 case REDIS_SET: freeSetObject(o); break;
3088 case REDIS_ZSET: freeZsetObject(o); break;
3089 case REDIS_HASH: freeHashObject(o); break;
3090 default: redisPanic("Unknown object type"); break;
3091 }
3092 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3093 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3094 !listAddNodeHead(server.objfreelist,o))
3095 zfree(o);
3096 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3097 }
3098 }
3099
3100 static robj *lookupKey(redisDb *db, robj *key) {
3101 dictEntry *de = dictFind(db->dict,key);
3102 if (de) {
3103 robj *key = dictGetEntryKey(de);
3104 robj *val = dictGetEntryVal(de);
3105
3106 if (server.vm_enabled) {
3107 if (key->storage == REDIS_VM_MEMORY ||
3108 key->storage == REDIS_VM_SWAPPING)
3109 {
3110 /* If we were swapping the object out, stop it, this key
3111 * was requested. */
3112 if (key->storage == REDIS_VM_SWAPPING)
3113 vmCancelThreadedIOJob(key);
3114 /* Update the access time of the key for the aging algorithm. */
3115 key->vm.atime = server.unixtime;
3116 } else {
3117 int notify = (key->storage == REDIS_VM_LOADING);
3118
3119 /* Our value was swapped on disk. Bring it at home. */
3120 redisAssert(val == NULL);
3121 val = vmLoadObject(key);
3122 dictGetEntryVal(de) = val;
3123
3124 /* Clients blocked by the VM subsystem may be waiting for
3125 * this key... */
3126 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3127 }
3128 }
3129 return val;
3130 } else {
3131 return NULL;
3132 }
3133 }
3134
3135 static robj *lookupKeyRead(redisDb *db, robj *key) {
3136 expireIfNeeded(db,key);
3137 return lookupKey(db,key);
3138 }
3139
3140 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3141 deleteIfVolatile(db,key);
3142 touchWatchedKey(db,key);
3143 return lookupKey(db,key);
3144 }
3145
3146 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3147 robj *o = lookupKeyRead(c->db, key);
3148 if (!o) addReply(c,reply);
3149 return o;
3150 }
3151
3152 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3153 robj *o = lookupKeyWrite(c->db, key);
3154 if (!o) addReply(c,reply);
3155 return o;
3156 }
3157
3158 static int checkType(redisClient *c, robj *o, int type) {
3159 if (o->type != type) {
3160 addReply(c,shared.wrongtypeerr);
3161 return 1;
3162 }
3163 return 0;
3164 }
3165
3166 static int deleteKey(redisDb *db, robj *key) {
3167 int retval;
3168
3169 /* We need to protect key from destruction: after the first dictDelete()
3170 * it may happen that 'key' is no longer valid if we don't increment
3171 * it's count. This may happen when we get the object reference directly
3172 * from the hash table with dictRandomKey() or dict iterators */
3173 incrRefCount(key);
3174 if (dictSize(db->expires)) dictDelete(db->expires,key);
3175 retval = dictDelete(db->dict,key);
3176 decrRefCount(key);
3177
3178 return retval == DICT_OK;
3179 }
3180
3181 /* Check if the nul-terminated string 's' can be represented by a long
3182 * (that is, is a number that fits into long without any other space or
3183 * character before or after the digits).
3184 *
3185 * If so, the function returns REDIS_OK and *longval is set to the value
3186 * of the number. Otherwise REDIS_ERR is returned */
3187 static int isStringRepresentableAsLong(sds s, long *longval) {
3188 char buf[32], *endptr;
3189 long value;
3190 int slen;
3191
3192 value = strtol(s, &endptr, 10);
3193 if (endptr[0] != '\0') return REDIS_ERR;
3194 slen = ll2string(buf,32,value);
3195
3196 /* If the number converted back into a string is not identical
3197 * then it's not possible to encode the string as integer */
3198 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3199 if (longval) *longval = value;
3200 return REDIS_OK;
3201 }
3202
3203 /* Try to encode a string object in order to save space */
3204 static robj *tryObjectEncoding(robj *o) {
3205 long value;
3206 sds s = o->ptr;
3207
3208 if (o->encoding != REDIS_ENCODING_RAW)
3209 return o; /* Already encoded */
3210
3211 /* It's not safe to encode shared objects: shared objects can be shared
3212 * everywhere in the "object space" of Redis. Encoded objects can only
3213 * appear as "values" (and not, for instance, as keys) */
3214 if (o->refcount > 1) return o;
3215
3216 /* Currently we try to encode only strings */
3217 redisAssert(o->type == REDIS_STRING);
3218
3219 /* Check if we can represent this string as a long integer */
3220 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3221
3222 /* Ok, this object can be encoded */
3223 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3224 decrRefCount(o);
3225 incrRefCount(shared.integers[value]);
3226 return shared.integers[value];
3227 } else {
3228 o->encoding = REDIS_ENCODING_INT;
3229 sdsfree(o->ptr);
3230 o->ptr = (void*) value;
3231 return o;
3232 }
3233 }
3234
3235 /* Get a decoded version of an encoded object (returned as a new object).
3236 * If the object is already raw-encoded just increment the ref count. */
3237 static robj *getDecodedObject(robj *o) {
3238 robj *dec;
3239
3240 if (o->encoding == REDIS_ENCODING_RAW) {
3241 incrRefCount(o);
3242 return o;
3243 }
3244 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3245 char buf[32];
3246
3247 ll2string(buf,32,(long)o->ptr);
3248 dec = createStringObject(buf,strlen(buf));
3249 return dec;
3250 } else {
3251 redisPanic("Unknown encoding type");
3252 }
3253 }
3254
3255 /* Compare two string objects via strcmp() or alike.
3256 * Note that the objects may be integer-encoded. In such a case we
3257 * use ll2string() to get a string representation of the numbers on the stack
3258 * and compare the strings, it's much faster than calling getDecodedObject().
3259 *
3260 * Important note: if objects are not integer encoded, but binary-safe strings,
3261 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3262 * binary safe. */
3263 static int compareStringObjects(robj *a, robj *b) {
3264 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3265 char bufa[128], bufb[128], *astr, *bstr;
3266 int bothsds = 1;
3267
3268 if (a == b) return 0;
3269 if (a->encoding != REDIS_ENCODING_RAW) {
3270 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3271 astr = bufa;
3272 bothsds = 0;
3273 } else {
3274 astr = a->ptr;
3275 }
3276 if (b->encoding != REDIS_ENCODING_RAW) {
3277 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3278 bstr = bufb;
3279 bothsds = 0;
3280 } else {
3281 bstr = b->ptr;
3282 }
3283 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3284 }
3285
3286 /* Equal string objects return 1 if the two objects are the same from the
3287 * point of view of a string comparison, otherwise 0 is returned. Note that
3288 * this function is faster then checking for (compareStringObject(a,b) == 0)
3289 * because it can perform some more optimization. */
3290 static int equalStringObjects(robj *a, robj *b) {
3291 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3292 return a->ptr == b->ptr;
3293 } else {
3294 return compareStringObjects(a,b) == 0;
3295 }
3296 }
3297
3298 static size_t stringObjectLen(robj *o) {
3299 redisAssert(o->type == REDIS_STRING);
3300 if (o->encoding == REDIS_ENCODING_RAW) {
3301 return sdslen(o->ptr);
3302 } else {
3303 char buf[32];
3304
3305 return ll2string(buf,32,(long)o->ptr);
3306 }
3307 }
3308
3309 static int getDoubleFromObject(robj *o, double *target) {
3310 double value;
3311 char *eptr;
3312
3313 if (o == NULL) {
3314 value = 0;
3315 } else {
3316 redisAssert(o->type == REDIS_STRING);
3317 if (o->encoding == REDIS_ENCODING_RAW) {
3318 value = strtod(o->ptr, &eptr);
3319 if (eptr[0] != '\0') return REDIS_ERR;
3320 } else if (o->encoding == REDIS_ENCODING_INT) {
3321 value = (long)o->ptr;
3322 } else {
3323 redisPanic("Unknown string encoding");
3324 }
3325 }
3326
3327 *target = value;
3328 return REDIS_OK;
3329 }
3330
3331 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3332 double value;
3333 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3334 if (msg != NULL) {
3335 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3336 } else {
3337 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3338 }
3339 return REDIS_ERR;
3340 }
3341
3342 *target = value;
3343 return REDIS_OK;
3344 }
3345
3346 static int getLongLongFromObject(robj *o, long long *target) {
3347 long long value;
3348 char *eptr;
3349
3350 if (o == NULL) {
3351 value = 0;
3352 } else {
3353 redisAssert(o->type == REDIS_STRING);
3354 if (o->encoding == REDIS_ENCODING_RAW) {
3355 value = strtoll(o->ptr, &eptr, 10);
3356 if (eptr[0] != '\0') return REDIS_ERR;
3357 } else if (o->encoding == REDIS_ENCODING_INT) {
3358 value = (long)o->ptr;
3359 } else {
3360 redisPanic("Unknown string encoding");
3361 }
3362 }
3363
3364 *target = value;
3365 return REDIS_OK;
3366 }
3367
3368 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3369 long long value;
3370 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3371 if (msg != NULL) {
3372 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3373 } else {
3374 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3375 }
3376 return REDIS_ERR;
3377 }
3378
3379 *target = value;
3380 return REDIS_OK;
3381 }
3382
3383 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3384 long long value;
3385
3386 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3387 if (value < LONG_MIN || value > LONG_MAX) {
3388 if (msg != NULL) {
3389 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3390 } else {
3391 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3392 }
3393 return REDIS_ERR;
3394 }
3395
3396 *target = value;
3397 return REDIS_OK;
3398 }
3399
3400 /*============================ RDB saving/loading =========================== */
3401
3402 static int rdbSaveType(FILE *fp, unsigned char type) {
3403 if (fwrite(&type,1,1,fp) == 0) return -1;
3404 return 0;
3405 }
3406
3407 static int rdbSaveTime(FILE *fp, time_t t) {
3408 int32_t t32 = (int32_t) t;
3409 if (fwrite(&t32,4,1,fp) == 0) return -1;
3410 return 0;
3411 }
3412
3413 /* check rdbLoadLen() comments for more info */
3414 static int rdbSaveLen(FILE *fp, uint32_t len) {
3415 unsigned char buf[2];
3416
3417 if (len < (1<<6)) {
3418 /* Save a 6 bit len */
3419 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3420 if (fwrite(buf,1,1,fp) == 0) return -1;
3421 } else if (len < (1<<14)) {
3422 /* Save a 14 bit len */
3423 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3424 buf[1] = len&0xFF;
3425 if (fwrite(buf,2,1,fp) == 0) return -1;
3426 } else {
3427 /* Save a 32 bit len */
3428 buf[0] = (REDIS_RDB_32BITLEN<<6);
3429 if (fwrite(buf,1,1,fp) == 0) return -1;
3430 len = htonl(len);
3431 if (fwrite(&len,4,1,fp) == 0) return -1;
3432 }
3433 return 0;
3434 }
3435
3436 /* Encode 'value' as an integer if possible (if integer will fit the
3437 * supported range). If the function sucessful encoded the integer
3438 * then the (up to 5 bytes) encoded representation is written in the
3439 * string pointed by 'enc' and the length is returned. Otherwise
3440 * 0 is returned. */
3441 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3442 /* Finally check if it fits in our ranges */
3443 if (value >= -(1<<7) && value <= (1<<7)-1) {
3444 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3445 enc[1] = value&0xFF;
3446 return 2;
3447 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3448 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3449 enc[1] = value&0xFF;
3450 enc[2] = (value>>8)&0xFF;
3451 return 3;
3452 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3453 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3454 enc[1] = value&0xFF;
3455 enc[2] = (value>>8)&0xFF;
3456 enc[3] = (value>>16)&0xFF;
3457 enc[4] = (value>>24)&0xFF;
3458 return 5;
3459 } else {
3460 return 0;
3461 }
3462 }
3463
3464 /* String objects in the form "2391" "-100" without any space and with a
3465 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3466 * encoded as integers to save space */
3467 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3468 long long value;
3469 char *endptr, buf[32];
3470
3471 /* Check if it's possible to encode this value as a number */
3472 value = strtoll(s, &endptr, 10);
3473 if (endptr[0] != '\0') return 0;
3474 ll2string(buf,32,value);
3475
3476 /* If the number converted back into a string is not identical
3477 * then it's not possible to encode the string as integer */
3478 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3479
3480 return rdbEncodeInteger(value,enc);
3481 }
3482
3483 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3484 size_t comprlen, outlen;
3485 unsigned char byte;
3486 void *out;
3487
3488 /* We require at least four bytes compression for this to be worth it */
3489 if (len <= 4) return 0;
3490 outlen = len-4;
3491 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3492 comprlen = lzf_compress(s, len, out, outlen);
3493 if (comprlen == 0) {
3494 zfree(out);
3495 return 0;
3496 }
3497 /* Data compressed! Let's save it on disk */
3498 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3499 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3500 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3501 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3502 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3503 zfree(out);
3504 return comprlen;
3505
3506 writeerr:
3507 zfree(out);
3508 return -1;
3509 }
3510
3511 /* Save a string objet as [len][data] on disk. If the object is a string
3512 * representation of an integer value we try to safe it in a special form */
3513 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3514 int enclen;
3515
3516 /* Try integer encoding */
3517 if (len <= 11) {
3518 unsigned char buf[5];
3519 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3520 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3521 return 0;
3522 }
3523 }
3524
3525 /* Try LZF compression - under 20 bytes it's unable to compress even
3526 * aaaaaaaaaaaaaaaaaa so skip it */
3527 if (server.rdbcompression && len > 20) {
3528 int retval;
3529
3530 retval = rdbSaveLzfStringObject(fp,s,len);
3531 if (retval == -1) return -1;
3532 if (retval > 0) return 0;
3533 /* retval == 0 means data can't be compressed, save the old way */
3534 }
3535
3536 /* Store verbatim */
3537 if (rdbSaveLen(fp,len) == -1) return -1;
3538 if (len && fwrite(s,len,1,fp) == 0) return -1;
3539 return 0;
3540 }
3541
3542 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3543 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3544 int retval;
3545
3546 /* Avoid to decode the object, then encode it again, if the
3547 * object is alrady integer encoded. */
3548 if (obj->encoding == REDIS_ENCODING_INT) {
3549 long val = (long) obj->ptr;
3550 unsigned char buf[5];
3551 int enclen;
3552
3553 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3554 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3555 return 0;
3556 }
3557 /* otherwise... fall throught and continue with the usual
3558 * code path. */
3559 }
3560
3561 /* Avoid incr/decr ref count business when possible.
3562 * This plays well with copy-on-write given that we are probably
3563 * in a child process (BGSAVE). Also this makes sure key objects
3564 * of swapped objects are not incRefCount-ed (an assert does not allow
3565 * this in order to avoid bugs) */
3566 if (obj->encoding != REDIS_ENCODING_RAW) {
3567 obj = getDecodedObject(obj);
3568 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3569 decrRefCount(obj);
3570 } else {
3571 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3572 }
3573 return retval;
3574 }
3575
3576 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3577 * 8 bit integer specifing the length of the representation.
3578 * This 8 bit integer has special values in order to specify the following
3579 * conditions:
3580 * 253: not a number
3581 * 254: + inf
3582 * 255: - inf
3583 */
3584 static int rdbSaveDoubleValue(FILE *fp, double val) {
3585 unsigned char buf[128];
3586 int len;
3587
3588 if (isnan(val)) {
3589 buf[0] = 253;
3590 len = 1;
3591 } else if (!isfinite(val)) {
3592 len = 1;
3593 buf[0] = (val < 0) ? 255 : 254;
3594 } else {
3595 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3596 /* Check if the float is in a safe range to be casted into a
3597 * long long. We are assuming that long long is 64 bit here.
3598 * Also we are assuming that there are no implementations around where
3599 * double has precision < 52 bit.
3600 *
3601 * Under this assumptions we test if a double is inside an interval
3602 * where casting to long long is safe. Then using two castings we
3603 * make sure the decimal part is zero. If all this is true we use
3604 * integer printing function that is much faster. */
3605 double min = -4503599627370495; /* (2^52)-1 */
3606 double max = 4503599627370496; /* -(2^52) */
3607 if (val > min && val < max && val == ((double)((long long)val)))
3608 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3609 else
3610 #endif
3611 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3612 buf[0] = strlen((char*)buf+1);
3613 len = buf[0]+1;
3614 }
3615 if (fwrite(buf,len,1,fp) == 0) return -1;
3616 return 0;
3617 }
3618
3619 /* Save a Redis object. */
3620 static int rdbSaveObject(FILE *fp, robj *o) {
3621 if (o->type == REDIS_STRING) {
3622 /* Save a string value */
3623 if (rdbSaveStringObject(fp,o) == -1) return -1;
3624 } else if (o->type == REDIS_LIST) {
3625 /* Save a list value */
3626 list *list = o->ptr;
3627 listIter li;
3628 listNode *ln;
3629
3630 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3631 listRewind(list,&li);
3632 while((ln = listNext(&li))) {
3633 robj *eleobj = listNodeValue(ln);
3634
3635 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3636 }
3637 } else if (o->type == REDIS_SET) {
3638 /* Save a set value */
3639 dict *set = o->ptr;
3640 dictIterator *di = dictGetIterator(set);
3641 dictEntry *de;
3642
3643 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3644 while((de = dictNext(di)) != NULL) {
3645 robj *eleobj = dictGetEntryKey(de);
3646
3647 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3648 }
3649 dictReleaseIterator(di);
3650 } else if (o->type == REDIS_ZSET) {
3651 /* Save a set value */
3652 zset *zs = o->ptr;
3653 dictIterator *di = dictGetIterator(zs->dict);
3654 dictEntry *de;
3655
3656 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3657 while((de = dictNext(di)) != NULL) {
3658 robj *eleobj = dictGetEntryKey(de);
3659 double *score = dictGetEntryVal(de);
3660
3661 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3662 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3663 }
3664 dictReleaseIterator(di);
3665 } else if (o->type == REDIS_HASH) {
3666 /* Save a hash value */
3667 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3668 unsigned char *p = zipmapRewind(o->ptr);
3669 unsigned int count = zipmapLen(o->ptr);
3670 unsigned char *key, *val;
3671 unsigned int klen, vlen;
3672
3673 if (rdbSaveLen(fp,count) == -1) return -1;
3674 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3675 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3676 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3677 }
3678 } else {
3679 dictIterator *di = dictGetIterator(o->ptr);
3680 dictEntry *de;
3681
3682 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3683 while((de = dictNext(di)) != NULL) {
3684 robj *key = dictGetEntryKey(de);
3685 robj *val = dictGetEntryVal(de);
3686
3687 if (rdbSaveStringObject(fp,key) == -1) return -1;
3688 if (rdbSaveStringObject(fp,val) == -1) return -1;
3689 }
3690 dictReleaseIterator(di);
3691 }
3692 } else {
3693 redisPanic("Unknown object type");
3694 }
3695 return 0;
3696 }
3697
3698 /* Return the length the object will have on disk if saved with
3699 * the rdbSaveObject() function. Currently we use a trick to get
3700 * this length with very little changes to the code. In the future
3701 * we could switch to a faster solution. */
3702 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3703 if (fp == NULL) fp = server.devnull;
3704 rewind(fp);
3705 assert(rdbSaveObject(fp,o) != 1);
3706 return ftello(fp);
3707 }
3708
3709 /* Return the number of pages required to save this object in the swap file */
3710 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3711 off_t bytes = rdbSavedObjectLen(o,fp);
3712
3713 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3714 }
3715
3716 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3717 static int rdbSave(char *filename) {
3718 dictIterator *di = NULL;
3719 dictEntry *de;
3720 FILE *fp;
3721 char tmpfile[256];
3722 int j;
3723 time_t now = time(NULL);
3724
3725 /* Wait for I/O therads to terminate, just in case this is a
3726 * foreground-saving, to avoid seeking the swap file descriptor at the
3727 * same time. */
3728 if (server.vm_enabled)
3729 waitEmptyIOJobsQueue();
3730
3731 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3732 fp = fopen(tmpfile,"w");
3733 if (!fp) {
3734 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3735 return REDIS_ERR;
3736 }
3737 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3738 for (j = 0; j < server.dbnum; j++) {
3739 redisDb *db = server.db+j;
3740 dict *d = db->dict;
3741 if (dictSize(d) == 0) continue;
3742 di = dictGetIterator(d);
3743 if (!di) {
3744 fclose(fp);
3745 return REDIS_ERR;
3746 }
3747
3748 /* Write the SELECT DB opcode */
3749 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3750 if (rdbSaveLen(fp,j) == -1) goto werr;
3751
3752 /* Iterate this DB writing every entry */
3753 while((de = dictNext(di)) != NULL) {
3754 robj *key = dictGetEntryKey(de);
3755 robj *o = dictGetEntryVal(de);
3756 time_t expiretime = getExpire(db,key);
3757
3758 /* Save the expire time */
3759 if (expiretime != -1) {
3760 /* If this key is already expired skip it */
3761 if (expiretime < now) continue;
3762 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3763 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3764 }
3765 /* Save the key and associated value. This requires special
3766 * handling if the value is swapped out. */
3767 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3768 key->storage == REDIS_VM_SWAPPING) {
3769 /* Save type, key, value */
3770 if (rdbSaveType(fp,o->type) == -1) goto werr;
3771 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3772 if (rdbSaveObject(fp,o) == -1) goto werr;
3773 } else {
3774 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3775 robj *po;
3776 /* Get a preview of the object in memory */
3777 po = vmPreviewObject(key);
3778 /* Save type, key, value */
3779 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3780 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3781 if (rdbSaveObject(fp,po) == -1) goto werr;
3782 /* Remove the loaded object from memory */
3783 decrRefCount(po);
3784 }
3785 }
3786 dictReleaseIterator(di);
3787 }
3788 /* EOF opcode */
3789 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3790
3791 /* Make sure data will not remain on the OS's output buffers */
3792 fflush(fp);
3793 fsync(fileno(fp));
3794 fclose(fp);
3795
3796 /* Use RENAME to make sure the DB file is changed atomically only
3797 * if the generate DB file is ok. */
3798 if (rename(tmpfile,filename) == -1) {
3799 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3800 unlink(tmpfile);
3801 return REDIS_ERR;
3802 }
3803 redisLog(REDIS_NOTICE,"DB saved on disk");
3804 server.dirty = 0;
3805 server.lastsave = time(NULL);
3806 return REDIS_OK;
3807
3808 werr:
3809 fclose(fp);
3810 unlink(tmpfile);
3811 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3812 if (di) dictReleaseIterator(di);
3813 return REDIS_ERR;
3814 }
3815
3816 static int rdbSaveBackground(char *filename) {
3817 pid_t childpid;
3818
3819 if (server.bgsavechildpid != -1) return REDIS_ERR;
3820 if (server.vm_enabled) waitEmptyIOJobsQueue();
3821 if ((childpid = fork()) == 0) {
3822 /* Child */
3823 if (server.vm_enabled) vmReopenSwapFile();
3824 close(server.fd);
3825 if (rdbSave(filename) == REDIS_OK) {
3826 _exit(0);
3827 } else {
3828 _exit(1);
3829 }
3830 } else {
3831 /* Parent */
3832 if (childpid == -1) {
3833 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3834 strerror(errno));
3835 return REDIS_ERR;
3836 }
3837 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3838 server.bgsavechildpid = childpid;
3839 updateDictResizePolicy();
3840 return REDIS_OK;
3841 }
3842 return REDIS_OK; /* unreached */
3843 }
3844
3845 static void rdbRemoveTempFile(pid_t childpid) {
3846 char tmpfile[256];
3847
3848 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3849 unlink(tmpfile);
3850 }
3851
3852 static int rdbLoadType(FILE *fp) {
3853 unsigned char type;
3854 if (fread(&type,1,1,fp) == 0) return -1;
3855 return type;
3856 }
3857
3858 static time_t rdbLoadTime(FILE *fp) {
3859 int32_t t32;
3860 if (fread(&t32,4,1,fp) == 0) return -1;
3861 return (time_t) t32;
3862 }
3863
3864 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3865 * of this file for a description of how this are stored on disk.
3866 *
3867 * isencoded is set to 1 if the readed length is not actually a length but
3868 * an "encoding type", check the above comments for more info */
3869 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3870 unsigned char buf[2];
3871 uint32_t len;
3872 int type;
3873
3874 if (isencoded) *isencoded = 0;
3875 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3876 type = (buf[0]&0xC0)>>6;
3877 if (type == REDIS_RDB_6BITLEN) {
3878 /* Read a 6 bit len */
3879 return buf[0]&0x3F;
3880 } else if (type == REDIS_RDB_ENCVAL) {
3881 /* Read a 6 bit len encoding type */
3882 if (isencoded) *isencoded = 1;
3883 return buf[0]&0x3F;
3884 } else if (type == REDIS_RDB_14BITLEN) {
3885 /* Read a 14 bit len */
3886 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3887 return ((buf[0]&0x3F)<<8)|buf[1];
3888 } else {
3889 /* Read a 32 bit len */
3890 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3891 return ntohl(len);
3892 }
3893 }
3894
3895 /* Load an integer-encoded object from file 'fp', with the specified
3896 * encoding type 'enctype'. If encode is true the function may return
3897 * an integer-encoded object as reply, otherwise the returned object
3898 * will always be encoded as a raw string. */
3899 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3900 unsigned char enc[4];
3901 long long val;
3902
3903 if (enctype == REDIS_RDB_ENC_INT8) {
3904 if (fread(enc,1,1,fp) == 0) return NULL;
3905 val = (signed char)enc[0];
3906 } else if (enctype == REDIS_RDB_ENC_INT16) {
3907 uint16_t v;
3908 if (fread(enc,2,1,fp) == 0) return NULL;
3909 v = enc[0]|(enc[1]<<8);
3910 val = (int16_t)v;
3911 } else if (enctype == REDIS_RDB_ENC_INT32) {
3912 uint32_t v;
3913 if (fread(enc,4,1,fp) == 0) return NULL;
3914 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3915 val = (int32_t)v;
3916 } else {
3917 val = 0; /* anti-warning */
3918 redisPanic("Unknown RDB integer encoding type");
3919 }
3920 if (encode)
3921 return createStringObjectFromLongLong(val);
3922 else
3923 return createObject(REDIS_STRING,sdsfromlonglong(val));
3924 }
3925
3926 static robj *rdbLoadLzfStringObject(FILE*fp) {
3927 unsigned int len, clen;
3928 unsigned char *c = NULL;
3929 sds val = NULL;
3930
3931 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3932 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3933 if ((c = zmalloc(clen)) == NULL) goto err;
3934 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3935 if (fread(c,clen,1,fp) == 0) goto err;
3936 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3937 zfree(c);
3938 return createObject(REDIS_STRING,val);
3939 err:
3940 zfree(c);
3941 sdsfree(val);
3942 return NULL;
3943 }
3944
3945 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3946 int isencoded;
3947 uint32_t len;
3948 sds val;
3949
3950 len = rdbLoadLen(fp,&isencoded);
3951 if (isencoded) {
3952 switch(len) {
3953 case REDIS_RDB_ENC_INT8:
3954 case REDIS_RDB_ENC_INT16:
3955 case REDIS_RDB_ENC_INT32:
3956 return rdbLoadIntegerObject(fp,len,encode);
3957 case REDIS_RDB_ENC_LZF:
3958 return rdbLoadLzfStringObject(fp);
3959 default:
3960 redisPanic("Unknown RDB encoding type");
3961 }
3962 }
3963
3964 if (len == REDIS_RDB_LENERR) return NULL;
3965 val = sdsnewlen(NULL,len);
3966 if (len && fread(val,len,1,fp) == 0) {
3967 sdsfree(val);
3968 return NULL;
3969 }
3970 return createObject(REDIS_STRING,val);
3971 }
3972
3973 static robj *rdbLoadStringObject(FILE *fp) {
3974 return rdbGenericLoadStringObject(fp,0);
3975 }
3976
3977 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3978 return rdbGenericLoadStringObject(fp,1);
3979 }
3980
3981 /* For information about double serialization check rdbSaveDoubleValue() */
3982 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3983 char buf[128];
3984 unsigned char len;
3985
3986 if (fread(&len,1,1,fp) == 0) return -1;
3987 switch(len) {
3988 case 255: *val = R_NegInf; return 0;
3989 case 254: *val = R_PosInf; return 0;
3990 case 253: *val = R_Nan; return 0;
3991 default:
3992 if (fread(buf,len,1,fp) == 0) return -1;
3993 buf[len] = '\0';
3994 sscanf(buf, "%lg", val);
3995 return 0;
3996 }
3997 }
3998
3999 /* Load a Redis object of the specified type from the specified file.
4000 * On success a newly allocated object is returned, otherwise NULL. */
4001 static robj *rdbLoadObject(int type, FILE *fp) {
4002 robj *o;
4003
4004 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4005 if (type == REDIS_STRING) {
4006 /* Read string value */
4007 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4008 o = tryObjectEncoding(o);
4009 } else if (type == REDIS_LIST || type == REDIS_SET) {
4010 /* Read list/set value */
4011 uint32_t listlen;
4012
4013 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4014 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
4015 /* It's faster to expand the dict to the right size asap in order
4016 * to avoid rehashing */
4017 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4018 dictExpand(o->ptr,listlen);
4019 /* Load every single element of the list/set */
4020 while(listlen--) {
4021 robj *ele;
4022
4023 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4024 ele = tryObjectEncoding(ele);
4025 if (type == REDIS_LIST) {
4026 listAddNodeTail((list*)o->ptr,ele);
4027 } else {
4028 dictAdd((dict*)o->ptr,ele,NULL);
4029 }
4030 }
4031 } else if (type == REDIS_ZSET) {
4032 /* Read list/set value */
4033 size_t zsetlen;
4034 zset *zs;
4035
4036 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4037 o = createZsetObject();
4038 zs = o->ptr;
4039 /* Load every single element of the list/set */
4040 while(zsetlen--) {
4041 robj *ele;
4042 double *score = zmalloc(sizeof(double));
4043
4044 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4045 ele = tryObjectEncoding(ele);
4046 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4047 dictAdd(zs->dict,ele,score);
4048 zslInsert(zs->zsl,*score,ele);
4049 incrRefCount(ele); /* added to skiplist */
4050 }
4051 } else if (type == REDIS_HASH) {
4052 size_t hashlen;
4053
4054 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4055 o = createHashObject();
4056 /* Too many entries? Use an hash table. */
4057 if (hashlen > server.hash_max_zipmap_entries)
4058 convertToRealHash(o);
4059 /* Load every key/value, then set it into the zipmap or hash
4060 * table, as needed. */
4061 while(hashlen--) {
4062 robj *key, *val;
4063
4064 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4065 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4066 /* If we are using a zipmap and there are too big values
4067 * the object is converted to real hash table encoding. */
4068 if (o->encoding != REDIS_ENCODING_HT &&
4069 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4070 sdslen(val->ptr) > server.hash_max_zipmap_value))
4071 {
4072 convertToRealHash(o);
4073 }
4074
4075 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4076 unsigned char *zm = o->ptr;
4077
4078 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4079 val->ptr,sdslen(val->ptr),NULL);
4080 o->ptr = zm;
4081 decrRefCount(key);
4082 decrRefCount(val);
4083 } else {
4084 key = tryObjectEncoding(key);
4085 val = tryObjectEncoding(val);
4086 dictAdd((dict*)o->ptr,key,val);
4087 }
4088 }
4089 } else {
4090 redisPanic("Unknown object type");
4091 }
4092 return o;
4093 }
4094
4095 static int rdbLoad(char *filename) {
4096 FILE *fp;
4097 uint32_t dbid;
4098 int type, retval, rdbver;
4099 int swap_all_values = 0;
4100 dict *d = server.db[0].dict;
4101 redisDb *db = server.db+0;
4102 char buf[1024];
4103 time_t expiretime, now = time(NULL);
4104 long long loadedkeys = 0;
4105
4106 fp = fopen(filename,"r");
4107 if (!fp) return REDIS_ERR;
4108 if (fread(buf,9,1,fp) == 0) goto eoferr;
4109 buf[9] = '\0';
4110 if (memcmp(buf,"REDIS",5) != 0) {
4111 fclose(fp);
4112 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4113 return REDIS_ERR;
4114 }
4115 rdbver = atoi(buf+5);
4116 if (rdbver != 1) {
4117 fclose(fp);
4118 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4119 return REDIS_ERR;
4120 }
4121 while(1) {
4122 robj *key, *val;
4123 int force_swapout;
4124
4125 expiretime = -1;
4126 /* Read type. */
4127 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4128 if (type == REDIS_EXPIRETIME) {
4129 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4130 /* We read the time so we need to read the object type again */
4131 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4132 }
4133 if (type == REDIS_EOF) break;
4134 /* Handle SELECT DB opcode as a special case */
4135 if (type == REDIS_SELECTDB) {
4136 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4137 goto eoferr;
4138 if (dbid >= (unsigned)server.dbnum) {
4139 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4140 exit(1);
4141 }
4142 db = server.db+dbid;
4143 d = db->dict;
4144 continue;
4145 }
4146 /* Read key */
4147 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4148 /* Read value */
4149 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4150 /* Check if the key already expired */
4151 if (expiretime != -1 && expiretime < now) {
4152 decrRefCount(key);
4153 decrRefCount(val);
4154 continue;
4155 }
4156 /* Add the new object in the hash table */
4157 retval = dictAdd(d,key,val);
4158 if (retval == DICT_ERR) {
4159 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4160 exit(1);
4161 }
4162 loadedkeys++;
4163 /* Set the expire time if needed */
4164 if (expiretime != -1) setExpire(db,key,expiretime);
4165
4166 /* Handle swapping while loading big datasets when VM is on */
4167
4168 /* If we detecter we are hopeless about fitting something in memory
4169 * we just swap every new key on disk. Directly...
4170 * Note that's important to check for this condition before resorting
4171 * to random sampling, otherwise we may try to swap already
4172 * swapped keys. */
4173 if (swap_all_values) {
4174 dictEntry *de = dictFind(d,key);
4175
4176 /* de may be NULL since the key already expired */
4177 if (de) {
4178 key = dictGetEntryKey(de);
4179 val = dictGetEntryVal(de);
4180
4181 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4182 dictGetEntryVal(de) = NULL;
4183 }
4184 }
4185 continue;
4186 }
4187
4188 force_swapout = 0;
4189 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4190 force_swapout = 1;
4191
4192 /* If we have still some hope of having some value fitting memory
4193 * then we try random sampling. */
4194 if (!swap_all_values && server.vm_enabled && force_swapout) {
4195 while (zmalloc_used_memory() > server.vm_max_memory) {
4196 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4197 }
4198 if (zmalloc_used_memory() > server.vm_max_memory)
4199 swap_all_values = 1; /* We are already using too much mem */
4200 }
4201 }
4202 fclose(fp);
4203 return REDIS_OK;
4204
4205 eoferr: /* unexpected end of file is handled here with a fatal exit */
4206 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4207 exit(1);
4208 return REDIS_ERR; /* Just to avoid warning */
4209 }
4210
4211 /*================================== Shutdown =============================== */
4212 static int prepareForShutdown() {
4213 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4214 /* Kill the saving child if there is a background saving in progress.
4215 We want to avoid race conditions, for instance our saving child may
4216 overwrite the synchronous saving did by SHUTDOWN. */
4217 if (server.bgsavechildpid != -1) {
4218 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4219 kill(server.bgsavechildpid,SIGKILL);
4220 rdbRemoveTempFile(server.bgsavechildpid);
4221 }
4222 if (server.appendonly) {
4223 /* Append only file: fsync() the AOF and exit */
4224 aof_fsync(server.appendfd);
4225 if (server.vm_enabled) unlink(server.vm_swap_file);
4226 } else {
4227 /* Snapshotting. Perform a SYNC SAVE and exit */
4228 if (rdbSave(server.dbfilename) == REDIS_OK) {
4229 if (server.daemonize)
4230 unlink(server.pidfile);
4231 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4232 } else {
4233 /* Ooops.. error saving! The best we can do is to continue
4234 * operating. Note that if there was a background saving process,
4235 * in the next cron() Redis will be notified that the background
4236 * saving aborted, handling special stuff like slaves pending for
4237 * synchronization... */
4238 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4239 return REDIS_ERR;
4240 }
4241 }
4242 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4243 return REDIS_OK;
4244 }
4245
4246 /*================================== Commands =============================== */
4247
4248 static void authCommand(redisClient *c) {
4249 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4250 c->authenticated = 1;
4251 addReply(c,shared.ok);
4252 } else {
4253 c->authenticated = 0;
4254 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4255 }
4256 }
4257
4258 static void pingCommand(redisClient *c) {
4259 addReply(c,shared.pong);
4260 }
4261
4262 static void echoCommand(redisClient *c) {
4263 addReplyBulk(c,c->argv[1]);
4264 }
4265
4266 /*=================================== Strings =============================== */
4267
4268 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4269 int retval;
4270 long seconds = 0; /* initialized to avoid an harmness warning */
4271
4272 if (expire) {
4273 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4274 return;
4275 if (seconds <= 0) {
4276 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4277 return;
4278 }
4279 }
4280
4281 touchWatchedKey(c->db,key);
4282 if (nx) deleteIfVolatile(c->db,key);
4283 retval = dictAdd(c->db->dict,key,val);
4284 if (retval == DICT_ERR) {
4285 if (!nx) {
4286 /* If the key is about a swapped value, we want a new key object
4287 * to overwrite the old. So we delete the old key in the database.
4288 * This will also make sure that swap pages about the old object
4289 * will be marked as free. */
4290 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4291 incrRefCount(key);
4292 dictReplace(c->db->dict,key,val);
4293 incrRefCount(val);
4294 } else {
4295 addReply(c,shared.czero);
4296 return;
4297 }
4298 } else {
4299 incrRefCount(key);
4300 incrRefCount(val);
4301 }
4302 server.dirty++;
4303 removeExpire(c->db,key);
4304 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4305 addReply(c, nx ? shared.cone : shared.ok);
4306 }
4307
4308 static void setCommand(redisClient *c) {
4309 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4310 }
4311
4312 static void setnxCommand(redisClient *c) {
4313 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4314 }
4315
4316 static void setexCommand(redisClient *c) {
4317 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4318 }
4319
4320 static int getGenericCommand(redisClient *c) {
4321 robj *o;
4322
4323 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4324 return REDIS_OK;
4325
4326 if (o->type != REDIS_STRING) {
4327 addReply(c,shared.wrongtypeerr);
4328 return REDIS_ERR;
4329 } else {
4330 addReplyBulk(c,o);
4331 return REDIS_OK;
4332 }
4333 }
4334
4335 static void getCommand(redisClient *c) {
4336 getGenericCommand(c);
4337 }
4338
4339 static void getsetCommand(redisClient *c) {
4340 if (getGenericCommand(c) == REDIS_ERR) return;
4341 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4342 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4343 } else {
4344 incrRefCount(c->argv[1]);
4345 }
4346 incrRefCount(c->argv[2]);
4347 server.dirty++;
4348 removeExpire(c->db,c->argv[1]);
4349 }
4350
4351 static void mgetCommand(redisClient *c) {
4352 int j;
4353
4354 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4355 for (j = 1; j < c->argc; j++) {
4356 robj *o = lookupKeyRead(c->db,c->argv[j]);
4357 if (o == NULL) {
4358 addReply(c,shared.nullbulk);
4359 } else {
4360 if (o->type != REDIS_STRING) {
4361 addReply(c,shared.nullbulk);
4362 } else {
4363 addReplyBulk(c,o);
4364 }
4365 }
4366 }
4367 }
4368
4369 static void msetGenericCommand(redisClient *c, int nx) {
4370 int j, busykeys = 0;
4371
4372 if ((c->argc % 2) == 0) {
4373 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4374 return;
4375 }
4376 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4377 * set nothing at all if at least one already key exists. */
4378 if (nx) {
4379 for (j = 1; j < c->argc; j += 2) {
4380 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4381 busykeys++;
4382 }
4383 }
4384 }
4385 if (busykeys) {
4386 addReply(c, shared.czero);
4387 return;
4388 }
4389
4390 for (j = 1; j < c->argc; j += 2) {
4391 int retval;
4392
4393 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4394 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4395 if (retval == DICT_ERR) {
4396 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4397 incrRefCount(c->argv[j+1]);
4398 } else {
4399 incrRefCount(c->argv[j]);
4400 incrRefCount(c->argv[j+1]);
4401 }
4402 removeExpire(c->db,c->argv[j]);
4403 }
4404 server.dirty += (c->argc-1)/2;
4405 addReply(c, nx ? shared.cone : shared.ok);
4406 }
4407
4408 static void msetCommand(redisClient *c) {
4409 msetGenericCommand(c,0);
4410 }
4411
4412 static void msetnxCommand(redisClient *c) {
4413 msetGenericCommand(c,1);
4414 }
4415
4416 static void incrDecrCommand(redisClient *c, long long incr) {
4417 long long value;
4418 int retval;
4419 robj *o;
4420
4421 o = lookupKeyWrite(c->db,c->argv[1]);
4422 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4423 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4424
4425 value += incr;
4426 o = createStringObjectFromLongLong(value);
4427 retval = dictAdd(c->db->dict,c->argv[1],o);
4428 if (retval == DICT_ERR) {
4429 dictReplace(c->db->dict,c->argv[1],o);
4430 removeExpire(c->db,c->argv[1]);
4431 } else {
4432 incrRefCount(c->argv[1]);
4433 }
4434 server.dirty++;
4435 addReply(c,shared.colon);
4436 addReply(c,o);
4437 addReply(c,shared.crlf);
4438 }
4439
4440 static void incrCommand(redisClient *c) {
4441 incrDecrCommand(c,1);
4442 }
4443
4444 static void decrCommand(redisClient *c) {
4445 incrDecrCommand(c,-1);
4446 }
4447
4448 static void incrbyCommand(redisClient *c) {
4449 long long incr;
4450
4451 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4452 incrDecrCommand(c,incr);
4453 }
4454
4455 static void decrbyCommand(redisClient *c) {
4456 long long incr;
4457
4458 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4459 incrDecrCommand(c,-incr);
4460 }
4461
4462 static void appendCommand(redisClient *c) {
4463 int retval;
4464 size_t totlen;
4465 robj *o;
4466
4467 o = lookupKeyWrite(c->db,c->argv[1]);
4468 if (o == NULL) {
4469 /* Create the key */
4470 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4471 incrRefCount(c->argv[1]);
4472 incrRefCount(c->argv[2]);
4473 totlen = stringObjectLen(c->argv[2]);
4474 } else {
4475 dictEntry *de;
4476
4477 de = dictFind(c->db->dict,c->argv[1]);
4478 assert(de != NULL);
4479
4480 o = dictGetEntryVal(de);
4481 if (o->type != REDIS_STRING) {
4482 addReply(c,shared.wrongtypeerr);
4483 return;
4484 }
4485 /* If the object is specially encoded or shared we have to make
4486 * a copy */
4487 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4488 robj *decoded = getDecodedObject(o);
4489
4490 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4491 decrRefCount(decoded);
4492 dictReplace(c->db->dict,c->argv[1],o);
4493 }
4494 /* APPEND! */
4495 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4496 o->ptr = sdscatlen(o->ptr,
4497 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4498 } else {
4499 o->ptr = sdscatprintf(o->ptr, "%ld",
4500 (unsigned long) c->argv[2]->ptr);
4501 }
4502 totlen = sdslen(o->ptr);
4503 }
4504 server.dirty++;
4505 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4506 }
4507
4508 static void substrCommand(redisClient *c) {
4509 robj *o;
4510 long start = atoi(c->argv[2]->ptr);
4511 long end = atoi(c->argv[3]->ptr);
4512 size_t rangelen, strlen;
4513 sds range;
4514
4515 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4516 checkType(c,o,REDIS_STRING)) return;
4517
4518 o = getDecodedObject(o);
4519 strlen = sdslen(o->ptr);
4520
4521 /* convert negative indexes */
4522 if (start < 0) start = strlen+start;
4523 if (end < 0) end = strlen+end;
4524 if (start < 0) start = 0;
4525 if (end < 0) end = 0;
4526
4527 /* indexes sanity checks */
4528 if (start > end || (size_t)start >= strlen) {
4529 /* Out of range start or start > end result in null reply */
4530 addReply(c,shared.nullbulk);
4531 decrRefCount(o);
4532 return;
4533 }
4534 if ((size_t)end >= strlen) end = strlen-1;
4535 rangelen = (end-start)+1;
4536
4537 /* Return the result */
4538 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4539 range = sdsnewlen((char*)o->ptr+start,rangelen);
4540 addReplySds(c,range);
4541 addReply(c,shared.crlf);
4542 decrRefCount(o);
4543 }
4544
4545 /* ========================= Type agnostic commands ========================= */
4546
4547 static void delCommand(redisClient *c) {
4548 int deleted = 0, j;
4549
4550 for (j = 1; j < c->argc; j++) {
4551 if (deleteKey(c->db,c->argv[j])) {
4552 touchWatchedKey(c->db,c->argv[j]);
4553 server.dirty++;
4554 deleted++;
4555 }
4556 }
4557 addReplyLongLong(c,deleted);
4558 }
4559
4560 static void existsCommand(redisClient *c) {
4561 expireIfNeeded(c->db,c->argv[1]);
4562 if (dictFind(c->db->dict,c->argv[1])) {
4563 addReply(c, shared.cone);
4564 } else {
4565 addReply(c, shared.czero);
4566 }
4567 }
4568
4569 static void selectCommand(redisClient *c) {
4570 int id = atoi(c->argv[1]->ptr);
4571
4572 if (selectDb(c,id) == REDIS_ERR) {
4573 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4574 } else {
4575 addReply(c,shared.ok);
4576 }
4577 }
4578
4579 static void randomkeyCommand(redisClient *c) {
4580 dictEntry *de;
4581 robj *key;
4582
4583 while(1) {
4584 de = dictGetRandomKey(c->db->dict);
4585 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4586 }
4587
4588 if (de == NULL) {
4589 addReply(c,shared.nullbulk);
4590 return;
4591 }
4592
4593 key = dictGetEntryKey(de);
4594 if (server.vm_enabled) {
4595 key = dupStringObject(key);
4596 addReplyBulk(c,key);
4597 decrRefCount(key);
4598 } else {
4599 addReplyBulk(c,key);
4600 }
4601 }
4602
4603 static void keysCommand(redisClient *c) {
4604 dictIterator *di;
4605 dictEntry *de;
4606 sds pattern = c->argv[1]->ptr;
4607 int plen = sdslen(pattern);
4608 unsigned long numkeys = 0;
4609 robj *lenobj = createObject(REDIS_STRING,NULL);
4610
4611 di = dictGetIterator(c->db->dict);
4612 addReply(c,lenobj);
4613 decrRefCount(lenobj);
4614 while((de = dictNext(di)) != NULL) {
4615 robj *keyobj = dictGetEntryKey(de);
4616
4617 sds key = keyobj->ptr;
4618 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4619 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4620 if (expireIfNeeded(c->db,keyobj) == 0) {
4621 addReplyBulk(c,keyobj);
4622 numkeys++;
4623 }
4624 }
4625 }
4626 dictReleaseIterator(di);
4627 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4628 }
4629
4630 static void dbsizeCommand(redisClient *c) {
4631 addReplySds(c,
4632 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4633 }
4634
4635 static void lastsaveCommand(redisClient *c) {
4636 addReplySds(c,
4637 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4638 }
4639
4640 static void typeCommand(redisClient *c) {
4641 robj *o;
4642 char *type;
4643
4644 o = lookupKeyRead(c->db,c->argv[1]);
4645 if (o == NULL) {
4646 type = "+none";
4647 } else {
4648 switch(o->type) {
4649 case REDIS_STRING: type = "+string"; break;
4650 case REDIS_LIST: type = "+list"; break;
4651 case REDIS_SET: type = "+set"; break;
4652 case REDIS_ZSET: type = "+zset"; break;
4653 case REDIS_HASH: type = "+hash"; break;
4654 default: type = "+unknown"; break;
4655 }
4656 }
4657 addReplySds(c,sdsnew(type));
4658 addReply(c,shared.crlf);
4659 }
4660
4661 static void saveCommand(redisClient *c) {
4662 if (server.bgsavechildpid != -1) {
4663 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4664 return;
4665 }
4666 if (rdbSave(server.dbfilename) == REDIS_OK) {
4667 addReply(c,shared.ok);
4668 } else {
4669 addReply(c,shared.err);
4670 }
4671 }
4672
4673 static void bgsaveCommand(redisClient *c) {
4674 if (server.bgsavechildpid != -1) {
4675 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4676 return;
4677 }
4678 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4679 char *status = "+Background saving started\r\n";
4680 addReplySds(c,sdsnew(status));
4681 } else {
4682 addReply(c,shared.err);
4683 }
4684 }
4685
4686 static void shutdownCommand(redisClient *c) {
4687 if (prepareForShutdown() == REDIS_OK)
4688 exit(0);
4689 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4690 }
4691
4692 static void renameGenericCommand(redisClient *c, int nx) {
4693 robj *o;
4694
4695 /* To use the same key as src and dst is probably an error */
4696 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4697 addReply(c,shared.sameobjecterr);
4698 return;
4699 }
4700
4701 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4702 return;
4703
4704 incrRefCount(o);
4705 deleteIfVolatile(c->db,c->argv[2]);
4706 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4707 if (nx) {
4708 decrRefCount(o);
4709 addReply(c,shared.czero);
4710 return;
4711 }
4712 dictReplace(c->db->dict,c->argv[2],o);
4713 } else {
4714 incrRefCount(c->argv[2]);
4715 }
4716 deleteKey(c->db,c->argv[1]);
4717 touchWatchedKey(c->db,c->argv[2]);
4718 server.dirty++;
4719 addReply(c,nx ? shared.cone : shared.ok);
4720 }
4721
4722 static void renameCommand(redisClient *c) {
4723 renameGenericCommand(c,0);
4724 }
4725
4726 static void renamenxCommand(redisClient *c) {
4727 renameGenericCommand(c,1);
4728 }
4729
4730 static void moveCommand(redisClient *c) {
4731 robj *o;
4732 redisDb *src, *dst;
4733 int srcid;
4734
4735 /* Obtain source and target DB pointers */
4736 src = c->db;
4737 srcid = c->db->id;
4738 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4739 addReply(c,shared.outofrangeerr);
4740 return;
4741 }
4742 dst = c->db;
4743 selectDb(c,srcid); /* Back to the source DB */
4744
4745 /* If the user is moving using as target the same
4746 * DB as the source DB it is probably an error. */
4747 if (src == dst) {
4748 addReply(c,shared.sameobjecterr);
4749 return;
4750 }
4751
4752 /* Check if the element exists and get a reference */
4753 o = lookupKeyWrite(c->db,c->argv[1]);
4754 if (!o) {
4755 addReply(c,shared.czero);
4756 return;
4757 }
4758
4759 /* Try to add the element to the target DB */
4760 deleteIfVolatile(dst,c->argv[1]);
4761 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4762 addReply(c,shared.czero);
4763 return;
4764 }
4765 incrRefCount(c->argv[1]);
4766 incrRefCount(o);
4767
4768 /* OK! key moved, free the entry in the source DB */
4769 deleteKey(src,c->argv[1]);
4770 server.dirty++;
4771 addReply(c,shared.cone);
4772 }
4773
4774 /* =================================== Lists ================================ */
4775 static void pushGenericCommand(redisClient *c, int where) {
4776 robj *lobj;
4777 list *list;
4778
4779 lobj = lookupKeyWrite(c->db,c->argv[1]);
4780 if (lobj == NULL) {
4781 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4782 addReply(c,shared.cone);
4783 return;
4784 }
4785 lobj = createListObject();
4786 list = lobj->ptr;
4787 if (where == REDIS_HEAD) {
4788 listAddNodeHead(list,c->argv[2]);
4789 } else {
4790 listAddNodeTail(list,c->argv[2]);
4791 }
4792 dictAdd(c->db->dict,c->argv[1],lobj);
4793 incrRefCount(c->argv[1]);
4794 incrRefCount(c->argv[2]);
4795 } else {
4796 if (lobj->type != REDIS_LIST) {
4797 addReply(c,shared.wrongtypeerr);
4798 return;
4799 }
4800 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4801 addReply(c,shared.cone);
4802 return;
4803 }
4804 list = lobj->ptr;
4805 if (where == REDIS_HEAD) {
4806 listAddNodeHead(list,c->argv[2]);
4807 } else {
4808 listAddNodeTail(list,c->argv[2]);
4809 }
4810 incrRefCount(c->argv[2]);
4811 }
4812 server.dirty++;
4813 addReplyLongLong(c,listLength(list));
4814 }
4815
4816 static void lpushCommand(redisClient *c) {
4817 pushGenericCommand(c,REDIS_HEAD);
4818 }
4819
4820 static void rpushCommand(redisClient *c) {
4821 pushGenericCommand(c,REDIS_TAIL);
4822 }
4823
4824 static void llenCommand(redisClient *c) {
4825 robj *o;
4826 list *l;
4827
4828 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4829 checkType(c,o,REDIS_LIST)) return;
4830
4831 l = o->ptr;
4832 addReplyUlong(c,listLength(l));
4833 }
4834
4835 static void lindexCommand(redisClient *c) {
4836 robj *o;
4837 int index = atoi(c->argv[2]->ptr);
4838 list *list;
4839 listNode *ln;
4840
4841 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4842 checkType(c,o,REDIS_LIST)) return;
4843 list = o->ptr;
4844
4845 ln = listIndex(list, index);
4846 if (ln == NULL) {
4847 addReply(c,shared.nullbulk);
4848 } else {
4849 robj *ele = listNodeValue(ln);
4850 addReplyBulk(c,ele);
4851 }
4852 }
4853
4854 static void lsetCommand(redisClient *c) {
4855 robj *o;
4856 int index = atoi(c->argv[2]->ptr);
4857 list *list;
4858 listNode *ln;
4859
4860 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4861 checkType(c,o,REDIS_LIST)) return;
4862 list = o->ptr;
4863
4864 ln = listIndex(list, index);
4865 if (ln == NULL) {
4866 addReply(c,shared.outofrangeerr);
4867 } else {
4868 robj *ele = listNodeValue(ln);
4869
4870 decrRefCount(ele);
4871 listNodeValue(ln) = c->argv[3];
4872 incrRefCount(c->argv[3]);
4873 addReply(c,shared.ok);
4874 server.dirty++;
4875 }
4876 }
4877
4878 static void popGenericCommand(redisClient *c, int where) {
4879 robj *o;
4880 list *list;
4881 listNode *ln;
4882
4883 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4884 checkType(c,o,REDIS_LIST)) return;
4885 list = o->ptr;
4886
4887 if (where == REDIS_HEAD)
4888 ln = listFirst(list);
4889 else
4890 ln = listLast(list);
4891
4892 if (ln == NULL) {
4893 addReply(c,shared.nullbulk);
4894 } else {
4895 robj *ele = listNodeValue(ln);
4896 addReplyBulk(c,ele);
4897 listDelNode(list,ln);
4898 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4899 server.dirty++;
4900 }
4901 }
4902
4903 static void lpopCommand(redisClient *c) {
4904 popGenericCommand(c,REDIS_HEAD);
4905 }
4906
4907 static void rpopCommand(redisClient *c) {
4908 popGenericCommand(c,REDIS_TAIL);
4909 }
4910
4911 static void lrangeCommand(redisClient *c) {
4912 robj *o;
4913 int start = atoi(c->argv[2]->ptr);
4914 int end = atoi(c->argv[3]->ptr);
4915 int llen;
4916 int rangelen, j;
4917 list *list;
4918 listNode *ln;
4919 robj *ele;
4920
4921 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4922 || checkType(c,o,REDIS_LIST)) return;
4923 list = o->ptr;
4924 llen = listLength(list);
4925
4926 /* convert negative indexes */
4927 if (start < 0) start = llen+start;
4928 if (end < 0) end = llen+end;
4929 if (start < 0) start = 0;
4930 if (end < 0) end = 0;
4931
4932 /* indexes sanity checks */
4933 if (start > end || start >= llen) {
4934 /* Out of range start or start > end result in empty list */
4935 addReply(c,shared.emptymultibulk);
4936 return;
4937 }
4938 if (end >= llen) end = llen-1;
4939 rangelen = (end-start)+1;
4940
4941 /* Return the result in form of a multi-bulk reply */
4942 ln = listIndex(list, start);
4943 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4944 for (j = 0; j < rangelen; j++) {
4945 ele = listNodeValue(ln);
4946 addReplyBulk(c,ele);
4947 ln = ln->next;
4948 }
4949 }
4950
4951 static void ltrimCommand(redisClient *c) {
4952 robj *o;
4953 int start = atoi(c->argv[2]->ptr);
4954 int end = atoi(c->argv[3]->ptr);
4955 int llen;
4956 int j, ltrim, rtrim;
4957 list *list;
4958 listNode *ln;
4959
4960 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4961 checkType(c,o,REDIS_LIST)) return;
4962 list = o->ptr;
4963 llen = listLength(list);
4964
4965 /* convert negative indexes */
4966 if (start < 0) start = llen+start;
4967 if (end < 0) end = llen+end;
4968 if (start < 0) start = 0;
4969 if (end < 0) end = 0;
4970
4971 /* indexes sanity checks */
4972 if (start > end || start >= llen) {
4973 /* Out of range start or start > end result in empty list */
4974 ltrim = llen;
4975 rtrim = 0;
4976 } else {
4977 if (end >= llen) end = llen-1;
4978 ltrim = start;
4979 rtrim = llen-end-1;
4980 }
4981
4982 /* Remove list elements to perform the trim */
4983 for (j = 0; j < ltrim; j++) {
4984 ln = listFirst(list);
4985 listDelNode(list,ln);
4986 }
4987 for (j = 0; j < rtrim; j++) {
4988 ln = listLast(list);
4989 listDelNode(list,ln);
4990 }
4991 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4992 server.dirty++;
4993 addReply(c,shared.ok);
4994 }
4995
4996 static void lremCommand(redisClient *c) {
4997 robj *o;
4998 list *list;
4999 listNode *ln, *next;
5000 int toremove = atoi(c->argv[2]->ptr);
5001 int removed = 0;
5002 int fromtail = 0;
5003
5004 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5005 checkType(c,o,REDIS_LIST)) return;
5006 list = o->ptr;
5007
5008 if (toremove < 0) {
5009 toremove = -toremove;
5010 fromtail = 1;
5011 }
5012 ln = fromtail ? list->tail : list->head;
5013 while (ln) {
5014 robj *ele = listNodeValue(ln);
5015
5016 next = fromtail ? ln->prev : ln->next;
5017 if (equalStringObjects(ele,c->argv[3])) {
5018 listDelNode(list,ln);
5019 server.dirty++;
5020 removed++;
5021 if (toremove && removed == toremove) break;
5022 }
5023 ln = next;
5024 }
5025 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5026 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5027 }
5028
5029 /* This is the semantic of this command:
5030 * RPOPLPUSH srclist dstlist:
5031 * IF LLEN(srclist) > 0
5032 * element = RPOP srclist
5033 * LPUSH dstlist element
5034 * RETURN element
5035 * ELSE
5036 * RETURN nil
5037 * END
5038 * END
5039 *
5040 * The idea is to be able to get an element from a list in a reliable way
5041 * since the element is not just returned but pushed against another list
5042 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5043 */
5044 static void rpoplpushcommand(redisClient *c) {
5045 robj *sobj;
5046 list *srclist;
5047 listNode *ln;
5048
5049 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5050 checkType(c,sobj,REDIS_LIST)) return;
5051 srclist = sobj->ptr;
5052 ln = listLast(srclist);
5053
5054 if (ln == NULL) {
5055 addReply(c,shared.nullbulk);
5056 } else {
5057 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5058 robj *ele = listNodeValue(ln);
5059 list *dstlist;
5060
5061 if (dobj && dobj->type != REDIS_LIST) {
5062 addReply(c,shared.wrongtypeerr);
5063 return;
5064 }
5065
5066 /* Add the element to the target list (unless it's directly
5067 * passed to some BLPOP-ing client */
5068 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5069 if (dobj == NULL) {
5070 /* Create the list if the key does not exist */
5071 dobj = createListObject();
5072 dictAdd(c->db->dict,c->argv[2],dobj);
5073 incrRefCount(c->argv[2]);
5074 }
5075 dstlist = dobj->ptr;
5076 listAddNodeHead(dstlist,ele);
5077 incrRefCount(ele);
5078 }
5079
5080 /* Send the element to the client as reply as well */
5081 addReplyBulk(c,ele);
5082
5083 /* Finally remove the element from the source list */
5084 listDelNode(srclist,ln);
5085 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5086 server.dirty++;
5087 }
5088 }
5089
5090 /* ==================================== Sets ================================ */
5091
5092 static void saddCommand(redisClient *c) {
5093 robj *set;
5094
5095 set = lookupKeyWrite(c->db,c->argv[1]);
5096 if (set == NULL) {
5097 set = createSetObject();
5098 dictAdd(c->db->dict,c->argv[1],set);
5099 incrRefCount(c->argv[1]);
5100 } else {
5101 if (set->type != REDIS_SET) {
5102 addReply(c,shared.wrongtypeerr);
5103 return;
5104 }
5105 }
5106 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5107 incrRefCount(c->argv[2]);
5108 server.dirty++;
5109 addReply(c,shared.cone);
5110 } else {
5111 addReply(c,shared.czero);
5112 }
5113 }
5114
5115 static void sremCommand(redisClient *c) {
5116 robj *set;
5117
5118 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5119 checkType(c,set,REDIS_SET)) return;
5120
5121 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5122 server.dirty++;
5123 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5124 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5125 addReply(c,shared.cone);
5126 } else {
5127 addReply(c,shared.czero);
5128 }
5129 }
5130
5131 static void smoveCommand(redisClient *c) {
5132 robj *srcset, *dstset;
5133
5134 srcset = lookupKeyWrite(c->db,c->argv[1]);
5135 dstset = lookupKeyWrite(c->db,c->argv[2]);
5136
5137 /* If the source key does not exist return 0, if it's of the wrong type
5138 * raise an error */
5139 if (srcset == NULL || srcset->type != REDIS_SET) {
5140 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5141 return;
5142 }
5143 /* Error if the destination key is not a set as well */
5144 if (dstset && dstset->type != REDIS_SET) {
5145 addReply(c,shared.wrongtypeerr);
5146 return;
5147 }
5148 /* Remove the element from the source set */
5149 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5150 /* Key not found in the src set! return zero */
5151 addReply(c,shared.czero);
5152 return;
5153 }
5154 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5155 deleteKey(c->db,c->argv[1]);
5156 server.dirty++;
5157 /* Add the element to the destination set */
5158 if (!dstset) {
5159 dstset = createSetObject();
5160 dictAdd(c->db->dict,c->argv[2],dstset);
5161 incrRefCount(c->argv[2]);
5162 }
5163 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5164 incrRefCount(c->argv[3]);
5165 addReply(c,shared.cone);
5166 }
5167
5168 static void sismemberCommand(redisClient *c) {
5169 robj *set;
5170
5171 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5172 checkType(c,set,REDIS_SET)) return;
5173
5174 if (dictFind(set->ptr,c->argv[2]))
5175 addReply(c,shared.cone);
5176 else
5177 addReply(c,shared.czero);
5178 }
5179
5180 static void scardCommand(redisClient *c) {
5181 robj *o;
5182 dict *s;
5183
5184 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5185 checkType(c,o,REDIS_SET)) return;
5186
5187 s = o->ptr;
5188 addReplyUlong(c,dictSize(s));
5189 }
5190
5191 static void spopCommand(redisClient *c) {
5192 robj *set;
5193 dictEntry *de;
5194
5195 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5196 checkType(c,set,REDIS_SET)) return;
5197
5198 de = dictGetRandomKey(set->ptr);
5199 if (de == NULL) {
5200 addReply(c,shared.nullbulk);
5201 } else {
5202 robj *ele = dictGetEntryKey(de);
5203
5204 addReplyBulk(c,ele);
5205 dictDelete(set->ptr,ele);
5206 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5207 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5208 server.dirty++;
5209 }
5210 }
5211
5212 static void srandmemberCommand(redisClient *c) {
5213 robj *set;
5214 dictEntry *de;
5215
5216 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5217 checkType(c,set,REDIS_SET)) return;
5218
5219 de = dictGetRandomKey(set->ptr);
5220 if (de == NULL) {
5221 addReply(c,shared.nullbulk);
5222 } else {
5223 robj *ele = dictGetEntryKey(de);
5224
5225 addReplyBulk(c,ele);
5226 }
5227 }
5228
5229 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5230 dict **d1 = (void*) s1, **d2 = (void*) s2;
5231
5232 return dictSize(*d1)-dictSize(*d2);
5233 }
5234
5235 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5236 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5237 dictIterator *di;
5238 dictEntry *de;
5239 robj *lenobj = NULL, *dstset = NULL;
5240 unsigned long j, cardinality = 0;
5241
5242 for (j = 0; j < setsnum; j++) {
5243 robj *setobj;
5244
5245 setobj = dstkey ?
5246 lookupKeyWrite(c->db,setskeys[j]) :
5247 lookupKeyRead(c->db,setskeys[j]);
5248 if (!setobj) {
5249 zfree(dv);
5250 if (dstkey) {
5251 if (deleteKey(c->db,dstkey))
5252 server.dirty++;
5253 addReply(c,shared.czero);
5254 } else {
5255 addReply(c,shared.emptymultibulk);
5256 }
5257 return;
5258 }
5259 if (setobj->type != REDIS_SET) {
5260 zfree(dv);
5261 addReply(c,shared.wrongtypeerr);
5262 return;
5263 }
5264 dv[j] = setobj->ptr;
5265 }
5266 /* Sort sets from the smallest to largest, this will improve our
5267 * algorithm's performace */
5268 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5269
5270 /* The first thing we should output is the total number of elements...
5271 * since this is a multi-bulk write, but at this stage we don't know
5272 * the intersection set size, so we use a trick, append an empty object
5273 * to the output list and save the pointer to later modify it with the
5274 * right length */
5275 if (!dstkey) {
5276 lenobj = createObject(REDIS_STRING,NULL);
5277 addReply(c,lenobj);
5278 decrRefCount(lenobj);
5279 } else {
5280 /* If we have a target key where to store the resulting set
5281 * create this key with an empty set inside */
5282 dstset = createSetObject();
5283 }
5284
5285 /* Iterate all the elements of the first (smallest) set, and test
5286 * the element against all the other sets, if at least one set does
5287 * not include the element it is discarded */
5288 di = dictGetIterator(dv[0]);
5289
5290 while((de = dictNext(di)) != NULL) {
5291 robj *ele;
5292
5293 for (j = 1; j < setsnum; j++)
5294 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5295 if (j != setsnum)
5296 continue; /* at least one set does not contain the member */
5297 ele = dictGetEntryKey(de);
5298 if (!dstkey) {
5299 addReplyBulk(c,ele);
5300 cardinality++;
5301 } else {
5302 dictAdd(dstset->ptr,ele,NULL);
5303 incrRefCount(ele);
5304 }
5305 }
5306 dictReleaseIterator(di);
5307
5308 if (dstkey) {
5309 /* Store the resulting set into the target, if the intersection
5310 * is not an empty set. */
5311 deleteKey(c->db,dstkey);
5312 if (dictSize((dict*)dstset->ptr) > 0) {
5313 dictAdd(c->db->dict,dstkey,dstset);
5314 incrRefCount(dstkey);
5315 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5316 } else {
5317 decrRefCount(dstset);
5318 addReply(c,shared.czero);
5319 }
5320 server.dirty++;
5321 } else {
5322 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5323 }
5324 zfree(dv);
5325 }
5326
5327 static void sinterCommand(redisClient *c) {
5328 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5329 }
5330
5331 static void sinterstoreCommand(redisClient *c) {
5332 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5333 }
5334
5335 #define REDIS_OP_UNION 0
5336 #define REDIS_OP_DIFF 1
5337 #define REDIS_OP_INTER 2
5338
5339 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5340 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5341 dictIterator *di;
5342 dictEntry *de;
5343 robj *dstset = NULL;
5344 int j, cardinality = 0;
5345
5346 for (j = 0; j < setsnum; j++) {
5347 robj *setobj;
5348
5349 setobj = dstkey ?
5350 lookupKeyWrite(c->db,setskeys[j]) :
5351 lookupKeyRead(c->db,setskeys[j]);
5352 if (!setobj) {
5353 dv[j] = NULL;
5354 continue;
5355 }
5356 if (setobj->type != REDIS_SET) {
5357 zfree(dv);
5358 addReply(c,shared.wrongtypeerr);
5359 return;
5360 }
5361 dv[j] = setobj->ptr;
5362 }
5363
5364 /* We need a temp set object to store our union. If the dstkey
5365 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5366 * this set object will be the resulting object to set into the target key*/
5367 dstset = createSetObject();
5368
5369 /* Iterate all the elements of all the sets, add every element a single
5370 * time to the result set */
5371 for (j = 0; j < setsnum; j++) {
5372 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5373 if (!dv[j]) continue; /* non existing keys are like empty sets */
5374
5375 di = dictGetIterator(dv[j]);
5376
5377 while((de = dictNext(di)) != NULL) {
5378 robj *ele;
5379
5380 /* dictAdd will not add the same element multiple times */
5381 ele = dictGetEntryKey(de);
5382 if (op == REDIS_OP_UNION || j == 0) {
5383 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5384 incrRefCount(ele);
5385 cardinality++;
5386 }
5387 } else if (op == REDIS_OP_DIFF) {
5388 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5389 cardinality--;
5390 }
5391 }
5392 }
5393 dictReleaseIterator(di);
5394
5395 /* result set is empty? Exit asap. */
5396 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5397 }
5398
5399 /* Output the content of the resulting set, if not in STORE mode */
5400 if (!dstkey) {
5401 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5402 di = dictGetIterator(dstset->ptr);
5403 while((de = dictNext(di)) != NULL) {
5404 robj *ele;
5405
5406 ele = dictGetEntryKey(de);
5407 addReplyBulk(c,ele);
5408 }
5409 dictReleaseIterator(di);
5410 decrRefCount(dstset);
5411 } else {
5412 /* If we have a target key where to store the resulting set
5413 * create this key with the result set inside */
5414 deleteKey(c->db,dstkey);
5415 if (dictSize((dict*)dstset->ptr) > 0) {
5416 dictAdd(c->db->dict,dstkey,dstset);
5417 incrRefCount(dstkey);
5418 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5419 } else {
5420 decrRefCount(dstset);
5421 addReply(c,shared.czero);
5422 }
5423 server.dirty++;
5424 }
5425 zfree(dv);
5426 }
5427
5428 static void sunionCommand(redisClient *c) {
5429 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5430 }
5431
5432 static void sunionstoreCommand(redisClient *c) {
5433 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5434 }
5435
5436 static void sdiffCommand(redisClient *c) {
5437 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5438 }
5439
5440 static void sdiffstoreCommand(redisClient *c) {
5441 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5442 }
5443
5444 /* ==================================== ZSets =============================== */
5445
5446 /* ZSETs are ordered sets using two data structures to hold the same elements
5447 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5448 * data structure.
5449 *
5450 * The elements are added to an hash table mapping Redis objects to scores.
5451 * At the same time the elements are added to a skip list mapping scores
5452 * to Redis objects (so objects are sorted by scores in this "view"). */
5453
5454 /* This skiplist implementation is almost a C translation of the original
5455 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5456 * Alternative to Balanced Trees", modified in three ways:
5457 * a) this implementation allows for repeated values.
5458 * b) the comparison is not just by key (our 'score') but by satellite data.
5459 * c) there is a back pointer, so it's a doubly linked list with the back
5460 * pointers being only at "level 1". This allows to traverse the list
5461 * from tail to head, useful for ZREVRANGE. */
5462
5463 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5464 zskiplistNode *zn = zmalloc(sizeof(*zn));
5465
5466 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5467 if (level > 1)
5468 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5469 else
5470 zn->span = NULL;
5471 zn->score = score;
5472 zn->obj = obj;
5473 return zn;
5474 }
5475
5476 static zskiplist *zslCreate(void) {
5477 int j;
5478 zskiplist *zsl;
5479
5480 zsl = zmalloc(sizeof(*zsl));
5481 zsl->level = 1;
5482 zsl->length = 0;
5483 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5484 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5485 zsl->header->forward[j] = NULL;
5486
5487 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5488 if (j < ZSKIPLIST_MAXLEVEL-1)
5489 zsl->header->span[j] = 0;
5490 }
5491 zsl->header->backward = NULL;
5492 zsl->tail = NULL;
5493 return zsl;
5494 }
5495
5496 static void zslFreeNode(zskiplistNode *node) {
5497 decrRefCount(node->obj);
5498 zfree(node->forward);
5499 zfree(node->span);
5500 zfree(node);
5501 }
5502
5503 static void zslFree(zskiplist *zsl) {
5504 zskiplistNode *node = zsl->header->forward[0], *next;
5505
5506 zfree(zsl->header->forward);
5507 zfree(zsl->header->span);
5508 zfree(zsl->header);
5509 while(node) {
5510 next = node->forward[0];
5511 zslFreeNode(node);
5512 node = next;
5513 }
5514 zfree(zsl);
5515 }
5516
5517 static int zslRandomLevel(void) {
5518 int level = 1;
5519 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5520 level += 1;
5521 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5522 }
5523
5524 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5525 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5526 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5527 int i, level;
5528
5529 x = zsl->header;
5530 for (i = zsl->level-1; i >= 0; i--) {
5531 /* store rank that is crossed to reach the insert position */
5532 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5533
5534 while (x->forward[i] &&
5535 (x->forward[i]->score < score ||
5536 (x->forward[i]->score == score &&
5537 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5538 rank[i] += i > 0 ? x->span[i-1] : 1;
5539 x = x->forward[i];
5540 }
5541 update[i] = x;
5542 }
5543 /* we assume the key is not already inside, since we allow duplicated
5544 * scores, and the re-insertion of score and redis object should never
5545 * happpen since the caller of zslInsert() should test in the hash table
5546 * if the element is already inside or not. */
5547 level = zslRandomLevel();
5548 if (level > zsl->level) {
5549 for (i = zsl->level; i < level; i++) {
5550 rank[i] = 0;
5551 update[i] = zsl->header;
5552 update[i]->span[i-1] = zsl->length;
5553 }
5554 zsl->level = level;
5555 }
5556 x = zslCreateNode(level,score,obj);
5557 for (i = 0; i < level; i++) {
5558 x->forward[i] = update[i]->forward[i];
5559 update[i]->forward[i] = x;
5560
5561 /* update span covered by update[i] as x is inserted here */
5562 if (i > 0) {
5563 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5564 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5565 }
5566 }
5567
5568 /* increment span for untouched levels */
5569 for (i = level; i < zsl->level; i++) {
5570 update[i]->span[i-1]++;
5571 }
5572
5573 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5574 if (x->forward[0])
5575 x->forward[0]->backward = x;
5576 else
5577 zsl->tail = x;
5578 zsl->length++;
5579 }
5580
5581 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5582 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5583 int i;
5584 for (i = 0; i < zsl->level; i++) {
5585 if (update[i]->forward[i] == x) {
5586 if (i > 0) {
5587 update[i]->span[i-1] += x->span[i-1] - 1;
5588 }
5589 update[i]->forward[i] = x->forward[i];
5590 } else {
5591 /* invariant: i > 0, because update[0]->forward[0]
5592 * is always equal to x */
5593 update[i]->span[i-1] -= 1;
5594 }
5595 }
5596 if (x->forward[0]) {
5597 x->forward[0]->backward = x->backward;
5598 } else {
5599 zsl->tail = x->backward;
5600 }
5601 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5602 zsl->level--;
5603 zsl->length--;
5604 }
5605
5606 /* Delete an element with matching score/object from the skiplist. */
5607 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5608 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5609 int i;
5610
5611 x = zsl->header;
5612 for (i = zsl->level-1; i >= 0; i--) {
5613 while (x->forward[i] &&
5614 (x->forward[i]->score < score ||
5615 (x->forward[i]->score == score &&
5616 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5617 x = x->forward[i];
5618 update[i] = x;
5619 }
5620 /* We may have multiple elements with the same score, what we need
5621 * is to find the element with both the right score and object. */
5622 x = x->forward[0];
5623 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5624 zslDeleteNode(zsl, x, update);
5625 zslFreeNode(x);
5626 return 1;
5627 } else {
5628 return 0; /* not found */
5629 }
5630 return 0; /* not found */
5631 }
5632
5633 /* Delete all the elements with score between min and max from the skiplist.
5634 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5635 * Note that this function takes the reference to the hash table view of the
5636 * sorted set, in order to remove the elements from the hash table too. */
5637 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5638 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5639 unsigned long removed = 0;
5640 int i;
5641
5642 x = zsl->header;
5643 for (i = zsl->level-1; i >= 0; i--) {
5644 while (x->forward[i] && x->forward[i]->score < min)
5645 x = x->forward[i];
5646 update[i] = x;
5647 }
5648 /* We may have multiple elements with the same score, what we need
5649 * is to find the element with both the right score and object. */
5650 x = x->forward[0];
5651 while (x && x->score <= max) {
5652 zskiplistNode *next = x->forward[0];
5653 zslDeleteNode(zsl, x, update);
5654 dictDelete(dict,x->obj);
5655 zslFreeNode(x);
5656 removed++;
5657 x = next;
5658 }
5659 return removed; /* not found */
5660 }
5661
5662 /* Delete all the elements with rank between start and end from the skiplist.
5663 * Start and end are inclusive. Note that start and end need to be 1-based */
5664 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5665 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5666 unsigned long traversed = 0, removed = 0;
5667 int i;
5668
5669 x = zsl->header;
5670 for (i = zsl->level-1; i >= 0; i--) {
5671 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5672 traversed += i > 0 ? x->span[i-1] : 1;
5673 x = x->forward[i];
5674 }
5675 update[i] = x;
5676 }
5677
5678 traversed++;
5679 x = x->forward[0];
5680 while (x && traversed <= end) {
5681 zskiplistNode *next = x->forward[0];
5682 zslDeleteNode(zsl, x, update);
5683 dictDelete(dict,x->obj);
5684 zslFreeNode(x);
5685 removed++;
5686 traversed++;
5687 x = next;
5688 }
5689 return removed;
5690 }
5691
5692 /* Find the first node having a score equal or greater than the specified one.
5693 * Returns NULL if there is no match. */
5694 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5695 zskiplistNode *x;
5696 int i;
5697
5698 x = zsl->header;
5699 for (i = zsl->level-1; i >= 0; i--) {
5700 while (x->forward[i] && x->forward[i]->score < score)
5701 x = x->forward[i];
5702 }
5703 /* We may have multiple elements with the same score, what we need
5704 * is to find the element with both the right score and object. */
5705 return x->forward[0];
5706 }
5707
5708 /* Find the rank for an element by both score and key.
5709 * Returns 0 when the element cannot be found, rank otherwise.
5710 * Note that the rank is 1-based due to the span of zsl->header to the
5711 * first element. */
5712 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5713 zskiplistNode *x;
5714 unsigned long rank = 0;
5715 int i;
5716
5717 x = zsl->header;
5718 for (i = zsl->level-1; i >= 0; i--) {
5719 while (x->forward[i] &&
5720 (x->forward[i]->score < score ||
5721 (x->forward[i]->score == score &&
5722 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5723 rank += i > 0 ? x->span[i-1] : 1;
5724 x = x->forward[i];
5725 }
5726
5727 /* x might be equal to zsl->header, so test if obj is non-NULL */
5728 if (x->obj && equalStringObjects(x->obj,o)) {
5729 return rank;
5730 }
5731 }
5732 return 0;
5733 }
5734
5735 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5736 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5737 zskiplistNode *x;
5738 unsigned long traversed = 0;
5739 int i;
5740
5741 x = zsl->header;
5742 for (i = zsl->level-1; i >= 0; i--) {
5743 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5744 {
5745 traversed += i > 0 ? x->span[i-1] : 1;
5746 x = x->forward[i];
5747 }
5748 if (traversed == rank) {
5749 return x;
5750 }
5751 }
5752 return NULL;
5753 }
5754
5755 /* The actual Z-commands implementations */
5756
5757 /* This generic command implements both ZADD and ZINCRBY.
5758 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5759 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5760 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5761 robj *zsetobj;
5762 zset *zs;
5763 double *score;
5764
5765 if (isnan(scoreval)) {
5766 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5767 return;
5768 }
5769
5770 zsetobj = lookupKeyWrite(c->db,key);
5771 if (zsetobj == NULL) {
5772 zsetobj = createZsetObject();
5773 dictAdd(c->db->dict,key,zsetobj);
5774 incrRefCount(key);
5775 } else {
5776 if (zsetobj->type != REDIS_ZSET) {
5777 addReply(c,shared.wrongtypeerr);
5778 return;
5779 }
5780 }
5781 zs = zsetobj->ptr;
5782
5783 /* Ok now since we implement both ZADD and ZINCRBY here the code
5784 * needs to handle the two different conditions. It's all about setting
5785 * '*score', that is, the new score to set, to the right value. */
5786 score = zmalloc(sizeof(double));
5787 if (doincrement) {
5788 dictEntry *de;
5789
5790 /* Read the old score. If the element was not present starts from 0 */
5791 de = dictFind(zs->dict,ele);
5792 if (de) {
5793 double *oldscore = dictGetEntryVal(de);
5794 *score = *oldscore + scoreval;
5795 } else {
5796 *score = scoreval;
5797 }
5798 if (isnan(*score)) {
5799 addReplySds(c,
5800 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5801 zfree(score);
5802 /* Note that we don't need to check if the zset may be empty and
5803 * should be removed here, as we can only obtain Nan as score if
5804 * there was already an element in the sorted set. */
5805 return;
5806 }
5807 } else {
5808 *score = scoreval;
5809 }
5810
5811 /* What follows is a simple remove and re-insert operation that is common
5812 * to both ZADD and ZINCRBY... */
5813 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5814 /* case 1: New element */
5815 incrRefCount(ele); /* added to hash */
5816 zslInsert(zs->zsl,*score,ele);
5817 incrRefCount(ele); /* added to skiplist */
5818 server.dirty++;
5819 if (doincrement)
5820 addReplyDouble(c,*score);
5821 else
5822 addReply(c,shared.cone);
5823 } else {
5824 dictEntry *de;
5825 double *oldscore;
5826
5827 /* case 2: Score update operation */
5828 de = dictFind(zs->dict,ele);
5829 redisAssert(de != NULL);
5830 oldscore = dictGetEntryVal(de);
5831 if (*score != *oldscore) {
5832 int deleted;
5833
5834 /* Remove and insert the element in the skip list with new score */
5835 deleted = zslDelete(zs->zsl,*oldscore,ele);
5836 redisAssert(deleted != 0);
5837 zslInsert(zs->zsl,*score,ele);
5838 incrRefCount(ele);
5839 /* Update the score in the hash table */
5840 dictReplace(zs->dict,ele,score);
5841 server.dirty++;
5842 } else {
5843 zfree(score);
5844 }
5845 if (doincrement)
5846 addReplyDouble(c,*score);
5847 else
5848 addReply(c,shared.czero);
5849 }
5850 }
5851
5852 static void zaddCommand(redisClient *c) {
5853 double scoreval;
5854
5855 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5856 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5857 }
5858
5859 static void zincrbyCommand(redisClient *c) {
5860 double scoreval;
5861
5862 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5863 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5864 }
5865
5866 static void zremCommand(redisClient *c) {
5867 robj *zsetobj;
5868 zset *zs;
5869 dictEntry *de;
5870 double *oldscore;
5871 int deleted;
5872
5873 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5874 checkType(c,zsetobj,REDIS_ZSET)) return;
5875
5876 zs = zsetobj->ptr;
5877 de = dictFind(zs->dict,c->argv[2]);
5878 if (de == NULL) {
5879 addReply(c,shared.czero);
5880 return;
5881 }
5882 /* Delete from the skiplist */
5883 oldscore = dictGetEntryVal(de);
5884 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5885 redisAssert(deleted != 0);
5886
5887 /* Delete from the hash table */
5888 dictDelete(zs->dict,c->argv[2]);
5889 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5890 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5891 server.dirty++;
5892 addReply(c,shared.cone);
5893 }
5894
5895 static void zremrangebyscoreCommand(redisClient *c) {
5896 double min;
5897 double max;
5898 long deleted;
5899 robj *zsetobj;
5900 zset *zs;
5901
5902 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5903 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5904
5905 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5906 checkType(c,zsetobj,REDIS_ZSET)) return;
5907
5908 zs = zsetobj->ptr;
5909 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5910 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5911 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5912 server.dirty += deleted;
5913 addReplyLongLong(c,deleted);
5914 }
5915
5916 static void zremrangebyrankCommand(redisClient *c) {
5917 long start;
5918 long end;
5919 int llen;
5920 long deleted;
5921 robj *zsetobj;
5922 zset *zs;
5923
5924 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5925 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5926
5927 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5928 checkType(c,zsetobj,REDIS_ZSET)) return;
5929 zs = zsetobj->ptr;
5930 llen = zs->zsl->length;
5931
5932 /* convert negative indexes */
5933 if (start < 0) start = llen+start;
5934 if (end < 0) end = llen+end;
5935 if (start < 0) start = 0;
5936 if (end < 0) end = 0;
5937
5938 /* indexes sanity checks */
5939 if (start > end || start >= llen) {
5940 addReply(c,shared.czero);
5941 return;
5942 }
5943 if (end >= llen) end = llen-1;
5944
5945 /* increment start and end because zsl*Rank functions
5946 * use 1-based rank */
5947 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5948 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5949 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5950 server.dirty += deleted;
5951 addReplyLongLong(c, deleted);
5952 }
5953
5954 typedef struct {
5955 dict *dict;
5956 double weight;
5957 } zsetopsrc;
5958
5959 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5960 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5961 unsigned long size1, size2;
5962 size1 = d1->dict ? dictSize(d1->dict) : 0;
5963 size2 = d2->dict ? dictSize(d2->dict) : 0;
5964 return size1 - size2;
5965 }
5966
5967 #define REDIS_AGGR_SUM 1
5968 #define REDIS_AGGR_MIN 2
5969 #define REDIS_AGGR_MAX 3
5970 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5971
5972 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5973 if (aggregate == REDIS_AGGR_SUM) {
5974 *target = *target + val;
5975 } else if (aggregate == REDIS_AGGR_MIN) {
5976 *target = val < *target ? val : *target;
5977 } else if (aggregate == REDIS_AGGR_MAX) {
5978 *target = val > *target ? val : *target;
5979 } else {
5980 /* safety net */
5981 redisPanic("Unknown ZUNION/INTER aggregate type");
5982 }
5983 }
5984
5985 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5986 int i, j, setnum;
5987 int aggregate = REDIS_AGGR_SUM;
5988 zsetopsrc *src;
5989 robj *dstobj;
5990 zset *dstzset;
5991 dictIterator *di;
5992 dictEntry *de;
5993
5994 /* expect setnum input keys to be given */
5995 setnum = atoi(c->argv[2]->ptr);
5996 if (setnum < 1) {
5997 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5998 return;
5999 }
6000
6001 /* test if the expected number of keys would overflow */
6002 if (3+setnum > c->argc) {
6003 addReply(c,shared.syntaxerr);
6004 return;
6005 }
6006
6007 /* read keys to be used for input */
6008 src = zmalloc(sizeof(zsetopsrc) * setnum);
6009 for (i = 0, j = 3; i < setnum; i++, j++) {
6010 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6011 if (!obj) {
6012 src[i].dict = NULL;
6013 } else {
6014 if (obj->type == REDIS_ZSET) {
6015 src[i].dict = ((zset*)obj->ptr)->dict;
6016 } else if (obj->type == REDIS_SET) {
6017 src[i].dict = (obj->ptr);
6018 } else {
6019 zfree(src);
6020 addReply(c,shared.wrongtypeerr);
6021 return;
6022 }
6023 }
6024
6025 /* default all weights to 1 */
6026 src[i].weight = 1.0;
6027 }
6028
6029 /* parse optional extra arguments */
6030 if (j < c->argc) {
6031 int remaining = c->argc - j;
6032
6033 while (remaining) {
6034 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6035 j++; remaining--;
6036 for (i = 0; i < setnum; i++, j++, remaining--) {
6037 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6038 return;
6039 }
6040 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6041 j++; remaining--;
6042 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6043 aggregate = REDIS_AGGR_SUM;
6044 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6045 aggregate = REDIS_AGGR_MIN;
6046 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6047 aggregate = REDIS_AGGR_MAX;
6048 } else {
6049 zfree(src);
6050 addReply(c,shared.syntaxerr);
6051 return;
6052 }
6053 j++; remaining--;
6054 } else {
6055 zfree(src);
6056 addReply(c,shared.syntaxerr);
6057 return;
6058 }
6059 }
6060 }
6061
6062 /* sort sets from the smallest to largest, this will improve our
6063 * algorithm's performance */
6064 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6065
6066 dstobj = createZsetObject();
6067 dstzset = dstobj->ptr;
6068
6069 if (op == REDIS_OP_INTER) {
6070 /* skip going over all entries if the smallest zset is NULL or empty */
6071 if (src[0].dict && dictSize(src[0].dict) > 0) {
6072 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6073 * from small to large, all src[i > 0].dict are non-empty too */
6074 di = dictGetIterator(src[0].dict);
6075 while((de = dictNext(di)) != NULL) {
6076 double *score = zmalloc(sizeof(double)), value;
6077 *score = src[0].weight * zunionInterDictValue(de);
6078
6079 for (j = 1; j < setnum; j++) {
6080 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6081 if (other) {
6082 value = src[j].weight * zunionInterDictValue(other);
6083 zunionInterAggregate(score, value, aggregate);
6084 } else {
6085 break;
6086 }
6087 }
6088
6089 /* skip entry when not present in every source dict */
6090 if (j != setnum) {
6091 zfree(score);
6092 } else {
6093 robj *o = dictGetEntryKey(de);
6094 dictAdd(dstzset->dict,o,score);
6095 incrRefCount(o); /* added to dictionary */
6096 zslInsert(dstzset->zsl,*score,o);
6097 incrRefCount(o); /* added to skiplist */
6098 }
6099 }
6100 dictReleaseIterator(di);
6101 }
6102 } else if (op == REDIS_OP_UNION) {
6103 for (i = 0; i < setnum; i++) {
6104 if (!src[i].dict) continue;
6105
6106 di = dictGetIterator(src[i].dict);
6107 while((de = dictNext(di)) != NULL) {
6108 /* skip key when already processed */
6109 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6110
6111 double *score = zmalloc(sizeof(double)), value;
6112 *score = src[i].weight * zunionInterDictValue(de);
6113
6114 /* because the zsets are sorted by size, its only possible
6115 * for sets at larger indices to hold this entry */
6116 for (j = (i+1); j < setnum; j++) {
6117 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6118 if (other) {
6119 value = src[j].weight * zunionInterDictValue(other);
6120 zunionInterAggregate(score, value, aggregate);
6121 }
6122 }
6123
6124 robj *o = dictGetEntryKey(de);
6125 dictAdd(dstzset->dict,o,score);
6126 incrRefCount(o); /* added to dictionary */
6127 zslInsert(dstzset->zsl,*score,o);
6128 incrRefCount(o); /* added to skiplist */
6129 }
6130 dictReleaseIterator(di);
6131 }
6132 } else {
6133 /* unknown operator */
6134 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6135 }
6136
6137 deleteKey(c->db,dstkey);
6138 if (dstzset->zsl->length) {
6139 dictAdd(c->db->dict,dstkey,dstobj);
6140 incrRefCount(dstkey);
6141 addReplyLongLong(c, dstzset->zsl->length);
6142 server.dirty++;
6143 } else {
6144 decrRefCount(dstobj);
6145 addReply(c, shared.czero);
6146 }
6147 zfree(src);
6148 }
6149
6150 static void zunionstoreCommand(redisClient *c) {
6151 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6152 }
6153
6154 static void zinterstoreCommand(redisClient *c) {
6155 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6156 }
6157
6158 static void zrangeGenericCommand(redisClient *c, int reverse) {
6159 robj *o;
6160 long start;
6161 long end;
6162 int withscores = 0;
6163 int llen;
6164 int rangelen, j;
6165 zset *zsetobj;
6166 zskiplist *zsl;
6167 zskiplistNode *ln;
6168 robj *ele;
6169
6170 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6171 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6172
6173 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6174 withscores = 1;
6175 } else if (c->argc >= 5) {
6176 addReply(c,shared.syntaxerr);
6177 return;
6178 }
6179
6180 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6181 || checkType(c,o,REDIS_ZSET)) return;
6182 zsetobj = o->ptr;
6183 zsl = zsetobj->zsl;
6184 llen = zsl->length;
6185
6186 /* convert negative indexes */
6187 if (start < 0) start = llen+start;
6188 if (end < 0) end = llen+end;
6189 if (start < 0) start = 0;
6190 if (end < 0) end = 0;
6191
6192 /* indexes sanity checks */
6193 if (start > end || start >= llen) {
6194 /* Out of range start or start > end result in empty list */
6195 addReply(c,shared.emptymultibulk);
6196 return;
6197 }
6198 if (end >= llen) end = llen-1;
6199 rangelen = (end-start)+1;
6200
6201 /* check if starting point is trivial, before searching
6202 * the element in log(N) time */
6203 if (reverse) {
6204 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6205 } else {
6206 ln = start == 0 ?
6207 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6208 }
6209
6210 /* Return the result in form of a multi-bulk reply */
6211 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6212 withscores ? (rangelen*2) : rangelen));
6213 for (j = 0; j < rangelen; j++) {
6214 ele = ln->obj;
6215 addReplyBulk(c,ele);
6216 if (withscores)
6217 addReplyDouble(c,ln->score);
6218 ln = reverse ? ln->backward : ln->forward[0];
6219 }
6220 }
6221
6222 static void zrangeCommand(redisClient *c) {
6223 zrangeGenericCommand(c,0);
6224 }
6225
6226 static void zrevrangeCommand(redisClient *c) {
6227 zrangeGenericCommand(c,1);
6228 }
6229
6230 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6231 * If justcount is non-zero, just the count is returned. */
6232 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6233 robj *o;
6234 double min, max;
6235 int minex = 0, maxex = 0; /* are min or max exclusive? */
6236 int offset = 0, limit = -1;
6237 int withscores = 0;
6238 int badsyntax = 0;
6239
6240 /* Parse the min-max interval. If one of the values is prefixed
6241 * by the "(" character, it's considered "open". For instance
6242 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6243 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6244 if (((char*)c->argv[2]->ptr)[0] == '(') {
6245 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6246 minex = 1;
6247 } else {
6248 min = strtod(c->argv[2]->ptr,NULL);
6249 }
6250 if (((char*)c->argv[3]->ptr)[0] == '(') {
6251 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6252 maxex = 1;
6253 } else {
6254 max = strtod(c->argv[3]->ptr,NULL);
6255 }
6256
6257 /* Parse "WITHSCORES": note that if the command was called with
6258 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6259 * enter the following paths to parse WITHSCORES and LIMIT. */
6260 if (c->argc == 5 || c->argc == 8) {
6261 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6262 withscores = 1;
6263 else
6264 badsyntax = 1;
6265 }
6266 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6267 badsyntax = 1;
6268 if (badsyntax) {
6269 addReplySds(c,
6270 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6271 return;
6272 }
6273
6274 /* Parse "LIMIT" */
6275 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6276 addReply(c,shared.syntaxerr);
6277 return;
6278 } else if (c->argc == (7 + withscores)) {
6279 offset = atoi(c->argv[5]->ptr);
6280 limit = atoi(c->argv[6]->ptr);
6281 if (offset < 0) offset = 0;
6282 }
6283
6284 /* Ok, lookup the key and get the range */
6285 o = lookupKeyRead(c->db,c->argv[1]);
6286 if (o == NULL) {
6287 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6288 } else {
6289 if (o->type != REDIS_ZSET) {
6290 addReply(c,shared.wrongtypeerr);
6291 } else {
6292 zset *zsetobj = o->ptr;
6293 zskiplist *zsl = zsetobj->zsl;
6294 zskiplistNode *ln;
6295 robj *ele, *lenobj = NULL;
6296 unsigned long rangelen = 0;
6297
6298 /* Get the first node with the score >= min, or with
6299 * score > min if 'minex' is true. */
6300 ln = zslFirstWithScore(zsl,min);
6301 while (minex && ln && ln->score == min) ln = ln->forward[0];
6302
6303 if (ln == NULL) {
6304 /* No element matching the speciifed interval */
6305 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6306 return;
6307 }
6308
6309 /* We don't know in advance how many matching elements there
6310 * are in the list, so we push this object that will represent
6311 * the multi-bulk length in the output buffer, and will "fix"
6312 * it later */
6313 if (!justcount) {
6314 lenobj = createObject(REDIS_STRING,NULL);
6315 addReply(c,lenobj);
6316 decrRefCount(lenobj);
6317 }
6318
6319 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6320 if (offset) {
6321 offset--;
6322 ln = ln->forward[0];
6323 continue;
6324 }
6325 if (limit == 0) break;
6326 if (!justcount) {
6327 ele = ln->obj;
6328 addReplyBulk(c,ele);
6329 if (withscores)
6330 addReplyDouble(c,ln->score);
6331 }
6332 ln = ln->forward[0];
6333 rangelen++;
6334 if (limit > 0) limit--;
6335 }
6336 if (justcount) {
6337 addReplyLongLong(c,(long)rangelen);
6338 } else {
6339 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6340 withscores ? (rangelen*2) : rangelen);
6341 }
6342 }
6343 }
6344 }
6345
6346 static void zrangebyscoreCommand(redisClient *c) {
6347 genericZrangebyscoreCommand(c,0);
6348 }
6349
6350 static void zcountCommand(redisClient *c) {
6351 genericZrangebyscoreCommand(c,1);
6352 }
6353
6354 static void zcardCommand(redisClient *c) {
6355 robj *o;
6356 zset *zs;
6357
6358 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6359 checkType(c,o,REDIS_ZSET)) return;
6360
6361 zs = o->ptr;
6362 addReplyUlong(c,zs->zsl->length);
6363 }
6364
6365 static void zscoreCommand(redisClient *c) {
6366 robj *o;
6367 zset *zs;
6368 dictEntry *de;
6369
6370 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6371 checkType(c,o,REDIS_ZSET)) return;
6372
6373 zs = o->ptr;
6374 de = dictFind(zs->dict,c->argv[2]);
6375 if (!de) {
6376 addReply(c,shared.nullbulk);
6377 } else {
6378 double *score = dictGetEntryVal(de);
6379
6380 addReplyDouble(c,*score);
6381 }
6382 }
6383
6384 static void zrankGenericCommand(redisClient *c, int reverse) {
6385 robj *o;
6386 zset *zs;
6387 zskiplist *zsl;
6388 dictEntry *de;
6389 unsigned long rank;
6390 double *score;
6391
6392 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6393 checkType(c,o,REDIS_ZSET)) return;
6394
6395 zs = o->ptr;
6396 zsl = zs->zsl;
6397 de = dictFind(zs->dict,c->argv[2]);
6398 if (!de) {
6399 addReply(c,shared.nullbulk);
6400 return;
6401 }
6402
6403 score = dictGetEntryVal(de);
6404 rank = zslGetRank(zsl, *score, c->argv[2]);
6405 if (rank) {
6406 if (reverse) {
6407 addReplyLongLong(c, zsl->length - rank);
6408 } else {
6409 addReplyLongLong(c, rank-1);
6410 }
6411 } else {
6412 addReply(c,shared.nullbulk);
6413 }
6414 }
6415
6416 static void zrankCommand(redisClient *c) {
6417 zrankGenericCommand(c, 0);
6418 }
6419
6420 static void zrevrankCommand(redisClient *c) {
6421 zrankGenericCommand(c, 1);
6422 }
6423
6424 /* ========================= Hashes utility functions ======================= */
6425 #define REDIS_HASH_KEY 1
6426 #define REDIS_HASH_VALUE 2
6427
6428 /* Check the length of a number of objects to see if we need to convert a
6429 * zipmap to a real hash. Note that we only check string encoded objects
6430 * as their string length can be queried in constant time. */
6431 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6432 int i;
6433 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6434
6435 for (i = start; i <= end; i++) {
6436 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6437 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6438 {
6439 convertToRealHash(subject);
6440 return;
6441 }
6442 }
6443 }
6444
6445 /* Encode given objects in-place when the hash uses a dict. */
6446 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6447 if (subject->encoding == REDIS_ENCODING_HT) {
6448 if (o1) *o1 = tryObjectEncoding(*o1);
6449 if (o2) *o2 = tryObjectEncoding(*o2);
6450 }
6451 }
6452
6453 /* Get the value from a hash identified by key. Returns either a string
6454 * object or NULL if the value cannot be found. The refcount of the object
6455 * is always increased by 1 when the value was found. */
6456 static robj *hashGet(robj *o, robj *key) {
6457 robj *value = NULL;
6458 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6459 unsigned char *v;
6460 unsigned int vlen;
6461 key = getDecodedObject(key);
6462 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6463 value = createStringObject((char*)v,vlen);
6464 }
6465 decrRefCount(key);
6466 } else {
6467 dictEntry *de = dictFind(o->ptr,key);
6468 if (de != NULL) {
6469 value = dictGetEntryVal(de);
6470 incrRefCount(value);
6471 }
6472 }
6473 return value;
6474 }
6475
6476 /* Test if the key exists in the given hash. Returns 1 if the key
6477 * exists and 0 when it doesn't. */
6478 static int hashExists(robj *o, robj *key) {
6479 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6480 key = getDecodedObject(key);
6481 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6482 decrRefCount(key);
6483 return 1;
6484 }
6485 decrRefCount(key);
6486 } else {
6487 if (dictFind(o->ptr,key) != NULL) {
6488 return 1;
6489 }
6490 }
6491 return 0;
6492 }
6493
6494 /* Add an element, discard the old if the key already exists.
6495 * Return 0 on insert and 1 on update. */
6496 static int hashSet(robj *o, robj *key, robj *value) {
6497 int update = 0;
6498 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6499 key = getDecodedObject(key);
6500 value = getDecodedObject(value);
6501 o->ptr = zipmapSet(o->ptr,
6502 key->ptr,sdslen(key->ptr),
6503 value->ptr,sdslen(value->ptr), &update);
6504 decrRefCount(key);
6505 decrRefCount(value);
6506
6507 /* Check if the zipmap needs to be upgraded to a real hash table */
6508 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6509 convertToRealHash(o);
6510 } else {
6511 if (dictReplace(o->ptr,key,value)) {
6512 /* Insert */
6513 incrRefCount(key);
6514 } else {
6515 /* Update */
6516 update = 1;
6517 }
6518 incrRefCount(value);
6519 }
6520 return update;
6521 }
6522
6523 /* Delete an element from a hash.
6524 * Return 1 on deleted and 0 on not found. */
6525 static int hashDelete(robj *o, robj *key) {
6526 int deleted = 0;
6527 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6528 key = getDecodedObject(key);
6529 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6530 decrRefCount(key);
6531 } else {
6532 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6533 /* Always check if the dictionary needs a resize after a delete. */
6534 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6535 }
6536 return deleted;
6537 }
6538
6539 /* Return the number of elements in a hash. */
6540 static unsigned long hashLength(robj *o) {
6541 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6542 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6543 }
6544
6545 /* Structure to hold hash iteration abstration. Note that iteration over
6546 * hashes involves both fields and values. Because it is possible that
6547 * not both are required, store pointers in the iterator to avoid
6548 * unnecessary memory allocation for fields/values. */
6549 typedef struct {
6550 int encoding;
6551 unsigned char *zi;
6552 unsigned char *zk, *zv;
6553 unsigned int zklen, zvlen;
6554
6555 dictIterator *di;
6556 dictEntry *de;
6557 } hashIterator;
6558
6559 static hashIterator *hashInitIterator(robj *subject) {
6560 hashIterator *hi = zmalloc(sizeof(hashIterator));
6561 hi->encoding = subject->encoding;
6562 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6563 hi->zi = zipmapRewind(subject->ptr);
6564 } else if (hi->encoding == REDIS_ENCODING_HT) {
6565 hi->di = dictGetIterator(subject->ptr);
6566 } else {
6567 redisAssert(NULL);
6568 }
6569 return hi;
6570 }
6571
6572 static void hashReleaseIterator(hashIterator *hi) {
6573 if (hi->encoding == REDIS_ENCODING_HT) {
6574 dictReleaseIterator(hi->di);
6575 }
6576 zfree(hi);
6577 }
6578
6579 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6580 * could be found and REDIS_ERR when the iterator reaches the end. */
6581 static int hashNext(hashIterator *hi) {
6582 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6583 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6584 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6585 } else {
6586 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6587 }
6588 return REDIS_OK;
6589 }
6590
6591 /* Get key or value object at current iteration position.
6592 * This increases the refcount of the field object by 1. */
6593 static robj *hashCurrent(hashIterator *hi, int what) {
6594 robj *o;
6595 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6596 if (what & REDIS_HASH_KEY) {
6597 o = createStringObject((char*)hi->zk,hi->zklen);
6598 } else {
6599 o = createStringObject((char*)hi->zv,hi->zvlen);
6600 }
6601 } else {
6602 if (what & REDIS_HASH_KEY) {
6603 o = dictGetEntryKey(hi->de);
6604 } else {
6605 o = dictGetEntryVal(hi->de);
6606 }
6607 incrRefCount(o);
6608 }
6609 return o;
6610 }
6611
6612 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6613 robj *o = lookupKeyWrite(c->db,key);
6614 if (o == NULL) {
6615 o = createHashObject();
6616 dictAdd(c->db->dict,key,o);
6617 incrRefCount(key);
6618 } else {
6619 if (o->type != REDIS_HASH) {
6620 addReply(c,shared.wrongtypeerr);
6621 return NULL;
6622 }
6623 }
6624 return o;
6625 }
6626
6627 /* ============================= Hash commands ============================== */
6628 static void hsetCommand(redisClient *c) {
6629 int update;
6630 robj *o;
6631
6632 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6633 hashTryConversion(o,c->argv,2,3);
6634 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6635 update = hashSet(o,c->argv[2],c->argv[3]);
6636 addReply(c, update ? shared.czero : shared.cone);
6637 server.dirty++;
6638 }
6639
6640 static void hsetnxCommand(redisClient *c) {
6641 robj *o;
6642 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6643 hashTryConversion(o,c->argv,2,3);
6644
6645 if (hashExists(o, c->argv[2])) {
6646 addReply(c, shared.czero);
6647 } else {
6648 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6649 hashSet(o,c->argv[2],c->argv[3]);
6650 addReply(c, shared.cone);
6651 server.dirty++;
6652 }
6653 }
6654
6655 static void hmsetCommand(redisClient *c) {
6656 int i;
6657 robj *o;
6658
6659 if ((c->argc % 2) == 1) {
6660 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6661 return;
6662 }
6663
6664 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6665 hashTryConversion(o,c->argv,2,c->argc-1);
6666 for (i = 2; i < c->argc; i += 2) {
6667 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6668 hashSet(o,c->argv[i],c->argv[i+1]);
6669 }
6670 addReply(c, shared.ok);
6671 server.dirty++;
6672 }
6673
6674 static void hincrbyCommand(redisClient *c) {
6675 long long value, incr;
6676 robj *o, *current, *new;
6677
6678 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6679 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6680 if ((current = hashGet(o,c->argv[2])) != NULL) {
6681 if (getLongLongFromObjectOrReply(c,current,&value,
6682 "hash value is not an integer") != REDIS_OK) {
6683 decrRefCount(current);
6684 return;
6685 }
6686 decrRefCount(current);
6687 } else {
6688 value = 0;
6689 }
6690
6691 value += incr;
6692 new = createStringObjectFromLongLong(value);
6693 hashTryObjectEncoding(o,&c->argv[2],NULL);
6694 hashSet(o,c->argv[2],new);
6695 decrRefCount(new);
6696 addReplyLongLong(c,value);
6697 server.dirty++;
6698 }
6699
6700 static void hgetCommand(redisClient *c) {
6701 robj *o, *value;
6702 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6703 checkType(c,o,REDIS_HASH)) return;
6704
6705 if ((value = hashGet(o,c->argv[2])) != NULL) {
6706 addReplyBulk(c,value);
6707 decrRefCount(value);
6708 } else {
6709 addReply(c,shared.nullbulk);
6710 }
6711 }
6712
6713 static void hmgetCommand(redisClient *c) {
6714 int i;
6715 robj *o, *value;
6716 o = lookupKeyRead(c->db,c->argv[1]);
6717 if (o != NULL && o->type != REDIS_HASH) {
6718 addReply(c,shared.wrongtypeerr);
6719 }
6720
6721 /* Note the check for o != NULL happens inside the loop. This is
6722 * done because objects that cannot be found are considered to be
6723 * an empty hash. The reply should then be a series of NULLs. */
6724 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6725 for (i = 2; i < c->argc; i++) {
6726 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6727 addReplyBulk(c,value);
6728 decrRefCount(value);
6729 } else {
6730 addReply(c,shared.nullbulk);
6731 }
6732 }
6733 }
6734
6735 static void hdelCommand(redisClient *c) {
6736 robj *o;
6737 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6738 checkType(c,o,REDIS_HASH)) return;
6739
6740 if (hashDelete(o,c->argv[2])) {
6741 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6742 addReply(c,shared.cone);
6743 server.dirty++;
6744 } else {
6745 addReply(c,shared.czero);
6746 }
6747 }
6748
6749 static void hlenCommand(redisClient *c) {
6750 robj *o;
6751 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6752 checkType(c,o,REDIS_HASH)) return;
6753
6754 addReplyUlong(c,hashLength(o));
6755 }
6756
6757 static void genericHgetallCommand(redisClient *c, int flags) {
6758 robj *o, *lenobj, *obj;
6759 unsigned long count = 0;
6760 hashIterator *hi;
6761
6762 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6763 || checkType(c,o,REDIS_HASH)) return;
6764
6765 lenobj = createObject(REDIS_STRING,NULL);
6766 addReply(c,lenobj);
6767 decrRefCount(lenobj);
6768
6769 hi = hashInitIterator(o);
6770 while (hashNext(hi) != REDIS_ERR) {
6771 if (flags & REDIS_HASH_KEY) {
6772 obj = hashCurrent(hi,REDIS_HASH_KEY);
6773 addReplyBulk(c,obj);
6774 decrRefCount(obj);
6775 count++;
6776 }
6777 if (flags & REDIS_HASH_VALUE) {
6778 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6779 addReplyBulk(c,obj);
6780 decrRefCount(obj);
6781 count++;
6782 }
6783 }
6784 hashReleaseIterator(hi);
6785
6786 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6787 }
6788
6789 static void hkeysCommand(redisClient *c) {
6790 genericHgetallCommand(c,REDIS_HASH_KEY);
6791 }
6792
6793 static void hvalsCommand(redisClient *c) {
6794 genericHgetallCommand(c,REDIS_HASH_VALUE);
6795 }
6796
6797 static void hgetallCommand(redisClient *c) {
6798 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6799 }
6800
6801 static void hexistsCommand(redisClient *c) {
6802 robj *o;
6803 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6804 checkType(c,o,REDIS_HASH)) return;
6805
6806 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6807 }
6808
6809 static void convertToRealHash(robj *o) {
6810 unsigned char *key, *val, *p, *zm = o->ptr;
6811 unsigned int klen, vlen;
6812 dict *dict = dictCreate(&hashDictType,NULL);
6813
6814 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6815 p = zipmapRewind(zm);
6816 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6817 robj *keyobj, *valobj;
6818
6819 keyobj = createStringObject((char*)key,klen);
6820 valobj = createStringObject((char*)val,vlen);
6821 keyobj = tryObjectEncoding(keyobj);
6822 valobj = tryObjectEncoding(valobj);
6823 dictAdd(dict,keyobj,valobj);
6824 }
6825 o->encoding = REDIS_ENCODING_HT;
6826 o->ptr = dict;
6827 zfree(zm);
6828 }
6829
6830 /* ========================= Non type-specific commands ==================== */
6831
6832 static void flushdbCommand(redisClient *c) {
6833 server.dirty += dictSize(c->db->dict);
6834 touchWatchedKeysOnFlush(c->db->id);
6835 dictEmpty(c->db->dict);
6836 dictEmpty(c->db->expires);
6837 addReply(c,shared.ok);
6838 }
6839
6840 static void flushallCommand(redisClient *c) {
6841 touchWatchedKeysOnFlush(-1);
6842 server.dirty += emptyDb();
6843 addReply(c,shared.ok);
6844 if (server.bgsavechildpid != -1) {
6845 kill(server.bgsavechildpid,SIGKILL);
6846 rdbRemoveTempFile(server.bgsavechildpid);
6847 }
6848 rdbSave(server.dbfilename);
6849 server.dirty++;
6850 }
6851
6852 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6853 redisSortOperation *so = zmalloc(sizeof(*so));
6854 so->type = type;
6855 so->pattern = pattern;
6856 return so;
6857 }
6858
6859 /* Return the value associated to the key with a name obtained
6860 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6861 * The returned object will always have its refcount increased by 1
6862 * when it is non-NULL. */
6863 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6864 char *p, *f;
6865 sds spat, ssub;
6866 robj keyobj, fieldobj, *o;
6867 int prefixlen, sublen, postfixlen, fieldlen;
6868 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6869 struct {
6870 long len;
6871 long free;
6872 char buf[REDIS_SORTKEY_MAX+1];
6873 } keyname, fieldname;
6874
6875 /* If the pattern is "#" return the substitution object itself in order
6876 * to implement the "SORT ... GET #" feature. */
6877 spat = pattern->ptr;
6878 if (spat[0] == '#' && spat[1] == '\0') {
6879 incrRefCount(subst);
6880 return subst;
6881 }
6882
6883 /* The substitution object may be specially encoded. If so we create
6884 * a decoded object on the fly. Otherwise getDecodedObject will just
6885 * increment the ref count, that we'll decrement later. */
6886 subst = getDecodedObject(subst);
6887
6888 ssub = subst->ptr;
6889 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6890 p = strchr(spat,'*');
6891 if (!p) {
6892 decrRefCount(subst);
6893 return NULL;
6894 }
6895
6896 /* Find out if we're dealing with a hash dereference. */
6897 if ((f = strstr(p+1, "->")) != NULL) {
6898 fieldlen = sdslen(spat)-(f-spat);
6899 /* this also copies \0 character */
6900 memcpy(fieldname.buf,f+2,fieldlen-1);
6901 fieldname.len = fieldlen-2;
6902 } else {
6903 fieldlen = 0;
6904 }
6905
6906 prefixlen = p-spat;
6907 sublen = sdslen(ssub);
6908 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6909 memcpy(keyname.buf,spat,prefixlen);
6910 memcpy(keyname.buf+prefixlen,ssub,sublen);
6911 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6912 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6913 keyname.len = prefixlen+sublen+postfixlen;
6914 decrRefCount(subst);
6915
6916 /* Lookup substituted key */
6917 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6918 o = lookupKeyRead(db,&keyobj);
6919 if (o == NULL) return NULL;
6920
6921 if (fieldlen > 0) {
6922 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6923
6924 /* Retrieve value from hash by the field name. This operation
6925 * already increases the refcount of the returned object. */
6926 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6927 o = hashGet(o, &fieldobj);
6928 } else {
6929 if (o->type != REDIS_STRING) return NULL;
6930
6931 /* Every object that this function returns needs to have its refcount
6932 * increased. sortCommand decreases it again. */
6933 incrRefCount(o);
6934 }
6935
6936 return o;
6937 }
6938
6939 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6940 * the additional parameter is not standard but a BSD-specific we have to
6941 * pass sorting parameters via the global 'server' structure */
6942 static int sortCompare(const void *s1, const void *s2) {
6943 const redisSortObject *so1 = s1, *so2 = s2;
6944 int cmp;
6945
6946 if (!server.sort_alpha) {
6947 /* Numeric sorting. Here it's trivial as we precomputed scores */
6948 if (so1->u.score > so2->u.score) {
6949 cmp = 1;
6950 } else if (so1->u.score < so2->u.score) {
6951 cmp = -1;
6952 } else {
6953 cmp = 0;
6954 }
6955 } else {
6956 /* Alphanumeric sorting */
6957 if (server.sort_bypattern) {
6958 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6959 /* At least one compare object is NULL */
6960 if (so1->u.cmpobj == so2->u.cmpobj)
6961 cmp = 0;
6962 else if (so1->u.cmpobj == NULL)
6963 cmp = -1;
6964 else
6965 cmp = 1;
6966 } else {
6967 /* We have both the objects, use strcoll */
6968 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6969 }
6970 } else {
6971 /* Compare elements directly. */
6972 cmp = compareStringObjects(so1->obj,so2->obj);
6973 }
6974 }
6975 return server.sort_desc ? -cmp : cmp;
6976 }
6977
6978 /* The SORT command is the most complex command in Redis. Warning: this code
6979 * is optimized for speed and a bit less for readability */
6980 static void sortCommand(redisClient *c) {
6981 list *operations;
6982 int outputlen = 0;
6983 int desc = 0, alpha = 0;
6984 int limit_start = 0, limit_count = -1, start, end;
6985 int j, dontsort = 0, vectorlen;
6986 int getop = 0; /* GET operation counter */
6987 robj *sortval, *sortby = NULL, *storekey = NULL;
6988 redisSortObject *vector; /* Resulting vector to sort */
6989
6990 /* Lookup the key to sort. It must be of the right types */
6991 sortval = lookupKeyRead(c->db,c->argv[1]);
6992 if (sortval == NULL) {
6993 addReply(c,shared.emptymultibulk);
6994 return;
6995 }
6996 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6997 sortval->type != REDIS_ZSET)
6998 {
6999 addReply(c,shared.wrongtypeerr);
7000 return;
7001 }
7002
7003 /* Create a list of operations to perform for every sorted element.
7004 * Operations can be GET/DEL/INCR/DECR */
7005 operations = listCreate();
7006 listSetFreeMethod(operations,zfree);
7007 j = 2;
7008
7009 /* Now we need to protect sortval incrementing its count, in the future
7010 * SORT may have options able to overwrite/delete keys during the sorting
7011 * and the sorted key itself may get destroied */
7012 incrRefCount(sortval);
7013
7014 /* The SORT command has an SQL-alike syntax, parse it */
7015 while(j < c->argc) {
7016 int leftargs = c->argc-j-1;
7017 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7018 desc = 0;
7019 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7020 desc = 1;
7021 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7022 alpha = 1;
7023 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7024 limit_start = atoi(c->argv[j+1]->ptr);
7025 limit_count = atoi(c->argv[j+2]->ptr);
7026 j+=2;
7027 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7028 storekey = c->argv[j+1];
7029 j++;
7030 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7031 sortby = c->argv[j+1];
7032 /* If the BY pattern does not contain '*', i.e. it is constant,
7033 * we don't need to sort nor to lookup the weight keys. */
7034 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7035 j++;
7036 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7037 listAddNodeTail(operations,createSortOperation(
7038 REDIS_SORT_GET,c->argv[j+1]));
7039 getop++;
7040 j++;
7041 } else {
7042 decrRefCount(sortval);
7043 listRelease(operations);
7044 addReply(c,shared.syntaxerr);
7045 return;
7046 }
7047 j++;
7048 }
7049
7050 /* Load the sorting vector with all the objects to sort */
7051 switch(sortval->type) {
7052 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7053 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7054 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7055 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7056 }
7057 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7058 j = 0;
7059
7060 if (sortval->type == REDIS_LIST) {
7061 list *list = sortval->ptr;
7062 listNode *ln;
7063 listIter li;
7064
7065 listRewind(list,&li);
7066 while((ln = listNext(&li))) {
7067 robj *ele = ln->value;
7068 vector[j].obj = ele;
7069 vector[j].u.score = 0;
7070 vector[j].u.cmpobj = NULL;
7071 j++;
7072 }
7073 } else {
7074 dict *set;
7075 dictIterator *di;
7076 dictEntry *setele;
7077
7078 if (sortval->type == REDIS_SET) {
7079 set = sortval->ptr;
7080 } else {
7081 zset *zs = sortval->ptr;
7082 set = zs->dict;
7083 }
7084
7085 di = dictGetIterator(set);
7086 while((setele = dictNext(di)) != NULL) {
7087 vector[j].obj = dictGetEntryKey(setele);
7088 vector[j].u.score = 0;
7089 vector[j].u.cmpobj = NULL;
7090 j++;
7091 }
7092 dictReleaseIterator(di);
7093 }
7094 redisAssert(j == vectorlen);
7095
7096 /* Now it's time to load the right scores in the sorting vector */
7097 if (dontsort == 0) {
7098 for (j = 0; j < vectorlen; j++) {
7099 robj *byval;
7100 if (sortby) {
7101 /* lookup value to sort by */
7102 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7103 if (!byval) continue;
7104 } else {
7105 /* use object itself to sort by */
7106 byval = vector[j].obj;
7107 }
7108
7109 if (alpha) {
7110 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7111 } else {
7112 if (byval->encoding == REDIS_ENCODING_RAW) {
7113 vector[j].u.score = strtod(byval->ptr,NULL);
7114 } else if (byval->encoding == REDIS_ENCODING_INT) {
7115 /* Don't need to decode the object if it's
7116 * integer-encoded (the only encoding supported) so
7117 * far. We can just cast it */
7118 vector[j].u.score = (long)byval->ptr;
7119 } else {
7120 redisAssert(1 != 1);
7121 }
7122 }
7123
7124 /* when the object was retrieved using lookupKeyByPattern,
7125 * its refcount needs to be decreased. */
7126 if (sortby) {
7127 decrRefCount(byval);
7128 }
7129 }
7130 }
7131
7132 /* We are ready to sort the vector... perform a bit of sanity check
7133 * on the LIMIT option too. We'll use a partial version of quicksort. */
7134 start = (limit_start < 0) ? 0 : limit_start;
7135 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7136 if (start >= vectorlen) {
7137 start = vectorlen-1;
7138 end = vectorlen-2;
7139 }
7140 if (end >= vectorlen) end = vectorlen-1;
7141
7142 if (dontsort == 0) {
7143 server.sort_desc = desc;
7144 server.sort_alpha = alpha;
7145 server.sort_bypattern = sortby ? 1 : 0;
7146 if (sortby && (start != 0 || end != vectorlen-1))
7147 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7148 else
7149 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7150 }
7151
7152 /* Send command output to the output buffer, performing the specified
7153 * GET/DEL/INCR/DECR operations if any. */
7154 outputlen = getop ? getop*(end-start+1) : end-start+1;
7155 if (storekey == NULL) {
7156 /* STORE option not specified, sent the sorting result to client */
7157 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7158 for (j = start; j <= end; j++) {
7159 listNode *ln;
7160 listIter li;
7161
7162 if (!getop) addReplyBulk(c,vector[j].obj);
7163 listRewind(operations,&li);
7164 while((ln = listNext(&li))) {
7165 redisSortOperation *sop = ln->value;
7166 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7167 vector[j].obj);
7168
7169 if (sop->type == REDIS_SORT_GET) {
7170 if (!val) {
7171 addReply(c,shared.nullbulk);
7172 } else {
7173 addReplyBulk(c,val);
7174 decrRefCount(val);
7175 }
7176 } else {
7177 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7178 }
7179 }
7180 }
7181 } else {
7182 robj *listObject = createListObject();
7183 list *listPtr = (list*) listObject->ptr;
7184
7185 /* STORE option specified, set the sorting result as a List object */
7186 for (j = start; j <= end; j++) {
7187 listNode *ln;
7188 listIter li;
7189
7190 if (!getop) {
7191 listAddNodeTail(listPtr,vector[j].obj);
7192 incrRefCount(vector[j].obj);
7193 }
7194 listRewind(operations,&li);
7195 while((ln = listNext(&li))) {
7196 redisSortOperation *sop = ln->value;
7197 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7198 vector[j].obj);
7199
7200 if (sop->type == REDIS_SORT_GET) {
7201 if (!val) {
7202 listAddNodeTail(listPtr,createStringObject("",0));
7203 } else {
7204 /* We should do a incrRefCount on val because it is
7205 * added to the list, but also a decrRefCount because
7206 * it is returned by lookupKeyByPattern. This results
7207 * in doing nothing at all. */
7208 listAddNodeTail(listPtr,val);
7209 }
7210 } else {
7211 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7212 }
7213 }
7214 }
7215 if (dictReplace(c->db->dict,storekey,listObject)) {
7216 incrRefCount(storekey);
7217 }
7218 /* Note: we add 1 because the DB is dirty anyway since even if the
7219 * SORT result is empty a new key is set and maybe the old content
7220 * replaced. */
7221 server.dirty += 1+outputlen;
7222 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7223 }
7224
7225 /* Cleanup */
7226 decrRefCount(sortval);
7227 listRelease(operations);
7228 for (j = 0; j < vectorlen; j++) {
7229 if (alpha && vector[j].u.cmpobj)
7230 decrRefCount(vector[j].u.cmpobj);
7231 }
7232 zfree(vector);
7233 }
7234
7235 /* Convert an amount of bytes into a human readable string in the form
7236 * of 100B, 2G, 100M, 4K, and so forth. */
7237 static void bytesToHuman(char *s, unsigned long long n) {
7238 double d;
7239
7240 if (n < 1024) {
7241 /* Bytes */
7242 sprintf(s,"%lluB",n);
7243 return;
7244 } else if (n < (1024*1024)) {
7245 d = (double)n/(1024);
7246 sprintf(s,"%.2fK",d);
7247 } else if (n < (1024LL*1024*1024)) {
7248 d = (double)n/(1024*1024);
7249 sprintf(s,"%.2fM",d);
7250 } else if (n < (1024LL*1024*1024*1024)) {
7251 d = (double)n/(1024LL*1024*1024);
7252 sprintf(s,"%.2fG",d);
7253 }
7254 }
7255
7256 /* Create the string returned by the INFO command. This is decoupled
7257 * by the INFO command itself as we need to report the same information
7258 * on memory corruption problems. */
7259 static sds genRedisInfoString(void) {
7260 sds info;
7261 time_t uptime = time(NULL)-server.stat_starttime;
7262 int j;
7263 char hmem[64];
7264
7265 bytesToHuman(hmem,zmalloc_used_memory());
7266 info = sdscatprintf(sdsempty(),
7267 "redis_version:%s\r\n"
7268 "redis_git_sha1:%s\r\n"
7269 "redis_git_dirty:%d\r\n"
7270 "arch_bits:%s\r\n"
7271 "multiplexing_api:%s\r\n"
7272 "process_id:%ld\r\n"
7273 "uptime_in_seconds:%ld\r\n"
7274 "uptime_in_days:%ld\r\n"
7275 "connected_clients:%d\r\n"
7276 "connected_slaves:%d\r\n"
7277 "blocked_clients:%d\r\n"
7278 "used_memory:%zu\r\n"
7279 "used_memory_human:%s\r\n"
7280 "changes_since_last_save:%lld\r\n"
7281 "bgsave_in_progress:%d\r\n"
7282 "last_save_time:%ld\r\n"
7283 "bgrewriteaof_in_progress:%d\r\n"
7284 "total_connections_received:%lld\r\n"
7285 "total_commands_processed:%lld\r\n"
7286 "expired_keys:%lld\r\n"
7287 "hash_max_zipmap_entries:%zu\r\n"
7288 "hash_max_zipmap_value:%zu\r\n"
7289 "pubsub_channels:%ld\r\n"
7290 "pubsub_patterns:%u\r\n"
7291 "vm_enabled:%d\r\n"
7292 "role:%s\r\n"
7293 ,REDIS_VERSION,
7294 REDIS_GIT_SHA1,
7295 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7296 (sizeof(long) == 8) ? "64" : "32",
7297 aeGetApiName(),
7298 (long) getpid(),
7299 uptime,
7300 uptime/(3600*24),
7301 listLength(server.clients)-listLength(server.slaves),
7302 listLength(server.slaves),
7303 server.blpop_blocked_clients,
7304 zmalloc_used_memory(),
7305 hmem,
7306 server.dirty,
7307 server.bgsavechildpid != -1,
7308 server.lastsave,
7309 server.bgrewritechildpid != -1,
7310 server.stat_numconnections,
7311 server.stat_numcommands,
7312 server.stat_expiredkeys,
7313 server.hash_max_zipmap_entries,
7314 server.hash_max_zipmap_value,
7315 dictSize(server.pubsub_channels),
7316 listLength(server.pubsub_patterns),
7317 server.vm_enabled != 0,
7318 server.masterhost == NULL ? "master" : "slave"
7319 );
7320 if (server.masterhost) {
7321 info = sdscatprintf(info,
7322 "master_host:%s\r\n"
7323 "master_port:%d\r\n"
7324 "master_link_status:%s\r\n"
7325 "master_last_io_seconds_ago:%d\r\n"
7326 ,server.masterhost,
7327 server.masterport,
7328 (server.replstate == REDIS_REPL_CONNECTED) ?
7329 "up" : "down",
7330 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7331 );
7332 }
7333 if (server.vm_enabled) {
7334 lockThreadedIO();
7335 info = sdscatprintf(info,
7336 "vm_conf_max_memory:%llu\r\n"
7337 "vm_conf_page_size:%llu\r\n"
7338 "vm_conf_pages:%llu\r\n"
7339 "vm_stats_used_pages:%llu\r\n"
7340 "vm_stats_swapped_objects:%llu\r\n"
7341 "vm_stats_swappin_count:%llu\r\n"
7342 "vm_stats_swappout_count:%llu\r\n"
7343 "vm_stats_io_newjobs_len:%lu\r\n"
7344 "vm_stats_io_processing_len:%lu\r\n"
7345 "vm_stats_io_processed_len:%lu\r\n"
7346 "vm_stats_io_active_threads:%lu\r\n"
7347 "vm_stats_blocked_clients:%lu\r\n"
7348 ,(unsigned long long) server.vm_max_memory,
7349 (unsigned long long) server.vm_page_size,
7350 (unsigned long long) server.vm_pages,
7351 (unsigned long long) server.vm_stats_used_pages,
7352 (unsigned long long) server.vm_stats_swapped_objects,
7353 (unsigned long long) server.vm_stats_swapins,
7354 (unsigned long long) server.vm_stats_swapouts,
7355 (unsigned long) listLength(server.io_newjobs),
7356 (unsigned long) listLength(server.io_processing),
7357 (unsigned long) listLength(server.io_processed),
7358 (unsigned long) server.io_active_threads,
7359 (unsigned long) server.vm_blocked_clients
7360 );
7361 unlockThreadedIO();
7362 }
7363 for (j = 0; j < server.dbnum; j++) {
7364 long long keys, vkeys;
7365
7366 keys = dictSize(server.db[j].dict);
7367 vkeys = dictSize(server.db[j].expires);
7368 if (keys || vkeys) {
7369 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7370 j, keys, vkeys);
7371 }
7372 }
7373 return info;
7374 }
7375
7376 static void infoCommand(redisClient *c) {
7377 sds info = genRedisInfoString();
7378 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7379 (unsigned long)sdslen(info)));
7380 addReplySds(c,info);
7381 addReply(c,shared.crlf);
7382 }
7383
7384 static void monitorCommand(redisClient *c) {
7385 /* ignore MONITOR if aleady slave or in monitor mode */
7386 if (c->flags & REDIS_SLAVE) return;
7387
7388 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7389 c->slaveseldb = 0;
7390 listAddNodeTail(server.monitors,c);
7391 addReply(c,shared.ok);
7392 }
7393
7394 /* ================================= Expire ================================= */
7395 static int removeExpire(redisDb *db, robj *key) {
7396 if (dictDelete(db->expires,key) == DICT_OK) {
7397 return 1;
7398 } else {
7399 return 0;
7400 }
7401 }
7402
7403 static int setExpire(redisDb *db, robj *key, time_t when) {
7404 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7405 return 0;
7406 } else {
7407 incrRefCount(key);
7408 return 1;
7409 }
7410 }
7411
7412 /* Return the expire time of the specified key, or -1 if no expire
7413 * is associated with this key (i.e. the key is non volatile) */
7414 static time_t getExpire(redisDb *db, robj *key) {
7415 dictEntry *de;
7416
7417 /* No expire? return ASAP */
7418 if (dictSize(db->expires) == 0 ||
7419 (de = dictFind(db->expires,key)) == NULL) return -1;
7420
7421 return (time_t) dictGetEntryVal(de);
7422 }
7423
7424 static int expireIfNeeded(redisDb *db, robj *key) {
7425 time_t when;
7426 dictEntry *de;
7427
7428 /* No expire? return ASAP */
7429 if (dictSize(db->expires) == 0 ||
7430 (de = dictFind(db->expires,key)) == NULL) return 0;
7431
7432 /* Lookup the expire */
7433 when = (time_t) dictGetEntryVal(de);
7434 if (time(NULL) <= when) return 0;
7435
7436 /* Delete the key */
7437 dictDelete(db->expires,key);
7438 server.stat_expiredkeys++;
7439 return dictDelete(db->dict,key) == DICT_OK;
7440 }
7441
7442 static int deleteIfVolatile(redisDb *db, robj *key) {
7443 dictEntry *de;
7444
7445 /* No expire? return ASAP */
7446 if (dictSize(db->expires) == 0 ||
7447 (de = dictFind(db->expires,key)) == NULL) return 0;
7448
7449 /* Delete the key */
7450 server.dirty++;
7451 server.stat_expiredkeys++;
7452 dictDelete(db->expires,key);
7453 return dictDelete(db->dict,key) == DICT_OK;
7454 }
7455
7456 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7457 dictEntry *de;
7458 time_t seconds;
7459
7460 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7461
7462 seconds -= offset;
7463
7464 de = dictFind(c->db->dict,key);
7465 if (de == NULL) {
7466 addReply(c,shared.czero);
7467 return;
7468 }
7469 if (seconds <= 0) {
7470 if (deleteKey(c->db,key)) server.dirty++;
7471 addReply(c, shared.cone);
7472 return;
7473 } else {
7474 time_t when = time(NULL)+seconds;
7475 if (setExpire(c->db,key,when)) {
7476 addReply(c,shared.cone);
7477 server.dirty++;
7478 } else {
7479 addReply(c,shared.czero);
7480 }
7481 return;
7482 }
7483 }
7484
7485 static void expireCommand(redisClient *c) {
7486 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7487 }
7488
7489 static void expireatCommand(redisClient *c) {
7490 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7491 }
7492
7493 static void ttlCommand(redisClient *c) {
7494 time_t expire;
7495 int ttl = -1;
7496
7497 expire = getExpire(c->db,c->argv[1]);
7498 if (expire != -1) {
7499 ttl = (int) (expire-time(NULL));
7500 if (ttl < 0) ttl = -1;
7501 }
7502 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7503 }
7504
7505 /* ================================ MULTI/EXEC ============================== */
7506
7507 /* Client state initialization for MULTI/EXEC */
7508 static void initClientMultiState(redisClient *c) {
7509 c->mstate.commands = NULL;
7510 c->mstate.count = 0;
7511 }
7512
7513 /* Release all the resources associated with MULTI/EXEC state */
7514 static void freeClientMultiState(redisClient *c) {
7515 int j;
7516
7517 for (j = 0; j < c->mstate.count; j++) {
7518 int i;
7519 multiCmd *mc = c->mstate.commands+j;
7520
7521 for (i = 0; i < mc->argc; i++)
7522 decrRefCount(mc->argv[i]);
7523 zfree(mc->argv);
7524 }
7525 zfree(c->mstate.commands);
7526 }
7527
7528 /* Add a new command into the MULTI commands queue */
7529 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7530 multiCmd *mc;
7531 int j;
7532
7533 c->mstate.commands = zrealloc(c->mstate.commands,
7534 sizeof(multiCmd)*(c->mstate.count+1));
7535 mc = c->mstate.commands+c->mstate.count;
7536 mc->cmd = cmd;
7537 mc->argc = c->argc;
7538 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7539 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7540 for (j = 0; j < c->argc; j++)
7541 incrRefCount(mc->argv[j]);
7542 c->mstate.count++;
7543 }
7544
7545 static void multiCommand(redisClient *c) {
7546 if (c->flags & REDIS_MULTI) {
7547 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7548 return;
7549 }
7550 c->flags |= REDIS_MULTI;
7551 addReply(c,shared.ok);
7552 }
7553
7554 static void discardCommand(redisClient *c) {
7555 if (!(c->flags & REDIS_MULTI)) {
7556 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7557 return;
7558 }
7559
7560 freeClientMultiState(c);
7561 initClientMultiState(c);
7562 c->flags &= (~REDIS_MULTI);
7563 addReply(c,shared.ok);
7564 }
7565
7566 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7567 * implememntation for more information. */
7568 static void execCommandReplicateMulti(redisClient *c) {
7569 struct redisCommand *cmd;
7570 robj *multistring = createStringObject("MULTI",5);
7571
7572 cmd = lookupCommand("multi");
7573 if (server.appendonly)
7574 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7575 if (listLength(server.slaves))
7576 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7577 decrRefCount(multistring);
7578 }
7579
7580 static void execCommand(redisClient *c) {
7581 int j;
7582 robj **orig_argv;
7583 int orig_argc;
7584
7585 if (!(c->flags & REDIS_MULTI)) {
7586 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7587 return;
7588 }
7589
7590 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7591 * A failed EXEC will return a multi bulk nil object. */
7592 if (c->flags & REDIS_DIRTY_CAS) {
7593 freeClientMultiState(c);
7594 initClientMultiState(c);
7595 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7596 unwatchAllKeys(c);
7597 addReply(c,shared.nullmultibulk);
7598 return;
7599 }
7600
7601 /* Replicate a MULTI request now that we are sure the block is executed.
7602 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7603 * both the AOF and the replication link will have the same consistency
7604 * and atomicity guarantees. */
7605 execCommandReplicateMulti(c);
7606
7607 /* Exec all the queued commands */
7608 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7609 orig_argv = c->argv;
7610 orig_argc = c->argc;
7611 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7612 for (j = 0; j < c->mstate.count; j++) {
7613 c->argc = c->mstate.commands[j].argc;
7614 c->argv = c->mstate.commands[j].argv;
7615 call(c,c->mstate.commands[j].cmd);
7616 }
7617 c->argv = orig_argv;
7618 c->argc = orig_argc;
7619 freeClientMultiState(c);
7620 initClientMultiState(c);
7621 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7622 /* Make sure the EXEC command is always replicated / AOF, since we
7623 * always send the MULTI command (we can't know beforehand if the
7624 * next operations will contain at least a modification to the DB). */
7625 server.dirty++;
7626 }
7627
7628 /* =========================== Blocking Operations ========================= */
7629
7630 /* Currently Redis blocking operations support is limited to list POP ops,
7631 * so the current implementation is not fully generic, but it is also not
7632 * completely specific so it will not require a rewrite to support new
7633 * kind of blocking operations in the future.
7634 *
7635 * Still it's important to note that list blocking operations can be already
7636 * used as a notification mechanism in order to implement other blocking
7637 * operations at application level, so there must be a very strong evidence
7638 * of usefulness and generality before new blocking operations are implemented.
7639 *
7640 * This is how the current blocking POP works, we use BLPOP as example:
7641 * - If the user calls BLPOP and the key exists and contains a non empty list
7642 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7643 * if there is not to block.
7644 * - If instead BLPOP is called and the key does not exists or the list is
7645 * empty we need to block. In order to do so we remove the notification for
7646 * new data to read in the client socket (so that we'll not serve new
7647 * requests if the blocking request is not served). Also we put the client
7648 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7649 * blocking for this keys.
7650 * - If a PUSH operation against a key with blocked clients waiting is
7651 * performed, we serve the first in the list: basically instead to push
7652 * the new element inside the list we return it to the (first / oldest)
7653 * blocking client, unblock the client, and remove it form the list.
7654 *
7655 * The above comment and the source code should be enough in order to understand
7656 * the implementation and modify / fix it later.
7657 */
7658
7659 /* Set a client in blocking mode for the specified key, with the specified
7660 * timeout */
7661 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7662 dictEntry *de;
7663 list *l;
7664 int j;
7665
7666 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7667 c->blocking_keys_num = numkeys;
7668 c->blockingto = timeout;
7669 for (j = 0; j < numkeys; j++) {
7670 /* Add the key in the client structure, to map clients -> keys */
7671 c->blocking_keys[j] = keys[j];
7672 incrRefCount(keys[j]);
7673
7674 /* And in the other "side", to map keys -> clients */
7675 de = dictFind(c->db->blocking_keys,keys[j]);
7676 if (de == NULL) {
7677 int retval;
7678
7679 /* For every key we take a list of clients blocked for it */
7680 l = listCreate();
7681 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7682 incrRefCount(keys[j]);
7683 assert(retval == DICT_OK);
7684 } else {
7685 l = dictGetEntryVal(de);
7686 }
7687 listAddNodeTail(l,c);
7688 }
7689 /* Mark the client as a blocked client */
7690 c->flags |= REDIS_BLOCKED;
7691 server.blpop_blocked_clients++;
7692 }
7693
7694 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7695 static void unblockClientWaitingData(redisClient *c) {
7696 dictEntry *de;
7697 list *l;
7698 int j;
7699
7700 assert(c->blocking_keys != NULL);
7701 /* The client may wait for multiple keys, so unblock it for every key. */
7702 for (j = 0; j < c->blocking_keys_num; j++) {
7703 /* Remove this client from the list of clients waiting for this key. */
7704 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7705 assert(de != NULL);
7706 l = dictGetEntryVal(de);
7707 listDelNode(l,listSearchKey(l,c));
7708 /* If the list is empty we need to remove it to avoid wasting memory */
7709 if (listLength(l) == 0)
7710 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7711 decrRefCount(c->blocking_keys[j]);
7712 }
7713 /* Cleanup the client structure */
7714 zfree(c->blocking_keys);
7715 c->blocking_keys = NULL;
7716 c->flags &= (~REDIS_BLOCKED);
7717 server.blpop_blocked_clients--;
7718 /* We want to process data if there is some command waiting
7719 * in the input buffer. Note that this is safe even if
7720 * unblockClientWaitingData() gets called from freeClient() because
7721 * freeClient() will be smart enough to call this function
7722 * *after* c->querybuf was set to NULL. */
7723 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7724 }
7725
7726 /* This should be called from any function PUSHing into lists.
7727 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7728 * 'ele' is the element pushed.
7729 *
7730 * If the function returns 0 there was no client waiting for a list push
7731 * against this key.
7732 *
7733 * If the function returns 1 there was a client waiting for a list push
7734 * against this key, the element was passed to this client thus it's not
7735 * needed to actually add it to the list and the caller should return asap. */
7736 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7737 struct dictEntry *de;
7738 redisClient *receiver;
7739 list *l;
7740 listNode *ln;
7741
7742 de = dictFind(c->db->blocking_keys,key);
7743 if (de == NULL) return 0;
7744 l = dictGetEntryVal(de);
7745 ln = listFirst(l);
7746 assert(ln != NULL);
7747 receiver = ln->value;
7748
7749 addReplySds(receiver,sdsnew("*2\r\n"));
7750 addReplyBulk(receiver,key);
7751 addReplyBulk(receiver,ele);
7752 unblockClientWaitingData(receiver);
7753 return 1;
7754 }
7755
7756 /* Blocking RPOP/LPOP */
7757 static void blockingPopGenericCommand(redisClient *c, int where) {
7758 robj *o;
7759 time_t timeout;
7760 int j;
7761
7762 for (j = 1; j < c->argc-1; j++) {
7763 o = lookupKeyWrite(c->db,c->argv[j]);
7764 if (o != NULL) {
7765 if (o->type != REDIS_LIST) {
7766 addReply(c,shared.wrongtypeerr);
7767 return;
7768 } else {
7769 list *list = o->ptr;
7770 if (listLength(list) != 0) {
7771 /* If the list contains elements fall back to the usual
7772 * non-blocking POP operation */
7773 robj *argv[2], **orig_argv;
7774 int orig_argc;
7775
7776 /* We need to alter the command arguments before to call
7777 * popGenericCommand() as the command takes a single key. */
7778 orig_argv = c->argv;
7779 orig_argc = c->argc;
7780 argv[1] = c->argv[j];
7781 c->argv = argv;
7782 c->argc = 2;
7783
7784 /* Also the return value is different, we need to output
7785 * the multi bulk reply header and the key name. The
7786 * "real" command will add the last element (the value)
7787 * for us. If this souds like an hack to you it's just
7788 * because it is... */
7789 addReplySds(c,sdsnew("*2\r\n"));
7790 addReplyBulk(c,argv[1]);
7791 popGenericCommand(c,where);
7792
7793 /* Fix the client structure with the original stuff */
7794 c->argv = orig_argv;
7795 c->argc = orig_argc;
7796 return;
7797 }
7798 }
7799 }
7800 }
7801 /* If the list is empty or the key does not exists we must block */
7802 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7803 if (timeout > 0) timeout += time(NULL);
7804 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7805 }
7806
7807 static void blpopCommand(redisClient *c) {
7808 blockingPopGenericCommand(c,REDIS_HEAD);
7809 }
7810
7811 static void brpopCommand(redisClient *c) {
7812 blockingPopGenericCommand(c,REDIS_TAIL);
7813 }
7814
7815 /* =============================== Replication ============================= */
7816
7817 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7818 ssize_t nwritten, ret = size;
7819 time_t start = time(NULL);
7820
7821 timeout++;
7822 while(size) {
7823 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7824 nwritten = write(fd,ptr,size);
7825 if (nwritten == -1) return -1;
7826 ptr += nwritten;
7827 size -= nwritten;
7828 }
7829 if ((time(NULL)-start) > timeout) {
7830 errno = ETIMEDOUT;
7831 return -1;
7832 }
7833 }
7834 return ret;
7835 }
7836
7837 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7838 ssize_t nread, totread = 0;
7839 time_t start = time(NULL);
7840
7841 timeout++;
7842 while(size) {
7843 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7844 nread = read(fd,ptr,size);
7845 if (nread == -1) return -1;
7846 ptr += nread;
7847 size -= nread;
7848 totread += nread;
7849 }
7850 if ((time(NULL)-start) > timeout) {
7851 errno = ETIMEDOUT;
7852 return -1;
7853 }
7854 }
7855 return totread;
7856 }
7857
7858 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7859 ssize_t nread = 0;
7860
7861 size--;
7862 while(size) {
7863 char c;
7864
7865 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7866 if (c == '\n') {
7867 *ptr = '\0';
7868 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7869 return nread;
7870 } else {
7871 *ptr++ = c;
7872 *ptr = '\0';
7873 nread++;
7874 }
7875 }
7876 return nread;
7877 }
7878
7879 static void syncCommand(redisClient *c) {
7880 /* ignore SYNC if aleady slave or in monitor mode */
7881 if (c->flags & REDIS_SLAVE) return;
7882
7883 /* SYNC can't be issued when the server has pending data to send to
7884 * the client about already issued commands. We need a fresh reply
7885 * buffer registering the differences between the BGSAVE and the current
7886 * dataset, so that we can copy to other slaves if needed. */
7887 if (listLength(c->reply) != 0) {
7888 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7889 return;
7890 }
7891
7892 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7893 /* Here we need to check if there is a background saving operation
7894 * in progress, or if it is required to start one */
7895 if (server.bgsavechildpid != -1) {
7896 /* Ok a background save is in progress. Let's check if it is a good
7897 * one for replication, i.e. if there is another slave that is
7898 * registering differences since the server forked to save */
7899 redisClient *slave;
7900 listNode *ln;
7901 listIter li;
7902
7903 listRewind(server.slaves,&li);
7904 while((ln = listNext(&li))) {
7905 slave = ln->value;
7906 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7907 }
7908 if (ln) {
7909 /* Perfect, the server is already registering differences for
7910 * another slave. Set the right state, and copy the buffer. */
7911 listRelease(c->reply);
7912 c->reply = listDup(slave->reply);
7913 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7914 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7915 } else {
7916 /* No way, we need to wait for the next BGSAVE in order to
7917 * register differences */
7918 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7919 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7920 }
7921 } else {
7922 /* Ok we don't have a BGSAVE in progress, let's start one */
7923 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7924 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7925 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7926 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7927 return;
7928 }
7929 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7930 }
7931 c->repldbfd = -1;
7932 c->flags |= REDIS_SLAVE;
7933 c->slaveseldb = 0;
7934 listAddNodeTail(server.slaves,c);
7935 return;
7936 }
7937
7938 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7939 redisClient *slave = privdata;
7940 REDIS_NOTUSED(el);
7941 REDIS_NOTUSED(mask);
7942 char buf[REDIS_IOBUF_LEN];
7943 ssize_t nwritten, buflen;
7944
7945 if (slave->repldboff == 0) {
7946 /* Write the bulk write count before to transfer the DB. In theory here
7947 * we don't know how much room there is in the output buffer of the
7948 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7949 * operations) will never be smaller than the few bytes we need. */
7950 sds bulkcount;
7951
7952 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7953 slave->repldbsize);
7954 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7955 {
7956 sdsfree(bulkcount);
7957 freeClient(slave);
7958 return;
7959 }
7960 sdsfree(bulkcount);
7961 }
7962 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7963 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7964 if (buflen <= 0) {
7965 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7966 (buflen == 0) ? "premature EOF" : strerror(errno));
7967 freeClient(slave);
7968 return;
7969 }
7970 if ((nwritten = write(fd,buf,buflen)) == -1) {
7971 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7972 strerror(errno));
7973 freeClient(slave);
7974 return;
7975 }
7976 slave->repldboff += nwritten;
7977 if (slave->repldboff == slave->repldbsize) {
7978 close(slave->repldbfd);
7979 slave->repldbfd = -1;
7980 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7981 slave->replstate = REDIS_REPL_ONLINE;
7982 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7983 sendReplyToClient, slave) == AE_ERR) {
7984 freeClient(slave);
7985 return;
7986 }
7987 addReplySds(slave,sdsempty());
7988 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7989 }
7990 }
7991
7992 /* This function is called at the end of every backgrond saving.
7993 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7994 * otherwise REDIS_ERR is passed to the function.
7995 *
7996 * The goal of this function is to handle slaves waiting for a successful
7997 * background saving in order to perform non-blocking synchronization. */
7998 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7999 listNode *ln;
8000 int startbgsave = 0;
8001 listIter li;
8002
8003 listRewind(server.slaves,&li);
8004 while((ln = listNext(&li))) {
8005 redisClient *slave = ln->value;
8006
8007 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8008 startbgsave = 1;
8009 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8010 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8011 struct redis_stat buf;
8012
8013 if (bgsaveerr != REDIS_OK) {
8014 freeClient(slave);
8015 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8016 continue;
8017 }
8018 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8019 redis_fstat(slave->repldbfd,&buf) == -1) {
8020 freeClient(slave);
8021 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8022 continue;
8023 }
8024 slave->repldboff = 0;
8025 slave->repldbsize = buf.st_size;
8026 slave->replstate = REDIS_REPL_SEND_BULK;
8027 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8028 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8029 freeClient(slave);
8030 continue;
8031 }
8032 }
8033 }
8034 if (startbgsave) {
8035 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8036 listIter li;
8037
8038 listRewind(server.slaves,&li);
8039 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8040 while((ln = listNext(&li))) {
8041 redisClient *slave = ln->value;
8042
8043 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8044 freeClient(slave);
8045 }
8046 }
8047 }
8048 }
8049
8050 static int syncWithMaster(void) {
8051 char buf[1024], tmpfile[256], authcmd[1024];
8052 long dumpsize;
8053 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8054 int dfd, maxtries = 5;
8055
8056 if (fd == -1) {
8057 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8058 strerror(errno));
8059 return REDIS_ERR;
8060 }
8061
8062 /* AUTH with the master if required. */
8063 if(server.masterauth) {
8064 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8065 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8066 close(fd);
8067 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8068 strerror(errno));
8069 return REDIS_ERR;
8070 }
8071 /* Read the AUTH result. */
8072 if (syncReadLine(fd,buf,1024,3600) == -1) {
8073 close(fd);
8074 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8075 strerror(errno));
8076 return REDIS_ERR;
8077 }
8078 if (buf[0] != '+') {
8079 close(fd);
8080 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8081 return REDIS_ERR;
8082 }
8083 }
8084
8085 /* Issue the SYNC command */
8086 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8087 close(fd);
8088 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8089 strerror(errno));
8090 return REDIS_ERR;
8091 }
8092 /* Read the bulk write count */
8093 if (syncReadLine(fd,buf,1024,3600) == -1) {
8094 close(fd);
8095 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8096 strerror(errno));
8097 return REDIS_ERR;
8098 }
8099 if (buf[0] != '$') {
8100 close(fd);
8101 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8102 return REDIS_ERR;
8103 }
8104 dumpsize = strtol(buf+1,NULL,10);
8105 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8106 /* Read the bulk write data on a temp file */
8107 while(maxtries--) {
8108 snprintf(tmpfile,256,
8109 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8110 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8111 if (dfd != -1) break;
8112 sleep(1);
8113 }
8114 if (dfd == -1) {
8115 close(fd);
8116 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8117 return REDIS_ERR;
8118 }
8119 while(dumpsize) {
8120 int nread, nwritten;
8121
8122 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8123 if (nread == -1) {
8124 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8125 strerror(errno));
8126 close(fd);
8127 close(dfd);
8128 return REDIS_ERR;
8129 }
8130 nwritten = write(dfd,buf,nread);
8131 if (nwritten == -1) {
8132 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8133 close(fd);
8134 close(dfd);
8135 return REDIS_ERR;
8136 }
8137 dumpsize -= nread;
8138 }
8139 close(dfd);
8140 if (rename(tmpfile,server.dbfilename) == -1) {
8141 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8142 unlink(tmpfile);
8143 close(fd);
8144 return REDIS_ERR;
8145 }
8146 emptyDb();
8147 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8148 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8149 close(fd);
8150 return REDIS_ERR;
8151 }
8152 server.master = createClient(fd);
8153 server.master->flags |= REDIS_MASTER;
8154 server.master->authenticated = 1;
8155 server.replstate = REDIS_REPL_CONNECTED;
8156 return REDIS_OK;
8157 }
8158
8159 static void slaveofCommand(redisClient *c) {
8160 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8161 !strcasecmp(c->argv[2]->ptr,"one")) {
8162 if (server.masterhost) {
8163 sdsfree(server.masterhost);
8164 server.masterhost = NULL;
8165 if (server.master) freeClient(server.master);
8166 server.replstate = REDIS_REPL_NONE;
8167 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8168 }
8169 } else {
8170 sdsfree(server.masterhost);
8171 server.masterhost = sdsdup(c->argv[1]->ptr);
8172 server.masterport = atoi(c->argv[2]->ptr);
8173 if (server.master) freeClient(server.master);
8174 server.replstate = REDIS_REPL_CONNECT;
8175 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8176 server.masterhost, server.masterport);
8177 }
8178 addReply(c,shared.ok);
8179 }
8180
8181 /* ============================ Maxmemory directive ======================== */
8182
8183 /* Try to free one object form the pre-allocated objects free list.
8184 * This is useful under low mem conditions as by default we take 1 million
8185 * free objects allocated. On success REDIS_OK is returned, otherwise
8186 * REDIS_ERR. */
8187 static int tryFreeOneObjectFromFreelist(void) {
8188 robj *o;
8189
8190 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8191 if (listLength(server.objfreelist)) {
8192 listNode *head = listFirst(server.objfreelist);
8193 o = listNodeValue(head);
8194 listDelNode(server.objfreelist,head);
8195 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8196 zfree(o);
8197 return REDIS_OK;
8198 } else {
8199 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8200 return REDIS_ERR;
8201 }
8202 }
8203
8204 /* This function gets called when 'maxmemory' is set on the config file to limit
8205 * the max memory used by the server, and we are out of memory.
8206 * This function will try to, in order:
8207 *
8208 * - Free objects from the free list
8209 * - Try to remove keys with an EXPIRE set
8210 *
8211 * It is not possible to free enough memory to reach used-memory < maxmemory
8212 * the server will start refusing commands that will enlarge even more the
8213 * memory usage.
8214 */
8215 static void freeMemoryIfNeeded(void) {
8216 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8217 int j, k, freed = 0;
8218
8219 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8220 for (j = 0; j < server.dbnum; j++) {
8221 int minttl = -1;
8222 robj *minkey = NULL;
8223 struct dictEntry *de;
8224
8225 if (dictSize(server.db[j].expires)) {
8226 freed = 1;
8227 /* From a sample of three keys drop the one nearest to
8228 * the natural expire */
8229 for (k = 0; k < 3; k++) {
8230 time_t t;
8231
8232 de = dictGetRandomKey(server.db[j].expires);
8233 t = (time_t) dictGetEntryVal(de);
8234 if (minttl == -1 || t < minttl) {
8235 minkey = dictGetEntryKey(de);
8236 minttl = t;
8237 }
8238 }
8239 deleteKey(server.db+j,minkey);
8240 }
8241 }
8242 if (!freed) return; /* nothing to free... */
8243 }
8244 }
8245
8246 /* ============================== Append Only file ========================== */
8247
8248 /* Write the append only file buffer on disk.
8249 *
8250 * Since we are required to write the AOF before replying to the client,
8251 * and the only way the client socket can get a write is entering when the
8252 * the event loop, we accumulate all the AOF writes in a memory
8253 * buffer and write it on disk using this function just before entering
8254 * the event loop again. */
8255 static void flushAppendOnlyFile(void) {
8256 time_t now;
8257 ssize_t nwritten;
8258
8259 if (sdslen(server.aofbuf) == 0) return;
8260
8261 /* We want to perform a single write. This should be guaranteed atomic
8262 * at least if the filesystem we are writing is a real physical one.
8263 * While this will save us against the server being killed I don't think
8264 * there is much to do about the whole server stopping for power problems
8265 * or alike */
8266 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8267 if (nwritten != (signed)sdslen(server.aofbuf)) {
8268 /* Ooops, we are in troubles. The best thing to do for now is
8269 * aborting instead of giving the illusion that everything is
8270 * working as expected. */
8271 if (nwritten == -1) {
8272 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8273 } else {
8274 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8275 }
8276 exit(1);
8277 }
8278 sdsfree(server.aofbuf);
8279 server.aofbuf = sdsempty();
8280
8281 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8282 * childs performing heavy I/O on disk. */
8283 if (server.no_appendfsync_on_rewrite &&
8284 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8285 return;
8286 /* Fsync if needed */
8287 now = time(NULL);
8288 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8289 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8290 now-server.lastfsync > 1))
8291 {
8292 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8293 * flushing metadata. */
8294 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8295 server.lastfsync = now;
8296 }
8297 }
8298
8299 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8300 int j;
8301 buf = sdscatprintf(buf,"*%d\r\n",argc);
8302 for (j = 0; j < argc; j++) {
8303 robj *o = getDecodedObject(argv[j]);
8304 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8305 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8306 buf = sdscatlen(buf,"\r\n",2);
8307 decrRefCount(o);
8308 }
8309 return buf;
8310 }
8311
8312 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8313 int argc = 3;
8314 long when;
8315 robj *argv[3];
8316
8317 /* Make sure we can use strtol */
8318 seconds = getDecodedObject(seconds);
8319 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8320 decrRefCount(seconds);
8321
8322 argv[0] = createStringObject("EXPIREAT",8);
8323 argv[1] = key;
8324 argv[2] = createObject(REDIS_STRING,
8325 sdscatprintf(sdsempty(),"%ld",when));
8326 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8327 decrRefCount(argv[0]);
8328 decrRefCount(argv[2]);
8329 return buf;
8330 }
8331
8332 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8333 sds buf = sdsempty();
8334 robj *tmpargv[3];
8335
8336 /* The DB this command was targetting is not the same as the last command
8337 * we appendend. To issue a SELECT command is needed. */
8338 if (dictid != server.appendseldb) {
8339 char seldb[64];
8340
8341 snprintf(seldb,sizeof(seldb),"%d",dictid);
8342 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8343 (unsigned long)strlen(seldb),seldb);
8344 server.appendseldb = dictid;
8345 }
8346
8347 if (cmd->proc == expireCommand) {
8348 /* Translate EXPIRE into EXPIREAT */
8349 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8350 } else if (cmd->proc == setexCommand) {
8351 /* Translate SETEX to SET and EXPIREAT */
8352 tmpargv[0] = createStringObject("SET",3);
8353 tmpargv[1] = argv[1];
8354 tmpargv[2] = argv[3];
8355 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8356 decrRefCount(tmpargv[0]);
8357 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8358 } else {
8359 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8360 }
8361
8362 /* Append to the AOF buffer. This will be flushed on disk just before
8363 * of re-entering the event loop, so before the client will get a
8364 * positive reply about the operation performed. */
8365 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8366
8367 /* If a background append only file rewriting is in progress we want to
8368 * accumulate the differences between the child DB and the current one
8369 * in a buffer, so that when the child process will do its work we
8370 * can append the differences to the new append only file. */
8371 if (server.bgrewritechildpid != -1)
8372 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8373
8374 sdsfree(buf);
8375 }
8376
8377 /* In Redis commands are always executed in the context of a client, so in
8378 * order to load the append only file we need to create a fake client. */
8379 static struct redisClient *createFakeClient(void) {
8380 struct redisClient *c = zmalloc(sizeof(*c));
8381
8382 selectDb(c,0);
8383 c->fd = -1;
8384 c->querybuf = sdsempty();
8385 c->argc = 0;
8386 c->argv = NULL;
8387 c->flags = 0;
8388 /* We set the fake client as a slave waiting for the synchronization
8389 * so that Redis will not try to send replies to this client. */
8390 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8391 c->reply = listCreate();
8392 listSetFreeMethod(c->reply,decrRefCount);
8393 listSetDupMethod(c->reply,dupClientReplyValue);
8394 initClientMultiState(c);
8395 return c;
8396 }
8397
8398 static void freeFakeClient(struct redisClient *c) {
8399 sdsfree(c->querybuf);
8400 listRelease(c->reply);
8401 freeClientMultiState(c);
8402 zfree(c);
8403 }
8404
8405 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8406 * error (the append only file is zero-length) REDIS_ERR is returned. On
8407 * fatal error an error message is logged and the program exists. */
8408 int loadAppendOnlyFile(char *filename) {
8409 struct redisClient *fakeClient;
8410 FILE *fp = fopen(filename,"r");
8411 struct redis_stat sb;
8412 unsigned long long loadedkeys = 0;
8413 int appendonly = server.appendonly;
8414
8415 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8416 return REDIS_ERR;
8417
8418 if (fp == NULL) {
8419 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8420 exit(1);
8421 }
8422
8423 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8424 * to the same file we're about to read. */
8425 server.appendonly = 0;
8426
8427 fakeClient = createFakeClient();
8428 while(1) {
8429 int argc, j;
8430 unsigned long len;
8431 robj **argv;
8432 char buf[128];
8433 sds argsds;
8434 struct redisCommand *cmd;
8435
8436 if (fgets(buf,sizeof(buf),fp) == NULL) {
8437 if (feof(fp))
8438 break;
8439 else
8440 goto readerr;
8441 }
8442 if (buf[0] != '*') goto fmterr;
8443 argc = atoi(buf+1);
8444 argv = zmalloc(sizeof(robj*)*argc);
8445 for (j = 0; j < argc; j++) {
8446 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8447 if (buf[0] != '$') goto fmterr;
8448 len = strtol(buf+1,NULL,10);
8449 argsds = sdsnewlen(NULL,len);
8450 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8451 argv[j] = createObject(REDIS_STRING,argsds);
8452 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8453 }
8454
8455 /* Command lookup */
8456 cmd = lookupCommand(argv[0]->ptr);
8457 if (!cmd) {
8458 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8459 exit(1);
8460 }
8461 /* Try object encoding */
8462 if (cmd->flags & REDIS_CMD_BULK)
8463 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8464 /* Run the command in the context of a fake client */
8465 fakeClient->argc = argc;
8466 fakeClient->argv = argv;
8467 cmd->proc(fakeClient);
8468 /* Discard the reply objects list from the fake client */
8469 while(listLength(fakeClient->reply))
8470 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8471 /* Clean up, ready for the next command */
8472 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8473 zfree(argv);
8474 /* Handle swapping while loading big datasets when VM is on */
8475 loadedkeys++;
8476 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8477 while (zmalloc_used_memory() > server.vm_max_memory) {
8478 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8479 }
8480 }
8481 }
8482
8483 /* This point can only be reached when EOF is reached without errors.
8484 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8485 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8486
8487 fclose(fp);
8488 freeFakeClient(fakeClient);
8489 server.appendonly = appendonly;
8490 return REDIS_OK;
8491
8492 readerr:
8493 if (feof(fp)) {
8494 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8495 } else {
8496 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8497 }
8498 exit(1);
8499 fmterr:
8500 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8501 exit(1);
8502 }
8503
8504 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8505 static int fwriteBulkObject(FILE *fp, robj *obj) {
8506 char buf[128];
8507 int decrrc = 0;
8508
8509 /* Avoid the incr/decr ref count business if possible to help
8510 * copy-on-write (we are often in a child process when this function
8511 * is called).
8512 * Also makes sure that key objects don't get incrRefCount-ed when VM
8513 * is enabled */
8514 if (obj->encoding != REDIS_ENCODING_RAW) {
8515 obj = getDecodedObject(obj);
8516 decrrc = 1;
8517 }
8518 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8519 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8520 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8521 goto err;
8522 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8523 if (decrrc) decrRefCount(obj);
8524 return 1;
8525 err:
8526 if (decrrc) decrRefCount(obj);
8527 return 0;
8528 }
8529
8530 /* Write binary-safe string into a file in the bulkformat
8531 * $<count>\r\n<payload>\r\n */
8532 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8533 char buf[128];
8534
8535 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8536 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8537 if (len && fwrite(s,len,1,fp) == 0) return 0;
8538 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8539 return 1;
8540 }
8541
8542 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8543 static int fwriteBulkDouble(FILE *fp, double d) {
8544 char buf[128], dbuf[128];
8545
8546 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8547 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8548 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8549 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8550 return 1;
8551 }
8552
8553 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8554 static int fwriteBulkLong(FILE *fp, long l) {
8555 char buf[128], lbuf[128];
8556
8557 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8558 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8559 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8560 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8561 return 1;
8562 }
8563
8564 /* Write a sequence of commands able to fully rebuild the dataset into
8565 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8566 static int rewriteAppendOnlyFile(char *filename) {
8567 dictIterator *di = NULL;
8568 dictEntry *de;
8569 FILE *fp;
8570 char tmpfile[256];
8571 int j;
8572 time_t now = time(NULL);
8573
8574 /* Note that we have to use a different temp name here compared to the
8575 * one used by rewriteAppendOnlyFileBackground() function. */
8576 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8577 fp = fopen(tmpfile,"w");
8578 if (!fp) {
8579 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8580 return REDIS_ERR;
8581 }
8582 for (j = 0; j < server.dbnum; j++) {
8583 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8584 redisDb *db = server.db+j;
8585 dict *d = db->dict;
8586 if (dictSize(d) == 0) continue;
8587 di = dictGetIterator(d);
8588 if (!di) {
8589 fclose(fp);
8590 return REDIS_ERR;
8591 }
8592
8593 /* SELECT the new DB */
8594 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8595 if (fwriteBulkLong(fp,j) == 0) goto werr;
8596
8597 /* Iterate this DB writing every entry */
8598 while((de = dictNext(di)) != NULL) {
8599 robj *key, *o;
8600 time_t expiretime;
8601 int swapped;
8602
8603 key = dictGetEntryKey(de);
8604 /* If the value for this key is swapped, load a preview in memory.
8605 * We use a "swapped" flag to remember if we need to free the
8606 * value object instead to just increment the ref count anyway
8607 * in order to avoid copy-on-write of pages if we are forked() */
8608 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8609 key->storage == REDIS_VM_SWAPPING) {
8610 o = dictGetEntryVal(de);
8611 swapped = 0;
8612 } else {
8613 o = vmPreviewObject(key);
8614 swapped = 1;
8615 }
8616 expiretime = getExpire(db,key);
8617
8618 /* Save the key and associated value */
8619 if (o->type == REDIS_STRING) {
8620 /* Emit a SET command */
8621 char cmd[]="*3\r\n$3\r\nSET\r\n";
8622 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8623 /* Key and value */
8624 if (fwriteBulkObject(fp,key) == 0) goto werr;
8625 if (fwriteBulkObject(fp,o) == 0) goto werr;
8626 } else if (o->type == REDIS_LIST) {
8627 /* Emit the RPUSHes needed to rebuild the list */
8628 list *list = o->ptr;
8629 listNode *ln;
8630 listIter li;
8631
8632 listRewind(list,&li);
8633 while((ln = listNext(&li))) {
8634 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8635 robj *eleobj = listNodeValue(ln);
8636
8637 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8638 if (fwriteBulkObject(fp,key) == 0) goto werr;
8639 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8640 }
8641 } else if (o->type == REDIS_SET) {
8642 /* Emit the SADDs needed to rebuild the set */
8643 dict *set = o->ptr;
8644 dictIterator *di = dictGetIterator(set);
8645 dictEntry *de;
8646
8647 while((de = dictNext(di)) != NULL) {
8648 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8649 robj *eleobj = dictGetEntryKey(de);
8650
8651 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8652 if (fwriteBulkObject(fp,key) == 0) goto werr;
8653 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8654 }
8655 dictReleaseIterator(di);
8656 } else if (o->type == REDIS_ZSET) {
8657 /* Emit the ZADDs needed to rebuild the sorted set */
8658 zset *zs = o->ptr;
8659 dictIterator *di = dictGetIterator(zs->dict);
8660 dictEntry *de;
8661
8662 while((de = dictNext(di)) != NULL) {
8663 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8664 robj *eleobj = dictGetEntryKey(de);
8665 double *score = dictGetEntryVal(de);
8666
8667 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8668 if (fwriteBulkObject(fp,key) == 0) goto werr;
8669 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8670 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8671 }
8672 dictReleaseIterator(di);
8673 } else if (o->type == REDIS_HASH) {
8674 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8675
8676 /* Emit the HSETs needed to rebuild the hash */
8677 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8678 unsigned char *p = zipmapRewind(o->ptr);
8679 unsigned char *field, *val;
8680 unsigned int flen, vlen;
8681
8682 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8683 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8684 if (fwriteBulkObject(fp,key) == 0) goto werr;
8685 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8686 return -1;
8687 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8688 return -1;
8689 }
8690 } else {
8691 dictIterator *di = dictGetIterator(o->ptr);
8692 dictEntry *de;
8693
8694 while((de = dictNext(di)) != NULL) {
8695 robj *field = dictGetEntryKey(de);
8696 robj *val = dictGetEntryVal(de);
8697
8698 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8699 if (fwriteBulkObject(fp,key) == 0) goto werr;
8700 if (fwriteBulkObject(fp,field) == -1) return -1;
8701 if (fwriteBulkObject(fp,val) == -1) return -1;
8702 }
8703 dictReleaseIterator(di);
8704 }
8705 } else {
8706 redisPanic("Unknown object type");
8707 }
8708 /* Save the expire time */
8709 if (expiretime != -1) {
8710 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8711 /* If this key is already expired skip it */
8712 if (expiretime < now) continue;
8713 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8714 if (fwriteBulkObject(fp,key) == 0) goto werr;
8715 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8716 }
8717 if (swapped) decrRefCount(o);
8718 }
8719 dictReleaseIterator(di);
8720 }
8721
8722 /* Make sure data will not remain on the OS's output buffers */
8723 fflush(fp);
8724 aof_fsync(fileno(fp));
8725 fclose(fp);
8726
8727 /* Use RENAME to make sure the DB file is changed atomically only
8728 * if the generate DB file is ok. */
8729 if (rename(tmpfile,filename) == -1) {
8730 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8731 unlink(tmpfile);
8732 return REDIS_ERR;
8733 }
8734 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8735 return REDIS_OK;
8736
8737 werr:
8738 fclose(fp);
8739 unlink(tmpfile);
8740 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8741 if (di) dictReleaseIterator(di);
8742 return REDIS_ERR;
8743 }
8744
8745 /* This is how rewriting of the append only file in background works:
8746 *
8747 * 1) The user calls BGREWRITEAOF
8748 * 2) Redis calls this function, that forks():
8749 * 2a) the child rewrite the append only file in a temp file.
8750 * 2b) the parent accumulates differences in server.bgrewritebuf.
8751 * 3) When the child finished '2a' exists.
8752 * 4) The parent will trap the exit code, if it's OK, will append the
8753 * data accumulated into server.bgrewritebuf into the temp file, and
8754 * finally will rename(2) the temp file in the actual file name.
8755 * The the new file is reopened as the new append only file. Profit!
8756 */
8757 static int rewriteAppendOnlyFileBackground(void) {
8758 pid_t childpid;
8759
8760 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8761 if (server.vm_enabled) waitEmptyIOJobsQueue();
8762 if ((childpid = fork()) == 0) {
8763 /* Child */
8764 char tmpfile[256];
8765
8766 if (server.vm_enabled) vmReopenSwapFile();
8767 close(server.fd);
8768 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8769 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8770 _exit(0);
8771 } else {
8772 _exit(1);
8773 }
8774 } else {
8775 /* Parent */
8776 if (childpid == -1) {
8777 redisLog(REDIS_WARNING,
8778 "Can't rewrite append only file in background: fork: %s",
8779 strerror(errno));
8780 return REDIS_ERR;
8781 }
8782 redisLog(REDIS_NOTICE,
8783 "Background append only file rewriting started by pid %d",childpid);
8784 server.bgrewritechildpid = childpid;
8785 updateDictResizePolicy();
8786 /* We set appendseldb to -1 in order to force the next call to the
8787 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8788 * accumulated by the parent into server.bgrewritebuf will start
8789 * with a SELECT statement and it will be safe to merge. */
8790 server.appendseldb = -1;
8791 return REDIS_OK;
8792 }
8793 return REDIS_OK; /* unreached */
8794 }
8795
8796 static void bgrewriteaofCommand(redisClient *c) {
8797 if (server.bgrewritechildpid != -1) {
8798 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8799 return;
8800 }
8801 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8802 char *status = "+Background append only file rewriting started\r\n";
8803 addReplySds(c,sdsnew(status));
8804 } else {
8805 addReply(c,shared.err);
8806 }
8807 }
8808
8809 static void aofRemoveTempFile(pid_t childpid) {
8810 char tmpfile[256];
8811
8812 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8813 unlink(tmpfile);
8814 }
8815
8816 /* Virtual Memory is composed mainly of two subsystems:
8817 * - Blocking Virutal Memory
8818 * - Threaded Virtual Memory I/O
8819 * The two parts are not fully decoupled, but functions are split among two
8820 * different sections of the source code (delimited by comments) in order to
8821 * make more clear what functionality is about the blocking VM and what about
8822 * the threaded (not blocking) VM.
8823 *
8824 * Redis VM design:
8825 *
8826 * Redis VM is a blocking VM (one that blocks reading swapped values from
8827 * disk into memory when a value swapped out is needed in memory) that is made
8828 * unblocking by trying to examine the command argument vector in order to
8829 * load in background values that will likely be needed in order to exec
8830 * the command. The command is executed only once all the relevant keys
8831 * are loaded into memory.
8832 *
8833 * This basically is almost as simple of a blocking VM, but almost as parallel
8834 * as a fully non-blocking VM.
8835 */
8836
8837 /* Called when the user switches from "appendonly yes" to "appendonly no"
8838 * at runtime using the CONFIG command. */
8839 static void stopAppendOnly(void) {
8840 flushAppendOnlyFile();
8841 aof_fsync(server.appendfd);
8842 close(server.appendfd);
8843
8844 server.appendfd = -1;
8845 server.appendseldb = -1;
8846 server.appendonly = 0;
8847 /* rewrite operation in progress? kill it, wait child exit */
8848 if (server.bgsavechildpid != -1) {
8849 int statloc;
8850
8851 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8852 wait3(&statloc,0,NULL);
8853 /* reset the buffer accumulating changes while the child saves */
8854 sdsfree(server.bgrewritebuf);
8855 server.bgrewritebuf = sdsempty();
8856 server.bgsavechildpid = -1;
8857 }
8858 }
8859
8860 /* Called when the user switches from "appendonly no" to "appendonly yes"
8861 * at runtime using the CONFIG command. */
8862 static int startAppendOnly(void) {
8863 server.appendonly = 1;
8864 server.lastfsync = time(NULL);
8865 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8866 if (server.appendfd == -1) {
8867 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8868 return REDIS_ERR;
8869 }
8870 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8871 server.appendonly = 0;
8872 close(server.appendfd);
8873 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8874 return REDIS_ERR;
8875 }
8876 return REDIS_OK;
8877 }
8878
8879 /* =================== Virtual Memory - Blocking Side ====================== */
8880
8881 static void vmInit(void) {
8882 off_t totsize;
8883 int pipefds[2];
8884 size_t stacksize;
8885 struct flock fl;
8886
8887 if (server.vm_max_threads != 0)
8888 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8889
8890 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8891 /* Try to open the old swap file, otherwise create it */
8892 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8893 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8894 }
8895 if (server.vm_fp == NULL) {
8896 redisLog(REDIS_WARNING,
8897 "Can't open the swap file: %s. Exiting.",
8898 strerror(errno));
8899 exit(1);
8900 }
8901 server.vm_fd = fileno(server.vm_fp);
8902 /* Lock the swap file for writing, this is useful in order to avoid
8903 * another instance to use the same swap file for a config error. */
8904 fl.l_type = F_WRLCK;
8905 fl.l_whence = SEEK_SET;
8906 fl.l_start = fl.l_len = 0;
8907 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8908 redisLog(REDIS_WARNING,
8909 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8910 exit(1);
8911 }
8912 /* Initialize */
8913 server.vm_next_page = 0;
8914 server.vm_near_pages = 0;
8915 server.vm_stats_used_pages = 0;
8916 server.vm_stats_swapped_objects = 0;
8917 server.vm_stats_swapouts = 0;
8918 server.vm_stats_swapins = 0;
8919 totsize = server.vm_pages*server.vm_page_size;
8920 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8921 if (ftruncate(server.vm_fd,totsize) == -1) {
8922 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8923 strerror(errno));
8924 exit(1);
8925 } else {
8926 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8927 }
8928 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8929 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8930 (long long) (server.vm_pages+7)/8, server.vm_pages);
8931 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8932
8933 /* Initialize threaded I/O (used by Virtual Memory) */
8934 server.io_newjobs = listCreate();
8935 server.io_processing = listCreate();
8936 server.io_processed = listCreate();
8937 server.io_ready_clients = listCreate();
8938 pthread_mutex_init(&server.io_mutex,NULL);
8939 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8940 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8941 server.io_active_threads = 0;
8942 if (pipe(pipefds) == -1) {
8943 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8944 ,strerror(errno));
8945 exit(1);
8946 }
8947 server.io_ready_pipe_read = pipefds[0];
8948 server.io_ready_pipe_write = pipefds[1];
8949 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8950 /* LZF requires a lot of stack */
8951 pthread_attr_init(&server.io_threads_attr);
8952 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8953 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8954 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8955 /* Listen for events in the threaded I/O pipe */
8956 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8957 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8958 oom("creating file event");
8959 }
8960
8961 /* Mark the page as used */
8962 static void vmMarkPageUsed(off_t page) {
8963 off_t byte = page/8;
8964 int bit = page&7;
8965 redisAssert(vmFreePage(page) == 1);
8966 server.vm_bitmap[byte] |= 1<<bit;
8967 }
8968
8969 /* Mark N contiguous pages as used, with 'page' being the first. */
8970 static void vmMarkPagesUsed(off_t page, off_t count) {
8971 off_t j;
8972
8973 for (j = 0; j < count; j++)
8974 vmMarkPageUsed(page+j);
8975 server.vm_stats_used_pages += count;
8976 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8977 (long long)count, (long long)page);
8978 }
8979
8980 /* Mark the page as free */
8981 static void vmMarkPageFree(off_t page) {
8982 off_t byte = page/8;
8983 int bit = page&7;
8984 redisAssert(vmFreePage(page) == 0);
8985 server.vm_bitmap[byte] &= ~(1<<bit);
8986 }
8987
8988 /* Mark N contiguous pages as free, with 'page' being the first. */
8989 static void vmMarkPagesFree(off_t page, off_t count) {
8990 off_t j;
8991
8992 for (j = 0; j < count; j++)
8993 vmMarkPageFree(page+j);
8994 server.vm_stats_used_pages -= count;
8995 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8996 (long long)count, (long long)page);
8997 }
8998
8999 /* Test if the page is free */
9000 static int vmFreePage(off_t page) {
9001 off_t byte = page/8;
9002 int bit = page&7;
9003 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9004 }
9005
9006 /* Find N contiguous free pages storing the first page of the cluster in *first.
9007 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9008 * REDIS_ERR is returned.
9009 *
9010 * This function uses a simple algorithm: we try to allocate
9011 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9012 * again from the start of the swap file searching for free spaces.
9013 *
9014 * If it looks pretty clear that there are no free pages near our offset
9015 * we try to find less populated places doing a forward jump of
9016 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9017 * without hurry, and then we jump again and so forth...
9018 *
9019 * This function can be improved using a free list to avoid to guess
9020 * too much, since we could collect data about freed pages.
9021 *
9022 * note: I implemented this function just after watching an episode of
9023 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9024 */
9025 static int vmFindContiguousPages(off_t *first, off_t n) {
9026 off_t base, offset = 0, since_jump = 0, numfree = 0;
9027
9028 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9029 server.vm_near_pages = 0;
9030 server.vm_next_page = 0;
9031 }
9032 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9033 base = server.vm_next_page;
9034
9035 while(offset < server.vm_pages) {
9036 off_t this = base+offset;
9037
9038 /* If we overflow, restart from page zero */
9039 if (this >= server.vm_pages) {
9040 this -= server.vm_pages;
9041 if (this == 0) {
9042 /* Just overflowed, what we found on tail is no longer
9043 * interesting, as it's no longer contiguous. */
9044 numfree = 0;
9045 }
9046 }
9047 if (vmFreePage(this)) {
9048 /* This is a free page */
9049 numfree++;
9050 /* Already got N free pages? Return to the caller, with success */
9051 if (numfree == n) {
9052 *first = this-(n-1);
9053 server.vm_next_page = this+1;
9054 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9055 return REDIS_OK;
9056 }
9057 } else {
9058 /* The current one is not a free page */
9059 numfree = 0;
9060 }
9061
9062 /* Fast-forward if the current page is not free and we already
9063 * searched enough near this place. */
9064 since_jump++;
9065 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9066 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9067 since_jump = 0;
9068 /* Note that even if we rewind after the jump, we are don't need
9069 * to make sure numfree is set to zero as we only jump *if* it
9070 * is set to zero. */
9071 } else {
9072 /* Otherwise just check the next page */
9073 offset++;
9074 }
9075 }
9076 return REDIS_ERR;
9077 }
9078
9079 /* Write the specified object at the specified page of the swap file */
9080 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9081 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9082 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9083 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9084 redisLog(REDIS_WARNING,
9085 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9086 strerror(errno));
9087 return REDIS_ERR;
9088 }
9089 rdbSaveObject(server.vm_fp,o);
9090 fflush(server.vm_fp);
9091 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9092 return REDIS_OK;
9093 }
9094
9095 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9096 * needed to later retrieve the object into the key object.
9097 * If we can't find enough contiguous empty pages to swap the object on disk
9098 * REDIS_ERR is returned. */
9099 static int vmSwapObjectBlocking(robj *key, robj *val) {
9100 off_t pages = rdbSavedObjectPages(val,NULL);
9101 off_t page;
9102
9103 assert(key->storage == REDIS_VM_MEMORY);
9104 assert(key->refcount == 1);
9105 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9106 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9107 key->vm.page = page;
9108 key->vm.usedpages = pages;
9109 key->storage = REDIS_VM_SWAPPED;
9110 key->vtype = val->type;
9111 decrRefCount(val); /* Deallocate the object from memory. */
9112 vmMarkPagesUsed(page,pages);
9113 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9114 (unsigned char*) key->ptr,
9115 (unsigned long long) page, (unsigned long long) pages);
9116 server.vm_stats_swapped_objects++;
9117 server.vm_stats_swapouts++;
9118 return REDIS_OK;
9119 }
9120
9121 static robj *vmReadObjectFromSwap(off_t page, int type) {
9122 robj *o;
9123
9124 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9125 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9126 redisLog(REDIS_WARNING,
9127 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9128 strerror(errno));
9129 _exit(1);
9130 }
9131 o = rdbLoadObject(type,server.vm_fp);
9132 if (o == NULL) {
9133 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9134 _exit(1);
9135 }
9136 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9137 return o;
9138 }
9139
9140 /* Load the value object relative to the 'key' object from swap to memory.
9141 * The newly allocated object is returned.
9142 *
9143 * If preview is true the unserialized object is returned to the caller but
9144 * no changes are made to the key object, nor the pages are marked as freed */
9145 static robj *vmGenericLoadObject(robj *key, int preview) {
9146 robj *val;
9147
9148 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9149 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9150 if (!preview) {
9151 key->storage = REDIS_VM_MEMORY;
9152 key->vm.atime = server.unixtime;
9153 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9154 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9155 (unsigned char*) key->ptr);
9156 server.vm_stats_swapped_objects--;
9157 } else {
9158 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9159 (unsigned char*) key->ptr);
9160 }
9161 server.vm_stats_swapins++;
9162 return val;
9163 }
9164
9165 /* Plain object loading, from swap to memory */
9166 static robj *vmLoadObject(robj *key) {
9167 /* If we are loading the object in background, stop it, we
9168 * need to load this object synchronously ASAP. */
9169 if (key->storage == REDIS_VM_LOADING)
9170 vmCancelThreadedIOJob(key);
9171 return vmGenericLoadObject(key,0);
9172 }
9173
9174 /* Just load the value on disk, without to modify the key.
9175 * This is useful when we want to perform some operation on the value
9176 * without to really bring it from swap to memory, like while saving the
9177 * dataset or rewriting the append only log. */
9178 static robj *vmPreviewObject(robj *key) {
9179 return vmGenericLoadObject(key,1);
9180 }
9181
9182 /* How a good candidate is this object for swapping?
9183 * The better candidate it is, the greater the returned value.
9184 *
9185 * Currently we try to perform a fast estimation of the object size in
9186 * memory, and combine it with aging informations.
9187 *
9188 * Basically swappability = idle-time * log(estimated size)
9189 *
9190 * Bigger objects are preferred over smaller objects, but not
9191 * proportionally, this is why we use the logarithm. This algorithm is
9192 * just a first try and will probably be tuned later. */
9193 static double computeObjectSwappability(robj *o) {
9194 time_t age = server.unixtime - o->vm.atime;
9195 long asize = 0;
9196 list *l;
9197 dict *d;
9198 struct dictEntry *de;
9199 int z;
9200
9201 if (age <= 0) return 0;
9202 switch(o->type) {
9203 case REDIS_STRING:
9204 if (o->encoding != REDIS_ENCODING_RAW) {
9205 asize = sizeof(*o);
9206 } else {
9207 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9208 }
9209 break;
9210 case REDIS_LIST:
9211 l = o->ptr;
9212 listNode *ln = listFirst(l);
9213
9214 asize = sizeof(list);
9215 if (ln) {
9216 robj *ele = ln->value;
9217 long elesize;
9218
9219 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9220 (sizeof(*o)+sdslen(ele->ptr)) :
9221 sizeof(*o);
9222 asize += (sizeof(listNode)+elesize)*listLength(l);
9223 }
9224 break;
9225 case REDIS_SET:
9226 case REDIS_ZSET:
9227 z = (o->type == REDIS_ZSET);
9228 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9229
9230 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9231 if (z) asize += sizeof(zset)-sizeof(dict);
9232 if (dictSize(d)) {
9233 long elesize;
9234 robj *ele;
9235
9236 de = dictGetRandomKey(d);
9237 ele = dictGetEntryKey(de);
9238 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9239 (sizeof(*o)+sdslen(ele->ptr)) :
9240 sizeof(*o);
9241 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9242 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9243 }
9244 break;
9245 case REDIS_HASH:
9246 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9247 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9248 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9249 unsigned int klen, vlen;
9250 unsigned char *key, *val;
9251
9252 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9253 klen = 0;
9254 vlen = 0;
9255 }
9256 asize = len*(klen+vlen+3);
9257 } else if (o->encoding == REDIS_ENCODING_HT) {
9258 d = o->ptr;
9259 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9260 if (dictSize(d)) {
9261 long elesize;
9262 robj *ele;
9263
9264 de = dictGetRandomKey(d);
9265 ele = dictGetEntryKey(de);
9266 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9267 (sizeof(*o)+sdslen(ele->ptr)) :
9268 sizeof(*o);
9269 ele = dictGetEntryVal(de);
9270 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9271 (sizeof(*o)+sdslen(ele->ptr)) :
9272 sizeof(*o);
9273 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9274 }
9275 }
9276 break;
9277 }
9278 return (double)age*log(1+asize);
9279 }
9280
9281 /* Try to swap an object that's a good candidate for swapping.
9282 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9283 * to swap any object at all.
9284 *
9285 * If 'usethreaded' is true, Redis will try to swap the object in background
9286 * using I/O threads. */
9287 static int vmSwapOneObject(int usethreads) {
9288 int j, i;
9289 struct dictEntry *best = NULL;
9290 double best_swappability = 0;
9291 redisDb *best_db = NULL;
9292 robj *key, *val;
9293
9294 for (j = 0; j < server.dbnum; j++) {
9295 redisDb *db = server.db+j;
9296 /* Why maxtries is set to 100?
9297 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9298 * are swappable objects */
9299 int maxtries = 100;
9300
9301 if (dictSize(db->dict) == 0) continue;
9302 for (i = 0; i < 5; i++) {
9303 dictEntry *de;
9304 double swappability;
9305
9306 if (maxtries) maxtries--;
9307 de = dictGetRandomKey(db->dict);
9308 key = dictGetEntryKey(de);
9309 val = dictGetEntryVal(de);
9310 /* Only swap objects that are currently in memory.
9311 *
9312 * Also don't swap shared objects if threaded VM is on, as we
9313 * try to ensure that the main thread does not touch the
9314 * object while the I/O thread is using it, but we can't
9315 * control other keys without adding additional mutex. */
9316 if (key->storage != REDIS_VM_MEMORY ||
9317 (server.vm_max_threads != 0 && val->refcount != 1)) {
9318 if (maxtries) i--; /* don't count this try */
9319 continue;
9320 }
9321 swappability = computeObjectSwappability(val);
9322 if (!best || swappability > best_swappability) {
9323 best = de;
9324 best_swappability = swappability;
9325 best_db = db;
9326 }
9327 }
9328 }
9329 if (best == NULL) return REDIS_ERR;
9330 key = dictGetEntryKey(best);
9331 val = dictGetEntryVal(best);
9332
9333 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9334 key->ptr, best_swappability);
9335
9336 /* Unshare the key if needed */
9337 if (key->refcount > 1) {
9338 robj *newkey = dupStringObject(key);
9339 decrRefCount(key);
9340 key = dictGetEntryKey(best) = newkey;
9341 }
9342 /* Swap it */
9343 if (usethreads) {
9344 vmSwapObjectThreaded(key,val,best_db);
9345 return REDIS_OK;
9346 } else {
9347 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9348 dictGetEntryVal(best) = NULL;
9349 return REDIS_OK;
9350 } else {
9351 return REDIS_ERR;
9352 }
9353 }
9354 }
9355
9356 static int vmSwapOneObjectBlocking() {
9357 return vmSwapOneObject(0);
9358 }
9359
9360 static int vmSwapOneObjectThreaded() {
9361 return vmSwapOneObject(1);
9362 }
9363
9364 /* Return true if it's safe to swap out objects in a given moment.
9365 * Basically we don't want to swap objects out while there is a BGSAVE
9366 * or a BGAEOREWRITE running in backgroud. */
9367 static int vmCanSwapOut(void) {
9368 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9369 }
9370
9371 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9372 * and was deleted. Otherwise 0 is returned. */
9373 static int deleteIfSwapped(redisDb *db, robj *key) {
9374 dictEntry *de;
9375 robj *foundkey;
9376
9377 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9378 foundkey = dictGetEntryKey(de);
9379 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9380 deleteKey(db,key);
9381 return 1;
9382 }
9383
9384 /* =================== Virtual Memory - Threaded I/O ======================= */
9385
9386 static void freeIOJob(iojob *j) {
9387 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9388 j->type == REDIS_IOJOB_DO_SWAP ||
9389 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9390 decrRefCount(j->val);
9391 /* We don't decrRefCount the j->key field as we did't incremented
9392 * the count creating IO Jobs. This is because the key field here is
9393 * just used as an indentifier and if a key is removed the Job should
9394 * never be touched again. */
9395 zfree(j);
9396 }
9397
9398 /* Every time a thread finished a Job, it writes a byte into the write side
9399 * of an unix pipe in order to "awake" the main thread, and this function
9400 * is called. */
9401 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9402 int mask)
9403 {
9404 char buf[1];
9405 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9406 REDIS_NOTUSED(el);
9407 REDIS_NOTUSED(mask);
9408 REDIS_NOTUSED(privdata);
9409
9410 /* For every byte we read in the read side of the pipe, there is one
9411 * I/O job completed to process. */
9412 while((retval = read(fd,buf,1)) == 1) {
9413 iojob *j;
9414 listNode *ln;
9415 robj *key;
9416 struct dictEntry *de;
9417
9418 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9419
9420 /* Get the processed element (the oldest one) */
9421 lockThreadedIO();
9422 assert(listLength(server.io_processed) != 0);
9423 if (toprocess == -1) {
9424 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9425 if (toprocess <= 0) toprocess = 1;
9426 }
9427 ln = listFirst(server.io_processed);
9428 j = ln->value;
9429 listDelNode(server.io_processed,ln);
9430 unlockThreadedIO();
9431 /* If this job is marked as canceled, just ignore it */
9432 if (j->canceled) {
9433 freeIOJob(j);
9434 continue;
9435 }
9436 /* Post process it in the main thread, as there are things we
9437 * can do just here to avoid race conditions and/or invasive locks */
9438 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9439 de = dictFind(j->db->dict,j->key);
9440 assert(de != NULL);
9441 key = dictGetEntryKey(de);
9442 if (j->type == REDIS_IOJOB_LOAD) {
9443 redisDb *db;
9444
9445 /* Key loaded, bring it at home */
9446 key->storage = REDIS_VM_MEMORY;
9447 key->vm.atime = server.unixtime;
9448 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9449 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9450 (unsigned char*) key->ptr);
9451 server.vm_stats_swapped_objects--;
9452 server.vm_stats_swapins++;
9453 dictGetEntryVal(de) = j->val;
9454 incrRefCount(j->val);
9455 db = j->db;
9456 freeIOJob(j);
9457 /* Handle clients waiting for this key to be loaded. */
9458 handleClientsBlockedOnSwappedKey(db,key);
9459 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9460 /* Now we know the amount of pages required to swap this object.
9461 * Let's find some space for it, and queue this task again
9462 * rebranded as REDIS_IOJOB_DO_SWAP. */
9463 if (!vmCanSwapOut() ||
9464 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9465 {
9466 /* Ooops... no space or we can't swap as there is
9467 * a fork()ed Redis trying to save stuff on disk. */
9468 freeIOJob(j);
9469 key->storage = REDIS_VM_MEMORY; /* undo operation */
9470 } else {
9471 /* Note that we need to mark this pages as used now,
9472 * if the job will be canceled, we'll mark them as freed
9473 * again. */
9474 vmMarkPagesUsed(j->page,j->pages);
9475 j->type = REDIS_IOJOB_DO_SWAP;
9476 lockThreadedIO();
9477 queueIOJob(j);
9478 unlockThreadedIO();
9479 }
9480 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9481 robj *val;
9482
9483 /* Key swapped. We can finally free some memory. */
9484 if (key->storage != REDIS_VM_SWAPPING) {
9485 printf("key->storage: %d\n",key->storage);
9486 printf("key->name: %s\n",(char*)key->ptr);
9487 printf("key->refcount: %d\n",key->refcount);
9488 printf("val: %p\n",(void*)j->val);
9489 printf("val->type: %d\n",j->val->type);
9490 printf("val->ptr: %s\n",(char*)j->val->ptr);
9491 }
9492 redisAssert(key->storage == REDIS_VM_SWAPPING);
9493 val = dictGetEntryVal(de);
9494 key->vm.page = j->page;
9495 key->vm.usedpages = j->pages;
9496 key->storage = REDIS_VM_SWAPPED;
9497 key->vtype = j->val->type;
9498 decrRefCount(val); /* Deallocate the object from memory. */
9499 dictGetEntryVal(de) = NULL;
9500 redisLog(REDIS_DEBUG,
9501 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9502 (unsigned char*) key->ptr,
9503 (unsigned long long) j->page, (unsigned long long) j->pages);
9504 server.vm_stats_swapped_objects++;
9505 server.vm_stats_swapouts++;
9506 freeIOJob(j);
9507 /* Put a few more swap requests in queue if we are still
9508 * out of memory */
9509 if (trytoswap && vmCanSwapOut() &&
9510 zmalloc_used_memory() > server.vm_max_memory)
9511 {
9512 int more = 1;
9513 while(more) {
9514 lockThreadedIO();
9515 more = listLength(server.io_newjobs) <
9516 (unsigned) server.vm_max_threads;
9517 unlockThreadedIO();
9518 /* Don't waste CPU time if swappable objects are rare. */
9519 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9520 trytoswap = 0;
9521 break;
9522 }
9523 }
9524 }
9525 }
9526 processed++;
9527 if (processed == toprocess) return;
9528 }
9529 if (retval < 0 && errno != EAGAIN) {
9530 redisLog(REDIS_WARNING,
9531 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9532 strerror(errno));
9533 }
9534 }
9535
9536 static void lockThreadedIO(void) {
9537 pthread_mutex_lock(&server.io_mutex);
9538 }
9539
9540 static void unlockThreadedIO(void) {
9541 pthread_mutex_unlock(&server.io_mutex);
9542 }
9543
9544 /* Remove the specified object from the threaded I/O queue if still not
9545 * processed, otherwise make sure to flag it as canceled. */
9546 static void vmCancelThreadedIOJob(robj *o) {
9547 list *lists[3] = {
9548 server.io_newjobs, /* 0 */
9549 server.io_processing, /* 1 */
9550 server.io_processed /* 2 */
9551 };
9552 int i;
9553
9554 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9555 again:
9556 lockThreadedIO();
9557 /* Search for a matching key in one of the queues */
9558 for (i = 0; i < 3; i++) {
9559 listNode *ln;
9560 listIter li;
9561
9562 listRewind(lists[i],&li);
9563 while ((ln = listNext(&li)) != NULL) {
9564 iojob *job = ln->value;
9565
9566 if (job->canceled) continue; /* Skip this, already canceled. */
9567 if (job->key == o) {
9568 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9569 (void*)job, (char*)o->ptr, job->type, i);
9570 /* Mark the pages as free since the swap didn't happened
9571 * or happened but is now discarded. */
9572 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9573 vmMarkPagesFree(job->page,job->pages);
9574 /* Cancel the job. It depends on the list the job is
9575 * living in. */
9576 switch(i) {
9577 case 0: /* io_newjobs */
9578 /* If the job was yet not processed the best thing to do
9579 * is to remove it from the queue at all */
9580 freeIOJob(job);
9581 listDelNode(lists[i],ln);
9582 break;
9583 case 1: /* io_processing */
9584 /* Oh Shi- the thread is messing with the Job:
9585 *
9586 * Probably it's accessing the object if this is a
9587 * PREPARE_SWAP or DO_SWAP job.
9588 * If it's a LOAD job it may be reading from disk and
9589 * if we don't wait for the job to terminate before to
9590 * cancel it, maybe in a few microseconds data can be
9591 * corrupted in this pages. So the short story is:
9592 *
9593 * Better to wait for the job to move into the
9594 * next queue (processed)... */
9595
9596 /* We try again and again until the job is completed. */
9597 unlockThreadedIO();
9598 /* But let's wait some time for the I/O thread
9599 * to finish with this job. After all this condition
9600 * should be very rare. */
9601 usleep(1);
9602 goto again;
9603 case 2: /* io_processed */
9604 /* The job was already processed, that's easy...
9605 * just mark it as canceled so that we'll ignore it
9606 * when processing completed jobs. */
9607 job->canceled = 1;
9608 break;
9609 }
9610 /* Finally we have to adjust the storage type of the object
9611 * in order to "UNDO" the operaiton. */
9612 if (o->storage == REDIS_VM_LOADING)
9613 o->storage = REDIS_VM_SWAPPED;
9614 else if (o->storage == REDIS_VM_SWAPPING)
9615 o->storage = REDIS_VM_MEMORY;
9616 unlockThreadedIO();
9617 return;
9618 }
9619 }
9620 }
9621 unlockThreadedIO();
9622 assert(1 != 1); /* We should never reach this */
9623 }
9624
9625 static void *IOThreadEntryPoint(void *arg) {
9626 iojob *j;
9627 listNode *ln;
9628 REDIS_NOTUSED(arg);
9629
9630 pthread_detach(pthread_self());
9631 while(1) {
9632 /* Get a new job to process */
9633 lockThreadedIO();
9634 if (listLength(server.io_newjobs) == 0) {
9635 /* No new jobs in queue, exit. */
9636 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9637 (long) pthread_self());
9638 server.io_active_threads--;
9639 unlockThreadedIO();
9640 return NULL;
9641 }
9642 ln = listFirst(server.io_newjobs);
9643 j = ln->value;
9644 listDelNode(server.io_newjobs,ln);
9645 /* Add the job in the processing queue */
9646 j->thread = pthread_self();
9647 listAddNodeTail(server.io_processing,j);
9648 ln = listLast(server.io_processing); /* We use ln later to remove it */
9649 unlockThreadedIO();
9650 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9651 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9652
9653 /* Process the Job */
9654 if (j->type == REDIS_IOJOB_LOAD) {
9655 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9656 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9657 FILE *fp = fopen("/dev/null","w+");
9658 j->pages = rdbSavedObjectPages(j->val,fp);
9659 fclose(fp);
9660 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9661 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9662 j->canceled = 1;
9663 }
9664
9665 /* Done: insert the job into the processed queue */
9666 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9667 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9668 lockThreadedIO();
9669 listDelNode(server.io_processing,ln);
9670 listAddNodeTail(server.io_processed,j);
9671 unlockThreadedIO();
9672
9673 /* Signal the main thread there is new stuff to process */
9674 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9675 }
9676 return NULL; /* never reached */
9677 }
9678
9679 static void spawnIOThread(void) {
9680 pthread_t thread;
9681 sigset_t mask, omask;
9682 int err;
9683
9684 sigemptyset(&mask);
9685 sigaddset(&mask,SIGCHLD);
9686 sigaddset(&mask,SIGHUP);
9687 sigaddset(&mask,SIGPIPE);
9688 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9689 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9690 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9691 strerror(err));
9692 usleep(1000000);
9693 }
9694 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9695 server.io_active_threads++;
9696 }
9697
9698 /* We need to wait for the last thread to exit before we are able to
9699 * fork() in order to BGSAVE or BGREWRITEAOF. */
9700 static void waitEmptyIOJobsQueue(void) {
9701 while(1) {
9702 int io_processed_len;
9703
9704 lockThreadedIO();
9705 if (listLength(server.io_newjobs) == 0 &&
9706 listLength(server.io_processing) == 0 &&
9707 server.io_active_threads == 0)
9708 {
9709 unlockThreadedIO();
9710 return;
9711 }
9712 /* While waiting for empty jobs queue condition we post-process some
9713 * finshed job, as I/O threads may be hanging trying to write against
9714 * the io_ready_pipe_write FD but there are so much pending jobs that
9715 * it's blocking. */
9716 io_processed_len = listLength(server.io_processed);
9717 unlockThreadedIO();
9718 if (io_processed_len) {
9719 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9720 usleep(1000); /* 1 millisecond */
9721 } else {
9722 usleep(10000); /* 10 milliseconds */
9723 }
9724 }
9725 }
9726
9727 static void vmReopenSwapFile(void) {
9728 /* Note: we don't close the old one as we are in the child process
9729 * and don't want to mess at all with the original file object. */
9730 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9731 if (server.vm_fp == NULL) {
9732 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9733 server.vm_swap_file);
9734 _exit(1);
9735 }
9736 server.vm_fd = fileno(server.vm_fp);
9737 }
9738
9739 /* This function must be called while with threaded IO locked */
9740 static void queueIOJob(iojob *j) {
9741 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9742 (void*)j, j->type, (char*)j->key->ptr);
9743 listAddNodeTail(server.io_newjobs,j);
9744 if (server.io_active_threads < server.vm_max_threads)
9745 spawnIOThread();
9746 }
9747
9748 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9749 iojob *j;
9750
9751 assert(key->storage == REDIS_VM_MEMORY);
9752 assert(key->refcount == 1);
9753
9754 j = zmalloc(sizeof(*j));
9755 j->type = REDIS_IOJOB_PREPARE_SWAP;
9756 j->db = db;
9757 j->key = key;
9758 j->val = val;
9759 incrRefCount(val);
9760 j->canceled = 0;
9761 j->thread = (pthread_t) -1;
9762 key->storage = REDIS_VM_SWAPPING;
9763
9764 lockThreadedIO();
9765 queueIOJob(j);
9766 unlockThreadedIO();
9767 return REDIS_OK;
9768 }
9769
9770 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9771
9772 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9773 * If there is not already a job loading the key, it is craeted.
9774 * The key is added to the io_keys list in the client structure, and also
9775 * in the hash table mapping swapped keys to waiting clients, that is,
9776 * server.io_waited_keys. */
9777 static int waitForSwappedKey(redisClient *c, robj *key) {
9778 struct dictEntry *de;
9779 robj *o;
9780 list *l;
9781
9782 /* If the key does not exist or is already in RAM we don't need to
9783 * block the client at all. */
9784 de = dictFind(c->db->dict,key);
9785 if (de == NULL) return 0;
9786 o = dictGetEntryKey(de);
9787 if (o->storage == REDIS_VM_MEMORY) {
9788 return 0;
9789 } else if (o->storage == REDIS_VM_SWAPPING) {
9790 /* We were swapping the key, undo it! */
9791 vmCancelThreadedIOJob(o);
9792 return 0;
9793 }
9794
9795 /* OK: the key is either swapped, or being loaded just now. */
9796
9797 /* Add the key to the list of keys this client is waiting for.
9798 * This maps clients to keys they are waiting for. */
9799 listAddNodeTail(c->io_keys,key);
9800 incrRefCount(key);
9801
9802 /* Add the client to the swapped keys => clients waiting map. */
9803 de = dictFind(c->db->io_keys,key);
9804 if (de == NULL) {
9805 int retval;
9806
9807 /* For every key we take a list of clients blocked for it */
9808 l = listCreate();
9809 retval = dictAdd(c->db->io_keys,key,l);
9810 incrRefCount(key);
9811 assert(retval == DICT_OK);
9812 } else {
9813 l = dictGetEntryVal(de);
9814 }
9815 listAddNodeTail(l,c);
9816
9817 /* Are we already loading the key from disk? If not create a job */
9818 if (o->storage == REDIS_VM_SWAPPED) {
9819 iojob *j;
9820
9821 o->storage = REDIS_VM_LOADING;
9822 j = zmalloc(sizeof(*j));
9823 j->type = REDIS_IOJOB_LOAD;
9824 j->db = c->db;
9825 j->key = o;
9826 j->key->vtype = o->vtype;
9827 j->page = o->vm.page;
9828 j->val = NULL;
9829 j->canceled = 0;
9830 j->thread = (pthread_t) -1;
9831 lockThreadedIO();
9832 queueIOJob(j);
9833 unlockThreadedIO();
9834 }
9835 return 1;
9836 }
9837
9838 /* Preload keys for any command with first, last and step values for
9839 * the command keys prototype, as defined in the command table. */
9840 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9841 int j, last;
9842 if (cmd->vm_firstkey == 0) return;
9843 last = cmd->vm_lastkey;
9844 if (last < 0) last = argc+last;
9845 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9846 redisAssert(j < argc);
9847 waitForSwappedKey(c,argv[j]);
9848 }
9849 }
9850
9851 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9852 * Note that the number of keys to preload is user-defined, so we need to
9853 * apply a sanity check against argc. */
9854 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9855 int i, num;
9856 REDIS_NOTUSED(cmd);
9857
9858 num = atoi(argv[2]->ptr);
9859 if (num > (argc-3)) return;
9860 for (i = 0; i < num; i++) {
9861 waitForSwappedKey(c,argv[3+i]);
9862 }
9863 }
9864
9865 /* Preload keys needed to execute the entire MULTI/EXEC block.
9866 *
9867 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9868 * and will block the client when any command requires a swapped out value. */
9869 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9870 int i, margc;
9871 struct redisCommand *mcmd;
9872 robj **margv;
9873 REDIS_NOTUSED(cmd);
9874 REDIS_NOTUSED(argc);
9875 REDIS_NOTUSED(argv);
9876
9877 if (!(c->flags & REDIS_MULTI)) return;
9878 for (i = 0; i < c->mstate.count; i++) {
9879 mcmd = c->mstate.commands[i].cmd;
9880 margc = c->mstate.commands[i].argc;
9881 margv = c->mstate.commands[i].argv;
9882
9883 if (mcmd->vm_preload_proc != NULL) {
9884 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9885 } else {
9886 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9887 }
9888 }
9889 }
9890
9891 /* Is this client attempting to run a command against swapped keys?
9892 * If so, block it ASAP, load the keys in background, then resume it.
9893 *
9894 * The important idea about this function is that it can fail! If keys will
9895 * still be swapped when the client is resumed, this key lookups will
9896 * just block loading keys from disk. In practical terms this should only
9897 * happen with SORT BY command or if there is a bug in this function.
9898 *
9899 * Return 1 if the client is marked as blocked, 0 if the client can
9900 * continue as the keys it is going to access appear to be in memory. */
9901 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9902 if (cmd->vm_preload_proc != NULL) {
9903 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9904 } else {
9905 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9906 }
9907
9908 /* If the client was blocked for at least one key, mark it as blocked. */
9909 if (listLength(c->io_keys)) {
9910 c->flags |= REDIS_IO_WAIT;
9911 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9912 server.vm_blocked_clients++;
9913 return 1;
9914 } else {
9915 return 0;
9916 }
9917 }
9918
9919 /* Remove the 'key' from the list of blocked keys for a given client.
9920 *
9921 * The function returns 1 when there are no longer blocking keys after
9922 * the current one was removed (and the client can be unblocked). */
9923 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9924 list *l;
9925 listNode *ln;
9926 listIter li;
9927 struct dictEntry *de;
9928
9929 /* Remove the key from the list of keys this client is waiting for. */
9930 listRewind(c->io_keys,&li);
9931 while ((ln = listNext(&li)) != NULL) {
9932 if (equalStringObjects(ln->value,key)) {
9933 listDelNode(c->io_keys,ln);
9934 break;
9935 }
9936 }
9937 assert(ln != NULL);
9938
9939 /* Remove the client form the key => waiting clients map. */
9940 de = dictFind(c->db->io_keys,key);
9941 assert(de != NULL);
9942 l = dictGetEntryVal(de);
9943 ln = listSearchKey(l,c);
9944 assert(ln != NULL);
9945 listDelNode(l,ln);
9946 if (listLength(l) == 0)
9947 dictDelete(c->db->io_keys,key);
9948
9949 return listLength(c->io_keys) == 0;
9950 }
9951
9952 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9953 struct dictEntry *de;
9954 list *l;
9955 listNode *ln;
9956 int len;
9957
9958 de = dictFind(db->io_keys,key);
9959 if (!de) return;
9960
9961 l = dictGetEntryVal(de);
9962 len = listLength(l);
9963 /* Note: we can't use something like while(listLength(l)) as the list
9964 * can be freed by the calling function when we remove the last element. */
9965 while (len--) {
9966 ln = listFirst(l);
9967 redisClient *c = ln->value;
9968
9969 if (dontWaitForSwappedKey(c,key)) {
9970 /* Put the client in the list of clients ready to go as we
9971 * loaded all the keys about it. */
9972 listAddNodeTail(server.io_ready_clients,c);
9973 }
9974 }
9975 }
9976
9977 /* =========================== Remote Configuration ========================= */
9978
9979 static void configSetCommand(redisClient *c) {
9980 robj *o = getDecodedObject(c->argv[3]);
9981 long long ll;
9982
9983 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9984 zfree(server.dbfilename);
9985 server.dbfilename = zstrdup(o->ptr);
9986 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9987 zfree(server.requirepass);
9988 server.requirepass = zstrdup(o->ptr);
9989 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9990 zfree(server.masterauth);
9991 server.masterauth = zstrdup(o->ptr);
9992 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9993 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9994 ll < 0) goto badfmt;
9995 server.maxmemory = ll;
9996 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9997 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9998 ll < 0 || ll > LONG_MAX) goto badfmt;
9999 server.maxidletime = ll;
10000 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10001 if (!strcasecmp(o->ptr,"no")) {
10002 server.appendfsync = APPENDFSYNC_NO;
10003 } else if (!strcasecmp(o->ptr,"everysec")) {
10004 server.appendfsync = APPENDFSYNC_EVERYSEC;
10005 } else if (!strcasecmp(o->ptr,"always")) {
10006 server.appendfsync = APPENDFSYNC_ALWAYS;
10007 } else {
10008 goto badfmt;
10009 }
10010 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10011 int yn = yesnotoi(o->ptr);
10012
10013 if (yn == -1) goto badfmt;
10014 server.no_appendfsync_on_rewrite = yn;
10015 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10016 int old = server.appendonly;
10017 int new = yesnotoi(o->ptr);
10018
10019 if (new == -1) goto badfmt;
10020 if (old != new) {
10021 if (new == 0) {
10022 stopAppendOnly();
10023 } else {
10024 if (startAppendOnly() == REDIS_ERR) {
10025 addReplySds(c,sdscatprintf(sdsempty(),
10026 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10027 decrRefCount(o);
10028 return;
10029 }
10030 }
10031 }
10032 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10033 int vlen, j;
10034 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10035
10036 /* Perform sanity check before setting the new config:
10037 * - Even number of args
10038 * - Seconds >= 1, changes >= 0 */
10039 if (vlen & 1) {
10040 sdsfreesplitres(v,vlen);
10041 goto badfmt;
10042 }
10043 for (j = 0; j < vlen; j++) {
10044 char *eptr;
10045 long val;
10046
10047 val = strtoll(v[j], &eptr, 10);
10048 if (eptr[0] != '\0' ||
10049 ((j & 1) == 0 && val < 1) ||
10050 ((j & 1) == 1 && val < 0)) {
10051 sdsfreesplitres(v,vlen);
10052 goto badfmt;
10053 }
10054 }
10055 /* Finally set the new config */
10056 resetServerSaveParams();
10057 for (j = 0; j < vlen; j += 2) {
10058 time_t seconds;
10059 int changes;
10060
10061 seconds = strtoll(v[j],NULL,10);
10062 changes = strtoll(v[j+1],NULL,10);
10063 appendServerSaveParams(seconds, changes);
10064 }
10065 sdsfreesplitres(v,vlen);
10066 } else {
10067 addReplySds(c,sdscatprintf(sdsempty(),
10068 "-ERR not supported CONFIG parameter %s\r\n",
10069 (char*)c->argv[2]->ptr));
10070 decrRefCount(o);
10071 return;
10072 }
10073 decrRefCount(o);
10074 addReply(c,shared.ok);
10075 return;
10076
10077 badfmt: /* Bad format errors */
10078 addReplySds(c,sdscatprintf(sdsempty(),
10079 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10080 (char*)o->ptr,
10081 (char*)c->argv[2]->ptr));
10082 decrRefCount(o);
10083 }
10084
10085 static void configGetCommand(redisClient *c) {
10086 robj *o = getDecodedObject(c->argv[2]);
10087 robj *lenobj = createObject(REDIS_STRING,NULL);
10088 char *pattern = o->ptr;
10089 int matches = 0;
10090
10091 addReply(c,lenobj);
10092 decrRefCount(lenobj);
10093
10094 if (stringmatch(pattern,"dbfilename",0)) {
10095 addReplyBulkCString(c,"dbfilename");
10096 addReplyBulkCString(c,server.dbfilename);
10097 matches++;
10098 }
10099 if (stringmatch(pattern,"requirepass",0)) {
10100 addReplyBulkCString(c,"requirepass");
10101 addReplyBulkCString(c,server.requirepass);
10102 matches++;
10103 }
10104 if (stringmatch(pattern,"masterauth",0)) {
10105 addReplyBulkCString(c,"masterauth");
10106 addReplyBulkCString(c,server.masterauth);
10107 matches++;
10108 }
10109 if (stringmatch(pattern,"maxmemory",0)) {
10110 char buf[128];
10111
10112 ll2string(buf,128,server.maxmemory);
10113 addReplyBulkCString(c,"maxmemory");
10114 addReplyBulkCString(c,buf);
10115 matches++;
10116 }
10117 if (stringmatch(pattern,"timeout",0)) {
10118 char buf[128];
10119
10120 ll2string(buf,128,server.maxidletime);
10121 addReplyBulkCString(c,"timeout");
10122 addReplyBulkCString(c,buf);
10123 matches++;
10124 }
10125 if (stringmatch(pattern,"appendonly",0)) {
10126 addReplyBulkCString(c,"appendonly");
10127 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10128 matches++;
10129 }
10130 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10131 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10132 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10133 matches++;
10134 }
10135 if (stringmatch(pattern,"appendfsync",0)) {
10136 char *policy;
10137
10138 switch(server.appendfsync) {
10139 case APPENDFSYNC_NO: policy = "no"; break;
10140 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10141 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10142 default: policy = "unknown"; break; /* too harmless to panic */
10143 }
10144 addReplyBulkCString(c,"appendfsync");
10145 addReplyBulkCString(c,policy);
10146 matches++;
10147 }
10148 if (stringmatch(pattern,"save",0)) {
10149 sds buf = sdsempty();
10150 int j;
10151
10152 for (j = 0; j < server.saveparamslen; j++) {
10153 buf = sdscatprintf(buf,"%ld %d",
10154 server.saveparams[j].seconds,
10155 server.saveparams[j].changes);
10156 if (j != server.saveparamslen-1)
10157 buf = sdscatlen(buf," ",1);
10158 }
10159 addReplyBulkCString(c,"save");
10160 addReplyBulkCString(c,buf);
10161 sdsfree(buf);
10162 matches++;
10163 }
10164 decrRefCount(o);
10165 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10166 }
10167
10168 static void configCommand(redisClient *c) {
10169 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10170 if (c->argc != 4) goto badarity;
10171 configSetCommand(c);
10172 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10173 if (c->argc != 3) goto badarity;
10174 configGetCommand(c);
10175 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10176 if (c->argc != 2) goto badarity;
10177 server.stat_numcommands = 0;
10178 server.stat_numconnections = 0;
10179 server.stat_expiredkeys = 0;
10180 server.stat_starttime = time(NULL);
10181 addReply(c,shared.ok);
10182 } else {
10183 addReplySds(c,sdscatprintf(sdsempty(),
10184 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10185 }
10186 return;
10187
10188 badarity:
10189 addReplySds(c,sdscatprintf(sdsempty(),
10190 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10191 (char*) c->argv[1]->ptr));
10192 }
10193
10194 /* =========================== Pubsub implementation ======================== */
10195
10196 static void freePubsubPattern(void *p) {
10197 pubsubPattern *pat = p;
10198
10199 decrRefCount(pat->pattern);
10200 zfree(pat);
10201 }
10202
10203 static int listMatchPubsubPattern(void *a, void *b) {
10204 pubsubPattern *pa = a, *pb = b;
10205
10206 return (pa->client == pb->client) &&
10207 (equalStringObjects(pa->pattern,pb->pattern));
10208 }
10209
10210 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10211 * 0 if the client was already subscribed to that channel. */
10212 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10213 struct dictEntry *de;
10214 list *clients = NULL;
10215 int retval = 0;
10216
10217 /* Add the channel to the client -> channels hash table */
10218 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10219 retval = 1;
10220 incrRefCount(channel);
10221 /* Add the client to the channel -> list of clients hash table */
10222 de = dictFind(server.pubsub_channels,channel);
10223 if (de == NULL) {
10224 clients = listCreate();
10225 dictAdd(server.pubsub_channels,channel,clients);
10226 incrRefCount(channel);
10227 } else {
10228 clients = dictGetEntryVal(de);
10229 }
10230 listAddNodeTail(clients,c);
10231 }
10232 /* Notify the client */
10233 addReply(c,shared.mbulk3);
10234 addReply(c,shared.subscribebulk);
10235 addReplyBulk(c,channel);
10236 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10237 return retval;
10238 }
10239
10240 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10241 * 0 if the client was not subscribed to the specified channel. */
10242 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10243 struct dictEntry *de;
10244 list *clients;
10245 listNode *ln;
10246 int retval = 0;
10247
10248 /* Remove the channel from the client -> channels hash table */
10249 incrRefCount(channel); /* channel may be just a pointer to the same object
10250 we have in the hash tables. Protect it... */
10251 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10252 retval = 1;
10253 /* Remove the client from the channel -> clients list hash table */
10254 de = dictFind(server.pubsub_channels,channel);
10255 assert(de != NULL);
10256 clients = dictGetEntryVal(de);
10257 ln = listSearchKey(clients,c);
10258 assert(ln != NULL);
10259 listDelNode(clients,ln);
10260 if (listLength(clients) == 0) {
10261 /* Free the list and associated hash entry at all if this was
10262 * the latest client, so that it will be possible to abuse
10263 * Redis PUBSUB creating millions of channels. */
10264 dictDelete(server.pubsub_channels,channel);
10265 }
10266 }
10267 /* Notify the client */
10268 if (notify) {
10269 addReply(c,shared.mbulk3);
10270 addReply(c,shared.unsubscribebulk);
10271 addReplyBulk(c,channel);
10272 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10273 listLength(c->pubsub_patterns));
10274
10275 }
10276 decrRefCount(channel); /* it is finally safe to release it */
10277 return retval;
10278 }
10279
10280 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10281 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10282 int retval = 0;
10283
10284 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10285 retval = 1;
10286 pubsubPattern *pat;
10287 listAddNodeTail(c->pubsub_patterns,pattern);
10288 incrRefCount(pattern);
10289 pat = zmalloc(sizeof(*pat));
10290 pat->pattern = getDecodedObject(pattern);
10291 pat->client = c;
10292 listAddNodeTail(server.pubsub_patterns,pat);
10293 }
10294 /* Notify the client */
10295 addReply(c,shared.mbulk3);
10296 addReply(c,shared.psubscribebulk);
10297 addReplyBulk(c,pattern);
10298 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10299 return retval;
10300 }
10301
10302 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10303 * 0 if the client was not subscribed to the specified channel. */
10304 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10305 listNode *ln;
10306 pubsubPattern pat;
10307 int retval = 0;
10308
10309 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10310 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10311 retval = 1;
10312 listDelNode(c->pubsub_patterns,ln);
10313 pat.client = c;
10314 pat.pattern = pattern;
10315 ln = listSearchKey(server.pubsub_patterns,&pat);
10316 listDelNode(server.pubsub_patterns,ln);
10317 }
10318 /* Notify the client */
10319 if (notify) {
10320 addReply(c,shared.mbulk3);
10321 addReply(c,shared.punsubscribebulk);
10322 addReplyBulk(c,pattern);
10323 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10324 listLength(c->pubsub_patterns));
10325 }
10326 decrRefCount(pattern);
10327 return retval;
10328 }
10329
10330 /* Unsubscribe from all the channels. Return the number of channels the
10331 * client was subscribed from. */
10332 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10333 dictIterator *di = dictGetIterator(c->pubsub_channels);
10334 dictEntry *de;
10335 int count = 0;
10336
10337 while((de = dictNext(di)) != NULL) {
10338 robj *channel = dictGetEntryKey(de);
10339
10340 count += pubsubUnsubscribeChannel(c,channel,notify);
10341 }
10342 dictReleaseIterator(di);
10343 return count;
10344 }
10345
10346 /* Unsubscribe from all the patterns. Return the number of patterns the
10347 * client was subscribed from. */
10348 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10349 listNode *ln;
10350 listIter li;
10351 int count = 0;
10352
10353 listRewind(c->pubsub_patterns,&li);
10354 while ((ln = listNext(&li)) != NULL) {
10355 robj *pattern = ln->value;
10356
10357 count += pubsubUnsubscribePattern(c,pattern,notify);
10358 }
10359 return count;
10360 }
10361
10362 /* Publish a message */
10363 static int pubsubPublishMessage(robj *channel, robj *message) {
10364 int receivers = 0;
10365 struct dictEntry *de;
10366 listNode *ln;
10367 listIter li;
10368
10369 /* Send to clients listening for that channel */
10370 de = dictFind(server.pubsub_channels,channel);
10371 if (de) {
10372 list *list = dictGetEntryVal(de);
10373 listNode *ln;
10374 listIter li;
10375
10376 listRewind(list,&li);
10377 while ((ln = listNext(&li)) != NULL) {
10378 redisClient *c = ln->value;
10379
10380 addReply(c,shared.mbulk3);
10381 addReply(c,shared.messagebulk);
10382 addReplyBulk(c,channel);
10383 addReplyBulk(c,message);
10384 receivers++;
10385 }
10386 }
10387 /* Send to clients listening to matching channels */
10388 if (listLength(server.pubsub_patterns)) {
10389 listRewind(server.pubsub_patterns,&li);
10390 channel = getDecodedObject(channel);
10391 while ((ln = listNext(&li)) != NULL) {
10392 pubsubPattern *pat = ln->value;
10393
10394 if (stringmatchlen((char*)pat->pattern->ptr,
10395 sdslen(pat->pattern->ptr),
10396 (char*)channel->ptr,
10397 sdslen(channel->ptr),0)) {
10398 addReply(pat->client,shared.mbulk4);
10399 addReply(pat->client,shared.pmessagebulk);
10400 addReplyBulk(pat->client,pat->pattern);
10401 addReplyBulk(pat->client,channel);
10402 addReplyBulk(pat->client,message);
10403 receivers++;
10404 }
10405 }
10406 decrRefCount(channel);
10407 }
10408 return receivers;
10409 }
10410
10411 static void subscribeCommand(redisClient *c) {
10412 int j;
10413
10414 for (j = 1; j < c->argc; j++)
10415 pubsubSubscribeChannel(c,c->argv[j]);
10416 }
10417
10418 static void unsubscribeCommand(redisClient *c) {
10419 if (c->argc == 1) {
10420 pubsubUnsubscribeAllChannels(c,1);
10421 return;
10422 } else {
10423 int j;
10424
10425 for (j = 1; j < c->argc; j++)
10426 pubsubUnsubscribeChannel(c,c->argv[j],1);
10427 }
10428 }
10429
10430 static void psubscribeCommand(redisClient *c) {
10431 int j;
10432
10433 for (j = 1; j < c->argc; j++)
10434 pubsubSubscribePattern(c,c->argv[j]);
10435 }
10436
10437 static void punsubscribeCommand(redisClient *c) {
10438 if (c->argc == 1) {
10439 pubsubUnsubscribeAllPatterns(c,1);
10440 return;
10441 } else {
10442 int j;
10443
10444 for (j = 1; j < c->argc; j++)
10445 pubsubUnsubscribePattern(c,c->argv[j],1);
10446 }
10447 }
10448
10449 static void publishCommand(redisClient *c) {
10450 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10451 addReplyLongLong(c,receivers);
10452 }
10453
10454 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10455 *
10456 * The implementation uses a per-DB hash table mapping keys to list of clients
10457 * WATCHing those keys, so that given a key that is going to be modified
10458 * we can mark all the associated clients as dirty.
10459 *
10460 * Also every client contains a list of WATCHed keys so that's possible to
10461 * un-watch such keys when the client is freed or when UNWATCH is called. */
10462
10463 /* In the client->watched_keys list we need to use watchedKey structures
10464 * as in order to identify a key in Redis we need both the key name and the
10465 * DB */
10466 typedef struct watchedKey {
10467 robj *key;
10468 redisDb *db;
10469 } watchedKey;
10470
10471 /* Watch for the specified key */
10472 static void watchForKey(redisClient *c, robj *key) {
10473 list *clients = NULL;
10474 listIter li;
10475 listNode *ln;
10476 watchedKey *wk;
10477
10478 /* Check if we are already watching for this key */
10479 listRewind(c->watched_keys,&li);
10480 while((ln = listNext(&li))) {
10481 wk = listNodeValue(ln);
10482 if (wk->db == c->db && equalStringObjects(key,wk->key))
10483 return; /* Key already watched */
10484 }
10485 /* This key is not already watched in this DB. Let's add it */
10486 clients = dictFetchValue(c->db->watched_keys,key);
10487 if (!clients) {
10488 clients = listCreate();
10489 dictAdd(c->db->watched_keys,key,clients);
10490 incrRefCount(key);
10491 }
10492 listAddNodeTail(clients,c);
10493 /* Add the new key to the lits of keys watched by this client */
10494 wk = zmalloc(sizeof(*wk));
10495 wk->key = key;
10496 wk->db = c->db;
10497 incrRefCount(key);
10498 listAddNodeTail(c->watched_keys,wk);
10499 }
10500
10501 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10502 * flag is up to the caller. */
10503 static void unwatchAllKeys(redisClient *c) {
10504 listIter li;
10505 listNode *ln;
10506
10507 if (listLength(c->watched_keys) == 0) return;
10508 listRewind(c->watched_keys,&li);
10509 while((ln = listNext(&li))) {
10510 list *clients;
10511 watchedKey *wk;
10512
10513 /* Lookup the watched key -> clients list and remove the client
10514 * from the list */
10515 wk = listNodeValue(ln);
10516 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10517 assert(clients != NULL);
10518 listDelNode(clients,listSearchKey(clients,c));
10519 /* Kill the entry at all if this was the only client */
10520 if (listLength(clients) == 0)
10521 dictDelete(wk->db->watched_keys, wk->key);
10522 /* Remove this watched key from the client->watched list */
10523 listDelNode(c->watched_keys,ln);
10524 decrRefCount(wk->key);
10525 zfree(wk);
10526 }
10527 }
10528
10529 /* "Touch" a key, so that if this key is being WATCHed by some client the
10530 * next EXEC will fail. */
10531 static void touchWatchedKey(redisDb *db, robj *key) {
10532 list *clients;
10533 listIter li;
10534 listNode *ln;
10535
10536 if (dictSize(db->watched_keys) == 0) return;
10537 clients = dictFetchValue(db->watched_keys, key);
10538 if (!clients) return;
10539
10540 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10541 /* Check if we are already watching for this key */
10542 listRewind(clients,&li);
10543 while((ln = listNext(&li))) {
10544 redisClient *c = listNodeValue(ln);
10545
10546 c->flags |= REDIS_DIRTY_CAS;
10547 }
10548 }
10549
10550 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10551 * flush but will be deleted as effect of the flushing operation should
10552 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10553 * a FLUSHALL operation (all the DBs flushed). */
10554 static void touchWatchedKeysOnFlush(int dbid) {
10555 listIter li1, li2;
10556 listNode *ln;
10557
10558 /* For every client, check all the waited keys */
10559 listRewind(server.clients,&li1);
10560 while((ln = listNext(&li1))) {
10561 redisClient *c = listNodeValue(ln);
10562 listRewind(c->watched_keys,&li2);
10563 while((ln = listNext(&li2))) {
10564 watchedKey *wk = listNodeValue(ln);
10565
10566 /* For every watched key matching the specified DB, if the
10567 * key exists, mark the client as dirty, as the key will be
10568 * removed. */
10569 if (dbid == -1 || wk->db->id == dbid) {
10570 if (dictFind(wk->db->dict, wk->key) != NULL)
10571 c->flags |= REDIS_DIRTY_CAS;
10572 }
10573 }
10574 }
10575 }
10576
10577 static void watchCommand(redisClient *c) {
10578 int j;
10579
10580 if (c->flags & REDIS_MULTI) {
10581 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10582 return;
10583 }
10584 for (j = 1; j < c->argc; j++)
10585 watchForKey(c,c->argv[j]);
10586 addReply(c,shared.ok);
10587 }
10588
10589 static void unwatchCommand(redisClient *c) {
10590 unwatchAllKeys(c);
10591 c->flags &= (~REDIS_DIRTY_CAS);
10592 addReply(c,shared.ok);
10593 }
10594
10595 /* ================================= Debugging ============================== */
10596
10597 /* Compute the sha1 of string at 's' with 'len' bytes long.
10598 * The SHA1 is then xored againt the string pointed by digest.
10599 * Since xor is commutative, this operation is used in order to
10600 * "add" digests relative to unordered elements.
10601 *
10602 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10603 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10604 SHA1_CTX ctx;
10605 unsigned char hash[20], *s = ptr;
10606 int j;
10607
10608 SHA1Init(&ctx);
10609 SHA1Update(&ctx,s,len);
10610 SHA1Final(hash,&ctx);
10611
10612 for (j = 0; j < 20; j++)
10613 digest[j] ^= hash[j];
10614 }
10615
10616 static void xorObjectDigest(unsigned char *digest, robj *o) {
10617 o = getDecodedObject(o);
10618 xorDigest(digest,o->ptr,sdslen(o->ptr));
10619 decrRefCount(o);
10620 }
10621
10622 /* This function instead of just computing the SHA1 and xoring it
10623 * against diget, also perform the digest of "digest" itself and
10624 * replace the old value with the new one.
10625 *
10626 * So the final digest will be:
10627 *
10628 * digest = SHA1(digest xor SHA1(data))
10629 *
10630 * This function is used every time we want to preserve the order so
10631 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10632 *
10633 * Also note that mixdigest("foo") followed by mixdigest("bar")
10634 * will lead to a different digest compared to "fo", "obar".
10635 */
10636 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10637 SHA1_CTX ctx;
10638 char *s = ptr;
10639
10640 xorDigest(digest,s,len);
10641 SHA1Init(&ctx);
10642 SHA1Update(&ctx,digest,20);
10643 SHA1Final(digest,&ctx);
10644 }
10645
10646 static void mixObjectDigest(unsigned char *digest, robj *o) {
10647 o = getDecodedObject(o);
10648 mixDigest(digest,o->ptr,sdslen(o->ptr));
10649 decrRefCount(o);
10650 }
10651
10652 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10653 * are not ordered, we use a trick: every aggregate digest is the xor
10654 * of the digests of their elements. This way the order will not change
10655 * the result. For list instead we use a feedback entering the output digest
10656 * as input in order to ensure that a different ordered list will result in
10657 * a different digest. */
10658 static void computeDatasetDigest(unsigned char *final) {
10659 unsigned char digest[20];
10660 char buf[128];
10661 dictIterator *di = NULL;
10662 dictEntry *de;
10663 int j;
10664 uint32_t aux;
10665
10666 memset(final,0,20); /* Start with a clean result */
10667
10668 for (j = 0; j < server.dbnum; j++) {
10669 redisDb *db = server.db+j;
10670
10671 if (dictSize(db->dict) == 0) continue;
10672 di = dictGetIterator(db->dict);
10673
10674 /* hash the DB id, so the same dataset moved in a different
10675 * DB will lead to a different digest */
10676 aux = htonl(j);
10677 mixDigest(final,&aux,sizeof(aux));
10678
10679 /* Iterate this DB writing every entry */
10680 while((de = dictNext(di)) != NULL) {
10681 robj *key, *o, *kcopy;
10682 time_t expiretime;
10683
10684 memset(digest,0,20); /* This key-val digest */
10685 key = dictGetEntryKey(de);
10686
10687 if (!server.vm_enabled) {
10688 mixObjectDigest(digest,key);
10689 o = dictGetEntryVal(de);
10690 } else {
10691 /* Don't work with the key directly as when VM is active
10692 * this is unsafe: TODO: fix decrRefCount to check if the
10693 * count really reached 0 to avoid this mess */
10694 kcopy = dupStringObject(key);
10695 mixObjectDigest(digest,kcopy);
10696 o = lookupKeyRead(db,kcopy);
10697 decrRefCount(kcopy);
10698 }
10699 aux = htonl(o->type);
10700 mixDigest(digest,&aux,sizeof(aux));
10701 expiretime = getExpire(db,key);
10702
10703 /* Save the key and associated value */
10704 if (o->type == REDIS_STRING) {
10705 mixObjectDigest(digest,o);
10706 } else if (o->type == REDIS_LIST) {
10707 list *list = o->ptr;
10708 listNode *ln;
10709 listIter li;
10710
10711 listRewind(list,&li);
10712 while((ln = listNext(&li))) {
10713 robj *eleobj = listNodeValue(ln);
10714
10715 mixObjectDigest(digest,eleobj);
10716 }
10717 } else if (o->type == REDIS_SET) {
10718 dict *set = o->ptr;
10719 dictIterator *di = dictGetIterator(set);
10720 dictEntry *de;
10721
10722 while((de = dictNext(di)) != NULL) {
10723 robj *eleobj = dictGetEntryKey(de);
10724
10725 xorObjectDigest(digest,eleobj);
10726 }
10727 dictReleaseIterator(di);
10728 } else if (o->type == REDIS_ZSET) {
10729 zset *zs = o->ptr;
10730 dictIterator *di = dictGetIterator(zs->dict);
10731 dictEntry *de;
10732
10733 while((de = dictNext(di)) != NULL) {
10734 robj *eleobj = dictGetEntryKey(de);
10735 double *score = dictGetEntryVal(de);
10736 unsigned char eledigest[20];
10737
10738 snprintf(buf,sizeof(buf),"%.17g",*score);
10739 memset(eledigest,0,20);
10740 mixObjectDigest(eledigest,eleobj);
10741 mixDigest(eledigest,buf,strlen(buf));
10742 xorDigest(digest,eledigest,20);
10743 }
10744 dictReleaseIterator(di);
10745 } else if (o->type == REDIS_HASH) {
10746 hashIterator *hi;
10747 robj *obj;
10748
10749 hi = hashInitIterator(o);
10750 while (hashNext(hi) != REDIS_ERR) {
10751 unsigned char eledigest[20];
10752
10753 memset(eledigest,0,20);
10754 obj = hashCurrent(hi,REDIS_HASH_KEY);
10755 mixObjectDigest(eledigest,obj);
10756 decrRefCount(obj);
10757 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10758 mixObjectDigest(eledigest,obj);
10759 decrRefCount(obj);
10760 xorDigest(digest,eledigest,20);
10761 }
10762 hashReleaseIterator(hi);
10763 } else {
10764 redisPanic("Unknown object type");
10765 }
10766 /* If the key has an expire, add it to the mix */
10767 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10768 /* We can finally xor the key-val digest to the final digest */
10769 xorDigest(final,digest,20);
10770 }
10771 dictReleaseIterator(di);
10772 }
10773 }
10774
10775 static void debugCommand(redisClient *c) {
10776 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10777 *((char*)-1) = 'x';
10778 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10779 if (rdbSave(server.dbfilename) != REDIS_OK) {
10780 addReply(c,shared.err);
10781 return;
10782 }
10783 emptyDb();
10784 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10785 addReply(c,shared.err);
10786 return;
10787 }
10788 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10789 addReply(c,shared.ok);
10790 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10791 emptyDb();
10792 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10793 addReply(c,shared.err);
10794 return;
10795 }
10796 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10797 addReply(c,shared.ok);
10798 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10799 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10800 robj *key, *val;
10801
10802 if (!de) {
10803 addReply(c,shared.nokeyerr);
10804 return;
10805 }
10806 key = dictGetEntryKey(de);
10807 val = dictGetEntryVal(de);
10808 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10809 key->storage == REDIS_VM_SWAPPING)) {
10810 char *strenc;
10811 char buf[128];
10812
10813 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10814 strenc = strencoding[val->encoding];
10815 } else {
10816 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10817 strenc = buf;
10818 }
10819 addReplySds(c,sdscatprintf(sdsempty(),
10820 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10821 "encoding:%s serializedlength:%lld\r\n",
10822 (void*)key, key->refcount, (void*)val, val->refcount,
10823 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10824 } else {
10825 addReplySds(c,sdscatprintf(sdsempty(),
10826 "+Key at:%p refcount:%d, value swapped at: page %llu "
10827 "using %llu pages\r\n",
10828 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10829 (unsigned long long) key->vm.usedpages));
10830 }
10831 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10832 lookupKeyRead(c->db,c->argv[2]);
10833 addReply(c,shared.ok);
10834 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10835 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10836 robj *key, *val;
10837
10838 if (!server.vm_enabled) {
10839 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10840 return;
10841 }
10842 if (!de) {
10843 addReply(c,shared.nokeyerr);
10844 return;
10845 }
10846 key = dictGetEntryKey(de);
10847 val = dictGetEntryVal(de);
10848 /* If the key is shared we want to create a copy */
10849 if (key->refcount > 1) {
10850 robj *newkey = dupStringObject(key);
10851 decrRefCount(key);
10852 key = dictGetEntryKey(de) = newkey;
10853 }
10854 /* Swap it */
10855 if (key->storage != REDIS_VM_MEMORY) {
10856 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10857 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10858 dictGetEntryVal(de) = NULL;
10859 addReply(c,shared.ok);
10860 } else {
10861 addReply(c,shared.err);
10862 }
10863 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10864 long keys, j;
10865 robj *key, *val;
10866 char buf[128];
10867
10868 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10869 return;
10870 for (j = 0; j < keys; j++) {
10871 snprintf(buf,sizeof(buf),"key:%lu",j);
10872 key = createStringObject(buf,strlen(buf));
10873 if (lookupKeyRead(c->db,key) != NULL) {
10874 decrRefCount(key);
10875 continue;
10876 }
10877 snprintf(buf,sizeof(buf),"value:%lu",j);
10878 val = createStringObject(buf,strlen(buf));
10879 dictAdd(c->db->dict,key,val);
10880 }
10881 addReply(c,shared.ok);
10882 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10883 unsigned char digest[20];
10884 sds d = sdsnew("+");
10885 int j;
10886
10887 computeDatasetDigest(digest);
10888 for (j = 0; j < 20; j++)
10889 d = sdscatprintf(d, "%02x",digest[j]);
10890
10891 d = sdscatlen(d,"\r\n",2);
10892 addReplySds(c,d);
10893 } else {
10894 addReplySds(c,sdsnew(
10895 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10896 }
10897 }
10898
10899 static void _redisAssert(char *estr, char *file, int line) {
10900 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10901 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10902 #ifdef HAVE_BACKTRACE
10903 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10904 *((char*)-1) = 'x';
10905 #endif
10906 }
10907
10908 static void _redisPanic(char *msg, char *file, int line) {
10909 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10910 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10911 #ifdef HAVE_BACKTRACE
10912 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10913 *((char*)-1) = 'x';
10914 #endif
10915 }
10916
10917 /* =================================== Main! ================================ */
10918
10919 #ifdef __linux__
10920 int linuxOvercommitMemoryValue(void) {
10921 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10922 char buf[64];
10923
10924 if (!fp) return -1;
10925 if (fgets(buf,64,fp) == NULL) {
10926 fclose(fp);
10927 return -1;
10928 }
10929 fclose(fp);
10930
10931 return atoi(buf);
10932 }
10933
10934 void linuxOvercommitMemoryWarning(void) {
10935 if (linuxOvercommitMemoryValue() == 0) {
10936 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10937 }
10938 }
10939 #endif /* __linux__ */
10940
10941 static void daemonize(void) {
10942 int fd;
10943 FILE *fp;
10944
10945 if (fork() != 0) exit(0); /* parent exits */
10946 setsid(); /* create a new session */
10947
10948 /* Every output goes to /dev/null. If Redis is daemonized but
10949 * the 'logfile' is set to 'stdout' in the configuration file
10950 * it will not log at all. */
10951 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10952 dup2(fd, STDIN_FILENO);
10953 dup2(fd, STDOUT_FILENO);
10954 dup2(fd, STDERR_FILENO);
10955 if (fd > STDERR_FILENO) close(fd);
10956 }
10957 /* Try to write the pid file */
10958 fp = fopen(server.pidfile,"w");
10959 if (fp) {
10960 fprintf(fp,"%d\n",getpid());
10961 fclose(fp);
10962 }
10963 }
10964
10965 static void version() {
10966 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10967 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
10968 exit(0);
10969 }
10970
10971 static void usage() {
10972 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10973 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10974 exit(1);
10975 }
10976
10977 int main(int argc, char **argv) {
10978 time_t start;
10979
10980 initServerConfig();
10981 sortCommandTable();
10982 if (argc == 2) {
10983 if (strcmp(argv[1], "-v") == 0 ||
10984 strcmp(argv[1], "--version") == 0) version();
10985 if (strcmp(argv[1], "--help") == 0) usage();
10986 resetServerSaveParams();
10987 loadServerConfig(argv[1]);
10988 } else if ((argc > 2)) {
10989 usage();
10990 } else {
10991 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10992 }
10993 if (server.daemonize) daemonize();
10994 initServer();
10995 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10996 #ifdef __linux__
10997 linuxOvercommitMemoryWarning();
10998 #endif
10999 start = time(NULL);
11000 if (server.appendonly) {
11001 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11002 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11003 } else {
11004 if (rdbLoad(server.dbfilename) == REDIS_OK)
11005 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11006 }
11007 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11008 aeSetBeforeSleepProc(server.el,beforeSleep);
11009 aeMain(server.el);
11010 aeDeleteEventLoop(server.el);
11011 return 0;
11012 }
11013
11014 /* ============================= Backtrace support ========================= */
11015
11016 #ifdef HAVE_BACKTRACE
11017 static char *findFuncName(void *pointer, unsigned long *offset);
11018
11019 static void *getMcontextEip(ucontext_t *uc) {
11020 #if defined(__FreeBSD__)
11021 return (void*) uc->uc_mcontext.mc_eip;
11022 #elif defined(__dietlibc__)
11023 return (void*) uc->uc_mcontext.eip;
11024 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11025 #if __x86_64__
11026 return (void*) uc->uc_mcontext->__ss.__rip;
11027 #else
11028 return (void*) uc->uc_mcontext->__ss.__eip;
11029 #endif
11030 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11031 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11032 return (void*) uc->uc_mcontext->__ss.__rip;
11033 #else
11034 return (void*) uc->uc_mcontext->__ss.__eip;
11035 #endif
11036 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11037 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11038 #elif defined(__ia64__) /* Linux IA64 */
11039 return (void*) uc->uc_mcontext.sc_ip;
11040 #else
11041 return NULL;
11042 #endif
11043 }
11044
11045 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11046 void *trace[100];
11047 char **messages = NULL;
11048 int i, trace_size = 0;
11049 unsigned long offset=0;
11050 ucontext_t *uc = (ucontext_t*) secret;
11051 sds infostring;
11052 REDIS_NOTUSED(info);
11053
11054 redisLog(REDIS_WARNING,
11055 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11056 infostring = genRedisInfoString();
11057 redisLog(REDIS_WARNING, "%s",infostring);
11058 /* It's not safe to sdsfree() the returned string under memory
11059 * corruption conditions. Let it leak as we are going to abort */
11060
11061 trace_size = backtrace(trace, 100);
11062 /* overwrite sigaction with caller's address */
11063 if (getMcontextEip(uc) != NULL) {
11064 trace[1] = getMcontextEip(uc);
11065 }
11066 messages = backtrace_symbols(trace, trace_size);
11067
11068 for (i=1; i<trace_size; ++i) {
11069 char *fn = findFuncName(trace[i], &offset), *p;
11070
11071 p = strchr(messages[i],'+');
11072 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11073 redisLog(REDIS_WARNING,"%s", messages[i]);
11074 } else {
11075 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11076 }
11077 }
11078 /* free(messages); Don't call free() with possibly corrupted memory. */
11079 _exit(0);
11080 }
11081
11082 static void sigtermHandler(int sig) {
11083 REDIS_NOTUSED(sig);
11084
11085 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11086 server.shutdown_asap = 1;
11087 }
11088
11089 static void setupSigSegvAction(void) {
11090 struct sigaction act;
11091
11092 sigemptyset (&act.sa_mask);
11093 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11094 * is used. Otherwise, sa_handler is used */
11095 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11096 act.sa_sigaction = segvHandler;
11097 sigaction (SIGSEGV, &act, NULL);
11098 sigaction (SIGBUS, &act, NULL);
11099 sigaction (SIGFPE, &act, NULL);
11100 sigaction (SIGILL, &act, NULL);
11101 sigaction (SIGBUS, &act, NULL);
11102
11103 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11104 act.sa_handler = sigtermHandler;
11105 sigaction (SIGTERM, &act, NULL);
11106 return;
11107 }
11108
11109 #include "staticsymbols.h"
11110 /* This function try to convert a pointer into a function name. It's used in
11111 * oreder to provide a backtrace under segmentation fault that's able to
11112 * display functions declared as static (otherwise the backtrace is useless). */
11113 static char *findFuncName(void *pointer, unsigned long *offset){
11114 int i, ret = -1;
11115 unsigned long off, minoff = 0;
11116
11117 /* Try to match against the Symbol with the smallest offset */
11118 for (i=0; symsTable[i].pointer; i++) {
11119 unsigned long lp = (unsigned long) pointer;
11120
11121 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11122 off=lp-symsTable[i].pointer;
11123 if (ret < 0 || off < minoff) {
11124 minoff=off;
11125 ret=i;
11126 }
11127 }
11128 }
11129 if (ret == -1) return NULL;
11130 *offset = minoff;
11131 return symsTable[ret].name;
11132 }
11133 #else /* HAVE_BACKTRACE */
11134 static void setupSigSegvAction(void) {
11135 }
11136 #endif /* HAVE_BACKTRACE */
11137
11138
11139
11140 /* The End */
11141
11142
11143