]> git.saurik.com Git - redis.git/blob - redis.c
0cf768ac895c559cf4de4e565e69d890edebaa26
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "intset.h" /* Compact integer set structure */
80 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
81 #include "release.h" /* Release and/or git repository information */
82
83 /* Error codes */
84 #define REDIS_OK 0
85 #define REDIS_ERR -1
86
87 /* Static server configuration */
88 #define REDIS_SERVERPORT 6379 /* TCP port */
89 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
90 #define REDIS_IOBUF_LEN 1024
91 #define REDIS_LOADBUF_LEN 1024
92 #define REDIS_STATIC_ARGS 8
93 #define REDIS_DEFAULT_DBNUM 16
94 #define REDIS_CONFIGLINE_MAX 1024
95 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
97 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
98 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
99 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
100
101 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102 #define REDIS_WRITEV_THRESHOLD 3
103 /* Max number of iovecs used for each writev call */
104 #define REDIS_WRITEV_IOVEC_COUNT 256
105
106 /* Hash table parameters */
107 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
108
109 /* Command flags */
110 #define REDIS_CMD_BULK 1 /* Bulk write command */
111 #define REDIS_CMD_INLINE 2 /* Inline command */
112 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116 #define REDIS_CMD_DENYOOM 4
117 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
118
119 /* Object types */
120 #define REDIS_STRING 0
121 #define REDIS_LIST 1
122 #define REDIS_SET 2
123 #define REDIS_ZSET 3
124 #define REDIS_HASH 4
125 #define REDIS_VMPOINTER 8
126
127 /* Objects encoding. Some kind of objects like Strings and Hashes can be
128 * internally represented in multiple ways. The 'encoding' field of the object
129 * is set to one of this fields for this object. */
130 #define REDIS_ENCODING_RAW 0 /* Raw representation */
131 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
132 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
133 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
134 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
135 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
136 #define REDIS_ENCODING_INTSET 6 /* Encoded as intset */
137
138 static char* strencoding[] = {
139 "raw", "int", "hashtable", "zipmap", "list", "ziplist", "intset"
140 };
141
142 /* Object types only used for dumping to disk */
143 #define REDIS_EXPIRETIME 253
144 #define REDIS_SELECTDB 254
145 #define REDIS_EOF 255
146
147 /* Defines related to the dump file format. To store 32 bits lengths for short
148 * keys requires a lot of space, so we check the most significant 2 bits of
149 * the first byte to interpreter the length:
150 *
151 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
152 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
153 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
154 * 11|000000 this means: specially encoded object will follow. The six bits
155 * number specify the kind of object that follows.
156 * See the REDIS_RDB_ENC_* defines.
157 *
158 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
159 * values, will fit inside. */
160 #define REDIS_RDB_6BITLEN 0
161 #define REDIS_RDB_14BITLEN 1
162 #define REDIS_RDB_32BITLEN 2
163 #define REDIS_RDB_ENCVAL 3
164 #define REDIS_RDB_LENERR UINT_MAX
165
166 /* When a length of a string object stored on disk has the first two bits
167 * set, the remaining two bits specify a special encoding for the object
168 * accordingly to the following defines: */
169 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
170 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
171 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
172 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
173
174 /* Virtual memory object->where field. */
175 #define REDIS_VM_MEMORY 0 /* The object is on memory */
176 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
177 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
178 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
179
180 /* Virtual memory static configuration stuff.
181 * Check vmFindContiguousPages() to know more about this magic numbers. */
182 #define REDIS_VM_MAX_NEAR_PAGES 65536
183 #define REDIS_VM_MAX_RANDOM_JUMP 4096
184 #define REDIS_VM_MAX_THREADS 32
185 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
186 /* The following is the *percentage* of completed I/O jobs to process when the
187 * handelr is called. While Virtual Memory I/O operations are performed by
188 * threads, this operations must be processed by the main thread when completed
189 * in order to take effect. */
190 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
191
192 /* Client flags */
193 #define REDIS_SLAVE 1 /* This client is a slave server */
194 #define REDIS_MASTER 2 /* This client is a master server */
195 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
196 #define REDIS_MULTI 8 /* This client is in a MULTI context */
197 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
198 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
199 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
200
201 /* Slave replication state - slave side */
202 #define REDIS_REPL_NONE 0 /* No active replication */
203 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
204 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
205
206 /* Slave replication state - from the point of view of master
207 * Note that in SEND_BULK and ONLINE state the slave receives new updates
208 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
209 * to start the next background saving in order to send updates to it. */
210 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
211 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
212 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
213 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
214
215 /* List related stuff */
216 #define REDIS_HEAD 0
217 #define REDIS_TAIL 1
218
219 /* Sort operations */
220 #define REDIS_SORT_GET 0
221 #define REDIS_SORT_ASC 1
222 #define REDIS_SORT_DESC 2
223 #define REDIS_SORTKEY_MAX 1024
224
225 /* Log levels */
226 #define REDIS_DEBUG 0
227 #define REDIS_VERBOSE 1
228 #define REDIS_NOTICE 2
229 #define REDIS_WARNING 3
230
231 /* Anti-warning macro... */
232 #define REDIS_NOTUSED(V) ((void) V)
233
234 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
235 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
236
237 /* Append only defines */
238 #define APPENDFSYNC_NO 0
239 #define APPENDFSYNC_ALWAYS 1
240 #define APPENDFSYNC_EVERYSEC 2
241
242 /* Zip structure related defaults */
243 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
244 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
245 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
246 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
247 #define REDIS_SET_MAX_INTSET_ENTRIES 4096
248
249 /* We can print the stacktrace, so our assert is defined this way: */
250 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
251 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
252 static void _redisAssert(char *estr, char *file, int line);
253 static void _redisPanic(char *msg, char *file, int line);
254
255 /*================================= Data types ============================== */
256
257 /* A redis object, that is a type able to hold a string / list / set */
258
259 /* The actual Redis Object */
260 typedef struct redisObject {
261 unsigned type:4;
262 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
263 unsigned encoding:4;
264 unsigned lru:22; /* lru time (relative to server.lruclock) */
265 int refcount;
266 void *ptr;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 } robj;
272
273 /* The VM pointer structure - identifies an object in the swap file.
274 *
275 * This object is stored in place of the value
276 * object in the main key->value hash table representing a database.
277 * Note that the first fields (type, storage) are the same as the redisObject
278 * structure so that vmPointer strucuters can be accessed even when casted
279 * as redisObject structures.
280 *
281 * This is useful as we don't know if a value object is or not on disk, but we
282 * are always able to read obj->storage to check this. For vmPointer
283 * structures "type" is set to REDIS_VMPOINTER (even if without this field
284 * is still possible to check the kind of object from the value of 'storage').*/
285 typedef struct vmPointer {
286 unsigned type:4;
287 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
288 unsigned notused:26;
289 unsigned int vtype; /* type of the object stored in the swap file */
290 off_t page; /* the page at witch the object is stored on disk */
291 off_t usedpages; /* number of pages used on disk */
292 } vmpointer;
293
294 /* Macro used to initalize a Redis object allocated on the stack.
295 * Note that this macro is taken near the structure definition to make sure
296 * we'll update it when the structure is changed, to avoid bugs like
297 * bug #85 introduced exactly in this way. */
298 #define initStaticStringObject(_var,_ptr) do { \
299 _var.refcount = 1; \
300 _var.type = REDIS_STRING; \
301 _var.encoding = REDIS_ENCODING_RAW; \
302 _var.ptr = _ptr; \
303 _var.storage = REDIS_VM_MEMORY; \
304 } while(0);
305
306 typedef struct redisDb {
307 dict *dict; /* The keyspace for this DB */
308 dict *expires; /* Timeout of keys with a timeout set */
309 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
310 dict *io_keys; /* Keys with clients waiting for VM I/O */
311 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
312 int id;
313 } redisDb;
314
315 /* Client MULTI/EXEC state */
316 typedef struct multiCmd {
317 robj **argv;
318 int argc;
319 struct redisCommand *cmd;
320 } multiCmd;
321
322 typedef struct multiState {
323 multiCmd *commands; /* Array of MULTI commands */
324 int count; /* Total number of MULTI commands */
325 } multiState;
326
327 /* With multiplexing we need to take per-clinet state.
328 * Clients are taken in a liked list. */
329 typedef struct redisClient {
330 int fd;
331 redisDb *db;
332 int dictid;
333 sds querybuf;
334 robj **argv, **mbargv;
335 int argc, mbargc;
336 int bulklen; /* bulk read len. -1 if not in bulk read mode */
337 int multibulk; /* multi bulk command format active */
338 list *reply;
339 int sentlen;
340 time_t lastinteraction; /* time of the last interaction, used for timeout */
341 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
342 int slaveseldb; /* slave selected db, if this client is a slave */
343 int authenticated; /* when requirepass is non-NULL */
344 int replstate; /* replication state if this is a slave */
345 int repldbfd; /* replication DB file descriptor */
346 long repldboff; /* replication DB file offset */
347 off_t repldbsize; /* replication DB file size */
348 multiState mstate; /* MULTI/EXEC state */
349 robj **blocking_keys; /* The key we are waiting to terminate a blocking
350 * operation such as BLPOP. Otherwise NULL. */
351 int blocking_keys_num; /* Number of blocking keys */
352 time_t blockingto; /* Blocking operation timeout. If UNIX current time
353 * is >= blockingto then the operation timed out. */
354 list *io_keys; /* Keys this client is waiting to be loaded from the
355 * swap file in order to continue. */
356 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
357 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
358 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
359 } redisClient;
360
361 struct saveparam {
362 time_t seconds;
363 int changes;
364 };
365
366 /* Global server state structure */
367 struct redisServer {
368 int port;
369 int fd;
370 redisDb *db;
371 long long dirty; /* changes to DB from the last save */
372 list *clients;
373 list *slaves, *monitors;
374 char neterr[ANET_ERR_LEN];
375 aeEventLoop *el;
376 int cronloops; /* number of times the cron function run */
377 list *objfreelist; /* A list of freed objects to avoid malloc() */
378 time_t lastsave; /* Unix time of last save succeeede */
379 /* Fields used only for stats */
380 time_t stat_starttime; /* server start time */
381 long long stat_numcommands; /* number of processed commands */
382 long long stat_numconnections; /* number of connections received */
383 long long stat_expiredkeys; /* number of expired keys */
384 /* Configuration */
385 int verbosity;
386 int glueoutputbuf;
387 int maxidletime;
388 int dbnum;
389 int daemonize;
390 int appendonly;
391 int appendfsync;
392 int no_appendfsync_on_rewrite;
393 int shutdown_asap;
394 time_t lastfsync;
395 int appendfd;
396 int appendseldb;
397 char *pidfile;
398 pid_t bgsavechildpid;
399 pid_t bgrewritechildpid;
400 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
401 sds aofbuf; /* AOF buffer, written before entering the event loop */
402 struct saveparam *saveparams;
403 int saveparamslen;
404 char *logfile;
405 char *bindaddr;
406 char *dbfilename;
407 char *appendfilename;
408 char *requirepass;
409 int rdbcompression;
410 int activerehashing;
411 /* Replication related */
412 int isslave;
413 char *masterauth;
414 char *masterhost;
415 int masterport;
416 redisClient *master; /* client that is master for this slave */
417 int replstate;
418 unsigned int maxclients;
419 unsigned long long maxmemory;
420 unsigned int blpop_blocked_clients;
421 unsigned int vm_blocked_clients;
422 /* Sort parameters - qsort_r() is only available under BSD so we
423 * have to take this state global, in order to pass it to sortCompare() */
424 int sort_desc;
425 int sort_alpha;
426 int sort_bypattern;
427 /* Virtual memory configuration */
428 int vm_enabled;
429 char *vm_swap_file;
430 off_t vm_page_size;
431 off_t vm_pages;
432 unsigned long long vm_max_memory;
433 /* Zip structure config */
434 size_t hash_max_zipmap_entries;
435 size_t hash_max_zipmap_value;
436 size_t list_max_ziplist_entries;
437 size_t list_max_ziplist_value;
438 size_t set_max_intset_entries;
439 /* Virtual memory state */
440 FILE *vm_fp;
441 int vm_fd;
442 off_t vm_next_page; /* Next probably empty page */
443 off_t vm_near_pages; /* Number of pages allocated sequentially */
444 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
445 time_t unixtime; /* Unix time sampled every second. */
446 /* Virtual memory I/O threads stuff */
447 /* An I/O thread process an element taken from the io_jobs queue and
448 * put the result of the operation in the io_done list. While the
449 * job is being processed, it's put on io_processing queue. */
450 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
451 list *io_processing; /* List of VM I/O jobs being processed */
452 list *io_processed; /* List of VM I/O jobs already processed */
453 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
454 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
455 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
456 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
457 pthread_attr_t io_threads_attr; /* attributes for threads creation */
458 int io_active_threads; /* Number of running I/O threads */
459 int vm_max_threads; /* Max number of I/O threads running at the same time */
460 /* Our main thread is blocked on the event loop, locking for sockets ready
461 * to be read or written, so when a threaded I/O operation is ready to be
462 * processed by the main thread, the I/O thread will use a unix pipe to
463 * awake the main thread. The followings are the two pipe FDs. */
464 int io_ready_pipe_read;
465 int io_ready_pipe_write;
466 /* Virtual memory stats */
467 unsigned long long vm_stats_used_pages;
468 unsigned long long vm_stats_swapped_objects;
469 unsigned long long vm_stats_swapouts;
470 unsigned long long vm_stats_swapins;
471 /* Pubsub */
472 dict *pubsub_channels; /* Map channels to list of subscribed clients */
473 list *pubsub_patterns; /* A list of pubsub_patterns */
474 /* Misc */
475 FILE *devnull;
476 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
477 unsigned lruclock_padding:10;
478 };
479
480 typedef struct pubsubPattern {
481 redisClient *client;
482 robj *pattern;
483 } pubsubPattern;
484
485 typedef void redisCommandProc(redisClient *c);
486 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
487 struct redisCommand {
488 char *name;
489 redisCommandProc *proc;
490 int arity;
491 int flags;
492 /* Use a function to determine which keys need to be loaded
493 * in the background prior to executing this command. Takes precedence
494 * over vm_firstkey and others, ignored when NULL */
495 redisVmPreloadProc *vm_preload_proc;
496 /* What keys should be loaded in background when calling this command? */
497 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
498 int vm_lastkey; /* THe last argument that's a key */
499 int vm_keystep; /* The step between first and last key */
500 };
501
502 struct redisFunctionSym {
503 char *name;
504 unsigned long pointer;
505 };
506
507 typedef struct _redisSortObject {
508 robj *obj;
509 union {
510 double score;
511 robj *cmpobj;
512 } u;
513 } redisSortObject;
514
515 typedef struct _redisSortOperation {
516 int type;
517 robj *pattern;
518 } redisSortOperation;
519
520 /* ZSETs use a specialized version of Skiplists */
521
522 typedef struct zskiplistNode {
523 struct zskiplistNode **forward;
524 struct zskiplistNode *backward;
525 unsigned int *span;
526 double score;
527 robj *obj;
528 } zskiplistNode;
529
530 typedef struct zskiplist {
531 struct zskiplistNode *header, *tail;
532 unsigned long length;
533 int level;
534 } zskiplist;
535
536 typedef struct zset {
537 dict *dict;
538 zskiplist *zsl;
539 } zset;
540
541 /* Our shared "common" objects */
542
543 #define REDIS_SHARED_INTEGERS 10000
544 struct sharedObjectsStruct {
545 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
546 *colon, *nullbulk, *nullmultibulk, *queued,
547 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
548 *outofrangeerr, *plus,
549 *select0, *select1, *select2, *select3, *select4,
550 *select5, *select6, *select7, *select8, *select9,
551 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
552 *mbulk4, *psubscribebulk, *punsubscribebulk,
553 *integers[REDIS_SHARED_INTEGERS];
554 } shared;
555
556 /* Global vars that are actally used as constants. The following double
557 * values are used for double on-disk serialization, and are initialized
558 * at runtime to avoid strange compiler optimizations. */
559
560 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
561
562 /* VM threaded I/O request message */
563 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
564 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
565 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
566 typedef struct iojob {
567 int type; /* Request type, REDIS_IOJOB_* */
568 redisDb *db;/* Redis database */
569 robj *key; /* This I/O request is about swapping this key */
570 robj *id; /* Unique identifier of this job:
571 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
572 vmpointer objct for REDIS_IOREQ_LOAD. */
573 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
574 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
575 off_t page; /* Swap page where to read/write the object */
576 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
577 int canceled; /* True if this command was canceled by blocking side of VM */
578 pthread_t thread; /* ID of the thread processing this entry */
579 } iojob;
580
581 /*================================ Prototypes =============================== */
582
583 static void freeStringObject(robj *o);
584 static void freeListObject(robj *o);
585 static void freeSetObject(robj *o);
586 static void decrRefCount(void *o);
587 static robj *createObject(int type, void *ptr);
588 static void freeClient(redisClient *c);
589 static int rdbLoad(char *filename);
590 static void addReply(redisClient *c, robj *obj);
591 static void addReplySds(redisClient *c, sds s);
592 static void incrRefCount(robj *o);
593 static int rdbSaveBackground(char *filename);
594 static robj *createStringObject(char *ptr, size_t len);
595 static robj *dupStringObject(robj *o);
596 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
597 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
598 static void flushAppendOnlyFile(void);
599 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
600 static int syncWithMaster(void);
601 static robj *tryObjectEncoding(robj *o);
602 static robj *getDecodedObject(robj *o);
603 static int removeExpire(redisDb *db, robj *key);
604 static int expireIfNeeded(redisDb *db, robj *key);
605 static int deleteIfVolatile(redisDb *db, robj *key);
606 static int dbDelete(redisDb *db, robj *key);
607 static time_t getExpire(redisDb *db, robj *key);
608 static int setExpire(redisDb *db, robj *key, time_t when);
609 static void updateSlavesWaitingBgsave(int bgsaveerr);
610 static void freeMemoryIfNeeded(void);
611 static int processCommand(redisClient *c);
612 static void setupSigSegvAction(void);
613 static void rdbRemoveTempFile(pid_t childpid);
614 static void aofRemoveTempFile(pid_t childpid);
615 static size_t stringObjectLen(robj *o);
616 static void processInputBuffer(redisClient *c);
617 static zskiplist *zslCreate(void);
618 static void zslFree(zskiplist *zsl);
619 static void zslInsert(zskiplist *zsl, double score, robj *obj);
620 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
621 static void initClientMultiState(redisClient *c);
622 static void freeClientMultiState(redisClient *c);
623 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
624 static void unblockClientWaitingData(redisClient *c);
625 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
626 static void vmInit(void);
627 static void vmMarkPagesFree(off_t page, off_t count);
628 static robj *vmLoadObject(robj *o);
629 static robj *vmPreviewObject(robj *o);
630 static int vmSwapOneObjectBlocking(void);
631 static int vmSwapOneObjectThreaded(void);
632 static int vmCanSwapOut(void);
633 static int tryFreeOneObjectFromFreelist(void);
634 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
635 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
636 static void vmCancelThreadedIOJob(robj *o);
637 static void lockThreadedIO(void);
638 static void unlockThreadedIO(void);
639 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
640 static void freeIOJob(iojob *j);
641 static void queueIOJob(iojob *j);
642 static int vmWriteObjectOnSwap(robj *o, off_t page);
643 static robj *vmReadObjectFromSwap(off_t page, int type);
644 static void waitEmptyIOJobsQueue(void);
645 static void vmReopenSwapFile(void);
646 static int vmFreePage(off_t page);
647 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
648 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
649 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
650 static int dontWaitForSwappedKey(redisClient *c, robj *key);
651 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
652 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
653 static struct redisCommand *lookupCommand(char *name);
654 static void call(redisClient *c, struct redisCommand *cmd);
655 static void resetClient(redisClient *c);
656 static void convertToRealHash(robj *o);
657 static void listTypeConvert(robj *o, int enc);
658 static void setTypeConvert(robj *o, int enc);
659 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
660 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
661 static void freePubsubPattern(void *p);
662 static int listMatchPubsubPattern(void *a, void *b);
663 static int compareStringObjects(robj *a, robj *b);
664 static int equalStringObjects(robj *a, robj *b);
665 static void usage();
666 static int rewriteAppendOnlyFileBackground(void);
667 static vmpointer *vmSwapObjectBlocking(robj *val);
668 static int prepareForShutdown();
669 static void touchWatchedKey(redisDb *db, robj *key);
670 static void touchWatchedKeysOnFlush(int dbid);
671 static void unwatchAllKeys(redisClient *c);
672
673 static void authCommand(redisClient *c);
674 static void pingCommand(redisClient *c);
675 static void echoCommand(redisClient *c);
676 static void setCommand(redisClient *c);
677 static void setnxCommand(redisClient *c);
678 static void setexCommand(redisClient *c);
679 static void getCommand(redisClient *c);
680 static void delCommand(redisClient *c);
681 static void existsCommand(redisClient *c);
682 static void incrCommand(redisClient *c);
683 static void decrCommand(redisClient *c);
684 static void incrbyCommand(redisClient *c);
685 static void decrbyCommand(redisClient *c);
686 static void selectCommand(redisClient *c);
687 static void randomkeyCommand(redisClient *c);
688 static void keysCommand(redisClient *c);
689 static void dbsizeCommand(redisClient *c);
690 static void lastsaveCommand(redisClient *c);
691 static void saveCommand(redisClient *c);
692 static void bgsaveCommand(redisClient *c);
693 static void bgrewriteaofCommand(redisClient *c);
694 static void shutdownCommand(redisClient *c);
695 static void moveCommand(redisClient *c);
696 static void renameCommand(redisClient *c);
697 static void renamenxCommand(redisClient *c);
698 static void lpushCommand(redisClient *c);
699 static void rpushCommand(redisClient *c);
700 static void lpopCommand(redisClient *c);
701 static void rpopCommand(redisClient *c);
702 static void llenCommand(redisClient *c);
703 static void lindexCommand(redisClient *c);
704 static void lrangeCommand(redisClient *c);
705 static void ltrimCommand(redisClient *c);
706 static void typeCommand(redisClient *c);
707 static void lsetCommand(redisClient *c);
708 static void saddCommand(redisClient *c);
709 static void sremCommand(redisClient *c);
710 static void smoveCommand(redisClient *c);
711 static void sismemberCommand(redisClient *c);
712 static void scardCommand(redisClient *c);
713 static void spopCommand(redisClient *c);
714 static void srandmemberCommand(redisClient *c);
715 static void sinterCommand(redisClient *c);
716 static void sinterstoreCommand(redisClient *c);
717 static void sunionCommand(redisClient *c);
718 static void sunionstoreCommand(redisClient *c);
719 static void sdiffCommand(redisClient *c);
720 static void sdiffstoreCommand(redisClient *c);
721 static void syncCommand(redisClient *c);
722 static void flushdbCommand(redisClient *c);
723 static void flushallCommand(redisClient *c);
724 static void sortCommand(redisClient *c);
725 static void lremCommand(redisClient *c);
726 static void rpoplpushcommand(redisClient *c);
727 static void infoCommand(redisClient *c);
728 static void mgetCommand(redisClient *c);
729 static void monitorCommand(redisClient *c);
730 static void expireCommand(redisClient *c);
731 static void expireatCommand(redisClient *c);
732 static void getsetCommand(redisClient *c);
733 static void ttlCommand(redisClient *c);
734 static void slaveofCommand(redisClient *c);
735 static void debugCommand(redisClient *c);
736 static void msetCommand(redisClient *c);
737 static void msetnxCommand(redisClient *c);
738 static void zaddCommand(redisClient *c);
739 static void zincrbyCommand(redisClient *c);
740 static void zrangeCommand(redisClient *c);
741 static void zrangebyscoreCommand(redisClient *c);
742 static void zcountCommand(redisClient *c);
743 static void zrevrangeCommand(redisClient *c);
744 static void zcardCommand(redisClient *c);
745 static void zremCommand(redisClient *c);
746 static void zscoreCommand(redisClient *c);
747 static void zremrangebyscoreCommand(redisClient *c);
748 static void multiCommand(redisClient *c);
749 static void execCommand(redisClient *c);
750 static void discardCommand(redisClient *c);
751 static void blpopCommand(redisClient *c);
752 static void brpopCommand(redisClient *c);
753 static void appendCommand(redisClient *c);
754 static void substrCommand(redisClient *c);
755 static void zrankCommand(redisClient *c);
756 static void zrevrankCommand(redisClient *c);
757 static void hsetCommand(redisClient *c);
758 static void hsetnxCommand(redisClient *c);
759 static void hgetCommand(redisClient *c);
760 static void hmsetCommand(redisClient *c);
761 static void hmgetCommand(redisClient *c);
762 static void hdelCommand(redisClient *c);
763 static void hlenCommand(redisClient *c);
764 static void zremrangebyrankCommand(redisClient *c);
765 static void zunionstoreCommand(redisClient *c);
766 static void zinterstoreCommand(redisClient *c);
767 static void hkeysCommand(redisClient *c);
768 static void hvalsCommand(redisClient *c);
769 static void hgetallCommand(redisClient *c);
770 static void hexistsCommand(redisClient *c);
771 static void configCommand(redisClient *c);
772 static void hincrbyCommand(redisClient *c);
773 static void subscribeCommand(redisClient *c);
774 static void unsubscribeCommand(redisClient *c);
775 static void psubscribeCommand(redisClient *c);
776 static void punsubscribeCommand(redisClient *c);
777 static void publishCommand(redisClient *c);
778 static void watchCommand(redisClient *c);
779 static void unwatchCommand(redisClient *c);
780
781 /*================================= Globals ================================= */
782
783 /* Global vars */
784 static struct redisServer server; /* server global state */
785 static struct redisCommand *commandTable;
786 static struct redisCommand readonlyCommandTable[] = {
787 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
790 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
791 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
798 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
810 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
811 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
814 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
819 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
820 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
821 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
822 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
823 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
824 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
831 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
832 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
838 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
839 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
843 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
844 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
845 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
846 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
847 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
852 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
856 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
860 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
869 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
875 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
877 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
882 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
883 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
885 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
888 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
893 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
894 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
895 };
896
897 /*============================ Utility functions ============================ */
898
899 /* Glob-style pattern matching. */
900 static int stringmatchlen(const char *pattern, int patternLen,
901 const char *string, int stringLen, int nocase)
902 {
903 while(patternLen) {
904 switch(pattern[0]) {
905 case '*':
906 while (pattern[1] == '*') {
907 pattern++;
908 patternLen--;
909 }
910 if (patternLen == 1)
911 return 1; /* match */
912 while(stringLen) {
913 if (stringmatchlen(pattern+1, patternLen-1,
914 string, stringLen, nocase))
915 return 1; /* match */
916 string++;
917 stringLen--;
918 }
919 return 0; /* no match */
920 break;
921 case '?':
922 if (stringLen == 0)
923 return 0; /* no match */
924 string++;
925 stringLen--;
926 break;
927 case '[':
928 {
929 int not, match;
930
931 pattern++;
932 patternLen--;
933 not = pattern[0] == '^';
934 if (not) {
935 pattern++;
936 patternLen--;
937 }
938 match = 0;
939 while(1) {
940 if (pattern[0] == '\\') {
941 pattern++;
942 patternLen--;
943 if (pattern[0] == string[0])
944 match = 1;
945 } else if (pattern[0] == ']') {
946 break;
947 } else if (patternLen == 0) {
948 pattern--;
949 patternLen++;
950 break;
951 } else if (pattern[1] == '-' && patternLen >= 3) {
952 int start = pattern[0];
953 int end = pattern[2];
954 int c = string[0];
955 if (start > end) {
956 int t = start;
957 start = end;
958 end = t;
959 }
960 if (nocase) {
961 start = tolower(start);
962 end = tolower(end);
963 c = tolower(c);
964 }
965 pattern += 2;
966 patternLen -= 2;
967 if (c >= start && c <= end)
968 match = 1;
969 } else {
970 if (!nocase) {
971 if (pattern[0] == string[0])
972 match = 1;
973 } else {
974 if (tolower((int)pattern[0]) == tolower((int)string[0]))
975 match = 1;
976 }
977 }
978 pattern++;
979 patternLen--;
980 }
981 if (not)
982 match = !match;
983 if (!match)
984 return 0; /* no match */
985 string++;
986 stringLen--;
987 break;
988 }
989 case '\\':
990 if (patternLen >= 2) {
991 pattern++;
992 patternLen--;
993 }
994 /* fall through */
995 default:
996 if (!nocase) {
997 if (pattern[0] != string[0])
998 return 0; /* no match */
999 } else {
1000 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1001 return 0; /* no match */
1002 }
1003 string++;
1004 stringLen--;
1005 break;
1006 }
1007 pattern++;
1008 patternLen--;
1009 if (stringLen == 0) {
1010 while(*pattern == '*') {
1011 pattern++;
1012 patternLen--;
1013 }
1014 break;
1015 }
1016 }
1017 if (patternLen == 0 && stringLen == 0)
1018 return 1;
1019 return 0;
1020 }
1021
1022 static int stringmatch(const char *pattern, const char *string, int nocase) {
1023 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1024 }
1025
1026 /* Convert a string representing an amount of memory into the number of
1027 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1028 * (1024*1024*1024).
1029 *
1030 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1031 * set to 0 */
1032 static long long memtoll(const char *p, int *err) {
1033 const char *u;
1034 char buf[128];
1035 long mul; /* unit multiplier */
1036 long long val;
1037 unsigned int digits;
1038
1039 if (err) *err = 0;
1040 /* Search the first non digit character. */
1041 u = p;
1042 if (*u == '-') u++;
1043 while(*u && isdigit(*u)) u++;
1044 if (*u == '\0' || !strcasecmp(u,"b")) {
1045 mul = 1;
1046 } else if (!strcasecmp(u,"k")) {
1047 mul = 1000;
1048 } else if (!strcasecmp(u,"kb")) {
1049 mul = 1024;
1050 } else if (!strcasecmp(u,"m")) {
1051 mul = 1000*1000;
1052 } else if (!strcasecmp(u,"mb")) {
1053 mul = 1024*1024;
1054 } else if (!strcasecmp(u,"g")) {
1055 mul = 1000L*1000*1000;
1056 } else if (!strcasecmp(u,"gb")) {
1057 mul = 1024L*1024*1024;
1058 } else {
1059 if (err) *err = 1;
1060 mul = 1;
1061 }
1062 digits = u-p;
1063 if (digits >= sizeof(buf)) {
1064 if (err) *err = 1;
1065 return LLONG_MAX;
1066 }
1067 memcpy(buf,p,digits);
1068 buf[digits] = '\0';
1069 val = strtoll(buf,NULL,10);
1070 return val*mul;
1071 }
1072
1073 /* Convert a long long into a string. Returns the number of
1074 * characters needed to represent the number, that can be shorter if passed
1075 * buffer length is not enough to store the whole number. */
1076 static int ll2string(char *s, size_t len, long long value) {
1077 char buf[32], *p;
1078 unsigned long long v;
1079 size_t l;
1080
1081 if (len == 0) return 0;
1082 v = (value < 0) ? -value : value;
1083 p = buf+31; /* point to the last character */
1084 do {
1085 *p-- = '0'+(v%10);
1086 v /= 10;
1087 } while(v);
1088 if (value < 0) *p-- = '-';
1089 p++;
1090 l = 32-(p-buf);
1091 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1092 memcpy(s,p,l);
1093 s[l] = '\0';
1094 return l;
1095 }
1096
1097 static void redisLog(int level, const char *fmt, ...) {
1098 va_list ap;
1099 FILE *fp;
1100
1101 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1102 if (!fp) return;
1103
1104 va_start(ap, fmt);
1105 if (level >= server.verbosity) {
1106 char *c = ".-*#";
1107 char buf[64];
1108 time_t now;
1109
1110 now = time(NULL);
1111 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1112 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1113 vfprintf(fp, fmt, ap);
1114 fprintf(fp,"\n");
1115 fflush(fp);
1116 }
1117 va_end(ap);
1118
1119 if (server.logfile) fclose(fp);
1120 }
1121
1122 /*====================== Hash table type implementation ==================== */
1123
1124 /* This is an hash table type that uses the SDS dynamic strings libary as
1125 * keys and radis objects as values (objects can hold SDS strings,
1126 * lists, sets). */
1127
1128 static void dictVanillaFree(void *privdata, void *val)
1129 {
1130 DICT_NOTUSED(privdata);
1131 zfree(val);
1132 }
1133
1134 static void dictListDestructor(void *privdata, void *val)
1135 {
1136 DICT_NOTUSED(privdata);
1137 listRelease((list*)val);
1138 }
1139
1140 static int dictSdsKeyCompare(void *privdata, const void *key1,
1141 const void *key2)
1142 {
1143 int l1,l2;
1144 DICT_NOTUSED(privdata);
1145
1146 l1 = sdslen((sds)key1);
1147 l2 = sdslen((sds)key2);
1148 if (l1 != l2) return 0;
1149 return memcmp(key1, key2, l1) == 0;
1150 }
1151
1152 static void dictRedisObjectDestructor(void *privdata, void *val)
1153 {
1154 DICT_NOTUSED(privdata);
1155
1156 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1157 decrRefCount(val);
1158 }
1159
1160 static void dictSdsDestructor(void *privdata, void *val)
1161 {
1162 DICT_NOTUSED(privdata);
1163
1164 sdsfree(val);
1165 }
1166
1167 static int dictObjKeyCompare(void *privdata, const void *key1,
1168 const void *key2)
1169 {
1170 const robj *o1 = key1, *o2 = key2;
1171 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1172 }
1173
1174 static unsigned int dictObjHash(const void *key) {
1175 const robj *o = key;
1176 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 }
1178
1179 static unsigned int dictSdsHash(const void *key) {
1180 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1181 }
1182
1183 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1184 const void *key2)
1185 {
1186 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1187 int cmp;
1188
1189 if (o1->encoding == REDIS_ENCODING_INT &&
1190 o2->encoding == REDIS_ENCODING_INT)
1191 return o1->ptr == o2->ptr;
1192
1193 o1 = getDecodedObject(o1);
1194 o2 = getDecodedObject(o2);
1195 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1196 decrRefCount(o1);
1197 decrRefCount(o2);
1198 return cmp;
1199 }
1200
1201 static unsigned int dictEncObjHash(const void *key) {
1202 robj *o = (robj*) key;
1203
1204 if (o->encoding == REDIS_ENCODING_RAW) {
1205 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1206 } else {
1207 if (o->encoding == REDIS_ENCODING_INT) {
1208 char buf[32];
1209 int len;
1210
1211 len = ll2string(buf,32,(long)o->ptr);
1212 return dictGenHashFunction((unsigned char*)buf, len);
1213 } else {
1214 unsigned int hash;
1215
1216 o = getDecodedObject(o);
1217 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1218 decrRefCount(o);
1219 return hash;
1220 }
1221 }
1222 }
1223
1224 /* Sets type */
1225 static dictType setDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 NULL /* val destructor */
1232 };
1233
1234 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1235 static dictType zsetDictType = {
1236 dictEncObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictEncObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
1241 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1242 };
1243
1244 /* Db->dict, keys are sds strings, vals are Redis objects. */
1245 static dictType dbDictType = {
1246 dictSdsHash, /* hash function */
1247 NULL, /* key dup */
1248 NULL, /* val dup */
1249 dictSdsKeyCompare, /* key compare */
1250 dictSdsDestructor, /* key destructor */
1251 dictRedisObjectDestructor /* val destructor */
1252 };
1253
1254 /* Db->expires */
1255 static dictType keyptrDictType = {
1256 dictSdsHash, /* hash function */
1257 NULL, /* key dup */
1258 NULL, /* val dup */
1259 dictSdsKeyCompare, /* key compare */
1260 dictSdsDestructor, /* key destructor */
1261 NULL /* val destructor */
1262 };
1263
1264 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1265 static dictType hashDictType = {
1266 dictEncObjHash, /* hash function */
1267 NULL, /* key dup */
1268 NULL, /* val dup */
1269 dictEncObjKeyCompare, /* key compare */
1270 dictRedisObjectDestructor, /* key destructor */
1271 dictRedisObjectDestructor /* val destructor */
1272 };
1273
1274 /* Keylist hash table type has unencoded redis objects as keys and
1275 * lists as values. It's used for blocking operations (BLPOP) and to
1276 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1277 static dictType keylistDictType = {
1278 dictObjHash, /* hash function */
1279 NULL, /* key dup */
1280 NULL, /* val dup */
1281 dictObjKeyCompare, /* key compare */
1282 dictRedisObjectDestructor, /* key destructor */
1283 dictListDestructor /* val destructor */
1284 };
1285
1286 static void version();
1287
1288 /* ========================= Random utility functions ======================= */
1289
1290 /* Redis generally does not try to recover from out of memory conditions
1291 * when allocating objects or strings, it is not clear if it will be possible
1292 * to report this condition to the client since the networking layer itself
1293 * is based on heap allocation for send buffers, so we simply abort.
1294 * At least the code will be simpler to read... */
1295 static void oom(const char *msg) {
1296 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1297 sleep(1);
1298 abort();
1299 }
1300
1301 /* ====================== Redis server networking stuff ===================== */
1302 static void closeTimedoutClients(void) {
1303 redisClient *c;
1304 listNode *ln;
1305 time_t now = time(NULL);
1306 listIter li;
1307
1308 listRewind(server.clients,&li);
1309 while ((ln = listNext(&li)) != NULL) {
1310 c = listNodeValue(ln);
1311 if (server.maxidletime &&
1312 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1313 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1314 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1315 listLength(c->pubsub_patterns) == 0 &&
1316 (now - c->lastinteraction > server.maxidletime))
1317 {
1318 redisLog(REDIS_VERBOSE,"Closing idle client");
1319 freeClient(c);
1320 } else if (c->flags & REDIS_BLOCKED) {
1321 if (c->blockingto != 0 && c->blockingto < now) {
1322 addReply(c,shared.nullmultibulk);
1323 unblockClientWaitingData(c);
1324 }
1325 }
1326 }
1327 }
1328
1329 static int htNeedsResize(dict *dict) {
1330 long long size, used;
1331
1332 size = dictSlots(dict);
1333 used = dictSize(dict);
1334 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1335 (used*100/size < REDIS_HT_MINFILL));
1336 }
1337
1338 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1339 * we resize the hash table to save memory */
1340 static void tryResizeHashTables(void) {
1341 int j;
1342
1343 for (j = 0; j < server.dbnum; j++) {
1344 if (htNeedsResize(server.db[j].dict))
1345 dictResize(server.db[j].dict);
1346 if (htNeedsResize(server.db[j].expires))
1347 dictResize(server.db[j].expires);
1348 }
1349 }
1350
1351 /* Our hash table implementation performs rehashing incrementally while
1352 * we write/read from the hash table. Still if the server is idle, the hash
1353 * table will use two tables for a long time. So we try to use 1 millisecond
1354 * of CPU time at every serverCron() loop in order to rehash some key. */
1355 static void incrementallyRehash(void) {
1356 int j;
1357
1358 for (j = 0; j < server.dbnum; j++) {
1359 if (dictIsRehashing(server.db[j].dict)) {
1360 dictRehashMilliseconds(server.db[j].dict,1);
1361 break; /* already used our millisecond for this loop... */
1362 }
1363 }
1364 }
1365
1366 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1367 void backgroundSaveDoneHandler(int statloc) {
1368 int exitcode = WEXITSTATUS(statloc);
1369 int bysignal = WIFSIGNALED(statloc);
1370
1371 if (!bysignal && exitcode == 0) {
1372 redisLog(REDIS_NOTICE,
1373 "Background saving terminated with success");
1374 server.dirty = 0;
1375 server.lastsave = time(NULL);
1376 } else if (!bysignal && exitcode != 0) {
1377 redisLog(REDIS_WARNING, "Background saving error");
1378 } else {
1379 redisLog(REDIS_WARNING,
1380 "Background saving terminated by signal %d", WTERMSIG(statloc));
1381 rdbRemoveTempFile(server.bgsavechildpid);
1382 }
1383 server.bgsavechildpid = -1;
1384 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1385 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1386 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1387 }
1388
1389 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1390 * Handle this. */
1391 void backgroundRewriteDoneHandler(int statloc) {
1392 int exitcode = WEXITSTATUS(statloc);
1393 int bysignal = WIFSIGNALED(statloc);
1394
1395 if (!bysignal && exitcode == 0) {
1396 int fd;
1397 char tmpfile[256];
1398
1399 redisLog(REDIS_NOTICE,
1400 "Background append only file rewriting terminated with success");
1401 /* Now it's time to flush the differences accumulated by the parent */
1402 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1403 fd = open(tmpfile,O_WRONLY|O_APPEND);
1404 if (fd == -1) {
1405 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1406 goto cleanup;
1407 }
1408 /* Flush our data... */
1409 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1410 (signed) sdslen(server.bgrewritebuf)) {
1411 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1412 close(fd);
1413 goto cleanup;
1414 }
1415 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1416 /* Now our work is to rename the temp file into the stable file. And
1417 * switch the file descriptor used by the server for append only. */
1418 if (rename(tmpfile,server.appendfilename) == -1) {
1419 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1420 close(fd);
1421 goto cleanup;
1422 }
1423 /* Mission completed... almost */
1424 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1425 if (server.appendfd != -1) {
1426 /* If append only is actually enabled... */
1427 close(server.appendfd);
1428 server.appendfd = fd;
1429 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1430 server.appendseldb = -1; /* Make sure it will issue SELECT */
1431 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1432 } else {
1433 /* If append only is disabled we just generate a dump in this
1434 * format. Why not? */
1435 close(fd);
1436 }
1437 } else if (!bysignal && exitcode != 0) {
1438 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1439 } else {
1440 redisLog(REDIS_WARNING,
1441 "Background append only file rewriting terminated by signal %d",
1442 WTERMSIG(statloc));
1443 }
1444 cleanup:
1445 sdsfree(server.bgrewritebuf);
1446 server.bgrewritebuf = sdsempty();
1447 aofRemoveTempFile(server.bgrewritechildpid);
1448 server.bgrewritechildpid = -1;
1449 }
1450
1451 /* This function is called once a background process of some kind terminates,
1452 * as we want to avoid resizing the hash tables when there is a child in order
1453 * to play well with copy-on-write (otherwise when a resize happens lots of
1454 * memory pages are copied). The goal of this function is to update the ability
1455 * for dict.c to resize the hash tables accordingly to the fact we have o not
1456 * running childs. */
1457 static void updateDictResizePolicy(void) {
1458 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1459 dictEnableResize();
1460 else
1461 dictDisableResize();
1462 }
1463
1464 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1465 int j, loops = server.cronloops++;
1466 REDIS_NOTUSED(eventLoop);
1467 REDIS_NOTUSED(id);
1468 REDIS_NOTUSED(clientData);
1469
1470 /* We take a cached value of the unix time in the global state because
1471 * with virtual memory and aging there is to store the current time
1472 * in objects at every object access, and accuracy is not needed.
1473 * To access a global var is faster than calling time(NULL) */
1474 server.unixtime = time(NULL);
1475 /* We have just 21 bits per object for LRU information.
1476 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1477 *
1478 * When we need to select what object to swap, we compute the minimum
1479 * time distance between the current lruclock and the object last access
1480 * lruclock info. Even if clocks will wrap on overflow, there is
1481 * the interesting property that we are sure that at least
1482 * ABS(A-B) minutes passed between current time and timestamp B.
1483 *
1484 * This is not precise but we don't need at all precision, but just
1485 * something statistically reasonable.
1486 */
1487 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1488
1489 /* We received a SIGTERM, shutting down here in a safe way, as it is
1490 * not ok doing so inside the signal handler. */
1491 if (server.shutdown_asap) {
1492 if (prepareForShutdown() == REDIS_OK) exit(0);
1493 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1494 }
1495
1496 /* Show some info about non-empty databases */
1497 for (j = 0; j < server.dbnum; j++) {
1498 long long size, used, vkeys;
1499
1500 size = dictSlots(server.db[j].dict);
1501 used = dictSize(server.db[j].dict);
1502 vkeys = dictSize(server.db[j].expires);
1503 if (!(loops % 50) && (used || vkeys)) {
1504 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1505 /* dictPrintStats(server.dict); */
1506 }
1507 }
1508
1509 /* We don't want to resize the hash tables while a bacground saving
1510 * is in progress: the saving child is created using fork() that is
1511 * implemented with a copy-on-write semantic in most modern systems, so
1512 * if we resize the HT while there is the saving child at work actually
1513 * a lot of memory movements in the parent will cause a lot of pages
1514 * copied. */
1515 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1516 if (!(loops % 10)) tryResizeHashTables();
1517 if (server.activerehashing) incrementallyRehash();
1518 }
1519
1520 /* Show information about connected clients */
1521 if (!(loops % 50)) {
1522 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1523 listLength(server.clients)-listLength(server.slaves),
1524 listLength(server.slaves),
1525 zmalloc_used_memory());
1526 }
1527
1528 /* Close connections of timedout clients */
1529 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1530 closeTimedoutClients();
1531
1532 /* Check if a background saving or AOF rewrite in progress terminated */
1533 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1534 int statloc;
1535 pid_t pid;
1536
1537 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1538 if (pid == server.bgsavechildpid) {
1539 backgroundSaveDoneHandler(statloc);
1540 } else {
1541 backgroundRewriteDoneHandler(statloc);
1542 }
1543 updateDictResizePolicy();
1544 }
1545 } else {
1546 /* If there is not a background saving in progress check if
1547 * we have to save now */
1548 time_t now = time(NULL);
1549 for (j = 0; j < server.saveparamslen; j++) {
1550 struct saveparam *sp = server.saveparams+j;
1551
1552 if (server.dirty >= sp->changes &&
1553 now-server.lastsave > sp->seconds) {
1554 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1555 sp->changes, sp->seconds);
1556 rdbSaveBackground(server.dbfilename);
1557 break;
1558 }
1559 }
1560 }
1561
1562 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1563 * will use few CPU cycles if there are few expiring keys, otherwise
1564 * it will get more aggressive to avoid that too much memory is used by
1565 * keys that can be removed from the keyspace. */
1566 for (j = 0; j < server.dbnum; j++) {
1567 int expired;
1568 redisDb *db = server.db+j;
1569
1570 /* Continue to expire if at the end of the cycle more than 25%
1571 * of the keys were expired. */
1572 do {
1573 long num = dictSize(db->expires);
1574 time_t now = time(NULL);
1575
1576 expired = 0;
1577 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1578 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1579 while (num--) {
1580 dictEntry *de;
1581 time_t t;
1582
1583 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1584 t = (time_t) dictGetEntryVal(de);
1585 if (now > t) {
1586 sds key = dictGetEntryKey(de);
1587 robj *keyobj = createStringObject(key,sdslen(key));
1588
1589 dbDelete(db,keyobj);
1590 decrRefCount(keyobj);
1591 expired++;
1592 server.stat_expiredkeys++;
1593 }
1594 }
1595 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1596 }
1597
1598 /* Swap a few keys on disk if we are over the memory limit and VM
1599 * is enbled. Try to free objects from the free list first. */
1600 if (vmCanSwapOut()) {
1601 while (server.vm_enabled && zmalloc_used_memory() >
1602 server.vm_max_memory)
1603 {
1604 int retval;
1605
1606 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1607 retval = (server.vm_max_threads == 0) ?
1608 vmSwapOneObjectBlocking() :
1609 vmSwapOneObjectThreaded();
1610 if (retval == REDIS_ERR && !(loops % 300) &&
1611 zmalloc_used_memory() >
1612 (server.vm_max_memory+server.vm_max_memory/10))
1613 {
1614 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1615 }
1616 /* Note that when using threade I/O we free just one object,
1617 * because anyway when the I/O thread in charge to swap this
1618 * object out will finish, the handler of completed jobs
1619 * will try to swap more objects if we are still out of memory. */
1620 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1621 }
1622 }
1623
1624 /* Check if we should connect to a MASTER */
1625 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1626 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1627 if (syncWithMaster() == REDIS_OK) {
1628 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1629 if (server.appendonly) rewriteAppendOnlyFileBackground();
1630 }
1631 }
1632 return 100;
1633 }
1634
1635 /* This function gets called every time Redis is entering the
1636 * main loop of the event driven library, that is, before to sleep
1637 * for ready file descriptors. */
1638 static void beforeSleep(struct aeEventLoop *eventLoop) {
1639 REDIS_NOTUSED(eventLoop);
1640
1641 /* Awake clients that got all the swapped keys they requested */
1642 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1643 listIter li;
1644 listNode *ln;
1645
1646 listRewind(server.io_ready_clients,&li);
1647 while((ln = listNext(&li))) {
1648 redisClient *c = ln->value;
1649 struct redisCommand *cmd;
1650
1651 /* Resume the client. */
1652 listDelNode(server.io_ready_clients,ln);
1653 c->flags &= (~REDIS_IO_WAIT);
1654 server.vm_blocked_clients--;
1655 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1656 readQueryFromClient, c);
1657 cmd = lookupCommand(c->argv[0]->ptr);
1658 assert(cmd != NULL);
1659 call(c,cmd);
1660 resetClient(c);
1661 /* There may be more data to process in the input buffer. */
1662 if (c->querybuf && sdslen(c->querybuf) > 0)
1663 processInputBuffer(c);
1664 }
1665 }
1666 /* Write the AOF buffer on disk */
1667 flushAppendOnlyFile();
1668 }
1669
1670 static void createSharedObjects(void) {
1671 int j;
1672
1673 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1674 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1675 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1676 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1677 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1678 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1679 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1680 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1681 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1682 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1683 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1684 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1685 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1686 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR no such key\r\n"));
1688 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR syntax error\r\n"));
1690 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR source and destination objects are the same\r\n"));
1692 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR index out of range\r\n"));
1694 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1695 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1696 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1697 shared.select0 = createStringObject("select 0\r\n",10);
1698 shared.select1 = createStringObject("select 1\r\n",10);
1699 shared.select2 = createStringObject("select 2\r\n",10);
1700 shared.select3 = createStringObject("select 3\r\n",10);
1701 shared.select4 = createStringObject("select 4\r\n",10);
1702 shared.select5 = createStringObject("select 5\r\n",10);
1703 shared.select6 = createStringObject("select 6\r\n",10);
1704 shared.select7 = createStringObject("select 7\r\n",10);
1705 shared.select8 = createStringObject("select 8\r\n",10);
1706 shared.select9 = createStringObject("select 9\r\n",10);
1707 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1708 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1709 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1710 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1711 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1712 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1713 shared.mbulk3 = createStringObject("*3\r\n",4);
1714 shared.mbulk4 = createStringObject("*4\r\n",4);
1715 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1716 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1717 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1718 }
1719 }
1720
1721 static void appendServerSaveParams(time_t seconds, int changes) {
1722 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1723 server.saveparams[server.saveparamslen].seconds = seconds;
1724 server.saveparams[server.saveparamslen].changes = changes;
1725 server.saveparamslen++;
1726 }
1727
1728 static void resetServerSaveParams() {
1729 zfree(server.saveparams);
1730 server.saveparams = NULL;
1731 server.saveparamslen = 0;
1732 }
1733
1734 static void initServerConfig() {
1735 server.dbnum = REDIS_DEFAULT_DBNUM;
1736 server.port = REDIS_SERVERPORT;
1737 server.verbosity = REDIS_VERBOSE;
1738 server.maxidletime = REDIS_MAXIDLETIME;
1739 server.saveparams = NULL;
1740 server.logfile = NULL; /* NULL = log on standard output */
1741 server.bindaddr = NULL;
1742 server.glueoutputbuf = 1;
1743 server.daemonize = 0;
1744 server.appendonly = 0;
1745 server.appendfsync = APPENDFSYNC_EVERYSEC;
1746 server.no_appendfsync_on_rewrite = 0;
1747 server.lastfsync = time(NULL);
1748 server.appendfd = -1;
1749 server.appendseldb = -1; /* Make sure the first time will not match */
1750 server.pidfile = zstrdup("/var/run/redis.pid");
1751 server.dbfilename = zstrdup("dump.rdb");
1752 server.appendfilename = zstrdup("appendonly.aof");
1753 server.requirepass = NULL;
1754 server.rdbcompression = 1;
1755 server.activerehashing = 1;
1756 server.maxclients = 0;
1757 server.blpop_blocked_clients = 0;
1758 server.maxmemory = 0;
1759 server.vm_enabled = 0;
1760 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1761 server.vm_page_size = 256; /* 256 bytes per page */
1762 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1763 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1764 server.vm_max_threads = 4;
1765 server.vm_blocked_clients = 0;
1766 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1767 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1768 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1769 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1770 server.set_max_intset_entries = REDIS_SET_MAX_INTSET_ENTRIES;
1771 server.shutdown_asap = 0;
1772
1773 resetServerSaveParams();
1774
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1779 server.isslave = 0;
1780 server.masterauth = NULL;
1781 server.masterhost = NULL;
1782 server.masterport = 6379;
1783 server.master = NULL;
1784 server.replstate = REDIS_REPL_NONE;
1785
1786 /* Double constants initialization */
1787 R_Zero = 0.0;
1788 R_PosInf = 1.0/R_Zero;
1789 R_NegInf = -1.0/R_Zero;
1790 R_Nan = R_Zero/R_Zero;
1791 }
1792
1793 static void initServer() {
1794 int j;
1795
1796 signal(SIGHUP, SIG_IGN);
1797 signal(SIGPIPE, SIG_IGN);
1798 setupSigSegvAction();
1799
1800 server.devnull = fopen("/dev/null","w");
1801 if (server.devnull == NULL) {
1802 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1803 exit(1);
1804 }
1805 server.clients = listCreate();
1806 server.slaves = listCreate();
1807 server.monitors = listCreate();
1808 server.objfreelist = listCreate();
1809 createSharedObjects();
1810 server.el = aeCreateEventLoop();
1811 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1812 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1813 if (server.fd == -1) {
1814 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1815 exit(1);
1816 }
1817 for (j = 0; j < server.dbnum; j++) {
1818 server.db[j].dict = dictCreate(&dbDictType,NULL);
1819 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1820 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1821 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1822 if (server.vm_enabled)
1823 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1824 server.db[j].id = j;
1825 }
1826 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1827 server.pubsub_patterns = listCreate();
1828 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1829 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1830 server.cronloops = 0;
1831 server.bgsavechildpid = -1;
1832 server.bgrewritechildpid = -1;
1833 server.bgrewritebuf = sdsempty();
1834 server.aofbuf = sdsempty();
1835 server.lastsave = time(NULL);
1836 server.dirty = 0;
1837 server.stat_numcommands = 0;
1838 server.stat_numconnections = 0;
1839 server.stat_expiredkeys = 0;
1840 server.stat_starttime = time(NULL);
1841 server.unixtime = time(NULL);
1842 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1843 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1844 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1845
1846 if (server.appendonly) {
1847 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1848 if (server.appendfd == -1) {
1849 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1850 strerror(errno));
1851 exit(1);
1852 }
1853 }
1854
1855 if (server.vm_enabled) vmInit();
1856 }
1857
1858 /* Empty the whole database */
1859 static long long emptyDb() {
1860 int j;
1861 long long removed = 0;
1862
1863 for (j = 0; j < server.dbnum; j++) {
1864 removed += dictSize(server.db[j].dict);
1865 dictEmpty(server.db[j].dict);
1866 dictEmpty(server.db[j].expires);
1867 }
1868 return removed;
1869 }
1870
1871 static int yesnotoi(char *s) {
1872 if (!strcasecmp(s,"yes")) return 1;
1873 else if (!strcasecmp(s,"no")) return 0;
1874 else return -1;
1875 }
1876
1877 /* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879 static void loadServerConfig(char *filename) {
1880 FILE *fp;
1881 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1882 int linenum = 0;
1883 sds line = NULL;
1884
1885 if (filename[0] == '-' && filename[1] == '\0')
1886 fp = stdin;
1887 else {
1888 if ((fp = fopen(filename,"r")) == NULL) {
1889 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1890 exit(1);
1891 }
1892 }
1893
1894 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1895 sds *argv;
1896 int argc, j;
1897
1898 linenum++;
1899 line = sdsnew(buf);
1900 line = sdstrim(line," \t\r\n");
1901
1902 /* Skip comments and blank lines*/
1903 if (line[0] == '#' || line[0] == '\0') {
1904 sdsfree(line);
1905 continue;
1906 }
1907
1908 /* Split into arguments */
1909 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1910 sdstolower(argv[0]);
1911
1912 /* Execute config directives */
1913 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1914 server.maxidletime = atoi(argv[1]);
1915 if (server.maxidletime < 0) {
1916 err = "Invalid timeout value"; goto loaderr;
1917 }
1918 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1919 server.port = atoi(argv[1]);
1920 if (server.port < 1 || server.port > 65535) {
1921 err = "Invalid port"; goto loaderr;
1922 }
1923 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1924 server.bindaddr = zstrdup(argv[1]);
1925 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1926 int seconds = atoi(argv[1]);
1927 int changes = atoi(argv[2]);
1928 if (seconds < 1 || changes < 0) {
1929 err = "Invalid save parameters"; goto loaderr;
1930 }
1931 appendServerSaveParams(seconds,changes);
1932 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1933 if (chdir(argv[1]) == -1) {
1934 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1935 argv[1], strerror(errno));
1936 exit(1);
1937 }
1938 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1939 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1940 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1941 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1942 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1943 else {
1944 err = "Invalid log level. Must be one of debug, notice, warning";
1945 goto loaderr;
1946 }
1947 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1948 FILE *logfp;
1949
1950 server.logfile = zstrdup(argv[1]);
1951 if (!strcasecmp(server.logfile,"stdout")) {
1952 zfree(server.logfile);
1953 server.logfile = NULL;
1954 }
1955 if (server.logfile) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
1958 logfp = fopen(server.logfile,"a");
1959 if (logfp == NULL) {
1960 err = sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno));
1962 goto loaderr;
1963 }
1964 fclose(logfp);
1965 }
1966 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1967 server.dbnum = atoi(argv[1]);
1968 if (server.dbnum < 1) {
1969 err = "Invalid number of databases"; goto loaderr;
1970 }
1971 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1972 loadServerConfig(argv[1]);
1973 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1974 server.maxclients = atoi(argv[1]);
1975 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1976 server.maxmemory = memtoll(argv[1],NULL);
1977 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1978 server.masterhost = sdsnew(argv[1]);
1979 server.masterport = atoi(argv[2]);
1980 server.replstate = REDIS_REPL_CONNECT;
1981 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1982 server.masterauth = zstrdup(argv[1]);
1983 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1984 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
1987 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1988 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1989 err = "argument must be 'yes' or 'no'"; goto loaderr;
1990 }
1991 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1992 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
1995 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1996 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1997 err = "argument must be 'yes' or 'no'"; goto loaderr;
1998 }
1999 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2000 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2001 err = "argument must be 'yes' or 'no'"; goto loaderr;
2002 }
2003 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2004 zfree(server.appendfilename);
2005 server.appendfilename = zstrdup(argv[1]);
2006 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2007 && argc == 2) {
2008 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2009 err = "argument must be 'yes' or 'no'"; goto loaderr;
2010 }
2011 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2012 if (!strcasecmp(argv[1],"no")) {
2013 server.appendfsync = APPENDFSYNC_NO;
2014 } else if (!strcasecmp(argv[1],"always")) {
2015 server.appendfsync = APPENDFSYNC_ALWAYS;
2016 } else if (!strcasecmp(argv[1],"everysec")) {
2017 server.appendfsync = APPENDFSYNC_EVERYSEC;
2018 } else {
2019 err = "argument must be 'no', 'always' or 'everysec'";
2020 goto loaderr;
2021 }
2022 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2023 server.requirepass = zstrdup(argv[1]);
2024 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2025 zfree(server.pidfile);
2026 server.pidfile = zstrdup(argv[1]);
2027 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2028 zfree(server.dbfilename);
2029 server.dbfilename = zstrdup(argv[1]);
2030 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2031 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2032 err = "argument must be 'yes' or 'no'"; goto loaderr;
2033 }
2034 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2035 zfree(server.vm_swap_file);
2036 server.vm_swap_file = zstrdup(argv[1]);
2037 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2038 server.vm_max_memory = memtoll(argv[1],NULL);
2039 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2040 server.vm_page_size = memtoll(argv[1], NULL);
2041 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2042 server.vm_pages = memtoll(argv[1], NULL);
2043 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2044 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2045 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2046 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2047 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2048 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2049 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2050 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2051 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2052 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2053 } else if (!strcasecmp(argv[0],"set-max-intset-entries") && argc == 2){
2054 server.set_max_intset_entries = memtoll(argv[1], NULL);
2055 } else {
2056 err = "Bad directive or wrong number of arguments"; goto loaderr;
2057 }
2058 for (j = 0; j < argc; j++)
2059 sdsfree(argv[j]);
2060 zfree(argv);
2061 sdsfree(line);
2062 }
2063 if (fp != stdin) fclose(fp);
2064 return;
2065
2066 loaderr:
2067 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2068 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2069 fprintf(stderr, ">>> '%s'\n", line);
2070 fprintf(stderr, "%s\n", err);
2071 exit(1);
2072 }
2073
2074 static void freeClientArgv(redisClient *c) {
2075 int j;
2076
2077 for (j = 0; j < c->argc; j++)
2078 decrRefCount(c->argv[j]);
2079 for (j = 0; j < c->mbargc; j++)
2080 decrRefCount(c->mbargv[j]);
2081 c->argc = 0;
2082 c->mbargc = 0;
2083 }
2084
2085 static void freeClient(redisClient *c) {
2086 listNode *ln;
2087
2088 /* Note that if the client we are freeing is blocked into a blocking
2089 * call, we have to set querybuf to NULL *before* to call
2090 * unblockClientWaitingData() to avoid processInputBuffer() will get
2091 * called. Also it is important to remove the file events after
2092 * this, because this call adds the READABLE event. */
2093 sdsfree(c->querybuf);
2094 c->querybuf = NULL;
2095 if (c->flags & REDIS_BLOCKED)
2096 unblockClientWaitingData(c);
2097
2098 /* UNWATCH all the keys */
2099 unwatchAllKeys(c);
2100 listRelease(c->watched_keys);
2101 /* Unsubscribe from all the pubsub channels */
2102 pubsubUnsubscribeAllChannels(c,0);
2103 pubsubUnsubscribeAllPatterns(c,0);
2104 dictRelease(c->pubsub_channels);
2105 listRelease(c->pubsub_patterns);
2106 /* Obvious cleanup */
2107 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2108 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2109 listRelease(c->reply);
2110 freeClientArgv(c);
2111 close(c->fd);
2112 /* Remove from the list of clients */
2113 ln = listSearchKey(server.clients,c);
2114 redisAssert(ln != NULL);
2115 listDelNode(server.clients,ln);
2116 /* Remove from the list of clients that are now ready to be restarted
2117 * after waiting for swapped keys */
2118 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2119 ln = listSearchKey(server.io_ready_clients,c);
2120 if (ln) {
2121 listDelNode(server.io_ready_clients,ln);
2122 server.vm_blocked_clients--;
2123 }
2124 }
2125 /* Remove from the list of clients waiting for swapped keys */
2126 while (server.vm_enabled && listLength(c->io_keys)) {
2127 ln = listFirst(c->io_keys);
2128 dontWaitForSwappedKey(c,ln->value);
2129 }
2130 listRelease(c->io_keys);
2131 /* Master/slave cleanup */
2132 if (c->flags & REDIS_SLAVE) {
2133 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2134 close(c->repldbfd);
2135 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2136 ln = listSearchKey(l,c);
2137 redisAssert(ln != NULL);
2138 listDelNode(l,ln);
2139 }
2140 if (c->flags & REDIS_MASTER) {
2141 server.master = NULL;
2142 server.replstate = REDIS_REPL_CONNECT;
2143 }
2144 /* Release memory */
2145 zfree(c->argv);
2146 zfree(c->mbargv);
2147 freeClientMultiState(c);
2148 zfree(c);
2149 }
2150
2151 #define GLUEREPLY_UP_TO (1024)
2152 static void glueReplyBuffersIfNeeded(redisClient *c) {
2153 int copylen = 0;
2154 char buf[GLUEREPLY_UP_TO];
2155 listNode *ln;
2156 listIter li;
2157 robj *o;
2158
2159 listRewind(c->reply,&li);
2160 while((ln = listNext(&li))) {
2161 int objlen;
2162
2163 o = ln->value;
2164 objlen = sdslen(o->ptr);
2165 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2166 memcpy(buf+copylen,o->ptr,objlen);
2167 copylen += objlen;
2168 listDelNode(c->reply,ln);
2169 } else {
2170 if (copylen == 0) return;
2171 break;
2172 }
2173 }
2174 /* Now the output buffer is empty, add the new single element */
2175 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2176 listAddNodeHead(c->reply,o);
2177 }
2178
2179 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2180 redisClient *c = privdata;
2181 int nwritten = 0, totwritten = 0, objlen;
2182 robj *o;
2183 REDIS_NOTUSED(el);
2184 REDIS_NOTUSED(mask);
2185
2186 /* Use writev() if we have enough buffers to send */
2187 if (!server.glueoutputbuf &&
2188 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2189 !(c->flags & REDIS_MASTER))
2190 {
2191 sendReplyToClientWritev(el, fd, privdata, mask);
2192 return;
2193 }
2194
2195 while(listLength(c->reply)) {
2196 if (server.glueoutputbuf && listLength(c->reply) > 1)
2197 glueReplyBuffersIfNeeded(c);
2198
2199 o = listNodeValue(listFirst(c->reply));
2200 objlen = sdslen(o->ptr);
2201
2202 if (objlen == 0) {
2203 listDelNode(c->reply,listFirst(c->reply));
2204 continue;
2205 }
2206
2207 if (c->flags & REDIS_MASTER) {
2208 /* Don't reply to a master */
2209 nwritten = objlen - c->sentlen;
2210 } else {
2211 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2212 if (nwritten <= 0) break;
2213 }
2214 c->sentlen += nwritten;
2215 totwritten += nwritten;
2216 /* If we fully sent the object on head go to the next one */
2217 if (c->sentlen == objlen) {
2218 listDelNode(c->reply,listFirst(c->reply));
2219 c->sentlen = 0;
2220 }
2221 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2222 * bytes, in a single threaded server it's a good idea to serve
2223 * other clients as well, even if a very large request comes from
2224 * super fast link that is always able to accept data (in real world
2225 * scenario think about 'KEYS *' against the loopback interfae) */
2226 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2227 }
2228 if (nwritten == -1) {
2229 if (errno == EAGAIN) {
2230 nwritten = 0;
2231 } else {
2232 redisLog(REDIS_VERBOSE,
2233 "Error writing to client: %s", strerror(errno));
2234 freeClient(c);
2235 return;
2236 }
2237 }
2238 if (totwritten > 0) c->lastinteraction = time(NULL);
2239 if (listLength(c->reply) == 0) {
2240 c->sentlen = 0;
2241 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2242 }
2243 }
2244
2245 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2246 {
2247 redisClient *c = privdata;
2248 int nwritten = 0, totwritten = 0, objlen, willwrite;
2249 robj *o;
2250 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2251 int offset, ion = 0;
2252 REDIS_NOTUSED(el);
2253 REDIS_NOTUSED(mask);
2254
2255 listNode *node;
2256 while (listLength(c->reply)) {
2257 offset = c->sentlen;
2258 ion = 0;
2259 willwrite = 0;
2260
2261 /* fill-in the iov[] array */
2262 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2263 o = listNodeValue(node);
2264 objlen = sdslen(o->ptr);
2265
2266 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2267 break;
2268
2269 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2270 break; /* no more iovecs */
2271
2272 iov[ion].iov_base = ((char*)o->ptr) + offset;
2273 iov[ion].iov_len = objlen - offset;
2274 willwrite += objlen - offset;
2275 offset = 0; /* just for the first item */
2276 ion++;
2277 }
2278
2279 if(willwrite == 0)
2280 break;
2281
2282 /* write all collected blocks at once */
2283 if((nwritten = writev(fd, iov, ion)) < 0) {
2284 if (errno != EAGAIN) {
2285 redisLog(REDIS_VERBOSE,
2286 "Error writing to client: %s", strerror(errno));
2287 freeClient(c);
2288 return;
2289 }
2290 break;
2291 }
2292
2293 totwritten += nwritten;
2294 offset = c->sentlen;
2295
2296 /* remove written robjs from c->reply */
2297 while (nwritten && listLength(c->reply)) {
2298 o = listNodeValue(listFirst(c->reply));
2299 objlen = sdslen(o->ptr);
2300
2301 if(nwritten >= objlen - offset) {
2302 listDelNode(c->reply, listFirst(c->reply));
2303 nwritten -= objlen - offset;
2304 c->sentlen = 0;
2305 } else {
2306 /* partial write */
2307 c->sentlen += nwritten;
2308 break;
2309 }
2310 offset = 0;
2311 }
2312 }
2313
2314 if (totwritten > 0)
2315 c->lastinteraction = time(NULL);
2316
2317 if (listLength(c->reply) == 0) {
2318 c->sentlen = 0;
2319 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2320 }
2321 }
2322
2323 static int qsortRedisCommands(const void *r1, const void *r2) {
2324 return strcasecmp(
2325 ((struct redisCommand*)r1)->name,
2326 ((struct redisCommand*)r2)->name);
2327 }
2328
2329 static void sortCommandTable() {
2330 /* Copy and sort the read-only version of the command table */
2331 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2332 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2333 qsort(commandTable,
2334 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2335 sizeof(struct redisCommand),qsortRedisCommands);
2336 }
2337
2338 static struct redisCommand *lookupCommand(char *name) {
2339 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2340 return bsearch(
2341 &tmp,
2342 commandTable,
2343 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2344 sizeof(struct redisCommand),
2345 qsortRedisCommands);
2346 }
2347
2348 /* resetClient prepare the client to process the next command */
2349 static void resetClient(redisClient *c) {
2350 freeClientArgv(c);
2351 c->bulklen = -1;
2352 c->multibulk = 0;
2353 }
2354
2355 /* Call() is the core of Redis execution of a command */
2356 static void call(redisClient *c, struct redisCommand *cmd) {
2357 long long dirty;
2358
2359 dirty = server.dirty;
2360 cmd->proc(c);
2361 dirty = server.dirty-dirty;
2362
2363 if (server.appendonly && dirty)
2364 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2365 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2366 listLength(server.slaves))
2367 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2368 if (listLength(server.monitors))
2369 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2370 server.stat_numcommands++;
2371 }
2372
2373 /* If this function gets called we already read a whole
2374 * command, argments are in the client argv/argc fields.
2375 * processCommand() execute the command or prepare the
2376 * server for a bulk read from the client.
2377 *
2378 * If 1 is returned the client is still alive and valid and
2379 * and other operations can be performed by the caller. Otherwise
2380 * if 0 is returned the client was destroied (i.e. after QUIT). */
2381 static int processCommand(redisClient *c) {
2382 struct redisCommand *cmd;
2383
2384 /* Free some memory if needed (maxmemory setting) */
2385 if (server.maxmemory) freeMemoryIfNeeded();
2386
2387 /* Handle the multi bulk command type. This is an alternative protocol
2388 * supported by Redis in order to receive commands that are composed of
2389 * multiple binary-safe "bulk" arguments. The latency of processing is
2390 * a bit higher but this allows things like multi-sets, so if this
2391 * protocol is used only for MSET and similar commands this is a big win. */
2392 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2393 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2394 if (c->multibulk <= 0) {
2395 resetClient(c);
2396 return 1;
2397 } else {
2398 decrRefCount(c->argv[c->argc-1]);
2399 c->argc--;
2400 return 1;
2401 }
2402 } else if (c->multibulk) {
2403 if (c->bulklen == -1) {
2404 if (((char*)c->argv[0]->ptr)[0] != '$') {
2405 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2406 resetClient(c);
2407 return 1;
2408 } else {
2409 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2410 decrRefCount(c->argv[0]);
2411 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2412 c->argc--;
2413 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2414 resetClient(c);
2415 return 1;
2416 }
2417 c->argc--;
2418 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2419 return 1;
2420 }
2421 } else {
2422 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2423 c->mbargv[c->mbargc] = c->argv[0];
2424 c->mbargc++;
2425 c->argc--;
2426 c->multibulk--;
2427 if (c->multibulk == 0) {
2428 robj **auxargv;
2429 int auxargc;
2430
2431 /* Here we need to swap the multi-bulk argc/argv with the
2432 * normal argc/argv of the client structure. */
2433 auxargv = c->argv;
2434 c->argv = c->mbargv;
2435 c->mbargv = auxargv;
2436
2437 auxargc = c->argc;
2438 c->argc = c->mbargc;
2439 c->mbargc = auxargc;
2440
2441 /* We need to set bulklen to something different than -1
2442 * in order for the code below to process the command without
2443 * to try to read the last argument of a bulk command as
2444 * a special argument. */
2445 c->bulklen = 0;
2446 /* continue below and process the command */
2447 } else {
2448 c->bulklen = -1;
2449 return 1;
2450 }
2451 }
2452 }
2453 /* -- end of multi bulk commands processing -- */
2454
2455 /* The QUIT command is handled as a special case. Normal command
2456 * procs are unable to close the client connection safely */
2457 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2458 freeClient(c);
2459 return 0;
2460 }
2461
2462 /* Now lookup the command and check ASAP about trivial error conditions
2463 * such wrong arity, bad command name and so forth. */
2464 cmd = lookupCommand(c->argv[0]->ptr);
2465 if (!cmd) {
2466 addReplySds(c,
2467 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2468 (char*)c->argv[0]->ptr));
2469 resetClient(c);
2470 return 1;
2471 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2472 (c->argc < -cmd->arity)) {
2473 addReplySds(c,
2474 sdscatprintf(sdsempty(),
2475 "-ERR wrong number of arguments for '%s' command\r\n",
2476 cmd->name));
2477 resetClient(c);
2478 return 1;
2479 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2480 /* This is a bulk command, we have to read the last argument yet. */
2481 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2482
2483 decrRefCount(c->argv[c->argc-1]);
2484 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2485 c->argc--;
2486 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490 c->argc--;
2491 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2492 /* It is possible that the bulk read is already in the
2493 * buffer. Check this condition and handle it accordingly.
2494 * This is just a fast path, alternative to call processInputBuffer().
2495 * It's a good idea since the code is small and this condition
2496 * happens most of the times. */
2497 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2498 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2499 c->argc++;
2500 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2501 } else {
2502 /* Otherwise return... there is to read the last argument
2503 * from the socket. */
2504 return 1;
2505 }
2506 }
2507 /* Let's try to encode the bulk object to save space. */
2508 if (cmd->flags & REDIS_CMD_BULK)
2509 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2510
2511 /* Check if the user is authenticated */
2512 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2513 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2514 resetClient(c);
2515 return 1;
2516 }
2517
2518 /* Handle the maxmemory directive */
2519 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2520 zmalloc_used_memory() > server.maxmemory)
2521 {
2522 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2523 resetClient(c);
2524 return 1;
2525 }
2526
2527 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2528 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2529 &&
2530 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2531 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2532 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2533 resetClient(c);
2534 return 1;
2535 }
2536
2537 /* Exec the command */
2538 if (c->flags & REDIS_MULTI &&
2539 cmd->proc != execCommand && cmd->proc != discardCommand &&
2540 cmd->proc != multiCommand && cmd->proc != watchCommand)
2541 {
2542 queueMultiCommand(c,cmd);
2543 addReply(c,shared.queued);
2544 } else {
2545 if (server.vm_enabled && server.vm_max_threads > 0 &&
2546 blockClientOnSwappedKeys(c,cmd)) return 1;
2547 call(c,cmd);
2548 }
2549
2550 /* Prepare the client for the next command */
2551 resetClient(c);
2552 return 1;
2553 }
2554
2555 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2556 listNode *ln;
2557 listIter li;
2558 int outc = 0, j;
2559 robj **outv;
2560 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2561 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2562 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2563 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2564 robj *lenobj;
2565
2566 if (argc <= REDIS_STATIC_ARGS) {
2567 outv = static_outv;
2568 } else {
2569 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2570 }
2571
2572 lenobj = createObject(REDIS_STRING,
2573 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2574 lenobj->refcount = 0;
2575 outv[outc++] = lenobj;
2576 for (j = 0; j < argc; j++) {
2577 lenobj = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"$%lu\r\n",
2579 (unsigned long) stringObjectLen(argv[j])));
2580 lenobj->refcount = 0;
2581 outv[outc++] = lenobj;
2582 outv[outc++] = argv[j];
2583 outv[outc++] = shared.crlf;
2584 }
2585
2586 /* Increment all the refcounts at start and decrement at end in order to
2587 * be sure to free objects if there is no slave in a replication state
2588 * able to be feed with commands */
2589 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2590 listRewind(slaves,&li);
2591 while((ln = listNext(&li))) {
2592 redisClient *slave = ln->value;
2593
2594 /* Don't feed slaves that are still waiting for BGSAVE to start */
2595 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2596
2597 /* Feed all the other slaves, MONITORs and so on */
2598 if (slave->slaveseldb != dictid) {
2599 robj *selectcmd;
2600
2601 switch(dictid) {
2602 case 0: selectcmd = shared.select0; break;
2603 case 1: selectcmd = shared.select1; break;
2604 case 2: selectcmd = shared.select2; break;
2605 case 3: selectcmd = shared.select3; break;
2606 case 4: selectcmd = shared.select4; break;
2607 case 5: selectcmd = shared.select5; break;
2608 case 6: selectcmd = shared.select6; break;
2609 case 7: selectcmd = shared.select7; break;
2610 case 8: selectcmd = shared.select8; break;
2611 case 9: selectcmd = shared.select9; break;
2612 default:
2613 selectcmd = createObject(REDIS_STRING,
2614 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2615 selectcmd->refcount = 0;
2616 break;
2617 }
2618 addReply(slave,selectcmd);
2619 slave->slaveseldb = dictid;
2620 }
2621 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2622 }
2623 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2624 if (outv != static_outv) zfree(outv);
2625 }
2626
2627 static sds sdscatrepr(sds s, char *p, size_t len) {
2628 s = sdscatlen(s,"\"",1);
2629 while(len--) {
2630 switch(*p) {
2631 case '\\':
2632 case '"':
2633 s = sdscatprintf(s,"\\%c",*p);
2634 break;
2635 case '\n': s = sdscatlen(s,"\\n",1); break;
2636 case '\r': s = sdscatlen(s,"\\r",1); break;
2637 case '\t': s = sdscatlen(s,"\\t",1); break;
2638 case '\a': s = sdscatlen(s,"\\a",1); break;
2639 case '\b': s = sdscatlen(s,"\\b",1); break;
2640 default:
2641 if (isprint(*p))
2642 s = sdscatprintf(s,"%c",*p);
2643 else
2644 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2645 break;
2646 }
2647 p++;
2648 }
2649 return sdscatlen(s,"\"",1);
2650 }
2651
2652 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2653 listNode *ln;
2654 listIter li;
2655 int j;
2656 sds cmdrepr = sdsnew("+");
2657 robj *cmdobj;
2658 struct timeval tv;
2659
2660 gettimeofday(&tv,NULL);
2661 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2662 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2663
2664 for (j = 0; j < argc; j++) {
2665 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2666 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2667 } else {
2668 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2669 sdslen(argv[j]->ptr));
2670 }
2671 if (j != argc-1)
2672 cmdrepr = sdscatlen(cmdrepr," ",1);
2673 }
2674 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2675 cmdobj = createObject(REDIS_STRING,cmdrepr);
2676
2677 listRewind(monitors,&li);
2678 while((ln = listNext(&li))) {
2679 redisClient *monitor = ln->value;
2680 addReply(monitor,cmdobj);
2681 }
2682 decrRefCount(cmdobj);
2683 }
2684
2685 static void processInputBuffer(redisClient *c) {
2686 again:
2687 /* Before to process the input buffer, make sure the client is not
2688 * waitig for a blocking operation such as BLPOP. Note that the first
2689 * iteration the client is never blocked, otherwise the processInputBuffer
2690 * would not be called at all, but after the execution of the first commands
2691 * in the input buffer the client may be blocked, and the "goto again"
2692 * will try to reiterate. The following line will make it return asap. */
2693 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2694 if (c->bulklen == -1) {
2695 /* Read the first line of the query */
2696 char *p = strchr(c->querybuf,'\n');
2697 size_t querylen;
2698
2699 if (p) {
2700 sds query, *argv;
2701 int argc, j;
2702
2703 query = c->querybuf;
2704 c->querybuf = sdsempty();
2705 querylen = 1+(p-(query));
2706 if (sdslen(query) > querylen) {
2707 /* leave data after the first line of the query in the buffer */
2708 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2709 }
2710 *p = '\0'; /* remove "\n" */
2711 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2712 sdsupdatelen(query);
2713
2714 /* Now we can split the query in arguments */
2715 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2716 sdsfree(query);
2717
2718 if (c->argv) zfree(c->argv);
2719 c->argv = zmalloc(sizeof(robj*)*argc);
2720
2721 for (j = 0; j < argc; j++) {
2722 if (sdslen(argv[j])) {
2723 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2724 c->argc++;
2725 } else {
2726 sdsfree(argv[j]);
2727 }
2728 }
2729 zfree(argv);
2730 if (c->argc) {
2731 /* Execute the command. If the client is still valid
2732 * after processCommand() return and there is something
2733 * on the query buffer try to process the next command. */
2734 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2735 } else {
2736 /* Nothing to process, argc == 0. Just process the query
2737 * buffer if it's not empty or return to the caller */
2738 if (sdslen(c->querybuf)) goto again;
2739 }
2740 return;
2741 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2742 redisLog(REDIS_VERBOSE, "Client protocol error");
2743 freeClient(c);
2744 return;
2745 }
2746 } else {
2747 /* Bulk read handling. Note that if we are at this point
2748 the client already sent a command terminated with a newline,
2749 we are reading the bulk data that is actually the last
2750 argument of the command. */
2751 int qbl = sdslen(c->querybuf);
2752
2753 if (c->bulklen <= qbl) {
2754 /* Copy everything but the final CRLF as final argument */
2755 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2756 c->argc++;
2757 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2758 /* Process the command. If the client is still valid after
2759 * the processing and there is more data in the buffer
2760 * try to parse it. */
2761 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2762 return;
2763 }
2764 }
2765 }
2766
2767 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2768 redisClient *c = (redisClient*) privdata;
2769 char buf[REDIS_IOBUF_LEN];
2770 int nread;
2771 REDIS_NOTUSED(el);
2772 REDIS_NOTUSED(mask);
2773
2774 nread = read(fd, buf, REDIS_IOBUF_LEN);
2775 if (nread == -1) {
2776 if (errno == EAGAIN) {
2777 nread = 0;
2778 } else {
2779 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2780 freeClient(c);
2781 return;
2782 }
2783 } else if (nread == 0) {
2784 redisLog(REDIS_VERBOSE, "Client closed connection");
2785 freeClient(c);
2786 return;
2787 }
2788 if (nread) {
2789 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2790 c->lastinteraction = time(NULL);
2791 } else {
2792 return;
2793 }
2794 processInputBuffer(c);
2795 }
2796
2797 static int selectDb(redisClient *c, int id) {
2798 if (id < 0 || id >= server.dbnum)
2799 return REDIS_ERR;
2800 c->db = &server.db[id];
2801 return REDIS_OK;
2802 }
2803
2804 static void *dupClientReplyValue(void *o) {
2805 incrRefCount((robj*)o);
2806 return o;
2807 }
2808
2809 static int listMatchObjects(void *a, void *b) {
2810 return equalStringObjects(a,b);
2811 }
2812
2813 static redisClient *createClient(int fd) {
2814 redisClient *c = zmalloc(sizeof(*c));
2815
2816 anetNonBlock(NULL,fd);
2817 anetTcpNoDelay(NULL,fd);
2818 if (!c) return NULL;
2819 selectDb(c,0);
2820 c->fd = fd;
2821 c->querybuf = sdsempty();
2822 c->argc = 0;
2823 c->argv = NULL;
2824 c->bulklen = -1;
2825 c->multibulk = 0;
2826 c->mbargc = 0;
2827 c->mbargv = NULL;
2828 c->sentlen = 0;
2829 c->flags = 0;
2830 c->lastinteraction = time(NULL);
2831 c->authenticated = 0;
2832 c->replstate = REDIS_REPL_NONE;
2833 c->reply = listCreate();
2834 listSetFreeMethod(c->reply,decrRefCount);
2835 listSetDupMethod(c->reply,dupClientReplyValue);
2836 c->blocking_keys = NULL;
2837 c->blocking_keys_num = 0;
2838 c->io_keys = listCreate();
2839 c->watched_keys = listCreate();
2840 listSetFreeMethod(c->io_keys,decrRefCount);
2841 c->pubsub_channels = dictCreate(&setDictType,NULL);
2842 c->pubsub_patterns = listCreate();
2843 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2844 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2845 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2846 readQueryFromClient, c) == AE_ERR) {
2847 freeClient(c);
2848 return NULL;
2849 }
2850 listAddNodeTail(server.clients,c);
2851 initClientMultiState(c);
2852 return c;
2853 }
2854
2855 static void addReply(redisClient *c, robj *obj) {
2856 if (listLength(c->reply) == 0 &&
2857 (c->replstate == REDIS_REPL_NONE ||
2858 c->replstate == REDIS_REPL_ONLINE) &&
2859 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2860 sendReplyToClient, c) == AE_ERR) return;
2861
2862 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2863 obj = dupStringObject(obj);
2864 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2865 }
2866 listAddNodeTail(c->reply,getDecodedObject(obj));
2867 }
2868
2869 static void addReplySds(redisClient *c, sds s) {
2870 robj *o = createObject(REDIS_STRING,s);
2871 addReply(c,o);
2872 decrRefCount(o);
2873 }
2874
2875 static void addReplyDouble(redisClient *c, double d) {
2876 char buf[128];
2877
2878 snprintf(buf,sizeof(buf),"%.17g",d);
2879 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2880 (unsigned long) strlen(buf),buf));
2881 }
2882
2883 static void addReplyLongLong(redisClient *c, long long ll) {
2884 char buf[128];
2885 size_t len;
2886
2887 if (ll == 0) {
2888 addReply(c,shared.czero);
2889 return;
2890 } else if (ll == 1) {
2891 addReply(c,shared.cone);
2892 return;
2893 }
2894 buf[0] = ':';
2895 len = ll2string(buf+1,sizeof(buf)-1,ll);
2896 buf[len+1] = '\r';
2897 buf[len+2] = '\n';
2898 addReplySds(c,sdsnewlen(buf,len+3));
2899 }
2900
2901 static void addReplyUlong(redisClient *c, unsigned long ul) {
2902 char buf[128];
2903 size_t len;
2904
2905 if (ul == 0) {
2906 addReply(c,shared.czero);
2907 return;
2908 } else if (ul == 1) {
2909 addReply(c,shared.cone);
2910 return;
2911 }
2912 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2913 addReplySds(c,sdsnewlen(buf,len));
2914 }
2915
2916 static void addReplyBulkLen(redisClient *c, robj *obj) {
2917 size_t len, intlen;
2918 char buf[128];
2919
2920 if (obj->encoding == REDIS_ENCODING_RAW) {
2921 len = sdslen(obj->ptr);
2922 } else {
2923 long n = (long)obj->ptr;
2924
2925 /* Compute how many bytes will take this integer as a radix 10 string */
2926 len = 1;
2927 if (n < 0) {
2928 len++;
2929 n = -n;
2930 }
2931 while((n = n/10) != 0) {
2932 len++;
2933 }
2934 }
2935 buf[0] = '$';
2936 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2937 buf[intlen+1] = '\r';
2938 buf[intlen+2] = '\n';
2939 addReplySds(c,sdsnewlen(buf,intlen+3));
2940 }
2941
2942 static void addReplyBulk(redisClient *c, robj *obj) {
2943 addReplyBulkLen(c,obj);
2944 addReply(c,obj);
2945 addReply(c,shared.crlf);
2946 }
2947
2948 static void addReplyBulkSds(redisClient *c, sds s) {
2949 robj *o = createStringObject(s, sdslen(s));
2950 addReplyBulk(c,o);
2951 decrRefCount(o);
2952 }
2953
2954 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2955 static void addReplyBulkCString(redisClient *c, char *s) {
2956 if (s == NULL) {
2957 addReply(c,shared.nullbulk);
2958 } else {
2959 robj *o = createStringObject(s,strlen(s));
2960 addReplyBulk(c,o);
2961 decrRefCount(o);
2962 }
2963 }
2964
2965 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2966 int cport, cfd;
2967 char cip[128];
2968 redisClient *c;
2969 REDIS_NOTUSED(el);
2970 REDIS_NOTUSED(mask);
2971 REDIS_NOTUSED(privdata);
2972
2973 cfd = anetAccept(server.neterr, fd, cip, &cport);
2974 if (cfd == AE_ERR) {
2975 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2976 return;
2977 }
2978 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2979 if ((c = createClient(cfd)) == NULL) {
2980 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2981 close(cfd); /* May be already closed, just ingore errors */
2982 return;
2983 }
2984 /* If maxclient directive is set and this is one client more... close the
2985 * connection. Note that we create the client instead to check before
2986 * for this condition, since now the socket is already set in nonblocking
2987 * mode and we can send an error for free using the Kernel I/O */
2988 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2989 char *err = "-ERR max number of clients reached\r\n";
2990
2991 /* That's a best effort error message, don't check write errors */
2992 if (write(c->fd,err,strlen(err)) == -1) {
2993 /* Nothing to do, Just to avoid the warning... */
2994 }
2995 freeClient(c);
2996 return;
2997 }
2998 server.stat_numconnections++;
2999 }
3000
3001 /* ======================= Redis objects implementation ===================== */
3002
3003 static robj *createObject(int type, void *ptr) {
3004 robj *o;
3005
3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3007 if (listLength(server.objfreelist)) {
3008 listNode *head = listFirst(server.objfreelist);
3009 o = listNodeValue(head);
3010 listDelNode(server.objfreelist,head);
3011 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3012 } else {
3013 if (server.vm_enabled)
3014 pthread_mutex_unlock(&server.obj_freelist_mutex);
3015 o = zmalloc(sizeof(*o));
3016 }
3017 o->type = type;
3018 o->encoding = REDIS_ENCODING_RAW;
3019 o->ptr = ptr;
3020 o->refcount = 1;
3021 if (server.vm_enabled) {
3022 /* Note that this code may run in the context of an I/O thread
3023 * and accessing server.lruclock in theory is an error
3024 * (no locks). But in practice this is safe, and even if we read
3025 * garbage Redis will not fail. */
3026 o->lru = server.lruclock;
3027 o->storage = REDIS_VM_MEMORY;
3028 }
3029 return o;
3030 }
3031
3032 static robj *createStringObject(char *ptr, size_t len) {
3033 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3034 }
3035
3036 static robj *createStringObjectFromLongLong(long long value) {
3037 robj *o;
3038 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3039 incrRefCount(shared.integers[value]);
3040 o = shared.integers[value];
3041 } else {
3042 if (value >= LONG_MIN && value <= LONG_MAX) {
3043 o = createObject(REDIS_STRING, NULL);
3044 o->encoding = REDIS_ENCODING_INT;
3045 o->ptr = (void*)((long)value);
3046 } else {
3047 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3048 }
3049 }
3050 return o;
3051 }
3052
3053 static robj *dupStringObject(robj *o) {
3054 assert(o->encoding == REDIS_ENCODING_RAW);
3055 return createStringObject(o->ptr,sdslen(o->ptr));
3056 }
3057
3058 static robj *createListObject(void) {
3059 list *l = listCreate();
3060 robj *o = createObject(REDIS_LIST,l);
3061 listSetFreeMethod(l,decrRefCount);
3062 o->encoding = REDIS_ENCODING_LIST;
3063 return o;
3064 }
3065
3066 static robj *createZiplistObject(void) {
3067 unsigned char *zl = ziplistNew();
3068 robj *o = createObject(REDIS_LIST,zl);
3069 o->encoding = REDIS_ENCODING_ZIPLIST;
3070 return o;
3071 }
3072
3073 static robj *createSetObject(void) {
3074 dict *d = dictCreate(&setDictType,NULL);
3075 robj *o = createObject(REDIS_SET,d);
3076 o->encoding = REDIS_ENCODING_HT;
3077 return o;
3078 }
3079
3080 static robj *createIntsetObject(void) {
3081 intset *is = intsetNew();
3082 robj *o = createObject(REDIS_SET,is);
3083 o->encoding = REDIS_ENCODING_INTSET;
3084 return o;
3085 }
3086
3087 static robj *createHashObject(void) {
3088 /* All the Hashes start as zipmaps. Will be automatically converted
3089 * into hash tables if there are enough elements or big elements
3090 * inside. */
3091 unsigned char *zm = zipmapNew();
3092 robj *o = createObject(REDIS_HASH,zm);
3093 o->encoding = REDIS_ENCODING_ZIPMAP;
3094 return o;
3095 }
3096
3097 static robj *createZsetObject(void) {
3098 zset *zs = zmalloc(sizeof(*zs));
3099
3100 zs->dict = dictCreate(&zsetDictType,NULL);
3101 zs->zsl = zslCreate();
3102 return createObject(REDIS_ZSET,zs);
3103 }
3104
3105 static void freeStringObject(robj *o) {
3106 if (o->encoding == REDIS_ENCODING_RAW) {
3107 sdsfree(o->ptr);
3108 }
3109 }
3110
3111 static void freeListObject(robj *o) {
3112 switch (o->encoding) {
3113 case REDIS_ENCODING_LIST:
3114 listRelease((list*) o->ptr);
3115 break;
3116 case REDIS_ENCODING_ZIPLIST:
3117 zfree(o->ptr);
3118 break;
3119 default:
3120 redisPanic("Unknown list encoding type");
3121 }
3122 }
3123
3124 static void freeSetObject(robj *o) {
3125 switch (o->encoding) {
3126 case REDIS_ENCODING_HT:
3127 dictRelease((dict*) o->ptr);
3128 break;
3129 case REDIS_ENCODING_INTSET:
3130 zfree(o->ptr);
3131 break;
3132 default:
3133 redisPanic("Unknown set encoding type");
3134 }
3135 }
3136
3137 static void freeZsetObject(robj *o) {
3138 zset *zs = o->ptr;
3139
3140 dictRelease(zs->dict);
3141 zslFree(zs->zsl);
3142 zfree(zs);
3143 }
3144
3145 static void freeHashObject(robj *o) {
3146 switch (o->encoding) {
3147 case REDIS_ENCODING_HT:
3148 dictRelease((dict*) o->ptr);
3149 break;
3150 case REDIS_ENCODING_ZIPMAP:
3151 zfree(o->ptr);
3152 break;
3153 default:
3154 redisPanic("Unknown hash encoding type");
3155 break;
3156 }
3157 }
3158
3159 static void incrRefCount(robj *o) {
3160 o->refcount++;
3161 }
3162
3163 static void decrRefCount(void *obj) {
3164 robj *o = obj;
3165
3166 /* Object is a swapped out value, or in the process of being loaded. */
3167 if (server.vm_enabled &&
3168 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3169 {
3170 vmpointer *vp = obj;
3171 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3172 vmMarkPagesFree(vp->page,vp->usedpages);
3173 server.vm_stats_swapped_objects--;
3174 zfree(vp);
3175 return;
3176 }
3177
3178 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3179 /* Object is in memory, or in the process of being swapped out.
3180 *
3181 * If the object is being swapped out, abort the operation on
3182 * decrRefCount even if the refcount does not drop to 0: the object
3183 * is referenced at least two times, as value of the key AND as
3184 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3185 * done but the relevant key was removed in the meantime, the
3186 * complete jobs handler will not find the key about the job and the
3187 * assert will fail. */
3188 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3189 vmCancelThreadedIOJob(o);
3190 if (--(o->refcount) == 0) {
3191 switch(o->type) {
3192 case REDIS_STRING: freeStringObject(o); break;
3193 case REDIS_LIST: freeListObject(o); break;
3194 case REDIS_SET: freeSetObject(o); break;
3195 case REDIS_ZSET: freeZsetObject(o); break;
3196 case REDIS_HASH: freeHashObject(o); break;
3197 default: redisPanic("Unknown object type"); break;
3198 }
3199 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3200 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3201 !listAddNodeHead(server.objfreelist,o))
3202 zfree(o);
3203 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3204 }
3205 }
3206
3207 static int checkType(redisClient *c, robj *o, int type) {
3208 if (o->type != type) {
3209 addReply(c,shared.wrongtypeerr);
3210 return 1;
3211 }
3212 return 0;
3213 }
3214
3215 /* Check if the nul-terminated string 's' can be represented by a long
3216 * (that is, is a number that fits into long without any other space or
3217 * character before or after the digits).
3218 *
3219 * If so, the function returns REDIS_OK and *longval is set to the value
3220 * of the number. Otherwise REDIS_ERR is returned */
3221 static int isStringRepresentableAsLong(sds s, long *longval) {
3222 char buf[32], *endptr;
3223 long value;
3224 int slen;
3225
3226 value = strtol(s, &endptr, 10);
3227 if (endptr[0] != '\0') return REDIS_ERR;
3228 slen = ll2string(buf,32,value);
3229
3230 /* If the number converted back into a string is not identical
3231 * then it's not possible to encode the string as integer */
3232 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3233 if (longval) *longval = value;
3234 return REDIS_OK;
3235 }
3236
3237 /* Try to encode a string object in order to save space */
3238 static robj *tryObjectEncoding(robj *o) {
3239 long value;
3240 sds s = o->ptr;
3241
3242 if (o->encoding != REDIS_ENCODING_RAW)
3243 return o; /* Already encoded */
3244
3245 /* It's not safe to encode shared objects: shared objects can be shared
3246 * everywhere in the "object space" of Redis. Encoded objects can only
3247 * appear as "values" (and not, for instance, as keys) */
3248 if (o->refcount > 1) return o;
3249
3250 /* Currently we try to encode only strings */
3251 redisAssert(o->type == REDIS_STRING);
3252
3253 /* Check if we can represent this string as a long integer */
3254 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3255
3256 /* Ok, this object can be encoded */
3257 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3258 decrRefCount(o);
3259 incrRefCount(shared.integers[value]);
3260 return shared.integers[value];
3261 } else {
3262 o->encoding = REDIS_ENCODING_INT;
3263 sdsfree(o->ptr);
3264 o->ptr = (void*) value;
3265 return o;
3266 }
3267 }
3268
3269 /* Get a decoded version of an encoded object (returned as a new object).
3270 * If the object is already raw-encoded just increment the ref count. */
3271 static robj *getDecodedObject(robj *o) {
3272 robj *dec;
3273
3274 if (o->encoding == REDIS_ENCODING_RAW) {
3275 incrRefCount(o);
3276 return o;
3277 }
3278 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3279 char buf[32];
3280
3281 ll2string(buf,32,(long)o->ptr);
3282 dec = createStringObject(buf,strlen(buf));
3283 return dec;
3284 } else {
3285 redisPanic("Unknown encoding type");
3286 }
3287 }
3288
3289 /* Compare two string objects via strcmp() or alike.
3290 * Note that the objects may be integer-encoded. In such a case we
3291 * use ll2string() to get a string representation of the numbers on the stack
3292 * and compare the strings, it's much faster than calling getDecodedObject().
3293 *
3294 * Important note: if objects are not integer encoded, but binary-safe strings,
3295 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3296 * binary safe. */
3297 static int compareStringObjects(robj *a, robj *b) {
3298 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3299 char bufa[128], bufb[128], *astr, *bstr;
3300 int bothsds = 1;
3301
3302 if (a == b) return 0;
3303 if (a->encoding != REDIS_ENCODING_RAW) {
3304 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3305 astr = bufa;
3306 bothsds = 0;
3307 } else {
3308 astr = a->ptr;
3309 }
3310 if (b->encoding != REDIS_ENCODING_RAW) {
3311 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3312 bstr = bufb;
3313 bothsds = 0;
3314 } else {
3315 bstr = b->ptr;
3316 }
3317 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3318 }
3319
3320 /* Equal string objects return 1 if the two objects are the same from the
3321 * point of view of a string comparison, otherwise 0 is returned. Note that
3322 * this function is faster then checking for (compareStringObject(a,b) == 0)
3323 * because it can perform some more optimization. */
3324 static int equalStringObjects(robj *a, robj *b) {
3325 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3326 return a->ptr == b->ptr;
3327 } else {
3328 return compareStringObjects(a,b) == 0;
3329 }
3330 }
3331
3332 static size_t stringObjectLen(robj *o) {
3333 redisAssert(o->type == REDIS_STRING);
3334 if (o->encoding == REDIS_ENCODING_RAW) {
3335 return sdslen(o->ptr);
3336 } else {
3337 char buf[32];
3338
3339 return ll2string(buf,32,(long)o->ptr);
3340 }
3341 }
3342
3343 static int getDoubleFromObject(robj *o, double *target) {
3344 double value;
3345 char *eptr;
3346
3347 if (o == NULL) {
3348 value = 0;
3349 } else {
3350 redisAssert(o->type == REDIS_STRING);
3351 if (o->encoding == REDIS_ENCODING_RAW) {
3352 value = strtod(o->ptr, &eptr);
3353 if (eptr[0] != '\0') return REDIS_ERR;
3354 } else if (o->encoding == REDIS_ENCODING_INT) {
3355 value = (long)o->ptr;
3356 } else {
3357 redisPanic("Unknown string encoding");
3358 }
3359 }
3360
3361 *target = value;
3362 return REDIS_OK;
3363 }
3364
3365 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3366 double value;
3367 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3368 if (msg != NULL) {
3369 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3370 } else {
3371 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3372 }
3373 return REDIS_ERR;
3374 }
3375
3376 *target = value;
3377 return REDIS_OK;
3378 }
3379
3380 static int getLongLongFromObject(robj *o, long long *target) {
3381 long long value;
3382 char *eptr;
3383
3384 if (o == NULL) {
3385 value = 0;
3386 } else {
3387 redisAssert(o->type == REDIS_STRING);
3388 if (o->encoding == REDIS_ENCODING_RAW) {
3389 value = strtoll(o->ptr, &eptr, 10);
3390 if (eptr[0] != '\0') return REDIS_ERR;
3391 } else if (o->encoding == REDIS_ENCODING_INT) {
3392 value = (long)o->ptr;
3393 } else {
3394 redisPanic("Unknown string encoding");
3395 }
3396 }
3397
3398 if (target) *target = value;
3399 return REDIS_OK;
3400 }
3401
3402 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3403 long long value;
3404 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3405 if (msg != NULL) {
3406 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3407 } else {
3408 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3409 }
3410 return REDIS_ERR;
3411 }
3412
3413 *target = value;
3414 return REDIS_OK;
3415 }
3416
3417 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3418 long long value;
3419
3420 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3421 if (value < LONG_MIN || value > LONG_MAX) {
3422 if (msg != NULL) {
3423 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3424 } else {
3425 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3426 }
3427 return REDIS_ERR;
3428 }
3429
3430 *target = value;
3431 return REDIS_OK;
3432 }
3433
3434 /* =========================== Keyspace access API ========================== */
3435
3436 static robj *lookupKey(redisDb *db, robj *key) {
3437 dictEntry *de = dictFind(db->dict,key->ptr);
3438 if (de) {
3439 robj *val = dictGetEntryVal(de);
3440
3441 if (server.vm_enabled) {
3442 if (val->storage == REDIS_VM_MEMORY ||
3443 val->storage == REDIS_VM_SWAPPING)
3444 {
3445 /* If we were swapping the object out, cancel the operation */
3446 if (val->storage == REDIS_VM_SWAPPING)
3447 vmCancelThreadedIOJob(val);
3448 /* Update the access time for the aging algorithm. */
3449 val->lru = server.lruclock;
3450 } else {
3451 int notify = (val->storage == REDIS_VM_LOADING);
3452
3453 /* Our value was swapped on disk. Bring it at home. */
3454 redisAssert(val->type == REDIS_VMPOINTER);
3455 val = vmLoadObject(val);
3456 dictGetEntryVal(de) = val;
3457
3458 /* Clients blocked by the VM subsystem may be waiting for
3459 * this key... */
3460 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3461 }
3462 }
3463 return val;
3464 } else {
3465 return NULL;
3466 }
3467 }
3468
3469 static robj *lookupKeyRead(redisDb *db, robj *key) {
3470 expireIfNeeded(db,key);
3471 return lookupKey(db,key);
3472 }
3473
3474 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3475 deleteIfVolatile(db,key);
3476 touchWatchedKey(db,key);
3477 return lookupKey(db,key);
3478 }
3479
3480 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3481 robj *o = lookupKeyRead(c->db, key);
3482 if (!o) addReply(c,reply);
3483 return o;
3484 }
3485
3486 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3487 robj *o = lookupKeyWrite(c->db, key);
3488 if (!o) addReply(c,reply);
3489 return o;
3490 }
3491
3492 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3493 * otherwise REDIS_OK is returned, and the caller should increment the
3494 * refcount of 'val'. */
3495 static int dbAdd(redisDb *db, robj *key, robj *val) {
3496 /* Perform a lookup before adding the key, as we need to copy the
3497 * key value. */
3498 if (dictFind(db->dict, key->ptr) != NULL) {
3499 return REDIS_ERR;
3500 } else {
3501 sds copy = sdsdup(key->ptr);
3502 dictAdd(db->dict, copy, val);
3503 return REDIS_OK;
3504 }
3505 }
3506
3507 /* If the key does not exist, this is just like dbAdd(). Otherwise
3508 * the value associated to the key is replaced with the new one.
3509 *
3510 * On update (key already existed) 0 is returned. Otherwise 1. */
3511 static int dbReplace(redisDb *db, robj *key, robj *val) {
3512 if (dictFind(db->dict,key->ptr) == NULL) {
3513 sds copy = sdsdup(key->ptr);
3514 dictAdd(db->dict, copy, val);
3515 return 1;
3516 } else {
3517 dictReplace(db->dict, key->ptr, val);
3518 return 0;
3519 }
3520 }
3521
3522 static int dbExists(redisDb *db, robj *key) {
3523 return dictFind(db->dict,key->ptr) != NULL;
3524 }
3525
3526 /* Return a random key, in form of a Redis object.
3527 * If there are no keys, NULL is returned.
3528 *
3529 * The function makes sure to return keys not already expired. */
3530 static robj *dbRandomKey(redisDb *db) {
3531 struct dictEntry *de;
3532
3533 while(1) {
3534 sds key;
3535 robj *keyobj;
3536
3537 de = dictGetRandomKey(db->dict);
3538 if (de == NULL) return NULL;
3539
3540 key = dictGetEntryKey(de);
3541 keyobj = createStringObject(key,sdslen(key));
3542 if (dictFind(db->expires,key)) {
3543 if (expireIfNeeded(db,keyobj)) {
3544 decrRefCount(keyobj);
3545 continue; /* search for another key. This expired. */
3546 }
3547 }
3548 return keyobj;
3549 }
3550 }
3551
3552 /* Delete a key, value, and associated expiration entry if any, from the DB */
3553 static int dbDelete(redisDb *db, robj *key) {
3554 int retval;
3555
3556 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3557 retval = dictDelete(db->dict,key->ptr);
3558
3559 return retval == DICT_OK;
3560 }
3561
3562 /*============================ RDB saving/loading =========================== */
3563
3564 static int rdbSaveType(FILE *fp, unsigned char type) {
3565 if (fwrite(&type,1,1,fp) == 0) return -1;
3566 return 0;
3567 }
3568
3569 static int rdbSaveTime(FILE *fp, time_t t) {
3570 int32_t t32 = (int32_t) t;
3571 if (fwrite(&t32,4,1,fp) == 0) return -1;
3572 return 0;
3573 }
3574
3575 /* check rdbLoadLen() comments for more info */
3576 static int rdbSaveLen(FILE *fp, uint32_t len) {
3577 unsigned char buf[2];
3578
3579 if (len < (1<<6)) {
3580 /* Save a 6 bit len */
3581 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3582 if (fwrite(buf,1,1,fp) == 0) return -1;
3583 } else if (len < (1<<14)) {
3584 /* Save a 14 bit len */
3585 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3586 buf[1] = len&0xFF;
3587 if (fwrite(buf,2,1,fp) == 0) return -1;
3588 } else {
3589 /* Save a 32 bit len */
3590 buf[0] = (REDIS_RDB_32BITLEN<<6);
3591 if (fwrite(buf,1,1,fp) == 0) return -1;
3592 len = htonl(len);
3593 if (fwrite(&len,4,1,fp) == 0) return -1;
3594 }
3595 return 0;
3596 }
3597
3598 /* Encode 'value' as an integer if possible (if integer will fit the
3599 * supported range). If the function sucessful encoded the integer
3600 * then the (up to 5 bytes) encoded representation is written in the
3601 * string pointed by 'enc' and the length is returned. Otherwise
3602 * 0 is returned. */
3603 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3604 /* Finally check if it fits in our ranges */
3605 if (value >= -(1<<7) && value <= (1<<7)-1) {
3606 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3607 enc[1] = value&0xFF;
3608 return 2;
3609 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3610 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3611 enc[1] = value&0xFF;
3612 enc[2] = (value>>8)&0xFF;
3613 return 3;
3614 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3615 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3616 enc[1] = value&0xFF;
3617 enc[2] = (value>>8)&0xFF;
3618 enc[3] = (value>>16)&0xFF;
3619 enc[4] = (value>>24)&0xFF;
3620 return 5;
3621 } else {
3622 return 0;
3623 }
3624 }
3625
3626 /* String objects in the form "2391" "-100" without any space and with a
3627 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3628 * encoded as integers to save space */
3629 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3630 long long value;
3631 char *endptr, buf[32];
3632
3633 /* Check if it's possible to encode this value as a number */
3634 value = strtoll(s, &endptr, 10);
3635 if (endptr[0] != '\0') return 0;
3636 ll2string(buf,32,value);
3637
3638 /* If the number converted back into a string is not identical
3639 * then it's not possible to encode the string as integer */
3640 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3641
3642 return rdbEncodeInteger(value,enc);
3643 }
3644
3645 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3646 size_t comprlen, outlen;
3647 unsigned char byte;
3648 void *out;
3649
3650 /* We require at least four bytes compression for this to be worth it */
3651 if (len <= 4) return 0;
3652 outlen = len-4;
3653 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3654 comprlen = lzf_compress(s, len, out, outlen);
3655 if (comprlen == 0) {
3656 zfree(out);
3657 return 0;
3658 }
3659 /* Data compressed! Let's save it on disk */
3660 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3661 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3662 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3663 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3664 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3665 zfree(out);
3666 return comprlen;
3667
3668 writeerr:
3669 zfree(out);
3670 return -1;
3671 }
3672
3673 /* Save a string objet as [len][data] on disk. If the object is a string
3674 * representation of an integer value we try to safe it in a special form */
3675 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3676 int enclen;
3677
3678 /* Try integer encoding */
3679 if (len <= 11) {
3680 unsigned char buf[5];
3681 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3682 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3683 return 0;
3684 }
3685 }
3686
3687 /* Try LZF compression - under 20 bytes it's unable to compress even
3688 * aaaaaaaaaaaaaaaaaa so skip it */
3689 if (server.rdbcompression && len > 20) {
3690 int retval;
3691
3692 retval = rdbSaveLzfStringObject(fp,s,len);
3693 if (retval == -1) return -1;
3694 if (retval > 0) return 0;
3695 /* retval == 0 means data can't be compressed, save the old way */
3696 }
3697
3698 /* Store verbatim */
3699 if (rdbSaveLen(fp,len) == -1) return -1;
3700 if (len && fwrite(s,len,1,fp) == 0) return -1;
3701 return 0;
3702 }
3703
3704 /* Save a long long value as either an encoded string or a string. */
3705 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3706 unsigned char buf[32];
3707 int enclen = rdbEncodeInteger(value,buf);
3708 if (enclen > 0) {
3709 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3710 } else {
3711 /* Encode as string */
3712 enclen = ll2string((char*)buf,32,value);
3713 redisAssert(enclen < 32);
3714 if (rdbSaveLen(fp,enclen) == -1) return -1;
3715 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3716 }
3717 return 0;
3718 }
3719
3720 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3721 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3722 /* Avoid to decode the object, then encode it again, if the
3723 * object is alrady integer encoded. */
3724 if (obj->encoding == REDIS_ENCODING_INT) {
3725 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3726 } else {
3727 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3728 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3729 }
3730 }
3731
3732 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3733 * 8 bit integer specifing the length of the representation.
3734 * This 8 bit integer has special values in order to specify the following
3735 * conditions:
3736 * 253: not a number
3737 * 254: + inf
3738 * 255: - inf
3739 */
3740 static int rdbSaveDoubleValue(FILE *fp, double val) {
3741 unsigned char buf[128];
3742 int len;
3743
3744 if (isnan(val)) {
3745 buf[0] = 253;
3746 len = 1;
3747 } else if (!isfinite(val)) {
3748 len = 1;
3749 buf[0] = (val < 0) ? 255 : 254;
3750 } else {
3751 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3752 /* Check if the float is in a safe range to be casted into a
3753 * long long. We are assuming that long long is 64 bit here.
3754 * Also we are assuming that there are no implementations around where
3755 * double has precision < 52 bit.
3756 *
3757 * Under this assumptions we test if a double is inside an interval
3758 * where casting to long long is safe. Then using two castings we
3759 * make sure the decimal part is zero. If all this is true we use
3760 * integer printing function that is much faster. */
3761 double min = -4503599627370495; /* (2^52)-1 */
3762 double max = 4503599627370496; /* -(2^52) */
3763 if (val > min && val < max && val == ((double)((long long)val)))
3764 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3765 else
3766 #endif
3767 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3768 buf[0] = strlen((char*)buf+1);
3769 len = buf[0]+1;
3770 }
3771 if (fwrite(buf,len,1,fp) == 0) return -1;
3772 return 0;
3773 }
3774
3775 /* Save a Redis object. */
3776 static int rdbSaveObject(FILE *fp, robj *o) {
3777 if (o->type == REDIS_STRING) {
3778 /* Save a string value */
3779 if (rdbSaveStringObject(fp,o) == -1) return -1;
3780 } else if (o->type == REDIS_LIST) {
3781 /* Save a list value */
3782 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3783 unsigned char *p;
3784 unsigned char *vstr;
3785 unsigned int vlen;
3786 long long vlong;
3787
3788 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3789 p = ziplistIndex(o->ptr,0);
3790 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3791 if (vstr) {
3792 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3793 return -1;
3794 } else {
3795 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3796 return -1;
3797 }
3798 p = ziplistNext(o->ptr,p);
3799 }
3800 } else if (o->encoding == REDIS_ENCODING_LIST) {
3801 list *list = o->ptr;
3802 listIter li;
3803 listNode *ln;
3804
3805 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3806 listRewind(list,&li);
3807 while((ln = listNext(&li))) {
3808 robj *eleobj = listNodeValue(ln);
3809 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3810 }
3811 } else {
3812 redisPanic("Unknown list encoding");
3813 }
3814 } else if (o->type == REDIS_SET) {
3815 /* Save a set value */
3816 if (o->encoding == REDIS_ENCODING_HT) {
3817 dict *set = o->ptr;
3818 dictIterator *di = dictGetIterator(set);
3819 dictEntry *de;
3820
3821 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3822 while((de = dictNext(di)) != NULL) {
3823 robj *eleobj = dictGetEntryKey(de);
3824 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3825 }
3826 dictReleaseIterator(di);
3827 } else if (o->encoding == REDIS_ENCODING_INTSET) {
3828 intset *is = o->ptr;
3829 long long llval;
3830 int i = 0;
3831
3832 if (rdbSaveLen(fp,intsetLen(is)) == -1) return -1;
3833 while(intsetGet(is,i++,&llval)) {
3834 if (rdbSaveLongLongAsStringObject(fp,llval) == -1) return -1;
3835 }
3836 } else {
3837 redisPanic("Unknown set encoding");
3838 }
3839 } else if (o->type == REDIS_ZSET) {
3840 /* Save a set value */
3841 zset *zs = o->ptr;
3842 dictIterator *di = dictGetIterator(zs->dict);
3843 dictEntry *de;
3844
3845 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3846 while((de = dictNext(di)) != NULL) {
3847 robj *eleobj = dictGetEntryKey(de);
3848 double *score = dictGetEntryVal(de);
3849
3850 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3851 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3852 }
3853 dictReleaseIterator(di);
3854 } else if (o->type == REDIS_HASH) {
3855 /* Save a hash value */
3856 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3857 unsigned char *p = zipmapRewind(o->ptr);
3858 unsigned int count = zipmapLen(o->ptr);
3859 unsigned char *key, *val;
3860 unsigned int klen, vlen;
3861
3862 if (rdbSaveLen(fp,count) == -1) return -1;
3863 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3864 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3865 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3866 }
3867 } else {
3868 dictIterator *di = dictGetIterator(o->ptr);
3869 dictEntry *de;
3870
3871 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3872 while((de = dictNext(di)) != NULL) {
3873 robj *key = dictGetEntryKey(de);
3874 robj *val = dictGetEntryVal(de);
3875
3876 if (rdbSaveStringObject(fp,key) == -1) return -1;
3877 if (rdbSaveStringObject(fp,val) == -1) return -1;
3878 }
3879 dictReleaseIterator(di);
3880 }
3881 } else {
3882 redisPanic("Unknown object type");
3883 }
3884 return 0;
3885 }
3886
3887 /* Return the length the object will have on disk if saved with
3888 * the rdbSaveObject() function. Currently we use a trick to get
3889 * this length with very little changes to the code. In the future
3890 * we could switch to a faster solution. */
3891 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3892 if (fp == NULL) fp = server.devnull;
3893 rewind(fp);
3894 assert(rdbSaveObject(fp,o) != 1);
3895 return ftello(fp);
3896 }
3897
3898 /* Return the number of pages required to save this object in the swap file */
3899 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3900 off_t bytes = rdbSavedObjectLen(o,fp);
3901
3902 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3903 }
3904
3905 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3906 static int rdbSave(char *filename) {
3907 dictIterator *di = NULL;
3908 dictEntry *de;
3909 FILE *fp;
3910 char tmpfile[256];
3911 int j;
3912 time_t now = time(NULL);
3913
3914 /* Wait for I/O therads to terminate, just in case this is a
3915 * foreground-saving, to avoid seeking the swap file descriptor at the
3916 * same time. */
3917 if (server.vm_enabled)
3918 waitEmptyIOJobsQueue();
3919
3920 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3921 fp = fopen(tmpfile,"w");
3922 if (!fp) {
3923 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3924 return REDIS_ERR;
3925 }
3926 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3927 for (j = 0; j < server.dbnum; j++) {
3928 redisDb *db = server.db+j;
3929 dict *d = db->dict;
3930 if (dictSize(d) == 0) continue;
3931 di = dictGetIterator(d);
3932 if (!di) {
3933 fclose(fp);
3934 return REDIS_ERR;
3935 }
3936
3937 /* Write the SELECT DB opcode */
3938 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3939 if (rdbSaveLen(fp,j) == -1) goto werr;
3940
3941 /* Iterate this DB writing every entry */
3942 while((de = dictNext(di)) != NULL) {
3943 sds keystr = dictGetEntryKey(de);
3944 robj key, *o = dictGetEntryVal(de);
3945 time_t expiretime;
3946
3947 initStaticStringObject(key,keystr);
3948 expiretime = getExpire(db,&key);
3949
3950 /* Save the expire time */
3951 if (expiretime != -1) {
3952 /* If this key is already expired skip it */
3953 if (expiretime < now) continue;
3954 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3955 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3956 }
3957 /* Save the key and associated value. This requires special
3958 * handling if the value is swapped out. */
3959 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3960 o->storage == REDIS_VM_SWAPPING) {
3961 /* Save type, key, value */
3962 if (rdbSaveType(fp,o->type) == -1) goto werr;
3963 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3964 if (rdbSaveObject(fp,o) == -1) goto werr;
3965 } else {
3966 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3967 robj *po;
3968 /* Get a preview of the object in memory */
3969 po = vmPreviewObject(o);
3970 /* Save type, key, value */
3971 if (rdbSaveType(fp,po->type) == -1) goto werr;
3972 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3973 if (rdbSaveObject(fp,po) == -1) goto werr;
3974 /* Remove the loaded object from memory */
3975 decrRefCount(po);
3976 }
3977 }
3978 dictReleaseIterator(di);
3979 }
3980 /* EOF opcode */
3981 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3982
3983 /* Make sure data will not remain on the OS's output buffers */
3984 fflush(fp);
3985 fsync(fileno(fp));
3986 fclose(fp);
3987
3988 /* Use RENAME to make sure the DB file is changed atomically only
3989 * if the generate DB file is ok. */
3990 if (rename(tmpfile,filename) == -1) {
3991 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3992 unlink(tmpfile);
3993 return REDIS_ERR;
3994 }
3995 redisLog(REDIS_NOTICE,"DB saved on disk");
3996 server.dirty = 0;
3997 server.lastsave = time(NULL);
3998 return REDIS_OK;
3999
4000 werr:
4001 fclose(fp);
4002 unlink(tmpfile);
4003 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
4004 if (di) dictReleaseIterator(di);
4005 return REDIS_ERR;
4006 }
4007
4008 static int rdbSaveBackground(char *filename) {
4009 pid_t childpid;
4010
4011 if (server.bgsavechildpid != -1) return REDIS_ERR;
4012 if (server.vm_enabled) waitEmptyIOJobsQueue();
4013 if ((childpid = fork()) == 0) {
4014 /* Child */
4015 if (server.vm_enabled) vmReopenSwapFile();
4016 close(server.fd);
4017 if (rdbSave(filename) == REDIS_OK) {
4018 _exit(0);
4019 } else {
4020 _exit(1);
4021 }
4022 } else {
4023 /* Parent */
4024 if (childpid == -1) {
4025 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
4026 strerror(errno));
4027 return REDIS_ERR;
4028 }
4029 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
4030 server.bgsavechildpid = childpid;
4031 updateDictResizePolicy();
4032 return REDIS_OK;
4033 }
4034 return REDIS_OK; /* unreached */
4035 }
4036
4037 static void rdbRemoveTempFile(pid_t childpid) {
4038 char tmpfile[256];
4039
4040 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4041 unlink(tmpfile);
4042 }
4043
4044 static int rdbLoadType(FILE *fp) {
4045 unsigned char type;
4046 if (fread(&type,1,1,fp) == 0) return -1;
4047 return type;
4048 }
4049
4050 static time_t rdbLoadTime(FILE *fp) {
4051 int32_t t32;
4052 if (fread(&t32,4,1,fp) == 0) return -1;
4053 return (time_t) t32;
4054 }
4055
4056 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4057 * of this file for a description of how this are stored on disk.
4058 *
4059 * isencoded is set to 1 if the readed length is not actually a length but
4060 * an "encoding type", check the above comments for more info */
4061 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4062 unsigned char buf[2];
4063 uint32_t len;
4064 int type;
4065
4066 if (isencoded) *isencoded = 0;
4067 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4068 type = (buf[0]&0xC0)>>6;
4069 if (type == REDIS_RDB_6BITLEN) {
4070 /* Read a 6 bit len */
4071 return buf[0]&0x3F;
4072 } else if (type == REDIS_RDB_ENCVAL) {
4073 /* Read a 6 bit len encoding type */
4074 if (isencoded) *isencoded = 1;
4075 return buf[0]&0x3F;
4076 } else if (type == REDIS_RDB_14BITLEN) {
4077 /* Read a 14 bit len */
4078 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4079 return ((buf[0]&0x3F)<<8)|buf[1];
4080 } else {
4081 /* Read a 32 bit len */
4082 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4083 return ntohl(len);
4084 }
4085 }
4086
4087 /* Load an integer-encoded object from file 'fp', with the specified
4088 * encoding type 'enctype'. If encode is true the function may return
4089 * an integer-encoded object as reply, otherwise the returned object
4090 * will always be encoded as a raw string. */
4091 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4092 unsigned char enc[4];
4093 long long val;
4094
4095 if (enctype == REDIS_RDB_ENC_INT8) {
4096 if (fread(enc,1,1,fp) == 0) return NULL;
4097 val = (signed char)enc[0];
4098 } else if (enctype == REDIS_RDB_ENC_INT16) {
4099 uint16_t v;
4100 if (fread(enc,2,1,fp) == 0) return NULL;
4101 v = enc[0]|(enc[1]<<8);
4102 val = (int16_t)v;
4103 } else if (enctype == REDIS_RDB_ENC_INT32) {
4104 uint32_t v;
4105 if (fread(enc,4,1,fp) == 0) return NULL;
4106 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4107 val = (int32_t)v;
4108 } else {
4109 val = 0; /* anti-warning */
4110 redisPanic("Unknown RDB integer encoding type");
4111 }
4112 if (encode)
4113 return createStringObjectFromLongLong(val);
4114 else
4115 return createObject(REDIS_STRING,sdsfromlonglong(val));
4116 }
4117
4118 static robj *rdbLoadLzfStringObject(FILE*fp) {
4119 unsigned int len, clen;
4120 unsigned char *c = NULL;
4121 sds val = NULL;
4122
4123 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4124 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4125 if ((c = zmalloc(clen)) == NULL) goto err;
4126 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4127 if (fread(c,clen,1,fp) == 0) goto err;
4128 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4129 zfree(c);
4130 return createObject(REDIS_STRING,val);
4131 err:
4132 zfree(c);
4133 sdsfree(val);
4134 return NULL;
4135 }
4136
4137 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4138 int isencoded;
4139 uint32_t len;
4140 sds val;
4141
4142 len = rdbLoadLen(fp,&isencoded);
4143 if (isencoded) {
4144 switch(len) {
4145 case REDIS_RDB_ENC_INT8:
4146 case REDIS_RDB_ENC_INT16:
4147 case REDIS_RDB_ENC_INT32:
4148 return rdbLoadIntegerObject(fp,len,encode);
4149 case REDIS_RDB_ENC_LZF:
4150 return rdbLoadLzfStringObject(fp);
4151 default:
4152 redisPanic("Unknown RDB encoding type");
4153 }
4154 }
4155
4156 if (len == REDIS_RDB_LENERR) return NULL;
4157 val = sdsnewlen(NULL,len);
4158 if (len && fread(val,len,1,fp) == 0) {
4159 sdsfree(val);
4160 return NULL;
4161 }
4162 return createObject(REDIS_STRING,val);
4163 }
4164
4165 static robj *rdbLoadStringObject(FILE *fp) {
4166 return rdbGenericLoadStringObject(fp,0);
4167 }
4168
4169 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4170 return rdbGenericLoadStringObject(fp,1);
4171 }
4172
4173 /* For information about double serialization check rdbSaveDoubleValue() */
4174 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4175 char buf[128];
4176 unsigned char len;
4177
4178 if (fread(&len,1,1,fp) == 0) return -1;
4179 switch(len) {
4180 case 255: *val = R_NegInf; return 0;
4181 case 254: *val = R_PosInf; return 0;
4182 case 253: *val = R_Nan; return 0;
4183 default:
4184 if (fread(buf,len,1,fp) == 0) return -1;
4185 buf[len] = '\0';
4186 sscanf(buf, "%lg", val);
4187 return 0;
4188 }
4189 }
4190
4191 /* Load a Redis object of the specified type from the specified file.
4192 * On success a newly allocated object is returned, otherwise NULL. */
4193 static robj *rdbLoadObject(int type, FILE *fp) {
4194 robj *o, *ele, *dec;
4195 size_t len;
4196
4197 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4198 if (type == REDIS_STRING) {
4199 /* Read string value */
4200 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4201 o = tryObjectEncoding(o);
4202 } else if (type == REDIS_LIST) {
4203 /* Read list value */
4204 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4205
4206 /* Use a real list when there are too many entries */
4207 if (len > server.list_max_ziplist_entries) {
4208 o = createListObject();
4209 } else {
4210 o = createZiplistObject();
4211 }
4212
4213 /* Load every single element of the list */
4214 while(len--) {
4215 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4216
4217 /* If we are using a ziplist and the value is too big, convert
4218 * the object to a real list. */
4219 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4220 ele->encoding == REDIS_ENCODING_RAW &&
4221 sdslen(ele->ptr) > server.list_max_ziplist_value)
4222 listTypeConvert(o,REDIS_ENCODING_LIST);
4223
4224 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4225 dec = getDecodedObject(ele);
4226 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4227 decrRefCount(dec);
4228 decrRefCount(ele);
4229 } else {
4230 ele = tryObjectEncoding(ele);
4231 listAddNodeTail(o->ptr,ele);
4232 }
4233 }
4234 } else if (type == REDIS_SET) {
4235 /* Read list/set value */
4236 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4237 o = createSetObject();
4238 /* It's faster to expand the dict to the right size asap in order
4239 * to avoid rehashing */
4240 if (len > DICT_HT_INITIAL_SIZE)
4241 dictExpand(o->ptr,len);
4242 /* Load every single element of the list/set */
4243 while(len--) {
4244 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4245 ele = tryObjectEncoding(ele);
4246 dictAdd((dict*)o->ptr,ele,NULL);
4247 }
4248 } else if (type == REDIS_ZSET) {
4249 /* Read list/set value */
4250 size_t zsetlen;
4251 zset *zs;
4252
4253 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4254 o = createZsetObject();
4255 zs = o->ptr;
4256 /* Load every single element of the list/set */
4257 while(zsetlen--) {
4258 robj *ele;
4259 double *score = zmalloc(sizeof(double));
4260
4261 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4262 ele = tryObjectEncoding(ele);
4263 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4264 dictAdd(zs->dict,ele,score);
4265 zslInsert(zs->zsl,*score,ele);
4266 incrRefCount(ele); /* added to skiplist */
4267 }
4268 } else if (type == REDIS_HASH) {
4269 size_t hashlen;
4270
4271 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4272 o = createHashObject();
4273 /* Too many entries? Use an hash table. */
4274 if (hashlen > server.hash_max_zipmap_entries)
4275 convertToRealHash(o);
4276 /* Load every key/value, then set it into the zipmap or hash
4277 * table, as needed. */
4278 while(hashlen--) {
4279 robj *key, *val;
4280
4281 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4282 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4283 /* If we are using a zipmap and there are too big values
4284 * the object is converted to real hash table encoding. */
4285 if (o->encoding != REDIS_ENCODING_HT &&
4286 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4287 sdslen(val->ptr) > server.hash_max_zipmap_value))
4288 {
4289 convertToRealHash(o);
4290 }
4291
4292 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4293 unsigned char *zm = o->ptr;
4294
4295 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4296 val->ptr,sdslen(val->ptr),NULL);
4297 o->ptr = zm;
4298 decrRefCount(key);
4299 decrRefCount(val);
4300 } else {
4301 key = tryObjectEncoding(key);
4302 val = tryObjectEncoding(val);
4303 dictAdd((dict*)o->ptr,key,val);
4304 }
4305 }
4306 } else {
4307 redisPanic("Unknown object type");
4308 }
4309 return o;
4310 }
4311
4312 static int rdbLoad(char *filename) {
4313 FILE *fp;
4314 uint32_t dbid;
4315 int type, retval, rdbver;
4316 int swap_all_values = 0;
4317 redisDb *db = server.db+0;
4318 char buf[1024];
4319 time_t expiretime, now = time(NULL);
4320
4321 fp = fopen(filename,"r");
4322 if (!fp) return REDIS_ERR;
4323 if (fread(buf,9,1,fp) == 0) goto eoferr;
4324 buf[9] = '\0';
4325 if (memcmp(buf,"REDIS",5) != 0) {
4326 fclose(fp);
4327 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4328 return REDIS_ERR;
4329 }
4330 rdbver = atoi(buf+5);
4331 if (rdbver != 1) {
4332 fclose(fp);
4333 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4334 return REDIS_ERR;
4335 }
4336 while(1) {
4337 robj *key, *val;
4338 int force_swapout;
4339
4340 expiretime = -1;
4341 /* Read type. */
4342 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4343 if (type == REDIS_EXPIRETIME) {
4344 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4345 /* We read the time so we need to read the object type again */
4346 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4347 }
4348 if (type == REDIS_EOF) break;
4349 /* Handle SELECT DB opcode as a special case */
4350 if (type == REDIS_SELECTDB) {
4351 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4352 goto eoferr;
4353 if (dbid >= (unsigned)server.dbnum) {
4354 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4355 exit(1);
4356 }
4357 db = server.db+dbid;
4358 continue;
4359 }
4360 /* Read key */
4361 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4362 /* Read value */
4363 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4364 /* Check if the key already expired */
4365 if (expiretime != -1 && expiretime < now) {
4366 decrRefCount(key);
4367 decrRefCount(val);
4368 continue;
4369 }
4370 /* Add the new object in the hash table */
4371 retval = dbAdd(db,key,val);
4372 if (retval == REDIS_ERR) {
4373 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4374 exit(1);
4375 }
4376 /* Set the expire time if needed */
4377 if (expiretime != -1) setExpire(db,key,expiretime);
4378
4379 /* Handle swapping while loading big datasets when VM is on */
4380
4381 /* If we detecter we are hopeless about fitting something in memory
4382 * we just swap every new key on disk. Directly...
4383 * Note that's important to check for this condition before resorting
4384 * to random sampling, otherwise we may try to swap already
4385 * swapped keys. */
4386 if (swap_all_values) {
4387 dictEntry *de = dictFind(db->dict,key->ptr);
4388
4389 /* de may be NULL since the key already expired */
4390 if (de) {
4391 vmpointer *vp;
4392 val = dictGetEntryVal(de);
4393
4394 if (val->refcount == 1 &&
4395 (vp = vmSwapObjectBlocking(val)) != NULL)
4396 dictGetEntryVal(de) = vp;
4397 }
4398 decrRefCount(key);
4399 continue;
4400 }
4401 decrRefCount(key);
4402
4403 /* Flush data on disk once 32 MB of additional RAM are used... */
4404 force_swapout = 0;
4405 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4406 force_swapout = 1;
4407
4408 /* If we have still some hope of having some value fitting memory
4409 * then we try random sampling. */
4410 if (!swap_all_values && server.vm_enabled && force_swapout) {
4411 while (zmalloc_used_memory() > server.vm_max_memory) {
4412 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4413 }
4414 if (zmalloc_used_memory() > server.vm_max_memory)
4415 swap_all_values = 1; /* We are already using too much mem */
4416 }
4417 }
4418 fclose(fp);
4419 return REDIS_OK;
4420
4421 eoferr: /* unexpected end of file is handled here with a fatal exit */
4422 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4423 exit(1);
4424 return REDIS_ERR; /* Just to avoid warning */
4425 }
4426
4427 /*================================== Shutdown =============================== */
4428 static int prepareForShutdown() {
4429 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4430 /* Kill the saving child if there is a background saving in progress.
4431 We want to avoid race conditions, for instance our saving child may
4432 overwrite the synchronous saving did by SHUTDOWN. */
4433 if (server.bgsavechildpid != -1) {
4434 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4435 kill(server.bgsavechildpid,SIGKILL);
4436 rdbRemoveTempFile(server.bgsavechildpid);
4437 }
4438 if (server.appendonly) {
4439 /* Append only file: fsync() the AOF and exit */
4440 aof_fsync(server.appendfd);
4441 if (server.vm_enabled) unlink(server.vm_swap_file);
4442 } else {
4443 /* Snapshotting. Perform a SYNC SAVE and exit */
4444 if (rdbSave(server.dbfilename) == REDIS_OK) {
4445 if (server.daemonize)
4446 unlink(server.pidfile);
4447 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4448 } else {
4449 /* Ooops.. error saving! The best we can do is to continue
4450 * operating. Note that if there was a background saving process,
4451 * in the next cron() Redis will be notified that the background
4452 * saving aborted, handling special stuff like slaves pending for
4453 * synchronization... */
4454 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4455 return REDIS_ERR;
4456 }
4457 }
4458 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4459 return REDIS_OK;
4460 }
4461
4462 /*================================== Commands =============================== */
4463
4464 static void authCommand(redisClient *c) {
4465 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4466 c->authenticated = 1;
4467 addReply(c,shared.ok);
4468 } else {
4469 c->authenticated = 0;
4470 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4471 }
4472 }
4473
4474 static void pingCommand(redisClient *c) {
4475 addReply(c,shared.pong);
4476 }
4477
4478 static void echoCommand(redisClient *c) {
4479 addReplyBulk(c,c->argv[1]);
4480 }
4481
4482 /*=================================== Strings =============================== */
4483
4484 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4485 int retval;
4486 long seconds = 0; /* initialized to avoid an harmness warning */
4487
4488 if (expire) {
4489 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4490 return;
4491 if (seconds <= 0) {
4492 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4493 return;
4494 }
4495 }
4496
4497 touchWatchedKey(c->db,key);
4498 if (nx) deleteIfVolatile(c->db,key);
4499 retval = dbAdd(c->db,key,val);
4500 if (retval == REDIS_ERR) {
4501 if (!nx) {
4502 dbReplace(c->db,key,val);
4503 incrRefCount(val);
4504 } else {
4505 addReply(c,shared.czero);
4506 return;
4507 }
4508 } else {
4509 incrRefCount(val);
4510 }
4511 server.dirty++;
4512 removeExpire(c->db,key);
4513 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4514 addReply(c, nx ? shared.cone : shared.ok);
4515 }
4516
4517 static void setCommand(redisClient *c) {
4518 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4519 }
4520
4521 static void setnxCommand(redisClient *c) {
4522 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4523 }
4524
4525 static void setexCommand(redisClient *c) {
4526 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4527 }
4528
4529 static int getGenericCommand(redisClient *c) {
4530 robj *o;
4531
4532 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4533 return REDIS_OK;
4534
4535 if (o->type != REDIS_STRING) {
4536 addReply(c,shared.wrongtypeerr);
4537 return REDIS_ERR;
4538 } else {
4539 addReplyBulk(c,o);
4540 return REDIS_OK;
4541 }
4542 }
4543
4544 static void getCommand(redisClient *c) {
4545 getGenericCommand(c);
4546 }
4547
4548 static void getsetCommand(redisClient *c) {
4549 if (getGenericCommand(c) == REDIS_ERR) return;
4550 dbReplace(c->db,c->argv[1],c->argv[2]);
4551 incrRefCount(c->argv[2]);
4552 server.dirty++;
4553 removeExpire(c->db,c->argv[1]);
4554 }
4555
4556 static void mgetCommand(redisClient *c) {
4557 int j;
4558
4559 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4560 for (j = 1; j < c->argc; j++) {
4561 robj *o = lookupKeyRead(c->db,c->argv[j]);
4562 if (o == NULL) {
4563 addReply(c,shared.nullbulk);
4564 } else {
4565 if (o->type != REDIS_STRING) {
4566 addReply(c,shared.nullbulk);
4567 } else {
4568 addReplyBulk(c,o);
4569 }
4570 }
4571 }
4572 }
4573
4574 static void msetGenericCommand(redisClient *c, int nx) {
4575 int j, busykeys = 0;
4576
4577 if ((c->argc % 2) == 0) {
4578 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4579 return;
4580 }
4581 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4582 * set nothing at all if at least one already key exists. */
4583 if (nx) {
4584 for (j = 1; j < c->argc; j += 2) {
4585 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4586 busykeys++;
4587 }
4588 }
4589 }
4590 if (busykeys) {
4591 addReply(c, shared.czero);
4592 return;
4593 }
4594
4595 for (j = 1; j < c->argc; j += 2) {
4596 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4597 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4598 incrRefCount(c->argv[j+1]);
4599 removeExpire(c->db,c->argv[j]);
4600 }
4601 server.dirty += (c->argc-1)/2;
4602 addReply(c, nx ? shared.cone : shared.ok);
4603 }
4604
4605 static void msetCommand(redisClient *c) {
4606 msetGenericCommand(c,0);
4607 }
4608
4609 static void msetnxCommand(redisClient *c) {
4610 msetGenericCommand(c,1);
4611 }
4612
4613 static void incrDecrCommand(redisClient *c, long long incr) {
4614 long long value;
4615 robj *o;
4616
4617 o = lookupKeyWrite(c->db,c->argv[1]);
4618 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4619 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4620
4621 value += incr;
4622 o = createStringObjectFromLongLong(value);
4623 dbReplace(c->db,c->argv[1],o);
4624 server.dirty++;
4625 addReply(c,shared.colon);
4626 addReply(c,o);
4627 addReply(c,shared.crlf);
4628 }
4629
4630 static void incrCommand(redisClient *c) {
4631 incrDecrCommand(c,1);
4632 }
4633
4634 static void decrCommand(redisClient *c) {
4635 incrDecrCommand(c,-1);
4636 }
4637
4638 static void incrbyCommand(redisClient *c) {
4639 long long incr;
4640
4641 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4642 incrDecrCommand(c,incr);
4643 }
4644
4645 static void decrbyCommand(redisClient *c) {
4646 long long incr;
4647
4648 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4649 incrDecrCommand(c,-incr);
4650 }
4651
4652 static void appendCommand(redisClient *c) {
4653 int retval;
4654 size_t totlen;
4655 robj *o;
4656
4657 o = lookupKeyWrite(c->db,c->argv[1]);
4658 if (o == NULL) {
4659 /* Create the key */
4660 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4661 incrRefCount(c->argv[2]);
4662 totlen = stringObjectLen(c->argv[2]);
4663 } else {
4664 if (o->type != REDIS_STRING) {
4665 addReply(c,shared.wrongtypeerr);
4666 return;
4667 }
4668 /* If the object is specially encoded or shared we have to make
4669 * a copy */
4670 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4671 robj *decoded = getDecodedObject(o);
4672
4673 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4674 decrRefCount(decoded);
4675 dbReplace(c->db,c->argv[1],o);
4676 }
4677 /* APPEND! */
4678 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4679 o->ptr = sdscatlen(o->ptr,
4680 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4681 } else {
4682 o->ptr = sdscatprintf(o->ptr, "%ld",
4683 (unsigned long) c->argv[2]->ptr);
4684 }
4685 totlen = sdslen(o->ptr);
4686 }
4687 server.dirty++;
4688 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4689 }
4690
4691 static void substrCommand(redisClient *c) {
4692 robj *o;
4693 long start = atoi(c->argv[2]->ptr);
4694 long end = atoi(c->argv[3]->ptr);
4695 size_t rangelen, strlen;
4696 sds range;
4697
4698 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4699 checkType(c,o,REDIS_STRING)) return;
4700
4701 o = getDecodedObject(o);
4702 strlen = sdslen(o->ptr);
4703
4704 /* convert negative indexes */
4705 if (start < 0) start = strlen+start;
4706 if (end < 0) end = strlen+end;
4707 if (start < 0) start = 0;
4708 if (end < 0) end = 0;
4709
4710 /* indexes sanity checks */
4711 if (start > end || (size_t)start >= strlen) {
4712 /* Out of range start or start > end result in null reply */
4713 addReply(c,shared.nullbulk);
4714 decrRefCount(o);
4715 return;
4716 }
4717 if ((size_t)end >= strlen) end = strlen-1;
4718 rangelen = (end-start)+1;
4719
4720 /* Return the result */
4721 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4722 range = sdsnewlen((char*)o->ptr+start,rangelen);
4723 addReplySds(c,range);
4724 addReply(c,shared.crlf);
4725 decrRefCount(o);
4726 }
4727
4728 /* ========================= Type agnostic commands ========================= */
4729
4730 static void delCommand(redisClient *c) {
4731 int deleted = 0, j;
4732
4733 for (j = 1; j < c->argc; j++) {
4734 if (dbDelete(c->db,c->argv[j])) {
4735 touchWatchedKey(c->db,c->argv[j]);
4736 server.dirty++;
4737 deleted++;
4738 }
4739 }
4740 addReplyLongLong(c,deleted);
4741 }
4742
4743 static void existsCommand(redisClient *c) {
4744 expireIfNeeded(c->db,c->argv[1]);
4745 if (dbExists(c->db,c->argv[1])) {
4746 addReply(c, shared.cone);
4747 } else {
4748 addReply(c, shared.czero);
4749 }
4750 }
4751
4752 static void selectCommand(redisClient *c) {
4753 int id = atoi(c->argv[1]->ptr);
4754
4755 if (selectDb(c,id) == REDIS_ERR) {
4756 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4757 } else {
4758 addReply(c,shared.ok);
4759 }
4760 }
4761
4762 static void randomkeyCommand(redisClient *c) {
4763 robj *key;
4764
4765 if ((key = dbRandomKey(c->db)) == NULL) {
4766 addReply(c,shared.nullbulk);
4767 return;
4768 }
4769
4770 addReplyBulk(c,key);
4771 decrRefCount(key);
4772 }
4773
4774 static void keysCommand(redisClient *c) {
4775 dictIterator *di;
4776 dictEntry *de;
4777 sds pattern = c->argv[1]->ptr;
4778 int plen = sdslen(pattern);
4779 unsigned long numkeys = 0;
4780 robj *lenobj = createObject(REDIS_STRING,NULL);
4781
4782 di = dictGetIterator(c->db->dict);
4783 addReply(c,lenobj);
4784 decrRefCount(lenobj);
4785 while((de = dictNext(di)) != NULL) {
4786 sds key = dictGetEntryKey(de);
4787 robj *keyobj;
4788
4789 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4790 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4791 keyobj = createStringObject(key,sdslen(key));
4792 if (expireIfNeeded(c->db,keyobj) == 0) {
4793 addReplyBulk(c,keyobj);
4794 numkeys++;
4795 }
4796 decrRefCount(keyobj);
4797 }
4798 }
4799 dictReleaseIterator(di);
4800 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4801 }
4802
4803 static void dbsizeCommand(redisClient *c) {
4804 addReplySds(c,
4805 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4806 }
4807
4808 static void lastsaveCommand(redisClient *c) {
4809 addReplySds(c,
4810 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4811 }
4812
4813 static void typeCommand(redisClient *c) {
4814 robj *o;
4815 char *type;
4816
4817 o = lookupKeyRead(c->db,c->argv[1]);
4818 if (o == NULL) {
4819 type = "+none";
4820 } else {
4821 switch(o->type) {
4822 case REDIS_STRING: type = "+string"; break;
4823 case REDIS_LIST: type = "+list"; break;
4824 case REDIS_SET: type = "+set"; break;
4825 case REDIS_ZSET: type = "+zset"; break;
4826 case REDIS_HASH: type = "+hash"; break;
4827 default: type = "+unknown"; break;
4828 }
4829 }
4830 addReplySds(c,sdsnew(type));
4831 addReply(c,shared.crlf);
4832 }
4833
4834 static void saveCommand(redisClient *c) {
4835 if (server.bgsavechildpid != -1) {
4836 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4837 return;
4838 }
4839 if (rdbSave(server.dbfilename) == REDIS_OK) {
4840 addReply(c,shared.ok);
4841 } else {
4842 addReply(c,shared.err);
4843 }
4844 }
4845
4846 static void bgsaveCommand(redisClient *c) {
4847 if (server.bgsavechildpid != -1) {
4848 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4849 return;
4850 }
4851 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4852 char *status = "+Background saving started\r\n";
4853 addReplySds(c,sdsnew(status));
4854 } else {
4855 addReply(c,shared.err);
4856 }
4857 }
4858
4859 static void shutdownCommand(redisClient *c) {
4860 if (prepareForShutdown() == REDIS_OK)
4861 exit(0);
4862 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4863 }
4864
4865 static void renameGenericCommand(redisClient *c, int nx) {
4866 robj *o;
4867
4868 /* To use the same key as src and dst is probably an error */
4869 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4870 addReply(c,shared.sameobjecterr);
4871 return;
4872 }
4873
4874 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4875 return;
4876
4877 incrRefCount(o);
4878 deleteIfVolatile(c->db,c->argv[2]);
4879 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4880 if (nx) {
4881 decrRefCount(o);
4882 addReply(c,shared.czero);
4883 return;
4884 }
4885 dbReplace(c->db,c->argv[2],o);
4886 }
4887 dbDelete(c->db,c->argv[1]);
4888 touchWatchedKey(c->db,c->argv[2]);
4889 server.dirty++;
4890 addReply(c,nx ? shared.cone : shared.ok);
4891 }
4892
4893 static void renameCommand(redisClient *c) {
4894 renameGenericCommand(c,0);
4895 }
4896
4897 static void renamenxCommand(redisClient *c) {
4898 renameGenericCommand(c,1);
4899 }
4900
4901 static void moveCommand(redisClient *c) {
4902 robj *o;
4903 redisDb *src, *dst;
4904 int srcid;
4905
4906 /* Obtain source and target DB pointers */
4907 src = c->db;
4908 srcid = c->db->id;
4909 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4910 addReply(c,shared.outofrangeerr);
4911 return;
4912 }
4913 dst = c->db;
4914 selectDb(c,srcid); /* Back to the source DB */
4915
4916 /* If the user is moving using as target the same
4917 * DB as the source DB it is probably an error. */
4918 if (src == dst) {
4919 addReply(c,shared.sameobjecterr);
4920 return;
4921 }
4922
4923 /* Check if the element exists and get a reference */
4924 o = lookupKeyWrite(c->db,c->argv[1]);
4925 if (!o) {
4926 addReply(c,shared.czero);
4927 return;
4928 }
4929
4930 /* Try to add the element to the target DB */
4931 deleteIfVolatile(dst,c->argv[1]);
4932 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4933 addReply(c,shared.czero);
4934 return;
4935 }
4936 incrRefCount(o);
4937
4938 /* OK! key moved, free the entry in the source DB */
4939 dbDelete(src,c->argv[1]);
4940 server.dirty++;
4941 addReply(c,shared.cone);
4942 }
4943
4944 /* =================================== Lists ================================ */
4945
4946
4947 /* Check the argument length to see if it requires us to convert the ziplist
4948 * to a real list. Only check raw-encoded objects because integer encoded
4949 * objects are never too long. */
4950 static void listTypeTryConversion(robj *subject, robj *value) {
4951 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4952 if (value->encoding == REDIS_ENCODING_RAW &&
4953 sdslen(value->ptr) > server.list_max_ziplist_value)
4954 listTypeConvert(subject,REDIS_ENCODING_LIST);
4955 }
4956
4957 static void listTypePush(robj *subject, robj *value, int where) {
4958 /* Check if we need to convert the ziplist */
4959 listTypeTryConversion(subject,value);
4960 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4961 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
4962 listTypeConvert(subject,REDIS_ENCODING_LIST);
4963
4964 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4965 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4966 value = getDecodedObject(value);
4967 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4968 decrRefCount(value);
4969 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4970 if (where == REDIS_HEAD) {
4971 listAddNodeHead(subject->ptr,value);
4972 } else {
4973 listAddNodeTail(subject->ptr,value);
4974 }
4975 incrRefCount(value);
4976 } else {
4977 redisPanic("Unknown list encoding");
4978 }
4979 }
4980
4981 static robj *listTypePop(robj *subject, int where) {
4982 robj *value = NULL;
4983 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4984 unsigned char *p;
4985 unsigned char *vstr;
4986 unsigned int vlen;
4987 long long vlong;
4988 int pos = (where == REDIS_HEAD) ? 0 : -1;
4989 p = ziplistIndex(subject->ptr,pos);
4990 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4991 if (vstr) {
4992 value = createStringObject((char*)vstr,vlen);
4993 } else {
4994 value = createStringObjectFromLongLong(vlong);
4995 }
4996 /* We only need to delete an element when it exists */
4997 subject->ptr = ziplistDelete(subject->ptr,&p);
4998 }
4999 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5000 list *list = subject->ptr;
5001 listNode *ln;
5002 if (where == REDIS_HEAD) {
5003 ln = listFirst(list);
5004 } else {
5005 ln = listLast(list);
5006 }
5007 if (ln != NULL) {
5008 value = listNodeValue(ln);
5009 incrRefCount(value);
5010 listDelNode(list,ln);
5011 }
5012 } else {
5013 redisPanic("Unknown list encoding");
5014 }
5015 return value;
5016 }
5017
5018 static unsigned long listTypeLength(robj *subject) {
5019 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5020 return ziplistLen(subject->ptr);
5021 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5022 return listLength((list*)subject->ptr);
5023 } else {
5024 redisPanic("Unknown list encoding");
5025 }
5026 }
5027
5028 /* Structure to hold set iteration abstraction. */
5029 typedef struct {
5030 robj *subject;
5031 unsigned char encoding;
5032 unsigned char direction; /* Iteration direction */
5033 unsigned char *zi;
5034 listNode *ln;
5035 } listTypeIterator;
5036
5037 /* Structure for an entry while iterating over a list. */
5038 typedef struct {
5039 listTypeIterator *li;
5040 unsigned char *zi; /* Entry in ziplist */
5041 listNode *ln; /* Entry in linked list */
5042 } listTypeEntry;
5043
5044 /* Initialize an iterator at the specified index. */
5045 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5046 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5047 li->subject = subject;
5048 li->encoding = subject->encoding;
5049 li->direction = direction;
5050 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5051 li->zi = ziplistIndex(subject->ptr,index);
5052 } else if (li->encoding == REDIS_ENCODING_LIST) {
5053 li->ln = listIndex(subject->ptr,index);
5054 } else {
5055 redisPanic("Unknown list encoding");
5056 }
5057 return li;
5058 }
5059
5060 /* Clean up the iterator. */
5061 static void listTypeReleaseIterator(listTypeIterator *li) {
5062 zfree(li);
5063 }
5064
5065 /* Stores pointer to current the entry in the provided entry structure
5066 * and advances the position of the iterator. Returns 1 when the current
5067 * entry is in fact an entry, 0 otherwise. */
5068 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5069 /* Protect from converting when iterating */
5070 redisAssert(li->subject->encoding == li->encoding);
5071
5072 entry->li = li;
5073 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5074 entry->zi = li->zi;
5075 if (entry->zi != NULL) {
5076 if (li->direction == REDIS_TAIL)
5077 li->zi = ziplistNext(li->subject->ptr,li->zi);
5078 else
5079 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5080 return 1;
5081 }
5082 } else if (li->encoding == REDIS_ENCODING_LIST) {
5083 entry->ln = li->ln;
5084 if (entry->ln != NULL) {
5085 if (li->direction == REDIS_TAIL)
5086 li->ln = li->ln->next;
5087 else
5088 li->ln = li->ln->prev;
5089 return 1;
5090 }
5091 } else {
5092 redisPanic("Unknown list encoding");
5093 }
5094 return 0;
5095 }
5096
5097 /* Return entry or NULL at the current position of the iterator. */
5098 static robj *listTypeGet(listTypeEntry *entry) {
5099 listTypeIterator *li = entry->li;
5100 robj *value = NULL;
5101 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5102 unsigned char *vstr;
5103 unsigned int vlen;
5104 long long vlong;
5105 redisAssert(entry->zi != NULL);
5106 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5107 if (vstr) {
5108 value = createStringObject((char*)vstr,vlen);
5109 } else {
5110 value = createStringObjectFromLongLong(vlong);
5111 }
5112 }
5113 } else if (li->encoding == REDIS_ENCODING_LIST) {
5114 redisAssert(entry->ln != NULL);
5115 value = listNodeValue(entry->ln);
5116 incrRefCount(value);
5117 } else {
5118 redisPanic("Unknown list encoding");
5119 }
5120 return value;
5121 }
5122
5123 /* Compare the given object with the entry at the current position. */
5124 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5125 listTypeIterator *li = entry->li;
5126 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5127 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5128 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5129 } else if (li->encoding == REDIS_ENCODING_LIST) {
5130 return equalStringObjects(o,listNodeValue(entry->ln));
5131 } else {
5132 redisPanic("Unknown list encoding");
5133 }
5134 }
5135
5136 /* Delete the element pointed to. */
5137 static void listTypeDelete(listTypeEntry *entry) {
5138 listTypeIterator *li = entry->li;
5139 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5140 unsigned char *p = entry->zi;
5141 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5142
5143 /* Update position of the iterator depending on the direction */
5144 if (li->direction == REDIS_TAIL)
5145 li->zi = p;
5146 else
5147 li->zi = ziplistPrev(li->subject->ptr,p);
5148 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5149 listNode *next;
5150 if (li->direction == REDIS_TAIL)
5151 next = entry->ln->next;
5152 else
5153 next = entry->ln->prev;
5154 listDelNode(li->subject->ptr,entry->ln);
5155 li->ln = next;
5156 } else {
5157 redisPanic("Unknown list encoding");
5158 }
5159 }
5160
5161 static void listTypeConvert(robj *subject, int enc) {
5162 listTypeIterator *li;
5163 listTypeEntry entry;
5164 redisAssert(subject->type == REDIS_LIST);
5165
5166 if (enc == REDIS_ENCODING_LIST) {
5167 list *l = listCreate();
5168 listSetFreeMethod(l,decrRefCount);
5169
5170 /* listTypeGet returns a robj with incremented refcount */
5171 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5172 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5173 listTypeReleaseIterator(li);
5174
5175 subject->encoding = REDIS_ENCODING_LIST;
5176 zfree(subject->ptr);
5177 subject->ptr = l;
5178 } else {
5179 redisPanic("Unsupported list conversion");
5180 }
5181 }
5182
5183 static void pushGenericCommand(redisClient *c, int where) {
5184 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5185 if (lobj == NULL) {
5186 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5187 addReply(c,shared.cone);
5188 return;
5189 }
5190 lobj = createZiplistObject();
5191 dbAdd(c->db,c->argv[1],lobj);
5192 } else {
5193 if (lobj->type != REDIS_LIST) {
5194 addReply(c,shared.wrongtypeerr);
5195 return;
5196 }
5197 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5198 addReply(c,shared.cone);
5199 return;
5200 }
5201 }
5202 listTypePush(lobj,c->argv[2],where);
5203 addReplyLongLong(c,listTypeLength(lobj));
5204 server.dirty++;
5205 }
5206
5207 static void lpushCommand(redisClient *c) {
5208 pushGenericCommand(c,REDIS_HEAD);
5209 }
5210
5211 static void rpushCommand(redisClient *c) {
5212 pushGenericCommand(c,REDIS_TAIL);
5213 }
5214
5215 static void llenCommand(redisClient *c) {
5216 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5217 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5218 addReplyUlong(c,listTypeLength(o));
5219 }
5220
5221 static void lindexCommand(redisClient *c) {
5222 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5223 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5224 int index = atoi(c->argv[2]->ptr);
5225 robj *value = NULL;
5226
5227 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5228 unsigned char *p;
5229 unsigned char *vstr;
5230 unsigned int vlen;
5231 long long vlong;
5232 p = ziplistIndex(o->ptr,index);
5233 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5234 if (vstr) {
5235 value = createStringObject((char*)vstr,vlen);
5236 } else {
5237 value = createStringObjectFromLongLong(vlong);
5238 }
5239 addReplyBulk(c,value);
5240 decrRefCount(value);
5241 } else {
5242 addReply(c,shared.nullbulk);
5243 }
5244 } else if (o->encoding == REDIS_ENCODING_LIST) {
5245 listNode *ln = listIndex(o->ptr,index);
5246 if (ln != NULL) {
5247 value = listNodeValue(ln);
5248 addReplyBulk(c,value);
5249 } else {
5250 addReply(c,shared.nullbulk);
5251 }
5252 } else {
5253 redisPanic("Unknown list encoding");
5254 }
5255 }
5256
5257 static void lsetCommand(redisClient *c) {
5258 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5259 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5260 int index = atoi(c->argv[2]->ptr);
5261 robj *value = c->argv[3];
5262
5263 listTypeTryConversion(o,value);
5264 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5265 unsigned char *p, *zl = o->ptr;
5266 p = ziplistIndex(zl,index);
5267 if (p == NULL) {
5268 addReply(c,shared.outofrangeerr);
5269 } else {
5270 o->ptr = ziplistDelete(o->ptr,&p);
5271 value = getDecodedObject(value);
5272 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5273 decrRefCount(value);
5274 addReply(c,shared.ok);
5275 server.dirty++;
5276 }
5277 } else if (o->encoding == REDIS_ENCODING_LIST) {
5278 listNode *ln = listIndex(o->ptr,index);
5279 if (ln == NULL) {
5280 addReply(c,shared.outofrangeerr);
5281 } else {
5282 decrRefCount((robj*)listNodeValue(ln));
5283 listNodeValue(ln) = value;
5284 incrRefCount(value);
5285 addReply(c,shared.ok);
5286 server.dirty++;
5287 }
5288 } else {
5289 redisPanic("Unknown list encoding");
5290 }
5291 }
5292
5293 static void popGenericCommand(redisClient *c, int where) {
5294 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5295 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5296
5297 robj *value = listTypePop(o,where);
5298 if (value == NULL) {
5299 addReply(c,shared.nullbulk);
5300 } else {
5301 addReplyBulk(c,value);
5302 decrRefCount(value);
5303 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5304 server.dirty++;
5305 }
5306 }
5307
5308 static void lpopCommand(redisClient *c) {
5309 popGenericCommand(c,REDIS_HEAD);
5310 }
5311
5312 static void rpopCommand(redisClient *c) {
5313 popGenericCommand(c,REDIS_TAIL);
5314 }
5315
5316 static void lrangeCommand(redisClient *c) {
5317 robj *o, *value;
5318 int start = atoi(c->argv[2]->ptr);
5319 int end = atoi(c->argv[3]->ptr);
5320 int llen;
5321 int rangelen, j;
5322 listTypeEntry entry;
5323
5324 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5325 || checkType(c,o,REDIS_LIST)) return;
5326 llen = listTypeLength(o);
5327
5328 /* convert negative indexes */
5329 if (start < 0) start = llen+start;
5330 if (end < 0) end = llen+end;
5331 if (start < 0) start = 0;
5332 if (end < 0) end = 0;
5333
5334 /* indexes sanity checks */
5335 if (start > end || start >= llen) {
5336 /* Out of range start or start > end result in empty list */
5337 addReply(c,shared.emptymultibulk);
5338 return;
5339 }
5340 if (end >= llen) end = llen-1;
5341 rangelen = (end-start)+1;
5342
5343 /* Return the result in form of a multi-bulk reply */
5344 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5345 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5346 for (j = 0; j < rangelen; j++) {
5347 redisAssert(listTypeNext(li,&entry));
5348 value = listTypeGet(&entry);
5349 addReplyBulk(c,value);
5350 decrRefCount(value);
5351 }
5352 listTypeReleaseIterator(li);
5353 }
5354
5355 static void ltrimCommand(redisClient *c) {
5356 robj *o;
5357 int start = atoi(c->argv[2]->ptr);
5358 int end = atoi(c->argv[3]->ptr);
5359 int llen;
5360 int j, ltrim, rtrim;
5361 list *list;
5362 listNode *ln;
5363
5364 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5365 checkType(c,o,REDIS_LIST)) return;
5366 llen = listTypeLength(o);
5367
5368 /* convert negative indexes */
5369 if (start < 0) start = llen+start;
5370 if (end < 0) end = llen+end;
5371 if (start < 0) start = 0;
5372 if (end < 0) end = 0;
5373
5374 /* indexes sanity checks */
5375 if (start > end || start >= llen) {
5376 /* Out of range start or start > end result in empty list */
5377 ltrim = llen;
5378 rtrim = 0;
5379 } else {
5380 if (end >= llen) end = llen-1;
5381 ltrim = start;
5382 rtrim = llen-end-1;
5383 }
5384
5385 /* Remove list elements to perform the trim */
5386 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5387 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5388 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5389 } else if (o->encoding == REDIS_ENCODING_LIST) {
5390 list = o->ptr;
5391 for (j = 0; j < ltrim; j++) {
5392 ln = listFirst(list);
5393 listDelNode(list,ln);
5394 }
5395 for (j = 0; j < rtrim; j++) {
5396 ln = listLast(list);
5397 listDelNode(list,ln);
5398 }
5399 } else {
5400 redisPanic("Unknown list encoding");
5401 }
5402 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5403 server.dirty++;
5404 addReply(c,shared.ok);
5405 }
5406
5407 static void lremCommand(redisClient *c) {
5408 robj *subject, *obj = c->argv[3];
5409 int toremove = atoi(c->argv[2]->ptr);
5410 int removed = 0;
5411 listTypeEntry entry;
5412
5413 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5414 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5415
5416 /* Make sure obj is raw when we're dealing with a ziplist */
5417 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5418 obj = getDecodedObject(obj);
5419
5420 listTypeIterator *li;
5421 if (toremove < 0) {
5422 toremove = -toremove;
5423 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5424 } else {
5425 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5426 }
5427
5428 while (listTypeNext(li,&entry)) {
5429 if (listTypeEqual(&entry,obj)) {
5430 listTypeDelete(&entry);
5431 server.dirty++;
5432 removed++;
5433 if (toremove && removed == toremove) break;
5434 }
5435 }
5436 listTypeReleaseIterator(li);
5437
5438 /* Clean up raw encoded object */
5439 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5440 decrRefCount(obj);
5441
5442 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5443 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5444 }
5445
5446 /* This is the semantic of this command:
5447 * RPOPLPUSH srclist dstlist:
5448 * IF LLEN(srclist) > 0
5449 * element = RPOP srclist
5450 * LPUSH dstlist element
5451 * RETURN element
5452 * ELSE
5453 * RETURN nil
5454 * END
5455 * END
5456 *
5457 * The idea is to be able to get an element from a list in a reliable way
5458 * since the element is not just returned but pushed against another list
5459 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5460 */
5461 static void rpoplpushcommand(redisClient *c) {
5462 robj *sobj, *value;
5463 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5464 checkType(c,sobj,REDIS_LIST)) return;
5465
5466 if (listTypeLength(sobj) == 0) {
5467 addReply(c,shared.nullbulk);
5468 } else {
5469 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5470 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5471 value = listTypePop(sobj,REDIS_TAIL);
5472
5473 /* Add the element to the target list (unless it's directly
5474 * passed to some BLPOP-ing client */
5475 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5476 /* Create the list if the key does not exist */
5477 if (!dobj) {
5478 dobj = createZiplistObject();
5479 dbAdd(c->db,c->argv[2],dobj);
5480 }
5481 listTypePush(dobj,value,REDIS_HEAD);
5482 }
5483
5484 /* Send the element to the client as reply as well */
5485 addReplyBulk(c,value);
5486
5487 /* listTypePop returns an object with its refcount incremented */
5488 decrRefCount(value);
5489
5490 /* Delete the source list when it is empty */
5491 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5492 server.dirty++;
5493 }
5494 }
5495
5496 /* ==================================== Sets ================================ */
5497
5498 /* Factory method to return a set that *can* hold "value". When the object has
5499 * an integer-encodable value, an intset will be returned. Otherwise a regular
5500 * hash table. */
5501 static robj *setTypeCreate(robj *value) {
5502 if (getLongLongFromObject(value,NULL) == REDIS_OK)
5503 return createIntsetObject();
5504 return createSetObject();
5505 }
5506
5507 static int setTypeAdd(robj *subject, robj *value) {
5508 long long llval;
5509 if (subject->encoding == REDIS_ENCODING_HT) {
5510 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5511 incrRefCount(value);
5512 return 1;
5513 }
5514 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5515 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5516 uint8_t success = 0;
5517 subject->ptr = intsetAdd(subject->ptr,llval,&success);
5518 if (success) {
5519 /* Convert to regular set when the intset contains
5520 * too many entries. */
5521 if (intsetLen(subject->ptr) > server.set_max_intset_entries)
5522 setTypeConvert(subject,REDIS_ENCODING_HT);
5523 return 1;
5524 }
5525 } else {
5526 /* Failed to get integer from object, convert to regular set. */
5527 setTypeConvert(subject,REDIS_ENCODING_HT);
5528
5529 /* The set *was* an intset and this value is not integer
5530 * encodable, so dictAdd should always work. */
5531 redisAssert(dictAdd(subject->ptr,value,NULL) == DICT_OK);
5532 incrRefCount(value);
5533 return 1;
5534 }
5535 } else {
5536 redisPanic("Unknown set encoding");
5537 }
5538 return 0;
5539 }
5540
5541 static int setTypeRemove(robj *subject, robj *value) {
5542 long long llval;
5543 if (subject->encoding == REDIS_ENCODING_HT) {
5544 if (dictDelete(subject->ptr,value) == DICT_OK) {
5545 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5546 return 1;
5547 }
5548 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5549 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5550 uint8_t success;
5551 subject->ptr = intsetRemove(subject->ptr,llval,&success);
5552 if (success) return 1;
5553 }
5554 } else {
5555 redisPanic("Unknown set encoding");
5556 }
5557 return 0;
5558 }
5559
5560 static int setTypeIsMember(robj *subject, robj *value) {
5561 long long llval;
5562 if (subject->encoding == REDIS_ENCODING_HT) {
5563 return dictFind((dict*)subject->ptr,value) != NULL;
5564 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5565 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5566 return intsetFind((intset*)subject->ptr,llval);
5567 }
5568 } else {
5569 redisPanic("Unknown set encoding");
5570 }
5571 return 0;
5572 }
5573
5574 /* Structure to hold set iteration abstraction. */
5575 typedef struct {
5576 robj *subject;
5577 int encoding;
5578 int ii; /* intset iterator */
5579 dictIterator *di;
5580 } setIterator;
5581
5582 static setIterator *setTypeInitIterator(robj *subject) {
5583 setIterator *si = zmalloc(sizeof(setIterator));
5584 si->subject = subject;
5585 si->encoding = subject->encoding;
5586 if (si->encoding == REDIS_ENCODING_HT) {
5587 si->di = dictGetIterator(subject->ptr);
5588 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5589 si->ii = 0;
5590 } else {
5591 redisPanic("Unknown set encoding");
5592 }
5593 return si;
5594 }
5595
5596 static void setTypeReleaseIterator(setIterator *si) {
5597 if (si->encoding == REDIS_ENCODING_HT)
5598 dictReleaseIterator(si->di);
5599 zfree(si);
5600 }
5601
5602 /* Move to the next entry in the set. Returns the object at the current
5603 * position, or NULL when the end is reached. This object will have its
5604 * refcount incremented, so the caller needs to take care of this. */
5605 static robj *setTypeNext(setIterator *si) {
5606 robj *ret = NULL;
5607 if (si->encoding == REDIS_ENCODING_HT) {
5608 dictEntry *de = dictNext(si->di);
5609 if (de != NULL) {
5610 ret = dictGetEntryKey(de);
5611 incrRefCount(ret);
5612 }
5613 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5614 long long llval;
5615 if (intsetGet(si->subject->ptr,si->ii++,&llval))
5616 ret = createStringObjectFromLongLong(llval);
5617 }
5618 return ret;
5619 }
5620
5621
5622 /* Return random element from set. The returned object will always have
5623 * an incremented refcount. */
5624 robj *setTypeRandomElement(robj *subject) {
5625 robj *ret = NULL;
5626 if (subject->encoding == REDIS_ENCODING_HT) {
5627 dictEntry *de = dictGetRandomKey(subject->ptr);
5628 ret = dictGetEntryKey(de);
5629 incrRefCount(ret);
5630 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5631 long long llval = intsetRandom(subject->ptr);
5632 ret = createStringObjectFromLongLong(llval);
5633 } else {
5634 redisPanic("Unknown set encoding");
5635 }
5636 return ret;
5637 }
5638
5639 static unsigned long setTypeSize(robj *subject) {
5640 if (subject->encoding == REDIS_ENCODING_HT) {
5641 return dictSize((dict*)subject->ptr);
5642 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5643 return intsetLen((intset*)subject->ptr);
5644 } else {
5645 redisPanic("Unknown set encoding");
5646 }
5647 }
5648
5649 static void setTypeConvert(robj *subject, int enc) {
5650 setIterator *si;
5651 robj *element;
5652 redisAssert(subject->type == REDIS_SET);
5653
5654 if (enc == REDIS_ENCODING_HT) {
5655 dict *d = dictCreate(&setDictType,NULL);
5656
5657 /* setTypeGet returns a robj with incremented refcount */
5658 si = setTypeInitIterator(subject);
5659 while ((element = setTypeNext(si)) != NULL)
5660 redisAssert(dictAdd(d,element,NULL) == DICT_OK);
5661 setTypeReleaseIterator(si);
5662
5663 subject->encoding = REDIS_ENCODING_HT;
5664 zfree(subject->ptr);
5665 subject->ptr = d;
5666 } else {
5667 redisPanic("Unsupported set conversion");
5668 }
5669 }
5670
5671 static void saddCommand(redisClient *c) {
5672 robj *set;
5673
5674 set = lookupKeyWrite(c->db,c->argv[1]);
5675 if (set == NULL) {
5676 set = setTypeCreate(c->argv[2]);
5677 dbAdd(c->db,c->argv[1],set);
5678 } else {
5679 if (set->type != REDIS_SET) {
5680 addReply(c,shared.wrongtypeerr);
5681 return;
5682 }
5683 }
5684 if (setTypeAdd(set,c->argv[2])) {
5685 server.dirty++;
5686 addReply(c,shared.cone);
5687 } else {
5688 addReply(c,shared.czero);
5689 }
5690 }
5691
5692 static void sremCommand(redisClient *c) {
5693 robj *set;
5694
5695 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5696 checkType(c,set,REDIS_SET)) return;
5697
5698 if (setTypeRemove(set,c->argv[2])) {
5699 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5700 server.dirty++;
5701 addReply(c,shared.cone);
5702 } else {
5703 addReply(c,shared.czero);
5704 }
5705 }
5706
5707 static void smoveCommand(redisClient *c) {
5708 robj *srcset, *dstset, *ele;
5709 srcset = lookupKeyWrite(c->db,c->argv[1]);
5710 dstset = lookupKeyWrite(c->db,c->argv[2]);
5711 ele = c->argv[3];
5712
5713 /* If the source key does not exist return 0 */
5714 if (srcset == NULL) {
5715 addReply(c,shared.czero);
5716 return;
5717 }
5718
5719 /* If the source key has the wrong type, or the destination key
5720 * is set and has the wrong type, return with an error. */
5721 if (checkType(c,srcset,REDIS_SET) ||
5722 (dstset && checkType(c,dstset,REDIS_SET))) return;
5723
5724 /* If srcset and dstset are equal, SMOVE is a no-op */
5725 if (srcset == dstset) {
5726 addReply(c,shared.cone);
5727 return;
5728 }
5729
5730 /* If the element cannot be removed from the src set, return 0. */
5731 if (!setTypeRemove(srcset,ele)) {
5732 addReply(c,shared.czero);
5733 return;
5734 }
5735
5736 /* Remove the src set from the database when empty */
5737 if (setTypeSize(srcset) == 0) dbDelete(c->db,c->argv[1]);
5738 server.dirty++;
5739
5740 /* Create the destination set when it doesn't exist */
5741 if (!dstset) {
5742 dstset = setTypeCreate(ele);
5743 dbAdd(c->db,c->argv[2],dstset);
5744 }
5745
5746 /* An extra key has changed when ele was successfully added to dstset */
5747 if (setTypeAdd(dstset,ele)) server.dirty++;
5748 addReply(c,shared.cone);
5749 }
5750
5751 static void sismemberCommand(redisClient *c) {
5752 robj *set;
5753
5754 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5755 checkType(c,set,REDIS_SET)) return;
5756
5757 if (setTypeIsMember(set,c->argv[2]))
5758 addReply(c,shared.cone);
5759 else
5760 addReply(c,shared.czero);
5761 }
5762
5763 static void scardCommand(redisClient *c) {
5764 robj *o;
5765
5766 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5767 checkType(c,o,REDIS_SET)) return;
5768
5769 addReplyUlong(c,setTypeSize(o));
5770 }
5771
5772 static void spopCommand(redisClient *c) {
5773 robj *set, *ele;
5774
5775 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5776 checkType(c,set,REDIS_SET)) return;
5777
5778 ele = setTypeRandomElement(set);
5779 if (ele == NULL) {
5780 addReply(c,shared.nullbulk);
5781 } else {
5782 setTypeRemove(set,ele);
5783 addReplyBulk(c,ele);
5784 decrRefCount(ele);
5785 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5786 server.dirty++;
5787 }
5788 }
5789
5790 static void srandmemberCommand(redisClient *c) {
5791 robj *set, *ele;
5792
5793 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5794 checkType(c,set,REDIS_SET)) return;
5795
5796 ele = setTypeRandomElement(set);
5797 if (ele == NULL) {
5798 addReply(c,shared.nullbulk);
5799 } else {
5800 addReplyBulk(c,ele);
5801 decrRefCount(ele);
5802 }
5803 }
5804
5805 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5806 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
5807 }
5808
5809 static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5810 robj **sets = zmalloc(sizeof(robj*)*setnum);
5811 setIterator *si;
5812 robj *ele, *lenobj = NULL, *dstset = NULL;
5813 unsigned long j, cardinality = 0;
5814
5815 for (j = 0; j < setnum; j++) {
5816 robj *setobj = dstkey ?
5817 lookupKeyWrite(c->db,setkeys[j]) :
5818 lookupKeyRead(c->db,setkeys[j]);
5819 if (!setobj) {
5820 zfree(sets);
5821 if (dstkey) {
5822 if (dbDelete(c->db,dstkey))
5823 server.dirty++;
5824 addReply(c,shared.czero);
5825 } else {
5826 addReply(c,shared.emptymultibulk);
5827 }
5828 return;
5829 }
5830 if (checkType(c,setobj,REDIS_SET)) {
5831 zfree(sets);
5832 return;
5833 }
5834 sets[j] = setobj;
5835 }
5836 /* Sort sets from the smallest to largest, this will improve our
5837 * algorithm's performace */
5838 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
5839
5840 /* The first thing we should output is the total number of elements...
5841 * since this is a multi-bulk write, but at this stage we don't know
5842 * the intersection set size, so we use a trick, append an empty object
5843 * to the output list and save the pointer to later modify it with the
5844 * right length */
5845 if (!dstkey) {
5846 lenobj = createObject(REDIS_STRING,NULL);
5847 addReply(c,lenobj);
5848 decrRefCount(lenobj);
5849 } else {
5850 /* If we have a target key where to store the resulting set
5851 * create this key with an empty set inside */
5852 dstset = createIntsetObject();
5853 }
5854
5855 /* Iterate all the elements of the first (smallest) set, and test
5856 * the element against all the other sets, if at least one set does
5857 * not include the element it is discarded */
5858 si = setTypeInitIterator(sets[0]);
5859 while((ele = setTypeNext(si)) != NULL) {
5860 for (j = 1; j < setnum; j++)
5861 if (!setTypeIsMember(sets[j],ele)) break;
5862
5863 /* Only take action when all sets contain the member */
5864 if (j == setnum) {
5865 if (!dstkey) {
5866 addReplyBulk(c,ele);
5867 cardinality++;
5868 } else {
5869 setTypeAdd(dstset,ele);
5870 }
5871 }
5872 decrRefCount(ele);
5873 }
5874 setTypeReleaseIterator(si);
5875
5876 if (dstkey) {
5877 /* Store the resulting set into the target, if the intersection
5878 * is not an empty set. */
5879 dbDelete(c->db,dstkey);
5880 if (setTypeSize(dstset) > 0) {
5881 dbAdd(c->db,dstkey,dstset);
5882 addReplyLongLong(c,setTypeSize(dstset));
5883 } else {
5884 decrRefCount(dstset);
5885 addReply(c,shared.czero);
5886 }
5887 server.dirty++;
5888 } else {
5889 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5890 }
5891 zfree(sets);
5892 }
5893
5894 static void sinterCommand(redisClient *c) {
5895 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5896 }
5897
5898 static void sinterstoreCommand(redisClient *c) {
5899 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5900 }
5901
5902 #define REDIS_OP_UNION 0
5903 #define REDIS_OP_DIFF 1
5904 #define REDIS_OP_INTER 2
5905
5906 static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5907 robj **sets = zmalloc(sizeof(robj*)*setnum);
5908 setIterator *si;
5909 robj *ele, *dstset = NULL;
5910 int j, cardinality = 0;
5911
5912 for (j = 0; j < setnum; j++) {
5913 robj *setobj = dstkey ?
5914 lookupKeyWrite(c->db,setkeys[j]) :
5915 lookupKeyRead(c->db,setkeys[j]);
5916 if (!setobj) {
5917 sets[j] = NULL;
5918 continue;
5919 }
5920 if (checkType(c,setobj,REDIS_SET)) {
5921 zfree(sets);
5922 return;
5923 }
5924 sets[j] = setobj;
5925 }
5926
5927 /* We need a temp set object to store our union. If the dstkey
5928 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5929 * this set object will be the resulting object to set into the target key*/
5930 dstset = createIntsetObject();
5931
5932 /* Iterate all the elements of all the sets, add every element a single
5933 * time to the result set */
5934 for (j = 0; j < setnum; j++) {
5935 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5936 if (!sets[j]) continue; /* non existing keys are like empty sets */
5937
5938 si = setTypeInitIterator(sets[j]);
5939 while((ele = setTypeNext(si)) != NULL) {
5940 if (op == REDIS_OP_UNION || j == 0) {
5941 if (setTypeAdd(dstset,ele)) {
5942 cardinality++;
5943 }
5944 } else if (op == REDIS_OP_DIFF) {
5945 if (setTypeRemove(dstset,ele)) {
5946 cardinality--;
5947 }
5948 }
5949 decrRefCount(ele);
5950 }
5951 setTypeReleaseIterator(si);
5952
5953 /* Exit when result set is empty. */
5954 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5955 }
5956
5957 /* Output the content of the resulting set, if not in STORE mode */
5958 if (!dstkey) {
5959 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5960 si = setTypeInitIterator(dstset);
5961 while((ele = setTypeNext(si)) != NULL) {
5962 addReplyBulk(c,ele);
5963 decrRefCount(ele);
5964 }
5965 setTypeReleaseIterator(si);
5966 decrRefCount(dstset);
5967 } else {
5968 /* If we have a target key where to store the resulting set
5969 * create this key with the result set inside */
5970 dbDelete(c->db,dstkey);
5971 if (setTypeSize(dstset) > 0) {
5972 dbAdd(c->db,dstkey,dstset);
5973 addReplyLongLong(c,setTypeSize(dstset));
5974 } else {
5975 decrRefCount(dstset);
5976 addReply(c,shared.czero);
5977 }
5978 server.dirty++;
5979 }
5980 zfree(sets);
5981 }
5982
5983 static void sunionCommand(redisClient *c) {
5984 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5985 }
5986
5987 static void sunionstoreCommand(redisClient *c) {
5988 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5989 }
5990
5991 static void sdiffCommand(redisClient *c) {
5992 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5993 }
5994
5995 static void sdiffstoreCommand(redisClient *c) {
5996 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5997 }
5998
5999 /* ==================================== ZSets =============================== */
6000
6001 /* ZSETs are ordered sets using two data structures to hold the same elements
6002 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
6003 * data structure.
6004 *
6005 * The elements are added to an hash table mapping Redis objects to scores.
6006 * At the same time the elements are added to a skip list mapping scores
6007 * to Redis objects (so objects are sorted by scores in this "view"). */
6008
6009 /* This skiplist implementation is almost a C translation of the original
6010 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
6011 * Alternative to Balanced Trees", modified in three ways:
6012 * a) this implementation allows for repeated values.
6013 * b) the comparison is not just by key (our 'score') but by satellite data.
6014 * c) there is a back pointer, so it's a doubly linked list with the back
6015 * pointers being only at "level 1". This allows to traverse the list
6016 * from tail to head, useful for ZREVRANGE. */
6017
6018 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
6019 zskiplistNode *zn = zmalloc(sizeof(*zn));
6020
6021 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
6022 if (level > 1)
6023 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6024 else
6025 zn->span = NULL;
6026 zn->score = score;
6027 zn->obj = obj;
6028 return zn;
6029 }
6030
6031 static zskiplist *zslCreate(void) {
6032 int j;
6033 zskiplist *zsl;
6034
6035 zsl = zmalloc(sizeof(*zsl));
6036 zsl->level = 1;
6037 zsl->length = 0;
6038 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
6039 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6040 zsl->header->forward[j] = NULL;
6041
6042 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
6043 if (j < ZSKIPLIST_MAXLEVEL-1)
6044 zsl->header->span[j] = 0;
6045 }
6046 zsl->header->backward = NULL;
6047 zsl->tail = NULL;
6048 return zsl;
6049 }
6050
6051 static void zslFreeNode(zskiplistNode *node) {
6052 decrRefCount(node->obj);
6053 zfree(node->forward);
6054 zfree(node->span);
6055 zfree(node);
6056 }
6057
6058 static void zslFree(zskiplist *zsl) {
6059 zskiplistNode *node = zsl->header->forward[0], *next;
6060
6061 zfree(zsl->header->forward);
6062 zfree(zsl->header->span);
6063 zfree(zsl->header);
6064 while(node) {
6065 next = node->forward[0];
6066 zslFreeNode(node);
6067 node = next;
6068 }
6069 zfree(zsl);
6070 }
6071
6072 static int zslRandomLevel(void) {
6073 int level = 1;
6074 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
6075 level += 1;
6076 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6077 }
6078
6079 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
6080 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6081 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6082 int i, level;
6083
6084 x = zsl->header;
6085 for (i = zsl->level-1; i >= 0; i--) {
6086 /* store rank that is crossed to reach the insert position */
6087 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
6088
6089 while (x->forward[i] &&
6090 (x->forward[i]->score < score ||
6091 (x->forward[i]->score == score &&
6092 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
6093 rank[i] += i > 0 ? x->span[i-1] : 1;
6094 x = x->forward[i];
6095 }
6096 update[i] = x;
6097 }
6098 /* we assume the key is not already inside, since we allow duplicated
6099 * scores, and the re-insertion of score and redis object should never
6100 * happpen since the caller of zslInsert() should test in the hash table
6101 * if the element is already inside or not. */
6102 level = zslRandomLevel();
6103 if (level > zsl->level) {
6104 for (i = zsl->level; i < level; i++) {
6105 rank[i] = 0;
6106 update[i] = zsl->header;
6107 update[i]->span[i-1] = zsl->length;
6108 }
6109 zsl->level = level;
6110 }
6111 x = zslCreateNode(level,score,obj);
6112 for (i = 0; i < level; i++) {
6113 x->forward[i] = update[i]->forward[i];
6114 update[i]->forward[i] = x;
6115
6116 /* update span covered by update[i] as x is inserted here */
6117 if (i > 0) {
6118 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6119 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6120 }
6121 }
6122
6123 /* increment span for untouched levels */
6124 for (i = level; i < zsl->level; i++) {
6125 update[i]->span[i-1]++;
6126 }
6127
6128 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6129 if (x->forward[0])
6130 x->forward[0]->backward = x;
6131 else
6132 zsl->tail = x;
6133 zsl->length++;
6134 }
6135
6136 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6137 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6138 int i;
6139 for (i = 0; i < zsl->level; i++) {
6140 if (update[i]->forward[i] == x) {
6141 if (i > 0) {
6142 update[i]->span[i-1] += x->span[i-1] - 1;
6143 }
6144 update[i]->forward[i] = x->forward[i];
6145 } else {
6146 /* invariant: i > 0, because update[0]->forward[0]
6147 * is always equal to x */
6148 update[i]->span[i-1] -= 1;
6149 }
6150 }
6151 if (x->forward[0]) {
6152 x->forward[0]->backward = x->backward;
6153 } else {
6154 zsl->tail = x->backward;
6155 }
6156 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6157 zsl->level--;
6158 zsl->length--;
6159 }
6160
6161 /* Delete an element with matching score/object from the skiplist. */
6162 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6163 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6164 int i;
6165
6166 x = zsl->header;
6167 for (i = zsl->level-1; i >= 0; i--) {
6168 while (x->forward[i] &&
6169 (x->forward[i]->score < score ||
6170 (x->forward[i]->score == score &&
6171 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6172 x = x->forward[i];
6173 update[i] = x;
6174 }
6175 /* We may have multiple elements with the same score, what we need
6176 * is to find the element with both the right score and object. */
6177 x = x->forward[0];
6178 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6179 zslDeleteNode(zsl, x, update);
6180 zslFreeNode(x);
6181 return 1;
6182 } else {
6183 return 0; /* not found */
6184 }
6185 return 0; /* not found */
6186 }
6187
6188 /* Delete all the elements with score between min and max from the skiplist.
6189 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6190 * Note that this function takes the reference to the hash table view of the
6191 * sorted set, in order to remove the elements from the hash table too. */
6192 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6193 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6194 unsigned long removed = 0;
6195 int i;
6196
6197 x = zsl->header;
6198 for (i = zsl->level-1; i >= 0; i--) {
6199 while (x->forward[i] && x->forward[i]->score < min)
6200 x = x->forward[i];
6201 update[i] = x;
6202 }
6203 /* We may have multiple elements with the same score, what we need
6204 * is to find the element with both the right score and object. */
6205 x = x->forward[0];
6206 while (x && x->score <= max) {
6207 zskiplistNode *next = x->forward[0];
6208 zslDeleteNode(zsl, x, update);
6209 dictDelete(dict,x->obj);
6210 zslFreeNode(x);
6211 removed++;
6212 x = next;
6213 }
6214 return removed; /* not found */
6215 }
6216
6217 /* Delete all the elements with rank between start and end from the skiplist.
6218 * Start and end are inclusive. Note that start and end need to be 1-based */
6219 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6220 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6221 unsigned long traversed = 0, removed = 0;
6222 int i;
6223
6224 x = zsl->header;
6225 for (i = zsl->level-1; i >= 0; i--) {
6226 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6227 traversed += i > 0 ? x->span[i-1] : 1;
6228 x = x->forward[i];
6229 }
6230 update[i] = x;
6231 }
6232
6233 traversed++;
6234 x = x->forward[0];
6235 while (x && traversed <= end) {
6236 zskiplistNode *next = x->forward[0];
6237 zslDeleteNode(zsl, x, update);
6238 dictDelete(dict,x->obj);
6239 zslFreeNode(x);
6240 removed++;
6241 traversed++;
6242 x = next;
6243 }
6244 return removed;
6245 }
6246
6247 /* Find the first node having a score equal or greater than the specified one.
6248 * Returns NULL if there is no match. */
6249 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6250 zskiplistNode *x;
6251 int i;
6252
6253 x = zsl->header;
6254 for (i = zsl->level-1; i >= 0; i--) {
6255 while (x->forward[i] && x->forward[i]->score < score)
6256 x = x->forward[i];
6257 }
6258 /* We may have multiple elements with the same score, what we need
6259 * is to find the element with both the right score and object. */
6260 return x->forward[0];
6261 }
6262
6263 /* Find the rank for an element by both score and key.
6264 * Returns 0 when the element cannot be found, rank otherwise.
6265 * Note that the rank is 1-based due to the span of zsl->header to the
6266 * first element. */
6267 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6268 zskiplistNode *x;
6269 unsigned long rank = 0;
6270 int i;
6271
6272 x = zsl->header;
6273 for (i = zsl->level-1; i >= 0; i--) {
6274 while (x->forward[i] &&
6275 (x->forward[i]->score < score ||
6276 (x->forward[i]->score == score &&
6277 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6278 rank += i > 0 ? x->span[i-1] : 1;
6279 x = x->forward[i];
6280 }
6281
6282 /* x might be equal to zsl->header, so test if obj is non-NULL */
6283 if (x->obj && equalStringObjects(x->obj,o)) {
6284 return rank;
6285 }
6286 }
6287 return 0;
6288 }
6289
6290 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6291 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6292 zskiplistNode *x;
6293 unsigned long traversed = 0;
6294 int i;
6295
6296 x = zsl->header;
6297 for (i = zsl->level-1; i >= 0; i--) {
6298 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6299 {
6300 traversed += i > 0 ? x->span[i-1] : 1;
6301 x = x->forward[i];
6302 }
6303 if (traversed == rank) {
6304 return x;
6305 }
6306 }
6307 return NULL;
6308 }
6309
6310 /* The actual Z-commands implementations */
6311
6312 /* This generic command implements both ZADD and ZINCRBY.
6313 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6314 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6315 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6316 robj *zsetobj;
6317 zset *zs;
6318 double *score;
6319
6320 if (isnan(scoreval)) {
6321 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6322 return;
6323 }
6324
6325 zsetobj = lookupKeyWrite(c->db,key);
6326 if (zsetobj == NULL) {
6327 zsetobj = createZsetObject();
6328 dbAdd(c->db,key,zsetobj);
6329 } else {
6330 if (zsetobj->type != REDIS_ZSET) {
6331 addReply(c,shared.wrongtypeerr);
6332 return;
6333 }
6334 }
6335 zs = zsetobj->ptr;
6336
6337 /* Ok now since we implement both ZADD and ZINCRBY here the code
6338 * needs to handle the two different conditions. It's all about setting
6339 * '*score', that is, the new score to set, to the right value. */
6340 score = zmalloc(sizeof(double));
6341 if (doincrement) {
6342 dictEntry *de;
6343
6344 /* Read the old score. If the element was not present starts from 0 */
6345 de = dictFind(zs->dict,ele);
6346 if (de) {
6347 double *oldscore = dictGetEntryVal(de);
6348 *score = *oldscore + scoreval;
6349 } else {
6350 *score = scoreval;
6351 }
6352 if (isnan(*score)) {
6353 addReplySds(c,
6354 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6355 zfree(score);
6356 /* Note that we don't need to check if the zset may be empty and
6357 * should be removed here, as we can only obtain Nan as score if
6358 * there was already an element in the sorted set. */
6359 return;
6360 }
6361 } else {
6362 *score = scoreval;
6363 }
6364
6365 /* What follows is a simple remove and re-insert operation that is common
6366 * to both ZADD and ZINCRBY... */
6367 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6368 /* case 1: New element */
6369 incrRefCount(ele); /* added to hash */
6370 zslInsert(zs->zsl,*score,ele);
6371 incrRefCount(ele); /* added to skiplist */
6372 server.dirty++;
6373 if (doincrement)
6374 addReplyDouble(c,*score);
6375 else
6376 addReply(c,shared.cone);
6377 } else {
6378 dictEntry *de;
6379 double *oldscore;
6380
6381 /* case 2: Score update operation */
6382 de = dictFind(zs->dict,ele);
6383 redisAssert(de != NULL);
6384 oldscore = dictGetEntryVal(de);
6385 if (*score != *oldscore) {
6386 int deleted;
6387
6388 /* Remove and insert the element in the skip list with new score */
6389 deleted = zslDelete(zs->zsl,*oldscore,ele);
6390 redisAssert(deleted != 0);
6391 zslInsert(zs->zsl,*score,ele);
6392 incrRefCount(ele);
6393 /* Update the score in the hash table */
6394 dictReplace(zs->dict,ele,score);
6395 server.dirty++;
6396 } else {
6397 zfree(score);
6398 }
6399 if (doincrement)
6400 addReplyDouble(c,*score);
6401 else
6402 addReply(c,shared.czero);
6403 }
6404 }
6405
6406 static void zaddCommand(redisClient *c) {
6407 double scoreval;
6408
6409 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6410 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6411 }
6412
6413 static void zincrbyCommand(redisClient *c) {
6414 double scoreval;
6415
6416 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6417 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6418 }
6419
6420 static void zremCommand(redisClient *c) {
6421 robj *zsetobj;
6422 zset *zs;
6423 dictEntry *de;
6424 double *oldscore;
6425 int deleted;
6426
6427 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6428 checkType(c,zsetobj,REDIS_ZSET)) return;
6429
6430 zs = zsetobj->ptr;
6431 de = dictFind(zs->dict,c->argv[2]);
6432 if (de == NULL) {
6433 addReply(c,shared.czero);
6434 return;
6435 }
6436 /* Delete from the skiplist */
6437 oldscore = dictGetEntryVal(de);
6438 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6439 redisAssert(deleted != 0);
6440
6441 /* Delete from the hash table */
6442 dictDelete(zs->dict,c->argv[2]);
6443 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6444 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6445 server.dirty++;
6446 addReply(c,shared.cone);
6447 }
6448
6449 static void zremrangebyscoreCommand(redisClient *c) {
6450 double min;
6451 double max;
6452 long deleted;
6453 robj *zsetobj;
6454 zset *zs;
6455
6456 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6457 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6458
6459 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6460 checkType(c,zsetobj,REDIS_ZSET)) return;
6461
6462 zs = zsetobj->ptr;
6463 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6464 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6465 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6466 server.dirty += deleted;
6467 addReplyLongLong(c,deleted);
6468 }
6469
6470 static void zremrangebyrankCommand(redisClient *c) {
6471 long start;
6472 long end;
6473 int llen;
6474 long deleted;
6475 robj *zsetobj;
6476 zset *zs;
6477
6478 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6479 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6480
6481 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6482 checkType(c,zsetobj,REDIS_ZSET)) return;
6483 zs = zsetobj->ptr;
6484 llen = zs->zsl->length;
6485
6486 /* convert negative indexes */
6487 if (start < 0) start = llen+start;
6488 if (end < 0) end = llen+end;
6489 if (start < 0) start = 0;
6490 if (end < 0) end = 0;
6491
6492 /* indexes sanity checks */
6493 if (start > end || start >= llen) {
6494 addReply(c,shared.czero);
6495 return;
6496 }
6497 if (end >= llen) end = llen-1;
6498
6499 /* increment start and end because zsl*Rank functions
6500 * use 1-based rank */
6501 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6502 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6503 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6504 server.dirty += deleted;
6505 addReplyLongLong(c, deleted);
6506 }
6507
6508 typedef struct {
6509 dict *dict;
6510 double weight;
6511 } zsetopsrc;
6512
6513 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6514 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6515 unsigned long size1, size2;
6516 size1 = d1->dict ? dictSize(d1->dict) : 0;
6517 size2 = d2->dict ? dictSize(d2->dict) : 0;
6518 return size1 - size2;
6519 }
6520
6521 #define REDIS_AGGR_SUM 1
6522 #define REDIS_AGGR_MIN 2
6523 #define REDIS_AGGR_MAX 3
6524 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6525
6526 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6527 if (aggregate == REDIS_AGGR_SUM) {
6528 *target = *target + val;
6529 } else if (aggregate == REDIS_AGGR_MIN) {
6530 *target = val < *target ? val : *target;
6531 } else if (aggregate == REDIS_AGGR_MAX) {
6532 *target = val > *target ? val : *target;
6533 } else {
6534 /* safety net */
6535 redisPanic("Unknown ZUNION/INTER aggregate type");
6536 }
6537 }
6538
6539 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6540 int i, j, setnum;
6541 int aggregate = REDIS_AGGR_SUM;
6542 zsetopsrc *src;
6543 robj *dstobj;
6544 zset *dstzset;
6545 dictIterator *di;
6546 dictEntry *de;
6547
6548 /* expect setnum input keys to be given */
6549 setnum = atoi(c->argv[2]->ptr);
6550 if (setnum < 1) {
6551 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6552 return;
6553 }
6554
6555 /* test if the expected number of keys would overflow */
6556 if (3+setnum > c->argc) {
6557 addReply(c,shared.syntaxerr);
6558 return;
6559 }
6560
6561 /* read keys to be used for input */
6562 src = zmalloc(sizeof(zsetopsrc) * setnum);
6563 for (i = 0, j = 3; i < setnum; i++, j++) {
6564 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6565 if (!obj) {
6566 src[i].dict = NULL;
6567 } else {
6568 if (obj->type == REDIS_ZSET) {
6569 src[i].dict = ((zset*)obj->ptr)->dict;
6570 } else if (obj->type == REDIS_SET) {
6571 src[i].dict = (obj->ptr);
6572 } else {
6573 zfree(src);
6574 addReply(c,shared.wrongtypeerr);
6575 return;
6576 }
6577 }
6578
6579 /* default all weights to 1 */
6580 src[i].weight = 1.0;
6581 }
6582
6583 /* parse optional extra arguments */
6584 if (j < c->argc) {
6585 int remaining = c->argc - j;
6586
6587 while (remaining) {
6588 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6589 j++; remaining--;
6590 for (i = 0; i < setnum; i++, j++, remaining--) {
6591 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6592 return;
6593 }
6594 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6595 j++; remaining--;
6596 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6597 aggregate = REDIS_AGGR_SUM;
6598 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6599 aggregate = REDIS_AGGR_MIN;
6600 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6601 aggregate = REDIS_AGGR_MAX;
6602 } else {
6603 zfree(src);
6604 addReply(c,shared.syntaxerr);
6605 return;
6606 }
6607 j++; remaining--;
6608 } else {
6609 zfree(src);
6610 addReply(c,shared.syntaxerr);
6611 return;
6612 }
6613 }
6614 }
6615
6616 /* sort sets from the smallest to largest, this will improve our
6617 * algorithm's performance */
6618 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6619
6620 dstobj = createZsetObject();
6621 dstzset = dstobj->ptr;
6622
6623 if (op == REDIS_OP_INTER) {
6624 /* skip going over all entries if the smallest zset is NULL or empty */
6625 if (src[0].dict && dictSize(src[0].dict) > 0) {
6626 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6627 * from small to large, all src[i > 0].dict are non-empty too */
6628 di = dictGetIterator(src[0].dict);
6629 while((de = dictNext(di)) != NULL) {
6630 double *score = zmalloc(sizeof(double)), value;
6631 *score = src[0].weight * zunionInterDictValue(de);
6632
6633 for (j = 1; j < setnum; j++) {
6634 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6635 if (other) {
6636 value = src[j].weight * zunionInterDictValue(other);
6637 zunionInterAggregate(score, value, aggregate);
6638 } else {
6639 break;
6640 }
6641 }
6642
6643 /* skip entry when not present in every source dict */
6644 if (j != setnum) {
6645 zfree(score);
6646 } else {
6647 robj *o = dictGetEntryKey(de);
6648 dictAdd(dstzset->dict,o,score);
6649 incrRefCount(o); /* added to dictionary */
6650 zslInsert(dstzset->zsl,*score,o);
6651 incrRefCount(o); /* added to skiplist */
6652 }
6653 }
6654 dictReleaseIterator(di);
6655 }
6656 } else if (op == REDIS_OP_UNION) {
6657 for (i = 0; i < setnum; i++) {
6658 if (!src[i].dict) continue;
6659
6660 di = dictGetIterator(src[i].dict);
6661 while((de = dictNext(di)) != NULL) {
6662 /* skip key when already processed */
6663 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6664
6665 double *score = zmalloc(sizeof(double)), value;
6666 *score = src[i].weight * zunionInterDictValue(de);
6667
6668 /* because the zsets are sorted by size, its only possible
6669 * for sets at larger indices to hold this entry */
6670 for (j = (i+1); j < setnum; j++) {
6671 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6672 if (other) {
6673 value = src[j].weight * zunionInterDictValue(other);
6674 zunionInterAggregate(score, value, aggregate);
6675 }
6676 }
6677
6678 robj *o = dictGetEntryKey(de);
6679 dictAdd(dstzset->dict,o,score);
6680 incrRefCount(o); /* added to dictionary */
6681 zslInsert(dstzset->zsl,*score,o);
6682 incrRefCount(o); /* added to skiplist */
6683 }
6684 dictReleaseIterator(di);
6685 }
6686 } else {
6687 /* unknown operator */
6688 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6689 }
6690
6691 dbDelete(c->db,dstkey);
6692 if (dstzset->zsl->length) {
6693 dbAdd(c->db,dstkey,dstobj);
6694 addReplyLongLong(c, dstzset->zsl->length);
6695 server.dirty++;
6696 } else {
6697 decrRefCount(dstobj);
6698 addReply(c, shared.czero);
6699 }
6700 zfree(src);
6701 }
6702
6703 static void zunionstoreCommand(redisClient *c) {
6704 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6705 }
6706
6707 static void zinterstoreCommand(redisClient *c) {
6708 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6709 }
6710
6711 static void zrangeGenericCommand(redisClient *c, int reverse) {
6712 robj *o;
6713 long start;
6714 long end;
6715 int withscores = 0;
6716 int llen;
6717 int rangelen, j;
6718 zset *zsetobj;
6719 zskiplist *zsl;
6720 zskiplistNode *ln;
6721 robj *ele;
6722
6723 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6724 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6725
6726 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6727 withscores = 1;
6728 } else if (c->argc >= 5) {
6729 addReply(c,shared.syntaxerr);
6730 return;
6731 }
6732
6733 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6734 || checkType(c,o,REDIS_ZSET)) return;
6735 zsetobj = o->ptr;
6736 zsl = zsetobj->zsl;
6737 llen = zsl->length;
6738
6739 /* convert negative indexes */
6740 if (start < 0) start = llen+start;
6741 if (end < 0) end = llen+end;
6742 if (start < 0) start = 0;
6743 if (end < 0) end = 0;
6744
6745 /* indexes sanity checks */
6746 if (start > end || start >= llen) {
6747 /* Out of range start or start > end result in empty list */
6748 addReply(c,shared.emptymultibulk);
6749 return;
6750 }
6751 if (end >= llen) end = llen-1;
6752 rangelen = (end-start)+1;
6753
6754 /* check if starting point is trivial, before searching
6755 * the element in log(N) time */
6756 if (reverse) {
6757 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6758 } else {
6759 ln = start == 0 ?
6760 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6761 }
6762
6763 /* Return the result in form of a multi-bulk reply */
6764 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6765 withscores ? (rangelen*2) : rangelen));
6766 for (j = 0; j < rangelen; j++) {
6767 ele = ln->obj;
6768 addReplyBulk(c,ele);
6769 if (withscores)
6770 addReplyDouble(c,ln->score);
6771 ln = reverse ? ln->backward : ln->forward[0];
6772 }
6773 }
6774
6775 static void zrangeCommand(redisClient *c) {
6776 zrangeGenericCommand(c,0);
6777 }
6778
6779 static void zrevrangeCommand(redisClient *c) {
6780 zrangeGenericCommand(c,1);
6781 }
6782
6783 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6784 * If justcount is non-zero, just the count is returned. */
6785 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6786 robj *o;
6787 double min, max;
6788 int minex = 0, maxex = 0; /* are min or max exclusive? */
6789 int offset = 0, limit = -1;
6790 int withscores = 0;
6791 int badsyntax = 0;
6792
6793 /* Parse the min-max interval. If one of the values is prefixed
6794 * by the "(" character, it's considered "open". For instance
6795 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6796 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6797 if (((char*)c->argv[2]->ptr)[0] == '(') {
6798 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6799 minex = 1;
6800 } else {
6801 min = strtod(c->argv[2]->ptr,NULL);
6802 }
6803 if (((char*)c->argv[3]->ptr)[0] == '(') {
6804 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6805 maxex = 1;
6806 } else {
6807 max = strtod(c->argv[3]->ptr,NULL);
6808 }
6809
6810 /* Parse "WITHSCORES": note that if the command was called with
6811 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6812 * enter the following paths to parse WITHSCORES and LIMIT. */
6813 if (c->argc == 5 || c->argc == 8) {
6814 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6815 withscores = 1;
6816 else
6817 badsyntax = 1;
6818 }
6819 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6820 badsyntax = 1;
6821 if (badsyntax) {
6822 addReplySds(c,
6823 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6824 return;
6825 }
6826
6827 /* Parse "LIMIT" */
6828 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6829 addReply(c,shared.syntaxerr);
6830 return;
6831 } else if (c->argc == (7 + withscores)) {
6832 offset = atoi(c->argv[5]->ptr);
6833 limit = atoi(c->argv[6]->ptr);
6834 if (offset < 0) offset = 0;
6835 }
6836
6837 /* Ok, lookup the key and get the range */
6838 o = lookupKeyRead(c->db,c->argv[1]);
6839 if (o == NULL) {
6840 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6841 } else {
6842 if (o->type != REDIS_ZSET) {
6843 addReply(c,shared.wrongtypeerr);
6844 } else {
6845 zset *zsetobj = o->ptr;
6846 zskiplist *zsl = zsetobj->zsl;
6847 zskiplistNode *ln;
6848 robj *ele, *lenobj = NULL;
6849 unsigned long rangelen = 0;
6850
6851 /* Get the first node with the score >= min, or with
6852 * score > min if 'minex' is true. */
6853 ln = zslFirstWithScore(zsl,min);
6854 while (minex && ln && ln->score == min) ln = ln->forward[0];
6855
6856 if (ln == NULL) {
6857 /* No element matching the speciifed interval */
6858 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6859 return;
6860 }
6861
6862 /* We don't know in advance how many matching elements there
6863 * are in the list, so we push this object that will represent
6864 * the multi-bulk length in the output buffer, and will "fix"
6865 * it later */
6866 if (!justcount) {
6867 lenobj = createObject(REDIS_STRING,NULL);
6868 addReply(c,lenobj);
6869 decrRefCount(lenobj);
6870 }
6871
6872 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6873 if (offset) {
6874 offset--;
6875 ln = ln->forward[0];
6876 continue;
6877 }
6878 if (limit == 0) break;
6879 if (!justcount) {
6880 ele = ln->obj;
6881 addReplyBulk(c,ele);
6882 if (withscores)
6883 addReplyDouble(c,ln->score);
6884 }
6885 ln = ln->forward[0];
6886 rangelen++;
6887 if (limit > 0) limit--;
6888 }
6889 if (justcount) {
6890 addReplyLongLong(c,(long)rangelen);
6891 } else {
6892 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6893 withscores ? (rangelen*2) : rangelen);
6894 }
6895 }
6896 }
6897 }
6898
6899 static void zrangebyscoreCommand(redisClient *c) {
6900 genericZrangebyscoreCommand(c,0);
6901 }
6902
6903 static void zcountCommand(redisClient *c) {
6904 genericZrangebyscoreCommand(c,1);
6905 }
6906
6907 static void zcardCommand(redisClient *c) {
6908 robj *o;
6909 zset *zs;
6910
6911 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6912 checkType(c,o,REDIS_ZSET)) return;
6913
6914 zs = o->ptr;
6915 addReplyUlong(c,zs->zsl->length);
6916 }
6917
6918 static void zscoreCommand(redisClient *c) {
6919 robj *o;
6920 zset *zs;
6921 dictEntry *de;
6922
6923 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6924 checkType(c,o,REDIS_ZSET)) return;
6925
6926 zs = o->ptr;
6927 de = dictFind(zs->dict,c->argv[2]);
6928 if (!de) {
6929 addReply(c,shared.nullbulk);
6930 } else {
6931 double *score = dictGetEntryVal(de);
6932
6933 addReplyDouble(c,*score);
6934 }
6935 }
6936
6937 static void zrankGenericCommand(redisClient *c, int reverse) {
6938 robj *o;
6939 zset *zs;
6940 zskiplist *zsl;
6941 dictEntry *de;
6942 unsigned long rank;
6943 double *score;
6944
6945 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6946 checkType(c,o,REDIS_ZSET)) return;
6947
6948 zs = o->ptr;
6949 zsl = zs->zsl;
6950 de = dictFind(zs->dict,c->argv[2]);
6951 if (!de) {
6952 addReply(c,shared.nullbulk);
6953 return;
6954 }
6955
6956 score = dictGetEntryVal(de);
6957 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6958 if (rank) {
6959 if (reverse) {
6960 addReplyLongLong(c, zsl->length - rank);
6961 } else {
6962 addReplyLongLong(c, rank-1);
6963 }
6964 } else {
6965 addReply(c,shared.nullbulk);
6966 }
6967 }
6968
6969 static void zrankCommand(redisClient *c) {
6970 zrankGenericCommand(c, 0);
6971 }
6972
6973 static void zrevrankCommand(redisClient *c) {
6974 zrankGenericCommand(c, 1);
6975 }
6976
6977 /* ========================= Hashes utility functions ======================= */
6978 #define REDIS_HASH_KEY 1
6979 #define REDIS_HASH_VALUE 2
6980
6981 /* Check the length of a number of objects to see if we need to convert a
6982 * zipmap to a real hash. Note that we only check string encoded objects
6983 * as their string length can be queried in constant time. */
6984 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
6985 int i;
6986 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6987
6988 for (i = start; i <= end; i++) {
6989 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6990 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6991 {
6992 convertToRealHash(subject);
6993 return;
6994 }
6995 }
6996 }
6997
6998 /* Encode given objects in-place when the hash uses a dict. */
6999 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
7000 if (subject->encoding == REDIS_ENCODING_HT) {
7001 if (o1) *o1 = tryObjectEncoding(*o1);
7002 if (o2) *o2 = tryObjectEncoding(*o2);
7003 }
7004 }
7005
7006 /* Get the value from a hash identified by key. Returns either a string
7007 * object or NULL if the value cannot be found. The refcount of the object
7008 * is always increased by 1 when the value was found. */
7009 static robj *hashTypeGet(robj *o, robj *key) {
7010 robj *value = NULL;
7011 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7012 unsigned char *v;
7013 unsigned int vlen;
7014 key = getDecodedObject(key);
7015 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
7016 value = createStringObject((char*)v,vlen);
7017 }
7018 decrRefCount(key);
7019 } else {
7020 dictEntry *de = dictFind(o->ptr,key);
7021 if (de != NULL) {
7022 value = dictGetEntryVal(de);
7023 incrRefCount(value);
7024 }
7025 }
7026 return value;
7027 }
7028
7029 /* Test if the key exists in the given hash. Returns 1 if the key
7030 * exists and 0 when it doesn't. */
7031 static int hashTypeExists(robj *o, robj *key) {
7032 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7033 key = getDecodedObject(key);
7034 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
7035 decrRefCount(key);
7036 return 1;
7037 }
7038 decrRefCount(key);
7039 } else {
7040 if (dictFind(o->ptr,key) != NULL) {
7041 return 1;
7042 }
7043 }
7044 return 0;
7045 }
7046
7047 /* Add an element, discard the old if the key already exists.
7048 * Return 0 on insert and 1 on update. */
7049 static int hashTypeSet(robj *o, robj *key, robj *value) {
7050 int update = 0;
7051 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7052 key = getDecodedObject(key);
7053 value = getDecodedObject(value);
7054 o->ptr = zipmapSet(o->ptr,
7055 key->ptr,sdslen(key->ptr),
7056 value->ptr,sdslen(value->ptr), &update);
7057 decrRefCount(key);
7058 decrRefCount(value);
7059
7060 /* Check if the zipmap needs to be upgraded to a real hash table */
7061 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
7062 convertToRealHash(o);
7063 } else {
7064 if (dictReplace(o->ptr,key,value)) {
7065 /* Insert */
7066 incrRefCount(key);
7067 } else {
7068 /* Update */
7069 update = 1;
7070 }
7071 incrRefCount(value);
7072 }
7073 return update;
7074 }
7075
7076 /* Delete an element from a hash.
7077 * Return 1 on deleted and 0 on not found. */
7078 static int hashTypeDelete(robj *o, robj *key) {
7079 int deleted = 0;
7080 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7081 key = getDecodedObject(key);
7082 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
7083 decrRefCount(key);
7084 } else {
7085 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
7086 /* Always check if the dictionary needs a resize after a delete. */
7087 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
7088 }
7089 return deleted;
7090 }
7091
7092 /* Return the number of elements in a hash. */
7093 static unsigned long hashTypeLength(robj *o) {
7094 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7095 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7096 }
7097
7098 /* Structure to hold hash iteration abstration. Note that iteration over
7099 * hashes involves both fields and values. Because it is possible that
7100 * not both are required, store pointers in the iterator to avoid
7101 * unnecessary memory allocation for fields/values. */
7102 typedef struct {
7103 int encoding;
7104 unsigned char *zi;
7105 unsigned char *zk, *zv;
7106 unsigned int zklen, zvlen;
7107
7108 dictIterator *di;
7109 dictEntry *de;
7110 } hashTypeIterator;
7111
7112 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7113 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7114 hi->encoding = subject->encoding;
7115 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7116 hi->zi = zipmapRewind(subject->ptr);
7117 } else if (hi->encoding == REDIS_ENCODING_HT) {
7118 hi->di = dictGetIterator(subject->ptr);
7119 } else {
7120 redisAssert(NULL);
7121 }
7122 return hi;
7123 }
7124
7125 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7126 if (hi->encoding == REDIS_ENCODING_HT) {
7127 dictReleaseIterator(hi->di);
7128 }
7129 zfree(hi);
7130 }
7131
7132 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7133 * could be found and REDIS_ERR when the iterator reaches the end. */
7134 static int hashTypeNext(hashTypeIterator *hi) {
7135 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7136 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7137 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7138 } else {
7139 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7140 }
7141 return REDIS_OK;
7142 }
7143
7144 /* Get key or value object at current iteration position.
7145 * This increases the refcount of the field object by 1. */
7146 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7147 robj *o;
7148 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7149 if (what & REDIS_HASH_KEY) {
7150 o = createStringObject((char*)hi->zk,hi->zklen);
7151 } else {
7152 o = createStringObject((char*)hi->zv,hi->zvlen);
7153 }
7154 } else {
7155 if (what & REDIS_HASH_KEY) {
7156 o = dictGetEntryKey(hi->de);
7157 } else {
7158 o = dictGetEntryVal(hi->de);
7159 }
7160 incrRefCount(o);
7161 }
7162 return o;
7163 }
7164
7165 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7166 robj *o = lookupKeyWrite(c->db,key);
7167 if (o == NULL) {
7168 o = createHashObject();
7169 dbAdd(c->db,key,o);
7170 } else {
7171 if (o->type != REDIS_HASH) {
7172 addReply(c,shared.wrongtypeerr);
7173 return NULL;
7174 }
7175 }
7176 return o;
7177 }
7178
7179 /* ============================= Hash commands ============================== */
7180 static void hsetCommand(redisClient *c) {
7181 int update;
7182 robj *o;
7183
7184 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7185 hashTypeTryConversion(o,c->argv,2,3);
7186 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7187 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7188 addReply(c, update ? shared.czero : shared.cone);
7189 server.dirty++;
7190 }
7191
7192 static void hsetnxCommand(redisClient *c) {
7193 robj *o;
7194 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7195 hashTypeTryConversion(o,c->argv,2,3);
7196
7197 if (hashTypeExists(o, c->argv[2])) {
7198 addReply(c, shared.czero);
7199 } else {
7200 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7201 hashTypeSet(o,c->argv[2],c->argv[3]);
7202 addReply(c, shared.cone);
7203 server.dirty++;
7204 }
7205 }
7206
7207 static void hmsetCommand(redisClient *c) {
7208 int i;
7209 robj *o;
7210
7211 if ((c->argc % 2) == 1) {
7212 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7213 return;
7214 }
7215
7216 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7217 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7218 for (i = 2; i < c->argc; i += 2) {
7219 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7220 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7221 }
7222 addReply(c, shared.ok);
7223 server.dirty++;
7224 }
7225
7226 static void hincrbyCommand(redisClient *c) {
7227 long long value, incr;
7228 robj *o, *current, *new;
7229
7230 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7231 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7232 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7233 if (getLongLongFromObjectOrReply(c,current,&value,
7234 "hash value is not an integer") != REDIS_OK) {
7235 decrRefCount(current);
7236 return;
7237 }
7238 decrRefCount(current);
7239 } else {
7240 value = 0;
7241 }
7242
7243 value += incr;
7244 new = createStringObjectFromLongLong(value);
7245 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7246 hashTypeSet(o,c->argv[2],new);
7247 decrRefCount(new);
7248 addReplyLongLong(c,value);
7249 server.dirty++;
7250 }
7251
7252 static void hgetCommand(redisClient *c) {
7253 robj *o, *value;
7254 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7255 checkType(c,o,REDIS_HASH)) return;
7256
7257 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7258 addReplyBulk(c,value);
7259 decrRefCount(value);
7260 } else {
7261 addReply(c,shared.nullbulk);
7262 }
7263 }
7264
7265 static void hmgetCommand(redisClient *c) {
7266 int i;
7267 robj *o, *value;
7268 o = lookupKeyRead(c->db,c->argv[1]);
7269 if (o != NULL && o->type != REDIS_HASH) {
7270 addReply(c,shared.wrongtypeerr);
7271 }
7272
7273 /* Note the check for o != NULL happens inside the loop. This is
7274 * done because objects that cannot be found are considered to be
7275 * an empty hash. The reply should then be a series of NULLs. */
7276 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7277 for (i = 2; i < c->argc; i++) {
7278 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7279 addReplyBulk(c,value);
7280 decrRefCount(value);
7281 } else {
7282 addReply(c,shared.nullbulk);
7283 }
7284 }
7285 }
7286
7287 static void hdelCommand(redisClient *c) {
7288 robj *o;
7289 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7290 checkType(c,o,REDIS_HASH)) return;
7291
7292 if (hashTypeDelete(o,c->argv[2])) {
7293 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7294 addReply(c,shared.cone);
7295 server.dirty++;
7296 } else {
7297 addReply(c,shared.czero);
7298 }
7299 }
7300
7301 static void hlenCommand(redisClient *c) {
7302 robj *o;
7303 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7304 checkType(c,o,REDIS_HASH)) return;
7305
7306 addReplyUlong(c,hashTypeLength(o));
7307 }
7308
7309 static void genericHgetallCommand(redisClient *c, int flags) {
7310 robj *o, *lenobj, *obj;
7311 unsigned long count = 0;
7312 hashTypeIterator *hi;
7313
7314 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7315 || checkType(c,o,REDIS_HASH)) return;
7316
7317 lenobj = createObject(REDIS_STRING,NULL);
7318 addReply(c,lenobj);
7319 decrRefCount(lenobj);
7320
7321 hi = hashTypeInitIterator(o);
7322 while (hashTypeNext(hi) != REDIS_ERR) {
7323 if (flags & REDIS_HASH_KEY) {
7324 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7325 addReplyBulk(c,obj);
7326 decrRefCount(obj);
7327 count++;
7328 }
7329 if (flags & REDIS_HASH_VALUE) {
7330 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7331 addReplyBulk(c,obj);
7332 decrRefCount(obj);
7333 count++;
7334 }
7335 }
7336 hashTypeReleaseIterator(hi);
7337
7338 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7339 }
7340
7341 static void hkeysCommand(redisClient *c) {
7342 genericHgetallCommand(c,REDIS_HASH_KEY);
7343 }
7344
7345 static void hvalsCommand(redisClient *c) {
7346 genericHgetallCommand(c,REDIS_HASH_VALUE);
7347 }
7348
7349 static void hgetallCommand(redisClient *c) {
7350 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7351 }
7352
7353 static void hexistsCommand(redisClient *c) {
7354 robj *o;
7355 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7356 checkType(c,o,REDIS_HASH)) return;
7357
7358 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7359 }
7360
7361 static void convertToRealHash(robj *o) {
7362 unsigned char *key, *val, *p, *zm = o->ptr;
7363 unsigned int klen, vlen;
7364 dict *dict = dictCreate(&hashDictType,NULL);
7365
7366 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7367 p = zipmapRewind(zm);
7368 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7369 robj *keyobj, *valobj;
7370
7371 keyobj = createStringObject((char*)key,klen);
7372 valobj = createStringObject((char*)val,vlen);
7373 keyobj = tryObjectEncoding(keyobj);
7374 valobj = tryObjectEncoding(valobj);
7375 dictAdd(dict,keyobj,valobj);
7376 }
7377 o->encoding = REDIS_ENCODING_HT;
7378 o->ptr = dict;
7379 zfree(zm);
7380 }
7381
7382 /* ========================= Non type-specific commands ==================== */
7383
7384 static void flushdbCommand(redisClient *c) {
7385 server.dirty += dictSize(c->db->dict);
7386 touchWatchedKeysOnFlush(c->db->id);
7387 dictEmpty(c->db->dict);
7388 dictEmpty(c->db->expires);
7389 addReply(c,shared.ok);
7390 }
7391
7392 static void flushallCommand(redisClient *c) {
7393 touchWatchedKeysOnFlush(-1);
7394 server.dirty += emptyDb();
7395 addReply(c,shared.ok);
7396 if (server.bgsavechildpid != -1) {
7397 kill(server.bgsavechildpid,SIGKILL);
7398 rdbRemoveTempFile(server.bgsavechildpid);
7399 }
7400 rdbSave(server.dbfilename);
7401 server.dirty++;
7402 }
7403
7404 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7405 redisSortOperation *so = zmalloc(sizeof(*so));
7406 so->type = type;
7407 so->pattern = pattern;
7408 return so;
7409 }
7410
7411 /* Return the value associated to the key with a name obtained
7412 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7413 * The returned object will always have its refcount increased by 1
7414 * when it is non-NULL. */
7415 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7416 char *p, *f;
7417 sds spat, ssub;
7418 robj keyobj, fieldobj, *o;
7419 int prefixlen, sublen, postfixlen, fieldlen;
7420 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7421 struct {
7422 long len;
7423 long free;
7424 char buf[REDIS_SORTKEY_MAX+1];
7425 } keyname, fieldname;
7426
7427 /* If the pattern is "#" return the substitution object itself in order
7428 * to implement the "SORT ... GET #" feature. */
7429 spat = pattern->ptr;
7430 if (spat[0] == '#' && spat[1] == '\0') {
7431 incrRefCount(subst);
7432 return subst;
7433 }
7434
7435 /* The substitution object may be specially encoded. If so we create
7436 * a decoded object on the fly. Otherwise getDecodedObject will just
7437 * increment the ref count, that we'll decrement later. */
7438 subst = getDecodedObject(subst);
7439
7440 ssub = subst->ptr;
7441 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7442 p = strchr(spat,'*');
7443 if (!p) {
7444 decrRefCount(subst);
7445 return NULL;
7446 }
7447
7448 /* Find out if we're dealing with a hash dereference. */
7449 if ((f = strstr(p+1, "->")) != NULL) {
7450 fieldlen = sdslen(spat)-(f-spat);
7451 /* this also copies \0 character */
7452 memcpy(fieldname.buf,f+2,fieldlen-1);
7453 fieldname.len = fieldlen-2;
7454 } else {
7455 fieldlen = 0;
7456 }
7457
7458 prefixlen = p-spat;
7459 sublen = sdslen(ssub);
7460 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7461 memcpy(keyname.buf,spat,prefixlen);
7462 memcpy(keyname.buf+prefixlen,ssub,sublen);
7463 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7464 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7465 keyname.len = prefixlen+sublen+postfixlen;
7466 decrRefCount(subst);
7467
7468 /* Lookup substituted key */
7469 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7470 o = lookupKeyRead(db,&keyobj);
7471 if (o == NULL) return NULL;
7472
7473 if (fieldlen > 0) {
7474 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7475
7476 /* Retrieve value from hash by the field name. This operation
7477 * already increases the refcount of the returned object. */
7478 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7479 o = hashTypeGet(o, &fieldobj);
7480 } else {
7481 if (o->type != REDIS_STRING) return NULL;
7482
7483 /* Every object that this function returns needs to have its refcount
7484 * increased. sortCommand decreases it again. */
7485 incrRefCount(o);
7486 }
7487
7488 return o;
7489 }
7490
7491 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7492 * the additional parameter is not standard but a BSD-specific we have to
7493 * pass sorting parameters via the global 'server' structure */
7494 static int sortCompare(const void *s1, const void *s2) {
7495 const redisSortObject *so1 = s1, *so2 = s2;
7496 int cmp;
7497
7498 if (!server.sort_alpha) {
7499 /* Numeric sorting. Here it's trivial as we precomputed scores */
7500 if (so1->u.score > so2->u.score) {
7501 cmp = 1;
7502 } else if (so1->u.score < so2->u.score) {
7503 cmp = -1;
7504 } else {
7505 cmp = 0;
7506 }
7507 } else {
7508 /* Alphanumeric sorting */
7509 if (server.sort_bypattern) {
7510 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7511 /* At least one compare object is NULL */
7512 if (so1->u.cmpobj == so2->u.cmpobj)
7513 cmp = 0;
7514 else if (so1->u.cmpobj == NULL)
7515 cmp = -1;
7516 else
7517 cmp = 1;
7518 } else {
7519 /* We have both the objects, use strcoll */
7520 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7521 }
7522 } else {
7523 /* Compare elements directly. */
7524 cmp = compareStringObjects(so1->obj,so2->obj);
7525 }
7526 }
7527 return server.sort_desc ? -cmp : cmp;
7528 }
7529
7530 /* The SORT command is the most complex command in Redis. Warning: this code
7531 * is optimized for speed and a bit less for readability */
7532 static void sortCommand(redisClient *c) {
7533 list *operations;
7534 unsigned int outputlen = 0;
7535 int desc = 0, alpha = 0;
7536 int limit_start = 0, limit_count = -1, start, end;
7537 int j, dontsort = 0, vectorlen;
7538 int getop = 0; /* GET operation counter */
7539 robj *sortval, *sortby = NULL, *storekey = NULL;
7540 redisSortObject *vector; /* Resulting vector to sort */
7541
7542 /* Lookup the key to sort. It must be of the right types */
7543 sortval = lookupKeyRead(c->db,c->argv[1]);
7544 if (sortval == NULL) {
7545 addReply(c,shared.emptymultibulk);
7546 return;
7547 }
7548 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7549 sortval->type != REDIS_ZSET)
7550 {
7551 addReply(c,shared.wrongtypeerr);
7552 return;
7553 }
7554
7555 /* Create a list of operations to perform for every sorted element.
7556 * Operations can be GET/DEL/INCR/DECR */
7557 operations = listCreate();
7558 listSetFreeMethod(operations,zfree);
7559 j = 2;
7560
7561 /* Now we need to protect sortval incrementing its count, in the future
7562 * SORT may have options able to overwrite/delete keys during the sorting
7563 * and the sorted key itself may get destroied */
7564 incrRefCount(sortval);
7565
7566 /* The SORT command has an SQL-alike syntax, parse it */
7567 while(j < c->argc) {
7568 int leftargs = c->argc-j-1;
7569 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7570 desc = 0;
7571 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7572 desc = 1;
7573 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7574 alpha = 1;
7575 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7576 limit_start = atoi(c->argv[j+1]->ptr);
7577 limit_count = atoi(c->argv[j+2]->ptr);
7578 j+=2;
7579 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7580 storekey = c->argv[j+1];
7581 j++;
7582 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7583 sortby = c->argv[j+1];
7584 /* If the BY pattern does not contain '*', i.e. it is constant,
7585 * we don't need to sort nor to lookup the weight keys. */
7586 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7587 j++;
7588 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7589 listAddNodeTail(operations,createSortOperation(
7590 REDIS_SORT_GET,c->argv[j+1]));
7591 getop++;
7592 j++;
7593 } else {
7594 decrRefCount(sortval);
7595 listRelease(operations);
7596 addReply(c,shared.syntaxerr);
7597 return;
7598 }
7599 j++;
7600 }
7601
7602 /* Load the sorting vector with all the objects to sort */
7603 switch(sortval->type) {
7604 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7605 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7606 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7607 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7608 }
7609 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7610 j = 0;
7611
7612 if (sortval->type == REDIS_LIST) {
7613 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7614 listTypeEntry entry;
7615 while(listTypeNext(li,&entry)) {
7616 vector[j].obj = listTypeGet(&entry);
7617 vector[j].u.score = 0;
7618 vector[j].u.cmpobj = NULL;
7619 j++;
7620 }
7621 listTypeReleaseIterator(li);
7622 } else {
7623 dict *set;
7624 dictIterator *di;
7625 dictEntry *setele;
7626
7627 if (sortval->type == REDIS_SET) {
7628 set = sortval->ptr;
7629 } else {
7630 zset *zs = sortval->ptr;
7631 set = zs->dict;
7632 }
7633
7634 di = dictGetIterator(set);
7635 while((setele = dictNext(di)) != NULL) {
7636 vector[j].obj = dictGetEntryKey(setele);
7637 vector[j].u.score = 0;
7638 vector[j].u.cmpobj = NULL;
7639 j++;
7640 }
7641 dictReleaseIterator(di);
7642 }
7643 redisAssert(j == vectorlen);
7644
7645 /* Now it's time to load the right scores in the sorting vector */
7646 if (dontsort == 0) {
7647 for (j = 0; j < vectorlen; j++) {
7648 robj *byval;
7649 if (sortby) {
7650 /* lookup value to sort by */
7651 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7652 if (!byval) continue;
7653 } else {
7654 /* use object itself to sort by */
7655 byval = vector[j].obj;
7656 }
7657
7658 if (alpha) {
7659 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7660 } else {
7661 if (byval->encoding == REDIS_ENCODING_RAW) {
7662 vector[j].u.score = strtod(byval->ptr,NULL);
7663 } else if (byval->encoding == REDIS_ENCODING_INT) {
7664 /* Don't need to decode the object if it's
7665 * integer-encoded (the only encoding supported) so
7666 * far. We can just cast it */
7667 vector[j].u.score = (long)byval->ptr;
7668 } else {
7669 redisAssert(1 != 1);
7670 }
7671 }
7672
7673 /* when the object was retrieved using lookupKeyByPattern,
7674 * its refcount needs to be decreased. */
7675 if (sortby) {
7676 decrRefCount(byval);
7677 }
7678 }
7679 }
7680
7681 /* We are ready to sort the vector... perform a bit of sanity check
7682 * on the LIMIT option too. We'll use a partial version of quicksort. */
7683 start = (limit_start < 0) ? 0 : limit_start;
7684 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7685 if (start >= vectorlen) {
7686 start = vectorlen-1;
7687 end = vectorlen-2;
7688 }
7689 if (end >= vectorlen) end = vectorlen-1;
7690
7691 if (dontsort == 0) {
7692 server.sort_desc = desc;
7693 server.sort_alpha = alpha;
7694 server.sort_bypattern = sortby ? 1 : 0;
7695 if (sortby && (start != 0 || end != vectorlen-1))
7696 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7697 else
7698 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7699 }
7700
7701 /* Send command output to the output buffer, performing the specified
7702 * GET/DEL/INCR/DECR operations if any. */
7703 outputlen = getop ? getop*(end-start+1) : end-start+1;
7704 if (storekey == NULL) {
7705 /* STORE option not specified, sent the sorting result to client */
7706 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7707 for (j = start; j <= end; j++) {
7708 listNode *ln;
7709 listIter li;
7710
7711 if (!getop) addReplyBulk(c,vector[j].obj);
7712 listRewind(operations,&li);
7713 while((ln = listNext(&li))) {
7714 redisSortOperation *sop = ln->value;
7715 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7716 vector[j].obj);
7717
7718 if (sop->type == REDIS_SORT_GET) {
7719 if (!val) {
7720 addReply(c,shared.nullbulk);
7721 } else {
7722 addReplyBulk(c,val);
7723 decrRefCount(val);
7724 }
7725 } else {
7726 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7727 }
7728 }
7729 }
7730 } else {
7731 robj *sobj = createZiplistObject();
7732
7733 /* STORE option specified, set the sorting result as a List object */
7734 for (j = start; j <= end; j++) {
7735 listNode *ln;
7736 listIter li;
7737
7738 if (!getop) {
7739 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7740 } else {
7741 listRewind(operations,&li);
7742 while((ln = listNext(&li))) {
7743 redisSortOperation *sop = ln->value;
7744 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7745 vector[j].obj);
7746
7747 if (sop->type == REDIS_SORT_GET) {
7748 if (!val) val = createStringObject("",0);
7749
7750 /* listTypePush does an incrRefCount, so we should take care
7751 * care of the incremented refcount caused by either
7752 * lookupKeyByPattern or createStringObject("",0) */
7753 listTypePush(sobj,val,REDIS_TAIL);
7754 decrRefCount(val);
7755 } else {
7756 /* always fails */
7757 redisAssert(sop->type == REDIS_SORT_GET);
7758 }
7759 }
7760 }
7761 }
7762 dbReplace(c->db,storekey,sobj);
7763 /* Note: we add 1 because the DB is dirty anyway since even if the
7764 * SORT result is empty a new key is set and maybe the old content
7765 * replaced. */
7766 server.dirty += 1+outputlen;
7767 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7768 }
7769
7770 /* Cleanup */
7771 if (sortval->type == REDIS_LIST)
7772 for (j = 0; j < vectorlen; j++)
7773 decrRefCount(vector[j].obj);
7774 decrRefCount(sortval);
7775 listRelease(operations);
7776 for (j = 0; j < vectorlen; j++) {
7777 if (alpha && vector[j].u.cmpobj)
7778 decrRefCount(vector[j].u.cmpobj);
7779 }
7780 zfree(vector);
7781 }
7782
7783 /* Convert an amount of bytes into a human readable string in the form
7784 * of 100B, 2G, 100M, 4K, and so forth. */
7785 static void bytesToHuman(char *s, unsigned long long n) {
7786 double d;
7787
7788 if (n < 1024) {
7789 /* Bytes */
7790 sprintf(s,"%lluB",n);
7791 return;
7792 } else if (n < (1024*1024)) {
7793 d = (double)n/(1024);
7794 sprintf(s,"%.2fK",d);
7795 } else if (n < (1024LL*1024*1024)) {
7796 d = (double)n/(1024*1024);
7797 sprintf(s,"%.2fM",d);
7798 } else if (n < (1024LL*1024*1024*1024)) {
7799 d = (double)n/(1024LL*1024*1024);
7800 sprintf(s,"%.2fG",d);
7801 }
7802 }
7803
7804 /* Create the string returned by the INFO command. This is decoupled
7805 * by the INFO command itself as we need to report the same information
7806 * on memory corruption problems. */
7807 static sds genRedisInfoString(void) {
7808 sds info;
7809 time_t uptime = time(NULL)-server.stat_starttime;
7810 int j;
7811 char hmem[64];
7812
7813 bytesToHuman(hmem,zmalloc_used_memory());
7814 info = sdscatprintf(sdsempty(),
7815 "redis_version:%s\r\n"
7816 "redis_git_sha1:%s\r\n"
7817 "redis_git_dirty:%d\r\n"
7818 "arch_bits:%s\r\n"
7819 "multiplexing_api:%s\r\n"
7820 "process_id:%ld\r\n"
7821 "uptime_in_seconds:%ld\r\n"
7822 "uptime_in_days:%ld\r\n"
7823 "connected_clients:%d\r\n"
7824 "connected_slaves:%d\r\n"
7825 "blocked_clients:%d\r\n"
7826 "used_memory:%zu\r\n"
7827 "used_memory_human:%s\r\n"
7828 "changes_since_last_save:%lld\r\n"
7829 "bgsave_in_progress:%d\r\n"
7830 "last_save_time:%ld\r\n"
7831 "bgrewriteaof_in_progress:%d\r\n"
7832 "total_connections_received:%lld\r\n"
7833 "total_commands_processed:%lld\r\n"
7834 "expired_keys:%lld\r\n"
7835 "hash_max_zipmap_entries:%zu\r\n"
7836 "hash_max_zipmap_value:%zu\r\n"
7837 "pubsub_channels:%ld\r\n"
7838 "pubsub_patterns:%u\r\n"
7839 "vm_enabled:%d\r\n"
7840 "role:%s\r\n"
7841 ,REDIS_VERSION,
7842 REDIS_GIT_SHA1,
7843 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7844 (sizeof(long) == 8) ? "64" : "32",
7845 aeGetApiName(),
7846 (long) getpid(),
7847 uptime,
7848 uptime/(3600*24),
7849 listLength(server.clients)-listLength(server.slaves),
7850 listLength(server.slaves),
7851 server.blpop_blocked_clients,
7852 zmalloc_used_memory(),
7853 hmem,
7854 server.dirty,
7855 server.bgsavechildpid != -1,
7856 server.lastsave,
7857 server.bgrewritechildpid != -1,
7858 server.stat_numconnections,
7859 server.stat_numcommands,
7860 server.stat_expiredkeys,
7861 server.hash_max_zipmap_entries,
7862 server.hash_max_zipmap_value,
7863 dictSize(server.pubsub_channels),
7864 listLength(server.pubsub_patterns),
7865 server.vm_enabled != 0,
7866 server.masterhost == NULL ? "master" : "slave"
7867 );
7868 if (server.masterhost) {
7869 info = sdscatprintf(info,
7870 "master_host:%s\r\n"
7871 "master_port:%d\r\n"
7872 "master_link_status:%s\r\n"
7873 "master_last_io_seconds_ago:%d\r\n"
7874 ,server.masterhost,
7875 server.masterport,
7876 (server.replstate == REDIS_REPL_CONNECTED) ?
7877 "up" : "down",
7878 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7879 );
7880 }
7881 if (server.vm_enabled) {
7882 lockThreadedIO();
7883 info = sdscatprintf(info,
7884 "vm_conf_max_memory:%llu\r\n"
7885 "vm_conf_page_size:%llu\r\n"
7886 "vm_conf_pages:%llu\r\n"
7887 "vm_stats_used_pages:%llu\r\n"
7888 "vm_stats_swapped_objects:%llu\r\n"
7889 "vm_stats_swappin_count:%llu\r\n"
7890 "vm_stats_swappout_count:%llu\r\n"
7891 "vm_stats_io_newjobs_len:%lu\r\n"
7892 "vm_stats_io_processing_len:%lu\r\n"
7893 "vm_stats_io_processed_len:%lu\r\n"
7894 "vm_stats_io_active_threads:%lu\r\n"
7895 "vm_stats_blocked_clients:%lu\r\n"
7896 ,(unsigned long long) server.vm_max_memory,
7897 (unsigned long long) server.vm_page_size,
7898 (unsigned long long) server.vm_pages,
7899 (unsigned long long) server.vm_stats_used_pages,
7900 (unsigned long long) server.vm_stats_swapped_objects,
7901 (unsigned long long) server.vm_stats_swapins,
7902 (unsigned long long) server.vm_stats_swapouts,
7903 (unsigned long) listLength(server.io_newjobs),
7904 (unsigned long) listLength(server.io_processing),
7905 (unsigned long) listLength(server.io_processed),
7906 (unsigned long) server.io_active_threads,
7907 (unsigned long) server.vm_blocked_clients
7908 );
7909 unlockThreadedIO();
7910 }
7911 for (j = 0; j < server.dbnum; j++) {
7912 long long keys, vkeys;
7913
7914 keys = dictSize(server.db[j].dict);
7915 vkeys = dictSize(server.db[j].expires);
7916 if (keys || vkeys) {
7917 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7918 j, keys, vkeys);
7919 }
7920 }
7921 return info;
7922 }
7923
7924 static void infoCommand(redisClient *c) {
7925 sds info = genRedisInfoString();
7926 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7927 (unsigned long)sdslen(info)));
7928 addReplySds(c,info);
7929 addReply(c,shared.crlf);
7930 }
7931
7932 static void monitorCommand(redisClient *c) {
7933 /* ignore MONITOR if aleady slave or in monitor mode */
7934 if (c->flags & REDIS_SLAVE) return;
7935
7936 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7937 c->slaveseldb = 0;
7938 listAddNodeTail(server.monitors,c);
7939 addReply(c,shared.ok);
7940 }
7941
7942 /* ================================= Expire ================================= */
7943 static int removeExpire(redisDb *db, robj *key) {
7944 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7945 return 1;
7946 } else {
7947 return 0;
7948 }
7949 }
7950
7951 static int setExpire(redisDb *db, robj *key, time_t when) {
7952 sds copy = sdsdup(key->ptr);
7953 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7954 sdsfree(copy);
7955 return 0;
7956 } else {
7957 return 1;
7958 }
7959 }
7960
7961 /* Return the expire time of the specified key, or -1 if no expire
7962 * is associated with this key (i.e. the key is non volatile) */
7963 static time_t getExpire(redisDb *db, robj *key) {
7964 dictEntry *de;
7965
7966 /* No expire? return ASAP */
7967 if (dictSize(db->expires) == 0 ||
7968 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7969
7970 return (time_t) dictGetEntryVal(de);
7971 }
7972
7973 static int expireIfNeeded(redisDb *db, robj *key) {
7974 time_t when;
7975 dictEntry *de;
7976
7977 /* No expire? return ASAP */
7978 if (dictSize(db->expires) == 0 ||
7979 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7980
7981 /* Lookup the expire */
7982 when = (time_t) dictGetEntryVal(de);
7983 if (time(NULL) <= when) return 0;
7984
7985 /* Delete the key */
7986 dbDelete(db,key);
7987 server.stat_expiredkeys++;
7988 return 1;
7989 }
7990
7991 static int deleteIfVolatile(redisDb *db, robj *key) {
7992 dictEntry *de;
7993
7994 /* No expire? return ASAP */
7995 if (dictSize(db->expires) == 0 ||
7996 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7997
7998 /* Delete the key */
7999 server.dirty++;
8000 server.stat_expiredkeys++;
8001 dictDelete(db->expires,key->ptr);
8002 return dictDelete(db->dict,key->ptr) == DICT_OK;
8003 }
8004
8005 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
8006 dictEntry *de;
8007 time_t seconds;
8008
8009 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
8010
8011 seconds -= offset;
8012
8013 de = dictFind(c->db->dict,key->ptr);
8014 if (de == NULL) {
8015 addReply(c,shared.czero);
8016 return;
8017 }
8018 if (seconds <= 0) {
8019 if (dbDelete(c->db,key)) server.dirty++;
8020 addReply(c, shared.cone);
8021 return;
8022 } else {
8023 time_t when = time(NULL)+seconds;
8024 if (setExpire(c->db,key,when)) {
8025 addReply(c,shared.cone);
8026 server.dirty++;
8027 } else {
8028 addReply(c,shared.czero);
8029 }
8030 return;
8031 }
8032 }
8033
8034 static void expireCommand(redisClient *c) {
8035 expireGenericCommand(c,c->argv[1],c->argv[2],0);
8036 }
8037
8038 static void expireatCommand(redisClient *c) {
8039 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
8040 }
8041
8042 static void ttlCommand(redisClient *c) {
8043 time_t expire;
8044 int ttl = -1;
8045
8046 expire = getExpire(c->db,c->argv[1]);
8047 if (expire != -1) {
8048 ttl = (int) (expire-time(NULL));
8049 if (ttl < 0) ttl = -1;
8050 }
8051 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
8052 }
8053
8054 /* ================================ MULTI/EXEC ============================== */
8055
8056 /* Client state initialization for MULTI/EXEC */
8057 static void initClientMultiState(redisClient *c) {
8058 c->mstate.commands = NULL;
8059 c->mstate.count = 0;
8060 }
8061
8062 /* Release all the resources associated with MULTI/EXEC state */
8063 static void freeClientMultiState(redisClient *c) {
8064 int j;
8065
8066 for (j = 0; j < c->mstate.count; j++) {
8067 int i;
8068 multiCmd *mc = c->mstate.commands+j;
8069
8070 for (i = 0; i < mc->argc; i++)
8071 decrRefCount(mc->argv[i]);
8072 zfree(mc->argv);
8073 }
8074 zfree(c->mstate.commands);
8075 }
8076
8077 /* Add a new command into the MULTI commands queue */
8078 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
8079 multiCmd *mc;
8080 int j;
8081
8082 c->mstate.commands = zrealloc(c->mstate.commands,
8083 sizeof(multiCmd)*(c->mstate.count+1));
8084 mc = c->mstate.commands+c->mstate.count;
8085 mc->cmd = cmd;
8086 mc->argc = c->argc;
8087 mc->argv = zmalloc(sizeof(robj*)*c->argc);
8088 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8089 for (j = 0; j < c->argc; j++)
8090 incrRefCount(mc->argv[j]);
8091 c->mstate.count++;
8092 }
8093
8094 static void multiCommand(redisClient *c) {
8095 if (c->flags & REDIS_MULTI) {
8096 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8097 return;
8098 }
8099 c->flags |= REDIS_MULTI;
8100 addReply(c,shared.ok);
8101 }
8102
8103 static void discardCommand(redisClient *c) {
8104 if (!(c->flags & REDIS_MULTI)) {
8105 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8106 return;
8107 }
8108
8109 freeClientMultiState(c);
8110 initClientMultiState(c);
8111 c->flags &= (~REDIS_MULTI);
8112 unwatchAllKeys(c);
8113 addReply(c,shared.ok);
8114 }
8115
8116 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8117 * implememntation for more information. */
8118 static void execCommandReplicateMulti(redisClient *c) {
8119 struct redisCommand *cmd;
8120 robj *multistring = createStringObject("MULTI",5);
8121
8122 cmd = lookupCommand("multi");
8123 if (server.appendonly)
8124 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8125 if (listLength(server.slaves))
8126 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8127 decrRefCount(multistring);
8128 }
8129
8130 static void execCommand(redisClient *c) {
8131 int j;
8132 robj **orig_argv;
8133 int orig_argc;
8134
8135 if (!(c->flags & REDIS_MULTI)) {
8136 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8137 return;
8138 }
8139
8140 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8141 * A failed EXEC will return a multi bulk nil object. */
8142 if (c->flags & REDIS_DIRTY_CAS) {
8143 freeClientMultiState(c);
8144 initClientMultiState(c);
8145 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8146 unwatchAllKeys(c);
8147 addReply(c,shared.nullmultibulk);
8148 return;
8149 }
8150
8151 /* Replicate a MULTI request now that we are sure the block is executed.
8152 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8153 * both the AOF and the replication link will have the same consistency
8154 * and atomicity guarantees. */
8155 execCommandReplicateMulti(c);
8156
8157 /* Exec all the queued commands */
8158 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8159 orig_argv = c->argv;
8160 orig_argc = c->argc;
8161 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8162 for (j = 0; j < c->mstate.count; j++) {
8163 c->argc = c->mstate.commands[j].argc;
8164 c->argv = c->mstate.commands[j].argv;
8165 call(c,c->mstate.commands[j].cmd);
8166 }
8167 c->argv = orig_argv;
8168 c->argc = orig_argc;
8169 freeClientMultiState(c);
8170 initClientMultiState(c);
8171 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8172 /* Make sure the EXEC command is always replicated / AOF, since we
8173 * always send the MULTI command (we can't know beforehand if the
8174 * next operations will contain at least a modification to the DB). */
8175 server.dirty++;
8176 }
8177
8178 /* =========================== Blocking Operations ========================= */
8179
8180 /* Currently Redis blocking operations support is limited to list POP ops,
8181 * so the current implementation is not fully generic, but it is also not
8182 * completely specific so it will not require a rewrite to support new
8183 * kind of blocking operations in the future.
8184 *
8185 * Still it's important to note that list blocking operations can be already
8186 * used as a notification mechanism in order to implement other blocking
8187 * operations at application level, so there must be a very strong evidence
8188 * of usefulness and generality before new blocking operations are implemented.
8189 *
8190 * This is how the current blocking POP works, we use BLPOP as example:
8191 * - If the user calls BLPOP and the key exists and contains a non empty list
8192 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8193 * if there is not to block.
8194 * - If instead BLPOP is called and the key does not exists or the list is
8195 * empty we need to block. In order to do so we remove the notification for
8196 * new data to read in the client socket (so that we'll not serve new
8197 * requests if the blocking request is not served). Also we put the client
8198 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8199 * blocking for this keys.
8200 * - If a PUSH operation against a key with blocked clients waiting is
8201 * performed, we serve the first in the list: basically instead to push
8202 * the new element inside the list we return it to the (first / oldest)
8203 * blocking client, unblock the client, and remove it form the list.
8204 *
8205 * The above comment and the source code should be enough in order to understand
8206 * the implementation and modify / fix it later.
8207 */
8208
8209 /* Set a client in blocking mode for the specified key, with the specified
8210 * timeout */
8211 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8212 dictEntry *de;
8213 list *l;
8214 int j;
8215
8216 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8217 c->blocking_keys_num = numkeys;
8218 c->blockingto = timeout;
8219 for (j = 0; j < numkeys; j++) {
8220 /* Add the key in the client structure, to map clients -> keys */
8221 c->blocking_keys[j] = keys[j];
8222 incrRefCount(keys[j]);
8223
8224 /* And in the other "side", to map keys -> clients */
8225 de = dictFind(c->db->blocking_keys,keys[j]);
8226 if (de == NULL) {
8227 int retval;
8228
8229 /* For every key we take a list of clients blocked for it */
8230 l = listCreate();
8231 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8232 incrRefCount(keys[j]);
8233 assert(retval == DICT_OK);
8234 } else {
8235 l = dictGetEntryVal(de);
8236 }
8237 listAddNodeTail(l,c);
8238 }
8239 /* Mark the client as a blocked client */
8240 c->flags |= REDIS_BLOCKED;
8241 server.blpop_blocked_clients++;
8242 }
8243
8244 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8245 static void unblockClientWaitingData(redisClient *c) {
8246 dictEntry *de;
8247 list *l;
8248 int j;
8249
8250 assert(c->blocking_keys != NULL);
8251 /* The client may wait for multiple keys, so unblock it for every key. */
8252 for (j = 0; j < c->blocking_keys_num; j++) {
8253 /* Remove this client from the list of clients waiting for this key. */
8254 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8255 assert(de != NULL);
8256 l = dictGetEntryVal(de);
8257 listDelNode(l,listSearchKey(l,c));
8258 /* If the list is empty we need to remove it to avoid wasting memory */
8259 if (listLength(l) == 0)
8260 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8261 decrRefCount(c->blocking_keys[j]);
8262 }
8263 /* Cleanup the client structure */
8264 zfree(c->blocking_keys);
8265 c->blocking_keys = NULL;
8266 c->flags &= (~REDIS_BLOCKED);
8267 server.blpop_blocked_clients--;
8268 /* We want to process data if there is some command waiting
8269 * in the input buffer. Note that this is safe even if
8270 * unblockClientWaitingData() gets called from freeClient() because
8271 * freeClient() will be smart enough to call this function
8272 * *after* c->querybuf was set to NULL. */
8273 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8274 }
8275
8276 /* This should be called from any function PUSHing into lists.
8277 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8278 * 'ele' is the element pushed.
8279 *
8280 * If the function returns 0 there was no client waiting for a list push
8281 * against this key.
8282 *
8283 * If the function returns 1 there was a client waiting for a list push
8284 * against this key, the element was passed to this client thus it's not
8285 * needed to actually add it to the list and the caller should return asap. */
8286 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8287 struct dictEntry *de;
8288 redisClient *receiver;
8289 list *l;
8290 listNode *ln;
8291
8292 de = dictFind(c->db->blocking_keys,key);
8293 if (de == NULL) return 0;
8294 l = dictGetEntryVal(de);
8295 ln = listFirst(l);
8296 assert(ln != NULL);
8297 receiver = ln->value;
8298
8299 addReplySds(receiver,sdsnew("*2\r\n"));
8300 addReplyBulk(receiver,key);
8301 addReplyBulk(receiver,ele);
8302 unblockClientWaitingData(receiver);
8303 return 1;
8304 }
8305
8306 /* Blocking RPOP/LPOP */
8307 static void blockingPopGenericCommand(redisClient *c, int where) {
8308 robj *o;
8309 time_t timeout;
8310 int j;
8311
8312 for (j = 1; j < c->argc-1; j++) {
8313 o = lookupKeyWrite(c->db,c->argv[j]);
8314 if (o != NULL) {
8315 if (o->type != REDIS_LIST) {
8316 addReply(c,shared.wrongtypeerr);
8317 return;
8318 } else {
8319 list *list = o->ptr;
8320 if (listLength(list) != 0) {
8321 /* If the list contains elements fall back to the usual
8322 * non-blocking POP operation */
8323 robj *argv[2], **orig_argv;
8324 int orig_argc;
8325
8326 /* We need to alter the command arguments before to call
8327 * popGenericCommand() as the command takes a single key. */
8328 orig_argv = c->argv;
8329 orig_argc = c->argc;
8330 argv[1] = c->argv[j];
8331 c->argv = argv;
8332 c->argc = 2;
8333
8334 /* Also the return value is different, we need to output
8335 * the multi bulk reply header and the key name. The
8336 * "real" command will add the last element (the value)
8337 * for us. If this souds like an hack to you it's just
8338 * because it is... */
8339 addReplySds(c,sdsnew("*2\r\n"));
8340 addReplyBulk(c,argv[1]);
8341 popGenericCommand(c,where);
8342
8343 /* Fix the client structure with the original stuff */
8344 c->argv = orig_argv;
8345 c->argc = orig_argc;
8346 return;
8347 }
8348 }
8349 }
8350 }
8351 /* If the list is empty or the key does not exists we must block */
8352 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8353 if (timeout > 0) timeout += time(NULL);
8354 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8355 }
8356
8357 static void blpopCommand(redisClient *c) {
8358 blockingPopGenericCommand(c,REDIS_HEAD);
8359 }
8360
8361 static void brpopCommand(redisClient *c) {
8362 blockingPopGenericCommand(c,REDIS_TAIL);
8363 }
8364
8365 /* =============================== Replication ============================= */
8366
8367 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8368 ssize_t nwritten, ret = size;
8369 time_t start = time(NULL);
8370
8371 timeout++;
8372 while(size) {
8373 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8374 nwritten = write(fd,ptr,size);
8375 if (nwritten == -1) return -1;
8376 ptr += nwritten;
8377 size -= nwritten;
8378 }
8379 if ((time(NULL)-start) > timeout) {
8380 errno = ETIMEDOUT;
8381 return -1;
8382 }
8383 }
8384 return ret;
8385 }
8386
8387 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8388 ssize_t nread, totread = 0;
8389 time_t start = time(NULL);
8390
8391 timeout++;
8392 while(size) {
8393 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8394 nread = read(fd,ptr,size);
8395 if (nread == -1) return -1;
8396 ptr += nread;
8397 size -= nread;
8398 totread += nread;
8399 }
8400 if ((time(NULL)-start) > timeout) {
8401 errno = ETIMEDOUT;
8402 return -1;
8403 }
8404 }
8405 return totread;
8406 }
8407
8408 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8409 ssize_t nread = 0;
8410
8411 size--;
8412 while(size) {
8413 char c;
8414
8415 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8416 if (c == '\n') {
8417 *ptr = '\0';
8418 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8419 return nread;
8420 } else {
8421 *ptr++ = c;
8422 *ptr = '\0';
8423 nread++;
8424 }
8425 }
8426 return nread;
8427 }
8428
8429 static void syncCommand(redisClient *c) {
8430 /* ignore SYNC if aleady slave or in monitor mode */
8431 if (c->flags & REDIS_SLAVE) return;
8432
8433 /* SYNC can't be issued when the server has pending data to send to
8434 * the client about already issued commands. We need a fresh reply
8435 * buffer registering the differences between the BGSAVE and the current
8436 * dataset, so that we can copy to other slaves if needed. */
8437 if (listLength(c->reply) != 0) {
8438 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8439 return;
8440 }
8441
8442 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8443 /* Here we need to check if there is a background saving operation
8444 * in progress, or if it is required to start one */
8445 if (server.bgsavechildpid != -1) {
8446 /* Ok a background save is in progress. Let's check if it is a good
8447 * one for replication, i.e. if there is another slave that is
8448 * registering differences since the server forked to save */
8449 redisClient *slave;
8450 listNode *ln;
8451 listIter li;
8452
8453 listRewind(server.slaves,&li);
8454 while((ln = listNext(&li))) {
8455 slave = ln->value;
8456 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8457 }
8458 if (ln) {
8459 /* Perfect, the server is already registering differences for
8460 * another slave. Set the right state, and copy the buffer. */
8461 listRelease(c->reply);
8462 c->reply = listDup(slave->reply);
8463 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8464 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8465 } else {
8466 /* No way, we need to wait for the next BGSAVE in order to
8467 * register differences */
8468 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8469 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8470 }
8471 } else {
8472 /* Ok we don't have a BGSAVE in progress, let's start one */
8473 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8474 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8475 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8476 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8477 return;
8478 }
8479 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8480 }
8481 c->repldbfd = -1;
8482 c->flags |= REDIS_SLAVE;
8483 c->slaveseldb = 0;
8484 listAddNodeTail(server.slaves,c);
8485 return;
8486 }
8487
8488 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8489 redisClient *slave = privdata;
8490 REDIS_NOTUSED(el);
8491 REDIS_NOTUSED(mask);
8492 char buf[REDIS_IOBUF_LEN];
8493 ssize_t nwritten, buflen;
8494
8495 if (slave->repldboff == 0) {
8496 /* Write the bulk write count before to transfer the DB. In theory here
8497 * we don't know how much room there is in the output buffer of the
8498 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8499 * operations) will never be smaller than the few bytes we need. */
8500 sds bulkcount;
8501
8502 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8503 slave->repldbsize);
8504 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8505 {
8506 sdsfree(bulkcount);
8507 freeClient(slave);
8508 return;
8509 }
8510 sdsfree(bulkcount);
8511 }
8512 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8513 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8514 if (buflen <= 0) {
8515 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8516 (buflen == 0) ? "premature EOF" : strerror(errno));
8517 freeClient(slave);
8518 return;
8519 }
8520 if ((nwritten = write(fd,buf,buflen)) == -1) {
8521 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8522 strerror(errno));
8523 freeClient(slave);
8524 return;
8525 }
8526 slave->repldboff += nwritten;
8527 if (slave->repldboff == slave->repldbsize) {
8528 close(slave->repldbfd);
8529 slave->repldbfd = -1;
8530 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8531 slave->replstate = REDIS_REPL_ONLINE;
8532 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8533 sendReplyToClient, slave) == AE_ERR) {
8534 freeClient(slave);
8535 return;
8536 }
8537 addReplySds(slave,sdsempty());
8538 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8539 }
8540 }
8541
8542 /* This function is called at the end of every backgrond saving.
8543 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8544 * otherwise REDIS_ERR is passed to the function.
8545 *
8546 * The goal of this function is to handle slaves waiting for a successful
8547 * background saving in order to perform non-blocking synchronization. */
8548 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8549 listNode *ln;
8550 int startbgsave = 0;
8551 listIter li;
8552
8553 listRewind(server.slaves,&li);
8554 while((ln = listNext(&li))) {
8555 redisClient *slave = ln->value;
8556
8557 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8558 startbgsave = 1;
8559 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8560 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8561 struct redis_stat buf;
8562
8563 if (bgsaveerr != REDIS_OK) {
8564 freeClient(slave);
8565 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8566 continue;
8567 }
8568 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8569 redis_fstat(slave->repldbfd,&buf) == -1) {
8570 freeClient(slave);
8571 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8572 continue;
8573 }
8574 slave->repldboff = 0;
8575 slave->repldbsize = buf.st_size;
8576 slave->replstate = REDIS_REPL_SEND_BULK;
8577 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8578 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8579 freeClient(slave);
8580 continue;
8581 }
8582 }
8583 }
8584 if (startbgsave) {
8585 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8586 listIter li;
8587
8588 listRewind(server.slaves,&li);
8589 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8590 while((ln = listNext(&li))) {
8591 redisClient *slave = ln->value;
8592
8593 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8594 freeClient(slave);
8595 }
8596 }
8597 }
8598 }
8599
8600 static int syncWithMaster(void) {
8601 char buf[1024], tmpfile[256], authcmd[1024];
8602 long dumpsize;
8603 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8604 int dfd, maxtries = 5;
8605
8606 if (fd == -1) {
8607 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8608 strerror(errno));
8609 return REDIS_ERR;
8610 }
8611
8612 /* AUTH with the master if required. */
8613 if(server.masterauth) {
8614 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8615 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8616 close(fd);
8617 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8618 strerror(errno));
8619 return REDIS_ERR;
8620 }
8621 /* Read the AUTH result. */
8622 if (syncReadLine(fd,buf,1024,3600) == -1) {
8623 close(fd);
8624 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8625 strerror(errno));
8626 return REDIS_ERR;
8627 }
8628 if (buf[0] != '+') {
8629 close(fd);
8630 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8631 return REDIS_ERR;
8632 }
8633 }
8634
8635 /* Issue the SYNC command */
8636 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8637 close(fd);
8638 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8639 strerror(errno));
8640 return REDIS_ERR;
8641 }
8642 /* Read the bulk write count */
8643 if (syncReadLine(fd,buf,1024,3600) == -1) {
8644 close(fd);
8645 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8646 strerror(errno));
8647 return REDIS_ERR;
8648 }
8649 if (buf[0] != '$') {
8650 close(fd);
8651 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8652 return REDIS_ERR;
8653 }
8654 dumpsize = strtol(buf+1,NULL,10);
8655 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8656 /* Read the bulk write data on a temp file */
8657 while(maxtries--) {
8658 snprintf(tmpfile,256,
8659 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8660 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8661 if (dfd != -1) break;
8662 sleep(1);
8663 }
8664 if (dfd == -1) {
8665 close(fd);
8666 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8667 return REDIS_ERR;
8668 }
8669 while(dumpsize) {
8670 int nread, nwritten;
8671
8672 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8673 if (nread == -1) {
8674 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8675 strerror(errno));
8676 close(fd);
8677 close(dfd);
8678 return REDIS_ERR;
8679 }
8680 nwritten = write(dfd,buf,nread);
8681 if (nwritten == -1) {
8682 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8683 close(fd);
8684 close(dfd);
8685 return REDIS_ERR;
8686 }
8687 dumpsize -= nread;
8688 }
8689 close(dfd);
8690 if (rename(tmpfile,server.dbfilename) == -1) {
8691 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8692 unlink(tmpfile);
8693 close(fd);
8694 return REDIS_ERR;
8695 }
8696 emptyDb();
8697 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8698 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8699 close(fd);
8700 return REDIS_ERR;
8701 }
8702 server.master = createClient(fd);
8703 server.master->flags |= REDIS_MASTER;
8704 server.master->authenticated = 1;
8705 server.replstate = REDIS_REPL_CONNECTED;
8706 return REDIS_OK;
8707 }
8708
8709 static void slaveofCommand(redisClient *c) {
8710 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8711 !strcasecmp(c->argv[2]->ptr,"one")) {
8712 if (server.masterhost) {
8713 sdsfree(server.masterhost);
8714 server.masterhost = NULL;
8715 if (server.master) freeClient(server.master);
8716 server.replstate = REDIS_REPL_NONE;
8717 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8718 }
8719 } else {
8720 sdsfree(server.masterhost);
8721 server.masterhost = sdsdup(c->argv[1]->ptr);
8722 server.masterport = atoi(c->argv[2]->ptr);
8723 if (server.master) freeClient(server.master);
8724 server.replstate = REDIS_REPL_CONNECT;
8725 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8726 server.masterhost, server.masterport);
8727 }
8728 addReply(c,shared.ok);
8729 }
8730
8731 /* ============================ Maxmemory directive ======================== */
8732
8733 /* Try to free one object form the pre-allocated objects free list.
8734 * This is useful under low mem conditions as by default we take 1 million
8735 * free objects allocated. On success REDIS_OK is returned, otherwise
8736 * REDIS_ERR. */
8737 static int tryFreeOneObjectFromFreelist(void) {
8738 robj *o;
8739
8740 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8741 if (listLength(server.objfreelist)) {
8742 listNode *head = listFirst(server.objfreelist);
8743 o = listNodeValue(head);
8744 listDelNode(server.objfreelist,head);
8745 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8746 zfree(o);
8747 return REDIS_OK;
8748 } else {
8749 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8750 return REDIS_ERR;
8751 }
8752 }
8753
8754 /* This function gets called when 'maxmemory' is set on the config file to limit
8755 * the max memory used by the server, and we are out of memory.
8756 * This function will try to, in order:
8757 *
8758 * - Free objects from the free list
8759 * - Try to remove keys with an EXPIRE set
8760 *
8761 * It is not possible to free enough memory to reach used-memory < maxmemory
8762 * the server will start refusing commands that will enlarge even more the
8763 * memory usage.
8764 */
8765 static void freeMemoryIfNeeded(void) {
8766 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8767 int j, k, freed = 0;
8768
8769 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8770 for (j = 0; j < server.dbnum; j++) {
8771 int minttl = -1;
8772 robj *minkey = NULL;
8773 struct dictEntry *de;
8774
8775 if (dictSize(server.db[j].expires)) {
8776 freed = 1;
8777 /* From a sample of three keys drop the one nearest to
8778 * the natural expire */
8779 for (k = 0; k < 3; k++) {
8780 time_t t;
8781
8782 de = dictGetRandomKey(server.db[j].expires);
8783 t = (time_t) dictGetEntryVal(de);
8784 if (minttl == -1 || t < minttl) {
8785 minkey = dictGetEntryKey(de);
8786 minttl = t;
8787 }
8788 }
8789 dbDelete(server.db+j,minkey);
8790 }
8791 }
8792 if (!freed) return; /* nothing to free... */
8793 }
8794 }
8795
8796 /* ============================== Append Only file ========================== */
8797
8798 /* Called when the user switches from "appendonly yes" to "appendonly no"
8799 * at runtime using the CONFIG command. */
8800 static void stopAppendOnly(void) {
8801 flushAppendOnlyFile();
8802 aof_fsync(server.appendfd);
8803 close(server.appendfd);
8804
8805 server.appendfd = -1;
8806 server.appendseldb = -1;
8807 server.appendonly = 0;
8808 /* rewrite operation in progress? kill it, wait child exit */
8809 if (server.bgsavechildpid != -1) {
8810 int statloc;
8811
8812 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8813 wait3(&statloc,0,NULL);
8814 /* reset the buffer accumulating changes while the child saves */
8815 sdsfree(server.bgrewritebuf);
8816 server.bgrewritebuf = sdsempty();
8817 server.bgsavechildpid = -1;
8818 }
8819 }
8820
8821 /* Called when the user switches from "appendonly no" to "appendonly yes"
8822 * at runtime using the CONFIG command. */
8823 static int startAppendOnly(void) {
8824 server.appendonly = 1;
8825 server.lastfsync = time(NULL);
8826 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8827 if (server.appendfd == -1) {
8828 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8829 return REDIS_ERR;
8830 }
8831 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8832 server.appendonly = 0;
8833 close(server.appendfd);
8834 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8835 return REDIS_ERR;
8836 }
8837 return REDIS_OK;
8838 }
8839
8840 /* Write the append only file buffer on disk.
8841 *
8842 * Since we are required to write the AOF before replying to the client,
8843 * and the only way the client socket can get a write is entering when the
8844 * the event loop, we accumulate all the AOF writes in a memory
8845 * buffer and write it on disk using this function just before entering
8846 * the event loop again. */
8847 static void flushAppendOnlyFile(void) {
8848 time_t now;
8849 ssize_t nwritten;
8850
8851 if (sdslen(server.aofbuf) == 0) return;
8852
8853 /* We want to perform a single write. This should be guaranteed atomic
8854 * at least if the filesystem we are writing is a real physical one.
8855 * While this will save us against the server being killed I don't think
8856 * there is much to do about the whole server stopping for power problems
8857 * or alike */
8858 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8859 if (nwritten != (signed)sdslen(server.aofbuf)) {
8860 /* Ooops, we are in troubles. The best thing to do for now is
8861 * aborting instead of giving the illusion that everything is
8862 * working as expected. */
8863 if (nwritten == -1) {
8864 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8865 } else {
8866 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8867 }
8868 exit(1);
8869 }
8870 sdsfree(server.aofbuf);
8871 server.aofbuf = sdsempty();
8872
8873 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8874 * childs performing heavy I/O on disk. */
8875 if (server.no_appendfsync_on_rewrite &&
8876 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8877 return;
8878 /* Fsync if needed */
8879 now = time(NULL);
8880 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8881 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8882 now-server.lastfsync > 1))
8883 {
8884 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8885 * flushing metadata. */
8886 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8887 server.lastfsync = now;
8888 }
8889 }
8890
8891 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8892 int j;
8893 buf = sdscatprintf(buf,"*%d\r\n",argc);
8894 for (j = 0; j < argc; j++) {
8895 robj *o = getDecodedObject(argv[j]);
8896 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8897 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8898 buf = sdscatlen(buf,"\r\n",2);
8899 decrRefCount(o);
8900 }
8901 return buf;
8902 }
8903
8904 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8905 int argc = 3;
8906 long when;
8907 robj *argv[3];
8908
8909 /* Make sure we can use strtol */
8910 seconds = getDecodedObject(seconds);
8911 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8912 decrRefCount(seconds);
8913
8914 argv[0] = createStringObject("EXPIREAT",8);
8915 argv[1] = key;
8916 argv[2] = createObject(REDIS_STRING,
8917 sdscatprintf(sdsempty(),"%ld",when));
8918 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8919 decrRefCount(argv[0]);
8920 decrRefCount(argv[2]);
8921 return buf;
8922 }
8923
8924 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8925 sds buf = sdsempty();
8926 robj *tmpargv[3];
8927
8928 /* The DB this command was targetting is not the same as the last command
8929 * we appendend. To issue a SELECT command is needed. */
8930 if (dictid != server.appendseldb) {
8931 char seldb[64];
8932
8933 snprintf(seldb,sizeof(seldb),"%d",dictid);
8934 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8935 (unsigned long)strlen(seldb),seldb);
8936 server.appendseldb = dictid;
8937 }
8938
8939 if (cmd->proc == expireCommand) {
8940 /* Translate EXPIRE into EXPIREAT */
8941 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8942 } else if (cmd->proc == setexCommand) {
8943 /* Translate SETEX to SET and EXPIREAT */
8944 tmpargv[0] = createStringObject("SET",3);
8945 tmpargv[1] = argv[1];
8946 tmpargv[2] = argv[3];
8947 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8948 decrRefCount(tmpargv[0]);
8949 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8950 } else {
8951 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8952 }
8953
8954 /* Append to the AOF buffer. This will be flushed on disk just before
8955 * of re-entering the event loop, so before the client will get a
8956 * positive reply about the operation performed. */
8957 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8958
8959 /* If a background append only file rewriting is in progress we want to
8960 * accumulate the differences between the child DB and the current one
8961 * in a buffer, so that when the child process will do its work we
8962 * can append the differences to the new append only file. */
8963 if (server.bgrewritechildpid != -1)
8964 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8965
8966 sdsfree(buf);
8967 }
8968
8969 /* In Redis commands are always executed in the context of a client, so in
8970 * order to load the append only file we need to create a fake client. */
8971 static struct redisClient *createFakeClient(void) {
8972 struct redisClient *c = zmalloc(sizeof(*c));
8973
8974 selectDb(c,0);
8975 c->fd = -1;
8976 c->querybuf = sdsempty();
8977 c->argc = 0;
8978 c->argv = NULL;
8979 c->flags = 0;
8980 /* We set the fake client as a slave waiting for the synchronization
8981 * so that Redis will not try to send replies to this client. */
8982 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8983 c->reply = listCreate();
8984 listSetFreeMethod(c->reply,decrRefCount);
8985 listSetDupMethod(c->reply,dupClientReplyValue);
8986 initClientMultiState(c);
8987 return c;
8988 }
8989
8990 static void freeFakeClient(struct redisClient *c) {
8991 sdsfree(c->querybuf);
8992 listRelease(c->reply);
8993 freeClientMultiState(c);
8994 zfree(c);
8995 }
8996
8997 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8998 * error (the append only file is zero-length) REDIS_ERR is returned. On
8999 * fatal error an error message is logged and the program exists. */
9000 int loadAppendOnlyFile(char *filename) {
9001 struct redisClient *fakeClient;
9002 FILE *fp = fopen(filename,"r");
9003 struct redis_stat sb;
9004 int appendonly = server.appendonly;
9005
9006 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
9007 return REDIS_ERR;
9008
9009 if (fp == NULL) {
9010 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
9011 exit(1);
9012 }
9013
9014 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
9015 * to the same file we're about to read. */
9016 server.appendonly = 0;
9017
9018 fakeClient = createFakeClient();
9019 while(1) {
9020 int argc, j;
9021 unsigned long len;
9022 robj **argv;
9023 char buf[128];
9024 sds argsds;
9025 struct redisCommand *cmd;
9026 int force_swapout;
9027
9028 if (fgets(buf,sizeof(buf),fp) == NULL) {
9029 if (feof(fp))
9030 break;
9031 else
9032 goto readerr;
9033 }
9034 if (buf[0] != '*') goto fmterr;
9035 argc = atoi(buf+1);
9036 argv = zmalloc(sizeof(robj*)*argc);
9037 for (j = 0; j < argc; j++) {
9038 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
9039 if (buf[0] != '$') goto fmterr;
9040 len = strtol(buf+1,NULL,10);
9041 argsds = sdsnewlen(NULL,len);
9042 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
9043 argv[j] = createObject(REDIS_STRING,argsds);
9044 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
9045 }
9046
9047 /* Command lookup */
9048 cmd = lookupCommand(argv[0]->ptr);
9049 if (!cmd) {
9050 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
9051 exit(1);
9052 }
9053 /* Try object encoding */
9054 if (cmd->flags & REDIS_CMD_BULK)
9055 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
9056 /* Run the command in the context of a fake client */
9057 fakeClient->argc = argc;
9058 fakeClient->argv = argv;
9059 cmd->proc(fakeClient);
9060 /* Discard the reply objects list from the fake client */
9061 while(listLength(fakeClient->reply))
9062 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
9063 /* Clean up, ready for the next command */
9064 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
9065 zfree(argv);
9066 /* Handle swapping while loading big datasets when VM is on */
9067 force_swapout = 0;
9068 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
9069 force_swapout = 1;
9070
9071 if (server.vm_enabled && force_swapout) {
9072 while (zmalloc_used_memory() > server.vm_max_memory) {
9073 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
9074 }
9075 }
9076 }
9077
9078 /* This point can only be reached when EOF is reached without errors.
9079 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9080 if (fakeClient->flags & REDIS_MULTI) goto readerr;
9081
9082 fclose(fp);
9083 freeFakeClient(fakeClient);
9084 server.appendonly = appendonly;
9085 return REDIS_OK;
9086
9087 readerr:
9088 if (feof(fp)) {
9089 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9090 } else {
9091 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9092 }
9093 exit(1);
9094 fmterr:
9095 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9096 exit(1);
9097 }
9098
9099 /* Write binary-safe string into a file in the bulkformat
9100 * $<count>\r\n<payload>\r\n */
9101 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9102 char cbuf[128];
9103 int clen;
9104 cbuf[0] = '$';
9105 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9106 cbuf[clen++] = '\r';
9107 cbuf[clen++] = '\n';
9108 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9109 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9110 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9111 return 1;
9112 }
9113
9114 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9115 static int fwriteBulkDouble(FILE *fp, double d) {
9116 char buf[128], dbuf[128];
9117
9118 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9119 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9120 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9121 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9122 return 1;
9123 }
9124
9125 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9126 static int fwriteBulkLongLong(FILE *fp, long long l) {
9127 char bbuf[128], lbuf[128];
9128 unsigned int blen, llen;
9129 llen = ll2string(lbuf,32,l);
9130 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9131 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9132 return 1;
9133 }
9134
9135 /* Delegate writing an object to writing a bulk string or bulk long long. */
9136 static int fwriteBulkObject(FILE *fp, robj *obj) {
9137 /* Avoid using getDecodedObject to help copy-on-write (we are often
9138 * in a child process when this function is called). */
9139 if (obj->encoding == REDIS_ENCODING_INT) {
9140 return fwriteBulkLongLong(fp,(long)obj->ptr);
9141 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9142 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9143 } else {
9144 redisPanic("Unknown string encoding");
9145 }
9146 }
9147
9148 /* Write a sequence of commands able to fully rebuild the dataset into
9149 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9150 static int rewriteAppendOnlyFile(char *filename) {
9151 dictIterator *di = NULL;
9152 dictEntry *de;
9153 FILE *fp;
9154 char tmpfile[256];
9155 int j;
9156 time_t now = time(NULL);
9157
9158 /* Note that we have to use a different temp name here compared to the
9159 * one used by rewriteAppendOnlyFileBackground() function. */
9160 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9161 fp = fopen(tmpfile,"w");
9162 if (!fp) {
9163 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9164 return REDIS_ERR;
9165 }
9166 for (j = 0; j < server.dbnum; j++) {
9167 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9168 redisDb *db = server.db+j;
9169 dict *d = db->dict;
9170 if (dictSize(d) == 0) continue;
9171 di = dictGetIterator(d);
9172 if (!di) {
9173 fclose(fp);
9174 return REDIS_ERR;
9175 }
9176
9177 /* SELECT the new DB */
9178 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9179 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9180
9181 /* Iterate this DB writing every entry */
9182 while((de = dictNext(di)) != NULL) {
9183 sds keystr = dictGetEntryKey(de);
9184 robj key, *o;
9185 time_t expiretime;
9186 int swapped;
9187
9188 keystr = dictGetEntryKey(de);
9189 o = dictGetEntryVal(de);
9190 initStaticStringObject(key,keystr);
9191 /* If the value for this key is swapped, load a preview in memory.
9192 * We use a "swapped" flag to remember if we need to free the
9193 * value object instead to just increment the ref count anyway
9194 * in order to avoid copy-on-write of pages if we are forked() */
9195 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9196 o->storage == REDIS_VM_SWAPPING) {
9197 swapped = 0;
9198 } else {
9199 o = vmPreviewObject(o);
9200 swapped = 1;
9201 }
9202 expiretime = getExpire(db,&key);
9203
9204 /* Save the key and associated value */
9205 if (o->type == REDIS_STRING) {
9206 /* Emit a SET command */
9207 char cmd[]="*3\r\n$3\r\nSET\r\n";
9208 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9209 /* Key and value */
9210 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9211 if (fwriteBulkObject(fp,o) == 0) goto werr;
9212 } else if (o->type == REDIS_LIST) {
9213 /* Emit the RPUSHes needed to rebuild the list */
9214 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9215 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9216 unsigned char *zl = o->ptr;
9217 unsigned char *p = ziplistIndex(zl,0);
9218 unsigned char *vstr;
9219 unsigned int vlen;
9220 long long vlong;
9221
9222 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9223 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9224 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9225 if (vstr) {
9226 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9227 goto werr;
9228 } else {
9229 if (fwriteBulkLongLong(fp,vlong) == 0)
9230 goto werr;
9231 }
9232 p = ziplistNext(zl,p);
9233 }
9234 } else if (o->encoding == REDIS_ENCODING_LIST) {
9235 list *list = o->ptr;
9236 listNode *ln;
9237 listIter li;
9238
9239 listRewind(list,&li);
9240 while((ln = listNext(&li))) {
9241 robj *eleobj = listNodeValue(ln);
9242
9243 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9244 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9245 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9246 }
9247 } else {
9248 redisPanic("Unknown list encoding");
9249 }
9250 } else if (o->type == REDIS_SET) {
9251 /* Emit the SADDs needed to rebuild the set */
9252 dict *set = o->ptr;
9253 dictIterator *di = dictGetIterator(set);
9254 dictEntry *de;
9255
9256 while((de = dictNext(di)) != NULL) {
9257 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9258 robj *eleobj = dictGetEntryKey(de);
9259
9260 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9261 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9262 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9263 }
9264 dictReleaseIterator(di);
9265 } else if (o->type == REDIS_ZSET) {
9266 /* Emit the ZADDs needed to rebuild the sorted set */
9267 zset *zs = o->ptr;
9268 dictIterator *di = dictGetIterator(zs->dict);
9269 dictEntry *de;
9270
9271 while((de = dictNext(di)) != NULL) {
9272 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9273 robj *eleobj = dictGetEntryKey(de);
9274 double *score = dictGetEntryVal(de);
9275
9276 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9277 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9278 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9279 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9280 }
9281 dictReleaseIterator(di);
9282 } else if (o->type == REDIS_HASH) {
9283 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9284
9285 /* Emit the HSETs needed to rebuild the hash */
9286 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9287 unsigned char *p = zipmapRewind(o->ptr);
9288 unsigned char *field, *val;
9289 unsigned int flen, vlen;
9290
9291 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9292 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9293 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9294 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9295 return -1;
9296 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9297 return -1;
9298 }
9299 } else {
9300 dictIterator *di = dictGetIterator(o->ptr);
9301 dictEntry *de;
9302
9303 while((de = dictNext(di)) != NULL) {
9304 robj *field = dictGetEntryKey(de);
9305 robj *val = dictGetEntryVal(de);
9306
9307 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9308 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9309 if (fwriteBulkObject(fp,field) == -1) return -1;
9310 if (fwriteBulkObject(fp,val) == -1) return -1;
9311 }
9312 dictReleaseIterator(di);
9313 }
9314 } else {
9315 redisPanic("Unknown object type");
9316 }
9317 /* Save the expire time */
9318 if (expiretime != -1) {
9319 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9320 /* If this key is already expired skip it */
9321 if (expiretime < now) continue;
9322 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9323 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9324 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9325 }
9326 if (swapped) decrRefCount(o);
9327 }
9328 dictReleaseIterator(di);
9329 }
9330
9331 /* Make sure data will not remain on the OS's output buffers */
9332 fflush(fp);
9333 aof_fsync(fileno(fp));
9334 fclose(fp);
9335
9336 /* Use RENAME to make sure the DB file is changed atomically only
9337 * if the generate DB file is ok. */
9338 if (rename(tmpfile,filename) == -1) {
9339 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9340 unlink(tmpfile);
9341 return REDIS_ERR;
9342 }
9343 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9344 return REDIS_OK;
9345
9346 werr:
9347 fclose(fp);
9348 unlink(tmpfile);
9349 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9350 if (di) dictReleaseIterator(di);
9351 return REDIS_ERR;
9352 }
9353
9354 /* This is how rewriting of the append only file in background works:
9355 *
9356 * 1) The user calls BGREWRITEAOF
9357 * 2) Redis calls this function, that forks():
9358 * 2a) the child rewrite the append only file in a temp file.
9359 * 2b) the parent accumulates differences in server.bgrewritebuf.
9360 * 3) When the child finished '2a' exists.
9361 * 4) The parent will trap the exit code, if it's OK, will append the
9362 * data accumulated into server.bgrewritebuf into the temp file, and
9363 * finally will rename(2) the temp file in the actual file name.
9364 * The the new file is reopened as the new append only file. Profit!
9365 */
9366 static int rewriteAppendOnlyFileBackground(void) {
9367 pid_t childpid;
9368
9369 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9370 if (server.vm_enabled) waitEmptyIOJobsQueue();
9371 if ((childpid = fork()) == 0) {
9372 /* Child */
9373 char tmpfile[256];
9374
9375 if (server.vm_enabled) vmReopenSwapFile();
9376 close(server.fd);
9377 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9378 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9379 _exit(0);
9380 } else {
9381 _exit(1);
9382 }
9383 } else {
9384 /* Parent */
9385 if (childpid == -1) {
9386 redisLog(REDIS_WARNING,
9387 "Can't rewrite append only file in background: fork: %s",
9388 strerror(errno));
9389 return REDIS_ERR;
9390 }
9391 redisLog(REDIS_NOTICE,
9392 "Background append only file rewriting started by pid %d",childpid);
9393 server.bgrewritechildpid = childpid;
9394 updateDictResizePolicy();
9395 /* We set appendseldb to -1 in order to force the next call to the
9396 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9397 * accumulated by the parent into server.bgrewritebuf will start
9398 * with a SELECT statement and it will be safe to merge. */
9399 server.appendseldb = -1;
9400 return REDIS_OK;
9401 }
9402 return REDIS_OK; /* unreached */
9403 }
9404
9405 static void bgrewriteaofCommand(redisClient *c) {
9406 if (server.bgrewritechildpid != -1) {
9407 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9408 return;
9409 }
9410 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9411 char *status = "+Background append only file rewriting started\r\n";
9412 addReplySds(c,sdsnew(status));
9413 } else {
9414 addReply(c,shared.err);
9415 }
9416 }
9417
9418 static void aofRemoveTempFile(pid_t childpid) {
9419 char tmpfile[256];
9420
9421 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9422 unlink(tmpfile);
9423 }
9424
9425 /* Virtual Memory is composed mainly of two subsystems:
9426 * - Blocking Virutal Memory
9427 * - Threaded Virtual Memory I/O
9428 * The two parts are not fully decoupled, but functions are split among two
9429 * different sections of the source code (delimited by comments) in order to
9430 * make more clear what functionality is about the blocking VM and what about
9431 * the threaded (not blocking) VM.
9432 *
9433 * Redis VM design:
9434 *
9435 * Redis VM is a blocking VM (one that blocks reading swapped values from
9436 * disk into memory when a value swapped out is needed in memory) that is made
9437 * unblocking by trying to examine the command argument vector in order to
9438 * load in background values that will likely be needed in order to exec
9439 * the command. The command is executed only once all the relevant keys
9440 * are loaded into memory.
9441 *
9442 * This basically is almost as simple of a blocking VM, but almost as parallel
9443 * as a fully non-blocking VM.
9444 */
9445
9446 /* =================== Virtual Memory - Blocking Side ====================== */
9447
9448 /* Create a VM pointer object. This kind of objects are used in place of
9449 * values in the key -> value hash table, for swapped out objects. */
9450 static vmpointer *createVmPointer(int vtype) {
9451 vmpointer *vp = zmalloc(sizeof(vmpointer));
9452
9453 vp->type = REDIS_VMPOINTER;
9454 vp->storage = REDIS_VM_SWAPPED;
9455 vp->vtype = vtype;
9456 return vp;
9457 }
9458
9459 static void vmInit(void) {
9460 off_t totsize;
9461 int pipefds[2];
9462 size_t stacksize;
9463 struct flock fl;
9464
9465 if (server.vm_max_threads != 0)
9466 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9467
9468 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9469 /* Try to open the old swap file, otherwise create it */
9470 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9471 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9472 }
9473 if (server.vm_fp == NULL) {
9474 redisLog(REDIS_WARNING,
9475 "Can't open the swap file: %s. Exiting.",
9476 strerror(errno));
9477 exit(1);
9478 }
9479 server.vm_fd = fileno(server.vm_fp);
9480 /* Lock the swap file for writing, this is useful in order to avoid
9481 * another instance to use the same swap file for a config error. */
9482 fl.l_type = F_WRLCK;
9483 fl.l_whence = SEEK_SET;
9484 fl.l_start = fl.l_len = 0;
9485 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9486 redisLog(REDIS_WARNING,
9487 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9488 exit(1);
9489 }
9490 /* Initialize */
9491 server.vm_next_page = 0;
9492 server.vm_near_pages = 0;
9493 server.vm_stats_used_pages = 0;
9494 server.vm_stats_swapped_objects = 0;
9495 server.vm_stats_swapouts = 0;
9496 server.vm_stats_swapins = 0;
9497 totsize = server.vm_pages*server.vm_page_size;
9498 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9499 if (ftruncate(server.vm_fd,totsize) == -1) {
9500 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9501 strerror(errno));
9502 exit(1);
9503 } else {
9504 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9505 }
9506 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9507 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9508 (long long) (server.vm_pages+7)/8, server.vm_pages);
9509 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9510
9511 /* Initialize threaded I/O (used by Virtual Memory) */
9512 server.io_newjobs = listCreate();
9513 server.io_processing = listCreate();
9514 server.io_processed = listCreate();
9515 server.io_ready_clients = listCreate();
9516 pthread_mutex_init(&server.io_mutex,NULL);
9517 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9518 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9519 server.io_active_threads = 0;
9520 if (pipe(pipefds) == -1) {
9521 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9522 ,strerror(errno));
9523 exit(1);
9524 }
9525 server.io_ready_pipe_read = pipefds[0];
9526 server.io_ready_pipe_write = pipefds[1];
9527 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9528 /* LZF requires a lot of stack */
9529 pthread_attr_init(&server.io_threads_attr);
9530 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9531 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9532 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9533 /* Listen for events in the threaded I/O pipe */
9534 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9535 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9536 oom("creating file event");
9537 }
9538
9539 /* Mark the page as used */
9540 static void vmMarkPageUsed(off_t page) {
9541 off_t byte = page/8;
9542 int bit = page&7;
9543 redisAssert(vmFreePage(page) == 1);
9544 server.vm_bitmap[byte] |= 1<<bit;
9545 }
9546
9547 /* Mark N contiguous pages as used, with 'page' being the first. */
9548 static void vmMarkPagesUsed(off_t page, off_t count) {
9549 off_t j;
9550
9551 for (j = 0; j < count; j++)
9552 vmMarkPageUsed(page+j);
9553 server.vm_stats_used_pages += count;
9554 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9555 (long long)count, (long long)page);
9556 }
9557
9558 /* Mark the page as free */
9559 static void vmMarkPageFree(off_t page) {
9560 off_t byte = page/8;
9561 int bit = page&7;
9562 redisAssert(vmFreePage(page) == 0);
9563 server.vm_bitmap[byte] &= ~(1<<bit);
9564 }
9565
9566 /* Mark N contiguous pages as free, with 'page' being the first. */
9567 static void vmMarkPagesFree(off_t page, off_t count) {
9568 off_t j;
9569
9570 for (j = 0; j < count; j++)
9571 vmMarkPageFree(page+j);
9572 server.vm_stats_used_pages -= count;
9573 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9574 (long long)count, (long long)page);
9575 }
9576
9577 /* Test if the page is free */
9578 static int vmFreePage(off_t page) {
9579 off_t byte = page/8;
9580 int bit = page&7;
9581 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9582 }
9583
9584 /* Find N contiguous free pages storing the first page of the cluster in *first.
9585 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9586 * REDIS_ERR is returned.
9587 *
9588 * This function uses a simple algorithm: we try to allocate
9589 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9590 * again from the start of the swap file searching for free spaces.
9591 *
9592 * If it looks pretty clear that there are no free pages near our offset
9593 * we try to find less populated places doing a forward jump of
9594 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9595 * without hurry, and then we jump again and so forth...
9596 *
9597 * This function can be improved using a free list to avoid to guess
9598 * too much, since we could collect data about freed pages.
9599 *
9600 * note: I implemented this function just after watching an episode of
9601 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9602 */
9603 static int vmFindContiguousPages(off_t *first, off_t n) {
9604 off_t base, offset = 0, since_jump = 0, numfree = 0;
9605
9606 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9607 server.vm_near_pages = 0;
9608 server.vm_next_page = 0;
9609 }
9610 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9611 base = server.vm_next_page;
9612
9613 while(offset < server.vm_pages) {
9614 off_t this = base+offset;
9615
9616 /* If we overflow, restart from page zero */
9617 if (this >= server.vm_pages) {
9618 this -= server.vm_pages;
9619 if (this == 0) {
9620 /* Just overflowed, what we found on tail is no longer
9621 * interesting, as it's no longer contiguous. */
9622 numfree = 0;
9623 }
9624 }
9625 if (vmFreePage(this)) {
9626 /* This is a free page */
9627 numfree++;
9628 /* Already got N free pages? Return to the caller, with success */
9629 if (numfree == n) {
9630 *first = this-(n-1);
9631 server.vm_next_page = this+1;
9632 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9633 return REDIS_OK;
9634 }
9635 } else {
9636 /* The current one is not a free page */
9637 numfree = 0;
9638 }
9639
9640 /* Fast-forward if the current page is not free and we already
9641 * searched enough near this place. */
9642 since_jump++;
9643 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9644 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9645 since_jump = 0;
9646 /* Note that even if we rewind after the jump, we are don't need
9647 * to make sure numfree is set to zero as we only jump *if* it
9648 * is set to zero. */
9649 } else {
9650 /* Otherwise just check the next page */
9651 offset++;
9652 }
9653 }
9654 return REDIS_ERR;
9655 }
9656
9657 /* Write the specified object at the specified page of the swap file */
9658 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9659 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9660 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9661 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9662 redisLog(REDIS_WARNING,
9663 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9664 strerror(errno));
9665 return REDIS_ERR;
9666 }
9667 rdbSaveObject(server.vm_fp,o);
9668 fflush(server.vm_fp);
9669 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9670 return REDIS_OK;
9671 }
9672
9673 /* Transfers the 'val' object to disk. Store all the information
9674 * a 'vmpointer' object containing all the information needed to load the
9675 * object back later is returned.
9676 *
9677 * If we can't find enough contiguous empty pages to swap the object on disk
9678 * NULL is returned. */
9679 static vmpointer *vmSwapObjectBlocking(robj *val) {
9680 off_t pages = rdbSavedObjectPages(val,NULL);
9681 off_t page;
9682 vmpointer *vp;
9683
9684 assert(val->storage == REDIS_VM_MEMORY);
9685 assert(val->refcount == 1);
9686 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9687 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9688
9689 vp = createVmPointer(val->type);
9690 vp->page = page;
9691 vp->usedpages = pages;
9692 decrRefCount(val); /* Deallocate the object from memory. */
9693 vmMarkPagesUsed(page,pages);
9694 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9695 (void*) val,
9696 (unsigned long long) page, (unsigned long long) pages);
9697 server.vm_stats_swapped_objects++;
9698 server.vm_stats_swapouts++;
9699 return vp;
9700 }
9701
9702 static robj *vmReadObjectFromSwap(off_t page, int type) {
9703 robj *o;
9704
9705 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9706 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9707 redisLog(REDIS_WARNING,
9708 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9709 strerror(errno));
9710 _exit(1);
9711 }
9712 o = rdbLoadObject(type,server.vm_fp);
9713 if (o == NULL) {
9714 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9715 _exit(1);
9716 }
9717 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9718 return o;
9719 }
9720
9721 /* Load the specified object from swap to memory.
9722 * The newly allocated object is returned.
9723 *
9724 * If preview is true the unserialized object is returned to the caller but
9725 * the pages are not marked as freed, nor the vp object is freed. */
9726 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9727 robj *val;
9728
9729 redisAssert(vp->type == REDIS_VMPOINTER &&
9730 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9731 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9732 if (!preview) {
9733 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9734 vmMarkPagesFree(vp->page,vp->usedpages);
9735 zfree(vp);
9736 server.vm_stats_swapped_objects--;
9737 } else {
9738 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9739 }
9740 server.vm_stats_swapins++;
9741 return val;
9742 }
9743
9744 /* Plain object loading, from swap to memory.
9745 *
9746 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9747 * The return value is the loaded object. */
9748 static robj *vmLoadObject(robj *o) {
9749 /* If we are loading the object in background, stop it, we
9750 * need to load this object synchronously ASAP. */
9751 if (o->storage == REDIS_VM_LOADING)
9752 vmCancelThreadedIOJob(o);
9753 return vmGenericLoadObject((vmpointer*)o,0);
9754 }
9755
9756 /* Just load the value on disk, without to modify the key.
9757 * This is useful when we want to perform some operation on the value
9758 * without to really bring it from swap to memory, like while saving the
9759 * dataset or rewriting the append only log. */
9760 static robj *vmPreviewObject(robj *o) {
9761 return vmGenericLoadObject((vmpointer*)o,1);
9762 }
9763
9764 /* How a good candidate is this object for swapping?
9765 * The better candidate it is, the greater the returned value.
9766 *
9767 * Currently we try to perform a fast estimation of the object size in
9768 * memory, and combine it with aging informations.
9769 *
9770 * Basically swappability = idle-time * log(estimated size)
9771 *
9772 * Bigger objects are preferred over smaller objects, but not
9773 * proportionally, this is why we use the logarithm. This algorithm is
9774 * just a first try and will probably be tuned later. */
9775 static double computeObjectSwappability(robj *o) {
9776 /* actual age can be >= minage, but not < minage. As we use wrapping
9777 * 21 bit clocks with minutes resolution for the LRU. */
9778 time_t minage = abs(server.lruclock - o->lru);
9779 long asize = 0;
9780 list *l;
9781 dict *d;
9782 struct dictEntry *de;
9783 int z;
9784
9785 if (minage <= 0) return 0;
9786 switch(o->type) {
9787 case REDIS_STRING:
9788 if (o->encoding != REDIS_ENCODING_RAW) {
9789 asize = sizeof(*o);
9790 } else {
9791 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9792 }
9793 break;
9794 case REDIS_LIST:
9795 l = o->ptr;
9796 listNode *ln = listFirst(l);
9797
9798 asize = sizeof(list);
9799 if (ln) {
9800 robj *ele = ln->value;
9801 long elesize;
9802
9803 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9804 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9805 asize += (sizeof(listNode)+elesize)*listLength(l);
9806 }
9807 break;
9808 case REDIS_SET:
9809 case REDIS_ZSET:
9810 z = (o->type == REDIS_ZSET);
9811 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9812
9813 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9814 if (z) asize += sizeof(zset)-sizeof(dict);
9815 if (dictSize(d)) {
9816 long elesize;
9817 robj *ele;
9818
9819 de = dictGetRandomKey(d);
9820 ele = dictGetEntryKey(de);
9821 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9822 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9823 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9824 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9825 }
9826 break;
9827 case REDIS_HASH:
9828 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9829 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9830 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9831 unsigned int klen, vlen;
9832 unsigned char *key, *val;
9833
9834 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9835 klen = 0;
9836 vlen = 0;
9837 }
9838 asize = len*(klen+vlen+3);
9839 } else if (o->encoding == REDIS_ENCODING_HT) {
9840 d = o->ptr;
9841 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9842 if (dictSize(d)) {
9843 long elesize;
9844 robj *ele;
9845
9846 de = dictGetRandomKey(d);
9847 ele = dictGetEntryKey(de);
9848 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9849 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9850 ele = dictGetEntryVal(de);
9851 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9852 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9853 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9854 }
9855 }
9856 break;
9857 }
9858 return (double)minage*log(1+asize);
9859 }
9860
9861 /* Try to swap an object that's a good candidate for swapping.
9862 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9863 * to swap any object at all.
9864 *
9865 * If 'usethreaded' is true, Redis will try to swap the object in background
9866 * using I/O threads. */
9867 static int vmSwapOneObject(int usethreads) {
9868 int j, i;
9869 struct dictEntry *best = NULL;
9870 double best_swappability = 0;
9871 redisDb *best_db = NULL;
9872 robj *val;
9873 sds key;
9874
9875 for (j = 0; j < server.dbnum; j++) {
9876 redisDb *db = server.db+j;
9877 /* Why maxtries is set to 100?
9878 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9879 * are swappable objects */
9880 int maxtries = 100;
9881
9882 if (dictSize(db->dict) == 0) continue;
9883 for (i = 0; i < 5; i++) {
9884 dictEntry *de;
9885 double swappability;
9886
9887 if (maxtries) maxtries--;
9888 de = dictGetRandomKey(db->dict);
9889 val = dictGetEntryVal(de);
9890 /* Only swap objects that are currently in memory.
9891 *
9892 * Also don't swap shared objects: not a good idea in general and
9893 * we need to ensure that the main thread does not touch the
9894 * object while the I/O thread is using it, but we can't
9895 * control other keys without adding additional mutex. */
9896 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9897 if (maxtries) i--; /* don't count this try */
9898 continue;
9899 }
9900 swappability = computeObjectSwappability(val);
9901 if (!best || swappability > best_swappability) {
9902 best = de;
9903 best_swappability = swappability;
9904 best_db = db;
9905 }
9906 }
9907 }
9908 if (best == NULL) return REDIS_ERR;
9909 key = dictGetEntryKey(best);
9910 val = dictGetEntryVal(best);
9911
9912 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9913 key, best_swappability);
9914
9915 /* Swap it */
9916 if (usethreads) {
9917 robj *keyobj = createStringObject(key,sdslen(key));
9918 vmSwapObjectThreaded(keyobj,val,best_db);
9919 decrRefCount(keyobj);
9920 return REDIS_OK;
9921 } else {
9922 vmpointer *vp;
9923
9924 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9925 dictGetEntryVal(best) = vp;
9926 return REDIS_OK;
9927 } else {
9928 return REDIS_ERR;
9929 }
9930 }
9931 }
9932
9933 static int vmSwapOneObjectBlocking() {
9934 return vmSwapOneObject(0);
9935 }
9936
9937 static int vmSwapOneObjectThreaded() {
9938 return vmSwapOneObject(1);
9939 }
9940
9941 /* Return true if it's safe to swap out objects in a given moment.
9942 * Basically we don't want to swap objects out while there is a BGSAVE
9943 * or a BGAEOREWRITE running in backgroud. */
9944 static int vmCanSwapOut(void) {
9945 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9946 }
9947
9948 /* =================== Virtual Memory - Threaded I/O ======================= */
9949
9950 static void freeIOJob(iojob *j) {
9951 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9952 j->type == REDIS_IOJOB_DO_SWAP ||
9953 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9954 {
9955 /* we fix the storage type, otherwise decrRefCount() will try to
9956 * kill the I/O thread Job (that does no longer exists). */
9957 if (j->val->storage == REDIS_VM_SWAPPING)
9958 j->val->storage = REDIS_VM_MEMORY;
9959 decrRefCount(j->val);
9960 }
9961 decrRefCount(j->key);
9962 zfree(j);
9963 }
9964
9965 /* Every time a thread finished a Job, it writes a byte into the write side
9966 * of an unix pipe in order to "awake" the main thread, and this function
9967 * is called. */
9968 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9969 int mask)
9970 {
9971 char buf[1];
9972 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9973 REDIS_NOTUSED(el);
9974 REDIS_NOTUSED(mask);
9975 REDIS_NOTUSED(privdata);
9976
9977 /* For every byte we read in the read side of the pipe, there is one
9978 * I/O job completed to process. */
9979 while((retval = read(fd,buf,1)) == 1) {
9980 iojob *j;
9981 listNode *ln;
9982 struct dictEntry *de;
9983
9984 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9985
9986 /* Get the processed element (the oldest one) */
9987 lockThreadedIO();
9988 assert(listLength(server.io_processed) != 0);
9989 if (toprocess == -1) {
9990 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9991 if (toprocess <= 0) toprocess = 1;
9992 }
9993 ln = listFirst(server.io_processed);
9994 j = ln->value;
9995 listDelNode(server.io_processed,ln);
9996 unlockThreadedIO();
9997 /* If this job is marked as canceled, just ignore it */
9998 if (j->canceled) {
9999 freeIOJob(j);
10000 continue;
10001 }
10002 /* Post process it in the main thread, as there are things we
10003 * can do just here to avoid race conditions and/or invasive locks */
10004 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
10005 de = dictFind(j->db->dict,j->key->ptr);
10006 redisAssert(de != NULL);
10007 if (j->type == REDIS_IOJOB_LOAD) {
10008 redisDb *db;
10009 vmpointer *vp = dictGetEntryVal(de);
10010
10011 /* Key loaded, bring it at home */
10012 vmMarkPagesFree(vp->page,vp->usedpages);
10013 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
10014 (unsigned char*) j->key->ptr);
10015 server.vm_stats_swapped_objects--;
10016 server.vm_stats_swapins++;
10017 dictGetEntryVal(de) = j->val;
10018 incrRefCount(j->val);
10019 db = j->db;
10020 /* Handle clients waiting for this key to be loaded. */
10021 handleClientsBlockedOnSwappedKey(db,j->key);
10022 freeIOJob(j);
10023 zfree(vp);
10024 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10025 /* Now we know the amount of pages required to swap this object.
10026 * Let's find some space for it, and queue this task again
10027 * rebranded as REDIS_IOJOB_DO_SWAP. */
10028 if (!vmCanSwapOut() ||
10029 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
10030 {
10031 /* Ooops... no space or we can't swap as there is
10032 * a fork()ed Redis trying to save stuff on disk. */
10033 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
10034 freeIOJob(j);
10035 } else {
10036 /* Note that we need to mark this pages as used now,
10037 * if the job will be canceled, we'll mark them as freed
10038 * again. */
10039 vmMarkPagesUsed(j->page,j->pages);
10040 j->type = REDIS_IOJOB_DO_SWAP;
10041 lockThreadedIO();
10042 queueIOJob(j);
10043 unlockThreadedIO();
10044 }
10045 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10046 vmpointer *vp;
10047
10048 /* Key swapped. We can finally free some memory. */
10049 if (j->val->storage != REDIS_VM_SWAPPING) {
10050 vmpointer *vp = (vmpointer*) j->id;
10051 printf("storage: %d\n",vp->storage);
10052 printf("key->name: %s\n",(char*)j->key->ptr);
10053 printf("val: %p\n",(void*)j->val);
10054 printf("val->type: %d\n",j->val->type);
10055 printf("val->ptr: %s\n",(char*)j->val->ptr);
10056 }
10057 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
10058 vp = createVmPointer(j->val->type);
10059 vp->page = j->page;
10060 vp->usedpages = j->pages;
10061 dictGetEntryVal(de) = vp;
10062 /* Fix the storage otherwise decrRefCount will attempt to
10063 * remove the associated I/O job */
10064 j->val->storage = REDIS_VM_MEMORY;
10065 decrRefCount(j->val);
10066 redisLog(REDIS_DEBUG,
10067 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
10068 (unsigned char*) j->key->ptr,
10069 (unsigned long long) j->page, (unsigned long long) j->pages);
10070 server.vm_stats_swapped_objects++;
10071 server.vm_stats_swapouts++;
10072 freeIOJob(j);
10073 /* Put a few more swap requests in queue if we are still
10074 * out of memory */
10075 if (trytoswap && vmCanSwapOut() &&
10076 zmalloc_used_memory() > server.vm_max_memory)
10077 {
10078 int more = 1;
10079 while(more) {
10080 lockThreadedIO();
10081 more = listLength(server.io_newjobs) <
10082 (unsigned) server.vm_max_threads;
10083 unlockThreadedIO();
10084 /* Don't waste CPU time if swappable objects are rare. */
10085 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
10086 trytoswap = 0;
10087 break;
10088 }
10089 }
10090 }
10091 }
10092 processed++;
10093 if (processed == toprocess) return;
10094 }
10095 if (retval < 0 && errno != EAGAIN) {
10096 redisLog(REDIS_WARNING,
10097 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10098 strerror(errno));
10099 }
10100 }
10101
10102 static void lockThreadedIO(void) {
10103 pthread_mutex_lock(&server.io_mutex);
10104 }
10105
10106 static void unlockThreadedIO(void) {
10107 pthread_mutex_unlock(&server.io_mutex);
10108 }
10109
10110 /* Remove the specified object from the threaded I/O queue if still not
10111 * processed, otherwise make sure to flag it as canceled. */
10112 static void vmCancelThreadedIOJob(robj *o) {
10113 list *lists[3] = {
10114 server.io_newjobs, /* 0 */
10115 server.io_processing, /* 1 */
10116 server.io_processed /* 2 */
10117 };
10118 int i;
10119
10120 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
10121 again:
10122 lockThreadedIO();
10123 /* Search for a matching object in one of the queues */
10124 for (i = 0; i < 3; i++) {
10125 listNode *ln;
10126 listIter li;
10127
10128 listRewind(lists[i],&li);
10129 while ((ln = listNext(&li)) != NULL) {
10130 iojob *job = ln->value;
10131
10132 if (job->canceled) continue; /* Skip this, already canceled. */
10133 if (job->id == o) {
10134 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10135 (void*)job, (char*)job->key->ptr, job->type, i);
10136 /* Mark the pages as free since the swap didn't happened
10137 * or happened but is now discarded. */
10138 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10139 vmMarkPagesFree(job->page,job->pages);
10140 /* Cancel the job. It depends on the list the job is
10141 * living in. */
10142 switch(i) {
10143 case 0: /* io_newjobs */
10144 /* If the job was yet not processed the best thing to do
10145 * is to remove it from the queue at all */
10146 freeIOJob(job);
10147 listDelNode(lists[i],ln);
10148 break;
10149 case 1: /* io_processing */
10150 /* Oh Shi- the thread is messing with the Job:
10151 *
10152 * Probably it's accessing the object if this is a
10153 * PREPARE_SWAP or DO_SWAP job.
10154 * If it's a LOAD job it may be reading from disk and
10155 * if we don't wait for the job to terminate before to
10156 * cancel it, maybe in a few microseconds data can be
10157 * corrupted in this pages. So the short story is:
10158 *
10159 * Better to wait for the job to move into the
10160 * next queue (processed)... */
10161
10162 /* We try again and again until the job is completed. */
10163 unlockThreadedIO();
10164 /* But let's wait some time for the I/O thread
10165 * to finish with this job. After all this condition
10166 * should be very rare. */
10167 usleep(1);
10168 goto again;
10169 case 2: /* io_processed */
10170 /* The job was already processed, that's easy...
10171 * just mark it as canceled so that we'll ignore it
10172 * when processing completed jobs. */
10173 job->canceled = 1;
10174 break;
10175 }
10176 /* Finally we have to adjust the storage type of the object
10177 * in order to "UNDO" the operaiton. */
10178 if (o->storage == REDIS_VM_LOADING)
10179 o->storage = REDIS_VM_SWAPPED;
10180 else if (o->storage == REDIS_VM_SWAPPING)
10181 o->storage = REDIS_VM_MEMORY;
10182 unlockThreadedIO();
10183 redisLog(REDIS_DEBUG,"*** DONE");
10184 return;
10185 }
10186 }
10187 }
10188 unlockThreadedIO();
10189 printf("Not found: %p\n", (void*)o);
10190 redisAssert(1 != 1); /* We should never reach this */
10191 }
10192
10193 static void *IOThreadEntryPoint(void *arg) {
10194 iojob *j;
10195 listNode *ln;
10196 REDIS_NOTUSED(arg);
10197
10198 pthread_detach(pthread_self());
10199 while(1) {
10200 /* Get a new job to process */
10201 lockThreadedIO();
10202 if (listLength(server.io_newjobs) == 0) {
10203 /* No new jobs in queue, exit. */
10204 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10205 (long) pthread_self());
10206 server.io_active_threads--;
10207 unlockThreadedIO();
10208 return NULL;
10209 }
10210 ln = listFirst(server.io_newjobs);
10211 j = ln->value;
10212 listDelNode(server.io_newjobs,ln);
10213 /* Add the job in the processing queue */
10214 j->thread = pthread_self();
10215 listAddNodeTail(server.io_processing,j);
10216 ln = listLast(server.io_processing); /* We use ln later to remove it */
10217 unlockThreadedIO();
10218 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10219 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10220
10221 /* Process the Job */
10222 if (j->type == REDIS_IOJOB_LOAD) {
10223 vmpointer *vp = (vmpointer*)j->id;
10224 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10225 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10226 FILE *fp = fopen("/dev/null","w+");
10227 j->pages = rdbSavedObjectPages(j->val,fp);
10228 fclose(fp);
10229 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10230 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10231 j->canceled = 1;
10232 }
10233
10234 /* Done: insert the job into the processed queue */
10235 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10236 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10237 lockThreadedIO();
10238 listDelNode(server.io_processing,ln);
10239 listAddNodeTail(server.io_processed,j);
10240 unlockThreadedIO();
10241
10242 /* Signal the main thread there is new stuff to process */
10243 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10244 }
10245 return NULL; /* never reached */
10246 }
10247
10248 static void spawnIOThread(void) {
10249 pthread_t thread;
10250 sigset_t mask, omask;
10251 int err;
10252
10253 sigemptyset(&mask);
10254 sigaddset(&mask,SIGCHLD);
10255 sigaddset(&mask,SIGHUP);
10256 sigaddset(&mask,SIGPIPE);
10257 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10258 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10259 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10260 strerror(err));
10261 usleep(1000000);
10262 }
10263 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10264 server.io_active_threads++;
10265 }
10266
10267 /* We need to wait for the last thread to exit before we are able to
10268 * fork() in order to BGSAVE or BGREWRITEAOF. */
10269 static void waitEmptyIOJobsQueue(void) {
10270 while(1) {
10271 int io_processed_len;
10272
10273 lockThreadedIO();
10274 if (listLength(server.io_newjobs) == 0 &&
10275 listLength(server.io_processing) == 0 &&
10276 server.io_active_threads == 0)
10277 {
10278 unlockThreadedIO();
10279 return;
10280 }
10281 /* While waiting for empty jobs queue condition we post-process some
10282 * finshed job, as I/O threads may be hanging trying to write against
10283 * the io_ready_pipe_write FD but there are so much pending jobs that
10284 * it's blocking. */
10285 io_processed_len = listLength(server.io_processed);
10286 unlockThreadedIO();
10287 if (io_processed_len) {
10288 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10289 usleep(1000); /* 1 millisecond */
10290 } else {
10291 usleep(10000); /* 10 milliseconds */
10292 }
10293 }
10294 }
10295
10296 static void vmReopenSwapFile(void) {
10297 /* Note: we don't close the old one as we are in the child process
10298 * and don't want to mess at all with the original file object. */
10299 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10300 if (server.vm_fp == NULL) {
10301 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10302 server.vm_swap_file);
10303 _exit(1);
10304 }
10305 server.vm_fd = fileno(server.vm_fp);
10306 }
10307
10308 /* This function must be called while with threaded IO locked */
10309 static void queueIOJob(iojob *j) {
10310 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10311 (void*)j, j->type, (char*)j->key->ptr);
10312 listAddNodeTail(server.io_newjobs,j);
10313 if (server.io_active_threads < server.vm_max_threads)
10314 spawnIOThread();
10315 }
10316
10317 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10318 iojob *j;
10319
10320 j = zmalloc(sizeof(*j));
10321 j->type = REDIS_IOJOB_PREPARE_SWAP;
10322 j->db = db;
10323 j->key = key;
10324 incrRefCount(key);
10325 j->id = j->val = val;
10326 incrRefCount(val);
10327 j->canceled = 0;
10328 j->thread = (pthread_t) -1;
10329 val->storage = REDIS_VM_SWAPPING;
10330
10331 lockThreadedIO();
10332 queueIOJob(j);
10333 unlockThreadedIO();
10334 return REDIS_OK;
10335 }
10336
10337 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10338
10339 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10340 * If there is not already a job loading the key, it is craeted.
10341 * The key is added to the io_keys list in the client structure, and also
10342 * in the hash table mapping swapped keys to waiting clients, that is,
10343 * server.io_waited_keys. */
10344 static int waitForSwappedKey(redisClient *c, robj *key) {
10345 struct dictEntry *de;
10346 robj *o;
10347 list *l;
10348
10349 /* If the key does not exist or is already in RAM we don't need to
10350 * block the client at all. */
10351 de = dictFind(c->db->dict,key->ptr);
10352 if (de == NULL) return 0;
10353 o = dictGetEntryVal(de);
10354 if (o->storage == REDIS_VM_MEMORY) {
10355 return 0;
10356 } else if (o->storage == REDIS_VM_SWAPPING) {
10357 /* We were swapping the key, undo it! */
10358 vmCancelThreadedIOJob(o);
10359 return 0;
10360 }
10361
10362 /* OK: the key is either swapped, or being loaded just now. */
10363
10364 /* Add the key to the list of keys this client is waiting for.
10365 * This maps clients to keys they are waiting for. */
10366 listAddNodeTail(c->io_keys,key);
10367 incrRefCount(key);
10368
10369 /* Add the client to the swapped keys => clients waiting map. */
10370 de = dictFind(c->db->io_keys,key);
10371 if (de == NULL) {
10372 int retval;
10373
10374 /* For every key we take a list of clients blocked for it */
10375 l = listCreate();
10376 retval = dictAdd(c->db->io_keys,key,l);
10377 incrRefCount(key);
10378 assert(retval == DICT_OK);
10379 } else {
10380 l = dictGetEntryVal(de);
10381 }
10382 listAddNodeTail(l,c);
10383
10384 /* Are we already loading the key from disk? If not create a job */
10385 if (o->storage == REDIS_VM_SWAPPED) {
10386 iojob *j;
10387 vmpointer *vp = (vmpointer*)o;
10388
10389 o->storage = REDIS_VM_LOADING;
10390 j = zmalloc(sizeof(*j));
10391 j->type = REDIS_IOJOB_LOAD;
10392 j->db = c->db;
10393 j->id = (robj*)vp;
10394 j->key = key;
10395 incrRefCount(key);
10396 j->page = vp->page;
10397 j->val = NULL;
10398 j->canceled = 0;
10399 j->thread = (pthread_t) -1;
10400 lockThreadedIO();
10401 queueIOJob(j);
10402 unlockThreadedIO();
10403 }
10404 return 1;
10405 }
10406
10407 /* Preload keys for any command with first, last and step values for
10408 * the command keys prototype, as defined in the command table. */
10409 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10410 int j, last;
10411 if (cmd->vm_firstkey == 0) return;
10412 last = cmd->vm_lastkey;
10413 if (last < 0) last = argc+last;
10414 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10415 redisAssert(j < argc);
10416 waitForSwappedKey(c,argv[j]);
10417 }
10418 }
10419
10420 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10421 * Note that the number of keys to preload is user-defined, so we need to
10422 * apply a sanity check against argc. */
10423 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10424 int i, num;
10425 REDIS_NOTUSED(cmd);
10426
10427 num = atoi(argv[2]->ptr);
10428 if (num > (argc-3)) return;
10429 for (i = 0; i < num; i++) {
10430 waitForSwappedKey(c,argv[3+i]);
10431 }
10432 }
10433
10434 /* Preload keys needed to execute the entire MULTI/EXEC block.
10435 *
10436 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10437 * and will block the client when any command requires a swapped out value. */
10438 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10439 int i, margc;
10440 struct redisCommand *mcmd;
10441 robj **margv;
10442 REDIS_NOTUSED(cmd);
10443 REDIS_NOTUSED(argc);
10444 REDIS_NOTUSED(argv);
10445
10446 if (!(c->flags & REDIS_MULTI)) return;
10447 for (i = 0; i < c->mstate.count; i++) {
10448 mcmd = c->mstate.commands[i].cmd;
10449 margc = c->mstate.commands[i].argc;
10450 margv = c->mstate.commands[i].argv;
10451
10452 if (mcmd->vm_preload_proc != NULL) {
10453 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10454 } else {
10455 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10456 }
10457 }
10458 }
10459
10460 /* Is this client attempting to run a command against swapped keys?
10461 * If so, block it ASAP, load the keys in background, then resume it.
10462 *
10463 * The important idea about this function is that it can fail! If keys will
10464 * still be swapped when the client is resumed, this key lookups will
10465 * just block loading keys from disk. In practical terms this should only
10466 * happen with SORT BY command or if there is a bug in this function.
10467 *
10468 * Return 1 if the client is marked as blocked, 0 if the client can
10469 * continue as the keys it is going to access appear to be in memory. */
10470 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10471 if (cmd->vm_preload_proc != NULL) {
10472 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10473 } else {
10474 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10475 }
10476
10477 /* If the client was blocked for at least one key, mark it as blocked. */
10478 if (listLength(c->io_keys)) {
10479 c->flags |= REDIS_IO_WAIT;
10480 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10481 server.vm_blocked_clients++;
10482 return 1;
10483 } else {
10484 return 0;
10485 }
10486 }
10487
10488 /* Remove the 'key' from the list of blocked keys for a given client.
10489 *
10490 * The function returns 1 when there are no longer blocking keys after
10491 * the current one was removed (and the client can be unblocked). */
10492 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10493 list *l;
10494 listNode *ln;
10495 listIter li;
10496 struct dictEntry *de;
10497
10498 /* Remove the key from the list of keys this client is waiting for. */
10499 listRewind(c->io_keys,&li);
10500 while ((ln = listNext(&li)) != NULL) {
10501 if (equalStringObjects(ln->value,key)) {
10502 listDelNode(c->io_keys,ln);
10503 break;
10504 }
10505 }
10506 assert(ln != NULL);
10507
10508 /* Remove the client form the key => waiting clients map. */
10509 de = dictFind(c->db->io_keys,key);
10510 assert(de != NULL);
10511 l = dictGetEntryVal(de);
10512 ln = listSearchKey(l,c);
10513 assert(ln != NULL);
10514 listDelNode(l,ln);
10515 if (listLength(l) == 0)
10516 dictDelete(c->db->io_keys,key);
10517
10518 return listLength(c->io_keys) == 0;
10519 }
10520
10521 /* Every time we now a key was loaded back in memory, we handle clients
10522 * waiting for this key if any. */
10523 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10524 struct dictEntry *de;
10525 list *l;
10526 listNode *ln;
10527 int len;
10528
10529 de = dictFind(db->io_keys,key);
10530 if (!de) return;
10531
10532 l = dictGetEntryVal(de);
10533 len = listLength(l);
10534 /* Note: we can't use something like while(listLength(l)) as the list
10535 * can be freed by the calling function when we remove the last element. */
10536 while (len--) {
10537 ln = listFirst(l);
10538 redisClient *c = ln->value;
10539
10540 if (dontWaitForSwappedKey(c,key)) {
10541 /* Put the client in the list of clients ready to go as we
10542 * loaded all the keys about it. */
10543 listAddNodeTail(server.io_ready_clients,c);
10544 }
10545 }
10546 }
10547
10548 /* =========================== Remote Configuration ========================= */
10549
10550 static void configSetCommand(redisClient *c) {
10551 robj *o = getDecodedObject(c->argv[3]);
10552 long long ll;
10553
10554 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10555 zfree(server.dbfilename);
10556 server.dbfilename = zstrdup(o->ptr);
10557 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10558 zfree(server.requirepass);
10559 server.requirepass = zstrdup(o->ptr);
10560 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10561 zfree(server.masterauth);
10562 server.masterauth = zstrdup(o->ptr);
10563 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10564 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10565 ll < 0) goto badfmt;
10566 server.maxmemory = ll;
10567 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10568 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10569 ll < 0 || ll > LONG_MAX) goto badfmt;
10570 server.maxidletime = ll;
10571 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10572 if (!strcasecmp(o->ptr,"no")) {
10573 server.appendfsync = APPENDFSYNC_NO;
10574 } else if (!strcasecmp(o->ptr,"everysec")) {
10575 server.appendfsync = APPENDFSYNC_EVERYSEC;
10576 } else if (!strcasecmp(o->ptr,"always")) {
10577 server.appendfsync = APPENDFSYNC_ALWAYS;
10578 } else {
10579 goto badfmt;
10580 }
10581 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10582 int yn = yesnotoi(o->ptr);
10583
10584 if (yn == -1) goto badfmt;
10585 server.no_appendfsync_on_rewrite = yn;
10586 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10587 int old = server.appendonly;
10588 int new = yesnotoi(o->ptr);
10589
10590 if (new == -1) goto badfmt;
10591 if (old != new) {
10592 if (new == 0) {
10593 stopAppendOnly();
10594 } else {
10595 if (startAppendOnly() == REDIS_ERR) {
10596 addReplySds(c,sdscatprintf(sdsempty(),
10597 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10598 decrRefCount(o);
10599 return;
10600 }
10601 }
10602 }
10603 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10604 int vlen, j;
10605 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10606
10607 /* Perform sanity check before setting the new config:
10608 * - Even number of args
10609 * - Seconds >= 1, changes >= 0 */
10610 if (vlen & 1) {
10611 sdsfreesplitres(v,vlen);
10612 goto badfmt;
10613 }
10614 for (j = 0; j < vlen; j++) {
10615 char *eptr;
10616 long val;
10617
10618 val = strtoll(v[j], &eptr, 10);
10619 if (eptr[0] != '\0' ||
10620 ((j & 1) == 0 && val < 1) ||
10621 ((j & 1) == 1 && val < 0)) {
10622 sdsfreesplitres(v,vlen);
10623 goto badfmt;
10624 }
10625 }
10626 /* Finally set the new config */
10627 resetServerSaveParams();
10628 for (j = 0; j < vlen; j += 2) {
10629 time_t seconds;
10630 int changes;
10631
10632 seconds = strtoll(v[j],NULL,10);
10633 changes = strtoll(v[j+1],NULL,10);
10634 appendServerSaveParams(seconds, changes);
10635 }
10636 sdsfreesplitres(v,vlen);
10637 } else {
10638 addReplySds(c,sdscatprintf(sdsempty(),
10639 "-ERR not supported CONFIG parameter %s\r\n",
10640 (char*)c->argv[2]->ptr));
10641 decrRefCount(o);
10642 return;
10643 }
10644 decrRefCount(o);
10645 addReply(c,shared.ok);
10646 return;
10647
10648 badfmt: /* Bad format errors */
10649 addReplySds(c,sdscatprintf(sdsempty(),
10650 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10651 (char*)o->ptr,
10652 (char*)c->argv[2]->ptr));
10653 decrRefCount(o);
10654 }
10655
10656 static void configGetCommand(redisClient *c) {
10657 robj *o = getDecodedObject(c->argv[2]);
10658 robj *lenobj = createObject(REDIS_STRING,NULL);
10659 char *pattern = o->ptr;
10660 int matches = 0;
10661
10662 addReply(c,lenobj);
10663 decrRefCount(lenobj);
10664
10665 if (stringmatch(pattern,"dbfilename",0)) {
10666 addReplyBulkCString(c,"dbfilename");
10667 addReplyBulkCString(c,server.dbfilename);
10668 matches++;
10669 }
10670 if (stringmatch(pattern,"requirepass",0)) {
10671 addReplyBulkCString(c,"requirepass");
10672 addReplyBulkCString(c,server.requirepass);
10673 matches++;
10674 }
10675 if (stringmatch(pattern,"masterauth",0)) {
10676 addReplyBulkCString(c,"masterauth");
10677 addReplyBulkCString(c,server.masterauth);
10678 matches++;
10679 }
10680 if (stringmatch(pattern,"maxmemory",0)) {
10681 char buf[128];
10682
10683 ll2string(buf,128,server.maxmemory);
10684 addReplyBulkCString(c,"maxmemory");
10685 addReplyBulkCString(c,buf);
10686 matches++;
10687 }
10688 if (stringmatch(pattern,"timeout",0)) {
10689 char buf[128];
10690
10691 ll2string(buf,128,server.maxidletime);
10692 addReplyBulkCString(c,"timeout");
10693 addReplyBulkCString(c,buf);
10694 matches++;
10695 }
10696 if (stringmatch(pattern,"appendonly",0)) {
10697 addReplyBulkCString(c,"appendonly");
10698 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10699 matches++;
10700 }
10701 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10702 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10703 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10704 matches++;
10705 }
10706 if (stringmatch(pattern,"appendfsync",0)) {
10707 char *policy;
10708
10709 switch(server.appendfsync) {
10710 case APPENDFSYNC_NO: policy = "no"; break;
10711 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10712 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10713 default: policy = "unknown"; break; /* too harmless to panic */
10714 }
10715 addReplyBulkCString(c,"appendfsync");
10716 addReplyBulkCString(c,policy);
10717 matches++;
10718 }
10719 if (stringmatch(pattern,"save",0)) {
10720 sds buf = sdsempty();
10721 int j;
10722
10723 for (j = 0; j < server.saveparamslen; j++) {
10724 buf = sdscatprintf(buf,"%ld %d",
10725 server.saveparams[j].seconds,
10726 server.saveparams[j].changes);
10727 if (j != server.saveparamslen-1)
10728 buf = sdscatlen(buf," ",1);
10729 }
10730 addReplyBulkCString(c,"save");
10731 addReplyBulkCString(c,buf);
10732 sdsfree(buf);
10733 matches++;
10734 }
10735 decrRefCount(o);
10736 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10737 }
10738
10739 static void configCommand(redisClient *c) {
10740 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10741 if (c->argc != 4) goto badarity;
10742 configSetCommand(c);
10743 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10744 if (c->argc != 3) goto badarity;
10745 configGetCommand(c);
10746 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10747 if (c->argc != 2) goto badarity;
10748 server.stat_numcommands = 0;
10749 server.stat_numconnections = 0;
10750 server.stat_expiredkeys = 0;
10751 server.stat_starttime = time(NULL);
10752 addReply(c,shared.ok);
10753 } else {
10754 addReplySds(c,sdscatprintf(sdsempty(),
10755 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10756 }
10757 return;
10758
10759 badarity:
10760 addReplySds(c,sdscatprintf(sdsempty(),
10761 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10762 (char*) c->argv[1]->ptr));
10763 }
10764
10765 /* =========================== Pubsub implementation ======================== */
10766
10767 static void freePubsubPattern(void *p) {
10768 pubsubPattern *pat = p;
10769
10770 decrRefCount(pat->pattern);
10771 zfree(pat);
10772 }
10773
10774 static int listMatchPubsubPattern(void *a, void *b) {
10775 pubsubPattern *pa = a, *pb = b;
10776
10777 return (pa->client == pb->client) &&
10778 (equalStringObjects(pa->pattern,pb->pattern));
10779 }
10780
10781 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10782 * 0 if the client was already subscribed to that channel. */
10783 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10784 struct dictEntry *de;
10785 list *clients = NULL;
10786 int retval = 0;
10787
10788 /* Add the channel to the client -> channels hash table */
10789 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10790 retval = 1;
10791 incrRefCount(channel);
10792 /* Add the client to the channel -> list of clients hash table */
10793 de = dictFind(server.pubsub_channels,channel);
10794 if (de == NULL) {
10795 clients = listCreate();
10796 dictAdd(server.pubsub_channels,channel,clients);
10797 incrRefCount(channel);
10798 } else {
10799 clients = dictGetEntryVal(de);
10800 }
10801 listAddNodeTail(clients,c);
10802 }
10803 /* Notify the client */
10804 addReply(c,shared.mbulk3);
10805 addReply(c,shared.subscribebulk);
10806 addReplyBulk(c,channel);
10807 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10808 return retval;
10809 }
10810
10811 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10812 * 0 if the client was not subscribed to the specified channel. */
10813 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10814 struct dictEntry *de;
10815 list *clients;
10816 listNode *ln;
10817 int retval = 0;
10818
10819 /* Remove the channel from the client -> channels hash table */
10820 incrRefCount(channel); /* channel may be just a pointer to the same object
10821 we have in the hash tables. Protect it... */
10822 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10823 retval = 1;
10824 /* Remove the client from the channel -> clients list hash table */
10825 de = dictFind(server.pubsub_channels,channel);
10826 assert(de != NULL);
10827 clients = dictGetEntryVal(de);
10828 ln = listSearchKey(clients,c);
10829 assert(ln != NULL);
10830 listDelNode(clients,ln);
10831 if (listLength(clients) == 0) {
10832 /* Free the list and associated hash entry at all if this was
10833 * the latest client, so that it will be possible to abuse
10834 * Redis PUBSUB creating millions of channels. */
10835 dictDelete(server.pubsub_channels,channel);
10836 }
10837 }
10838 /* Notify the client */
10839 if (notify) {
10840 addReply(c,shared.mbulk3);
10841 addReply(c,shared.unsubscribebulk);
10842 addReplyBulk(c,channel);
10843 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10844 listLength(c->pubsub_patterns));
10845
10846 }
10847 decrRefCount(channel); /* it is finally safe to release it */
10848 return retval;
10849 }
10850
10851 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10852 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10853 int retval = 0;
10854
10855 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10856 retval = 1;
10857 pubsubPattern *pat;
10858 listAddNodeTail(c->pubsub_patterns,pattern);
10859 incrRefCount(pattern);
10860 pat = zmalloc(sizeof(*pat));
10861 pat->pattern = getDecodedObject(pattern);
10862 pat->client = c;
10863 listAddNodeTail(server.pubsub_patterns,pat);
10864 }
10865 /* Notify the client */
10866 addReply(c,shared.mbulk3);
10867 addReply(c,shared.psubscribebulk);
10868 addReplyBulk(c,pattern);
10869 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10870 return retval;
10871 }
10872
10873 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10874 * 0 if the client was not subscribed to the specified channel. */
10875 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10876 listNode *ln;
10877 pubsubPattern pat;
10878 int retval = 0;
10879
10880 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10881 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10882 retval = 1;
10883 listDelNode(c->pubsub_patterns,ln);
10884 pat.client = c;
10885 pat.pattern = pattern;
10886 ln = listSearchKey(server.pubsub_patterns,&pat);
10887 listDelNode(server.pubsub_patterns,ln);
10888 }
10889 /* Notify the client */
10890 if (notify) {
10891 addReply(c,shared.mbulk3);
10892 addReply(c,shared.punsubscribebulk);
10893 addReplyBulk(c,pattern);
10894 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10895 listLength(c->pubsub_patterns));
10896 }
10897 decrRefCount(pattern);
10898 return retval;
10899 }
10900
10901 /* Unsubscribe from all the channels. Return the number of channels the
10902 * client was subscribed from. */
10903 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10904 dictIterator *di = dictGetIterator(c->pubsub_channels);
10905 dictEntry *de;
10906 int count = 0;
10907
10908 while((de = dictNext(di)) != NULL) {
10909 robj *channel = dictGetEntryKey(de);
10910
10911 count += pubsubUnsubscribeChannel(c,channel,notify);
10912 }
10913 dictReleaseIterator(di);
10914 return count;
10915 }
10916
10917 /* Unsubscribe from all the patterns. Return the number of patterns the
10918 * client was subscribed from. */
10919 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10920 listNode *ln;
10921 listIter li;
10922 int count = 0;
10923
10924 listRewind(c->pubsub_patterns,&li);
10925 while ((ln = listNext(&li)) != NULL) {
10926 robj *pattern = ln->value;
10927
10928 count += pubsubUnsubscribePattern(c,pattern,notify);
10929 }
10930 return count;
10931 }
10932
10933 /* Publish a message */
10934 static int pubsubPublishMessage(robj *channel, robj *message) {
10935 int receivers = 0;
10936 struct dictEntry *de;
10937 listNode *ln;
10938 listIter li;
10939
10940 /* Send to clients listening for that channel */
10941 de = dictFind(server.pubsub_channels,channel);
10942 if (de) {
10943 list *list = dictGetEntryVal(de);
10944 listNode *ln;
10945 listIter li;
10946
10947 listRewind(list,&li);
10948 while ((ln = listNext(&li)) != NULL) {
10949 redisClient *c = ln->value;
10950
10951 addReply(c,shared.mbulk3);
10952 addReply(c,shared.messagebulk);
10953 addReplyBulk(c,channel);
10954 addReplyBulk(c,message);
10955 receivers++;
10956 }
10957 }
10958 /* Send to clients listening to matching channels */
10959 if (listLength(server.pubsub_patterns)) {
10960 listRewind(server.pubsub_patterns,&li);
10961 channel = getDecodedObject(channel);
10962 while ((ln = listNext(&li)) != NULL) {
10963 pubsubPattern *pat = ln->value;
10964
10965 if (stringmatchlen((char*)pat->pattern->ptr,
10966 sdslen(pat->pattern->ptr),
10967 (char*)channel->ptr,
10968 sdslen(channel->ptr),0)) {
10969 addReply(pat->client,shared.mbulk4);
10970 addReply(pat->client,shared.pmessagebulk);
10971 addReplyBulk(pat->client,pat->pattern);
10972 addReplyBulk(pat->client,channel);
10973 addReplyBulk(pat->client,message);
10974 receivers++;
10975 }
10976 }
10977 decrRefCount(channel);
10978 }
10979 return receivers;
10980 }
10981
10982 static void subscribeCommand(redisClient *c) {
10983 int j;
10984
10985 for (j = 1; j < c->argc; j++)
10986 pubsubSubscribeChannel(c,c->argv[j]);
10987 }
10988
10989 static void unsubscribeCommand(redisClient *c) {
10990 if (c->argc == 1) {
10991 pubsubUnsubscribeAllChannels(c,1);
10992 return;
10993 } else {
10994 int j;
10995
10996 for (j = 1; j < c->argc; j++)
10997 pubsubUnsubscribeChannel(c,c->argv[j],1);
10998 }
10999 }
11000
11001 static void psubscribeCommand(redisClient *c) {
11002 int j;
11003
11004 for (j = 1; j < c->argc; j++)
11005 pubsubSubscribePattern(c,c->argv[j]);
11006 }
11007
11008 static void punsubscribeCommand(redisClient *c) {
11009 if (c->argc == 1) {
11010 pubsubUnsubscribeAllPatterns(c,1);
11011 return;
11012 } else {
11013 int j;
11014
11015 for (j = 1; j < c->argc; j++)
11016 pubsubUnsubscribePattern(c,c->argv[j],1);
11017 }
11018 }
11019
11020 static void publishCommand(redisClient *c) {
11021 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
11022 addReplyLongLong(c,receivers);
11023 }
11024
11025 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
11026 *
11027 * The implementation uses a per-DB hash table mapping keys to list of clients
11028 * WATCHing those keys, so that given a key that is going to be modified
11029 * we can mark all the associated clients as dirty.
11030 *
11031 * Also every client contains a list of WATCHed keys so that's possible to
11032 * un-watch such keys when the client is freed or when UNWATCH is called. */
11033
11034 /* In the client->watched_keys list we need to use watchedKey structures
11035 * as in order to identify a key in Redis we need both the key name and the
11036 * DB */
11037 typedef struct watchedKey {
11038 robj *key;
11039 redisDb *db;
11040 } watchedKey;
11041
11042 /* Watch for the specified key */
11043 static void watchForKey(redisClient *c, robj *key) {
11044 list *clients = NULL;
11045 listIter li;
11046 listNode *ln;
11047 watchedKey *wk;
11048
11049 /* Check if we are already watching for this key */
11050 listRewind(c->watched_keys,&li);
11051 while((ln = listNext(&li))) {
11052 wk = listNodeValue(ln);
11053 if (wk->db == c->db && equalStringObjects(key,wk->key))
11054 return; /* Key already watched */
11055 }
11056 /* This key is not already watched in this DB. Let's add it */
11057 clients = dictFetchValue(c->db->watched_keys,key);
11058 if (!clients) {
11059 clients = listCreate();
11060 dictAdd(c->db->watched_keys,key,clients);
11061 incrRefCount(key);
11062 }
11063 listAddNodeTail(clients,c);
11064 /* Add the new key to the lits of keys watched by this client */
11065 wk = zmalloc(sizeof(*wk));
11066 wk->key = key;
11067 wk->db = c->db;
11068 incrRefCount(key);
11069 listAddNodeTail(c->watched_keys,wk);
11070 }
11071
11072 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
11073 * flag is up to the caller. */
11074 static void unwatchAllKeys(redisClient *c) {
11075 listIter li;
11076 listNode *ln;
11077
11078 if (listLength(c->watched_keys) == 0) return;
11079 listRewind(c->watched_keys,&li);
11080 while((ln = listNext(&li))) {
11081 list *clients;
11082 watchedKey *wk;
11083
11084 /* Lookup the watched key -> clients list and remove the client
11085 * from the list */
11086 wk = listNodeValue(ln);
11087 clients = dictFetchValue(wk->db->watched_keys, wk->key);
11088 assert(clients != NULL);
11089 listDelNode(clients,listSearchKey(clients,c));
11090 /* Kill the entry at all if this was the only client */
11091 if (listLength(clients) == 0)
11092 dictDelete(wk->db->watched_keys, wk->key);
11093 /* Remove this watched key from the client->watched list */
11094 listDelNode(c->watched_keys,ln);
11095 decrRefCount(wk->key);
11096 zfree(wk);
11097 }
11098 }
11099
11100 /* "Touch" a key, so that if this key is being WATCHed by some client the
11101 * next EXEC will fail. */
11102 static void touchWatchedKey(redisDb *db, robj *key) {
11103 list *clients;
11104 listIter li;
11105 listNode *ln;
11106
11107 if (dictSize(db->watched_keys) == 0) return;
11108 clients = dictFetchValue(db->watched_keys, key);
11109 if (!clients) return;
11110
11111 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11112 /* Check if we are already watching for this key */
11113 listRewind(clients,&li);
11114 while((ln = listNext(&li))) {
11115 redisClient *c = listNodeValue(ln);
11116
11117 c->flags |= REDIS_DIRTY_CAS;
11118 }
11119 }
11120
11121 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11122 * flush but will be deleted as effect of the flushing operation should
11123 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11124 * a FLUSHALL operation (all the DBs flushed). */
11125 static void touchWatchedKeysOnFlush(int dbid) {
11126 listIter li1, li2;
11127 listNode *ln;
11128
11129 /* For every client, check all the waited keys */
11130 listRewind(server.clients,&li1);
11131 while((ln = listNext(&li1))) {
11132 redisClient *c = listNodeValue(ln);
11133 listRewind(c->watched_keys,&li2);
11134 while((ln = listNext(&li2))) {
11135 watchedKey *wk = listNodeValue(ln);
11136
11137 /* For every watched key matching the specified DB, if the
11138 * key exists, mark the client as dirty, as the key will be
11139 * removed. */
11140 if (dbid == -1 || wk->db->id == dbid) {
11141 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11142 c->flags |= REDIS_DIRTY_CAS;
11143 }
11144 }
11145 }
11146 }
11147
11148 static void watchCommand(redisClient *c) {
11149 int j;
11150
11151 if (c->flags & REDIS_MULTI) {
11152 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11153 return;
11154 }
11155 for (j = 1; j < c->argc; j++)
11156 watchForKey(c,c->argv[j]);
11157 addReply(c,shared.ok);
11158 }
11159
11160 static void unwatchCommand(redisClient *c) {
11161 unwatchAllKeys(c);
11162 c->flags &= (~REDIS_DIRTY_CAS);
11163 addReply(c,shared.ok);
11164 }
11165
11166 /* ================================= Debugging ============================== */
11167
11168 /* Compute the sha1 of string at 's' with 'len' bytes long.
11169 * The SHA1 is then xored againt the string pointed by digest.
11170 * Since xor is commutative, this operation is used in order to
11171 * "add" digests relative to unordered elements.
11172 *
11173 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11174 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11175 SHA1_CTX ctx;
11176 unsigned char hash[20], *s = ptr;
11177 int j;
11178
11179 SHA1Init(&ctx);
11180 SHA1Update(&ctx,s,len);
11181 SHA1Final(hash,&ctx);
11182
11183 for (j = 0; j < 20; j++)
11184 digest[j] ^= hash[j];
11185 }
11186
11187 static void xorObjectDigest(unsigned char *digest, robj *o) {
11188 o = getDecodedObject(o);
11189 xorDigest(digest,o->ptr,sdslen(o->ptr));
11190 decrRefCount(o);
11191 }
11192
11193 /* This function instead of just computing the SHA1 and xoring it
11194 * against diget, also perform the digest of "digest" itself and
11195 * replace the old value with the new one.
11196 *
11197 * So the final digest will be:
11198 *
11199 * digest = SHA1(digest xor SHA1(data))
11200 *
11201 * This function is used every time we want to preserve the order so
11202 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11203 *
11204 * Also note that mixdigest("foo") followed by mixdigest("bar")
11205 * will lead to a different digest compared to "fo", "obar".
11206 */
11207 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11208 SHA1_CTX ctx;
11209 char *s = ptr;
11210
11211 xorDigest(digest,s,len);
11212 SHA1Init(&ctx);
11213 SHA1Update(&ctx,digest,20);
11214 SHA1Final(digest,&ctx);
11215 }
11216
11217 static void mixObjectDigest(unsigned char *digest, robj *o) {
11218 o = getDecodedObject(o);
11219 mixDigest(digest,o->ptr,sdslen(o->ptr));
11220 decrRefCount(o);
11221 }
11222
11223 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11224 * are not ordered, we use a trick: every aggregate digest is the xor
11225 * of the digests of their elements. This way the order will not change
11226 * the result. For list instead we use a feedback entering the output digest
11227 * as input in order to ensure that a different ordered list will result in
11228 * a different digest. */
11229 static void computeDatasetDigest(unsigned char *final) {
11230 unsigned char digest[20];
11231 char buf[128];
11232 dictIterator *di = NULL;
11233 dictEntry *de;
11234 int j;
11235 uint32_t aux;
11236
11237 memset(final,0,20); /* Start with a clean result */
11238
11239 for (j = 0; j < server.dbnum; j++) {
11240 redisDb *db = server.db+j;
11241
11242 if (dictSize(db->dict) == 0) continue;
11243 di = dictGetIterator(db->dict);
11244
11245 /* hash the DB id, so the same dataset moved in a different
11246 * DB will lead to a different digest */
11247 aux = htonl(j);
11248 mixDigest(final,&aux,sizeof(aux));
11249
11250 /* Iterate this DB writing every entry */
11251 while((de = dictNext(di)) != NULL) {
11252 sds key;
11253 robj *keyobj, *o;
11254 time_t expiretime;
11255
11256 memset(digest,0,20); /* This key-val digest */
11257 key = dictGetEntryKey(de);
11258 keyobj = createStringObject(key,sdslen(key));
11259
11260 mixDigest(digest,key,sdslen(key));
11261
11262 /* Make sure the key is loaded if VM is active */
11263 o = lookupKeyRead(db,keyobj);
11264
11265 aux = htonl(o->type);
11266 mixDigest(digest,&aux,sizeof(aux));
11267 expiretime = getExpire(db,keyobj);
11268
11269 /* Save the key and associated value */
11270 if (o->type == REDIS_STRING) {
11271 mixObjectDigest(digest,o);
11272 } else if (o->type == REDIS_LIST) {
11273 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11274 listTypeEntry entry;
11275 while(listTypeNext(li,&entry)) {
11276 robj *eleobj = listTypeGet(&entry);
11277 mixObjectDigest(digest,eleobj);
11278 decrRefCount(eleobj);
11279 }
11280 listTypeReleaseIterator(li);
11281 } else if (o->type == REDIS_SET) {
11282 dict *set = o->ptr;
11283 dictIterator *di = dictGetIterator(set);
11284 dictEntry *de;
11285
11286 while((de = dictNext(di)) != NULL) {
11287 robj *eleobj = dictGetEntryKey(de);
11288
11289 xorObjectDigest(digest,eleobj);
11290 }
11291 dictReleaseIterator(di);
11292 } else if (o->type == REDIS_ZSET) {
11293 zset *zs = o->ptr;
11294 dictIterator *di = dictGetIterator(zs->dict);
11295 dictEntry *de;
11296
11297 while((de = dictNext(di)) != NULL) {
11298 robj *eleobj = dictGetEntryKey(de);
11299 double *score = dictGetEntryVal(de);
11300 unsigned char eledigest[20];
11301
11302 snprintf(buf,sizeof(buf),"%.17g",*score);
11303 memset(eledigest,0,20);
11304 mixObjectDigest(eledigest,eleobj);
11305 mixDigest(eledigest,buf,strlen(buf));
11306 xorDigest(digest,eledigest,20);
11307 }
11308 dictReleaseIterator(di);
11309 } else if (o->type == REDIS_HASH) {
11310 hashTypeIterator *hi;
11311 robj *obj;
11312
11313 hi = hashTypeInitIterator(o);
11314 while (hashTypeNext(hi) != REDIS_ERR) {
11315 unsigned char eledigest[20];
11316
11317 memset(eledigest,0,20);
11318 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11319 mixObjectDigest(eledigest,obj);
11320 decrRefCount(obj);
11321 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11322 mixObjectDigest(eledigest,obj);
11323 decrRefCount(obj);
11324 xorDigest(digest,eledigest,20);
11325 }
11326 hashTypeReleaseIterator(hi);
11327 } else {
11328 redisPanic("Unknown object type");
11329 }
11330 /* If the key has an expire, add it to the mix */
11331 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11332 /* We can finally xor the key-val digest to the final digest */
11333 xorDigest(final,digest,20);
11334 decrRefCount(keyobj);
11335 }
11336 dictReleaseIterator(di);
11337 }
11338 }
11339
11340 static void debugCommand(redisClient *c) {
11341 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11342 *((char*)-1) = 'x';
11343 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11344 if (rdbSave(server.dbfilename) != REDIS_OK) {
11345 addReply(c,shared.err);
11346 return;
11347 }
11348 emptyDb();
11349 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11350 addReply(c,shared.err);
11351 return;
11352 }
11353 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11354 addReply(c,shared.ok);
11355 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11356 emptyDb();
11357 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11358 addReply(c,shared.err);
11359 return;
11360 }
11361 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11362 addReply(c,shared.ok);
11363 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11364 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11365 robj *val;
11366
11367 if (!de) {
11368 addReply(c,shared.nokeyerr);
11369 return;
11370 }
11371 val = dictGetEntryVal(de);
11372 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11373 val->storage == REDIS_VM_SWAPPING)) {
11374 char *strenc;
11375 char buf[128];
11376
11377 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11378 strenc = strencoding[val->encoding];
11379 } else {
11380 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11381 strenc = buf;
11382 }
11383 addReplySds(c,sdscatprintf(sdsempty(),
11384 "+Value at:%p refcount:%d "
11385 "encoding:%s serializedlength:%lld\r\n",
11386 (void*)val, val->refcount,
11387 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11388 } else {
11389 vmpointer *vp = (vmpointer*) val;
11390 addReplySds(c,sdscatprintf(sdsempty(),
11391 "+Value swapped at: page %llu "
11392 "using %llu pages\r\n",
11393 (unsigned long long) vp->page,
11394 (unsigned long long) vp->usedpages));
11395 }
11396 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11397 lookupKeyRead(c->db,c->argv[2]);
11398 addReply(c,shared.ok);
11399 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11400 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11401 robj *val;
11402 vmpointer *vp;
11403
11404 if (!server.vm_enabled) {
11405 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11406 return;
11407 }
11408 if (!de) {
11409 addReply(c,shared.nokeyerr);
11410 return;
11411 }
11412 val = dictGetEntryVal(de);
11413 /* Swap it */
11414 if (val->storage != REDIS_VM_MEMORY) {
11415 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11416 } else if (val->refcount != 1) {
11417 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11418 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11419 dictGetEntryVal(de) = vp;
11420 addReply(c,shared.ok);
11421 } else {
11422 addReply(c,shared.err);
11423 }
11424 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11425 long keys, j;
11426 robj *key, *val;
11427 char buf[128];
11428
11429 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11430 return;
11431 for (j = 0; j < keys; j++) {
11432 snprintf(buf,sizeof(buf),"key:%lu",j);
11433 key = createStringObject(buf,strlen(buf));
11434 if (lookupKeyRead(c->db,key) != NULL) {
11435 decrRefCount(key);
11436 continue;
11437 }
11438 snprintf(buf,sizeof(buf),"value:%lu",j);
11439 val = createStringObject(buf,strlen(buf));
11440 dbAdd(c->db,key,val);
11441 decrRefCount(key);
11442 }
11443 addReply(c,shared.ok);
11444 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11445 unsigned char digest[20];
11446 sds d = sdsnew("+");
11447 int j;
11448
11449 computeDatasetDigest(digest);
11450 for (j = 0; j < 20; j++)
11451 d = sdscatprintf(d, "%02x",digest[j]);
11452
11453 d = sdscatlen(d,"\r\n",2);
11454 addReplySds(c,d);
11455 } else {
11456 addReplySds(c,sdsnew(
11457 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11458 }
11459 }
11460
11461 static void _redisAssert(char *estr, char *file, int line) {
11462 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11463 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11464 #ifdef HAVE_BACKTRACE
11465 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11466 *((char*)-1) = 'x';
11467 #endif
11468 }
11469
11470 static void _redisPanic(char *msg, char *file, int line) {
11471 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11472 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11473 #ifdef HAVE_BACKTRACE
11474 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11475 *((char*)-1) = 'x';
11476 #endif
11477 }
11478
11479 /* =================================== Main! ================================ */
11480
11481 #ifdef __linux__
11482 int linuxOvercommitMemoryValue(void) {
11483 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11484 char buf[64];
11485
11486 if (!fp) return -1;
11487 if (fgets(buf,64,fp) == NULL) {
11488 fclose(fp);
11489 return -1;
11490 }
11491 fclose(fp);
11492
11493 return atoi(buf);
11494 }
11495
11496 void linuxOvercommitMemoryWarning(void) {
11497 if (linuxOvercommitMemoryValue() == 0) {
11498 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11499 }
11500 }
11501 #endif /* __linux__ */
11502
11503 static void daemonize(void) {
11504 int fd;
11505 FILE *fp;
11506
11507 if (fork() != 0) exit(0); /* parent exits */
11508 setsid(); /* create a new session */
11509
11510 /* Every output goes to /dev/null. If Redis is daemonized but
11511 * the 'logfile' is set to 'stdout' in the configuration file
11512 * it will not log at all. */
11513 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11514 dup2(fd, STDIN_FILENO);
11515 dup2(fd, STDOUT_FILENO);
11516 dup2(fd, STDERR_FILENO);
11517 if (fd > STDERR_FILENO) close(fd);
11518 }
11519 /* Try to write the pid file */
11520 fp = fopen(server.pidfile,"w");
11521 if (fp) {
11522 fprintf(fp,"%d\n",getpid());
11523 fclose(fp);
11524 }
11525 }
11526
11527 static void version() {
11528 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11529 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11530 exit(0);
11531 }
11532
11533 static void usage() {
11534 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11535 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11536 exit(1);
11537 }
11538
11539 int main(int argc, char **argv) {
11540 time_t start;
11541
11542 initServerConfig();
11543 sortCommandTable();
11544 if (argc == 2) {
11545 if (strcmp(argv[1], "-v") == 0 ||
11546 strcmp(argv[1], "--version") == 0) version();
11547 if (strcmp(argv[1], "--help") == 0) usage();
11548 resetServerSaveParams();
11549 loadServerConfig(argv[1]);
11550 } else if ((argc > 2)) {
11551 usage();
11552 } else {
11553 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11554 }
11555 if (server.daemonize) daemonize();
11556 initServer();
11557 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11558 #ifdef __linux__
11559 linuxOvercommitMemoryWarning();
11560 #endif
11561 start = time(NULL);
11562 if (server.appendonly) {
11563 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11564 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11565 } else {
11566 if (rdbLoad(server.dbfilename) == REDIS_OK)
11567 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11568 }
11569 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11570 aeSetBeforeSleepProc(server.el,beforeSleep);
11571 aeMain(server.el);
11572 aeDeleteEventLoop(server.el);
11573 return 0;
11574 }
11575
11576 /* ============================= Backtrace support ========================= */
11577
11578 #ifdef HAVE_BACKTRACE
11579 static char *findFuncName(void *pointer, unsigned long *offset);
11580
11581 static void *getMcontextEip(ucontext_t *uc) {
11582 #if defined(__FreeBSD__)
11583 return (void*) uc->uc_mcontext.mc_eip;
11584 #elif defined(__dietlibc__)
11585 return (void*) uc->uc_mcontext.eip;
11586 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11587 #if __x86_64__
11588 return (void*) uc->uc_mcontext->__ss.__rip;
11589 #else
11590 return (void*) uc->uc_mcontext->__ss.__eip;
11591 #endif
11592 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11593 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11594 return (void*) uc->uc_mcontext->__ss.__rip;
11595 #else
11596 return (void*) uc->uc_mcontext->__ss.__eip;
11597 #endif
11598 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11599 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11600 #elif defined(__ia64__) /* Linux IA64 */
11601 return (void*) uc->uc_mcontext.sc_ip;
11602 #else
11603 return NULL;
11604 #endif
11605 }
11606
11607 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11608 void *trace[100];
11609 char **messages = NULL;
11610 int i, trace_size = 0;
11611 unsigned long offset=0;
11612 ucontext_t *uc = (ucontext_t*) secret;
11613 sds infostring;
11614 REDIS_NOTUSED(info);
11615
11616 redisLog(REDIS_WARNING,
11617 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11618 infostring = genRedisInfoString();
11619 redisLog(REDIS_WARNING, "%s",infostring);
11620 /* It's not safe to sdsfree() the returned string under memory
11621 * corruption conditions. Let it leak as we are going to abort */
11622
11623 trace_size = backtrace(trace, 100);
11624 /* overwrite sigaction with caller's address */
11625 if (getMcontextEip(uc) != NULL) {
11626 trace[1] = getMcontextEip(uc);
11627 }
11628 messages = backtrace_symbols(trace, trace_size);
11629
11630 for (i=1; i<trace_size; ++i) {
11631 char *fn = findFuncName(trace[i], &offset), *p;
11632
11633 p = strchr(messages[i],'+');
11634 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11635 redisLog(REDIS_WARNING,"%s", messages[i]);
11636 } else {
11637 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11638 }
11639 }
11640 /* free(messages); Don't call free() with possibly corrupted memory. */
11641 _exit(0);
11642 }
11643
11644 static void sigtermHandler(int sig) {
11645 REDIS_NOTUSED(sig);
11646
11647 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11648 server.shutdown_asap = 1;
11649 }
11650
11651 static void setupSigSegvAction(void) {
11652 struct sigaction act;
11653
11654 sigemptyset (&act.sa_mask);
11655 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11656 * is used. Otherwise, sa_handler is used */
11657 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11658 act.sa_sigaction = segvHandler;
11659 sigaction (SIGSEGV, &act, NULL);
11660 sigaction (SIGBUS, &act, NULL);
11661 sigaction (SIGFPE, &act, NULL);
11662 sigaction (SIGILL, &act, NULL);
11663 sigaction (SIGBUS, &act, NULL);
11664
11665 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11666 act.sa_handler = sigtermHandler;
11667 sigaction (SIGTERM, &act, NULL);
11668 return;
11669 }
11670
11671 #include "staticsymbols.h"
11672 /* This function try to convert a pointer into a function name. It's used in
11673 * oreder to provide a backtrace under segmentation fault that's able to
11674 * display functions declared as static (otherwise the backtrace is useless). */
11675 static char *findFuncName(void *pointer, unsigned long *offset){
11676 int i, ret = -1;
11677 unsigned long off, minoff = 0;
11678
11679 /* Try to match against the Symbol with the smallest offset */
11680 for (i=0; symsTable[i].pointer; i++) {
11681 unsigned long lp = (unsigned long) pointer;
11682
11683 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11684 off=lp-symsTable[i].pointer;
11685 if (ret < 0 || off < minoff) {
11686 minoff=off;
11687 ret=i;
11688 }
11689 }
11690 }
11691 if (ret == -1) return NULL;
11692 *offset = minoff;
11693 return symsTable[ret].name;
11694 }
11695 #else /* HAVE_BACKTRACE */
11696 static void setupSigSegvAction(void) {
11697 }
11698 #endif /* HAVE_BACKTRACE */
11699
11700
11701
11702 /* The End */
11703
11704
11705