]> git.saurik.com Git - redis.git/blob - redis.c
cb7bca8e100a65ced0daa9942f822ecfb5bbd0ea
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
80 #include "release.h" /* Release and/or git repository information */
81
82 /* Error codes */
83 #define REDIS_OK 0
84 #define REDIS_ERR -1
85
86 /* Static server configuration */
87 #define REDIS_SERVERPORT 6379 /* TCP port */
88 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
89 #define REDIS_IOBUF_LEN 1024
90 #define REDIS_LOADBUF_LEN 1024
91 #define REDIS_STATIC_ARGS 8
92 #define REDIS_DEFAULT_DBNUM 16
93 #define REDIS_CONFIGLINE_MAX 1024
94 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
96 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
97 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
98 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101 #define REDIS_WRITEV_THRESHOLD 3
102 /* Max number of iovecs used for each writev call */
103 #define REDIS_WRITEV_IOVEC_COUNT 256
104
105 /* Hash table parameters */
106 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107
108 /* Command flags */
109 #define REDIS_CMD_BULK 1 /* Bulk write command */
110 #define REDIS_CMD_INLINE 2 /* Inline command */
111 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117
118 /* Object types */
119 #define REDIS_STRING 0
120 #define REDIS_LIST 1
121 #define REDIS_SET 2
122 #define REDIS_ZSET 3
123 #define REDIS_HASH 4
124 #define REDIS_VMPOINTER 8
125
126 /* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
129 #define REDIS_ENCODING_RAW 0 /* Raw representation */
130 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
131 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
135
136 static char* strencoding[] = {
137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
138 };
139
140 /* Object types only used for dumping to disk */
141 #define REDIS_EXPIRETIME 253
142 #define REDIS_SELECTDB 254
143 #define REDIS_EOF 255
144
145 /* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
155 *
156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
158 #define REDIS_RDB_6BITLEN 0
159 #define REDIS_RDB_14BITLEN 1
160 #define REDIS_RDB_32BITLEN 2
161 #define REDIS_RDB_ENCVAL 3
162 #define REDIS_RDB_LENERR UINT_MAX
163
164 /* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
170 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
171
172 /* Virtual memory object->where field. */
173 #define REDIS_VM_MEMORY 0 /* The object is on memory */
174 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
175 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
178 /* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180 #define REDIS_VM_MAX_NEAR_PAGES 65536
181 #define REDIS_VM_MAX_RANDOM_JUMP 4096
182 #define REDIS_VM_MAX_THREADS 32
183 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
184 /* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
188 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
189
190 /* Client flags */
191 #define REDIS_SLAVE 1 /* This client is a slave server */
192 #define REDIS_MASTER 2 /* This client is a master server */
193 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194 #define REDIS_MULTI 8 /* This client is in a MULTI context */
195 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
197 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
198
199 /* Slave replication state - slave side */
200 #define REDIS_REPL_NONE 0 /* No active replication */
201 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
202 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
204 /* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
213 /* List related stuff */
214 #define REDIS_HEAD 0
215 #define REDIS_TAIL 1
216
217 /* Sort operations */
218 #define REDIS_SORT_GET 0
219 #define REDIS_SORT_ASC 1
220 #define REDIS_SORT_DESC 2
221 #define REDIS_SORTKEY_MAX 1024
222
223 /* Log levels */
224 #define REDIS_DEBUG 0
225 #define REDIS_VERBOSE 1
226 #define REDIS_NOTICE 2
227 #define REDIS_WARNING 3
228
229 /* Anti-warning macro... */
230 #define REDIS_NOTUSED(V) ((void) V)
231
232 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
234
235 /* Append only defines */
236 #define APPENDFSYNC_NO 0
237 #define APPENDFSYNC_ALWAYS 1
238 #define APPENDFSYNC_EVERYSEC 2
239
240 /* Zip structure related defaults */
241 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
243 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
245
246 /* We can print the stacktrace, so our assert is defined this way: */
247 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
248 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
249 static void _redisAssert(char *estr, char *file, int line);
250 static void _redisPanic(char *msg, char *file, int line);
251
252 /*================================= Data types ============================== */
253
254 /* A redis object, that is a type able to hold a string / list / set */
255
256 /* The actual Redis Object */
257 typedef struct redisObject {
258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
262 int refcount;
263 void *ptr;
264 /* VM fields, this are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
268 } robj;
269
270 /* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
279 * are always able to read obj->storage to check this. For vmPointer
280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282 typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289 } vmpointer;
290
291 /* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295 #define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
300 _var.storage = REDIS_VM_MEMORY; \
301 } while(0);
302
303 typedef struct redisDb {
304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
307 dict *io_keys; /* Keys with clients waiting for VM I/O */
308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
309 int id;
310 } redisDb;
311
312 /* Client MULTI/EXEC state */
313 typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317 } multiCmd;
318
319 typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322 } multiState;
323
324 /* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326 typedef struct redisClient {
327 int fd;
328 redisDb *db;
329 int dictid;
330 sds querybuf;
331 robj **argv, **mbargv;
332 int argc, mbargc;
333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
334 int multibulk; /* multi bulk command format active */
335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
343 long repldboff; /* replication DB file offset */
344 off_t repldbsize; /* replication DB file size */
345 multiState mstate; /* MULTI/EXEC state */
346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
347 * operation such as BLPOP. Otherwise NULL. */
348 int blocking_keys_num; /* Number of blocking keys */
349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
356 } redisClient;
357
358 struct saveparam {
359 time_t seconds;
360 int changes;
361 };
362
363 /* Global server state structure */
364 struct redisServer {
365 int port;
366 int fd;
367 redisDb *db;
368 long long dirty; /* changes to DB from the last save */
369 list *clients;
370 list *slaves, *monitors;
371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
380 long long stat_expiredkeys; /* number of expired keys */
381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
387 int appendonly;
388 int appendfsync;
389 int no_appendfsync_on_rewrite;
390 int shutdown_asap;
391 time_t lastfsync;
392 int appendfd;
393 int appendseldb;
394 char *pidfile;
395 pid_t bgsavechildpid;
396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
398 sds aofbuf; /* AOF buffer, written before entering the event loop */
399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
404 char *appendfilename;
405 char *requirepass;
406 int rdbcompression;
407 int activerehashing;
408 /* Replication related */
409 int isslave;
410 char *masterauth;
411 char *masterhost;
412 int masterport;
413 redisClient *master; /* client that is master for this slave */
414 int replstate;
415 unsigned int maxclients;
416 unsigned long long maxmemory;
417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
424 /* Virtual memory configuration */
425 int vm_enabled;
426 char *vm_swap_file;
427 off_t vm_page_size;
428 off_t vm_pages;
429 unsigned long long vm_max_memory;
430 /* Zip structure config */
431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
441 time_t unixtime; /* Unix time sampled every second. */
442 /* Virtual memory I/O threads stuff */
443 /* An I/O thread process an element taken from the io_jobs queue and
444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
467 /* Pubsub */
468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
470 /* Misc */
471 FILE *devnull;
472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
474 };
475
476 typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479 } pubsubPattern;
480
481 typedef void redisCommandProc(redisClient *c);
482 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
483 struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
491 redisVmPreloadProc *vm_preload_proc;
492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
496 };
497
498 struct redisFunctionSym {
499 char *name;
500 unsigned long pointer;
501 };
502
503 typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509 } redisSortObject;
510
511 typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514 } redisSortOperation;
515
516 /* ZSETs use a specialized version of Skiplists */
517
518 typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
520 struct zskiplistNode *backward;
521 unsigned int *span;
522 double score;
523 robj *obj;
524 } zskiplistNode;
525
526 typedef struct zskiplist {
527 struct zskiplistNode *header, *tail;
528 unsigned long length;
529 int level;
530 } zskiplist;
531
532 typedef struct zset {
533 dict *dict;
534 zskiplist *zsl;
535 } zset;
536
537 /* Our shared "common" objects */
538
539 #define REDIS_SHARED_INTEGERS 10000
540 struct sharedObjectsStruct {
541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
542 *colon, *nullbulk, *nullmultibulk, *queued,
543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
545 *select0, *select1, *select2, *select3, *select4,
546 *select5, *select6, *select7, *select8, *select9,
547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
550 } shared;
551
552 /* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
558 /* VM threaded I/O request message */
559 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
562 typedef struct iojob {
563 int type; /* Request type, REDIS_IOJOB_* */
564 redisDb *db;/* Redis database */
565 robj *key; /* This I/O request is about swapping this key */
566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575 } iojob;
576
577 /*================================ Prototypes =============================== */
578
579 static void freeStringObject(robj *o);
580 static void freeListObject(robj *o);
581 static void freeSetObject(robj *o);
582 static void decrRefCount(void *o);
583 static robj *createObject(int type, void *ptr);
584 static void freeClient(redisClient *c);
585 static int rdbLoad(char *filename);
586 static void addReply(redisClient *c, robj *obj);
587 static void addReplySds(redisClient *c, sds s);
588 static void incrRefCount(robj *o);
589 static int rdbSaveBackground(char *filename);
590 static robj *createStringObject(char *ptr, size_t len);
591 static robj *dupStringObject(robj *o);
592 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
593 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
594 static void flushAppendOnlyFile(void);
595 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
596 static int syncWithMaster(void);
597 static robj *tryObjectEncoding(robj *o);
598 static robj *getDecodedObject(robj *o);
599 static int removeExpire(redisDb *db, robj *key);
600 static int expireIfNeeded(redisDb *db, robj *key);
601 static int deleteIfVolatile(redisDb *db, robj *key);
602 static int dbDelete(redisDb *db, robj *key);
603 static time_t getExpire(redisDb *db, robj *key);
604 static int setExpire(redisDb *db, robj *key, time_t when);
605 static void updateSlavesWaitingBgsave(int bgsaveerr);
606 static void freeMemoryIfNeeded(void);
607 static int processCommand(redisClient *c);
608 static void setupSigSegvAction(void);
609 static void rdbRemoveTempFile(pid_t childpid);
610 static void aofRemoveTempFile(pid_t childpid);
611 static size_t stringObjectLen(robj *o);
612 static void processInputBuffer(redisClient *c);
613 static zskiplist *zslCreate(void);
614 static void zslFree(zskiplist *zsl);
615 static void zslInsert(zskiplist *zsl, double score, robj *obj);
616 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
617 static void initClientMultiState(redisClient *c);
618 static void freeClientMultiState(redisClient *c);
619 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
620 static void unblockClientWaitingData(redisClient *c);
621 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
622 static void vmInit(void);
623 static void vmMarkPagesFree(off_t page, off_t count);
624 static robj *vmLoadObject(robj *o);
625 static robj *vmPreviewObject(robj *o);
626 static int vmSwapOneObjectBlocking(void);
627 static int vmSwapOneObjectThreaded(void);
628 static int vmCanSwapOut(void);
629 static int tryFreeOneObjectFromFreelist(void);
630 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
631 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
632 static void vmCancelThreadedIOJob(robj *o);
633 static void lockThreadedIO(void);
634 static void unlockThreadedIO(void);
635 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
636 static void freeIOJob(iojob *j);
637 static void queueIOJob(iojob *j);
638 static int vmWriteObjectOnSwap(robj *o, off_t page);
639 static robj *vmReadObjectFromSwap(off_t page, int type);
640 static void waitEmptyIOJobsQueue(void);
641 static void vmReopenSwapFile(void);
642 static int vmFreePage(off_t page);
643 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
644 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
645 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
646 static int dontWaitForSwappedKey(redisClient *c, robj *key);
647 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
648 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
649 static struct redisCommand *lookupCommand(char *name);
650 static void call(redisClient *c, struct redisCommand *cmd);
651 static void resetClient(redisClient *c);
652 static void convertToRealHash(robj *o);
653 static void listTypeConvert(robj *o, int enc);
654 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
655 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
656 static void freePubsubPattern(void *p);
657 static int listMatchPubsubPattern(void *a, void *b);
658 static int compareStringObjects(robj *a, robj *b);
659 static int equalStringObjects(robj *a, robj *b);
660 static void usage();
661 static int rewriteAppendOnlyFileBackground(void);
662 static vmpointer *vmSwapObjectBlocking(robj *val);
663 static int prepareForShutdown();
664 static void touchWatchedKey(redisDb *db, robj *key);
665 static void touchWatchedKeysOnFlush(int dbid);
666 static void unwatchAllKeys(redisClient *c);
667
668 static void authCommand(redisClient *c);
669 static void pingCommand(redisClient *c);
670 static void echoCommand(redisClient *c);
671 static void setCommand(redisClient *c);
672 static void setnxCommand(redisClient *c);
673 static void setexCommand(redisClient *c);
674 static void getCommand(redisClient *c);
675 static void delCommand(redisClient *c);
676 static void existsCommand(redisClient *c);
677 static void incrCommand(redisClient *c);
678 static void decrCommand(redisClient *c);
679 static void incrbyCommand(redisClient *c);
680 static void decrbyCommand(redisClient *c);
681 static void selectCommand(redisClient *c);
682 static void randomkeyCommand(redisClient *c);
683 static void keysCommand(redisClient *c);
684 static void dbsizeCommand(redisClient *c);
685 static void lastsaveCommand(redisClient *c);
686 static void saveCommand(redisClient *c);
687 static void bgsaveCommand(redisClient *c);
688 static void bgrewriteaofCommand(redisClient *c);
689 static void shutdownCommand(redisClient *c);
690 static void moveCommand(redisClient *c);
691 static void renameCommand(redisClient *c);
692 static void renamenxCommand(redisClient *c);
693 static void lpushCommand(redisClient *c);
694 static void rpushCommand(redisClient *c);
695 static void lpopCommand(redisClient *c);
696 static void rpopCommand(redisClient *c);
697 static void llenCommand(redisClient *c);
698 static void lindexCommand(redisClient *c);
699 static void lrangeCommand(redisClient *c);
700 static void ltrimCommand(redisClient *c);
701 static void typeCommand(redisClient *c);
702 static void lsetCommand(redisClient *c);
703 static void saddCommand(redisClient *c);
704 static void sremCommand(redisClient *c);
705 static void smoveCommand(redisClient *c);
706 static void sismemberCommand(redisClient *c);
707 static void scardCommand(redisClient *c);
708 static void spopCommand(redisClient *c);
709 static void srandmemberCommand(redisClient *c);
710 static void sinterCommand(redisClient *c);
711 static void sinterstoreCommand(redisClient *c);
712 static void sunionCommand(redisClient *c);
713 static void sunionstoreCommand(redisClient *c);
714 static void sdiffCommand(redisClient *c);
715 static void sdiffstoreCommand(redisClient *c);
716 static void syncCommand(redisClient *c);
717 static void flushdbCommand(redisClient *c);
718 static void flushallCommand(redisClient *c);
719 static void sortCommand(redisClient *c);
720 static void lremCommand(redisClient *c);
721 static void rpoplpushcommand(redisClient *c);
722 static void infoCommand(redisClient *c);
723 static void mgetCommand(redisClient *c);
724 static void monitorCommand(redisClient *c);
725 static void expireCommand(redisClient *c);
726 static void expireatCommand(redisClient *c);
727 static void getsetCommand(redisClient *c);
728 static void ttlCommand(redisClient *c);
729 static void slaveofCommand(redisClient *c);
730 static void debugCommand(redisClient *c);
731 static void msetCommand(redisClient *c);
732 static void msetnxCommand(redisClient *c);
733 static void zaddCommand(redisClient *c);
734 static void zincrbyCommand(redisClient *c);
735 static void zrangeCommand(redisClient *c);
736 static void zrangebyscoreCommand(redisClient *c);
737 static void zcountCommand(redisClient *c);
738 static void zrevrangeCommand(redisClient *c);
739 static void zcardCommand(redisClient *c);
740 static void zremCommand(redisClient *c);
741 static void zscoreCommand(redisClient *c);
742 static void zremrangebyscoreCommand(redisClient *c);
743 static void multiCommand(redisClient *c);
744 static void execCommand(redisClient *c);
745 static void discardCommand(redisClient *c);
746 static void blpopCommand(redisClient *c);
747 static void brpopCommand(redisClient *c);
748 static void appendCommand(redisClient *c);
749 static void substrCommand(redisClient *c);
750 static void zrankCommand(redisClient *c);
751 static void zrevrankCommand(redisClient *c);
752 static void hsetCommand(redisClient *c);
753 static void hsetnxCommand(redisClient *c);
754 static void hgetCommand(redisClient *c);
755 static void hmsetCommand(redisClient *c);
756 static void hmgetCommand(redisClient *c);
757 static void hdelCommand(redisClient *c);
758 static void hlenCommand(redisClient *c);
759 static void zremrangebyrankCommand(redisClient *c);
760 static void zunionstoreCommand(redisClient *c);
761 static void zinterstoreCommand(redisClient *c);
762 static void hkeysCommand(redisClient *c);
763 static void hvalsCommand(redisClient *c);
764 static void hgetallCommand(redisClient *c);
765 static void hexistsCommand(redisClient *c);
766 static void configCommand(redisClient *c);
767 static void hincrbyCommand(redisClient *c);
768 static void subscribeCommand(redisClient *c);
769 static void unsubscribeCommand(redisClient *c);
770 static void psubscribeCommand(redisClient *c);
771 static void punsubscribeCommand(redisClient *c);
772 static void publishCommand(redisClient *c);
773 static void watchCommand(redisClient *c);
774 static void unwatchCommand(redisClient *c);
775
776 /*================================= Globals ================================= */
777
778 /* Global vars */
779 static struct redisServer server; /* server global state */
780 static struct redisCommand *commandTable;
781 static struct redisCommand readonlyCommandTable[] = {
782 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
784 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
785 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
786 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
793 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
805 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
806 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
809 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
814 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
815 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
816 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
817 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
818 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
819 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
823 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
826 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
827 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
833 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
834 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
835 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
836 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
837 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
838 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
839 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
842 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
843 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
845 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
847 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
848 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
849 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
850 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
851 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
852 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
855 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
857 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
864 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
870 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
872 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
877 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
880 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
883 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
885 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
888 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
890 };
891
892 /*============================ Utility functions ============================ */
893
894 /* Glob-style pattern matching. */
895 static int stringmatchlen(const char *pattern, int patternLen,
896 const char *string, int stringLen, int nocase)
897 {
898 while(patternLen) {
899 switch(pattern[0]) {
900 case '*':
901 while (pattern[1] == '*') {
902 pattern++;
903 patternLen--;
904 }
905 if (patternLen == 1)
906 return 1; /* match */
907 while(stringLen) {
908 if (stringmatchlen(pattern+1, patternLen-1,
909 string, stringLen, nocase))
910 return 1; /* match */
911 string++;
912 stringLen--;
913 }
914 return 0; /* no match */
915 break;
916 case '?':
917 if (stringLen == 0)
918 return 0; /* no match */
919 string++;
920 stringLen--;
921 break;
922 case '[':
923 {
924 int not, match;
925
926 pattern++;
927 patternLen--;
928 not = pattern[0] == '^';
929 if (not) {
930 pattern++;
931 patternLen--;
932 }
933 match = 0;
934 while(1) {
935 if (pattern[0] == '\\') {
936 pattern++;
937 patternLen--;
938 if (pattern[0] == string[0])
939 match = 1;
940 } else if (pattern[0] == ']') {
941 break;
942 } else if (patternLen == 0) {
943 pattern--;
944 patternLen++;
945 break;
946 } else if (pattern[1] == '-' && patternLen >= 3) {
947 int start = pattern[0];
948 int end = pattern[2];
949 int c = string[0];
950 if (start > end) {
951 int t = start;
952 start = end;
953 end = t;
954 }
955 if (nocase) {
956 start = tolower(start);
957 end = tolower(end);
958 c = tolower(c);
959 }
960 pattern += 2;
961 patternLen -= 2;
962 if (c >= start && c <= end)
963 match = 1;
964 } else {
965 if (!nocase) {
966 if (pattern[0] == string[0])
967 match = 1;
968 } else {
969 if (tolower((int)pattern[0]) == tolower((int)string[0]))
970 match = 1;
971 }
972 }
973 pattern++;
974 patternLen--;
975 }
976 if (not)
977 match = !match;
978 if (!match)
979 return 0; /* no match */
980 string++;
981 stringLen--;
982 break;
983 }
984 case '\\':
985 if (patternLen >= 2) {
986 pattern++;
987 patternLen--;
988 }
989 /* fall through */
990 default:
991 if (!nocase) {
992 if (pattern[0] != string[0])
993 return 0; /* no match */
994 } else {
995 if (tolower((int)pattern[0]) != tolower((int)string[0]))
996 return 0; /* no match */
997 }
998 string++;
999 stringLen--;
1000 break;
1001 }
1002 pattern++;
1003 patternLen--;
1004 if (stringLen == 0) {
1005 while(*pattern == '*') {
1006 pattern++;
1007 patternLen--;
1008 }
1009 break;
1010 }
1011 }
1012 if (patternLen == 0 && stringLen == 0)
1013 return 1;
1014 return 0;
1015 }
1016
1017 static int stringmatch(const char *pattern, const char *string, int nocase) {
1018 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1019 }
1020
1021 /* Convert a string representing an amount of memory into the number of
1022 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1023 * (1024*1024*1024).
1024 *
1025 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1026 * set to 0 */
1027 static long long memtoll(const char *p, int *err) {
1028 const char *u;
1029 char buf[128];
1030 long mul; /* unit multiplier */
1031 long long val;
1032 unsigned int digits;
1033
1034 if (err) *err = 0;
1035 /* Search the first non digit character. */
1036 u = p;
1037 if (*u == '-') u++;
1038 while(*u && isdigit(*u)) u++;
1039 if (*u == '\0' || !strcasecmp(u,"b")) {
1040 mul = 1;
1041 } else if (!strcasecmp(u,"k")) {
1042 mul = 1000;
1043 } else if (!strcasecmp(u,"kb")) {
1044 mul = 1024;
1045 } else if (!strcasecmp(u,"m")) {
1046 mul = 1000*1000;
1047 } else if (!strcasecmp(u,"mb")) {
1048 mul = 1024*1024;
1049 } else if (!strcasecmp(u,"g")) {
1050 mul = 1000L*1000*1000;
1051 } else if (!strcasecmp(u,"gb")) {
1052 mul = 1024L*1024*1024;
1053 } else {
1054 if (err) *err = 1;
1055 mul = 1;
1056 }
1057 digits = u-p;
1058 if (digits >= sizeof(buf)) {
1059 if (err) *err = 1;
1060 return LLONG_MAX;
1061 }
1062 memcpy(buf,p,digits);
1063 buf[digits] = '\0';
1064 val = strtoll(buf,NULL,10);
1065 return val*mul;
1066 }
1067
1068 /* Convert a long long into a string. Returns the number of
1069 * characters needed to represent the number, that can be shorter if passed
1070 * buffer length is not enough to store the whole number. */
1071 static int ll2string(char *s, size_t len, long long value) {
1072 char buf[32], *p;
1073 unsigned long long v;
1074 size_t l;
1075
1076 if (len == 0) return 0;
1077 v = (value < 0) ? -value : value;
1078 p = buf+31; /* point to the last character */
1079 do {
1080 *p-- = '0'+(v%10);
1081 v /= 10;
1082 } while(v);
1083 if (value < 0) *p-- = '-';
1084 p++;
1085 l = 32-(p-buf);
1086 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1087 memcpy(s,p,l);
1088 s[l] = '\0';
1089 return l;
1090 }
1091
1092 static void redisLog(int level, const char *fmt, ...) {
1093 va_list ap;
1094 FILE *fp;
1095
1096 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1097 if (!fp) return;
1098
1099 va_start(ap, fmt);
1100 if (level >= server.verbosity) {
1101 char *c = ".-*#";
1102 char buf[64];
1103 time_t now;
1104
1105 now = time(NULL);
1106 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1107 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1108 vfprintf(fp, fmt, ap);
1109 fprintf(fp,"\n");
1110 fflush(fp);
1111 }
1112 va_end(ap);
1113
1114 if (server.logfile) fclose(fp);
1115 }
1116
1117 /*====================== Hash table type implementation ==================== */
1118
1119 /* This is an hash table type that uses the SDS dynamic strings libary as
1120 * keys and radis objects as values (objects can hold SDS strings,
1121 * lists, sets). */
1122
1123 static void dictVanillaFree(void *privdata, void *val)
1124 {
1125 DICT_NOTUSED(privdata);
1126 zfree(val);
1127 }
1128
1129 static void dictListDestructor(void *privdata, void *val)
1130 {
1131 DICT_NOTUSED(privdata);
1132 listRelease((list*)val);
1133 }
1134
1135 static int dictSdsKeyCompare(void *privdata, const void *key1,
1136 const void *key2)
1137 {
1138 int l1,l2;
1139 DICT_NOTUSED(privdata);
1140
1141 l1 = sdslen((sds)key1);
1142 l2 = sdslen((sds)key2);
1143 if (l1 != l2) return 0;
1144 return memcmp(key1, key2, l1) == 0;
1145 }
1146
1147 static void dictRedisObjectDestructor(void *privdata, void *val)
1148 {
1149 DICT_NOTUSED(privdata);
1150
1151 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1152 decrRefCount(val);
1153 }
1154
1155 static void dictSdsDestructor(void *privdata, void *val)
1156 {
1157 DICT_NOTUSED(privdata);
1158
1159 sdsfree(val);
1160 }
1161
1162 static int dictObjKeyCompare(void *privdata, const void *key1,
1163 const void *key2)
1164 {
1165 const robj *o1 = key1, *o2 = key2;
1166 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1167 }
1168
1169 static unsigned int dictObjHash(const void *key) {
1170 const robj *o = key;
1171 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1172 }
1173
1174 static unsigned int dictSdsHash(const void *key) {
1175 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1176 }
1177
1178 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1179 const void *key2)
1180 {
1181 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1182 int cmp;
1183
1184 if (o1->encoding == REDIS_ENCODING_INT &&
1185 o2->encoding == REDIS_ENCODING_INT)
1186 return o1->ptr == o2->ptr;
1187
1188 o1 = getDecodedObject(o1);
1189 o2 = getDecodedObject(o2);
1190 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1191 decrRefCount(o1);
1192 decrRefCount(o2);
1193 return cmp;
1194 }
1195
1196 static unsigned int dictEncObjHash(const void *key) {
1197 robj *o = (robj*) key;
1198
1199 if (o->encoding == REDIS_ENCODING_RAW) {
1200 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1201 } else {
1202 if (o->encoding == REDIS_ENCODING_INT) {
1203 char buf[32];
1204 int len;
1205
1206 len = ll2string(buf,32,(long)o->ptr);
1207 return dictGenHashFunction((unsigned char*)buf, len);
1208 } else {
1209 unsigned int hash;
1210
1211 o = getDecodedObject(o);
1212 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1213 decrRefCount(o);
1214 return hash;
1215 }
1216 }
1217 }
1218
1219 /* Sets type */
1220 static dictType setDictType = {
1221 dictEncObjHash, /* hash function */
1222 NULL, /* key dup */
1223 NULL, /* val dup */
1224 dictEncObjKeyCompare, /* key compare */
1225 dictRedisObjectDestructor, /* key destructor */
1226 NULL /* val destructor */
1227 };
1228
1229 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1230 static dictType zsetDictType = {
1231 dictEncObjHash, /* hash function */
1232 NULL, /* key dup */
1233 NULL, /* val dup */
1234 dictEncObjKeyCompare, /* key compare */
1235 dictRedisObjectDestructor, /* key destructor */
1236 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1237 };
1238
1239 /* Db->dict, keys are sds strings, vals are Redis objects. */
1240 static dictType dbDictType = {
1241 dictSdsHash, /* hash function */
1242 NULL, /* key dup */
1243 NULL, /* val dup */
1244 dictSdsKeyCompare, /* key compare */
1245 dictSdsDestructor, /* key destructor */
1246 dictRedisObjectDestructor /* val destructor */
1247 };
1248
1249 /* Db->expires */
1250 static dictType keyptrDictType = {
1251 dictSdsHash, /* hash function */
1252 NULL, /* key dup */
1253 NULL, /* val dup */
1254 dictSdsKeyCompare, /* key compare */
1255 dictSdsDestructor, /* key destructor */
1256 NULL /* val destructor */
1257 };
1258
1259 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1260 static dictType hashDictType = {
1261 dictEncObjHash, /* hash function */
1262 NULL, /* key dup */
1263 NULL, /* val dup */
1264 dictEncObjKeyCompare, /* key compare */
1265 dictRedisObjectDestructor, /* key destructor */
1266 dictRedisObjectDestructor /* val destructor */
1267 };
1268
1269 /* Keylist hash table type has unencoded redis objects as keys and
1270 * lists as values. It's used for blocking operations (BLPOP) and to
1271 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1272 static dictType keylistDictType = {
1273 dictObjHash, /* hash function */
1274 NULL, /* key dup */
1275 NULL, /* val dup */
1276 dictObjKeyCompare, /* key compare */
1277 dictRedisObjectDestructor, /* key destructor */
1278 dictListDestructor /* val destructor */
1279 };
1280
1281 static void version();
1282
1283 /* ========================= Random utility functions ======================= */
1284
1285 /* Redis generally does not try to recover from out of memory conditions
1286 * when allocating objects or strings, it is not clear if it will be possible
1287 * to report this condition to the client since the networking layer itself
1288 * is based on heap allocation for send buffers, so we simply abort.
1289 * At least the code will be simpler to read... */
1290 static void oom(const char *msg) {
1291 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1292 sleep(1);
1293 abort();
1294 }
1295
1296 /* ====================== Redis server networking stuff ===================== */
1297 static void closeTimedoutClients(void) {
1298 redisClient *c;
1299 listNode *ln;
1300 time_t now = time(NULL);
1301 listIter li;
1302
1303 listRewind(server.clients,&li);
1304 while ((ln = listNext(&li)) != NULL) {
1305 c = listNodeValue(ln);
1306 if (server.maxidletime &&
1307 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1308 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1309 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1310 listLength(c->pubsub_patterns) == 0 &&
1311 (now - c->lastinteraction > server.maxidletime))
1312 {
1313 redisLog(REDIS_VERBOSE,"Closing idle client");
1314 freeClient(c);
1315 } else if (c->flags & REDIS_BLOCKED) {
1316 if (c->blockingto != 0 && c->blockingto < now) {
1317 addReply(c,shared.nullmultibulk);
1318 unblockClientWaitingData(c);
1319 }
1320 }
1321 }
1322 }
1323
1324 static int htNeedsResize(dict *dict) {
1325 long long size, used;
1326
1327 size = dictSlots(dict);
1328 used = dictSize(dict);
1329 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1330 (used*100/size < REDIS_HT_MINFILL));
1331 }
1332
1333 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1334 * we resize the hash table to save memory */
1335 static void tryResizeHashTables(void) {
1336 int j;
1337
1338 for (j = 0; j < server.dbnum; j++) {
1339 if (htNeedsResize(server.db[j].dict))
1340 dictResize(server.db[j].dict);
1341 if (htNeedsResize(server.db[j].expires))
1342 dictResize(server.db[j].expires);
1343 }
1344 }
1345
1346 /* Our hash table implementation performs rehashing incrementally while
1347 * we write/read from the hash table. Still if the server is idle, the hash
1348 * table will use two tables for a long time. So we try to use 1 millisecond
1349 * of CPU time at every serverCron() loop in order to rehash some key. */
1350 static void incrementallyRehash(void) {
1351 int j;
1352
1353 for (j = 0; j < server.dbnum; j++) {
1354 if (dictIsRehashing(server.db[j].dict)) {
1355 dictRehashMilliseconds(server.db[j].dict,1);
1356 break; /* already used our millisecond for this loop... */
1357 }
1358 }
1359 }
1360
1361 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1362 void backgroundSaveDoneHandler(int statloc) {
1363 int exitcode = WEXITSTATUS(statloc);
1364 int bysignal = WIFSIGNALED(statloc);
1365
1366 if (!bysignal && exitcode == 0) {
1367 redisLog(REDIS_NOTICE,
1368 "Background saving terminated with success");
1369 server.dirty = 0;
1370 server.lastsave = time(NULL);
1371 } else if (!bysignal && exitcode != 0) {
1372 redisLog(REDIS_WARNING, "Background saving error");
1373 } else {
1374 redisLog(REDIS_WARNING,
1375 "Background saving terminated by signal %d", WTERMSIG(statloc));
1376 rdbRemoveTempFile(server.bgsavechildpid);
1377 }
1378 server.bgsavechildpid = -1;
1379 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1380 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1381 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1382 }
1383
1384 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1385 * Handle this. */
1386 void backgroundRewriteDoneHandler(int statloc) {
1387 int exitcode = WEXITSTATUS(statloc);
1388 int bysignal = WIFSIGNALED(statloc);
1389
1390 if (!bysignal && exitcode == 0) {
1391 int fd;
1392 char tmpfile[256];
1393
1394 redisLog(REDIS_NOTICE,
1395 "Background append only file rewriting terminated with success");
1396 /* Now it's time to flush the differences accumulated by the parent */
1397 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1398 fd = open(tmpfile,O_WRONLY|O_APPEND);
1399 if (fd == -1) {
1400 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1401 goto cleanup;
1402 }
1403 /* Flush our data... */
1404 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1405 (signed) sdslen(server.bgrewritebuf)) {
1406 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1407 close(fd);
1408 goto cleanup;
1409 }
1410 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1411 /* Now our work is to rename the temp file into the stable file. And
1412 * switch the file descriptor used by the server for append only. */
1413 if (rename(tmpfile,server.appendfilename) == -1) {
1414 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1415 close(fd);
1416 goto cleanup;
1417 }
1418 /* Mission completed... almost */
1419 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1420 if (server.appendfd != -1) {
1421 /* If append only is actually enabled... */
1422 close(server.appendfd);
1423 server.appendfd = fd;
1424 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1425 server.appendseldb = -1; /* Make sure it will issue SELECT */
1426 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1427 } else {
1428 /* If append only is disabled we just generate a dump in this
1429 * format. Why not? */
1430 close(fd);
1431 }
1432 } else if (!bysignal && exitcode != 0) {
1433 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1434 } else {
1435 redisLog(REDIS_WARNING,
1436 "Background append only file rewriting terminated by signal %d",
1437 WTERMSIG(statloc));
1438 }
1439 cleanup:
1440 sdsfree(server.bgrewritebuf);
1441 server.bgrewritebuf = sdsempty();
1442 aofRemoveTempFile(server.bgrewritechildpid);
1443 server.bgrewritechildpid = -1;
1444 }
1445
1446 /* This function is called once a background process of some kind terminates,
1447 * as we want to avoid resizing the hash tables when there is a child in order
1448 * to play well with copy-on-write (otherwise when a resize happens lots of
1449 * memory pages are copied). The goal of this function is to update the ability
1450 * for dict.c to resize the hash tables accordingly to the fact we have o not
1451 * running childs. */
1452 static void updateDictResizePolicy(void) {
1453 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1454 dictEnableResize();
1455 else
1456 dictDisableResize();
1457 }
1458
1459 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1460 int j, loops = server.cronloops++;
1461 REDIS_NOTUSED(eventLoop);
1462 REDIS_NOTUSED(id);
1463 REDIS_NOTUSED(clientData);
1464
1465 /* We take a cached value of the unix time in the global state because
1466 * with virtual memory and aging there is to store the current time
1467 * in objects at every object access, and accuracy is not needed.
1468 * To access a global var is faster than calling time(NULL) */
1469 server.unixtime = time(NULL);
1470 /* We have just 21 bits per object for LRU information.
1471 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1472 *
1473 * When we need to select what object to swap, we compute the minimum
1474 * time distance between the current lruclock and the object last access
1475 * lruclock info. Even if clocks will wrap on overflow, there is
1476 * the interesting property that we are sure that at least
1477 * ABS(A-B) minutes passed between current time and timestamp B.
1478 *
1479 * This is not precise but we don't need at all precision, but just
1480 * something statistically reasonable.
1481 */
1482 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1483
1484 /* We received a SIGTERM, shutting down here in a safe way, as it is
1485 * not ok doing so inside the signal handler. */
1486 if (server.shutdown_asap) {
1487 if (prepareForShutdown() == REDIS_OK) exit(0);
1488 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1489 }
1490
1491 /* Show some info about non-empty databases */
1492 for (j = 0; j < server.dbnum; j++) {
1493 long long size, used, vkeys;
1494
1495 size = dictSlots(server.db[j].dict);
1496 used = dictSize(server.db[j].dict);
1497 vkeys = dictSize(server.db[j].expires);
1498 if (!(loops % 50) && (used || vkeys)) {
1499 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1500 /* dictPrintStats(server.dict); */
1501 }
1502 }
1503
1504 /* We don't want to resize the hash tables while a bacground saving
1505 * is in progress: the saving child is created using fork() that is
1506 * implemented with a copy-on-write semantic in most modern systems, so
1507 * if we resize the HT while there is the saving child at work actually
1508 * a lot of memory movements in the parent will cause a lot of pages
1509 * copied. */
1510 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1511 if (!(loops % 10)) tryResizeHashTables();
1512 if (server.activerehashing) incrementallyRehash();
1513 }
1514
1515 /* Show information about connected clients */
1516 if (!(loops % 50)) {
1517 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1518 listLength(server.clients)-listLength(server.slaves),
1519 listLength(server.slaves),
1520 zmalloc_used_memory());
1521 }
1522
1523 /* Close connections of timedout clients */
1524 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1525 closeTimedoutClients();
1526
1527 /* Check if a background saving or AOF rewrite in progress terminated */
1528 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1529 int statloc;
1530 pid_t pid;
1531
1532 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1533 if (pid == server.bgsavechildpid) {
1534 backgroundSaveDoneHandler(statloc);
1535 } else {
1536 backgroundRewriteDoneHandler(statloc);
1537 }
1538 updateDictResizePolicy();
1539 }
1540 } else {
1541 /* If there is not a background saving in progress check if
1542 * we have to save now */
1543 time_t now = time(NULL);
1544 for (j = 0; j < server.saveparamslen; j++) {
1545 struct saveparam *sp = server.saveparams+j;
1546
1547 if (server.dirty >= sp->changes &&
1548 now-server.lastsave > sp->seconds) {
1549 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1550 sp->changes, sp->seconds);
1551 rdbSaveBackground(server.dbfilename);
1552 break;
1553 }
1554 }
1555 }
1556
1557 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1558 * will use few CPU cycles if there are few expiring keys, otherwise
1559 * it will get more aggressive to avoid that too much memory is used by
1560 * keys that can be removed from the keyspace. */
1561 for (j = 0; j < server.dbnum; j++) {
1562 int expired;
1563 redisDb *db = server.db+j;
1564
1565 /* Continue to expire if at the end of the cycle more than 25%
1566 * of the keys were expired. */
1567 do {
1568 long num = dictSize(db->expires);
1569 time_t now = time(NULL);
1570
1571 expired = 0;
1572 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1573 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1574 while (num--) {
1575 dictEntry *de;
1576 time_t t;
1577
1578 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1579 t = (time_t) dictGetEntryVal(de);
1580 if (now > t) {
1581 sds key = dictGetEntryKey(de);
1582 robj *keyobj = createStringObject(key,sdslen(key));
1583
1584 dbDelete(db,keyobj);
1585 decrRefCount(keyobj);
1586 expired++;
1587 server.stat_expiredkeys++;
1588 }
1589 }
1590 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1591 }
1592
1593 /* Swap a few keys on disk if we are over the memory limit and VM
1594 * is enbled. Try to free objects from the free list first. */
1595 if (vmCanSwapOut()) {
1596 while (server.vm_enabled && zmalloc_used_memory() >
1597 server.vm_max_memory)
1598 {
1599 int retval;
1600
1601 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1602 retval = (server.vm_max_threads == 0) ?
1603 vmSwapOneObjectBlocking() :
1604 vmSwapOneObjectThreaded();
1605 if (retval == REDIS_ERR && !(loops % 300) &&
1606 zmalloc_used_memory() >
1607 (server.vm_max_memory+server.vm_max_memory/10))
1608 {
1609 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1610 }
1611 /* Note that when using threade I/O we free just one object,
1612 * because anyway when the I/O thread in charge to swap this
1613 * object out will finish, the handler of completed jobs
1614 * will try to swap more objects if we are still out of memory. */
1615 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1616 }
1617 }
1618
1619 /* Check if we should connect to a MASTER */
1620 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1621 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1622 if (syncWithMaster() == REDIS_OK) {
1623 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1624 if (server.appendonly) rewriteAppendOnlyFileBackground();
1625 }
1626 }
1627 return 100;
1628 }
1629
1630 /* This function gets called every time Redis is entering the
1631 * main loop of the event driven library, that is, before to sleep
1632 * for ready file descriptors. */
1633 static void beforeSleep(struct aeEventLoop *eventLoop) {
1634 REDIS_NOTUSED(eventLoop);
1635
1636 /* Awake clients that got all the swapped keys they requested */
1637 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1638 listIter li;
1639 listNode *ln;
1640
1641 listRewind(server.io_ready_clients,&li);
1642 while((ln = listNext(&li))) {
1643 redisClient *c = ln->value;
1644 struct redisCommand *cmd;
1645
1646 /* Resume the client. */
1647 listDelNode(server.io_ready_clients,ln);
1648 c->flags &= (~REDIS_IO_WAIT);
1649 server.vm_blocked_clients--;
1650 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1651 readQueryFromClient, c);
1652 cmd = lookupCommand(c->argv[0]->ptr);
1653 assert(cmd != NULL);
1654 call(c,cmd);
1655 resetClient(c);
1656 /* There may be more data to process in the input buffer. */
1657 if (c->querybuf && sdslen(c->querybuf) > 0)
1658 processInputBuffer(c);
1659 }
1660 }
1661 /* Write the AOF buffer on disk */
1662 flushAppendOnlyFile();
1663 }
1664
1665 static void createSharedObjects(void) {
1666 int j;
1667
1668 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1669 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1670 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1671 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1672 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1673 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1674 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1675 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1676 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1677 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1678 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1679 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1680 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1681 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1682 "-ERR no such key\r\n"));
1683 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1684 "-ERR syntax error\r\n"));
1685 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1686 "-ERR source and destination objects are the same\r\n"));
1687 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1688 "-ERR index out of range\r\n"));
1689 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1690 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1691 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1692 shared.select0 = createStringObject("select 0\r\n",10);
1693 shared.select1 = createStringObject("select 1\r\n",10);
1694 shared.select2 = createStringObject("select 2\r\n",10);
1695 shared.select3 = createStringObject("select 3\r\n",10);
1696 shared.select4 = createStringObject("select 4\r\n",10);
1697 shared.select5 = createStringObject("select 5\r\n",10);
1698 shared.select6 = createStringObject("select 6\r\n",10);
1699 shared.select7 = createStringObject("select 7\r\n",10);
1700 shared.select8 = createStringObject("select 8\r\n",10);
1701 shared.select9 = createStringObject("select 9\r\n",10);
1702 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1703 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1704 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1705 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1706 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1707 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1708 shared.mbulk3 = createStringObject("*3\r\n",4);
1709 shared.mbulk4 = createStringObject("*4\r\n",4);
1710 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1711 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1712 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1713 }
1714 }
1715
1716 static void appendServerSaveParams(time_t seconds, int changes) {
1717 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1718 server.saveparams[server.saveparamslen].seconds = seconds;
1719 server.saveparams[server.saveparamslen].changes = changes;
1720 server.saveparamslen++;
1721 }
1722
1723 static void resetServerSaveParams() {
1724 zfree(server.saveparams);
1725 server.saveparams = NULL;
1726 server.saveparamslen = 0;
1727 }
1728
1729 static void initServerConfig() {
1730 server.dbnum = REDIS_DEFAULT_DBNUM;
1731 server.port = REDIS_SERVERPORT;
1732 server.verbosity = REDIS_VERBOSE;
1733 server.maxidletime = REDIS_MAXIDLETIME;
1734 server.saveparams = NULL;
1735 server.logfile = NULL; /* NULL = log on standard output */
1736 server.bindaddr = NULL;
1737 server.glueoutputbuf = 1;
1738 server.daemonize = 0;
1739 server.appendonly = 0;
1740 server.appendfsync = APPENDFSYNC_EVERYSEC;
1741 server.no_appendfsync_on_rewrite = 0;
1742 server.lastfsync = time(NULL);
1743 server.appendfd = -1;
1744 server.appendseldb = -1; /* Make sure the first time will not match */
1745 server.pidfile = zstrdup("/var/run/redis.pid");
1746 server.dbfilename = zstrdup("dump.rdb");
1747 server.appendfilename = zstrdup("appendonly.aof");
1748 server.requirepass = NULL;
1749 server.rdbcompression = 1;
1750 server.activerehashing = 1;
1751 server.maxclients = 0;
1752 server.blpop_blocked_clients = 0;
1753 server.maxmemory = 0;
1754 server.vm_enabled = 0;
1755 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1756 server.vm_page_size = 256; /* 256 bytes per page */
1757 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1758 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1759 server.vm_max_threads = 4;
1760 server.vm_blocked_clients = 0;
1761 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1762 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1763 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1764 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1765 server.shutdown_asap = 0;
1766
1767 resetServerSaveParams();
1768
1769 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1770 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1771 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1772 /* Replication related */
1773 server.isslave = 0;
1774 server.masterauth = NULL;
1775 server.masterhost = NULL;
1776 server.masterport = 6379;
1777 server.master = NULL;
1778 server.replstate = REDIS_REPL_NONE;
1779
1780 /* Double constants initialization */
1781 R_Zero = 0.0;
1782 R_PosInf = 1.0/R_Zero;
1783 R_NegInf = -1.0/R_Zero;
1784 R_Nan = R_Zero/R_Zero;
1785 }
1786
1787 static void initServer() {
1788 int j;
1789
1790 signal(SIGHUP, SIG_IGN);
1791 signal(SIGPIPE, SIG_IGN);
1792 setupSigSegvAction();
1793
1794 server.devnull = fopen("/dev/null","w");
1795 if (server.devnull == NULL) {
1796 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1797 exit(1);
1798 }
1799 server.clients = listCreate();
1800 server.slaves = listCreate();
1801 server.monitors = listCreate();
1802 server.objfreelist = listCreate();
1803 createSharedObjects();
1804 server.el = aeCreateEventLoop();
1805 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1806 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1807 if (server.fd == -1) {
1808 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1809 exit(1);
1810 }
1811 for (j = 0; j < server.dbnum; j++) {
1812 server.db[j].dict = dictCreate(&dbDictType,NULL);
1813 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1814 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1815 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1816 if (server.vm_enabled)
1817 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1818 server.db[j].id = j;
1819 }
1820 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1821 server.pubsub_patterns = listCreate();
1822 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1823 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1824 server.cronloops = 0;
1825 server.bgsavechildpid = -1;
1826 server.bgrewritechildpid = -1;
1827 server.bgrewritebuf = sdsempty();
1828 server.aofbuf = sdsempty();
1829 server.lastsave = time(NULL);
1830 server.dirty = 0;
1831 server.stat_numcommands = 0;
1832 server.stat_numconnections = 0;
1833 server.stat_expiredkeys = 0;
1834 server.stat_starttime = time(NULL);
1835 server.unixtime = time(NULL);
1836 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1837 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1838 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1839
1840 if (server.appendonly) {
1841 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1842 if (server.appendfd == -1) {
1843 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1844 strerror(errno));
1845 exit(1);
1846 }
1847 }
1848
1849 if (server.vm_enabled) vmInit();
1850 }
1851
1852 /* Empty the whole database */
1853 static long long emptyDb() {
1854 int j;
1855 long long removed = 0;
1856
1857 for (j = 0; j < server.dbnum; j++) {
1858 removed += dictSize(server.db[j].dict);
1859 dictEmpty(server.db[j].dict);
1860 dictEmpty(server.db[j].expires);
1861 }
1862 return removed;
1863 }
1864
1865 static int yesnotoi(char *s) {
1866 if (!strcasecmp(s,"yes")) return 1;
1867 else if (!strcasecmp(s,"no")) return 0;
1868 else return -1;
1869 }
1870
1871 /* I agree, this is a very rudimental way to load a configuration...
1872 will improve later if the config gets more complex */
1873 static void loadServerConfig(char *filename) {
1874 FILE *fp;
1875 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1876 int linenum = 0;
1877 sds line = NULL;
1878
1879 if (filename[0] == '-' && filename[1] == '\0')
1880 fp = stdin;
1881 else {
1882 if ((fp = fopen(filename,"r")) == NULL) {
1883 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1884 exit(1);
1885 }
1886 }
1887
1888 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1889 sds *argv;
1890 int argc, j;
1891
1892 linenum++;
1893 line = sdsnew(buf);
1894 line = sdstrim(line," \t\r\n");
1895
1896 /* Skip comments and blank lines*/
1897 if (line[0] == '#' || line[0] == '\0') {
1898 sdsfree(line);
1899 continue;
1900 }
1901
1902 /* Split into arguments */
1903 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1904 sdstolower(argv[0]);
1905
1906 /* Execute config directives */
1907 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1908 server.maxidletime = atoi(argv[1]);
1909 if (server.maxidletime < 0) {
1910 err = "Invalid timeout value"; goto loaderr;
1911 }
1912 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1913 server.port = atoi(argv[1]);
1914 if (server.port < 1 || server.port > 65535) {
1915 err = "Invalid port"; goto loaderr;
1916 }
1917 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1918 server.bindaddr = zstrdup(argv[1]);
1919 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1920 int seconds = atoi(argv[1]);
1921 int changes = atoi(argv[2]);
1922 if (seconds < 1 || changes < 0) {
1923 err = "Invalid save parameters"; goto loaderr;
1924 }
1925 appendServerSaveParams(seconds,changes);
1926 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1927 if (chdir(argv[1]) == -1) {
1928 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1929 argv[1], strerror(errno));
1930 exit(1);
1931 }
1932 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1933 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1934 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1935 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1936 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1937 else {
1938 err = "Invalid log level. Must be one of debug, notice, warning";
1939 goto loaderr;
1940 }
1941 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1942 FILE *logfp;
1943
1944 server.logfile = zstrdup(argv[1]);
1945 if (!strcasecmp(server.logfile,"stdout")) {
1946 zfree(server.logfile);
1947 server.logfile = NULL;
1948 }
1949 if (server.logfile) {
1950 /* Test if we are able to open the file. The server will not
1951 * be able to abort just for this problem later... */
1952 logfp = fopen(server.logfile,"a");
1953 if (logfp == NULL) {
1954 err = sdscatprintf(sdsempty(),
1955 "Can't open the log file: %s", strerror(errno));
1956 goto loaderr;
1957 }
1958 fclose(logfp);
1959 }
1960 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1961 server.dbnum = atoi(argv[1]);
1962 if (server.dbnum < 1) {
1963 err = "Invalid number of databases"; goto loaderr;
1964 }
1965 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1966 loadServerConfig(argv[1]);
1967 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1968 server.maxclients = atoi(argv[1]);
1969 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1970 server.maxmemory = memtoll(argv[1],NULL);
1971 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1972 server.masterhost = sdsnew(argv[1]);
1973 server.masterport = atoi(argv[2]);
1974 server.replstate = REDIS_REPL_CONNECT;
1975 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1976 server.masterauth = zstrdup(argv[1]);
1977 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1978 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1979 err = "argument must be 'yes' or 'no'"; goto loaderr;
1980 }
1981 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1982 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1983 err = "argument must be 'yes' or 'no'"; goto loaderr;
1984 }
1985 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1986 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1987 err = "argument must be 'yes' or 'no'"; goto loaderr;
1988 }
1989 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1990 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1991 err = "argument must be 'yes' or 'no'"; goto loaderr;
1992 }
1993 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1994 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1995 err = "argument must be 'yes' or 'no'"; goto loaderr;
1996 }
1997 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1998 zfree(server.appendfilename);
1999 server.appendfilename = zstrdup(argv[1]);
2000 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2001 && argc == 2) {
2002 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2003 err = "argument must be 'yes' or 'no'"; goto loaderr;
2004 }
2005 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2006 if (!strcasecmp(argv[1],"no")) {
2007 server.appendfsync = APPENDFSYNC_NO;
2008 } else if (!strcasecmp(argv[1],"always")) {
2009 server.appendfsync = APPENDFSYNC_ALWAYS;
2010 } else if (!strcasecmp(argv[1],"everysec")) {
2011 server.appendfsync = APPENDFSYNC_EVERYSEC;
2012 } else {
2013 err = "argument must be 'no', 'always' or 'everysec'";
2014 goto loaderr;
2015 }
2016 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2017 server.requirepass = zstrdup(argv[1]);
2018 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2019 zfree(server.pidfile);
2020 server.pidfile = zstrdup(argv[1]);
2021 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2022 zfree(server.dbfilename);
2023 server.dbfilename = zstrdup(argv[1]);
2024 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2025 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2026 err = "argument must be 'yes' or 'no'"; goto loaderr;
2027 }
2028 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2029 zfree(server.vm_swap_file);
2030 server.vm_swap_file = zstrdup(argv[1]);
2031 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2032 server.vm_max_memory = memtoll(argv[1],NULL);
2033 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2034 server.vm_page_size = memtoll(argv[1], NULL);
2035 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2036 server.vm_pages = memtoll(argv[1], NULL);
2037 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2038 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2039 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2040 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2041 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2042 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2043 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2044 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2045 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2046 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2047 } else {
2048 err = "Bad directive or wrong number of arguments"; goto loaderr;
2049 }
2050 for (j = 0; j < argc; j++)
2051 sdsfree(argv[j]);
2052 zfree(argv);
2053 sdsfree(line);
2054 }
2055 if (fp != stdin) fclose(fp);
2056 return;
2057
2058 loaderr:
2059 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2060 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2061 fprintf(stderr, ">>> '%s'\n", line);
2062 fprintf(stderr, "%s\n", err);
2063 exit(1);
2064 }
2065
2066 static void freeClientArgv(redisClient *c) {
2067 int j;
2068
2069 for (j = 0; j < c->argc; j++)
2070 decrRefCount(c->argv[j]);
2071 for (j = 0; j < c->mbargc; j++)
2072 decrRefCount(c->mbargv[j]);
2073 c->argc = 0;
2074 c->mbargc = 0;
2075 }
2076
2077 static void freeClient(redisClient *c) {
2078 listNode *ln;
2079
2080 /* Note that if the client we are freeing is blocked into a blocking
2081 * call, we have to set querybuf to NULL *before* to call
2082 * unblockClientWaitingData() to avoid processInputBuffer() will get
2083 * called. Also it is important to remove the file events after
2084 * this, because this call adds the READABLE event. */
2085 sdsfree(c->querybuf);
2086 c->querybuf = NULL;
2087 if (c->flags & REDIS_BLOCKED)
2088 unblockClientWaitingData(c);
2089
2090 /* UNWATCH all the keys */
2091 unwatchAllKeys(c);
2092 listRelease(c->watched_keys);
2093 /* Unsubscribe from all the pubsub channels */
2094 pubsubUnsubscribeAllChannels(c,0);
2095 pubsubUnsubscribeAllPatterns(c,0);
2096 dictRelease(c->pubsub_channels);
2097 listRelease(c->pubsub_patterns);
2098 /* Obvious cleanup */
2099 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2100 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2101 listRelease(c->reply);
2102 freeClientArgv(c);
2103 close(c->fd);
2104 /* Remove from the list of clients */
2105 ln = listSearchKey(server.clients,c);
2106 redisAssert(ln != NULL);
2107 listDelNode(server.clients,ln);
2108 /* Remove from the list of clients that are now ready to be restarted
2109 * after waiting for swapped keys */
2110 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2111 ln = listSearchKey(server.io_ready_clients,c);
2112 if (ln) {
2113 listDelNode(server.io_ready_clients,ln);
2114 server.vm_blocked_clients--;
2115 }
2116 }
2117 /* Remove from the list of clients waiting for swapped keys */
2118 while (server.vm_enabled && listLength(c->io_keys)) {
2119 ln = listFirst(c->io_keys);
2120 dontWaitForSwappedKey(c,ln->value);
2121 }
2122 listRelease(c->io_keys);
2123 /* Master/slave cleanup */
2124 if (c->flags & REDIS_SLAVE) {
2125 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2126 close(c->repldbfd);
2127 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2128 ln = listSearchKey(l,c);
2129 redisAssert(ln != NULL);
2130 listDelNode(l,ln);
2131 }
2132 if (c->flags & REDIS_MASTER) {
2133 server.master = NULL;
2134 server.replstate = REDIS_REPL_CONNECT;
2135 }
2136 /* Release memory */
2137 zfree(c->argv);
2138 zfree(c->mbargv);
2139 freeClientMultiState(c);
2140 zfree(c);
2141 }
2142
2143 #define GLUEREPLY_UP_TO (1024)
2144 static void glueReplyBuffersIfNeeded(redisClient *c) {
2145 int copylen = 0;
2146 char buf[GLUEREPLY_UP_TO];
2147 listNode *ln;
2148 listIter li;
2149 robj *o;
2150
2151 listRewind(c->reply,&li);
2152 while((ln = listNext(&li))) {
2153 int objlen;
2154
2155 o = ln->value;
2156 objlen = sdslen(o->ptr);
2157 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2158 memcpy(buf+copylen,o->ptr,objlen);
2159 copylen += objlen;
2160 listDelNode(c->reply,ln);
2161 } else {
2162 if (copylen == 0) return;
2163 break;
2164 }
2165 }
2166 /* Now the output buffer is empty, add the new single element */
2167 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2168 listAddNodeHead(c->reply,o);
2169 }
2170
2171 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2172 redisClient *c = privdata;
2173 int nwritten = 0, totwritten = 0, objlen;
2174 robj *o;
2175 REDIS_NOTUSED(el);
2176 REDIS_NOTUSED(mask);
2177
2178 /* Use writev() if we have enough buffers to send */
2179 if (!server.glueoutputbuf &&
2180 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2181 !(c->flags & REDIS_MASTER))
2182 {
2183 sendReplyToClientWritev(el, fd, privdata, mask);
2184 return;
2185 }
2186
2187 while(listLength(c->reply)) {
2188 if (server.glueoutputbuf && listLength(c->reply) > 1)
2189 glueReplyBuffersIfNeeded(c);
2190
2191 o = listNodeValue(listFirst(c->reply));
2192 objlen = sdslen(o->ptr);
2193
2194 if (objlen == 0) {
2195 listDelNode(c->reply,listFirst(c->reply));
2196 continue;
2197 }
2198
2199 if (c->flags & REDIS_MASTER) {
2200 /* Don't reply to a master */
2201 nwritten = objlen - c->sentlen;
2202 } else {
2203 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2204 if (nwritten <= 0) break;
2205 }
2206 c->sentlen += nwritten;
2207 totwritten += nwritten;
2208 /* If we fully sent the object on head go to the next one */
2209 if (c->sentlen == objlen) {
2210 listDelNode(c->reply,listFirst(c->reply));
2211 c->sentlen = 0;
2212 }
2213 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2214 * bytes, in a single threaded server it's a good idea to serve
2215 * other clients as well, even if a very large request comes from
2216 * super fast link that is always able to accept data (in real world
2217 * scenario think about 'KEYS *' against the loopback interfae) */
2218 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2219 }
2220 if (nwritten == -1) {
2221 if (errno == EAGAIN) {
2222 nwritten = 0;
2223 } else {
2224 redisLog(REDIS_VERBOSE,
2225 "Error writing to client: %s", strerror(errno));
2226 freeClient(c);
2227 return;
2228 }
2229 }
2230 if (totwritten > 0) c->lastinteraction = time(NULL);
2231 if (listLength(c->reply) == 0) {
2232 c->sentlen = 0;
2233 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2234 }
2235 }
2236
2237 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2238 {
2239 redisClient *c = privdata;
2240 int nwritten = 0, totwritten = 0, objlen, willwrite;
2241 robj *o;
2242 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2243 int offset, ion = 0;
2244 REDIS_NOTUSED(el);
2245 REDIS_NOTUSED(mask);
2246
2247 listNode *node;
2248 while (listLength(c->reply)) {
2249 offset = c->sentlen;
2250 ion = 0;
2251 willwrite = 0;
2252
2253 /* fill-in the iov[] array */
2254 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2255 o = listNodeValue(node);
2256 objlen = sdslen(o->ptr);
2257
2258 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2259 break;
2260
2261 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2262 break; /* no more iovecs */
2263
2264 iov[ion].iov_base = ((char*)o->ptr) + offset;
2265 iov[ion].iov_len = objlen - offset;
2266 willwrite += objlen - offset;
2267 offset = 0; /* just for the first item */
2268 ion++;
2269 }
2270
2271 if(willwrite == 0)
2272 break;
2273
2274 /* write all collected blocks at once */
2275 if((nwritten = writev(fd, iov, ion)) < 0) {
2276 if (errno != EAGAIN) {
2277 redisLog(REDIS_VERBOSE,
2278 "Error writing to client: %s", strerror(errno));
2279 freeClient(c);
2280 return;
2281 }
2282 break;
2283 }
2284
2285 totwritten += nwritten;
2286 offset = c->sentlen;
2287
2288 /* remove written robjs from c->reply */
2289 while (nwritten && listLength(c->reply)) {
2290 o = listNodeValue(listFirst(c->reply));
2291 objlen = sdslen(o->ptr);
2292
2293 if(nwritten >= objlen - offset) {
2294 listDelNode(c->reply, listFirst(c->reply));
2295 nwritten -= objlen - offset;
2296 c->sentlen = 0;
2297 } else {
2298 /* partial write */
2299 c->sentlen += nwritten;
2300 break;
2301 }
2302 offset = 0;
2303 }
2304 }
2305
2306 if (totwritten > 0)
2307 c->lastinteraction = time(NULL);
2308
2309 if (listLength(c->reply) == 0) {
2310 c->sentlen = 0;
2311 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2312 }
2313 }
2314
2315 static int qsortRedisCommands(const void *r1, const void *r2) {
2316 return strcasecmp(
2317 ((struct redisCommand*)r1)->name,
2318 ((struct redisCommand*)r2)->name);
2319 }
2320
2321 static void sortCommandTable() {
2322 /* Copy and sort the read-only version of the command table */
2323 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2324 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2325 qsort(commandTable,
2326 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2327 sizeof(struct redisCommand),qsortRedisCommands);
2328 }
2329
2330 static struct redisCommand *lookupCommand(char *name) {
2331 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2332 return bsearch(
2333 &tmp,
2334 commandTable,
2335 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2336 sizeof(struct redisCommand),
2337 qsortRedisCommands);
2338 }
2339
2340 /* resetClient prepare the client to process the next command */
2341 static void resetClient(redisClient *c) {
2342 freeClientArgv(c);
2343 c->bulklen = -1;
2344 c->multibulk = 0;
2345 }
2346
2347 /* Call() is the core of Redis execution of a command */
2348 static void call(redisClient *c, struct redisCommand *cmd) {
2349 long long dirty;
2350
2351 dirty = server.dirty;
2352 cmd->proc(c);
2353 dirty = server.dirty-dirty;
2354
2355 if (server.appendonly && dirty)
2356 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2357 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2358 listLength(server.slaves))
2359 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2360 if (listLength(server.monitors))
2361 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2362 server.stat_numcommands++;
2363 }
2364
2365 /* If this function gets called we already read a whole
2366 * command, argments are in the client argv/argc fields.
2367 * processCommand() execute the command or prepare the
2368 * server for a bulk read from the client.
2369 *
2370 * If 1 is returned the client is still alive and valid and
2371 * and other operations can be performed by the caller. Otherwise
2372 * if 0 is returned the client was destroied (i.e. after QUIT). */
2373 static int processCommand(redisClient *c) {
2374 struct redisCommand *cmd;
2375
2376 /* Free some memory if needed (maxmemory setting) */
2377 if (server.maxmemory) freeMemoryIfNeeded();
2378
2379 /* Handle the multi bulk command type. This is an alternative protocol
2380 * supported by Redis in order to receive commands that are composed of
2381 * multiple binary-safe "bulk" arguments. The latency of processing is
2382 * a bit higher but this allows things like multi-sets, so if this
2383 * protocol is used only for MSET and similar commands this is a big win. */
2384 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2385 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2386 if (c->multibulk <= 0) {
2387 resetClient(c);
2388 return 1;
2389 } else {
2390 decrRefCount(c->argv[c->argc-1]);
2391 c->argc--;
2392 return 1;
2393 }
2394 } else if (c->multibulk) {
2395 if (c->bulklen == -1) {
2396 if (((char*)c->argv[0]->ptr)[0] != '$') {
2397 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2398 resetClient(c);
2399 return 1;
2400 } else {
2401 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2402 decrRefCount(c->argv[0]);
2403 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2404 c->argc--;
2405 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2406 resetClient(c);
2407 return 1;
2408 }
2409 c->argc--;
2410 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2411 return 1;
2412 }
2413 } else {
2414 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2415 c->mbargv[c->mbargc] = c->argv[0];
2416 c->mbargc++;
2417 c->argc--;
2418 c->multibulk--;
2419 if (c->multibulk == 0) {
2420 robj **auxargv;
2421 int auxargc;
2422
2423 /* Here we need to swap the multi-bulk argc/argv with the
2424 * normal argc/argv of the client structure. */
2425 auxargv = c->argv;
2426 c->argv = c->mbargv;
2427 c->mbargv = auxargv;
2428
2429 auxargc = c->argc;
2430 c->argc = c->mbargc;
2431 c->mbargc = auxargc;
2432
2433 /* We need to set bulklen to something different than -1
2434 * in order for the code below to process the command without
2435 * to try to read the last argument of a bulk command as
2436 * a special argument. */
2437 c->bulklen = 0;
2438 /* continue below and process the command */
2439 } else {
2440 c->bulklen = -1;
2441 return 1;
2442 }
2443 }
2444 }
2445 /* -- end of multi bulk commands processing -- */
2446
2447 /* The QUIT command is handled as a special case. Normal command
2448 * procs are unable to close the client connection safely */
2449 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2450 freeClient(c);
2451 return 0;
2452 }
2453
2454 /* Now lookup the command and check ASAP about trivial error conditions
2455 * such wrong arity, bad command name and so forth. */
2456 cmd = lookupCommand(c->argv[0]->ptr);
2457 if (!cmd) {
2458 addReplySds(c,
2459 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2460 (char*)c->argv[0]->ptr));
2461 resetClient(c);
2462 return 1;
2463 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2464 (c->argc < -cmd->arity)) {
2465 addReplySds(c,
2466 sdscatprintf(sdsempty(),
2467 "-ERR wrong number of arguments for '%s' command\r\n",
2468 cmd->name));
2469 resetClient(c);
2470 return 1;
2471 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2472 /* This is a bulk command, we have to read the last argument yet. */
2473 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2474
2475 decrRefCount(c->argv[c->argc-1]);
2476 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2477 c->argc--;
2478 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2479 resetClient(c);
2480 return 1;
2481 }
2482 c->argc--;
2483 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2484 /* It is possible that the bulk read is already in the
2485 * buffer. Check this condition and handle it accordingly.
2486 * This is just a fast path, alternative to call processInputBuffer().
2487 * It's a good idea since the code is small and this condition
2488 * happens most of the times. */
2489 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2490 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2491 c->argc++;
2492 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2493 } else {
2494 /* Otherwise return... there is to read the last argument
2495 * from the socket. */
2496 return 1;
2497 }
2498 }
2499 /* Let's try to encode the bulk object to save space. */
2500 if (cmd->flags & REDIS_CMD_BULK)
2501 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2502
2503 /* Check if the user is authenticated */
2504 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2505 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2506 resetClient(c);
2507 return 1;
2508 }
2509
2510 /* Handle the maxmemory directive */
2511 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2512 zmalloc_used_memory() > server.maxmemory)
2513 {
2514 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2515 resetClient(c);
2516 return 1;
2517 }
2518
2519 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2520 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2521 &&
2522 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2523 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2524 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2525 resetClient(c);
2526 return 1;
2527 }
2528
2529 /* Exec the command */
2530 if (c->flags & REDIS_MULTI &&
2531 cmd->proc != execCommand && cmd->proc != discardCommand &&
2532 cmd->proc != multiCommand && cmd->proc != watchCommand)
2533 {
2534 queueMultiCommand(c,cmd);
2535 addReply(c,shared.queued);
2536 } else {
2537 if (server.vm_enabled && server.vm_max_threads > 0 &&
2538 blockClientOnSwappedKeys(c,cmd)) return 1;
2539 call(c,cmd);
2540 }
2541
2542 /* Prepare the client for the next command */
2543 resetClient(c);
2544 return 1;
2545 }
2546
2547 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2548 listNode *ln;
2549 listIter li;
2550 int outc = 0, j;
2551 robj **outv;
2552 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2553 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2554 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2555 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2556 robj *lenobj;
2557
2558 if (argc <= REDIS_STATIC_ARGS) {
2559 outv = static_outv;
2560 } else {
2561 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2562 }
2563
2564 lenobj = createObject(REDIS_STRING,
2565 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2566 lenobj->refcount = 0;
2567 outv[outc++] = lenobj;
2568 for (j = 0; j < argc; j++) {
2569 lenobj = createObject(REDIS_STRING,
2570 sdscatprintf(sdsempty(),"$%lu\r\n",
2571 (unsigned long) stringObjectLen(argv[j])));
2572 lenobj->refcount = 0;
2573 outv[outc++] = lenobj;
2574 outv[outc++] = argv[j];
2575 outv[outc++] = shared.crlf;
2576 }
2577
2578 /* Increment all the refcounts at start and decrement at end in order to
2579 * be sure to free objects if there is no slave in a replication state
2580 * able to be feed with commands */
2581 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2582 listRewind(slaves,&li);
2583 while((ln = listNext(&li))) {
2584 redisClient *slave = ln->value;
2585
2586 /* Don't feed slaves that are still waiting for BGSAVE to start */
2587 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2588
2589 /* Feed all the other slaves, MONITORs and so on */
2590 if (slave->slaveseldb != dictid) {
2591 robj *selectcmd;
2592
2593 switch(dictid) {
2594 case 0: selectcmd = shared.select0; break;
2595 case 1: selectcmd = shared.select1; break;
2596 case 2: selectcmd = shared.select2; break;
2597 case 3: selectcmd = shared.select3; break;
2598 case 4: selectcmd = shared.select4; break;
2599 case 5: selectcmd = shared.select5; break;
2600 case 6: selectcmd = shared.select6; break;
2601 case 7: selectcmd = shared.select7; break;
2602 case 8: selectcmd = shared.select8; break;
2603 case 9: selectcmd = shared.select9; break;
2604 default:
2605 selectcmd = createObject(REDIS_STRING,
2606 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2607 selectcmd->refcount = 0;
2608 break;
2609 }
2610 addReply(slave,selectcmd);
2611 slave->slaveseldb = dictid;
2612 }
2613 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2614 }
2615 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2616 if (outv != static_outv) zfree(outv);
2617 }
2618
2619 static sds sdscatrepr(sds s, char *p, size_t len) {
2620 s = sdscatlen(s,"\"",1);
2621 while(len--) {
2622 switch(*p) {
2623 case '\\':
2624 case '"':
2625 s = sdscatprintf(s,"\\%c",*p);
2626 break;
2627 case '\n': s = sdscatlen(s,"\\n",1); break;
2628 case '\r': s = sdscatlen(s,"\\r",1); break;
2629 case '\t': s = sdscatlen(s,"\\t",1); break;
2630 case '\a': s = sdscatlen(s,"\\a",1); break;
2631 case '\b': s = sdscatlen(s,"\\b",1); break;
2632 default:
2633 if (isprint(*p))
2634 s = sdscatprintf(s,"%c",*p);
2635 else
2636 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2637 break;
2638 }
2639 p++;
2640 }
2641 return sdscatlen(s,"\"",1);
2642 }
2643
2644 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2645 listNode *ln;
2646 listIter li;
2647 int j;
2648 sds cmdrepr = sdsnew("+");
2649 robj *cmdobj;
2650 struct timeval tv;
2651
2652 gettimeofday(&tv,NULL);
2653 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2654 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2655
2656 for (j = 0; j < argc; j++) {
2657 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2658 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2659 } else {
2660 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2661 sdslen(argv[j]->ptr));
2662 }
2663 if (j != argc-1)
2664 cmdrepr = sdscatlen(cmdrepr," ",1);
2665 }
2666 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2667 cmdobj = createObject(REDIS_STRING,cmdrepr);
2668
2669 listRewind(monitors,&li);
2670 while((ln = listNext(&li))) {
2671 redisClient *monitor = ln->value;
2672 addReply(monitor,cmdobj);
2673 }
2674 decrRefCount(cmdobj);
2675 }
2676
2677 static void processInputBuffer(redisClient *c) {
2678 again:
2679 /* Before to process the input buffer, make sure the client is not
2680 * waitig for a blocking operation such as BLPOP. Note that the first
2681 * iteration the client is never blocked, otherwise the processInputBuffer
2682 * would not be called at all, but after the execution of the first commands
2683 * in the input buffer the client may be blocked, and the "goto again"
2684 * will try to reiterate. The following line will make it return asap. */
2685 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2686 if (c->bulklen == -1) {
2687 /* Read the first line of the query */
2688 char *p = strchr(c->querybuf,'\n');
2689 size_t querylen;
2690
2691 if (p) {
2692 sds query, *argv;
2693 int argc, j;
2694
2695 query = c->querybuf;
2696 c->querybuf = sdsempty();
2697 querylen = 1+(p-(query));
2698 if (sdslen(query) > querylen) {
2699 /* leave data after the first line of the query in the buffer */
2700 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2701 }
2702 *p = '\0'; /* remove "\n" */
2703 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2704 sdsupdatelen(query);
2705
2706 /* Now we can split the query in arguments */
2707 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2708 sdsfree(query);
2709
2710 if (c->argv) zfree(c->argv);
2711 c->argv = zmalloc(sizeof(robj*)*argc);
2712
2713 for (j = 0; j < argc; j++) {
2714 if (sdslen(argv[j])) {
2715 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2716 c->argc++;
2717 } else {
2718 sdsfree(argv[j]);
2719 }
2720 }
2721 zfree(argv);
2722 if (c->argc) {
2723 /* Execute the command. If the client is still valid
2724 * after processCommand() return and there is something
2725 * on the query buffer try to process the next command. */
2726 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2727 } else {
2728 /* Nothing to process, argc == 0. Just process the query
2729 * buffer if it's not empty or return to the caller */
2730 if (sdslen(c->querybuf)) goto again;
2731 }
2732 return;
2733 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2734 redisLog(REDIS_VERBOSE, "Client protocol error");
2735 freeClient(c);
2736 return;
2737 }
2738 } else {
2739 /* Bulk read handling. Note that if we are at this point
2740 the client already sent a command terminated with a newline,
2741 we are reading the bulk data that is actually the last
2742 argument of the command. */
2743 int qbl = sdslen(c->querybuf);
2744
2745 if (c->bulklen <= qbl) {
2746 /* Copy everything but the final CRLF as final argument */
2747 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2748 c->argc++;
2749 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2750 /* Process the command. If the client is still valid after
2751 * the processing and there is more data in the buffer
2752 * try to parse it. */
2753 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2754 return;
2755 }
2756 }
2757 }
2758
2759 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2760 redisClient *c = (redisClient*) privdata;
2761 char buf[REDIS_IOBUF_LEN];
2762 int nread;
2763 REDIS_NOTUSED(el);
2764 REDIS_NOTUSED(mask);
2765
2766 nread = read(fd, buf, REDIS_IOBUF_LEN);
2767 if (nread == -1) {
2768 if (errno == EAGAIN) {
2769 nread = 0;
2770 } else {
2771 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2772 freeClient(c);
2773 return;
2774 }
2775 } else if (nread == 0) {
2776 redisLog(REDIS_VERBOSE, "Client closed connection");
2777 freeClient(c);
2778 return;
2779 }
2780 if (nread) {
2781 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2782 c->lastinteraction = time(NULL);
2783 } else {
2784 return;
2785 }
2786 processInputBuffer(c);
2787 }
2788
2789 static int selectDb(redisClient *c, int id) {
2790 if (id < 0 || id >= server.dbnum)
2791 return REDIS_ERR;
2792 c->db = &server.db[id];
2793 return REDIS_OK;
2794 }
2795
2796 static void *dupClientReplyValue(void *o) {
2797 incrRefCount((robj*)o);
2798 return o;
2799 }
2800
2801 static int listMatchObjects(void *a, void *b) {
2802 return equalStringObjects(a,b);
2803 }
2804
2805 static redisClient *createClient(int fd) {
2806 redisClient *c = zmalloc(sizeof(*c));
2807
2808 anetNonBlock(NULL,fd);
2809 anetTcpNoDelay(NULL,fd);
2810 if (!c) return NULL;
2811 selectDb(c,0);
2812 c->fd = fd;
2813 c->querybuf = sdsempty();
2814 c->argc = 0;
2815 c->argv = NULL;
2816 c->bulklen = -1;
2817 c->multibulk = 0;
2818 c->mbargc = 0;
2819 c->mbargv = NULL;
2820 c->sentlen = 0;
2821 c->flags = 0;
2822 c->lastinteraction = time(NULL);
2823 c->authenticated = 0;
2824 c->replstate = REDIS_REPL_NONE;
2825 c->reply = listCreate();
2826 listSetFreeMethod(c->reply,decrRefCount);
2827 listSetDupMethod(c->reply,dupClientReplyValue);
2828 c->blocking_keys = NULL;
2829 c->blocking_keys_num = 0;
2830 c->io_keys = listCreate();
2831 c->watched_keys = listCreate();
2832 listSetFreeMethod(c->io_keys,decrRefCount);
2833 c->pubsub_channels = dictCreate(&setDictType,NULL);
2834 c->pubsub_patterns = listCreate();
2835 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2836 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2837 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2838 readQueryFromClient, c) == AE_ERR) {
2839 freeClient(c);
2840 return NULL;
2841 }
2842 listAddNodeTail(server.clients,c);
2843 initClientMultiState(c);
2844 return c;
2845 }
2846
2847 static void addReply(redisClient *c, robj *obj) {
2848 if (listLength(c->reply) == 0 &&
2849 (c->replstate == REDIS_REPL_NONE ||
2850 c->replstate == REDIS_REPL_ONLINE) &&
2851 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2852 sendReplyToClient, c) == AE_ERR) return;
2853
2854 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2855 obj = dupStringObject(obj);
2856 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2857 }
2858 listAddNodeTail(c->reply,getDecodedObject(obj));
2859 }
2860
2861 static void addReplySds(redisClient *c, sds s) {
2862 robj *o = createObject(REDIS_STRING,s);
2863 addReply(c,o);
2864 decrRefCount(o);
2865 }
2866
2867 static void addReplyDouble(redisClient *c, double d) {
2868 char buf[128];
2869
2870 snprintf(buf,sizeof(buf),"%.17g",d);
2871 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2872 (unsigned long) strlen(buf),buf));
2873 }
2874
2875 static void addReplyLongLong(redisClient *c, long long ll) {
2876 char buf[128];
2877 size_t len;
2878
2879 if (ll == 0) {
2880 addReply(c,shared.czero);
2881 return;
2882 } else if (ll == 1) {
2883 addReply(c,shared.cone);
2884 return;
2885 }
2886 buf[0] = ':';
2887 len = ll2string(buf+1,sizeof(buf)-1,ll);
2888 buf[len+1] = '\r';
2889 buf[len+2] = '\n';
2890 addReplySds(c,sdsnewlen(buf,len+3));
2891 }
2892
2893 static void addReplyUlong(redisClient *c, unsigned long ul) {
2894 char buf[128];
2895 size_t len;
2896
2897 if (ul == 0) {
2898 addReply(c,shared.czero);
2899 return;
2900 } else if (ul == 1) {
2901 addReply(c,shared.cone);
2902 return;
2903 }
2904 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2905 addReplySds(c,sdsnewlen(buf,len));
2906 }
2907
2908 static void addReplyBulkLen(redisClient *c, robj *obj) {
2909 size_t len, intlen;
2910 char buf[128];
2911
2912 if (obj->encoding == REDIS_ENCODING_RAW) {
2913 len = sdslen(obj->ptr);
2914 } else {
2915 long n = (long)obj->ptr;
2916
2917 /* Compute how many bytes will take this integer as a radix 10 string */
2918 len = 1;
2919 if (n < 0) {
2920 len++;
2921 n = -n;
2922 }
2923 while((n = n/10) != 0) {
2924 len++;
2925 }
2926 }
2927 buf[0] = '$';
2928 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2929 buf[intlen+1] = '\r';
2930 buf[intlen+2] = '\n';
2931 addReplySds(c,sdsnewlen(buf,intlen+3));
2932 }
2933
2934 static void addReplyBulk(redisClient *c, robj *obj) {
2935 addReplyBulkLen(c,obj);
2936 addReply(c,obj);
2937 addReply(c,shared.crlf);
2938 }
2939
2940 static void addReplyBulkSds(redisClient *c, sds s) {
2941 robj *o = createStringObject(s, sdslen(s));
2942 addReplyBulk(c,o);
2943 decrRefCount(o);
2944 }
2945
2946 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2947 static void addReplyBulkCString(redisClient *c, char *s) {
2948 if (s == NULL) {
2949 addReply(c,shared.nullbulk);
2950 } else {
2951 robj *o = createStringObject(s,strlen(s));
2952 addReplyBulk(c,o);
2953 decrRefCount(o);
2954 }
2955 }
2956
2957 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2958 int cport, cfd;
2959 char cip[128];
2960 redisClient *c;
2961 REDIS_NOTUSED(el);
2962 REDIS_NOTUSED(mask);
2963 REDIS_NOTUSED(privdata);
2964
2965 cfd = anetAccept(server.neterr, fd, cip, &cport);
2966 if (cfd == AE_ERR) {
2967 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2968 return;
2969 }
2970 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2971 if ((c = createClient(cfd)) == NULL) {
2972 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2973 close(cfd); /* May be already closed, just ingore errors */
2974 return;
2975 }
2976 /* If maxclient directive is set and this is one client more... close the
2977 * connection. Note that we create the client instead to check before
2978 * for this condition, since now the socket is already set in nonblocking
2979 * mode and we can send an error for free using the Kernel I/O */
2980 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2981 char *err = "-ERR max number of clients reached\r\n";
2982
2983 /* That's a best effort error message, don't check write errors */
2984 if (write(c->fd,err,strlen(err)) == -1) {
2985 /* Nothing to do, Just to avoid the warning... */
2986 }
2987 freeClient(c);
2988 return;
2989 }
2990 server.stat_numconnections++;
2991 }
2992
2993 /* ======================= Redis objects implementation ===================== */
2994
2995 static robj *createObject(int type, void *ptr) {
2996 robj *o;
2997
2998 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2999 if (listLength(server.objfreelist)) {
3000 listNode *head = listFirst(server.objfreelist);
3001 o = listNodeValue(head);
3002 listDelNode(server.objfreelist,head);
3003 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3004 } else {
3005 if (server.vm_enabled)
3006 pthread_mutex_unlock(&server.obj_freelist_mutex);
3007 o = zmalloc(sizeof(*o));
3008 }
3009 o->type = type;
3010 o->encoding = REDIS_ENCODING_RAW;
3011 o->ptr = ptr;
3012 o->refcount = 1;
3013 if (server.vm_enabled) {
3014 /* Note that this code may run in the context of an I/O thread
3015 * and accessing server.lruclock in theory is an error
3016 * (no locks). But in practice this is safe, and even if we read
3017 * garbage Redis will not fail. */
3018 o->lru = server.lruclock;
3019 o->storage = REDIS_VM_MEMORY;
3020 }
3021 return o;
3022 }
3023
3024 static robj *createStringObject(char *ptr, size_t len) {
3025 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3026 }
3027
3028 static robj *createStringObjectFromLongLong(long long value) {
3029 robj *o;
3030 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3031 incrRefCount(shared.integers[value]);
3032 o = shared.integers[value];
3033 } else {
3034 if (value >= LONG_MIN && value <= LONG_MAX) {
3035 o = createObject(REDIS_STRING, NULL);
3036 o->encoding = REDIS_ENCODING_INT;
3037 o->ptr = (void*)((long)value);
3038 } else {
3039 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3040 }
3041 }
3042 return o;
3043 }
3044
3045 static robj *dupStringObject(robj *o) {
3046 assert(o->encoding == REDIS_ENCODING_RAW);
3047 return createStringObject(o->ptr,sdslen(o->ptr));
3048 }
3049
3050 static robj *createListObject(void) {
3051 list *l = listCreate();
3052 robj *o = createObject(REDIS_LIST,l);
3053 listSetFreeMethod(l,decrRefCount);
3054 o->encoding = REDIS_ENCODING_LIST;
3055 return o;
3056 }
3057
3058 static robj *createZiplistObject(void) {
3059 unsigned char *zl = ziplistNew();
3060 robj *o = createObject(REDIS_LIST,zl);
3061 o->encoding = REDIS_ENCODING_ZIPLIST;
3062 return o;
3063 }
3064
3065 static robj *createSetObject(void) {
3066 dict *d = dictCreate(&setDictType,NULL);
3067 robj *o = createObject(REDIS_SET,d);
3068 o->encoding = REDIS_ENCODING_HT;
3069 return o;
3070 }
3071
3072 static robj *createHashObject(void) {
3073 /* All the Hashes start as zipmaps. Will be automatically converted
3074 * into hash tables if there are enough elements or big elements
3075 * inside. */
3076 unsigned char *zm = zipmapNew();
3077 robj *o = createObject(REDIS_HASH,zm);
3078 o->encoding = REDIS_ENCODING_ZIPMAP;
3079 return o;
3080 }
3081
3082 static robj *createZsetObject(void) {
3083 zset *zs = zmalloc(sizeof(*zs));
3084
3085 zs->dict = dictCreate(&zsetDictType,NULL);
3086 zs->zsl = zslCreate();
3087 return createObject(REDIS_ZSET,zs);
3088 }
3089
3090 static void freeStringObject(robj *o) {
3091 if (o->encoding == REDIS_ENCODING_RAW) {
3092 sdsfree(o->ptr);
3093 }
3094 }
3095
3096 static void freeListObject(robj *o) {
3097 switch (o->encoding) {
3098 case REDIS_ENCODING_LIST:
3099 listRelease((list*) o->ptr);
3100 break;
3101 case REDIS_ENCODING_ZIPLIST:
3102 zfree(o->ptr);
3103 break;
3104 default:
3105 redisPanic("Unknown list encoding type");
3106 }
3107 }
3108
3109 static void freeSetObject(robj *o) {
3110 dictRelease((dict*) o->ptr);
3111 }
3112
3113 static void freeZsetObject(robj *o) {
3114 zset *zs = o->ptr;
3115
3116 dictRelease(zs->dict);
3117 zslFree(zs->zsl);
3118 zfree(zs);
3119 }
3120
3121 static void freeHashObject(robj *o) {
3122 switch (o->encoding) {
3123 case REDIS_ENCODING_HT:
3124 dictRelease((dict*) o->ptr);
3125 break;
3126 case REDIS_ENCODING_ZIPMAP:
3127 zfree(o->ptr);
3128 break;
3129 default:
3130 redisPanic("Unknown hash encoding type");
3131 break;
3132 }
3133 }
3134
3135 static void incrRefCount(robj *o) {
3136 o->refcount++;
3137 }
3138
3139 static void decrRefCount(void *obj) {
3140 robj *o = obj;
3141
3142 /* Object is a swapped out value, or in the process of being loaded. */
3143 if (server.vm_enabled &&
3144 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3145 {
3146 vmpointer *vp = obj;
3147 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3148 vmMarkPagesFree(vp->page,vp->usedpages);
3149 server.vm_stats_swapped_objects--;
3150 zfree(vp);
3151 return;
3152 }
3153
3154 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3155 /* Object is in memory, or in the process of being swapped out.
3156 *
3157 * If the object is being swapped out, abort the operation on
3158 * decrRefCount even if the refcount does not drop to 0: the object
3159 * is referenced at least two times, as value of the key AND as
3160 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3161 * done but the relevant key was removed in the meantime, the
3162 * complete jobs handler will not find the key about the job and the
3163 * assert will fail. */
3164 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3165 vmCancelThreadedIOJob(o);
3166 if (--(o->refcount) == 0) {
3167 switch(o->type) {
3168 case REDIS_STRING: freeStringObject(o); break;
3169 case REDIS_LIST: freeListObject(o); break;
3170 case REDIS_SET: freeSetObject(o); break;
3171 case REDIS_ZSET: freeZsetObject(o); break;
3172 case REDIS_HASH: freeHashObject(o); break;
3173 default: redisPanic("Unknown object type"); break;
3174 }
3175 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3176 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3177 !listAddNodeHead(server.objfreelist,o))
3178 zfree(o);
3179 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3180 }
3181 }
3182
3183 static int checkType(redisClient *c, robj *o, int type) {
3184 if (o->type != type) {
3185 addReply(c,shared.wrongtypeerr);
3186 return 1;
3187 }
3188 return 0;
3189 }
3190
3191 /* Check if the nul-terminated string 's' can be represented by a long
3192 * (that is, is a number that fits into long without any other space or
3193 * character before or after the digits).
3194 *
3195 * If so, the function returns REDIS_OK and *longval is set to the value
3196 * of the number. Otherwise REDIS_ERR is returned */
3197 static int isStringRepresentableAsLong(sds s, long *longval) {
3198 char buf[32], *endptr;
3199 long value;
3200 int slen;
3201
3202 value = strtol(s, &endptr, 10);
3203 if (endptr[0] != '\0') return REDIS_ERR;
3204 slen = ll2string(buf,32,value);
3205
3206 /* If the number converted back into a string is not identical
3207 * then it's not possible to encode the string as integer */
3208 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3209 if (longval) *longval = value;
3210 return REDIS_OK;
3211 }
3212
3213 /* Try to encode a string object in order to save space */
3214 static robj *tryObjectEncoding(robj *o) {
3215 long value;
3216 sds s = o->ptr;
3217
3218 if (o->encoding != REDIS_ENCODING_RAW)
3219 return o; /* Already encoded */
3220
3221 /* It's not safe to encode shared objects: shared objects can be shared
3222 * everywhere in the "object space" of Redis. Encoded objects can only
3223 * appear as "values" (and not, for instance, as keys) */
3224 if (o->refcount > 1) return o;
3225
3226 /* Currently we try to encode only strings */
3227 redisAssert(o->type == REDIS_STRING);
3228
3229 /* Check if we can represent this string as a long integer */
3230 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3231
3232 /* Ok, this object can be encoded */
3233 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3234 decrRefCount(o);
3235 incrRefCount(shared.integers[value]);
3236 return shared.integers[value];
3237 } else {
3238 o->encoding = REDIS_ENCODING_INT;
3239 sdsfree(o->ptr);
3240 o->ptr = (void*) value;
3241 return o;
3242 }
3243 }
3244
3245 /* Get a decoded version of an encoded object (returned as a new object).
3246 * If the object is already raw-encoded just increment the ref count. */
3247 static robj *getDecodedObject(robj *o) {
3248 robj *dec;
3249
3250 if (o->encoding == REDIS_ENCODING_RAW) {
3251 incrRefCount(o);
3252 return o;
3253 }
3254 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3255 char buf[32];
3256
3257 ll2string(buf,32,(long)o->ptr);
3258 dec = createStringObject(buf,strlen(buf));
3259 return dec;
3260 } else {
3261 redisPanic("Unknown encoding type");
3262 }
3263 }
3264
3265 /* Compare two string objects via strcmp() or alike.
3266 * Note that the objects may be integer-encoded. In such a case we
3267 * use ll2string() to get a string representation of the numbers on the stack
3268 * and compare the strings, it's much faster than calling getDecodedObject().
3269 *
3270 * Important note: if objects are not integer encoded, but binary-safe strings,
3271 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3272 * binary safe. */
3273 static int compareStringObjects(robj *a, robj *b) {
3274 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3275 char bufa[128], bufb[128], *astr, *bstr;
3276 int bothsds = 1;
3277
3278 if (a == b) return 0;
3279 if (a->encoding != REDIS_ENCODING_RAW) {
3280 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3281 astr = bufa;
3282 bothsds = 0;
3283 } else {
3284 astr = a->ptr;
3285 }
3286 if (b->encoding != REDIS_ENCODING_RAW) {
3287 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3288 bstr = bufb;
3289 bothsds = 0;
3290 } else {
3291 bstr = b->ptr;
3292 }
3293 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3294 }
3295
3296 /* Equal string objects return 1 if the two objects are the same from the
3297 * point of view of a string comparison, otherwise 0 is returned. Note that
3298 * this function is faster then checking for (compareStringObject(a,b) == 0)
3299 * because it can perform some more optimization. */
3300 static int equalStringObjects(robj *a, robj *b) {
3301 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3302 return a->ptr == b->ptr;
3303 } else {
3304 return compareStringObjects(a,b) == 0;
3305 }
3306 }
3307
3308 static size_t stringObjectLen(robj *o) {
3309 redisAssert(o->type == REDIS_STRING);
3310 if (o->encoding == REDIS_ENCODING_RAW) {
3311 return sdslen(o->ptr);
3312 } else {
3313 char buf[32];
3314
3315 return ll2string(buf,32,(long)o->ptr);
3316 }
3317 }
3318
3319 static int getDoubleFromObject(robj *o, double *target) {
3320 double value;
3321 char *eptr;
3322
3323 if (o == NULL) {
3324 value = 0;
3325 } else {
3326 redisAssert(o->type == REDIS_STRING);
3327 if (o->encoding == REDIS_ENCODING_RAW) {
3328 value = strtod(o->ptr, &eptr);
3329 if (eptr[0] != '\0') return REDIS_ERR;
3330 } else if (o->encoding == REDIS_ENCODING_INT) {
3331 value = (long)o->ptr;
3332 } else {
3333 redisPanic("Unknown string encoding");
3334 }
3335 }
3336
3337 *target = value;
3338 return REDIS_OK;
3339 }
3340
3341 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3342 double value;
3343 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3344 if (msg != NULL) {
3345 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3346 } else {
3347 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3348 }
3349 return REDIS_ERR;
3350 }
3351
3352 *target = value;
3353 return REDIS_OK;
3354 }
3355
3356 static int getLongLongFromObject(robj *o, long long *target) {
3357 long long value;
3358 char *eptr;
3359
3360 if (o == NULL) {
3361 value = 0;
3362 } else {
3363 redisAssert(o->type == REDIS_STRING);
3364 if (o->encoding == REDIS_ENCODING_RAW) {
3365 value = strtoll(o->ptr, &eptr, 10);
3366 if (eptr[0] != '\0') return REDIS_ERR;
3367 } else if (o->encoding == REDIS_ENCODING_INT) {
3368 value = (long)o->ptr;
3369 } else {
3370 redisPanic("Unknown string encoding");
3371 }
3372 }
3373
3374 *target = value;
3375 return REDIS_OK;
3376 }
3377
3378 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3379 long long value;
3380 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3381 if (msg != NULL) {
3382 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3383 } else {
3384 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3385 }
3386 return REDIS_ERR;
3387 }
3388
3389 *target = value;
3390 return REDIS_OK;
3391 }
3392
3393 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3394 long long value;
3395
3396 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3397 if (value < LONG_MIN || value > LONG_MAX) {
3398 if (msg != NULL) {
3399 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3400 } else {
3401 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3402 }
3403 return REDIS_ERR;
3404 }
3405
3406 *target = value;
3407 return REDIS_OK;
3408 }
3409
3410 /* =========================== Keyspace access API ========================== */
3411
3412 static robj *lookupKey(redisDb *db, robj *key) {
3413 dictEntry *de = dictFind(db->dict,key->ptr);
3414 if (de) {
3415 robj *val = dictGetEntryVal(de);
3416
3417 if (server.vm_enabled) {
3418 if (val->storage == REDIS_VM_MEMORY ||
3419 val->storage == REDIS_VM_SWAPPING)
3420 {
3421 /* If we were swapping the object out, cancel the operation */
3422 if (val->storage == REDIS_VM_SWAPPING)
3423 vmCancelThreadedIOJob(val);
3424 /* Update the access time for the aging algorithm. */
3425 val->lru = server.lruclock;
3426 } else {
3427 int notify = (val->storage == REDIS_VM_LOADING);
3428
3429 /* Our value was swapped on disk. Bring it at home. */
3430 redisAssert(val->type == REDIS_VMPOINTER);
3431 val = vmLoadObject(val);
3432 dictGetEntryVal(de) = val;
3433
3434 /* Clients blocked by the VM subsystem may be waiting for
3435 * this key... */
3436 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3437 }
3438 }
3439 return val;
3440 } else {
3441 return NULL;
3442 }
3443 }
3444
3445 static robj *lookupKeyRead(redisDb *db, robj *key) {
3446 expireIfNeeded(db,key);
3447 return lookupKey(db,key);
3448 }
3449
3450 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3451 deleteIfVolatile(db,key);
3452 touchWatchedKey(db,key);
3453 return lookupKey(db,key);
3454 }
3455
3456 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3457 robj *o = lookupKeyRead(c->db, key);
3458 if (!o) addReply(c,reply);
3459 return o;
3460 }
3461
3462 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3463 robj *o = lookupKeyWrite(c->db, key);
3464 if (!o) addReply(c,reply);
3465 return o;
3466 }
3467
3468 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3469 * otherwise REDIS_OK is returned, and the caller should increment the
3470 * refcount of 'val'. */
3471 static int dbAdd(redisDb *db, robj *key, robj *val) {
3472 /* Perform a lookup before adding the key, as we need to copy the
3473 * key value. */
3474 if (dictFind(db->dict, key->ptr) != NULL) {
3475 return REDIS_ERR;
3476 } else {
3477 sds copy = sdsdup(key->ptr);
3478 dictAdd(db->dict, copy, val);
3479 return REDIS_OK;
3480 }
3481 }
3482
3483 /* If the key does not exist, this is just like dbAdd(). Otherwise
3484 * the value associated to the key is replaced with the new one.
3485 *
3486 * On update (key already existed) 0 is returned. Otherwise 1. */
3487 static int dbReplace(redisDb *db, robj *key, robj *val) {
3488 if (dictFind(db->dict,key->ptr) == NULL) {
3489 sds copy = sdsdup(key->ptr);
3490 dictAdd(db->dict, copy, val);
3491 return 1;
3492 } else {
3493 dictReplace(db->dict, key->ptr, val);
3494 return 0;
3495 }
3496 }
3497
3498 static int dbExists(redisDb *db, robj *key) {
3499 return dictFind(db->dict,key->ptr) != NULL;
3500 }
3501
3502 /* Return a random key, in form of a Redis object.
3503 * If there are no keys, NULL is returned.
3504 *
3505 * The function makes sure to return keys not already expired. */
3506 static robj *dbRandomKey(redisDb *db) {
3507 struct dictEntry *de;
3508
3509 while(1) {
3510 sds key;
3511 robj *keyobj;
3512
3513 de = dictGetRandomKey(db->dict);
3514 if (de == NULL) return NULL;
3515
3516 key = dictGetEntryKey(de);
3517 keyobj = createStringObject(key,sdslen(key));
3518 if (dictFind(db->expires,key)) {
3519 if (expireIfNeeded(db,keyobj)) {
3520 decrRefCount(keyobj);
3521 continue; /* search for another key. This expired. */
3522 }
3523 }
3524 return keyobj;
3525 }
3526 }
3527
3528 /* Delete a key, value, and associated expiration entry if any, from the DB */
3529 static int dbDelete(redisDb *db, robj *key) {
3530 int retval;
3531
3532 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3533 retval = dictDelete(db->dict,key->ptr);
3534
3535 return retval == DICT_OK;
3536 }
3537
3538 /*============================ RDB saving/loading =========================== */
3539
3540 static int rdbSaveType(FILE *fp, unsigned char type) {
3541 if (fwrite(&type,1,1,fp) == 0) return -1;
3542 return 0;
3543 }
3544
3545 static int rdbSaveTime(FILE *fp, time_t t) {
3546 int32_t t32 = (int32_t) t;
3547 if (fwrite(&t32,4,1,fp) == 0) return -1;
3548 return 0;
3549 }
3550
3551 /* check rdbLoadLen() comments for more info */
3552 static int rdbSaveLen(FILE *fp, uint32_t len) {
3553 unsigned char buf[2];
3554
3555 if (len < (1<<6)) {
3556 /* Save a 6 bit len */
3557 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3558 if (fwrite(buf,1,1,fp) == 0) return -1;
3559 } else if (len < (1<<14)) {
3560 /* Save a 14 bit len */
3561 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3562 buf[1] = len&0xFF;
3563 if (fwrite(buf,2,1,fp) == 0) return -1;
3564 } else {
3565 /* Save a 32 bit len */
3566 buf[0] = (REDIS_RDB_32BITLEN<<6);
3567 if (fwrite(buf,1,1,fp) == 0) return -1;
3568 len = htonl(len);
3569 if (fwrite(&len,4,1,fp) == 0) return -1;
3570 }
3571 return 0;
3572 }
3573
3574 /* Encode 'value' as an integer if possible (if integer will fit the
3575 * supported range). If the function sucessful encoded the integer
3576 * then the (up to 5 bytes) encoded representation is written in the
3577 * string pointed by 'enc' and the length is returned. Otherwise
3578 * 0 is returned. */
3579 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3580 /* Finally check if it fits in our ranges */
3581 if (value >= -(1<<7) && value <= (1<<7)-1) {
3582 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3583 enc[1] = value&0xFF;
3584 return 2;
3585 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3586 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3587 enc[1] = value&0xFF;
3588 enc[2] = (value>>8)&0xFF;
3589 return 3;
3590 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3591 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3592 enc[1] = value&0xFF;
3593 enc[2] = (value>>8)&0xFF;
3594 enc[3] = (value>>16)&0xFF;
3595 enc[4] = (value>>24)&0xFF;
3596 return 5;
3597 } else {
3598 return 0;
3599 }
3600 }
3601
3602 /* String objects in the form "2391" "-100" without any space and with a
3603 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3604 * encoded as integers to save space */
3605 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3606 long long value;
3607 char *endptr, buf[32];
3608
3609 /* Check if it's possible to encode this value as a number */
3610 value = strtoll(s, &endptr, 10);
3611 if (endptr[0] != '\0') return 0;
3612 ll2string(buf,32,value);
3613
3614 /* If the number converted back into a string is not identical
3615 * then it's not possible to encode the string as integer */
3616 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3617
3618 return rdbEncodeInteger(value,enc);
3619 }
3620
3621 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3622 size_t comprlen, outlen;
3623 unsigned char byte;
3624 void *out;
3625
3626 /* We require at least four bytes compression for this to be worth it */
3627 if (len <= 4) return 0;
3628 outlen = len-4;
3629 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3630 comprlen = lzf_compress(s, len, out, outlen);
3631 if (comprlen == 0) {
3632 zfree(out);
3633 return 0;
3634 }
3635 /* Data compressed! Let's save it on disk */
3636 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3637 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3638 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3639 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3640 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3641 zfree(out);
3642 return comprlen;
3643
3644 writeerr:
3645 zfree(out);
3646 return -1;
3647 }
3648
3649 /* Save a string objet as [len][data] on disk. If the object is a string
3650 * representation of an integer value we try to safe it in a special form */
3651 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3652 int enclen;
3653
3654 /* Try integer encoding */
3655 if (len <= 11) {
3656 unsigned char buf[5];
3657 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3658 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3659 return 0;
3660 }
3661 }
3662
3663 /* Try LZF compression - under 20 bytes it's unable to compress even
3664 * aaaaaaaaaaaaaaaaaa so skip it */
3665 if (server.rdbcompression && len > 20) {
3666 int retval;
3667
3668 retval = rdbSaveLzfStringObject(fp,s,len);
3669 if (retval == -1) return -1;
3670 if (retval > 0) return 0;
3671 /* retval == 0 means data can't be compressed, save the old way */
3672 }
3673
3674 /* Store verbatim */
3675 if (rdbSaveLen(fp,len) == -1) return -1;
3676 if (len && fwrite(s,len,1,fp) == 0) return -1;
3677 return 0;
3678 }
3679
3680 /* Save a long long value as either an encoded string or a string. */
3681 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3682 unsigned char buf[32];
3683 int enclen = rdbEncodeInteger(value,buf);
3684 if (enclen > 0) {
3685 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3686 } else {
3687 /* Encode as string */
3688 enclen = ll2string((char*)buf,32,value);
3689 redisAssert(enclen < 32);
3690 if (rdbSaveLen(fp,enclen) == -1) return -1;
3691 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3692 }
3693 return 0;
3694 }
3695
3696 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3697 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3698 /* Avoid to decode the object, then encode it again, if the
3699 * object is alrady integer encoded. */
3700 if (obj->encoding == REDIS_ENCODING_INT) {
3701 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3702 } else {
3703 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3704 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3705 }
3706 }
3707
3708 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3709 * 8 bit integer specifing the length of the representation.
3710 * This 8 bit integer has special values in order to specify the following
3711 * conditions:
3712 * 253: not a number
3713 * 254: + inf
3714 * 255: - inf
3715 */
3716 static int rdbSaveDoubleValue(FILE *fp, double val) {
3717 unsigned char buf[128];
3718 int len;
3719
3720 if (isnan(val)) {
3721 buf[0] = 253;
3722 len = 1;
3723 } else if (!isfinite(val)) {
3724 len = 1;
3725 buf[0] = (val < 0) ? 255 : 254;
3726 } else {
3727 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3728 /* Check if the float is in a safe range to be casted into a
3729 * long long. We are assuming that long long is 64 bit here.
3730 * Also we are assuming that there are no implementations around where
3731 * double has precision < 52 bit.
3732 *
3733 * Under this assumptions we test if a double is inside an interval
3734 * where casting to long long is safe. Then using two castings we
3735 * make sure the decimal part is zero. If all this is true we use
3736 * integer printing function that is much faster. */
3737 double min = -4503599627370495; /* (2^52)-1 */
3738 double max = 4503599627370496; /* -(2^52) */
3739 if (val > min && val < max && val == ((double)((long long)val)))
3740 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3741 else
3742 #endif
3743 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3744 buf[0] = strlen((char*)buf+1);
3745 len = buf[0]+1;
3746 }
3747 if (fwrite(buf,len,1,fp) == 0) return -1;
3748 return 0;
3749 }
3750
3751 /* Save a Redis object. */
3752 static int rdbSaveObject(FILE *fp, robj *o) {
3753 if (o->type == REDIS_STRING) {
3754 /* Save a string value */
3755 if (rdbSaveStringObject(fp,o) == -1) return -1;
3756 } else if (o->type == REDIS_LIST) {
3757 /* Save a list value */
3758 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3759 unsigned char *p;
3760 unsigned char *vstr;
3761 unsigned int vlen;
3762 long long vlong;
3763
3764 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3765 p = ziplistIndex(o->ptr,0);
3766 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3767 if (vstr) {
3768 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3769 return -1;
3770 } else {
3771 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3772 return -1;
3773 }
3774 p = ziplistNext(o->ptr,p);
3775 }
3776 } else if (o->encoding == REDIS_ENCODING_LIST) {
3777 list *list = o->ptr;
3778 listIter li;
3779 listNode *ln;
3780
3781 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3782 listRewind(list,&li);
3783 while((ln = listNext(&li))) {
3784 robj *eleobj = listNodeValue(ln);
3785 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3786 }
3787 } else {
3788 redisPanic("Unknown list encoding");
3789 }
3790 } else if (o->type == REDIS_SET) {
3791 /* Save a set value */
3792 dict *set = o->ptr;
3793 dictIterator *di = dictGetIterator(set);
3794 dictEntry *de;
3795
3796 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3797 while((de = dictNext(di)) != NULL) {
3798 robj *eleobj = dictGetEntryKey(de);
3799
3800 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3801 }
3802 dictReleaseIterator(di);
3803 } else if (o->type == REDIS_ZSET) {
3804 /* Save a set value */
3805 zset *zs = o->ptr;
3806 dictIterator *di = dictGetIterator(zs->dict);
3807 dictEntry *de;
3808
3809 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3810 while((de = dictNext(di)) != NULL) {
3811 robj *eleobj = dictGetEntryKey(de);
3812 double *score = dictGetEntryVal(de);
3813
3814 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3815 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3816 }
3817 dictReleaseIterator(di);
3818 } else if (o->type == REDIS_HASH) {
3819 /* Save a hash value */
3820 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3821 unsigned char *p = zipmapRewind(o->ptr);
3822 unsigned int count = zipmapLen(o->ptr);
3823 unsigned char *key, *val;
3824 unsigned int klen, vlen;
3825
3826 if (rdbSaveLen(fp,count) == -1) return -1;
3827 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3828 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3829 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3830 }
3831 } else {
3832 dictIterator *di = dictGetIterator(o->ptr);
3833 dictEntry *de;
3834
3835 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3836 while((de = dictNext(di)) != NULL) {
3837 robj *key = dictGetEntryKey(de);
3838 robj *val = dictGetEntryVal(de);
3839
3840 if (rdbSaveStringObject(fp,key) == -1) return -1;
3841 if (rdbSaveStringObject(fp,val) == -1) return -1;
3842 }
3843 dictReleaseIterator(di);
3844 }
3845 } else {
3846 redisPanic("Unknown object type");
3847 }
3848 return 0;
3849 }
3850
3851 /* Return the length the object will have on disk if saved with
3852 * the rdbSaveObject() function. Currently we use a trick to get
3853 * this length with very little changes to the code. In the future
3854 * we could switch to a faster solution. */
3855 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3856 if (fp == NULL) fp = server.devnull;
3857 rewind(fp);
3858 assert(rdbSaveObject(fp,o) != 1);
3859 return ftello(fp);
3860 }
3861
3862 /* Return the number of pages required to save this object in the swap file */
3863 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3864 off_t bytes = rdbSavedObjectLen(o,fp);
3865
3866 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3867 }
3868
3869 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3870 static int rdbSave(char *filename) {
3871 dictIterator *di = NULL;
3872 dictEntry *de;
3873 FILE *fp;
3874 char tmpfile[256];
3875 int j;
3876 time_t now = time(NULL);
3877
3878 /* Wait for I/O therads to terminate, just in case this is a
3879 * foreground-saving, to avoid seeking the swap file descriptor at the
3880 * same time. */
3881 if (server.vm_enabled)
3882 waitEmptyIOJobsQueue();
3883
3884 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3885 fp = fopen(tmpfile,"w");
3886 if (!fp) {
3887 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3888 return REDIS_ERR;
3889 }
3890 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3891 for (j = 0; j < server.dbnum; j++) {
3892 redisDb *db = server.db+j;
3893 dict *d = db->dict;
3894 if (dictSize(d) == 0) continue;
3895 di = dictGetIterator(d);
3896 if (!di) {
3897 fclose(fp);
3898 return REDIS_ERR;
3899 }
3900
3901 /* Write the SELECT DB opcode */
3902 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3903 if (rdbSaveLen(fp,j) == -1) goto werr;
3904
3905 /* Iterate this DB writing every entry */
3906 while((de = dictNext(di)) != NULL) {
3907 sds keystr = dictGetEntryKey(de);
3908 robj key, *o = dictGetEntryVal(de);
3909 time_t expiretime;
3910
3911 initStaticStringObject(key,keystr);
3912 expiretime = getExpire(db,&key);
3913
3914 /* Save the expire time */
3915 if (expiretime != -1) {
3916 /* If this key is already expired skip it */
3917 if (expiretime < now) continue;
3918 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3919 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3920 }
3921 /* Save the key and associated value. This requires special
3922 * handling if the value is swapped out. */
3923 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3924 o->storage == REDIS_VM_SWAPPING) {
3925 /* Save type, key, value */
3926 if (rdbSaveType(fp,o->type) == -1) goto werr;
3927 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3928 if (rdbSaveObject(fp,o) == -1) goto werr;
3929 } else {
3930 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3931 robj *po;
3932 /* Get a preview of the object in memory */
3933 po = vmPreviewObject(o);
3934 /* Save type, key, value */
3935 if (rdbSaveType(fp,po->type) == -1) goto werr;
3936 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3937 if (rdbSaveObject(fp,po) == -1) goto werr;
3938 /* Remove the loaded object from memory */
3939 decrRefCount(po);
3940 }
3941 }
3942 dictReleaseIterator(di);
3943 }
3944 /* EOF opcode */
3945 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3946
3947 /* Make sure data will not remain on the OS's output buffers */
3948 fflush(fp);
3949 fsync(fileno(fp));
3950 fclose(fp);
3951
3952 /* Use RENAME to make sure the DB file is changed atomically only
3953 * if the generate DB file is ok. */
3954 if (rename(tmpfile,filename) == -1) {
3955 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3956 unlink(tmpfile);
3957 return REDIS_ERR;
3958 }
3959 redisLog(REDIS_NOTICE,"DB saved on disk");
3960 server.dirty = 0;
3961 server.lastsave = time(NULL);
3962 return REDIS_OK;
3963
3964 werr:
3965 fclose(fp);
3966 unlink(tmpfile);
3967 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3968 if (di) dictReleaseIterator(di);
3969 return REDIS_ERR;
3970 }
3971
3972 static int rdbSaveBackground(char *filename) {
3973 pid_t childpid;
3974
3975 if (server.bgsavechildpid != -1) return REDIS_ERR;
3976 if (server.vm_enabled) waitEmptyIOJobsQueue();
3977 if ((childpid = fork()) == 0) {
3978 /* Child */
3979 if (server.vm_enabled) vmReopenSwapFile();
3980 close(server.fd);
3981 if (rdbSave(filename) == REDIS_OK) {
3982 _exit(0);
3983 } else {
3984 _exit(1);
3985 }
3986 } else {
3987 /* Parent */
3988 if (childpid == -1) {
3989 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3990 strerror(errno));
3991 return REDIS_ERR;
3992 }
3993 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3994 server.bgsavechildpid = childpid;
3995 updateDictResizePolicy();
3996 return REDIS_OK;
3997 }
3998 return REDIS_OK; /* unreached */
3999 }
4000
4001 static void rdbRemoveTempFile(pid_t childpid) {
4002 char tmpfile[256];
4003
4004 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4005 unlink(tmpfile);
4006 }
4007
4008 static int rdbLoadType(FILE *fp) {
4009 unsigned char type;
4010 if (fread(&type,1,1,fp) == 0) return -1;
4011 return type;
4012 }
4013
4014 static time_t rdbLoadTime(FILE *fp) {
4015 int32_t t32;
4016 if (fread(&t32,4,1,fp) == 0) return -1;
4017 return (time_t) t32;
4018 }
4019
4020 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4021 * of this file for a description of how this are stored on disk.
4022 *
4023 * isencoded is set to 1 if the readed length is not actually a length but
4024 * an "encoding type", check the above comments for more info */
4025 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4026 unsigned char buf[2];
4027 uint32_t len;
4028 int type;
4029
4030 if (isencoded) *isencoded = 0;
4031 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4032 type = (buf[0]&0xC0)>>6;
4033 if (type == REDIS_RDB_6BITLEN) {
4034 /* Read a 6 bit len */
4035 return buf[0]&0x3F;
4036 } else if (type == REDIS_RDB_ENCVAL) {
4037 /* Read a 6 bit len encoding type */
4038 if (isencoded) *isencoded = 1;
4039 return buf[0]&0x3F;
4040 } else if (type == REDIS_RDB_14BITLEN) {
4041 /* Read a 14 bit len */
4042 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4043 return ((buf[0]&0x3F)<<8)|buf[1];
4044 } else {
4045 /* Read a 32 bit len */
4046 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4047 return ntohl(len);
4048 }
4049 }
4050
4051 /* Load an integer-encoded object from file 'fp', with the specified
4052 * encoding type 'enctype'. If encode is true the function may return
4053 * an integer-encoded object as reply, otherwise the returned object
4054 * will always be encoded as a raw string. */
4055 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4056 unsigned char enc[4];
4057 long long val;
4058
4059 if (enctype == REDIS_RDB_ENC_INT8) {
4060 if (fread(enc,1,1,fp) == 0) return NULL;
4061 val = (signed char)enc[0];
4062 } else if (enctype == REDIS_RDB_ENC_INT16) {
4063 uint16_t v;
4064 if (fread(enc,2,1,fp) == 0) return NULL;
4065 v = enc[0]|(enc[1]<<8);
4066 val = (int16_t)v;
4067 } else if (enctype == REDIS_RDB_ENC_INT32) {
4068 uint32_t v;
4069 if (fread(enc,4,1,fp) == 0) return NULL;
4070 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4071 val = (int32_t)v;
4072 } else {
4073 val = 0; /* anti-warning */
4074 redisPanic("Unknown RDB integer encoding type");
4075 }
4076 if (encode)
4077 return createStringObjectFromLongLong(val);
4078 else
4079 return createObject(REDIS_STRING,sdsfromlonglong(val));
4080 }
4081
4082 static robj *rdbLoadLzfStringObject(FILE*fp) {
4083 unsigned int len, clen;
4084 unsigned char *c = NULL;
4085 sds val = NULL;
4086
4087 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4088 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4089 if ((c = zmalloc(clen)) == NULL) goto err;
4090 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4091 if (fread(c,clen,1,fp) == 0) goto err;
4092 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4093 zfree(c);
4094 return createObject(REDIS_STRING,val);
4095 err:
4096 zfree(c);
4097 sdsfree(val);
4098 return NULL;
4099 }
4100
4101 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4102 int isencoded;
4103 uint32_t len;
4104 sds val;
4105
4106 len = rdbLoadLen(fp,&isencoded);
4107 if (isencoded) {
4108 switch(len) {
4109 case REDIS_RDB_ENC_INT8:
4110 case REDIS_RDB_ENC_INT16:
4111 case REDIS_RDB_ENC_INT32:
4112 return rdbLoadIntegerObject(fp,len,encode);
4113 case REDIS_RDB_ENC_LZF:
4114 return rdbLoadLzfStringObject(fp);
4115 default:
4116 redisPanic("Unknown RDB encoding type");
4117 }
4118 }
4119
4120 if (len == REDIS_RDB_LENERR) return NULL;
4121 val = sdsnewlen(NULL,len);
4122 if (len && fread(val,len,1,fp) == 0) {
4123 sdsfree(val);
4124 return NULL;
4125 }
4126 return createObject(REDIS_STRING,val);
4127 }
4128
4129 static robj *rdbLoadStringObject(FILE *fp) {
4130 return rdbGenericLoadStringObject(fp,0);
4131 }
4132
4133 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4134 return rdbGenericLoadStringObject(fp,1);
4135 }
4136
4137 /* For information about double serialization check rdbSaveDoubleValue() */
4138 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4139 char buf[128];
4140 unsigned char len;
4141
4142 if (fread(&len,1,1,fp) == 0) return -1;
4143 switch(len) {
4144 case 255: *val = R_NegInf; return 0;
4145 case 254: *val = R_PosInf; return 0;
4146 case 253: *val = R_Nan; return 0;
4147 default:
4148 if (fread(buf,len,1,fp) == 0) return -1;
4149 buf[len] = '\0';
4150 sscanf(buf, "%lg", val);
4151 return 0;
4152 }
4153 }
4154
4155 /* Load a Redis object of the specified type from the specified file.
4156 * On success a newly allocated object is returned, otherwise NULL. */
4157 static robj *rdbLoadObject(int type, FILE *fp) {
4158 robj *o, *ele, *dec;
4159 size_t len;
4160
4161 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4162 if (type == REDIS_STRING) {
4163 /* Read string value */
4164 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4165 o = tryObjectEncoding(o);
4166 } else if (type == REDIS_LIST) {
4167 /* Read list value */
4168 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4169
4170 /* Use a real list when there are too many entries */
4171 if (len > server.list_max_ziplist_entries) {
4172 o = createListObject();
4173 } else {
4174 o = createZiplistObject();
4175 }
4176
4177 /* Load every single element of the list */
4178 while(len--) {
4179 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4180
4181 /* If we are using a ziplist and the value is too big, convert
4182 * the object to a real list. */
4183 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4184 ele->encoding == REDIS_ENCODING_RAW &&
4185 sdslen(ele->ptr) > server.list_max_ziplist_value)
4186 listTypeConvert(o,REDIS_ENCODING_LIST);
4187
4188 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4189 dec = getDecodedObject(ele);
4190 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4191 decrRefCount(dec);
4192 decrRefCount(ele);
4193 } else {
4194 ele = tryObjectEncoding(ele);
4195 listAddNodeTail(o->ptr,ele);
4196 }
4197 }
4198 } else if (type == REDIS_SET) {
4199 /* Read list/set value */
4200 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4201 o = createSetObject();
4202 /* It's faster to expand the dict to the right size asap in order
4203 * to avoid rehashing */
4204 if (len > DICT_HT_INITIAL_SIZE)
4205 dictExpand(o->ptr,len);
4206 /* Load every single element of the list/set */
4207 while(len--) {
4208 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4209 ele = tryObjectEncoding(ele);
4210 dictAdd((dict*)o->ptr,ele,NULL);
4211 }
4212 } else if (type == REDIS_ZSET) {
4213 /* Read list/set value */
4214 size_t zsetlen;
4215 zset *zs;
4216
4217 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4218 o = createZsetObject();
4219 zs = o->ptr;
4220 /* Load every single element of the list/set */
4221 while(zsetlen--) {
4222 robj *ele;
4223 double *score = zmalloc(sizeof(double));
4224
4225 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4226 ele = tryObjectEncoding(ele);
4227 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4228 dictAdd(zs->dict,ele,score);
4229 zslInsert(zs->zsl,*score,ele);
4230 incrRefCount(ele); /* added to skiplist */
4231 }
4232 } else if (type == REDIS_HASH) {
4233 size_t hashlen;
4234
4235 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4236 o = createHashObject();
4237 /* Too many entries? Use an hash table. */
4238 if (hashlen > server.hash_max_zipmap_entries)
4239 convertToRealHash(o);
4240 /* Load every key/value, then set it into the zipmap or hash
4241 * table, as needed. */
4242 while(hashlen--) {
4243 robj *key, *val;
4244
4245 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4246 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4247 /* If we are using a zipmap and there are too big values
4248 * the object is converted to real hash table encoding. */
4249 if (o->encoding != REDIS_ENCODING_HT &&
4250 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4251 sdslen(val->ptr) > server.hash_max_zipmap_value))
4252 {
4253 convertToRealHash(o);
4254 }
4255
4256 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4257 unsigned char *zm = o->ptr;
4258
4259 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4260 val->ptr,sdslen(val->ptr),NULL);
4261 o->ptr = zm;
4262 decrRefCount(key);
4263 decrRefCount(val);
4264 } else {
4265 key = tryObjectEncoding(key);
4266 val = tryObjectEncoding(val);
4267 dictAdd((dict*)o->ptr,key,val);
4268 }
4269 }
4270 } else {
4271 redisPanic("Unknown object type");
4272 }
4273 return o;
4274 }
4275
4276 static int rdbLoad(char *filename) {
4277 FILE *fp;
4278 uint32_t dbid;
4279 int type, retval, rdbver;
4280 int swap_all_values = 0;
4281 redisDb *db = server.db+0;
4282 char buf[1024];
4283 time_t expiretime, now = time(NULL);
4284
4285 fp = fopen(filename,"r");
4286 if (!fp) return REDIS_ERR;
4287 if (fread(buf,9,1,fp) == 0) goto eoferr;
4288 buf[9] = '\0';
4289 if (memcmp(buf,"REDIS",5) != 0) {
4290 fclose(fp);
4291 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4292 return REDIS_ERR;
4293 }
4294 rdbver = atoi(buf+5);
4295 if (rdbver != 1) {
4296 fclose(fp);
4297 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4298 return REDIS_ERR;
4299 }
4300 while(1) {
4301 robj *key, *val;
4302 int force_swapout;
4303
4304 expiretime = -1;
4305 /* Read type. */
4306 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4307 if (type == REDIS_EXPIRETIME) {
4308 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4309 /* We read the time so we need to read the object type again */
4310 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4311 }
4312 if (type == REDIS_EOF) break;
4313 /* Handle SELECT DB opcode as a special case */
4314 if (type == REDIS_SELECTDB) {
4315 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4316 goto eoferr;
4317 if (dbid >= (unsigned)server.dbnum) {
4318 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4319 exit(1);
4320 }
4321 db = server.db+dbid;
4322 continue;
4323 }
4324 /* Read key */
4325 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4326 /* Read value */
4327 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4328 /* Check if the key already expired */
4329 if (expiretime != -1 && expiretime < now) {
4330 decrRefCount(key);
4331 decrRefCount(val);
4332 continue;
4333 }
4334 /* Add the new object in the hash table */
4335 retval = dbAdd(db,key,val);
4336 if (retval == REDIS_ERR) {
4337 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4338 exit(1);
4339 }
4340 /* Set the expire time if needed */
4341 if (expiretime != -1) setExpire(db,key,expiretime);
4342
4343 /* Handle swapping while loading big datasets when VM is on */
4344
4345 /* If we detecter we are hopeless about fitting something in memory
4346 * we just swap every new key on disk. Directly...
4347 * Note that's important to check for this condition before resorting
4348 * to random sampling, otherwise we may try to swap already
4349 * swapped keys. */
4350 if (swap_all_values) {
4351 dictEntry *de = dictFind(db->dict,key->ptr);
4352
4353 /* de may be NULL since the key already expired */
4354 if (de) {
4355 vmpointer *vp;
4356 val = dictGetEntryVal(de);
4357
4358 if (val->refcount == 1 &&
4359 (vp = vmSwapObjectBlocking(val)) != NULL)
4360 dictGetEntryVal(de) = vp;
4361 }
4362 decrRefCount(key);
4363 continue;
4364 }
4365 decrRefCount(key);
4366
4367 /* Flush data on disk once 32 MB of additional RAM are used... */
4368 force_swapout = 0;
4369 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4370 force_swapout = 1;
4371
4372 /* If we have still some hope of having some value fitting memory
4373 * then we try random sampling. */
4374 if (!swap_all_values && server.vm_enabled && force_swapout) {
4375 while (zmalloc_used_memory() > server.vm_max_memory) {
4376 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4377 }
4378 if (zmalloc_used_memory() > server.vm_max_memory)
4379 swap_all_values = 1; /* We are already using too much mem */
4380 }
4381 }
4382 fclose(fp);
4383 return REDIS_OK;
4384
4385 eoferr: /* unexpected end of file is handled here with a fatal exit */
4386 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4387 exit(1);
4388 return REDIS_ERR; /* Just to avoid warning */
4389 }
4390
4391 /*================================== Shutdown =============================== */
4392 static int prepareForShutdown() {
4393 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4394 /* Kill the saving child if there is a background saving in progress.
4395 We want to avoid race conditions, for instance our saving child may
4396 overwrite the synchronous saving did by SHUTDOWN. */
4397 if (server.bgsavechildpid != -1) {
4398 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4399 kill(server.bgsavechildpid,SIGKILL);
4400 rdbRemoveTempFile(server.bgsavechildpid);
4401 }
4402 if (server.appendonly) {
4403 /* Append only file: fsync() the AOF and exit */
4404 aof_fsync(server.appendfd);
4405 if (server.vm_enabled) unlink(server.vm_swap_file);
4406 } else {
4407 /* Snapshotting. Perform a SYNC SAVE and exit */
4408 if (rdbSave(server.dbfilename) == REDIS_OK) {
4409 if (server.daemonize)
4410 unlink(server.pidfile);
4411 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4412 } else {
4413 /* Ooops.. error saving! The best we can do is to continue
4414 * operating. Note that if there was a background saving process,
4415 * in the next cron() Redis will be notified that the background
4416 * saving aborted, handling special stuff like slaves pending for
4417 * synchronization... */
4418 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4419 return REDIS_ERR;
4420 }
4421 }
4422 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4423 return REDIS_OK;
4424 }
4425
4426 /*================================== Commands =============================== */
4427
4428 static void authCommand(redisClient *c) {
4429 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4430 c->authenticated = 1;
4431 addReply(c,shared.ok);
4432 } else {
4433 c->authenticated = 0;
4434 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4435 }
4436 }
4437
4438 static void pingCommand(redisClient *c) {
4439 addReply(c,shared.pong);
4440 }
4441
4442 static void echoCommand(redisClient *c) {
4443 addReplyBulk(c,c->argv[1]);
4444 }
4445
4446 /*=================================== Strings =============================== */
4447
4448 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4449 int retval;
4450 long seconds = 0; /* initialized to avoid an harmness warning */
4451
4452 if (expire) {
4453 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4454 return;
4455 if (seconds <= 0) {
4456 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4457 return;
4458 }
4459 }
4460
4461 touchWatchedKey(c->db,key);
4462 if (nx) deleteIfVolatile(c->db,key);
4463 retval = dbAdd(c->db,key,val);
4464 if (retval == REDIS_ERR) {
4465 if (!nx) {
4466 dbReplace(c->db,key,val);
4467 incrRefCount(val);
4468 } else {
4469 addReply(c,shared.czero);
4470 return;
4471 }
4472 } else {
4473 incrRefCount(val);
4474 }
4475 server.dirty++;
4476 removeExpire(c->db,key);
4477 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4478 addReply(c, nx ? shared.cone : shared.ok);
4479 }
4480
4481 static void setCommand(redisClient *c) {
4482 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4483 }
4484
4485 static void setnxCommand(redisClient *c) {
4486 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4487 }
4488
4489 static void setexCommand(redisClient *c) {
4490 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4491 }
4492
4493 static int getGenericCommand(redisClient *c) {
4494 robj *o;
4495
4496 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4497 return REDIS_OK;
4498
4499 if (o->type != REDIS_STRING) {
4500 addReply(c,shared.wrongtypeerr);
4501 return REDIS_ERR;
4502 } else {
4503 addReplyBulk(c,o);
4504 return REDIS_OK;
4505 }
4506 }
4507
4508 static void getCommand(redisClient *c) {
4509 getGenericCommand(c);
4510 }
4511
4512 static void getsetCommand(redisClient *c) {
4513 if (getGenericCommand(c) == REDIS_ERR) return;
4514 dbReplace(c->db,c->argv[1],c->argv[2]);
4515 incrRefCount(c->argv[2]);
4516 server.dirty++;
4517 removeExpire(c->db,c->argv[1]);
4518 }
4519
4520 static void mgetCommand(redisClient *c) {
4521 int j;
4522
4523 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4524 for (j = 1; j < c->argc; j++) {
4525 robj *o = lookupKeyRead(c->db,c->argv[j]);
4526 if (o == NULL) {
4527 addReply(c,shared.nullbulk);
4528 } else {
4529 if (o->type != REDIS_STRING) {
4530 addReply(c,shared.nullbulk);
4531 } else {
4532 addReplyBulk(c,o);
4533 }
4534 }
4535 }
4536 }
4537
4538 static void msetGenericCommand(redisClient *c, int nx) {
4539 int j, busykeys = 0;
4540
4541 if ((c->argc % 2) == 0) {
4542 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4543 return;
4544 }
4545 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4546 * set nothing at all if at least one already key exists. */
4547 if (nx) {
4548 for (j = 1; j < c->argc; j += 2) {
4549 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4550 busykeys++;
4551 }
4552 }
4553 }
4554 if (busykeys) {
4555 addReply(c, shared.czero);
4556 return;
4557 }
4558
4559 for (j = 1; j < c->argc; j += 2) {
4560 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4561 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4562 incrRefCount(c->argv[j+1]);
4563 removeExpire(c->db,c->argv[j]);
4564 }
4565 server.dirty += (c->argc-1)/2;
4566 addReply(c, nx ? shared.cone : shared.ok);
4567 }
4568
4569 static void msetCommand(redisClient *c) {
4570 msetGenericCommand(c,0);
4571 }
4572
4573 static void msetnxCommand(redisClient *c) {
4574 msetGenericCommand(c,1);
4575 }
4576
4577 static void incrDecrCommand(redisClient *c, long long incr) {
4578 long long value;
4579 robj *o;
4580
4581 o = lookupKeyWrite(c->db,c->argv[1]);
4582 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4583 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4584
4585 value += incr;
4586 o = createStringObjectFromLongLong(value);
4587 dbReplace(c->db,c->argv[1],o);
4588 server.dirty++;
4589 addReply(c,shared.colon);
4590 addReply(c,o);
4591 addReply(c,shared.crlf);
4592 }
4593
4594 static void incrCommand(redisClient *c) {
4595 incrDecrCommand(c,1);
4596 }
4597
4598 static void decrCommand(redisClient *c) {
4599 incrDecrCommand(c,-1);
4600 }
4601
4602 static void incrbyCommand(redisClient *c) {
4603 long long incr;
4604
4605 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4606 incrDecrCommand(c,incr);
4607 }
4608
4609 static void decrbyCommand(redisClient *c) {
4610 long long incr;
4611
4612 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4613 incrDecrCommand(c,-incr);
4614 }
4615
4616 static void appendCommand(redisClient *c) {
4617 int retval;
4618 size_t totlen;
4619 robj *o;
4620
4621 o = lookupKeyWrite(c->db,c->argv[1]);
4622 if (o == NULL) {
4623 /* Create the key */
4624 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4625 incrRefCount(c->argv[2]);
4626 totlen = stringObjectLen(c->argv[2]);
4627 } else {
4628 if (o->type != REDIS_STRING) {
4629 addReply(c,shared.wrongtypeerr);
4630 return;
4631 }
4632 /* If the object is specially encoded or shared we have to make
4633 * a copy */
4634 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4635 robj *decoded = getDecodedObject(o);
4636
4637 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4638 decrRefCount(decoded);
4639 dbReplace(c->db,c->argv[1],o);
4640 }
4641 /* APPEND! */
4642 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4643 o->ptr = sdscatlen(o->ptr,
4644 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4645 } else {
4646 o->ptr = sdscatprintf(o->ptr, "%ld",
4647 (unsigned long) c->argv[2]->ptr);
4648 }
4649 totlen = sdslen(o->ptr);
4650 }
4651 server.dirty++;
4652 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4653 }
4654
4655 static void substrCommand(redisClient *c) {
4656 robj *o;
4657 long start = atoi(c->argv[2]->ptr);
4658 long end = atoi(c->argv[3]->ptr);
4659 size_t rangelen, strlen;
4660 sds range;
4661
4662 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4663 checkType(c,o,REDIS_STRING)) return;
4664
4665 o = getDecodedObject(o);
4666 strlen = sdslen(o->ptr);
4667
4668 /* convert negative indexes */
4669 if (start < 0) start = strlen+start;
4670 if (end < 0) end = strlen+end;
4671 if (start < 0) start = 0;
4672 if (end < 0) end = 0;
4673
4674 /* indexes sanity checks */
4675 if (start > end || (size_t)start >= strlen) {
4676 /* Out of range start or start > end result in null reply */
4677 addReply(c,shared.nullbulk);
4678 decrRefCount(o);
4679 return;
4680 }
4681 if ((size_t)end >= strlen) end = strlen-1;
4682 rangelen = (end-start)+1;
4683
4684 /* Return the result */
4685 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4686 range = sdsnewlen((char*)o->ptr+start,rangelen);
4687 addReplySds(c,range);
4688 addReply(c,shared.crlf);
4689 decrRefCount(o);
4690 }
4691
4692 /* ========================= Type agnostic commands ========================= */
4693
4694 static void delCommand(redisClient *c) {
4695 int deleted = 0, j;
4696
4697 for (j = 1; j < c->argc; j++) {
4698 if (dbDelete(c->db,c->argv[j])) {
4699 touchWatchedKey(c->db,c->argv[j]);
4700 server.dirty++;
4701 deleted++;
4702 }
4703 }
4704 addReplyLongLong(c,deleted);
4705 }
4706
4707 static void existsCommand(redisClient *c) {
4708 expireIfNeeded(c->db,c->argv[1]);
4709 if (dbExists(c->db,c->argv[1])) {
4710 addReply(c, shared.cone);
4711 } else {
4712 addReply(c, shared.czero);
4713 }
4714 }
4715
4716 static void selectCommand(redisClient *c) {
4717 int id = atoi(c->argv[1]->ptr);
4718
4719 if (selectDb(c,id) == REDIS_ERR) {
4720 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4721 } else {
4722 addReply(c,shared.ok);
4723 }
4724 }
4725
4726 static void randomkeyCommand(redisClient *c) {
4727 robj *key;
4728
4729 if ((key = dbRandomKey(c->db)) == NULL) {
4730 addReply(c,shared.nullbulk);
4731 return;
4732 }
4733
4734 addReplyBulk(c,key);
4735 decrRefCount(key);
4736 }
4737
4738 static void keysCommand(redisClient *c) {
4739 dictIterator *di;
4740 dictEntry *de;
4741 sds pattern = c->argv[1]->ptr;
4742 int plen = sdslen(pattern);
4743 unsigned long numkeys = 0;
4744 robj *lenobj = createObject(REDIS_STRING,NULL);
4745
4746 di = dictGetIterator(c->db->dict);
4747 addReply(c,lenobj);
4748 decrRefCount(lenobj);
4749 while((de = dictNext(di)) != NULL) {
4750 sds key = dictGetEntryKey(de);
4751 robj *keyobj;
4752
4753 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4754 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4755 keyobj = createStringObject(key,sdslen(key));
4756 if (expireIfNeeded(c->db,keyobj) == 0) {
4757 addReplyBulk(c,keyobj);
4758 numkeys++;
4759 }
4760 decrRefCount(keyobj);
4761 }
4762 }
4763 dictReleaseIterator(di);
4764 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4765 }
4766
4767 static void dbsizeCommand(redisClient *c) {
4768 addReplySds(c,
4769 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4770 }
4771
4772 static void lastsaveCommand(redisClient *c) {
4773 addReplySds(c,
4774 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4775 }
4776
4777 static void typeCommand(redisClient *c) {
4778 robj *o;
4779 char *type;
4780
4781 o = lookupKeyRead(c->db,c->argv[1]);
4782 if (o == NULL) {
4783 type = "+none";
4784 } else {
4785 switch(o->type) {
4786 case REDIS_STRING: type = "+string"; break;
4787 case REDIS_LIST: type = "+list"; break;
4788 case REDIS_SET: type = "+set"; break;
4789 case REDIS_ZSET: type = "+zset"; break;
4790 case REDIS_HASH: type = "+hash"; break;
4791 default: type = "+unknown"; break;
4792 }
4793 }
4794 addReplySds(c,sdsnew(type));
4795 addReply(c,shared.crlf);
4796 }
4797
4798 static void saveCommand(redisClient *c) {
4799 if (server.bgsavechildpid != -1) {
4800 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4801 return;
4802 }
4803 if (rdbSave(server.dbfilename) == REDIS_OK) {
4804 addReply(c,shared.ok);
4805 } else {
4806 addReply(c,shared.err);
4807 }
4808 }
4809
4810 static void bgsaveCommand(redisClient *c) {
4811 if (server.bgsavechildpid != -1) {
4812 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4813 return;
4814 }
4815 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4816 char *status = "+Background saving started\r\n";
4817 addReplySds(c,sdsnew(status));
4818 } else {
4819 addReply(c,shared.err);
4820 }
4821 }
4822
4823 static void shutdownCommand(redisClient *c) {
4824 if (prepareForShutdown() == REDIS_OK)
4825 exit(0);
4826 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4827 }
4828
4829 static void renameGenericCommand(redisClient *c, int nx) {
4830 robj *o;
4831
4832 /* To use the same key as src and dst is probably an error */
4833 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4834 addReply(c,shared.sameobjecterr);
4835 return;
4836 }
4837
4838 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4839 return;
4840
4841 incrRefCount(o);
4842 deleteIfVolatile(c->db,c->argv[2]);
4843 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4844 if (nx) {
4845 decrRefCount(o);
4846 addReply(c,shared.czero);
4847 return;
4848 }
4849 dbReplace(c->db,c->argv[2],o);
4850 }
4851 dbDelete(c->db,c->argv[1]);
4852 touchWatchedKey(c->db,c->argv[2]);
4853 server.dirty++;
4854 addReply(c,nx ? shared.cone : shared.ok);
4855 }
4856
4857 static void renameCommand(redisClient *c) {
4858 renameGenericCommand(c,0);
4859 }
4860
4861 static void renamenxCommand(redisClient *c) {
4862 renameGenericCommand(c,1);
4863 }
4864
4865 static void moveCommand(redisClient *c) {
4866 robj *o;
4867 redisDb *src, *dst;
4868 int srcid;
4869
4870 /* Obtain source and target DB pointers */
4871 src = c->db;
4872 srcid = c->db->id;
4873 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4874 addReply(c,shared.outofrangeerr);
4875 return;
4876 }
4877 dst = c->db;
4878 selectDb(c,srcid); /* Back to the source DB */
4879
4880 /* If the user is moving using as target the same
4881 * DB as the source DB it is probably an error. */
4882 if (src == dst) {
4883 addReply(c,shared.sameobjecterr);
4884 return;
4885 }
4886
4887 /* Check if the element exists and get a reference */
4888 o = lookupKeyWrite(c->db,c->argv[1]);
4889 if (!o) {
4890 addReply(c,shared.czero);
4891 return;
4892 }
4893
4894 /* Try to add the element to the target DB */
4895 deleteIfVolatile(dst,c->argv[1]);
4896 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4897 addReply(c,shared.czero);
4898 return;
4899 }
4900 incrRefCount(o);
4901
4902 /* OK! key moved, free the entry in the source DB */
4903 dbDelete(src,c->argv[1]);
4904 server.dirty++;
4905 addReply(c,shared.cone);
4906 }
4907
4908 /* =================================== Lists ================================ */
4909
4910
4911 /* Check the argument length to see if it requires us to convert the ziplist
4912 * to a real list. Only check raw-encoded objects because integer encoded
4913 * objects are never too long. */
4914 static void listTypeTryConversion(robj *subject, robj *value) {
4915 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4916 if (value->encoding == REDIS_ENCODING_RAW &&
4917 sdslen(value->ptr) > server.list_max_ziplist_value)
4918 listTypeConvert(subject,REDIS_ENCODING_LIST);
4919 }
4920
4921 static void listTypePush(robj *subject, robj *value, int where) {
4922 /* Check if we need to convert the ziplist */
4923 listTypeTryConversion(subject,value);
4924 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4925 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
4926 listTypeConvert(subject,REDIS_ENCODING_LIST);
4927
4928 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4929 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4930 value = getDecodedObject(value);
4931 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4932 decrRefCount(value);
4933 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4934 if (where == REDIS_HEAD) {
4935 listAddNodeHead(subject->ptr,value);
4936 } else {
4937 listAddNodeTail(subject->ptr,value);
4938 }
4939 incrRefCount(value);
4940 } else {
4941 redisPanic("Unknown list encoding");
4942 }
4943 }
4944
4945 static robj *listTypePop(robj *subject, int where) {
4946 robj *value = NULL;
4947 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4948 unsigned char *p;
4949 unsigned char *vstr;
4950 unsigned int vlen;
4951 long long vlong;
4952 int pos = (where == REDIS_HEAD) ? 0 : -1;
4953 p = ziplistIndex(subject->ptr,pos);
4954 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4955 if (vstr) {
4956 value = createStringObject((char*)vstr,vlen);
4957 } else {
4958 value = createStringObjectFromLongLong(vlong);
4959 }
4960 /* We only need to delete an element when it exists */
4961 subject->ptr = ziplistDelete(subject->ptr,&p);
4962 }
4963 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4964 list *list = subject->ptr;
4965 listNode *ln;
4966 if (where == REDIS_HEAD) {
4967 ln = listFirst(list);
4968 } else {
4969 ln = listLast(list);
4970 }
4971 if (ln != NULL) {
4972 value = listNodeValue(ln);
4973 incrRefCount(value);
4974 listDelNode(list,ln);
4975 }
4976 } else {
4977 redisPanic("Unknown list encoding");
4978 }
4979 return value;
4980 }
4981
4982 static unsigned long listTypeLength(robj *subject) {
4983 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4984 return ziplistLen(subject->ptr);
4985 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4986 return listLength((list*)subject->ptr);
4987 } else {
4988 redisPanic("Unknown list encoding");
4989 }
4990 }
4991
4992 /* Structure to hold set iteration abstraction. */
4993 typedef struct {
4994 robj *subject;
4995 unsigned char encoding;
4996 unsigned char direction; /* Iteration direction */
4997 unsigned char *zi;
4998 listNode *ln;
4999 } listTypeIterator;
5000
5001 /* Structure for an entry while iterating over a list. */
5002 typedef struct {
5003 listTypeIterator *li;
5004 unsigned char *zi; /* Entry in ziplist */
5005 listNode *ln; /* Entry in linked list */
5006 } listTypeEntry;
5007
5008 /* Initialize an iterator at the specified index. */
5009 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5010 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5011 li->subject = subject;
5012 li->encoding = subject->encoding;
5013 li->direction = direction;
5014 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5015 li->zi = ziplistIndex(subject->ptr,index);
5016 } else if (li->encoding == REDIS_ENCODING_LIST) {
5017 li->ln = listIndex(subject->ptr,index);
5018 } else {
5019 redisPanic("Unknown list encoding");
5020 }
5021 return li;
5022 }
5023
5024 /* Clean up the iterator. */
5025 static void listTypeReleaseIterator(listTypeIterator *li) {
5026 zfree(li);
5027 }
5028
5029 /* Stores pointer to current the entry in the provided entry structure
5030 * and advances the position of the iterator. Returns 1 when the current
5031 * entry is in fact an entry, 0 otherwise. */
5032 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5033 /* Protect from converting when iterating */
5034 redisAssert(li->subject->encoding == li->encoding);
5035
5036 entry->li = li;
5037 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5038 entry->zi = li->zi;
5039 if (entry->zi != NULL) {
5040 if (li->direction == REDIS_TAIL)
5041 li->zi = ziplistNext(li->subject->ptr,li->zi);
5042 else
5043 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5044 return 1;
5045 }
5046 } else if (li->encoding == REDIS_ENCODING_LIST) {
5047 entry->ln = li->ln;
5048 if (entry->ln != NULL) {
5049 if (li->direction == REDIS_TAIL)
5050 li->ln = li->ln->next;
5051 else
5052 li->ln = li->ln->prev;
5053 return 1;
5054 }
5055 } else {
5056 redisPanic("Unknown list encoding");
5057 }
5058 return 0;
5059 }
5060
5061 /* Return entry or NULL at the current position of the iterator. */
5062 static robj *listTypeGet(listTypeEntry *entry) {
5063 listTypeIterator *li = entry->li;
5064 robj *value = NULL;
5065 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5066 unsigned char *vstr;
5067 unsigned int vlen;
5068 long long vlong;
5069 redisAssert(entry->zi != NULL);
5070 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5071 if (vstr) {
5072 value = createStringObject((char*)vstr,vlen);
5073 } else {
5074 value = createStringObjectFromLongLong(vlong);
5075 }
5076 }
5077 } else if (li->encoding == REDIS_ENCODING_LIST) {
5078 redisAssert(entry->ln != NULL);
5079 value = listNodeValue(entry->ln);
5080 incrRefCount(value);
5081 } else {
5082 redisPanic("Unknown list encoding");
5083 }
5084 return value;
5085 }
5086
5087 /* Compare the given object with the entry at the current position. */
5088 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5089 listTypeIterator *li = entry->li;
5090 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5091 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5092 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5093 } else if (li->encoding == REDIS_ENCODING_LIST) {
5094 return equalStringObjects(o,listNodeValue(entry->ln));
5095 } else {
5096 redisPanic("Unknown list encoding");
5097 }
5098 }
5099
5100 /* Delete the element pointed to. */
5101 static void listTypeDelete(listTypeEntry *entry) {
5102 listTypeIterator *li = entry->li;
5103 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5104 unsigned char *p = entry->zi;
5105 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5106
5107 /* Update position of the iterator depending on the direction */
5108 if (li->direction == REDIS_TAIL)
5109 li->zi = p;
5110 else
5111 li->zi = ziplistPrev(li->subject->ptr,p);
5112 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5113 listNode *next;
5114 if (li->direction == REDIS_TAIL)
5115 next = entry->ln->next;
5116 else
5117 next = entry->ln->prev;
5118 listDelNode(li->subject->ptr,entry->ln);
5119 li->ln = next;
5120 } else {
5121 redisPanic("Unknown list encoding");
5122 }
5123 }
5124
5125 static void listTypeConvert(robj *subject, int enc) {
5126 listTypeIterator *li;
5127 listTypeEntry entry;
5128 redisAssert(subject->type == REDIS_LIST);
5129
5130 if (enc == REDIS_ENCODING_LIST) {
5131 list *l = listCreate();
5132 listSetFreeMethod(l,decrRefCount);
5133
5134 /* listTypeGet returns a robj with incremented refcount */
5135 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5136 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5137 listTypeReleaseIterator(li);
5138
5139 subject->encoding = REDIS_ENCODING_LIST;
5140 zfree(subject->ptr);
5141 subject->ptr = l;
5142 } else {
5143 redisPanic("Unsupported list conversion");
5144 }
5145 }
5146
5147 static void pushGenericCommand(redisClient *c, int where) {
5148 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5149 if (lobj == NULL) {
5150 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5151 addReply(c,shared.cone);
5152 return;
5153 }
5154 lobj = createZiplistObject();
5155 dbAdd(c->db,c->argv[1],lobj);
5156 } else {
5157 if (lobj->type != REDIS_LIST) {
5158 addReply(c,shared.wrongtypeerr);
5159 return;
5160 }
5161 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5162 addReply(c,shared.cone);
5163 return;
5164 }
5165 }
5166 listTypePush(lobj,c->argv[2],where);
5167 addReplyLongLong(c,listTypeLength(lobj));
5168 server.dirty++;
5169 }
5170
5171 static void lpushCommand(redisClient *c) {
5172 pushGenericCommand(c,REDIS_HEAD);
5173 }
5174
5175 static void rpushCommand(redisClient *c) {
5176 pushGenericCommand(c,REDIS_TAIL);
5177 }
5178
5179 static void llenCommand(redisClient *c) {
5180 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5181 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5182 addReplyUlong(c,listTypeLength(o));
5183 }
5184
5185 static void lindexCommand(redisClient *c) {
5186 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5187 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5188 int index = atoi(c->argv[2]->ptr);
5189 robj *value = NULL;
5190
5191 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5192 unsigned char *p;
5193 unsigned char *vstr;
5194 unsigned int vlen;
5195 long long vlong;
5196 p = ziplistIndex(o->ptr,index);
5197 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5198 if (vstr) {
5199 value = createStringObject((char*)vstr,vlen);
5200 } else {
5201 value = createStringObjectFromLongLong(vlong);
5202 }
5203 addReplyBulk(c,value);
5204 decrRefCount(value);
5205 } else {
5206 addReply(c,shared.nullbulk);
5207 }
5208 } else if (o->encoding == REDIS_ENCODING_LIST) {
5209 listNode *ln = listIndex(o->ptr,index);
5210 if (ln != NULL) {
5211 value = listNodeValue(ln);
5212 addReplyBulk(c,value);
5213 } else {
5214 addReply(c,shared.nullbulk);
5215 }
5216 } else {
5217 redisPanic("Unknown list encoding");
5218 }
5219 }
5220
5221 static void lsetCommand(redisClient *c) {
5222 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5223 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5224 int index = atoi(c->argv[2]->ptr);
5225 robj *value = c->argv[3];
5226
5227 listTypeTryConversion(o,value);
5228 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5229 unsigned char *p, *zl = o->ptr;
5230 p = ziplistIndex(zl,index);
5231 if (p == NULL) {
5232 addReply(c,shared.outofrangeerr);
5233 } else {
5234 o->ptr = ziplistDelete(o->ptr,&p);
5235 value = getDecodedObject(value);
5236 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5237 decrRefCount(value);
5238 addReply(c,shared.ok);
5239 server.dirty++;
5240 }
5241 } else if (o->encoding == REDIS_ENCODING_LIST) {
5242 listNode *ln = listIndex(o->ptr,index);
5243 if (ln == NULL) {
5244 addReply(c,shared.outofrangeerr);
5245 } else {
5246 decrRefCount((robj*)listNodeValue(ln));
5247 listNodeValue(ln) = value;
5248 incrRefCount(value);
5249 addReply(c,shared.ok);
5250 server.dirty++;
5251 }
5252 } else {
5253 redisPanic("Unknown list encoding");
5254 }
5255 }
5256
5257 static void popGenericCommand(redisClient *c, int where) {
5258 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5259 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5260
5261 robj *value = listTypePop(o,where);
5262 if (value == NULL) {
5263 addReply(c,shared.nullbulk);
5264 } else {
5265 addReplyBulk(c,value);
5266 decrRefCount(value);
5267 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5268 server.dirty++;
5269 }
5270 }
5271
5272 static void lpopCommand(redisClient *c) {
5273 popGenericCommand(c,REDIS_HEAD);
5274 }
5275
5276 static void rpopCommand(redisClient *c) {
5277 popGenericCommand(c,REDIS_TAIL);
5278 }
5279
5280 static void lrangeCommand(redisClient *c) {
5281 robj *o, *value;
5282 int start = atoi(c->argv[2]->ptr);
5283 int end = atoi(c->argv[3]->ptr);
5284 int llen;
5285 int rangelen, j;
5286 listTypeEntry entry;
5287
5288 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5289 || checkType(c,o,REDIS_LIST)) return;
5290 llen = listTypeLength(o);
5291
5292 /* convert negative indexes */
5293 if (start < 0) start = llen+start;
5294 if (end < 0) end = llen+end;
5295 if (start < 0) start = 0;
5296 if (end < 0) end = 0;
5297
5298 /* indexes sanity checks */
5299 if (start > end || start >= llen) {
5300 /* Out of range start or start > end result in empty list */
5301 addReply(c,shared.emptymultibulk);
5302 return;
5303 }
5304 if (end >= llen) end = llen-1;
5305 rangelen = (end-start)+1;
5306
5307 /* Return the result in form of a multi-bulk reply */
5308 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5309 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5310 for (j = 0; j < rangelen; j++) {
5311 redisAssert(listTypeNext(li,&entry));
5312 value = listTypeGet(&entry);
5313 addReplyBulk(c,value);
5314 decrRefCount(value);
5315 }
5316 listTypeReleaseIterator(li);
5317 }
5318
5319 static void ltrimCommand(redisClient *c) {
5320 robj *o;
5321 int start = atoi(c->argv[2]->ptr);
5322 int end = atoi(c->argv[3]->ptr);
5323 int llen;
5324 int j, ltrim, rtrim;
5325 list *list;
5326 listNode *ln;
5327
5328 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5329 checkType(c,o,REDIS_LIST)) return;
5330 llen = listTypeLength(o);
5331
5332 /* convert negative indexes */
5333 if (start < 0) start = llen+start;
5334 if (end < 0) end = llen+end;
5335 if (start < 0) start = 0;
5336 if (end < 0) end = 0;
5337
5338 /* indexes sanity checks */
5339 if (start > end || start >= llen) {
5340 /* Out of range start or start > end result in empty list */
5341 ltrim = llen;
5342 rtrim = 0;
5343 } else {
5344 if (end >= llen) end = llen-1;
5345 ltrim = start;
5346 rtrim = llen-end-1;
5347 }
5348
5349 /* Remove list elements to perform the trim */
5350 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5351 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5352 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5353 } else if (o->encoding == REDIS_ENCODING_LIST) {
5354 list = o->ptr;
5355 for (j = 0; j < ltrim; j++) {
5356 ln = listFirst(list);
5357 listDelNode(list,ln);
5358 }
5359 for (j = 0; j < rtrim; j++) {
5360 ln = listLast(list);
5361 listDelNode(list,ln);
5362 }
5363 } else {
5364 redisPanic("Unknown list encoding");
5365 }
5366 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5367 server.dirty++;
5368 addReply(c,shared.ok);
5369 }
5370
5371 static void lremCommand(redisClient *c) {
5372 robj *subject, *obj = c->argv[3];
5373 int toremove = atoi(c->argv[2]->ptr);
5374 int removed = 0;
5375 listTypeEntry entry;
5376
5377 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5378 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5379
5380 /* Make sure obj is raw when we're dealing with a ziplist */
5381 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5382 obj = getDecodedObject(obj);
5383
5384 listTypeIterator *li;
5385 if (toremove < 0) {
5386 toremove = -toremove;
5387 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5388 } else {
5389 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5390 }
5391
5392 while (listTypeNext(li,&entry)) {
5393 if (listTypeEqual(&entry,obj)) {
5394 listTypeDelete(&entry);
5395 server.dirty++;
5396 removed++;
5397 if (toremove && removed == toremove) break;
5398 }
5399 }
5400 listTypeReleaseIterator(li);
5401
5402 /* Clean up raw encoded object */
5403 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5404 decrRefCount(obj);
5405
5406 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5407 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5408 }
5409
5410 /* This is the semantic of this command:
5411 * RPOPLPUSH srclist dstlist:
5412 * IF LLEN(srclist) > 0
5413 * element = RPOP srclist
5414 * LPUSH dstlist element
5415 * RETURN element
5416 * ELSE
5417 * RETURN nil
5418 * END
5419 * END
5420 *
5421 * The idea is to be able to get an element from a list in a reliable way
5422 * since the element is not just returned but pushed against another list
5423 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5424 */
5425 static void rpoplpushcommand(redisClient *c) {
5426 robj *sobj, *value;
5427 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5428 checkType(c,sobj,REDIS_LIST)) return;
5429
5430 if (listTypeLength(sobj) == 0) {
5431 addReply(c,shared.nullbulk);
5432 } else {
5433 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5434 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5435 value = listTypePop(sobj,REDIS_TAIL);
5436
5437 /* Add the element to the target list (unless it's directly
5438 * passed to some BLPOP-ing client */
5439 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5440 /* Create the list if the key does not exist */
5441 if (!dobj) {
5442 dobj = createZiplistObject();
5443 dbAdd(c->db,c->argv[2],dobj);
5444 }
5445 listTypePush(dobj,value,REDIS_HEAD);
5446 }
5447
5448 /* Send the element to the client as reply as well */
5449 addReplyBulk(c,value);
5450
5451 /* listTypePop returns an object with its refcount incremented */
5452 decrRefCount(value);
5453
5454 /* Delete the source list when it is empty */
5455 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5456 server.dirty++;
5457 }
5458 }
5459
5460 /* ==================================== Sets ================================ */
5461
5462 static int setTypeAdd(robj *subject, robj *value) {
5463 if (subject->encoding == REDIS_ENCODING_HT) {
5464 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5465 incrRefCount(value);
5466 return 1;
5467 }
5468 } else {
5469 redisPanic("Unknown set encoding");
5470 }
5471 return 0;
5472 }
5473
5474 static int setTypeRemove(robj *subject, robj *value) {
5475 if (subject->encoding == REDIS_ENCODING_HT) {
5476 if (dictDelete(subject->ptr,value) == DICT_OK) {
5477 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5478 return 1;
5479 }
5480 } else {
5481 redisPanic("Unknown set encoding");
5482 }
5483 return 0;
5484 }
5485
5486 static int setTypeIsMember(robj *subject, robj *value) {
5487 if (subject->encoding == REDIS_ENCODING_HT) {
5488 return dictFind((dict*)subject->ptr,value) != NULL;
5489 } else {
5490 redisPanic("Unknown set encoding");
5491 }
5492 }
5493
5494 /* Structure to hold set iteration abstraction. */
5495 typedef struct {
5496 int encoding;
5497 dictIterator *di;
5498 } setIterator;
5499
5500 static setIterator *setTypeInitIterator(robj *subject) {
5501 setIterator *si = zmalloc(sizeof(setIterator));
5502 si->encoding = subject->encoding;
5503 if (si->encoding == REDIS_ENCODING_HT) {
5504 si->di = dictGetIterator(subject->ptr);
5505 } else {
5506 redisPanic("Unknown set encoding");
5507 }
5508 return si;
5509 }
5510
5511 static void setTypeReleaseIterator(setIterator *si) {
5512 if (si->encoding == REDIS_ENCODING_HT)
5513 dictReleaseIterator(si->di);
5514 zfree(si);
5515 }
5516
5517 /* Move to the next entry in the set. Returns the object at the current
5518 * position, or NULL when the end is reached. This object will have its
5519 * refcount incremented, so the caller needs to take care of this. */
5520 static robj *setTypeNext(setIterator *si) {
5521 robj *ret = NULL;
5522 if (si->encoding == REDIS_ENCODING_HT) {
5523 dictEntry *de = dictNext(si->di);
5524 if (de != NULL) {
5525 ret = dictGetEntryKey(de);
5526 incrRefCount(ret);
5527 }
5528 }
5529 return ret;
5530 }
5531
5532
5533 /* Return random element from set. The returned object will always have
5534 * an incremented refcount. */
5535 robj *setTypeRandomElement(robj *subject) {
5536 robj *ret = NULL;
5537 if (subject->encoding == REDIS_ENCODING_HT) {
5538 dictEntry *de = dictGetRandomKey(subject->ptr);
5539 ret = dictGetEntryKey(de);
5540 incrRefCount(ret);
5541 } else {
5542 redisPanic("Unknown set encoding");
5543 }
5544 return ret;
5545 }
5546
5547 static unsigned long setTypeSize(robj *subject) {
5548 if (subject->encoding == REDIS_ENCODING_HT) {
5549 return dictSize((dict*)subject->ptr);
5550 } else {
5551 redisPanic("Unknown set encoding");
5552 }
5553 }
5554
5555 static void saddCommand(redisClient *c) {
5556 robj *set;
5557
5558 set = lookupKeyWrite(c->db,c->argv[1]);
5559 if (set == NULL) {
5560 set = createSetObject();
5561 dbAdd(c->db,c->argv[1],set);
5562 } else {
5563 if (set->type != REDIS_SET) {
5564 addReply(c,shared.wrongtypeerr);
5565 return;
5566 }
5567 }
5568 if (setTypeAdd(set,c->argv[2])) {
5569 server.dirty++;
5570 addReply(c,shared.cone);
5571 } else {
5572 addReply(c,shared.czero);
5573 }
5574 }
5575
5576 static void sremCommand(redisClient *c) {
5577 robj *set;
5578
5579 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5580 checkType(c,set,REDIS_SET)) return;
5581
5582 if (setTypeRemove(set,c->argv[2])) {
5583 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5584 server.dirty++;
5585 addReply(c,shared.cone);
5586 } else {
5587 addReply(c,shared.czero);
5588 }
5589 }
5590
5591 static void smoveCommand(redisClient *c) {
5592 robj *srcset, *dstset;
5593
5594 srcset = lookupKeyWrite(c->db,c->argv[1]);
5595 dstset = lookupKeyWrite(c->db,c->argv[2]);
5596
5597 /* If the source key does not exist return 0, if it's of the wrong type
5598 * raise an error */
5599 if (srcset == NULL || srcset->type != REDIS_SET) {
5600 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5601 return;
5602 }
5603 /* Error if the destination key is not a set as well */
5604 if (dstset && dstset->type != REDIS_SET) {
5605 addReply(c,shared.wrongtypeerr);
5606 return;
5607 }
5608 /* Remove the element from the source set */
5609 if (!setTypeRemove(srcset,c->argv[3])) {
5610 /* Key not found in the src set! return zero */
5611 addReply(c,shared.czero);
5612 return;
5613 }
5614 if (setTypeSize(srcset) == 0 && srcset != dstset)
5615 dbDelete(c->db,c->argv[1]);
5616 server.dirty++;
5617 /* Add the element to the destination set */
5618 if (!dstset) {
5619 dstset = createSetObject();
5620 dbAdd(c->db,c->argv[2],dstset);
5621 }
5622 setTypeAdd(dstset,c->argv[3]);
5623 addReply(c,shared.cone);
5624 }
5625
5626 static void sismemberCommand(redisClient *c) {
5627 robj *set;
5628
5629 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5630 checkType(c,set,REDIS_SET)) return;
5631
5632 if (setTypeIsMember(set,c->argv[2]))
5633 addReply(c,shared.cone);
5634 else
5635 addReply(c,shared.czero);
5636 }
5637
5638 static void scardCommand(redisClient *c) {
5639 robj *o;
5640
5641 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5642 checkType(c,o,REDIS_SET)) return;
5643
5644 addReplyUlong(c,setTypeSize(o));
5645 }
5646
5647 static void spopCommand(redisClient *c) {
5648 robj *set, *ele;
5649
5650 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5651 checkType(c,set,REDIS_SET)) return;
5652
5653 ele = setTypeRandomElement(set);
5654 if (ele == NULL) {
5655 addReply(c,shared.nullbulk);
5656 } else {
5657 setTypeRemove(set,ele);
5658 addReplyBulk(c,ele);
5659 decrRefCount(ele);
5660 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5661 server.dirty++;
5662 }
5663 }
5664
5665 static void srandmemberCommand(redisClient *c) {
5666 robj *set, *ele;
5667
5668 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5669 checkType(c,set,REDIS_SET)) return;
5670
5671 ele = setTypeRandomElement(set);
5672 if (ele == NULL) {
5673 addReply(c,shared.nullbulk);
5674 } else {
5675 addReplyBulk(c,ele);
5676 decrRefCount(ele);
5677 }
5678 }
5679
5680 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5681 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
5682 }
5683
5684 static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5685 robj **sets = zmalloc(sizeof(robj*)*setnum);
5686 setIterator *si;
5687 robj *ele, *lenobj = NULL, *dstset = NULL;
5688 unsigned long j, cardinality = 0;
5689
5690 for (j = 0; j < setnum; j++) {
5691 robj *setobj = dstkey ?
5692 lookupKeyWrite(c->db,setkeys[j]) :
5693 lookupKeyRead(c->db,setkeys[j]);
5694 if (!setobj) {
5695 zfree(sets);
5696 if (dstkey) {
5697 if (dbDelete(c->db,dstkey))
5698 server.dirty++;
5699 addReply(c,shared.czero);
5700 } else {
5701 addReply(c,shared.emptymultibulk);
5702 }
5703 return;
5704 }
5705 if (checkType(c,setobj,REDIS_SET)) {
5706 zfree(sets);
5707 return;
5708 }
5709 sets[j] = setobj;
5710 }
5711 /* Sort sets from the smallest to largest, this will improve our
5712 * algorithm's performace */
5713 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
5714
5715 /* The first thing we should output is the total number of elements...
5716 * since this is a multi-bulk write, but at this stage we don't know
5717 * the intersection set size, so we use a trick, append an empty object
5718 * to the output list and save the pointer to later modify it with the
5719 * right length */
5720 if (!dstkey) {
5721 lenobj = createObject(REDIS_STRING,NULL);
5722 addReply(c,lenobj);
5723 decrRefCount(lenobj);
5724 } else {
5725 /* If we have a target key where to store the resulting set
5726 * create this key with an empty set inside */
5727 dstset = createSetObject();
5728 }
5729
5730 /* Iterate all the elements of the first (smallest) set, and test
5731 * the element against all the other sets, if at least one set does
5732 * not include the element it is discarded */
5733 si = setTypeInitIterator(sets[0]);
5734 while((ele = setTypeNext(si)) != NULL) {
5735 for (j = 1; j < setnum; j++)
5736 if (!setTypeIsMember(sets[j],ele)) break;
5737
5738 /* Only take action when all sets contain the member */
5739 if (j == setnum) {
5740 if (!dstkey) {
5741 addReplyBulk(c,ele);
5742 cardinality++;
5743 } else {
5744 setTypeAdd(dstset,ele);
5745 }
5746 }
5747 decrRefCount(ele);
5748 }
5749 setTypeReleaseIterator(si);
5750
5751 if (dstkey) {
5752 /* Store the resulting set into the target, if the intersection
5753 * is not an empty set. */
5754 dbDelete(c->db,dstkey);
5755 if (setTypeSize(dstset) > 0) {
5756 dbAdd(c->db,dstkey,dstset);
5757 addReplyLongLong(c,setTypeSize(dstset));
5758 } else {
5759 decrRefCount(dstset);
5760 addReply(c,shared.czero);
5761 }
5762 server.dirty++;
5763 } else {
5764 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5765 }
5766 zfree(sets);
5767 }
5768
5769 static void sinterCommand(redisClient *c) {
5770 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5771 }
5772
5773 static void sinterstoreCommand(redisClient *c) {
5774 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5775 }
5776
5777 #define REDIS_OP_UNION 0
5778 #define REDIS_OP_DIFF 1
5779 #define REDIS_OP_INTER 2
5780
5781 static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5782 robj **sets = zmalloc(sizeof(robj*)*setnum);
5783 setIterator *si;
5784 robj *ele, *dstset = NULL;
5785 int j, cardinality = 0;
5786
5787 for (j = 0; j < setnum; j++) {
5788 robj *setobj = dstkey ?
5789 lookupKeyWrite(c->db,setkeys[j]) :
5790 lookupKeyRead(c->db,setkeys[j]);
5791 if (!setobj) {
5792 sets[j] = NULL;
5793 continue;
5794 }
5795 if (checkType(c,setobj,REDIS_SET)) {
5796 zfree(sets);
5797 return;
5798 }
5799 sets[j] = setobj;
5800 }
5801
5802 /* We need a temp set object to store our union. If the dstkey
5803 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5804 * this set object will be the resulting object to set into the target key*/
5805 dstset = createSetObject();
5806
5807 /* Iterate all the elements of all the sets, add every element a single
5808 * time to the result set */
5809 for (j = 0; j < setnum; j++) {
5810 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5811 if (!sets[j]) continue; /* non existing keys are like empty sets */
5812
5813 si = setTypeInitIterator(sets[j]);
5814 while((ele = setTypeNext(si)) != NULL) {
5815 if (op == REDIS_OP_UNION || j == 0) {
5816 if (setTypeAdd(dstset,ele)) {
5817 cardinality++;
5818 }
5819 } else if (op == REDIS_OP_DIFF) {
5820 if (setTypeRemove(dstset,ele)) {
5821 cardinality--;
5822 }
5823 }
5824 decrRefCount(ele);
5825 }
5826 setTypeReleaseIterator(si);
5827
5828 /* Exit when result set is empty. */
5829 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5830 }
5831
5832 /* Output the content of the resulting set, if not in STORE mode */
5833 if (!dstkey) {
5834 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5835 si = setTypeInitIterator(dstset);
5836 while((ele = setTypeNext(si)) != NULL) {
5837 addReplyBulk(c,ele);
5838 decrRefCount(ele);
5839 }
5840 setTypeReleaseIterator(si);
5841 decrRefCount(dstset);
5842 } else {
5843 /* If we have a target key where to store the resulting set
5844 * create this key with the result set inside */
5845 dbDelete(c->db,dstkey);
5846 if (setTypeSize(dstset) > 0) {
5847 dbAdd(c->db,dstkey,dstset);
5848 addReplyLongLong(c,setTypeSize(dstset));
5849 } else {
5850 decrRefCount(dstset);
5851 addReply(c,shared.czero);
5852 }
5853 server.dirty++;
5854 }
5855 zfree(sets);
5856 }
5857
5858 static void sunionCommand(redisClient *c) {
5859 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5860 }
5861
5862 static void sunionstoreCommand(redisClient *c) {
5863 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5864 }
5865
5866 static void sdiffCommand(redisClient *c) {
5867 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5868 }
5869
5870 static void sdiffstoreCommand(redisClient *c) {
5871 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5872 }
5873
5874 /* ==================================== ZSets =============================== */
5875
5876 /* ZSETs are ordered sets using two data structures to hold the same elements
5877 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5878 * data structure.
5879 *
5880 * The elements are added to an hash table mapping Redis objects to scores.
5881 * At the same time the elements are added to a skip list mapping scores
5882 * to Redis objects (so objects are sorted by scores in this "view"). */
5883
5884 /* This skiplist implementation is almost a C translation of the original
5885 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5886 * Alternative to Balanced Trees", modified in three ways:
5887 * a) this implementation allows for repeated values.
5888 * b) the comparison is not just by key (our 'score') but by satellite data.
5889 * c) there is a back pointer, so it's a doubly linked list with the back
5890 * pointers being only at "level 1". This allows to traverse the list
5891 * from tail to head, useful for ZREVRANGE. */
5892
5893 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5894 zskiplistNode *zn = zmalloc(sizeof(*zn));
5895
5896 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5897 if (level > 1)
5898 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5899 else
5900 zn->span = NULL;
5901 zn->score = score;
5902 zn->obj = obj;
5903 return zn;
5904 }
5905
5906 static zskiplist *zslCreate(void) {
5907 int j;
5908 zskiplist *zsl;
5909
5910 zsl = zmalloc(sizeof(*zsl));
5911 zsl->level = 1;
5912 zsl->length = 0;
5913 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5914 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5915 zsl->header->forward[j] = NULL;
5916
5917 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5918 if (j < ZSKIPLIST_MAXLEVEL-1)
5919 zsl->header->span[j] = 0;
5920 }
5921 zsl->header->backward = NULL;
5922 zsl->tail = NULL;
5923 return zsl;
5924 }
5925
5926 static void zslFreeNode(zskiplistNode *node) {
5927 decrRefCount(node->obj);
5928 zfree(node->forward);
5929 zfree(node->span);
5930 zfree(node);
5931 }
5932
5933 static void zslFree(zskiplist *zsl) {
5934 zskiplistNode *node = zsl->header->forward[0], *next;
5935
5936 zfree(zsl->header->forward);
5937 zfree(zsl->header->span);
5938 zfree(zsl->header);
5939 while(node) {
5940 next = node->forward[0];
5941 zslFreeNode(node);
5942 node = next;
5943 }
5944 zfree(zsl);
5945 }
5946
5947 static int zslRandomLevel(void) {
5948 int level = 1;
5949 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5950 level += 1;
5951 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5952 }
5953
5954 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5955 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5956 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5957 int i, level;
5958
5959 x = zsl->header;
5960 for (i = zsl->level-1; i >= 0; i--) {
5961 /* store rank that is crossed to reach the insert position */
5962 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5963
5964 while (x->forward[i] &&
5965 (x->forward[i]->score < score ||
5966 (x->forward[i]->score == score &&
5967 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5968 rank[i] += i > 0 ? x->span[i-1] : 1;
5969 x = x->forward[i];
5970 }
5971 update[i] = x;
5972 }
5973 /* we assume the key is not already inside, since we allow duplicated
5974 * scores, and the re-insertion of score and redis object should never
5975 * happpen since the caller of zslInsert() should test in the hash table
5976 * if the element is already inside or not. */
5977 level = zslRandomLevel();
5978 if (level > zsl->level) {
5979 for (i = zsl->level; i < level; i++) {
5980 rank[i] = 0;
5981 update[i] = zsl->header;
5982 update[i]->span[i-1] = zsl->length;
5983 }
5984 zsl->level = level;
5985 }
5986 x = zslCreateNode(level,score,obj);
5987 for (i = 0; i < level; i++) {
5988 x->forward[i] = update[i]->forward[i];
5989 update[i]->forward[i] = x;
5990
5991 /* update span covered by update[i] as x is inserted here */
5992 if (i > 0) {
5993 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5994 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5995 }
5996 }
5997
5998 /* increment span for untouched levels */
5999 for (i = level; i < zsl->level; i++) {
6000 update[i]->span[i-1]++;
6001 }
6002
6003 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6004 if (x->forward[0])
6005 x->forward[0]->backward = x;
6006 else
6007 zsl->tail = x;
6008 zsl->length++;
6009 }
6010
6011 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6012 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6013 int i;
6014 for (i = 0; i < zsl->level; i++) {
6015 if (update[i]->forward[i] == x) {
6016 if (i > 0) {
6017 update[i]->span[i-1] += x->span[i-1] - 1;
6018 }
6019 update[i]->forward[i] = x->forward[i];
6020 } else {
6021 /* invariant: i > 0, because update[0]->forward[0]
6022 * is always equal to x */
6023 update[i]->span[i-1] -= 1;
6024 }
6025 }
6026 if (x->forward[0]) {
6027 x->forward[0]->backward = x->backward;
6028 } else {
6029 zsl->tail = x->backward;
6030 }
6031 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6032 zsl->level--;
6033 zsl->length--;
6034 }
6035
6036 /* Delete an element with matching score/object from the skiplist. */
6037 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6038 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6039 int i;
6040
6041 x = zsl->header;
6042 for (i = zsl->level-1; i >= 0; i--) {
6043 while (x->forward[i] &&
6044 (x->forward[i]->score < score ||
6045 (x->forward[i]->score == score &&
6046 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6047 x = x->forward[i];
6048 update[i] = x;
6049 }
6050 /* We may have multiple elements with the same score, what we need
6051 * is to find the element with both the right score and object. */
6052 x = x->forward[0];
6053 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6054 zslDeleteNode(zsl, x, update);
6055 zslFreeNode(x);
6056 return 1;
6057 } else {
6058 return 0; /* not found */
6059 }
6060 return 0; /* not found */
6061 }
6062
6063 /* Delete all the elements with score between min and max from the skiplist.
6064 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6065 * Note that this function takes the reference to the hash table view of the
6066 * sorted set, in order to remove the elements from the hash table too. */
6067 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6068 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6069 unsigned long removed = 0;
6070 int i;
6071
6072 x = zsl->header;
6073 for (i = zsl->level-1; i >= 0; i--) {
6074 while (x->forward[i] && x->forward[i]->score < min)
6075 x = x->forward[i];
6076 update[i] = x;
6077 }
6078 /* We may have multiple elements with the same score, what we need
6079 * is to find the element with both the right score and object. */
6080 x = x->forward[0];
6081 while (x && x->score <= max) {
6082 zskiplistNode *next = x->forward[0];
6083 zslDeleteNode(zsl, x, update);
6084 dictDelete(dict,x->obj);
6085 zslFreeNode(x);
6086 removed++;
6087 x = next;
6088 }
6089 return removed; /* not found */
6090 }
6091
6092 /* Delete all the elements with rank between start and end from the skiplist.
6093 * Start and end are inclusive. Note that start and end need to be 1-based */
6094 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6095 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6096 unsigned long traversed = 0, removed = 0;
6097 int i;
6098
6099 x = zsl->header;
6100 for (i = zsl->level-1; i >= 0; i--) {
6101 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6102 traversed += i > 0 ? x->span[i-1] : 1;
6103 x = x->forward[i];
6104 }
6105 update[i] = x;
6106 }
6107
6108 traversed++;
6109 x = x->forward[0];
6110 while (x && traversed <= end) {
6111 zskiplistNode *next = x->forward[0];
6112 zslDeleteNode(zsl, x, update);
6113 dictDelete(dict,x->obj);
6114 zslFreeNode(x);
6115 removed++;
6116 traversed++;
6117 x = next;
6118 }
6119 return removed;
6120 }
6121
6122 /* Find the first node having a score equal or greater than the specified one.
6123 * Returns NULL if there is no match. */
6124 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6125 zskiplistNode *x;
6126 int i;
6127
6128 x = zsl->header;
6129 for (i = zsl->level-1; i >= 0; i--) {
6130 while (x->forward[i] && x->forward[i]->score < score)
6131 x = x->forward[i];
6132 }
6133 /* We may have multiple elements with the same score, what we need
6134 * is to find the element with both the right score and object. */
6135 return x->forward[0];
6136 }
6137
6138 /* Find the rank for an element by both score and key.
6139 * Returns 0 when the element cannot be found, rank otherwise.
6140 * Note that the rank is 1-based due to the span of zsl->header to the
6141 * first element. */
6142 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6143 zskiplistNode *x;
6144 unsigned long rank = 0;
6145 int i;
6146
6147 x = zsl->header;
6148 for (i = zsl->level-1; i >= 0; i--) {
6149 while (x->forward[i] &&
6150 (x->forward[i]->score < score ||
6151 (x->forward[i]->score == score &&
6152 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6153 rank += i > 0 ? x->span[i-1] : 1;
6154 x = x->forward[i];
6155 }
6156
6157 /* x might be equal to zsl->header, so test if obj is non-NULL */
6158 if (x->obj && equalStringObjects(x->obj,o)) {
6159 return rank;
6160 }
6161 }
6162 return 0;
6163 }
6164
6165 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6166 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6167 zskiplistNode *x;
6168 unsigned long traversed = 0;
6169 int i;
6170
6171 x = zsl->header;
6172 for (i = zsl->level-1; i >= 0; i--) {
6173 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6174 {
6175 traversed += i > 0 ? x->span[i-1] : 1;
6176 x = x->forward[i];
6177 }
6178 if (traversed == rank) {
6179 return x;
6180 }
6181 }
6182 return NULL;
6183 }
6184
6185 /* The actual Z-commands implementations */
6186
6187 /* This generic command implements both ZADD and ZINCRBY.
6188 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6189 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6190 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6191 robj *zsetobj;
6192 zset *zs;
6193 double *score;
6194
6195 if (isnan(scoreval)) {
6196 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6197 return;
6198 }
6199
6200 zsetobj = lookupKeyWrite(c->db,key);
6201 if (zsetobj == NULL) {
6202 zsetobj = createZsetObject();
6203 dbAdd(c->db,key,zsetobj);
6204 } else {
6205 if (zsetobj->type != REDIS_ZSET) {
6206 addReply(c,shared.wrongtypeerr);
6207 return;
6208 }
6209 }
6210 zs = zsetobj->ptr;
6211
6212 /* Ok now since we implement both ZADD and ZINCRBY here the code
6213 * needs to handle the two different conditions. It's all about setting
6214 * '*score', that is, the new score to set, to the right value. */
6215 score = zmalloc(sizeof(double));
6216 if (doincrement) {
6217 dictEntry *de;
6218
6219 /* Read the old score. If the element was not present starts from 0 */
6220 de = dictFind(zs->dict,ele);
6221 if (de) {
6222 double *oldscore = dictGetEntryVal(de);
6223 *score = *oldscore + scoreval;
6224 } else {
6225 *score = scoreval;
6226 }
6227 if (isnan(*score)) {
6228 addReplySds(c,
6229 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6230 zfree(score);
6231 /* Note that we don't need to check if the zset may be empty and
6232 * should be removed here, as we can only obtain Nan as score if
6233 * there was already an element in the sorted set. */
6234 return;
6235 }
6236 } else {
6237 *score = scoreval;
6238 }
6239
6240 /* What follows is a simple remove and re-insert operation that is common
6241 * to both ZADD and ZINCRBY... */
6242 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6243 /* case 1: New element */
6244 incrRefCount(ele); /* added to hash */
6245 zslInsert(zs->zsl,*score,ele);
6246 incrRefCount(ele); /* added to skiplist */
6247 server.dirty++;
6248 if (doincrement)
6249 addReplyDouble(c,*score);
6250 else
6251 addReply(c,shared.cone);
6252 } else {
6253 dictEntry *de;
6254 double *oldscore;
6255
6256 /* case 2: Score update operation */
6257 de = dictFind(zs->dict,ele);
6258 redisAssert(de != NULL);
6259 oldscore = dictGetEntryVal(de);
6260 if (*score != *oldscore) {
6261 int deleted;
6262
6263 /* Remove and insert the element in the skip list with new score */
6264 deleted = zslDelete(zs->zsl,*oldscore,ele);
6265 redisAssert(deleted != 0);
6266 zslInsert(zs->zsl,*score,ele);
6267 incrRefCount(ele);
6268 /* Update the score in the hash table */
6269 dictReplace(zs->dict,ele,score);
6270 server.dirty++;
6271 } else {
6272 zfree(score);
6273 }
6274 if (doincrement)
6275 addReplyDouble(c,*score);
6276 else
6277 addReply(c,shared.czero);
6278 }
6279 }
6280
6281 static void zaddCommand(redisClient *c) {
6282 double scoreval;
6283
6284 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6285 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6286 }
6287
6288 static void zincrbyCommand(redisClient *c) {
6289 double scoreval;
6290
6291 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6292 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6293 }
6294
6295 static void zremCommand(redisClient *c) {
6296 robj *zsetobj;
6297 zset *zs;
6298 dictEntry *de;
6299 double *oldscore;
6300 int deleted;
6301
6302 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6303 checkType(c,zsetobj,REDIS_ZSET)) return;
6304
6305 zs = zsetobj->ptr;
6306 de = dictFind(zs->dict,c->argv[2]);
6307 if (de == NULL) {
6308 addReply(c,shared.czero);
6309 return;
6310 }
6311 /* Delete from the skiplist */
6312 oldscore = dictGetEntryVal(de);
6313 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6314 redisAssert(deleted != 0);
6315
6316 /* Delete from the hash table */
6317 dictDelete(zs->dict,c->argv[2]);
6318 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6319 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6320 server.dirty++;
6321 addReply(c,shared.cone);
6322 }
6323
6324 static void zremrangebyscoreCommand(redisClient *c) {
6325 double min;
6326 double max;
6327 long deleted;
6328 robj *zsetobj;
6329 zset *zs;
6330
6331 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6332 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6333
6334 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6335 checkType(c,zsetobj,REDIS_ZSET)) return;
6336
6337 zs = zsetobj->ptr;
6338 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6339 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6340 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6341 server.dirty += deleted;
6342 addReplyLongLong(c,deleted);
6343 }
6344
6345 static void zremrangebyrankCommand(redisClient *c) {
6346 long start;
6347 long end;
6348 int llen;
6349 long deleted;
6350 robj *zsetobj;
6351 zset *zs;
6352
6353 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6354 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6355
6356 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6357 checkType(c,zsetobj,REDIS_ZSET)) return;
6358 zs = zsetobj->ptr;
6359 llen = zs->zsl->length;
6360
6361 /* convert negative indexes */
6362 if (start < 0) start = llen+start;
6363 if (end < 0) end = llen+end;
6364 if (start < 0) start = 0;
6365 if (end < 0) end = 0;
6366
6367 /* indexes sanity checks */
6368 if (start > end || start >= llen) {
6369 addReply(c,shared.czero);
6370 return;
6371 }
6372 if (end >= llen) end = llen-1;
6373
6374 /* increment start and end because zsl*Rank functions
6375 * use 1-based rank */
6376 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6377 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6378 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6379 server.dirty += deleted;
6380 addReplyLongLong(c, deleted);
6381 }
6382
6383 typedef struct {
6384 dict *dict;
6385 double weight;
6386 } zsetopsrc;
6387
6388 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6389 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6390 unsigned long size1, size2;
6391 size1 = d1->dict ? dictSize(d1->dict) : 0;
6392 size2 = d2->dict ? dictSize(d2->dict) : 0;
6393 return size1 - size2;
6394 }
6395
6396 #define REDIS_AGGR_SUM 1
6397 #define REDIS_AGGR_MIN 2
6398 #define REDIS_AGGR_MAX 3
6399 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6400
6401 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6402 if (aggregate == REDIS_AGGR_SUM) {
6403 *target = *target + val;
6404 } else if (aggregate == REDIS_AGGR_MIN) {
6405 *target = val < *target ? val : *target;
6406 } else if (aggregate == REDIS_AGGR_MAX) {
6407 *target = val > *target ? val : *target;
6408 } else {
6409 /* safety net */
6410 redisPanic("Unknown ZUNION/INTER aggregate type");
6411 }
6412 }
6413
6414 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6415 int i, j, setnum;
6416 int aggregate = REDIS_AGGR_SUM;
6417 zsetopsrc *src;
6418 robj *dstobj;
6419 zset *dstzset;
6420 dictIterator *di;
6421 dictEntry *de;
6422
6423 /* expect setnum input keys to be given */
6424 setnum = atoi(c->argv[2]->ptr);
6425 if (setnum < 1) {
6426 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6427 return;
6428 }
6429
6430 /* test if the expected number of keys would overflow */
6431 if (3+setnum > c->argc) {
6432 addReply(c,shared.syntaxerr);
6433 return;
6434 }
6435
6436 /* read keys to be used for input */
6437 src = zmalloc(sizeof(zsetopsrc) * setnum);
6438 for (i = 0, j = 3; i < setnum; i++, j++) {
6439 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6440 if (!obj) {
6441 src[i].dict = NULL;
6442 } else {
6443 if (obj->type == REDIS_ZSET) {
6444 src[i].dict = ((zset*)obj->ptr)->dict;
6445 } else if (obj->type == REDIS_SET) {
6446 src[i].dict = (obj->ptr);
6447 } else {
6448 zfree(src);
6449 addReply(c,shared.wrongtypeerr);
6450 return;
6451 }
6452 }
6453
6454 /* default all weights to 1 */
6455 src[i].weight = 1.0;
6456 }
6457
6458 /* parse optional extra arguments */
6459 if (j < c->argc) {
6460 int remaining = c->argc - j;
6461
6462 while (remaining) {
6463 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6464 j++; remaining--;
6465 for (i = 0; i < setnum; i++, j++, remaining--) {
6466 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6467 return;
6468 }
6469 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6470 j++; remaining--;
6471 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6472 aggregate = REDIS_AGGR_SUM;
6473 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6474 aggregate = REDIS_AGGR_MIN;
6475 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6476 aggregate = REDIS_AGGR_MAX;
6477 } else {
6478 zfree(src);
6479 addReply(c,shared.syntaxerr);
6480 return;
6481 }
6482 j++; remaining--;
6483 } else {
6484 zfree(src);
6485 addReply(c,shared.syntaxerr);
6486 return;
6487 }
6488 }
6489 }
6490
6491 /* sort sets from the smallest to largest, this will improve our
6492 * algorithm's performance */
6493 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6494
6495 dstobj = createZsetObject();
6496 dstzset = dstobj->ptr;
6497
6498 if (op == REDIS_OP_INTER) {
6499 /* skip going over all entries if the smallest zset is NULL or empty */
6500 if (src[0].dict && dictSize(src[0].dict) > 0) {
6501 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6502 * from small to large, all src[i > 0].dict are non-empty too */
6503 di = dictGetIterator(src[0].dict);
6504 while((de = dictNext(di)) != NULL) {
6505 double *score = zmalloc(sizeof(double)), value;
6506 *score = src[0].weight * zunionInterDictValue(de);
6507
6508 for (j = 1; j < setnum; j++) {
6509 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6510 if (other) {
6511 value = src[j].weight * zunionInterDictValue(other);
6512 zunionInterAggregate(score, value, aggregate);
6513 } else {
6514 break;
6515 }
6516 }
6517
6518 /* skip entry when not present in every source dict */
6519 if (j != setnum) {
6520 zfree(score);
6521 } else {
6522 robj *o = dictGetEntryKey(de);
6523 dictAdd(dstzset->dict,o,score);
6524 incrRefCount(o); /* added to dictionary */
6525 zslInsert(dstzset->zsl,*score,o);
6526 incrRefCount(o); /* added to skiplist */
6527 }
6528 }
6529 dictReleaseIterator(di);
6530 }
6531 } else if (op == REDIS_OP_UNION) {
6532 for (i = 0; i < setnum; i++) {
6533 if (!src[i].dict) continue;
6534
6535 di = dictGetIterator(src[i].dict);
6536 while((de = dictNext(di)) != NULL) {
6537 /* skip key when already processed */
6538 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6539
6540 double *score = zmalloc(sizeof(double)), value;
6541 *score = src[i].weight * zunionInterDictValue(de);
6542
6543 /* because the zsets are sorted by size, its only possible
6544 * for sets at larger indices to hold this entry */
6545 for (j = (i+1); j < setnum; j++) {
6546 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6547 if (other) {
6548 value = src[j].weight * zunionInterDictValue(other);
6549 zunionInterAggregate(score, value, aggregate);
6550 }
6551 }
6552
6553 robj *o = dictGetEntryKey(de);
6554 dictAdd(dstzset->dict,o,score);
6555 incrRefCount(o); /* added to dictionary */
6556 zslInsert(dstzset->zsl,*score,o);
6557 incrRefCount(o); /* added to skiplist */
6558 }
6559 dictReleaseIterator(di);
6560 }
6561 } else {
6562 /* unknown operator */
6563 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6564 }
6565
6566 dbDelete(c->db,dstkey);
6567 if (dstzset->zsl->length) {
6568 dbAdd(c->db,dstkey,dstobj);
6569 addReplyLongLong(c, dstzset->zsl->length);
6570 server.dirty++;
6571 } else {
6572 decrRefCount(dstobj);
6573 addReply(c, shared.czero);
6574 }
6575 zfree(src);
6576 }
6577
6578 static void zunionstoreCommand(redisClient *c) {
6579 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6580 }
6581
6582 static void zinterstoreCommand(redisClient *c) {
6583 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6584 }
6585
6586 static void zrangeGenericCommand(redisClient *c, int reverse) {
6587 robj *o;
6588 long start;
6589 long end;
6590 int withscores = 0;
6591 int llen;
6592 int rangelen, j;
6593 zset *zsetobj;
6594 zskiplist *zsl;
6595 zskiplistNode *ln;
6596 robj *ele;
6597
6598 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6599 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6600
6601 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6602 withscores = 1;
6603 } else if (c->argc >= 5) {
6604 addReply(c,shared.syntaxerr);
6605 return;
6606 }
6607
6608 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6609 || checkType(c,o,REDIS_ZSET)) return;
6610 zsetobj = o->ptr;
6611 zsl = zsetobj->zsl;
6612 llen = zsl->length;
6613
6614 /* convert negative indexes */
6615 if (start < 0) start = llen+start;
6616 if (end < 0) end = llen+end;
6617 if (start < 0) start = 0;
6618 if (end < 0) end = 0;
6619
6620 /* indexes sanity checks */
6621 if (start > end || start >= llen) {
6622 /* Out of range start or start > end result in empty list */
6623 addReply(c,shared.emptymultibulk);
6624 return;
6625 }
6626 if (end >= llen) end = llen-1;
6627 rangelen = (end-start)+1;
6628
6629 /* check if starting point is trivial, before searching
6630 * the element in log(N) time */
6631 if (reverse) {
6632 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6633 } else {
6634 ln = start == 0 ?
6635 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6636 }
6637
6638 /* Return the result in form of a multi-bulk reply */
6639 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6640 withscores ? (rangelen*2) : rangelen));
6641 for (j = 0; j < rangelen; j++) {
6642 ele = ln->obj;
6643 addReplyBulk(c,ele);
6644 if (withscores)
6645 addReplyDouble(c,ln->score);
6646 ln = reverse ? ln->backward : ln->forward[0];
6647 }
6648 }
6649
6650 static void zrangeCommand(redisClient *c) {
6651 zrangeGenericCommand(c,0);
6652 }
6653
6654 static void zrevrangeCommand(redisClient *c) {
6655 zrangeGenericCommand(c,1);
6656 }
6657
6658 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6659 * If justcount is non-zero, just the count is returned. */
6660 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6661 robj *o;
6662 double min, max;
6663 int minex = 0, maxex = 0; /* are min or max exclusive? */
6664 int offset = 0, limit = -1;
6665 int withscores = 0;
6666 int badsyntax = 0;
6667
6668 /* Parse the min-max interval. If one of the values is prefixed
6669 * by the "(" character, it's considered "open". For instance
6670 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6671 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6672 if (((char*)c->argv[2]->ptr)[0] == '(') {
6673 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6674 minex = 1;
6675 } else {
6676 min = strtod(c->argv[2]->ptr,NULL);
6677 }
6678 if (((char*)c->argv[3]->ptr)[0] == '(') {
6679 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6680 maxex = 1;
6681 } else {
6682 max = strtod(c->argv[3]->ptr,NULL);
6683 }
6684
6685 /* Parse "WITHSCORES": note that if the command was called with
6686 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6687 * enter the following paths to parse WITHSCORES and LIMIT. */
6688 if (c->argc == 5 || c->argc == 8) {
6689 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6690 withscores = 1;
6691 else
6692 badsyntax = 1;
6693 }
6694 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6695 badsyntax = 1;
6696 if (badsyntax) {
6697 addReplySds(c,
6698 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6699 return;
6700 }
6701
6702 /* Parse "LIMIT" */
6703 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6704 addReply(c,shared.syntaxerr);
6705 return;
6706 } else if (c->argc == (7 + withscores)) {
6707 offset = atoi(c->argv[5]->ptr);
6708 limit = atoi(c->argv[6]->ptr);
6709 if (offset < 0) offset = 0;
6710 }
6711
6712 /* Ok, lookup the key and get the range */
6713 o = lookupKeyRead(c->db,c->argv[1]);
6714 if (o == NULL) {
6715 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6716 } else {
6717 if (o->type != REDIS_ZSET) {
6718 addReply(c,shared.wrongtypeerr);
6719 } else {
6720 zset *zsetobj = o->ptr;
6721 zskiplist *zsl = zsetobj->zsl;
6722 zskiplistNode *ln;
6723 robj *ele, *lenobj = NULL;
6724 unsigned long rangelen = 0;
6725
6726 /* Get the first node with the score >= min, or with
6727 * score > min if 'minex' is true. */
6728 ln = zslFirstWithScore(zsl,min);
6729 while (minex && ln && ln->score == min) ln = ln->forward[0];
6730
6731 if (ln == NULL) {
6732 /* No element matching the speciifed interval */
6733 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6734 return;
6735 }
6736
6737 /* We don't know in advance how many matching elements there
6738 * are in the list, so we push this object that will represent
6739 * the multi-bulk length in the output buffer, and will "fix"
6740 * it later */
6741 if (!justcount) {
6742 lenobj = createObject(REDIS_STRING,NULL);
6743 addReply(c,lenobj);
6744 decrRefCount(lenobj);
6745 }
6746
6747 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6748 if (offset) {
6749 offset--;
6750 ln = ln->forward[0];
6751 continue;
6752 }
6753 if (limit == 0) break;
6754 if (!justcount) {
6755 ele = ln->obj;
6756 addReplyBulk(c,ele);
6757 if (withscores)
6758 addReplyDouble(c,ln->score);
6759 }
6760 ln = ln->forward[0];
6761 rangelen++;
6762 if (limit > 0) limit--;
6763 }
6764 if (justcount) {
6765 addReplyLongLong(c,(long)rangelen);
6766 } else {
6767 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6768 withscores ? (rangelen*2) : rangelen);
6769 }
6770 }
6771 }
6772 }
6773
6774 static void zrangebyscoreCommand(redisClient *c) {
6775 genericZrangebyscoreCommand(c,0);
6776 }
6777
6778 static void zcountCommand(redisClient *c) {
6779 genericZrangebyscoreCommand(c,1);
6780 }
6781
6782 static void zcardCommand(redisClient *c) {
6783 robj *o;
6784 zset *zs;
6785
6786 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6787 checkType(c,o,REDIS_ZSET)) return;
6788
6789 zs = o->ptr;
6790 addReplyUlong(c,zs->zsl->length);
6791 }
6792
6793 static void zscoreCommand(redisClient *c) {
6794 robj *o;
6795 zset *zs;
6796 dictEntry *de;
6797
6798 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6799 checkType(c,o,REDIS_ZSET)) return;
6800
6801 zs = o->ptr;
6802 de = dictFind(zs->dict,c->argv[2]);
6803 if (!de) {
6804 addReply(c,shared.nullbulk);
6805 } else {
6806 double *score = dictGetEntryVal(de);
6807
6808 addReplyDouble(c,*score);
6809 }
6810 }
6811
6812 static void zrankGenericCommand(redisClient *c, int reverse) {
6813 robj *o;
6814 zset *zs;
6815 zskiplist *zsl;
6816 dictEntry *de;
6817 unsigned long rank;
6818 double *score;
6819
6820 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6821 checkType(c,o,REDIS_ZSET)) return;
6822
6823 zs = o->ptr;
6824 zsl = zs->zsl;
6825 de = dictFind(zs->dict,c->argv[2]);
6826 if (!de) {
6827 addReply(c,shared.nullbulk);
6828 return;
6829 }
6830
6831 score = dictGetEntryVal(de);
6832 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6833 if (rank) {
6834 if (reverse) {
6835 addReplyLongLong(c, zsl->length - rank);
6836 } else {
6837 addReplyLongLong(c, rank-1);
6838 }
6839 } else {
6840 addReply(c,shared.nullbulk);
6841 }
6842 }
6843
6844 static void zrankCommand(redisClient *c) {
6845 zrankGenericCommand(c, 0);
6846 }
6847
6848 static void zrevrankCommand(redisClient *c) {
6849 zrankGenericCommand(c, 1);
6850 }
6851
6852 /* ========================= Hashes utility functions ======================= */
6853 #define REDIS_HASH_KEY 1
6854 #define REDIS_HASH_VALUE 2
6855
6856 /* Check the length of a number of objects to see if we need to convert a
6857 * zipmap to a real hash. Note that we only check string encoded objects
6858 * as their string length can be queried in constant time. */
6859 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
6860 int i;
6861 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6862
6863 for (i = start; i <= end; i++) {
6864 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6865 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6866 {
6867 convertToRealHash(subject);
6868 return;
6869 }
6870 }
6871 }
6872
6873 /* Encode given objects in-place when the hash uses a dict. */
6874 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6875 if (subject->encoding == REDIS_ENCODING_HT) {
6876 if (o1) *o1 = tryObjectEncoding(*o1);
6877 if (o2) *o2 = tryObjectEncoding(*o2);
6878 }
6879 }
6880
6881 /* Get the value from a hash identified by key. Returns either a string
6882 * object or NULL if the value cannot be found. The refcount of the object
6883 * is always increased by 1 when the value was found. */
6884 static robj *hashTypeGet(robj *o, robj *key) {
6885 robj *value = NULL;
6886 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6887 unsigned char *v;
6888 unsigned int vlen;
6889 key = getDecodedObject(key);
6890 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6891 value = createStringObject((char*)v,vlen);
6892 }
6893 decrRefCount(key);
6894 } else {
6895 dictEntry *de = dictFind(o->ptr,key);
6896 if (de != NULL) {
6897 value = dictGetEntryVal(de);
6898 incrRefCount(value);
6899 }
6900 }
6901 return value;
6902 }
6903
6904 /* Test if the key exists in the given hash. Returns 1 if the key
6905 * exists and 0 when it doesn't. */
6906 static int hashTypeExists(robj *o, robj *key) {
6907 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6908 key = getDecodedObject(key);
6909 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6910 decrRefCount(key);
6911 return 1;
6912 }
6913 decrRefCount(key);
6914 } else {
6915 if (dictFind(o->ptr,key) != NULL) {
6916 return 1;
6917 }
6918 }
6919 return 0;
6920 }
6921
6922 /* Add an element, discard the old if the key already exists.
6923 * Return 0 on insert and 1 on update. */
6924 static int hashTypeSet(robj *o, robj *key, robj *value) {
6925 int update = 0;
6926 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6927 key = getDecodedObject(key);
6928 value = getDecodedObject(value);
6929 o->ptr = zipmapSet(o->ptr,
6930 key->ptr,sdslen(key->ptr),
6931 value->ptr,sdslen(value->ptr), &update);
6932 decrRefCount(key);
6933 decrRefCount(value);
6934
6935 /* Check if the zipmap needs to be upgraded to a real hash table */
6936 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6937 convertToRealHash(o);
6938 } else {
6939 if (dictReplace(o->ptr,key,value)) {
6940 /* Insert */
6941 incrRefCount(key);
6942 } else {
6943 /* Update */
6944 update = 1;
6945 }
6946 incrRefCount(value);
6947 }
6948 return update;
6949 }
6950
6951 /* Delete an element from a hash.
6952 * Return 1 on deleted and 0 on not found. */
6953 static int hashTypeDelete(robj *o, robj *key) {
6954 int deleted = 0;
6955 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6956 key = getDecodedObject(key);
6957 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6958 decrRefCount(key);
6959 } else {
6960 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6961 /* Always check if the dictionary needs a resize after a delete. */
6962 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6963 }
6964 return deleted;
6965 }
6966
6967 /* Return the number of elements in a hash. */
6968 static unsigned long hashTypeLength(robj *o) {
6969 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6970 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6971 }
6972
6973 /* Structure to hold hash iteration abstration. Note that iteration over
6974 * hashes involves both fields and values. Because it is possible that
6975 * not both are required, store pointers in the iterator to avoid
6976 * unnecessary memory allocation for fields/values. */
6977 typedef struct {
6978 int encoding;
6979 unsigned char *zi;
6980 unsigned char *zk, *zv;
6981 unsigned int zklen, zvlen;
6982
6983 dictIterator *di;
6984 dictEntry *de;
6985 } hashTypeIterator;
6986
6987 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
6988 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
6989 hi->encoding = subject->encoding;
6990 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6991 hi->zi = zipmapRewind(subject->ptr);
6992 } else if (hi->encoding == REDIS_ENCODING_HT) {
6993 hi->di = dictGetIterator(subject->ptr);
6994 } else {
6995 redisAssert(NULL);
6996 }
6997 return hi;
6998 }
6999
7000 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7001 if (hi->encoding == REDIS_ENCODING_HT) {
7002 dictReleaseIterator(hi->di);
7003 }
7004 zfree(hi);
7005 }
7006
7007 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7008 * could be found and REDIS_ERR when the iterator reaches the end. */
7009 static int hashTypeNext(hashTypeIterator *hi) {
7010 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7011 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7012 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7013 } else {
7014 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7015 }
7016 return REDIS_OK;
7017 }
7018
7019 /* Get key or value object at current iteration position.
7020 * This increases the refcount of the field object by 1. */
7021 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7022 robj *o;
7023 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7024 if (what & REDIS_HASH_KEY) {
7025 o = createStringObject((char*)hi->zk,hi->zklen);
7026 } else {
7027 o = createStringObject((char*)hi->zv,hi->zvlen);
7028 }
7029 } else {
7030 if (what & REDIS_HASH_KEY) {
7031 o = dictGetEntryKey(hi->de);
7032 } else {
7033 o = dictGetEntryVal(hi->de);
7034 }
7035 incrRefCount(o);
7036 }
7037 return o;
7038 }
7039
7040 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7041 robj *o = lookupKeyWrite(c->db,key);
7042 if (o == NULL) {
7043 o = createHashObject();
7044 dbAdd(c->db,key,o);
7045 } else {
7046 if (o->type != REDIS_HASH) {
7047 addReply(c,shared.wrongtypeerr);
7048 return NULL;
7049 }
7050 }
7051 return o;
7052 }
7053
7054 /* ============================= Hash commands ============================== */
7055 static void hsetCommand(redisClient *c) {
7056 int update;
7057 robj *o;
7058
7059 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7060 hashTypeTryConversion(o,c->argv,2,3);
7061 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7062 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7063 addReply(c, update ? shared.czero : shared.cone);
7064 server.dirty++;
7065 }
7066
7067 static void hsetnxCommand(redisClient *c) {
7068 robj *o;
7069 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7070 hashTypeTryConversion(o,c->argv,2,3);
7071
7072 if (hashTypeExists(o, c->argv[2])) {
7073 addReply(c, shared.czero);
7074 } else {
7075 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7076 hashTypeSet(o,c->argv[2],c->argv[3]);
7077 addReply(c, shared.cone);
7078 server.dirty++;
7079 }
7080 }
7081
7082 static void hmsetCommand(redisClient *c) {
7083 int i;
7084 robj *o;
7085
7086 if ((c->argc % 2) == 1) {
7087 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7088 return;
7089 }
7090
7091 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7092 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7093 for (i = 2; i < c->argc; i += 2) {
7094 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7095 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7096 }
7097 addReply(c, shared.ok);
7098 server.dirty++;
7099 }
7100
7101 static void hincrbyCommand(redisClient *c) {
7102 long long value, incr;
7103 robj *o, *current, *new;
7104
7105 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7106 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7107 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7108 if (getLongLongFromObjectOrReply(c,current,&value,
7109 "hash value is not an integer") != REDIS_OK) {
7110 decrRefCount(current);
7111 return;
7112 }
7113 decrRefCount(current);
7114 } else {
7115 value = 0;
7116 }
7117
7118 value += incr;
7119 new = createStringObjectFromLongLong(value);
7120 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7121 hashTypeSet(o,c->argv[2],new);
7122 decrRefCount(new);
7123 addReplyLongLong(c,value);
7124 server.dirty++;
7125 }
7126
7127 static void hgetCommand(redisClient *c) {
7128 robj *o, *value;
7129 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7130 checkType(c,o,REDIS_HASH)) return;
7131
7132 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7133 addReplyBulk(c,value);
7134 decrRefCount(value);
7135 } else {
7136 addReply(c,shared.nullbulk);
7137 }
7138 }
7139
7140 static void hmgetCommand(redisClient *c) {
7141 int i;
7142 robj *o, *value;
7143 o = lookupKeyRead(c->db,c->argv[1]);
7144 if (o != NULL && o->type != REDIS_HASH) {
7145 addReply(c,shared.wrongtypeerr);
7146 }
7147
7148 /* Note the check for o != NULL happens inside the loop. This is
7149 * done because objects that cannot be found are considered to be
7150 * an empty hash. The reply should then be a series of NULLs. */
7151 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7152 for (i = 2; i < c->argc; i++) {
7153 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7154 addReplyBulk(c,value);
7155 decrRefCount(value);
7156 } else {
7157 addReply(c,shared.nullbulk);
7158 }
7159 }
7160 }
7161
7162 static void hdelCommand(redisClient *c) {
7163 robj *o;
7164 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7165 checkType(c,o,REDIS_HASH)) return;
7166
7167 if (hashTypeDelete(o,c->argv[2])) {
7168 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7169 addReply(c,shared.cone);
7170 server.dirty++;
7171 } else {
7172 addReply(c,shared.czero);
7173 }
7174 }
7175
7176 static void hlenCommand(redisClient *c) {
7177 robj *o;
7178 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7179 checkType(c,o,REDIS_HASH)) return;
7180
7181 addReplyUlong(c,hashTypeLength(o));
7182 }
7183
7184 static void genericHgetallCommand(redisClient *c, int flags) {
7185 robj *o, *lenobj, *obj;
7186 unsigned long count = 0;
7187 hashTypeIterator *hi;
7188
7189 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7190 || checkType(c,o,REDIS_HASH)) return;
7191
7192 lenobj = createObject(REDIS_STRING,NULL);
7193 addReply(c,lenobj);
7194 decrRefCount(lenobj);
7195
7196 hi = hashTypeInitIterator(o);
7197 while (hashTypeNext(hi) != REDIS_ERR) {
7198 if (flags & REDIS_HASH_KEY) {
7199 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7200 addReplyBulk(c,obj);
7201 decrRefCount(obj);
7202 count++;
7203 }
7204 if (flags & REDIS_HASH_VALUE) {
7205 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7206 addReplyBulk(c,obj);
7207 decrRefCount(obj);
7208 count++;
7209 }
7210 }
7211 hashTypeReleaseIterator(hi);
7212
7213 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7214 }
7215
7216 static void hkeysCommand(redisClient *c) {
7217 genericHgetallCommand(c,REDIS_HASH_KEY);
7218 }
7219
7220 static void hvalsCommand(redisClient *c) {
7221 genericHgetallCommand(c,REDIS_HASH_VALUE);
7222 }
7223
7224 static void hgetallCommand(redisClient *c) {
7225 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7226 }
7227
7228 static void hexistsCommand(redisClient *c) {
7229 robj *o;
7230 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7231 checkType(c,o,REDIS_HASH)) return;
7232
7233 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7234 }
7235
7236 static void convertToRealHash(robj *o) {
7237 unsigned char *key, *val, *p, *zm = o->ptr;
7238 unsigned int klen, vlen;
7239 dict *dict = dictCreate(&hashDictType,NULL);
7240
7241 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7242 p = zipmapRewind(zm);
7243 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7244 robj *keyobj, *valobj;
7245
7246 keyobj = createStringObject((char*)key,klen);
7247 valobj = createStringObject((char*)val,vlen);
7248 keyobj = tryObjectEncoding(keyobj);
7249 valobj = tryObjectEncoding(valobj);
7250 dictAdd(dict,keyobj,valobj);
7251 }
7252 o->encoding = REDIS_ENCODING_HT;
7253 o->ptr = dict;
7254 zfree(zm);
7255 }
7256
7257 /* ========================= Non type-specific commands ==================== */
7258
7259 static void flushdbCommand(redisClient *c) {
7260 server.dirty += dictSize(c->db->dict);
7261 touchWatchedKeysOnFlush(c->db->id);
7262 dictEmpty(c->db->dict);
7263 dictEmpty(c->db->expires);
7264 addReply(c,shared.ok);
7265 }
7266
7267 static void flushallCommand(redisClient *c) {
7268 touchWatchedKeysOnFlush(-1);
7269 server.dirty += emptyDb();
7270 addReply(c,shared.ok);
7271 if (server.bgsavechildpid != -1) {
7272 kill(server.bgsavechildpid,SIGKILL);
7273 rdbRemoveTempFile(server.bgsavechildpid);
7274 }
7275 rdbSave(server.dbfilename);
7276 server.dirty++;
7277 }
7278
7279 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7280 redisSortOperation *so = zmalloc(sizeof(*so));
7281 so->type = type;
7282 so->pattern = pattern;
7283 return so;
7284 }
7285
7286 /* Return the value associated to the key with a name obtained
7287 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7288 * The returned object will always have its refcount increased by 1
7289 * when it is non-NULL. */
7290 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7291 char *p, *f;
7292 sds spat, ssub;
7293 robj keyobj, fieldobj, *o;
7294 int prefixlen, sublen, postfixlen, fieldlen;
7295 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7296 struct {
7297 long len;
7298 long free;
7299 char buf[REDIS_SORTKEY_MAX+1];
7300 } keyname, fieldname;
7301
7302 /* If the pattern is "#" return the substitution object itself in order
7303 * to implement the "SORT ... GET #" feature. */
7304 spat = pattern->ptr;
7305 if (spat[0] == '#' && spat[1] == '\0') {
7306 incrRefCount(subst);
7307 return subst;
7308 }
7309
7310 /* The substitution object may be specially encoded. If so we create
7311 * a decoded object on the fly. Otherwise getDecodedObject will just
7312 * increment the ref count, that we'll decrement later. */
7313 subst = getDecodedObject(subst);
7314
7315 ssub = subst->ptr;
7316 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7317 p = strchr(spat,'*');
7318 if (!p) {
7319 decrRefCount(subst);
7320 return NULL;
7321 }
7322
7323 /* Find out if we're dealing with a hash dereference. */
7324 if ((f = strstr(p+1, "->")) != NULL) {
7325 fieldlen = sdslen(spat)-(f-spat);
7326 /* this also copies \0 character */
7327 memcpy(fieldname.buf,f+2,fieldlen-1);
7328 fieldname.len = fieldlen-2;
7329 } else {
7330 fieldlen = 0;
7331 }
7332
7333 prefixlen = p-spat;
7334 sublen = sdslen(ssub);
7335 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7336 memcpy(keyname.buf,spat,prefixlen);
7337 memcpy(keyname.buf+prefixlen,ssub,sublen);
7338 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7339 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7340 keyname.len = prefixlen+sublen+postfixlen;
7341 decrRefCount(subst);
7342
7343 /* Lookup substituted key */
7344 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7345 o = lookupKeyRead(db,&keyobj);
7346 if (o == NULL) return NULL;
7347
7348 if (fieldlen > 0) {
7349 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7350
7351 /* Retrieve value from hash by the field name. This operation
7352 * already increases the refcount of the returned object. */
7353 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7354 o = hashTypeGet(o, &fieldobj);
7355 } else {
7356 if (o->type != REDIS_STRING) return NULL;
7357
7358 /* Every object that this function returns needs to have its refcount
7359 * increased. sortCommand decreases it again. */
7360 incrRefCount(o);
7361 }
7362
7363 return o;
7364 }
7365
7366 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7367 * the additional parameter is not standard but a BSD-specific we have to
7368 * pass sorting parameters via the global 'server' structure */
7369 static int sortCompare(const void *s1, const void *s2) {
7370 const redisSortObject *so1 = s1, *so2 = s2;
7371 int cmp;
7372
7373 if (!server.sort_alpha) {
7374 /* Numeric sorting. Here it's trivial as we precomputed scores */
7375 if (so1->u.score > so2->u.score) {
7376 cmp = 1;
7377 } else if (so1->u.score < so2->u.score) {
7378 cmp = -1;
7379 } else {
7380 cmp = 0;
7381 }
7382 } else {
7383 /* Alphanumeric sorting */
7384 if (server.sort_bypattern) {
7385 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7386 /* At least one compare object is NULL */
7387 if (so1->u.cmpobj == so2->u.cmpobj)
7388 cmp = 0;
7389 else if (so1->u.cmpobj == NULL)
7390 cmp = -1;
7391 else
7392 cmp = 1;
7393 } else {
7394 /* We have both the objects, use strcoll */
7395 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7396 }
7397 } else {
7398 /* Compare elements directly. */
7399 cmp = compareStringObjects(so1->obj,so2->obj);
7400 }
7401 }
7402 return server.sort_desc ? -cmp : cmp;
7403 }
7404
7405 /* The SORT command is the most complex command in Redis. Warning: this code
7406 * is optimized for speed and a bit less for readability */
7407 static void sortCommand(redisClient *c) {
7408 list *operations;
7409 unsigned int outputlen = 0;
7410 int desc = 0, alpha = 0;
7411 int limit_start = 0, limit_count = -1, start, end;
7412 int j, dontsort = 0, vectorlen;
7413 int getop = 0; /* GET operation counter */
7414 robj *sortval, *sortby = NULL, *storekey = NULL;
7415 redisSortObject *vector; /* Resulting vector to sort */
7416
7417 /* Lookup the key to sort. It must be of the right types */
7418 sortval = lookupKeyRead(c->db,c->argv[1]);
7419 if (sortval == NULL) {
7420 addReply(c,shared.emptymultibulk);
7421 return;
7422 }
7423 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7424 sortval->type != REDIS_ZSET)
7425 {
7426 addReply(c,shared.wrongtypeerr);
7427 return;
7428 }
7429
7430 /* Create a list of operations to perform for every sorted element.
7431 * Operations can be GET/DEL/INCR/DECR */
7432 operations = listCreate();
7433 listSetFreeMethod(operations,zfree);
7434 j = 2;
7435
7436 /* Now we need to protect sortval incrementing its count, in the future
7437 * SORT may have options able to overwrite/delete keys during the sorting
7438 * and the sorted key itself may get destroied */
7439 incrRefCount(sortval);
7440
7441 /* The SORT command has an SQL-alike syntax, parse it */
7442 while(j < c->argc) {
7443 int leftargs = c->argc-j-1;
7444 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7445 desc = 0;
7446 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7447 desc = 1;
7448 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7449 alpha = 1;
7450 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7451 limit_start = atoi(c->argv[j+1]->ptr);
7452 limit_count = atoi(c->argv[j+2]->ptr);
7453 j+=2;
7454 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7455 storekey = c->argv[j+1];
7456 j++;
7457 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7458 sortby = c->argv[j+1];
7459 /* If the BY pattern does not contain '*', i.e. it is constant,
7460 * we don't need to sort nor to lookup the weight keys. */
7461 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7462 j++;
7463 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7464 listAddNodeTail(operations,createSortOperation(
7465 REDIS_SORT_GET,c->argv[j+1]));
7466 getop++;
7467 j++;
7468 } else {
7469 decrRefCount(sortval);
7470 listRelease(operations);
7471 addReply(c,shared.syntaxerr);
7472 return;
7473 }
7474 j++;
7475 }
7476
7477 /* Load the sorting vector with all the objects to sort */
7478 switch(sortval->type) {
7479 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7480 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7481 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7482 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7483 }
7484 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7485 j = 0;
7486
7487 if (sortval->type == REDIS_LIST) {
7488 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7489 listTypeEntry entry;
7490 while(listTypeNext(li,&entry)) {
7491 vector[j].obj = listTypeGet(&entry);
7492 vector[j].u.score = 0;
7493 vector[j].u.cmpobj = NULL;
7494 j++;
7495 }
7496 listTypeReleaseIterator(li);
7497 } else {
7498 dict *set;
7499 dictIterator *di;
7500 dictEntry *setele;
7501
7502 if (sortval->type == REDIS_SET) {
7503 set = sortval->ptr;
7504 } else {
7505 zset *zs = sortval->ptr;
7506 set = zs->dict;
7507 }
7508
7509 di = dictGetIterator(set);
7510 while((setele = dictNext(di)) != NULL) {
7511 vector[j].obj = dictGetEntryKey(setele);
7512 vector[j].u.score = 0;
7513 vector[j].u.cmpobj = NULL;
7514 j++;
7515 }
7516 dictReleaseIterator(di);
7517 }
7518 redisAssert(j == vectorlen);
7519
7520 /* Now it's time to load the right scores in the sorting vector */
7521 if (dontsort == 0) {
7522 for (j = 0; j < vectorlen; j++) {
7523 robj *byval;
7524 if (sortby) {
7525 /* lookup value to sort by */
7526 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7527 if (!byval) continue;
7528 } else {
7529 /* use object itself to sort by */
7530 byval = vector[j].obj;
7531 }
7532
7533 if (alpha) {
7534 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7535 } else {
7536 if (byval->encoding == REDIS_ENCODING_RAW) {
7537 vector[j].u.score = strtod(byval->ptr,NULL);
7538 } else if (byval->encoding == REDIS_ENCODING_INT) {
7539 /* Don't need to decode the object if it's
7540 * integer-encoded (the only encoding supported) so
7541 * far. We can just cast it */
7542 vector[j].u.score = (long)byval->ptr;
7543 } else {
7544 redisAssert(1 != 1);
7545 }
7546 }
7547
7548 /* when the object was retrieved using lookupKeyByPattern,
7549 * its refcount needs to be decreased. */
7550 if (sortby) {
7551 decrRefCount(byval);
7552 }
7553 }
7554 }
7555
7556 /* We are ready to sort the vector... perform a bit of sanity check
7557 * on the LIMIT option too. We'll use a partial version of quicksort. */
7558 start = (limit_start < 0) ? 0 : limit_start;
7559 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7560 if (start >= vectorlen) {
7561 start = vectorlen-1;
7562 end = vectorlen-2;
7563 }
7564 if (end >= vectorlen) end = vectorlen-1;
7565
7566 if (dontsort == 0) {
7567 server.sort_desc = desc;
7568 server.sort_alpha = alpha;
7569 server.sort_bypattern = sortby ? 1 : 0;
7570 if (sortby && (start != 0 || end != vectorlen-1))
7571 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7572 else
7573 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7574 }
7575
7576 /* Send command output to the output buffer, performing the specified
7577 * GET/DEL/INCR/DECR operations if any. */
7578 outputlen = getop ? getop*(end-start+1) : end-start+1;
7579 if (storekey == NULL) {
7580 /* STORE option not specified, sent the sorting result to client */
7581 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7582 for (j = start; j <= end; j++) {
7583 listNode *ln;
7584 listIter li;
7585
7586 if (!getop) addReplyBulk(c,vector[j].obj);
7587 listRewind(operations,&li);
7588 while((ln = listNext(&li))) {
7589 redisSortOperation *sop = ln->value;
7590 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7591 vector[j].obj);
7592
7593 if (sop->type == REDIS_SORT_GET) {
7594 if (!val) {
7595 addReply(c,shared.nullbulk);
7596 } else {
7597 addReplyBulk(c,val);
7598 decrRefCount(val);
7599 }
7600 } else {
7601 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7602 }
7603 }
7604 }
7605 } else {
7606 robj *sobj = createZiplistObject();
7607
7608 /* STORE option specified, set the sorting result as a List object */
7609 for (j = start; j <= end; j++) {
7610 listNode *ln;
7611 listIter li;
7612
7613 if (!getop) {
7614 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7615 } else {
7616 listRewind(operations,&li);
7617 while((ln = listNext(&li))) {
7618 redisSortOperation *sop = ln->value;
7619 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7620 vector[j].obj);
7621
7622 if (sop->type == REDIS_SORT_GET) {
7623 if (!val) val = createStringObject("",0);
7624
7625 /* listTypePush does an incrRefCount, so we should take care
7626 * care of the incremented refcount caused by either
7627 * lookupKeyByPattern or createStringObject("",0) */
7628 listTypePush(sobj,val,REDIS_TAIL);
7629 decrRefCount(val);
7630 } else {
7631 /* always fails */
7632 redisAssert(sop->type == REDIS_SORT_GET);
7633 }
7634 }
7635 }
7636 }
7637 dbReplace(c->db,storekey,sobj);
7638 /* Note: we add 1 because the DB is dirty anyway since even if the
7639 * SORT result is empty a new key is set and maybe the old content
7640 * replaced. */
7641 server.dirty += 1+outputlen;
7642 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7643 }
7644
7645 /* Cleanup */
7646 if (sortval->type == REDIS_LIST)
7647 for (j = 0; j < vectorlen; j++)
7648 decrRefCount(vector[j].obj);
7649 decrRefCount(sortval);
7650 listRelease(operations);
7651 for (j = 0; j < vectorlen; j++) {
7652 if (alpha && vector[j].u.cmpobj)
7653 decrRefCount(vector[j].u.cmpobj);
7654 }
7655 zfree(vector);
7656 }
7657
7658 /* Convert an amount of bytes into a human readable string in the form
7659 * of 100B, 2G, 100M, 4K, and so forth. */
7660 static void bytesToHuman(char *s, unsigned long long n) {
7661 double d;
7662
7663 if (n < 1024) {
7664 /* Bytes */
7665 sprintf(s,"%lluB",n);
7666 return;
7667 } else if (n < (1024*1024)) {
7668 d = (double)n/(1024);
7669 sprintf(s,"%.2fK",d);
7670 } else if (n < (1024LL*1024*1024)) {
7671 d = (double)n/(1024*1024);
7672 sprintf(s,"%.2fM",d);
7673 } else if (n < (1024LL*1024*1024*1024)) {
7674 d = (double)n/(1024LL*1024*1024);
7675 sprintf(s,"%.2fG",d);
7676 }
7677 }
7678
7679 /* Create the string returned by the INFO command. This is decoupled
7680 * by the INFO command itself as we need to report the same information
7681 * on memory corruption problems. */
7682 static sds genRedisInfoString(void) {
7683 sds info;
7684 time_t uptime = time(NULL)-server.stat_starttime;
7685 int j;
7686 char hmem[64];
7687
7688 bytesToHuman(hmem,zmalloc_used_memory());
7689 info = sdscatprintf(sdsempty(),
7690 "redis_version:%s\r\n"
7691 "redis_git_sha1:%s\r\n"
7692 "redis_git_dirty:%d\r\n"
7693 "arch_bits:%s\r\n"
7694 "multiplexing_api:%s\r\n"
7695 "process_id:%ld\r\n"
7696 "uptime_in_seconds:%ld\r\n"
7697 "uptime_in_days:%ld\r\n"
7698 "connected_clients:%d\r\n"
7699 "connected_slaves:%d\r\n"
7700 "blocked_clients:%d\r\n"
7701 "used_memory:%zu\r\n"
7702 "used_memory_human:%s\r\n"
7703 "changes_since_last_save:%lld\r\n"
7704 "bgsave_in_progress:%d\r\n"
7705 "last_save_time:%ld\r\n"
7706 "bgrewriteaof_in_progress:%d\r\n"
7707 "total_connections_received:%lld\r\n"
7708 "total_commands_processed:%lld\r\n"
7709 "expired_keys:%lld\r\n"
7710 "hash_max_zipmap_entries:%zu\r\n"
7711 "hash_max_zipmap_value:%zu\r\n"
7712 "pubsub_channels:%ld\r\n"
7713 "pubsub_patterns:%u\r\n"
7714 "vm_enabled:%d\r\n"
7715 "role:%s\r\n"
7716 ,REDIS_VERSION,
7717 REDIS_GIT_SHA1,
7718 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7719 (sizeof(long) == 8) ? "64" : "32",
7720 aeGetApiName(),
7721 (long) getpid(),
7722 uptime,
7723 uptime/(3600*24),
7724 listLength(server.clients)-listLength(server.slaves),
7725 listLength(server.slaves),
7726 server.blpop_blocked_clients,
7727 zmalloc_used_memory(),
7728 hmem,
7729 server.dirty,
7730 server.bgsavechildpid != -1,
7731 server.lastsave,
7732 server.bgrewritechildpid != -1,
7733 server.stat_numconnections,
7734 server.stat_numcommands,
7735 server.stat_expiredkeys,
7736 server.hash_max_zipmap_entries,
7737 server.hash_max_zipmap_value,
7738 dictSize(server.pubsub_channels),
7739 listLength(server.pubsub_patterns),
7740 server.vm_enabled != 0,
7741 server.masterhost == NULL ? "master" : "slave"
7742 );
7743 if (server.masterhost) {
7744 info = sdscatprintf(info,
7745 "master_host:%s\r\n"
7746 "master_port:%d\r\n"
7747 "master_link_status:%s\r\n"
7748 "master_last_io_seconds_ago:%d\r\n"
7749 ,server.masterhost,
7750 server.masterport,
7751 (server.replstate == REDIS_REPL_CONNECTED) ?
7752 "up" : "down",
7753 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7754 );
7755 }
7756 if (server.vm_enabled) {
7757 lockThreadedIO();
7758 info = sdscatprintf(info,
7759 "vm_conf_max_memory:%llu\r\n"
7760 "vm_conf_page_size:%llu\r\n"
7761 "vm_conf_pages:%llu\r\n"
7762 "vm_stats_used_pages:%llu\r\n"
7763 "vm_stats_swapped_objects:%llu\r\n"
7764 "vm_stats_swappin_count:%llu\r\n"
7765 "vm_stats_swappout_count:%llu\r\n"
7766 "vm_stats_io_newjobs_len:%lu\r\n"
7767 "vm_stats_io_processing_len:%lu\r\n"
7768 "vm_stats_io_processed_len:%lu\r\n"
7769 "vm_stats_io_active_threads:%lu\r\n"
7770 "vm_stats_blocked_clients:%lu\r\n"
7771 ,(unsigned long long) server.vm_max_memory,
7772 (unsigned long long) server.vm_page_size,
7773 (unsigned long long) server.vm_pages,
7774 (unsigned long long) server.vm_stats_used_pages,
7775 (unsigned long long) server.vm_stats_swapped_objects,
7776 (unsigned long long) server.vm_stats_swapins,
7777 (unsigned long long) server.vm_stats_swapouts,
7778 (unsigned long) listLength(server.io_newjobs),
7779 (unsigned long) listLength(server.io_processing),
7780 (unsigned long) listLength(server.io_processed),
7781 (unsigned long) server.io_active_threads,
7782 (unsigned long) server.vm_blocked_clients
7783 );
7784 unlockThreadedIO();
7785 }
7786 for (j = 0; j < server.dbnum; j++) {
7787 long long keys, vkeys;
7788
7789 keys = dictSize(server.db[j].dict);
7790 vkeys = dictSize(server.db[j].expires);
7791 if (keys || vkeys) {
7792 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7793 j, keys, vkeys);
7794 }
7795 }
7796 return info;
7797 }
7798
7799 static void infoCommand(redisClient *c) {
7800 sds info = genRedisInfoString();
7801 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7802 (unsigned long)sdslen(info)));
7803 addReplySds(c,info);
7804 addReply(c,shared.crlf);
7805 }
7806
7807 static void monitorCommand(redisClient *c) {
7808 /* ignore MONITOR if aleady slave or in monitor mode */
7809 if (c->flags & REDIS_SLAVE) return;
7810
7811 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7812 c->slaveseldb = 0;
7813 listAddNodeTail(server.monitors,c);
7814 addReply(c,shared.ok);
7815 }
7816
7817 /* ================================= Expire ================================= */
7818 static int removeExpire(redisDb *db, robj *key) {
7819 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7820 return 1;
7821 } else {
7822 return 0;
7823 }
7824 }
7825
7826 static int setExpire(redisDb *db, robj *key, time_t when) {
7827 sds copy = sdsdup(key->ptr);
7828 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7829 sdsfree(copy);
7830 return 0;
7831 } else {
7832 return 1;
7833 }
7834 }
7835
7836 /* Return the expire time of the specified key, or -1 if no expire
7837 * is associated with this key (i.e. the key is non volatile) */
7838 static time_t getExpire(redisDb *db, robj *key) {
7839 dictEntry *de;
7840
7841 /* No expire? return ASAP */
7842 if (dictSize(db->expires) == 0 ||
7843 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7844
7845 return (time_t) dictGetEntryVal(de);
7846 }
7847
7848 static int expireIfNeeded(redisDb *db, robj *key) {
7849 time_t when;
7850 dictEntry *de;
7851
7852 /* No expire? return ASAP */
7853 if (dictSize(db->expires) == 0 ||
7854 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7855
7856 /* Lookup the expire */
7857 when = (time_t) dictGetEntryVal(de);
7858 if (time(NULL) <= when) return 0;
7859
7860 /* Delete the key */
7861 dbDelete(db,key);
7862 server.stat_expiredkeys++;
7863 return 1;
7864 }
7865
7866 static int deleteIfVolatile(redisDb *db, robj *key) {
7867 dictEntry *de;
7868
7869 /* No expire? return ASAP */
7870 if (dictSize(db->expires) == 0 ||
7871 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7872
7873 /* Delete the key */
7874 server.dirty++;
7875 server.stat_expiredkeys++;
7876 dictDelete(db->expires,key->ptr);
7877 return dictDelete(db->dict,key->ptr) == DICT_OK;
7878 }
7879
7880 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7881 dictEntry *de;
7882 time_t seconds;
7883
7884 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7885
7886 seconds -= offset;
7887
7888 de = dictFind(c->db->dict,key->ptr);
7889 if (de == NULL) {
7890 addReply(c,shared.czero);
7891 return;
7892 }
7893 if (seconds <= 0) {
7894 if (dbDelete(c->db,key)) server.dirty++;
7895 addReply(c, shared.cone);
7896 return;
7897 } else {
7898 time_t when = time(NULL)+seconds;
7899 if (setExpire(c->db,key,when)) {
7900 addReply(c,shared.cone);
7901 server.dirty++;
7902 } else {
7903 addReply(c,shared.czero);
7904 }
7905 return;
7906 }
7907 }
7908
7909 static void expireCommand(redisClient *c) {
7910 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7911 }
7912
7913 static void expireatCommand(redisClient *c) {
7914 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7915 }
7916
7917 static void ttlCommand(redisClient *c) {
7918 time_t expire;
7919 int ttl = -1;
7920
7921 expire = getExpire(c->db,c->argv[1]);
7922 if (expire != -1) {
7923 ttl = (int) (expire-time(NULL));
7924 if (ttl < 0) ttl = -1;
7925 }
7926 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7927 }
7928
7929 /* ================================ MULTI/EXEC ============================== */
7930
7931 /* Client state initialization for MULTI/EXEC */
7932 static void initClientMultiState(redisClient *c) {
7933 c->mstate.commands = NULL;
7934 c->mstate.count = 0;
7935 }
7936
7937 /* Release all the resources associated with MULTI/EXEC state */
7938 static void freeClientMultiState(redisClient *c) {
7939 int j;
7940
7941 for (j = 0; j < c->mstate.count; j++) {
7942 int i;
7943 multiCmd *mc = c->mstate.commands+j;
7944
7945 for (i = 0; i < mc->argc; i++)
7946 decrRefCount(mc->argv[i]);
7947 zfree(mc->argv);
7948 }
7949 zfree(c->mstate.commands);
7950 }
7951
7952 /* Add a new command into the MULTI commands queue */
7953 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7954 multiCmd *mc;
7955 int j;
7956
7957 c->mstate.commands = zrealloc(c->mstate.commands,
7958 sizeof(multiCmd)*(c->mstate.count+1));
7959 mc = c->mstate.commands+c->mstate.count;
7960 mc->cmd = cmd;
7961 mc->argc = c->argc;
7962 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7963 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7964 for (j = 0; j < c->argc; j++)
7965 incrRefCount(mc->argv[j]);
7966 c->mstate.count++;
7967 }
7968
7969 static void multiCommand(redisClient *c) {
7970 if (c->flags & REDIS_MULTI) {
7971 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7972 return;
7973 }
7974 c->flags |= REDIS_MULTI;
7975 addReply(c,shared.ok);
7976 }
7977
7978 static void discardCommand(redisClient *c) {
7979 if (!(c->flags & REDIS_MULTI)) {
7980 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7981 return;
7982 }
7983
7984 freeClientMultiState(c);
7985 initClientMultiState(c);
7986 c->flags &= (~REDIS_MULTI);
7987 unwatchAllKeys(c);
7988 addReply(c,shared.ok);
7989 }
7990
7991 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7992 * implememntation for more information. */
7993 static void execCommandReplicateMulti(redisClient *c) {
7994 struct redisCommand *cmd;
7995 robj *multistring = createStringObject("MULTI",5);
7996
7997 cmd = lookupCommand("multi");
7998 if (server.appendonly)
7999 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8000 if (listLength(server.slaves))
8001 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8002 decrRefCount(multistring);
8003 }
8004
8005 static void execCommand(redisClient *c) {
8006 int j;
8007 robj **orig_argv;
8008 int orig_argc;
8009
8010 if (!(c->flags & REDIS_MULTI)) {
8011 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8012 return;
8013 }
8014
8015 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8016 * A failed EXEC will return a multi bulk nil object. */
8017 if (c->flags & REDIS_DIRTY_CAS) {
8018 freeClientMultiState(c);
8019 initClientMultiState(c);
8020 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8021 unwatchAllKeys(c);
8022 addReply(c,shared.nullmultibulk);
8023 return;
8024 }
8025
8026 /* Replicate a MULTI request now that we are sure the block is executed.
8027 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8028 * both the AOF and the replication link will have the same consistency
8029 * and atomicity guarantees. */
8030 execCommandReplicateMulti(c);
8031
8032 /* Exec all the queued commands */
8033 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8034 orig_argv = c->argv;
8035 orig_argc = c->argc;
8036 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8037 for (j = 0; j < c->mstate.count; j++) {
8038 c->argc = c->mstate.commands[j].argc;
8039 c->argv = c->mstate.commands[j].argv;
8040 call(c,c->mstate.commands[j].cmd);
8041 }
8042 c->argv = orig_argv;
8043 c->argc = orig_argc;
8044 freeClientMultiState(c);
8045 initClientMultiState(c);
8046 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8047 /* Make sure the EXEC command is always replicated / AOF, since we
8048 * always send the MULTI command (we can't know beforehand if the
8049 * next operations will contain at least a modification to the DB). */
8050 server.dirty++;
8051 }
8052
8053 /* =========================== Blocking Operations ========================= */
8054
8055 /* Currently Redis blocking operations support is limited to list POP ops,
8056 * so the current implementation is not fully generic, but it is also not
8057 * completely specific so it will not require a rewrite to support new
8058 * kind of blocking operations in the future.
8059 *
8060 * Still it's important to note that list blocking operations can be already
8061 * used as a notification mechanism in order to implement other blocking
8062 * operations at application level, so there must be a very strong evidence
8063 * of usefulness and generality before new blocking operations are implemented.
8064 *
8065 * This is how the current blocking POP works, we use BLPOP as example:
8066 * - If the user calls BLPOP and the key exists and contains a non empty list
8067 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8068 * if there is not to block.
8069 * - If instead BLPOP is called and the key does not exists or the list is
8070 * empty we need to block. In order to do so we remove the notification for
8071 * new data to read in the client socket (so that we'll not serve new
8072 * requests if the blocking request is not served). Also we put the client
8073 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8074 * blocking for this keys.
8075 * - If a PUSH operation against a key with blocked clients waiting is
8076 * performed, we serve the first in the list: basically instead to push
8077 * the new element inside the list we return it to the (first / oldest)
8078 * blocking client, unblock the client, and remove it form the list.
8079 *
8080 * The above comment and the source code should be enough in order to understand
8081 * the implementation and modify / fix it later.
8082 */
8083
8084 /* Set a client in blocking mode for the specified key, with the specified
8085 * timeout */
8086 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8087 dictEntry *de;
8088 list *l;
8089 int j;
8090
8091 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8092 c->blocking_keys_num = numkeys;
8093 c->blockingto = timeout;
8094 for (j = 0; j < numkeys; j++) {
8095 /* Add the key in the client structure, to map clients -> keys */
8096 c->blocking_keys[j] = keys[j];
8097 incrRefCount(keys[j]);
8098
8099 /* And in the other "side", to map keys -> clients */
8100 de = dictFind(c->db->blocking_keys,keys[j]);
8101 if (de == NULL) {
8102 int retval;
8103
8104 /* For every key we take a list of clients blocked for it */
8105 l = listCreate();
8106 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8107 incrRefCount(keys[j]);
8108 assert(retval == DICT_OK);
8109 } else {
8110 l = dictGetEntryVal(de);
8111 }
8112 listAddNodeTail(l,c);
8113 }
8114 /* Mark the client as a blocked client */
8115 c->flags |= REDIS_BLOCKED;
8116 server.blpop_blocked_clients++;
8117 }
8118
8119 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8120 static void unblockClientWaitingData(redisClient *c) {
8121 dictEntry *de;
8122 list *l;
8123 int j;
8124
8125 assert(c->blocking_keys != NULL);
8126 /* The client may wait for multiple keys, so unblock it for every key. */
8127 for (j = 0; j < c->blocking_keys_num; j++) {
8128 /* Remove this client from the list of clients waiting for this key. */
8129 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8130 assert(de != NULL);
8131 l = dictGetEntryVal(de);
8132 listDelNode(l,listSearchKey(l,c));
8133 /* If the list is empty we need to remove it to avoid wasting memory */
8134 if (listLength(l) == 0)
8135 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8136 decrRefCount(c->blocking_keys[j]);
8137 }
8138 /* Cleanup the client structure */
8139 zfree(c->blocking_keys);
8140 c->blocking_keys = NULL;
8141 c->flags &= (~REDIS_BLOCKED);
8142 server.blpop_blocked_clients--;
8143 /* We want to process data if there is some command waiting
8144 * in the input buffer. Note that this is safe even if
8145 * unblockClientWaitingData() gets called from freeClient() because
8146 * freeClient() will be smart enough to call this function
8147 * *after* c->querybuf was set to NULL. */
8148 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8149 }
8150
8151 /* This should be called from any function PUSHing into lists.
8152 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8153 * 'ele' is the element pushed.
8154 *
8155 * If the function returns 0 there was no client waiting for a list push
8156 * against this key.
8157 *
8158 * If the function returns 1 there was a client waiting for a list push
8159 * against this key, the element was passed to this client thus it's not
8160 * needed to actually add it to the list and the caller should return asap. */
8161 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8162 struct dictEntry *de;
8163 redisClient *receiver;
8164 list *l;
8165 listNode *ln;
8166
8167 de = dictFind(c->db->blocking_keys,key);
8168 if (de == NULL) return 0;
8169 l = dictGetEntryVal(de);
8170 ln = listFirst(l);
8171 assert(ln != NULL);
8172 receiver = ln->value;
8173
8174 addReplySds(receiver,sdsnew("*2\r\n"));
8175 addReplyBulk(receiver,key);
8176 addReplyBulk(receiver,ele);
8177 unblockClientWaitingData(receiver);
8178 return 1;
8179 }
8180
8181 /* Blocking RPOP/LPOP */
8182 static void blockingPopGenericCommand(redisClient *c, int where) {
8183 robj *o;
8184 time_t timeout;
8185 int j;
8186
8187 for (j = 1; j < c->argc-1; j++) {
8188 o = lookupKeyWrite(c->db,c->argv[j]);
8189 if (o != NULL) {
8190 if (o->type != REDIS_LIST) {
8191 addReply(c,shared.wrongtypeerr);
8192 return;
8193 } else {
8194 list *list = o->ptr;
8195 if (listLength(list) != 0) {
8196 /* If the list contains elements fall back to the usual
8197 * non-blocking POP operation */
8198 robj *argv[2], **orig_argv;
8199 int orig_argc;
8200
8201 /* We need to alter the command arguments before to call
8202 * popGenericCommand() as the command takes a single key. */
8203 orig_argv = c->argv;
8204 orig_argc = c->argc;
8205 argv[1] = c->argv[j];
8206 c->argv = argv;
8207 c->argc = 2;
8208
8209 /* Also the return value is different, we need to output
8210 * the multi bulk reply header and the key name. The
8211 * "real" command will add the last element (the value)
8212 * for us. If this souds like an hack to you it's just
8213 * because it is... */
8214 addReplySds(c,sdsnew("*2\r\n"));
8215 addReplyBulk(c,argv[1]);
8216 popGenericCommand(c,where);
8217
8218 /* Fix the client structure with the original stuff */
8219 c->argv = orig_argv;
8220 c->argc = orig_argc;
8221 return;
8222 }
8223 }
8224 }
8225 }
8226 /* If the list is empty or the key does not exists we must block */
8227 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8228 if (timeout > 0) timeout += time(NULL);
8229 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8230 }
8231
8232 static void blpopCommand(redisClient *c) {
8233 blockingPopGenericCommand(c,REDIS_HEAD);
8234 }
8235
8236 static void brpopCommand(redisClient *c) {
8237 blockingPopGenericCommand(c,REDIS_TAIL);
8238 }
8239
8240 /* =============================== Replication ============================= */
8241
8242 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8243 ssize_t nwritten, ret = size;
8244 time_t start = time(NULL);
8245
8246 timeout++;
8247 while(size) {
8248 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8249 nwritten = write(fd,ptr,size);
8250 if (nwritten == -1) return -1;
8251 ptr += nwritten;
8252 size -= nwritten;
8253 }
8254 if ((time(NULL)-start) > timeout) {
8255 errno = ETIMEDOUT;
8256 return -1;
8257 }
8258 }
8259 return ret;
8260 }
8261
8262 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8263 ssize_t nread, totread = 0;
8264 time_t start = time(NULL);
8265
8266 timeout++;
8267 while(size) {
8268 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8269 nread = read(fd,ptr,size);
8270 if (nread == -1) return -1;
8271 ptr += nread;
8272 size -= nread;
8273 totread += nread;
8274 }
8275 if ((time(NULL)-start) > timeout) {
8276 errno = ETIMEDOUT;
8277 return -1;
8278 }
8279 }
8280 return totread;
8281 }
8282
8283 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8284 ssize_t nread = 0;
8285
8286 size--;
8287 while(size) {
8288 char c;
8289
8290 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8291 if (c == '\n') {
8292 *ptr = '\0';
8293 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8294 return nread;
8295 } else {
8296 *ptr++ = c;
8297 *ptr = '\0';
8298 nread++;
8299 }
8300 }
8301 return nread;
8302 }
8303
8304 static void syncCommand(redisClient *c) {
8305 /* ignore SYNC if aleady slave or in monitor mode */
8306 if (c->flags & REDIS_SLAVE) return;
8307
8308 /* SYNC can't be issued when the server has pending data to send to
8309 * the client about already issued commands. We need a fresh reply
8310 * buffer registering the differences between the BGSAVE and the current
8311 * dataset, so that we can copy to other slaves if needed. */
8312 if (listLength(c->reply) != 0) {
8313 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8314 return;
8315 }
8316
8317 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8318 /* Here we need to check if there is a background saving operation
8319 * in progress, or if it is required to start one */
8320 if (server.bgsavechildpid != -1) {
8321 /* Ok a background save is in progress. Let's check if it is a good
8322 * one for replication, i.e. if there is another slave that is
8323 * registering differences since the server forked to save */
8324 redisClient *slave;
8325 listNode *ln;
8326 listIter li;
8327
8328 listRewind(server.slaves,&li);
8329 while((ln = listNext(&li))) {
8330 slave = ln->value;
8331 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8332 }
8333 if (ln) {
8334 /* Perfect, the server is already registering differences for
8335 * another slave. Set the right state, and copy the buffer. */
8336 listRelease(c->reply);
8337 c->reply = listDup(slave->reply);
8338 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8339 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8340 } else {
8341 /* No way, we need to wait for the next BGSAVE in order to
8342 * register differences */
8343 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8344 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8345 }
8346 } else {
8347 /* Ok we don't have a BGSAVE in progress, let's start one */
8348 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8349 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8350 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8351 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8352 return;
8353 }
8354 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8355 }
8356 c->repldbfd = -1;
8357 c->flags |= REDIS_SLAVE;
8358 c->slaveseldb = 0;
8359 listAddNodeTail(server.slaves,c);
8360 return;
8361 }
8362
8363 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8364 redisClient *slave = privdata;
8365 REDIS_NOTUSED(el);
8366 REDIS_NOTUSED(mask);
8367 char buf[REDIS_IOBUF_LEN];
8368 ssize_t nwritten, buflen;
8369
8370 if (slave->repldboff == 0) {
8371 /* Write the bulk write count before to transfer the DB. In theory here
8372 * we don't know how much room there is in the output buffer of the
8373 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8374 * operations) will never be smaller than the few bytes we need. */
8375 sds bulkcount;
8376
8377 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8378 slave->repldbsize);
8379 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8380 {
8381 sdsfree(bulkcount);
8382 freeClient(slave);
8383 return;
8384 }
8385 sdsfree(bulkcount);
8386 }
8387 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8388 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8389 if (buflen <= 0) {
8390 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8391 (buflen == 0) ? "premature EOF" : strerror(errno));
8392 freeClient(slave);
8393 return;
8394 }
8395 if ((nwritten = write(fd,buf,buflen)) == -1) {
8396 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8397 strerror(errno));
8398 freeClient(slave);
8399 return;
8400 }
8401 slave->repldboff += nwritten;
8402 if (slave->repldboff == slave->repldbsize) {
8403 close(slave->repldbfd);
8404 slave->repldbfd = -1;
8405 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8406 slave->replstate = REDIS_REPL_ONLINE;
8407 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8408 sendReplyToClient, slave) == AE_ERR) {
8409 freeClient(slave);
8410 return;
8411 }
8412 addReplySds(slave,sdsempty());
8413 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8414 }
8415 }
8416
8417 /* This function is called at the end of every backgrond saving.
8418 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8419 * otherwise REDIS_ERR is passed to the function.
8420 *
8421 * The goal of this function is to handle slaves waiting for a successful
8422 * background saving in order to perform non-blocking synchronization. */
8423 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8424 listNode *ln;
8425 int startbgsave = 0;
8426 listIter li;
8427
8428 listRewind(server.slaves,&li);
8429 while((ln = listNext(&li))) {
8430 redisClient *slave = ln->value;
8431
8432 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8433 startbgsave = 1;
8434 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8435 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8436 struct redis_stat buf;
8437
8438 if (bgsaveerr != REDIS_OK) {
8439 freeClient(slave);
8440 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8441 continue;
8442 }
8443 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8444 redis_fstat(slave->repldbfd,&buf) == -1) {
8445 freeClient(slave);
8446 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8447 continue;
8448 }
8449 slave->repldboff = 0;
8450 slave->repldbsize = buf.st_size;
8451 slave->replstate = REDIS_REPL_SEND_BULK;
8452 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8453 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8454 freeClient(slave);
8455 continue;
8456 }
8457 }
8458 }
8459 if (startbgsave) {
8460 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8461 listIter li;
8462
8463 listRewind(server.slaves,&li);
8464 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8465 while((ln = listNext(&li))) {
8466 redisClient *slave = ln->value;
8467
8468 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8469 freeClient(slave);
8470 }
8471 }
8472 }
8473 }
8474
8475 static int syncWithMaster(void) {
8476 char buf[1024], tmpfile[256], authcmd[1024];
8477 long dumpsize;
8478 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8479 int dfd, maxtries = 5;
8480
8481 if (fd == -1) {
8482 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8483 strerror(errno));
8484 return REDIS_ERR;
8485 }
8486
8487 /* AUTH with the master if required. */
8488 if(server.masterauth) {
8489 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8490 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8491 close(fd);
8492 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8493 strerror(errno));
8494 return REDIS_ERR;
8495 }
8496 /* Read the AUTH result. */
8497 if (syncReadLine(fd,buf,1024,3600) == -1) {
8498 close(fd);
8499 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8500 strerror(errno));
8501 return REDIS_ERR;
8502 }
8503 if (buf[0] != '+') {
8504 close(fd);
8505 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8506 return REDIS_ERR;
8507 }
8508 }
8509
8510 /* Issue the SYNC command */
8511 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8512 close(fd);
8513 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8514 strerror(errno));
8515 return REDIS_ERR;
8516 }
8517 /* Read the bulk write count */
8518 if (syncReadLine(fd,buf,1024,3600) == -1) {
8519 close(fd);
8520 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8521 strerror(errno));
8522 return REDIS_ERR;
8523 }
8524 if (buf[0] != '$') {
8525 close(fd);
8526 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8527 return REDIS_ERR;
8528 }
8529 dumpsize = strtol(buf+1,NULL,10);
8530 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8531 /* Read the bulk write data on a temp file */
8532 while(maxtries--) {
8533 snprintf(tmpfile,256,
8534 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8535 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8536 if (dfd != -1) break;
8537 sleep(1);
8538 }
8539 if (dfd == -1) {
8540 close(fd);
8541 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8542 return REDIS_ERR;
8543 }
8544 while(dumpsize) {
8545 int nread, nwritten;
8546
8547 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8548 if (nread == -1) {
8549 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8550 strerror(errno));
8551 close(fd);
8552 close(dfd);
8553 return REDIS_ERR;
8554 }
8555 nwritten = write(dfd,buf,nread);
8556 if (nwritten == -1) {
8557 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8558 close(fd);
8559 close(dfd);
8560 return REDIS_ERR;
8561 }
8562 dumpsize -= nread;
8563 }
8564 close(dfd);
8565 if (rename(tmpfile,server.dbfilename) == -1) {
8566 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8567 unlink(tmpfile);
8568 close(fd);
8569 return REDIS_ERR;
8570 }
8571 emptyDb();
8572 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8573 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8574 close(fd);
8575 return REDIS_ERR;
8576 }
8577 server.master = createClient(fd);
8578 server.master->flags |= REDIS_MASTER;
8579 server.master->authenticated = 1;
8580 server.replstate = REDIS_REPL_CONNECTED;
8581 return REDIS_OK;
8582 }
8583
8584 static void slaveofCommand(redisClient *c) {
8585 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8586 !strcasecmp(c->argv[2]->ptr,"one")) {
8587 if (server.masterhost) {
8588 sdsfree(server.masterhost);
8589 server.masterhost = NULL;
8590 if (server.master) freeClient(server.master);
8591 server.replstate = REDIS_REPL_NONE;
8592 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8593 }
8594 } else {
8595 sdsfree(server.masterhost);
8596 server.masterhost = sdsdup(c->argv[1]->ptr);
8597 server.masterport = atoi(c->argv[2]->ptr);
8598 if (server.master) freeClient(server.master);
8599 server.replstate = REDIS_REPL_CONNECT;
8600 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8601 server.masterhost, server.masterport);
8602 }
8603 addReply(c,shared.ok);
8604 }
8605
8606 /* ============================ Maxmemory directive ======================== */
8607
8608 /* Try to free one object form the pre-allocated objects free list.
8609 * This is useful under low mem conditions as by default we take 1 million
8610 * free objects allocated. On success REDIS_OK is returned, otherwise
8611 * REDIS_ERR. */
8612 static int tryFreeOneObjectFromFreelist(void) {
8613 robj *o;
8614
8615 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8616 if (listLength(server.objfreelist)) {
8617 listNode *head = listFirst(server.objfreelist);
8618 o = listNodeValue(head);
8619 listDelNode(server.objfreelist,head);
8620 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8621 zfree(o);
8622 return REDIS_OK;
8623 } else {
8624 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8625 return REDIS_ERR;
8626 }
8627 }
8628
8629 /* This function gets called when 'maxmemory' is set on the config file to limit
8630 * the max memory used by the server, and we are out of memory.
8631 * This function will try to, in order:
8632 *
8633 * - Free objects from the free list
8634 * - Try to remove keys with an EXPIRE set
8635 *
8636 * It is not possible to free enough memory to reach used-memory < maxmemory
8637 * the server will start refusing commands that will enlarge even more the
8638 * memory usage.
8639 */
8640 static void freeMemoryIfNeeded(void) {
8641 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8642 int j, k, freed = 0;
8643
8644 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8645 for (j = 0; j < server.dbnum; j++) {
8646 int minttl = -1;
8647 robj *minkey = NULL;
8648 struct dictEntry *de;
8649
8650 if (dictSize(server.db[j].expires)) {
8651 freed = 1;
8652 /* From a sample of three keys drop the one nearest to
8653 * the natural expire */
8654 for (k = 0; k < 3; k++) {
8655 time_t t;
8656
8657 de = dictGetRandomKey(server.db[j].expires);
8658 t = (time_t) dictGetEntryVal(de);
8659 if (minttl == -1 || t < minttl) {
8660 minkey = dictGetEntryKey(de);
8661 minttl = t;
8662 }
8663 }
8664 dbDelete(server.db+j,minkey);
8665 }
8666 }
8667 if (!freed) return; /* nothing to free... */
8668 }
8669 }
8670
8671 /* ============================== Append Only file ========================== */
8672
8673 /* Called when the user switches from "appendonly yes" to "appendonly no"
8674 * at runtime using the CONFIG command. */
8675 static void stopAppendOnly(void) {
8676 flushAppendOnlyFile();
8677 aof_fsync(server.appendfd);
8678 close(server.appendfd);
8679
8680 server.appendfd = -1;
8681 server.appendseldb = -1;
8682 server.appendonly = 0;
8683 /* rewrite operation in progress? kill it, wait child exit */
8684 if (server.bgsavechildpid != -1) {
8685 int statloc;
8686
8687 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8688 wait3(&statloc,0,NULL);
8689 /* reset the buffer accumulating changes while the child saves */
8690 sdsfree(server.bgrewritebuf);
8691 server.bgrewritebuf = sdsempty();
8692 server.bgsavechildpid = -1;
8693 }
8694 }
8695
8696 /* Called when the user switches from "appendonly no" to "appendonly yes"
8697 * at runtime using the CONFIG command. */
8698 static int startAppendOnly(void) {
8699 server.appendonly = 1;
8700 server.lastfsync = time(NULL);
8701 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8702 if (server.appendfd == -1) {
8703 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8704 return REDIS_ERR;
8705 }
8706 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8707 server.appendonly = 0;
8708 close(server.appendfd);
8709 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8710 return REDIS_ERR;
8711 }
8712 return REDIS_OK;
8713 }
8714
8715 /* Write the append only file buffer on disk.
8716 *
8717 * Since we are required to write the AOF before replying to the client,
8718 * and the only way the client socket can get a write is entering when the
8719 * the event loop, we accumulate all the AOF writes in a memory
8720 * buffer and write it on disk using this function just before entering
8721 * the event loop again. */
8722 static void flushAppendOnlyFile(void) {
8723 time_t now;
8724 ssize_t nwritten;
8725
8726 if (sdslen(server.aofbuf) == 0) return;
8727
8728 /* We want to perform a single write. This should be guaranteed atomic
8729 * at least if the filesystem we are writing is a real physical one.
8730 * While this will save us against the server being killed I don't think
8731 * there is much to do about the whole server stopping for power problems
8732 * or alike */
8733 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8734 if (nwritten != (signed)sdslen(server.aofbuf)) {
8735 /* Ooops, we are in troubles. The best thing to do for now is
8736 * aborting instead of giving the illusion that everything is
8737 * working as expected. */
8738 if (nwritten == -1) {
8739 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8740 } else {
8741 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8742 }
8743 exit(1);
8744 }
8745 sdsfree(server.aofbuf);
8746 server.aofbuf = sdsempty();
8747
8748 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8749 * childs performing heavy I/O on disk. */
8750 if (server.no_appendfsync_on_rewrite &&
8751 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8752 return;
8753 /* Fsync if needed */
8754 now = time(NULL);
8755 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8756 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8757 now-server.lastfsync > 1))
8758 {
8759 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8760 * flushing metadata. */
8761 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8762 server.lastfsync = now;
8763 }
8764 }
8765
8766 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8767 int j;
8768 buf = sdscatprintf(buf,"*%d\r\n",argc);
8769 for (j = 0; j < argc; j++) {
8770 robj *o = getDecodedObject(argv[j]);
8771 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8772 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8773 buf = sdscatlen(buf,"\r\n",2);
8774 decrRefCount(o);
8775 }
8776 return buf;
8777 }
8778
8779 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8780 int argc = 3;
8781 long when;
8782 robj *argv[3];
8783
8784 /* Make sure we can use strtol */
8785 seconds = getDecodedObject(seconds);
8786 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8787 decrRefCount(seconds);
8788
8789 argv[0] = createStringObject("EXPIREAT",8);
8790 argv[1] = key;
8791 argv[2] = createObject(REDIS_STRING,
8792 sdscatprintf(sdsempty(),"%ld",when));
8793 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8794 decrRefCount(argv[0]);
8795 decrRefCount(argv[2]);
8796 return buf;
8797 }
8798
8799 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8800 sds buf = sdsempty();
8801 robj *tmpargv[3];
8802
8803 /* The DB this command was targetting is not the same as the last command
8804 * we appendend. To issue a SELECT command is needed. */
8805 if (dictid != server.appendseldb) {
8806 char seldb[64];
8807
8808 snprintf(seldb,sizeof(seldb),"%d",dictid);
8809 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8810 (unsigned long)strlen(seldb),seldb);
8811 server.appendseldb = dictid;
8812 }
8813
8814 if (cmd->proc == expireCommand) {
8815 /* Translate EXPIRE into EXPIREAT */
8816 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8817 } else if (cmd->proc == setexCommand) {
8818 /* Translate SETEX to SET and EXPIREAT */
8819 tmpargv[0] = createStringObject("SET",3);
8820 tmpargv[1] = argv[1];
8821 tmpargv[2] = argv[3];
8822 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8823 decrRefCount(tmpargv[0]);
8824 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8825 } else {
8826 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8827 }
8828
8829 /* Append to the AOF buffer. This will be flushed on disk just before
8830 * of re-entering the event loop, so before the client will get a
8831 * positive reply about the operation performed. */
8832 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8833
8834 /* If a background append only file rewriting is in progress we want to
8835 * accumulate the differences between the child DB and the current one
8836 * in a buffer, so that when the child process will do its work we
8837 * can append the differences to the new append only file. */
8838 if (server.bgrewritechildpid != -1)
8839 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8840
8841 sdsfree(buf);
8842 }
8843
8844 /* In Redis commands are always executed in the context of a client, so in
8845 * order to load the append only file we need to create a fake client. */
8846 static struct redisClient *createFakeClient(void) {
8847 struct redisClient *c = zmalloc(sizeof(*c));
8848
8849 selectDb(c,0);
8850 c->fd = -1;
8851 c->querybuf = sdsempty();
8852 c->argc = 0;
8853 c->argv = NULL;
8854 c->flags = 0;
8855 /* We set the fake client as a slave waiting for the synchronization
8856 * so that Redis will not try to send replies to this client. */
8857 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8858 c->reply = listCreate();
8859 listSetFreeMethod(c->reply,decrRefCount);
8860 listSetDupMethod(c->reply,dupClientReplyValue);
8861 initClientMultiState(c);
8862 return c;
8863 }
8864
8865 static void freeFakeClient(struct redisClient *c) {
8866 sdsfree(c->querybuf);
8867 listRelease(c->reply);
8868 freeClientMultiState(c);
8869 zfree(c);
8870 }
8871
8872 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8873 * error (the append only file is zero-length) REDIS_ERR is returned. On
8874 * fatal error an error message is logged and the program exists. */
8875 int loadAppendOnlyFile(char *filename) {
8876 struct redisClient *fakeClient;
8877 FILE *fp = fopen(filename,"r");
8878 struct redis_stat sb;
8879 int appendonly = server.appendonly;
8880
8881 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8882 return REDIS_ERR;
8883
8884 if (fp == NULL) {
8885 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8886 exit(1);
8887 }
8888
8889 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8890 * to the same file we're about to read. */
8891 server.appendonly = 0;
8892
8893 fakeClient = createFakeClient();
8894 while(1) {
8895 int argc, j;
8896 unsigned long len;
8897 robj **argv;
8898 char buf[128];
8899 sds argsds;
8900 struct redisCommand *cmd;
8901 int force_swapout;
8902
8903 if (fgets(buf,sizeof(buf),fp) == NULL) {
8904 if (feof(fp))
8905 break;
8906 else
8907 goto readerr;
8908 }
8909 if (buf[0] != '*') goto fmterr;
8910 argc = atoi(buf+1);
8911 argv = zmalloc(sizeof(robj*)*argc);
8912 for (j = 0; j < argc; j++) {
8913 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8914 if (buf[0] != '$') goto fmterr;
8915 len = strtol(buf+1,NULL,10);
8916 argsds = sdsnewlen(NULL,len);
8917 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8918 argv[j] = createObject(REDIS_STRING,argsds);
8919 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8920 }
8921
8922 /* Command lookup */
8923 cmd = lookupCommand(argv[0]->ptr);
8924 if (!cmd) {
8925 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8926 exit(1);
8927 }
8928 /* Try object encoding */
8929 if (cmd->flags & REDIS_CMD_BULK)
8930 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8931 /* Run the command in the context of a fake client */
8932 fakeClient->argc = argc;
8933 fakeClient->argv = argv;
8934 cmd->proc(fakeClient);
8935 /* Discard the reply objects list from the fake client */
8936 while(listLength(fakeClient->reply))
8937 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8938 /* Clean up, ready for the next command */
8939 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8940 zfree(argv);
8941 /* Handle swapping while loading big datasets when VM is on */
8942 force_swapout = 0;
8943 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8944 force_swapout = 1;
8945
8946 if (server.vm_enabled && force_swapout) {
8947 while (zmalloc_used_memory() > server.vm_max_memory) {
8948 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8949 }
8950 }
8951 }
8952
8953 /* This point can only be reached when EOF is reached without errors.
8954 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8955 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8956
8957 fclose(fp);
8958 freeFakeClient(fakeClient);
8959 server.appendonly = appendonly;
8960 return REDIS_OK;
8961
8962 readerr:
8963 if (feof(fp)) {
8964 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8965 } else {
8966 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8967 }
8968 exit(1);
8969 fmterr:
8970 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8971 exit(1);
8972 }
8973
8974 /* Write binary-safe string into a file in the bulkformat
8975 * $<count>\r\n<payload>\r\n */
8976 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8977 char cbuf[128];
8978 int clen;
8979 cbuf[0] = '$';
8980 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
8981 cbuf[clen++] = '\r';
8982 cbuf[clen++] = '\n';
8983 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
8984 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
8985 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8986 return 1;
8987 }
8988
8989 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8990 static int fwriteBulkDouble(FILE *fp, double d) {
8991 char buf[128], dbuf[128];
8992
8993 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8994 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8995 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8996 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8997 return 1;
8998 }
8999
9000 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9001 static int fwriteBulkLongLong(FILE *fp, long long l) {
9002 char bbuf[128], lbuf[128];
9003 unsigned int blen, llen;
9004 llen = ll2string(lbuf,32,l);
9005 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9006 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9007 return 1;
9008 }
9009
9010 /* Delegate writing an object to writing a bulk string or bulk long long. */
9011 static int fwriteBulkObject(FILE *fp, robj *obj) {
9012 /* Avoid using getDecodedObject to help copy-on-write (we are often
9013 * in a child process when this function is called). */
9014 if (obj->encoding == REDIS_ENCODING_INT) {
9015 return fwriteBulkLongLong(fp,(long)obj->ptr);
9016 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9017 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9018 } else {
9019 redisPanic("Unknown string encoding");
9020 }
9021 }
9022
9023 /* Write a sequence of commands able to fully rebuild the dataset into
9024 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9025 static int rewriteAppendOnlyFile(char *filename) {
9026 dictIterator *di = NULL;
9027 dictEntry *de;
9028 FILE *fp;
9029 char tmpfile[256];
9030 int j;
9031 time_t now = time(NULL);
9032
9033 /* Note that we have to use a different temp name here compared to the
9034 * one used by rewriteAppendOnlyFileBackground() function. */
9035 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9036 fp = fopen(tmpfile,"w");
9037 if (!fp) {
9038 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9039 return REDIS_ERR;
9040 }
9041 for (j = 0; j < server.dbnum; j++) {
9042 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9043 redisDb *db = server.db+j;
9044 dict *d = db->dict;
9045 if (dictSize(d) == 0) continue;
9046 di = dictGetIterator(d);
9047 if (!di) {
9048 fclose(fp);
9049 return REDIS_ERR;
9050 }
9051
9052 /* SELECT the new DB */
9053 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9054 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9055
9056 /* Iterate this DB writing every entry */
9057 while((de = dictNext(di)) != NULL) {
9058 sds keystr = dictGetEntryKey(de);
9059 robj key, *o;
9060 time_t expiretime;
9061 int swapped;
9062
9063 keystr = dictGetEntryKey(de);
9064 o = dictGetEntryVal(de);
9065 initStaticStringObject(key,keystr);
9066 /* If the value for this key is swapped, load a preview in memory.
9067 * We use a "swapped" flag to remember if we need to free the
9068 * value object instead to just increment the ref count anyway
9069 * in order to avoid copy-on-write of pages if we are forked() */
9070 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9071 o->storage == REDIS_VM_SWAPPING) {
9072 swapped = 0;
9073 } else {
9074 o = vmPreviewObject(o);
9075 swapped = 1;
9076 }
9077 expiretime = getExpire(db,&key);
9078
9079 /* Save the key and associated value */
9080 if (o->type == REDIS_STRING) {
9081 /* Emit a SET command */
9082 char cmd[]="*3\r\n$3\r\nSET\r\n";
9083 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9084 /* Key and value */
9085 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9086 if (fwriteBulkObject(fp,o) == 0) goto werr;
9087 } else if (o->type == REDIS_LIST) {
9088 /* Emit the RPUSHes needed to rebuild the list */
9089 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9090 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9091 unsigned char *zl = o->ptr;
9092 unsigned char *p = ziplistIndex(zl,0);
9093 unsigned char *vstr;
9094 unsigned int vlen;
9095 long long vlong;
9096
9097 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9098 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9099 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9100 if (vstr) {
9101 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9102 goto werr;
9103 } else {
9104 if (fwriteBulkLongLong(fp,vlong) == 0)
9105 goto werr;
9106 }
9107 p = ziplistNext(zl,p);
9108 }
9109 } else if (o->encoding == REDIS_ENCODING_LIST) {
9110 list *list = o->ptr;
9111 listNode *ln;
9112 listIter li;
9113
9114 listRewind(list,&li);
9115 while((ln = listNext(&li))) {
9116 robj *eleobj = listNodeValue(ln);
9117
9118 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9119 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9120 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9121 }
9122 } else {
9123 redisPanic("Unknown list encoding");
9124 }
9125 } else if (o->type == REDIS_SET) {
9126 /* Emit the SADDs needed to rebuild the set */
9127 dict *set = o->ptr;
9128 dictIterator *di = dictGetIterator(set);
9129 dictEntry *de;
9130
9131 while((de = dictNext(di)) != NULL) {
9132 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9133 robj *eleobj = dictGetEntryKey(de);
9134
9135 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9136 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9137 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9138 }
9139 dictReleaseIterator(di);
9140 } else if (o->type == REDIS_ZSET) {
9141 /* Emit the ZADDs needed to rebuild the sorted set */
9142 zset *zs = o->ptr;
9143 dictIterator *di = dictGetIterator(zs->dict);
9144 dictEntry *de;
9145
9146 while((de = dictNext(di)) != NULL) {
9147 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9148 robj *eleobj = dictGetEntryKey(de);
9149 double *score = dictGetEntryVal(de);
9150
9151 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9152 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9153 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9154 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9155 }
9156 dictReleaseIterator(di);
9157 } else if (o->type == REDIS_HASH) {
9158 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9159
9160 /* Emit the HSETs needed to rebuild the hash */
9161 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9162 unsigned char *p = zipmapRewind(o->ptr);
9163 unsigned char *field, *val;
9164 unsigned int flen, vlen;
9165
9166 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9167 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9168 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9169 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9170 return -1;
9171 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9172 return -1;
9173 }
9174 } else {
9175 dictIterator *di = dictGetIterator(o->ptr);
9176 dictEntry *de;
9177
9178 while((de = dictNext(di)) != NULL) {
9179 robj *field = dictGetEntryKey(de);
9180 robj *val = dictGetEntryVal(de);
9181
9182 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9183 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9184 if (fwriteBulkObject(fp,field) == -1) return -1;
9185 if (fwriteBulkObject(fp,val) == -1) return -1;
9186 }
9187 dictReleaseIterator(di);
9188 }
9189 } else {
9190 redisPanic("Unknown object type");
9191 }
9192 /* Save the expire time */
9193 if (expiretime != -1) {
9194 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9195 /* If this key is already expired skip it */
9196 if (expiretime < now) continue;
9197 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9198 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9199 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9200 }
9201 if (swapped) decrRefCount(o);
9202 }
9203 dictReleaseIterator(di);
9204 }
9205
9206 /* Make sure data will not remain on the OS's output buffers */
9207 fflush(fp);
9208 aof_fsync(fileno(fp));
9209 fclose(fp);
9210
9211 /* Use RENAME to make sure the DB file is changed atomically only
9212 * if the generate DB file is ok. */
9213 if (rename(tmpfile,filename) == -1) {
9214 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9215 unlink(tmpfile);
9216 return REDIS_ERR;
9217 }
9218 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9219 return REDIS_OK;
9220
9221 werr:
9222 fclose(fp);
9223 unlink(tmpfile);
9224 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9225 if (di) dictReleaseIterator(di);
9226 return REDIS_ERR;
9227 }
9228
9229 /* This is how rewriting of the append only file in background works:
9230 *
9231 * 1) The user calls BGREWRITEAOF
9232 * 2) Redis calls this function, that forks():
9233 * 2a) the child rewrite the append only file in a temp file.
9234 * 2b) the parent accumulates differences in server.bgrewritebuf.
9235 * 3) When the child finished '2a' exists.
9236 * 4) The parent will trap the exit code, if it's OK, will append the
9237 * data accumulated into server.bgrewritebuf into the temp file, and
9238 * finally will rename(2) the temp file in the actual file name.
9239 * The the new file is reopened as the new append only file. Profit!
9240 */
9241 static int rewriteAppendOnlyFileBackground(void) {
9242 pid_t childpid;
9243
9244 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9245 if (server.vm_enabled) waitEmptyIOJobsQueue();
9246 if ((childpid = fork()) == 0) {
9247 /* Child */
9248 char tmpfile[256];
9249
9250 if (server.vm_enabled) vmReopenSwapFile();
9251 close(server.fd);
9252 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9253 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9254 _exit(0);
9255 } else {
9256 _exit(1);
9257 }
9258 } else {
9259 /* Parent */
9260 if (childpid == -1) {
9261 redisLog(REDIS_WARNING,
9262 "Can't rewrite append only file in background: fork: %s",
9263 strerror(errno));
9264 return REDIS_ERR;
9265 }
9266 redisLog(REDIS_NOTICE,
9267 "Background append only file rewriting started by pid %d",childpid);
9268 server.bgrewritechildpid = childpid;
9269 updateDictResizePolicy();
9270 /* We set appendseldb to -1 in order to force the next call to the
9271 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9272 * accumulated by the parent into server.bgrewritebuf will start
9273 * with a SELECT statement and it will be safe to merge. */
9274 server.appendseldb = -1;
9275 return REDIS_OK;
9276 }
9277 return REDIS_OK; /* unreached */
9278 }
9279
9280 static void bgrewriteaofCommand(redisClient *c) {
9281 if (server.bgrewritechildpid != -1) {
9282 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9283 return;
9284 }
9285 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9286 char *status = "+Background append only file rewriting started\r\n";
9287 addReplySds(c,sdsnew(status));
9288 } else {
9289 addReply(c,shared.err);
9290 }
9291 }
9292
9293 static void aofRemoveTempFile(pid_t childpid) {
9294 char tmpfile[256];
9295
9296 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9297 unlink(tmpfile);
9298 }
9299
9300 /* Virtual Memory is composed mainly of two subsystems:
9301 * - Blocking Virutal Memory
9302 * - Threaded Virtual Memory I/O
9303 * The two parts are not fully decoupled, but functions are split among two
9304 * different sections of the source code (delimited by comments) in order to
9305 * make more clear what functionality is about the blocking VM and what about
9306 * the threaded (not blocking) VM.
9307 *
9308 * Redis VM design:
9309 *
9310 * Redis VM is a blocking VM (one that blocks reading swapped values from
9311 * disk into memory when a value swapped out is needed in memory) that is made
9312 * unblocking by trying to examine the command argument vector in order to
9313 * load in background values that will likely be needed in order to exec
9314 * the command. The command is executed only once all the relevant keys
9315 * are loaded into memory.
9316 *
9317 * This basically is almost as simple of a blocking VM, but almost as parallel
9318 * as a fully non-blocking VM.
9319 */
9320
9321 /* =================== Virtual Memory - Blocking Side ====================== */
9322
9323 /* Create a VM pointer object. This kind of objects are used in place of
9324 * values in the key -> value hash table, for swapped out objects. */
9325 static vmpointer *createVmPointer(int vtype) {
9326 vmpointer *vp = zmalloc(sizeof(vmpointer));
9327
9328 vp->type = REDIS_VMPOINTER;
9329 vp->storage = REDIS_VM_SWAPPED;
9330 vp->vtype = vtype;
9331 return vp;
9332 }
9333
9334 static void vmInit(void) {
9335 off_t totsize;
9336 int pipefds[2];
9337 size_t stacksize;
9338 struct flock fl;
9339
9340 if (server.vm_max_threads != 0)
9341 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9342
9343 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9344 /* Try to open the old swap file, otherwise create it */
9345 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9346 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9347 }
9348 if (server.vm_fp == NULL) {
9349 redisLog(REDIS_WARNING,
9350 "Can't open the swap file: %s. Exiting.",
9351 strerror(errno));
9352 exit(1);
9353 }
9354 server.vm_fd = fileno(server.vm_fp);
9355 /* Lock the swap file for writing, this is useful in order to avoid
9356 * another instance to use the same swap file for a config error. */
9357 fl.l_type = F_WRLCK;
9358 fl.l_whence = SEEK_SET;
9359 fl.l_start = fl.l_len = 0;
9360 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9361 redisLog(REDIS_WARNING,
9362 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9363 exit(1);
9364 }
9365 /* Initialize */
9366 server.vm_next_page = 0;
9367 server.vm_near_pages = 0;
9368 server.vm_stats_used_pages = 0;
9369 server.vm_stats_swapped_objects = 0;
9370 server.vm_stats_swapouts = 0;
9371 server.vm_stats_swapins = 0;
9372 totsize = server.vm_pages*server.vm_page_size;
9373 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9374 if (ftruncate(server.vm_fd,totsize) == -1) {
9375 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9376 strerror(errno));
9377 exit(1);
9378 } else {
9379 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9380 }
9381 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9382 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9383 (long long) (server.vm_pages+7)/8, server.vm_pages);
9384 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9385
9386 /* Initialize threaded I/O (used by Virtual Memory) */
9387 server.io_newjobs = listCreate();
9388 server.io_processing = listCreate();
9389 server.io_processed = listCreate();
9390 server.io_ready_clients = listCreate();
9391 pthread_mutex_init(&server.io_mutex,NULL);
9392 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9393 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9394 server.io_active_threads = 0;
9395 if (pipe(pipefds) == -1) {
9396 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9397 ,strerror(errno));
9398 exit(1);
9399 }
9400 server.io_ready_pipe_read = pipefds[0];
9401 server.io_ready_pipe_write = pipefds[1];
9402 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9403 /* LZF requires a lot of stack */
9404 pthread_attr_init(&server.io_threads_attr);
9405 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9406 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9407 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9408 /* Listen for events in the threaded I/O pipe */
9409 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9410 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9411 oom("creating file event");
9412 }
9413
9414 /* Mark the page as used */
9415 static void vmMarkPageUsed(off_t page) {
9416 off_t byte = page/8;
9417 int bit = page&7;
9418 redisAssert(vmFreePage(page) == 1);
9419 server.vm_bitmap[byte] |= 1<<bit;
9420 }
9421
9422 /* Mark N contiguous pages as used, with 'page' being the first. */
9423 static void vmMarkPagesUsed(off_t page, off_t count) {
9424 off_t j;
9425
9426 for (j = 0; j < count; j++)
9427 vmMarkPageUsed(page+j);
9428 server.vm_stats_used_pages += count;
9429 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9430 (long long)count, (long long)page);
9431 }
9432
9433 /* Mark the page as free */
9434 static void vmMarkPageFree(off_t page) {
9435 off_t byte = page/8;
9436 int bit = page&7;
9437 redisAssert(vmFreePage(page) == 0);
9438 server.vm_bitmap[byte] &= ~(1<<bit);
9439 }
9440
9441 /* Mark N contiguous pages as free, with 'page' being the first. */
9442 static void vmMarkPagesFree(off_t page, off_t count) {
9443 off_t j;
9444
9445 for (j = 0; j < count; j++)
9446 vmMarkPageFree(page+j);
9447 server.vm_stats_used_pages -= count;
9448 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9449 (long long)count, (long long)page);
9450 }
9451
9452 /* Test if the page is free */
9453 static int vmFreePage(off_t page) {
9454 off_t byte = page/8;
9455 int bit = page&7;
9456 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9457 }
9458
9459 /* Find N contiguous free pages storing the first page of the cluster in *first.
9460 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9461 * REDIS_ERR is returned.
9462 *
9463 * This function uses a simple algorithm: we try to allocate
9464 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9465 * again from the start of the swap file searching for free spaces.
9466 *
9467 * If it looks pretty clear that there are no free pages near our offset
9468 * we try to find less populated places doing a forward jump of
9469 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9470 * without hurry, and then we jump again and so forth...
9471 *
9472 * This function can be improved using a free list to avoid to guess
9473 * too much, since we could collect data about freed pages.
9474 *
9475 * note: I implemented this function just after watching an episode of
9476 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9477 */
9478 static int vmFindContiguousPages(off_t *first, off_t n) {
9479 off_t base, offset = 0, since_jump = 0, numfree = 0;
9480
9481 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9482 server.vm_near_pages = 0;
9483 server.vm_next_page = 0;
9484 }
9485 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9486 base = server.vm_next_page;
9487
9488 while(offset < server.vm_pages) {
9489 off_t this = base+offset;
9490
9491 /* If we overflow, restart from page zero */
9492 if (this >= server.vm_pages) {
9493 this -= server.vm_pages;
9494 if (this == 0) {
9495 /* Just overflowed, what we found on tail is no longer
9496 * interesting, as it's no longer contiguous. */
9497 numfree = 0;
9498 }
9499 }
9500 if (vmFreePage(this)) {
9501 /* This is a free page */
9502 numfree++;
9503 /* Already got N free pages? Return to the caller, with success */
9504 if (numfree == n) {
9505 *first = this-(n-1);
9506 server.vm_next_page = this+1;
9507 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9508 return REDIS_OK;
9509 }
9510 } else {
9511 /* The current one is not a free page */
9512 numfree = 0;
9513 }
9514
9515 /* Fast-forward if the current page is not free and we already
9516 * searched enough near this place. */
9517 since_jump++;
9518 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9519 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9520 since_jump = 0;
9521 /* Note that even if we rewind after the jump, we are don't need
9522 * to make sure numfree is set to zero as we only jump *if* it
9523 * is set to zero. */
9524 } else {
9525 /* Otherwise just check the next page */
9526 offset++;
9527 }
9528 }
9529 return REDIS_ERR;
9530 }
9531
9532 /* Write the specified object at the specified page of the swap file */
9533 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9534 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9535 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9536 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9537 redisLog(REDIS_WARNING,
9538 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9539 strerror(errno));
9540 return REDIS_ERR;
9541 }
9542 rdbSaveObject(server.vm_fp,o);
9543 fflush(server.vm_fp);
9544 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9545 return REDIS_OK;
9546 }
9547
9548 /* Transfers the 'val' object to disk. Store all the information
9549 * a 'vmpointer' object containing all the information needed to load the
9550 * object back later is returned.
9551 *
9552 * If we can't find enough contiguous empty pages to swap the object on disk
9553 * NULL is returned. */
9554 static vmpointer *vmSwapObjectBlocking(robj *val) {
9555 off_t pages = rdbSavedObjectPages(val,NULL);
9556 off_t page;
9557 vmpointer *vp;
9558
9559 assert(val->storage == REDIS_VM_MEMORY);
9560 assert(val->refcount == 1);
9561 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9562 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9563
9564 vp = createVmPointer(val->type);
9565 vp->page = page;
9566 vp->usedpages = pages;
9567 decrRefCount(val); /* Deallocate the object from memory. */
9568 vmMarkPagesUsed(page,pages);
9569 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9570 (void*) val,
9571 (unsigned long long) page, (unsigned long long) pages);
9572 server.vm_stats_swapped_objects++;
9573 server.vm_stats_swapouts++;
9574 return vp;
9575 }
9576
9577 static robj *vmReadObjectFromSwap(off_t page, int type) {
9578 robj *o;
9579
9580 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9581 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9582 redisLog(REDIS_WARNING,
9583 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9584 strerror(errno));
9585 _exit(1);
9586 }
9587 o = rdbLoadObject(type,server.vm_fp);
9588 if (o == NULL) {
9589 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9590 _exit(1);
9591 }
9592 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9593 return o;
9594 }
9595
9596 /* Load the specified object from swap to memory.
9597 * The newly allocated object is returned.
9598 *
9599 * If preview is true the unserialized object is returned to the caller but
9600 * the pages are not marked as freed, nor the vp object is freed. */
9601 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9602 robj *val;
9603
9604 redisAssert(vp->type == REDIS_VMPOINTER &&
9605 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9606 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9607 if (!preview) {
9608 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9609 vmMarkPagesFree(vp->page,vp->usedpages);
9610 zfree(vp);
9611 server.vm_stats_swapped_objects--;
9612 } else {
9613 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9614 }
9615 server.vm_stats_swapins++;
9616 return val;
9617 }
9618
9619 /* Plain object loading, from swap to memory.
9620 *
9621 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9622 * The return value is the loaded object. */
9623 static robj *vmLoadObject(robj *o) {
9624 /* If we are loading the object in background, stop it, we
9625 * need to load this object synchronously ASAP. */
9626 if (o->storage == REDIS_VM_LOADING)
9627 vmCancelThreadedIOJob(o);
9628 return vmGenericLoadObject((vmpointer*)o,0);
9629 }
9630
9631 /* Just load the value on disk, without to modify the key.
9632 * This is useful when we want to perform some operation on the value
9633 * without to really bring it from swap to memory, like while saving the
9634 * dataset or rewriting the append only log. */
9635 static robj *vmPreviewObject(robj *o) {
9636 return vmGenericLoadObject((vmpointer*)o,1);
9637 }
9638
9639 /* How a good candidate is this object for swapping?
9640 * The better candidate it is, the greater the returned value.
9641 *
9642 * Currently we try to perform a fast estimation of the object size in
9643 * memory, and combine it with aging informations.
9644 *
9645 * Basically swappability = idle-time * log(estimated size)
9646 *
9647 * Bigger objects are preferred over smaller objects, but not
9648 * proportionally, this is why we use the logarithm. This algorithm is
9649 * just a first try and will probably be tuned later. */
9650 static double computeObjectSwappability(robj *o) {
9651 /* actual age can be >= minage, but not < minage. As we use wrapping
9652 * 21 bit clocks with minutes resolution for the LRU. */
9653 time_t minage = abs(server.lruclock - o->lru);
9654 long asize = 0;
9655 list *l;
9656 dict *d;
9657 struct dictEntry *de;
9658 int z;
9659
9660 if (minage <= 0) return 0;
9661 switch(o->type) {
9662 case REDIS_STRING:
9663 if (o->encoding != REDIS_ENCODING_RAW) {
9664 asize = sizeof(*o);
9665 } else {
9666 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9667 }
9668 break;
9669 case REDIS_LIST:
9670 l = o->ptr;
9671 listNode *ln = listFirst(l);
9672
9673 asize = sizeof(list);
9674 if (ln) {
9675 robj *ele = ln->value;
9676 long elesize;
9677
9678 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9679 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9680 asize += (sizeof(listNode)+elesize)*listLength(l);
9681 }
9682 break;
9683 case REDIS_SET:
9684 case REDIS_ZSET:
9685 z = (o->type == REDIS_ZSET);
9686 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9687
9688 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9689 if (z) asize += sizeof(zset)-sizeof(dict);
9690 if (dictSize(d)) {
9691 long elesize;
9692 robj *ele;
9693
9694 de = dictGetRandomKey(d);
9695 ele = dictGetEntryKey(de);
9696 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9697 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9698 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9699 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9700 }
9701 break;
9702 case REDIS_HASH:
9703 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9704 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9705 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9706 unsigned int klen, vlen;
9707 unsigned char *key, *val;
9708
9709 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9710 klen = 0;
9711 vlen = 0;
9712 }
9713 asize = len*(klen+vlen+3);
9714 } else if (o->encoding == REDIS_ENCODING_HT) {
9715 d = o->ptr;
9716 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9717 if (dictSize(d)) {
9718 long elesize;
9719 robj *ele;
9720
9721 de = dictGetRandomKey(d);
9722 ele = dictGetEntryKey(de);
9723 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9724 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9725 ele = dictGetEntryVal(de);
9726 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9727 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9728 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9729 }
9730 }
9731 break;
9732 }
9733 return (double)minage*log(1+asize);
9734 }
9735
9736 /* Try to swap an object that's a good candidate for swapping.
9737 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9738 * to swap any object at all.
9739 *
9740 * If 'usethreaded' is true, Redis will try to swap the object in background
9741 * using I/O threads. */
9742 static int vmSwapOneObject(int usethreads) {
9743 int j, i;
9744 struct dictEntry *best = NULL;
9745 double best_swappability = 0;
9746 redisDb *best_db = NULL;
9747 robj *val;
9748 sds key;
9749
9750 for (j = 0; j < server.dbnum; j++) {
9751 redisDb *db = server.db+j;
9752 /* Why maxtries is set to 100?
9753 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9754 * are swappable objects */
9755 int maxtries = 100;
9756
9757 if (dictSize(db->dict) == 0) continue;
9758 for (i = 0; i < 5; i++) {
9759 dictEntry *de;
9760 double swappability;
9761
9762 if (maxtries) maxtries--;
9763 de = dictGetRandomKey(db->dict);
9764 val = dictGetEntryVal(de);
9765 /* Only swap objects that are currently in memory.
9766 *
9767 * Also don't swap shared objects: not a good idea in general and
9768 * we need to ensure that the main thread does not touch the
9769 * object while the I/O thread is using it, but we can't
9770 * control other keys without adding additional mutex. */
9771 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9772 if (maxtries) i--; /* don't count this try */
9773 continue;
9774 }
9775 swappability = computeObjectSwappability(val);
9776 if (!best || swappability > best_swappability) {
9777 best = de;
9778 best_swappability = swappability;
9779 best_db = db;
9780 }
9781 }
9782 }
9783 if (best == NULL) return REDIS_ERR;
9784 key = dictGetEntryKey(best);
9785 val = dictGetEntryVal(best);
9786
9787 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9788 key, best_swappability);
9789
9790 /* Swap it */
9791 if (usethreads) {
9792 robj *keyobj = createStringObject(key,sdslen(key));
9793 vmSwapObjectThreaded(keyobj,val,best_db);
9794 decrRefCount(keyobj);
9795 return REDIS_OK;
9796 } else {
9797 vmpointer *vp;
9798
9799 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9800 dictGetEntryVal(best) = vp;
9801 return REDIS_OK;
9802 } else {
9803 return REDIS_ERR;
9804 }
9805 }
9806 }
9807
9808 static int vmSwapOneObjectBlocking() {
9809 return vmSwapOneObject(0);
9810 }
9811
9812 static int vmSwapOneObjectThreaded() {
9813 return vmSwapOneObject(1);
9814 }
9815
9816 /* Return true if it's safe to swap out objects in a given moment.
9817 * Basically we don't want to swap objects out while there is a BGSAVE
9818 * or a BGAEOREWRITE running in backgroud. */
9819 static int vmCanSwapOut(void) {
9820 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9821 }
9822
9823 /* =================== Virtual Memory - Threaded I/O ======================= */
9824
9825 static void freeIOJob(iojob *j) {
9826 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9827 j->type == REDIS_IOJOB_DO_SWAP ||
9828 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9829 {
9830 /* we fix the storage type, otherwise decrRefCount() will try to
9831 * kill the I/O thread Job (that does no longer exists). */
9832 if (j->val->storage == REDIS_VM_SWAPPING)
9833 j->val->storage = REDIS_VM_MEMORY;
9834 decrRefCount(j->val);
9835 }
9836 decrRefCount(j->key);
9837 zfree(j);
9838 }
9839
9840 /* Every time a thread finished a Job, it writes a byte into the write side
9841 * of an unix pipe in order to "awake" the main thread, and this function
9842 * is called. */
9843 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9844 int mask)
9845 {
9846 char buf[1];
9847 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9848 REDIS_NOTUSED(el);
9849 REDIS_NOTUSED(mask);
9850 REDIS_NOTUSED(privdata);
9851
9852 /* For every byte we read in the read side of the pipe, there is one
9853 * I/O job completed to process. */
9854 while((retval = read(fd,buf,1)) == 1) {
9855 iojob *j;
9856 listNode *ln;
9857 struct dictEntry *de;
9858
9859 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9860
9861 /* Get the processed element (the oldest one) */
9862 lockThreadedIO();
9863 assert(listLength(server.io_processed) != 0);
9864 if (toprocess == -1) {
9865 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9866 if (toprocess <= 0) toprocess = 1;
9867 }
9868 ln = listFirst(server.io_processed);
9869 j = ln->value;
9870 listDelNode(server.io_processed,ln);
9871 unlockThreadedIO();
9872 /* If this job is marked as canceled, just ignore it */
9873 if (j->canceled) {
9874 freeIOJob(j);
9875 continue;
9876 }
9877 /* Post process it in the main thread, as there are things we
9878 * can do just here to avoid race conditions and/or invasive locks */
9879 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9880 de = dictFind(j->db->dict,j->key->ptr);
9881 redisAssert(de != NULL);
9882 if (j->type == REDIS_IOJOB_LOAD) {
9883 redisDb *db;
9884 vmpointer *vp = dictGetEntryVal(de);
9885
9886 /* Key loaded, bring it at home */
9887 vmMarkPagesFree(vp->page,vp->usedpages);
9888 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9889 (unsigned char*) j->key->ptr);
9890 server.vm_stats_swapped_objects--;
9891 server.vm_stats_swapins++;
9892 dictGetEntryVal(de) = j->val;
9893 incrRefCount(j->val);
9894 db = j->db;
9895 /* Handle clients waiting for this key to be loaded. */
9896 handleClientsBlockedOnSwappedKey(db,j->key);
9897 freeIOJob(j);
9898 zfree(vp);
9899 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9900 /* Now we know the amount of pages required to swap this object.
9901 * Let's find some space for it, and queue this task again
9902 * rebranded as REDIS_IOJOB_DO_SWAP. */
9903 if (!vmCanSwapOut() ||
9904 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9905 {
9906 /* Ooops... no space or we can't swap as there is
9907 * a fork()ed Redis trying to save stuff on disk. */
9908 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9909 freeIOJob(j);
9910 } else {
9911 /* Note that we need to mark this pages as used now,
9912 * if the job will be canceled, we'll mark them as freed
9913 * again. */
9914 vmMarkPagesUsed(j->page,j->pages);
9915 j->type = REDIS_IOJOB_DO_SWAP;
9916 lockThreadedIO();
9917 queueIOJob(j);
9918 unlockThreadedIO();
9919 }
9920 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9921 vmpointer *vp;
9922
9923 /* Key swapped. We can finally free some memory. */
9924 if (j->val->storage != REDIS_VM_SWAPPING) {
9925 vmpointer *vp = (vmpointer*) j->id;
9926 printf("storage: %d\n",vp->storage);
9927 printf("key->name: %s\n",(char*)j->key->ptr);
9928 printf("val: %p\n",(void*)j->val);
9929 printf("val->type: %d\n",j->val->type);
9930 printf("val->ptr: %s\n",(char*)j->val->ptr);
9931 }
9932 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9933 vp = createVmPointer(j->val->type);
9934 vp->page = j->page;
9935 vp->usedpages = j->pages;
9936 dictGetEntryVal(de) = vp;
9937 /* Fix the storage otherwise decrRefCount will attempt to
9938 * remove the associated I/O job */
9939 j->val->storage = REDIS_VM_MEMORY;
9940 decrRefCount(j->val);
9941 redisLog(REDIS_DEBUG,
9942 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9943 (unsigned char*) j->key->ptr,
9944 (unsigned long long) j->page, (unsigned long long) j->pages);
9945 server.vm_stats_swapped_objects++;
9946 server.vm_stats_swapouts++;
9947 freeIOJob(j);
9948 /* Put a few more swap requests in queue if we are still
9949 * out of memory */
9950 if (trytoswap && vmCanSwapOut() &&
9951 zmalloc_used_memory() > server.vm_max_memory)
9952 {
9953 int more = 1;
9954 while(more) {
9955 lockThreadedIO();
9956 more = listLength(server.io_newjobs) <
9957 (unsigned) server.vm_max_threads;
9958 unlockThreadedIO();
9959 /* Don't waste CPU time if swappable objects are rare. */
9960 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9961 trytoswap = 0;
9962 break;
9963 }
9964 }
9965 }
9966 }
9967 processed++;
9968 if (processed == toprocess) return;
9969 }
9970 if (retval < 0 && errno != EAGAIN) {
9971 redisLog(REDIS_WARNING,
9972 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9973 strerror(errno));
9974 }
9975 }
9976
9977 static void lockThreadedIO(void) {
9978 pthread_mutex_lock(&server.io_mutex);
9979 }
9980
9981 static void unlockThreadedIO(void) {
9982 pthread_mutex_unlock(&server.io_mutex);
9983 }
9984
9985 /* Remove the specified object from the threaded I/O queue if still not
9986 * processed, otherwise make sure to flag it as canceled. */
9987 static void vmCancelThreadedIOJob(robj *o) {
9988 list *lists[3] = {
9989 server.io_newjobs, /* 0 */
9990 server.io_processing, /* 1 */
9991 server.io_processed /* 2 */
9992 };
9993 int i;
9994
9995 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9996 again:
9997 lockThreadedIO();
9998 /* Search for a matching object in one of the queues */
9999 for (i = 0; i < 3; i++) {
10000 listNode *ln;
10001 listIter li;
10002
10003 listRewind(lists[i],&li);
10004 while ((ln = listNext(&li)) != NULL) {
10005 iojob *job = ln->value;
10006
10007 if (job->canceled) continue; /* Skip this, already canceled. */
10008 if (job->id == o) {
10009 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10010 (void*)job, (char*)job->key->ptr, job->type, i);
10011 /* Mark the pages as free since the swap didn't happened
10012 * or happened but is now discarded. */
10013 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10014 vmMarkPagesFree(job->page,job->pages);
10015 /* Cancel the job. It depends on the list the job is
10016 * living in. */
10017 switch(i) {
10018 case 0: /* io_newjobs */
10019 /* If the job was yet not processed the best thing to do
10020 * is to remove it from the queue at all */
10021 freeIOJob(job);
10022 listDelNode(lists[i],ln);
10023 break;
10024 case 1: /* io_processing */
10025 /* Oh Shi- the thread is messing with the Job:
10026 *
10027 * Probably it's accessing the object if this is a
10028 * PREPARE_SWAP or DO_SWAP job.
10029 * If it's a LOAD job it may be reading from disk and
10030 * if we don't wait for the job to terminate before to
10031 * cancel it, maybe in a few microseconds data can be
10032 * corrupted in this pages. So the short story is:
10033 *
10034 * Better to wait for the job to move into the
10035 * next queue (processed)... */
10036
10037 /* We try again and again until the job is completed. */
10038 unlockThreadedIO();
10039 /* But let's wait some time for the I/O thread
10040 * to finish with this job. After all this condition
10041 * should be very rare. */
10042 usleep(1);
10043 goto again;
10044 case 2: /* io_processed */
10045 /* The job was already processed, that's easy...
10046 * just mark it as canceled so that we'll ignore it
10047 * when processing completed jobs. */
10048 job->canceled = 1;
10049 break;
10050 }
10051 /* Finally we have to adjust the storage type of the object
10052 * in order to "UNDO" the operaiton. */
10053 if (o->storage == REDIS_VM_LOADING)
10054 o->storage = REDIS_VM_SWAPPED;
10055 else if (o->storage == REDIS_VM_SWAPPING)
10056 o->storage = REDIS_VM_MEMORY;
10057 unlockThreadedIO();
10058 redisLog(REDIS_DEBUG,"*** DONE");
10059 return;
10060 }
10061 }
10062 }
10063 unlockThreadedIO();
10064 printf("Not found: %p\n", (void*)o);
10065 redisAssert(1 != 1); /* We should never reach this */
10066 }
10067
10068 static void *IOThreadEntryPoint(void *arg) {
10069 iojob *j;
10070 listNode *ln;
10071 REDIS_NOTUSED(arg);
10072
10073 pthread_detach(pthread_self());
10074 while(1) {
10075 /* Get a new job to process */
10076 lockThreadedIO();
10077 if (listLength(server.io_newjobs) == 0) {
10078 /* No new jobs in queue, exit. */
10079 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10080 (long) pthread_self());
10081 server.io_active_threads--;
10082 unlockThreadedIO();
10083 return NULL;
10084 }
10085 ln = listFirst(server.io_newjobs);
10086 j = ln->value;
10087 listDelNode(server.io_newjobs,ln);
10088 /* Add the job in the processing queue */
10089 j->thread = pthread_self();
10090 listAddNodeTail(server.io_processing,j);
10091 ln = listLast(server.io_processing); /* We use ln later to remove it */
10092 unlockThreadedIO();
10093 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10094 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10095
10096 /* Process the Job */
10097 if (j->type == REDIS_IOJOB_LOAD) {
10098 vmpointer *vp = (vmpointer*)j->id;
10099 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10100 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10101 FILE *fp = fopen("/dev/null","w+");
10102 j->pages = rdbSavedObjectPages(j->val,fp);
10103 fclose(fp);
10104 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10105 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10106 j->canceled = 1;
10107 }
10108
10109 /* Done: insert the job into the processed queue */
10110 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10111 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10112 lockThreadedIO();
10113 listDelNode(server.io_processing,ln);
10114 listAddNodeTail(server.io_processed,j);
10115 unlockThreadedIO();
10116
10117 /* Signal the main thread there is new stuff to process */
10118 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10119 }
10120 return NULL; /* never reached */
10121 }
10122
10123 static void spawnIOThread(void) {
10124 pthread_t thread;
10125 sigset_t mask, omask;
10126 int err;
10127
10128 sigemptyset(&mask);
10129 sigaddset(&mask,SIGCHLD);
10130 sigaddset(&mask,SIGHUP);
10131 sigaddset(&mask,SIGPIPE);
10132 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10133 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10134 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10135 strerror(err));
10136 usleep(1000000);
10137 }
10138 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10139 server.io_active_threads++;
10140 }
10141
10142 /* We need to wait for the last thread to exit before we are able to
10143 * fork() in order to BGSAVE or BGREWRITEAOF. */
10144 static void waitEmptyIOJobsQueue(void) {
10145 while(1) {
10146 int io_processed_len;
10147
10148 lockThreadedIO();
10149 if (listLength(server.io_newjobs) == 0 &&
10150 listLength(server.io_processing) == 0 &&
10151 server.io_active_threads == 0)
10152 {
10153 unlockThreadedIO();
10154 return;
10155 }
10156 /* While waiting for empty jobs queue condition we post-process some
10157 * finshed job, as I/O threads may be hanging trying to write against
10158 * the io_ready_pipe_write FD but there are so much pending jobs that
10159 * it's blocking. */
10160 io_processed_len = listLength(server.io_processed);
10161 unlockThreadedIO();
10162 if (io_processed_len) {
10163 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10164 usleep(1000); /* 1 millisecond */
10165 } else {
10166 usleep(10000); /* 10 milliseconds */
10167 }
10168 }
10169 }
10170
10171 static void vmReopenSwapFile(void) {
10172 /* Note: we don't close the old one as we are in the child process
10173 * and don't want to mess at all with the original file object. */
10174 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10175 if (server.vm_fp == NULL) {
10176 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10177 server.vm_swap_file);
10178 _exit(1);
10179 }
10180 server.vm_fd = fileno(server.vm_fp);
10181 }
10182
10183 /* This function must be called while with threaded IO locked */
10184 static void queueIOJob(iojob *j) {
10185 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10186 (void*)j, j->type, (char*)j->key->ptr);
10187 listAddNodeTail(server.io_newjobs,j);
10188 if (server.io_active_threads < server.vm_max_threads)
10189 spawnIOThread();
10190 }
10191
10192 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10193 iojob *j;
10194
10195 j = zmalloc(sizeof(*j));
10196 j->type = REDIS_IOJOB_PREPARE_SWAP;
10197 j->db = db;
10198 j->key = key;
10199 incrRefCount(key);
10200 j->id = j->val = val;
10201 incrRefCount(val);
10202 j->canceled = 0;
10203 j->thread = (pthread_t) -1;
10204 val->storage = REDIS_VM_SWAPPING;
10205
10206 lockThreadedIO();
10207 queueIOJob(j);
10208 unlockThreadedIO();
10209 return REDIS_OK;
10210 }
10211
10212 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10213
10214 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10215 * If there is not already a job loading the key, it is craeted.
10216 * The key is added to the io_keys list in the client structure, and also
10217 * in the hash table mapping swapped keys to waiting clients, that is,
10218 * server.io_waited_keys. */
10219 static int waitForSwappedKey(redisClient *c, robj *key) {
10220 struct dictEntry *de;
10221 robj *o;
10222 list *l;
10223
10224 /* If the key does not exist or is already in RAM we don't need to
10225 * block the client at all. */
10226 de = dictFind(c->db->dict,key->ptr);
10227 if (de == NULL) return 0;
10228 o = dictGetEntryVal(de);
10229 if (o->storage == REDIS_VM_MEMORY) {
10230 return 0;
10231 } else if (o->storage == REDIS_VM_SWAPPING) {
10232 /* We were swapping the key, undo it! */
10233 vmCancelThreadedIOJob(o);
10234 return 0;
10235 }
10236
10237 /* OK: the key is either swapped, or being loaded just now. */
10238
10239 /* Add the key to the list of keys this client is waiting for.
10240 * This maps clients to keys they are waiting for. */
10241 listAddNodeTail(c->io_keys,key);
10242 incrRefCount(key);
10243
10244 /* Add the client to the swapped keys => clients waiting map. */
10245 de = dictFind(c->db->io_keys,key);
10246 if (de == NULL) {
10247 int retval;
10248
10249 /* For every key we take a list of clients blocked for it */
10250 l = listCreate();
10251 retval = dictAdd(c->db->io_keys,key,l);
10252 incrRefCount(key);
10253 assert(retval == DICT_OK);
10254 } else {
10255 l = dictGetEntryVal(de);
10256 }
10257 listAddNodeTail(l,c);
10258
10259 /* Are we already loading the key from disk? If not create a job */
10260 if (o->storage == REDIS_VM_SWAPPED) {
10261 iojob *j;
10262 vmpointer *vp = (vmpointer*)o;
10263
10264 o->storage = REDIS_VM_LOADING;
10265 j = zmalloc(sizeof(*j));
10266 j->type = REDIS_IOJOB_LOAD;
10267 j->db = c->db;
10268 j->id = (robj*)vp;
10269 j->key = key;
10270 incrRefCount(key);
10271 j->page = vp->page;
10272 j->val = NULL;
10273 j->canceled = 0;
10274 j->thread = (pthread_t) -1;
10275 lockThreadedIO();
10276 queueIOJob(j);
10277 unlockThreadedIO();
10278 }
10279 return 1;
10280 }
10281
10282 /* Preload keys for any command with first, last and step values for
10283 * the command keys prototype, as defined in the command table. */
10284 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10285 int j, last;
10286 if (cmd->vm_firstkey == 0) return;
10287 last = cmd->vm_lastkey;
10288 if (last < 0) last = argc+last;
10289 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10290 redisAssert(j < argc);
10291 waitForSwappedKey(c,argv[j]);
10292 }
10293 }
10294
10295 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10296 * Note that the number of keys to preload is user-defined, so we need to
10297 * apply a sanity check against argc. */
10298 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10299 int i, num;
10300 REDIS_NOTUSED(cmd);
10301
10302 num = atoi(argv[2]->ptr);
10303 if (num > (argc-3)) return;
10304 for (i = 0; i < num; i++) {
10305 waitForSwappedKey(c,argv[3+i]);
10306 }
10307 }
10308
10309 /* Preload keys needed to execute the entire MULTI/EXEC block.
10310 *
10311 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10312 * and will block the client when any command requires a swapped out value. */
10313 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10314 int i, margc;
10315 struct redisCommand *mcmd;
10316 robj **margv;
10317 REDIS_NOTUSED(cmd);
10318 REDIS_NOTUSED(argc);
10319 REDIS_NOTUSED(argv);
10320
10321 if (!(c->flags & REDIS_MULTI)) return;
10322 for (i = 0; i < c->mstate.count; i++) {
10323 mcmd = c->mstate.commands[i].cmd;
10324 margc = c->mstate.commands[i].argc;
10325 margv = c->mstate.commands[i].argv;
10326
10327 if (mcmd->vm_preload_proc != NULL) {
10328 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10329 } else {
10330 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10331 }
10332 }
10333 }
10334
10335 /* Is this client attempting to run a command against swapped keys?
10336 * If so, block it ASAP, load the keys in background, then resume it.
10337 *
10338 * The important idea about this function is that it can fail! If keys will
10339 * still be swapped when the client is resumed, this key lookups will
10340 * just block loading keys from disk. In practical terms this should only
10341 * happen with SORT BY command or if there is a bug in this function.
10342 *
10343 * Return 1 if the client is marked as blocked, 0 if the client can
10344 * continue as the keys it is going to access appear to be in memory. */
10345 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10346 if (cmd->vm_preload_proc != NULL) {
10347 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10348 } else {
10349 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10350 }
10351
10352 /* If the client was blocked for at least one key, mark it as blocked. */
10353 if (listLength(c->io_keys)) {
10354 c->flags |= REDIS_IO_WAIT;
10355 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10356 server.vm_blocked_clients++;
10357 return 1;
10358 } else {
10359 return 0;
10360 }
10361 }
10362
10363 /* Remove the 'key' from the list of blocked keys for a given client.
10364 *
10365 * The function returns 1 when there are no longer blocking keys after
10366 * the current one was removed (and the client can be unblocked). */
10367 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10368 list *l;
10369 listNode *ln;
10370 listIter li;
10371 struct dictEntry *de;
10372
10373 /* Remove the key from the list of keys this client is waiting for. */
10374 listRewind(c->io_keys,&li);
10375 while ((ln = listNext(&li)) != NULL) {
10376 if (equalStringObjects(ln->value,key)) {
10377 listDelNode(c->io_keys,ln);
10378 break;
10379 }
10380 }
10381 assert(ln != NULL);
10382
10383 /* Remove the client form the key => waiting clients map. */
10384 de = dictFind(c->db->io_keys,key);
10385 assert(de != NULL);
10386 l = dictGetEntryVal(de);
10387 ln = listSearchKey(l,c);
10388 assert(ln != NULL);
10389 listDelNode(l,ln);
10390 if (listLength(l) == 0)
10391 dictDelete(c->db->io_keys,key);
10392
10393 return listLength(c->io_keys) == 0;
10394 }
10395
10396 /* Every time we now a key was loaded back in memory, we handle clients
10397 * waiting for this key if any. */
10398 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10399 struct dictEntry *de;
10400 list *l;
10401 listNode *ln;
10402 int len;
10403
10404 de = dictFind(db->io_keys,key);
10405 if (!de) return;
10406
10407 l = dictGetEntryVal(de);
10408 len = listLength(l);
10409 /* Note: we can't use something like while(listLength(l)) as the list
10410 * can be freed by the calling function when we remove the last element. */
10411 while (len--) {
10412 ln = listFirst(l);
10413 redisClient *c = ln->value;
10414
10415 if (dontWaitForSwappedKey(c,key)) {
10416 /* Put the client in the list of clients ready to go as we
10417 * loaded all the keys about it. */
10418 listAddNodeTail(server.io_ready_clients,c);
10419 }
10420 }
10421 }
10422
10423 /* =========================== Remote Configuration ========================= */
10424
10425 static void configSetCommand(redisClient *c) {
10426 robj *o = getDecodedObject(c->argv[3]);
10427 long long ll;
10428
10429 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10430 zfree(server.dbfilename);
10431 server.dbfilename = zstrdup(o->ptr);
10432 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10433 zfree(server.requirepass);
10434 server.requirepass = zstrdup(o->ptr);
10435 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10436 zfree(server.masterauth);
10437 server.masterauth = zstrdup(o->ptr);
10438 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10439 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10440 ll < 0) goto badfmt;
10441 server.maxmemory = ll;
10442 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10443 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10444 ll < 0 || ll > LONG_MAX) goto badfmt;
10445 server.maxidletime = ll;
10446 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10447 if (!strcasecmp(o->ptr,"no")) {
10448 server.appendfsync = APPENDFSYNC_NO;
10449 } else if (!strcasecmp(o->ptr,"everysec")) {
10450 server.appendfsync = APPENDFSYNC_EVERYSEC;
10451 } else if (!strcasecmp(o->ptr,"always")) {
10452 server.appendfsync = APPENDFSYNC_ALWAYS;
10453 } else {
10454 goto badfmt;
10455 }
10456 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10457 int yn = yesnotoi(o->ptr);
10458
10459 if (yn == -1) goto badfmt;
10460 server.no_appendfsync_on_rewrite = yn;
10461 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10462 int old = server.appendonly;
10463 int new = yesnotoi(o->ptr);
10464
10465 if (new == -1) goto badfmt;
10466 if (old != new) {
10467 if (new == 0) {
10468 stopAppendOnly();
10469 } else {
10470 if (startAppendOnly() == REDIS_ERR) {
10471 addReplySds(c,sdscatprintf(sdsempty(),
10472 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10473 decrRefCount(o);
10474 return;
10475 }
10476 }
10477 }
10478 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10479 int vlen, j;
10480 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10481
10482 /* Perform sanity check before setting the new config:
10483 * - Even number of args
10484 * - Seconds >= 1, changes >= 0 */
10485 if (vlen & 1) {
10486 sdsfreesplitres(v,vlen);
10487 goto badfmt;
10488 }
10489 for (j = 0; j < vlen; j++) {
10490 char *eptr;
10491 long val;
10492
10493 val = strtoll(v[j], &eptr, 10);
10494 if (eptr[0] != '\0' ||
10495 ((j & 1) == 0 && val < 1) ||
10496 ((j & 1) == 1 && val < 0)) {
10497 sdsfreesplitres(v,vlen);
10498 goto badfmt;
10499 }
10500 }
10501 /* Finally set the new config */
10502 resetServerSaveParams();
10503 for (j = 0; j < vlen; j += 2) {
10504 time_t seconds;
10505 int changes;
10506
10507 seconds = strtoll(v[j],NULL,10);
10508 changes = strtoll(v[j+1],NULL,10);
10509 appendServerSaveParams(seconds, changes);
10510 }
10511 sdsfreesplitres(v,vlen);
10512 } else {
10513 addReplySds(c,sdscatprintf(sdsempty(),
10514 "-ERR not supported CONFIG parameter %s\r\n",
10515 (char*)c->argv[2]->ptr));
10516 decrRefCount(o);
10517 return;
10518 }
10519 decrRefCount(o);
10520 addReply(c,shared.ok);
10521 return;
10522
10523 badfmt: /* Bad format errors */
10524 addReplySds(c,sdscatprintf(sdsempty(),
10525 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10526 (char*)o->ptr,
10527 (char*)c->argv[2]->ptr));
10528 decrRefCount(o);
10529 }
10530
10531 static void configGetCommand(redisClient *c) {
10532 robj *o = getDecodedObject(c->argv[2]);
10533 robj *lenobj = createObject(REDIS_STRING,NULL);
10534 char *pattern = o->ptr;
10535 int matches = 0;
10536
10537 addReply(c,lenobj);
10538 decrRefCount(lenobj);
10539
10540 if (stringmatch(pattern,"dbfilename",0)) {
10541 addReplyBulkCString(c,"dbfilename");
10542 addReplyBulkCString(c,server.dbfilename);
10543 matches++;
10544 }
10545 if (stringmatch(pattern,"requirepass",0)) {
10546 addReplyBulkCString(c,"requirepass");
10547 addReplyBulkCString(c,server.requirepass);
10548 matches++;
10549 }
10550 if (stringmatch(pattern,"masterauth",0)) {
10551 addReplyBulkCString(c,"masterauth");
10552 addReplyBulkCString(c,server.masterauth);
10553 matches++;
10554 }
10555 if (stringmatch(pattern,"maxmemory",0)) {
10556 char buf[128];
10557
10558 ll2string(buf,128,server.maxmemory);
10559 addReplyBulkCString(c,"maxmemory");
10560 addReplyBulkCString(c,buf);
10561 matches++;
10562 }
10563 if (stringmatch(pattern,"timeout",0)) {
10564 char buf[128];
10565
10566 ll2string(buf,128,server.maxidletime);
10567 addReplyBulkCString(c,"timeout");
10568 addReplyBulkCString(c,buf);
10569 matches++;
10570 }
10571 if (stringmatch(pattern,"appendonly",0)) {
10572 addReplyBulkCString(c,"appendonly");
10573 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10574 matches++;
10575 }
10576 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10577 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10578 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10579 matches++;
10580 }
10581 if (stringmatch(pattern,"appendfsync",0)) {
10582 char *policy;
10583
10584 switch(server.appendfsync) {
10585 case APPENDFSYNC_NO: policy = "no"; break;
10586 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10587 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10588 default: policy = "unknown"; break; /* too harmless to panic */
10589 }
10590 addReplyBulkCString(c,"appendfsync");
10591 addReplyBulkCString(c,policy);
10592 matches++;
10593 }
10594 if (stringmatch(pattern,"save",0)) {
10595 sds buf = sdsempty();
10596 int j;
10597
10598 for (j = 0; j < server.saveparamslen; j++) {
10599 buf = sdscatprintf(buf,"%ld %d",
10600 server.saveparams[j].seconds,
10601 server.saveparams[j].changes);
10602 if (j != server.saveparamslen-1)
10603 buf = sdscatlen(buf," ",1);
10604 }
10605 addReplyBulkCString(c,"save");
10606 addReplyBulkCString(c,buf);
10607 sdsfree(buf);
10608 matches++;
10609 }
10610 decrRefCount(o);
10611 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10612 }
10613
10614 static void configCommand(redisClient *c) {
10615 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10616 if (c->argc != 4) goto badarity;
10617 configSetCommand(c);
10618 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10619 if (c->argc != 3) goto badarity;
10620 configGetCommand(c);
10621 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10622 if (c->argc != 2) goto badarity;
10623 server.stat_numcommands = 0;
10624 server.stat_numconnections = 0;
10625 server.stat_expiredkeys = 0;
10626 server.stat_starttime = time(NULL);
10627 addReply(c,shared.ok);
10628 } else {
10629 addReplySds(c,sdscatprintf(sdsempty(),
10630 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10631 }
10632 return;
10633
10634 badarity:
10635 addReplySds(c,sdscatprintf(sdsempty(),
10636 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10637 (char*) c->argv[1]->ptr));
10638 }
10639
10640 /* =========================== Pubsub implementation ======================== */
10641
10642 static void freePubsubPattern(void *p) {
10643 pubsubPattern *pat = p;
10644
10645 decrRefCount(pat->pattern);
10646 zfree(pat);
10647 }
10648
10649 static int listMatchPubsubPattern(void *a, void *b) {
10650 pubsubPattern *pa = a, *pb = b;
10651
10652 return (pa->client == pb->client) &&
10653 (equalStringObjects(pa->pattern,pb->pattern));
10654 }
10655
10656 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10657 * 0 if the client was already subscribed to that channel. */
10658 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10659 struct dictEntry *de;
10660 list *clients = NULL;
10661 int retval = 0;
10662
10663 /* Add the channel to the client -> channels hash table */
10664 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10665 retval = 1;
10666 incrRefCount(channel);
10667 /* Add the client to the channel -> list of clients hash table */
10668 de = dictFind(server.pubsub_channels,channel);
10669 if (de == NULL) {
10670 clients = listCreate();
10671 dictAdd(server.pubsub_channels,channel,clients);
10672 incrRefCount(channel);
10673 } else {
10674 clients = dictGetEntryVal(de);
10675 }
10676 listAddNodeTail(clients,c);
10677 }
10678 /* Notify the client */
10679 addReply(c,shared.mbulk3);
10680 addReply(c,shared.subscribebulk);
10681 addReplyBulk(c,channel);
10682 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10683 return retval;
10684 }
10685
10686 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10687 * 0 if the client was not subscribed to the specified channel. */
10688 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10689 struct dictEntry *de;
10690 list *clients;
10691 listNode *ln;
10692 int retval = 0;
10693
10694 /* Remove the channel from the client -> channels hash table */
10695 incrRefCount(channel); /* channel may be just a pointer to the same object
10696 we have in the hash tables. Protect it... */
10697 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10698 retval = 1;
10699 /* Remove the client from the channel -> clients list hash table */
10700 de = dictFind(server.pubsub_channels,channel);
10701 assert(de != NULL);
10702 clients = dictGetEntryVal(de);
10703 ln = listSearchKey(clients,c);
10704 assert(ln != NULL);
10705 listDelNode(clients,ln);
10706 if (listLength(clients) == 0) {
10707 /* Free the list and associated hash entry at all if this was
10708 * the latest client, so that it will be possible to abuse
10709 * Redis PUBSUB creating millions of channels. */
10710 dictDelete(server.pubsub_channels,channel);
10711 }
10712 }
10713 /* Notify the client */
10714 if (notify) {
10715 addReply(c,shared.mbulk3);
10716 addReply(c,shared.unsubscribebulk);
10717 addReplyBulk(c,channel);
10718 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10719 listLength(c->pubsub_patterns));
10720
10721 }
10722 decrRefCount(channel); /* it is finally safe to release it */
10723 return retval;
10724 }
10725
10726 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10727 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10728 int retval = 0;
10729
10730 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10731 retval = 1;
10732 pubsubPattern *pat;
10733 listAddNodeTail(c->pubsub_patterns,pattern);
10734 incrRefCount(pattern);
10735 pat = zmalloc(sizeof(*pat));
10736 pat->pattern = getDecodedObject(pattern);
10737 pat->client = c;
10738 listAddNodeTail(server.pubsub_patterns,pat);
10739 }
10740 /* Notify the client */
10741 addReply(c,shared.mbulk3);
10742 addReply(c,shared.psubscribebulk);
10743 addReplyBulk(c,pattern);
10744 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10745 return retval;
10746 }
10747
10748 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10749 * 0 if the client was not subscribed to the specified channel. */
10750 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10751 listNode *ln;
10752 pubsubPattern pat;
10753 int retval = 0;
10754
10755 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10756 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10757 retval = 1;
10758 listDelNode(c->pubsub_patterns,ln);
10759 pat.client = c;
10760 pat.pattern = pattern;
10761 ln = listSearchKey(server.pubsub_patterns,&pat);
10762 listDelNode(server.pubsub_patterns,ln);
10763 }
10764 /* Notify the client */
10765 if (notify) {
10766 addReply(c,shared.mbulk3);
10767 addReply(c,shared.punsubscribebulk);
10768 addReplyBulk(c,pattern);
10769 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10770 listLength(c->pubsub_patterns));
10771 }
10772 decrRefCount(pattern);
10773 return retval;
10774 }
10775
10776 /* Unsubscribe from all the channels. Return the number of channels the
10777 * client was subscribed from. */
10778 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10779 dictIterator *di = dictGetIterator(c->pubsub_channels);
10780 dictEntry *de;
10781 int count = 0;
10782
10783 while((de = dictNext(di)) != NULL) {
10784 robj *channel = dictGetEntryKey(de);
10785
10786 count += pubsubUnsubscribeChannel(c,channel,notify);
10787 }
10788 dictReleaseIterator(di);
10789 return count;
10790 }
10791
10792 /* Unsubscribe from all the patterns. Return the number of patterns the
10793 * client was subscribed from. */
10794 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10795 listNode *ln;
10796 listIter li;
10797 int count = 0;
10798
10799 listRewind(c->pubsub_patterns,&li);
10800 while ((ln = listNext(&li)) != NULL) {
10801 robj *pattern = ln->value;
10802
10803 count += pubsubUnsubscribePattern(c,pattern,notify);
10804 }
10805 return count;
10806 }
10807
10808 /* Publish a message */
10809 static int pubsubPublishMessage(robj *channel, robj *message) {
10810 int receivers = 0;
10811 struct dictEntry *de;
10812 listNode *ln;
10813 listIter li;
10814
10815 /* Send to clients listening for that channel */
10816 de = dictFind(server.pubsub_channels,channel);
10817 if (de) {
10818 list *list = dictGetEntryVal(de);
10819 listNode *ln;
10820 listIter li;
10821
10822 listRewind(list,&li);
10823 while ((ln = listNext(&li)) != NULL) {
10824 redisClient *c = ln->value;
10825
10826 addReply(c,shared.mbulk3);
10827 addReply(c,shared.messagebulk);
10828 addReplyBulk(c,channel);
10829 addReplyBulk(c,message);
10830 receivers++;
10831 }
10832 }
10833 /* Send to clients listening to matching channels */
10834 if (listLength(server.pubsub_patterns)) {
10835 listRewind(server.pubsub_patterns,&li);
10836 channel = getDecodedObject(channel);
10837 while ((ln = listNext(&li)) != NULL) {
10838 pubsubPattern *pat = ln->value;
10839
10840 if (stringmatchlen((char*)pat->pattern->ptr,
10841 sdslen(pat->pattern->ptr),
10842 (char*)channel->ptr,
10843 sdslen(channel->ptr),0)) {
10844 addReply(pat->client,shared.mbulk4);
10845 addReply(pat->client,shared.pmessagebulk);
10846 addReplyBulk(pat->client,pat->pattern);
10847 addReplyBulk(pat->client,channel);
10848 addReplyBulk(pat->client,message);
10849 receivers++;
10850 }
10851 }
10852 decrRefCount(channel);
10853 }
10854 return receivers;
10855 }
10856
10857 static void subscribeCommand(redisClient *c) {
10858 int j;
10859
10860 for (j = 1; j < c->argc; j++)
10861 pubsubSubscribeChannel(c,c->argv[j]);
10862 }
10863
10864 static void unsubscribeCommand(redisClient *c) {
10865 if (c->argc == 1) {
10866 pubsubUnsubscribeAllChannels(c,1);
10867 return;
10868 } else {
10869 int j;
10870
10871 for (j = 1; j < c->argc; j++)
10872 pubsubUnsubscribeChannel(c,c->argv[j],1);
10873 }
10874 }
10875
10876 static void psubscribeCommand(redisClient *c) {
10877 int j;
10878
10879 for (j = 1; j < c->argc; j++)
10880 pubsubSubscribePattern(c,c->argv[j]);
10881 }
10882
10883 static void punsubscribeCommand(redisClient *c) {
10884 if (c->argc == 1) {
10885 pubsubUnsubscribeAllPatterns(c,1);
10886 return;
10887 } else {
10888 int j;
10889
10890 for (j = 1; j < c->argc; j++)
10891 pubsubUnsubscribePattern(c,c->argv[j],1);
10892 }
10893 }
10894
10895 static void publishCommand(redisClient *c) {
10896 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10897 addReplyLongLong(c,receivers);
10898 }
10899
10900 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10901 *
10902 * The implementation uses a per-DB hash table mapping keys to list of clients
10903 * WATCHing those keys, so that given a key that is going to be modified
10904 * we can mark all the associated clients as dirty.
10905 *
10906 * Also every client contains a list of WATCHed keys so that's possible to
10907 * un-watch such keys when the client is freed or when UNWATCH is called. */
10908
10909 /* In the client->watched_keys list we need to use watchedKey structures
10910 * as in order to identify a key in Redis we need both the key name and the
10911 * DB */
10912 typedef struct watchedKey {
10913 robj *key;
10914 redisDb *db;
10915 } watchedKey;
10916
10917 /* Watch for the specified key */
10918 static void watchForKey(redisClient *c, robj *key) {
10919 list *clients = NULL;
10920 listIter li;
10921 listNode *ln;
10922 watchedKey *wk;
10923
10924 /* Check if we are already watching for this key */
10925 listRewind(c->watched_keys,&li);
10926 while((ln = listNext(&li))) {
10927 wk = listNodeValue(ln);
10928 if (wk->db == c->db && equalStringObjects(key,wk->key))
10929 return; /* Key already watched */
10930 }
10931 /* This key is not already watched in this DB. Let's add it */
10932 clients = dictFetchValue(c->db->watched_keys,key);
10933 if (!clients) {
10934 clients = listCreate();
10935 dictAdd(c->db->watched_keys,key,clients);
10936 incrRefCount(key);
10937 }
10938 listAddNodeTail(clients,c);
10939 /* Add the new key to the lits of keys watched by this client */
10940 wk = zmalloc(sizeof(*wk));
10941 wk->key = key;
10942 wk->db = c->db;
10943 incrRefCount(key);
10944 listAddNodeTail(c->watched_keys,wk);
10945 }
10946
10947 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10948 * flag is up to the caller. */
10949 static void unwatchAllKeys(redisClient *c) {
10950 listIter li;
10951 listNode *ln;
10952
10953 if (listLength(c->watched_keys) == 0) return;
10954 listRewind(c->watched_keys,&li);
10955 while((ln = listNext(&li))) {
10956 list *clients;
10957 watchedKey *wk;
10958
10959 /* Lookup the watched key -> clients list and remove the client
10960 * from the list */
10961 wk = listNodeValue(ln);
10962 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10963 assert(clients != NULL);
10964 listDelNode(clients,listSearchKey(clients,c));
10965 /* Kill the entry at all if this was the only client */
10966 if (listLength(clients) == 0)
10967 dictDelete(wk->db->watched_keys, wk->key);
10968 /* Remove this watched key from the client->watched list */
10969 listDelNode(c->watched_keys,ln);
10970 decrRefCount(wk->key);
10971 zfree(wk);
10972 }
10973 }
10974
10975 /* "Touch" a key, so that if this key is being WATCHed by some client the
10976 * next EXEC will fail. */
10977 static void touchWatchedKey(redisDb *db, robj *key) {
10978 list *clients;
10979 listIter li;
10980 listNode *ln;
10981
10982 if (dictSize(db->watched_keys) == 0) return;
10983 clients = dictFetchValue(db->watched_keys, key);
10984 if (!clients) return;
10985
10986 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10987 /* Check if we are already watching for this key */
10988 listRewind(clients,&li);
10989 while((ln = listNext(&li))) {
10990 redisClient *c = listNodeValue(ln);
10991
10992 c->flags |= REDIS_DIRTY_CAS;
10993 }
10994 }
10995
10996 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10997 * flush but will be deleted as effect of the flushing operation should
10998 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10999 * a FLUSHALL operation (all the DBs flushed). */
11000 static void touchWatchedKeysOnFlush(int dbid) {
11001 listIter li1, li2;
11002 listNode *ln;
11003
11004 /* For every client, check all the waited keys */
11005 listRewind(server.clients,&li1);
11006 while((ln = listNext(&li1))) {
11007 redisClient *c = listNodeValue(ln);
11008 listRewind(c->watched_keys,&li2);
11009 while((ln = listNext(&li2))) {
11010 watchedKey *wk = listNodeValue(ln);
11011
11012 /* For every watched key matching the specified DB, if the
11013 * key exists, mark the client as dirty, as the key will be
11014 * removed. */
11015 if (dbid == -1 || wk->db->id == dbid) {
11016 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11017 c->flags |= REDIS_DIRTY_CAS;
11018 }
11019 }
11020 }
11021 }
11022
11023 static void watchCommand(redisClient *c) {
11024 int j;
11025
11026 if (c->flags & REDIS_MULTI) {
11027 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11028 return;
11029 }
11030 for (j = 1; j < c->argc; j++)
11031 watchForKey(c,c->argv[j]);
11032 addReply(c,shared.ok);
11033 }
11034
11035 static void unwatchCommand(redisClient *c) {
11036 unwatchAllKeys(c);
11037 c->flags &= (~REDIS_DIRTY_CAS);
11038 addReply(c,shared.ok);
11039 }
11040
11041 /* ================================= Debugging ============================== */
11042
11043 /* Compute the sha1 of string at 's' with 'len' bytes long.
11044 * The SHA1 is then xored againt the string pointed by digest.
11045 * Since xor is commutative, this operation is used in order to
11046 * "add" digests relative to unordered elements.
11047 *
11048 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11049 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11050 SHA1_CTX ctx;
11051 unsigned char hash[20], *s = ptr;
11052 int j;
11053
11054 SHA1Init(&ctx);
11055 SHA1Update(&ctx,s,len);
11056 SHA1Final(hash,&ctx);
11057
11058 for (j = 0; j < 20; j++)
11059 digest[j] ^= hash[j];
11060 }
11061
11062 static void xorObjectDigest(unsigned char *digest, robj *o) {
11063 o = getDecodedObject(o);
11064 xorDigest(digest,o->ptr,sdslen(o->ptr));
11065 decrRefCount(o);
11066 }
11067
11068 /* This function instead of just computing the SHA1 and xoring it
11069 * against diget, also perform the digest of "digest" itself and
11070 * replace the old value with the new one.
11071 *
11072 * So the final digest will be:
11073 *
11074 * digest = SHA1(digest xor SHA1(data))
11075 *
11076 * This function is used every time we want to preserve the order so
11077 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11078 *
11079 * Also note that mixdigest("foo") followed by mixdigest("bar")
11080 * will lead to a different digest compared to "fo", "obar".
11081 */
11082 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11083 SHA1_CTX ctx;
11084 char *s = ptr;
11085
11086 xorDigest(digest,s,len);
11087 SHA1Init(&ctx);
11088 SHA1Update(&ctx,digest,20);
11089 SHA1Final(digest,&ctx);
11090 }
11091
11092 static void mixObjectDigest(unsigned char *digest, robj *o) {
11093 o = getDecodedObject(o);
11094 mixDigest(digest,o->ptr,sdslen(o->ptr));
11095 decrRefCount(o);
11096 }
11097
11098 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11099 * are not ordered, we use a trick: every aggregate digest is the xor
11100 * of the digests of their elements. This way the order will not change
11101 * the result. For list instead we use a feedback entering the output digest
11102 * as input in order to ensure that a different ordered list will result in
11103 * a different digest. */
11104 static void computeDatasetDigest(unsigned char *final) {
11105 unsigned char digest[20];
11106 char buf[128];
11107 dictIterator *di = NULL;
11108 dictEntry *de;
11109 int j;
11110 uint32_t aux;
11111
11112 memset(final,0,20); /* Start with a clean result */
11113
11114 for (j = 0; j < server.dbnum; j++) {
11115 redisDb *db = server.db+j;
11116
11117 if (dictSize(db->dict) == 0) continue;
11118 di = dictGetIterator(db->dict);
11119
11120 /* hash the DB id, so the same dataset moved in a different
11121 * DB will lead to a different digest */
11122 aux = htonl(j);
11123 mixDigest(final,&aux,sizeof(aux));
11124
11125 /* Iterate this DB writing every entry */
11126 while((de = dictNext(di)) != NULL) {
11127 sds key;
11128 robj *keyobj, *o;
11129 time_t expiretime;
11130
11131 memset(digest,0,20); /* This key-val digest */
11132 key = dictGetEntryKey(de);
11133 keyobj = createStringObject(key,sdslen(key));
11134
11135 mixDigest(digest,key,sdslen(key));
11136
11137 /* Make sure the key is loaded if VM is active */
11138 o = lookupKeyRead(db,keyobj);
11139
11140 aux = htonl(o->type);
11141 mixDigest(digest,&aux,sizeof(aux));
11142 expiretime = getExpire(db,keyobj);
11143
11144 /* Save the key and associated value */
11145 if (o->type == REDIS_STRING) {
11146 mixObjectDigest(digest,o);
11147 } else if (o->type == REDIS_LIST) {
11148 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11149 listTypeEntry entry;
11150 while(listTypeNext(li,&entry)) {
11151 robj *eleobj = listTypeGet(&entry);
11152 mixObjectDigest(digest,eleobj);
11153 decrRefCount(eleobj);
11154 }
11155 listTypeReleaseIterator(li);
11156 } else if (o->type == REDIS_SET) {
11157 dict *set = o->ptr;
11158 dictIterator *di = dictGetIterator(set);
11159 dictEntry *de;
11160
11161 while((de = dictNext(di)) != NULL) {
11162 robj *eleobj = dictGetEntryKey(de);
11163
11164 xorObjectDigest(digest,eleobj);
11165 }
11166 dictReleaseIterator(di);
11167 } else if (o->type == REDIS_ZSET) {
11168 zset *zs = o->ptr;
11169 dictIterator *di = dictGetIterator(zs->dict);
11170 dictEntry *de;
11171
11172 while((de = dictNext(di)) != NULL) {
11173 robj *eleobj = dictGetEntryKey(de);
11174 double *score = dictGetEntryVal(de);
11175 unsigned char eledigest[20];
11176
11177 snprintf(buf,sizeof(buf),"%.17g",*score);
11178 memset(eledigest,0,20);
11179 mixObjectDigest(eledigest,eleobj);
11180 mixDigest(eledigest,buf,strlen(buf));
11181 xorDigest(digest,eledigest,20);
11182 }
11183 dictReleaseIterator(di);
11184 } else if (o->type == REDIS_HASH) {
11185 hashTypeIterator *hi;
11186 robj *obj;
11187
11188 hi = hashTypeInitIterator(o);
11189 while (hashTypeNext(hi) != REDIS_ERR) {
11190 unsigned char eledigest[20];
11191
11192 memset(eledigest,0,20);
11193 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11194 mixObjectDigest(eledigest,obj);
11195 decrRefCount(obj);
11196 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11197 mixObjectDigest(eledigest,obj);
11198 decrRefCount(obj);
11199 xorDigest(digest,eledigest,20);
11200 }
11201 hashTypeReleaseIterator(hi);
11202 } else {
11203 redisPanic("Unknown object type");
11204 }
11205 /* If the key has an expire, add it to the mix */
11206 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11207 /* We can finally xor the key-val digest to the final digest */
11208 xorDigest(final,digest,20);
11209 decrRefCount(keyobj);
11210 }
11211 dictReleaseIterator(di);
11212 }
11213 }
11214
11215 static void debugCommand(redisClient *c) {
11216 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11217 *((char*)-1) = 'x';
11218 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11219 if (rdbSave(server.dbfilename) != REDIS_OK) {
11220 addReply(c,shared.err);
11221 return;
11222 }
11223 emptyDb();
11224 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11225 addReply(c,shared.err);
11226 return;
11227 }
11228 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11229 addReply(c,shared.ok);
11230 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11231 emptyDb();
11232 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11233 addReply(c,shared.err);
11234 return;
11235 }
11236 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11237 addReply(c,shared.ok);
11238 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11239 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11240 robj *val;
11241
11242 if (!de) {
11243 addReply(c,shared.nokeyerr);
11244 return;
11245 }
11246 val = dictGetEntryVal(de);
11247 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11248 val->storage == REDIS_VM_SWAPPING)) {
11249 char *strenc;
11250 char buf[128];
11251
11252 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11253 strenc = strencoding[val->encoding];
11254 } else {
11255 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11256 strenc = buf;
11257 }
11258 addReplySds(c,sdscatprintf(sdsempty(),
11259 "+Value at:%p refcount:%d "
11260 "encoding:%s serializedlength:%lld\r\n",
11261 (void*)val, val->refcount,
11262 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11263 } else {
11264 vmpointer *vp = (vmpointer*) val;
11265 addReplySds(c,sdscatprintf(sdsempty(),
11266 "+Value swapped at: page %llu "
11267 "using %llu pages\r\n",
11268 (unsigned long long) vp->page,
11269 (unsigned long long) vp->usedpages));
11270 }
11271 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11272 lookupKeyRead(c->db,c->argv[2]);
11273 addReply(c,shared.ok);
11274 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11275 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11276 robj *val;
11277 vmpointer *vp;
11278
11279 if (!server.vm_enabled) {
11280 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11281 return;
11282 }
11283 if (!de) {
11284 addReply(c,shared.nokeyerr);
11285 return;
11286 }
11287 val = dictGetEntryVal(de);
11288 /* Swap it */
11289 if (val->storage != REDIS_VM_MEMORY) {
11290 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11291 } else if (val->refcount != 1) {
11292 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11293 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11294 dictGetEntryVal(de) = vp;
11295 addReply(c,shared.ok);
11296 } else {
11297 addReply(c,shared.err);
11298 }
11299 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11300 long keys, j;
11301 robj *key, *val;
11302 char buf[128];
11303
11304 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11305 return;
11306 for (j = 0; j < keys; j++) {
11307 snprintf(buf,sizeof(buf),"key:%lu",j);
11308 key = createStringObject(buf,strlen(buf));
11309 if (lookupKeyRead(c->db,key) != NULL) {
11310 decrRefCount(key);
11311 continue;
11312 }
11313 snprintf(buf,sizeof(buf),"value:%lu",j);
11314 val = createStringObject(buf,strlen(buf));
11315 dbAdd(c->db,key,val);
11316 decrRefCount(key);
11317 }
11318 addReply(c,shared.ok);
11319 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11320 unsigned char digest[20];
11321 sds d = sdsnew("+");
11322 int j;
11323
11324 computeDatasetDigest(digest);
11325 for (j = 0; j < 20; j++)
11326 d = sdscatprintf(d, "%02x",digest[j]);
11327
11328 d = sdscatlen(d,"\r\n",2);
11329 addReplySds(c,d);
11330 } else {
11331 addReplySds(c,sdsnew(
11332 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11333 }
11334 }
11335
11336 static void _redisAssert(char *estr, char *file, int line) {
11337 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11338 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11339 #ifdef HAVE_BACKTRACE
11340 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11341 *((char*)-1) = 'x';
11342 #endif
11343 }
11344
11345 static void _redisPanic(char *msg, char *file, int line) {
11346 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11347 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11348 #ifdef HAVE_BACKTRACE
11349 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11350 *((char*)-1) = 'x';
11351 #endif
11352 }
11353
11354 /* =================================== Main! ================================ */
11355
11356 #ifdef __linux__
11357 int linuxOvercommitMemoryValue(void) {
11358 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11359 char buf[64];
11360
11361 if (!fp) return -1;
11362 if (fgets(buf,64,fp) == NULL) {
11363 fclose(fp);
11364 return -1;
11365 }
11366 fclose(fp);
11367
11368 return atoi(buf);
11369 }
11370
11371 void linuxOvercommitMemoryWarning(void) {
11372 if (linuxOvercommitMemoryValue() == 0) {
11373 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11374 }
11375 }
11376 #endif /* __linux__ */
11377
11378 static void daemonize(void) {
11379 int fd;
11380 FILE *fp;
11381
11382 if (fork() != 0) exit(0); /* parent exits */
11383 setsid(); /* create a new session */
11384
11385 /* Every output goes to /dev/null. If Redis is daemonized but
11386 * the 'logfile' is set to 'stdout' in the configuration file
11387 * it will not log at all. */
11388 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11389 dup2(fd, STDIN_FILENO);
11390 dup2(fd, STDOUT_FILENO);
11391 dup2(fd, STDERR_FILENO);
11392 if (fd > STDERR_FILENO) close(fd);
11393 }
11394 /* Try to write the pid file */
11395 fp = fopen(server.pidfile,"w");
11396 if (fp) {
11397 fprintf(fp,"%d\n",getpid());
11398 fclose(fp);
11399 }
11400 }
11401
11402 static void version() {
11403 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11404 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11405 exit(0);
11406 }
11407
11408 static void usage() {
11409 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11410 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11411 exit(1);
11412 }
11413
11414 int main(int argc, char **argv) {
11415 time_t start;
11416
11417 initServerConfig();
11418 sortCommandTable();
11419 if (argc == 2) {
11420 if (strcmp(argv[1], "-v") == 0 ||
11421 strcmp(argv[1], "--version") == 0) version();
11422 if (strcmp(argv[1], "--help") == 0) usage();
11423 resetServerSaveParams();
11424 loadServerConfig(argv[1]);
11425 } else if ((argc > 2)) {
11426 usage();
11427 } else {
11428 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11429 }
11430 if (server.daemonize) daemonize();
11431 initServer();
11432 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11433 #ifdef __linux__
11434 linuxOvercommitMemoryWarning();
11435 #endif
11436 start = time(NULL);
11437 if (server.appendonly) {
11438 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11439 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11440 } else {
11441 if (rdbLoad(server.dbfilename) == REDIS_OK)
11442 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11443 }
11444 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11445 aeSetBeforeSleepProc(server.el,beforeSleep);
11446 aeMain(server.el);
11447 aeDeleteEventLoop(server.el);
11448 return 0;
11449 }
11450
11451 /* ============================= Backtrace support ========================= */
11452
11453 #ifdef HAVE_BACKTRACE
11454 static char *findFuncName(void *pointer, unsigned long *offset);
11455
11456 static void *getMcontextEip(ucontext_t *uc) {
11457 #if defined(__FreeBSD__)
11458 return (void*) uc->uc_mcontext.mc_eip;
11459 #elif defined(__dietlibc__)
11460 return (void*) uc->uc_mcontext.eip;
11461 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11462 #if __x86_64__
11463 return (void*) uc->uc_mcontext->__ss.__rip;
11464 #else
11465 return (void*) uc->uc_mcontext->__ss.__eip;
11466 #endif
11467 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11468 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11469 return (void*) uc->uc_mcontext->__ss.__rip;
11470 #else
11471 return (void*) uc->uc_mcontext->__ss.__eip;
11472 #endif
11473 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11474 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11475 #elif defined(__ia64__) /* Linux IA64 */
11476 return (void*) uc->uc_mcontext.sc_ip;
11477 #else
11478 return NULL;
11479 #endif
11480 }
11481
11482 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11483 void *trace[100];
11484 char **messages = NULL;
11485 int i, trace_size = 0;
11486 unsigned long offset=0;
11487 ucontext_t *uc = (ucontext_t*) secret;
11488 sds infostring;
11489 REDIS_NOTUSED(info);
11490
11491 redisLog(REDIS_WARNING,
11492 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11493 infostring = genRedisInfoString();
11494 redisLog(REDIS_WARNING, "%s",infostring);
11495 /* It's not safe to sdsfree() the returned string under memory
11496 * corruption conditions. Let it leak as we are going to abort */
11497
11498 trace_size = backtrace(trace, 100);
11499 /* overwrite sigaction with caller's address */
11500 if (getMcontextEip(uc) != NULL) {
11501 trace[1] = getMcontextEip(uc);
11502 }
11503 messages = backtrace_symbols(trace, trace_size);
11504
11505 for (i=1; i<trace_size; ++i) {
11506 char *fn = findFuncName(trace[i], &offset), *p;
11507
11508 p = strchr(messages[i],'+');
11509 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11510 redisLog(REDIS_WARNING,"%s", messages[i]);
11511 } else {
11512 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11513 }
11514 }
11515 /* free(messages); Don't call free() with possibly corrupted memory. */
11516 _exit(0);
11517 }
11518
11519 static void sigtermHandler(int sig) {
11520 REDIS_NOTUSED(sig);
11521
11522 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11523 server.shutdown_asap = 1;
11524 }
11525
11526 static void setupSigSegvAction(void) {
11527 struct sigaction act;
11528
11529 sigemptyset (&act.sa_mask);
11530 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11531 * is used. Otherwise, sa_handler is used */
11532 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11533 act.sa_sigaction = segvHandler;
11534 sigaction (SIGSEGV, &act, NULL);
11535 sigaction (SIGBUS, &act, NULL);
11536 sigaction (SIGFPE, &act, NULL);
11537 sigaction (SIGILL, &act, NULL);
11538 sigaction (SIGBUS, &act, NULL);
11539
11540 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11541 act.sa_handler = sigtermHandler;
11542 sigaction (SIGTERM, &act, NULL);
11543 return;
11544 }
11545
11546 #include "staticsymbols.h"
11547 /* This function try to convert a pointer into a function name. It's used in
11548 * oreder to provide a backtrace under segmentation fault that's able to
11549 * display functions declared as static (otherwise the backtrace is useless). */
11550 static char *findFuncName(void *pointer, unsigned long *offset){
11551 int i, ret = -1;
11552 unsigned long off, minoff = 0;
11553
11554 /* Try to match against the Symbol with the smallest offset */
11555 for (i=0; symsTable[i].pointer; i++) {
11556 unsigned long lp = (unsigned long) pointer;
11557
11558 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11559 off=lp-symsTable[i].pointer;
11560 if (ret < 0 || off < minoff) {
11561 minoff=off;
11562 ret=i;
11563 }
11564 }
11565 }
11566 if (ret == -1) return NULL;
11567 *offset = minoff;
11568 return symsTable[ret].name;
11569 }
11570 #else /* HAVE_BACKTRACE */
11571 static void setupSigSegvAction(void) {
11572 }
11573 #endif /* HAVE_BACKTRACE */
11574
11575
11576
11577 /* The End */
11578
11579
11580