]> git.saurik.com Git - redis.git/blob - redis.c
0d6648eeb82b389fd6e14ee87977d35e5603a8a1
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "intset.h" /* Compact integer set structure */
80 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
81 #include "release.h" /* Release and/or git repository information */
82
83 /* Error codes */
84 #define REDIS_OK 0
85 #define REDIS_ERR -1
86
87 /* Static server configuration */
88 #define REDIS_SERVERPORT 6379 /* TCP port */
89 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
90 #define REDIS_IOBUF_LEN 1024
91 #define REDIS_LOADBUF_LEN 1024
92 #define REDIS_STATIC_ARGS 8
93 #define REDIS_DEFAULT_DBNUM 16
94 #define REDIS_CONFIGLINE_MAX 1024
95 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
97 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
98 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
99 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
100
101 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102 #define REDIS_WRITEV_THRESHOLD 3
103 /* Max number of iovecs used for each writev call */
104 #define REDIS_WRITEV_IOVEC_COUNT 256
105
106 /* Hash table parameters */
107 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
108
109 /* Command flags */
110 #define REDIS_CMD_BULK 1 /* Bulk write command */
111 #define REDIS_CMD_INLINE 2 /* Inline command */
112 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116 #define REDIS_CMD_DENYOOM 4
117 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
118
119 /* Object types */
120 #define REDIS_STRING 0
121 #define REDIS_LIST 1
122 #define REDIS_SET 2
123 #define REDIS_ZSET 3
124 #define REDIS_HASH 4
125 #define REDIS_VMPOINTER 8
126
127 /* Objects encoding. Some kind of objects like Strings and Hashes can be
128 * internally represented in multiple ways. The 'encoding' field of the object
129 * is set to one of this fields for this object. */
130 #define REDIS_ENCODING_RAW 0 /* Raw representation */
131 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
132 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
133 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
134 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
135 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
136 #define REDIS_ENCODING_INTSET 6 /* Encoded as intset */
137
138 static char* strencoding[] = {
139 "raw", "int", "hashtable", "zipmap", "list", "ziplist", "intset"
140 };
141
142 /* Object types only used for dumping to disk */
143 #define REDIS_EXPIRETIME 253
144 #define REDIS_SELECTDB 254
145 #define REDIS_EOF 255
146
147 /* Defines related to the dump file format. To store 32 bits lengths for short
148 * keys requires a lot of space, so we check the most significant 2 bits of
149 * the first byte to interpreter the length:
150 *
151 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
152 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
153 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
154 * 11|000000 this means: specially encoded object will follow. The six bits
155 * number specify the kind of object that follows.
156 * See the REDIS_RDB_ENC_* defines.
157 *
158 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
159 * values, will fit inside. */
160 #define REDIS_RDB_6BITLEN 0
161 #define REDIS_RDB_14BITLEN 1
162 #define REDIS_RDB_32BITLEN 2
163 #define REDIS_RDB_ENCVAL 3
164 #define REDIS_RDB_LENERR UINT_MAX
165
166 /* When a length of a string object stored on disk has the first two bits
167 * set, the remaining two bits specify a special encoding for the object
168 * accordingly to the following defines: */
169 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
170 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
171 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
172 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
173
174 /* Virtual memory object->where field. */
175 #define REDIS_VM_MEMORY 0 /* The object is on memory */
176 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
177 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
178 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
179
180 /* Virtual memory static configuration stuff.
181 * Check vmFindContiguousPages() to know more about this magic numbers. */
182 #define REDIS_VM_MAX_NEAR_PAGES 65536
183 #define REDIS_VM_MAX_RANDOM_JUMP 4096
184 #define REDIS_VM_MAX_THREADS 32
185 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
186 /* The following is the *percentage* of completed I/O jobs to process when the
187 * handelr is called. While Virtual Memory I/O operations are performed by
188 * threads, this operations must be processed by the main thread when completed
189 * in order to take effect. */
190 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
191
192 /* Client flags */
193 #define REDIS_SLAVE 1 /* This client is a slave server */
194 #define REDIS_MASTER 2 /* This client is a master server */
195 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
196 #define REDIS_MULTI 8 /* This client is in a MULTI context */
197 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
198 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
199 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
200
201 /* Slave replication state - slave side */
202 #define REDIS_REPL_NONE 0 /* No active replication */
203 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
204 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
205
206 /* Slave replication state - from the point of view of master
207 * Note that in SEND_BULK and ONLINE state the slave receives new updates
208 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
209 * to start the next background saving in order to send updates to it. */
210 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
211 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
212 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
213 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
214
215 /* List related stuff */
216 #define REDIS_HEAD 0
217 #define REDIS_TAIL 1
218
219 /* Sort operations */
220 #define REDIS_SORT_GET 0
221 #define REDIS_SORT_ASC 1
222 #define REDIS_SORT_DESC 2
223 #define REDIS_SORTKEY_MAX 1024
224
225 /* Log levels */
226 #define REDIS_DEBUG 0
227 #define REDIS_VERBOSE 1
228 #define REDIS_NOTICE 2
229 #define REDIS_WARNING 3
230
231 /* Anti-warning macro... */
232 #define REDIS_NOTUSED(V) ((void) V)
233
234 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
235 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
236
237 /* Append only defines */
238 #define APPENDFSYNC_NO 0
239 #define APPENDFSYNC_ALWAYS 1
240 #define APPENDFSYNC_EVERYSEC 2
241
242 /* Zip structure related defaults */
243 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
244 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
245 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
246 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
247 #define REDIS_SET_MAX_INTSET_ENTRIES 4096
248
249 /* We can print the stacktrace, so our assert is defined this way: */
250 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
251 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
252 static void _redisAssert(char *estr, char *file, int line);
253 static void _redisPanic(char *msg, char *file, int line);
254
255 /*================================= Data types ============================== */
256
257 /* A redis object, that is a type able to hold a string / list / set */
258
259 /* The actual Redis Object */
260 typedef struct redisObject {
261 unsigned type:4;
262 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
263 unsigned encoding:4;
264 unsigned lru:22; /* lru time (relative to server.lruclock) */
265 int refcount;
266 void *ptr;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 } robj;
272
273 /* The VM pointer structure - identifies an object in the swap file.
274 *
275 * This object is stored in place of the value
276 * object in the main key->value hash table representing a database.
277 * Note that the first fields (type, storage) are the same as the redisObject
278 * structure so that vmPointer strucuters can be accessed even when casted
279 * as redisObject structures.
280 *
281 * This is useful as we don't know if a value object is or not on disk, but we
282 * are always able to read obj->storage to check this. For vmPointer
283 * structures "type" is set to REDIS_VMPOINTER (even if without this field
284 * is still possible to check the kind of object from the value of 'storage').*/
285 typedef struct vmPointer {
286 unsigned type:4;
287 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
288 unsigned notused:26;
289 unsigned int vtype; /* type of the object stored in the swap file */
290 off_t page; /* the page at witch the object is stored on disk */
291 off_t usedpages; /* number of pages used on disk */
292 } vmpointer;
293
294 /* Macro used to initalize a Redis object allocated on the stack.
295 * Note that this macro is taken near the structure definition to make sure
296 * we'll update it when the structure is changed, to avoid bugs like
297 * bug #85 introduced exactly in this way. */
298 #define initStaticStringObject(_var,_ptr) do { \
299 _var.refcount = 1; \
300 _var.type = REDIS_STRING; \
301 _var.encoding = REDIS_ENCODING_RAW; \
302 _var.ptr = _ptr; \
303 _var.storage = REDIS_VM_MEMORY; \
304 } while(0);
305
306 typedef struct redisDb {
307 dict *dict; /* The keyspace for this DB */
308 dict *expires; /* Timeout of keys with a timeout set */
309 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
310 dict *io_keys; /* Keys with clients waiting for VM I/O */
311 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
312 int id;
313 } redisDb;
314
315 /* Client MULTI/EXEC state */
316 typedef struct multiCmd {
317 robj **argv;
318 int argc;
319 struct redisCommand *cmd;
320 } multiCmd;
321
322 typedef struct multiState {
323 multiCmd *commands; /* Array of MULTI commands */
324 int count; /* Total number of MULTI commands */
325 } multiState;
326
327 /* With multiplexing we need to take per-clinet state.
328 * Clients are taken in a liked list. */
329 typedef struct redisClient {
330 int fd;
331 redisDb *db;
332 int dictid;
333 sds querybuf;
334 robj **argv, **mbargv;
335 int argc, mbargc;
336 int bulklen; /* bulk read len. -1 if not in bulk read mode */
337 int multibulk; /* multi bulk command format active */
338 list *reply;
339 int sentlen;
340 time_t lastinteraction; /* time of the last interaction, used for timeout */
341 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
342 int slaveseldb; /* slave selected db, if this client is a slave */
343 int authenticated; /* when requirepass is non-NULL */
344 int replstate; /* replication state if this is a slave */
345 int repldbfd; /* replication DB file descriptor */
346 long repldboff; /* replication DB file offset */
347 off_t repldbsize; /* replication DB file size */
348 multiState mstate; /* MULTI/EXEC state */
349 robj **blocking_keys; /* The key we are waiting to terminate a blocking
350 * operation such as BLPOP. Otherwise NULL. */
351 int blocking_keys_num; /* Number of blocking keys */
352 time_t blockingto; /* Blocking operation timeout. If UNIX current time
353 * is >= blockingto then the operation timed out. */
354 list *io_keys; /* Keys this client is waiting to be loaded from the
355 * swap file in order to continue. */
356 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
357 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
358 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
359 } redisClient;
360
361 struct saveparam {
362 time_t seconds;
363 int changes;
364 };
365
366 /* Global server state structure */
367 struct redisServer {
368 int port;
369 int fd;
370 redisDb *db;
371 long long dirty; /* changes to DB from the last save */
372 list *clients;
373 list *slaves, *monitors;
374 char neterr[ANET_ERR_LEN];
375 aeEventLoop *el;
376 int cronloops; /* number of times the cron function run */
377 list *objfreelist; /* A list of freed objects to avoid malloc() */
378 time_t lastsave; /* Unix time of last save succeeede */
379 /* Fields used only for stats */
380 time_t stat_starttime; /* server start time */
381 long long stat_numcommands; /* number of processed commands */
382 long long stat_numconnections; /* number of connections received */
383 long long stat_expiredkeys; /* number of expired keys */
384 /* Configuration */
385 int verbosity;
386 int glueoutputbuf;
387 int maxidletime;
388 int dbnum;
389 int daemonize;
390 int appendonly;
391 int appendfsync;
392 int no_appendfsync_on_rewrite;
393 int shutdown_asap;
394 time_t lastfsync;
395 int appendfd;
396 int appendseldb;
397 char *pidfile;
398 pid_t bgsavechildpid;
399 pid_t bgrewritechildpid;
400 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
401 sds aofbuf; /* AOF buffer, written before entering the event loop */
402 struct saveparam *saveparams;
403 int saveparamslen;
404 char *logfile;
405 char *bindaddr;
406 char *dbfilename;
407 char *appendfilename;
408 char *requirepass;
409 int rdbcompression;
410 int activerehashing;
411 /* Replication related */
412 int isslave;
413 char *masterauth;
414 char *masterhost;
415 int masterport;
416 redisClient *master; /* client that is master for this slave */
417 int replstate;
418 unsigned int maxclients;
419 unsigned long long maxmemory;
420 unsigned int blpop_blocked_clients;
421 unsigned int vm_blocked_clients;
422 /* Sort parameters - qsort_r() is only available under BSD so we
423 * have to take this state global, in order to pass it to sortCompare() */
424 int sort_desc;
425 int sort_alpha;
426 int sort_bypattern;
427 /* Virtual memory configuration */
428 int vm_enabled;
429 char *vm_swap_file;
430 off_t vm_page_size;
431 off_t vm_pages;
432 unsigned long long vm_max_memory;
433 /* Zip structure config */
434 size_t hash_max_zipmap_entries;
435 size_t hash_max_zipmap_value;
436 size_t list_max_ziplist_entries;
437 size_t list_max_ziplist_value;
438 size_t set_max_intset_entries;
439 /* Virtual memory state */
440 FILE *vm_fp;
441 int vm_fd;
442 off_t vm_next_page; /* Next probably empty page */
443 off_t vm_near_pages; /* Number of pages allocated sequentially */
444 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
445 time_t unixtime; /* Unix time sampled every second. */
446 /* Virtual memory I/O threads stuff */
447 /* An I/O thread process an element taken from the io_jobs queue and
448 * put the result of the operation in the io_done list. While the
449 * job is being processed, it's put on io_processing queue. */
450 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
451 list *io_processing; /* List of VM I/O jobs being processed */
452 list *io_processed; /* List of VM I/O jobs already processed */
453 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
454 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
455 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
456 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
457 pthread_attr_t io_threads_attr; /* attributes for threads creation */
458 int io_active_threads; /* Number of running I/O threads */
459 int vm_max_threads; /* Max number of I/O threads running at the same time */
460 /* Our main thread is blocked on the event loop, locking for sockets ready
461 * to be read or written, so when a threaded I/O operation is ready to be
462 * processed by the main thread, the I/O thread will use a unix pipe to
463 * awake the main thread. The followings are the two pipe FDs. */
464 int io_ready_pipe_read;
465 int io_ready_pipe_write;
466 /* Virtual memory stats */
467 unsigned long long vm_stats_used_pages;
468 unsigned long long vm_stats_swapped_objects;
469 unsigned long long vm_stats_swapouts;
470 unsigned long long vm_stats_swapins;
471 /* Pubsub */
472 dict *pubsub_channels; /* Map channels to list of subscribed clients */
473 list *pubsub_patterns; /* A list of pubsub_patterns */
474 /* Misc */
475 FILE *devnull;
476 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
477 unsigned lruclock_padding:10;
478 };
479
480 typedef struct pubsubPattern {
481 redisClient *client;
482 robj *pattern;
483 } pubsubPattern;
484
485 typedef void redisCommandProc(redisClient *c);
486 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
487 struct redisCommand {
488 char *name;
489 redisCommandProc *proc;
490 int arity;
491 int flags;
492 /* Use a function to determine which keys need to be loaded
493 * in the background prior to executing this command. Takes precedence
494 * over vm_firstkey and others, ignored when NULL */
495 redisVmPreloadProc *vm_preload_proc;
496 /* What keys should be loaded in background when calling this command? */
497 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
498 int vm_lastkey; /* THe last argument that's a key */
499 int vm_keystep; /* The step between first and last key */
500 };
501
502 struct redisFunctionSym {
503 char *name;
504 unsigned long pointer;
505 };
506
507 typedef struct _redisSortObject {
508 robj *obj;
509 union {
510 double score;
511 robj *cmpobj;
512 } u;
513 } redisSortObject;
514
515 typedef struct _redisSortOperation {
516 int type;
517 robj *pattern;
518 } redisSortOperation;
519
520 /* ZSETs use a specialized version of Skiplists */
521
522 typedef struct zskiplistNode {
523 struct zskiplistNode **forward;
524 struct zskiplistNode *backward;
525 unsigned int *span;
526 double score;
527 robj *obj;
528 } zskiplistNode;
529
530 typedef struct zskiplist {
531 struct zskiplistNode *header, *tail;
532 unsigned long length;
533 int level;
534 } zskiplist;
535
536 typedef struct zset {
537 dict *dict;
538 zskiplist *zsl;
539 } zset;
540
541 /* Our shared "common" objects */
542
543 #define REDIS_SHARED_INTEGERS 10000
544 struct sharedObjectsStruct {
545 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
546 *colon, *nullbulk, *nullmultibulk, *queued,
547 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
548 *outofrangeerr, *plus,
549 *select0, *select1, *select2, *select3, *select4,
550 *select5, *select6, *select7, *select8, *select9,
551 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
552 *mbulk4, *psubscribebulk, *punsubscribebulk,
553 *integers[REDIS_SHARED_INTEGERS];
554 } shared;
555
556 /* Global vars that are actally used as constants. The following double
557 * values are used for double on-disk serialization, and are initialized
558 * at runtime to avoid strange compiler optimizations. */
559
560 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
561
562 /* VM threaded I/O request message */
563 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
564 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
565 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
566 typedef struct iojob {
567 int type; /* Request type, REDIS_IOJOB_* */
568 redisDb *db;/* Redis database */
569 robj *key; /* This I/O request is about swapping this key */
570 robj *id; /* Unique identifier of this job:
571 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
572 vmpointer objct for REDIS_IOREQ_LOAD. */
573 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
574 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
575 off_t page; /* Swap page where to read/write the object */
576 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
577 int canceled; /* True if this command was canceled by blocking side of VM */
578 pthread_t thread; /* ID of the thread processing this entry */
579 } iojob;
580
581 /*================================ Prototypes =============================== */
582
583 static void freeStringObject(robj *o);
584 static void freeListObject(robj *o);
585 static void freeSetObject(robj *o);
586 static void decrRefCount(void *o);
587 static robj *createObject(int type, void *ptr);
588 static void freeClient(redisClient *c);
589 static int rdbLoad(char *filename);
590 static void addReply(redisClient *c, robj *obj);
591 static void addReplySds(redisClient *c, sds s);
592 static void incrRefCount(robj *o);
593 static int rdbSaveBackground(char *filename);
594 static robj *createStringObject(char *ptr, size_t len);
595 static robj *dupStringObject(robj *o);
596 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
597 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
598 static void flushAppendOnlyFile(void);
599 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
600 static int syncWithMaster(void);
601 static robj *tryObjectEncoding(robj *o);
602 static robj *getDecodedObject(robj *o);
603 static int removeExpire(redisDb *db, robj *key);
604 static int expireIfNeeded(redisDb *db, robj *key);
605 static int deleteIfVolatile(redisDb *db, robj *key);
606 static int dbDelete(redisDb *db, robj *key);
607 static time_t getExpire(redisDb *db, robj *key);
608 static int setExpire(redisDb *db, robj *key, time_t when);
609 static void updateSlavesWaitingBgsave(int bgsaveerr);
610 static void freeMemoryIfNeeded(void);
611 static int processCommand(redisClient *c);
612 static void setupSigSegvAction(void);
613 static void rdbRemoveTempFile(pid_t childpid);
614 static void aofRemoveTempFile(pid_t childpid);
615 static size_t stringObjectLen(robj *o);
616 static void processInputBuffer(redisClient *c);
617 static zskiplist *zslCreate(void);
618 static void zslFree(zskiplist *zsl);
619 static void zslInsert(zskiplist *zsl, double score, robj *obj);
620 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
621 static void initClientMultiState(redisClient *c);
622 static void freeClientMultiState(redisClient *c);
623 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
624 static void unblockClientWaitingData(redisClient *c);
625 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
626 static void vmInit(void);
627 static void vmMarkPagesFree(off_t page, off_t count);
628 static robj *vmLoadObject(robj *o);
629 static robj *vmPreviewObject(robj *o);
630 static int vmSwapOneObjectBlocking(void);
631 static int vmSwapOneObjectThreaded(void);
632 static int vmCanSwapOut(void);
633 static int tryFreeOneObjectFromFreelist(void);
634 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
635 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
636 static void vmCancelThreadedIOJob(robj *o);
637 static void lockThreadedIO(void);
638 static void unlockThreadedIO(void);
639 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
640 static void freeIOJob(iojob *j);
641 static void queueIOJob(iojob *j);
642 static int vmWriteObjectOnSwap(robj *o, off_t page);
643 static robj *vmReadObjectFromSwap(off_t page, int type);
644 static void waitEmptyIOJobsQueue(void);
645 static void vmReopenSwapFile(void);
646 static int vmFreePage(off_t page);
647 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
648 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
649 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
650 static int dontWaitForSwappedKey(redisClient *c, robj *key);
651 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
652 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
653 static struct redisCommand *lookupCommand(char *name);
654 static void call(redisClient *c, struct redisCommand *cmd);
655 static void resetClient(redisClient *c);
656 static void convertToRealHash(robj *o);
657 static void listTypeConvert(robj *o, int enc);
658 static void setTypeConvert(robj *o, int enc);
659 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
660 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
661 static void freePubsubPattern(void *p);
662 static int listMatchPubsubPattern(void *a, void *b);
663 static int compareStringObjects(robj *a, robj *b);
664 static int equalStringObjects(robj *a, robj *b);
665 static void usage();
666 static int rewriteAppendOnlyFileBackground(void);
667 static vmpointer *vmSwapObjectBlocking(robj *val);
668 static int prepareForShutdown();
669 static void touchWatchedKey(redisDb *db, robj *key);
670 static void touchWatchedKeysOnFlush(int dbid);
671 static void unwatchAllKeys(redisClient *c);
672
673 static void authCommand(redisClient *c);
674 static void pingCommand(redisClient *c);
675 static void echoCommand(redisClient *c);
676 static void setCommand(redisClient *c);
677 static void setnxCommand(redisClient *c);
678 static void setexCommand(redisClient *c);
679 static void getCommand(redisClient *c);
680 static void delCommand(redisClient *c);
681 static void existsCommand(redisClient *c);
682 static void incrCommand(redisClient *c);
683 static void decrCommand(redisClient *c);
684 static void incrbyCommand(redisClient *c);
685 static void decrbyCommand(redisClient *c);
686 static void selectCommand(redisClient *c);
687 static void randomkeyCommand(redisClient *c);
688 static void keysCommand(redisClient *c);
689 static void dbsizeCommand(redisClient *c);
690 static void lastsaveCommand(redisClient *c);
691 static void saveCommand(redisClient *c);
692 static void bgsaveCommand(redisClient *c);
693 static void bgrewriteaofCommand(redisClient *c);
694 static void shutdownCommand(redisClient *c);
695 static void moveCommand(redisClient *c);
696 static void renameCommand(redisClient *c);
697 static void renamenxCommand(redisClient *c);
698 static void lpushCommand(redisClient *c);
699 static void rpushCommand(redisClient *c);
700 static void lpopCommand(redisClient *c);
701 static void rpopCommand(redisClient *c);
702 static void llenCommand(redisClient *c);
703 static void lindexCommand(redisClient *c);
704 static void lrangeCommand(redisClient *c);
705 static void ltrimCommand(redisClient *c);
706 static void typeCommand(redisClient *c);
707 static void lsetCommand(redisClient *c);
708 static void saddCommand(redisClient *c);
709 static void sremCommand(redisClient *c);
710 static void smoveCommand(redisClient *c);
711 static void sismemberCommand(redisClient *c);
712 static void scardCommand(redisClient *c);
713 static void spopCommand(redisClient *c);
714 static void srandmemberCommand(redisClient *c);
715 static void sinterCommand(redisClient *c);
716 static void sinterstoreCommand(redisClient *c);
717 static void sunionCommand(redisClient *c);
718 static void sunionstoreCommand(redisClient *c);
719 static void sdiffCommand(redisClient *c);
720 static void sdiffstoreCommand(redisClient *c);
721 static void syncCommand(redisClient *c);
722 static void flushdbCommand(redisClient *c);
723 static void flushallCommand(redisClient *c);
724 static void sortCommand(redisClient *c);
725 static void lremCommand(redisClient *c);
726 static void rpoplpushcommand(redisClient *c);
727 static void infoCommand(redisClient *c);
728 static void mgetCommand(redisClient *c);
729 static void monitorCommand(redisClient *c);
730 static void expireCommand(redisClient *c);
731 static void expireatCommand(redisClient *c);
732 static void getsetCommand(redisClient *c);
733 static void ttlCommand(redisClient *c);
734 static void slaveofCommand(redisClient *c);
735 static void debugCommand(redisClient *c);
736 static void msetCommand(redisClient *c);
737 static void msetnxCommand(redisClient *c);
738 static void zaddCommand(redisClient *c);
739 static void zincrbyCommand(redisClient *c);
740 static void zrangeCommand(redisClient *c);
741 static void zrangebyscoreCommand(redisClient *c);
742 static void zcountCommand(redisClient *c);
743 static void zrevrangeCommand(redisClient *c);
744 static void zcardCommand(redisClient *c);
745 static void zremCommand(redisClient *c);
746 static void zscoreCommand(redisClient *c);
747 static void zremrangebyscoreCommand(redisClient *c);
748 static void multiCommand(redisClient *c);
749 static void execCommand(redisClient *c);
750 static void discardCommand(redisClient *c);
751 static void blpopCommand(redisClient *c);
752 static void brpopCommand(redisClient *c);
753 static void appendCommand(redisClient *c);
754 static void substrCommand(redisClient *c);
755 static void zrankCommand(redisClient *c);
756 static void zrevrankCommand(redisClient *c);
757 static void hsetCommand(redisClient *c);
758 static void hsetnxCommand(redisClient *c);
759 static void hgetCommand(redisClient *c);
760 static void hmsetCommand(redisClient *c);
761 static void hmgetCommand(redisClient *c);
762 static void hdelCommand(redisClient *c);
763 static void hlenCommand(redisClient *c);
764 static void zremrangebyrankCommand(redisClient *c);
765 static void zunionstoreCommand(redisClient *c);
766 static void zinterstoreCommand(redisClient *c);
767 static void hkeysCommand(redisClient *c);
768 static void hvalsCommand(redisClient *c);
769 static void hgetallCommand(redisClient *c);
770 static void hexistsCommand(redisClient *c);
771 static void configCommand(redisClient *c);
772 static void hincrbyCommand(redisClient *c);
773 static void subscribeCommand(redisClient *c);
774 static void unsubscribeCommand(redisClient *c);
775 static void psubscribeCommand(redisClient *c);
776 static void punsubscribeCommand(redisClient *c);
777 static void publishCommand(redisClient *c);
778 static void watchCommand(redisClient *c);
779 static void unwatchCommand(redisClient *c);
780
781 /*================================= Globals ================================= */
782
783 /* Global vars */
784 static struct redisServer server; /* server global state */
785 static struct redisCommand *commandTable;
786 static struct redisCommand readonlyCommandTable[] = {
787 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
790 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
791 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
798 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
810 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
811 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
814 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
819 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
820 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
821 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
822 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
823 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
824 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
831 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
832 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
838 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
839 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
843 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
844 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
845 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
846 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
847 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
852 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
856 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
860 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
869 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
875 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
877 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
882 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
883 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
885 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
888 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
893 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
894 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
895 };
896
897 /*============================ Utility functions ============================ */
898
899 /* Glob-style pattern matching. */
900 static int stringmatchlen(const char *pattern, int patternLen,
901 const char *string, int stringLen, int nocase)
902 {
903 while(patternLen) {
904 switch(pattern[0]) {
905 case '*':
906 while (pattern[1] == '*') {
907 pattern++;
908 patternLen--;
909 }
910 if (patternLen == 1)
911 return 1; /* match */
912 while(stringLen) {
913 if (stringmatchlen(pattern+1, patternLen-1,
914 string, stringLen, nocase))
915 return 1; /* match */
916 string++;
917 stringLen--;
918 }
919 return 0; /* no match */
920 break;
921 case '?':
922 if (stringLen == 0)
923 return 0; /* no match */
924 string++;
925 stringLen--;
926 break;
927 case '[':
928 {
929 int not, match;
930
931 pattern++;
932 patternLen--;
933 not = pattern[0] == '^';
934 if (not) {
935 pattern++;
936 patternLen--;
937 }
938 match = 0;
939 while(1) {
940 if (pattern[0] == '\\') {
941 pattern++;
942 patternLen--;
943 if (pattern[0] == string[0])
944 match = 1;
945 } else if (pattern[0] == ']') {
946 break;
947 } else if (patternLen == 0) {
948 pattern--;
949 patternLen++;
950 break;
951 } else if (pattern[1] == '-' && patternLen >= 3) {
952 int start = pattern[0];
953 int end = pattern[2];
954 int c = string[0];
955 if (start > end) {
956 int t = start;
957 start = end;
958 end = t;
959 }
960 if (nocase) {
961 start = tolower(start);
962 end = tolower(end);
963 c = tolower(c);
964 }
965 pattern += 2;
966 patternLen -= 2;
967 if (c >= start && c <= end)
968 match = 1;
969 } else {
970 if (!nocase) {
971 if (pattern[0] == string[0])
972 match = 1;
973 } else {
974 if (tolower((int)pattern[0]) == tolower((int)string[0]))
975 match = 1;
976 }
977 }
978 pattern++;
979 patternLen--;
980 }
981 if (not)
982 match = !match;
983 if (!match)
984 return 0; /* no match */
985 string++;
986 stringLen--;
987 break;
988 }
989 case '\\':
990 if (patternLen >= 2) {
991 pattern++;
992 patternLen--;
993 }
994 /* fall through */
995 default:
996 if (!nocase) {
997 if (pattern[0] != string[0])
998 return 0; /* no match */
999 } else {
1000 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1001 return 0; /* no match */
1002 }
1003 string++;
1004 stringLen--;
1005 break;
1006 }
1007 pattern++;
1008 patternLen--;
1009 if (stringLen == 0) {
1010 while(*pattern == '*') {
1011 pattern++;
1012 patternLen--;
1013 }
1014 break;
1015 }
1016 }
1017 if (patternLen == 0 && stringLen == 0)
1018 return 1;
1019 return 0;
1020 }
1021
1022 static int stringmatch(const char *pattern, const char *string, int nocase) {
1023 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1024 }
1025
1026 /* Convert a string representing an amount of memory into the number of
1027 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1028 * (1024*1024*1024).
1029 *
1030 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1031 * set to 0 */
1032 static long long memtoll(const char *p, int *err) {
1033 const char *u;
1034 char buf[128];
1035 long mul; /* unit multiplier */
1036 long long val;
1037 unsigned int digits;
1038
1039 if (err) *err = 0;
1040 /* Search the first non digit character. */
1041 u = p;
1042 if (*u == '-') u++;
1043 while(*u && isdigit(*u)) u++;
1044 if (*u == '\0' || !strcasecmp(u,"b")) {
1045 mul = 1;
1046 } else if (!strcasecmp(u,"k")) {
1047 mul = 1000;
1048 } else if (!strcasecmp(u,"kb")) {
1049 mul = 1024;
1050 } else if (!strcasecmp(u,"m")) {
1051 mul = 1000*1000;
1052 } else if (!strcasecmp(u,"mb")) {
1053 mul = 1024*1024;
1054 } else if (!strcasecmp(u,"g")) {
1055 mul = 1000L*1000*1000;
1056 } else if (!strcasecmp(u,"gb")) {
1057 mul = 1024L*1024*1024;
1058 } else {
1059 if (err) *err = 1;
1060 mul = 1;
1061 }
1062 digits = u-p;
1063 if (digits >= sizeof(buf)) {
1064 if (err) *err = 1;
1065 return LLONG_MAX;
1066 }
1067 memcpy(buf,p,digits);
1068 buf[digits] = '\0';
1069 val = strtoll(buf,NULL,10);
1070 return val*mul;
1071 }
1072
1073 /* Convert a long long into a string. Returns the number of
1074 * characters needed to represent the number, that can be shorter if passed
1075 * buffer length is not enough to store the whole number. */
1076 static int ll2string(char *s, size_t len, long long value) {
1077 char buf[32], *p;
1078 unsigned long long v;
1079 size_t l;
1080
1081 if (len == 0) return 0;
1082 v = (value < 0) ? -value : value;
1083 p = buf+31; /* point to the last character */
1084 do {
1085 *p-- = '0'+(v%10);
1086 v /= 10;
1087 } while(v);
1088 if (value < 0) *p-- = '-';
1089 p++;
1090 l = 32-(p-buf);
1091 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1092 memcpy(s,p,l);
1093 s[l] = '\0';
1094 return l;
1095 }
1096
1097 static void redisLog(int level, const char *fmt, ...) {
1098 va_list ap;
1099 FILE *fp;
1100
1101 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1102 if (!fp) return;
1103
1104 va_start(ap, fmt);
1105 if (level >= server.verbosity) {
1106 char *c = ".-*#";
1107 char buf[64];
1108 time_t now;
1109
1110 now = time(NULL);
1111 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1112 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1113 vfprintf(fp, fmt, ap);
1114 fprintf(fp,"\n");
1115 fflush(fp);
1116 }
1117 va_end(ap);
1118
1119 if (server.logfile) fclose(fp);
1120 }
1121
1122 /*====================== Hash table type implementation ==================== */
1123
1124 /* This is an hash table type that uses the SDS dynamic strings libary as
1125 * keys and radis objects as values (objects can hold SDS strings,
1126 * lists, sets). */
1127
1128 static void dictVanillaFree(void *privdata, void *val)
1129 {
1130 DICT_NOTUSED(privdata);
1131 zfree(val);
1132 }
1133
1134 static void dictListDestructor(void *privdata, void *val)
1135 {
1136 DICT_NOTUSED(privdata);
1137 listRelease((list*)val);
1138 }
1139
1140 static int dictSdsKeyCompare(void *privdata, const void *key1,
1141 const void *key2)
1142 {
1143 int l1,l2;
1144 DICT_NOTUSED(privdata);
1145
1146 l1 = sdslen((sds)key1);
1147 l2 = sdslen((sds)key2);
1148 if (l1 != l2) return 0;
1149 return memcmp(key1, key2, l1) == 0;
1150 }
1151
1152 static void dictRedisObjectDestructor(void *privdata, void *val)
1153 {
1154 DICT_NOTUSED(privdata);
1155
1156 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1157 decrRefCount(val);
1158 }
1159
1160 static void dictSdsDestructor(void *privdata, void *val)
1161 {
1162 DICT_NOTUSED(privdata);
1163
1164 sdsfree(val);
1165 }
1166
1167 static int dictObjKeyCompare(void *privdata, const void *key1,
1168 const void *key2)
1169 {
1170 const robj *o1 = key1, *o2 = key2;
1171 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1172 }
1173
1174 static unsigned int dictObjHash(const void *key) {
1175 const robj *o = key;
1176 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 }
1178
1179 static unsigned int dictSdsHash(const void *key) {
1180 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1181 }
1182
1183 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1184 const void *key2)
1185 {
1186 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1187 int cmp;
1188
1189 if (o1->encoding == REDIS_ENCODING_INT &&
1190 o2->encoding == REDIS_ENCODING_INT)
1191 return o1->ptr == o2->ptr;
1192
1193 o1 = getDecodedObject(o1);
1194 o2 = getDecodedObject(o2);
1195 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1196 decrRefCount(o1);
1197 decrRefCount(o2);
1198 return cmp;
1199 }
1200
1201 static unsigned int dictEncObjHash(const void *key) {
1202 robj *o = (robj*) key;
1203
1204 if (o->encoding == REDIS_ENCODING_RAW) {
1205 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1206 } else {
1207 if (o->encoding == REDIS_ENCODING_INT) {
1208 char buf[32];
1209 int len;
1210
1211 len = ll2string(buf,32,(long)o->ptr);
1212 return dictGenHashFunction((unsigned char*)buf, len);
1213 } else {
1214 unsigned int hash;
1215
1216 o = getDecodedObject(o);
1217 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1218 decrRefCount(o);
1219 return hash;
1220 }
1221 }
1222 }
1223
1224 /* Sets type */
1225 static dictType setDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 NULL /* val destructor */
1232 };
1233
1234 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1235 static dictType zsetDictType = {
1236 dictEncObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictEncObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
1241 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1242 };
1243
1244 /* Db->dict, keys are sds strings, vals are Redis objects. */
1245 static dictType dbDictType = {
1246 dictSdsHash, /* hash function */
1247 NULL, /* key dup */
1248 NULL, /* val dup */
1249 dictSdsKeyCompare, /* key compare */
1250 dictSdsDestructor, /* key destructor */
1251 dictRedisObjectDestructor /* val destructor */
1252 };
1253
1254 /* Db->expires */
1255 static dictType keyptrDictType = {
1256 dictSdsHash, /* hash function */
1257 NULL, /* key dup */
1258 NULL, /* val dup */
1259 dictSdsKeyCompare, /* key compare */
1260 dictSdsDestructor, /* key destructor */
1261 NULL /* val destructor */
1262 };
1263
1264 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1265 static dictType hashDictType = {
1266 dictEncObjHash, /* hash function */
1267 NULL, /* key dup */
1268 NULL, /* val dup */
1269 dictEncObjKeyCompare, /* key compare */
1270 dictRedisObjectDestructor, /* key destructor */
1271 dictRedisObjectDestructor /* val destructor */
1272 };
1273
1274 /* Keylist hash table type has unencoded redis objects as keys and
1275 * lists as values. It's used for blocking operations (BLPOP) and to
1276 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1277 static dictType keylistDictType = {
1278 dictObjHash, /* hash function */
1279 NULL, /* key dup */
1280 NULL, /* val dup */
1281 dictObjKeyCompare, /* key compare */
1282 dictRedisObjectDestructor, /* key destructor */
1283 dictListDestructor /* val destructor */
1284 };
1285
1286 static void version();
1287
1288 /* ========================= Random utility functions ======================= */
1289
1290 /* Redis generally does not try to recover from out of memory conditions
1291 * when allocating objects or strings, it is not clear if it will be possible
1292 * to report this condition to the client since the networking layer itself
1293 * is based on heap allocation for send buffers, so we simply abort.
1294 * At least the code will be simpler to read... */
1295 static void oom(const char *msg) {
1296 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1297 sleep(1);
1298 abort();
1299 }
1300
1301 /* ====================== Redis server networking stuff ===================== */
1302 static void closeTimedoutClients(void) {
1303 redisClient *c;
1304 listNode *ln;
1305 time_t now = time(NULL);
1306 listIter li;
1307
1308 listRewind(server.clients,&li);
1309 while ((ln = listNext(&li)) != NULL) {
1310 c = listNodeValue(ln);
1311 if (server.maxidletime &&
1312 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1313 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1314 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1315 listLength(c->pubsub_patterns) == 0 &&
1316 (now - c->lastinteraction > server.maxidletime))
1317 {
1318 redisLog(REDIS_VERBOSE,"Closing idle client");
1319 freeClient(c);
1320 } else if (c->flags & REDIS_BLOCKED) {
1321 if (c->blockingto != 0 && c->blockingto < now) {
1322 addReply(c,shared.nullmultibulk);
1323 unblockClientWaitingData(c);
1324 }
1325 }
1326 }
1327 }
1328
1329 static int htNeedsResize(dict *dict) {
1330 long long size, used;
1331
1332 size = dictSlots(dict);
1333 used = dictSize(dict);
1334 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1335 (used*100/size < REDIS_HT_MINFILL));
1336 }
1337
1338 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1339 * we resize the hash table to save memory */
1340 static void tryResizeHashTables(void) {
1341 int j;
1342
1343 for (j = 0; j < server.dbnum; j++) {
1344 if (htNeedsResize(server.db[j].dict))
1345 dictResize(server.db[j].dict);
1346 if (htNeedsResize(server.db[j].expires))
1347 dictResize(server.db[j].expires);
1348 }
1349 }
1350
1351 /* Our hash table implementation performs rehashing incrementally while
1352 * we write/read from the hash table. Still if the server is idle, the hash
1353 * table will use two tables for a long time. So we try to use 1 millisecond
1354 * of CPU time at every serverCron() loop in order to rehash some key. */
1355 static void incrementallyRehash(void) {
1356 int j;
1357
1358 for (j = 0; j < server.dbnum; j++) {
1359 if (dictIsRehashing(server.db[j].dict)) {
1360 dictRehashMilliseconds(server.db[j].dict,1);
1361 break; /* already used our millisecond for this loop... */
1362 }
1363 }
1364 }
1365
1366 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1367 void backgroundSaveDoneHandler(int statloc) {
1368 int exitcode = WEXITSTATUS(statloc);
1369 int bysignal = WIFSIGNALED(statloc);
1370
1371 if (!bysignal && exitcode == 0) {
1372 redisLog(REDIS_NOTICE,
1373 "Background saving terminated with success");
1374 server.dirty = 0;
1375 server.lastsave = time(NULL);
1376 } else if (!bysignal && exitcode != 0) {
1377 redisLog(REDIS_WARNING, "Background saving error");
1378 } else {
1379 redisLog(REDIS_WARNING,
1380 "Background saving terminated by signal %d", WTERMSIG(statloc));
1381 rdbRemoveTempFile(server.bgsavechildpid);
1382 }
1383 server.bgsavechildpid = -1;
1384 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1385 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1386 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1387 }
1388
1389 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1390 * Handle this. */
1391 void backgroundRewriteDoneHandler(int statloc) {
1392 int exitcode = WEXITSTATUS(statloc);
1393 int bysignal = WIFSIGNALED(statloc);
1394
1395 if (!bysignal && exitcode == 0) {
1396 int fd;
1397 char tmpfile[256];
1398
1399 redisLog(REDIS_NOTICE,
1400 "Background append only file rewriting terminated with success");
1401 /* Now it's time to flush the differences accumulated by the parent */
1402 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1403 fd = open(tmpfile,O_WRONLY|O_APPEND);
1404 if (fd == -1) {
1405 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1406 goto cleanup;
1407 }
1408 /* Flush our data... */
1409 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1410 (signed) sdslen(server.bgrewritebuf)) {
1411 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1412 close(fd);
1413 goto cleanup;
1414 }
1415 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1416 /* Now our work is to rename the temp file into the stable file. And
1417 * switch the file descriptor used by the server for append only. */
1418 if (rename(tmpfile,server.appendfilename) == -1) {
1419 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1420 close(fd);
1421 goto cleanup;
1422 }
1423 /* Mission completed... almost */
1424 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1425 if (server.appendfd != -1) {
1426 /* If append only is actually enabled... */
1427 close(server.appendfd);
1428 server.appendfd = fd;
1429 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1430 server.appendseldb = -1; /* Make sure it will issue SELECT */
1431 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1432 } else {
1433 /* If append only is disabled we just generate a dump in this
1434 * format. Why not? */
1435 close(fd);
1436 }
1437 } else if (!bysignal && exitcode != 0) {
1438 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1439 } else {
1440 redisLog(REDIS_WARNING,
1441 "Background append only file rewriting terminated by signal %d",
1442 WTERMSIG(statloc));
1443 }
1444 cleanup:
1445 sdsfree(server.bgrewritebuf);
1446 server.bgrewritebuf = sdsempty();
1447 aofRemoveTempFile(server.bgrewritechildpid);
1448 server.bgrewritechildpid = -1;
1449 }
1450
1451 /* This function is called once a background process of some kind terminates,
1452 * as we want to avoid resizing the hash tables when there is a child in order
1453 * to play well with copy-on-write (otherwise when a resize happens lots of
1454 * memory pages are copied). The goal of this function is to update the ability
1455 * for dict.c to resize the hash tables accordingly to the fact we have o not
1456 * running childs. */
1457 static void updateDictResizePolicy(void) {
1458 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1459 dictEnableResize();
1460 else
1461 dictDisableResize();
1462 }
1463
1464 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1465 int j, loops = server.cronloops++;
1466 REDIS_NOTUSED(eventLoop);
1467 REDIS_NOTUSED(id);
1468 REDIS_NOTUSED(clientData);
1469
1470 /* We take a cached value of the unix time in the global state because
1471 * with virtual memory and aging there is to store the current time
1472 * in objects at every object access, and accuracy is not needed.
1473 * To access a global var is faster than calling time(NULL) */
1474 server.unixtime = time(NULL);
1475 /* We have just 21 bits per object for LRU information.
1476 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1477 *
1478 * When we need to select what object to swap, we compute the minimum
1479 * time distance between the current lruclock and the object last access
1480 * lruclock info. Even if clocks will wrap on overflow, there is
1481 * the interesting property that we are sure that at least
1482 * ABS(A-B) minutes passed between current time and timestamp B.
1483 *
1484 * This is not precise but we don't need at all precision, but just
1485 * something statistically reasonable.
1486 */
1487 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1488
1489 /* We received a SIGTERM, shutting down here in a safe way, as it is
1490 * not ok doing so inside the signal handler. */
1491 if (server.shutdown_asap) {
1492 if (prepareForShutdown() == REDIS_OK) exit(0);
1493 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1494 }
1495
1496 /* Show some info about non-empty databases */
1497 for (j = 0; j < server.dbnum; j++) {
1498 long long size, used, vkeys;
1499
1500 size = dictSlots(server.db[j].dict);
1501 used = dictSize(server.db[j].dict);
1502 vkeys = dictSize(server.db[j].expires);
1503 if (!(loops % 50) && (used || vkeys)) {
1504 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1505 /* dictPrintStats(server.dict); */
1506 }
1507 }
1508
1509 /* We don't want to resize the hash tables while a bacground saving
1510 * is in progress: the saving child is created using fork() that is
1511 * implemented with a copy-on-write semantic in most modern systems, so
1512 * if we resize the HT while there is the saving child at work actually
1513 * a lot of memory movements in the parent will cause a lot of pages
1514 * copied. */
1515 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1516 if (!(loops % 10)) tryResizeHashTables();
1517 if (server.activerehashing) incrementallyRehash();
1518 }
1519
1520 /* Show information about connected clients */
1521 if (!(loops % 50)) {
1522 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1523 listLength(server.clients)-listLength(server.slaves),
1524 listLength(server.slaves),
1525 zmalloc_used_memory());
1526 }
1527
1528 /* Close connections of timedout clients */
1529 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1530 closeTimedoutClients();
1531
1532 /* Check if a background saving or AOF rewrite in progress terminated */
1533 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1534 int statloc;
1535 pid_t pid;
1536
1537 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1538 if (pid == server.bgsavechildpid) {
1539 backgroundSaveDoneHandler(statloc);
1540 } else {
1541 backgroundRewriteDoneHandler(statloc);
1542 }
1543 updateDictResizePolicy();
1544 }
1545 } else {
1546 /* If there is not a background saving in progress check if
1547 * we have to save now */
1548 time_t now = time(NULL);
1549 for (j = 0; j < server.saveparamslen; j++) {
1550 struct saveparam *sp = server.saveparams+j;
1551
1552 if (server.dirty >= sp->changes &&
1553 now-server.lastsave > sp->seconds) {
1554 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1555 sp->changes, sp->seconds);
1556 rdbSaveBackground(server.dbfilename);
1557 break;
1558 }
1559 }
1560 }
1561
1562 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1563 * will use few CPU cycles if there are few expiring keys, otherwise
1564 * it will get more aggressive to avoid that too much memory is used by
1565 * keys that can be removed from the keyspace. */
1566 for (j = 0; j < server.dbnum; j++) {
1567 int expired;
1568 redisDb *db = server.db+j;
1569
1570 /* Continue to expire if at the end of the cycle more than 25%
1571 * of the keys were expired. */
1572 do {
1573 long num = dictSize(db->expires);
1574 time_t now = time(NULL);
1575
1576 expired = 0;
1577 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1578 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1579 while (num--) {
1580 dictEntry *de;
1581 time_t t;
1582
1583 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1584 t = (time_t) dictGetEntryVal(de);
1585 if (now > t) {
1586 sds key = dictGetEntryKey(de);
1587 robj *keyobj = createStringObject(key,sdslen(key));
1588
1589 dbDelete(db,keyobj);
1590 decrRefCount(keyobj);
1591 expired++;
1592 server.stat_expiredkeys++;
1593 }
1594 }
1595 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1596 }
1597
1598 /* Swap a few keys on disk if we are over the memory limit and VM
1599 * is enbled. Try to free objects from the free list first. */
1600 if (vmCanSwapOut()) {
1601 while (server.vm_enabled && zmalloc_used_memory() >
1602 server.vm_max_memory)
1603 {
1604 int retval;
1605
1606 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1607 retval = (server.vm_max_threads == 0) ?
1608 vmSwapOneObjectBlocking() :
1609 vmSwapOneObjectThreaded();
1610 if (retval == REDIS_ERR && !(loops % 300) &&
1611 zmalloc_used_memory() >
1612 (server.vm_max_memory+server.vm_max_memory/10))
1613 {
1614 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1615 }
1616 /* Note that when using threade I/O we free just one object,
1617 * because anyway when the I/O thread in charge to swap this
1618 * object out will finish, the handler of completed jobs
1619 * will try to swap more objects if we are still out of memory. */
1620 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1621 }
1622 }
1623
1624 /* Check if we should connect to a MASTER */
1625 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1626 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1627 if (syncWithMaster() == REDIS_OK) {
1628 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1629 if (server.appendonly) rewriteAppendOnlyFileBackground();
1630 }
1631 }
1632 return 100;
1633 }
1634
1635 /* This function gets called every time Redis is entering the
1636 * main loop of the event driven library, that is, before to sleep
1637 * for ready file descriptors. */
1638 static void beforeSleep(struct aeEventLoop *eventLoop) {
1639 REDIS_NOTUSED(eventLoop);
1640
1641 /* Awake clients that got all the swapped keys they requested */
1642 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1643 listIter li;
1644 listNode *ln;
1645
1646 listRewind(server.io_ready_clients,&li);
1647 while((ln = listNext(&li))) {
1648 redisClient *c = ln->value;
1649 struct redisCommand *cmd;
1650
1651 /* Resume the client. */
1652 listDelNode(server.io_ready_clients,ln);
1653 c->flags &= (~REDIS_IO_WAIT);
1654 server.vm_blocked_clients--;
1655 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1656 readQueryFromClient, c);
1657 cmd = lookupCommand(c->argv[0]->ptr);
1658 assert(cmd != NULL);
1659 call(c,cmd);
1660 resetClient(c);
1661 /* There may be more data to process in the input buffer. */
1662 if (c->querybuf && sdslen(c->querybuf) > 0)
1663 processInputBuffer(c);
1664 }
1665 }
1666 /* Write the AOF buffer on disk */
1667 flushAppendOnlyFile();
1668 }
1669
1670 static void createSharedObjects(void) {
1671 int j;
1672
1673 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1674 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1675 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1676 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1677 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1678 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1679 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1680 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1681 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1682 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1683 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1684 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1685 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1686 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR no such key\r\n"));
1688 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR syntax error\r\n"));
1690 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR source and destination objects are the same\r\n"));
1692 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR index out of range\r\n"));
1694 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1695 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1696 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1697 shared.select0 = createStringObject("select 0\r\n",10);
1698 shared.select1 = createStringObject("select 1\r\n",10);
1699 shared.select2 = createStringObject("select 2\r\n",10);
1700 shared.select3 = createStringObject("select 3\r\n",10);
1701 shared.select4 = createStringObject("select 4\r\n",10);
1702 shared.select5 = createStringObject("select 5\r\n",10);
1703 shared.select6 = createStringObject("select 6\r\n",10);
1704 shared.select7 = createStringObject("select 7\r\n",10);
1705 shared.select8 = createStringObject("select 8\r\n",10);
1706 shared.select9 = createStringObject("select 9\r\n",10);
1707 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1708 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1709 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1710 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1711 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1712 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1713 shared.mbulk3 = createStringObject("*3\r\n",4);
1714 shared.mbulk4 = createStringObject("*4\r\n",4);
1715 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1716 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1717 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1718 }
1719 }
1720
1721 static void appendServerSaveParams(time_t seconds, int changes) {
1722 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1723 server.saveparams[server.saveparamslen].seconds = seconds;
1724 server.saveparams[server.saveparamslen].changes = changes;
1725 server.saveparamslen++;
1726 }
1727
1728 static void resetServerSaveParams() {
1729 zfree(server.saveparams);
1730 server.saveparams = NULL;
1731 server.saveparamslen = 0;
1732 }
1733
1734 static void initServerConfig() {
1735 server.dbnum = REDIS_DEFAULT_DBNUM;
1736 server.port = REDIS_SERVERPORT;
1737 server.verbosity = REDIS_VERBOSE;
1738 server.maxidletime = REDIS_MAXIDLETIME;
1739 server.saveparams = NULL;
1740 server.logfile = NULL; /* NULL = log on standard output */
1741 server.bindaddr = NULL;
1742 server.glueoutputbuf = 1;
1743 server.daemonize = 0;
1744 server.appendonly = 0;
1745 server.appendfsync = APPENDFSYNC_EVERYSEC;
1746 server.no_appendfsync_on_rewrite = 0;
1747 server.lastfsync = time(NULL);
1748 server.appendfd = -1;
1749 server.appendseldb = -1; /* Make sure the first time will not match */
1750 server.pidfile = zstrdup("/var/run/redis.pid");
1751 server.dbfilename = zstrdup("dump.rdb");
1752 server.appendfilename = zstrdup("appendonly.aof");
1753 server.requirepass = NULL;
1754 server.rdbcompression = 1;
1755 server.activerehashing = 1;
1756 server.maxclients = 0;
1757 server.blpop_blocked_clients = 0;
1758 server.maxmemory = 0;
1759 server.vm_enabled = 0;
1760 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1761 server.vm_page_size = 256; /* 256 bytes per page */
1762 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1763 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1764 server.vm_max_threads = 4;
1765 server.vm_blocked_clients = 0;
1766 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1767 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1768 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1769 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1770 server.set_max_intset_entries = REDIS_SET_MAX_INTSET_ENTRIES;
1771 server.shutdown_asap = 0;
1772
1773 resetServerSaveParams();
1774
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1779 server.isslave = 0;
1780 server.masterauth = NULL;
1781 server.masterhost = NULL;
1782 server.masterport = 6379;
1783 server.master = NULL;
1784 server.replstate = REDIS_REPL_NONE;
1785
1786 /* Double constants initialization */
1787 R_Zero = 0.0;
1788 R_PosInf = 1.0/R_Zero;
1789 R_NegInf = -1.0/R_Zero;
1790 R_Nan = R_Zero/R_Zero;
1791 }
1792
1793 static void initServer() {
1794 int j;
1795
1796 signal(SIGHUP, SIG_IGN);
1797 signal(SIGPIPE, SIG_IGN);
1798 setupSigSegvAction();
1799
1800 server.devnull = fopen("/dev/null","w");
1801 if (server.devnull == NULL) {
1802 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1803 exit(1);
1804 }
1805 server.clients = listCreate();
1806 server.slaves = listCreate();
1807 server.monitors = listCreate();
1808 server.objfreelist = listCreate();
1809 createSharedObjects();
1810 server.el = aeCreateEventLoop();
1811 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1812 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1813 if (server.fd == -1) {
1814 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1815 exit(1);
1816 }
1817 for (j = 0; j < server.dbnum; j++) {
1818 server.db[j].dict = dictCreate(&dbDictType,NULL);
1819 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1820 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1821 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1822 if (server.vm_enabled)
1823 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1824 server.db[j].id = j;
1825 }
1826 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1827 server.pubsub_patterns = listCreate();
1828 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1829 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1830 server.cronloops = 0;
1831 server.bgsavechildpid = -1;
1832 server.bgrewritechildpid = -1;
1833 server.bgrewritebuf = sdsempty();
1834 server.aofbuf = sdsempty();
1835 server.lastsave = time(NULL);
1836 server.dirty = 0;
1837 server.stat_numcommands = 0;
1838 server.stat_numconnections = 0;
1839 server.stat_expiredkeys = 0;
1840 server.stat_starttime = time(NULL);
1841 server.unixtime = time(NULL);
1842 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1843 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1844 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1845
1846 if (server.appendonly) {
1847 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1848 if (server.appendfd == -1) {
1849 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1850 strerror(errno));
1851 exit(1);
1852 }
1853 }
1854
1855 if (server.vm_enabled) vmInit();
1856 }
1857
1858 /* Empty the whole database */
1859 static long long emptyDb() {
1860 int j;
1861 long long removed = 0;
1862
1863 for (j = 0; j < server.dbnum; j++) {
1864 removed += dictSize(server.db[j].dict);
1865 dictEmpty(server.db[j].dict);
1866 dictEmpty(server.db[j].expires);
1867 }
1868 return removed;
1869 }
1870
1871 static int yesnotoi(char *s) {
1872 if (!strcasecmp(s,"yes")) return 1;
1873 else if (!strcasecmp(s,"no")) return 0;
1874 else return -1;
1875 }
1876
1877 /* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879 static void loadServerConfig(char *filename) {
1880 FILE *fp;
1881 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1882 int linenum = 0;
1883 sds line = NULL;
1884
1885 if (filename[0] == '-' && filename[1] == '\0')
1886 fp = stdin;
1887 else {
1888 if ((fp = fopen(filename,"r")) == NULL) {
1889 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1890 exit(1);
1891 }
1892 }
1893
1894 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1895 sds *argv;
1896 int argc, j;
1897
1898 linenum++;
1899 line = sdsnew(buf);
1900 line = sdstrim(line," \t\r\n");
1901
1902 /* Skip comments and blank lines*/
1903 if (line[0] == '#' || line[0] == '\0') {
1904 sdsfree(line);
1905 continue;
1906 }
1907
1908 /* Split into arguments */
1909 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1910 sdstolower(argv[0]);
1911
1912 /* Execute config directives */
1913 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1914 server.maxidletime = atoi(argv[1]);
1915 if (server.maxidletime < 0) {
1916 err = "Invalid timeout value"; goto loaderr;
1917 }
1918 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1919 server.port = atoi(argv[1]);
1920 if (server.port < 1 || server.port > 65535) {
1921 err = "Invalid port"; goto loaderr;
1922 }
1923 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1924 server.bindaddr = zstrdup(argv[1]);
1925 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1926 int seconds = atoi(argv[1]);
1927 int changes = atoi(argv[2]);
1928 if (seconds < 1 || changes < 0) {
1929 err = "Invalid save parameters"; goto loaderr;
1930 }
1931 appendServerSaveParams(seconds,changes);
1932 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1933 if (chdir(argv[1]) == -1) {
1934 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1935 argv[1], strerror(errno));
1936 exit(1);
1937 }
1938 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1939 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1940 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1941 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1942 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1943 else {
1944 err = "Invalid log level. Must be one of debug, notice, warning";
1945 goto loaderr;
1946 }
1947 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1948 FILE *logfp;
1949
1950 server.logfile = zstrdup(argv[1]);
1951 if (!strcasecmp(server.logfile,"stdout")) {
1952 zfree(server.logfile);
1953 server.logfile = NULL;
1954 }
1955 if (server.logfile) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
1958 logfp = fopen(server.logfile,"a");
1959 if (logfp == NULL) {
1960 err = sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno));
1962 goto loaderr;
1963 }
1964 fclose(logfp);
1965 }
1966 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1967 server.dbnum = atoi(argv[1]);
1968 if (server.dbnum < 1) {
1969 err = "Invalid number of databases"; goto loaderr;
1970 }
1971 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1972 loadServerConfig(argv[1]);
1973 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1974 server.maxclients = atoi(argv[1]);
1975 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1976 server.maxmemory = memtoll(argv[1],NULL);
1977 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1978 server.masterhost = sdsnew(argv[1]);
1979 server.masterport = atoi(argv[2]);
1980 server.replstate = REDIS_REPL_CONNECT;
1981 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1982 server.masterauth = zstrdup(argv[1]);
1983 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1984 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
1987 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1988 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1989 err = "argument must be 'yes' or 'no'"; goto loaderr;
1990 }
1991 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1992 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
1995 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1996 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1997 err = "argument must be 'yes' or 'no'"; goto loaderr;
1998 }
1999 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2000 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2001 err = "argument must be 'yes' or 'no'"; goto loaderr;
2002 }
2003 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2004 zfree(server.appendfilename);
2005 server.appendfilename = zstrdup(argv[1]);
2006 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2007 && argc == 2) {
2008 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2009 err = "argument must be 'yes' or 'no'"; goto loaderr;
2010 }
2011 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2012 if (!strcasecmp(argv[1],"no")) {
2013 server.appendfsync = APPENDFSYNC_NO;
2014 } else if (!strcasecmp(argv[1],"always")) {
2015 server.appendfsync = APPENDFSYNC_ALWAYS;
2016 } else if (!strcasecmp(argv[1],"everysec")) {
2017 server.appendfsync = APPENDFSYNC_EVERYSEC;
2018 } else {
2019 err = "argument must be 'no', 'always' or 'everysec'";
2020 goto loaderr;
2021 }
2022 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2023 server.requirepass = zstrdup(argv[1]);
2024 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2025 zfree(server.pidfile);
2026 server.pidfile = zstrdup(argv[1]);
2027 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2028 zfree(server.dbfilename);
2029 server.dbfilename = zstrdup(argv[1]);
2030 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2031 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2032 err = "argument must be 'yes' or 'no'"; goto loaderr;
2033 }
2034 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2035 zfree(server.vm_swap_file);
2036 server.vm_swap_file = zstrdup(argv[1]);
2037 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2038 server.vm_max_memory = memtoll(argv[1],NULL);
2039 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2040 server.vm_page_size = memtoll(argv[1], NULL);
2041 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2042 server.vm_pages = memtoll(argv[1], NULL);
2043 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2044 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2045 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2046 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2047 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2048 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2049 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2050 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2051 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2052 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2053 } else if (!strcasecmp(argv[0],"set-max-intset-entries") && argc == 2){
2054 server.set_max_intset_entries = memtoll(argv[1], NULL);
2055 } else {
2056 err = "Bad directive or wrong number of arguments"; goto loaderr;
2057 }
2058 for (j = 0; j < argc; j++)
2059 sdsfree(argv[j]);
2060 zfree(argv);
2061 sdsfree(line);
2062 }
2063 if (fp != stdin) fclose(fp);
2064 return;
2065
2066 loaderr:
2067 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2068 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2069 fprintf(stderr, ">>> '%s'\n", line);
2070 fprintf(stderr, "%s\n", err);
2071 exit(1);
2072 }
2073
2074 static void freeClientArgv(redisClient *c) {
2075 int j;
2076
2077 for (j = 0; j < c->argc; j++)
2078 decrRefCount(c->argv[j]);
2079 for (j = 0; j < c->mbargc; j++)
2080 decrRefCount(c->mbargv[j]);
2081 c->argc = 0;
2082 c->mbargc = 0;
2083 }
2084
2085 static void freeClient(redisClient *c) {
2086 listNode *ln;
2087
2088 /* Note that if the client we are freeing is blocked into a blocking
2089 * call, we have to set querybuf to NULL *before* to call
2090 * unblockClientWaitingData() to avoid processInputBuffer() will get
2091 * called. Also it is important to remove the file events after
2092 * this, because this call adds the READABLE event. */
2093 sdsfree(c->querybuf);
2094 c->querybuf = NULL;
2095 if (c->flags & REDIS_BLOCKED)
2096 unblockClientWaitingData(c);
2097
2098 /* UNWATCH all the keys */
2099 unwatchAllKeys(c);
2100 listRelease(c->watched_keys);
2101 /* Unsubscribe from all the pubsub channels */
2102 pubsubUnsubscribeAllChannels(c,0);
2103 pubsubUnsubscribeAllPatterns(c,0);
2104 dictRelease(c->pubsub_channels);
2105 listRelease(c->pubsub_patterns);
2106 /* Obvious cleanup */
2107 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2108 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2109 listRelease(c->reply);
2110 freeClientArgv(c);
2111 close(c->fd);
2112 /* Remove from the list of clients */
2113 ln = listSearchKey(server.clients,c);
2114 redisAssert(ln != NULL);
2115 listDelNode(server.clients,ln);
2116 /* Remove from the list of clients that are now ready to be restarted
2117 * after waiting for swapped keys */
2118 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2119 ln = listSearchKey(server.io_ready_clients,c);
2120 if (ln) {
2121 listDelNode(server.io_ready_clients,ln);
2122 server.vm_blocked_clients--;
2123 }
2124 }
2125 /* Remove from the list of clients waiting for swapped keys */
2126 while (server.vm_enabled && listLength(c->io_keys)) {
2127 ln = listFirst(c->io_keys);
2128 dontWaitForSwappedKey(c,ln->value);
2129 }
2130 listRelease(c->io_keys);
2131 /* Master/slave cleanup */
2132 if (c->flags & REDIS_SLAVE) {
2133 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2134 close(c->repldbfd);
2135 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2136 ln = listSearchKey(l,c);
2137 redisAssert(ln != NULL);
2138 listDelNode(l,ln);
2139 }
2140 if (c->flags & REDIS_MASTER) {
2141 server.master = NULL;
2142 server.replstate = REDIS_REPL_CONNECT;
2143 }
2144 /* Release memory */
2145 zfree(c->argv);
2146 zfree(c->mbargv);
2147 freeClientMultiState(c);
2148 zfree(c);
2149 }
2150
2151 #define GLUEREPLY_UP_TO (1024)
2152 static void glueReplyBuffersIfNeeded(redisClient *c) {
2153 int copylen = 0;
2154 char buf[GLUEREPLY_UP_TO];
2155 listNode *ln;
2156 listIter li;
2157 robj *o;
2158
2159 listRewind(c->reply,&li);
2160 while((ln = listNext(&li))) {
2161 int objlen;
2162
2163 o = ln->value;
2164 objlen = sdslen(o->ptr);
2165 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2166 memcpy(buf+copylen,o->ptr,objlen);
2167 copylen += objlen;
2168 listDelNode(c->reply,ln);
2169 } else {
2170 if (copylen == 0) return;
2171 break;
2172 }
2173 }
2174 /* Now the output buffer is empty, add the new single element */
2175 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2176 listAddNodeHead(c->reply,o);
2177 }
2178
2179 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2180 redisClient *c = privdata;
2181 int nwritten = 0, totwritten = 0, objlen;
2182 robj *o;
2183 REDIS_NOTUSED(el);
2184 REDIS_NOTUSED(mask);
2185
2186 /* Use writev() if we have enough buffers to send */
2187 if (!server.glueoutputbuf &&
2188 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2189 !(c->flags & REDIS_MASTER))
2190 {
2191 sendReplyToClientWritev(el, fd, privdata, mask);
2192 return;
2193 }
2194
2195 while(listLength(c->reply)) {
2196 if (server.glueoutputbuf && listLength(c->reply) > 1)
2197 glueReplyBuffersIfNeeded(c);
2198
2199 o = listNodeValue(listFirst(c->reply));
2200 objlen = sdslen(o->ptr);
2201
2202 if (objlen == 0) {
2203 listDelNode(c->reply,listFirst(c->reply));
2204 continue;
2205 }
2206
2207 if (c->flags & REDIS_MASTER) {
2208 /* Don't reply to a master */
2209 nwritten = objlen - c->sentlen;
2210 } else {
2211 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2212 if (nwritten <= 0) break;
2213 }
2214 c->sentlen += nwritten;
2215 totwritten += nwritten;
2216 /* If we fully sent the object on head go to the next one */
2217 if (c->sentlen == objlen) {
2218 listDelNode(c->reply,listFirst(c->reply));
2219 c->sentlen = 0;
2220 }
2221 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2222 * bytes, in a single threaded server it's a good idea to serve
2223 * other clients as well, even if a very large request comes from
2224 * super fast link that is always able to accept data (in real world
2225 * scenario think about 'KEYS *' against the loopback interfae) */
2226 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2227 }
2228 if (nwritten == -1) {
2229 if (errno == EAGAIN) {
2230 nwritten = 0;
2231 } else {
2232 redisLog(REDIS_VERBOSE,
2233 "Error writing to client: %s", strerror(errno));
2234 freeClient(c);
2235 return;
2236 }
2237 }
2238 if (totwritten > 0) c->lastinteraction = time(NULL);
2239 if (listLength(c->reply) == 0) {
2240 c->sentlen = 0;
2241 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2242 }
2243 }
2244
2245 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2246 {
2247 redisClient *c = privdata;
2248 int nwritten = 0, totwritten = 0, objlen, willwrite;
2249 robj *o;
2250 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2251 int offset, ion = 0;
2252 REDIS_NOTUSED(el);
2253 REDIS_NOTUSED(mask);
2254
2255 listNode *node;
2256 while (listLength(c->reply)) {
2257 offset = c->sentlen;
2258 ion = 0;
2259 willwrite = 0;
2260
2261 /* fill-in the iov[] array */
2262 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2263 o = listNodeValue(node);
2264 objlen = sdslen(o->ptr);
2265
2266 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2267 break;
2268
2269 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2270 break; /* no more iovecs */
2271
2272 iov[ion].iov_base = ((char*)o->ptr) + offset;
2273 iov[ion].iov_len = objlen - offset;
2274 willwrite += objlen - offset;
2275 offset = 0; /* just for the first item */
2276 ion++;
2277 }
2278
2279 if(willwrite == 0)
2280 break;
2281
2282 /* write all collected blocks at once */
2283 if((nwritten = writev(fd, iov, ion)) < 0) {
2284 if (errno != EAGAIN) {
2285 redisLog(REDIS_VERBOSE,
2286 "Error writing to client: %s", strerror(errno));
2287 freeClient(c);
2288 return;
2289 }
2290 break;
2291 }
2292
2293 totwritten += nwritten;
2294 offset = c->sentlen;
2295
2296 /* remove written robjs from c->reply */
2297 while (nwritten && listLength(c->reply)) {
2298 o = listNodeValue(listFirst(c->reply));
2299 objlen = sdslen(o->ptr);
2300
2301 if(nwritten >= objlen - offset) {
2302 listDelNode(c->reply, listFirst(c->reply));
2303 nwritten -= objlen - offset;
2304 c->sentlen = 0;
2305 } else {
2306 /* partial write */
2307 c->sentlen += nwritten;
2308 break;
2309 }
2310 offset = 0;
2311 }
2312 }
2313
2314 if (totwritten > 0)
2315 c->lastinteraction = time(NULL);
2316
2317 if (listLength(c->reply) == 0) {
2318 c->sentlen = 0;
2319 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2320 }
2321 }
2322
2323 static int qsortRedisCommands(const void *r1, const void *r2) {
2324 return strcasecmp(
2325 ((struct redisCommand*)r1)->name,
2326 ((struct redisCommand*)r2)->name);
2327 }
2328
2329 static void sortCommandTable() {
2330 /* Copy and sort the read-only version of the command table */
2331 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2332 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2333 qsort(commandTable,
2334 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2335 sizeof(struct redisCommand),qsortRedisCommands);
2336 }
2337
2338 static struct redisCommand *lookupCommand(char *name) {
2339 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2340 return bsearch(
2341 &tmp,
2342 commandTable,
2343 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2344 sizeof(struct redisCommand),
2345 qsortRedisCommands);
2346 }
2347
2348 /* resetClient prepare the client to process the next command */
2349 static void resetClient(redisClient *c) {
2350 freeClientArgv(c);
2351 c->bulklen = -1;
2352 c->multibulk = 0;
2353 }
2354
2355 /* Call() is the core of Redis execution of a command */
2356 static void call(redisClient *c, struct redisCommand *cmd) {
2357 long long dirty;
2358
2359 dirty = server.dirty;
2360 cmd->proc(c);
2361 dirty = server.dirty-dirty;
2362
2363 if (server.appendonly && dirty)
2364 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2365 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2366 listLength(server.slaves))
2367 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2368 if (listLength(server.monitors))
2369 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2370 server.stat_numcommands++;
2371 }
2372
2373 /* If this function gets called we already read a whole
2374 * command, argments are in the client argv/argc fields.
2375 * processCommand() execute the command or prepare the
2376 * server for a bulk read from the client.
2377 *
2378 * If 1 is returned the client is still alive and valid and
2379 * and other operations can be performed by the caller. Otherwise
2380 * if 0 is returned the client was destroied (i.e. after QUIT). */
2381 static int processCommand(redisClient *c) {
2382 struct redisCommand *cmd;
2383
2384 /* Free some memory if needed (maxmemory setting) */
2385 if (server.maxmemory) freeMemoryIfNeeded();
2386
2387 /* Handle the multi bulk command type. This is an alternative protocol
2388 * supported by Redis in order to receive commands that are composed of
2389 * multiple binary-safe "bulk" arguments. The latency of processing is
2390 * a bit higher but this allows things like multi-sets, so if this
2391 * protocol is used only for MSET and similar commands this is a big win. */
2392 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2393 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2394 if (c->multibulk <= 0) {
2395 resetClient(c);
2396 return 1;
2397 } else {
2398 decrRefCount(c->argv[c->argc-1]);
2399 c->argc--;
2400 return 1;
2401 }
2402 } else if (c->multibulk) {
2403 if (c->bulklen == -1) {
2404 if (((char*)c->argv[0]->ptr)[0] != '$') {
2405 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2406 resetClient(c);
2407 return 1;
2408 } else {
2409 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2410 decrRefCount(c->argv[0]);
2411 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2412 c->argc--;
2413 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2414 resetClient(c);
2415 return 1;
2416 }
2417 c->argc--;
2418 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2419 return 1;
2420 }
2421 } else {
2422 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2423 c->mbargv[c->mbargc] = c->argv[0];
2424 c->mbargc++;
2425 c->argc--;
2426 c->multibulk--;
2427 if (c->multibulk == 0) {
2428 robj **auxargv;
2429 int auxargc;
2430
2431 /* Here we need to swap the multi-bulk argc/argv with the
2432 * normal argc/argv of the client structure. */
2433 auxargv = c->argv;
2434 c->argv = c->mbargv;
2435 c->mbargv = auxargv;
2436
2437 auxargc = c->argc;
2438 c->argc = c->mbargc;
2439 c->mbargc = auxargc;
2440
2441 /* We need to set bulklen to something different than -1
2442 * in order for the code below to process the command without
2443 * to try to read the last argument of a bulk command as
2444 * a special argument. */
2445 c->bulklen = 0;
2446 /* continue below and process the command */
2447 } else {
2448 c->bulklen = -1;
2449 return 1;
2450 }
2451 }
2452 }
2453 /* -- end of multi bulk commands processing -- */
2454
2455 /* The QUIT command is handled as a special case. Normal command
2456 * procs are unable to close the client connection safely */
2457 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2458 freeClient(c);
2459 return 0;
2460 }
2461
2462 /* Now lookup the command and check ASAP about trivial error conditions
2463 * such wrong arity, bad command name and so forth. */
2464 cmd = lookupCommand(c->argv[0]->ptr);
2465 if (!cmd) {
2466 addReplySds(c,
2467 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2468 (char*)c->argv[0]->ptr));
2469 resetClient(c);
2470 return 1;
2471 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2472 (c->argc < -cmd->arity)) {
2473 addReplySds(c,
2474 sdscatprintf(sdsempty(),
2475 "-ERR wrong number of arguments for '%s' command\r\n",
2476 cmd->name));
2477 resetClient(c);
2478 return 1;
2479 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2480 /* This is a bulk command, we have to read the last argument yet. */
2481 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2482
2483 decrRefCount(c->argv[c->argc-1]);
2484 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2485 c->argc--;
2486 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490 c->argc--;
2491 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2492 /* It is possible that the bulk read is already in the
2493 * buffer. Check this condition and handle it accordingly.
2494 * This is just a fast path, alternative to call processInputBuffer().
2495 * It's a good idea since the code is small and this condition
2496 * happens most of the times. */
2497 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2498 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2499 c->argc++;
2500 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2501 } else {
2502 /* Otherwise return... there is to read the last argument
2503 * from the socket. */
2504 return 1;
2505 }
2506 }
2507 /* Let's try to encode the bulk object to save space. */
2508 if (cmd->flags & REDIS_CMD_BULK)
2509 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2510
2511 /* Check if the user is authenticated */
2512 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2513 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2514 resetClient(c);
2515 return 1;
2516 }
2517
2518 /* Handle the maxmemory directive */
2519 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2520 zmalloc_used_memory() > server.maxmemory)
2521 {
2522 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2523 resetClient(c);
2524 return 1;
2525 }
2526
2527 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2528 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2529 &&
2530 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2531 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2532 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2533 resetClient(c);
2534 return 1;
2535 }
2536
2537 /* Exec the command */
2538 if (c->flags & REDIS_MULTI &&
2539 cmd->proc != execCommand && cmd->proc != discardCommand &&
2540 cmd->proc != multiCommand && cmd->proc != watchCommand)
2541 {
2542 queueMultiCommand(c,cmd);
2543 addReply(c,shared.queued);
2544 } else {
2545 if (server.vm_enabled && server.vm_max_threads > 0 &&
2546 blockClientOnSwappedKeys(c,cmd)) return 1;
2547 call(c,cmd);
2548 }
2549
2550 /* Prepare the client for the next command */
2551 resetClient(c);
2552 return 1;
2553 }
2554
2555 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2556 listNode *ln;
2557 listIter li;
2558 int outc = 0, j;
2559 robj **outv;
2560 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2561 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2562 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2563 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2564 robj *lenobj;
2565
2566 if (argc <= REDIS_STATIC_ARGS) {
2567 outv = static_outv;
2568 } else {
2569 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2570 }
2571
2572 lenobj = createObject(REDIS_STRING,
2573 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2574 lenobj->refcount = 0;
2575 outv[outc++] = lenobj;
2576 for (j = 0; j < argc; j++) {
2577 lenobj = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"$%lu\r\n",
2579 (unsigned long) stringObjectLen(argv[j])));
2580 lenobj->refcount = 0;
2581 outv[outc++] = lenobj;
2582 outv[outc++] = argv[j];
2583 outv[outc++] = shared.crlf;
2584 }
2585
2586 /* Increment all the refcounts at start and decrement at end in order to
2587 * be sure to free objects if there is no slave in a replication state
2588 * able to be feed with commands */
2589 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2590 listRewind(slaves,&li);
2591 while((ln = listNext(&li))) {
2592 redisClient *slave = ln->value;
2593
2594 /* Don't feed slaves that are still waiting for BGSAVE to start */
2595 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2596
2597 /* Feed all the other slaves, MONITORs and so on */
2598 if (slave->slaveseldb != dictid) {
2599 robj *selectcmd;
2600
2601 switch(dictid) {
2602 case 0: selectcmd = shared.select0; break;
2603 case 1: selectcmd = shared.select1; break;
2604 case 2: selectcmd = shared.select2; break;
2605 case 3: selectcmd = shared.select3; break;
2606 case 4: selectcmd = shared.select4; break;
2607 case 5: selectcmd = shared.select5; break;
2608 case 6: selectcmd = shared.select6; break;
2609 case 7: selectcmd = shared.select7; break;
2610 case 8: selectcmd = shared.select8; break;
2611 case 9: selectcmd = shared.select9; break;
2612 default:
2613 selectcmd = createObject(REDIS_STRING,
2614 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2615 selectcmd->refcount = 0;
2616 break;
2617 }
2618 addReply(slave,selectcmd);
2619 slave->slaveseldb = dictid;
2620 }
2621 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2622 }
2623 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2624 if (outv != static_outv) zfree(outv);
2625 }
2626
2627 static sds sdscatrepr(sds s, char *p, size_t len) {
2628 s = sdscatlen(s,"\"",1);
2629 while(len--) {
2630 switch(*p) {
2631 case '\\':
2632 case '"':
2633 s = sdscatprintf(s,"\\%c",*p);
2634 break;
2635 case '\n': s = sdscatlen(s,"\\n",1); break;
2636 case '\r': s = sdscatlen(s,"\\r",1); break;
2637 case '\t': s = sdscatlen(s,"\\t",1); break;
2638 case '\a': s = sdscatlen(s,"\\a",1); break;
2639 case '\b': s = sdscatlen(s,"\\b",1); break;
2640 default:
2641 if (isprint(*p))
2642 s = sdscatprintf(s,"%c",*p);
2643 else
2644 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2645 break;
2646 }
2647 p++;
2648 }
2649 return sdscatlen(s,"\"",1);
2650 }
2651
2652 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2653 listNode *ln;
2654 listIter li;
2655 int j;
2656 sds cmdrepr = sdsnew("+");
2657 robj *cmdobj;
2658 struct timeval tv;
2659
2660 gettimeofday(&tv,NULL);
2661 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2662 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2663
2664 for (j = 0; j < argc; j++) {
2665 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2666 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2667 } else {
2668 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2669 sdslen(argv[j]->ptr));
2670 }
2671 if (j != argc-1)
2672 cmdrepr = sdscatlen(cmdrepr," ",1);
2673 }
2674 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2675 cmdobj = createObject(REDIS_STRING,cmdrepr);
2676
2677 listRewind(monitors,&li);
2678 while((ln = listNext(&li))) {
2679 redisClient *monitor = ln->value;
2680 addReply(monitor,cmdobj);
2681 }
2682 decrRefCount(cmdobj);
2683 }
2684
2685 static void processInputBuffer(redisClient *c) {
2686 again:
2687 /* Before to process the input buffer, make sure the client is not
2688 * waitig for a blocking operation such as BLPOP. Note that the first
2689 * iteration the client is never blocked, otherwise the processInputBuffer
2690 * would not be called at all, but after the execution of the first commands
2691 * in the input buffer the client may be blocked, and the "goto again"
2692 * will try to reiterate. The following line will make it return asap. */
2693 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2694 if (c->bulklen == -1) {
2695 /* Read the first line of the query */
2696 char *p = strchr(c->querybuf,'\n');
2697 size_t querylen;
2698
2699 if (p) {
2700 sds query, *argv;
2701 int argc, j;
2702
2703 query = c->querybuf;
2704 c->querybuf = sdsempty();
2705 querylen = 1+(p-(query));
2706 if (sdslen(query) > querylen) {
2707 /* leave data after the first line of the query in the buffer */
2708 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2709 }
2710 *p = '\0'; /* remove "\n" */
2711 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2712 sdsupdatelen(query);
2713
2714 /* Now we can split the query in arguments */
2715 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2716 sdsfree(query);
2717
2718 if (c->argv) zfree(c->argv);
2719 c->argv = zmalloc(sizeof(robj*)*argc);
2720
2721 for (j = 0; j < argc; j++) {
2722 if (sdslen(argv[j])) {
2723 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2724 c->argc++;
2725 } else {
2726 sdsfree(argv[j]);
2727 }
2728 }
2729 zfree(argv);
2730 if (c->argc) {
2731 /* Execute the command. If the client is still valid
2732 * after processCommand() return and there is something
2733 * on the query buffer try to process the next command. */
2734 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2735 } else {
2736 /* Nothing to process, argc == 0. Just process the query
2737 * buffer if it's not empty or return to the caller */
2738 if (sdslen(c->querybuf)) goto again;
2739 }
2740 return;
2741 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2742 redisLog(REDIS_VERBOSE, "Client protocol error");
2743 freeClient(c);
2744 return;
2745 }
2746 } else {
2747 /* Bulk read handling. Note that if we are at this point
2748 the client already sent a command terminated with a newline,
2749 we are reading the bulk data that is actually the last
2750 argument of the command. */
2751 int qbl = sdslen(c->querybuf);
2752
2753 if (c->bulklen <= qbl) {
2754 /* Copy everything but the final CRLF as final argument */
2755 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2756 c->argc++;
2757 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2758 /* Process the command. If the client is still valid after
2759 * the processing and there is more data in the buffer
2760 * try to parse it. */
2761 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2762 return;
2763 }
2764 }
2765 }
2766
2767 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2768 redisClient *c = (redisClient*) privdata;
2769 char buf[REDIS_IOBUF_LEN];
2770 int nread;
2771 REDIS_NOTUSED(el);
2772 REDIS_NOTUSED(mask);
2773
2774 nread = read(fd, buf, REDIS_IOBUF_LEN);
2775 if (nread == -1) {
2776 if (errno == EAGAIN) {
2777 nread = 0;
2778 } else {
2779 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2780 freeClient(c);
2781 return;
2782 }
2783 } else if (nread == 0) {
2784 redisLog(REDIS_VERBOSE, "Client closed connection");
2785 freeClient(c);
2786 return;
2787 }
2788 if (nread) {
2789 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2790 c->lastinteraction = time(NULL);
2791 } else {
2792 return;
2793 }
2794 processInputBuffer(c);
2795 }
2796
2797 static int selectDb(redisClient *c, int id) {
2798 if (id < 0 || id >= server.dbnum)
2799 return REDIS_ERR;
2800 c->db = &server.db[id];
2801 return REDIS_OK;
2802 }
2803
2804 static void *dupClientReplyValue(void *o) {
2805 incrRefCount((robj*)o);
2806 return o;
2807 }
2808
2809 static int listMatchObjects(void *a, void *b) {
2810 return equalStringObjects(a,b);
2811 }
2812
2813 static redisClient *createClient(int fd) {
2814 redisClient *c = zmalloc(sizeof(*c));
2815
2816 anetNonBlock(NULL,fd);
2817 anetTcpNoDelay(NULL,fd);
2818 if (!c) return NULL;
2819 selectDb(c,0);
2820 c->fd = fd;
2821 c->querybuf = sdsempty();
2822 c->argc = 0;
2823 c->argv = NULL;
2824 c->bulklen = -1;
2825 c->multibulk = 0;
2826 c->mbargc = 0;
2827 c->mbargv = NULL;
2828 c->sentlen = 0;
2829 c->flags = 0;
2830 c->lastinteraction = time(NULL);
2831 c->authenticated = 0;
2832 c->replstate = REDIS_REPL_NONE;
2833 c->reply = listCreate();
2834 listSetFreeMethod(c->reply,decrRefCount);
2835 listSetDupMethod(c->reply,dupClientReplyValue);
2836 c->blocking_keys = NULL;
2837 c->blocking_keys_num = 0;
2838 c->io_keys = listCreate();
2839 c->watched_keys = listCreate();
2840 listSetFreeMethod(c->io_keys,decrRefCount);
2841 c->pubsub_channels = dictCreate(&setDictType,NULL);
2842 c->pubsub_patterns = listCreate();
2843 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2844 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2845 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2846 readQueryFromClient, c) == AE_ERR) {
2847 freeClient(c);
2848 return NULL;
2849 }
2850 listAddNodeTail(server.clients,c);
2851 initClientMultiState(c);
2852 return c;
2853 }
2854
2855 static void addReply(redisClient *c, robj *obj) {
2856 if (listLength(c->reply) == 0 &&
2857 (c->replstate == REDIS_REPL_NONE ||
2858 c->replstate == REDIS_REPL_ONLINE) &&
2859 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2860 sendReplyToClient, c) == AE_ERR) return;
2861
2862 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2863 obj = dupStringObject(obj);
2864 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2865 }
2866 listAddNodeTail(c->reply,getDecodedObject(obj));
2867 }
2868
2869 static void addReplySds(redisClient *c, sds s) {
2870 robj *o = createObject(REDIS_STRING,s);
2871 addReply(c,o);
2872 decrRefCount(o);
2873 }
2874
2875 static void addReplyDouble(redisClient *c, double d) {
2876 char buf[128];
2877
2878 snprintf(buf,sizeof(buf),"%.17g",d);
2879 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2880 (unsigned long) strlen(buf),buf));
2881 }
2882
2883 static void addReplyLongLong(redisClient *c, long long ll) {
2884 char buf[128];
2885 size_t len;
2886
2887 if (ll == 0) {
2888 addReply(c,shared.czero);
2889 return;
2890 } else if (ll == 1) {
2891 addReply(c,shared.cone);
2892 return;
2893 }
2894 buf[0] = ':';
2895 len = ll2string(buf+1,sizeof(buf)-1,ll);
2896 buf[len+1] = '\r';
2897 buf[len+2] = '\n';
2898 addReplySds(c,sdsnewlen(buf,len+3));
2899 }
2900
2901 static void addReplyUlong(redisClient *c, unsigned long ul) {
2902 char buf[128];
2903 size_t len;
2904
2905 if (ul == 0) {
2906 addReply(c,shared.czero);
2907 return;
2908 } else if (ul == 1) {
2909 addReply(c,shared.cone);
2910 return;
2911 }
2912 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2913 addReplySds(c,sdsnewlen(buf,len));
2914 }
2915
2916 static void addReplyBulkLen(redisClient *c, robj *obj) {
2917 size_t len, intlen;
2918 char buf[128];
2919
2920 if (obj->encoding == REDIS_ENCODING_RAW) {
2921 len = sdslen(obj->ptr);
2922 } else {
2923 long n = (long)obj->ptr;
2924
2925 /* Compute how many bytes will take this integer as a radix 10 string */
2926 len = 1;
2927 if (n < 0) {
2928 len++;
2929 n = -n;
2930 }
2931 while((n = n/10) != 0) {
2932 len++;
2933 }
2934 }
2935 buf[0] = '$';
2936 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2937 buf[intlen+1] = '\r';
2938 buf[intlen+2] = '\n';
2939 addReplySds(c,sdsnewlen(buf,intlen+3));
2940 }
2941
2942 static void addReplyBulk(redisClient *c, robj *obj) {
2943 addReplyBulkLen(c,obj);
2944 addReply(c,obj);
2945 addReply(c,shared.crlf);
2946 }
2947
2948 static void addReplyBulkSds(redisClient *c, sds s) {
2949 robj *o = createStringObject(s, sdslen(s));
2950 addReplyBulk(c,o);
2951 decrRefCount(o);
2952 }
2953
2954 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2955 static void addReplyBulkCString(redisClient *c, char *s) {
2956 if (s == NULL) {
2957 addReply(c,shared.nullbulk);
2958 } else {
2959 robj *o = createStringObject(s,strlen(s));
2960 addReplyBulk(c,o);
2961 decrRefCount(o);
2962 }
2963 }
2964
2965 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2966 int cport, cfd;
2967 char cip[128];
2968 redisClient *c;
2969 REDIS_NOTUSED(el);
2970 REDIS_NOTUSED(mask);
2971 REDIS_NOTUSED(privdata);
2972
2973 cfd = anetAccept(server.neterr, fd, cip, &cport);
2974 if (cfd == AE_ERR) {
2975 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2976 return;
2977 }
2978 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2979 if ((c = createClient(cfd)) == NULL) {
2980 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2981 close(cfd); /* May be already closed, just ingore errors */
2982 return;
2983 }
2984 /* If maxclient directive is set and this is one client more... close the
2985 * connection. Note that we create the client instead to check before
2986 * for this condition, since now the socket is already set in nonblocking
2987 * mode and we can send an error for free using the Kernel I/O */
2988 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2989 char *err = "-ERR max number of clients reached\r\n";
2990
2991 /* That's a best effort error message, don't check write errors */
2992 if (write(c->fd,err,strlen(err)) == -1) {
2993 /* Nothing to do, Just to avoid the warning... */
2994 }
2995 freeClient(c);
2996 return;
2997 }
2998 server.stat_numconnections++;
2999 }
3000
3001 /* ======================= Redis objects implementation ===================== */
3002
3003 static robj *createObject(int type, void *ptr) {
3004 robj *o;
3005
3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3007 if (listLength(server.objfreelist)) {
3008 listNode *head = listFirst(server.objfreelist);
3009 o = listNodeValue(head);
3010 listDelNode(server.objfreelist,head);
3011 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3012 } else {
3013 if (server.vm_enabled)
3014 pthread_mutex_unlock(&server.obj_freelist_mutex);
3015 o = zmalloc(sizeof(*o));
3016 }
3017 o->type = type;
3018 o->encoding = REDIS_ENCODING_RAW;
3019 o->ptr = ptr;
3020 o->refcount = 1;
3021 if (server.vm_enabled) {
3022 /* Note that this code may run in the context of an I/O thread
3023 * and accessing server.lruclock in theory is an error
3024 * (no locks). But in practice this is safe, and even if we read
3025 * garbage Redis will not fail. */
3026 o->lru = server.lruclock;
3027 o->storage = REDIS_VM_MEMORY;
3028 }
3029 return o;
3030 }
3031
3032 static robj *createStringObject(char *ptr, size_t len) {
3033 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3034 }
3035
3036 static robj *createStringObjectFromLongLong(long long value) {
3037 robj *o;
3038 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3039 incrRefCount(shared.integers[value]);
3040 o = shared.integers[value];
3041 } else {
3042 if (value >= LONG_MIN && value <= LONG_MAX) {
3043 o = createObject(REDIS_STRING, NULL);
3044 o->encoding = REDIS_ENCODING_INT;
3045 o->ptr = (void*)((long)value);
3046 } else {
3047 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3048 }
3049 }
3050 return o;
3051 }
3052
3053 static robj *dupStringObject(robj *o) {
3054 assert(o->encoding == REDIS_ENCODING_RAW);
3055 return createStringObject(o->ptr,sdslen(o->ptr));
3056 }
3057
3058 static robj *createListObject(void) {
3059 list *l = listCreate();
3060 robj *o = createObject(REDIS_LIST,l);
3061 listSetFreeMethod(l,decrRefCount);
3062 o->encoding = REDIS_ENCODING_LIST;
3063 return o;
3064 }
3065
3066 static robj *createZiplistObject(void) {
3067 unsigned char *zl = ziplistNew();
3068 robj *o = createObject(REDIS_LIST,zl);
3069 o->encoding = REDIS_ENCODING_ZIPLIST;
3070 return o;
3071 }
3072
3073 static robj *createSetObject(void) {
3074 dict *d = dictCreate(&setDictType,NULL);
3075 robj *o = createObject(REDIS_SET,d);
3076 o->encoding = REDIS_ENCODING_HT;
3077 return o;
3078 }
3079
3080 static robj *createIntsetObject(void) {
3081 intset *is = intsetNew();
3082 robj *o = createObject(REDIS_SET,is);
3083 o->encoding = REDIS_ENCODING_INTSET;
3084 return o;
3085 }
3086
3087 static robj *createHashObject(void) {
3088 /* All the Hashes start as zipmaps. Will be automatically converted
3089 * into hash tables if there are enough elements or big elements
3090 * inside. */
3091 unsigned char *zm = zipmapNew();
3092 robj *o = createObject(REDIS_HASH,zm);
3093 o->encoding = REDIS_ENCODING_ZIPMAP;
3094 return o;
3095 }
3096
3097 static robj *createZsetObject(void) {
3098 zset *zs = zmalloc(sizeof(*zs));
3099
3100 zs->dict = dictCreate(&zsetDictType,NULL);
3101 zs->zsl = zslCreate();
3102 return createObject(REDIS_ZSET,zs);
3103 }
3104
3105 static void freeStringObject(robj *o) {
3106 if (o->encoding == REDIS_ENCODING_RAW) {
3107 sdsfree(o->ptr);
3108 }
3109 }
3110
3111 static void freeListObject(robj *o) {
3112 switch (o->encoding) {
3113 case REDIS_ENCODING_LIST:
3114 listRelease((list*) o->ptr);
3115 break;
3116 case REDIS_ENCODING_ZIPLIST:
3117 zfree(o->ptr);
3118 break;
3119 default:
3120 redisPanic("Unknown list encoding type");
3121 }
3122 }
3123
3124 static void freeSetObject(robj *o) {
3125 switch (o->encoding) {
3126 case REDIS_ENCODING_HT:
3127 dictRelease((dict*) o->ptr);
3128 break;
3129 case REDIS_ENCODING_INTSET:
3130 zfree(o->ptr);
3131 break;
3132 default:
3133 redisPanic("Unknown set encoding type");
3134 }
3135 }
3136
3137 static void freeZsetObject(robj *o) {
3138 zset *zs = o->ptr;
3139
3140 dictRelease(zs->dict);
3141 zslFree(zs->zsl);
3142 zfree(zs);
3143 }
3144
3145 static void freeHashObject(robj *o) {
3146 switch (o->encoding) {
3147 case REDIS_ENCODING_HT:
3148 dictRelease((dict*) o->ptr);
3149 break;
3150 case REDIS_ENCODING_ZIPMAP:
3151 zfree(o->ptr);
3152 break;
3153 default:
3154 redisPanic("Unknown hash encoding type");
3155 break;
3156 }
3157 }
3158
3159 static void incrRefCount(robj *o) {
3160 o->refcount++;
3161 }
3162
3163 static void decrRefCount(void *obj) {
3164 robj *o = obj;
3165
3166 /* Object is a swapped out value, or in the process of being loaded. */
3167 if (server.vm_enabled &&
3168 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3169 {
3170 vmpointer *vp = obj;
3171 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3172 vmMarkPagesFree(vp->page,vp->usedpages);
3173 server.vm_stats_swapped_objects--;
3174 zfree(vp);
3175 return;
3176 }
3177
3178 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3179 /* Object is in memory, or in the process of being swapped out.
3180 *
3181 * If the object is being swapped out, abort the operation on
3182 * decrRefCount even if the refcount does not drop to 0: the object
3183 * is referenced at least two times, as value of the key AND as
3184 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3185 * done but the relevant key was removed in the meantime, the
3186 * complete jobs handler will not find the key about the job and the
3187 * assert will fail. */
3188 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3189 vmCancelThreadedIOJob(o);
3190 if (--(o->refcount) == 0) {
3191 switch(o->type) {
3192 case REDIS_STRING: freeStringObject(o); break;
3193 case REDIS_LIST: freeListObject(o); break;
3194 case REDIS_SET: freeSetObject(o); break;
3195 case REDIS_ZSET: freeZsetObject(o); break;
3196 case REDIS_HASH: freeHashObject(o); break;
3197 default: redisPanic("Unknown object type"); break;
3198 }
3199 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3200 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3201 !listAddNodeHead(server.objfreelist,o))
3202 zfree(o);
3203 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3204 }
3205 }
3206
3207 static int checkType(redisClient *c, robj *o, int type) {
3208 if (o->type != type) {
3209 addReply(c,shared.wrongtypeerr);
3210 return 1;
3211 }
3212 return 0;
3213 }
3214
3215 /* Check if the nul-terminated string 's' can be represented by a long
3216 * (that is, is a number that fits into long without any other space or
3217 * character before or after the digits).
3218 *
3219 * If so, the function returns REDIS_OK and *longval is set to the value
3220 * of the number. Otherwise REDIS_ERR is returned */
3221 static int isStringRepresentableAsLong(sds s, long *longval) {
3222 char buf[32], *endptr;
3223 long value;
3224 int slen;
3225
3226 value = strtol(s, &endptr, 10);
3227 if (endptr[0] != '\0') return REDIS_ERR;
3228 slen = ll2string(buf,32,value);
3229
3230 /* If the number converted back into a string is not identical
3231 * then it's not possible to encode the string as integer */
3232 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3233 if (longval) *longval = value;
3234 return REDIS_OK;
3235 }
3236
3237 /* Try to encode a string object in order to save space */
3238 static robj *tryObjectEncoding(robj *o) {
3239 long value;
3240 sds s = o->ptr;
3241
3242 if (o->encoding != REDIS_ENCODING_RAW)
3243 return o; /* Already encoded */
3244
3245 /* It's not safe to encode shared objects: shared objects can be shared
3246 * everywhere in the "object space" of Redis. Encoded objects can only
3247 * appear as "values" (and not, for instance, as keys) */
3248 if (o->refcount > 1) return o;
3249
3250 /* Currently we try to encode only strings */
3251 redisAssert(o->type == REDIS_STRING);
3252
3253 /* Check if we can represent this string as a long integer */
3254 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3255
3256 /* Ok, this object can be encoded */
3257 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3258 decrRefCount(o);
3259 incrRefCount(shared.integers[value]);
3260 return shared.integers[value];
3261 } else {
3262 o->encoding = REDIS_ENCODING_INT;
3263 sdsfree(o->ptr);
3264 o->ptr = (void*) value;
3265 return o;
3266 }
3267 }
3268
3269 /* Get a decoded version of an encoded object (returned as a new object).
3270 * If the object is already raw-encoded just increment the ref count. */
3271 static robj *getDecodedObject(robj *o) {
3272 robj *dec;
3273
3274 if (o->encoding == REDIS_ENCODING_RAW) {
3275 incrRefCount(o);
3276 return o;
3277 }
3278 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3279 char buf[32];
3280
3281 ll2string(buf,32,(long)o->ptr);
3282 dec = createStringObject(buf,strlen(buf));
3283 return dec;
3284 } else {
3285 redisPanic("Unknown encoding type");
3286 }
3287 }
3288
3289 /* Compare two string objects via strcmp() or alike.
3290 * Note that the objects may be integer-encoded. In such a case we
3291 * use ll2string() to get a string representation of the numbers on the stack
3292 * and compare the strings, it's much faster than calling getDecodedObject().
3293 *
3294 * Important note: if objects are not integer encoded, but binary-safe strings,
3295 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3296 * binary safe. */
3297 static int compareStringObjects(robj *a, robj *b) {
3298 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3299 char bufa[128], bufb[128], *astr, *bstr;
3300 int bothsds = 1;
3301
3302 if (a == b) return 0;
3303 if (a->encoding != REDIS_ENCODING_RAW) {
3304 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3305 astr = bufa;
3306 bothsds = 0;
3307 } else {
3308 astr = a->ptr;
3309 }
3310 if (b->encoding != REDIS_ENCODING_RAW) {
3311 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3312 bstr = bufb;
3313 bothsds = 0;
3314 } else {
3315 bstr = b->ptr;
3316 }
3317 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3318 }
3319
3320 /* Equal string objects return 1 if the two objects are the same from the
3321 * point of view of a string comparison, otherwise 0 is returned. Note that
3322 * this function is faster then checking for (compareStringObject(a,b) == 0)
3323 * because it can perform some more optimization. */
3324 static int equalStringObjects(robj *a, robj *b) {
3325 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3326 return a->ptr == b->ptr;
3327 } else {
3328 return compareStringObjects(a,b) == 0;
3329 }
3330 }
3331
3332 static size_t stringObjectLen(robj *o) {
3333 redisAssert(o->type == REDIS_STRING);
3334 if (o->encoding == REDIS_ENCODING_RAW) {
3335 return sdslen(o->ptr);
3336 } else {
3337 char buf[32];
3338
3339 return ll2string(buf,32,(long)o->ptr);
3340 }
3341 }
3342
3343 static int getDoubleFromObject(robj *o, double *target) {
3344 double value;
3345 char *eptr;
3346
3347 if (o == NULL) {
3348 value = 0;
3349 } else {
3350 redisAssert(o->type == REDIS_STRING);
3351 if (o->encoding == REDIS_ENCODING_RAW) {
3352 value = strtod(o->ptr, &eptr);
3353 if (eptr[0] != '\0') return REDIS_ERR;
3354 } else if (o->encoding == REDIS_ENCODING_INT) {
3355 value = (long)o->ptr;
3356 } else {
3357 redisPanic("Unknown string encoding");
3358 }
3359 }
3360
3361 *target = value;
3362 return REDIS_OK;
3363 }
3364
3365 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3366 double value;
3367 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3368 if (msg != NULL) {
3369 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3370 } else {
3371 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3372 }
3373 return REDIS_ERR;
3374 }
3375
3376 *target = value;
3377 return REDIS_OK;
3378 }
3379
3380 static int getLongLongFromObject(robj *o, long long *target) {
3381 long long value;
3382 char *eptr;
3383
3384 if (o == NULL) {
3385 value = 0;
3386 } else {
3387 redisAssert(o->type == REDIS_STRING);
3388 if (o->encoding == REDIS_ENCODING_RAW) {
3389 value = strtoll(o->ptr, &eptr, 10);
3390 if (eptr[0] != '\0') return REDIS_ERR;
3391 } else if (o->encoding == REDIS_ENCODING_INT) {
3392 value = (long)o->ptr;
3393 } else {
3394 redisPanic("Unknown string encoding");
3395 }
3396 }
3397
3398 if (target) *target = value;
3399 return REDIS_OK;
3400 }
3401
3402 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3403 long long value;
3404 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3405 if (msg != NULL) {
3406 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3407 } else {
3408 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3409 }
3410 return REDIS_ERR;
3411 }
3412
3413 *target = value;
3414 return REDIS_OK;
3415 }
3416
3417 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3418 long long value;
3419
3420 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3421 if (value < LONG_MIN || value > LONG_MAX) {
3422 if (msg != NULL) {
3423 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3424 } else {
3425 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3426 }
3427 return REDIS_ERR;
3428 }
3429
3430 *target = value;
3431 return REDIS_OK;
3432 }
3433
3434 /* =========================== Keyspace access API ========================== */
3435
3436 static robj *lookupKey(redisDb *db, robj *key) {
3437 dictEntry *de = dictFind(db->dict,key->ptr);
3438 if (de) {
3439 robj *val = dictGetEntryVal(de);
3440
3441 if (server.vm_enabled) {
3442 if (val->storage == REDIS_VM_MEMORY ||
3443 val->storage == REDIS_VM_SWAPPING)
3444 {
3445 /* If we were swapping the object out, cancel the operation */
3446 if (val->storage == REDIS_VM_SWAPPING)
3447 vmCancelThreadedIOJob(val);
3448 /* Update the access time for the aging algorithm. */
3449 val->lru = server.lruclock;
3450 } else {
3451 int notify = (val->storage == REDIS_VM_LOADING);
3452
3453 /* Our value was swapped on disk. Bring it at home. */
3454 redisAssert(val->type == REDIS_VMPOINTER);
3455 val = vmLoadObject(val);
3456 dictGetEntryVal(de) = val;
3457
3458 /* Clients blocked by the VM subsystem may be waiting for
3459 * this key... */
3460 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3461 }
3462 }
3463 return val;
3464 } else {
3465 return NULL;
3466 }
3467 }
3468
3469 static robj *lookupKeyRead(redisDb *db, robj *key) {
3470 expireIfNeeded(db,key);
3471 return lookupKey(db,key);
3472 }
3473
3474 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3475 deleteIfVolatile(db,key);
3476 touchWatchedKey(db,key);
3477 return lookupKey(db,key);
3478 }
3479
3480 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3481 robj *o = lookupKeyRead(c->db, key);
3482 if (!o) addReply(c,reply);
3483 return o;
3484 }
3485
3486 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3487 robj *o = lookupKeyWrite(c->db, key);
3488 if (!o) addReply(c,reply);
3489 return o;
3490 }
3491
3492 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3493 * otherwise REDIS_OK is returned, and the caller should increment the
3494 * refcount of 'val'. */
3495 static int dbAdd(redisDb *db, robj *key, robj *val) {
3496 /* Perform a lookup before adding the key, as we need to copy the
3497 * key value. */
3498 if (dictFind(db->dict, key->ptr) != NULL) {
3499 return REDIS_ERR;
3500 } else {
3501 sds copy = sdsdup(key->ptr);
3502 dictAdd(db->dict, copy, val);
3503 return REDIS_OK;
3504 }
3505 }
3506
3507 /* If the key does not exist, this is just like dbAdd(). Otherwise
3508 * the value associated to the key is replaced with the new one.
3509 *
3510 * On update (key already existed) 0 is returned. Otherwise 1. */
3511 static int dbReplace(redisDb *db, robj *key, robj *val) {
3512 if (dictFind(db->dict,key->ptr) == NULL) {
3513 sds copy = sdsdup(key->ptr);
3514 dictAdd(db->dict, copy, val);
3515 return 1;
3516 } else {
3517 dictReplace(db->dict, key->ptr, val);
3518 return 0;
3519 }
3520 }
3521
3522 static int dbExists(redisDb *db, robj *key) {
3523 return dictFind(db->dict,key->ptr) != NULL;
3524 }
3525
3526 /* Return a random key, in form of a Redis object.
3527 * If there are no keys, NULL is returned.
3528 *
3529 * The function makes sure to return keys not already expired. */
3530 static robj *dbRandomKey(redisDb *db) {
3531 struct dictEntry *de;
3532
3533 while(1) {
3534 sds key;
3535 robj *keyobj;
3536
3537 de = dictGetRandomKey(db->dict);
3538 if (de == NULL) return NULL;
3539
3540 key = dictGetEntryKey(de);
3541 keyobj = createStringObject(key,sdslen(key));
3542 if (dictFind(db->expires,key)) {
3543 if (expireIfNeeded(db,keyobj)) {
3544 decrRefCount(keyobj);
3545 continue; /* search for another key. This expired. */
3546 }
3547 }
3548 return keyobj;
3549 }
3550 }
3551
3552 /* Delete a key, value, and associated expiration entry if any, from the DB */
3553 static int dbDelete(redisDb *db, robj *key) {
3554 int retval;
3555
3556 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3557 retval = dictDelete(db->dict,key->ptr);
3558
3559 return retval == DICT_OK;
3560 }
3561
3562 /*============================ RDB saving/loading =========================== */
3563
3564 static int rdbSaveType(FILE *fp, unsigned char type) {
3565 if (fwrite(&type,1,1,fp) == 0) return -1;
3566 return 0;
3567 }
3568
3569 static int rdbSaveTime(FILE *fp, time_t t) {
3570 int32_t t32 = (int32_t) t;
3571 if (fwrite(&t32,4,1,fp) == 0) return -1;
3572 return 0;
3573 }
3574
3575 /* check rdbLoadLen() comments for more info */
3576 static int rdbSaveLen(FILE *fp, uint32_t len) {
3577 unsigned char buf[2];
3578
3579 if (len < (1<<6)) {
3580 /* Save a 6 bit len */
3581 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3582 if (fwrite(buf,1,1,fp) == 0) return -1;
3583 } else if (len < (1<<14)) {
3584 /* Save a 14 bit len */
3585 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3586 buf[1] = len&0xFF;
3587 if (fwrite(buf,2,1,fp) == 0) return -1;
3588 } else {
3589 /* Save a 32 bit len */
3590 buf[0] = (REDIS_RDB_32BITLEN<<6);
3591 if (fwrite(buf,1,1,fp) == 0) return -1;
3592 len = htonl(len);
3593 if (fwrite(&len,4,1,fp) == 0) return -1;
3594 }
3595 return 0;
3596 }
3597
3598 /* Encode 'value' as an integer if possible (if integer will fit the
3599 * supported range). If the function sucessful encoded the integer
3600 * then the (up to 5 bytes) encoded representation is written in the
3601 * string pointed by 'enc' and the length is returned. Otherwise
3602 * 0 is returned. */
3603 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3604 /* Finally check if it fits in our ranges */
3605 if (value >= -(1<<7) && value <= (1<<7)-1) {
3606 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3607 enc[1] = value&0xFF;
3608 return 2;
3609 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3610 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3611 enc[1] = value&0xFF;
3612 enc[2] = (value>>8)&0xFF;
3613 return 3;
3614 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3615 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3616 enc[1] = value&0xFF;
3617 enc[2] = (value>>8)&0xFF;
3618 enc[3] = (value>>16)&0xFF;
3619 enc[4] = (value>>24)&0xFF;
3620 return 5;
3621 } else {
3622 return 0;
3623 }
3624 }
3625
3626 /* String objects in the form "2391" "-100" without any space and with a
3627 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3628 * encoded as integers to save space */
3629 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3630 long long value;
3631 char *endptr, buf[32];
3632
3633 /* Check if it's possible to encode this value as a number */
3634 value = strtoll(s, &endptr, 10);
3635 if (endptr[0] != '\0') return 0;
3636 ll2string(buf,32,value);
3637
3638 /* If the number converted back into a string is not identical
3639 * then it's not possible to encode the string as integer */
3640 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3641
3642 return rdbEncodeInteger(value,enc);
3643 }
3644
3645 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3646 size_t comprlen, outlen;
3647 unsigned char byte;
3648 void *out;
3649
3650 /* We require at least four bytes compression for this to be worth it */
3651 if (len <= 4) return 0;
3652 outlen = len-4;
3653 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3654 comprlen = lzf_compress(s, len, out, outlen);
3655 if (comprlen == 0) {
3656 zfree(out);
3657 return 0;
3658 }
3659 /* Data compressed! Let's save it on disk */
3660 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3661 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3662 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3663 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3664 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3665 zfree(out);
3666 return comprlen;
3667
3668 writeerr:
3669 zfree(out);
3670 return -1;
3671 }
3672
3673 /* Save a string objet as [len][data] on disk. If the object is a string
3674 * representation of an integer value we try to safe it in a special form */
3675 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3676 int enclen;
3677
3678 /* Try integer encoding */
3679 if (len <= 11) {
3680 unsigned char buf[5];
3681 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3682 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3683 return 0;
3684 }
3685 }
3686
3687 /* Try LZF compression - under 20 bytes it's unable to compress even
3688 * aaaaaaaaaaaaaaaaaa so skip it */
3689 if (server.rdbcompression && len > 20) {
3690 int retval;
3691
3692 retval = rdbSaveLzfStringObject(fp,s,len);
3693 if (retval == -1) return -1;
3694 if (retval > 0) return 0;
3695 /* retval == 0 means data can't be compressed, save the old way */
3696 }
3697
3698 /* Store verbatim */
3699 if (rdbSaveLen(fp,len) == -1) return -1;
3700 if (len && fwrite(s,len,1,fp) == 0) return -1;
3701 return 0;
3702 }
3703
3704 /* Save a long long value as either an encoded string or a string. */
3705 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3706 unsigned char buf[32];
3707 int enclen = rdbEncodeInteger(value,buf);
3708 if (enclen > 0) {
3709 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3710 } else {
3711 /* Encode as string */
3712 enclen = ll2string((char*)buf,32,value);
3713 redisAssert(enclen < 32);
3714 if (rdbSaveLen(fp,enclen) == -1) return -1;
3715 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3716 }
3717 return 0;
3718 }
3719
3720 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3721 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3722 /* Avoid to decode the object, then encode it again, if the
3723 * object is alrady integer encoded. */
3724 if (obj->encoding == REDIS_ENCODING_INT) {
3725 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3726 } else {
3727 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3728 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3729 }
3730 }
3731
3732 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3733 * 8 bit integer specifing the length of the representation.
3734 * This 8 bit integer has special values in order to specify the following
3735 * conditions:
3736 * 253: not a number
3737 * 254: + inf
3738 * 255: - inf
3739 */
3740 static int rdbSaveDoubleValue(FILE *fp, double val) {
3741 unsigned char buf[128];
3742 int len;
3743
3744 if (isnan(val)) {
3745 buf[0] = 253;
3746 len = 1;
3747 } else if (!isfinite(val)) {
3748 len = 1;
3749 buf[0] = (val < 0) ? 255 : 254;
3750 } else {
3751 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3752 /* Check if the float is in a safe range to be casted into a
3753 * long long. We are assuming that long long is 64 bit here.
3754 * Also we are assuming that there are no implementations around where
3755 * double has precision < 52 bit.
3756 *
3757 * Under this assumptions we test if a double is inside an interval
3758 * where casting to long long is safe. Then using two castings we
3759 * make sure the decimal part is zero. If all this is true we use
3760 * integer printing function that is much faster. */
3761 double min = -4503599627370495; /* (2^52)-1 */
3762 double max = 4503599627370496; /* -(2^52) */
3763 if (val > min && val < max && val == ((double)((long long)val)))
3764 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3765 else
3766 #endif
3767 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3768 buf[0] = strlen((char*)buf+1);
3769 len = buf[0]+1;
3770 }
3771 if (fwrite(buf,len,1,fp) == 0) return -1;
3772 return 0;
3773 }
3774
3775 /* Save a Redis object. */
3776 static int rdbSaveObject(FILE *fp, robj *o) {
3777 if (o->type == REDIS_STRING) {
3778 /* Save a string value */
3779 if (rdbSaveStringObject(fp,o) == -1) return -1;
3780 } else if (o->type == REDIS_LIST) {
3781 /* Save a list value */
3782 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3783 unsigned char *p;
3784 unsigned char *vstr;
3785 unsigned int vlen;
3786 long long vlong;
3787
3788 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3789 p = ziplistIndex(o->ptr,0);
3790 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3791 if (vstr) {
3792 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3793 return -1;
3794 } else {
3795 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3796 return -1;
3797 }
3798 p = ziplistNext(o->ptr,p);
3799 }
3800 } else if (o->encoding == REDIS_ENCODING_LIST) {
3801 list *list = o->ptr;
3802 listIter li;
3803 listNode *ln;
3804
3805 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3806 listRewind(list,&li);
3807 while((ln = listNext(&li))) {
3808 robj *eleobj = listNodeValue(ln);
3809 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3810 }
3811 } else {
3812 redisPanic("Unknown list encoding");
3813 }
3814 } else if (o->type == REDIS_SET) {
3815 /* Save a set value */
3816 if (o->encoding == REDIS_ENCODING_HT) {
3817 dict *set = o->ptr;
3818 dictIterator *di = dictGetIterator(set);
3819 dictEntry *de;
3820
3821 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3822 while((de = dictNext(di)) != NULL) {
3823 robj *eleobj = dictGetEntryKey(de);
3824 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3825 }
3826 dictReleaseIterator(di);
3827 } else if (o->encoding == REDIS_ENCODING_INTSET) {
3828 intset *is = o->ptr;
3829 long long llval;
3830 int i = 0;
3831
3832 if (rdbSaveLen(fp,intsetLen(is)) == -1) return -1;
3833 while(intsetGet(is,i++,&llval)) {
3834 if (rdbSaveLongLongAsStringObject(fp,llval) == -1) return -1;
3835 }
3836 } else {
3837 redisPanic("Unknown set encoding");
3838 }
3839 } else if (o->type == REDIS_ZSET) {
3840 /* Save a set value */
3841 zset *zs = o->ptr;
3842 dictIterator *di = dictGetIterator(zs->dict);
3843 dictEntry *de;
3844
3845 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3846 while((de = dictNext(di)) != NULL) {
3847 robj *eleobj = dictGetEntryKey(de);
3848 double *score = dictGetEntryVal(de);
3849
3850 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3851 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3852 }
3853 dictReleaseIterator(di);
3854 } else if (o->type == REDIS_HASH) {
3855 /* Save a hash value */
3856 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3857 unsigned char *p = zipmapRewind(o->ptr);
3858 unsigned int count = zipmapLen(o->ptr);
3859 unsigned char *key, *val;
3860 unsigned int klen, vlen;
3861
3862 if (rdbSaveLen(fp,count) == -1) return -1;
3863 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3864 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3865 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3866 }
3867 } else {
3868 dictIterator *di = dictGetIterator(o->ptr);
3869 dictEntry *de;
3870
3871 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3872 while((de = dictNext(di)) != NULL) {
3873 robj *key = dictGetEntryKey(de);
3874 robj *val = dictGetEntryVal(de);
3875
3876 if (rdbSaveStringObject(fp,key) == -1) return -1;
3877 if (rdbSaveStringObject(fp,val) == -1) return -1;
3878 }
3879 dictReleaseIterator(di);
3880 }
3881 } else {
3882 redisPanic("Unknown object type");
3883 }
3884 return 0;
3885 }
3886
3887 /* Return the length the object will have on disk if saved with
3888 * the rdbSaveObject() function. Currently we use a trick to get
3889 * this length with very little changes to the code. In the future
3890 * we could switch to a faster solution. */
3891 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3892 if (fp == NULL) fp = server.devnull;
3893 rewind(fp);
3894 assert(rdbSaveObject(fp,o) != 1);
3895 return ftello(fp);
3896 }
3897
3898 /* Return the number of pages required to save this object in the swap file */
3899 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3900 off_t bytes = rdbSavedObjectLen(o,fp);
3901
3902 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3903 }
3904
3905 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3906 static int rdbSave(char *filename) {
3907 dictIterator *di = NULL;
3908 dictEntry *de;
3909 FILE *fp;
3910 char tmpfile[256];
3911 int j;
3912 time_t now = time(NULL);
3913
3914 /* Wait for I/O therads to terminate, just in case this is a
3915 * foreground-saving, to avoid seeking the swap file descriptor at the
3916 * same time. */
3917 if (server.vm_enabled)
3918 waitEmptyIOJobsQueue();
3919
3920 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3921 fp = fopen(tmpfile,"w");
3922 if (!fp) {
3923 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3924 return REDIS_ERR;
3925 }
3926 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3927 for (j = 0; j < server.dbnum; j++) {
3928 redisDb *db = server.db+j;
3929 dict *d = db->dict;
3930 if (dictSize(d) == 0) continue;
3931 di = dictGetIterator(d);
3932 if (!di) {
3933 fclose(fp);
3934 return REDIS_ERR;
3935 }
3936
3937 /* Write the SELECT DB opcode */
3938 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3939 if (rdbSaveLen(fp,j) == -1) goto werr;
3940
3941 /* Iterate this DB writing every entry */
3942 while((de = dictNext(di)) != NULL) {
3943 sds keystr = dictGetEntryKey(de);
3944 robj key, *o = dictGetEntryVal(de);
3945 time_t expiretime;
3946
3947 initStaticStringObject(key,keystr);
3948 expiretime = getExpire(db,&key);
3949
3950 /* Save the expire time */
3951 if (expiretime != -1) {
3952 /* If this key is already expired skip it */
3953 if (expiretime < now) continue;
3954 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3955 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3956 }
3957 /* Save the key and associated value. This requires special
3958 * handling if the value is swapped out. */
3959 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3960 o->storage == REDIS_VM_SWAPPING) {
3961 /* Save type, key, value */
3962 if (rdbSaveType(fp,o->type) == -1) goto werr;
3963 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3964 if (rdbSaveObject(fp,o) == -1) goto werr;
3965 } else {
3966 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3967 robj *po;
3968 /* Get a preview of the object in memory */
3969 po = vmPreviewObject(o);
3970 /* Save type, key, value */
3971 if (rdbSaveType(fp,po->type) == -1) goto werr;
3972 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3973 if (rdbSaveObject(fp,po) == -1) goto werr;
3974 /* Remove the loaded object from memory */
3975 decrRefCount(po);
3976 }
3977 }
3978 dictReleaseIterator(di);
3979 }
3980 /* EOF opcode */
3981 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3982
3983 /* Make sure data will not remain on the OS's output buffers */
3984 fflush(fp);
3985 fsync(fileno(fp));
3986 fclose(fp);
3987
3988 /* Use RENAME to make sure the DB file is changed atomically only
3989 * if the generate DB file is ok. */
3990 if (rename(tmpfile,filename) == -1) {
3991 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3992 unlink(tmpfile);
3993 return REDIS_ERR;
3994 }
3995 redisLog(REDIS_NOTICE,"DB saved on disk");
3996 server.dirty = 0;
3997 server.lastsave = time(NULL);
3998 return REDIS_OK;
3999
4000 werr:
4001 fclose(fp);
4002 unlink(tmpfile);
4003 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
4004 if (di) dictReleaseIterator(di);
4005 return REDIS_ERR;
4006 }
4007
4008 static int rdbSaveBackground(char *filename) {
4009 pid_t childpid;
4010
4011 if (server.bgsavechildpid != -1) return REDIS_ERR;
4012 if (server.vm_enabled) waitEmptyIOJobsQueue();
4013 if ((childpid = fork()) == 0) {
4014 /* Child */
4015 if (server.vm_enabled) vmReopenSwapFile();
4016 close(server.fd);
4017 if (rdbSave(filename) == REDIS_OK) {
4018 _exit(0);
4019 } else {
4020 _exit(1);
4021 }
4022 } else {
4023 /* Parent */
4024 if (childpid == -1) {
4025 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
4026 strerror(errno));
4027 return REDIS_ERR;
4028 }
4029 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
4030 server.bgsavechildpid = childpid;
4031 updateDictResizePolicy();
4032 return REDIS_OK;
4033 }
4034 return REDIS_OK; /* unreached */
4035 }
4036
4037 static void rdbRemoveTempFile(pid_t childpid) {
4038 char tmpfile[256];
4039
4040 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4041 unlink(tmpfile);
4042 }
4043
4044 static int rdbLoadType(FILE *fp) {
4045 unsigned char type;
4046 if (fread(&type,1,1,fp) == 0) return -1;
4047 return type;
4048 }
4049
4050 static time_t rdbLoadTime(FILE *fp) {
4051 int32_t t32;
4052 if (fread(&t32,4,1,fp) == 0) return -1;
4053 return (time_t) t32;
4054 }
4055
4056 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4057 * of this file for a description of how this are stored on disk.
4058 *
4059 * isencoded is set to 1 if the readed length is not actually a length but
4060 * an "encoding type", check the above comments for more info */
4061 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4062 unsigned char buf[2];
4063 uint32_t len;
4064 int type;
4065
4066 if (isencoded) *isencoded = 0;
4067 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4068 type = (buf[0]&0xC0)>>6;
4069 if (type == REDIS_RDB_6BITLEN) {
4070 /* Read a 6 bit len */
4071 return buf[0]&0x3F;
4072 } else if (type == REDIS_RDB_ENCVAL) {
4073 /* Read a 6 bit len encoding type */
4074 if (isencoded) *isencoded = 1;
4075 return buf[0]&0x3F;
4076 } else if (type == REDIS_RDB_14BITLEN) {
4077 /* Read a 14 bit len */
4078 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4079 return ((buf[0]&0x3F)<<8)|buf[1];
4080 } else {
4081 /* Read a 32 bit len */
4082 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4083 return ntohl(len);
4084 }
4085 }
4086
4087 /* Load an integer-encoded object from file 'fp', with the specified
4088 * encoding type 'enctype'. If encode is true the function may return
4089 * an integer-encoded object as reply, otherwise the returned object
4090 * will always be encoded as a raw string. */
4091 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4092 unsigned char enc[4];
4093 long long val;
4094
4095 if (enctype == REDIS_RDB_ENC_INT8) {
4096 if (fread(enc,1,1,fp) == 0) return NULL;
4097 val = (signed char)enc[0];
4098 } else if (enctype == REDIS_RDB_ENC_INT16) {
4099 uint16_t v;
4100 if (fread(enc,2,1,fp) == 0) return NULL;
4101 v = enc[0]|(enc[1]<<8);
4102 val = (int16_t)v;
4103 } else if (enctype == REDIS_RDB_ENC_INT32) {
4104 uint32_t v;
4105 if (fread(enc,4,1,fp) == 0) return NULL;
4106 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4107 val = (int32_t)v;
4108 } else {
4109 val = 0; /* anti-warning */
4110 redisPanic("Unknown RDB integer encoding type");
4111 }
4112 if (encode)
4113 return createStringObjectFromLongLong(val);
4114 else
4115 return createObject(REDIS_STRING,sdsfromlonglong(val));
4116 }
4117
4118 static robj *rdbLoadLzfStringObject(FILE*fp) {
4119 unsigned int len, clen;
4120 unsigned char *c = NULL;
4121 sds val = NULL;
4122
4123 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4124 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4125 if ((c = zmalloc(clen)) == NULL) goto err;
4126 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4127 if (fread(c,clen,1,fp) == 0) goto err;
4128 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4129 zfree(c);
4130 return createObject(REDIS_STRING,val);
4131 err:
4132 zfree(c);
4133 sdsfree(val);
4134 return NULL;
4135 }
4136
4137 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4138 int isencoded;
4139 uint32_t len;
4140 sds val;
4141
4142 len = rdbLoadLen(fp,&isencoded);
4143 if (isencoded) {
4144 switch(len) {
4145 case REDIS_RDB_ENC_INT8:
4146 case REDIS_RDB_ENC_INT16:
4147 case REDIS_RDB_ENC_INT32:
4148 return rdbLoadIntegerObject(fp,len,encode);
4149 case REDIS_RDB_ENC_LZF:
4150 return rdbLoadLzfStringObject(fp);
4151 default:
4152 redisPanic("Unknown RDB encoding type");
4153 }
4154 }
4155
4156 if (len == REDIS_RDB_LENERR) return NULL;
4157 val = sdsnewlen(NULL,len);
4158 if (len && fread(val,len,1,fp) == 0) {
4159 sdsfree(val);
4160 return NULL;
4161 }
4162 return createObject(REDIS_STRING,val);
4163 }
4164
4165 static robj *rdbLoadStringObject(FILE *fp) {
4166 return rdbGenericLoadStringObject(fp,0);
4167 }
4168
4169 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4170 return rdbGenericLoadStringObject(fp,1);
4171 }
4172
4173 /* For information about double serialization check rdbSaveDoubleValue() */
4174 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4175 char buf[128];
4176 unsigned char len;
4177
4178 if (fread(&len,1,1,fp) == 0) return -1;
4179 switch(len) {
4180 case 255: *val = R_NegInf; return 0;
4181 case 254: *val = R_PosInf; return 0;
4182 case 253: *val = R_Nan; return 0;
4183 default:
4184 if (fread(buf,len,1,fp) == 0) return -1;
4185 buf[len] = '\0';
4186 sscanf(buf, "%lg", val);
4187 return 0;
4188 }
4189 }
4190
4191 /* Load a Redis object of the specified type from the specified file.
4192 * On success a newly allocated object is returned, otherwise NULL. */
4193 static robj *rdbLoadObject(int type, FILE *fp) {
4194 robj *o, *ele, *dec;
4195 size_t len;
4196
4197 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4198 if (type == REDIS_STRING) {
4199 /* Read string value */
4200 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4201 o = tryObjectEncoding(o);
4202 } else if (type == REDIS_LIST) {
4203 /* Read list value */
4204 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4205
4206 /* Use a real list when there are too many entries */
4207 if (len > server.list_max_ziplist_entries) {
4208 o = createListObject();
4209 } else {
4210 o = createZiplistObject();
4211 }
4212
4213 /* Load every single element of the list */
4214 while(len--) {
4215 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4216
4217 /* If we are using a ziplist and the value is too big, convert
4218 * the object to a real list. */
4219 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4220 ele->encoding == REDIS_ENCODING_RAW &&
4221 sdslen(ele->ptr) > server.list_max_ziplist_value)
4222 listTypeConvert(o,REDIS_ENCODING_LIST);
4223
4224 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4225 dec = getDecodedObject(ele);
4226 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4227 decrRefCount(dec);
4228 decrRefCount(ele);
4229 } else {
4230 ele = tryObjectEncoding(ele);
4231 listAddNodeTail(o->ptr,ele);
4232 }
4233 }
4234 } else if (type == REDIS_SET) {
4235 /* Read list/set value */
4236 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4237
4238 /* Use a regular set when there are too many entries. */
4239 if (len > server.set_max_intset_entries) {
4240 o = createSetObject();
4241 /* It's faster to expand the dict to the right size asap in order
4242 * to avoid rehashing */
4243 if (len > DICT_HT_INITIAL_SIZE)
4244 dictExpand(o->ptr,len);
4245 } else {
4246 o = createIntsetObject();
4247 }
4248
4249 /* Load every single element of the list/set */
4250 while(len--) {
4251 long long llval;
4252 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4253 ele = tryObjectEncoding(ele);
4254
4255 if (o->encoding == REDIS_ENCODING_INTSET) {
4256 /* Fetch integer value from element */
4257 if (getLongLongFromObject(ele,&llval) == REDIS_OK) {
4258 o->ptr = intsetAdd(o->ptr,llval,NULL);
4259 } else {
4260 setTypeConvert(o,REDIS_ENCODING_HT);
4261 }
4262 }
4263
4264 /* This will also be called when the set was just converted
4265 * to regular hashtable encoded set */
4266 if (o->encoding == REDIS_ENCODING_HT) {
4267 dictAdd((dict*)o->ptr,ele,NULL);
4268 }
4269 }
4270 } else if (type == REDIS_ZSET) {
4271 /* Read list/set value */
4272 size_t zsetlen;
4273 zset *zs;
4274
4275 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4276 o = createZsetObject();
4277 zs = o->ptr;
4278 /* Load every single element of the list/set */
4279 while(zsetlen--) {
4280 robj *ele;
4281 double *score = zmalloc(sizeof(double));
4282
4283 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4284 ele = tryObjectEncoding(ele);
4285 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4286 dictAdd(zs->dict,ele,score);
4287 zslInsert(zs->zsl,*score,ele);
4288 incrRefCount(ele); /* added to skiplist */
4289 }
4290 } else if (type == REDIS_HASH) {
4291 size_t hashlen;
4292
4293 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4294 o = createHashObject();
4295 /* Too many entries? Use an hash table. */
4296 if (hashlen > server.hash_max_zipmap_entries)
4297 convertToRealHash(o);
4298 /* Load every key/value, then set it into the zipmap or hash
4299 * table, as needed. */
4300 while(hashlen--) {
4301 robj *key, *val;
4302
4303 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4304 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4305 /* If we are using a zipmap and there are too big values
4306 * the object is converted to real hash table encoding. */
4307 if (o->encoding != REDIS_ENCODING_HT &&
4308 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4309 sdslen(val->ptr) > server.hash_max_zipmap_value))
4310 {
4311 convertToRealHash(o);
4312 }
4313
4314 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4315 unsigned char *zm = o->ptr;
4316
4317 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4318 val->ptr,sdslen(val->ptr),NULL);
4319 o->ptr = zm;
4320 decrRefCount(key);
4321 decrRefCount(val);
4322 } else {
4323 key = tryObjectEncoding(key);
4324 val = tryObjectEncoding(val);
4325 dictAdd((dict*)o->ptr,key,val);
4326 }
4327 }
4328 } else {
4329 redisPanic("Unknown object type");
4330 }
4331 return o;
4332 }
4333
4334 static int rdbLoad(char *filename) {
4335 FILE *fp;
4336 uint32_t dbid;
4337 int type, retval, rdbver;
4338 int swap_all_values = 0;
4339 redisDb *db = server.db+0;
4340 char buf[1024];
4341 time_t expiretime, now = time(NULL);
4342
4343 fp = fopen(filename,"r");
4344 if (!fp) return REDIS_ERR;
4345 if (fread(buf,9,1,fp) == 0) goto eoferr;
4346 buf[9] = '\0';
4347 if (memcmp(buf,"REDIS",5) != 0) {
4348 fclose(fp);
4349 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4350 return REDIS_ERR;
4351 }
4352 rdbver = atoi(buf+5);
4353 if (rdbver != 1) {
4354 fclose(fp);
4355 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4356 return REDIS_ERR;
4357 }
4358 while(1) {
4359 robj *key, *val;
4360 int force_swapout;
4361
4362 expiretime = -1;
4363 /* Read type. */
4364 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4365 if (type == REDIS_EXPIRETIME) {
4366 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4367 /* We read the time so we need to read the object type again */
4368 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4369 }
4370 if (type == REDIS_EOF) break;
4371 /* Handle SELECT DB opcode as a special case */
4372 if (type == REDIS_SELECTDB) {
4373 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4374 goto eoferr;
4375 if (dbid >= (unsigned)server.dbnum) {
4376 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4377 exit(1);
4378 }
4379 db = server.db+dbid;
4380 continue;
4381 }
4382 /* Read key */
4383 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4384 /* Read value */
4385 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4386 /* Check if the key already expired */
4387 if (expiretime != -1 && expiretime < now) {
4388 decrRefCount(key);
4389 decrRefCount(val);
4390 continue;
4391 }
4392 /* Add the new object in the hash table */
4393 retval = dbAdd(db,key,val);
4394 if (retval == REDIS_ERR) {
4395 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4396 exit(1);
4397 }
4398 /* Set the expire time if needed */
4399 if (expiretime != -1) setExpire(db,key,expiretime);
4400
4401 /* Handle swapping while loading big datasets when VM is on */
4402
4403 /* If we detecter we are hopeless about fitting something in memory
4404 * we just swap every new key on disk. Directly...
4405 * Note that's important to check for this condition before resorting
4406 * to random sampling, otherwise we may try to swap already
4407 * swapped keys. */
4408 if (swap_all_values) {
4409 dictEntry *de = dictFind(db->dict,key->ptr);
4410
4411 /* de may be NULL since the key already expired */
4412 if (de) {
4413 vmpointer *vp;
4414 val = dictGetEntryVal(de);
4415
4416 if (val->refcount == 1 &&
4417 (vp = vmSwapObjectBlocking(val)) != NULL)
4418 dictGetEntryVal(de) = vp;
4419 }
4420 decrRefCount(key);
4421 continue;
4422 }
4423 decrRefCount(key);
4424
4425 /* Flush data on disk once 32 MB of additional RAM are used... */
4426 force_swapout = 0;
4427 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4428 force_swapout = 1;
4429
4430 /* If we have still some hope of having some value fitting memory
4431 * then we try random sampling. */
4432 if (!swap_all_values && server.vm_enabled && force_swapout) {
4433 while (zmalloc_used_memory() > server.vm_max_memory) {
4434 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4435 }
4436 if (zmalloc_used_memory() > server.vm_max_memory)
4437 swap_all_values = 1; /* We are already using too much mem */
4438 }
4439 }
4440 fclose(fp);
4441 return REDIS_OK;
4442
4443 eoferr: /* unexpected end of file is handled here with a fatal exit */
4444 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4445 exit(1);
4446 return REDIS_ERR; /* Just to avoid warning */
4447 }
4448
4449 /*================================== Shutdown =============================== */
4450 static int prepareForShutdown() {
4451 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4452 /* Kill the saving child if there is a background saving in progress.
4453 We want to avoid race conditions, for instance our saving child may
4454 overwrite the synchronous saving did by SHUTDOWN. */
4455 if (server.bgsavechildpid != -1) {
4456 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4457 kill(server.bgsavechildpid,SIGKILL);
4458 rdbRemoveTempFile(server.bgsavechildpid);
4459 }
4460 if (server.appendonly) {
4461 /* Append only file: fsync() the AOF and exit */
4462 aof_fsync(server.appendfd);
4463 if (server.vm_enabled) unlink(server.vm_swap_file);
4464 } else {
4465 /* Snapshotting. Perform a SYNC SAVE and exit */
4466 if (rdbSave(server.dbfilename) == REDIS_OK) {
4467 if (server.daemonize)
4468 unlink(server.pidfile);
4469 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4470 } else {
4471 /* Ooops.. error saving! The best we can do is to continue
4472 * operating. Note that if there was a background saving process,
4473 * in the next cron() Redis will be notified that the background
4474 * saving aborted, handling special stuff like slaves pending for
4475 * synchronization... */
4476 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4477 return REDIS_ERR;
4478 }
4479 }
4480 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4481 return REDIS_OK;
4482 }
4483
4484 /*================================== Commands =============================== */
4485
4486 static void authCommand(redisClient *c) {
4487 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4488 c->authenticated = 1;
4489 addReply(c,shared.ok);
4490 } else {
4491 c->authenticated = 0;
4492 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4493 }
4494 }
4495
4496 static void pingCommand(redisClient *c) {
4497 addReply(c,shared.pong);
4498 }
4499
4500 static void echoCommand(redisClient *c) {
4501 addReplyBulk(c,c->argv[1]);
4502 }
4503
4504 /*=================================== Strings =============================== */
4505
4506 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4507 int retval;
4508 long seconds = 0; /* initialized to avoid an harmness warning */
4509
4510 if (expire) {
4511 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4512 return;
4513 if (seconds <= 0) {
4514 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4515 return;
4516 }
4517 }
4518
4519 touchWatchedKey(c->db,key);
4520 if (nx) deleteIfVolatile(c->db,key);
4521 retval = dbAdd(c->db,key,val);
4522 if (retval == REDIS_ERR) {
4523 if (!nx) {
4524 dbReplace(c->db,key,val);
4525 incrRefCount(val);
4526 } else {
4527 addReply(c,shared.czero);
4528 return;
4529 }
4530 } else {
4531 incrRefCount(val);
4532 }
4533 server.dirty++;
4534 removeExpire(c->db,key);
4535 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4536 addReply(c, nx ? shared.cone : shared.ok);
4537 }
4538
4539 static void setCommand(redisClient *c) {
4540 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4541 }
4542
4543 static void setnxCommand(redisClient *c) {
4544 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4545 }
4546
4547 static void setexCommand(redisClient *c) {
4548 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4549 }
4550
4551 static int getGenericCommand(redisClient *c) {
4552 robj *o;
4553
4554 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4555 return REDIS_OK;
4556
4557 if (o->type != REDIS_STRING) {
4558 addReply(c,shared.wrongtypeerr);
4559 return REDIS_ERR;
4560 } else {
4561 addReplyBulk(c,o);
4562 return REDIS_OK;
4563 }
4564 }
4565
4566 static void getCommand(redisClient *c) {
4567 getGenericCommand(c);
4568 }
4569
4570 static void getsetCommand(redisClient *c) {
4571 if (getGenericCommand(c) == REDIS_ERR) return;
4572 dbReplace(c->db,c->argv[1],c->argv[2]);
4573 incrRefCount(c->argv[2]);
4574 server.dirty++;
4575 removeExpire(c->db,c->argv[1]);
4576 }
4577
4578 static void mgetCommand(redisClient *c) {
4579 int j;
4580
4581 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4582 for (j = 1; j < c->argc; j++) {
4583 robj *o = lookupKeyRead(c->db,c->argv[j]);
4584 if (o == NULL) {
4585 addReply(c,shared.nullbulk);
4586 } else {
4587 if (o->type != REDIS_STRING) {
4588 addReply(c,shared.nullbulk);
4589 } else {
4590 addReplyBulk(c,o);
4591 }
4592 }
4593 }
4594 }
4595
4596 static void msetGenericCommand(redisClient *c, int nx) {
4597 int j, busykeys = 0;
4598
4599 if ((c->argc % 2) == 0) {
4600 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4601 return;
4602 }
4603 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4604 * set nothing at all if at least one already key exists. */
4605 if (nx) {
4606 for (j = 1; j < c->argc; j += 2) {
4607 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4608 busykeys++;
4609 }
4610 }
4611 }
4612 if (busykeys) {
4613 addReply(c, shared.czero);
4614 return;
4615 }
4616
4617 for (j = 1; j < c->argc; j += 2) {
4618 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4619 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4620 incrRefCount(c->argv[j+1]);
4621 removeExpire(c->db,c->argv[j]);
4622 }
4623 server.dirty += (c->argc-1)/2;
4624 addReply(c, nx ? shared.cone : shared.ok);
4625 }
4626
4627 static void msetCommand(redisClient *c) {
4628 msetGenericCommand(c,0);
4629 }
4630
4631 static void msetnxCommand(redisClient *c) {
4632 msetGenericCommand(c,1);
4633 }
4634
4635 static void incrDecrCommand(redisClient *c, long long incr) {
4636 long long value;
4637 robj *o;
4638
4639 o = lookupKeyWrite(c->db,c->argv[1]);
4640 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4641 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4642
4643 value += incr;
4644 o = createStringObjectFromLongLong(value);
4645 dbReplace(c->db,c->argv[1],o);
4646 server.dirty++;
4647 addReply(c,shared.colon);
4648 addReply(c,o);
4649 addReply(c,shared.crlf);
4650 }
4651
4652 static void incrCommand(redisClient *c) {
4653 incrDecrCommand(c,1);
4654 }
4655
4656 static void decrCommand(redisClient *c) {
4657 incrDecrCommand(c,-1);
4658 }
4659
4660 static void incrbyCommand(redisClient *c) {
4661 long long incr;
4662
4663 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4664 incrDecrCommand(c,incr);
4665 }
4666
4667 static void decrbyCommand(redisClient *c) {
4668 long long incr;
4669
4670 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4671 incrDecrCommand(c,-incr);
4672 }
4673
4674 static void appendCommand(redisClient *c) {
4675 int retval;
4676 size_t totlen;
4677 robj *o;
4678
4679 o = lookupKeyWrite(c->db,c->argv[1]);
4680 if (o == NULL) {
4681 /* Create the key */
4682 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4683 incrRefCount(c->argv[2]);
4684 totlen = stringObjectLen(c->argv[2]);
4685 } else {
4686 if (o->type != REDIS_STRING) {
4687 addReply(c,shared.wrongtypeerr);
4688 return;
4689 }
4690 /* If the object is specially encoded or shared we have to make
4691 * a copy */
4692 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4693 robj *decoded = getDecodedObject(o);
4694
4695 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4696 decrRefCount(decoded);
4697 dbReplace(c->db,c->argv[1],o);
4698 }
4699 /* APPEND! */
4700 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4701 o->ptr = sdscatlen(o->ptr,
4702 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4703 } else {
4704 o->ptr = sdscatprintf(o->ptr, "%ld",
4705 (unsigned long) c->argv[2]->ptr);
4706 }
4707 totlen = sdslen(o->ptr);
4708 }
4709 server.dirty++;
4710 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4711 }
4712
4713 static void substrCommand(redisClient *c) {
4714 robj *o;
4715 long start = atoi(c->argv[2]->ptr);
4716 long end = atoi(c->argv[3]->ptr);
4717 size_t rangelen, strlen;
4718 sds range;
4719
4720 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4721 checkType(c,o,REDIS_STRING)) return;
4722
4723 o = getDecodedObject(o);
4724 strlen = sdslen(o->ptr);
4725
4726 /* convert negative indexes */
4727 if (start < 0) start = strlen+start;
4728 if (end < 0) end = strlen+end;
4729 if (start < 0) start = 0;
4730 if (end < 0) end = 0;
4731
4732 /* indexes sanity checks */
4733 if (start > end || (size_t)start >= strlen) {
4734 /* Out of range start or start > end result in null reply */
4735 addReply(c,shared.nullbulk);
4736 decrRefCount(o);
4737 return;
4738 }
4739 if ((size_t)end >= strlen) end = strlen-1;
4740 rangelen = (end-start)+1;
4741
4742 /* Return the result */
4743 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4744 range = sdsnewlen((char*)o->ptr+start,rangelen);
4745 addReplySds(c,range);
4746 addReply(c,shared.crlf);
4747 decrRefCount(o);
4748 }
4749
4750 /* ========================= Type agnostic commands ========================= */
4751
4752 static void delCommand(redisClient *c) {
4753 int deleted = 0, j;
4754
4755 for (j = 1; j < c->argc; j++) {
4756 if (dbDelete(c->db,c->argv[j])) {
4757 touchWatchedKey(c->db,c->argv[j]);
4758 server.dirty++;
4759 deleted++;
4760 }
4761 }
4762 addReplyLongLong(c,deleted);
4763 }
4764
4765 static void existsCommand(redisClient *c) {
4766 expireIfNeeded(c->db,c->argv[1]);
4767 if (dbExists(c->db,c->argv[1])) {
4768 addReply(c, shared.cone);
4769 } else {
4770 addReply(c, shared.czero);
4771 }
4772 }
4773
4774 static void selectCommand(redisClient *c) {
4775 int id = atoi(c->argv[1]->ptr);
4776
4777 if (selectDb(c,id) == REDIS_ERR) {
4778 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4779 } else {
4780 addReply(c,shared.ok);
4781 }
4782 }
4783
4784 static void randomkeyCommand(redisClient *c) {
4785 robj *key;
4786
4787 if ((key = dbRandomKey(c->db)) == NULL) {
4788 addReply(c,shared.nullbulk);
4789 return;
4790 }
4791
4792 addReplyBulk(c,key);
4793 decrRefCount(key);
4794 }
4795
4796 static void keysCommand(redisClient *c) {
4797 dictIterator *di;
4798 dictEntry *de;
4799 sds pattern = c->argv[1]->ptr;
4800 int plen = sdslen(pattern);
4801 unsigned long numkeys = 0;
4802 robj *lenobj = createObject(REDIS_STRING,NULL);
4803
4804 di = dictGetIterator(c->db->dict);
4805 addReply(c,lenobj);
4806 decrRefCount(lenobj);
4807 while((de = dictNext(di)) != NULL) {
4808 sds key = dictGetEntryKey(de);
4809 robj *keyobj;
4810
4811 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4812 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4813 keyobj = createStringObject(key,sdslen(key));
4814 if (expireIfNeeded(c->db,keyobj) == 0) {
4815 addReplyBulk(c,keyobj);
4816 numkeys++;
4817 }
4818 decrRefCount(keyobj);
4819 }
4820 }
4821 dictReleaseIterator(di);
4822 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4823 }
4824
4825 static void dbsizeCommand(redisClient *c) {
4826 addReplySds(c,
4827 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4828 }
4829
4830 static void lastsaveCommand(redisClient *c) {
4831 addReplySds(c,
4832 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4833 }
4834
4835 static void typeCommand(redisClient *c) {
4836 robj *o;
4837 char *type;
4838
4839 o = lookupKeyRead(c->db,c->argv[1]);
4840 if (o == NULL) {
4841 type = "+none";
4842 } else {
4843 switch(o->type) {
4844 case REDIS_STRING: type = "+string"; break;
4845 case REDIS_LIST: type = "+list"; break;
4846 case REDIS_SET: type = "+set"; break;
4847 case REDIS_ZSET: type = "+zset"; break;
4848 case REDIS_HASH: type = "+hash"; break;
4849 default: type = "+unknown"; break;
4850 }
4851 }
4852 addReplySds(c,sdsnew(type));
4853 addReply(c,shared.crlf);
4854 }
4855
4856 static void saveCommand(redisClient *c) {
4857 if (server.bgsavechildpid != -1) {
4858 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4859 return;
4860 }
4861 if (rdbSave(server.dbfilename) == REDIS_OK) {
4862 addReply(c,shared.ok);
4863 } else {
4864 addReply(c,shared.err);
4865 }
4866 }
4867
4868 static void bgsaveCommand(redisClient *c) {
4869 if (server.bgsavechildpid != -1) {
4870 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4871 return;
4872 }
4873 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4874 char *status = "+Background saving started\r\n";
4875 addReplySds(c,sdsnew(status));
4876 } else {
4877 addReply(c,shared.err);
4878 }
4879 }
4880
4881 static void shutdownCommand(redisClient *c) {
4882 if (prepareForShutdown() == REDIS_OK)
4883 exit(0);
4884 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4885 }
4886
4887 static void renameGenericCommand(redisClient *c, int nx) {
4888 robj *o;
4889
4890 /* To use the same key as src and dst is probably an error */
4891 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4892 addReply(c,shared.sameobjecterr);
4893 return;
4894 }
4895
4896 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4897 return;
4898
4899 incrRefCount(o);
4900 deleteIfVolatile(c->db,c->argv[2]);
4901 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4902 if (nx) {
4903 decrRefCount(o);
4904 addReply(c,shared.czero);
4905 return;
4906 }
4907 dbReplace(c->db,c->argv[2],o);
4908 }
4909 dbDelete(c->db,c->argv[1]);
4910 touchWatchedKey(c->db,c->argv[2]);
4911 server.dirty++;
4912 addReply(c,nx ? shared.cone : shared.ok);
4913 }
4914
4915 static void renameCommand(redisClient *c) {
4916 renameGenericCommand(c,0);
4917 }
4918
4919 static void renamenxCommand(redisClient *c) {
4920 renameGenericCommand(c,1);
4921 }
4922
4923 static void moveCommand(redisClient *c) {
4924 robj *o;
4925 redisDb *src, *dst;
4926 int srcid;
4927
4928 /* Obtain source and target DB pointers */
4929 src = c->db;
4930 srcid = c->db->id;
4931 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4932 addReply(c,shared.outofrangeerr);
4933 return;
4934 }
4935 dst = c->db;
4936 selectDb(c,srcid); /* Back to the source DB */
4937
4938 /* If the user is moving using as target the same
4939 * DB as the source DB it is probably an error. */
4940 if (src == dst) {
4941 addReply(c,shared.sameobjecterr);
4942 return;
4943 }
4944
4945 /* Check if the element exists and get a reference */
4946 o = lookupKeyWrite(c->db,c->argv[1]);
4947 if (!o) {
4948 addReply(c,shared.czero);
4949 return;
4950 }
4951
4952 /* Try to add the element to the target DB */
4953 deleteIfVolatile(dst,c->argv[1]);
4954 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4955 addReply(c,shared.czero);
4956 return;
4957 }
4958 incrRefCount(o);
4959
4960 /* OK! key moved, free the entry in the source DB */
4961 dbDelete(src,c->argv[1]);
4962 server.dirty++;
4963 addReply(c,shared.cone);
4964 }
4965
4966 /* =================================== Lists ================================ */
4967
4968
4969 /* Check the argument length to see if it requires us to convert the ziplist
4970 * to a real list. Only check raw-encoded objects because integer encoded
4971 * objects are never too long. */
4972 static void listTypeTryConversion(robj *subject, robj *value) {
4973 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4974 if (value->encoding == REDIS_ENCODING_RAW &&
4975 sdslen(value->ptr) > server.list_max_ziplist_value)
4976 listTypeConvert(subject,REDIS_ENCODING_LIST);
4977 }
4978
4979 static void listTypePush(robj *subject, robj *value, int where) {
4980 /* Check if we need to convert the ziplist */
4981 listTypeTryConversion(subject,value);
4982 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4983 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
4984 listTypeConvert(subject,REDIS_ENCODING_LIST);
4985
4986 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4987 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4988 value = getDecodedObject(value);
4989 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4990 decrRefCount(value);
4991 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4992 if (where == REDIS_HEAD) {
4993 listAddNodeHead(subject->ptr,value);
4994 } else {
4995 listAddNodeTail(subject->ptr,value);
4996 }
4997 incrRefCount(value);
4998 } else {
4999 redisPanic("Unknown list encoding");
5000 }
5001 }
5002
5003 static robj *listTypePop(robj *subject, int where) {
5004 robj *value = NULL;
5005 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5006 unsigned char *p;
5007 unsigned char *vstr;
5008 unsigned int vlen;
5009 long long vlong;
5010 int pos = (where == REDIS_HEAD) ? 0 : -1;
5011 p = ziplistIndex(subject->ptr,pos);
5012 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5013 if (vstr) {
5014 value = createStringObject((char*)vstr,vlen);
5015 } else {
5016 value = createStringObjectFromLongLong(vlong);
5017 }
5018 /* We only need to delete an element when it exists */
5019 subject->ptr = ziplistDelete(subject->ptr,&p);
5020 }
5021 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5022 list *list = subject->ptr;
5023 listNode *ln;
5024 if (where == REDIS_HEAD) {
5025 ln = listFirst(list);
5026 } else {
5027 ln = listLast(list);
5028 }
5029 if (ln != NULL) {
5030 value = listNodeValue(ln);
5031 incrRefCount(value);
5032 listDelNode(list,ln);
5033 }
5034 } else {
5035 redisPanic("Unknown list encoding");
5036 }
5037 return value;
5038 }
5039
5040 static unsigned long listTypeLength(robj *subject) {
5041 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5042 return ziplistLen(subject->ptr);
5043 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5044 return listLength((list*)subject->ptr);
5045 } else {
5046 redisPanic("Unknown list encoding");
5047 }
5048 }
5049
5050 /* Structure to hold set iteration abstraction. */
5051 typedef struct {
5052 robj *subject;
5053 unsigned char encoding;
5054 unsigned char direction; /* Iteration direction */
5055 unsigned char *zi;
5056 listNode *ln;
5057 } listTypeIterator;
5058
5059 /* Structure for an entry while iterating over a list. */
5060 typedef struct {
5061 listTypeIterator *li;
5062 unsigned char *zi; /* Entry in ziplist */
5063 listNode *ln; /* Entry in linked list */
5064 } listTypeEntry;
5065
5066 /* Initialize an iterator at the specified index. */
5067 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5068 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5069 li->subject = subject;
5070 li->encoding = subject->encoding;
5071 li->direction = direction;
5072 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5073 li->zi = ziplistIndex(subject->ptr,index);
5074 } else if (li->encoding == REDIS_ENCODING_LIST) {
5075 li->ln = listIndex(subject->ptr,index);
5076 } else {
5077 redisPanic("Unknown list encoding");
5078 }
5079 return li;
5080 }
5081
5082 /* Clean up the iterator. */
5083 static void listTypeReleaseIterator(listTypeIterator *li) {
5084 zfree(li);
5085 }
5086
5087 /* Stores pointer to current the entry in the provided entry structure
5088 * and advances the position of the iterator. Returns 1 when the current
5089 * entry is in fact an entry, 0 otherwise. */
5090 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5091 /* Protect from converting when iterating */
5092 redisAssert(li->subject->encoding == li->encoding);
5093
5094 entry->li = li;
5095 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5096 entry->zi = li->zi;
5097 if (entry->zi != NULL) {
5098 if (li->direction == REDIS_TAIL)
5099 li->zi = ziplistNext(li->subject->ptr,li->zi);
5100 else
5101 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5102 return 1;
5103 }
5104 } else if (li->encoding == REDIS_ENCODING_LIST) {
5105 entry->ln = li->ln;
5106 if (entry->ln != NULL) {
5107 if (li->direction == REDIS_TAIL)
5108 li->ln = li->ln->next;
5109 else
5110 li->ln = li->ln->prev;
5111 return 1;
5112 }
5113 } else {
5114 redisPanic("Unknown list encoding");
5115 }
5116 return 0;
5117 }
5118
5119 /* Return entry or NULL at the current position of the iterator. */
5120 static robj *listTypeGet(listTypeEntry *entry) {
5121 listTypeIterator *li = entry->li;
5122 robj *value = NULL;
5123 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5124 unsigned char *vstr;
5125 unsigned int vlen;
5126 long long vlong;
5127 redisAssert(entry->zi != NULL);
5128 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5129 if (vstr) {
5130 value = createStringObject((char*)vstr,vlen);
5131 } else {
5132 value = createStringObjectFromLongLong(vlong);
5133 }
5134 }
5135 } else if (li->encoding == REDIS_ENCODING_LIST) {
5136 redisAssert(entry->ln != NULL);
5137 value = listNodeValue(entry->ln);
5138 incrRefCount(value);
5139 } else {
5140 redisPanic("Unknown list encoding");
5141 }
5142 return value;
5143 }
5144
5145 /* Compare the given object with the entry at the current position. */
5146 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5147 listTypeIterator *li = entry->li;
5148 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5149 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5150 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5151 } else if (li->encoding == REDIS_ENCODING_LIST) {
5152 return equalStringObjects(o,listNodeValue(entry->ln));
5153 } else {
5154 redisPanic("Unknown list encoding");
5155 }
5156 }
5157
5158 /* Delete the element pointed to. */
5159 static void listTypeDelete(listTypeEntry *entry) {
5160 listTypeIterator *li = entry->li;
5161 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5162 unsigned char *p = entry->zi;
5163 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5164
5165 /* Update position of the iterator depending on the direction */
5166 if (li->direction == REDIS_TAIL)
5167 li->zi = p;
5168 else
5169 li->zi = ziplistPrev(li->subject->ptr,p);
5170 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5171 listNode *next;
5172 if (li->direction == REDIS_TAIL)
5173 next = entry->ln->next;
5174 else
5175 next = entry->ln->prev;
5176 listDelNode(li->subject->ptr,entry->ln);
5177 li->ln = next;
5178 } else {
5179 redisPanic("Unknown list encoding");
5180 }
5181 }
5182
5183 static void listTypeConvert(robj *subject, int enc) {
5184 listTypeIterator *li;
5185 listTypeEntry entry;
5186 redisAssert(subject->type == REDIS_LIST);
5187
5188 if (enc == REDIS_ENCODING_LIST) {
5189 list *l = listCreate();
5190 listSetFreeMethod(l,decrRefCount);
5191
5192 /* listTypeGet returns a robj with incremented refcount */
5193 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5194 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5195 listTypeReleaseIterator(li);
5196
5197 subject->encoding = REDIS_ENCODING_LIST;
5198 zfree(subject->ptr);
5199 subject->ptr = l;
5200 } else {
5201 redisPanic("Unsupported list conversion");
5202 }
5203 }
5204
5205 static void pushGenericCommand(redisClient *c, int where) {
5206 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5207 if (lobj == NULL) {
5208 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5209 addReply(c,shared.cone);
5210 return;
5211 }
5212 lobj = createZiplistObject();
5213 dbAdd(c->db,c->argv[1],lobj);
5214 } else {
5215 if (lobj->type != REDIS_LIST) {
5216 addReply(c,shared.wrongtypeerr);
5217 return;
5218 }
5219 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5220 addReply(c,shared.cone);
5221 return;
5222 }
5223 }
5224 listTypePush(lobj,c->argv[2],where);
5225 addReplyLongLong(c,listTypeLength(lobj));
5226 server.dirty++;
5227 }
5228
5229 static void lpushCommand(redisClient *c) {
5230 pushGenericCommand(c,REDIS_HEAD);
5231 }
5232
5233 static void rpushCommand(redisClient *c) {
5234 pushGenericCommand(c,REDIS_TAIL);
5235 }
5236
5237 static void llenCommand(redisClient *c) {
5238 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5239 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5240 addReplyUlong(c,listTypeLength(o));
5241 }
5242
5243 static void lindexCommand(redisClient *c) {
5244 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5245 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5246 int index = atoi(c->argv[2]->ptr);
5247 robj *value = NULL;
5248
5249 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5250 unsigned char *p;
5251 unsigned char *vstr;
5252 unsigned int vlen;
5253 long long vlong;
5254 p = ziplistIndex(o->ptr,index);
5255 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5256 if (vstr) {
5257 value = createStringObject((char*)vstr,vlen);
5258 } else {
5259 value = createStringObjectFromLongLong(vlong);
5260 }
5261 addReplyBulk(c,value);
5262 decrRefCount(value);
5263 } else {
5264 addReply(c,shared.nullbulk);
5265 }
5266 } else if (o->encoding == REDIS_ENCODING_LIST) {
5267 listNode *ln = listIndex(o->ptr,index);
5268 if (ln != NULL) {
5269 value = listNodeValue(ln);
5270 addReplyBulk(c,value);
5271 } else {
5272 addReply(c,shared.nullbulk);
5273 }
5274 } else {
5275 redisPanic("Unknown list encoding");
5276 }
5277 }
5278
5279 static void lsetCommand(redisClient *c) {
5280 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5281 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5282 int index = atoi(c->argv[2]->ptr);
5283 robj *value = c->argv[3];
5284
5285 listTypeTryConversion(o,value);
5286 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5287 unsigned char *p, *zl = o->ptr;
5288 p = ziplistIndex(zl,index);
5289 if (p == NULL) {
5290 addReply(c,shared.outofrangeerr);
5291 } else {
5292 o->ptr = ziplistDelete(o->ptr,&p);
5293 value = getDecodedObject(value);
5294 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5295 decrRefCount(value);
5296 addReply(c,shared.ok);
5297 server.dirty++;
5298 }
5299 } else if (o->encoding == REDIS_ENCODING_LIST) {
5300 listNode *ln = listIndex(o->ptr,index);
5301 if (ln == NULL) {
5302 addReply(c,shared.outofrangeerr);
5303 } else {
5304 decrRefCount((robj*)listNodeValue(ln));
5305 listNodeValue(ln) = value;
5306 incrRefCount(value);
5307 addReply(c,shared.ok);
5308 server.dirty++;
5309 }
5310 } else {
5311 redisPanic("Unknown list encoding");
5312 }
5313 }
5314
5315 static void popGenericCommand(redisClient *c, int where) {
5316 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5317 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5318
5319 robj *value = listTypePop(o,where);
5320 if (value == NULL) {
5321 addReply(c,shared.nullbulk);
5322 } else {
5323 addReplyBulk(c,value);
5324 decrRefCount(value);
5325 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5326 server.dirty++;
5327 }
5328 }
5329
5330 static void lpopCommand(redisClient *c) {
5331 popGenericCommand(c,REDIS_HEAD);
5332 }
5333
5334 static void rpopCommand(redisClient *c) {
5335 popGenericCommand(c,REDIS_TAIL);
5336 }
5337
5338 static void lrangeCommand(redisClient *c) {
5339 robj *o, *value;
5340 int start = atoi(c->argv[2]->ptr);
5341 int end = atoi(c->argv[3]->ptr);
5342 int llen;
5343 int rangelen, j;
5344 listTypeEntry entry;
5345
5346 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5347 || checkType(c,o,REDIS_LIST)) return;
5348 llen = listTypeLength(o);
5349
5350 /* convert negative indexes */
5351 if (start < 0) start = llen+start;
5352 if (end < 0) end = llen+end;
5353 if (start < 0) start = 0;
5354 if (end < 0) end = 0;
5355
5356 /* indexes sanity checks */
5357 if (start > end || start >= llen) {
5358 /* Out of range start or start > end result in empty list */
5359 addReply(c,shared.emptymultibulk);
5360 return;
5361 }
5362 if (end >= llen) end = llen-1;
5363 rangelen = (end-start)+1;
5364
5365 /* Return the result in form of a multi-bulk reply */
5366 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5367 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5368 for (j = 0; j < rangelen; j++) {
5369 redisAssert(listTypeNext(li,&entry));
5370 value = listTypeGet(&entry);
5371 addReplyBulk(c,value);
5372 decrRefCount(value);
5373 }
5374 listTypeReleaseIterator(li);
5375 }
5376
5377 static void ltrimCommand(redisClient *c) {
5378 robj *o;
5379 int start = atoi(c->argv[2]->ptr);
5380 int end = atoi(c->argv[3]->ptr);
5381 int llen;
5382 int j, ltrim, rtrim;
5383 list *list;
5384 listNode *ln;
5385
5386 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5387 checkType(c,o,REDIS_LIST)) return;
5388 llen = listTypeLength(o);
5389
5390 /* convert negative indexes */
5391 if (start < 0) start = llen+start;
5392 if (end < 0) end = llen+end;
5393 if (start < 0) start = 0;
5394 if (end < 0) end = 0;
5395
5396 /* indexes sanity checks */
5397 if (start > end || start >= llen) {
5398 /* Out of range start or start > end result in empty list */
5399 ltrim = llen;
5400 rtrim = 0;
5401 } else {
5402 if (end >= llen) end = llen-1;
5403 ltrim = start;
5404 rtrim = llen-end-1;
5405 }
5406
5407 /* Remove list elements to perform the trim */
5408 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5409 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5410 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5411 } else if (o->encoding == REDIS_ENCODING_LIST) {
5412 list = o->ptr;
5413 for (j = 0; j < ltrim; j++) {
5414 ln = listFirst(list);
5415 listDelNode(list,ln);
5416 }
5417 for (j = 0; j < rtrim; j++) {
5418 ln = listLast(list);
5419 listDelNode(list,ln);
5420 }
5421 } else {
5422 redisPanic("Unknown list encoding");
5423 }
5424 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5425 server.dirty++;
5426 addReply(c,shared.ok);
5427 }
5428
5429 static void lremCommand(redisClient *c) {
5430 robj *subject, *obj = c->argv[3];
5431 int toremove = atoi(c->argv[2]->ptr);
5432 int removed = 0;
5433 listTypeEntry entry;
5434
5435 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5436 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5437
5438 /* Make sure obj is raw when we're dealing with a ziplist */
5439 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5440 obj = getDecodedObject(obj);
5441
5442 listTypeIterator *li;
5443 if (toremove < 0) {
5444 toremove = -toremove;
5445 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5446 } else {
5447 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5448 }
5449
5450 while (listTypeNext(li,&entry)) {
5451 if (listTypeEqual(&entry,obj)) {
5452 listTypeDelete(&entry);
5453 server.dirty++;
5454 removed++;
5455 if (toremove && removed == toremove) break;
5456 }
5457 }
5458 listTypeReleaseIterator(li);
5459
5460 /* Clean up raw encoded object */
5461 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5462 decrRefCount(obj);
5463
5464 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5465 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5466 }
5467
5468 /* This is the semantic of this command:
5469 * RPOPLPUSH srclist dstlist:
5470 * IF LLEN(srclist) > 0
5471 * element = RPOP srclist
5472 * LPUSH dstlist element
5473 * RETURN element
5474 * ELSE
5475 * RETURN nil
5476 * END
5477 * END
5478 *
5479 * The idea is to be able to get an element from a list in a reliable way
5480 * since the element is not just returned but pushed against another list
5481 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5482 */
5483 static void rpoplpushcommand(redisClient *c) {
5484 robj *sobj, *value;
5485 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5486 checkType(c,sobj,REDIS_LIST)) return;
5487
5488 if (listTypeLength(sobj) == 0) {
5489 addReply(c,shared.nullbulk);
5490 } else {
5491 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5492 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5493 value = listTypePop(sobj,REDIS_TAIL);
5494
5495 /* Add the element to the target list (unless it's directly
5496 * passed to some BLPOP-ing client */
5497 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5498 /* Create the list if the key does not exist */
5499 if (!dobj) {
5500 dobj = createZiplistObject();
5501 dbAdd(c->db,c->argv[2],dobj);
5502 }
5503 listTypePush(dobj,value,REDIS_HEAD);
5504 }
5505
5506 /* Send the element to the client as reply as well */
5507 addReplyBulk(c,value);
5508
5509 /* listTypePop returns an object with its refcount incremented */
5510 decrRefCount(value);
5511
5512 /* Delete the source list when it is empty */
5513 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5514 server.dirty++;
5515 }
5516 }
5517
5518 /* ==================================== Sets ================================ */
5519
5520 /* Factory method to return a set that *can* hold "value". When the object has
5521 * an integer-encodable value, an intset will be returned. Otherwise a regular
5522 * hash table. */
5523 static robj *setTypeCreate(robj *value) {
5524 if (getLongLongFromObject(value,NULL) == REDIS_OK)
5525 return createIntsetObject();
5526 return createSetObject();
5527 }
5528
5529 static int setTypeAdd(robj *subject, robj *value) {
5530 long long llval;
5531 if (subject->encoding == REDIS_ENCODING_HT) {
5532 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5533 incrRefCount(value);
5534 return 1;
5535 }
5536 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5537 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5538 uint8_t success = 0;
5539 subject->ptr = intsetAdd(subject->ptr,llval,&success);
5540 if (success) {
5541 /* Convert to regular set when the intset contains
5542 * too many entries. */
5543 if (intsetLen(subject->ptr) > server.set_max_intset_entries)
5544 setTypeConvert(subject,REDIS_ENCODING_HT);
5545 return 1;
5546 }
5547 } else {
5548 /* Failed to get integer from object, convert to regular set. */
5549 setTypeConvert(subject,REDIS_ENCODING_HT);
5550
5551 /* The set *was* an intset and this value is not integer
5552 * encodable, so dictAdd should always work. */
5553 redisAssert(dictAdd(subject->ptr,value,NULL) == DICT_OK);
5554 incrRefCount(value);
5555 return 1;
5556 }
5557 } else {
5558 redisPanic("Unknown set encoding");
5559 }
5560 return 0;
5561 }
5562
5563 static int setTypeRemove(robj *subject, robj *value) {
5564 long long llval;
5565 if (subject->encoding == REDIS_ENCODING_HT) {
5566 if (dictDelete(subject->ptr,value) == DICT_OK) {
5567 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5568 return 1;
5569 }
5570 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5571 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5572 uint8_t success;
5573 subject->ptr = intsetRemove(subject->ptr,llval,&success);
5574 if (success) return 1;
5575 }
5576 } else {
5577 redisPanic("Unknown set encoding");
5578 }
5579 return 0;
5580 }
5581
5582 static int setTypeIsMember(robj *subject, robj *value) {
5583 long long llval;
5584 if (subject->encoding == REDIS_ENCODING_HT) {
5585 return dictFind((dict*)subject->ptr,value) != NULL;
5586 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5587 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5588 return intsetFind((intset*)subject->ptr,llval);
5589 }
5590 } else {
5591 redisPanic("Unknown set encoding");
5592 }
5593 return 0;
5594 }
5595
5596 /* Structure to hold set iteration abstraction. */
5597 typedef struct {
5598 robj *subject;
5599 int encoding;
5600 int ii; /* intset iterator */
5601 dictIterator *di;
5602 } setIterator;
5603
5604 static setIterator *setTypeInitIterator(robj *subject) {
5605 setIterator *si = zmalloc(sizeof(setIterator));
5606 si->subject = subject;
5607 si->encoding = subject->encoding;
5608 if (si->encoding == REDIS_ENCODING_HT) {
5609 si->di = dictGetIterator(subject->ptr);
5610 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5611 si->ii = 0;
5612 } else {
5613 redisPanic("Unknown set encoding");
5614 }
5615 return si;
5616 }
5617
5618 static void setTypeReleaseIterator(setIterator *si) {
5619 if (si->encoding == REDIS_ENCODING_HT)
5620 dictReleaseIterator(si->di);
5621 zfree(si);
5622 }
5623
5624 /* Move to the next entry in the set. Returns the object at the current
5625 * position, or NULL when the end is reached. This object will have its
5626 * refcount incremented, so the caller needs to take care of this. */
5627 static robj *setTypeNext(setIterator *si) {
5628 robj *ret = NULL;
5629 if (si->encoding == REDIS_ENCODING_HT) {
5630 dictEntry *de = dictNext(si->di);
5631 if (de != NULL) {
5632 ret = dictGetEntryKey(de);
5633 incrRefCount(ret);
5634 }
5635 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5636 long long llval;
5637 if (intsetGet(si->subject->ptr,si->ii++,&llval))
5638 ret = createStringObjectFromLongLong(llval);
5639 }
5640 return ret;
5641 }
5642
5643
5644 /* Return random element from set. The returned object will always have
5645 * an incremented refcount. */
5646 robj *setTypeRandomElement(robj *subject) {
5647 robj *ret = NULL;
5648 if (subject->encoding == REDIS_ENCODING_HT) {
5649 dictEntry *de = dictGetRandomKey(subject->ptr);
5650 ret = dictGetEntryKey(de);
5651 incrRefCount(ret);
5652 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5653 long long llval = intsetRandom(subject->ptr);
5654 ret = createStringObjectFromLongLong(llval);
5655 } else {
5656 redisPanic("Unknown set encoding");
5657 }
5658 return ret;
5659 }
5660
5661 static unsigned long setTypeSize(robj *subject) {
5662 if (subject->encoding == REDIS_ENCODING_HT) {
5663 return dictSize((dict*)subject->ptr);
5664 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5665 return intsetLen((intset*)subject->ptr);
5666 } else {
5667 redisPanic("Unknown set encoding");
5668 }
5669 }
5670
5671 static void setTypeConvert(robj *subject, int enc) {
5672 setIterator *si;
5673 robj *element;
5674 redisAssert(subject->type == REDIS_SET);
5675
5676 if (enc == REDIS_ENCODING_HT) {
5677 dict *d = dictCreate(&setDictType,NULL);
5678
5679 /* setTypeGet returns a robj with incremented refcount */
5680 si = setTypeInitIterator(subject);
5681 while ((element = setTypeNext(si)) != NULL)
5682 redisAssert(dictAdd(d,element,NULL) == DICT_OK);
5683 setTypeReleaseIterator(si);
5684
5685 subject->encoding = REDIS_ENCODING_HT;
5686 zfree(subject->ptr);
5687 subject->ptr = d;
5688 } else {
5689 redisPanic("Unsupported set conversion");
5690 }
5691 }
5692
5693 static void saddCommand(redisClient *c) {
5694 robj *set;
5695
5696 set = lookupKeyWrite(c->db,c->argv[1]);
5697 if (set == NULL) {
5698 set = setTypeCreate(c->argv[2]);
5699 dbAdd(c->db,c->argv[1],set);
5700 } else {
5701 if (set->type != REDIS_SET) {
5702 addReply(c,shared.wrongtypeerr);
5703 return;
5704 }
5705 }
5706 if (setTypeAdd(set,c->argv[2])) {
5707 server.dirty++;
5708 addReply(c,shared.cone);
5709 } else {
5710 addReply(c,shared.czero);
5711 }
5712 }
5713
5714 static void sremCommand(redisClient *c) {
5715 robj *set;
5716
5717 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5718 checkType(c,set,REDIS_SET)) return;
5719
5720 if (setTypeRemove(set,c->argv[2])) {
5721 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5722 server.dirty++;
5723 addReply(c,shared.cone);
5724 } else {
5725 addReply(c,shared.czero);
5726 }
5727 }
5728
5729 static void smoveCommand(redisClient *c) {
5730 robj *srcset, *dstset, *ele;
5731 srcset = lookupKeyWrite(c->db,c->argv[1]);
5732 dstset = lookupKeyWrite(c->db,c->argv[2]);
5733 ele = c->argv[3];
5734
5735 /* If the source key does not exist return 0 */
5736 if (srcset == NULL) {
5737 addReply(c,shared.czero);
5738 return;
5739 }
5740
5741 /* If the source key has the wrong type, or the destination key
5742 * is set and has the wrong type, return with an error. */
5743 if (checkType(c,srcset,REDIS_SET) ||
5744 (dstset && checkType(c,dstset,REDIS_SET))) return;
5745
5746 /* If srcset and dstset are equal, SMOVE is a no-op */
5747 if (srcset == dstset) {
5748 addReply(c,shared.cone);
5749 return;
5750 }
5751
5752 /* If the element cannot be removed from the src set, return 0. */
5753 if (!setTypeRemove(srcset,ele)) {
5754 addReply(c,shared.czero);
5755 return;
5756 }
5757
5758 /* Remove the src set from the database when empty */
5759 if (setTypeSize(srcset) == 0) dbDelete(c->db,c->argv[1]);
5760 server.dirty++;
5761
5762 /* Create the destination set when it doesn't exist */
5763 if (!dstset) {
5764 dstset = setTypeCreate(ele);
5765 dbAdd(c->db,c->argv[2],dstset);
5766 }
5767
5768 /* An extra key has changed when ele was successfully added to dstset */
5769 if (setTypeAdd(dstset,ele)) server.dirty++;
5770 addReply(c,shared.cone);
5771 }
5772
5773 static void sismemberCommand(redisClient *c) {
5774 robj *set;
5775
5776 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5777 checkType(c,set,REDIS_SET)) return;
5778
5779 if (setTypeIsMember(set,c->argv[2]))
5780 addReply(c,shared.cone);
5781 else
5782 addReply(c,shared.czero);
5783 }
5784
5785 static void scardCommand(redisClient *c) {
5786 robj *o;
5787
5788 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5789 checkType(c,o,REDIS_SET)) return;
5790
5791 addReplyUlong(c,setTypeSize(o));
5792 }
5793
5794 static void spopCommand(redisClient *c) {
5795 robj *set, *ele;
5796
5797 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5798 checkType(c,set,REDIS_SET)) return;
5799
5800 ele = setTypeRandomElement(set);
5801 if (ele == NULL) {
5802 addReply(c,shared.nullbulk);
5803 } else {
5804 setTypeRemove(set,ele);
5805 addReplyBulk(c,ele);
5806 decrRefCount(ele);
5807 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
5808 server.dirty++;
5809 }
5810 }
5811
5812 static void srandmemberCommand(redisClient *c) {
5813 robj *set, *ele;
5814
5815 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5816 checkType(c,set,REDIS_SET)) return;
5817
5818 ele = setTypeRandomElement(set);
5819 if (ele == NULL) {
5820 addReply(c,shared.nullbulk);
5821 } else {
5822 addReplyBulk(c,ele);
5823 decrRefCount(ele);
5824 }
5825 }
5826
5827 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5828 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
5829 }
5830
5831 static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5832 robj **sets = zmalloc(sizeof(robj*)*setnum);
5833 setIterator *si;
5834 robj *ele, *lenobj = NULL, *dstset = NULL;
5835 unsigned long j, cardinality = 0;
5836
5837 for (j = 0; j < setnum; j++) {
5838 robj *setobj = dstkey ?
5839 lookupKeyWrite(c->db,setkeys[j]) :
5840 lookupKeyRead(c->db,setkeys[j]);
5841 if (!setobj) {
5842 zfree(sets);
5843 if (dstkey) {
5844 if (dbDelete(c->db,dstkey))
5845 server.dirty++;
5846 addReply(c,shared.czero);
5847 } else {
5848 addReply(c,shared.emptymultibulk);
5849 }
5850 return;
5851 }
5852 if (checkType(c,setobj,REDIS_SET)) {
5853 zfree(sets);
5854 return;
5855 }
5856 sets[j] = setobj;
5857 }
5858 /* Sort sets from the smallest to largest, this will improve our
5859 * algorithm's performace */
5860 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
5861
5862 /* The first thing we should output is the total number of elements...
5863 * since this is a multi-bulk write, but at this stage we don't know
5864 * the intersection set size, so we use a trick, append an empty object
5865 * to the output list and save the pointer to later modify it with the
5866 * right length */
5867 if (!dstkey) {
5868 lenobj = createObject(REDIS_STRING,NULL);
5869 addReply(c,lenobj);
5870 decrRefCount(lenobj);
5871 } else {
5872 /* If we have a target key where to store the resulting set
5873 * create this key with an empty set inside */
5874 dstset = createIntsetObject();
5875 }
5876
5877 /* Iterate all the elements of the first (smallest) set, and test
5878 * the element against all the other sets, if at least one set does
5879 * not include the element it is discarded */
5880 si = setTypeInitIterator(sets[0]);
5881 while((ele = setTypeNext(si)) != NULL) {
5882 for (j = 1; j < setnum; j++)
5883 if (!setTypeIsMember(sets[j],ele)) break;
5884
5885 /* Only take action when all sets contain the member */
5886 if (j == setnum) {
5887 if (!dstkey) {
5888 addReplyBulk(c,ele);
5889 cardinality++;
5890 } else {
5891 setTypeAdd(dstset,ele);
5892 }
5893 }
5894 decrRefCount(ele);
5895 }
5896 setTypeReleaseIterator(si);
5897
5898 if (dstkey) {
5899 /* Store the resulting set into the target, if the intersection
5900 * is not an empty set. */
5901 dbDelete(c->db,dstkey);
5902 if (setTypeSize(dstset) > 0) {
5903 dbAdd(c->db,dstkey,dstset);
5904 addReplyLongLong(c,setTypeSize(dstset));
5905 } else {
5906 decrRefCount(dstset);
5907 addReply(c,shared.czero);
5908 }
5909 server.dirty++;
5910 } else {
5911 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5912 }
5913 zfree(sets);
5914 }
5915
5916 static void sinterCommand(redisClient *c) {
5917 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5918 }
5919
5920 static void sinterstoreCommand(redisClient *c) {
5921 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5922 }
5923
5924 #define REDIS_OP_UNION 0
5925 #define REDIS_OP_DIFF 1
5926 #define REDIS_OP_INTER 2
5927
5928 static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5929 robj **sets = zmalloc(sizeof(robj*)*setnum);
5930 setIterator *si;
5931 robj *ele, *dstset = NULL;
5932 int j, cardinality = 0;
5933
5934 for (j = 0; j < setnum; j++) {
5935 robj *setobj = dstkey ?
5936 lookupKeyWrite(c->db,setkeys[j]) :
5937 lookupKeyRead(c->db,setkeys[j]);
5938 if (!setobj) {
5939 sets[j] = NULL;
5940 continue;
5941 }
5942 if (checkType(c,setobj,REDIS_SET)) {
5943 zfree(sets);
5944 return;
5945 }
5946 sets[j] = setobj;
5947 }
5948
5949 /* We need a temp set object to store our union. If the dstkey
5950 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5951 * this set object will be the resulting object to set into the target key*/
5952 dstset = createIntsetObject();
5953
5954 /* Iterate all the elements of all the sets, add every element a single
5955 * time to the result set */
5956 for (j = 0; j < setnum; j++) {
5957 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5958 if (!sets[j]) continue; /* non existing keys are like empty sets */
5959
5960 si = setTypeInitIterator(sets[j]);
5961 while((ele = setTypeNext(si)) != NULL) {
5962 if (op == REDIS_OP_UNION || j == 0) {
5963 if (setTypeAdd(dstset,ele)) {
5964 cardinality++;
5965 }
5966 } else if (op == REDIS_OP_DIFF) {
5967 if (setTypeRemove(dstset,ele)) {
5968 cardinality--;
5969 }
5970 }
5971 decrRefCount(ele);
5972 }
5973 setTypeReleaseIterator(si);
5974
5975 /* Exit when result set is empty. */
5976 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5977 }
5978
5979 /* Output the content of the resulting set, if not in STORE mode */
5980 if (!dstkey) {
5981 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5982 si = setTypeInitIterator(dstset);
5983 while((ele = setTypeNext(si)) != NULL) {
5984 addReplyBulk(c,ele);
5985 decrRefCount(ele);
5986 }
5987 setTypeReleaseIterator(si);
5988 decrRefCount(dstset);
5989 } else {
5990 /* If we have a target key where to store the resulting set
5991 * create this key with the result set inside */
5992 dbDelete(c->db,dstkey);
5993 if (setTypeSize(dstset) > 0) {
5994 dbAdd(c->db,dstkey,dstset);
5995 addReplyLongLong(c,setTypeSize(dstset));
5996 } else {
5997 decrRefCount(dstset);
5998 addReply(c,shared.czero);
5999 }
6000 server.dirty++;
6001 }
6002 zfree(sets);
6003 }
6004
6005 static void sunionCommand(redisClient *c) {
6006 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
6007 }
6008
6009 static void sunionstoreCommand(redisClient *c) {
6010 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
6011 }
6012
6013 static void sdiffCommand(redisClient *c) {
6014 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
6015 }
6016
6017 static void sdiffstoreCommand(redisClient *c) {
6018 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
6019 }
6020
6021 /* ==================================== ZSets =============================== */
6022
6023 /* ZSETs are ordered sets using two data structures to hold the same elements
6024 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
6025 * data structure.
6026 *
6027 * The elements are added to an hash table mapping Redis objects to scores.
6028 * At the same time the elements are added to a skip list mapping scores
6029 * to Redis objects (so objects are sorted by scores in this "view"). */
6030
6031 /* This skiplist implementation is almost a C translation of the original
6032 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
6033 * Alternative to Balanced Trees", modified in three ways:
6034 * a) this implementation allows for repeated values.
6035 * b) the comparison is not just by key (our 'score') but by satellite data.
6036 * c) there is a back pointer, so it's a doubly linked list with the back
6037 * pointers being only at "level 1". This allows to traverse the list
6038 * from tail to head, useful for ZREVRANGE. */
6039
6040 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
6041 zskiplistNode *zn = zmalloc(sizeof(*zn));
6042
6043 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
6044 if (level > 1)
6045 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6046 else
6047 zn->span = NULL;
6048 zn->score = score;
6049 zn->obj = obj;
6050 return zn;
6051 }
6052
6053 static zskiplist *zslCreate(void) {
6054 int j;
6055 zskiplist *zsl;
6056
6057 zsl = zmalloc(sizeof(*zsl));
6058 zsl->level = 1;
6059 zsl->length = 0;
6060 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
6061 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6062 zsl->header->forward[j] = NULL;
6063
6064 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
6065 if (j < ZSKIPLIST_MAXLEVEL-1)
6066 zsl->header->span[j] = 0;
6067 }
6068 zsl->header->backward = NULL;
6069 zsl->tail = NULL;
6070 return zsl;
6071 }
6072
6073 static void zslFreeNode(zskiplistNode *node) {
6074 decrRefCount(node->obj);
6075 zfree(node->forward);
6076 zfree(node->span);
6077 zfree(node);
6078 }
6079
6080 static void zslFree(zskiplist *zsl) {
6081 zskiplistNode *node = zsl->header->forward[0], *next;
6082
6083 zfree(zsl->header->forward);
6084 zfree(zsl->header->span);
6085 zfree(zsl->header);
6086 while(node) {
6087 next = node->forward[0];
6088 zslFreeNode(node);
6089 node = next;
6090 }
6091 zfree(zsl);
6092 }
6093
6094 static int zslRandomLevel(void) {
6095 int level = 1;
6096 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
6097 level += 1;
6098 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6099 }
6100
6101 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
6102 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6103 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6104 int i, level;
6105
6106 x = zsl->header;
6107 for (i = zsl->level-1; i >= 0; i--) {
6108 /* store rank that is crossed to reach the insert position */
6109 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
6110
6111 while (x->forward[i] &&
6112 (x->forward[i]->score < score ||
6113 (x->forward[i]->score == score &&
6114 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
6115 rank[i] += i > 0 ? x->span[i-1] : 1;
6116 x = x->forward[i];
6117 }
6118 update[i] = x;
6119 }
6120 /* we assume the key is not already inside, since we allow duplicated
6121 * scores, and the re-insertion of score and redis object should never
6122 * happpen since the caller of zslInsert() should test in the hash table
6123 * if the element is already inside or not. */
6124 level = zslRandomLevel();
6125 if (level > zsl->level) {
6126 for (i = zsl->level; i < level; i++) {
6127 rank[i] = 0;
6128 update[i] = zsl->header;
6129 update[i]->span[i-1] = zsl->length;
6130 }
6131 zsl->level = level;
6132 }
6133 x = zslCreateNode(level,score,obj);
6134 for (i = 0; i < level; i++) {
6135 x->forward[i] = update[i]->forward[i];
6136 update[i]->forward[i] = x;
6137
6138 /* update span covered by update[i] as x is inserted here */
6139 if (i > 0) {
6140 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6141 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6142 }
6143 }
6144
6145 /* increment span for untouched levels */
6146 for (i = level; i < zsl->level; i++) {
6147 update[i]->span[i-1]++;
6148 }
6149
6150 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6151 if (x->forward[0])
6152 x->forward[0]->backward = x;
6153 else
6154 zsl->tail = x;
6155 zsl->length++;
6156 }
6157
6158 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6159 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6160 int i;
6161 for (i = 0; i < zsl->level; i++) {
6162 if (update[i]->forward[i] == x) {
6163 if (i > 0) {
6164 update[i]->span[i-1] += x->span[i-1] - 1;
6165 }
6166 update[i]->forward[i] = x->forward[i];
6167 } else {
6168 /* invariant: i > 0, because update[0]->forward[0]
6169 * is always equal to x */
6170 update[i]->span[i-1] -= 1;
6171 }
6172 }
6173 if (x->forward[0]) {
6174 x->forward[0]->backward = x->backward;
6175 } else {
6176 zsl->tail = x->backward;
6177 }
6178 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6179 zsl->level--;
6180 zsl->length--;
6181 }
6182
6183 /* Delete an element with matching score/object from the skiplist. */
6184 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6185 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6186 int i;
6187
6188 x = zsl->header;
6189 for (i = zsl->level-1; i >= 0; i--) {
6190 while (x->forward[i] &&
6191 (x->forward[i]->score < score ||
6192 (x->forward[i]->score == score &&
6193 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6194 x = x->forward[i];
6195 update[i] = x;
6196 }
6197 /* We may have multiple elements with the same score, what we need
6198 * is to find the element with both the right score and object. */
6199 x = x->forward[0];
6200 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6201 zslDeleteNode(zsl, x, update);
6202 zslFreeNode(x);
6203 return 1;
6204 } else {
6205 return 0; /* not found */
6206 }
6207 return 0; /* not found */
6208 }
6209
6210 /* Delete all the elements with score between min and max from the skiplist.
6211 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6212 * Note that this function takes the reference to the hash table view of the
6213 * sorted set, in order to remove the elements from the hash table too. */
6214 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6215 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6216 unsigned long removed = 0;
6217 int i;
6218
6219 x = zsl->header;
6220 for (i = zsl->level-1; i >= 0; i--) {
6221 while (x->forward[i] && x->forward[i]->score < min)
6222 x = x->forward[i];
6223 update[i] = x;
6224 }
6225 /* We may have multiple elements with the same score, what we need
6226 * is to find the element with both the right score and object. */
6227 x = x->forward[0];
6228 while (x && x->score <= max) {
6229 zskiplistNode *next = x->forward[0];
6230 zslDeleteNode(zsl, x, update);
6231 dictDelete(dict,x->obj);
6232 zslFreeNode(x);
6233 removed++;
6234 x = next;
6235 }
6236 return removed; /* not found */
6237 }
6238
6239 /* Delete all the elements with rank between start and end from the skiplist.
6240 * Start and end are inclusive. Note that start and end need to be 1-based */
6241 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6242 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6243 unsigned long traversed = 0, removed = 0;
6244 int i;
6245
6246 x = zsl->header;
6247 for (i = zsl->level-1; i >= 0; i--) {
6248 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6249 traversed += i > 0 ? x->span[i-1] : 1;
6250 x = x->forward[i];
6251 }
6252 update[i] = x;
6253 }
6254
6255 traversed++;
6256 x = x->forward[0];
6257 while (x && traversed <= end) {
6258 zskiplistNode *next = x->forward[0];
6259 zslDeleteNode(zsl, x, update);
6260 dictDelete(dict,x->obj);
6261 zslFreeNode(x);
6262 removed++;
6263 traversed++;
6264 x = next;
6265 }
6266 return removed;
6267 }
6268
6269 /* Find the first node having a score equal or greater than the specified one.
6270 * Returns NULL if there is no match. */
6271 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6272 zskiplistNode *x;
6273 int i;
6274
6275 x = zsl->header;
6276 for (i = zsl->level-1; i >= 0; i--) {
6277 while (x->forward[i] && x->forward[i]->score < score)
6278 x = x->forward[i];
6279 }
6280 /* We may have multiple elements with the same score, what we need
6281 * is to find the element with both the right score and object. */
6282 return x->forward[0];
6283 }
6284
6285 /* Find the rank for an element by both score and key.
6286 * Returns 0 when the element cannot be found, rank otherwise.
6287 * Note that the rank is 1-based due to the span of zsl->header to the
6288 * first element. */
6289 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6290 zskiplistNode *x;
6291 unsigned long rank = 0;
6292 int i;
6293
6294 x = zsl->header;
6295 for (i = zsl->level-1; i >= 0; i--) {
6296 while (x->forward[i] &&
6297 (x->forward[i]->score < score ||
6298 (x->forward[i]->score == score &&
6299 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6300 rank += i > 0 ? x->span[i-1] : 1;
6301 x = x->forward[i];
6302 }
6303
6304 /* x might be equal to zsl->header, so test if obj is non-NULL */
6305 if (x->obj && equalStringObjects(x->obj,o)) {
6306 return rank;
6307 }
6308 }
6309 return 0;
6310 }
6311
6312 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6313 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6314 zskiplistNode *x;
6315 unsigned long traversed = 0;
6316 int i;
6317
6318 x = zsl->header;
6319 for (i = zsl->level-1; i >= 0; i--) {
6320 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6321 {
6322 traversed += i > 0 ? x->span[i-1] : 1;
6323 x = x->forward[i];
6324 }
6325 if (traversed == rank) {
6326 return x;
6327 }
6328 }
6329 return NULL;
6330 }
6331
6332 /* The actual Z-commands implementations */
6333
6334 /* This generic command implements both ZADD and ZINCRBY.
6335 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6336 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6337 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6338 robj *zsetobj;
6339 zset *zs;
6340 double *score;
6341
6342 if (isnan(scoreval)) {
6343 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6344 return;
6345 }
6346
6347 zsetobj = lookupKeyWrite(c->db,key);
6348 if (zsetobj == NULL) {
6349 zsetobj = createZsetObject();
6350 dbAdd(c->db,key,zsetobj);
6351 } else {
6352 if (zsetobj->type != REDIS_ZSET) {
6353 addReply(c,shared.wrongtypeerr);
6354 return;
6355 }
6356 }
6357 zs = zsetobj->ptr;
6358
6359 /* Ok now since we implement both ZADD and ZINCRBY here the code
6360 * needs to handle the two different conditions. It's all about setting
6361 * '*score', that is, the new score to set, to the right value. */
6362 score = zmalloc(sizeof(double));
6363 if (doincrement) {
6364 dictEntry *de;
6365
6366 /* Read the old score. If the element was not present starts from 0 */
6367 de = dictFind(zs->dict,ele);
6368 if (de) {
6369 double *oldscore = dictGetEntryVal(de);
6370 *score = *oldscore + scoreval;
6371 } else {
6372 *score = scoreval;
6373 }
6374 if (isnan(*score)) {
6375 addReplySds(c,
6376 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6377 zfree(score);
6378 /* Note that we don't need to check if the zset may be empty and
6379 * should be removed here, as we can only obtain Nan as score if
6380 * there was already an element in the sorted set. */
6381 return;
6382 }
6383 } else {
6384 *score = scoreval;
6385 }
6386
6387 /* What follows is a simple remove and re-insert operation that is common
6388 * to both ZADD and ZINCRBY... */
6389 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6390 /* case 1: New element */
6391 incrRefCount(ele); /* added to hash */
6392 zslInsert(zs->zsl,*score,ele);
6393 incrRefCount(ele); /* added to skiplist */
6394 server.dirty++;
6395 if (doincrement)
6396 addReplyDouble(c,*score);
6397 else
6398 addReply(c,shared.cone);
6399 } else {
6400 dictEntry *de;
6401 double *oldscore;
6402
6403 /* case 2: Score update operation */
6404 de = dictFind(zs->dict,ele);
6405 redisAssert(de != NULL);
6406 oldscore = dictGetEntryVal(de);
6407 if (*score != *oldscore) {
6408 int deleted;
6409
6410 /* Remove and insert the element in the skip list with new score */
6411 deleted = zslDelete(zs->zsl,*oldscore,ele);
6412 redisAssert(deleted != 0);
6413 zslInsert(zs->zsl,*score,ele);
6414 incrRefCount(ele);
6415 /* Update the score in the hash table */
6416 dictReplace(zs->dict,ele,score);
6417 server.dirty++;
6418 } else {
6419 zfree(score);
6420 }
6421 if (doincrement)
6422 addReplyDouble(c,*score);
6423 else
6424 addReply(c,shared.czero);
6425 }
6426 }
6427
6428 static void zaddCommand(redisClient *c) {
6429 double scoreval;
6430
6431 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6432 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6433 }
6434
6435 static void zincrbyCommand(redisClient *c) {
6436 double scoreval;
6437
6438 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6439 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6440 }
6441
6442 static void zremCommand(redisClient *c) {
6443 robj *zsetobj;
6444 zset *zs;
6445 dictEntry *de;
6446 double *oldscore;
6447 int deleted;
6448
6449 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6450 checkType(c,zsetobj,REDIS_ZSET)) return;
6451
6452 zs = zsetobj->ptr;
6453 de = dictFind(zs->dict,c->argv[2]);
6454 if (de == NULL) {
6455 addReply(c,shared.czero);
6456 return;
6457 }
6458 /* Delete from the skiplist */
6459 oldscore = dictGetEntryVal(de);
6460 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6461 redisAssert(deleted != 0);
6462
6463 /* Delete from the hash table */
6464 dictDelete(zs->dict,c->argv[2]);
6465 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6466 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6467 server.dirty++;
6468 addReply(c,shared.cone);
6469 }
6470
6471 static void zremrangebyscoreCommand(redisClient *c) {
6472 double min;
6473 double max;
6474 long deleted;
6475 robj *zsetobj;
6476 zset *zs;
6477
6478 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6479 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6480
6481 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6482 checkType(c,zsetobj,REDIS_ZSET)) return;
6483
6484 zs = zsetobj->ptr;
6485 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6486 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6487 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6488 server.dirty += deleted;
6489 addReplyLongLong(c,deleted);
6490 }
6491
6492 static void zremrangebyrankCommand(redisClient *c) {
6493 long start;
6494 long end;
6495 int llen;
6496 long deleted;
6497 robj *zsetobj;
6498 zset *zs;
6499
6500 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6501 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6502
6503 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6504 checkType(c,zsetobj,REDIS_ZSET)) return;
6505 zs = zsetobj->ptr;
6506 llen = zs->zsl->length;
6507
6508 /* convert negative indexes */
6509 if (start < 0) start = llen+start;
6510 if (end < 0) end = llen+end;
6511 if (start < 0) start = 0;
6512 if (end < 0) end = 0;
6513
6514 /* indexes sanity checks */
6515 if (start > end || start >= llen) {
6516 addReply(c,shared.czero);
6517 return;
6518 }
6519 if (end >= llen) end = llen-1;
6520
6521 /* increment start and end because zsl*Rank functions
6522 * use 1-based rank */
6523 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6524 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6525 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6526 server.dirty += deleted;
6527 addReplyLongLong(c, deleted);
6528 }
6529
6530 typedef struct {
6531 dict *dict;
6532 double weight;
6533 } zsetopsrc;
6534
6535 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6536 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6537 unsigned long size1, size2;
6538 size1 = d1->dict ? dictSize(d1->dict) : 0;
6539 size2 = d2->dict ? dictSize(d2->dict) : 0;
6540 return size1 - size2;
6541 }
6542
6543 #define REDIS_AGGR_SUM 1
6544 #define REDIS_AGGR_MIN 2
6545 #define REDIS_AGGR_MAX 3
6546 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6547
6548 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6549 if (aggregate == REDIS_AGGR_SUM) {
6550 *target = *target + val;
6551 } else if (aggregate == REDIS_AGGR_MIN) {
6552 *target = val < *target ? val : *target;
6553 } else if (aggregate == REDIS_AGGR_MAX) {
6554 *target = val > *target ? val : *target;
6555 } else {
6556 /* safety net */
6557 redisPanic("Unknown ZUNION/INTER aggregate type");
6558 }
6559 }
6560
6561 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6562 int i, j, setnum;
6563 int aggregate = REDIS_AGGR_SUM;
6564 zsetopsrc *src;
6565 robj *dstobj;
6566 zset *dstzset;
6567 dictIterator *di;
6568 dictEntry *de;
6569
6570 /* expect setnum input keys to be given */
6571 setnum = atoi(c->argv[2]->ptr);
6572 if (setnum < 1) {
6573 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6574 return;
6575 }
6576
6577 /* test if the expected number of keys would overflow */
6578 if (3+setnum > c->argc) {
6579 addReply(c,shared.syntaxerr);
6580 return;
6581 }
6582
6583 /* read keys to be used for input */
6584 src = zmalloc(sizeof(zsetopsrc) * setnum);
6585 for (i = 0, j = 3; i < setnum; i++, j++) {
6586 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6587 if (!obj) {
6588 src[i].dict = NULL;
6589 } else {
6590 if (obj->type == REDIS_ZSET) {
6591 src[i].dict = ((zset*)obj->ptr)->dict;
6592 } else if (obj->type == REDIS_SET) {
6593 src[i].dict = (obj->ptr);
6594 } else {
6595 zfree(src);
6596 addReply(c,shared.wrongtypeerr);
6597 return;
6598 }
6599 }
6600
6601 /* default all weights to 1 */
6602 src[i].weight = 1.0;
6603 }
6604
6605 /* parse optional extra arguments */
6606 if (j < c->argc) {
6607 int remaining = c->argc - j;
6608
6609 while (remaining) {
6610 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6611 j++; remaining--;
6612 for (i = 0; i < setnum; i++, j++, remaining--) {
6613 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6614 return;
6615 }
6616 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6617 j++; remaining--;
6618 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6619 aggregate = REDIS_AGGR_SUM;
6620 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6621 aggregate = REDIS_AGGR_MIN;
6622 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6623 aggregate = REDIS_AGGR_MAX;
6624 } else {
6625 zfree(src);
6626 addReply(c,shared.syntaxerr);
6627 return;
6628 }
6629 j++; remaining--;
6630 } else {
6631 zfree(src);
6632 addReply(c,shared.syntaxerr);
6633 return;
6634 }
6635 }
6636 }
6637
6638 /* sort sets from the smallest to largest, this will improve our
6639 * algorithm's performance */
6640 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6641
6642 dstobj = createZsetObject();
6643 dstzset = dstobj->ptr;
6644
6645 if (op == REDIS_OP_INTER) {
6646 /* skip going over all entries if the smallest zset is NULL or empty */
6647 if (src[0].dict && dictSize(src[0].dict) > 0) {
6648 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6649 * from small to large, all src[i > 0].dict are non-empty too */
6650 di = dictGetIterator(src[0].dict);
6651 while((de = dictNext(di)) != NULL) {
6652 double *score = zmalloc(sizeof(double)), value;
6653 *score = src[0].weight * zunionInterDictValue(de);
6654
6655 for (j = 1; j < setnum; j++) {
6656 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6657 if (other) {
6658 value = src[j].weight * zunionInterDictValue(other);
6659 zunionInterAggregate(score, value, aggregate);
6660 } else {
6661 break;
6662 }
6663 }
6664
6665 /* skip entry when not present in every source dict */
6666 if (j != setnum) {
6667 zfree(score);
6668 } else {
6669 robj *o = dictGetEntryKey(de);
6670 dictAdd(dstzset->dict,o,score);
6671 incrRefCount(o); /* added to dictionary */
6672 zslInsert(dstzset->zsl,*score,o);
6673 incrRefCount(o); /* added to skiplist */
6674 }
6675 }
6676 dictReleaseIterator(di);
6677 }
6678 } else if (op == REDIS_OP_UNION) {
6679 for (i = 0; i < setnum; i++) {
6680 if (!src[i].dict) continue;
6681
6682 di = dictGetIterator(src[i].dict);
6683 while((de = dictNext(di)) != NULL) {
6684 /* skip key when already processed */
6685 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6686
6687 double *score = zmalloc(sizeof(double)), value;
6688 *score = src[i].weight * zunionInterDictValue(de);
6689
6690 /* because the zsets are sorted by size, its only possible
6691 * for sets at larger indices to hold this entry */
6692 for (j = (i+1); j < setnum; j++) {
6693 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6694 if (other) {
6695 value = src[j].weight * zunionInterDictValue(other);
6696 zunionInterAggregate(score, value, aggregate);
6697 }
6698 }
6699
6700 robj *o = dictGetEntryKey(de);
6701 dictAdd(dstzset->dict,o,score);
6702 incrRefCount(o); /* added to dictionary */
6703 zslInsert(dstzset->zsl,*score,o);
6704 incrRefCount(o); /* added to skiplist */
6705 }
6706 dictReleaseIterator(di);
6707 }
6708 } else {
6709 /* unknown operator */
6710 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6711 }
6712
6713 dbDelete(c->db,dstkey);
6714 if (dstzset->zsl->length) {
6715 dbAdd(c->db,dstkey,dstobj);
6716 addReplyLongLong(c, dstzset->zsl->length);
6717 server.dirty++;
6718 } else {
6719 decrRefCount(dstobj);
6720 addReply(c, shared.czero);
6721 }
6722 zfree(src);
6723 }
6724
6725 static void zunionstoreCommand(redisClient *c) {
6726 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6727 }
6728
6729 static void zinterstoreCommand(redisClient *c) {
6730 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6731 }
6732
6733 static void zrangeGenericCommand(redisClient *c, int reverse) {
6734 robj *o;
6735 long start;
6736 long end;
6737 int withscores = 0;
6738 int llen;
6739 int rangelen, j;
6740 zset *zsetobj;
6741 zskiplist *zsl;
6742 zskiplistNode *ln;
6743 robj *ele;
6744
6745 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6746 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6747
6748 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6749 withscores = 1;
6750 } else if (c->argc >= 5) {
6751 addReply(c,shared.syntaxerr);
6752 return;
6753 }
6754
6755 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6756 || checkType(c,o,REDIS_ZSET)) return;
6757 zsetobj = o->ptr;
6758 zsl = zsetobj->zsl;
6759 llen = zsl->length;
6760
6761 /* convert negative indexes */
6762 if (start < 0) start = llen+start;
6763 if (end < 0) end = llen+end;
6764 if (start < 0) start = 0;
6765 if (end < 0) end = 0;
6766
6767 /* indexes sanity checks */
6768 if (start > end || start >= llen) {
6769 /* Out of range start or start > end result in empty list */
6770 addReply(c,shared.emptymultibulk);
6771 return;
6772 }
6773 if (end >= llen) end = llen-1;
6774 rangelen = (end-start)+1;
6775
6776 /* check if starting point is trivial, before searching
6777 * the element in log(N) time */
6778 if (reverse) {
6779 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6780 } else {
6781 ln = start == 0 ?
6782 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6783 }
6784
6785 /* Return the result in form of a multi-bulk reply */
6786 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6787 withscores ? (rangelen*2) : rangelen));
6788 for (j = 0; j < rangelen; j++) {
6789 ele = ln->obj;
6790 addReplyBulk(c,ele);
6791 if (withscores)
6792 addReplyDouble(c,ln->score);
6793 ln = reverse ? ln->backward : ln->forward[0];
6794 }
6795 }
6796
6797 static void zrangeCommand(redisClient *c) {
6798 zrangeGenericCommand(c,0);
6799 }
6800
6801 static void zrevrangeCommand(redisClient *c) {
6802 zrangeGenericCommand(c,1);
6803 }
6804
6805 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6806 * If justcount is non-zero, just the count is returned. */
6807 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6808 robj *o;
6809 double min, max;
6810 int minex = 0, maxex = 0; /* are min or max exclusive? */
6811 int offset = 0, limit = -1;
6812 int withscores = 0;
6813 int badsyntax = 0;
6814
6815 /* Parse the min-max interval. If one of the values is prefixed
6816 * by the "(" character, it's considered "open". For instance
6817 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6818 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6819 if (((char*)c->argv[2]->ptr)[0] == '(') {
6820 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6821 minex = 1;
6822 } else {
6823 min = strtod(c->argv[2]->ptr,NULL);
6824 }
6825 if (((char*)c->argv[3]->ptr)[0] == '(') {
6826 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6827 maxex = 1;
6828 } else {
6829 max = strtod(c->argv[3]->ptr,NULL);
6830 }
6831
6832 /* Parse "WITHSCORES": note that if the command was called with
6833 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6834 * enter the following paths to parse WITHSCORES and LIMIT. */
6835 if (c->argc == 5 || c->argc == 8) {
6836 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6837 withscores = 1;
6838 else
6839 badsyntax = 1;
6840 }
6841 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6842 badsyntax = 1;
6843 if (badsyntax) {
6844 addReplySds(c,
6845 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6846 return;
6847 }
6848
6849 /* Parse "LIMIT" */
6850 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6851 addReply(c,shared.syntaxerr);
6852 return;
6853 } else if (c->argc == (7 + withscores)) {
6854 offset = atoi(c->argv[5]->ptr);
6855 limit = atoi(c->argv[6]->ptr);
6856 if (offset < 0) offset = 0;
6857 }
6858
6859 /* Ok, lookup the key and get the range */
6860 o = lookupKeyRead(c->db,c->argv[1]);
6861 if (o == NULL) {
6862 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6863 } else {
6864 if (o->type != REDIS_ZSET) {
6865 addReply(c,shared.wrongtypeerr);
6866 } else {
6867 zset *zsetobj = o->ptr;
6868 zskiplist *zsl = zsetobj->zsl;
6869 zskiplistNode *ln;
6870 robj *ele, *lenobj = NULL;
6871 unsigned long rangelen = 0;
6872
6873 /* Get the first node with the score >= min, or with
6874 * score > min if 'minex' is true. */
6875 ln = zslFirstWithScore(zsl,min);
6876 while (minex && ln && ln->score == min) ln = ln->forward[0];
6877
6878 if (ln == NULL) {
6879 /* No element matching the speciifed interval */
6880 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6881 return;
6882 }
6883
6884 /* We don't know in advance how many matching elements there
6885 * are in the list, so we push this object that will represent
6886 * the multi-bulk length in the output buffer, and will "fix"
6887 * it later */
6888 if (!justcount) {
6889 lenobj = createObject(REDIS_STRING,NULL);
6890 addReply(c,lenobj);
6891 decrRefCount(lenobj);
6892 }
6893
6894 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6895 if (offset) {
6896 offset--;
6897 ln = ln->forward[0];
6898 continue;
6899 }
6900 if (limit == 0) break;
6901 if (!justcount) {
6902 ele = ln->obj;
6903 addReplyBulk(c,ele);
6904 if (withscores)
6905 addReplyDouble(c,ln->score);
6906 }
6907 ln = ln->forward[0];
6908 rangelen++;
6909 if (limit > 0) limit--;
6910 }
6911 if (justcount) {
6912 addReplyLongLong(c,(long)rangelen);
6913 } else {
6914 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6915 withscores ? (rangelen*2) : rangelen);
6916 }
6917 }
6918 }
6919 }
6920
6921 static void zrangebyscoreCommand(redisClient *c) {
6922 genericZrangebyscoreCommand(c,0);
6923 }
6924
6925 static void zcountCommand(redisClient *c) {
6926 genericZrangebyscoreCommand(c,1);
6927 }
6928
6929 static void zcardCommand(redisClient *c) {
6930 robj *o;
6931 zset *zs;
6932
6933 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6934 checkType(c,o,REDIS_ZSET)) return;
6935
6936 zs = o->ptr;
6937 addReplyUlong(c,zs->zsl->length);
6938 }
6939
6940 static void zscoreCommand(redisClient *c) {
6941 robj *o;
6942 zset *zs;
6943 dictEntry *de;
6944
6945 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6946 checkType(c,o,REDIS_ZSET)) return;
6947
6948 zs = o->ptr;
6949 de = dictFind(zs->dict,c->argv[2]);
6950 if (!de) {
6951 addReply(c,shared.nullbulk);
6952 } else {
6953 double *score = dictGetEntryVal(de);
6954
6955 addReplyDouble(c,*score);
6956 }
6957 }
6958
6959 static void zrankGenericCommand(redisClient *c, int reverse) {
6960 robj *o;
6961 zset *zs;
6962 zskiplist *zsl;
6963 dictEntry *de;
6964 unsigned long rank;
6965 double *score;
6966
6967 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6968 checkType(c,o,REDIS_ZSET)) return;
6969
6970 zs = o->ptr;
6971 zsl = zs->zsl;
6972 de = dictFind(zs->dict,c->argv[2]);
6973 if (!de) {
6974 addReply(c,shared.nullbulk);
6975 return;
6976 }
6977
6978 score = dictGetEntryVal(de);
6979 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6980 if (rank) {
6981 if (reverse) {
6982 addReplyLongLong(c, zsl->length - rank);
6983 } else {
6984 addReplyLongLong(c, rank-1);
6985 }
6986 } else {
6987 addReply(c,shared.nullbulk);
6988 }
6989 }
6990
6991 static void zrankCommand(redisClient *c) {
6992 zrankGenericCommand(c, 0);
6993 }
6994
6995 static void zrevrankCommand(redisClient *c) {
6996 zrankGenericCommand(c, 1);
6997 }
6998
6999 /* ========================= Hashes utility functions ======================= */
7000 #define REDIS_HASH_KEY 1
7001 #define REDIS_HASH_VALUE 2
7002
7003 /* Check the length of a number of objects to see if we need to convert a
7004 * zipmap to a real hash. Note that we only check string encoded objects
7005 * as their string length can be queried in constant time. */
7006 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
7007 int i;
7008 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
7009
7010 for (i = start; i <= end; i++) {
7011 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
7012 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
7013 {
7014 convertToRealHash(subject);
7015 return;
7016 }
7017 }
7018 }
7019
7020 /* Encode given objects in-place when the hash uses a dict. */
7021 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
7022 if (subject->encoding == REDIS_ENCODING_HT) {
7023 if (o1) *o1 = tryObjectEncoding(*o1);
7024 if (o2) *o2 = tryObjectEncoding(*o2);
7025 }
7026 }
7027
7028 /* Get the value from a hash identified by key. Returns either a string
7029 * object or NULL if the value cannot be found. The refcount of the object
7030 * is always increased by 1 when the value was found. */
7031 static robj *hashTypeGet(robj *o, robj *key) {
7032 robj *value = NULL;
7033 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7034 unsigned char *v;
7035 unsigned int vlen;
7036 key = getDecodedObject(key);
7037 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
7038 value = createStringObject((char*)v,vlen);
7039 }
7040 decrRefCount(key);
7041 } else {
7042 dictEntry *de = dictFind(o->ptr,key);
7043 if (de != NULL) {
7044 value = dictGetEntryVal(de);
7045 incrRefCount(value);
7046 }
7047 }
7048 return value;
7049 }
7050
7051 /* Test if the key exists in the given hash. Returns 1 if the key
7052 * exists and 0 when it doesn't. */
7053 static int hashTypeExists(robj *o, robj *key) {
7054 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7055 key = getDecodedObject(key);
7056 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
7057 decrRefCount(key);
7058 return 1;
7059 }
7060 decrRefCount(key);
7061 } else {
7062 if (dictFind(o->ptr,key) != NULL) {
7063 return 1;
7064 }
7065 }
7066 return 0;
7067 }
7068
7069 /* Add an element, discard the old if the key already exists.
7070 * Return 0 on insert and 1 on update. */
7071 static int hashTypeSet(robj *o, robj *key, robj *value) {
7072 int update = 0;
7073 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7074 key = getDecodedObject(key);
7075 value = getDecodedObject(value);
7076 o->ptr = zipmapSet(o->ptr,
7077 key->ptr,sdslen(key->ptr),
7078 value->ptr,sdslen(value->ptr), &update);
7079 decrRefCount(key);
7080 decrRefCount(value);
7081
7082 /* Check if the zipmap needs to be upgraded to a real hash table */
7083 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
7084 convertToRealHash(o);
7085 } else {
7086 if (dictReplace(o->ptr,key,value)) {
7087 /* Insert */
7088 incrRefCount(key);
7089 } else {
7090 /* Update */
7091 update = 1;
7092 }
7093 incrRefCount(value);
7094 }
7095 return update;
7096 }
7097
7098 /* Delete an element from a hash.
7099 * Return 1 on deleted and 0 on not found. */
7100 static int hashTypeDelete(robj *o, robj *key) {
7101 int deleted = 0;
7102 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7103 key = getDecodedObject(key);
7104 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
7105 decrRefCount(key);
7106 } else {
7107 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
7108 /* Always check if the dictionary needs a resize after a delete. */
7109 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
7110 }
7111 return deleted;
7112 }
7113
7114 /* Return the number of elements in a hash. */
7115 static unsigned long hashTypeLength(robj *o) {
7116 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7117 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7118 }
7119
7120 /* Structure to hold hash iteration abstration. Note that iteration over
7121 * hashes involves both fields and values. Because it is possible that
7122 * not both are required, store pointers in the iterator to avoid
7123 * unnecessary memory allocation for fields/values. */
7124 typedef struct {
7125 int encoding;
7126 unsigned char *zi;
7127 unsigned char *zk, *zv;
7128 unsigned int zklen, zvlen;
7129
7130 dictIterator *di;
7131 dictEntry *de;
7132 } hashTypeIterator;
7133
7134 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7135 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7136 hi->encoding = subject->encoding;
7137 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7138 hi->zi = zipmapRewind(subject->ptr);
7139 } else if (hi->encoding == REDIS_ENCODING_HT) {
7140 hi->di = dictGetIterator(subject->ptr);
7141 } else {
7142 redisAssert(NULL);
7143 }
7144 return hi;
7145 }
7146
7147 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7148 if (hi->encoding == REDIS_ENCODING_HT) {
7149 dictReleaseIterator(hi->di);
7150 }
7151 zfree(hi);
7152 }
7153
7154 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7155 * could be found and REDIS_ERR when the iterator reaches the end. */
7156 static int hashTypeNext(hashTypeIterator *hi) {
7157 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7158 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7159 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7160 } else {
7161 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7162 }
7163 return REDIS_OK;
7164 }
7165
7166 /* Get key or value object at current iteration position.
7167 * This increases the refcount of the field object by 1. */
7168 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7169 robj *o;
7170 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7171 if (what & REDIS_HASH_KEY) {
7172 o = createStringObject((char*)hi->zk,hi->zklen);
7173 } else {
7174 o = createStringObject((char*)hi->zv,hi->zvlen);
7175 }
7176 } else {
7177 if (what & REDIS_HASH_KEY) {
7178 o = dictGetEntryKey(hi->de);
7179 } else {
7180 o = dictGetEntryVal(hi->de);
7181 }
7182 incrRefCount(o);
7183 }
7184 return o;
7185 }
7186
7187 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7188 robj *o = lookupKeyWrite(c->db,key);
7189 if (o == NULL) {
7190 o = createHashObject();
7191 dbAdd(c->db,key,o);
7192 } else {
7193 if (o->type != REDIS_HASH) {
7194 addReply(c,shared.wrongtypeerr);
7195 return NULL;
7196 }
7197 }
7198 return o;
7199 }
7200
7201 /* ============================= Hash commands ============================== */
7202 static void hsetCommand(redisClient *c) {
7203 int update;
7204 robj *o;
7205
7206 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7207 hashTypeTryConversion(o,c->argv,2,3);
7208 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7209 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7210 addReply(c, update ? shared.czero : shared.cone);
7211 server.dirty++;
7212 }
7213
7214 static void hsetnxCommand(redisClient *c) {
7215 robj *o;
7216 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7217 hashTypeTryConversion(o,c->argv,2,3);
7218
7219 if (hashTypeExists(o, c->argv[2])) {
7220 addReply(c, shared.czero);
7221 } else {
7222 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7223 hashTypeSet(o,c->argv[2],c->argv[3]);
7224 addReply(c, shared.cone);
7225 server.dirty++;
7226 }
7227 }
7228
7229 static void hmsetCommand(redisClient *c) {
7230 int i;
7231 robj *o;
7232
7233 if ((c->argc % 2) == 1) {
7234 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7235 return;
7236 }
7237
7238 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7239 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7240 for (i = 2; i < c->argc; i += 2) {
7241 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7242 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7243 }
7244 addReply(c, shared.ok);
7245 server.dirty++;
7246 }
7247
7248 static void hincrbyCommand(redisClient *c) {
7249 long long value, incr;
7250 robj *o, *current, *new;
7251
7252 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7253 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7254 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7255 if (getLongLongFromObjectOrReply(c,current,&value,
7256 "hash value is not an integer") != REDIS_OK) {
7257 decrRefCount(current);
7258 return;
7259 }
7260 decrRefCount(current);
7261 } else {
7262 value = 0;
7263 }
7264
7265 value += incr;
7266 new = createStringObjectFromLongLong(value);
7267 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7268 hashTypeSet(o,c->argv[2],new);
7269 decrRefCount(new);
7270 addReplyLongLong(c,value);
7271 server.dirty++;
7272 }
7273
7274 static void hgetCommand(redisClient *c) {
7275 robj *o, *value;
7276 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7277 checkType(c,o,REDIS_HASH)) return;
7278
7279 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7280 addReplyBulk(c,value);
7281 decrRefCount(value);
7282 } else {
7283 addReply(c,shared.nullbulk);
7284 }
7285 }
7286
7287 static void hmgetCommand(redisClient *c) {
7288 int i;
7289 robj *o, *value;
7290 o = lookupKeyRead(c->db,c->argv[1]);
7291 if (o != NULL && o->type != REDIS_HASH) {
7292 addReply(c,shared.wrongtypeerr);
7293 }
7294
7295 /* Note the check for o != NULL happens inside the loop. This is
7296 * done because objects that cannot be found are considered to be
7297 * an empty hash. The reply should then be a series of NULLs. */
7298 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7299 for (i = 2; i < c->argc; i++) {
7300 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7301 addReplyBulk(c,value);
7302 decrRefCount(value);
7303 } else {
7304 addReply(c,shared.nullbulk);
7305 }
7306 }
7307 }
7308
7309 static void hdelCommand(redisClient *c) {
7310 robj *o;
7311 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7312 checkType(c,o,REDIS_HASH)) return;
7313
7314 if (hashTypeDelete(o,c->argv[2])) {
7315 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7316 addReply(c,shared.cone);
7317 server.dirty++;
7318 } else {
7319 addReply(c,shared.czero);
7320 }
7321 }
7322
7323 static void hlenCommand(redisClient *c) {
7324 robj *o;
7325 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7326 checkType(c,o,REDIS_HASH)) return;
7327
7328 addReplyUlong(c,hashTypeLength(o));
7329 }
7330
7331 static void genericHgetallCommand(redisClient *c, int flags) {
7332 robj *o, *lenobj, *obj;
7333 unsigned long count = 0;
7334 hashTypeIterator *hi;
7335
7336 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7337 || checkType(c,o,REDIS_HASH)) return;
7338
7339 lenobj = createObject(REDIS_STRING,NULL);
7340 addReply(c,lenobj);
7341 decrRefCount(lenobj);
7342
7343 hi = hashTypeInitIterator(o);
7344 while (hashTypeNext(hi) != REDIS_ERR) {
7345 if (flags & REDIS_HASH_KEY) {
7346 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7347 addReplyBulk(c,obj);
7348 decrRefCount(obj);
7349 count++;
7350 }
7351 if (flags & REDIS_HASH_VALUE) {
7352 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7353 addReplyBulk(c,obj);
7354 decrRefCount(obj);
7355 count++;
7356 }
7357 }
7358 hashTypeReleaseIterator(hi);
7359
7360 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7361 }
7362
7363 static void hkeysCommand(redisClient *c) {
7364 genericHgetallCommand(c,REDIS_HASH_KEY);
7365 }
7366
7367 static void hvalsCommand(redisClient *c) {
7368 genericHgetallCommand(c,REDIS_HASH_VALUE);
7369 }
7370
7371 static void hgetallCommand(redisClient *c) {
7372 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7373 }
7374
7375 static void hexistsCommand(redisClient *c) {
7376 robj *o;
7377 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7378 checkType(c,o,REDIS_HASH)) return;
7379
7380 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7381 }
7382
7383 static void convertToRealHash(robj *o) {
7384 unsigned char *key, *val, *p, *zm = o->ptr;
7385 unsigned int klen, vlen;
7386 dict *dict = dictCreate(&hashDictType,NULL);
7387
7388 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7389 p = zipmapRewind(zm);
7390 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7391 robj *keyobj, *valobj;
7392
7393 keyobj = createStringObject((char*)key,klen);
7394 valobj = createStringObject((char*)val,vlen);
7395 keyobj = tryObjectEncoding(keyobj);
7396 valobj = tryObjectEncoding(valobj);
7397 dictAdd(dict,keyobj,valobj);
7398 }
7399 o->encoding = REDIS_ENCODING_HT;
7400 o->ptr = dict;
7401 zfree(zm);
7402 }
7403
7404 /* ========================= Non type-specific commands ==================== */
7405
7406 static void flushdbCommand(redisClient *c) {
7407 server.dirty += dictSize(c->db->dict);
7408 touchWatchedKeysOnFlush(c->db->id);
7409 dictEmpty(c->db->dict);
7410 dictEmpty(c->db->expires);
7411 addReply(c,shared.ok);
7412 }
7413
7414 static void flushallCommand(redisClient *c) {
7415 touchWatchedKeysOnFlush(-1);
7416 server.dirty += emptyDb();
7417 addReply(c,shared.ok);
7418 if (server.bgsavechildpid != -1) {
7419 kill(server.bgsavechildpid,SIGKILL);
7420 rdbRemoveTempFile(server.bgsavechildpid);
7421 }
7422 rdbSave(server.dbfilename);
7423 server.dirty++;
7424 }
7425
7426 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7427 redisSortOperation *so = zmalloc(sizeof(*so));
7428 so->type = type;
7429 so->pattern = pattern;
7430 return so;
7431 }
7432
7433 /* Return the value associated to the key with a name obtained
7434 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7435 * The returned object will always have its refcount increased by 1
7436 * when it is non-NULL. */
7437 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7438 char *p, *f;
7439 sds spat, ssub;
7440 robj keyobj, fieldobj, *o;
7441 int prefixlen, sublen, postfixlen, fieldlen;
7442 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7443 struct {
7444 long len;
7445 long free;
7446 char buf[REDIS_SORTKEY_MAX+1];
7447 } keyname, fieldname;
7448
7449 /* If the pattern is "#" return the substitution object itself in order
7450 * to implement the "SORT ... GET #" feature. */
7451 spat = pattern->ptr;
7452 if (spat[0] == '#' && spat[1] == '\0') {
7453 incrRefCount(subst);
7454 return subst;
7455 }
7456
7457 /* The substitution object may be specially encoded. If so we create
7458 * a decoded object on the fly. Otherwise getDecodedObject will just
7459 * increment the ref count, that we'll decrement later. */
7460 subst = getDecodedObject(subst);
7461
7462 ssub = subst->ptr;
7463 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7464 p = strchr(spat,'*');
7465 if (!p) {
7466 decrRefCount(subst);
7467 return NULL;
7468 }
7469
7470 /* Find out if we're dealing with a hash dereference. */
7471 if ((f = strstr(p+1, "->")) != NULL) {
7472 fieldlen = sdslen(spat)-(f-spat);
7473 /* this also copies \0 character */
7474 memcpy(fieldname.buf,f+2,fieldlen-1);
7475 fieldname.len = fieldlen-2;
7476 } else {
7477 fieldlen = 0;
7478 }
7479
7480 prefixlen = p-spat;
7481 sublen = sdslen(ssub);
7482 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7483 memcpy(keyname.buf,spat,prefixlen);
7484 memcpy(keyname.buf+prefixlen,ssub,sublen);
7485 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7486 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7487 keyname.len = prefixlen+sublen+postfixlen;
7488 decrRefCount(subst);
7489
7490 /* Lookup substituted key */
7491 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7492 o = lookupKeyRead(db,&keyobj);
7493 if (o == NULL) return NULL;
7494
7495 if (fieldlen > 0) {
7496 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7497
7498 /* Retrieve value from hash by the field name. This operation
7499 * already increases the refcount of the returned object. */
7500 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7501 o = hashTypeGet(o, &fieldobj);
7502 } else {
7503 if (o->type != REDIS_STRING) return NULL;
7504
7505 /* Every object that this function returns needs to have its refcount
7506 * increased. sortCommand decreases it again. */
7507 incrRefCount(o);
7508 }
7509
7510 return o;
7511 }
7512
7513 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7514 * the additional parameter is not standard but a BSD-specific we have to
7515 * pass sorting parameters via the global 'server' structure */
7516 static int sortCompare(const void *s1, const void *s2) {
7517 const redisSortObject *so1 = s1, *so2 = s2;
7518 int cmp;
7519
7520 if (!server.sort_alpha) {
7521 /* Numeric sorting. Here it's trivial as we precomputed scores */
7522 if (so1->u.score > so2->u.score) {
7523 cmp = 1;
7524 } else if (so1->u.score < so2->u.score) {
7525 cmp = -1;
7526 } else {
7527 cmp = 0;
7528 }
7529 } else {
7530 /* Alphanumeric sorting */
7531 if (server.sort_bypattern) {
7532 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7533 /* At least one compare object is NULL */
7534 if (so1->u.cmpobj == so2->u.cmpobj)
7535 cmp = 0;
7536 else if (so1->u.cmpobj == NULL)
7537 cmp = -1;
7538 else
7539 cmp = 1;
7540 } else {
7541 /* We have both the objects, use strcoll */
7542 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7543 }
7544 } else {
7545 /* Compare elements directly. */
7546 cmp = compareStringObjects(so1->obj,so2->obj);
7547 }
7548 }
7549 return server.sort_desc ? -cmp : cmp;
7550 }
7551
7552 /* The SORT command is the most complex command in Redis. Warning: this code
7553 * is optimized for speed and a bit less for readability */
7554 static void sortCommand(redisClient *c) {
7555 list *operations;
7556 unsigned int outputlen = 0;
7557 int desc = 0, alpha = 0;
7558 int limit_start = 0, limit_count = -1, start, end;
7559 int j, dontsort = 0, vectorlen;
7560 int getop = 0; /* GET operation counter */
7561 robj *sortval, *sortby = NULL, *storekey = NULL;
7562 redisSortObject *vector; /* Resulting vector to sort */
7563
7564 /* Lookup the key to sort. It must be of the right types */
7565 sortval = lookupKeyRead(c->db,c->argv[1]);
7566 if (sortval == NULL) {
7567 addReply(c,shared.emptymultibulk);
7568 return;
7569 }
7570 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7571 sortval->type != REDIS_ZSET)
7572 {
7573 addReply(c,shared.wrongtypeerr);
7574 return;
7575 }
7576
7577 /* Create a list of operations to perform for every sorted element.
7578 * Operations can be GET/DEL/INCR/DECR */
7579 operations = listCreate();
7580 listSetFreeMethod(operations,zfree);
7581 j = 2;
7582
7583 /* Now we need to protect sortval incrementing its count, in the future
7584 * SORT may have options able to overwrite/delete keys during the sorting
7585 * and the sorted key itself may get destroied */
7586 incrRefCount(sortval);
7587
7588 /* The SORT command has an SQL-alike syntax, parse it */
7589 while(j < c->argc) {
7590 int leftargs = c->argc-j-1;
7591 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7592 desc = 0;
7593 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7594 desc = 1;
7595 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7596 alpha = 1;
7597 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7598 limit_start = atoi(c->argv[j+1]->ptr);
7599 limit_count = atoi(c->argv[j+2]->ptr);
7600 j+=2;
7601 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7602 storekey = c->argv[j+1];
7603 j++;
7604 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7605 sortby = c->argv[j+1];
7606 /* If the BY pattern does not contain '*', i.e. it is constant,
7607 * we don't need to sort nor to lookup the weight keys. */
7608 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7609 j++;
7610 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7611 listAddNodeTail(operations,createSortOperation(
7612 REDIS_SORT_GET,c->argv[j+1]));
7613 getop++;
7614 j++;
7615 } else {
7616 decrRefCount(sortval);
7617 listRelease(operations);
7618 addReply(c,shared.syntaxerr);
7619 return;
7620 }
7621 j++;
7622 }
7623
7624 /* Load the sorting vector with all the objects to sort */
7625 switch(sortval->type) {
7626 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7627 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7628 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7629 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7630 }
7631 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7632 j = 0;
7633
7634 if (sortval->type == REDIS_LIST) {
7635 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7636 listTypeEntry entry;
7637 while(listTypeNext(li,&entry)) {
7638 vector[j].obj = listTypeGet(&entry);
7639 vector[j].u.score = 0;
7640 vector[j].u.cmpobj = NULL;
7641 j++;
7642 }
7643 listTypeReleaseIterator(li);
7644 } else {
7645 dict *set;
7646 dictIterator *di;
7647 dictEntry *setele;
7648
7649 if (sortval->type == REDIS_SET) {
7650 set = sortval->ptr;
7651 } else {
7652 zset *zs = sortval->ptr;
7653 set = zs->dict;
7654 }
7655
7656 di = dictGetIterator(set);
7657 while((setele = dictNext(di)) != NULL) {
7658 vector[j].obj = dictGetEntryKey(setele);
7659 vector[j].u.score = 0;
7660 vector[j].u.cmpobj = NULL;
7661 j++;
7662 }
7663 dictReleaseIterator(di);
7664 }
7665 redisAssert(j == vectorlen);
7666
7667 /* Now it's time to load the right scores in the sorting vector */
7668 if (dontsort == 0) {
7669 for (j = 0; j < vectorlen; j++) {
7670 robj *byval;
7671 if (sortby) {
7672 /* lookup value to sort by */
7673 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7674 if (!byval) continue;
7675 } else {
7676 /* use object itself to sort by */
7677 byval = vector[j].obj;
7678 }
7679
7680 if (alpha) {
7681 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7682 } else {
7683 if (byval->encoding == REDIS_ENCODING_RAW) {
7684 vector[j].u.score = strtod(byval->ptr,NULL);
7685 } else if (byval->encoding == REDIS_ENCODING_INT) {
7686 /* Don't need to decode the object if it's
7687 * integer-encoded (the only encoding supported) so
7688 * far. We can just cast it */
7689 vector[j].u.score = (long)byval->ptr;
7690 } else {
7691 redisAssert(1 != 1);
7692 }
7693 }
7694
7695 /* when the object was retrieved using lookupKeyByPattern,
7696 * its refcount needs to be decreased. */
7697 if (sortby) {
7698 decrRefCount(byval);
7699 }
7700 }
7701 }
7702
7703 /* We are ready to sort the vector... perform a bit of sanity check
7704 * on the LIMIT option too. We'll use a partial version of quicksort. */
7705 start = (limit_start < 0) ? 0 : limit_start;
7706 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7707 if (start >= vectorlen) {
7708 start = vectorlen-1;
7709 end = vectorlen-2;
7710 }
7711 if (end >= vectorlen) end = vectorlen-1;
7712
7713 if (dontsort == 0) {
7714 server.sort_desc = desc;
7715 server.sort_alpha = alpha;
7716 server.sort_bypattern = sortby ? 1 : 0;
7717 if (sortby && (start != 0 || end != vectorlen-1))
7718 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7719 else
7720 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7721 }
7722
7723 /* Send command output to the output buffer, performing the specified
7724 * GET/DEL/INCR/DECR operations if any. */
7725 outputlen = getop ? getop*(end-start+1) : end-start+1;
7726 if (storekey == NULL) {
7727 /* STORE option not specified, sent the sorting result to client */
7728 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7729 for (j = start; j <= end; j++) {
7730 listNode *ln;
7731 listIter li;
7732
7733 if (!getop) addReplyBulk(c,vector[j].obj);
7734 listRewind(operations,&li);
7735 while((ln = listNext(&li))) {
7736 redisSortOperation *sop = ln->value;
7737 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7738 vector[j].obj);
7739
7740 if (sop->type == REDIS_SORT_GET) {
7741 if (!val) {
7742 addReply(c,shared.nullbulk);
7743 } else {
7744 addReplyBulk(c,val);
7745 decrRefCount(val);
7746 }
7747 } else {
7748 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7749 }
7750 }
7751 }
7752 } else {
7753 robj *sobj = createZiplistObject();
7754
7755 /* STORE option specified, set the sorting result as a List object */
7756 for (j = start; j <= end; j++) {
7757 listNode *ln;
7758 listIter li;
7759
7760 if (!getop) {
7761 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7762 } else {
7763 listRewind(operations,&li);
7764 while((ln = listNext(&li))) {
7765 redisSortOperation *sop = ln->value;
7766 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7767 vector[j].obj);
7768
7769 if (sop->type == REDIS_SORT_GET) {
7770 if (!val) val = createStringObject("",0);
7771
7772 /* listTypePush does an incrRefCount, so we should take care
7773 * care of the incremented refcount caused by either
7774 * lookupKeyByPattern or createStringObject("",0) */
7775 listTypePush(sobj,val,REDIS_TAIL);
7776 decrRefCount(val);
7777 } else {
7778 /* always fails */
7779 redisAssert(sop->type == REDIS_SORT_GET);
7780 }
7781 }
7782 }
7783 }
7784 dbReplace(c->db,storekey,sobj);
7785 /* Note: we add 1 because the DB is dirty anyway since even if the
7786 * SORT result is empty a new key is set and maybe the old content
7787 * replaced. */
7788 server.dirty += 1+outputlen;
7789 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7790 }
7791
7792 /* Cleanup */
7793 if (sortval->type == REDIS_LIST)
7794 for (j = 0; j < vectorlen; j++)
7795 decrRefCount(vector[j].obj);
7796 decrRefCount(sortval);
7797 listRelease(operations);
7798 for (j = 0; j < vectorlen; j++) {
7799 if (alpha && vector[j].u.cmpobj)
7800 decrRefCount(vector[j].u.cmpobj);
7801 }
7802 zfree(vector);
7803 }
7804
7805 /* Convert an amount of bytes into a human readable string in the form
7806 * of 100B, 2G, 100M, 4K, and so forth. */
7807 static void bytesToHuman(char *s, unsigned long long n) {
7808 double d;
7809
7810 if (n < 1024) {
7811 /* Bytes */
7812 sprintf(s,"%lluB",n);
7813 return;
7814 } else if (n < (1024*1024)) {
7815 d = (double)n/(1024);
7816 sprintf(s,"%.2fK",d);
7817 } else if (n < (1024LL*1024*1024)) {
7818 d = (double)n/(1024*1024);
7819 sprintf(s,"%.2fM",d);
7820 } else if (n < (1024LL*1024*1024*1024)) {
7821 d = (double)n/(1024LL*1024*1024);
7822 sprintf(s,"%.2fG",d);
7823 }
7824 }
7825
7826 /* Create the string returned by the INFO command. This is decoupled
7827 * by the INFO command itself as we need to report the same information
7828 * on memory corruption problems. */
7829 static sds genRedisInfoString(void) {
7830 sds info;
7831 time_t uptime = time(NULL)-server.stat_starttime;
7832 int j;
7833 char hmem[64];
7834
7835 bytesToHuman(hmem,zmalloc_used_memory());
7836 info = sdscatprintf(sdsempty(),
7837 "redis_version:%s\r\n"
7838 "redis_git_sha1:%s\r\n"
7839 "redis_git_dirty:%d\r\n"
7840 "arch_bits:%s\r\n"
7841 "multiplexing_api:%s\r\n"
7842 "process_id:%ld\r\n"
7843 "uptime_in_seconds:%ld\r\n"
7844 "uptime_in_days:%ld\r\n"
7845 "connected_clients:%d\r\n"
7846 "connected_slaves:%d\r\n"
7847 "blocked_clients:%d\r\n"
7848 "used_memory:%zu\r\n"
7849 "used_memory_human:%s\r\n"
7850 "changes_since_last_save:%lld\r\n"
7851 "bgsave_in_progress:%d\r\n"
7852 "last_save_time:%ld\r\n"
7853 "bgrewriteaof_in_progress:%d\r\n"
7854 "total_connections_received:%lld\r\n"
7855 "total_commands_processed:%lld\r\n"
7856 "expired_keys:%lld\r\n"
7857 "hash_max_zipmap_entries:%zu\r\n"
7858 "hash_max_zipmap_value:%zu\r\n"
7859 "pubsub_channels:%ld\r\n"
7860 "pubsub_patterns:%u\r\n"
7861 "vm_enabled:%d\r\n"
7862 "role:%s\r\n"
7863 ,REDIS_VERSION,
7864 REDIS_GIT_SHA1,
7865 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7866 (sizeof(long) == 8) ? "64" : "32",
7867 aeGetApiName(),
7868 (long) getpid(),
7869 uptime,
7870 uptime/(3600*24),
7871 listLength(server.clients)-listLength(server.slaves),
7872 listLength(server.slaves),
7873 server.blpop_blocked_clients,
7874 zmalloc_used_memory(),
7875 hmem,
7876 server.dirty,
7877 server.bgsavechildpid != -1,
7878 server.lastsave,
7879 server.bgrewritechildpid != -1,
7880 server.stat_numconnections,
7881 server.stat_numcommands,
7882 server.stat_expiredkeys,
7883 server.hash_max_zipmap_entries,
7884 server.hash_max_zipmap_value,
7885 dictSize(server.pubsub_channels),
7886 listLength(server.pubsub_patterns),
7887 server.vm_enabled != 0,
7888 server.masterhost == NULL ? "master" : "slave"
7889 );
7890 if (server.masterhost) {
7891 info = sdscatprintf(info,
7892 "master_host:%s\r\n"
7893 "master_port:%d\r\n"
7894 "master_link_status:%s\r\n"
7895 "master_last_io_seconds_ago:%d\r\n"
7896 ,server.masterhost,
7897 server.masterport,
7898 (server.replstate == REDIS_REPL_CONNECTED) ?
7899 "up" : "down",
7900 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7901 );
7902 }
7903 if (server.vm_enabled) {
7904 lockThreadedIO();
7905 info = sdscatprintf(info,
7906 "vm_conf_max_memory:%llu\r\n"
7907 "vm_conf_page_size:%llu\r\n"
7908 "vm_conf_pages:%llu\r\n"
7909 "vm_stats_used_pages:%llu\r\n"
7910 "vm_stats_swapped_objects:%llu\r\n"
7911 "vm_stats_swappin_count:%llu\r\n"
7912 "vm_stats_swappout_count:%llu\r\n"
7913 "vm_stats_io_newjobs_len:%lu\r\n"
7914 "vm_stats_io_processing_len:%lu\r\n"
7915 "vm_stats_io_processed_len:%lu\r\n"
7916 "vm_stats_io_active_threads:%lu\r\n"
7917 "vm_stats_blocked_clients:%lu\r\n"
7918 ,(unsigned long long) server.vm_max_memory,
7919 (unsigned long long) server.vm_page_size,
7920 (unsigned long long) server.vm_pages,
7921 (unsigned long long) server.vm_stats_used_pages,
7922 (unsigned long long) server.vm_stats_swapped_objects,
7923 (unsigned long long) server.vm_stats_swapins,
7924 (unsigned long long) server.vm_stats_swapouts,
7925 (unsigned long) listLength(server.io_newjobs),
7926 (unsigned long) listLength(server.io_processing),
7927 (unsigned long) listLength(server.io_processed),
7928 (unsigned long) server.io_active_threads,
7929 (unsigned long) server.vm_blocked_clients
7930 );
7931 unlockThreadedIO();
7932 }
7933 for (j = 0; j < server.dbnum; j++) {
7934 long long keys, vkeys;
7935
7936 keys = dictSize(server.db[j].dict);
7937 vkeys = dictSize(server.db[j].expires);
7938 if (keys || vkeys) {
7939 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7940 j, keys, vkeys);
7941 }
7942 }
7943 return info;
7944 }
7945
7946 static void infoCommand(redisClient *c) {
7947 sds info = genRedisInfoString();
7948 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7949 (unsigned long)sdslen(info)));
7950 addReplySds(c,info);
7951 addReply(c,shared.crlf);
7952 }
7953
7954 static void monitorCommand(redisClient *c) {
7955 /* ignore MONITOR if aleady slave or in monitor mode */
7956 if (c->flags & REDIS_SLAVE) return;
7957
7958 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7959 c->slaveseldb = 0;
7960 listAddNodeTail(server.monitors,c);
7961 addReply(c,shared.ok);
7962 }
7963
7964 /* ================================= Expire ================================= */
7965 static int removeExpire(redisDb *db, robj *key) {
7966 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7967 return 1;
7968 } else {
7969 return 0;
7970 }
7971 }
7972
7973 static int setExpire(redisDb *db, robj *key, time_t when) {
7974 sds copy = sdsdup(key->ptr);
7975 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7976 sdsfree(copy);
7977 return 0;
7978 } else {
7979 return 1;
7980 }
7981 }
7982
7983 /* Return the expire time of the specified key, or -1 if no expire
7984 * is associated with this key (i.e. the key is non volatile) */
7985 static time_t getExpire(redisDb *db, robj *key) {
7986 dictEntry *de;
7987
7988 /* No expire? return ASAP */
7989 if (dictSize(db->expires) == 0 ||
7990 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7991
7992 return (time_t) dictGetEntryVal(de);
7993 }
7994
7995 static int expireIfNeeded(redisDb *db, robj *key) {
7996 time_t when;
7997 dictEntry *de;
7998
7999 /* No expire? return ASAP */
8000 if (dictSize(db->expires) == 0 ||
8001 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
8002
8003 /* Lookup the expire */
8004 when = (time_t) dictGetEntryVal(de);
8005 if (time(NULL) <= when) return 0;
8006
8007 /* Delete the key */
8008 dbDelete(db,key);
8009 server.stat_expiredkeys++;
8010 return 1;
8011 }
8012
8013 static int deleteIfVolatile(redisDb *db, robj *key) {
8014 dictEntry *de;
8015
8016 /* No expire? return ASAP */
8017 if (dictSize(db->expires) == 0 ||
8018 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
8019
8020 /* Delete the key */
8021 server.dirty++;
8022 server.stat_expiredkeys++;
8023 dictDelete(db->expires,key->ptr);
8024 return dictDelete(db->dict,key->ptr) == DICT_OK;
8025 }
8026
8027 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
8028 dictEntry *de;
8029 time_t seconds;
8030
8031 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
8032
8033 seconds -= offset;
8034
8035 de = dictFind(c->db->dict,key->ptr);
8036 if (de == NULL) {
8037 addReply(c,shared.czero);
8038 return;
8039 }
8040 if (seconds <= 0) {
8041 if (dbDelete(c->db,key)) server.dirty++;
8042 addReply(c, shared.cone);
8043 return;
8044 } else {
8045 time_t when = time(NULL)+seconds;
8046 if (setExpire(c->db,key,when)) {
8047 addReply(c,shared.cone);
8048 server.dirty++;
8049 } else {
8050 addReply(c,shared.czero);
8051 }
8052 return;
8053 }
8054 }
8055
8056 static void expireCommand(redisClient *c) {
8057 expireGenericCommand(c,c->argv[1],c->argv[2],0);
8058 }
8059
8060 static void expireatCommand(redisClient *c) {
8061 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
8062 }
8063
8064 static void ttlCommand(redisClient *c) {
8065 time_t expire;
8066 int ttl = -1;
8067
8068 expire = getExpire(c->db,c->argv[1]);
8069 if (expire != -1) {
8070 ttl = (int) (expire-time(NULL));
8071 if (ttl < 0) ttl = -1;
8072 }
8073 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
8074 }
8075
8076 /* ================================ MULTI/EXEC ============================== */
8077
8078 /* Client state initialization for MULTI/EXEC */
8079 static void initClientMultiState(redisClient *c) {
8080 c->mstate.commands = NULL;
8081 c->mstate.count = 0;
8082 }
8083
8084 /* Release all the resources associated with MULTI/EXEC state */
8085 static void freeClientMultiState(redisClient *c) {
8086 int j;
8087
8088 for (j = 0; j < c->mstate.count; j++) {
8089 int i;
8090 multiCmd *mc = c->mstate.commands+j;
8091
8092 for (i = 0; i < mc->argc; i++)
8093 decrRefCount(mc->argv[i]);
8094 zfree(mc->argv);
8095 }
8096 zfree(c->mstate.commands);
8097 }
8098
8099 /* Add a new command into the MULTI commands queue */
8100 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
8101 multiCmd *mc;
8102 int j;
8103
8104 c->mstate.commands = zrealloc(c->mstate.commands,
8105 sizeof(multiCmd)*(c->mstate.count+1));
8106 mc = c->mstate.commands+c->mstate.count;
8107 mc->cmd = cmd;
8108 mc->argc = c->argc;
8109 mc->argv = zmalloc(sizeof(robj*)*c->argc);
8110 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8111 for (j = 0; j < c->argc; j++)
8112 incrRefCount(mc->argv[j]);
8113 c->mstate.count++;
8114 }
8115
8116 static void multiCommand(redisClient *c) {
8117 if (c->flags & REDIS_MULTI) {
8118 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8119 return;
8120 }
8121 c->flags |= REDIS_MULTI;
8122 addReply(c,shared.ok);
8123 }
8124
8125 static void discardCommand(redisClient *c) {
8126 if (!(c->flags & REDIS_MULTI)) {
8127 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8128 return;
8129 }
8130
8131 freeClientMultiState(c);
8132 initClientMultiState(c);
8133 c->flags &= (~REDIS_MULTI);
8134 unwatchAllKeys(c);
8135 addReply(c,shared.ok);
8136 }
8137
8138 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8139 * implememntation for more information. */
8140 static void execCommandReplicateMulti(redisClient *c) {
8141 struct redisCommand *cmd;
8142 robj *multistring = createStringObject("MULTI",5);
8143
8144 cmd = lookupCommand("multi");
8145 if (server.appendonly)
8146 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8147 if (listLength(server.slaves))
8148 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8149 decrRefCount(multistring);
8150 }
8151
8152 static void execCommand(redisClient *c) {
8153 int j;
8154 robj **orig_argv;
8155 int orig_argc;
8156
8157 if (!(c->flags & REDIS_MULTI)) {
8158 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8159 return;
8160 }
8161
8162 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8163 * A failed EXEC will return a multi bulk nil object. */
8164 if (c->flags & REDIS_DIRTY_CAS) {
8165 freeClientMultiState(c);
8166 initClientMultiState(c);
8167 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8168 unwatchAllKeys(c);
8169 addReply(c,shared.nullmultibulk);
8170 return;
8171 }
8172
8173 /* Replicate a MULTI request now that we are sure the block is executed.
8174 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8175 * both the AOF and the replication link will have the same consistency
8176 * and atomicity guarantees. */
8177 execCommandReplicateMulti(c);
8178
8179 /* Exec all the queued commands */
8180 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8181 orig_argv = c->argv;
8182 orig_argc = c->argc;
8183 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8184 for (j = 0; j < c->mstate.count; j++) {
8185 c->argc = c->mstate.commands[j].argc;
8186 c->argv = c->mstate.commands[j].argv;
8187 call(c,c->mstate.commands[j].cmd);
8188 }
8189 c->argv = orig_argv;
8190 c->argc = orig_argc;
8191 freeClientMultiState(c);
8192 initClientMultiState(c);
8193 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8194 /* Make sure the EXEC command is always replicated / AOF, since we
8195 * always send the MULTI command (we can't know beforehand if the
8196 * next operations will contain at least a modification to the DB). */
8197 server.dirty++;
8198 }
8199
8200 /* =========================== Blocking Operations ========================= */
8201
8202 /* Currently Redis blocking operations support is limited to list POP ops,
8203 * so the current implementation is not fully generic, but it is also not
8204 * completely specific so it will not require a rewrite to support new
8205 * kind of blocking operations in the future.
8206 *
8207 * Still it's important to note that list blocking operations can be already
8208 * used as a notification mechanism in order to implement other blocking
8209 * operations at application level, so there must be a very strong evidence
8210 * of usefulness and generality before new blocking operations are implemented.
8211 *
8212 * This is how the current blocking POP works, we use BLPOP as example:
8213 * - If the user calls BLPOP and the key exists and contains a non empty list
8214 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8215 * if there is not to block.
8216 * - If instead BLPOP is called and the key does not exists or the list is
8217 * empty we need to block. In order to do so we remove the notification for
8218 * new data to read in the client socket (so that we'll not serve new
8219 * requests if the blocking request is not served). Also we put the client
8220 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8221 * blocking for this keys.
8222 * - If a PUSH operation against a key with blocked clients waiting is
8223 * performed, we serve the first in the list: basically instead to push
8224 * the new element inside the list we return it to the (first / oldest)
8225 * blocking client, unblock the client, and remove it form the list.
8226 *
8227 * The above comment and the source code should be enough in order to understand
8228 * the implementation and modify / fix it later.
8229 */
8230
8231 /* Set a client in blocking mode for the specified key, with the specified
8232 * timeout */
8233 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8234 dictEntry *de;
8235 list *l;
8236 int j;
8237
8238 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8239 c->blocking_keys_num = numkeys;
8240 c->blockingto = timeout;
8241 for (j = 0; j < numkeys; j++) {
8242 /* Add the key in the client structure, to map clients -> keys */
8243 c->blocking_keys[j] = keys[j];
8244 incrRefCount(keys[j]);
8245
8246 /* And in the other "side", to map keys -> clients */
8247 de = dictFind(c->db->blocking_keys,keys[j]);
8248 if (de == NULL) {
8249 int retval;
8250
8251 /* For every key we take a list of clients blocked for it */
8252 l = listCreate();
8253 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8254 incrRefCount(keys[j]);
8255 assert(retval == DICT_OK);
8256 } else {
8257 l = dictGetEntryVal(de);
8258 }
8259 listAddNodeTail(l,c);
8260 }
8261 /* Mark the client as a blocked client */
8262 c->flags |= REDIS_BLOCKED;
8263 server.blpop_blocked_clients++;
8264 }
8265
8266 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8267 static void unblockClientWaitingData(redisClient *c) {
8268 dictEntry *de;
8269 list *l;
8270 int j;
8271
8272 assert(c->blocking_keys != NULL);
8273 /* The client may wait for multiple keys, so unblock it for every key. */
8274 for (j = 0; j < c->blocking_keys_num; j++) {
8275 /* Remove this client from the list of clients waiting for this key. */
8276 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8277 assert(de != NULL);
8278 l = dictGetEntryVal(de);
8279 listDelNode(l,listSearchKey(l,c));
8280 /* If the list is empty we need to remove it to avoid wasting memory */
8281 if (listLength(l) == 0)
8282 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8283 decrRefCount(c->blocking_keys[j]);
8284 }
8285 /* Cleanup the client structure */
8286 zfree(c->blocking_keys);
8287 c->blocking_keys = NULL;
8288 c->flags &= (~REDIS_BLOCKED);
8289 server.blpop_blocked_clients--;
8290 /* We want to process data if there is some command waiting
8291 * in the input buffer. Note that this is safe even if
8292 * unblockClientWaitingData() gets called from freeClient() because
8293 * freeClient() will be smart enough to call this function
8294 * *after* c->querybuf was set to NULL. */
8295 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8296 }
8297
8298 /* This should be called from any function PUSHing into lists.
8299 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8300 * 'ele' is the element pushed.
8301 *
8302 * If the function returns 0 there was no client waiting for a list push
8303 * against this key.
8304 *
8305 * If the function returns 1 there was a client waiting for a list push
8306 * against this key, the element was passed to this client thus it's not
8307 * needed to actually add it to the list and the caller should return asap. */
8308 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8309 struct dictEntry *de;
8310 redisClient *receiver;
8311 list *l;
8312 listNode *ln;
8313
8314 de = dictFind(c->db->blocking_keys,key);
8315 if (de == NULL) return 0;
8316 l = dictGetEntryVal(de);
8317 ln = listFirst(l);
8318 assert(ln != NULL);
8319 receiver = ln->value;
8320
8321 addReplySds(receiver,sdsnew("*2\r\n"));
8322 addReplyBulk(receiver,key);
8323 addReplyBulk(receiver,ele);
8324 unblockClientWaitingData(receiver);
8325 return 1;
8326 }
8327
8328 /* Blocking RPOP/LPOP */
8329 static void blockingPopGenericCommand(redisClient *c, int where) {
8330 robj *o;
8331 time_t timeout;
8332 int j;
8333
8334 for (j = 1; j < c->argc-1; j++) {
8335 o = lookupKeyWrite(c->db,c->argv[j]);
8336 if (o != NULL) {
8337 if (o->type != REDIS_LIST) {
8338 addReply(c,shared.wrongtypeerr);
8339 return;
8340 } else {
8341 list *list = o->ptr;
8342 if (listLength(list) != 0) {
8343 /* If the list contains elements fall back to the usual
8344 * non-blocking POP operation */
8345 robj *argv[2], **orig_argv;
8346 int orig_argc;
8347
8348 /* We need to alter the command arguments before to call
8349 * popGenericCommand() as the command takes a single key. */
8350 orig_argv = c->argv;
8351 orig_argc = c->argc;
8352 argv[1] = c->argv[j];
8353 c->argv = argv;
8354 c->argc = 2;
8355
8356 /* Also the return value is different, we need to output
8357 * the multi bulk reply header and the key name. The
8358 * "real" command will add the last element (the value)
8359 * for us. If this souds like an hack to you it's just
8360 * because it is... */
8361 addReplySds(c,sdsnew("*2\r\n"));
8362 addReplyBulk(c,argv[1]);
8363 popGenericCommand(c,where);
8364
8365 /* Fix the client structure with the original stuff */
8366 c->argv = orig_argv;
8367 c->argc = orig_argc;
8368 return;
8369 }
8370 }
8371 }
8372 }
8373 /* If the list is empty or the key does not exists we must block */
8374 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8375 if (timeout > 0) timeout += time(NULL);
8376 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8377 }
8378
8379 static void blpopCommand(redisClient *c) {
8380 blockingPopGenericCommand(c,REDIS_HEAD);
8381 }
8382
8383 static void brpopCommand(redisClient *c) {
8384 blockingPopGenericCommand(c,REDIS_TAIL);
8385 }
8386
8387 /* =============================== Replication ============================= */
8388
8389 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8390 ssize_t nwritten, ret = size;
8391 time_t start = time(NULL);
8392
8393 timeout++;
8394 while(size) {
8395 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8396 nwritten = write(fd,ptr,size);
8397 if (nwritten == -1) return -1;
8398 ptr += nwritten;
8399 size -= nwritten;
8400 }
8401 if ((time(NULL)-start) > timeout) {
8402 errno = ETIMEDOUT;
8403 return -1;
8404 }
8405 }
8406 return ret;
8407 }
8408
8409 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8410 ssize_t nread, totread = 0;
8411 time_t start = time(NULL);
8412
8413 timeout++;
8414 while(size) {
8415 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8416 nread = read(fd,ptr,size);
8417 if (nread == -1) return -1;
8418 ptr += nread;
8419 size -= nread;
8420 totread += nread;
8421 }
8422 if ((time(NULL)-start) > timeout) {
8423 errno = ETIMEDOUT;
8424 return -1;
8425 }
8426 }
8427 return totread;
8428 }
8429
8430 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8431 ssize_t nread = 0;
8432
8433 size--;
8434 while(size) {
8435 char c;
8436
8437 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8438 if (c == '\n') {
8439 *ptr = '\0';
8440 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8441 return nread;
8442 } else {
8443 *ptr++ = c;
8444 *ptr = '\0';
8445 nread++;
8446 }
8447 }
8448 return nread;
8449 }
8450
8451 static void syncCommand(redisClient *c) {
8452 /* ignore SYNC if aleady slave or in monitor mode */
8453 if (c->flags & REDIS_SLAVE) return;
8454
8455 /* SYNC can't be issued when the server has pending data to send to
8456 * the client about already issued commands. We need a fresh reply
8457 * buffer registering the differences between the BGSAVE and the current
8458 * dataset, so that we can copy to other slaves if needed. */
8459 if (listLength(c->reply) != 0) {
8460 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8461 return;
8462 }
8463
8464 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8465 /* Here we need to check if there is a background saving operation
8466 * in progress, or if it is required to start one */
8467 if (server.bgsavechildpid != -1) {
8468 /* Ok a background save is in progress. Let's check if it is a good
8469 * one for replication, i.e. if there is another slave that is
8470 * registering differences since the server forked to save */
8471 redisClient *slave;
8472 listNode *ln;
8473 listIter li;
8474
8475 listRewind(server.slaves,&li);
8476 while((ln = listNext(&li))) {
8477 slave = ln->value;
8478 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8479 }
8480 if (ln) {
8481 /* Perfect, the server is already registering differences for
8482 * another slave. Set the right state, and copy the buffer. */
8483 listRelease(c->reply);
8484 c->reply = listDup(slave->reply);
8485 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8486 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8487 } else {
8488 /* No way, we need to wait for the next BGSAVE in order to
8489 * register differences */
8490 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8491 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8492 }
8493 } else {
8494 /* Ok we don't have a BGSAVE in progress, let's start one */
8495 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8496 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8497 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8498 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8499 return;
8500 }
8501 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8502 }
8503 c->repldbfd = -1;
8504 c->flags |= REDIS_SLAVE;
8505 c->slaveseldb = 0;
8506 listAddNodeTail(server.slaves,c);
8507 return;
8508 }
8509
8510 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8511 redisClient *slave = privdata;
8512 REDIS_NOTUSED(el);
8513 REDIS_NOTUSED(mask);
8514 char buf[REDIS_IOBUF_LEN];
8515 ssize_t nwritten, buflen;
8516
8517 if (slave->repldboff == 0) {
8518 /* Write the bulk write count before to transfer the DB. In theory here
8519 * we don't know how much room there is in the output buffer of the
8520 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8521 * operations) will never be smaller than the few bytes we need. */
8522 sds bulkcount;
8523
8524 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8525 slave->repldbsize);
8526 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8527 {
8528 sdsfree(bulkcount);
8529 freeClient(slave);
8530 return;
8531 }
8532 sdsfree(bulkcount);
8533 }
8534 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8535 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8536 if (buflen <= 0) {
8537 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8538 (buflen == 0) ? "premature EOF" : strerror(errno));
8539 freeClient(slave);
8540 return;
8541 }
8542 if ((nwritten = write(fd,buf,buflen)) == -1) {
8543 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8544 strerror(errno));
8545 freeClient(slave);
8546 return;
8547 }
8548 slave->repldboff += nwritten;
8549 if (slave->repldboff == slave->repldbsize) {
8550 close(slave->repldbfd);
8551 slave->repldbfd = -1;
8552 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8553 slave->replstate = REDIS_REPL_ONLINE;
8554 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8555 sendReplyToClient, slave) == AE_ERR) {
8556 freeClient(slave);
8557 return;
8558 }
8559 addReplySds(slave,sdsempty());
8560 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8561 }
8562 }
8563
8564 /* This function is called at the end of every backgrond saving.
8565 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8566 * otherwise REDIS_ERR is passed to the function.
8567 *
8568 * The goal of this function is to handle slaves waiting for a successful
8569 * background saving in order to perform non-blocking synchronization. */
8570 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8571 listNode *ln;
8572 int startbgsave = 0;
8573 listIter li;
8574
8575 listRewind(server.slaves,&li);
8576 while((ln = listNext(&li))) {
8577 redisClient *slave = ln->value;
8578
8579 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8580 startbgsave = 1;
8581 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8582 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8583 struct redis_stat buf;
8584
8585 if (bgsaveerr != REDIS_OK) {
8586 freeClient(slave);
8587 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8588 continue;
8589 }
8590 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8591 redis_fstat(slave->repldbfd,&buf) == -1) {
8592 freeClient(slave);
8593 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8594 continue;
8595 }
8596 slave->repldboff = 0;
8597 slave->repldbsize = buf.st_size;
8598 slave->replstate = REDIS_REPL_SEND_BULK;
8599 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8600 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8601 freeClient(slave);
8602 continue;
8603 }
8604 }
8605 }
8606 if (startbgsave) {
8607 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8608 listIter li;
8609
8610 listRewind(server.slaves,&li);
8611 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8612 while((ln = listNext(&li))) {
8613 redisClient *slave = ln->value;
8614
8615 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8616 freeClient(slave);
8617 }
8618 }
8619 }
8620 }
8621
8622 static int syncWithMaster(void) {
8623 char buf[1024], tmpfile[256], authcmd[1024];
8624 long dumpsize;
8625 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8626 int dfd, maxtries = 5;
8627
8628 if (fd == -1) {
8629 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8630 strerror(errno));
8631 return REDIS_ERR;
8632 }
8633
8634 /* AUTH with the master if required. */
8635 if(server.masterauth) {
8636 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8637 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8638 close(fd);
8639 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8640 strerror(errno));
8641 return REDIS_ERR;
8642 }
8643 /* Read the AUTH result. */
8644 if (syncReadLine(fd,buf,1024,3600) == -1) {
8645 close(fd);
8646 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8647 strerror(errno));
8648 return REDIS_ERR;
8649 }
8650 if (buf[0] != '+') {
8651 close(fd);
8652 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8653 return REDIS_ERR;
8654 }
8655 }
8656
8657 /* Issue the SYNC command */
8658 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8659 close(fd);
8660 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8661 strerror(errno));
8662 return REDIS_ERR;
8663 }
8664 /* Read the bulk write count */
8665 if (syncReadLine(fd,buf,1024,3600) == -1) {
8666 close(fd);
8667 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8668 strerror(errno));
8669 return REDIS_ERR;
8670 }
8671 if (buf[0] != '$') {
8672 close(fd);
8673 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8674 return REDIS_ERR;
8675 }
8676 dumpsize = strtol(buf+1,NULL,10);
8677 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8678 /* Read the bulk write data on a temp file */
8679 while(maxtries--) {
8680 snprintf(tmpfile,256,
8681 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8682 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8683 if (dfd != -1) break;
8684 sleep(1);
8685 }
8686 if (dfd == -1) {
8687 close(fd);
8688 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8689 return REDIS_ERR;
8690 }
8691 while(dumpsize) {
8692 int nread, nwritten;
8693
8694 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8695 if (nread == -1) {
8696 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8697 strerror(errno));
8698 close(fd);
8699 close(dfd);
8700 return REDIS_ERR;
8701 }
8702 nwritten = write(dfd,buf,nread);
8703 if (nwritten == -1) {
8704 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8705 close(fd);
8706 close(dfd);
8707 return REDIS_ERR;
8708 }
8709 dumpsize -= nread;
8710 }
8711 close(dfd);
8712 if (rename(tmpfile,server.dbfilename) == -1) {
8713 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8714 unlink(tmpfile);
8715 close(fd);
8716 return REDIS_ERR;
8717 }
8718 emptyDb();
8719 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8720 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8721 close(fd);
8722 return REDIS_ERR;
8723 }
8724 server.master = createClient(fd);
8725 server.master->flags |= REDIS_MASTER;
8726 server.master->authenticated = 1;
8727 server.replstate = REDIS_REPL_CONNECTED;
8728 return REDIS_OK;
8729 }
8730
8731 static void slaveofCommand(redisClient *c) {
8732 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8733 !strcasecmp(c->argv[2]->ptr,"one")) {
8734 if (server.masterhost) {
8735 sdsfree(server.masterhost);
8736 server.masterhost = NULL;
8737 if (server.master) freeClient(server.master);
8738 server.replstate = REDIS_REPL_NONE;
8739 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8740 }
8741 } else {
8742 sdsfree(server.masterhost);
8743 server.masterhost = sdsdup(c->argv[1]->ptr);
8744 server.masterport = atoi(c->argv[2]->ptr);
8745 if (server.master) freeClient(server.master);
8746 server.replstate = REDIS_REPL_CONNECT;
8747 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8748 server.masterhost, server.masterport);
8749 }
8750 addReply(c,shared.ok);
8751 }
8752
8753 /* ============================ Maxmemory directive ======================== */
8754
8755 /* Try to free one object form the pre-allocated objects free list.
8756 * This is useful under low mem conditions as by default we take 1 million
8757 * free objects allocated. On success REDIS_OK is returned, otherwise
8758 * REDIS_ERR. */
8759 static int tryFreeOneObjectFromFreelist(void) {
8760 robj *o;
8761
8762 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8763 if (listLength(server.objfreelist)) {
8764 listNode *head = listFirst(server.objfreelist);
8765 o = listNodeValue(head);
8766 listDelNode(server.objfreelist,head);
8767 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8768 zfree(o);
8769 return REDIS_OK;
8770 } else {
8771 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8772 return REDIS_ERR;
8773 }
8774 }
8775
8776 /* This function gets called when 'maxmemory' is set on the config file to limit
8777 * the max memory used by the server, and we are out of memory.
8778 * This function will try to, in order:
8779 *
8780 * - Free objects from the free list
8781 * - Try to remove keys with an EXPIRE set
8782 *
8783 * It is not possible to free enough memory to reach used-memory < maxmemory
8784 * the server will start refusing commands that will enlarge even more the
8785 * memory usage.
8786 */
8787 static void freeMemoryIfNeeded(void) {
8788 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8789 int j, k, freed = 0;
8790
8791 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8792 for (j = 0; j < server.dbnum; j++) {
8793 int minttl = -1;
8794 robj *minkey = NULL;
8795 struct dictEntry *de;
8796
8797 if (dictSize(server.db[j].expires)) {
8798 freed = 1;
8799 /* From a sample of three keys drop the one nearest to
8800 * the natural expire */
8801 for (k = 0; k < 3; k++) {
8802 time_t t;
8803
8804 de = dictGetRandomKey(server.db[j].expires);
8805 t = (time_t) dictGetEntryVal(de);
8806 if (minttl == -1 || t < minttl) {
8807 minkey = dictGetEntryKey(de);
8808 minttl = t;
8809 }
8810 }
8811 dbDelete(server.db+j,minkey);
8812 }
8813 }
8814 if (!freed) return; /* nothing to free... */
8815 }
8816 }
8817
8818 /* ============================== Append Only file ========================== */
8819
8820 /* Called when the user switches from "appendonly yes" to "appendonly no"
8821 * at runtime using the CONFIG command. */
8822 static void stopAppendOnly(void) {
8823 flushAppendOnlyFile();
8824 aof_fsync(server.appendfd);
8825 close(server.appendfd);
8826
8827 server.appendfd = -1;
8828 server.appendseldb = -1;
8829 server.appendonly = 0;
8830 /* rewrite operation in progress? kill it, wait child exit */
8831 if (server.bgsavechildpid != -1) {
8832 int statloc;
8833
8834 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8835 wait3(&statloc,0,NULL);
8836 /* reset the buffer accumulating changes while the child saves */
8837 sdsfree(server.bgrewritebuf);
8838 server.bgrewritebuf = sdsempty();
8839 server.bgsavechildpid = -1;
8840 }
8841 }
8842
8843 /* Called when the user switches from "appendonly no" to "appendonly yes"
8844 * at runtime using the CONFIG command. */
8845 static int startAppendOnly(void) {
8846 server.appendonly = 1;
8847 server.lastfsync = time(NULL);
8848 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8849 if (server.appendfd == -1) {
8850 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8851 return REDIS_ERR;
8852 }
8853 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8854 server.appendonly = 0;
8855 close(server.appendfd);
8856 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8857 return REDIS_ERR;
8858 }
8859 return REDIS_OK;
8860 }
8861
8862 /* Write the append only file buffer on disk.
8863 *
8864 * Since we are required to write the AOF before replying to the client,
8865 * and the only way the client socket can get a write is entering when the
8866 * the event loop, we accumulate all the AOF writes in a memory
8867 * buffer and write it on disk using this function just before entering
8868 * the event loop again. */
8869 static void flushAppendOnlyFile(void) {
8870 time_t now;
8871 ssize_t nwritten;
8872
8873 if (sdslen(server.aofbuf) == 0) return;
8874
8875 /* We want to perform a single write. This should be guaranteed atomic
8876 * at least if the filesystem we are writing is a real physical one.
8877 * While this will save us against the server being killed I don't think
8878 * there is much to do about the whole server stopping for power problems
8879 * or alike */
8880 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8881 if (nwritten != (signed)sdslen(server.aofbuf)) {
8882 /* Ooops, we are in troubles. The best thing to do for now is
8883 * aborting instead of giving the illusion that everything is
8884 * working as expected. */
8885 if (nwritten == -1) {
8886 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8887 } else {
8888 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8889 }
8890 exit(1);
8891 }
8892 sdsfree(server.aofbuf);
8893 server.aofbuf = sdsempty();
8894
8895 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8896 * childs performing heavy I/O on disk. */
8897 if (server.no_appendfsync_on_rewrite &&
8898 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8899 return;
8900 /* Fsync if needed */
8901 now = time(NULL);
8902 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8903 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8904 now-server.lastfsync > 1))
8905 {
8906 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8907 * flushing metadata. */
8908 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8909 server.lastfsync = now;
8910 }
8911 }
8912
8913 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8914 int j;
8915 buf = sdscatprintf(buf,"*%d\r\n",argc);
8916 for (j = 0; j < argc; j++) {
8917 robj *o = getDecodedObject(argv[j]);
8918 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8919 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8920 buf = sdscatlen(buf,"\r\n",2);
8921 decrRefCount(o);
8922 }
8923 return buf;
8924 }
8925
8926 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8927 int argc = 3;
8928 long when;
8929 robj *argv[3];
8930
8931 /* Make sure we can use strtol */
8932 seconds = getDecodedObject(seconds);
8933 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8934 decrRefCount(seconds);
8935
8936 argv[0] = createStringObject("EXPIREAT",8);
8937 argv[1] = key;
8938 argv[2] = createObject(REDIS_STRING,
8939 sdscatprintf(sdsempty(),"%ld",when));
8940 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8941 decrRefCount(argv[0]);
8942 decrRefCount(argv[2]);
8943 return buf;
8944 }
8945
8946 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8947 sds buf = sdsempty();
8948 robj *tmpargv[3];
8949
8950 /* The DB this command was targetting is not the same as the last command
8951 * we appendend. To issue a SELECT command is needed. */
8952 if (dictid != server.appendseldb) {
8953 char seldb[64];
8954
8955 snprintf(seldb,sizeof(seldb),"%d",dictid);
8956 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8957 (unsigned long)strlen(seldb),seldb);
8958 server.appendseldb = dictid;
8959 }
8960
8961 if (cmd->proc == expireCommand) {
8962 /* Translate EXPIRE into EXPIREAT */
8963 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8964 } else if (cmd->proc == setexCommand) {
8965 /* Translate SETEX to SET and EXPIREAT */
8966 tmpargv[0] = createStringObject("SET",3);
8967 tmpargv[1] = argv[1];
8968 tmpargv[2] = argv[3];
8969 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8970 decrRefCount(tmpargv[0]);
8971 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8972 } else {
8973 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8974 }
8975
8976 /* Append to the AOF buffer. This will be flushed on disk just before
8977 * of re-entering the event loop, so before the client will get a
8978 * positive reply about the operation performed. */
8979 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8980
8981 /* If a background append only file rewriting is in progress we want to
8982 * accumulate the differences between the child DB and the current one
8983 * in a buffer, so that when the child process will do its work we
8984 * can append the differences to the new append only file. */
8985 if (server.bgrewritechildpid != -1)
8986 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8987
8988 sdsfree(buf);
8989 }
8990
8991 /* In Redis commands are always executed in the context of a client, so in
8992 * order to load the append only file we need to create a fake client. */
8993 static struct redisClient *createFakeClient(void) {
8994 struct redisClient *c = zmalloc(sizeof(*c));
8995
8996 selectDb(c,0);
8997 c->fd = -1;
8998 c->querybuf = sdsempty();
8999 c->argc = 0;
9000 c->argv = NULL;
9001 c->flags = 0;
9002 /* We set the fake client as a slave waiting for the synchronization
9003 * so that Redis will not try to send replies to this client. */
9004 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
9005 c->reply = listCreate();
9006 listSetFreeMethod(c->reply,decrRefCount);
9007 listSetDupMethod(c->reply,dupClientReplyValue);
9008 initClientMultiState(c);
9009 return c;
9010 }
9011
9012 static void freeFakeClient(struct redisClient *c) {
9013 sdsfree(c->querybuf);
9014 listRelease(c->reply);
9015 freeClientMultiState(c);
9016 zfree(c);
9017 }
9018
9019 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
9020 * error (the append only file is zero-length) REDIS_ERR is returned. On
9021 * fatal error an error message is logged and the program exists. */
9022 int loadAppendOnlyFile(char *filename) {
9023 struct redisClient *fakeClient;
9024 FILE *fp = fopen(filename,"r");
9025 struct redis_stat sb;
9026 int appendonly = server.appendonly;
9027
9028 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
9029 return REDIS_ERR;
9030
9031 if (fp == NULL) {
9032 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
9033 exit(1);
9034 }
9035
9036 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
9037 * to the same file we're about to read. */
9038 server.appendonly = 0;
9039
9040 fakeClient = createFakeClient();
9041 while(1) {
9042 int argc, j;
9043 unsigned long len;
9044 robj **argv;
9045 char buf[128];
9046 sds argsds;
9047 struct redisCommand *cmd;
9048 int force_swapout;
9049
9050 if (fgets(buf,sizeof(buf),fp) == NULL) {
9051 if (feof(fp))
9052 break;
9053 else
9054 goto readerr;
9055 }
9056 if (buf[0] != '*') goto fmterr;
9057 argc = atoi(buf+1);
9058 argv = zmalloc(sizeof(robj*)*argc);
9059 for (j = 0; j < argc; j++) {
9060 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
9061 if (buf[0] != '$') goto fmterr;
9062 len = strtol(buf+1,NULL,10);
9063 argsds = sdsnewlen(NULL,len);
9064 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
9065 argv[j] = createObject(REDIS_STRING,argsds);
9066 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
9067 }
9068
9069 /* Command lookup */
9070 cmd = lookupCommand(argv[0]->ptr);
9071 if (!cmd) {
9072 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
9073 exit(1);
9074 }
9075 /* Try object encoding */
9076 if (cmd->flags & REDIS_CMD_BULK)
9077 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
9078 /* Run the command in the context of a fake client */
9079 fakeClient->argc = argc;
9080 fakeClient->argv = argv;
9081 cmd->proc(fakeClient);
9082 /* Discard the reply objects list from the fake client */
9083 while(listLength(fakeClient->reply))
9084 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
9085 /* Clean up, ready for the next command */
9086 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
9087 zfree(argv);
9088 /* Handle swapping while loading big datasets when VM is on */
9089 force_swapout = 0;
9090 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
9091 force_swapout = 1;
9092
9093 if (server.vm_enabled && force_swapout) {
9094 while (zmalloc_used_memory() > server.vm_max_memory) {
9095 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
9096 }
9097 }
9098 }
9099
9100 /* This point can only be reached when EOF is reached without errors.
9101 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9102 if (fakeClient->flags & REDIS_MULTI) goto readerr;
9103
9104 fclose(fp);
9105 freeFakeClient(fakeClient);
9106 server.appendonly = appendonly;
9107 return REDIS_OK;
9108
9109 readerr:
9110 if (feof(fp)) {
9111 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9112 } else {
9113 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9114 }
9115 exit(1);
9116 fmterr:
9117 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9118 exit(1);
9119 }
9120
9121 /* Write binary-safe string into a file in the bulkformat
9122 * $<count>\r\n<payload>\r\n */
9123 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9124 char cbuf[128];
9125 int clen;
9126 cbuf[0] = '$';
9127 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9128 cbuf[clen++] = '\r';
9129 cbuf[clen++] = '\n';
9130 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9131 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9132 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9133 return 1;
9134 }
9135
9136 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9137 static int fwriteBulkDouble(FILE *fp, double d) {
9138 char buf[128], dbuf[128];
9139
9140 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9141 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9142 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9143 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9144 return 1;
9145 }
9146
9147 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9148 static int fwriteBulkLongLong(FILE *fp, long long l) {
9149 char bbuf[128], lbuf[128];
9150 unsigned int blen, llen;
9151 llen = ll2string(lbuf,32,l);
9152 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9153 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9154 return 1;
9155 }
9156
9157 /* Delegate writing an object to writing a bulk string or bulk long long. */
9158 static int fwriteBulkObject(FILE *fp, robj *obj) {
9159 /* Avoid using getDecodedObject to help copy-on-write (we are often
9160 * in a child process when this function is called). */
9161 if (obj->encoding == REDIS_ENCODING_INT) {
9162 return fwriteBulkLongLong(fp,(long)obj->ptr);
9163 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9164 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9165 } else {
9166 redisPanic("Unknown string encoding");
9167 }
9168 }
9169
9170 /* Write a sequence of commands able to fully rebuild the dataset into
9171 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9172 static int rewriteAppendOnlyFile(char *filename) {
9173 dictIterator *di = NULL;
9174 dictEntry *de;
9175 FILE *fp;
9176 char tmpfile[256];
9177 int j;
9178 time_t now = time(NULL);
9179
9180 /* Note that we have to use a different temp name here compared to the
9181 * one used by rewriteAppendOnlyFileBackground() function. */
9182 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9183 fp = fopen(tmpfile,"w");
9184 if (!fp) {
9185 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9186 return REDIS_ERR;
9187 }
9188 for (j = 0; j < server.dbnum; j++) {
9189 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9190 redisDb *db = server.db+j;
9191 dict *d = db->dict;
9192 if (dictSize(d) == 0) continue;
9193 di = dictGetIterator(d);
9194 if (!di) {
9195 fclose(fp);
9196 return REDIS_ERR;
9197 }
9198
9199 /* SELECT the new DB */
9200 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9201 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9202
9203 /* Iterate this DB writing every entry */
9204 while((de = dictNext(di)) != NULL) {
9205 sds keystr = dictGetEntryKey(de);
9206 robj key, *o;
9207 time_t expiretime;
9208 int swapped;
9209
9210 keystr = dictGetEntryKey(de);
9211 o = dictGetEntryVal(de);
9212 initStaticStringObject(key,keystr);
9213 /* If the value for this key is swapped, load a preview in memory.
9214 * We use a "swapped" flag to remember if we need to free the
9215 * value object instead to just increment the ref count anyway
9216 * in order to avoid copy-on-write of pages if we are forked() */
9217 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9218 o->storage == REDIS_VM_SWAPPING) {
9219 swapped = 0;
9220 } else {
9221 o = vmPreviewObject(o);
9222 swapped = 1;
9223 }
9224 expiretime = getExpire(db,&key);
9225
9226 /* Save the key and associated value */
9227 if (o->type == REDIS_STRING) {
9228 /* Emit a SET command */
9229 char cmd[]="*3\r\n$3\r\nSET\r\n";
9230 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9231 /* Key and value */
9232 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9233 if (fwriteBulkObject(fp,o) == 0) goto werr;
9234 } else if (o->type == REDIS_LIST) {
9235 /* Emit the RPUSHes needed to rebuild the list */
9236 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9237 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9238 unsigned char *zl = o->ptr;
9239 unsigned char *p = ziplistIndex(zl,0);
9240 unsigned char *vstr;
9241 unsigned int vlen;
9242 long long vlong;
9243
9244 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9245 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9246 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9247 if (vstr) {
9248 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9249 goto werr;
9250 } else {
9251 if (fwriteBulkLongLong(fp,vlong) == 0)
9252 goto werr;
9253 }
9254 p = ziplistNext(zl,p);
9255 }
9256 } else if (o->encoding == REDIS_ENCODING_LIST) {
9257 list *list = o->ptr;
9258 listNode *ln;
9259 listIter li;
9260
9261 listRewind(list,&li);
9262 while((ln = listNext(&li))) {
9263 robj *eleobj = listNodeValue(ln);
9264
9265 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9266 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9267 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9268 }
9269 } else {
9270 redisPanic("Unknown list encoding");
9271 }
9272 } else if (o->type == REDIS_SET) {
9273 /* Emit the SADDs needed to rebuild the set */
9274 dict *set = o->ptr;
9275 dictIterator *di = dictGetIterator(set);
9276 dictEntry *de;
9277
9278 while((de = dictNext(di)) != NULL) {
9279 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9280 robj *eleobj = dictGetEntryKey(de);
9281
9282 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9283 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9284 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9285 }
9286 dictReleaseIterator(di);
9287 } else if (o->type == REDIS_ZSET) {
9288 /* Emit the ZADDs needed to rebuild the sorted set */
9289 zset *zs = o->ptr;
9290 dictIterator *di = dictGetIterator(zs->dict);
9291 dictEntry *de;
9292
9293 while((de = dictNext(di)) != NULL) {
9294 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9295 robj *eleobj = dictGetEntryKey(de);
9296 double *score = dictGetEntryVal(de);
9297
9298 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9299 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9300 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9301 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9302 }
9303 dictReleaseIterator(di);
9304 } else if (o->type == REDIS_HASH) {
9305 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9306
9307 /* Emit the HSETs needed to rebuild the hash */
9308 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9309 unsigned char *p = zipmapRewind(o->ptr);
9310 unsigned char *field, *val;
9311 unsigned int flen, vlen;
9312
9313 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9314 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9315 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9316 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9317 return -1;
9318 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9319 return -1;
9320 }
9321 } else {
9322 dictIterator *di = dictGetIterator(o->ptr);
9323 dictEntry *de;
9324
9325 while((de = dictNext(di)) != NULL) {
9326 robj *field = dictGetEntryKey(de);
9327 robj *val = dictGetEntryVal(de);
9328
9329 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9330 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9331 if (fwriteBulkObject(fp,field) == -1) return -1;
9332 if (fwriteBulkObject(fp,val) == -1) return -1;
9333 }
9334 dictReleaseIterator(di);
9335 }
9336 } else {
9337 redisPanic("Unknown object type");
9338 }
9339 /* Save the expire time */
9340 if (expiretime != -1) {
9341 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9342 /* If this key is already expired skip it */
9343 if (expiretime < now) continue;
9344 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9345 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9346 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9347 }
9348 if (swapped) decrRefCount(o);
9349 }
9350 dictReleaseIterator(di);
9351 }
9352
9353 /* Make sure data will not remain on the OS's output buffers */
9354 fflush(fp);
9355 aof_fsync(fileno(fp));
9356 fclose(fp);
9357
9358 /* Use RENAME to make sure the DB file is changed atomically only
9359 * if the generate DB file is ok. */
9360 if (rename(tmpfile,filename) == -1) {
9361 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9362 unlink(tmpfile);
9363 return REDIS_ERR;
9364 }
9365 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9366 return REDIS_OK;
9367
9368 werr:
9369 fclose(fp);
9370 unlink(tmpfile);
9371 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9372 if (di) dictReleaseIterator(di);
9373 return REDIS_ERR;
9374 }
9375
9376 /* This is how rewriting of the append only file in background works:
9377 *
9378 * 1) The user calls BGREWRITEAOF
9379 * 2) Redis calls this function, that forks():
9380 * 2a) the child rewrite the append only file in a temp file.
9381 * 2b) the parent accumulates differences in server.bgrewritebuf.
9382 * 3) When the child finished '2a' exists.
9383 * 4) The parent will trap the exit code, if it's OK, will append the
9384 * data accumulated into server.bgrewritebuf into the temp file, and
9385 * finally will rename(2) the temp file in the actual file name.
9386 * The the new file is reopened as the new append only file. Profit!
9387 */
9388 static int rewriteAppendOnlyFileBackground(void) {
9389 pid_t childpid;
9390
9391 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9392 if (server.vm_enabled) waitEmptyIOJobsQueue();
9393 if ((childpid = fork()) == 0) {
9394 /* Child */
9395 char tmpfile[256];
9396
9397 if (server.vm_enabled) vmReopenSwapFile();
9398 close(server.fd);
9399 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9400 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9401 _exit(0);
9402 } else {
9403 _exit(1);
9404 }
9405 } else {
9406 /* Parent */
9407 if (childpid == -1) {
9408 redisLog(REDIS_WARNING,
9409 "Can't rewrite append only file in background: fork: %s",
9410 strerror(errno));
9411 return REDIS_ERR;
9412 }
9413 redisLog(REDIS_NOTICE,
9414 "Background append only file rewriting started by pid %d",childpid);
9415 server.bgrewritechildpid = childpid;
9416 updateDictResizePolicy();
9417 /* We set appendseldb to -1 in order to force the next call to the
9418 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9419 * accumulated by the parent into server.bgrewritebuf will start
9420 * with a SELECT statement and it will be safe to merge. */
9421 server.appendseldb = -1;
9422 return REDIS_OK;
9423 }
9424 return REDIS_OK; /* unreached */
9425 }
9426
9427 static void bgrewriteaofCommand(redisClient *c) {
9428 if (server.bgrewritechildpid != -1) {
9429 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9430 return;
9431 }
9432 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9433 char *status = "+Background append only file rewriting started\r\n";
9434 addReplySds(c,sdsnew(status));
9435 } else {
9436 addReply(c,shared.err);
9437 }
9438 }
9439
9440 static void aofRemoveTempFile(pid_t childpid) {
9441 char tmpfile[256];
9442
9443 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9444 unlink(tmpfile);
9445 }
9446
9447 /* Virtual Memory is composed mainly of two subsystems:
9448 * - Blocking Virutal Memory
9449 * - Threaded Virtual Memory I/O
9450 * The two parts are not fully decoupled, but functions are split among two
9451 * different sections of the source code (delimited by comments) in order to
9452 * make more clear what functionality is about the blocking VM and what about
9453 * the threaded (not blocking) VM.
9454 *
9455 * Redis VM design:
9456 *
9457 * Redis VM is a blocking VM (one that blocks reading swapped values from
9458 * disk into memory when a value swapped out is needed in memory) that is made
9459 * unblocking by trying to examine the command argument vector in order to
9460 * load in background values that will likely be needed in order to exec
9461 * the command. The command is executed only once all the relevant keys
9462 * are loaded into memory.
9463 *
9464 * This basically is almost as simple of a blocking VM, but almost as parallel
9465 * as a fully non-blocking VM.
9466 */
9467
9468 /* =================== Virtual Memory - Blocking Side ====================== */
9469
9470 /* Create a VM pointer object. This kind of objects are used in place of
9471 * values in the key -> value hash table, for swapped out objects. */
9472 static vmpointer *createVmPointer(int vtype) {
9473 vmpointer *vp = zmalloc(sizeof(vmpointer));
9474
9475 vp->type = REDIS_VMPOINTER;
9476 vp->storage = REDIS_VM_SWAPPED;
9477 vp->vtype = vtype;
9478 return vp;
9479 }
9480
9481 static void vmInit(void) {
9482 off_t totsize;
9483 int pipefds[2];
9484 size_t stacksize;
9485 struct flock fl;
9486
9487 if (server.vm_max_threads != 0)
9488 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9489
9490 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9491 /* Try to open the old swap file, otherwise create it */
9492 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9493 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9494 }
9495 if (server.vm_fp == NULL) {
9496 redisLog(REDIS_WARNING,
9497 "Can't open the swap file: %s. Exiting.",
9498 strerror(errno));
9499 exit(1);
9500 }
9501 server.vm_fd = fileno(server.vm_fp);
9502 /* Lock the swap file for writing, this is useful in order to avoid
9503 * another instance to use the same swap file for a config error. */
9504 fl.l_type = F_WRLCK;
9505 fl.l_whence = SEEK_SET;
9506 fl.l_start = fl.l_len = 0;
9507 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9508 redisLog(REDIS_WARNING,
9509 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9510 exit(1);
9511 }
9512 /* Initialize */
9513 server.vm_next_page = 0;
9514 server.vm_near_pages = 0;
9515 server.vm_stats_used_pages = 0;
9516 server.vm_stats_swapped_objects = 0;
9517 server.vm_stats_swapouts = 0;
9518 server.vm_stats_swapins = 0;
9519 totsize = server.vm_pages*server.vm_page_size;
9520 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9521 if (ftruncate(server.vm_fd,totsize) == -1) {
9522 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9523 strerror(errno));
9524 exit(1);
9525 } else {
9526 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9527 }
9528 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9529 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9530 (long long) (server.vm_pages+7)/8, server.vm_pages);
9531 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9532
9533 /* Initialize threaded I/O (used by Virtual Memory) */
9534 server.io_newjobs = listCreate();
9535 server.io_processing = listCreate();
9536 server.io_processed = listCreate();
9537 server.io_ready_clients = listCreate();
9538 pthread_mutex_init(&server.io_mutex,NULL);
9539 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9540 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9541 server.io_active_threads = 0;
9542 if (pipe(pipefds) == -1) {
9543 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9544 ,strerror(errno));
9545 exit(1);
9546 }
9547 server.io_ready_pipe_read = pipefds[0];
9548 server.io_ready_pipe_write = pipefds[1];
9549 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9550 /* LZF requires a lot of stack */
9551 pthread_attr_init(&server.io_threads_attr);
9552 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9553 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9554 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9555 /* Listen for events in the threaded I/O pipe */
9556 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9557 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9558 oom("creating file event");
9559 }
9560
9561 /* Mark the page as used */
9562 static void vmMarkPageUsed(off_t page) {
9563 off_t byte = page/8;
9564 int bit = page&7;
9565 redisAssert(vmFreePage(page) == 1);
9566 server.vm_bitmap[byte] |= 1<<bit;
9567 }
9568
9569 /* Mark N contiguous pages as used, with 'page' being the first. */
9570 static void vmMarkPagesUsed(off_t page, off_t count) {
9571 off_t j;
9572
9573 for (j = 0; j < count; j++)
9574 vmMarkPageUsed(page+j);
9575 server.vm_stats_used_pages += count;
9576 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9577 (long long)count, (long long)page);
9578 }
9579
9580 /* Mark the page as free */
9581 static void vmMarkPageFree(off_t page) {
9582 off_t byte = page/8;
9583 int bit = page&7;
9584 redisAssert(vmFreePage(page) == 0);
9585 server.vm_bitmap[byte] &= ~(1<<bit);
9586 }
9587
9588 /* Mark N contiguous pages as free, with 'page' being the first. */
9589 static void vmMarkPagesFree(off_t page, off_t count) {
9590 off_t j;
9591
9592 for (j = 0; j < count; j++)
9593 vmMarkPageFree(page+j);
9594 server.vm_stats_used_pages -= count;
9595 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9596 (long long)count, (long long)page);
9597 }
9598
9599 /* Test if the page is free */
9600 static int vmFreePage(off_t page) {
9601 off_t byte = page/8;
9602 int bit = page&7;
9603 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9604 }
9605
9606 /* Find N contiguous free pages storing the first page of the cluster in *first.
9607 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9608 * REDIS_ERR is returned.
9609 *
9610 * This function uses a simple algorithm: we try to allocate
9611 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9612 * again from the start of the swap file searching for free spaces.
9613 *
9614 * If it looks pretty clear that there are no free pages near our offset
9615 * we try to find less populated places doing a forward jump of
9616 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9617 * without hurry, and then we jump again and so forth...
9618 *
9619 * This function can be improved using a free list to avoid to guess
9620 * too much, since we could collect data about freed pages.
9621 *
9622 * note: I implemented this function just after watching an episode of
9623 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9624 */
9625 static int vmFindContiguousPages(off_t *first, off_t n) {
9626 off_t base, offset = 0, since_jump = 0, numfree = 0;
9627
9628 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9629 server.vm_near_pages = 0;
9630 server.vm_next_page = 0;
9631 }
9632 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9633 base = server.vm_next_page;
9634
9635 while(offset < server.vm_pages) {
9636 off_t this = base+offset;
9637
9638 /* If we overflow, restart from page zero */
9639 if (this >= server.vm_pages) {
9640 this -= server.vm_pages;
9641 if (this == 0) {
9642 /* Just overflowed, what we found on tail is no longer
9643 * interesting, as it's no longer contiguous. */
9644 numfree = 0;
9645 }
9646 }
9647 if (vmFreePage(this)) {
9648 /* This is a free page */
9649 numfree++;
9650 /* Already got N free pages? Return to the caller, with success */
9651 if (numfree == n) {
9652 *first = this-(n-1);
9653 server.vm_next_page = this+1;
9654 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9655 return REDIS_OK;
9656 }
9657 } else {
9658 /* The current one is not a free page */
9659 numfree = 0;
9660 }
9661
9662 /* Fast-forward if the current page is not free and we already
9663 * searched enough near this place. */
9664 since_jump++;
9665 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9666 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9667 since_jump = 0;
9668 /* Note that even if we rewind after the jump, we are don't need
9669 * to make sure numfree is set to zero as we only jump *if* it
9670 * is set to zero. */
9671 } else {
9672 /* Otherwise just check the next page */
9673 offset++;
9674 }
9675 }
9676 return REDIS_ERR;
9677 }
9678
9679 /* Write the specified object at the specified page of the swap file */
9680 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9681 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9682 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9683 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9684 redisLog(REDIS_WARNING,
9685 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9686 strerror(errno));
9687 return REDIS_ERR;
9688 }
9689 rdbSaveObject(server.vm_fp,o);
9690 fflush(server.vm_fp);
9691 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9692 return REDIS_OK;
9693 }
9694
9695 /* Transfers the 'val' object to disk. Store all the information
9696 * a 'vmpointer' object containing all the information needed to load the
9697 * object back later is returned.
9698 *
9699 * If we can't find enough contiguous empty pages to swap the object on disk
9700 * NULL is returned. */
9701 static vmpointer *vmSwapObjectBlocking(robj *val) {
9702 off_t pages = rdbSavedObjectPages(val,NULL);
9703 off_t page;
9704 vmpointer *vp;
9705
9706 assert(val->storage == REDIS_VM_MEMORY);
9707 assert(val->refcount == 1);
9708 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9709 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9710
9711 vp = createVmPointer(val->type);
9712 vp->page = page;
9713 vp->usedpages = pages;
9714 decrRefCount(val); /* Deallocate the object from memory. */
9715 vmMarkPagesUsed(page,pages);
9716 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9717 (void*) val,
9718 (unsigned long long) page, (unsigned long long) pages);
9719 server.vm_stats_swapped_objects++;
9720 server.vm_stats_swapouts++;
9721 return vp;
9722 }
9723
9724 static robj *vmReadObjectFromSwap(off_t page, int type) {
9725 robj *o;
9726
9727 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9728 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9729 redisLog(REDIS_WARNING,
9730 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9731 strerror(errno));
9732 _exit(1);
9733 }
9734 o = rdbLoadObject(type,server.vm_fp);
9735 if (o == NULL) {
9736 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9737 _exit(1);
9738 }
9739 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9740 return o;
9741 }
9742
9743 /* Load the specified object from swap to memory.
9744 * The newly allocated object is returned.
9745 *
9746 * If preview is true the unserialized object is returned to the caller but
9747 * the pages are not marked as freed, nor the vp object is freed. */
9748 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9749 robj *val;
9750
9751 redisAssert(vp->type == REDIS_VMPOINTER &&
9752 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9753 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9754 if (!preview) {
9755 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9756 vmMarkPagesFree(vp->page,vp->usedpages);
9757 zfree(vp);
9758 server.vm_stats_swapped_objects--;
9759 } else {
9760 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9761 }
9762 server.vm_stats_swapins++;
9763 return val;
9764 }
9765
9766 /* Plain object loading, from swap to memory.
9767 *
9768 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9769 * The return value is the loaded object. */
9770 static robj *vmLoadObject(robj *o) {
9771 /* If we are loading the object in background, stop it, we
9772 * need to load this object synchronously ASAP. */
9773 if (o->storage == REDIS_VM_LOADING)
9774 vmCancelThreadedIOJob(o);
9775 return vmGenericLoadObject((vmpointer*)o,0);
9776 }
9777
9778 /* Just load the value on disk, without to modify the key.
9779 * This is useful when we want to perform some operation on the value
9780 * without to really bring it from swap to memory, like while saving the
9781 * dataset or rewriting the append only log. */
9782 static robj *vmPreviewObject(robj *o) {
9783 return vmGenericLoadObject((vmpointer*)o,1);
9784 }
9785
9786 /* How a good candidate is this object for swapping?
9787 * The better candidate it is, the greater the returned value.
9788 *
9789 * Currently we try to perform a fast estimation of the object size in
9790 * memory, and combine it with aging informations.
9791 *
9792 * Basically swappability = idle-time * log(estimated size)
9793 *
9794 * Bigger objects are preferred over smaller objects, but not
9795 * proportionally, this is why we use the logarithm. This algorithm is
9796 * just a first try and will probably be tuned later. */
9797 static double computeObjectSwappability(robj *o) {
9798 /* actual age can be >= minage, but not < minage. As we use wrapping
9799 * 21 bit clocks with minutes resolution for the LRU. */
9800 time_t minage = abs(server.lruclock - o->lru);
9801 long asize = 0;
9802 list *l;
9803 dict *d;
9804 struct dictEntry *de;
9805 int z;
9806
9807 if (minage <= 0) return 0;
9808 switch(o->type) {
9809 case REDIS_STRING:
9810 if (o->encoding != REDIS_ENCODING_RAW) {
9811 asize = sizeof(*o);
9812 } else {
9813 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9814 }
9815 break;
9816 case REDIS_LIST:
9817 l = o->ptr;
9818 listNode *ln = listFirst(l);
9819
9820 asize = sizeof(list);
9821 if (ln) {
9822 robj *ele = ln->value;
9823 long elesize;
9824
9825 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9826 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9827 asize += (sizeof(listNode)+elesize)*listLength(l);
9828 }
9829 break;
9830 case REDIS_SET:
9831 case REDIS_ZSET:
9832 z = (o->type == REDIS_ZSET);
9833 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9834
9835 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9836 if (z) asize += sizeof(zset)-sizeof(dict);
9837 if (dictSize(d)) {
9838 long elesize;
9839 robj *ele;
9840
9841 de = dictGetRandomKey(d);
9842 ele = dictGetEntryKey(de);
9843 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9844 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9845 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9846 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9847 }
9848 break;
9849 case REDIS_HASH:
9850 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9851 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9852 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9853 unsigned int klen, vlen;
9854 unsigned char *key, *val;
9855
9856 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9857 klen = 0;
9858 vlen = 0;
9859 }
9860 asize = len*(klen+vlen+3);
9861 } else if (o->encoding == REDIS_ENCODING_HT) {
9862 d = o->ptr;
9863 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9864 if (dictSize(d)) {
9865 long elesize;
9866 robj *ele;
9867
9868 de = dictGetRandomKey(d);
9869 ele = dictGetEntryKey(de);
9870 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9871 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9872 ele = dictGetEntryVal(de);
9873 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9874 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9875 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9876 }
9877 }
9878 break;
9879 }
9880 return (double)minage*log(1+asize);
9881 }
9882
9883 /* Try to swap an object that's a good candidate for swapping.
9884 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9885 * to swap any object at all.
9886 *
9887 * If 'usethreaded' is true, Redis will try to swap the object in background
9888 * using I/O threads. */
9889 static int vmSwapOneObject(int usethreads) {
9890 int j, i;
9891 struct dictEntry *best = NULL;
9892 double best_swappability = 0;
9893 redisDb *best_db = NULL;
9894 robj *val;
9895 sds key;
9896
9897 for (j = 0; j < server.dbnum; j++) {
9898 redisDb *db = server.db+j;
9899 /* Why maxtries is set to 100?
9900 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9901 * are swappable objects */
9902 int maxtries = 100;
9903
9904 if (dictSize(db->dict) == 0) continue;
9905 for (i = 0; i < 5; i++) {
9906 dictEntry *de;
9907 double swappability;
9908
9909 if (maxtries) maxtries--;
9910 de = dictGetRandomKey(db->dict);
9911 val = dictGetEntryVal(de);
9912 /* Only swap objects that are currently in memory.
9913 *
9914 * Also don't swap shared objects: not a good idea in general and
9915 * we need to ensure that the main thread does not touch the
9916 * object while the I/O thread is using it, but we can't
9917 * control other keys without adding additional mutex. */
9918 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9919 if (maxtries) i--; /* don't count this try */
9920 continue;
9921 }
9922 swappability = computeObjectSwappability(val);
9923 if (!best || swappability > best_swappability) {
9924 best = de;
9925 best_swappability = swappability;
9926 best_db = db;
9927 }
9928 }
9929 }
9930 if (best == NULL) return REDIS_ERR;
9931 key = dictGetEntryKey(best);
9932 val = dictGetEntryVal(best);
9933
9934 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9935 key, best_swappability);
9936
9937 /* Swap it */
9938 if (usethreads) {
9939 robj *keyobj = createStringObject(key,sdslen(key));
9940 vmSwapObjectThreaded(keyobj,val,best_db);
9941 decrRefCount(keyobj);
9942 return REDIS_OK;
9943 } else {
9944 vmpointer *vp;
9945
9946 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9947 dictGetEntryVal(best) = vp;
9948 return REDIS_OK;
9949 } else {
9950 return REDIS_ERR;
9951 }
9952 }
9953 }
9954
9955 static int vmSwapOneObjectBlocking() {
9956 return vmSwapOneObject(0);
9957 }
9958
9959 static int vmSwapOneObjectThreaded() {
9960 return vmSwapOneObject(1);
9961 }
9962
9963 /* Return true if it's safe to swap out objects in a given moment.
9964 * Basically we don't want to swap objects out while there is a BGSAVE
9965 * or a BGAEOREWRITE running in backgroud. */
9966 static int vmCanSwapOut(void) {
9967 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9968 }
9969
9970 /* =================== Virtual Memory - Threaded I/O ======================= */
9971
9972 static void freeIOJob(iojob *j) {
9973 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9974 j->type == REDIS_IOJOB_DO_SWAP ||
9975 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9976 {
9977 /* we fix the storage type, otherwise decrRefCount() will try to
9978 * kill the I/O thread Job (that does no longer exists). */
9979 if (j->val->storage == REDIS_VM_SWAPPING)
9980 j->val->storage = REDIS_VM_MEMORY;
9981 decrRefCount(j->val);
9982 }
9983 decrRefCount(j->key);
9984 zfree(j);
9985 }
9986
9987 /* Every time a thread finished a Job, it writes a byte into the write side
9988 * of an unix pipe in order to "awake" the main thread, and this function
9989 * is called. */
9990 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9991 int mask)
9992 {
9993 char buf[1];
9994 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9995 REDIS_NOTUSED(el);
9996 REDIS_NOTUSED(mask);
9997 REDIS_NOTUSED(privdata);
9998
9999 /* For every byte we read in the read side of the pipe, there is one
10000 * I/O job completed to process. */
10001 while((retval = read(fd,buf,1)) == 1) {
10002 iojob *j;
10003 listNode *ln;
10004 struct dictEntry *de;
10005
10006 redisLog(REDIS_DEBUG,"Processing I/O completed job");
10007
10008 /* Get the processed element (the oldest one) */
10009 lockThreadedIO();
10010 assert(listLength(server.io_processed) != 0);
10011 if (toprocess == -1) {
10012 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
10013 if (toprocess <= 0) toprocess = 1;
10014 }
10015 ln = listFirst(server.io_processed);
10016 j = ln->value;
10017 listDelNode(server.io_processed,ln);
10018 unlockThreadedIO();
10019 /* If this job is marked as canceled, just ignore it */
10020 if (j->canceled) {
10021 freeIOJob(j);
10022 continue;
10023 }
10024 /* Post process it in the main thread, as there are things we
10025 * can do just here to avoid race conditions and/or invasive locks */
10026 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
10027 de = dictFind(j->db->dict,j->key->ptr);
10028 redisAssert(de != NULL);
10029 if (j->type == REDIS_IOJOB_LOAD) {
10030 redisDb *db;
10031 vmpointer *vp = dictGetEntryVal(de);
10032
10033 /* Key loaded, bring it at home */
10034 vmMarkPagesFree(vp->page,vp->usedpages);
10035 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
10036 (unsigned char*) j->key->ptr);
10037 server.vm_stats_swapped_objects--;
10038 server.vm_stats_swapins++;
10039 dictGetEntryVal(de) = j->val;
10040 incrRefCount(j->val);
10041 db = j->db;
10042 /* Handle clients waiting for this key to be loaded. */
10043 handleClientsBlockedOnSwappedKey(db,j->key);
10044 freeIOJob(j);
10045 zfree(vp);
10046 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10047 /* Now we know the amount of pages required to swap this object.
10048 * Let's find some space for it, and queue this task again
10049 * rebranded as REDIS_IOJOB_DO_SWAP. */
10050 if (!vmCanSwapOut() ||
10051 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
10052 {
10053 /* Ooops... no space or we can't swap as there is
10054 * a fork()ed Redis trying to save stuff on disk. */
10055 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
10056 freeIOJob(j);
10057 } else {
10058 /* Note that we need to mark this pages as used now,
10059 * if the job will be canceled, we'll mark them as freed
10060 * again. */
10061 vmMarkPagesUsed(j->page,j->pages);
10062 j->type = REDIS_IOJOB_DO_SWAP;
10063 lockThreadedIO();
10064 queueIOJob(j);
10065 unlockThreadedIO();
10066 }
10067 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10068 vmpointer *vp;
10069
10070 /* Key swapped. We can finally free some memory. */
10071 if (j->val->storage != REDIS_VM_SWAPPING) {
10072 vmpointer *vp = (vmpointer*) j->id;
10073 printf("storage: %d\n",vp->storage);
10074 printf("key->name: %s\n",(char*)j->key->ptr);
10075 printf("val: %p\n",(void*)j->val);
10076 printf("val->type: %d\n",j->val->type);
10077 printf("val->ptr: %s\n",(char*)j->val->ptr);
10078 }
10079 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
10080 vp = createVmPointer(j->val->type);
10081 vp->page = j->page;
10082 vp->usedpages = j->pages;
10083 dictGetEntryVal(de) = vp;
10084 /* Fix the storage otherwise decrRefCount will attempt to
10085 * remove the associated I/O job */
10086 j->val->storage = REDIS_VM_MEMORY;
10087 decrRefCount(j->val);
10088 redisLog(REDIS_DEBUG,
10089 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
10090 (unsigned char*) j->key->ptr,
10091 (unsigned long long) j->page, (unsigned long long) j->pages);
10092 server.vm_stats_swapped_objects++;
10093 server.vm_stats_swapouts++;
10094 freeIOJob(j);
10095 /* Put a few more swap requests in queue if we are still
10096 * out of memory */
10097 if (trytoswap && vmCanSwapOut() &&
10098 zmalloc_used_memory() > server.vm_max_memory)
10099 {
10100 int more = 1;
10101 while(more) {
10102 lockThreadedIO();
10103 more = listLength(server.io_newjobs) <
10104 (unsigned) server.vm_max_threads;
10105 unlockThreadedIO();
10106 /* Don't waste CPU time if swappable objects are rare. */
10107 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
10108 trytoswap = 0;
10109 break;
10110 }
10111 }
10112 }
10113 }
10114 processed++;
10115 if (processed == toprocess) return;
10116 }
10117 if (retval < 0 && errno != EAGAIN) {
10118 redisLog(REDIS_WARNING,
10119 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10120 strerror(errno));
10121 }
10122 }
10123
10124 static void lockThreadedIO(void) {
10125 pthread_mutex_lock(&server.io_mutex);
10126 }
10127
10128 static void unlockThreadedIO(void) {
10129 pthread_mutex_unlock(&server.io_mutex);
10130 }
10131
10132 /* Remove the specified object from the threaded I/O queue if still not
10133 * processed, otherwise make sure to flag it as canceled. */
10134 static void vmCancelThreadedIOJob(robj *o) {
10135 list *lists[3] = {
10136 server.io_newjobs, /* 0 */
10137 server.io_processing, /* 1 */
10138 server.io_processed /* 2 */
10139 };
10140 int i;
10141
10142 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
10143 again:
10144 lockThreadedIO();
10145 /* Search for a matching object in one of the queues */
10146 for (i = 0; i < 3; i++) {
10147 listNode *ln;
10148 listIter li;
10149
10150 listRewind(lists[i],&li);
10151 while ((ln = listNext(&li)) != NULL) {
10152 iojob *job = ln->value;
10153
10154 if (job->canceled) continue; /* Skip this, already canceled. */
10155 if (job->id == o) {
10156 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10157 (void*)job, (char*)job->key->ptr, job->type, i);
10158 /* Mark the pages as free since the swap didn't happened
10159 * or happened but is now discarded. */
10160 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10161 vmMarkPagesFree(job->page,job->pages);
10162 /* Cancel the job. It depends on the list the job is
10163 * living in. */
10164 switch(i) {
10165 case 0: /* io_newjobs */
10166 /* If the job was yet not processed the best thing to do
10167 * is to remove it from the queue at all */
10168 freeIOJob(job);
10169 listDelNode(lists[i],ln);
10170 break;
10171 case 1: /* io_processing */
10172 /* Oh Shi- the thread is messing with the Job:
10173 *
10174 * Probably it's accessing the object if this is a
10175 * PREPARE_SWAP or DO_SWAP job.
10176 * If it's a LOAD job it may be reading from disk and
10177 * if we don't wait for the job to terminate before to
10178 * cancel it, maybe in a few microseconds data can be
10179 * corrupted in this pages. So the short story is:
10180 *
10181 * Better to wait for the job to move into the
10182 * next queue (processed)... */
10183
10184 /* We try again and again until the job is completed. */
10185 unlockThreadedIO();
10186 /* But let's wait some time for the I/O thread
10187 * to finish with this job. After all this condition
10188 * should be very rare. */
10189 usleep(1);
10190 goto again;
10191 case 2: /* io_processed */
10192 /* The job was already processed, that's easy...
10193 * just mark it as canceled so that we'll ignore it
10194 * when processing completed jobs. */
10195 job->canceled = 1;
10196 break;
10197 }
10198 /* Finally we have to adjust the storage type of the object
10199 * in order to "UNDO" the operaiton. */
10200 if (o->storage == REDIS_VM_LOADING)
10201 o->storage = REDIS_VM_SWAPPED;
10202 else if (o->storage == REDIS_VM_SWAPPING)
10203 o->storage = REDIS_VM_MEMORY;
10204 unlockThreadedIO();
10205 redisLog(REDIS_DEBUG,"*** DONE");
10206 return;
10207 }
10208 }
10209 }
10210 unlockThreadedIO();
10211 printf("Not found: %p\n", (void*)o);
10212 redisAssert(1 != 1); /* We should never reach this */
10213 }
10214
10215 static void *IOThreadEntryPoint(void *arg) {
10216 iojob *j;
10217 listNode *ln;
10218 REDIS_NOTUSED(arg);
10219
10220 pthread_detach(pthread_self());
10221 while(1) {
10222 /* Get a new job to process */
10223 lockThreadedIO();
10224 if (listLength(server.io_newjobs) == 0) {
10225 /* No new jobs in queue, exit. */
10226 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10227 (long) pthread_self());
10228 server.io_active_threads--;
10229 unlockThreadedIO();
10230 return NULL;
10231 }
10232 ln = listFirst(server.io_newjobs);
10233 j = ln->value;
10234 listDelNode(server.io_newjobs,ln);
10235 /* Add the job in the processing queue */
10236 j->thread = pthread_self();
10237 listAddNodeTail(server.io_processing,j);
10238 ln = listLast(server.io_processing); /* We use ln later to remove it */
10239 unlockThreadedIO();
10240 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10241 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10242
10243 /* Process the Job */
10244 if (j->type == REDIS_IOJOB_LOAD) {
10245 vmpointer *vp = (vmpointer*)j->id;
10246 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10247 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10248 FILE *fp = fopen("/dev/null","w+");
10249 j->pages = rdbSavedObjectPages(j->val,fp);
10250 fclose(fp);
10251 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10252 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10253 j->canceled = 1;
10254 }
10255
10256 /* Done: insert the job into the processed queue */
10257 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10258 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10259 lockThreadedIO();
10260 listDelNode(server.io_processing,ln);
10261 listAddNodeTail(server.io_processed,j);
10262 unlockThreadedIO();
10263
10264 /* Signal the main thread there is new stuff to process */
10265 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10266 }
10267 return NULL; /* never reached */
10268 }
10269
10270 static void spawnIOThread(void) {
10271 pthread_t thread;
10272 sigset_t mask, omask;
10273 int err;
10274
10275 sigemptyset(&mask);
10276 sigaddset(&mask,SIGCHLD);
10277 sigaddset(&mask,SIGHUP);
10278 sigaddset(&mask,SIGPIPE);
10279 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10280 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10281 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10282 strerror(err));
10283 usleep(1000000);
10284 }
10285 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10286 server.io_active_threads++;
10287 }
10288
10289 /* We need to wait for the last thread to exit before we are able to
10290 * fork() in order to BGSAVE or BGREWRITEAOF. */
10291 static void waitEmptyIOJobsQueue(void) {
10292 while(1) {
10293 int io_processed_len;
10294
10295 lockThreadedIO();
10296 if (listLength(server.io_newjobs) == 0 &&
10297 listLength(server.io_processing) == 0 &&
10298 server.io_active_threads == 0)
10299 {
10300 unlockThreadedIO();
10301 return;
10302 }
10303 /* While waiting for empty jobs queue condition we post-process some
10304 * finshed job, as I/O threads may be hanging trying to write against
10305 * the io_ready_pipe_write FD but there are so much pending jobs that
10306 * it's blocking. */
10307 io_processed_len = listLength(server.io_processed);
10308 unlockThreadedIO();
10309 if (io_processed_len) {
10310 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10311 usleep(1000); /* 1 millisecond */
10312 } else {
10313 usleep(10000); /* 10 milliseconds */
10314 }
10315 }
10316 }
10317
10318 static void vmReopenSwapFile(void) {
10319 /* Note: we don't close the old one as we are in the child process
10320 * and don't want to mess at all with the original file object. */
10321 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10322 if (server.vm_fp == NULL) {
10323 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10324 server.vm_swap_file);
10325 _exit(1);
10326 }
10327 server.vm_fd = fileno(server.vm_fp);
10328 }
10329
10330 /* This function must be called while with threaded IO locked */
10331 static void queueIOJob(iojob *j) {
10332 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10333 (void*)j, j->type, (char*)j->key->ptr);
10334 listAddNodeTail(server.io_newjobs,j);
10335 if (server.io_active_threads < server.vm_max_threads)
10336 spawnIOThread();
10337 }
10338
10339 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10340 iojob *j;
10341
10342 j = zmalloc(sizeof(*j));
10343 j->type = REDIS_IOJOB_PREPARE_SWAP;
10344 j->db = db;
10345 j->key = key;
10346 incrRefCount(key);
10347 j->id = j->val = val;
10348 incrRefCount(val);
10349 j->canceled = 0;
10350 j->thread = (pthread_t) -1;
10351 val->storage = REDIS_VM_SWAPPING;
10352
10353 lockThreadedIO();
10354 queueIOJob(j);
10355 unlockThreadedIO();
10356 return REDIS_OK;
10357 }
10358
10359 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10360
10361 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10362 * If there is not already a job loading the key, it is craeted.
10363 * The key is added to the io_keys list in the client structure, and also
10364 * in the hash table mapping swapped keys to waiting clients, that is,
10365 * server.io_waited_keys. */
10366 static int waitForSwappedKey(redisClient *c, robj *key) {
10367 struct dictEntry *de;
10368 robj *o;
10369 list *l;
10370
10371 /* If the key does not exist or is already in RAM we don't need to
10372 * block the client at all. */
10373 de = dictFind(c->db->dict,key->ptr);
10374 if (de == NULL) return 0;
10375 o = dictGetEntryVal(de);
10376 if (o->storage == REDIS_VM_MEMORY) {
10377 return 0;
10378 } else if (o->storage == REDIS_VM_SWAPPING) {
10379 /* We were swapping the key, undo it! */
10380 vmCancelThreadedIOJob(o);
10381 return 0;
10382 }
10383
10384 /* OK: the key is either swapped, or being loaded just now. */
10385
10386 /* Add the key to the list of keys this client is waiting for.
10387 * This maps clients to keys they are waiting for. */
10388 listAddNodeTail(c->io_keys,key);
10389 incrRefCount(key);
10390
10391 /* Add the client to the swapped keys => clients waiting map. */
10392 de = dictFind(c->db->io_keys,key);
10393 if (de == NULL) {
10394 int retval;
10395
10396 /* For every key we take a list of clients blocked for it */
10397 l = listCreate();
10398 retval = dictAdd(c->db->io_keys,key,l);
10399 incrRefCount(key);
10400 assert(retval == DICT_OK);
10401 } else {
10402 l = dictGetEntryVal(de);
10403 }
10404 listAddNodeTail(l,c);
10405
10406 /* Are we already loading the key from disk? If not create a job */
10407 if (o->storage == REDIS_VM_SWAPPED) {
10408 iojob *j;
10409 vmpointer *vp = (vmpointer*)o;
10410
10411 o->storage = REDIS_VM_LOADING;
10412 j = zmalloc(sizeof(*j));
10413 j->type = REDIS_IOJOB_LOAD;
10414 j->db = c->db;
10415 j->id = (robj*)vp;
10416 j->key = key;
10417 incrRefCount(key);
10418 j->page = vp->page;
10419 j->val = NULL;
10420 j->canceled = 0;
10421 j->thread = (pthread_t) -1;
10422 lockThreadedIO();
10423 queueIOJob(j);
10424 unlockThreadedIO();
10425 }
10426 return 1;
10427 }
10428
10429 /* Preload keys for any command with first, last and step values for
10430 * the command keys prototype, as defined in the command table. */
10431 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10432 int j, last;
10433 if (cmd->vm_firstkey == 0) return;
10434 last = cmd->vm_lastkey;
10435 if (last < 0) last = argc+last;
10436 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10437 redisAssert(j < argc);
10438 waitForSwappedKey(c,argv[j]);
10439 }
10440 }
10441
10442 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10443 * Note that the number of keys to preload is user-defined, so we need to
10444 * apply a sanity check against argc. */
10445 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10446 int i, num;
10447 REDIS_NOTUSED(cmd);
10448
10449 num = atoi(argv[2]->ptr);
10450 if (num > (argc-3)) return;
10451 for (i = 0; i < num; i++) {
10452 waitForSwappedKey(c,argv[3+i]);
10453 }
10454 }
10455
10456 /* Preload keys needed to execute the entire MULTI/EXEC block.
10457 *
10458 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10459 * and will block the client when any command requires a swapped out value. */
10460 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10461 int i, margc;
10462 struct redisCommand *mcmd;
10463 robj **margv;
10464 REDIS_NOTUSED(cmd);
10465 REDIS_NOTUSED(argc);
10466 REDIS_NOTUSED(argv);
10467
10468 if (!(c->flags & REDIS_MULTI)) return;
10469 for (i = 0; i < c->mstate.count; i++) {
10470 mcmd = c->mstate.commands[i].cmd;
10471 margc = c->mstate.commands[i].argc;
10472 margv = c->mstate.commands[i].argv;
10473
10474 if (mcmd->vm_preload_proc != NULL) {
10475 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10476 } else {
10477 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10478 }
10479 }
10480 }
10481
10482 /* Is this client attempting to run a command against swapped keys?
10483 * If so, block it ASAP, load the keys in background, then resume it.
10484 *
10485 * The important idea about this function is that it can fail! If keys will
10486 * still be swapped when the client is resumed, this key lookups will
10487 * just block loading keys from disk. In practical terms this should only
10488 * happen with SORT BY command or if there is a bug in this function.
10489 *
10490 * Return 1 if the client is marked as blocked, 0 if the client can
10491 * continue as the keys it is going to access appear to be in memory. */
10492 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10493 if (cmd->vm_preload_proc != NULL) {
10494 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10495 } else {
10496 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10497 }
10498
10499 /* If the client was blocked for at least one key, mark it as blocked. */
10500 if (listLength(c->io_keys)) {
10501 c->flags |= REDIS_IO_WAIT;
10502 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10503 server.vm_blocked_clients++;
10504 return 1;
10505 } else {
10506 return 0;
10507 }
10508 }
10509
10510 /* Remove the 'key' from the list of blocked keys for a given client.
10511 *
10512 * The function returns 1 when there are no longer blocking keys after
10513 * the current one was removed (and the client can be unblocked). */
10514 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10515 list *l;
10516 listNode *ln;
10517 listIter li;
10518 struct dictEntry *de;
10519
10520 /* Remove the key from the list of keys this client is waiting for. */
10521 listRewind(c->io_keys,&li);
10522 while ((ln = listNext(&li)) != NULL) {
10523 if (equalStringObjects(ln->value,key)) {
10524 listDelNode(c->io_keys,ln);
10525 break;
10526 }
10527 }
10528 assert(ln != NULL);
10529
10530 /* Remove the client form the key => waiting clients map. */
10531 de = dictFind(c->db->io_keys,key);
10532 assert(de != NULL);
10533 l = dictGetEntryVal(de);
10534 ln = listSearchKey(l,c);
10535 assert(ln != NULL);
10536 listDelNode(l,ln);
10537 if (listLength(l) == 0)
10538 dictDelete(c->db->io_keys,key);
10539
10540 return listLength(c->io_keys) == 0;
10541 }
10542
10543 /* Every time we now a key was loaded back in memory, we handle clients
10544 * waiting for this key if any. */
10545 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10546 struct dictEntry *de;
10547 list *l;
10548 listNode *ln;
10549 int len;
10550
10551 de = dictFind(db->io_keys,key);
10552 if (!de) return;
10553
10554 l = dictGetEntryVal(de);
10555 len = listLength(l);
10556 /* Note: we can't use something like while(listLength(l)) as the list
10557 * can be freed by the calling function when we remove the last element. */
10558 while (len--) {
10559 ln = listFirst(l);
10560 redisClient *c = ln->value;
10561
10562 if (dontWaitForSwappedKey(c,key)) {
10563 /* Put the client in the list of clients ready to go as we
10564 * loaded all the keys about it. */
10565 listAddNodeTail(server.io_ready_clients,c);
10566 }
10567 }
10568 }
10569
10570 /* =========================== Remote Configuration ========================= */
10571
10572 static void configSetCommand(redisClient *c) {
10573 robj *o = getDecodedObject(c->argv[3]);
10574 long long ll;
10575
10576 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10577 zfree(server.dbfilename);
10578 server.dbfilename = zstrdup(o->ptr);
10579 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10580 zfree(server.requirepass);
10581 server.requirepass = zstrdup(o->ptr);
10582 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10583 zfree(server.masterauth);
10584 server.masterauth = zstrdup(o->ptr);
10585 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10586 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10587 ll < 0) goto badfmt;
10588 server.maxmemory = ll;
10589 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10590 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10591 ll < 0 || ll > LONG_MAX) goto badfmt;
10592 server.maxidletime = ll;
10593 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10594 if (!strcasecmp(o->ptr,"no")) {
10595 server.appendfsync = APPENDFSYNC_NO;
10596 } else if (!strcasecmp(o->ptr,"everysec")) {
10597 server.appendfsync = APPENDFSYNC_EVERYSEC;
10598 } else if (!strcasecmp(o->ptr,"always")) {
10599 server.appendfsync = APPENDFSYNC_ALWAYS;
10600 } else {
10601 goto badfmt;
10602 }
10603 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10604 int yn = yesnotoi(o->ptr);
10605
10606 if (yn == -1) goto badfmt;
10607 server.no_appendfsync_on_rewrite = yn;
10608 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10609 int old = server.appendonly;
10610 int new = yesnotoi(o->ptr);
10611
10612 if (new == -1) goto badfmt;
10613 if (old != new) {
10614 if (new == 0) {
10615 stopAppendOnly();
10616 } else {
10617 if (startAppendOnly() == REDIS_ERR) {
10618 addReplySds(c,sdscatprintf(sdsempty(),
10619 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10620 decrRefCount(o);
10621 return;
10622 }
10623 }
10624 }
10625 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10626 int vlen, j;
10627 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10628
10629 /* Perform sanity check before setting the new config:
10630 * - Even number of args
10631 * - Seconds >= 1, changes >= 0 */
10632 if (vlen & 1) {
10633 sdsfreesplitres(v,vlen);
10634 goto badfmt;
10635 }
10636 for (j = 0; j < vlen; j++) {
10637 char *eptr;
10638 long val;
10639
10640 val = strtoll(v[j], &eptr, 10);
10641 if (eptr[0] != '\0' ||
10642 ((j & 1) == 0 && val < 1) ||
10643 ((j & 1) == 1 && val < 0)) {
10644 sdsfreesplitres(v,vlen);
10645 goto badfmt;
10646 }
10647 }
10648 /* Finally set the new config */
10649 resetServerSaveParams();
10650 for (j = 0; j < vlen; j += 2) {
10651 time_t seconds;
10652 int changes;
10653
10654 seconds = strtoll(v[j],NULL,10);
10655 changes = strtoll(v[j+1],NULL,10);
10656 appendServerSaveParams(seconds, changes);
10657 }
10658 sdsfreesplitres(v,vlen);
10659 } else {
10660 addReplySds(c,sdscatprintf(sdsempty(),
10661 "-ERR not supported CONFIG parameter %s\r\n",
10662 (char*)c->argv[2]->ptr));
10663 decrRefCount(o);
10664 return;
10665 }
10666 decrRefCount(o);
10667 addReply(c,shared.ok);
10668 return;
10669
10670 badfmt: /* Bad format errors */
10671 addReplySds(c,sdscatprintf(sdsempty(),
10672 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10673 (char*)o->ptr,
10674 (char*)c->argv[2]->ptr));
10675 decrRefCount(o);
10676 }
10677
10678 static void configGetCommand(redisClient *c) {
10679 robj *o = getDecodedObject(c->argv[2]);
10680 robj *lenobj = createObject(REDIS_STRING,NULL);
10681 char *pattern = o->ptr;
10682 int matches = 0;
10683
10684 addReply(c,lenobj);
10685 decrRefCount(lenobj);
10686
10687 if (stringmatch(pattern,"dbfilename",0)) {
10688 addReplyBulkCString(c,"dbfilename");
10689 addReplyBulkCString(c,server.dbfilename);
10690 matches++;
10691 }
10692 if (stringmatch(pattern,"requirepass",0)) {
10693 addReplyBulkCString(c,"requirepass");
10694 addReplyBulkCString(c,server.requirepass);
10695 matches++;
10696 }
10697 if (stringmatch(pattern,"masterauth",0)) {
10698 addReplyBulkCString(c,"masterauth");
10699 addReplyBulkCString(c,server.masterauth);
10700 matches++;
10701 }
10702 if (stringmatch(pattern,"maxmemory",0)) {
10703 char buf[128];
10704
10705 ll2string(buf,128,server.maxmemory);
10706 addReplyBulkCString(c,"maxmemory");
10707 addReplyBulkCString(c,buf);
10708 matches++;
10709 }
10710 if (stringmatch(pattern,"timeout",0)) {
10711 char buf[128];
10712
10713 ll2string(buf,128,server.maxidletime);
10714 addReplyBulkCString(c,"timeout");
10715 addReplyBulkCString(c,buf);
10716 matches++;
10717 }
10718 if (stringmatch(pattern,"appendonly",0)) {
10719 addReplyBulkCString(c,"appendonly");
10720 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10721 matches++;
10722 }
10723 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10724 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10725 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10726 matches++;
10727 }
10728 if (stringmatch(pattern,"appendfsync",0)) {
10729 char *policy;
10730
10731 switch(server.appendfsync) {
10732 case APPENDFSYNC_NO: policy = "no"; break;
10733 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10734 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10735 default: policy = "unknown"; break; /* too harmless to panic */
10736 }
10737 addReplyBulkCString(c,"appendfsync");
10738 addReplyBulkCString(c,policy);
10739 matches++;
10740 }
10741 if (stringmatch(pattern,"save",0)) {
10742 sds buf = sdsempty();
10743 int j;
10744
10745 for (j = 0; j < server.saveparamslen; j++) {
10746 buf = sdscatprintf(buf,"%ld %d",
10747 server.saveparams[j].seconds,
10748 server.saveparams[j].changes);
10749 if (j != server.saveparamslen-1)
10750 buf = sdscatlen(buf," ",1);
10751 }
10752 addReplyBulkCString(c,"save");
10753 addReplyBulkCString(c,buf);
10754 sdsfree(buf);
10755 matches++;
10756 }
10757 decrRefCount(o);
10758 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10759 }
10760
10761 static void configCommand(redisClient *c) {
10762 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10763 if (c->argc != 4) goto badarity;
10764 configSetCommand(c);
10765 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10766 if (c->argc != 3) goto badarity;
10767 configGetCommand(c);
10768 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10769 if (c->argc != 2) goto badarity;
10770 server.stat_numcommands = 0;
10771 server.stat_numconnections = 0;
10772 server.stat_expiredkeys = 0;
10773 server.stat_starttime = time(NULL);
10774 addReply(c,shared.ok);
10775 } else {
10776 addReplySds(c,sdscatprintf(sdsempty(),
10777 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10778 }
10779 return;
10780
10781 badarity:
10782 addReplySds(c,sdscatprintf(sdsempty(),
10783 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10784 (char*) c->argv[1]->ptr));
10785 }
10786
10787 /* =========================== Pubsub implementation ======================== */
10788
10789 static void freePubsubPattern(void *p) {
10790 pubsubPattern *pat = p;
10791
10792 decrRefCount(pat->pattern);
10793 zfree(pat);
10794 }
10795
10796 static int listMatchPubsubPattern(void *a, void *b) {
10797 pubsubPattern *pa = a, *pb = b;
10798
10799 return (pa->client == pb->client) &&
10800 (equalStringObjects(pa->pattern,pb->pattern));
10801 }
10802
10803 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10804 * 0 if the client was already subscribed to that channel. */
10805 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10806 struct dictEntry *de;
10807 list *clients = NULL;
10808 int retval = 0;
10809
10810 /* Add the channel to the client -> channels hash table */
10811 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10812 retval = 1;
10813 incrRefCount(channel);
10814 /* Add the client to the channel -> list of clients hash table */
10815 de = dictFind(server.pubsub_channels,channel);
10816 if (de == NULL) {
10817 clients = listCreate();
10818 dictAdd(server.pubsub_channels,channel,clients);
10819 incrRefCount(channel);
10820 } else {
10821 clients = dictGetEntryVal(de);
10822 }
10823 listAddNodeTail(clients,c);
10824 }
10825 /* Notify the client */
10826 addReply(c,shared.mbulk3);
10827 addReply(c,shared.subscribebulk);
10828 addReplyBulk(c,channel);
10829 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10830 return retval;
10831 }
10832
10833 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10834 * 0 if the client was not subscribed to the specified channel. */
10835 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10836 struct dictEntry *de;
10837 list *clients;
10838 listNode *ln;
10839 int retval = 0;
10840
10841 /* Remove the channel from the client -> channels hash table */
10842 incrRefCount(channel); /* channel may be just a pointer to the same object
10843 we have in the hash tables. Protect it... */
10844 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10845 retval = 1;
10846 /* Remove the client from the channel -> clients list hash table */
10847 de = dictFind(server.pubsub_channels,channel);
10848 assert(de != NULL);
10849 clients = dictGetEntryVal(de);
10850 ln = listSearchKey(clients,c);
10851 assert(ln != NULL);
10852 listDelNode(clients,ln);
10853 if (listLength(clients) == 0) {
10854 /* Free the list and associated hash entry at all if this was
10855 * the latest client, so that it will be possible to abuse
10856 * Redis PUBSUB creating millions of channels. */
10857 dictDelete(server.pubsub_channels,channel);
10858 }
10859 }
10860 /* Notify the client */
10861 if (notify) {
10862 addReply(c,shared.mbulk3);
10863 addReply(c,shared.unsubscribebulk);
10864 addReplyBulk(c,channel);
10865 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10866 listLength(c->pubsub_patterns));
10867
10868 }
10869 decrRefCount(channel); /* it is finally safe to release it */
10870 return retval;
10871 }
10872
10873 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10874 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10875 int retval = 0;
10876
10877 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10878 retval = 1;
10879 pubsubPattern *pat;
10880 listAddNodeTail(c->pubsub_patterns,pattern);
10881 incrRefCount(pattern);
10882 pat = zmalloc(sizeof(*pat));
10883 pat->pattern = getDecodedObject(pattern);
10884 pat->client = c;
10885 listAddNodeTail(server.pubsub_patterns,pat);
10886 }
10887 /* Notify the client */
10888 addReply(c,shared.mbulk3);
10889 addReply(c,shared.psubscribebulk);
10890 addReplyBulk(c,pattern);
10891 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10892 return retval;
10893 }
10894
10895 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10896 * 0 if the client was not subscribed to the specified channel. */
10897 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10898 listNode *ln;
10899 pubsubPattern pat;
10900 int retval = 0;
10901
10902 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10903 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10904 retval = 1;
10905 listDelNode(c->pubsub_patterns,ln);
10906 pat.client = c;
10907 pat.pattern = pattern;
10908 ln = listSearchKey(server.pubsub_patterns,&pat);
10909 listDelNode(server.pubsub_patterns,ln);
10910 }
10911 /* Notify the client */
10912 if (notify) {
10913 addReply(c,shared.mbulk3);
10914 addReply(c,shared.punsubscribebulk);
10915 addReplyBulk(c,pattern);
10916 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10917 listLength(c->pubsub_patterns));
10918 }
10919 decrRefCount(pattern);
10920 return retval;
10921 }
10922
10923 /* Unsubscribe from all the channels. Return the number of channels the
10924 * client was subscribed from. */
10925 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10926 dictIterator *di = dictGetIterator(c->pubsub_channels);
10927 dictEntry *de;
10928 int count = 0;
10929
10930 while((de = dictNext(di)) != NULL) {
10931 robj *channel = dictGetEntryKey(de);
10932
10933 count += pubsubUnsubscribeChannel(c,channel,notify);
10934 }
10935 dictReleaseIterator(di);
10936 return count;
10937 }
10938
10939 /* Unsubscribe from all the patterns. Return the number of patterns the
10940 * client was subscribed from. */
10941 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10942 listNode *ln;
10943 listIter li;
10944 int count = 0;
10945
10946 listRewind(c->pubsub_patterns,&li);
10947 while ((ln = listNext(&li)) != NULL) {
10948 robj *pattern = ln->value;
10949
10950 count += pubsubUnsubscribePattern(c,pattern,notify);
10951 }
10952 return count;
10953 }
10954
10955 /* Publish a message */
10956 static int pubsubPublishMessage(robj *channel, robj *message) {
10957 int receivers = 0;
10958 struct dictEntry *de;
10959 listNode *ln;
10960 listIter li;
10961
10962 /* Send to clients listening for that channel */
10963 de = dictFind(server.pubsub_channels,channel);
10964 if (de) {
10965 list *list = dictGetEntryVal(de);
10966 listNode *ln;
10967 listIter li;
10968
10969 listRewind(list,&li);
10970 while ((ln = listNext(&li)) != NULL) {
10971 redisClient *c = ln->value;
10972
10973 addReply(c,shared.mbulk3);
10974 addReply(c,shared.messagebulk);
10975 addReplyBulk(c,channel);
10976 addReplyBulk(c,message);
10977 receivers++;
10978 }
10979 }
10980 /* Send to clients listening to matching channels */
10981 if (listLength(server.pubsub_patterns)) {
10982 listRewind(server.pubsub_patterns,&li);
10983 channel = getDecodedObject(channel);
10984 while ((ln = listNext(&li)) != NULL) {
10985 pubsubPattern *pat = ln->value;
10986
10987 if (stringmatchlen((char*)pat->pattern->ptr,
10988 sdslen(pat->pattern->ptr),
10989 (char*)channel->ptr,
10990 sdslen(channel->ptr),0)) {
10991 addReply(pat->client,shared.mbulk4);
10992 addReply(pat->client,shared.pmessagebulk);
10993 addReplyBulk(pat->client,pat->pattern);
10994 addReplyBulk(pat->client,channel);
10995 addReplyBulk(pat->client,message);
10996 receivers++;
10997 }
10998 }
10999 decrRefCount(channel);
11000 }
11001 return receivers;
11002 }
11003
11004 static void subscribeCommand(redisClient *c) {
11005 int j;
11006
11007 for (j = 1; j < c->argc; j++)
11008 pubsubSubscribeChannel(c,c->argv[j]);
11009 }
11010
11011 static void unsubscribeCommand(redisClient *c) {
11012 if (c->argc == 1) {
11013 pubsubUnsubscribeAllChannels(c,1);
11014 return;
11015 } else {
11016 int j;
11017
11018 for (j = 1; j < c->argc; j++)
11019 pubsubUnsubscribeChannel(c,c->argv[j],1);
11020 }
11021 }
11022
11023 static void psubscribeCommand(redisClient *c) {
11024 int j;
11025
11026 for (j = 1; j < c->argc; j++)
11027 pubsubSubscribePattern(c,c->argv[j]);
11028 }
11029
11030 static void punsubscribeCommand(redisClient *c) {
11031 if (c->argc == 1) {
11032 pubsubUnsubscribeAllPatterns(c,1);
11033 return;
11034 } else {
11035 int j;
11036
11037 for (j = 1; j < c->argc; j++)
11038 pubsubUnsubscribePattern(c,c->argv[j],1);
11039 }
11040 }
11041
11042 static void publishCommand(redisClient *c) {
11043 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
11044 addReplyLongLong(c,receivers);
11045 }
11046
11047 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
11048 *
11049 * The implementation uses a per-DB hash table mapping keys to list of clients
11050 * WATCHing those keys, so that given a key that is going to be modified
11051 * we can mark all the associated clients as dirty.
11052 *
11053 * Also every client contains a list of WATCHed keys so that's possible to
11054 * un-watch such keys when the client is freed or when UNWATCH is called. */
11055
11056 /* In the client->watched_keys list we need to use watchedKey structures
11057 * as in order to identify a key in Redis we need both the key name and the
11058 * DB */
11059 typedef struct watchedKey {
11060 robj *key;
11061 redisDb *db;
11062 } watchedKey;
11063
11064 /* Watch for the specified key */
11065 static void watchForKey(redisClient *c, robj *key) {
11066 list *clients = NULL;
11067 listIter li;
11068 listNode *ln;
11069 watchedKey *wk;
11070
11071 /* Check if we are already watching for this key */
11072 listRewind(c->watched_keys,&li);
11073 while((ln = listNext(&li))) {
11074 wk = listNodeValue(ln);
11075 if (wk->db == c->db && equalStringObjects(key,wk->key))
11076 return; /* Key already watched */
11077 }
11078 /* This key is not already watched in this DB. Let's add it */
11079 clients = dictFetchValue(c->db->watched_keys,key);
11080 if (!clients) {
11081 clients = listCreate();
11082 dictAdd(c->db->watched_keys,key,clients);
11083 incrRefCount(key);
11084 }
11085 listAddNodeTail(clients,c);
11086 /* Add the new key to the lits of keys watched by this client */
11087 wk = zmalloc(sizeof(*wk));
11088 wk->key = key;
11089 wk->db = c->db;
11090 incrRefCount(key);
11091 listAddNodeTail(c->watched_keys,wk);
11092 }
11093
11094 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
11095 * flag is up to the caller. */
11096 static void unwatchAllKeys(redisClient *c) {
11097 listIter li;
11098 listNode *ln;
11099
11100 if (listLength(c->watched_keys) == 0) return;
11101 listRewind(c->watched_keys,&li);
11102 while((ln = listNext(&li))) {
11103 list *clients;
11104 watchedKey *wk;
11105
11106 /* Lookup the watched key -> clients list and remove the client
11107 * from the list */
11108 wk = listNodeValue(ln);
11109 clients = dictFetchValue(wk->db->watched_keys, wk->key);
11110 assert(clients != NULL);
11111 listDelNode(clients,listSearchKey(clients,c));
11112 /* Kill the entry at all if this was the only client */
11113 if (listLength(clients) == 0)
11114 dictDelete(wk->db->watched_keys, wk->key);
11115 /* Remove this watched key from the client->watched list */
11116 listDelNode(c->watched_keys,ln);
11117 decrRefCount(wk->key);
11118 zfree(wk);
11119 }
11120 }
11121
11122 /* "Touch" a key, so that if this key is being WATCHed by some client the
11123 * next EXEC will fail. */
11124 static void touchWatchedKey(redisDb *db, robj *key) {
11125 list *clients;
11126 listIter li;
11127 listNode *ln;
11128
11129 if (dictSize(db->watched_keys) == 0) return;
11130 clients = dictFetchValue(db->watched_keys, key);
11131 if (!clients) return;
11132
11133 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11134 /* Check if we are already watching for this key */
11135 listRewind(clients,&li);
11136 while((ln = listNext(&li))) {
11137 redisClient *c = listNodeValue(ln);
11138
11139 c->flags |= REDIS_DIRTY_CAS;
11140 }
11141 }
11142
11143 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11144 * flush but will be deleted as effect of the flushing operation should
11145 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11146 * a FLUSHALL operation (all the DBs flushed). */
11147 static void touchWatchedKeysOnFlush(int dbid) {
11148 listIter li1, li2;
11149 listNode *ln;
11150
11151 /* For every client, check all the waited keys */
11152 listRewind(server.clients,&li1);
11153 while((ln = listNext(&li1))) {
11154 redisClient *c = listNodeValue(ln);
11155 listRewind(c->watched_keys,&li2);
11156 while((ln = listNext(&li2))) {
11157 watchedKey *wk = listNodeValue(ln);
11158
11159 /* For every watched key matching the specified DB, if the
11160 * key exists, mark the client as dirty, as the key will be
11161 * removed. */
11162 if (dbid == -1 || wk->db->id == dbid) {
11163 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11164 c->flags |= REDIS_DIRTY_CAS;
11165 }
11166 }
11167 }
11168 }
11169
11170 static void watchCommand(redisClient *c) {
11171 int j;
11172
11173 if (c->flags & REDIS_MULTI) {
11174 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11175 return;
11176 }
11177 for (j = 1; j < c->argc; j++)
11178 watchForKey(c,c->argv[j]);
11179 addReply(c,shared.ok);
11180 }
11181
11182 static void unwatchCommand(redisClient *c) {
11183 unwatchAllKeys(c);
11184 c->flags &= (~REDIS_DIRTY_CAS);
11185 addReply(c,shared.ok);
11186 }
11187
11188 /* ================================= Debugging ============================== */
11189
11190 /* Compute the sha1 of string at 's' with 'len' bytes long.
11191 * The SHA1 is then xored againt the string pointed by digest.
11192 * Since xor is commutative, this operation is used in order to
11193 * "add" digests relative to unordered elements.
11194 *
11195 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11196 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11197 SHA1_CTX ctx;
11198 unsigned char hash[20], *s = ptr;
11199 int j;
11200
11201 SHA1Init(&ctx);
11202 SHA1Update(&ctx,s,len);
11203 SHA1Final(hash,&ctx);
11204
11205 for (j = 0; j < 20; j++)
11206 digest[j] ^= hash[j];
11207 }
11208
11209 static void xorObjectDigest(unsigned char *digest, robj *o) {
11210 o = getDecodedObject(o);
11211 xorDigest(digest,o->ptr,sdslen(o->ptr));
11212 decrRefCount(o);
11213 }
11214
11215 /* This function instead of just computing the SHA1 and xoring it
11216 * against diget, also perform the digest of "digest" itself and
11217 * replace the old value with the new one.
11218 *
11219 * So the final digest will be:
11220 *
11221 * digest = SHA1(digest xor SHA1(data))
11222 *
11223 * This function is used every time we want to preserve the order so
11224 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11225 *
11226 * Also note that mixdigest("foo") followed by mixdigest("bar")
11227 * will lead to a different digest compared to "fo", "obar".
11228 */
11229 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11230 SHA1_CTX ctx;
11231 char *s = ptr;
11232
11233 xorDigest(digest,s,len);
11234 SHA1Init(&ctx);
11235 SHA1Update(&ctx,digest,20);
11236 SHA1Final(digest,&ctx);
11237 }
11238
11239 static void mixObjectDigest(unsigned char *digest, robj *o) {
11240 o = getDecodedObject(o);
11241 mixDigest(digest,o->ptr,sdslen(o->ptr));
11242 decrRefCount(o);
11243 }
11244
11245 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11246 * are not ordered, we use a trick: every aggregate digest is the xor
11247 * of the digests of their elements. This way the order will not change
11248 * the result. For list instead we use a feedback entering the output digest
11249 * as input in order to ensure that a different ordered list will result in
11250 * a different digest. */
11251 static void computeDatasetDigest(unsigned char *final) {
11252 unsigned char digest[20];
11253 char buf[128];
11254 dictIterator *di = NULL;
11255 dictEntry *de;
11256 int j;
11257 uint32_t aux;
11258
11259 memset(final,0,20); /* Start with a clean result */
11260
11261 for (j = 0; j < server.dbnum; j++) {
11262 redisDb *db = server.db+j;
11263
11264 if (dictSize(db->dict) == 0) continue;
11265 di = dictGetIterator(db->dict);
11266
11267 /* hash the DB id, so the same dataset moved in a different
11268 * DB will lead to a different digest */
11269 aux = htonl(j);
11270 mixDigest(final,&aux,sizeof(aux));
11271
11272 /* Iterate this DB writing every entry */
11273 while((de = dictNext(di)) != NULL) {
11274 sds key;
11275 robj *keyobj, *o;
11276 time_t expiretime;
11277
11278 memset(digest,0,20); /* This key-val digest */
11279 key = dictGetEntryKey(de);
11280 keyobj = createStringObject(key,sdslen(key));
11281
11282 mixDigest(digest,key,sdslen(key));
11283
11284 /* Make sure the key is loaded if VM is active */
11285 o = lookupKeyRead(db,keyobj);
11286
11287 aux = htonl(o->type);
11288 mixDigest(digest,&aux,sizeof(aux));
11289 expiretime = getExpire(db,keyobj);
11290
11291 /* Save the key and associated value */
11292 if (o->type == REDIS_STRING) {
11293 mixObjectDigest(digest,o);
11294 } else if (o->type == REDIS_LIST) {
11295 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11296 listTypeEntry entry;
11297 while(listTypeNext(li,&entry)) {
11298 robj *eleobj = listTypeGet(&entry);
11299 mixObjectDigest(digest,eleobj);
11300 decrRefCount(eleobj);
11301 }
11302 listTypeReleaseIterator(li);
11303 } else if (o->type == REDIS_SET) {
11304 dict *set = o->ptr;
11305 dictIterator *di = dictGetIterator(set);
11306 dictEntry *de;
11307
11308 while((de = dictNext(di)) != NULL) {
11309 robj *eleobj = dictGetEntryKey(de);
11310
11311 xorObjectDigest(digest,eleobj);
11312 }
11313 dictReleaseIterator(di);
11314 } else if (o->type == REDIS_ZSET) {
11315 zset *zs = o->ptr;
11316 dictIterator *di = dictGetIterator(zs->dict);
11317 dictEntry *de;
11318
11319 while((de = dictNext(di)) != NULL) {
11320 robj *eleobj = dictGetEntryKey(de);
11321 double *score = dictGetEntryVal(de);
11322 unsigned char eledigest[20];
11323
11324 snprintf(buf,sizeof(buf),"%.17g",*score);
11325 memset(eledigest,0,20);
11326 mixObjectDigest(eledigest,eleobj);
11327 mixDigest(eledigest,buf,strlen(buf));
11328 xorDigest(digest,eledigest,20);
11329 }
11330 dictReleaseIterator(di);
11331 } else if (o->type == REDIS_HASH) {
11332 hashTypeIterator *hi;
11333 robj *obj;
11334
11335 hi = hashTypeInitIterator(o);
11336 while (hashTypeNext(hi) != REDIS_ERR) {
11337 unsigned char eledigest[20];
11338
11339 memset(eledigest,0,20);
11340 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11341 mixObjectDigest(eledigest,obj);
11342 decrRefCount(obj);
11343 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11344 mixObjectDigest(eledigest,obj);
11345 decrRefCount(obj);
11346 xorDigest(digest,eledigest,20);
11347 }
11348 hashTypeReleaseIterator(hi);
11349 } else {
11350 redisPanic("Unknown object type");
11351 }
11352 /* If the key has an expire, add it to the mix */
11353 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11354 /* We can finally xor the key-val digest to the final digest */
11355 xorDigest(final,digest,20);
11356 decrRefCount(keyobj);
11357 }
11358 dictReleaseIterator(di);
11359 }
11360 }
11361
11362 static void debugCommand(redisClient *c) {
11363 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11364 *((char*)-1) = 'x';
11365 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11366 if (rdbSave(server.dbfilename) != REDIS_OK) {
11367 addReply(c,shared.err);
11368 return;
11369 }
11370 emptyDb();
11371 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11372 addReply(c,shared.err);
11373 return;
11374 }
11375 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11376 addReply(c,shared.ok);
11377 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11378 emptyDb();
11379 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11380 addReply(c,shared.err);
11381 return;
11382 }
11383 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11384 addReply(c,shared.ok);
11385 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11386 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11387 robj *val;
11388
11389 if (!de) {
11390 addReply(c,shared.nokeyerr);
11391 return;
11392 }
11393 val = dictGetEntryVal(de);
11394 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11395 val->storage == REDIS_VM_SWAPPING)) {
11396 char *strenc;
11397 char buf[128];
11398
11399 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11400 strenc = strencoding[val->encoding];
11401 } else {
11402 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11403 strenc = buf;
11404 }
11405 addReplySds(c,sdscatprintf(sdsempty(),
11406 "+Value at:%p refcount:%d "
11407 "encoding:%s serializedlength:%lld\r\n",
11408 (void*)val, val->refcount,
11409 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11410 } else {
11411 vmpointer *vp = (vmpointer*) val;
11412 addReplySds(c,sdscatprintf(sdsempty(),
11413 "+Value swapped at: page %llu "
11414 "using %llu pages\r\n",
11415 (unsigned long long) vp->page,
11416 (unsigned long long) vp->usedpages));
11417 }
11418 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11419 lookupKeyRead(c->db,c->argv[2]);
11420 addReply(c,shared.ok);
11421 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11422 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11423 robj *val;
11424 vmpointer *vp;
11425
11426 if (!server.vm_enabled) {
11427 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11428 return;
11429 }
11430 if (!de) {
11431 addReply(c,shared.nokeyerr);
11432 return;
11433 }
11434 val = dictGetEntryVal(de);
11435 /* Swap it */
11436 if (val->storage != REDIS_VM_MEMORY) {
11437 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11438 } else if (val->refcount != 1) {
11439 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11440 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11441 dictGetEntryVal(de) = vp;
11442 addReply(c,shared.ok);
11443 } else {
11444 addReply(c,shared.err);
11445 }
11446 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11447 long keys, j;
11448 robj *key, *val;
11449 char buf[128];
11450
11451 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11452 return;
11453 for (j = 0; j < keys; j++) {
11454 snprintf(buf,sizeof(buf),"key:%lu",j);
11455 key = createStringObject(buf,strlen(buf));
11456 if (lookupKeyRead(c->db,key) != NULL) {
11457 decrRefCount(key);
11458 continue;
11459 }
11460 snprintf(buf,sizeof(buf),"value:%lu",j);
11461 val = createStringObject(buf,strlen(buf));
11462 dbAdd(c->db,key,val);
11463 decrRefCount(key);
11464 }
11465 addReply(c,shared.ok);
11466 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11467 unsigned char digest[20];
11468 sds d = sdsnew("+");
11469 int j;
11470
11471 computeDatasetDigest(digest);
11472 for (j = 0; j < 20; j++)
11473 d = sdscatprintf(d, "%02x",digest[j]);
11474
11475 d = sdscatlen(d,"\r\n",2);
11476 addReplySds(c,d);
11477 } else {
11478 addReplySds(c,sdsnew(
11479 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11480 }
11481 }
11482
11483 static void _redisAssert(char *estr, char *file, int line) {
11484 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11485 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11486 #ifdef HAVE_BACKTRACE
11487 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11488 *((char*)-1) = 'x';
11489 #endif
11490 }
11491
11492 static void _redisPanic(char *msg, char *file, int line) {
11493 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11494 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11495 #ifdef HAVE_BACKTRACE
11496 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11497 *((char*)-1) = 'x';
11498 #endif
11499 }
11500
11501 /* =================================== Main! ================================ */
11502
11503 #ifdef __linux__
11504 int linuxOvercommitMemoryValue(void) {
11505 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11506 char buf[64];
11507
11508 if (!fp) return -1;
11509 if (fgets(buf,64,fp) == NULL) {
11510 fclose(fp);
11511 return -1;
11512 }
11513 fclose(fp);
11514
11515 return atoi(buf);
11516 }
11517
11518 void linuxOvercommitMemoryWarning(void) {
11519 if (linuxOvercommitMemoryValue() == 0) {
11520 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11521 }
11522 }
11523 #endif /* __linux__ */
11524
11525 static void daemonize(void) {
11526 int fd;
11527 FILE *fp;
11528
11529 if (fork() != 0) exit(0); /* parent exits */
11530 setsid(); /* create a new session */
11531
11532 /* Every output goes to /dev/null. If Redis is daemonized but
11533 * the 'logfile' is set to 'stdout' in the configuration file
11534 * it will not log at all. */
11535 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11536 dup2(fd, STDIN_FILENO);
11537 dup2(fd, STDOUT_FILENO);
11538 dup2(fd, STDERR_FILENO);
11539 if (fd > STDERR_FILENO) close(fd);
11540 }
11541 /* Try to write the pid file */
11542 fp = fopen(server.pidfile,"w");
11543 if (fp) {
11544 fprintf(fp,"%d\n",getpid());
11545 fclose(fp);
11546 }
11547 }
11548
11549 static void version() {
11550 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11551 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11552 exit(0);
11553 }
11554
11555 static void usage() {
11556 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11557 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11558 exit(1);
11559 }
11560
11561 int main(int argc, char **argv) {
11562 time_t start;
11563
11564 initServerConfig();
11565 sortCommandTable();
11566 if (argc == 2) {
11567 if (strcmp(argv[1], "-v") == 0 ||
11568 strcmp(argv[1], "--version") == 0) version();
11569 if (strcmp(argv[1], "--help") == 0) usage();
11570 resetServerSaveParams();
11571 loadServerConfig(argv[1]);
11572 } else if ((argc > 2)) {
11573 usage();
11574 } else {
11575 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11576 }
11577 if (server.daemonize) daemonize();
11578 initServer();
11579 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11580 #ifdef __linux__
11581 linuxOvercommitMemoryWarning();
11582 #endif
11583 start = time(NULL);
11584 if (server.appendonly) {
11585 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11586 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11587 } else {
11588 if (rdbLoad(server.dbfilename) == REDIS_OK)
11589 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11590 }
11591 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11592 aeSetBeforeSleepProc(server.el,beforeSleep);
11593 aeMain(server.el);
11594 aeDeleteEventLoop(server.el);
11595 return 0;
11596 }
11597
11598 /* ============================= Backtrace support ========================= */
11599
11600 #ifdef HAVE_BACKTRACE
11601 static char *findFuncName(void *pointer, unsigned long *offset);
11602
11603 static void *getMcontextEip(ucontext_t *uc) {
11604 #if defined(__FreeBSD__)
11605 return (void*) uc->uc_mcontext.mc_eip;
11606 #elif defined(__dietlibc__)
11607 return (void*) uc->uc_mcontext.eip;
11608 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11609 #if __x86_64__
11610 return (void*) uc->uc_mcontext->__ss.__rip;
11611 #else
11612 return (void*) uc->uc_mcontext->__ss.__eip;
11613 #endif
11614 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11615 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11616 return (void*) uc->uc_mcontext->__ss.__rip;
11617 #else
11618 return (void*) uc->uc_mcontext->__ss.__eip;
11619 #endif
11620 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11621 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11622 #elif defined(__ia64__) /* Linux IA64 */
11623 return (void*) uc->uc_mcontext.sc_ip;
11624 #else
11625 return NULL;
11626 #endif
11627 }
11628
11629 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11630 void *trace[100];
11631 char **messages = NULL;
11632 int i, trace_size = 0;
11633 unsigned long offset=0;
11634 ucontext_t *uc = (ucontext_t*) secret;
11635 sds infostring;
11636 REDIS_NOTUSED(info);
11637
11638 redisLog(REDIS_WARNING,
11639 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11640 infostring = genRedisInfoString();
11641 redisLog(REDIS_WARNING, "%s",infostring);
11642 /* It's not safe to sdsfree() the returned string under memory
11643 * corruption conditions. Let it leak as we are going to abort */
11644
11645 trace_size = backtrace(trace, 100);
11646 /* overwrite sigaction with caller's address */
11647 if (getMcontextEip(uc) != NULL) {
11648 trace[1] = getMcontextEip(uc);
11649 }
11650 messages = backtrace_symbols(trace, trace_size);
11651
11652 for (i=1; i<trace_size; ++i) {
11653 char *fn = findFuncName(trace[i], &offset), *p;
11654
11655 p = strchr(messages[i],'+');
11656 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11657 redisLog(REDIS_WARNING,"%s", messages[i]);
11658 } else {
11659 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11660 }
11661 }
11662 /* free(messages); Don't call free() with possibly corrupted memory. */
11663 _exit(0);
11664 }
11665
11666 static void sigtermHandler(int sig) {
11667 REDIS_NOTUSED(sig);
11668
11669 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11670 server.shutdown_asap = 1;
11671 }
11672
11673 static void setupSigSegvAction(void) {
11674 struct sigaction act;
11675
11676 sigemptyset (&act.sa_mask);
11677 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11678 * is used. Otherwise, sa_handler is used */
11679 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11680 act.sa_sigaction = segvHandler;
11681 sigaction (SIGSEGV, &act, NULL);
11682 sigaction (SIGBUS, &act, NULL);
11683 sigaction (SIGFPE, &act, NULL);
11684 sigaction (SIGILL, &act, NULL);
11685 sigaction (SIGBUS, &act, NULL);
11686
11687 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11688 act.sa_handler = sigtermHandler;
11689 sigaction (SIGTERM, &act, NULL);
11690 return;
11691 }
11692
11693 #include "staticsymbols.h"
11694 /* This function try to convert a pointer into a function name. It's used in
11695 * oreder to provide a backtrace under segmentation fault that's able to
11696 * display functions declared as static (otherwise the backtrace is useless). */
11697 static char *findFuncName(void *pointer, unsigned long *offset){
11698 int i, ret = -1;
11699 unsigned long off, minoff = 0;
11700
11701 /* Try to match against the Symbol with the smallest offset */
11702 for (i=0; symsTable[i].pointer; i++) {
11703 unsigned long lp = (unsigned long) pointer;
11704
11705 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11706 off=lp-symsTable[i].pointer;
11707 if (ret < 0 || off < minoff) {
11708 minoff=off;
11709 ret=i;
11710 }
11711 }
11712 }
11713 if (ret == -1) return NULL;
11714 *offset = minoff;
11715 return symsTable[ret].name;
11716 }
11717 #else /* HAVE_BACKTRACE */
11718 static void setupSigSegvAction(void) {
11719 }
11720 #endif /* HAVE_BACKTRACE */
11721
11722
11723
11724 /* The End */
11725
11726
11727