]> git.saurik.com Git - redis.git/blob - redis.c
8f5e8f47f43bce8ec5502f479a6b147171b9cf69
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
80 #include "release.h" /* Release and/or git repository information */
81
82 /* Error codes */
83 #define REDIS_OK 0
84 #define REDIS_ERR -1
85
86 /* Static server configuration */
87 #define REDIS_SERVERPORT 6379 /* TCP port */
88 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
89 #define REDIS_IOBUF_LEN 1024
90 #define REDIS_LOADBUF_LEN 1024
91 #define REDIS_STATIC_ARGS 8
92 #define REDIS_DEFAULT_DBNUM 16
93 #define REDIS_CONFIGLINE_MAX 1024
94 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
96 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
97 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
98 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101 #define REDIS_WRITEV_THRESHOLD 3
102 /* Max number of iovecs used for each writev call */
103 #define REDIS_WRITEV_IOVEC_COUNT 256
104
105 /* Hash table parameters */
106 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107
108 /* Command flags */
109 #define REDIS_CMD_BULK 1 /* Bulk write command */
110 #define REDIS_CMD_INLINE 2 /* Inline command */
111 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117
118 /* Object types */
119 #define REDIS_STRING 0
120 #define REDIS_LIST 1
121 #define REDIS_SET 2
122 #define REDIS_ZSET 3
123 #define REDIS_HASH 4
124 #define REDIS_VMPOINTER 8
125
126 /* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
129 #define REDIS_ENCODING_RAW 0 /* Raw representation */
130 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
131 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
135
136 static char* strencoding[] = {
137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
138 };
139
140 /* Object types only used for dumping to disk */
141 #define REDIS_EXPIRETIME 253
142 #define REDIS_SELECTDB 254
143 #define REDIS_EOF 255
144
145 /* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
155 *
156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
158 #define REDIS_RDB_6BITLEN 0
159 #define REDIS_RDB_14BITLEN 1
160 #define REDIS_RDB_32BITLEN 2
161 #define REDIS_RDB_ENCVAL 3
162 #define REDIS_RDB_LENERR UINT_MAX
163
164 /* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
170 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
171
172 /* Virtual memory object->where field. */
173 #define REDIS_VM_MEMORY 0 /* The object is on memory */
174 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
175 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
178 /* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180 #define REDIS_VM_MAX_NEAR_PAGES 65536
181 #define REDIS_VM_MAX_RANDOM_JUMP 4096
182 #define REDIS_VM_MAX_THREADS 32
183 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
184 /* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
188 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
189
190 /* Client flags */
191 #define REDIS_SLAVE 1 /* This client is a slave server */
192 #define REDIS_MASTER 2 /* This client is a master server */
193 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194 #define REDIS_MULTI 8 /* This client is in a MULTI context */
195 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
197 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
198
199 /* Slave replication state - slave side */
200 #define REDIS_REPL_NONE 0 /* No active replication */
201 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
202 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
204 /* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
213 /* List related stuff */
214 #define REDIS_HEAD 0
215 #define REDIS_TAIL 1
216
217 /* Sort operations */
218 #define REDIS_SORT_GET 0
219 #define REDIS_SORT_ASC 1
220 #define REDIS_SORT_DESC 2
221 #define REDIS_SORTKEY_MAX 1024
222
223 /* Log levels */
224 #define REDIS_DEBUG 0
225 #define REDIS_VERBOSE 1
226 #define REDIS_NOTICE 2
227 #define REDIS_WARNING 3
228
229 /* Anti-warning macro... */
230 #define REDIS_NOTUSED(V) ((void) V)
231
232 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
234
235 /* Append only defines */
236 #define APPENDFSYNC_NO 0
237 #define APPENDFSYNC_ALWAYS 1
238 #define APPENDFSYNC_EVERYSEC 2
239
240 /* Zip structure related defaults */
241 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
243 #define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244 #define REDIS_LIST_MAX_ZIPLIST_VALUE 32
245
246 /* We can print the stacktrace, so our assert is defined this way: */
247 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
248 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
249 static void _redisAssert(char *estr, char *file, int line);
250 static void _redisPanic(char *msg, char *file, int line);
251
252 /*================================= Data types ============================== */
253
254 /* A redis object, that is a type able to hold a string / list / set */
255
256 /* The actual Redis Object */
257 typedef struct redisObject {
258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
262 int refcount;
263 void *ptr;
264 /* VM fields are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
268 } robj;
269
270 /* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
279 * are always able to read obj->storage to check this. For vmPointer
280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282 typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289 } vmpointer;
290
291 /* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295 #define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
300 _var.storage = REDIS_VM_MEMORY; \
301 } while(0);
302
303 typedef struct redisDb {
304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
307 dict *io_keys; /* Keys with clients waiting for VM I/O */
308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
309 int id;
310 } redisDb;
311
312 /* Client MULTI/EXEC state */
313 typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317 } multiCmd;
318
319 typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322 } multiState;
323
324 /* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326 typedef struct redisClient {
327 int fd;
328 redisDb *db;
329 int dictid;
330 sds querybuf;
331 robj **argv, **mbargv;
332 int argc, mbargc;
333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
334 int multibulk; /* multi bulk command format active */
335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
343 long repldboff; /* replication DB file offset */
344 off_t repldbsize; /* replication DB file size */
345 multiState mstate; /* MULTI/EXEC state */
346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
347 * operation such as BLPOP. Otherwise NULL. */
348 int blocking_keys_num; /* Number of blocking keys */
349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
356 } redisClient;
357
358 struct saveparam {
359 time_t seconds;
360 int changes;
361 };
362
363 /* Global server state structure */
364 struct redisServer {
365 int port;
366 int fd;
367 redisDb *db;
368 long long dirty; /* changes to DB from the last save */
369 list *clients;
370 list *slaves, *monitors;
371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
380 long long stat_expiredkeys; /* number of expired keys */
381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
387 int appendonly;
388 int appendfsync;
389 int no_appendfsync_on_rewrite;
390 int shutdown_asap;
391 time_t lastfsync;
392 int appendfd;
393 int appendseldb;
394 char *pidfile;
395 pid_t bgsavechildpid;
396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
398 sds aofbuf; /* AOF buffer, written before entering the event loop */
399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
404 char *appendfilename;
405 char *requirepass;
406 int rdbcompression;
407 int activerehashing;
408 /* Replication related */
409 int isslave;
410 char *masterauth;
411 char *masterhost;
412 int masterport;
413 redisClient *master; /* client that is master for this slave */
414 int replstate;
415 unsigned int maxclients;
416 unsigned long long maxmemory;
417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
424 /* Virtual memory configuration */
425 int vm_enabled;
426 char *vm_swap_file;
427 off_t vm_page_size;
428 off_t vm_pages;
429 unsigned long long vm_max_memory;
430 /* Zip structure config */
431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
441 time_t unixtime; /* Unix time sampled every second. */
442 /* Virtual memory I/O threads stuff */
443 /* An I/O thread process an element taken from the io_jobs queue and
444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
467 /* Pubsub */
468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
470 /* Misc */
471 FILE *devnull;
472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
474 };
475
476 typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479 } pubsubPattern;
480
481 typedef void redisCommandProc(redisClient *c);
482 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
483 struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
491 redisVmPreloadProc *vm_preload_proc;
492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
496 };
497
498 struct redisFunctionSym {
499 char *name;
500 unsigned long pointer;
501 };
502
503 typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509 } redisSortObject;
510
511 typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514 } redisSortOperation;
515
516 /* ZSETs use a specialized version of Skiplists */
517
518 typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
520 struct zskiplistNode *backward;
521 unsigned int *span;
522 double score;
523 robj *obj;
524 } zskiplistNode;
525
526 typedef struct zskiplist {
527 struct zskiplistNode *header, *tail;
528 unsigned long length;
529 int level;
530 } zskiplist;
531
532 typedef struct zset {
533 dict *dict;
534 zskiplist *zsl;
535 } zset;
536
537 /* Our shared "common" objects */
538
539 #define REDIS_SHARED_INTEGERS 10000
540 struct sharedObjectsStruct {
541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space,
542 *colon, *nullbulk, *nullmultibulk, *queued,
543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
545 *select0, *select1, *select2, *select3, *select4,
546 *select5, *select6, *select7, *select8, *select9,
547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
550 } shared;
551
552 /* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
558 /* VM threaded I/O request message */
559 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
562 typedef struct iojob {
563 int type; /* Request type, REDIS_IOJOB_* */
564 redisDb *db;/* Redis database */
565 robj *key; /* This I/O request is about swapping this key */
566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575 } iojob;
576
577 /*================================ Prototypes =============================== */
578 char *redisGitSHA1(void);
579 char *redisGitDirty(void);
580
581 static void freeStringObject(robj *o);
582 static void freeListObject(robj *o);
583 static void freeSetObject(robj *o);
584 static void decrRefCount(void *o);
585 static robj *createObject(int type, void *ptr);
586 static void freeClient(redisClient *c);
587 static int rdbLoad(char *filename);
588 static void addReply(redisClient *c, robj *obj);
589 static void addReplySds(redisClient *c, sds s);
590 static void incrRefCount(robj *o);
591 static int rdbSaveBackground(char *filename);
592 static robj *createStringObject(char *ptr, size_t len);
593 static robj *dupStringObject(robj *o);
594 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
595 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
596 static void flushAppendOnlyFile(void);
597 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
598 static int syncWithMaster(void);
599 static robj *tryObjectEncoding(robj *o);
600 static robj *getDecodedObject(robj *o);
601 static int removeExpire(redisDb *db, robj *key);
602 static int expireIfNeeded(redisDb *db, robj *key);
603 static int deleteIfVolatile(redisDb *db, robj *key);
604 static int dbDelete(redisDb *db, robj *key);
605 static time_t getExpire(redisDb *db, robj *key);
606 static int setExpire(redisDb *db, robj *key, time_t when);
607 static void updateSlavesWaitingBgsave(int bgsaveerr);
608 static void freeMemoryIfNeeded(void);
609 static int processCommand(redisClient *c);
610 static void setupSigSegvAction(void);
611 static void rdbRemoveTempFile(pid_t childpid);
612 static void aofRemoveTempFile(pid_t childpid);
613 static size_t stringObjectLen(robj *o);
614 static void processInputBuffer(redisClient *c);
615 static zskiplist *zslCreate(void);
616 static void zslFree(zskiplist *zsl);
617 static void zslInsert(zskiplist *zsl, double score, robj *obj);
618 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
619 static void initClientMultiState(redisClient *c);
620 static void freeClientMultiState(redisClient *c);
621 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
622 static void unblockClientWaitingData(redisClient *c);
623 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
624 static void vmInit(void);
625 static void vmMarkPagesFree(off_t page, off_t count);
626 static robj *vmLoadObject(robj *o);
627 static robj *vmPreviewObject(robj *o);
628 static int vmSwapOneObjectBlocking(void);
629 static int vmSwapOneObjectThreaded(void);
630 static int vmCanSwapOut(void);
631 static int tryFreeOneObjectFromFreelist(void);
632 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
633 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
634 static void vmCancelThreadedIOJob(robj *o);
635 static void lockThreadedIO(void);
636 static void unlockThreadedIO(void);
637 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
638 static void freeIOJob(iojob *j);
639 static void queueIOJob(iojob *j);
640 static int vmWriteObjectOnSwap(robj *o, off_t page);
641 static robj *vmReadObjectFromSwap(off_t page, int type);
642 static void waitEmptyIOJobsQueue(void);
643 static void vmReopenSwapFile(void);
644 static int vmFreePage(off_t page);
645 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
646 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
647 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
648 static int dontWaitForSwappedKey(redisClient *c, robj *key);
649 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
650 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
651 static struct redisCommand *lookupCommand(char *name);
652 static void call(redisClient *c, struct redisCommand *cmd);
653 static void resetClient(redisClient *c);
654 static void convertToRealHash(robj *o);
655 static void listTypeConvert(robj *o, int enc);
656 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
657 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
658 static void freePubsubPattern(void *p);
659 static int listMatchPubsubPattern(void *a, void *b);
660 static int compareStringObjects(robj *a, robj *b);
661 static int equalStringObjects(robj *a, robj *b);
662 static void usage();
663 static int rewriteAppendOnlyFileBackground(void);
664 static vmpointer *vmSwapObjectBlocking(robj *val);
665 static int prepareForShutdown();
666 static void touchWatchedKey(redisDb *db, robj *key);
667 static void touchWatchedKeysOnFlush(int dbid);
668 static void unwatchAllKeys(redisClient *c);
669
670 static void authCommand(redisClient *c);
671 static void pingCommand(redisClient *c);
672 static void echoCommand(redisClient *c);
673 static void setCommand(redisClient *c);
674 static void setnxCommand(redisClient *c);
675 static void setexCommand(redisClient *c);
676 static void getCommand(redisClient *c);
677 static void delCommand(redisClient *c);
678 static void existsCommand(redisClient *c);
679 static void incrCommand(redisClient *c);
680 static void decrCommand(redisClient *c);
681 static void incrbyCommand(redisClient *c);
682 static void decrbyCommand(redisClient *c);
683 static void selectCommand(redisClient *c);
684 static void randomkeyCommand(redisClient *c);
685 static void keysCommand(redisClient *c);
686 static void dbsizeCommand(redisClient *c);
687 static void lastsaveCommand(redisClient *c);
688 static void saveCommand(redisClient *c);
689 static void bgsaveCommand(redisClient *c);
690 static void bgrewriteaofCommand(redisClient *c);
691 static void shutdownCommand(redisClient *c);
692 static void moveCommand(redisClient *c);
693 static void renameCommand(redisClient *c);
694 static void renamenxCommand(redisClient *c);
695 static void lpushCommand(redisClient *c);
696 static void rpushCommand(redisClient *c);
697 static void lpushxCommand(redisClient *c);
698 static void rpushxCommand(redisClient *c);
699 static void linsertCommand(redisClient *c);
700 static void lpopCommand(redisClient *c);
701 static void rpopCommand(redisClient *c);
702 static void llenCommand(redisClient *c);
703 static void lindexCommand(redisClient *c);
704 static void lrangeCommand(redisClient *c);
705 static void ltrimCommand(redisClient *c);
706 static void typeCommand(redisClient *c);
707 static void lsetCommand(redisClient *c);
708 static void saddCommand(redisClient *c);
709 static void sremCommand(redisClient *c);
710 static void smoveCommand(redisClient *c);
711 static void sismemberCommand(redisClient *c);
712 static void scardCommand(redisClient *c);
713 static void spopCommand(redisClient *c);
714 static void srandmemberCommand(redisClient *c);
715 static void sinterCommand(redisClient *c);
716 static void sinterstoreCommand(redisClient *c);
717 static void sunionCommand(redisClient *c);
718 static void sunionstoreCommand(redisClient *c);
719 static void sdiffCommand(redisClient *c);
720 static void sdiffstoreCommand(redisClient *c);
721 static void syncCommand(redisClient *c);
722 static void flushdbCommand(redisClient *c);
723 static void flushallCommand(redisClient *c);
724 static void sortCommand(redisClient *c);
725 static void lremCommand(redisClient *c);
726 static void rpoplpushcommand(redisClient *c);
727 static void infoCommand(redisClient *c);
728 static void mgetCommand(redisClient *c);
729 static void monitorCommand(redisClient *c);
730 static void expireCommand(redisClient *c);
731 static void expireatCommand(redisClient *c);
732 static void getsetCommand(redisClient *c);
733 static void ttlCommand(redisClient *c);
734 static void slaveofCommand(redisClient *c);
735 static void debugCommand(redisClient *c);
736 static void msetCommand(redisClient *c);
737 static void msetnxCommand(redisClient *c);
738 static void zaddCommand(redisClient *c);
739 static void zincrbyCommand(redisClient *c);
740 static void zrangeCommand(redisClient *c);
741 static void zrangebyscoreCommand(redisClient *c);
742 static void zcountCommand(redisClient *c);
743 static void zrevrangeCommand(redisClient *c);
744 static void zcardCommand(redisClient *c);
745 static void zremCommand(redisClient *c);
746 static void zscoreCommand(redisClient *c);
747 static void zremrangebyscoreCommand(redisClient *c);
748 static void multiCommand(redisClient *c);
749 static void execCommand(redisClient *c);
750 static void discardCommand(redisClient *c);
751 static void blpopCommand(redisClient *c);
752 static void brpopCommand(redisClient *c);
753 static void appendCommand(redisClient *c);
754 static void substrCommand(redisClient *c);
755 static void zrankCommand(redisClient *c);
756 static void zrevrankCommand(redisClient *c);
757 static void hsetCommand(redisClient *c);
758 static void hsetnxCommand(redisClient *c);
759 static void hgetCommand(redisClient *c);
760 static void hmsetCommand(redisClient *c);
761 static void hmgetCommand(redisClient *c);
762 static void hdelCommand(redisClient *c);
763 static void hlenCommand(redisClient *c);
764 static void zremrangebyrankCommand(redisClient *c);
765 static void zunionstoreCommand(redisClient *c);
766 static void zinterstoreCommand(redisClient *c);
767 static void hkeysCommand(redisClient *c);
768 static void hvalsCommand(redisClient *c);
769 static void hgetallCommand(redisClient *c);
770 static void hexistsCommand(redisClient *c);
771 static void configCommand(redisClient *c);
772 static void hincrbyCommand(redisClient *c);
773 static void subscribeCommand(redisClient *c);
774 static void unsubscribeCommand(redisClient *c);
775 static void psubscribeCommand(redisClient *c);
776 static void punsubscribeCommand(redisClient *c);
777 static void publishCommand(redisClient *c);
778 static void watchCommand(redisClient *c);
779 static void unwatchCommand(redisClient *c);
780
781 /*================================= Globals ================================= */
782
783 /* Global vars */
784 static struct redisServer server; /* server global state */
785 static struct redisCommand *commandTable;
786 static struct redisCommand readonlyCommandTable[] = {
787 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
790 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
791 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
798 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
801 {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
813 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
814 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
817 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
822 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
823 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
824 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
825 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
826 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
827 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
829 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
830 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
831 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
834 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
835 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
840 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
842 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
843 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
844 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
845 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
846 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
847 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
848 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
849 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
850 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
852 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
853 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
854 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
855 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
856 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
857 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
858 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
859 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
860 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
864 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
865 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
872 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
878 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
880 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
883 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
885 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
888 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
891 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
893 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
894 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
895 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
896 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
897 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
898 };
899
900 /*============================ Utility functions ============================ */
901
902 /* Glob-style pattern matching. */
903 static int stringmatchlen(const char *pattern, int patternLen,
904 const char *string, int stringLen, int nocase)
905 {
906 while(patternLen) {
907 switch(pattern[0]) {
908 case '*':
909 while (pattern[1] == '*') {
910 pattern++;
911 patternLen--;
912 }
913 if (patternLen == 1)
914 return 1; /* match */
915 while(stringLen) {
916 if (stringmatchlen(pattern+1, patternLen-1,
917 string, stringLen, nocase))
918 return 1; /* match */
919 string++;
920 stringLen--;
921 }
922 return 0; /* no match */
923 break;
924 case '?':
925 if (stringLen == 0)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 case '[':
931 {
932 int not, match;
933
934 pattern++;
935 patternLen--;
936 not = pattern[0] == '^';
937 if (not) {
938 pattern++;
939 patternLen--;
940 }
941 match = 0;
942 while(1) {
943 if (pattern[0] == '\\') {
944 pattern++;
945 patternLen--;
946 if (pattern[0] == string[0])
947 match = 1;
948 } else if (pattern[0] == ']') {
949 break;
950 } else if (patternLen == 0) {
951 pattern--;
952 patternLen++;
953 break;
954 } else if (pattern[1] == '-' && patternLen >= 3) {
955 int start = pattern[0];
956 int end = pattern[2];
957 int c = string[0];
958 if (start > end) {
959 int t = start;
960 start = end;
961 end = t;
962 }
963 if (nocase) {
964 start = tolower(start);
965 end = tolower(end);
966 c = tolower(c);
967 }
968 pattern += 2;
969 patternLen -= 2;
970 if (c >= start && c <= end)
971 match = 1;
972 } else {
973 if (!nocase) {
974 if (pattern[0] == string[0])
975 match = 1;
976 } else {
977 if (tolower((int)pattern[0]) == tolower((int)string[0]))
978 match = 1;
979 }
980 }
981 pattern++;
982 patternLen--;
983 }
984 if (not)
985 match = !match;
986 if (!match)
987 return 0; /* no match */
988 string++;
989 stringLen--;
990 break;
991 }
992 case '\\':
993 if (patternLen >= 2) {
994 pattern++;
995 patternLen--;
996 }
997 /* fall through */
998 default:
999 if (!nocase) {
1000 if (pattern[0] != string[0])
1001 return 0; /* no match */
1002 } else {
1003 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1004 return 0; /* no match */
1005 }
1006 string++;
1007 stringLen--;
1008 break;
1009 }
1010 pattern++;
1011 patternLen--;
1012 if (stringLen == 0) {
1013 while(*pattern == '*') {
1014 pattern++;
1015 patternLen--;
1016 }
1017 break;
1018 }
1019 }
1020 if (patternLen == 0 && stringLen == 0)
1021 return 1;
1022 return 0;
1023 }
1024
1025 static int stringmatch(const char *pattern, const char *string, int nocase) {
1026 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1027 }
1028
1029 /* Convert a string representing an amount of memory into the number of
1030 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1031 * (1024*1024*1024).
1032 *
1033 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1034 * set to 0 */
1035 static long long memtoll(const char *p, int *err) {
1036 const char *u;
1037 char buf[128];
1038 long mul; /* unit multiplier */
1039 long long val;
1040 unsigned int digits;
1041
1042 if (err) *err = 0;
1043 /* Search the first non digit character. */
1044 u = p;
1045 if (*u == '-') u++;
1046 while(*u && isdigit(*u)) u++;
1047 if (*u == '\0' || !strcasecmp(u,"b")) {
1048 mul = 1;
1049 } else if (!strcasecmp(u,"k")) {
1050 mul = 1000;
1051 } else if (!strcasecmp(u,"kb")) {
1052 mul = 1024;
1053 } else if (!strcasecmp(u,"m")) {
1054 mul = 1000*1000;
1055 } else if (!strcasecmp(u,"mb")) {
1056 mul = 1024*1024;
1057 } else if (!strcasecmp(u,"g")) {
1058 mul = 1000L*1000*1000;
1059 } else if (!strcasecmp(u,"gb")) {
1060 mul = 1024L*1024*1024;
1061 } else {
1062 if (err) *err = 1;
1063 mul = 1;
1064 }
1065 digits = u-p;
1066 if (digits >= sizeof(buf)) {
1067 if (err) *err = 1;
1068 return LLONG_MAX;
1069 }
1070 memcpy(buf,p,digits);
1071 buf[digits] = '\0';
1072 val = strtoll(buf,NULL,10);
1073 return val*mul;
1074 }
1075
1076 /* Convert a long long into a string. Returns the number of
1077 * characters needed to represent the number, that can be shorter if passed
1078 * buffer length is not enough to store the whole number. */
1079 static int ll2string(char *s, size_t len, long long value) {
1080 char buf[32], *p;
1081 unsigned long long v;
1082 size_t l;
1083
1084 if (len == 0) return 0;
1085 v = (value < 0) ? -value : value;
1086 p = buf+31; /* point to the last character */
1087 do {
1088 *p-- = '0'+(v%10);
1089 v /= 10;
1090 } while(v);
1091 if (value < 0) *p-- = '-';
1092 p++;
1093 l = 32-(p-buf);
1094 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1095 memcpy(s,p,l);
1096 s[l] = '\0';
1097 return l;
1098 }
1099
1100 static void redisLog(int level, const char *fmt, ...) {
1101 va_list ap;
1102 FILE *fp;
1103
1104 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1105 if (!fp) return;
1106
1107 va_start(ap, fmt);
1108 if (level >= server.verbosity) {
1109 char *c = ".-*#";
1110 char buf[64];
1111 time_t now;
1112
1113 now = time(NULL);
1114 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1115 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1116 vfprintf(fp, fmt, ap);
1117 fprintf(fp,"\n");
1118 fflush(fp);
1119 }
1120 va_end(ap);
1121
1122 if (server.logfile) fclose(fp);
1123 }
1124
1125 /*====================== Hash table type implementation ==================== */
1126
1127 /* This is an hash table type that uses the SDS dynamic strings libary as
1128 * keys and radis objects as values (objects can hold SDS strings,
1129 * lists, sets). */
1130
1131 static void dictVanillaFree(void *privdata, void *val)
1132 {
1133 DICT_NOTUSED(privdata);
1134 zfree(val);
1135 }
1136
1137 static void dictListDestructor(void *privdata, void *val)
1138 {
1139 DICT_NOTUSED(privdata);
1140 listRelease((list*)val);
1141 }
1142
1143 static int dictSdsKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145 {
1146 int l1,l2;
1147 DICT_NOTUSED(privdata);
1148
1149 l1 = sdslen((sds)key1);
1150 l2 = sdslen((sds)key2);
1151 if (l1 != l2) return 0;
1152 return memcmp(key1, key2, l1) == 0;
1153 }
1154
1155 static void dictRedisObjectDestructor(void *privdata, void *val)
1156 {
1157 DICT_NOTUSED(privdata);
1158
1159 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1160 decrRefCount(val);
1161 }
1162
1163 static void dictSdsDestructor(void *privdata, void *val)
1164 {
1165 DICT_NOTUSED(privdata);
1166
1167 sdsfree(val);
1168 }
1169
1170 static int dictObjKeyCompare(void *privdata, const void *key1,
1171 const void *key2)
1172 {
1173 const robj *o1 = key1, *o2 = key2;
1174 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1175 }
1176
1177 static unsigned int dictObjHash(const void *key) {
1178 const robj *o = key;
1179 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1180 }
1181
1182 static unsigned int dictSdsHash(const void *key) {
1183 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1184 }
1185
1186 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1187 const void *key2)
1188 {
1189 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1190 int cmp;
1191
1192 if (o1->encoding == REDIS_ENCODING_INT &&
1193 o2->encoding == REDIS_ENCODING_INT)
1194 return o1->ptr == o2->ptr;
1195
1196 o1 = getDecodedObject(o1);
1197 o2 = getDecodedObject(o2);
1198 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1199 decrRefCount(o1);
1200 decrRefCount(o2);
1201 return cmp;
1202 }
1203
1204 static unsigned int dictEncObjHash(const void *key) {
1205 robj *o = (robj*) key;
1206
1207 if (o->encoding == REDIS_ENCODING_RAW) {
1208 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1209 } else {
1210 if (o->encoding == REDIS_ENCODING_INT) {
1211 char buf[32];
1212 int len;
1213
1214 len = ll2string(buf,32,(long)o->ptr);
1215 return dictGenHashFunction((unsigned char*)buf, len);
1216 } else {
1217 unsigned int hash;
1218
1219 o = getDecodedObject(o);
1220 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1221 decrRefCount(o);
1222 return hash;
1223 }
1224 }
1225 }
1226
1227 /* Sets type */
1228 static dictType setDictType = {
1229 dictEncObjHash, /* hash function */
1230 NULL, /* key dup */
1231 NULL, /* val dup */
1232 dictEncObjKeyCompare, /* key compare */
1233 dictRedisObjectDestructor, /* key destructor */
1234 NULL /* val destructor */
1235 };
1236
1237 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1238 static dictType zsetDictType = {
1239 dictEncObjHash, /* hash function */
1240 NULL, /* key dup */
1241 NULL, /* val dup */
1242 dictEncObjKeyCompare, /* key compare */
1243 dictRedisObjectDestructor, /* key destructor */
1244 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1245 };
1246
1247 /* Db->dict, keys are sds strings, vals are Redis objects. */
1248 static dictType dbDictType = {
1249 dictSdsHash, /* hash function */
1250 NULL, /* key dup */
1251 NULL, /* val dup */
1252 dictSdsKeyCompare, /* key compare */
1253 dictSdsDestructor, /* key destructor */
1254 dictRedisObjectDestructor /* val destructor */
1255 };
1256
1257 /* Db->expires */
1258 static dictType keyptrDictType = {
1259 dictSdsHash, /* hash function */
1260 NULL, /* key dup */
1261 NULL, /* val dup */
1262 dictSdsKeyCompare, /* key compare */
1263 NULL, /* key destructor */
1264 NULL /* val destructor */
1265 };
1266
1267 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1268 static dictType hashDictType = {
1269 dictEncObjHash, /* hash function */
1270 NULL, /* key dup */
1271 NULL, /* val dup */
1272 dictEncObjKeyCompare, /* key compare */
1273 dictRedisObjectDestructor, /* key destructor */
1274 dictRedisObjectDestructor /* val destructor */
1275 };
1276
1277 /* Keylist hash table type has unencoded redis objects as keys and
1278 * lists as values. It's used for blocking operations (BLPOP) and to
1279 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1280 static dictType keylistDictType = {
1281 dictObjHash, /* hash function */
1282 NULL, /* key dup */
1283 NULL, /* val dup */
1284 dictObjKeyCompare, /* key compare */
1285 dictRedisObjectDestructor, /* key destructor */
1286 dictListDestructor /* val destructor */
1287 };
1288
1289 static void version();
1290
1291 /* ========================= Random utility functions ======================= */
1292
1293 /* Redis generally does not try to recover from out of memory conditions
1294 * when allocating objects or strings, it is not clear if it will be possible
1295 * to report this condition to the client since the networking layer itself
1296 * is based on heap allocation for send buffers, so we simply abort.
1297 * At least the code will be simpler to read... */
1298 static void oom(const char *msg) {
1299 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1300 sleep(1);
1301 abort();
1302 }
1303
1304 /* ====================== Redis server networking stuff ===================== */
1305 static void closeTimedoutClients(void) {
1306 redisClient *c;
1307 listNode *ln;
1308 time_t now = time(NULL);
1309 listIter li;
1310
1311 listRewind(server.clients,&li);
1312 while ((ln = listNext(&li)) != NULL) {
1313 c = listNodeValue(ln);
1314 if (server.maxidletime &&
1315 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1316 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1317 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1318 listLength(c->pubsub_patterns) == 0 &&
1319 (now - c->lastinteraction > server.maxidletime))
1320 {
1321 redisLog(REDIS_VERBOSE,"Closing idle client");
1322 freeClient(c);
1323 } else if (c->flags & REDIS_BLOCKED) {
1324 if (c->blockingto != 0 && c->blockingto < now) {
1325 addReply(c,shared.nullmultibulk);
1326 unblockClientWaitingData(c);
1327 }
1328 }
1329 }
1330 }
1331
1332 static int htNeedsResize(dict *dict) {
1333 long long size, used;
1334
1335 size = dictSlots(dict);
1336 used = dictSize(dict);
1337 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1338 (used*100/size < REDIS_HT_MINFILL));
1339 }
1340
1341 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1342 * we resize the hash table to save memory */
1343 static void tryResizeHashTables(void) {
1344 int j;
1345
1346 for (j = 0; j < server.dbnum; j++) {
1347 if (htNeedsResize(server.db[j].dict))
1348 dictResize(server.db[j].dict);
1349 if (htNeedsResize(server.db[j].expires))
1350 dictResize(server.db[j].expires);
1351 }
1352 }
1353
1354 /* Our hash table implementation performs rehashing incrementally while
1355 * we write/read from the hash table. Still if the server is idle, the hash
1356 * table will use two tables for a long time. So we try to use 1 millisecond
1357 * of CPU time at every serverCron() loop in order to rehash some key. */
1358 static void incrementallyRehash(void) {
1359 int j;
1360
1361 for (j = 0; j < server.dbnum; j++) {
1362 if (dictIsRehashing(server.db[j].dict)) {
1363 dictRehashMilliseconds(server.db[j].dict,1);
1364 break; /* already used our millisecond for this loop... */
1365 }
1366 }
1367 }
1368
1369 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1370 void backgroundSaveDoneHandler(int statloc) {
1371 int exitcode = WEXITSTATUS(statloc);
1372 int bysignal = WIFSIGNALED(statloc);
1373
1374 if (!bysignal && exitcode == 0) {
1375 redisLog(REDIS_NOTICE,
1376 "Background saving terminated with success");
1377 server.dirty = 0;
1378 server.lastsave = time(NULL);
1379 } else if (!bysignal && exitcode != 0) {
1380 redisLog(REDIS_WARNING, "Background saving error");
1381 } else {
1382 redisLog(REDIS_WARNING,
1383 "Background saving terminated by signal %d", WTERMSIG(statloc));
1384 rdbRemoveTempFile(server.bgsavechildpid);
1385 }
1386 server.bgsavechildpid = -1;
1387 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1388 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1389 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1390 }
1391
1392 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1393 * Handle this. */
1394 void backgroundRewriteDoneHandler(int statloc) {
1395 int exitcode = WEXITSTATUS(statloc);
1396 int bysignal = WIFSIGNALED(statloc);
1397
1398 if (!bysignal && exitcode == 0) {
1399 int fd;
1400 char tmpfile[256];
1401
1402 redisLog(REDIS_NOTICE,
1403 "Background append only file rewriting terminated with success");
1404 /* Now it's time to flush the differences accumulated by the parent */
1405 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1406 fd = open(tmpfile,O_WRONLY|O_APPEND);
1407 if (fd == -1) {
1408 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1409 goto cleanup;
1410 }
1411 /* Flush our data... */
1412 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1413 (signed) sdslen(server.bgrewritebuf)) {
1414 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1415 close(fd);
1416 goto cleanup;
1417 }
1418 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1419 /* Now our work is to rename the temp file into the stable file. And
1420 * switch the file descriptor used by the server for append only. */
1421 if (rename(tmpfile,server.appendfilename) == -1) {
1422 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1423 close(fd);
1424 goto cleanup;
1425 }
1426 /* Mission completed... almost */
1427 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1428 if (server.appendfd != -1) {
1429 /* If append only is actually enabled... */
1430 close(server.appendfd);
1431 server.appendfd = fd;
1432 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1433 server.appendseldb = -1; /* Make sure it will issue SELECT */
1434 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1435 } else {
1436 /* If append only is disabled we just generate a dump in this
1437 * format. Why not? */
1438 close(fd);
1439 }
1440 } else if (!bysignal && exitcode != 0) {
1441 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1442 } else {
1443 redisLog(REDIS_WARNING,
1444 "Background append only file rewriting terminated by signal %d",
1445 WTERMSIG(statloc));
1446 }
1447 cleanup:
1448 sdsfree(server.bgrewritebuf);
1449 server.bgrewritebuf = sdsempty();
1450 aofRemoveTempFile(server.bgrewritechildpid);
1451 server.bgrewritechildpid = -1;
1452 }
1453
1454 /* This function is called once a background process of some kind terminates,
1455 * as we want to avoid resizing the hash tables when there is a child in order
1456 * to play well with copy-on-write (otherwise when a resize happens lots of
1457 * memory pages are copied). The goal of this function is to update the ability
1458 * for dict.c to resize the hash tables accordingly to the fact we have o not
1459 * running childs. */
1460 static void updateDictResizePolicy(void) {
1461 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1462 dictEnableResize();
1463 else
1464 dictDisableResize();
1465 }
1466
1467 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1468 int j, loops = server.cronloops++;
1469 REDIS_NOTUSED(eventLoop);
1470 REDIS_NOTUSED(id);
1471 REDIS_NOTUSED(clientData);
1472
1473 /* We take a cached value of the unix time in the global state because
1474 * with virtual memory and aging there is to store the current time
1475 * in objects at every object access, and accuracy is not needed.
1476 * To access a global var is faster than calling time(NULL) */
1477 server.unixtime = time(NULL);
1478 /* We have just 21 bits per object for LRU information.
1479 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1480 *
1481 * When we need to select what object to swap, we compute the minimum
1482 * time distance between the current lruclock and the object last access
1483 * lruclock info. Even if clocks will wrap on overflow, there is
1484 * the interesting property that we are sure that at least
1485 * ABS(A-B) minutes passed between current time and timestamp B.
1486 *
1487 * This is not precise but we don't need at all precision, but just
1488 * something statistically reasonable.
1489 */
1490 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1491
1492 /* We received a SIGTERM, shutting down here in a safe way, as it is
1493 * not ok doing so inside the signal handler. */
1494 if (server.shutdown_asap) {
1495 if (prepareForShutdown() == REDIS_OK) exit(0);
1496 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1497 }
1498
1499 /* Show some info about non-empty databases */
1500 for (j = 0; j < server.dbnum; j++) {
1501 long long size, used, vkeys;
1502
1503 size = dictSlots(server.db[j].dict);
1504 used = dictSize(server.db[j].dict);
1505 vkeys = dictSize(server.db[j].expires);
1506 if (!(loops % 50) && (used || vkeys)) {
1507 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1508 /* dictPrintStats(server.dict); */
1509 }
1510 }
1511
1512 /* We don't want to resize the hash tables while a bacground saving
1513 * is in progress: the saving child is created using fork() that is
1514 * implemented with a copy-on-write semantic in most modern systems, so
1515 * if we resize the HT while there is the saving child at work actually
1516 * a lot of memory movements in the parent will cause a lot of pages
1517 * copied. */
1518 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1519 if (!(loops % 10)) tryResizeHashTables();
1520 if (server.activerehashing) incrementallyRehash();
1521 }
1522
1523 /* Show information about connected clients */
1524 if (!(loops % 50)) {
1525 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1526 listLength(server.clients)-listLength(server.slaves),
1527 listLength(server.slaves),
1528 zmalloc_used_memory());
1529 }
1530
1531 /* Close connections of timedout clients */
1532 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1533 closeTimedoutClients();
1534
1535 /* Check if a background saving or AOF rewrite in progress terminated */
1536 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1537 int statloc;
1538 pid_t pid;
1539
1540 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1541 if (pid == server.bgsavechildpid) {
1542 backgroundSaveDoneHandler(statloc);
1543 } else {
1544 backgroundRewriteDoneHandler(statloc);
1545 }
1546 updateDictResizePolicy();
1547 }
1548 } else {
1549 /* If there is not a background saving in progress check if
1550 * we have to save now */
1551 time_t now = time(NULL);
1552 for (j = 0; j < server.saveparamslen; j++) {
1553 struct saveparam *sp = server.saveparams+j;
1554
1555 if (server.dirty >= sp->changes &&
1556 now-server.lastsave > sp->seconds) {
1557 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1558 sp->changes, sp->seconds);
1559 rdbSaveBackground(server.dbfilename);
1560 break;
1561 }
1562 }
1563 }
1564
1565 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1566 * will use few CPU cycles if there are few expiring keys, otherwise
1567 * it will get more aggressive to avoid that too much memory is used by
1568 * keys that can be removed from the keyspace. */
1569 for (j = 0; j < server.dbnum; j++) {
1570 int expired;
1571 redisDb *db = server.db+j;
1572
1573 /* Continue to expire if at the end of the cycle more than 25%
1574 * of the keys were expired. */
1575 do {
1576 long num = dictSize(db->expires);
1577 time_t now = time(NULL);
1578
1579 expired = 0;
1580 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1581 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1582 while (num--) {
1583 dictEntry *de;
1584 time_t t;
1585
1586 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1587 t = (time_t) dictGetEntryVal(de);
1588 if (now > t) {
1589 sds key = dictGetEntryKey(de);
1590 robj *keyobj = createStringObject(key,sdslen(key));
1591
1592 dbDelete(db,keyobj);
1593 decrRefCount(keyobj);
1594 expired++;
1595 server.stat_expiredkeys++;
1596 }
1597 }
1598 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1599 }
1600
1601 /* Swap a few keys on disk if we are over the memory limit and VM
1602 * is enbled. Try to free objects from the free list first. */
1603 if (vmCanSwapOut()) {
1604 while (server.vm_enabled && zmalloc_used_memory() >
1605 server.vm_max_memory)
1606 {
1607 int retval;
1608
1609 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1610 retval = (server.vm_max_threads == 0) ?
1611 vmSwapOneObjectBlocking() :
1612 vmSwapOneObjectThreaded();
1613 if (retval == REDIS_ERR && !(loops % 300) &&
1614 zmalloc_used_memory() >
1615 (server.vm_max_memory+server.vm_max_memory/10))
1616 {
1617 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1618 }
1619 /* Note that when using threade I/O we free just one object,
1620 * because anyway when the I/O thread in charge to swap this
1621 * object out will finish, the handler of completed jobs
1622 * will try to swap more objects if we are still out of memory. */
1623 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1624 }
1625 }
1626
1627 /* Check if we should connect to a MASTER */
1628 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1629 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1630 if (syncWithMaster() == REDIS_OK) {
1631 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1632 if (server.appendonly) rewriteAppendOnlyFileBackground();
1633 }
1634 }
1635 return 100;
1636 }
1637
1638 /* This function gets called every time Redis is entering the
1639 * main loop of the event driven library, that is, before to sleep
1640 * for ready file descriptors. */
1641 static void beforeSleep(struct aeEventLoop *eventLoop) {
1642 REDIS_NOTUSED(eventLoop);
1643
1644 /* Awake clients that got all the swapped keys they requested */
1645 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1646 listIter li;
1647 listNode *ln;
1648
1649 listRewind(server.io_ready_clients,&li);
1650 while((ln = listNext(&li))) {
1651 redisClient *c = ln->value;
1652 struct redisCommand *cmd;
1653
1654 /* Resume the client. */
1655 listDelNode(server.io_ready_clients,ln);
1656 c->flags &= (~REDIS_IO_WAIT);
1657 server.vm_blocked_clients--;
1658 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1659 readQueryFromClient, c);
1660 cmd = lookupCommand(c->argv[0]->ptr);
1661 assert(cmd != NULL);
1662 call(c,cmd);
1663 resetClient(c);
1664 /* There may be more data to process in the input buffer. */
1665 if (c->querybuf && sdslen(c->querybuf) > 0)
1666 processInputBuffer(c);
1667 }
1668 }
1669 /* Write the AOF buffer on disk */
1670 flushAppendOnlyFile();
1671 }
1672
1673 static void createSharedObjects(void) {
1674 int j;
1675
1676 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1677 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1678 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1679 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1680 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1681 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1682 shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n"));
1683 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1684 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1685 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1686 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1687 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1688 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1690 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR no such key\r\n"));
1692 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR syntax error\r\n"));
1694 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1695 "-ERR source and destination objects are the same\r\n"));
1696 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1697 "-ERR index out of range\r\n"));
1698 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1699 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1700 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1701 shared.select0 = createStringObject("select 0\r\n",10);
1702 shared.select1 = createStringObject("select 1\r\n",10);
1703 shared.select2 = createStringObject("select 2\r\n",10);
1704 shared.select3 = createStringObject("select 3\r\n",10);
1705 shared.select4 = createStringObject("select 4\r\n",10);
1706 shared.select5 = createStringObject("select 5\r\n",10);
1707 shared.select6 = createStringObject("select 6\r\n",10);
1708 shared.select7 = createStringObject("select 7\r\n",10);
1709 shared.select8 = createStringObject("select 8\r\n",10);
1710 shared.select9 = createStringObject("select 9\r\n",10);
1711 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1712 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1713 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1714 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1715 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1716 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1717 shared.mbulk3 = createStringObject("*3\r\n",4);
1718 shared.mbulk4 = createStringObject("*4\r\n",4);
1719 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1720 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1721 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1722 }
1723 }
1724
1725 static void appendServerSaveParams(time_t seconds, int changes) {
1726 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1727 server.saveparams[server.saveparamslen].seconds = seconds;
1728 server.saveparams[server.saveparamslen].changes = changes;
1729 server.saveparamslen++;
1730 }
1731
1732 static void resetServerSaveParams() {
1733 zfree(server.saveparams);
1734 server.saveparams = NULL;
1735 server.saveparamslen = 0;
1736 }
1737
1738 static void initServerConfig() {
1739 server.dbnum = REDIS_DEFAULT_DBNUM;
1740 server.port = REDIS_SERVERPORT;
1741 server.verbosity = REDIS_VERBOSE;
1742 server.maxidletime = REDIS_MAXIDLETIME;
1743 server.saveparams = NULL;
1744 server.logfile = NULL; /* NULL = log on standard output */
1745 server.bindaddr = NULL;
1746 server.glueoutputbuf = 1;
1747 server.daemonize = 0;
1748 server.appendonly = 0;
1749 server.appendfsync = APPENDFSYNC_EVERYSEC;
1750 server.no_appendfsync_on_rewrite = 0;
1751 server.lastfsync = time(NULL);
1752 server.appendfd = -1;
1753 server.appendseldb = -1; /* Make sure the first time will not match */
1754 server.pidfile = zstrdup("/var/run/redis.pid");
1755 server.dbfilename = zstrdup("dump.rdb");
1756 server.appendfilename = zstrdup("appendonly.aof");
1757 server.requirepass = NULL;
1758 server.rdbcompression = 1;
1759 server.activerehashing = 1;
1760 server.maxclients = 0;
1761 server.blpop_blocked_clients = 0;
1762 server.maxmemory = 0;
1763 server.vm_enabled = 0;
1764 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1765 server.vm_page_size = 256; /* 256 bytes per page */
1766 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1767 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1768 server.vm_max_threads = 4;
1769 server.vm_blocked_clients = 0;
1770 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1771 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1772 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1773 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
1774 server.shutdown_asap = 0;
1775
1776 resetServerSaveParams();
1777
1778 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1779 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1780 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1781 /* Replication related */
1782 server.isslave = 0;
1783 server.masterauth = NULL;
1784 server.masterhost = NULL;
1785 server.masterport = 6379;
1786 server.master = NULL;
1787 server.replstate = REDIS_REPL_NONE;
1788
1789 /* Double constants initialization */
1790 R_Zero = 0.0;
1791 R_PosInf = 1.0/R_Zero;
1792 R_NegInf = -1.0/R_Zero;
1793 R_Nan = R_Zero/R_Zero;
1794 }
1795
1796 static void initServer() {
1797 int j;
1798
1799 signal(SIGHUP, SIG_IGN);
1800 signal(SIGPIPE, SIG_IGN);
1801 setupSigSegvAction();
1802
1803 server.devnull = fopen("/dev/null","w");
1804 if (server.devnull == NULL) {
1805 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1806 exit(1);
1807 }
1808 server.clients = listCreate();
1809 server.slaves = listCreate();
1810 server.monitors = listCreate();
1811 server.objfreelist = listCreate();
1812 createSharedObjects();
1813 server.el = aeCreateEventLoop();
1814 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1815 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1816 if (server.fd == -1) {
1817 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1818 exit(1);
1819 }
1820 for (j = 0; j < server.dbnum; j++) {
1821 server.db[j].dict = dictCreate(&dbDictType,NULL);
1822 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1823 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1824 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1825 if (server.vm_enabled)
1826 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1827 server.db[j].id = j;
1828 }
1829 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1830 server.pubsub_patterns = listCreate();
1831 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1832 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1833 server.cronloops = 0;
1834 server.bgsavechildpid = -1;
1835 server.bgrewritechildpid = -1;
1836 server.bgrewritebuf = sdsempty();
1837 server.aofbuf = sdsempty();
1838 server.lastsave = time(NULL);
1839 server.dirty = 0;
1840 server.stat_numcommands = 0;
1841 server.stat_numconnections = 0;
1842 server.stat_expiredkeys = 0;
1843 server.stat_starttime = time(NULL);
1844 server.unixtime = time(NULL);
1845 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1846 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1847 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1848
1849 if (server.appendonly) {
1850 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1851 if (server.appendfd == -1) {
1852 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1853 strerror(errno));
1854 exit(1);
1855 }
1856 }
1857
1858 if (server.vm_enabled) vmInit();
1859 }
1860
1861 /* Empty the whole database */
1862 static long long emptyDb() {
1863 int j;
1864 long long removed = 0;
1865
1866 for (j = 0; j < server.dbnum; j++) {
1867 removed += dictSize(server.db[j].dict);
1868 dictEmpty(server.db[j].dict);
1869 dictEmpty(server.db[j].expires);
1870 }
1871 return removed;
1872 }
1873
1874 static int yesnotoi(char *s) {
1875 if (!strcasecmp(s,"yes")) return 1;
1876 else if (!strcasecmp(s,"no")) return 0;
1877 else return -1;
1878 }
1879
1880 /* I agree, this is a very rudimental way to load a configuration...
1881 will improve later if the config gets more complex */
1882 static void loadServerConfig(char *filename) {
1883 FILE *fp;
1884 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1885 int linenum = 0;
1886 sds line = NULL;
1887
1888 if (filename[0] == '-' && filename[1] == '\0')
1889 fp = stdin;
1890 else {
1891 if ((fp = fopen(filename,"r")) == NULL) {
1892 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1893 exit(1);
1894 }
1895 }
1896
1897 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1898 sds *argv;
1899 int argc, j;
1900
1901 linenum++;
1902 line = sdsnew(buf);
1903 line = sdstrim(line," \t\r\n");
1904
1905 /* Skip comments and blank lines*/
1906 if (line[0] == '#' || line[0] == '\0') {
1907 sdsfree(line);
1908 continue;
1909 }
1910
1911 /* Split into arguments */
1912 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1913 sdstolower(argv[0]);
1914
1915 /* Execute config directives */
1916 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1917 server.maxidletime = atoi(argv[1]);
1918 if (server.maxidletime < 0) {
1919 err = "Invalid timeout value"; goto loaderr;
1920 }
1921 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1922 server.port = atoi(argv[1]);
1923 if (server.port < 1 || server.port > 65535) {
1924 err = "Invalid port"; goto loaderr;
1925 }
1926 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1927 server.bindaddr = zstrdup(argv[1]);
1928 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1929 int seconds = atoi(argv[1]);
1930 int changes = atoi(argv[2]);
1931 if (seconds < 1 || changes < 0) {
1932 err = "Invalid save parameters"; goto loaderr;
1933 }
1934 appendServerSaveParams(seconds,changes);
1935 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1936 if (chdir(argv[1]) == -1) {
1937 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1938 argv[1], strerror(errno));
1939 exit(1);
1940 }
1941 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1942 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1943 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1944 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1945 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1946 else {
1947 err = "Invalid log level. Must be one of debug, notice, warning";
1948 goto loaderr;
1949 }
1950 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1951 FILE *logfp;
1952
1953 server.logfile = zstrdup(argv[1]);
1954 if (!strcasecmp(server.logfile,"stdout")) {
1955 zfree(server.logfile);
1956 server.logfile = NULL;
1957 }
1958 if (server.logfile) {
1959 /* Test if we are able to open the file. The server will not
1960 * be able to abort just for this problem later... */
1961 logfp = fopen(server.logfile,"a");
1962 if (logfp == NULL) {
1963 err = sdscatprintf(sdsempty(),
1964 "Can't open the log file: %s", strerror(errno));
1965 goto loaderr;
1966 }
1967 fclose(logfp);
1968 }
1969 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1970 server.dbnum = atoi(argv[1]);
1971 if (server.dbnum < 1) {
1972 err = "Invalid number of databases"; goto loaderr;
1973 }
1974 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1975 loadServerConfig(argv[1]);
1976 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1977 server.maxclients = atoi(argv[1]);
1978 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1979 server.maxmemory = memtoll(argv[1],NULL);
1980 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1981 server.masterhost = sdsnew(argv[1]);
1982 server.masterport = atoi(argv[2]);
1983 server.replstate = REDIS_REPL_CONNECT;
1984 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1985 server.masterauth = zstrdup(argv[1]);
1986 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1987 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1988 err = "argument must be 'yes' or 'no'"; goto loaderr;
1989 }
1990 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1991 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1992 err = "argument must be 'yes' or 'no'"; goto loaderr;
1993 }
1994 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1995 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1996 err = "argument must be 'yes' or 'no'"; goto loaderr;
1997 }
1998 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1999 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
2000 err = "argument must be 'yes' or 'no'"; goto loaderr;
2001 }
2002 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2003 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2004 err = "argument must be 'yes' or 'no'"; goto loaderr;
2005 }
2006 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2007 zfree(server.appendfilename);
2008 server.appendfilename = zstrdup(argv[1]);
2009 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2010 && argc == 2) {
2011 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2012 err = "argument must be 'yes' or 'no'"; goto loaderr;
2013 }
2014 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
2015 if (!strcasecmp(argv[1],"no")) {
2016 server.appendfsync = APPENDFSYNC_NO;
2017 } else if (!strcasecmp(argv[1],"always")) {
2018 server.appendfsync = APPENDFSYNC_ALWAYS;
2019 } else if (!strcasecmp(argv[1],"everysec")) {
2020 server.appendfsync = APPENDFSYNC_EVERYSEC;
2021 } else {
2022 err = "argument must be 'no', 'always' or 'everysec'";
2023 goto loaderr;
2024 }
2025 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2026 server.requirepass = zstrdup(argv[1]);
2027 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2028 zfree(server.pidfile);
2029 server.pidfile = zstrdup(argv[1]);
2030 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2031 zfree(server.dbfilename);
2032 server.dbfilename = zstrdup(argv[1]);
2033 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2034 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2035 err = "argument must be 'yes' or 'no'"; goto loaderr;
2036 }
2037 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2038 zfree(server.vm_swap_file);
2039 server.vm_swap_file = zstrdup(argv[1]);
2040 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2041 server.vm_max_memory = memtoll(argv[1],NULL);
2042 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2043 server.vm_page_size = memtoll(argv[1], NULL);
2044 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2045 server.vm_pages = memtoll(argv[1], NULL);
2046 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2047 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2048 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2049 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2050 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2051 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2052 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2053 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2054 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2055 server.list_max_ziplist_value = memtoll(argv[1], NULL);
2056 } else {
2057 err = "Bad directive or wrong number of arguments"; goto loaderr;
2058 }
2059 for (j = 0; j < argc; j++)
2060 sdsfree(argv[j]);
2061 zfree(argv);
2062 sdsfree(line);
2063 }
2064 if (fp != stdin) fclose(fp);
2065 return;
2066
2067 loaderr:
2068 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2069 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2070 fprintf(stderr, ">>> '%s'\n", line);
2071 fprintf(stderr, "%s\n", err);
2072 exit(1);
2073 }
2074
2075 static void freeClientArgv(redisClient *c) {
2076 int j;
2077
2078 for (j = 0; j < c->argc; j++)
2079 decrRefCount(c->argv[j]);
2080 for (j = 0; j < c->mbargc; j++)
2081 decrRefCount(c->mbargv[j]);
2082 c->argc = 0;
2083 c->mbargc = 0;
2084 }
2085
2086 static void freeClient(redisClient *c) {
2087 listNode *ln;
2088
2089 /* Note that if the client we are freeing is blocked into a blocking
2090 * call, we have to set querybuf to NULL *before* to call
2091 * unblockClientWaitingData() to avoid processInputBuffer() will get
2092 * called. Also it is important to remove the file events after
2093 * this, because this call adds the READABLE event. */
2094 sdsfree(c->querybuf);
2095 c->querybuf = NULL;
2096 if (c->flags & REDIS_BLOCKED)
2097 unblockClientWaitingData(c);
2098
2099 /* UNWATCH all the keys */
2100 unwatchAllKeys(c);
2101 listRelease(c->watched_keys);
2102 /* Unsubscribe from all the pubsub channels */
2103 pubsubUnsubscribeAllChannels(c,0);
2104 pubsubUnsubscribeAllPatterns(c,0);
2105 dictRelease(c->pubsub_channels);
2106 listRelease(c->pubsub_patterns);
2107 /* Obvious cleanup */
2108 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2109 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2110 listRelease(c->reply);
2111 freeClientArgv(c);
2112 close(c->fd);
2113 /* Remove from the list of clients */
2114 ln = listSearchKey(server.clients,c);
2115 redisAssert(ln != NULL);
2116 listDelNode(server.clients,ln);
2117 /* Remove from the list of clients that are now ready to be restarted
2118 * after waiting for swapped keys */
2119 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2120 ln = listSearchKey(server.io_ready_clients,c);
2121 if (ln) {
2122 listDelNode(server.io_ready_clients,ln);
2123 server.vm_blocked_clients--;
2124 }
2125 }
2126 /* Remove from the list of clients waiting for swapped keys */
2127 while (server.vm_enabled && listLength(c->io_keys)) {
2128 ln = listFirst(c->io_keys);
2129 dontWaitForSwappedKey(c,ln->value);
2130 }
2131 listRelease(c->io_keys);
2132 /* Master/slave cleanup */
2133 if (c->flags & REDIS_SLAVE) {
2134 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2135 close(c->repldbfd);
2136 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2137 ln = listSearchKey(l,c);
2138 redisAssert(ln != NULL);
2139 listDelNode(l,ln);
2140 }
2141 if (c->flags & REDIS_MASTER) {
2142 server.master = NULL;
2143 server.replstate = REDIS_REPL_CONNECT;
2144 }
2145 /* Release memory */
2146 zfree(c->argv);
2147 zfree(c->mbargv);
2148 freeClientMultiState(c);
2149 zfree(c);
2150 }
2151
2152 #define GLUEREPLY_UP_TO (1024)
2153 static void glueReplyBuffersIfNeeded(redisClient *c) {
2154 int copylen = 0;
2155 char buf[GLUEREPLY_UP_TO];
2156 listNode *ln;
2157 listIter li;
2158 robj *o;
2159
2160 listRewind(c->reply,&li);
2161 while((ln = listNext(&li))) {
2162 int objlen;
2163
2164 o = ln->value;
2165 objlen = sdslen(o->ptr);
2166 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2167 memcpy(buf+copylen,o->ptr,objlen);
2168 copylen += objlen;
2169 listDelNode(c->reply,ln);
2170 } else {
2171 if (copylen == 0) return;
2172 break;
2173 }
2174 }
2175 /* Now the output buffer is empty, add the new single element */
2176 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2177 listAddNodeHead(c->reply,o);
2178 }
2179
2180 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen;
2183 robj *o;
2184 REDIS_NOTUSED(el);
2185 REDIS_NOTUSED(mask);
2186
2187 /* Use writev() if we have enough buffers to send */
2188 if (!server.glueoutputbuf &&
2189 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2190 !(c->flags & REDIS_MASTER))
2191 {
2192 sendReplyToClientWritev(el, fd, privdata, mask);
2193 return;
2194 }
2195
2196 while(listLength(c->reply)) {
2197 if (server.glueoutputbuf && listLength(c->reply) > 1)
2198 glueReplyBuffersIfNeeded(c);
2199
2200 o = listNodeValue(listFirst(c->reply));
2201 objlen = sdslen(o->ptr);
2202
2203 if (objlen == 0) {
2204 listDelNode(c->reply,listFirst(c->reply));
2205 continue;
2206 }
2207
2208 if (c->flags & REDIS_MASTER) {
2209 /* Don't reply to a master */
2210 nwritten = objlen - c->sentlen;
2211 } else {
2212 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2213 if (nwritten <= 0) break;
2214 }
2215 c->sentlen += nwritten;
2216 totwritten += nwritten;
2217 /* If we fully sent the object on head go to the next one */
2218 if (c->sentlen == objlen) {
2219 listDelNode(c->reply,listFirst(c->reply));
2220 c->sentlen = 0;
2221 }
2222 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2223 * bytes, in a single threaded server it's a good idea to serve
2224 * other clients as well, even if a very large request comes from
2225 * super fast link that is always able to accept data (in real world
2226 * scenario think about 'KEYS *' against the loopback interfae) */
2227 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2228 }
2229 if (nwritten == -1) {
2230 if (errno == EAGAIN) {
2231 nwritten = 0;
2232 } else {
2233 redisLog(REDIS_VERBOSE,
2234 "Error writing to client: %s", strerror(errno));
2235 freeClient(c);
2236 return;
2237 }
2238 }
2239 if (totwritten > 0) c->lastinteraction = time(NULL);
2240 if (listLength(c->reply) == 0) {
2241 c->sentlen = 0;
2242 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2243 }
2244 }
2245
2246 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2247 {
2248 redisClient *c = privdata;
2249 int nwritten = 0, totwritten = 0, objlen, willwrite;
2250 robj *o;
2251 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2252 int offset, ion = 0;
2253 REDIS_NOTUSED(el);
2254 REDIS_NOTUSED(mask);
2255
2256 listNode *node;
2257 while (listLength(c->reply)) {
2258 offset = c->sentlen;
2259 ion = 0;
2260 willwrite = 0;
2261
2262 /* fill-in the iov[] array */
2263 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2264 o = listNodeValue(node);
2265 objlen = sdslen(o->ptr);
2266
2267 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2268 break;
2269
2270 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2271 break; /* no more iovecs */
2272
2273 iov[ion].iov_base = ((char*)o->ptr) + offset;
2274 iov[ion].iov_len = objlen - offset;
2275 willwrite += objlen - offset;
2276 offset = 0; /* just for the first item */
2277 ion++;
2278 }
2279
2280 if(willwrite == 0)
2281 break;
2282
2283 /* write all collected blocks at once */
2284 if((nwritten = writev(fd, iov, ion)) < 0) {
2285 if (errno != EAGAIN) {
2286 redisLog(REDIS_VERBOSE,
2287 "Error writing to client: %s", strerror(errno));
2288 freeClient(c);
2289 return;
2290 }
2291 break;
2292 }
2293
2294 totwritten += nwritten;
2295 offset = c->sentlen;
2296
2297 /* remove written robjs from c->reply */
2298 while (nwritten && listLength(c->reply)) {
2299 o = listNodeValue(listFirst(c->reply));
2300 objlen = sdslen(o->ptr);
2301
2302 if(nwritten >= objlen - offset) {
2303 listDelNode(c->reply, listFirst(c->reply));
2304 nwritten -= objlen - offset;
2305 c->sentlen = 0;
2306 } else {
2307 /* partial write */
2308 c->sentlen += nwritten;
2309 break;
2310 }
2311 offset = 0;
2312 }
2313 }
2314
2315 if (totwritten > 0)
2316 c->lastinteraction = time(NULL);
2317
2318 if (listLength(c->reply) == 0) {
2319 c->sentlen = 0;
2320 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2321 }
2322 }
2323
2324 static int qsortRedisCommands(const void *r1, const void *r2) {
2325 return strcasecmp(
2326 ((struct redisCommand*)r1)->name,
2327 ((struct redisCommand*)r2)->name);
2328 }
2329
2330 static void sortCommandTable() {
2331 /* Copy and sort the read-only version of the command table */
2332 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2333 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2334 qsort(commandTable,
2335 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2336 sizeof(struct redisCommand),qsortRedisCommands);
2337 }
2338
2339 static struct redisCommand *lookupCommand(char *name) {
2340 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2341 return bsearch(
2342 &tmp,
2343 commandTable,
2344 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2345 sizeof(struct redisCommand),
2346 qsortRedisCommands);
2347 }
2348
2349 /* resetClient prepare the client to process the next command */
2350 static void resetClient(redisClient *c) {
2351 freeClientArgv(c);
2352 c->bulklen = -1;
2353 c->multibulk = 0;
2354 }
2355
2356 /* Call() is the core of Redis execution of a command */
2357 static void call(redisClient *c, struct redisCommand *cmd) {
2358 long long dirty;
2359
2360 dirty = server.dirty;
2361 cmd->proc(c);
2362 dirty = server.dirty-dirty;
2363
2364 if (server.appendonly && dirty)
2365 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2366 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2367 listLength(server.slaves))
2368 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2369 if (listLength(server.monitors))
2370 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2371 server.stat_numcommands++;
2372 }
2373
2374 /* If this function gets called we already read a whole
2375 * command, argments are in the client argv/argc fields.
2376 * processCommand() execute the command or prepare the
2377 * server for a bulk read from the client.
2378 *
2379 * If 1 is returned the client is still alive and valid and
2380 * and other operations can be performed by the caller. Otherwise
2381 * if 0 is returned the client was destroied (i.e. after QUIT). */
2382 static int processCommand(redisClient *c) {
2383 struct redisCommand *cmd;
2384
2385 /* Free some memory if needed (maxmemory setting) */
2386 if (server.maxmemory) freeMemoryIfNeeded();
2387
2388 /* Handle the multi bulk command type. This is an alternative protocol
2389 * supported by Redis in order to receive commands that are composed of
2390 * multiple binary-safe "bulk" arguments. The latency of processing is
2391 * a bit higher but this allows things like multi-sets, so if this
2392 * protocol is used only for MSET and similar commands this is a big win. */
2393 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2394 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2395 if (c->multibulk <= 0) {
2396 resetClient(c);
2397 return 1;
2398 } else {
2399 decrRefCount(c->argv[c->argc-1]);
2400 c->argc--;
2401 return 1;
2402 }
2403 } else if (c->multibulk) {
2404 if (c->bulklen == -1) {
2405 if (((char*)c->argv[0]->ptr)[0] != '$') {
2406 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2407 resetClient(c);
2408 return 1;
2409 } else {
2410 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2411 decrRefCount(c->argv[0]);
2412 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2413 c->argc--;
2414 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2415 resetClient(c);
2416 return 1;
2417 }
2418 c->argc--;
2419 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2420 return 1;
2421 }
2422 } else {
2423 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2424 c->mbargv[c->mbargc] = c->argv[0];
2425 c->mbargc++;
2426 c->argc--;
2427 c->multibulk--;
2428 if (c->multibulk == 0) {
2429 robj **auxargv;
2430 int auxargc;
2431
2432 /* Here we need to swap the multi-bulk argc/argv with the
2433 * normal argc/argv of the client structure. */
2434 auxargv = c->argv;
2435 c->argv = c->mbargv;
2436 c->mbargv = auxargv;
2437
2438 auxargc = c->argc;
2439 c->argc = c->mbargc;
2440 c->mbargc = auxargc;
2441
2442 /* We need to set bulklen to something different than -1
2443 * in order for the code below to process the command without
2444 * to try to read the last argument of a bulk command as
2445 * a special argument. */
2446 c->bulklen = 0;
2447 /* continue below and process the command */
2448 } else {
2449 c->bulklen = -1;
2450 return 1;
2451 }
2452 }
2453 }
2454 /* -- end of multi bulk commands processing -- */
2455
2456 /* The QUIT command is handled as a special case. Normal command
2457 * procs are unable to close the client connection safely */
2458 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2459 freeClient(c);
2460 return 0;
2461 }
2462
2463 /* Now lookup the command and check ASAP about trivial error conditions
2464 * such wrong arity, bad command name and so forth. */
2465 cmd = lookupCommand(c->argv[0]->ptr);
2466 if (!cmd) {
2467 addReplySds(c,
2468 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2469 (char*)c->argv[0]->ptr));
2470 resetClient(c);
2471 return 1;
2472 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2473 (c->argc < -cmd->arity)) {
2474 addReplySds(c,
2475 sdscatprintf(sdsempty(),
2476 "-ERR wrong number of arguments for '%s' command\r\n",
2477 cmd->name));
2478 resetClient(c);
2479 return 1;
2480 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2481 /* This is a bulk command, we have to read the last argument yet. */
2482 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2483
2484 decrRefCount(c->argv[c->argc-1]);
2485 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2486 c->argc--;
2487 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2488 resetClient(c);
2489 return 1;
2490 }
2491 c->argc--;
2492 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2493 /* It is possible that the bulk read is already in the
2494 * buffer. Check this condition and handle it accordingly.
2495 * This is just a fast path, alternative to call processInputBuffer().
2496 * It's a good idea since the code is small and this condition
2497 * happens most of the times. */
2498 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2499 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2500 c->argc++;
2501 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2502 } else {
2503 /* Otherwise return... there is to read the last argument
2504 * from the socket. */
2505 return 1;
2506 }
2507 }
2508 /* Let's try to encode the bulk object to save space. */
2509 if (cmd->flags & REDIS_CMD_BULK)
2510 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2511
2512 /* Check if the user is authenticated */
2513 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2514 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2515 resetClient(c);
2516 return 1;
2517 }
2518
2519 /* Handle the maxmemory directive */
2520 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2521 zmalloc_used_memory() > server.maxmemory)
2522 {
2523 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2524 resetClient(c);
2525 return 1;
2526 }
2527
2528 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2529 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2530 &&
2531 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2532 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2533 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2534 resetClient(c);
2535 return 1;
2536 }
2537
2538 /* Exec the command */
2539 if (c->flags & REDIS_MULTI &&
2540 cmd->proc != execCommand && cmd->proc != discardCommand &&
2541 cmd->proc != multiCommand && cmd->proc != watchCommand)
2542 {
2543 queueMultiCommand(c,cmd);
2544 addReply(c,shared.queued);
2545 } else {
2546 if (server.vm_enabled && server.vm_max_threads > 0 &&
2547 blockClientOnSwappedKeys(c,cmd)) return 1;
2548 call(c,cmd);
2549 }
2550
2551 /* Prepare the client for the next command */
2552 resetClient(c);
2553 return 1;
2554 }
2555
2556 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2557 listNode *ln;
2558 listIter li;
2559 int outc = 0, j;
2560 robj **outv;
2561 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2562 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2563 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2564 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2565 robj *lenobj;
2566
2567 if (argc <= REDIS_STATIC_ARGS) {
2568 outv = static_outv;
2569 } else {
2570 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2571 }
2572
2573 lenobj = createObject(REDIS_STRING,
2574 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2575 lenobj->refcount = 0;
2576 outv[outc++] = lenobj;
2577 for (j = 0; j < argc; j++) {
2578 lenobj = createObject(REDIS_STRING,
2579 sdscatprintf(sdsempty(),"$%lu\r\n",
2580 (unsigned long) stringObjectLen(argv[j])));
2581 lenobj->refcount = 0;
2582 outv[outc++] = lenobj;
2583 outv[outc++] = argv[j];
2584 outv[outc++] = shared.crlf;
2585 }
2586
2587 /* Increment all the refcounts at start and decrement at end in order to
2588 * be sure to free objects if there is no slave in a replication state
2589 * able to be feed with commands */
2590 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2591 listRewind(slaves,&li);
2592 while((ln = listNext(&li))) {
2593 redisClient *slave = ln->value;
2594
2595 /* Don't feed slaves that are still waiting for BGSAVE to start */
2596 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2597
2598 /* Feed all the other slaves, MONITORs and so on */
2599 if (slave->slaveseldb != dictid) {
2600 robj *selectcmd;
2601
2602 switch(dictid) {
2603 case 0: selectcmd = shared.select0; break;
2604 case 1: selectcmd = shared.select1; break;
2605 case 2: selectcmd = shared.select2; break;
2606 case 3: selectcmd = shared.select3; break;
2607 case 4: selectcmd = shared.select4; break;
2608 case 5: selectcmd = shared.select5; break;
2609 case 6: selectcmd = shared.select6; break;
2610 case 7: selectcmd = shared.select7; break;
2611 case 8: selectcmd = shared.select8; break;
2612 case 9: selectcmd = shared.select9; break;
2613 default:
2614 selectcmd = createObject(REDIS_STRING,
2615 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2616 selectcmd->refcount = 0;
2617 break;
2618 }
2619 addReply(slave,selectcmd);
2620 slave->slaveseldb = dictid;
2621 }
2622 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2623 }
2624 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2625 if (outv != static_outv) zfree(outv);
2626 }
2627
2628 static sds sdscatrepr(sds s, char *p, size_t len) {
2629 s = sdscatlen(s,"\"",1);
2630 while(len--) {
2631 switch(*p) {
2632 case '\\':
2633 case '"':
2634 s = sdscatprintf(s,"\\%c",*p);
2635 break;
2636 case '\n': s = sdscatlen(s,"\\n",1); break;
2637 case '\r': s = sdscatlen(s,"\\r",1); break;
2638 case '\t': s = sdscatlen(s,"\\t",1); break;
2639 case '\a': s = sdscatlen(s,"\\a",1); break;
2640 case '\b': s = sdscatlen(s,"\\b",1); break;
2641 default:
2642 if (isprint(*p))
2643 s = sdscatprintf(s,"%c",*p);
2644 else
2645 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2646 break;
2647 }
2648 p++;
2649 }
2650 return sdscatlen(s,"\"",1);
2651 }
2652
2653 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2654 listNode *ln;
2655 listIter li;
2656 int j;
2657 sds cmdrepr = sdsnew("+");
2658 robj *cmdobj;
2659 struct timeval tv;
2660
2661 gettimeofday(&tv,NULL);
2662 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2663 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2664
2665 for (j = 0; j < argc; j++) {
2666 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2667 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2668 } else {
2669 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2670 sdslen(argv[j]->ptr));
2671 }
2672 if (j != argc-1)
2673 cmdrepr = sdscatlen(cmdrepr," ",1);
2674 }
2675 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2676 cmdobj = createObject(REDIS_STRING,cmdrepr);
2677
2678 listRewind(monitors,&li);
2679 while((ln = listNext(&li))) {
2680 redisClient *monitor = ln->value;
2681 addReply(monitor,cmdobj);
2682 }
2683 decrRefCount(cmdobj);
2684 }
2685
2686 static void processInputBuffer(redisClient *c) {
2687 again:
2688 /* Before to process the input buffer, make sure the client is not
2689 * waitig for a blocking operation such as BLPOP. Note that the first
2690 * iteration the client is never blocked, otherwise the processInputBuffer
2691 * would not be called at all, but after the execution of the first commands
2692 * in the input buffer the client may be blocked, and the "goto again"
2693 * will try to reiterate. The following line will make it return asap. */
2694 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2695 if (c->bulklen == -1) {
2696 /* Read the first line of the query */
2697 char *p = strchr(c->querybuf,'\n');
2698 size_t querylen;
2699
2700 if (p) {
2701 sds query, *argv;
2702 int argc, j;
2703
2704 query = c->querybuf;
2705 c->querybuf = sdsempty();
2706 querylen = 1+(p-(query));
2707 if (sdslen(query) > querylen) {
2708 /* leave data after the first line of the query in the buffer */
2709 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2710 }
2711 *p = '\0'; /* remove "\n" */
2712 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2713 sdsupdatelen(query);
2714
2715 /* Now we can split the query in arguments */
2716 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2717 sdsfree(query);
2718
2719 if (c->argv) zfree(c->argv);
2720 c->argv = zmalloc(sizeof(robj*)*argc);
2721
2722 for (j = 0; j < argc; j++) {
2723 if (sdslen(argv[j])) {
2724 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2725 c->argc++;
2726 } else {
2727 sdsfree(argv[j]);
2728 }
2729 }
2730 zfree(argv);
2731 if (c->argc) {
2732 /* Execute the command. If the client is still valid
2733 * after processCommand() return and there is something
2734 * on the query buffer try to process the next command. */
2735 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2736 } else {
2737 /* Nothing to process, argc == 0. Just process the query
2738 * buffer if it's not empty or return to the caller */
2739 if (sdslen(c->querybuf)) goto again;
2740 }
2741 return;
2742 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2743 redisLog(REDIS_VERBOSE, "Client protocol error");
2744 freeClient(c);
2745 return;
2746 }
2747 } else {
2748 /* Bulk read handling. Note that if we are at this point
2749 the client already sent a command terminated with a newline,
2750 we are reading the bulk data that is actually the last
2751 argument of the command. */
2752 int qbl = sdslen(c->querybuf);
2753
2754 if (c->bulklen <= qbl) {
2755 /* Copy everything but the final CRLF as final argument */
2756 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2757 c->argc++;
2758 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2759 /* Process the command. If the client is still valid after
2760 * the processing and there is more data in the buffer
2761 * try to parse it. */
2762 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2763 return;
2764 }
2765 }
2766 }
2767
2768 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2769 redisClient *c = (redisClient*) privdata;
2770 char buf[REDIS_IOBUF_LEN];
2771 int nread;
2772 REDIS_NOTUSED(el);
2773 REDIS_NOTUSED(mask);
2774
2775 nread = read(fd, buf, REDIS_IOBUF_LEN);
2776 if (nread == -1) {
2777 if (errno == EAGAIN) {
2778 nread = 0;
2779 } else {
2780 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2781 freeClient(c);
2782 return;
2783 }
2784 } else if (nread == 0) {
2785 redisLog(REDIS_VERBOSE, "Client closed connection");
2786 freeClient(c);
2787 return;
2788 }
2789 if (nread) {
2790 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2791 c->lastinteraction = time(NULL);
2792 } else {
2793 return;
2794 }
2795 processInputBuffer(c);
2796 }
2797
2798 static int selectDb(redisClient *c, int id) {
2799 if (id < 0 || id >= server.dbnum)
2800 return REDIS_ERR;
2801 c->db = &server.db[id];
2802 return REDIS_OK;
2803 }
2804
2805 static void *dupClientReplyValue(void *o) {
2806 incrRefCount((robj*)o);
2807 return o;
2808 }
2809
2810 static int listMatchObjects(void *a, void *b) {
2811 return equalStringObjects(a,b);
2812 }
2813
2814 static redisClient *createClient(int fd) {
2815 redisClient *c = zmalloc(sizeof(*c));
2816
2817 anetNonBlock(NULL,fd);
2818 anetTcpNoDelay(NULL,fd);
2819 if (!c) return NULL;
2820 selectDb(c,0);
2821 c->fd = fd;
2822 c->querybuf = sdsempty();
2823 c->argc = 0;
2824 c->argv = NULL;
2825 c->bulklen = -1;
2826 c->multibulk = 0;
2827 c->mbargc = 0;
2828 c->mbargv = NULL;
2829 c->sentlen = 0;
2830 c->flags = 0;
2831 c->lastinteraction = time(NULL);
2832 c->authenticated = 0;
2833 c->replstate = REDIS_REPL_NONE;
2834 c->reply = listCreate();
2835 listSetFreeMethod(c->reply,decrRefCount);
2836 listSetDupMethod(c->reply,dupClientReplyValue);
2837 c->blocking_keys = NULL;
2838 c->blocking_keys_num = 0;
2839 c->io_keys = listCreate();
2840 c->watched_keys = listCreate();
2841 listSetFreeMethod(c->io_keys,decrRefCount);
2842 c->pubsub_channels = dictCreate(&setDictType,NULL);
2843 c->pubsub_patterns = listCreate();
2844 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2845 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2846 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2847 readQueryFromClient, c) == AE_ERR) {
2848 freeClient(c);
2849 return NULL;
2850 }
2851 listAddNodeTail(server.clients,c);
2852 initClientMultiState(c);
2853 return c;
2854 }
2855
2856 static void addReply(redisClient *c, robj *obj) {
2857 if (listLength(c->reply) == 0 &&
2858 (c->replstate == REDIS_REPL_NONE ||
2859 c->replstate == REDIS_REPL_ONLINE) &&
2860 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2861 sendReplyToClient, c) == AE_ERR) return;
2862
2863 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2864 obj = dupStringObject(obj);
2865 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2866 }
2867 listAddNodeTail(c->reply,getDecodedObject(obj));
2868 }
2869
2870 static void addReplySds(redisClient *c, sds s) {
2871 robj *o = createObject(REDIS_STRING,s);
2872 addReply(c,o);
2873 decrRefCount(o);
2874 }
2875
2876 static void addReplyDouble(redisClient *c, double d) {
2877 char buf[128];
2878
2879 snprintf(buf,sizeof(buf),"%.17g",d);
2880 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2881 (unsigned long) strlen(buf),buf));
2882 }
2883
2884 static void addReplyLongLong(redisClient *c, long long ll) {
2885 char buf[128];
2886 size_t len;
2887
2888 if (ll == 0) {
2889 addReply(c,shared.czero);
2890 return;
2891 } else if (ll == 1) {
2892 addReply(c,shared.cone);
2893 return;
2894 }
2895 buf[0] = ':';
2896 len = ll2string(buf+1,sizeof(buf)-1,ll);
2897 buf[len+1] = '\r';
2898 buf[len+2] = '\n';
2899 addReplySds(c,sdsnewlen(buf,len+3));
2900 }
2901
2902 static void addReplyUlong(redisClient *c, unsigned long ul) {
2903 char buf[128];
2904 size_t len;
2905
2906 if (ul == 0) {
2907 addReply(c,shared.czero);
2908 return;
2909 } else if (ul == 1) {
2910 addReply(c,shared.cone);
2911 return;
2912 }
2913 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2914 addReplySds(c,sdsnewlen(buf,len));
2915 }
2916
2917 static void addReplyBulkLen(redisClient *c, robj *obj) {
2918 size_t len, intlen;
2919 char buf[128];
2920
2921 if (obj->encoding == REDIS_ENCODING_RAW) {
2922 len = sdslen(obj->ptr);
2923 } else {
2924 long n = (long)obj->ptr;
2925
2926 /* Compute how many bytes will take this integer as a radix 10 string */
2927 len = 1;
2928 if (n < 0) {
2929 len++;
2930 n = -n;
2931 }
2932 while((n = n/10) != 0) {
2933 len++;
2934 }
2935 }
2936 buf[0] = '$';
2937 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2938 buf[intlen+1] = '\r';
2939 buf[intlen+2] = '\n';
2940 addReplySds(c,sdsnewlen(buf,intlen+3));
2941 }
2942
2943 static void addReplyBulk(redisClient *c, robj *obj) {
2944 addReplyBulkLen(c,obj);
2945 addReply(c,obj);
2946 addReply(c,shared.crlf);
2947 }
2948
2949 static void addReplyBulkSds(redisClient *c, sds s) {
2950 robj *o = createStringObject(s, sdslen(s));
2951 addReplyBulk(c,o);
2952 decrRefCount(o);
2953 }
2954
2955 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2956 static void addReplyBulkCString(redisClient *c, char *s) {
2957 if (s == NULL) {
2958 addReply(c,shared.nullbulk);
2959 } else {
2960 robj *o = createStringObject(s,strlen(s));
2961 addReplyBulk(c,o);
2962 decrRefCount(o);
2963 }
2964 }
2965
2966 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2967 int cport, cfd;
2968 char cip[128];
2969 redisClient *c;
2970 REDIS_NOTUSED(el);
2971 REDIS_NOTUSED(mask);
2972 REDIS_NOTUSED(privdata);
2973
2974 cfd = anetAccept(server.neterr, fd, cip, &cport);
2975 if (cfd == AE_ERR) {
2976 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2977 return;
2978 }
2979 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2980 if ((c = createClient(cfd)) == NULL) {
2981 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2982 close(cfd); /* May be already closed, just ingore errors */
2983 return;
2984 }
2985 /* If maxclient directive is set and this is one client more... close the
2986 * connection. Note that we create the client instead to check before
2987 * for this condition, since now the socket is already set in nonblocking
2988 * mode and we can send an error for free using the Kernel I/O */
2989 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2990 char *err = "-ERR max number of clients reached\r\n";
2991
2992 /* That's a best effort error message, don't check write errors */
2993 if (write(c->fd,err,strlen(err)) == -1) {
2994 /* Nothing to do, Just to avoid the warning... */
2995 }
2996 freeClient(c);
2997 return;
2998 }
2999 server.stat_numconnections++;
3000 }
3001
3002 /* ======================= Redis objects implementation ===================== */
3003
3004 static robj *createObject(int type, void *ptr) {
3005 robj *o;
3006
3007 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3008 if (listLength(server.objfreelist)) {
3009 listNode *head = listFirst(server.objfreelist);
3010 o = listNodeValue(head);
3011 listDelNode(server.objfreelist,head);
3012 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3013 } else {
3014 if (server.vm_enabled)
3015 pthread_mutex_unlock(&server.obj_freelist_mutex);
3016 o = zmalloc(sizeof(*o));
3017 }
3018 o->type = type;
3019 o->encoding = REDIS_ENCODING_RAW;
3020 o->ptr = ptr;
3021 o->refcount = 1;
3022 if (server.vm_enabled) {
3023 /* Note that this code may run in the context of an I/O thread
3024 * and accessing server.lruclock in theory is an error
3025 * (no locks). But in practice this is safe, and even if we read
3026 * garbage Redis will not fail. */
3027 o->lru = server.lruclock;
3028 o->storage = REDIS_VM_MEMORY;
3029 }
3030 return o;
3031 }
3032
3033 static robj *createStringObject(char *ptr, size_t len) {
3034 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3035 }
3036
3037 static robj *createStringObjectFromLongLong(long long value) {
3038 robj *o;
3039 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3040 incrRefCount(shared.integers[value]);
3041 o = shared.integers[value];
3042 } else {
3043 if (value >= LONG_MIN && value <= LONG_MAX) {
3044 o = createObject(REDIS_STRING, NULL);
3045 o->encoding = REDIS_ENCODING_INT;
3046 o->ptr = (void*)((long)value);
3047 } else {
3048 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3049 }
3050 }
3051 return o;
3052 }
3053
3054 static robj *dupStringObject(robj *o) {
3055 assert(o->encoding == REDIS_ENCODING_RAW);
3056 return createStringObject(o->ptr,sdslen(o->ptr));
3057 }
3058
3059 static robj *createListObject(void) {
3060 list *l = listCreate();
3061 robj *o = createObject(REDIS_LIST,l);
3062 listSetFreeMethod(l,decrRefCount);
3063 o->encoding = REDIS_ENCODING_LIST;
3064 return o;
3065 }
3066
3067 static robj *createZiplistObject(void) {
3068 unsigned char *zl = ziplistNew();
3069 robj *o = createObject(REDIS_LIST,zl);
3070 o->encoding = REDIS_ENCODING_ZIPLIST;
3071 return o;
3072 }
3073
3074 static robj *createSetObject(void) {
3075 dict *d = dictCreate(&setDictType,NULL);
3076 return createObject(REDIS_SET,d);
3077 }
3078
3079 static robj *createHashObject(void) {
3080 /* All the Hashes start as zipmaps. Will be automatically converted
3081 * into hash tables if there are enough elements or big elements
3082 * inside. */
3083 unsigned char *zm = zipmapNew();
3084 robj *o = createObject(REDIS_HASH,zm);
3085 o->encoding = REDIS_ENCODING_ZIPMAP;
3086 return o;
3087 }
3088
3089 static robj *createZsetObject(void) {
3090 zset *zs = zmalloc(sizeof(*zs));
3091
3092 zs->dict = dictCreate(&zsetDictType,NULL);
3093 zs->zsl = zslCreate();
3094 return createObject(REDIS_ZSET,zs);
3095 }
3096
3097 static void freeStringObject(robj *o) {
3098 if (o->encoding == REDIS_ENCODING_RAW) {
3099 sdsfree(o->ptr);
3100 }
3101 }
3102
3103 static void freeListObject(robj *o) {
3104 switch (o->encoding) {
3105 case REDIS_ENCODING_LIST:
3106 listRelease((list*) o->ptr);
3107 break;
3108 case REDIS_ENCODING_ZIPLIST:
3109 zfree(o->ptr);
3110 break;
3111 default:
3112 redisPanic("Unknown list encoding type");
3113 }
3114 }
3115
3116 static void freeSetObject(robj *o) {
3117 dictRelease((dict*) o->ptr);
3118 }
3119
3120 static void freeZsetObject(robj *o) {
3121 zset *zs = o->ptr;
3122
3123 dictRelease(zs->dict);
3124 zslFree(zs->zsl);
3125 zfree(zs);
3126 }
3127
3128 static void freeHashObject(robj *o) {
3129 switch (o->encoding) {
3130 case REDIS_ENCODING_HT:
3131 dictRelease((dict*) o->ptr);
3132 break;
3133 case REDIS_ENCODING_ZIPMAP:
3134 zfree(o->ptr);
3135 break;
3136 default:
3137 redisPanic("Unknown hash encoding type");
3138 break;
3139 }
3140 }
3141
3142 static void incrRefCount(robj *o) {
3143 o->refcount++;
3144 }
3145
3146 static void decrRefCount(void *obj) {
3147 robj *o = obj;
3148
3149 /* Object is a swapped out value, or in the process of being loaded. */
3150 if (server.vm_enabled &&
3151 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3152 {
3153 vmpointer *vp = obj;
3154 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3155 vmMarkPagesFree(vp->page,vp->usedpages);
3156 server.vm_stats_swapped_objects--;
3157 zfree(vp);
3158 return;
3159 }
3160
3161 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3162 /* Object is in memory, or in the process of being swapped out.
3163 *
3164 * If the object is being swapped out, abort the operation on
3165 * decrRefCount even if the refcount does not drop to 0: the object
3166 * is referenced at least two times, as value of the key AND as
3167 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3168 * done but the relevant key was removed in the meantime, the
3169 * complete jobs handler will not find the key about the job and the
3170 * assert will fail. */
3171 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3172 vmCancelThreadedIOJob(o);
3173 if (--(o->refcount) == 0) {
3174 switch(o->type) {
3175 case REDIS_STRING: freeStringObject(o); break;
3176 case REDIS_LIST: freeListObject(o); break;
3177 case REDIS_SET: freeSetObject(o); break;
3178 case REDIS_ZSET: freeZsetObject(o); break;
3179 case REDIS_HASH: freeHashObject(o); break;
3180 default: redisPanic("Unknown object type"); break;
3181 }
3182 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3183 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3184 !listAddNodeHead(server.objfreelist,o))
3185 zfree(o);
3186 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3187 }
3188 }
3189
3190 static int checkType(redisClient *c, robj *o, int type) {
3191 if (o->type != type) {
3192 addReply(c,shared.wrongtypeerr);
3193 return 1;
3194 }
3195 return 0;
3196 }
3197
3198 /* Check if the nul-terminated string 's' can be represented by a long
3199 * (that is, is a number that fits into long without any other space or
3200 * character before or after the digits).
3201 *
3202 * If so, the function returns REDIS_OK and *longval is set to the value
3203 * of the number. Otherwise REDIS_ERR is returned */
3204 static int isStringRepresentableAsLong(sds s, long *longval) {
3205 char buf[32], *endptr;
3206 long value;
3207 int slen;
3208
3209 value = strtol(s, &endptr, 10);
3210 if (endptr[0] != '\0') return REDIS_ERR;
3211 slen = ll2string(buf,32,value);
3212
3213 /* If the number converted back into a string is not identical
3214 * then it's not possible to encode the string as integer */
3215 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3216 if (longval) *longval = value;
3217 return REDIS_OK;
3218 }
3219
3220 /* Try to encode a string object in order to save space */
3221 static robj *tryObjectEncoding(robj *o) {
3222 long value;
3223 sds s = o->ptr;
3224
3225 if (o->encoding != REDIS_ENCODING_RAW)
3226 return o; /* Already encoded */
3227
3228 /* It's not safe to encode shared objects: shared objects can be shared
3229 * everywhere in the "object space" of Redis. Encoded objects can only
3230 * appear as "values" (and not, for instance, as keys) */
3231 if (o->refcount > 1) return o;
3232
3233 /* Currently we try to encode only strings */
3234 redisAssert(o->type == REDIS_STRING);
3235
3236 /* Check if we can represent this string as a long integer */
3237 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3238
3239 /* Ok, this object can be encoded */
3240 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3241 decrRefCount(o);
3242 incrRefCount(shared.integers[value]);
3243 return shared.integers[value];
3244 } else {
3245 o->encoding = REDIS_ENCODING_INT;
3246 sdsfree(o->ptr);
3247 o->ptr = (void*) value;
3248 return o;
3249 }
3250 }
3251
3252 /* Get a decoded version of an encoded object (returned as a new object).
3253 * If the object is already raw-encoded just increment the ref count. */
3254 static robj *getDecodedObject(robj *o) {
3255 robj *dec;
3256
3257 if (o->encoding == REDIS_ENCODING_RAW) {
3258 incrRefCount(o);
3259 return o;
3260 }
3261 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3262 char buf[32];
3263
3264 ll2string(buf,32,(long)o->ptr);
3265 dec = createStringObject(buf,strlen(buf));
3266 return dec;
3267 } else {
3268 redisPanic("Unknown encoding type");
3269 }
3270 }
3271
3272 /* Compare two string objects via strcmp() or alike.
3273 * Note that the objects may be integer-encoded. In such a case we
3274 * use ll2string() to get a string representation of the numbers on the stack
3275 * and compare the strings, it's much faster than calling getDecodedObject().
3276 *
3277 * Important note: if objects are not integer encoded, but binary-safe strings,
3278 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3279 * binary safe. */
3280 static int compareStringObjects(robj *a, robj *b) {
3281 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3282 char bufa[128], bufb[128], *astr, *bstr;
3283 int bothsds = 1;
3284
3285 if (a == b) return 0;
3286 if (a->encoding != REDIS_ENCODING_RAW) {
3287 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3288 astr = bufa;
3289 bothsds = 0;
3290 } else {
3291 astr = a->ptr;
3292 }
3293 if (b->encoding != REDIS_ENCODING_RAW) {
3294 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3295 bstr = bufb;
3296 bothsds = 0;
3297 } else {
3298 bstr = b->ptr;
3299 }
3300 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3301 }
3302
3303 /* Equal string objects return 1 if the two objects are the same from the
3304 * point of view of a string comparison, otherwise 0 is returned. Note that
3305 * this function is faster then checking for (compareStringObject(a,b) == 0)
3306 * because it can perform some more optimization. */
3307 static int equalStringObjects(robj *a, robj *b) {
3308 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3309 return a->ptr == b->ptr;
3310 } else {
3311 return compareStringObjects(a,b) == 0;
3312 }
3313 }
3314
3315 static size_t stringObjectLen(robj *o) {
3316 redisAssert(o->type == REDIS_STRING);
3317 if (o->encoding == REDIS_ENCODING_RAW) {
3318 return sdslen(o->ptr);
3319 } else {
3320 char buf[32];
3321
3322 return ll2string(buf,32,(long)o->ptr);
3323 }
3324 }
3325
3326 static int getDoubleFromObject(robj *o, double *target) {
3327 double value;
3328 char *eptr;
3329
3330 if (o == NULL) {
3331 value = 0;
3332 } else {
3333 redisAssert(o->type == REDIS_STRING);
3334 if (o->encoding == REDIS_ENCODING_RAW) {
3335 value = strtod(o->ptr, &eptr);
3336 if (eptr[0] != '\0') return REDIS_ERR;
3337 } else if (o->encoding == REDIS_ENCODING_INT) {
3338 value = (long)o->ptr;
3339 } else {
3340 redisPanic("Unknown string encoding");
3341 }
3342 }
3343
3344 *target = value;
3345 return REDIS_OK;
3346 }
3347
3348 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3349 double value;
3350 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3351 if (msg != NULL) {
3352 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3353 } else {
3354 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3355 }
3356 return REDIS_ERR;
3357 }
3358
3359 *target = value;
3360 return REDIS_OK;
3361 }
3362
3363 static int getLongLongFromObject(robj *o, long long *target) {
3364 long long value;
3365 char *eptr;
3366
3367 if (o == NULL) {
3368 value = 0;
3369 } else {
3370 redisAssert(o->type == REDIS_STRING);
3371 if (o->encoding == REDIS_ENCODING_RAW) {
3372 value = strtoll(o->ptr, &eptr, 10);
3373 if (eptr[0] != '\0') return REDIS_ERR;
3374 } else if (o->encoding == REDIS_ENCODING_INT) {
3375 value = (long)o->ptr;
3376 } else {
3377 redisPanic("Unknown string encoding");
3378 }
3379 }
3380
3381 *target = value;
3382 return REDIS_OK;
3383 }
3384
3385 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3386 long long value;
3387 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3388 if (msg != NULL) {
3389 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3390 } else {
3391 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3392 }
3393 return REDIS_ERR;
3394 }
3395
3396 *target = value;
3397 return REDIS_OK;
3398 }
3399
3400 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3401 long long value;
3402
3403 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3404 if (value < LONG_MIN || value > LONG_MAX) {
3405 if (msg != NULL) {
3406 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3407 } else {
3408 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3409 }
3410 return REDIS_ERR;
3411 }
3412
3413 *target = value;
3414 return REDIS_OK;
3415 }
3416
3417 /* =========================== Keyspace access API ========================== */
3418
3419 static robj *lookupKey(redisDb *db, robj *key) {
3420 dictEntry *de = dictFind(db->dict,key->ptr);
3421 if (de) {
3422 robj *val = dictGetEntryVal(de);
3423
3424 if (server.vm_enabled) {
3425 if (val->storage == REDIS_VM_MEMORY ||
3426 val->storage == REDIS_VM_SWAPPING)
3427 {
3428 /* If we were swapping the object out, cancel the operation */
3429 if (val->storage == REDIS_VM_SWAPPING)
3430 vmCancelThreadedIOJob(val);
3431 /* Update the access time for the aging algorithm. */
3432 val->lru = server.lruclock;
3433 } else {
3434 int notify = (val->storage == REDIS_VM_LOADING);
3435
3436 /* Our value was swapped on disk. Bring it at home. */
3437 redisAssert(val->type == REDIS_VMPOINTER);
3438 val = vmLoadObject(val);
3439 dictGetEntryVal(de) = val;
3440
3441 /* Clients blocked by the VM subsystem may be waiting for
3442 * this key... */
3443 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3444 }
3445 }
3446 return val;
3447 } else {
3448 return NULL;
3449 }
3450 }
3451
3452 static robj *lookupKeyRead(redisDb *db, robj *key) {
3453 expireIfNeeded(db,key);
3454 return lookupKey(db,key);
3455 }
3456
3457 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3458 deleteIfVolatile(db,key);
3459 touchWatchedKey(db,key);
3460 return lookupKey(db,key);
3461 }
3462
3463 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3464 robj *o = lookupKeyRead(c->db, key);
3465 if (!o) addReply(c,reply);
3466 return o;
3467 }
3468
3469 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3470 robj *o = lookupKeyWrite(c->db, key);
3471 if (!o) addReply(c,reply);
3472 return o;
3473 }
3474
3475 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3476 * otherwise REDIS_OK is returned, and the caller should increment the
3477 * refcount of 'val'. */
3478 static int dbAdd(redisDb *db, robj *key, robj *val) {
3479 /* Perform a lookup before adding the key, as we need to copy the
3480 * key value. */
3481 if (dictFind(db->dict, key->ptr) != NULL) {
3482 return REDIS_ERR;
3483 } else {
3484 sds copy = sdsdup(key->ptr);
3485 dictAdd(db->dict, copy, val);
3486 return REDIS_OK;
3487 }
3488 }
3489
3490 /* If the key does not exist, this is just like dbAdd(). Otherwise
3491 * the value associated to the key is replaced with the new one.
3492 *
3493 * On update (key already existed) 0 is returned. Otherwise 1. */
3494 static int dbReplace(redisDb *db, robj *key, robj *val) {
3495 if (dictFind(db->dict,key->ptr) == NULL) {
3496 sds copy = sdsdup(key->ptr);
3497 dictAdd(db->dict, copy, val);
3498 return 1;
3499 } else {
3500 dictReplace(db->dict, key->ptr, val);
3501 return 0;
3502 }
3503 }
3504
3505 static int dbExists(redisDb *db, robj *key) {
3506 return dictFind(db->dict,key->ptr) != NULL;
3507 }
3508
3509 /* Return a random key, in form of a Redis object.
3510 * If there are no keys, NULL is returned.
3511 *
3512 * The function makes sure to return keys not already expired. */
3513 static robj *dbRandomKey(redisDb *db) {
3514 struct dictEntry *de;
3515
3516 while(1) {
3517 sds key;
3518 robj *keyobj;
3519
3520 de = dictGetRandomKey(db->dict);
3521 if (de == NULL) return NULL;
3522
3523 key = dictGetEntryKey(de);
3524 keyobj = createStringObject(key,sdslen(key));
3525 if (dictFind(db->expires,key)) {
3526 if (expireIfNeeded(db,keyobj)) {
3527 decrRefCount(keyobj);
3528 continue; /* search for another key. This expired. */
3529 }
3530 }
3531 return keyobj;
3532 }
3533 }
3534
3535 /* Delete a key, value, and associated expiration entry if any, from the DB */
3536 static int dbDelete(redisDb *db, robj *key) {
3537 /* Deleting an entry from the expires dict will not free the sds of
3538 * the key, because it is shared with the main dictionary. */
3539 if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
3540 return dictDelete(db->dict,key->ptr) == DICT_OK;
3541 }
3542
3543 /*============================ RDB saving/loading =========================== */
3544
3545 static int rdbSaveType(FILE *fp, unsigned char type) {
3546 if (fwrite(&type,1,1,fp) == 0) return -1;
3547 return 0;
3548 }
3549
3550 static int rdbSaveTime(FILE *fp, time_t t) {
3551 int32_t t32 = (int32_t) t;
3552 if (fwrite(&t32,4,1,fp) == 0) return -1;
3553 return 0;
3554 }
3555
3556 /* check rdbLoadLen() comments for more info */
3557 static int rdbSaveLen(FILE *fp, uint32_t len) {
3558 unsigned char buf[2];
3559
3560 if (len < (1<<6)) {
3561 /* Save a 6 bit len */
3562 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3563 if (fwrite(buf,1,1,fp) == 0) return -1;
3564 } else if (len < (1<<14)) {
3565 /* Save a 14 bit len */
3566 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3567 buf[1] = len&0xFF;
3568 if (fwrite(buf,2,1,fp) == 0) return -1;
3569 } else {
3570 /* Save a 32 bit len */
3571 buf[0] = (REDIS_RDB_32BITLEN<<6);
3572 if (fwrite(buf,1,1,fp) == 0) return -1;
3573 len = htonl(len);
3574 if (fwrite(&len,4,1,fp) == 0) return -1;
3575 }
3576 return 0;
3577 }
3578
3579 /* Encode 'value' as an integer if possible (if integer will fit the
3580 * supported range). If the function sucessful encoded the integer
3581 * then the (up to 5 bytes) encoded representation is written in the
3582 * string pointed by 'enc' and the length is returned. Otherwise
3583 * 0 is returned. */
3584 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3585 /* Finally check if it fits in our ranges */
3586 if (value >= -(1<<7) && value <= (1<<7)-1) {
3587 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3588 enc[1] = value&0xFF;
3589 return 2;
3590 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3591 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3592 enc[1] = value&0xFF;
3593 enc[2] = (value>>8)&0xFF;
3594 return 3;
3595 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3596 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3597 enc[1] = value&0xFF;
3598 enc[2] = (value>>8)&0xFF;
3599 enc[3] = (value>>16)&0xFF;
3600 enc[4] = (value>>24)&0xFF;
3601 return 5;
3602 } else {
3603 return 0;
3604 }
3605 }
3606
3607 /* String objects in the form "2391" "-100" without any space and with a
3608 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3609 * encoded as integers to save space */
3610 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3611 long long value;
3612 char *endptr, buf[32];
3613
3614 /* Check if it's possible to encode this value as a number */
3615 value = strtoll(s, &endptr, 10);
3616 if (endptr[0] != '\0') return 0;
3617 ll2string(buf,32,value);
3618
3619 /* If the number converted back into a string is not identical
3620 * then it's not possible to encode the string as integer */
3621 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3622
3623 return rdbEncodeInteger(value,enc);
3624 }
3625
3626 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3627 size_t comprlen, outlen;
3628 unsigned char byte;
3629 void *out;
3630
3631 /* We require at least four bytes compression for this to be worth it */
3632 if (len <= 4) return 0;
3633 outlen = len-4;
3634 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3635 comprlen = lzf_compress(s, len, out, outlen);
3636 if (comprlen == 0) {
3637 zfree(out);
3638 return 0;
3639 }
3640 /* Data compressed! Let's save it on disk */
3641 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3642 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3643 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3644 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3645 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3646 zfree(out);
3647 return comprlen;
3648
3649 writeerr:
3650 zfree(out);
3651 return -1;
3652 }
3653
3654 /* Save a string objet as [len][data] on disk. If the object is a string
3655 * representation of an integer value we try to safe it in a special form */
3656 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3657 int enclen;
3658
3659 /* Try integer encoding */
3660 if (len <= 11) {
3661 unsigned char buf[5];
3662 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3663 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3664 return 0;
3665 }
3666 }
3667
3668 /* Try LZF compression - under 20 bytes it's unable to compress even
3669 * aaaaaaaaaaaaaaaaaa so skip it */
3670 if (server.rdbcompression && len > 20) {
3671 int retval;
3672
3673 retval = rdbSaveLzfStringObject(fp,s,len);
3674 if (retval == -1) return -1;
3675 if (retval > 0) return 0;
3676 /* retval == 0 means data can't be compressed, save the old way */
3677 }
3678
3679 /* Store verbatim */
3680 if (rdbSaveLen(fp,len) == -1) return -1;
3681 if (len && fwrite(s,len,1,fp) == 0) return -1;
3682 return 0;
3683 }
3684
3685 /* Save a long long value as either an encoded string or a string. */
3686 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3687 unsigned char buf[32];
3688 int enclen = rdbEncodeInteger(value,buf);
3689 if (enclen > 0) {
3690 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3691 } else {
3692 /* Encode as string */
3693 enclen = ll2string((char*)buf,32,value);
3694 redisAssert(enclen < 32);
3695 if (rdbSaveLen(fp,enclen) == -1) return -1;
3696 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3697 }
3698 return 0;
3699 }
3700
3701 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3702 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3703 /* Avoid to decode the object, then encode it again, if the
3704 * object is alrady integer encoded. */
3705 if (obj->encoding == REDIS_ENCODING_INT) {
3706 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3707 } else {
3708 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3709 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3710 }
3711 }
3712
3713 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3714 * 8 bit integer specifing the length of the representation.
3715 * This 8 bit integer has special values in order to specify the following
3716 * conditions:
3717 * 253: not a number
3718 * 254: + inf
3719 * 255: - inf
3720 */
3721 static int rdbSaveDoubleValue(FILE *fp, double val) {
3722 unsigned char buf[128];
3723 int len;
3724
3725 if (isnan(val)) {
3726 buf[0] = 253;
3727 len = 1;
3728 } else if (!isfinite(val)) {
3729 len = 1;
3730 buf[0] = (val < 0) ? 255 : 254;
3731 } else {
3732 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3733 /* Check if the float is in a safe range to be casted into a
3734 * long long. We are assuming that long long is 64 bit here.
3735 * Also we are assuming that there are no implementations around where
3736 * double has precision < 52 bit.
3737 *
3738 * Under this assumptions we test if a double is inside an interval
3739 * where casting to long long is safe. Then using two castings we
3740 * make sure the decimal part is zero. If all this is true we use
3741 * integer printing function that is much faster. */
3742 double min = -4503599627370495; /* (2^52)-1 */
3743 double max = 4503599627370496; /* -(2^52) */
3744 if (val > min && val < max && val == ((double)((long long)val)))
3745 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3746 else
3747 #endif
3748 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3749 buf[0] = strlen((char*)buf+1);
3750 len = buf[0]+1;
3751 }
3752 if (fwrite(buf,len,1,fp) == 0) return -1;
3753 return 0;
3754 }
3755
3756 /* Save a Redis object. */
3757 static int rdbSaveObject(FILE *fp, robj *o) {
3758 if (o->type == REDIS_STRING) {
3759 /* Save a string value */
3760 if (rdbSaveStringObject(fp,o) == -1) return -1;
3761 } else if (o->type == REDIS_LIST) {
3762 /* Save a list value */
3763 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3764 unsigned char *p;
3765 unsigned char *vstr;
3766 unsigned int vlen;
3767 long long vlong;
3768
3769 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3770 p = ziplistIndex(o->ptr,0);
3771 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3772 if (vstr) {
3773 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3774 return -1;
3775 } else {
3776 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3777 return -1;
3778 }
3779 p = ziplistNext(o->ptr,p);
3780 }
3781 } else if (o->encoding == REDIS_ENCODING_LIST) {
3782 list *list = o->ptr;
3783 listIter li;
3784 listNode *ln;
3785
3786 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3787 listRewind(list,&li);
3788 while((ln = listNext(&li))) {
3789 robj *eleobj = listNodeValue(ln);
3790 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3791 }
3792 } else {
3793 redisPanic("Unknown list encoding");
3794 }
3795 } else if (o->type == REDIS_SET) {
3796 /* Save a set value */
3797 dict *set = o->ptr;
3798 dictIterator *di = dictGetIterator(set);
3799 dictEntry *de;
3800
3801 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3802 while((de = dictNext(di)) != NULL) {
3803 robj *eleobj = dictGetEntryKey(de);
3804
3805 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3806 }
3807 dictReleaseIterator(di);
3808 } else if (o->type == REDIS_ZSET) {
3809 /* Save a set value */
3810 zset *zs = o->ptr;
3811 dictIterator *di = dictGetIterator(zs->dict);
3812 dictEntry *de;
3813
3814 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3815 while((de = dictNext(di)) != NULL) {
3816 robj *eleobj = dictGetEntryKey(de);
3817 double *score = dictGetEntryVal(de);
3818
3819 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3820 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3821 }
3822 dictReleaseIterator(di);
3823 } else if (o->type == REDIS_HASH) {
3824 /* Save a hash value */
3825 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3826 unsigned char *p = zipmapRewind(o->ptr);
3827 unsigned int count = zipmapLen(o->ptr);
3828 unsigned char *key, *val;
3829 unsigned int klen, vlen;
3830
3831 if (rdbSaveLen(fp,count) == -1) return -1;
3832 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3833 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3834 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3835 }
3836 } else {
3837 dictIterator *di = dictGetIterator(o->ptr);
3838 dictEntry *de;
3839
3840 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3841 while((de = dictNext(di)) != NULL) {
3842 robj *key = dictGetEntryKey(de);
3843 robj *val = dictGetEntryVal(de);
3844
3845 if (rdbSaveStringObject(fp,key) == -1) return -1;
3846 if (rdbSaveStringObject(fp,val) == -1) return -1;
3847 }
3848 dictReleaseIterator(di);
3849 }
3850 } else {
3851 redisPanic("Unknown object type");
3852 }
3853 return 0;
3854 }
3855
3856 /* Return the length the object will have on disk if saved with
3857 * the rdbSaveObject() function. Currently we use a trick to get
3858 * this length with very little changes to the code. In the future
3859 * we could switch to a faster solution. */
3860 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3861 if (fp == NULL) fp = server.devnull;
3862 rewind(fp);
3863 assert(rdbSaveObject(fp,o) != 1);
3864 return ftello(fp);
3865 }
3866
3867 /* Return the number of pages required to save this object in the swap file */
3868 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3869 off_t bytes = rdbSavedObjectLen(o,fp);
3870
3871 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3872 }
3873
3874 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3875 static int rdbSave(char *filename) {
3876 dictIterator *di = NULL;
3877 dictEntry *de;
3878 FILE *fp;
3879 char tmpfile[256];
3880 int j;
3881 time_t now = time(NULL);
3882
3883 /* Wait for I/O therads to terminate, just in case this is a
3884 * foreground-saving, to avoid seeking the swap file descriptor at the
3885 * same time. */
3886 if (server.vm_enabled)
3887 waitEmptyIOJobsQueue();
3888
3889 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3890 fp = fopen(tmpfile,"w");
3891 if (!fp) {
3892 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3893 return REDIS_ERR;
3894 }
3895 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3896 for (j = 0; j < server.dbnum; j++) {
3897 redisDb *db = server.db+j;
3898 dict *d = db->dict;
3899 if (dictSize(d) == 0) continue;
3900 di = dictGetIterator(d);
3901 if (!di) {
3902 fclose(fp);
3903 return REDIS_ERR;
3904 }
3905
3906 /* Write the SELECT DB opcode */
3907 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3908 if (rdbSaveLen(fp,j) == -1) goto werr;
3909
3910 /* Iterate this DB writing every entry */
3911 while((de = dictNext(di)) != NULL) {
3912 sds keystr = dictGetEntryKey(de);
3913 robj key, *o = dictGetEntryVal(de);
3914 time_t expiretime;
3915
3916 initStaticStringObject(key,keystr);
3917 expiretime = getExpire(db,&key);
3918
3919 /* Save the expire time */
3920 if (expiretime != -1) {
3921 /* If this key is already expired skip it */
3922 if (expiretime < now) continue;
3923 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3924 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3925 }
3926 /* Save the key and associated value. This requires special
3927 * handling if the value is swapped out. */
3928 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3929 o->storage == REDIS_VM_SWAPPING) {
3930 /* Save type, key, value */
3931 if (rdbSaveType(fp,o->type) == -1) goto werr;
3932 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3933 if (rdbSaveObject(fp,o) == -1) goto werr;
3934 } else {
3935 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3936 robj *po;
3937 /* Get a preview of the object in memory */
3938 po = vmPreviewObject(o);
3939 /* Save type, key, value */
3940 if (rdbSaveType(fp,po->type) == -1) goto werr;
3941 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3942 if (rdbSaveObject(fp,po) == -1) goto werr;
3943 /* Remove the loaded object from memory */
3944 decrRefCount(po);
3945 }
3946 }
3947 dictReleaseIterator(di);
3948 }
3949 /* EOF opcode */
3950 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3951
3952 /* Make sure data will not remain on the OS's output buffers */
3953 fflush(fp);
3954 fsync(fileno(fp));
3955 fclose(fp);
3956
3957 /* Use RENAME to make sure the DB file is changed atomically only
3958 * if the generate DB file is ok. */
3959 if (rename(tmpfile,filename) == -1) {
3960 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3961 unlink(tmpfile);
3962 return REDIS_ERR;
3963 }
3964 redisLog(REDIS_NOTICE,"DB saved on disk");
3965 server.dirty = 0;
3966 server.lastsave = time(NULL);
3967 return REDIS_OK;
3968
3969 werr:
3970 fclose(fp);
3971 unlink(tmpfile);
3972 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3973 if (di) dictReleaseIterator(di);
3974 return REDIS_ERR;
3975 }
3976
3977 static int rdbSaveBackground(char *filename) {
3978 pid_t childpid;
3979
3980 if (server.bgsavechildpid != -1) return REDIS_ERR;
3981 if (server.vm_enabled) waitEmptyIOJobsQueue();
3982 if ((childpid = fork()) == 0) {
3983 /* Child */
3984 if (server.vm_enabled) vmReopenSwapFile();
3985 close(server.fd);
3986 if (rdbSave(filename) == REDIS_OK) {
3987 _exit(0);
3988 } else {
3989 _exit(1);
3990 }
3991 } else {
3992 /* Parent */
3993 if (childpid == -1) {
3994 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3995 strerror(errno));
3996 return REDIS_ERR;
3997 }
3998 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3999 server.bgsavechildpid = childpid;
4000 updateDictResizePolicy();
4001 return REDIS_OK;
4002 }
4003 return REDIS_OK; /* unreached */
4004 }
4005
4006 static void rdbRemoveTempFile(pid_t childpid) {
4007 char tmpfile[256];
4008
4009 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4010 unlink(tmpfile);
4011 }
4012
4013 static int rdbLoadType(FILE *fp) {
4014 unsigned char type;
4015 if (fread(&type,1,1,fp) == 0) return -1;
4016 return type;
4017 }
4018
4019 static time_t rdbLoadTime(FILE *fp) {
4020 int32_t t32;
4021 if (fread(&t32,4,1,fp) == 0) return -1;
4022 return (time_t) t32;
4023 }
4024
4025 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4026 * of this file for a description of how this are stored on disk.
4027 *
4028 * isencoded is set to 1 if the readed length is not actually a length but
4029 * an "encoding type", check the above comments for more info */
4030 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4031 unsigned char buf[2];
4032 uint32_t len;
4033 int type;
4034
4035 if (isencoded) *isencoded = 0;
4036 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4037 type = (buf[0]&0xC0)>>6;
4038 if (type == REDIS_RDB_6BITLEN) {
4039 /* Read a 6 bit len */
4040 return buf[0]&0x3F;
4041 } else if (type == REDIS_RDB_ENCVAL) {
4042 /* Read a 6 bit len encoding type */
4043 if (isencoded) *isencoded = 1;
4044 return buf[0]&0x3F;
4045 } else if (type == REDIS_RDB_14BITLEN) {
4046 /* Read a 14 bit len */
4047 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4048 return ((buf[0]&0x3F)<<8)|buf[1];
4049 } else {
4050 /* Read a 32 bit len */
4051 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4052 return ntohl(len);
4053 }
4054 }
4055
4056 /* Load an integer-encoded object from file 'fp', with the specified
4057 * encoding type 'enctype'. If encode is true the function may return
4058 * an integer-encoded object as reply, otherwise the returned object
4059 * will always be encoded as a raw string. */
4060 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4061 unsigned char enc[4];
4062 long long val;
4063
4064 if (enctype == REDIS_RDB_ENC_INT8) {
4065 if (fread(enc,1,1,fp) == 0) return NULL;
4066 val = (signed char)enc[0];
4067 } else if (enctype == REDIS_RDB_ENC_INT16) {
4068 uint16_t v;
4069 if (fread(enc,2,1,fp) == 0) return NULL;
4070 v = enc[0]|(enc[1]<<8);
4071 val = (int16_t)v;
4072 } else if (enctype == REDIS_RDB_ENC_INT32) {
4073 uint32_t v;
4074 if (fread(enc,4,1,fp) == 0) return NULL;
4075 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4076 val = (int32_t)v;
4077 } else {
4078 val = 0; /* anti-warning */
4079 redisPanic("Unknown RDB integer encoding type");
4080 }
4081 if (encode)
4082 return createStringObjectFromLongLong(val);
4083 else
4084 return createObject(REDIS_STRING,sdsfromlonglong(val));
4085 }
4086
4087 static robj *rdbLoadLzfStringObject(FILE*fp) {
4088 unsigned int len, clen;
4089 unsigned char *c = NULL;
4090 sds val = NULL;
4091
4092 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4093 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4094 if ((c = zmalloc(clen)) == NULL) goto err;
4095 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4096 if (fread(c,clen,1,fp) == 0) goto err;
4097 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4098 zfree(c);
4099 return createObject(REDIS_STRING,val);
4100 err:
4101 zfree(c);
4102 sdsfree(val);
4103 return NULL;
4104 }
4105
4106 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4107 int isencoded;
4108 uint32_t len;
4109 sds val;
4110
4111 len = rdbLoadLen(fp,&isencoded);
4112 if (isencoded) {
4113 switch(len) {
4114 case REDIS_RDB_ENC_INT8:
4115 case REDIS_RDB_ENC_INT16:
4116 case REDIS_RDB_ENC_INT32:
4117 return rdbLoadIntegerObject(fp,len,encode);
4118 case REDIS_RDB_ENC_LZF:
4119 return rdbLoadLzfStringObject(fp);
4120 default:
4121 redisPanic("Unknown RDB encoding type");
4122 }
4123 }
4124
4125 if (len == REDIS_RDB_LENERR) return NULL;
4126 val = sdsnewlen(NULL,len);
4127 if (len && fread(val,len,1,fp) == 0) {
4128 sdsfree(val);
4129 return NULL;
4130 }
4131 return createObject(REDIS_STRING,val);
4132 }
4133
4134 static robj *rdbLoadStringObject(FILE *fp) {
4135 return rdbGenericLoadStringObject(fp,0);
4136 }
4137
4138 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4139 return rdbGenericLoadStringObject(fp,1);
4140 }
4141
4142 /* For information about double serialization check rdbSaveDoubleValue() */
4143 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4144 char buf[128];
4145 unsigned char len;
4146
4147 if (fread(&len,1,1,fp) == 0) return -1;
4148 switch(len) {
4149 case 255: *val = R_NegInf; return 0;
4150 case 254: *val = R_PosInf; return 0;
4151 case 253: *val = R_Nan; return 0;
4152 default:
4153 if (fread(buf,len,1,fp) == 0) return -1;
4154 buf[len] = '\0';
4155 sscanf(buf, "%lg", val);
4156 return 0;
4157 }
4158 }
4159
4160 /* Load a Redis object of the specified type from the specified file.
4161 * On success a newly allocated object is returned, otherwise NULL. */
4162 static robj *rdbLoadObject(int type, FILE *fp) {
4163 robj *o, *ele, *dec;
4164 size_t len;
4165
4166 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4167 if (type == REDIS_STRING) {
4168 /* Read string value */
4169 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4170 o = tryObjectEncoding(o);
4171 } else if (type == REDIS_LIST) {
4172 /* Read list value */
4173 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4174
4175 /* Use a real list when there are too many entries */
4176 if (len > server.list_max_ziplist_entries) {
4177 o = createListObject();
4178 } else {
4179 o = createZiplistObject();
4180 }
4181
4182 /* Load every single element of the list */
4183 while(len--) {
4184 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4185
4186 /* If we are using a ziplist and the value is too big, convert
4187 * the object to a real list. */
4188 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4189 ele->encoding == REDIS_ENCODING_RAW &&
4190 sdslen(ele->ptr) > server.list_max_ziplist_value)
4191 listTypeConvert(o,REDIS_ENCODING_LIST);
4192
4193 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4194 dec = getDecodedObject(ele);
4195 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4196 decrRefCount(dec);
4197 decrRefCount(ele);
4198 } else {
4199 ele = tryObjectEncoding(ele);
4200 listAddNodeTail(o->ptr,ele);
4201 }
4202 }
4203 } else if (type == REDIS_SET) {
4204 /* Read list/set value */
4205 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4206 o = createSetObject();
4207 /* It's faster to expand the dict to the right size asap in order
4208 * to avoid rehashing */
4209 if (len > DICT_HT_INITIAL_SIZE)
4210 dictExpand(o->ptr,len);
4211 /* Load every single element of the list/set */
4212 while(len--) {
4213 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4214 ele = tryObjectEncoding(ele);
4215 dictAdd((dict*)o->ptr,ele,NULL);
4216 }
4217 } else if (type == REDIS_ZSET) {
4218 /* Read list/set value */
4219 size_t zsetlen;
4220 zset *zs;
4221
4222 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4223 o = createZsetObject();
4224 zs = o->ptr;
4225 /* Load every single element of the list/set */
4226 while(zsetlen--) {
4227 robj *ele;
4228 double *score = zmalloc(sizeof(double));
4229
4230 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4231 ele = tryObjectEncoding(ele);
4232 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4233 dictAdd(zs->dict,ele,score);
4234 zslInsert(zs->zsl,*score,ele);
4235 incrRefCount(ele); /* added to skiplist */
4236 }
4237 } else if (type == REDIS_HASH) {
4238 size_t hashlen;
4239
4240 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4241 o = createHashObject();
4242 /* Too many entries? Use an hash table. */
4243 if (hashlen > server.hash_max_zipmap_entries)
4244 convertToRealHash(o);
4245 /* Load every key/value, then set it into the zipmap or hash
4246 * table, as needed. */
4247 while(hashlen--) {
4248 robj *key, *val;
4249
4250 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4251 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4252 /* If we are using a zipmap and there are too big values
4253 * the object is converted to real hash table encoding. */
4254 if (o->encoding != REDIS_ENCODING_HT &&
4255 ((key->encoding == REDIS_ENCODING_RAW &&
4256 sdslen(key->ptr) > server.hash_max_zipmap_value) ||
4257 (val->encoding == REDIS_ENCODING_RAW &&
4258 sdslen(val->ptr) > server.hash_max_zipmap_value)))
4259 {
4260 convertToRealHash(o);
4261 }
4262
4263 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4264 unsigned char *zm = o->ptr;
4265 robj *deckey, *decval;
4266
4267 /* We need raw string objects to add them to the zipmap */
4268 deckey = getDecodedObject(key);
4269 decval = getDecodedObject(val);
4270 zm = zipmapSet(zm,deckey->ptr,sdslen(deckey->ptr),
4271 decval->ptr,sdslen(decval->ptr),NULL);
4272 o->ptr = zm;
4273 decrRefCount(deckey);
4274 decrRefCount(decval);
4275 decrRefCount(key);
4276 decrRefCount(val);
4277 } else {
4278 key = tryObjectEncoding(key);
4279 val = tryObjectEncoding(val);
4280 dictAdd((dict*)o->ptr,key,val);
4281 }
4282 }
4283 } else {
4284 redisPanic("Unknown object type");
4285 }
4286 return o;
4287 }
4288
4289 static int rdbLoad(char *filename) {
4290 FILE *fp;
4291 uint32_t dbid;
4292 int type, retval, rdbver;
4293 int swap_all_values = 0;
4294 redisDb *db = server.db+0;
4295 char buf[1024];
4296 time_t expiretime, now = time(NULL);
4297
4298 fp = fopen(filename,"r");
4299 if (!fp) return REDIS_ERR;
4300 if (fread(buf,9,1,fp) == 0) goto eoferr;
4301 buf[9] = '\0';
4302 if (memcmp(buf,"REDIS",5) != 0) {
4303 fclose(fp);
4304 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4305 return REDIS_ERR;
4306 }
4307 rdbver = atoi(buf+5);
4308 if (rdbver != 1) {
4309 fclose(fp);
4310 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4311 return REDIS_ERR;
4312 }
4313 while(1) {
4314 robj *key, *val;
4315 int force_swapout;
4316
4317 expiretime = -1;
4318 /* Read type. */
4319 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4320 if (type == REDIS_EXPIRETIME) {
4321 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4322 /* We read the time so we need to read the object type again */
4323 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4324 }
4325 if (type == REDIS_EOF) break;
4326 /* Handle SELECT DB opcode as a special case */
4327 if (type == REDIS_SELECTDB) {
4328 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4329 goto eoferr;
4330 if (dbid >= (unsigned)server.dbnum) {
4331 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4332 exit(1);
4333 }
4334 db = server.db+dbid;
4335 continue;
4336 }
4337 /* Read key */
4338 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4339 /* Read value */
4340 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4341 /* Check if the key already expired */
4342 if (expiretime != -1 && expiretime < now) {
4343 decrRefCount(key);
4344 decrRefCount(val);
4345 continue;
4346 }
4347 /* Add the new object in the hash table */
4348 retval = dbAdd(db,key,val);
4349 if (retval == REDIS_ERR) {
4350 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4351 exit(1);
4352 }
4353 /* Set the expire time if needed */
4354 if (expiretime != -1) setExpire(db,key,expiretime);
4355
4356 /* Handle swapping while loading big datasets when VM is on */
4357
4358 /* If we detecter we are hopeless about fitting something in memory
4359 * we just swap every new key on disk. Directly...
4360 * Note that's important to check for this condition before resorting
4361 * to random sampling, otherwise we may try to swap already
4362 * swapped keys. */
4363 if (swap_all_values) {
4364 dictEntry *de = dictFind(db->dict,key->ptr);
4365
4366 /* de may be NULL since the key already expired */
4367 if (de) {
4368 vmpointer *vp;
4369 val = dictGetEntryVal(de);
4370
4371 if (val->refcount == 1 &&
4372 (vp = vmSwapObjectBlocking(val)) != NULL)
4373 dictGetEntryVal(de) = vp;
4374 }
4375 decrRefCount(key);
4376 continue;
4377 }
4378 decrRefCount(key);
4379
4380 /* Flush data on disk once 32 MB of additional RAM are used... */
4381 force_swapout = 0;
4382 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4383 force_swapout = 1;
4384
4385 /* If we have still some hope of having some value fitting memory
4386 * then we try random sampling. */
4387 if (!swap_all_values && server.vm_enabled && force_swapout) {
4388 while (zmalloc_used_memory() > server.vm_max_memory) {
4389 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4390 }
4391 if (zmalloc_used_memory() > server.vm_max_memory)
4392 swap_all_values = 1; /* We are already using too much mem */
4393 }
4394 }
4395 fclose(fp);
4396 return REDIS_OK;
4397
4398 eoferr: /* unexpected end of file is handled here with a fatal exit */
4399 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4400 exit(1);
4401 return REDIS_ERR; /* Just to avoid warning */
4402 }
4403
4404 /*================================== Shutdown =============================== */
4405 static int prepareForShutdown() {
4406 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4407 /* Kill the saving child if there is a background saving in progress.
4408 We want to avoid race conditions, for instance our saving child may
4409 overwrite the synchronous saving did by SHUTDOWN. */
4410 if (server.bgsavechildpid != -1) {
4411 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4412 kill(server.bgsavechildpid,SIGKILL);
4413 rdbRemoveTempFile(server.bgsavechildpid);
4414 }
4415 if (server.appendonly) {
4416 /* Append only file: fsync() the AOF and exit */
4417 aof_fsync(server.appendfd);
4418 if (server.vm_enabled) unlink(server.vm_swap_file);
4419 } else {
4420 /* Snapshotting. Perform a SYNC SAVE and exit */
4421 if (rdbSave(server.dbfilename) == REDIS_OK) {
4422 if (server.daemonize)
4423 unlink(server.pidfile);
4424 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4425 } else {
4426 /* Ooops.. error saving! The best we can do is to continue
4427 * operating. Note that if there was a background saving process,
4428 * in the next cron() Redis will be notified that the background
4429 * saving aborted, handling special stuff like slaves pending for
4430 * synchronization... */
4431 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4432 return REDIS_ERR;
4433 }
4434 }
4435 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4436 return REDIS_OK;
4437 }
4438
4439 /*================================== Commands =============================== */
4440
4441 static void authCommand(redisClient *c) {
4442 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4443 c->authenticated = 1;
4444 addReply(c,shared.ok);
4445 } else {
4446 c->authenticated = 0;
4447 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4448 }
4449 }
4450
4451 static void pingCommand(redisClient *c) {
4452 addReply(c,shared.pong);
4453 }
4454
4455 static void echoCommand(redisClient *c) {
4456 addReplyBulk(c,c->argv[1]);
4457 }
4458
4459 /*=================================== Strings =============================== */
4460
4461 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4462 int retval;
4463 long seconds = 0; /* initialized to avoid an harmness warning */
4464
4465 if (expire) {
4466 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4467 return;
4468 if (seconds <= 0) {
4469 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4470 return;
4471 }
4472 }
4473
4474 touchWatchedKey(c->db,key);
4475 if (nx) deleteIfVolatile(c->db,key);
4476 retval = dbAdd(c->db,key,val);
4477 if (retval == REDIS_ERR) {
4478 if (!nx) {
4479 dbReplace(c->db,key,val);
4480 incrRefCount(val);
4481 } else {
4482 addReply(c,shared.czero);
4483 return;
4484 }
4485 } else {
4486 incrRefCount(val);
4487 }
4488 server.dirty++;
4489 removeExpire(c->db,key);
4490 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4491 addReply(c, nx ? shared.cone : shared.ok);
4492 }
4493
4494 static void setCommand(redisClient *c) {
4495 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4496 }
4497
4498 static void setnxCommand(redisClient *c) {
4499 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4500 }
4501
4502 static void setexCommand(redisClient *c) {
4503 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4504 }
4505
4506 static int getGenericCommand(redisClient *c) {
4507 robj *o;
4508
4509 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4510 return REDIS_OK;
4511
4512 if (o->type != REDIS_STRING) {
4513 addReply(c,shared.wrongtypeerr);
4514 return REDIS_ERR;
4515 } else {
4516 addReplyBulk(c,o);
4517 return REDIS_OK;
4518 }
4519 }
4520
4521 static void getCommand(redisClient *c) {
4522 getGenericCommand(c);
4523 }
4524
4525 static void getsetCommand(redisClient *c) {
4526 if (getGenericCommand(c) == REDIS_ERR) return;
4527 dbReplace(c->db,c->argv[1],c->argv[2]);
4528 incrRefCount(c->argv[2]);
4529 server.dirty++;
4530 removeExpire(c->db,c->argv[1]);
4531 }
4532
4533 static void mgetCommand(redisClient *c) {
4534 int j;
4535
4536 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4537 for (j = 1; j < c->argc; j++) {
4538 robj *o = lookupKeyRead(c->db,c->argv[j]);
4539 if (o == NULL) {
4540 addReply(c,shared.nullbulk);
4541 } else {
4542 if (o->type != REDIS_STRING) {
4543 addReply(c,shared.nullbulk);
4544 } else {
4545 addReplyBulk(c,o);
4546 }
4547 }
4548 }
4549 }
4550
4551 static void msetGenericCommand(redisClient *c, int nx) {
4552 int j, busykeys = 0;
4553
4554 if ((c->argc % 2) == 0) {
4555 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4556 return;
4557 }
4558 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4559 * set nothing at all if at least one already key exists. */
4560 if (nx) {
4561 for (j = 1; j < c->argc; j += 2) {
4562 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4563 busykeys++;
4564 }
4565 }
4566 }
4567 if (busykeys) {
4568 addReply(c, shared.czero);
4569 return;
4570 }
4571
4572 for (j = 1; j < c->argc; j += 2) {
4573 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4574 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4575 incrRefCount(c->argv[j+1]);
4576 removeExpire(c->db,c->argv[j]);
4577 }
4578 server.dirty += (c->argc-1)/2;
4579 addReply(c, nx ? shared.cone : shared.ok);
4580 }
4581
4582 static void msetCommand(redisClient *c) {
4583 msetGenericCommand(c,0);
4584 }
4585
4586 static void msetnxCommand(redisClient *c) {
4587 msetGenericCommand(c,1);
4588 }
4589
4590 static void incrDecrCommand(redisClient *c, long long incr) {
4591 long long value;
4592 robj *o;
4593
4594 o = lookupKeyWrite(c->db,c->argv[1]);
4595 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4596 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4597
4598 value += incr;
4599 o = createStringObjectFromLongLong(value);
4600 dbReplace(c->db,c->argv[1],o);
4601 server.dirty++;
4602 addReply(c,shared.colon);
4603 addReply(c,o);
4604 addReply(c,shared.crlf);
4605 }
4606
4607 static void incrCommand(redisClient *c) {
4608 incrDecrCommand(c,1);
4609 }
4610
4611 static void decrCommand(redisClient *c) {
4612 incrDecrCommand(c,-1);
4613 }
4614
4615 static void incrbyCommand(redisClient *c) {
4616 long long incr;
4617
4618 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4619 incrDecrCommand(c,incr);
4620 }
4621
4622 static void decrbyCommand(redisClient *c) {
4623 long long incr;
4624
4625 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4626 incrDecrCommand(c,-incr);
4627 }
4628
4629 static void appendCommand(redisClient *c) {
4630 int retval;
4631 size_t totlen;
4632 robj *o;
4633
4634 o = lookupKeyWrite(c->db,c->argv[1]);
4635 if (o == NULL) {
4636 /* Create the key */
4637 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4638 incrRefCount(c->argv[2]);
4639 totlen = stringObjectLen(c->argv[2]);
4640 } else {
4641 if (o->type != REDIS_STRING) {
4642 addReply(c,shared.wrongtypeerr);
4643 return;
4644 }
4645 /* If the object is specially encoded or shared we have to make
4646 * a copy */
4647 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4648 robj *decoded = getDecodedObject(o);
4649
4650 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4651 decrRefCount(decoded);
4652 dbReplace(c->db,c->argv[1],o);
4653 }
4654 /* APPEND! */
4655 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4656 o->ptr = sdscatlen(o->ptr,
4657 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4658 } else {
4659 o->ptr = sdscatprintf(o->ptr, "%ld",
4660 (unsigned long) c->argv[2]->ptr);
4661 }
4662 totlen = sdslen(o->ptr);
4663 }
4664 server.dirty++;
4665 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4666 }
4667
4668 static void substrCommand(redisClient *c) {
4669 robj *o;
4670 long start = atoi(c->argv[2]->ptr);
4671 long end = atoi(c->argv[3]->ptr);
4672 size_t rangelen, strlen;
4673 sds range;
4674
4675 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4676 checkType(c,o,REDIS_STRING)) return;
4677
4678 o = getDecodedObject(o);
4679 strlen = sdslen(o->ptr);
4680
4681 /* convert negative indexes */
4682 if (start < 0) start = strlen+start;
4683 if (end < 0) end = strlen+end;
4684 if (start < 0) start = 0;
4685 if (end < 0) end = 0;
4686
4687 /* indexes sanity checks */
4688 if (start > end || (size_t)start >= strlen) {
4689 /* Out of range start or start > end result in null reply */
4690 addReply(c,shared.nullbulk);
4691 decrRefCount(o);
4692 return;
4693 }
4694 if ((size_t)end >= strlen) end = strlen-1;
4695 rangelen = (end-start)+1;
4696
4697 /* Return the result */
4698 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4699 range = sdsnewlen((char*)o->ptr+start,rangelen);
4700 addReplySds(c,range);
4701 addReply(c,shared.crlf);
4702 decrRefCount(o);
4703 }
4704
4705 /* ========================= Type agnostic commands ========================= */
4706
4707 static void delCommand(redisClient *c) {
4708 int deleted = 0, j;
4709
4710 for (j = 1; j < c->argc; j++) {
4711 if (dbDelete(c->db,c->argv[j])) {
4712 touchWatchedKey(c->db,c->argv[j]);
4713 server.dirty++;
4714 deleted++;
4715 }
4716 }
4717 addReplyLongLong(c,deleted);
4718 }
4719
4720 static void existsCommand(redisClient *c) {
4721 expireIfNeeded(c->db,c->argv[1]);
4722 if (dbExists(c->db,c->argv[1])) {
4723 addReply(c, shared.cone);
4724 } else {
4725 addReply(c, shared.czero);
4726 }
4727 }
4728
4729 static void selectCommand(redisClient *c) {
4730 int id = atoi(c->argv[1]->ptr);
4731
4732 if (selectDb(c,id) == REDIS_ERR) {
4733 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4734 } else {
4735 addReply(c,shared.ok);
4736 }
4737 }
4738
4739 static void randomkeyCommand(redisClient *c) {
4740 robj *key;
4741
4742 if ((key = dbRandomKey(c->db)) == NULL) {
4743 addReply(c,shared.nullbulk);
4744 return;
4745 }
4746
4747 addReplyBulk(c,key);
4748 decrRefCount(key);
4749 }
4750
4751 static void keysCommand(redisClient *c) {
4752 dictIterator *di;
4753 dictEntry *de;
4754 sds pattern = c->argv[1]->ptr;
4755 int plen = sdslen(pattern);
4756 unsigned long numkeys = 0;
4757 robj *lenobj = createObject(REDIS_STRING,NULL);
4758
4759 di = dictGetIterator(c->db->dict);
4760 addReply(c,lenobj);
4761 decrRefCount(lenobj);
4762 while((de = dictNext(di)) != NULL) {
4763 sds key = dictGetEntryKey(de);
4764 robj *keyobj;
4765
4766 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4767 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4768 keyobj = createStringObject(key,sdslen(key));
4769 if (expireIfNeeded(c->db,keyobj) == 0) {
4770 addReplyBulk(c,keyobj);
4771 numkeys++;
4772 }
4773 decrRefCount(keyobj);
4774 }
4775 }
4776 dictReleaseIterator(di);
4777 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4778 }
4779
4780 static void dbsizeCommand(redisClient *c) {
4781 addReplySds(c,
4782 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4783 }
4784
4785 static void lastsaveCommand(redisClient *c) {
4786 addReplySds(c,
4787 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4788 }
4789
4790 static void typeCommand(redisClient *c) {
4791 robj *o;
4792 char *type;
4793
4794 o = lookupKeyRead(c->db,c->argv[1]);
4795 if (o == NULL) {
4796 type = "+none";
4797 } else {
4798 switch(o->type) {
4799 case REDIS_STRING: type = "+string"; break;
4800 case REDIS_LIST: type = "+list"; break;
4801 case REDIS_SET: type = "+set"; break;
4802 case REDIS_ZSET: type = "+zset"; break;
4803 case REDIS_HASH: type = "+hash"; break;
4804 default: type = "+unknown"; break;
4805 }
4806 }
4807 addReplySds(c,sdsnew(type));
4808 addReply(c,shared.crlf);
4809 }
4810
4811 static void saveCommand(redisClient *c) {
4812 if (server.bgsavechildpid != -1) {
4813 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4814 return;
4815 }
4816 if (rdbSave(server.dbfilename) == REDIS_OK) {
4817 addReply(c,shared.ok);
4818 } else {
4819 addReply(c,shared.err);
4820 }
4821 }
4822
4823 static void bgsaveCommand(redisClient *c) {
4824 if (server.bgsavechildpid != -1) {
4825 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4826 return;
4827 }
4828 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4829 char *status = "+Background saving started\r\n";
4830 addReplySds(c,sdsnew(status));
4831 } else {
4832 addReply(c,shared.err);
4833 }
4834 }
4835
4836 static void shutdownCommand(redisClient *c) {
4837 if (prepareForShutdown() == REDIS_OK)
4838 exit(0);
4839 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4840 }
4841
4842 static void renameGenericCommand(redisClient *c, int nx) {
4843 robj *o;
4844
4845 /* To use the same key as src and dst is probably an error */
4846 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4847 addReply(c,shared.sameobjecterr);
4848 return;
4849 }
4850
4851 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4852 return;
4853
4854 incrRefCount(o);
4855 deleteIfVolatile(c->db,c->argv[2]);
4856 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4857 if (nx) {
4858 decrRefCount(o);
4859 addReply(c,shared.czero);
4860 return;
4861 }
4862 dbReplace(c->db,c->argv[2],o);
4863 }
4864 dbDelete(c->db,c->argv[1]);
4865 touchWatchedKey(c->db,c->argv[2]);
4866 server.dirty++;
4867 addReply(c,nx ? shared.cone : shared.ok);
4868 }
4869
4870 static void renameCommand(redisClient *c) {
4871 renameGenericCommand(c,0);
4872 }
4873
4874 static void renamenxCommand(redisClient *c) {
4875 renameGenericCommand(c,1);
4876 }
4877
4878 static void moveCommand(redisClient *c) {
4879 robj *o;
4880 redisDb *src, *dst;
4881 int srcid;
4882
4883 /* Obtain source and target DB pointers */
4884 src = c->db;
4885 srcid = c->db->id;
4886 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4887 addReply(c,shared.outofrangeerr);
4888 return;
4889 }
4890 dst = c->db;
4891 selectDb(c,srcid); /* Back to the source DB */
4892
4893 /* If the user is moving using as target the same
4894 * DB as the source DB it is probably an error. */
4895 if (src == dst) {
4896 addReply(c,shared.sameobjecterr);
4897 return;
4898 }
4899
4900 /* Check if the element exists and get a reference */
4901 o = lookupKeyWrite(c->db,c->argv[1]);
4902 if (!o) {
4903 addReply(c,shared.czero);
4904 return;
4905 }
4906
4907 /* Try to add the element to the target DB */
4908 deleteIfVolatile(dst,c->argv[1]);
4909 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4910 addReply(c,shared.czero);
4911 return;
4912 }
4913 incrRefCount(o);
4914
4915 /* OK! key moved, free the entry in the source DB */
4916 dbDelete(src,c->argv[1]);
4917 server.dirty++;
4918 addReply(c,shared.cone);
4919 }
4920
4921 /* =================================== Lists ================================ */
4922
4923
4924 /* Check the argument length to see if it requires us to convert the ziplist
4925 * to a real list. Only check raw-encoded objects because integer encoded
4926 * objects are never too long. */
4927 static void listTypeTryConversion(robj *subject, robj *value) {
4928 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4929 if (value->encoding == REDIS_ENCODING_RAW &&
4930 sdslen(value->ptr) > server.list_max_ziplist_value)
4931 listTypeConvert(subject,REDIS_ENCODING_LIST);
4932 }
4933
4934 static void listTypePush(robj *subject, robj *value, int where) {
4935 /* Check if we need to convert the ziplist */
4936 listTypeTryConversion(subject,value);
4937 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4938 ziplistLen(subject->ptr) >= server.list_max_ziplist_entries)
4939 listTypeConvert(subject,REDIS_ENCODING_LIST);
4940
4941 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4942 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4943 value = getDecodedObject(value);
4944 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4945 decrRefCount(value);
4946 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4947 if (where == REDIS_HEAD) {
4948 listAddNodeHead(subject->ptr,value);
4949 } else {
4950 listAddNodeTail(subject->ptr,value);
4951 }
4952 incrRefCount(value);
4953 } else {
4954 redisPanic("Unknown list encoding");
4955 }
4956 }
4957
4958 static robj *listTypePop(robj *subject, int where) {
4959 robj *value = NULL;
4960 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4961 unsigned char *p;
4962 unsigned char *vstr;
4963 unsigned int vlen;
4964 long long vlong;
4965 int pos = (where == REDIS_HEAD) ? 0 : -1;
4966 p = ziplistIndex(subject->ptr,pos);
4967 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4968 if (vstr) {
4969 value = createStringObject((char*)vstr,vlen);
4970 } else {
4971 value = createStringObjectFromLongLong(vlong);
4972 }
4973 /* We only need to delete an element when it exists */
4974 subject->ptr = ziplistDelete(subject->ptr,&p);
4975 }
4976 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4977 list *list = subject->ptr;
4978 listNode *ln;
4979 if (where == REDIS_HEAD) {
4980 ln = listFirst(list);
4981 } else {
4982 ln = listLast(list);
4983 }
4984 if (ln != NULL) {
4985 value = listNodeValue(ln);
4986 incrRefCount(value);
4987 listDelNode(list,ln);
4988 }
4989 } else {
4990 redisPanic("Unknown list encoding");
4991 }
4992 return value;
4993 }
4994
4995 static unsigned long listTypeLength(robj *subject) {
4996 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4997 return ziplistLen(subject->ptr);
4998 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4999 return listLength((list*)subject->ptr);
5000 } else {
5001 redisPanic("Unknown list encoding");
5002 }
5003 }
5004
5005 /* Structure to hold set iteration abstraction. */
5006 typedef struct {
5007 robj *subject;
5008 unsigned char encoding;
5009 unsigned char direction; /* Iteration direction */
5010 unsigned char *zi;
5011 listNode *ln;
5012 } listTypeIterator;
5013
5014 /* Structure for an entry while iterating over a list. */
5015 typedef struct {
5016 listTypeIterator *li;
5017 unsigned char *zi; /* Entry in ziplist */
5018 listNode *ln; /* Entry in linked list */
5019 } listTypeEntry;
5020
5021 /* Initialize an iterator at the specified index. */
5022 static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5023 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
5024 li->subject = subject;
5025 li->encoding = subject->encoding;
5026 li->direction = direction;
5027 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5028 li->zi = ziplistIndex(subject->ptr,index);
5029 } else if (li->encoding == REDIS_ENCODING_LIST) {
5030 li->ln = listIndex(subject->ptr,index);
5031 } else {
5032 redisPanic("Unknown list encoding");
5033 }
5034 return li;
5035 }
5036
5037 /* Clean up the iterator. */
5038 static void listTypeReleaseIterator(listTypeIterator *li) {
5039 zfree(li);
5040 }
5041
5042 /* Stores pointer to current the entry in the provided entry structure
5043 * and advances the position of the iterator. Returns 1 when the current
5044 * entry is in fact an entry, 0 otherwise. */
5045 static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
5046 /* Protect from converting when iterating */
5047 redisAssert(li->subject->encoding == li->encoding);
5048
5049 entry->li = li;
5050 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5051 entry->zi = li->zi;
5052 if (entry->zi != NULL) {
5053 if (li->direction == REDIS_TAIL)
5054 li->zi = ziplistNext(li->subject->ptr,li->zi);
5055 else
5056 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5057 return 1;
5058 }
5059 } else if (li->encoding == REDIS_ENCODING_LIST) {
5060 entry->ln = li->ln;
5061 if (entry->ln != NULL) {
5062 if (li->direction == REDIS_TAIL)
5063 li->ln = li->ln->next;
5064 else
5065 li->ln = li->ln->prev;
5066 return 1;
5067 }
5068 } else {
5069 redisPanic("Unknown list encoding");
5070 }
5071 return 0;
5072 }
5073
5074 /* Return entry or NULL at the current position of the iterator. */
5075 static robj *listTypeGet(listTypeEntry *entry) {
5076 listTypeIterator *li = entry->li;
5077 robj *value = NULL;
5078 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5079 unsigned char *vstr;
5080 unsigned int vlen;
5081 long long vlong;
5082 redisAssert(entry->zi != NULL);
5083 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5084 if (vstr) {
5085 value = createStringObject((char*)vstr,vlen);
5086 } else {
5087 value = createStringObjectFromLongLong(vlong);
5088 }
5089 }
5090 } else if (li->encoding == REDIS_ENCODING_LIST) {
5091 redisAssert(entry->ln != NULL);
5092 value = listNodeValue(entry->ln);
5093 incrRefCount(value);
5094 } else {
5095 redisPanic("Unknown list encoding");
5096 }
5097 return value;
5098 }
5099
5100 static void listTypeInsert(listTypeEntry *entry, robj *value, int where) {
5101 robj *subject = entry->li->subject;
5102 if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) {
5103 value = getDecodedObject(value);
5104 if (where == REDIS_TAIL) {
5105 unsigned char *next = ziplistNext(subject->ptr,entry->zi);
5106
5107 /* When we insert after the current element, but the current element
5108 * is the tail of the list, we need to do a push. */
5109 if (next == NULL) {
5110 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL);
5111 } else {
5112 subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr));
5113 }
5114 } else {
5115 subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr));
5116 }
5117 decrRefCount(value);
5118 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5119 if (where == REDIS_TAIL) {
5120 listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL);
5121 } else {
5122 listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD);
5123 }
5124 incrRefCount(value);
5125 } else {
5126 redisPanic("Unknown list encoding");
5127 }
5128 }
5129
5130 /* Compare the given object with the entry at the current position. */
5131 static int listTypeEqual(listTypeEntry *entry, robj *o) {
5132 listTypeIterator *li = entry->li;
5133 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5134 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5135 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5136 } else if (li->encoding == REDIS_ENCODING_LIST) {
5137 return equalStringObjects(o,listNodeValue(entry->ln));
5138 } else {
5139 redisPanic("Unknown list encoding");
5140 }
5141 }
5142
5143 /* Delete the element pointed to. */
5144 static void listTypeDelete(listTypeEntry *entry) {
5145 listTypeIterator *li = entry->li;
5146 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5147 unsigned char *p = entry->zi;
5148 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5149
5150 /* Update position of the iterator depending on the direction */
5151 if (li->direction == REDIS_TAIL)
5152 li->zi = p;
5153 else
5154 li->zi = ziplistPrev(li->subject->ptr,p);
5155 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5156 listNode *next;
5157 if (li->direction == REDIS_TAIL)
5158 next = entry->ln->next;
5159 else
5160 next = entry->ln->prev;
5161 listDelNode(li->subject->ptr,entry->ln);
5162 li->ln = next;
5163 } else {
5164 redisPanic("Unknown list encoding");
5165 }
5166 }
5167
5168 static void listTypeConvert(robj *subject, int enc) {
5169 listTypeIterator *li;
5170 listTypeEntry entry;
5171 redisAssert(subject->type == REDIS_LIST);
5172
5173 if (enc == REDIS_ENCODING_LIST) {
5174 list *l = listCreate();
5175 listSetFreeMethod(l,decrRefCount);
5176
5177 /* listTypeGet returns a robj with incremented refcount */
5178 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5179 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5180 listTypeReleaseIterator(li);
5181
5182 subject->encoding = REDIS_ENCODING_LIST;
5183 zfree(subject->ptr);
5184 subject->ptr = l;
5185 } else {
5186 redisPanic("Unsupported list conversion");
5187 }
5188 }
5189
5190 static void pushGenericCommand(redisClient *c, int where) {
5191 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5192 if (lobj == NULL) {
5193 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5194 addReply(c,shared.cone);
5195 return;
5196 }
5197 lobj = createZiplistObject();
5198 dbAdd(c->db,c->argv[1],lobj);
5199 } else {
5200 if (lobj->type != REDIS_LIST) {
5201 addReply(c,shared.wrongtypeerr);
5202 return;
5203 }
5204 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5205 addReply(c,shared.cone);
5206 return;
5207 }
5208 }
5209 listTypePush(lobj,c->argv[2],where);
5210 addReplyLongLong(c,listTypeLength(lobj));
5211 server.dirty++;
5212 }
5213
5214 static void lpushCommand(redisClient *c) {
5215 pushGenericCommand(c,REDIS_HEAD);
5216 }
5217
5218 static void rpushCommand(redisClient *c) {
5219 pushGenericCommand(c,REDIS_TAIL);
5220 }
5221
5222 static void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) {
5223 robj *subject;
5224 listTypeIterator *iter;
5225 listTypeEntry entry;
5226 int inserted = 0;
5227
5228 if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5229 checkType(c,subject,REDIS_LIST)) return;
5230
5231 if (refval != NULL) {
5232 /* Note: we expect refval to be string-encoded because it is *not* the
5233 * last argument of the multi-bulk LINSERT. */
5234 redisAssert(refval->encoding == REDIS_ENCODING_RAW);
5235
5236 /* We're not sure if this value can be inserted yet, but we cannot
5237 * convert the list inside the iterator. We don't want to loop over
5238 * the list twice (once to see if the value can be inserted and once
5239 * to do the actual insert), so we assume this value can be inserted
5240 * and convert the ziplist to a regular list if necessary. */
5241 listTypeTryConversion(subject,val);
5242
5243 /* Seek refval from head to tail */
5244 iter = listTypeInitIterator(subject,0,REDIS_TAIL);
5245 while (listTypeNext(iter,&entry)) {
5246 if (listTypeEqual(&entry,refval)) {
5247 listTypeInsert(&entry,val,where);
5248 inserted = 1;
5249 break;
5250 }
5251 }
5252 listTypeReleaseIterator(iter);
5253
5254 if (inserted) {
5255 /* Check if the length exceeds the ziplist length threshold. */
5256 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
5257 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
5258 listTypeConvert(subject,REDIS_ENCODING_LIST);
5259 server.dirty++;
5260 } else {
5261 /* Notify client of a failed insert */
5262 addReply(c,shared.cnegone);
5263 return;
5264 }
5265 } else {
5266 listTypePush(subject,val,where);
5267 server.dirty++;
5268 }
5269
5270 addReplyUlong(c,listTypeLength(subject));
5271 }
5272
5273 static void lpushxCommand(redisClient *c) {
5274 pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD);
5275 }
5276
5277 static void rpushxCommand(redisClient *c) {
5278 pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL);
5279 }
5280
5281 static void linsertCommand(redisClient *c) {
5282 if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
5283 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL);
5284 } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
5285 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD);
5286 } else {
5287 addReply(c,shared.syntaxerr);
5288 }
5289 }
5290
5291 static void llenCommand(redisClient *c) {
5292 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5293 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5294 addReplyUlong(c,listTypeLength(o));
5295 }
5296
5297 static void lindexCommand(redisClient *c) {
5298 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5299 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5300 int index = atoi(c->argv[2]->ptr);
5301 robj *value = NULL;
5302
5303 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5304 unsigned char *p;
5305 unsigned char *vstr;
5306 unsigned int vlen;
5307 long long vlong;
5308 p = ziplistIndex(o->ptr,index);
5309 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5310 if (vstr) {
5311 value = createStringObject((char*)vstr,vlen);
5312 } else {
5313 value = createStringObjectFromLongLong(vlong);
5314 }
5315 addReplyBulk(c,value);
5316 decrRefCount(value);
5317 } else {
5318 addReply(c,shared.nullbulk);
5319 }
5320 } else if (o->encoding == REDIS_ENCODING_LIST) {
5321 listNode *ln = listIndex(o->ptr,index);
5322 if (ln != NULL) {
5323 value = listNodeValue(ln);
5324 addReplyBulk(c,value);
5325 } else {
5326 addReply(c,shared.nullbulk);
5327 }
5328 } else {
5329 redisPanic("Unknown list encoding");
5330 }
5331 }
5332
5333 static void lsetCommand(redisClient *c) {
5334 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5335 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5336 int index = atoi(c->argv[2]->ptr);
5337 robj *value = c->argv[3];
5338
5339 listTypeTryConversion(o,value);
5340 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5341 unsigned char *p, *zl = o->ptr;
5342 p = ziplistIndex(zl,index);
5343 if (p == NULL) {
5344 addReply(c,shared.outofrangeerr);
5345 } else {
5346 o->ptr = ziplistDelete(o->ptr,&p);
5347 value = getDecodedObject(value);
5348 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5349 decrRefCount(value);
5350 addReply(c,shared.ok);
5351 server.dirty++;
5352 }
5353 } else if (o->encoding == REDIS_ENCODING_LIST) {
5354 listNode *ln = listIndex(o->ptr,index);
5355 if (ln == NULL) {
5356 addReply(c,shared.outofrangeerr);
5357 } else {
5358 decrRefCount((robj*)listNodeValue(ln));
5359 listNodeValue(ln) = value;
5360 incrRefCount(value);
5361 addReply(c,shared.ok);
5362 server.dirty++;
5363 }
5364 } else {
5365 redisPanic("Unknown list encoding");
5366 }
5367 }
5368
5369 static void popGenericCommand(redisClient *c, int where) {
5370 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5371 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5372
5373 robj *value = listTypePop(o,where);
5374 if (value == NULL) {
5375 addReply(c,shared.nullbulk);
5376 } else {
5377 addReplyBulk(c,value);
5378 decrRefCount(value);
5379 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5380 server.dirty++;
5381 }
5382 }
5383
5384 static void lpopCommand(redisClient *c) {
5385 popGenericCommand(c,REDIS_HEAD);
5386 }
5387
5388 static void rpopCommand(redisClient *c) {
5389 popGenericCommand(c,REDIS_TAIL);
5390 }
5391
5392 static void lrangeCommand(redisClient *c) {
5393 robj *o, *value;
5394 int start = atoi(c->argv[2]->ptr);
5395 int end = atoi(c->argv[3]->ptr);
5396 int llen;
5397 int rangelen, j;
5398 listTypeEntry entry;
5399
5400 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5401 || checkType(c,o,REDIS_LIST)) return;
5402 llen = listTypeLength(o);
5403
5404 /* convert negative indexes */
5405 if (start < 0) start = llen+start;
5406 if (end < 0) end = llen+end;
5407 if (start < 0) start = 0;
5408 if (end < 0) end = 0;
5409
5410 /* indexes sanity checks */
5411 if (start > end || start >= llen) {
5412 /* Out of range start or start > end result in empty list */
5413 addReply(c,shared.emptymultibulk);
5414 return;
5415 }
5416 if (end >= llen) end = llen-1;
5417 rangelen = (end-start)+1;
5418
5419 /* Return the result in form of a multi-bulk reply */
5420 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5421 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
5422 for (j = 0; j < rangelen; j++) {
5423 redisAssert(listTypeNext(li,&entry));
5424 value = listTypeGet(&entry);
5425 addReplyBulk(c,value);
5426 decrRefCount(value);
5427 }
5428 listTypeReleaseIterator(li);
5429 }
5430
5431 static void ltrimCommand(redisClient *c) {
5432 robj *o;
5433 int start = atoi(c->argv[2]->ptr);
5434 int end = atoi(c->argv[3]->ptr);
5435 int llen;
5436 int j, ltrim, rtrim;
5437 list *list;
5438 listNode *ln;
5439
5440 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5441 checkType(c,o,REDIS_LIST)) return;
5442 llen = listTypeLength(o);
5443
5444 /* convert negative indexes */
5445 if (start < 0) start = llen+start;
5446 if (end < 0) end = llen+end;
5447 if (start < 0) start = 0;
5448 if (end < 0) end = 0;
5449
5450 /* indexes sanity checks */
5451 if (start > end || start >= llen) {
5452 /* Out of range start or start > end result in empty list */
5453 ltrim = llen;
5454 rtrim = 0;
5455 } else {
5456 if (end >= llen) end = llen-1;
5457 ltrim = start;
5458 rtrim = llen-end-1;
5459 }
5460
5461 /* Remove list elements to perform the trim */
5462 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5463 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5464 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5465 } else if (o->encoding == REDIS_ENCODING_LIST) {
5466 list = o->ptr;
5467 for (j = 0; j < ltrim; j++) {
5468 ln = listFirst(list);
5469 listDelNode(list,ln);
5470 }
5471 for (j = 0; j < rtrim; j++) {
5472 ln = listLast(list);
5473 listDelNode(list,ln);
5474 }
5475 } else {
5476 redisPanic("Unknown list encoding");
5477 }
5478 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
5479 server.dirty++;
5480 addReply(c,shared.ok);
5481 }
5482
5483 static void lremCommand(redisClient *c) {
5484 robj *subject, *obj = c->argv[3];
5485 int toremove = atoi(c->argv[2]->ptr);
5486 int removed = 0;
5487 listTypeEntry entry;
5488
5489 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5490 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5491
5492 /* Make sure obj is raw when we're dealing with a ziplist */
5493 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5494 obj = getDecodedObject(obj);
5495
5496 listTypeIterator *li;
5497 if (toremove < 0) {
5498 toremove = -toremove;
5499 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
5500 } else {
5501 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5502 }
5503
5504 while (listTypeNext(li,&entry)) {
5505 if (listTypeEqual(&entry,obj)) {
5506 listTypeDelete(&entry);
5507 server.dirty++;
5508 removed++;
5509 if (toremove && removed == toremove) break;
5510 }
5511 }
5512 listTypeReleaseIterator(li);
5513
5514 /* Clean up raw encoded object */
5515 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5516 decrRefCount(obj);
5517
5518 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5519 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5520 }
5521
5522 /* This is the semantic of this command:
5523 * RPOPLPUSH srclist dstlist:
5524 * IF LLEN(srclist) > 0
5525 * element = RPOP srclist
5526 * LPUSH dstlist element
5527 * RETURN element
5528 * ELSE
5529 * RETURN nil
5530 * END
5531 * END
5532 *
5533 * The idea is to be able to get an element from a list in a reliable way
5534 * since the element is not just returned but pushed against another list
5535 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5536 */
5537 static void rpoplpushcommand(redisClient *c) {
5538 robj *sobj, *value;
5539 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5540 checkType(c,sobj,REDIS_LIST)) return;
5541
5542 if (listTypeLength(sobj) == 0) {
5543 addReply(c,shared.nullbulk);
5544 } else {
5545 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5546 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5547 value = listTypePop(sobj,REDIS_TAIL);
5548
5549 /* Add the element to the target list (unless it's directly
5550 * passed to some BLPOP-ing client */
5551 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5552 /* Create the list if the key does not exist */
5553 if (!dobj) {
5554 dobj = createZiplistObject();
5555 dbAdd(c->db,c->argv[2],dobj);
5556 }
5557 listTypePush(dobj,value,REDIS_HEAD);
5558 }
5559
5560 /* Send the element to the client as reply as well */
5561 addReplyBulk(c,value);
5562
5563 /* listTypePop returns an object with its refcount incremented */
5564 decrRefCount(value);
5565
5566 /* Delete the source list when it is empty */
5567 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5568 server.dirty++;
5569 }
5570 }
5571
5572 /* ==================================== Sets ================================ */
5573
5574 static void saddCommand(redisClient *c) {
5575 robj *set;
5576
5577 set = lookupKeyWrite(c->db,c->argv[1]);
5578 if (set == NULL) {
5579 set = createSetObject();
5580 dbAdd(c->db,c->argv[1],set);
5581 } else {
5582 if (set->type != REDIS_SET) {
5583 addReply(c,shared.wrongtypeerr);
5584 return;
5585 }
5586 }
5587 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5588 incrRefCount(c->argv[2]);
5589 server.dirty++;
5590 addReply(c,shared.cone);
5591 } else {
5592 addReply(c,shared.czero);
5593 }
5594 }
5595
5596 static void sremCommand(redisClient *c) {
5597 robj *set;
5598
5599 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5600 checkType(c,set,REDIS_SET)) return;
5601
5602 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5603 server.dirty++;
5604 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5605 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5606 addReply(c,shared.cone);
5607 } else {
5608 addReply(c,shared.czero);
5609 }
5610 }
5611
5612 static void smoveCommand(redisClient *c) {
5613 robj *srcset, *dstset;
5614
5615 srcset = lookupKeyWrite(c->db,c->argv[1]);
5616 dstset = lookupKeyWrite(c->db,c->argv[2]);
5617
5618 /* If the source key does not exist return 0, if it's of the wrong type
5619 * raise an error */
5620 if (srcset == NULL || srcset->type != REDIS_SET) {
5621 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5622 return;
5623 }
5624 /* Error if the destination key is not a set as well */
5625 if (dstset && dstset->type != REDIS_SET) {
5626 addReply(c,shared.wrongtypeerr);
5627 return;
5628 }
5629 /* Remove the element from the source set */
5630 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5631 /* Key not found in the src set! return zero */
5632 addReply(c,shared.czero);
5633 return;
5634 }
5635 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5636 dbDelete(c->db,c->argv[1]);
5637 server.dirty++;
5638 /* Add the element to the destination set */
5639 if (!dstset) {
5640 dstset = createSetObject();
5641 dbAdd(c->db,c->argv[2],dstset);
5642 }
5643 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5644 incrRefCount(c->argv[3]);
5645 addReply(c,shared.cone);
5646 }
5647
5648 static void sismemberCommand(redisClient *c) {
5649 robj *set;
5650
5651 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5652 checkType(c,set,REDIS_SET)) return;
5653
5654 if (dictFind(set->ptr,c->argv[2]))
5655 addReply(c,shared.cone);
5656 else
5657 addReply(c,shared.czero);
5658 }
5659
5660 static void scardCommand(redisClient *c) {
5661 robj *o;
5662 dict *s;
5663
5664 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5665 checkType(c,o,REDIS_SET)) return;
5666
5667 s = o->ptr;
5668 addReplyUlong(c,dictSize(s));
5669 }
5670
5671 static void spopCommand(redisClient *c) {
5672 robj *set;
5673 dictEntry *de;
5674
5675 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5676 checkType(c,set,REDIS_SET)) return;
5677
5678 de = dictGetRandomKey(set->ptr);
5679 if (de == NULL) {
5680 addReply(c,shared.nullbulk);
5681 } else {
5682 robj *ele = dictGetEntryKey(de);
5683
5684 addReplyBulk(c,ele);
5685 dictDelete(set->ptr,ele);
5686 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5687 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5688 server.dirty++;
5689 }
5690 }
5691
5692 static void srandmemberCommand(redisClient *c) {
5693 robj *set;
5694 dictEntry *de;
5695
5696 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5697 checkType(c,set,REDIS_SET)) return;
5698
5699 de = dictGetRandomKey(set->ptr);
5700 if (de == NULL) {
5701 addReply(c,shared.nullbulk);
5702 } else {
5703 robj *ele = dictGetEntryKey(de);
5704
5705 addReplyBulk(c,ele);
5706 }
5707 }
5708
5709 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5710 dict **d1 = (void*) s1, **d2 = (void*) s2;
5711
5712 return dictSize(*d1)-dictSize(*d2);
5713 }
5714
5715 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5716 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5717 dictIterator *di;
5718 dictEntry *de;
5719 robj *lenobj = NULL, *dstset = NULL;
5720 unsigned long j, cardinality = 0;
5721
5722 for (j = 0; j < setsnum; j++) {
5723 robj *setobj;
5724
5725 setobj = dstkey ?
5726 lookupKeyWrite(c->db,setskeys[j]) :
5727 lookupKeyRead(c->db,setskeys[j]);
5728 if (!setobj) {
5729 zfree(dv);
5730 if (dstkey) {
5731 if (dbDelete(c->db,dstkey))
5732 server.dirty++;
5733 addReply(c,shared.czero);
5734 } else {
5735 addReply(c,shared.emptymultibulk);
5736 }
5737 return;
5738 }
5739 if (setobj->type != REDIS_SET) {
5740 zfree(dv);
5741 addReply(c,shared.wrongtypeerr);
5742 return;
5743 }
5744 dv[j] = setobj->ptr;
5745 }
5746 /* Sort sets from the smallest to largest, this will improve our
5747 * algorithm's performace */
5748 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5749
5750 /* The first thing we should output is the total number of elements...
5751 * since this is a multi-bulk write, but at this stage we don't know
5752 * the intersection set size, so we use a trick, append an empty object
5753 * to the output list and save the pointer to later modify it with the
5754 * right length */
5755 if (!dstkey) {
5756 lenobj = createObject(REDIS_STRING,NULL);
5757 addReply(c,lenobj);
5758 decrRefCount(lenobj);
5759 } else {
5760 /* If we have a target key where to store the resulting set
5761 * create this key with an empty set inside */
5762 dstset = createSetObject();
5763 }
5764
5765 /* Iterate all the elements of the first (smallest) set, and test
5766 * the element against all the other sets, if at least one set does
5767 * not include the element it is discarded */
5768 di = dictGetIterator(dv[0]);
5769
5770 while((de = dictNext(di)) != NULL) {
5771 robj *ele;
5772
5773 for (j = 1; j < setsnum; j++)
5774 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5775 if (j != setsnum)
5776 continue; /* at least one set does not contain the member */
5777 ele = dictGetEntryKey(de);
5778 if (!dstkey) {
5779 addReplyBulk(c,ele);
5780 cardinality++;
5781 } else {
5782 dictAdd(dstset->ptr,ele,NULL);
5783 incrRefCount(ele);
5784 }
5785 }
5786 dictReleaseIterator(di);
5787
5788 if (dstkey) {
5789 /* Store the resulting set into the target, if the intersection
5790 * is not an empty set. */
5791 dbDelete(c->db,dstkey);
5792 if (dictSize((dict*)dstset->ptr) > 0) {
5793 dbAdd(c->db,dstkey,dstset);
5794 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5795 } else {
5796 decrRefCount(dstset);
5797 addReply(c,shared.czero);
5798 }
5799 server.dirty++;
5800 } else {
5801 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5802 }
5803 zfree(dv);
5804 }
5805
5806 static void sinterCommand(redisClient *c) {
5807 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5808 }
5809
5810 static void sinterstoreCommand(redisClient *c) {
5811 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5812 }
5813
5814 #define REDIS_OP_UNION 0
5815 #define REDIS_OP_DIFF 1
5816 #define REDIS_OP_INTER 2
5817
5818 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5819 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5820 dictIterator *di;
5821 dictEntry *de;
5822 robj *dstset = NULL;
5823 int j, cardinality = 0;
5824
5825 for (j = 0; j < setsnum; j++) {
5826 robj *setobj;
5827
5828 setobj = dstkey ?
5829 lookupKeyWrite(c->db,setskeys[j]) :
5830 lookupKeyRead(c->db,setskeys[j]);
5831 if (!setobj) {
5832 dv[j] = NULL;
5833 continue;
5834 }
5835 if (setobj->type != REDIS_SET) {
5836 zfree(dv);
5837 addReply(c,shared.wrongtypeerr);
5838 return;
5839 }
5840 dv[j] = setobj->ptr;
5841 }
5842
5843 /* We need a temp set object to store our union. If the dstkey
5844 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5845 * this set object will be the resulting object to set into the target key*/
5846 dstset = createSetObject();
5847
5848 /* Iterate all the elements of all the sets, add every element a single
5849 * time to the result set */
5850 for (j = 0; j < setsnum; j++) {
5851 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5852 if (!dv[j]) continue; /* non existing keys are like empty sets */
5853
5854 di = dictGetIterator(dv[j]);
5855
5856 while((de = dictNext(di)) != NULL) {
5857 robj *ele;
5858
5859 /* dictAdd will not add the same element multiple times */
5860 ele = dictGetEntryKey(de);
5861 if (op == REDIS_OP_UNION || j == 0) {
5862 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5863 incrRefCount(ele);
5864 cardinality++;
5865 }
5866 } else if (op == REDIS_OP_DIFF) {
5867 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5868 cardinality--;
5869 }
5870 }
5871 }
5872 dictReleaseIterator(di);
5873
5874 /* result set is empty? Exit asap. */
5875 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5876 }
5877
5878 /* Output the content of the resulting set, if not in STORE mode */
5879 if (!dstkey) {
5880 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5881 di = dictGetIterator(dstset->ptr);
5882 while((de = dictNext(di)) != NULL) {
5883 robj *ele;
5884
5885 ele = dictGetEntryKey(de);
5886 addReplyBulk(c,ele);
5887 }
5888 dictReleaseIterator(di);
5889 decrRefCount(dstset);
5890 } else {
5891 /* If we have a target key where to store the resulting set
5892 * create this key with the result set inside */
5893 dbDelete(c->db,dstkey);
5894 if (dictSize((dict*)dstset->ptr) > 0) {
5895 dbAdd(c->db,dstkey,dstset);
5896 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5897 } else {
5898 decrRefCount(dstset);
5899 addReply(c,shared.czero);
5900 }
5901 server.dirty++;
5902 }
5903 zfree(dv);
5904 }
5905
5906 static void sunionCommand(redisClient *c) {
5907 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5908 }
5909
5910 static void sunionstoreCommand(redisClient *c) {
5911 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5912 }
5913
5914 static void sdiffCommand(redisClient *c) {
5915 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5916 }
5917
5918 static void sdiffstoreCommand(redisClient *c) {
5919 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5920 }
5921
5922 /* ==================================== ZSets =============================== */
5923
5924 /* ZSETs are ordered sets using two data structures to hold the same elements
5925 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5926 * data structure.
5927 *
5928 * The elements are added to an hash table mapping Redis objects to scores.
5929 * At the same time the elements are added to a skip list mapping scores
5930 * to Redis objects (so objects are sorted by scores in this "view"). */
5931
5932 /* This skiplist implementation is almost a C translation of the original
5933 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5934 * Alternative to Balanced Trees", modified in three ways:
5935 * a) this implementation allows for repeated values.
5936 * b) the comparison is not just by key (our 'score') but by satellite data.
5937 * c) there is a back pointer, so it's a doubly linked list with the back
5938 * pointers being only at "level 1". This allows to traverse the list
5939 * from tail to head, useful for ZREVRANGE. */
5940
5941 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5942 zskiplistNode *zn = zmalloc(sizeof(*zn));
5943
5944 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5945 if (level > 1)
5946 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5947 else
5948 zn->span = NULL;
5949 zn->score = score;
5950 zn->obj = obj;
5951 return zn;
5952 }
5953
5954 static zskiplist *zslCreate(void) {
5955 int j;
5956 zskiplist *zsl;
5957
5958 zsl = zmalloc(sizeof(*zsl));
5959 zsl->level = 1;
5960 zsl->length = 0;
5961 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5962 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5963 zsl->header->forward[j] = NULL;
5964
5965 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5966 if (j < ZSKIPLIST_MAXLEVEL-1)
5967 zsl->header->span[j] = 0;
5968 }
5969 zsl->header->backward = NULL;
5970 zsl->tail = NULL;
5971 return zsl;
5972 }
5973
5974 static void zslFreeNode(zskiplistNode *node) {
5975 decrRefCount(node->obj);
5976 zfree(node->forward);
5977 zfree(node->span);
5978 zfree(node);
5979 }
5980
5981 static void zslFree(zskiplist *zsl) {
5982 zskiplistNode *node = zsl->header->forward[0], *next;
5983
5984 zfree(zsl->header->forward);
5985 zfree(zsl->header->span);
5986 zfree(zsl->header);
5987 while(node) {
5988 next = node->forward[0];
5989 zslFreeNode(node);
5990 node = next;
5991 }
5992 zfree(zsl);
5993 }
5994
5995 static int zslRandomLevel(void) {
5996 int level = 1;
5997 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5998 level += 1;
5999 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6000 }
6001
6002 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
6003 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6004 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6005 int i, level;
6006
6007 x = zsl->header;
6008 for (i = zsl->level-1; i >= 0; i--) {
6009 /* store rank that is crossed to reach the insert position */
6010 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
6011
6012 while (x->forward[i] &&
6013 (x->forward[i]->score < score ||
6014 (x->forward[i]->score == score &&
6015 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
6016 rank[i] += i > 0 ? x->span[i-1] : 1;
6017 x = x->forward[i];
6018 }
6019 update[i] = x;
6020 }
6021 /* we assume the key is not already inside, since we allow duplicated
6022 * scores, and the re-insertion of score and redis object should never
6023 * happpen since the caller of zslInsert() should test in the hash table
6024 * if the element is already inside or not. */
6025 level = zslRandomLevel();
6026 if (level > zsl->level) {
6027 for (i = zsl->level; i < level; i++) {
6028 rank[i] = 0;
6029 update[i] = zsl->header;
6030 update[i]->span[i-1] = zsl->length;
6031 }
6032 zsl->level = level;
6033 }
6034 x = zslCreateNode(level,score,obj);
6035 for (i = 0; i < level; i++) {
6036 x->forward[i] = update[i]->forward[i];
6037 update[i]->forward[i] = x;
6038
6039 /* update span covered by update[i] as x is inserted here */
6040 if (i > 0) {
6041 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6042 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6043 }
6044 }
6045
6046 /* increment span for untouched levels */
6047 for (i = level; i < zsl->level; i++) {
6048 update[i]->span[i-1]++;
6049 }
6050
6051 x->backward = (update[0] == zsl->header) ? NULL : update[0];
6052 if (x->forward[0])
6053 x->forward[0]->backward = x;
6054 else
6055 zsl->tail = x;
6056 zsl->length++;
6057 }
6058
6059 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6060 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6061 int i;
6062 for (i = 0; i < zsl->level; i++) {
6063 if (update[i]->forward[i] == x) {
6064 if (i > 0) {
6065 update[i]->span[i-1] += x->span[i-1] - 1;
6066 }
6067 update[i]->forward[i] = x->forward[i];
6068 } else {
6069 /* invariant: i > 0, because update[0]->forward[0]
6070 * is always equal to x */
6071 update[i]->span[i-1] -= 1;
6072 }
6073 }
6074 if (x->forward[0]) {
6075 x->forward[0]->backward = x->backward;
6076 } else {
6077 zsl->tail = x->backward;
6078 }
6079 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6080 zsl->level--;
6081 zsl->length--;
6082 }
6083
6084 /* Delete an element with matching score/object from the skiplist. */
6085 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
6086 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6087 int i;
6088
6089 x = zsl->header;
6090 for (i = zsl->level-1; i >= 0; i--) {
6091 while (x->forward[i] &&
6092 (x->forward[i]->score < score ||
6093 (x->forward[i]->score == score &&
6094 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6095 x = x->forward[i];
6096 update[i] = x;
6097 }
6098 /* We may have multiple elements with the same score, what we need
6099 * is to find the element with both the right score and object. */
6100 x = x->forward[0];
6101 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
6102 zslDeleteNode(zsl, x, update);
6103 zslFreeNode(x);
6104 return 1;
6105 } else {
6106 return 0; /* not found */
6107 }
6108 return 0; /* not found */
6109 }
6110
6111 /* Delete all the elements with score between min and max from the skiplist.
6112 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6113 * Note that this function takes the reference to the hash table view of the
6114 * sorted set, in order to remove the elements from the hash table too. */
6115 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
6116 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6117 unsigned long removed = 0;
6118 int i;
6119
6120 x = zsl->header;
6121 for (i = zsl->level-1; i >= 0; i--) {
6122 while (x->forward[i] && x->forward[i]->score < min)
6123 x = x->forward[i];
6124 update[i] = x;
6125 }
6126 /* We may have multiple elements with the same score, what we need
6127 * is to find the element with both the right score and object. */
6128 x = x->forward[0];
6129 while (x && x->score <= max) {
6130 zskiplistNode *next = x->forward[0];
6131 zslDeleteNode(zsl, x, update);
6132 dictDelete(dict,x->obj);
6133 zslFreeNode(x);
6134 removed++;
6135 x = next;
6136 }
6137 return removed; /* not found */
6138 }
6139
6140 /* Delete all the elements with rank between start and end from the skiplist.
6141 * Start and end are inclusive. Note that start and end need to be 1-based */
6142 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6143 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6144 unsigned long traversed = 0, removed = 0;
6145 int i;
6146
6147 x = zsl->header;
6148 for (i = zsl->level-1; i >= 0; i--) {
6149 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6150 traversed += i > 0 ? x->span[i-1] : 1;
6151 x = x->forward[i];
6152 }
6153 update[i] = x;
6154 }
6155
6156 traversed++;
6157 x = x->forward[0];
6158 while (x && traversed <= end) {
6159 zskiplistNode *next = x->forward[0];
6160 zslDeleteNode(zsl, x, update);
6161 dictDelete(dict,x->obj);
6162 zslFreeNode(x);
6163 removed++;
6164 traversed++;
6165 x = next;
6166 }
6167 return removed;
6168 }
6169
6170 /* Find the first node having a score equal or greater than the specified one.
6171 * Returns NULL if there is no match. */
6172 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6173 zskiplistNode *x;
6174 int i;
6175
6176 x = zsl->header;
6177 for (i = zsl->level-1; i >= 0; i--) {
6178 while (x->forward[i] && x->forward[i]->score < score)
6179 x = x->forward[i];
6180 }
6181 /* We may have multiple elements with the same score, what we need
6182 * is to find the element with both the right score and object. */
6183 return x->forward[0];
6184 }
6185
6186 /* Find the rank for an element by both score and key.
6187 * Returns 0 when the element cannot be found, rank otherwise.
6188 * Note that the rank is 1-based due to the span of zsl->header to the
6189 * first element. */
6190 static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
6191 zskiplistNode *x;
6192 unsigned long rank = 0;
6193 int i;
6194
6195 x = zsl->header;
6196 for (i = zsl->level-1; i >= 0; i--) {
6197 while (x->forward[i] &&
6198 (x->forward[i]->score < score ||
6199 (x->forward[i]->score == score &&
6200 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6201 rank += i > 0 ? x->span[i-1] : 1;
6202 x = x->forward[i];
6203 }
6204
6205 /* x might be equal to zsl->header, so test if obj is non-NULL */
6206 if (x->obj && equalStringObjects(x->obj,o)) {
6207 return rank;
6208 }
6209 }
6210 return 0;
6211 }
6212
6213 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6214 zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
6215 zskiplistNode *x;
6216 unsigned long traversed = 0;
6217 int i;
6218
6219 x = zsl->header;
6220 for (i = zsl->level-1; i >= 0; i--) {
6221 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6222 {
6223 traversed += i > 0 ? x->span[i-1] : 1;
6224 x = x->forward[i];
6225 }
6226 if (traversed == rank) {
6227 return x;
6228 }
6229 }
6230 return NULL;
6231 }
6232
6233 /* The actual Z-commands implementations */
6234
6235 /* This generic command implements both ZADD and ZINCRBY.
6236 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6237 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6238 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6239 robj *zsetobj;
6240 zset *zs;
6241 double *score;
6242
6243 if (isnan(scoreval)) {
6244 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6245 return;
6246 }
6247
6248 zsetobj = lookupKeyWrite(c->db,key);
6249 if (zsetobj == NULL) {
6250 zsetobj = createZsetObject();
6251 dbAdd(c->db,key,zsetobj);
6252 } else {
6253 if (zsetobj->type != REDIS_ZSET) {
6254 addReply(c,shared.wrongtypeerr);
6255 return;
6256 }
6257 }
6258 zs = zsetobj->ptr;
6259
6260 /* Ok now since we implement both ZADD and ZINCRBY here the code
6261 * needs to handle the two different conditions. It's all about setting
6262 * '*score', that is, the new score to set, to the right value. */
6263 score = zmalloc(sizeof(double));
6264 if (doincrement) {
6265 dictEntry *de;
6266
6267 /* Read the old score. If the element was not present starts from 0 */
6268 de = dictFind(zs->dict,ele);
6269 if (de) {
6270 double *oldscore = dictGetEntryVal(de);
6271 *score = *oldscore + scoreval;
6272 } else {
6273 *score = scoreval;
6274 }
6275 if (isnan(*score)) {
6276 addReplySds(c,
6277 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6278 zfree(score);
6279 /* Note that we don't need to check if the zset may be empty and
6280 * should be removed here, as we can only obtain Nan as score if
6281 * there was already an element in the sorted set. */
6282 return;
6283 }
6284 } else {
6285 *score = scoreval;
6286 }
6287
6288 /* What follows is a simple remove and re-insert operation that is common
6289 * to both ZADD and ZINCRBY... */
6290 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6291 /* case 1: New element */
6292 incrRefCount(ele); /* added to hash */
6293 zslInsert(zs->zsl,*score,ele);
6294 incrRefCount(ele); /* added to skiplist */
6295 server.dirty++;
6296 if (doincrement)
6297 addReplyDouble(c,*score);
6298 else
6299 addReply(c,shared.cone);
6300 } else {
6301 dictEntry *de;
6302 double *oldscore;
6303
6304 /* case 2: Score update operation */
6305 de = dictFind(zs->dict,ele);
6306 redisAssert(de != NULL);
6307 oldscore = dictGetEntryVal(de);
6308 if (*score != *oldscore) {
6309 int deleted;
6310
6311 /* Remove and insert the element in the skip list with new score */
6312 deleted = zslDelete(zs->zsl,*oldscore,ele);
6313 redisAssert(deleted != 0);
6314 zslInsert(zs->zsl,*score,ele);
6315 incrRefCount(ele);
6316 /* Update the score in the hash table */
6317 dictReplace(zs->dict,ele,score);
6318 server.dirty++;
6319 } else {
6320 zfree(score);
6321 }
6322 if (doincrement)
6323 addReplyDouble(c,*score);
6324 else
6325 addReply(c,shared.czero);
6326 }
6327 }
6328
6329 static void zaddCommand(redisClient *c) {
6330 double scoreval;
6331
6332 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6333 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6334 }
6335
6336 static void zincrbyCommand(redisClient *c) {
6337 double scoreval;
6338
6339 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6340 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6341 }
6342
6343 static void zremCommand(redisClient *c) {
6344 robj *zsetobj;
6345 zset *zs;
6346 dictEntry *de;
6347 double *oldscore;
6348 int deleted;
6349
6350 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6351 checkType(c,zsetobj,REDIS_ZSET)) return;
6352
6353 zs = zsetobj->ptr;
6354 de = dictFind(zs->dict,c->argv[2]);
6355 if (de == NULL) {
6356 addReply(c,shared.czero);
6357 return;
6358 }
6359 /* Delete from the skiplist */
6360 oldscore = dictGetEntryVal(de);
6361 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6362 redisAssert(deleted != 0);
6363
6364 /* Delete from the hash table */
6365 dictDelete(zs->dict,c->argv[2]);
6366 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6367 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6368 server.dirty++;
6369 addReply(c,shared.cone);
6370 }
6371
6372 static void zremrangebyscoreCommand(redisClient *c) {
6373 double min;
6374 double max;
6375 long deleted;
6376 robj *zsetobj;
6377 zset *zs;
6378
6379 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6380 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6381
6382 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6383 checkType(c,zsetobj,REDIS_ZSET)) return;
6384
6385 zs = zsetobj->ptr;
6386 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6387 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6388 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6389 server.dirty += deleted;
6390 addReplyLongLong(c,deleted);
6391 }
6392
6393 static void zremrangebyrankCommand(redisClient *c) {
6394 long start;
6395 long end;
6396 int llen;
6397 long deleted;
6398 robj *zsetobj;
6399 zset *zs;
6400
6401 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6402 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6403
6404 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6405 checkType(c,zsetobj,REDIS_ZSET)) return;
6406 zs = zsetobj->ptr;
6407 llen = zs->zsl->length;
6408
6409 /* convert negative indexes */
6410 if (start < 0) start = llen+start;
6411 if (end < 0) end = llen+end;
6412 if (start < 0) start = 0;
6413 if (end < 0) end = 0;
6414
6415 /* indexes sanity checks */
6416 if (start > end || start >= llen) {
6417 addReply(c,shared.czero);
6418 return;
6419 }
6420 if (end >= llen) end = llen-1;
6421
6422 /* increment start and end because zsl*Rank functions
6423 * use 1-based rank */
6424 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6425 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6426 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6427 server.dirty += deleted;
6428 addReplyLongLong(c, deleted);
6429 }
6430
6431 typedef struct {
6432 dict *dict;
6433 double weight;
6434 } zsetopsrc;
6435
6436 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6437 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6438 unsigned long size1, size2;
6439 size1 = d1->dict ? dictSize(d1->dict) : 0;
6440 size2 = d2->dict ? dictSize(d2->dict) : 0;
6441 return size1 - size2;
6442 }
6443
6444 #define REDIS_AGGR_SUM 1
6445 #define REDIS_AGGR_MIN 2
6446 #define REDIS_AGGR_MAX 3
6447 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6448
6449 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6450 if (aggregate == REDIS_AGGR_SUM) {
6451 *target = *target + val;
6452 } else if (aggregate == REDIS_AGGR_MIN) {
6453 *target = val < *target ? val : *target;
6454 } else if (aggregate == REDIS_AGGR_MAX) {
6455 *target = val > *target ? val : *target;
6456 } else {
6457 /* safety net */
6458 redisPanic("Unknown ZUNION/INTER aggregate type");
6459 }
6460 }
6461
6462 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6463 int i, j, setnum;
6464 int aggregate = REDIS_AGGR_SUM;
6465 zsetopsrc *src;
6466 robj *dstobj;
6467 zset *dstzset;
6468 dictIterator *di;
6469 dictEntry *de;
6470
6471 /* expect setnum input keys to be given */
6472 setnum = atoi(c->argv[2]->ptr);
6473 if (setnum < 1) {
6474 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6475 return;
6476 }
6477
6478 /* test if the expected number of keys would overflow */
6479 if (3+setnum > c->argc) {
6480 addReply(c,shared.syntaxerr);
6481 return;
6482 }
6483
6484 /* read keys to be used for input */
6485 src = zmalloc(sizeof(zsetopsrc) * setnum);
6486 for (i = 0, j = 3; i < setnum; i++, j++) {
6487 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6488 if (!obj) {
6489 src[i].dict = NULL;
6490 } else {
6491 if (obj->type == REDIS_ZSET) {
6492 src[i].dict = ((zset*)obj->ptr)->dict;
6493 } else if (obj->type == REDIS_SET) {
6494 src[i].dict = (obj->ptr);
6495 } else {
6496 zfree(src);
6497 addReply(c,shared.wrongtypeerr);
6498 return;
6499 }
6500 }
6501
6502 /* default all weights to 1 */
6503 src[i].weight = 1.0;
6504 }
6505
6506 /* parse optional extra arguments */
6507 if (j < c->argc) {
6508 int remaining = c->argc - j;
6509
6510 while (remaining) {
6511 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6512 j++; remaining--;
6513 for (i = 0; i < setnum; i++, j++, remaining--) {
6514 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6515 return;
6516 }
6517 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6518 j++; remaining--;
6519 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6520 aggregate = REDIS_AGGR_SUM;
6521 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6522 aggregate = REDIS_AGGR_MIN;
6523 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6524 aggregate = REDIS_AGGR_MAX;
6525 } else {
6526 zfree(src);
6527 addReply(c,shared.syntaxerr);
6528 return;
6529 }
6530 j++; remaining--;
6531 } else {
6532 zfree(src);
6533 addReply(c,shared.syntaxerr);
6534 return;
6535 }
6536 }
6537 }
6538
6539 /* sort sets from the smallest to largest, this will improve our
6540 * algorithm's performance */
6541 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6542
6543 dstobj = createZsetObject();
6544 dstzset = dstobj->ptr;
6545
6546 if (op == REDIS_OP_INTER) {
6547 /* skip going over all entries if the smallest zset is NULL or empty */
6548 if (src[0].dict && dictSize(src[0].dict) > 0) {
6549 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6550 * from small to large, all src[i > 0].dict are non-empty too */
6551 di = dictGetIterator(src[0].dict);
6552 while((de = dictNext(di)) != NULL) {
6553 double *score = zmalloc(sizeof(double)), value;
6554 *score = src[0].weight * zunionInterDictValue(de);
6555
6556 for (j = 1; j < setnum; j++) {
6557 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6558 if (other) {
6559 value = src[j].weight * zunionInterDictValue(other);
6560 zunionInterAggregate(score, value, aggregate);
6561 } else {
6562 break;
6563 }
6564 }
6565
6566 /* skip entry when not present in every source dict */
6567 if (j != setnum) {
6568 zfree(score);
6569 } else {
6570 robj *o = dictGetEntryKey(de);
6571 dictAdd(dstzset->dict,o,score);
6572 incrRefCount(o); /* added to dictionary */
6573 zslInsert(dstzset->zsl,*score,o);
6574 incrRefCount(o); /* added to skiplist */
6575 }
6576 }
6577 dictReleaseIterator(di);
6578 }
6579 } else if (op == REDIS_OP_UNION) {
6580 for (i = 0; i < setnum; i++) {
6581 if (!src[i].dict) continue;
6582
6583 di = dictGetIterator(src[i].dict);
6584 while((de = dictNext(di)) != NULL) {
6585 /* skip key when already processed */
6586 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6587
6588 double *score = zmalloc(sizeof(double)), value;
6589 *score = src[i].weight * zunionInterDictValue(de);
6590
6591 /* because the zsets are sorted by size, its only possible
6592 * for sets at larger indices to hold this entry */
6593 for (j = (i+1); j < setnum; j++) {
6594 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6595 if (other) {
6596 value = src[j].weight * zunionInterDictValue(other);
6597 zunionInterAggregate(score, value, aggregate);
6598 }
6599 }
6600
6601 robj *o = dictGetEntryKey(de);
6602 dictAdd(dstzset->dict,o,score);
6603 incrRefCount(o); /* added to dictionary */
6604 zslInsert(dstzset->zsl,*score,o);
6605 incrRefCount(o); /* added to skiplist */
6606 }
6607 dictReleaseIterator(di);
6608 }
6609 } else {
6610 /* unknown operator */
6611 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6612 }
6613
6614 dbDelete(c->db,dstkey);
6615 if (dstzset->zsl->length) {
6616 dbAdd(c->db,dstkey,dstobj);
6617 addReplyLongLong(c, dstzset->zsl->length);
6618 server.dirty++;
6619 } else {
6620 decrRefCount(dstobj);
6621 addReply(c, shared.czero);
6622 }
6623 zfree(src);
6624 }
6625
6626 static void zunionstoreCommand(redisClient *c) {
6627 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6628 }
6629
6630 static void zinterstoreCommand(redisClient *c) {
6631 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6632 }
6633
6634 static void zrangeGenericCommand(redisClient *c, int reverse) {
6635 robj *o;
6636 long start;
6637 long end;
6638 int withscores = 0;
6639 int llen;
6640 int rangelen, j;
6641 zset *zsetobj;
6642 zskiplist *zsl;
6643 zskiplistNode *ln;
6644 robj *ele;
6645
6646 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6647 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6648
6649 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6650 withscores = 1;
6651 } else if (c->argc >= 5) {
6652 addReply(c,shared.syntaxerr);
6653 return;
6654 }
6655
6656 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6657 || checkType(c,o,REDIS_ZSET)) return;
6658 zsetobj = o->ptr;
6659 zsl = zsetobj->zsl;
6660 llen = zsl->length;
6661
6662 /* convert negative indexes */
6663 if (start < 0) start = llen+start;
6664 if (end < 0) end = llen+end;
6665 if (start < 0) start = 0;
6666 if (end < 0) end = 0;
6667
6668 /* indexes sanity checks */
6669 if (start > end || start >= llen) {
6670 /* Out of range start or start > end result in empty list */
6671 addReply(c,shared.emptymultibulk);
6672 return;
6673 }
6674 if (end >= llen) end = llen-1;
6675 rangelen = (end-start)+1;
6676
6677 /* check if starting point is trivial, before searching
6678 * the element in log(N) time */
6679 if (reverse) {
6680 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
6681 } else {
6682 ln = start == 0 ?
6683 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
6684 }
6685
6686 /* Return the result in form of a multi-bulk reply */
6687 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6688 withscores ? (rangelen*2) : rangelen));
6689 for (j = 0; j < rangelen; j++) {
6690 ele = ln->obj;
6691 addReplyBulk(c,ele);
6692 if (withscores)
6693 addReplyDouble(c,ln->score);
6694 ln = reverse ? ln->backward : ln->forward[0];
6695 }
6696 }
6697
6698 static void zrangeCommand(redisClient *c) {
6699 zrangeGenericCommand(c,0);
6700 }
6701
6702 static void zrevrangeCommand(redisClient *c) {
6703 zrangeGenericCommand(c,1);
6704 }
6705
6706 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6707 * If justcount is non-zero, just the count is returned. */
6708 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6709 robj *o;
6710 double min, max;
6711 int minex = 0, maxex = 0; /* are min or max exclusive? */
6712 int offset = 0, limit = -1;
6713 int withscores = 0;
6714 int badsyntax = 0;
6715
6716 /* Parse the min-max interval. If one of the values is prefixed
6717 * by the "(" character, it's considered "open". For instance
6718 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6719 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6720 if (((char*)c->argv[2]->ptr)[0] == '(') {
6721 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6722 minex = 1;
6723 } else {
6724 min = strtod(c->argv[2]->ptr,NULL);
6725 }
6726 if (((char*)c->argv[3]->ptr)[0] == '(') {
6727 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6728 maxex = 1;
6729 } else {
6730 max = strtod(c->argv[3]->ptr,NULL);
6731 }
6732
6733 /* Parse "WITHSCORES": note that if the command was called with
6734 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6735 * enter the following paths to parse WITHSCORES and LIMIT. */
6736 if (c->argc == 5 || c->argc == 8) {
6737 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6738 withscores = 1;
6739 else
6740 badsyntax = 1;
6741 }
6742 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6743 badsyntax = 1;
6744 if (badsyntax) {
6745 addReplySds(c,
6746 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6747 return;
6748 }
6749
6750 /* Parse "LIMIT" */
6751 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6752 addReply(c,shared.syntaxerr);
6753 return;
6754 } else if (c->argc == (7 + withscores)) {
6755 offset = atoi(c->argv[5]->ptr);
6756 limit = atoi(c->argv[6]->ptr);
6757 if (offset < 0) offset = 0;
6758 }
6759
6760 /* Ok, lookup the key and get the range */
6761 o = lookupKeyRead(c->db,c->argv[1]);
6762 if (o == NULL) {
6763 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6764 } else {
6765 if (o->type != REDIS_ZSET) {
6766 addReply(c,shared.wrongtypeerr);
6767 } else {
6768 zset *zsetobj = o->ptr;
6769 zskiplist *zsl = zsetobj->zsl;
6770 zskiplistNode *ln;
6771 robj *ele, *lenobj = NULL;
6772 unsigned long rangelen = 0;
6773
6774 /* Get the first node with the score >= min, or with
6775 * score > min if 'minex' is true. */
6776 ln = zslFirstWithScore(zsl,min);
6777 while (minex && ln && ln->score == min) ln = ln->forward[0];
6778
6779 if (ln == NULL) {
6780 /* No element matching the speciifed interval */
6781 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6782 return;
6783 }
6784
6785 /* We don't know in advance how many matching elements there
6786 * are in the list, so we push this object that will represent
6787 * the multi-bulk length in the output buffer, and will "fix"
6788 * it later */
6789 if (!justcount) {
6790 lenobj = createObject(REDIS_STRING,NULL);
6791 addReply(c,lenobj);
6792 decrRefCount(lenobj);
6793 }
6794
6795 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6796 if (offset) {
6797 offset--;
6798 ln = ln->forward[0];
6799 continue;
6800 }
6801 if (limit == 0) break;
6802 if (!justcount) {
6803 ele = ln->obj;
6804 addReplyBulk(c,ele);
6805 if (withscores)
6806 addReplyDouble(c,ln->score);
6807 }
6808 ln = ln->forward[0];
6809 rangelen++;
6810 if (limit > 0) limit--;
6811 }
6812 if (justcount) {
6813 addReplyLongLong(c,(long)rangelen);
6814 } else {
6815 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6816 withscores ? (rangelen*2) : rangelen);
6817 }
6818 }
6819 }
6820 }
6821
6822 static void zrangebyscoreCommand(redisClient *c) {
6823 genericZrangebyscoreCommand(c,0);
6824 }
6825
6826 static void zcountCommand(redisClient *c) {
6827 genericZrangebyscoreCommand(c,1);
6828 }
6829
6830 static void zcardCommand(redisClient *c) {
6831 robj *o;
6832 zset *zs;
6833
6834 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6835 checkType(c,o,REDIS_ZSET)) return;
6836
6837 zs = o->ptr;
6838 addReplyUlong(c,zs->zsl->length);
6839 }
6840
6841 static void zscoreCommand(redisClient *c) {
6842 robj *o;
6843 zset *zs;
6844 dictEntry *de;
6845
6846 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6847 checkType(c,o,REDIS_ZSET)) return;
6848
6849 zs = o->ptr;
6850 de = dictFind(zs->dict,c->argv[2]);
6851 if (!de) {
6852 addReply(c,shared.nullbulk);
6853 } else {
6854 double *score = dictGetEntryVal(de);
6855
6856 addReplyDouble(c,*score);
6857 }
6858 }
6859
6860 static void zrankGenericCommand(redisClient *c, int reverse) {
6861 robj *o;
6862 zset *zs;
6863 zskiplist *zsl;
6864 dictEntry *de;
6865 unsigned long rank;
6866 double *score;
6867
6868 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6869 checkType(c,o,REDIS_ZSET)) return;
6870
6871 zs = o->ptr;
6872 zsl = zs->zsl;
6873 de = dictFind(zs->dict,c->argv[2]);
6874 if (!de) {
6875 addReply(c,shared.nullbulk);
6876 return;
6877 }
6878
6879 score = dictGetEntryVal(de);
6880 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
6881 if (rank) {
6882 if (reverse) {
6883 addReplyLongLong(c, zsl->length - rank);
6884 } else {
6885 addReplyLongLong(c, rank-1);
6886 }
6887 } else {
6888 addReply(c,shared.nullbulk);
6889 }
6890 }
6891
6892 static void zrankCommand(redisClient *c) {
6893 zrankGenericCommand(c, 0);
6894 }
6895
6896 static void zrevrankCommand(redisClient *c) {
6897 zrankGenericCommand(c, 1);
6898 }
6899
6900 /* ========================= Hashes utility functions ======================= */
6901 #define REDIS_HASH_KEY 1
6902 #define REDIS_HASH_VALUE 2
6903
6904 /* Check the length of a number of objects to see if we need to convert a
6905 * zipmap to a real hash. Note that we only check string encoded objects
6906 * as their string length can be queried in constant time. */
6907 static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
6908 int i;
6909 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6910
6911 for (i = start; i <= end; i++) {
6912 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6913 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6914 {
6915 convertToRealHash(subject);
6916 return;
6917 }
6918 }
6919 }
6920
6921 /* Encode given objects in-place when the hash uses a dict. */
6922 static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6923 if (subject->encoding == REDIS_ENCODING_HT) {
6924 if (o1) *o1 = tryObjectEncoding(*o1);
6925 if (o2) *o2 = tryObjectEncoding(*o2);
6926 }
6927 }
6928
6929 /* Get the value from a hash identified by key. Returns either a string
6930 * object or NULL if the value cannot be found. The refcount of the object
6931 * is always increased by 1 when the value was found. */
6932 static robj *hashTypeGet(robj *o, robj *key) {
6933 robj *value = NULL;
6934 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6935 unsigned char *v;
6936 unsigned int vlen;
6937 key = getDecodedObject(key);
6938 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6939 value = createStringObject((char*)v,vlen);
6940 }
6941 decrRefCount(key);
6942 } else {
6943 dictEntry *de = dictFind(o->ptr,key);
6944 if (de != NULL) {
6945 value = dictGetEntryVal(de);
6946 incrRefCount(value);
6947 }
6948 }
6949 return value;
6950 }
6951
6952 /* Test if the key exists in the given hash. Returns 1 if the key
6953 * exists and 0 when it doesn't. */
6954 static int hashTypeExists(robj *o, robj *key) {
6955 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6956 key = getDecodedObject(key);
6957 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6958 decrRefCount(key);
6959 return 1;
6960 }
6961 decrRefCount(key);
6962 } else {
6963 if (dictFind(o->ptr,key) != NULL) {
6964 return 1;
6965 }
6966 }
6967 return 0;
6968 }
6969
6970 /* Add an element, discard the old if the key already exists.
6971 * Return 0 on insert and 1 on update. */
6972 static int hashTypeSet(robj *o, robj *key, robj *value) {
6973 int update = 0;
6974 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6975 key = getDecodedObject(key);
6976 value = getDecodedObject(value);
6977 o->ptr = zipmapSet(o->ptr,
6978 key->ptr,sdslen(key->ptr),
6979 value->ptr,sdslen(value->ptr), &update);
6980 decrRefCount(key);
6981 decrRefCount(value);
6982
6983 /* Check if the zipmap needs to be upgraded to a real hash table */
6984 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6985 convertToRealHash(o);
6986 } else {
6987 if (dictReplace(o->ptr,key,value)) {
6988 /* Insert */
6989 incrRefCount(key);
6990 } else {
6991 /* Update */
6992 update = 1;
6993 }
6994 incrRefCount(value);
6995 }
6996 return update;
6997 }
6998
6999 /* Delete an element from a hash.
7000 * Return 1 on deleted and 0 on not found. */
7001 static int hashTypeDelete(robj *o, robj *key) {
7002 int deleted = 0;
7003 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7004 key = getDecodedObject(key);
7005 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
7006 decrRefCount(key);
7007 } else {
7008 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
7009 /* Always check if the dictionary needs a resize after a delete. */
7010 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
7011 }
7012 return deleted;
7013 }
7014
7015 /* Return the number of elements in a hash. */
7016 static unsigned long hashTypeLength(robj *o) {
7017 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7018 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7019 }
7020
7021 /* Structure to hold hash iteration abstration. Note that iteration over
7022 * hashes involves both fields and values. Because it is possible that
7023 * not both are required, store pointers in the iterator to avoid
7024 * unnecessary memory allocation for fields/values. */
7025 typedef struct {
7026 int encoding;
7027 unsigned char *zi;
7028 unsigned char *zk, *zv;
7029 unsigned int zklen, zvlen;
7030
7031 dictIterator *di;
7032 dictEntry *de;
7033 } hashTypeIterator;
7034
7035 static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7036 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7037 hi->encoding = subject->encoding;
7038 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7039 hi->zi = zipmapRewind(subject->ptr);
7040 } else if (hi->encoding == REDIS_ENCODING_HT) {
7041 hi->di = dictGetIterator(subject->ptr);
7042 } else {
7043 redisAssert(NULL);
7044 }
7045 return hi;
7046 }
7047
7048 static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7049 if (hi->encoding == REDIS_ENCODING_HT) {
7050 dictReleaseIterator(hi->di);
7051 }
7052 zfree(hi);
7053 }
7054
7055 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
7056 * could be found and REDIS_ERR when the iterator reaches the end. */
7057 static int hashTypeNext(hashTypeIterator *hi) {
7058 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7059 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7060 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7061 } else {
7062 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7063 }
7064 return REDIS_OK;
7065 }
7066
7067 /* Get key or value object at current iteration position.
7068 * This increases the refcount of the field object by 1. */
7069 static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7070 robj *o;
7071 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7072 if (what & REDIS_HASH_KEY) {
7073 o = createStringObject((char*)hi->zk,hi->zklen);
7074 } else {
7075 o = createStringObject((char*)hi->zv,hi->zvlen);
7076 }
7077 } else {
7078 if (what & REDIS_HASH_KEY) {
7079 o = dictGetEntryKey(hi->de);
7080 } else {
7081 o = dictGetEntryVal(hi->de);
7082 }
7083 incrRefCount(o);
7084 }
7085 return o;
7086 }
7087
7088 static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7089 robj *o = lookupKeyWrite(c->db,key);
7090 if (o == NULL) {
7091 o = createHashObject();
7092 dbAdd(c->db,key,o);
7093 } else {
7094 if (o->type != REDIS_HASH) {
7095 addReply(c,shared.wrongtypeerr);
7096 return NULL;
7097 }
7098 }
7099 return o;
7100 }
7101
7102 /* ============================= Hash commands ============================== */
7103 static void hsetCommand(redisClient *c) {
7104 int update;
7105 robj *o;
7106
7107 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7108 hashTypeTryConversion(o,c->argv,2,3);
7109 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7110 update = hashTypeSet(o,c->argv[2],c->argv[3]);
7111 addReply(c, update ? shared.czero : shared.cone);
7112 server.dirty++;
7113 }
7114
7115 static void hsetnxCommand(redisClient *c) {
7116 robj *o;
7117 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7118 hashTypeTryConversion(o,c->argv,2,3);
7119
7120 if (hashTypeExists(o, c->argv[2])) {
7121 addReply(c, shared.czero);
7122 } else {
7123 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7124 hashTypeSet(o,c->argv[2],c->argv[3]);
7125 addReply(c, shared.cone);
7126 server.dirty++;
7127 }
7128 }
7129
7130 static void hmsetCommand(redisClient *c) {
7131 int i;
7132 robj *o;
7133
7134 if ((c->argc % 2) == 1) {
7135 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7136 return;
7137 }
7138
7139 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7140 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7141 for (i = 2; i < c->argc; i += 2) {
7142 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7143 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7144 }
7145 addReply(c, shared.ok);
7146 server.dirty++;
7147 }
7148
7149 static void hincrbyCommand(redisClient *c) {
7150 long long value, incr;
7151 robj *o, *current, *new;
7152
7153 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7154 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7155 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
7156 if (getLongLongFromObjectOrReply(c,current,&value,
7157 "hash value is not an integer") != REDIS_OK) {
7158 decrRefCount(current);
7159 return;
7160 }
7161 decrRefCount(current);
7162 } else {
7163 value = 0;
7164 }
7165
7166 value += incr;
7167 new = createStringObjectFromLongLong(value);
7168 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7169 hashTypeSet(o,c->argv[2],new);
7170 decrRefCount(new);
7171 addReplyLongLong(c,value);
7172 server.dirty++;
7173 }
7174
7175 static void hgetCommand(redisClient *c) {
7176 robj *o, *value;
7177 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7178 checkType(c,o,REDIS_HASH)) return;
7179
7180 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7181 addReplyBulk(c,value);
7182 decrRefCount(value);
7183 } else {
7184 addReply(c,shared.nullbulk);
7185 }
7186 }
7187
7188 static void hmgetCommand(redisClient *c) {
7189 int i;
7190 robj *o, *value;
7191 o = lookupKeyRead(c->db,c->argv[1]);
7192 if (o != NULL && o->type != REDIS_HASH) {
7193 addReply(c,shared.wrongtypeerr);
7194 }
7195
7196 /* Note the check for o != NULL happens inside the loop. This is
7197 * done because objects that cannot be found are considered to be
7198 * an empty hash. The reply should then be a series of NULLs. */
7199 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7200 for (i = 2; i < c->argc; i++) {
7201 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7202 addReplyBulk(c,value);
7203 decrRefCount(value);
7204 } else {
7205 addReply(c,shared.nullbulk);
7206 }
7207 }
7208 }
7209
7210 static void hdelCommand(redisClient *c) {
7211 robj *o;
7212 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7213 checkType(c,o,REDIS_HASH)) return;
7214
7215 if (hashTypeDelete(o,c->argv[2])) {
7216 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7217 addReply(c,shared.cone);
7218 server.dirty++;
7219 } else {
7220 addReply(c,shared.czero);
7221 }
7222 }
7223
7224 static void hlenCommand(redisClient *c) {
7225 robj *o;
7226 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7227 checkType(c,o,REDIS_HASH)) return;
7228
7229 addReplyUlong(c,hashTypeLength(o));
7230 }
7231
7232 static void genericHgetallCommand(redisClient *c, int flags) {
7233 robj *o, *lenobj, *obj;
7234 unsigned long count = 0;
7235 hashTypeIterator *hi;
7236
7237 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7238 || checkType(c,o,REDIS_HASH)) return;
7239
7240 lenobj = createObject(REDIS_STRING,NULL);
7241 addReply(c,lenobj);
7242 decrRefCount(lenobj);
7243
7244 hi = hashTypeInitIterator(o);
7245 while (hashTypeNext(hi) != REDIS_ERR) {
7246 if (flags & REDIS_HASH_KEY) {
7247 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7248 addReplyBulk(c,obj);
7249 decrRefCount(obj);
7250 count++;
7251 }
7252 if (flags & REDIS_HASH_VALUE) {
7253 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7254 addReplyBulk(c,obj);
7255 decrRefCount(obj);
7256 count++;
7257 }
7258 }
7259 hashTypeReleaseIterator(hi);
7260
7261 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7262 }
7263
7264 static void hkeysCommand(redisClient *c) {
7265 genericHgetallCommand(c,REDIS_HASH_KEY);
7266 }
7267
7268 static void hvalsCommand(redisClient *c) {
7269 genericHgetallCommand(c,REDIS_HASH_VALUE);
7270 }
7271
7272 static void hgetallCommand(redisClient *c) {
7273 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7274 }
7275
7276 static void hexistsCommand(redisClient *c) {
7277 robj *o;
7278 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7279 checkType(c,o,REDIS_HASH)) return;
7280
7281 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
7282 }
7283
7284 static void convertToRealHash(robj *o) {
7285 unsigned char *key, *val, *p, *zm = o->ptr;
7286 unsigned int klen, vlen;
7287 dict *dict = dictCreate(&hashDictType,NULL);
7288
7289 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7290 p = zipmapRewind(zm);
7291 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7292 robj *keyobj, *valobj;
7293
7294 keyobj = createStringObject((char*)key,klen);
7295 valobj = createStringObject((char*)val,vlen);
7296 keyobj = tryObjectEncoding(keyobj);
7297 valobj = tryObjectEncoding(valobj);
7298 dictAdd(dict,keyobj,valobj);
7299 }
7300 o->encoding = REDIS_ENCODING_HT;
7301 o->ptr = dict;
7302 zfree(zm);
7303 }
7304
7305 /* ========================= Non type-specific commands ==================== */
7306
7307 static void flushdbCommand(redisClient *c) {
7308 server.dirty += dictSize(c->db->dict);
7309 touchWatchedKeysOnFlush(c->db->id);
7310 dictEmpty(c->db->dict);
7311 dictEmpty(c->db->expires);
7312 addReply(c,shared.ok);
7313 }
7314
7315 static void flushallCommand(redisClient *c) {
7316 touchWatchedKeysOnFlush(-1);
7317 server.dirty += emptyDb();
7318 addReply(c,shared.ok);
7319 if (server.bgsavechildpid != -1) {
7320 kill(server.bgsavechildpid,SIGKILL);
7321 rdbRemoveTempFile(server.bgsavechildpid);
7322 }
7323 rdbSave(server.dbfilename);
7324 server.dirty++;
7325 }
7326
7327 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7328 redisSortOperation *so = zmalloc(sizeof(*so));
7329 so->type = type;
7330 so->pattern = pattern;
7331 return so;
7332 }
7333
7334 /* Return the value associated to the key with a name obtained
7335 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7336 * The returned object will always have its refcount increased by 1
7337 * when it is non-NULL. */
7338 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7339 char *p, *f;
7340 sds spat, ssub;
7341 robj keyobj, fieldobj, *o;
7342 int prefixlen, sublen, postfixlen, fieldlen;
7343 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7344 struct {
7345 long len;
7346 long free;
7347 char buf[REDIS_SORTKEY_MAX+1];
7348 } keyname, fieldname;
7349
7350 /* If the pattern is "#" return the substitution object itself in order
7351 * to implement the "SORT ... GET #" feature. */
7352 spat = pattern->ptr;
7353 if (spat[0] == '#' && spat[1] == '\0') {
7354 incrRefCount(subst);
7355 return subst;
7356 }
7357
7358 /* The substitution object may be specially encoded. If so we create
7359 * a decoded object on the fly. Otherwise getDecodedObject will just
7360 * increment the ref count, that we'll decrement later. */
7361 subst = getDecodedObject(subst);
7362
7363 ssub = subst->ptr;
7364 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7365 p = strchr(spat,'*');
7366 if (!p) {
7367 decrRefCount(subst);
7368 return NULL;
7369 }
7370
7371 /* Find out if we're dealing with a hash dereference. */
7372 if ((f = strstr(p+1, "->")) != NULL) {
7373 fieldlen = sdslen(spat)-(f-spat);
7374 /* this also copies \0 character */
7375 memcpy(fieldname.buf,f+2,fieldlen-1);
7376 fieldname.len = fieldlen-2;
7377 } else {
7378 fieldlen = 0;
7379 }
7380
7381 prefixlen = p-spat;
7382 sublen = sdslen(ssub);
7383 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7384 memcpy(keyname.buf,spat,prefixlen);
7385 memcpy(keyname.buf+prefixlen,ssub,sublen);
7386 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7387 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7388 keyname.len = prefixlen+sublen+postfixlen;
7389 decrRefCount(subst);
7390
7391 /* Lookup substituted key */
7392 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7393 o = lookupKeyRead(db,&keyobj);
7394 if (o == NULL) return NULL;
7395
7396 if (fieldlen > 0) {
7397 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7398
7399 /* Retrieve value from hash by the field name. This operation
7400 * already increases the refcount of the returned object. */
7401 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7402 o = hashTypeGet(o, &fieldobj);
7403 } else {
7404 if (o->type != REDIS_STRING) return NULL;
7405
7406 /* Every object that this function returns needs to have its refcount
7407 * increased. sortCommand decreases it again. */
7408 incrRefCount(o);
7409 }
7410
7411 return o;
7412 }
7413
7414 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7415 * the additional parameter is not standard but a BSD-specific we have to
7416 * pass sorting parameters via the global 'server' structure */
7417 static int sortCompare(const void *s1, const void *s2) {
7418 const redisSortObject *so1 = s1, *so2 = s2;
7419 int cmp;
7420
7421 if (!server.sort_alpha) {
7422 /* Numeric sorting. Here it's trivial as we precomputed scores */
7423 if (so1->u.score > so2->u.score) {
7424 cmp = 1;
7425 } else if (so1->u.score < so2->u.score) {
7426 cmp = -1;
7427 } else {
7428 cmp = 0;
7429 }
7430 } else {
7431 /* Alphanumeric sorting */
7432 if (server.sort_bypattern) {
7433 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7434 /* At least one compare object is NULL */
7435 if (so1->u.cmpobj == so2->u.cmpobj)
7436 cmp = 0;
7437 else if (so1->u.cmpobj == NULL)
7438 cmp = -1;
7439 else
7440 cmp = 1;
7441 } else {
7442 /* We have both the objects, use strcoll */
7443 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7444 }
7445 } else {
7446 /* Compare elements directly. */
7447 cmp = compareStringObjects(so1->obj,so2->obj);
7448 }
7449 }
7450 return server.sort_desc ? -cmp : cmp;
7451 }
7452
7453 /* The SORT command is the most complex command in Redis. Warning: this code
7454 * is optimized for speed and a bit less for readability */
7455 static void sortCommand(redisClient *c) {
7456 list *operations;
7457 unsigned int outputlen = 0;
7458 int desc = 0, alpha = 0;
7459 int limit_start = 0, limit_count = -1, start, end;
7460 int j, dontsort = 0, vectorlen;
7461 int getop = 0; /* GET operation counter */
7462 robj *sortval, *sortby = NULL, *storekey = NULL;
7463 redisSortObject *vector; /* Resulting vector to sort */
7464
7465 /* Lookup the key to sort. It must be of the right types */
7466 sortval = lookupKeyRead(c->db,c->argv[1]);
7467 if (sortval == NULL) {
7468 addReply(c,shared.emptymultibulk);
7469 return;
7470 }
7471 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7472 sortval->type != REDIS_ZSET)
7473 {
7474 addReply(c,shared.wrongtypeerr);
7475 return;
7476 }
7477
7478 /* Create a list of operations to perform for every sorted element.
7479 * Operations can be GET/DEL/INCR/DECR */
7480 operations = listCreate();
7481 listSetFreeMethod(operations,zfree);
7482 j = 2;
7483
7484 /* Now we need to protect sortval incrementing its count, in the future
7485 * SORT may have options able to overwrite/delete keys during the sorting
7486 * and the sorted key itself may get destroied */
7487 incrRefCount(sortval);
7488
7489 /* The SORT command has an SQL-alike syntax, parse it */
7490 while(j < c->argc) {
7491 int leftargs = c->argc-j-1;
7492 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7493 desc = 0;
7494 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7495 desc = 1;
7496 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7497 alpha = 1;
7498 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7499 limit_start = atoi(c->argv[j+1]->ptr);
7500 limit_count = atoi(c->argv[j+2]->ptr);
7501 j+=2;
7502 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7503 storekey = c->argv[j+1];
7504 j++;
7505 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7506 sortby = c->argv[j+1];
7507 /* If the BY pattern does not contain '*', i.e. it is constant,
7508 * we don't need to sort nor to lookup the weight keys. */
7509 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7510 j++;
7511 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7512 listAddNodeTail(operations,createSortOperation(
7513 REDIS_SORT_GET,c->argv[j+1]));
7514 getop++;
7515 j++;
7516 } else {
7517 decrRefCount(sortval);
7518 listRelease(operations);
7519 addReply(c,shared.syntaxerr);
7520 return;
7521 }
7522 j++;
7523 }
7524
7525 /* Load the sorting vector with all the objects to sort */
7526 switch(sortval->type) {
7527 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
7528 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7529 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7530 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7531 }
7532 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7533 j = 0;
7534
7535 if (sortval->type == REDIS_LIST) {
7536 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7537 listTypeEntry entry;
7538 while(listTypeNext(li,&entry)) {
7539 vector[j].obj = listTypeGet(&entry);
7540 vector[j].u.score = 0;
7541 vector[j].u.cmpobj = NULL;
7542 j++;
7543 }
7544 listTypeReleaseIterator(li);
7545 } else {
7546 dict *set;
7547 dictIterator *di;
7548 dictEntry *setele;
7549
7550 if (sortval->type == REDIS_SET) {
7551 set = sortval->ptr;
7552 } else {
7553 zset *zs = sortval->ptr;
7554 set = zs->dict;
7555 }
7556
7557 di = dictGetIterator(set);
7558 while((setele = dictNext(di)) != NULL) {
7559 vector[j].obj = dictGetEntryKey(setele);
7560 vector[j].u.score = 0;
7561 vector[j].u.cmpobj = NULL;
7562 j++;
7563 }
7564 dictReleaseIterator(di);
7565 }
7566 redisAssert(j == vectorlen);
7567
7568 /* Now it's time to load the right scores in the sorting vector */
7569 if (dontsort == 0) {
7570 for (j = 0; j < vectorlen; j++) {
7571 robj *byval;
7572 if (sortby) {
7573 /* lookup value to sort by */
7574 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7575 if (!byval) continue;
7576 } else {
7577 /* use object itself to sort by */
7578 byval = vector[j].obj;
7579 }
7580
7581 if (alpha) {
7582 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7583 } else {
7584 if (byval->encoding == REDIS_ENCODING_RAW) {
7585 vector[j].u.score = strtod(byval->ptr,NULL);
7586 } else if (byval->encoding == REDIS_ENCODING_INT) {
7587 /* Don't need to decode the object if it's
7588 * integer-encoded (the only encoding supported) so
7589 * far. We can just cast it */
7590 vector[j].u.score = (long)byval->ptr;
7591 } else {
7592 redisAssert(1 != 1);
7593 }
7594 }
7595
7596 /* when the object was retrieved using lookupKeyByPattern,
7597 * its refcount needs to be decreased. */
7598 if (sortby) {
7599 decrRefCount(byval);
7600 }
7601 }
7602 }
7603
7604 /* We are ready to sort the vector... perform a bit of sanity check
7605 * on the LIMIT option too. We'll use a partial version of quicksort. */
7606 start = (limit_start < 0) ? 0 : limit_start;
7607 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7608 if (start >= vectorlen) {
7609 start = vectorlen-1;
7610 end = vectorlen-2;
7611 }
7612 if (end >= vectorlen) end = vectorlen-1;
7613
7614 if (dontsort == 0) {
7615 server.sort_desc = desc;
7616 server.sort_alpha = alpha;
7617 server.sort_bypattern = sortby ? 1 : 0;
7618 if (sortby && (start != 0 || end != vectorlen-1))
7619 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7620 else
7621 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7622 }
7623
7624 /* Send command output to the output buffer, performing the specified
7625 * GET/DEL/INCR/DECR operations if any. */
7626 outputlen = getop ? getop*(end-start+1) : end-start+1;
7627 if (storekey == NULL) {
7628 /* STORE option not specified, sent the sorting result to client */
7629 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7630 for (j = start; j <= end; j++) {
7631 listNode *ln;
7632 listIter li;
7633
7634 if (!getop) addReplyBulk(c,vector[j].obj);
7635 listRewind(operations,&li);
7636 while((ln = listNext(&li))) {
7637 redisSortOperation *sop = ln->value;
7638 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7639 vector[j].obj);
7640
7641 if (sop->type == REDIS_SORT_GET) {
7642 if (!val) {
7643 addReply(c,shared.nullbulk);
7644 } else {
7645 addReplyBulk(c,val);
7646 decrRefCount(val);
7647 }
7648 } else {
7649 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7650 }
7651 }
7652 }
7653 } else {
7654 robj *sobj = createZiplistObject();
7655
7656 /* STORE option specified, set the sorting result as a List object */
7657 for (j = start; j <= end; j++) {
7658 listNode *ln;
7659 listIter li;
7660
7661 if (!getop) {
7662 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
7663 } else {
7664 listRewind(operations,&li);
7665 while((ln = listNext(&li))) {
7666 redisSortOperation *sop = ln->value;
7667 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7668 vector[j].obj);
7669
7670 if (sop->type == REDIS_SORT_GET) {
7671 if (!val) val = createStringObject("",0);
7672
7673 /* listTypePush does an incrRefCount, so we should take care
7674 * care of the incremented refcount caused by either
7675 * lookupKeyByPattern or createStringObject("",0) */
7676 listTypePush(sobj,val,REDIS_TAIL);
7677 decrRefCount(val);
7678 } else {
7679 /* always fails */
7680 redisAssert(sop->type == REDIS_SORT_GET);
7681 }
7682 }
7683 }
7684 }
7685 dbReplace(c->db,storekey,sobj);
7686 /* Note: we add 1 because the DB is dirty anyway since even if the
7687 * SORT result is empty a new key is set and maybe the old content
7688 * replaced. */
7689 server.dirty += 1+outputlen;
7690 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7691 }
7692
7693 /* Cleanup */
7694 if (sortval->type == REDIS_LIST)
7695 for (j = 0; j < vectorlen; j++)
7696 decrRefCount(vector[j].obj);
7697 decrRefCount(sortval);
7698 listRelease(operations);
7699 for (j = 0; j < vectorlen; j++) {
7700 if (alpha && vector[j].u.cmpobj)
7701 decrRefCount(vector[j].u.cmpobj);
7702 }
7703 zfree(vector);
7704 }
7705
7706 /* Convert an amount of bytes into a human readable string in the form
7707 * of 100B, 2G, 100M, 4K, and so forth. */
7708 static void bytesToHuman(char *s, unsigned long long n) {
7709 double d;
7710
7711 if (n < 1024) {
7712 /* Bytes */
7713 sprintf(s,"%lluB",n);
7714 return;
7715 } else if (n < (1024*1024)) {
7716 d = (double)n/(1024);
7717 sprintf(s,"%.2fK",d);
7718 } else if (n < (1024LL*1024*1024)) {
7719 d = (double)n/(1024*1024);
7720 sprintf(s,"%.2fM",d);
7721 } else if (n < (1024LL*1024*1024*1024)) {
7722 d = (double)n/(1024LL*1024*1024);
7723 sprintf(s,"%.2fG",d);
7724 }
7725 }
7726
7727 /* Create the string returned by the INFO command. This is decoupled
7728 * by the INFO command itself as we need to report the same information
7729 * on memory corruption problems. */
7730 static sds genRedisInfoString(void) {
7731 sds info;
7732 time_t uptime = time(NULL)-server.stat_starttime;
7733 int j;
7734 char hmem[64];
7735
7736 bytesToHuman(hmem,zmalloc_used_memory());
7737 info = sdscatprintf(sdsempty(),
7738 "redis_version:%s\r\n"
7739 "redis_git_sha1:%s\r\n"
7740 "redis_git_dirty:%d\r\n"
7741 "arch_bits:%s\r\n"
7742 "multiplexing_api:%s\r\n"
7743 "process_id:%ld\r\n"
7744 "uptime_in_seconds:%ld\r\n"
7745 "uptime_in_days:%ld\r\n"
7746 "connected_clients:%d\r\n"
7747 "connected_slaves:%d\r\n"
7748 "blocked_clients:%d\r\n"
7749 "used_memory:%zu\r\n"
7750 "used_memory_human:%s\r\n"
7751 "changes_since_last_save:%lld\r\n"
7752 "bgsave_in_progress:%d\r\n"
7753 "last_save_time:%ld\r\n"
7754 "bgrewriteaof_in_progress:%d\r\n"
7755 "total_connections_received:%lld\r\n"
7756 "total_commands_processed:%lld\r\n"
7757 "expired_keys:%lld\r\n"
7758 "hash_max_zipmap_entries:%zu\r\n"
7759 "hash_max_zipmap_value:%zu\r\n"
7760 "pubsub_channels:%ld\r\n"
7761 "pubsub_patterns:%u\r\n"
7762 "vm_enabled:%d\r\n"
7763 "role:%s\r\n"
7764 ,REDIS_VERSION,
7765 redisGitSHA1(),
7766 strtol(redisGitDirty(),NULL,10) > 0,
7767 (sizeof(long) == 8) ? "64" : "32",
7768 aeGetApiName(),
7769 (long) getpid(),
7770 uptime,
7771 uptime/(3600*24),
7772 listLength(server.clients)-listLength(server.slaves),
7773 listLength(server.slaves),
7774 server.blpop_blocked_clients,
7775 zmalloc_used_memory(),
7776 hmem,
7777 server.dirty,
7778 server.bgsavechildpid != -1,
7779 server.lastsave,
7780 server.bgrewritechildpid != -1,
7781 server.stat_numconnections,
7782 server.stat_numcommands,
7783 server.stat_expiredkeys,
7784 server.hash_max_zipmap_entries,
7785 server.hash_max_zipmap_value,
7786 dictSize(server.pubsub_channels),
7787 listLength(server.pubsub_patterns),
7788 server.vm_enabled != 0,
7789 server.masterhost == NULL ? "master" : "slave"
7790 );
7791 if (server.masterhost) {
7792 info = sdscatprintf(info,
7793 "master_host:%s\r\n"
7794 "master_port:%d\r\n"
7795 "master_link_status:%s\r\n"
7796 "master_last_io_seconds_ago:%d\r\n"
7797 ,server.masterhost,
7798 server.masterport,
7799 (server.replstate == REDIS_REPL_CONNECTED) ?
7800 "up" : "down",
7801 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7802 );
7803 }
7804 if (server.vm_enabled) {
7805 lockThreadedIO();
7806 info = sdscatprintf(info,
7807 "vm_conf_max_memory:%llu\r\n"
7808 "vm_conf_page_size:%llu\r\n"
7809 "vm_conf_pages:%llu\r\n"
7810 "vm_stats_used_pages:%llu\r\n"
7811 "vm_stats_swapped_objects:%llu\r\n"
7812 "vm_stats_swappin_count:%llu\r\n"
7813 "vm_stats_swappout_count:%llu\r\n"
7814 "vm_stats_io_newjobs_len:%lu\r\n"
7815 "vm_stats_io_processing_len:%lu\r\n"
7816 "vm_stats_io_processed_len:%lu\r\n"
7817 "vm_stats_io_active_threads:%lu\r\n"
7818 "vm_stats_blocked_clients:%lu\r\n"
7819 ,(unsigned long long) server.vm_max_memory,
7820 (unsigned long long) server.vm_page_size,
7821 (unsigned long long) server.vm_pages,
7822 (unsigned long long) server.vm_stats_used_pages,
7823 (unsigned long long) server.vm_stats_swapped_objects,
7824 (unsigned long long) server.vm_stats_swapins,
7825 (unsigned long long) server.vm_stats_swapouts,
7826 (unsigned long) listLength(server.io_newjobs),
7827 (unsigned long) listLength(server.io_processing),
7828 (unsigned long) listLength(server.io_processed),
7829 (unsigned long) server.io_active_threads,
7830 (unsigned long) server.vm_blocked_clients
7831 );
7832 unlockThreadedIO();
7833 }
7834 for (j = 0; j < server.dbnum; j++) {
7835 long long keys, vkeys;
7836
7837 keys = dictSize(server.db[j].dict);
7838 vkeys = dictSize(server.db[j].expires);
7839 if (keys || vkeys) {
7840 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7841 j, keys, vkeys);
7842 }
7843 }
7844 return info;
7845 }
7846
7847 static void infoCommand(redisClient *c) {
7848 sds info = genRedisInfoString();
7849 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7850 (unsigned long)sdslen(info)));
7851 addReplySds(c,info);
7852 addReply(c,shared.crlf);
7853 }
7854
7855 static void monitorCommand(redisClient *c) {
7856 /* ignore MONITOR if aleady slave or in monitor mode */
7857 if (c->flags & REDIS_SLAVE) return;
7858
7859 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7860 c->slaveseldb = 0;
7861 listAddNodeTail(server.monitors,c);
7862 addReply(c,shared.ok);
7863 }
7864
7865 /* ================================= Expire ================================= */
7866 static int removeExpire(redisDb *db, robj *key) {
7867 /* An expire may only be removed if there is a corresponding entry in the
7868 * main dict. Otherwise, the key will never be freed. */
7869 redisAssert(dictFind(db->dict,key->ptr) != NULL);
7870 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7871 return 1;
7872 } else {
7873 return 0;
7874 }
7875 }
7876
7877 static int setExpire(redisDb *db, robj *key, time_t when) {
7878 dictEntry *de;
7879
7880 /* Reuse the sds from the main dict in the expire dict */
7881 redisAssert((de = dictFind(db->dict,key->ptr)) != NULL);
7882 if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) {
7883 return 0;
7884 } else {
7885 return 1;
7886 }
7887 }
7888
7889 /* Return the expire time of the specified key, or -1 if no expire
7890 * is associated with this key (i.e. the key is non volatile) */
7891 static time_t getExpire(redisDb *db, robj *key) {
7892 dictEntry *de;
7893
7894 /* No expire? return ASAP */
7895 if (dictSize(db->expires) == 0 ||
7896 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7897
7898 /* The entry was found in the expire dict, this means it should also
7899 * be present in the main dict (safety check). */
7900 redisAssert(dictFind(db->dict,key->ptr) != NULL);
7901 return (time_t) dictGetEntryVal(de);
7902 }
7903
7904 static int expireIfNeeded(redisDb *db, robj *key) {
7905 time_t when = getExpire(db,key);
7906 if (when < 0) return 0;
7907
7908 /* Return when this key has not expired */
7909 if (time(NULL) <= when) return 0;
7910
7911 /* Delete the key */
7912 server.stat_expiredkeys++;
7913 server.dirty++;
7914 return dbDelete(db,key);
7915 }
7916
7917 static int deleteIfVolatile(redisDb *db, robj *key) {
7918 if (getExpire(db,key) < 0) return 0;
7919
7920 /* Delete the key */
7921 server.stat_expiredkeys++;
7922 server.dirty++;
7923 return dbDelete(db,key);
7924 }
7925
7926 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7927 dictEntry *de;
7928 time_t seconds;
7929
7930 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7931
7932 seconds -= offset;
7933
7934 de = dictFind(c->db->dict,key->ptr);
7935 if (de == NULL) {
7936 addReply(c,shared.czero);
7937 return;
7938 }
7939 if (seconds <= 0) {
7940 if (dbDelete(c->db,key)) server.dirty++;
7941 addReply(c, shared.cone);
7942 return;
7943 } else {
7944 time_t when = time(NULL)+seconds;
7945 if (setExpire(c->db,key,when)) {
7946 addReply(c,shared.cone);
7947 server.dirty++;
7948 } else {
7949 addReply(c,shared.czero);
7950 }
7951 return;
7952 }
7953 }
7954
7955 static void expireCommand(redisClient *c) {
7956 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7957 }
7958
7959 static void expireatCommand(redisClient *c) {
7960 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7961 }
7962
7963 static void ttlCommand(redisClient *c) {
7964 time_t expire;
7965 int ttl = -1;
7966
7967 expire = getExpire(c->db,c->argv[1]);
7968 if (expire != -1) {
7969 ttl = (int) (expire-time(NULL));
7970 if (ttl < 0) ttl = -1;
7971 }
7972 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7973 }
7974
7975 /* ================================ MULTI/EXEC ============================== */
7976
7977 /* Client state initialization for MULTI/EXEC */
7978 static void initClientMultiState(redisClient *c) {
7979 c->mstate.commands = NULL;
7980 c->mstate.count = 0;
7981 }
7982
7983 /* Release all the resources associated with MULTI/EXEC state */
7984 static void freeClientMultiState(redisClient *c) {
7985 int j;
7986
7987 for (j = 0; j < c->mstate.count; j++) {
7988 int i;
7989 multiCmd *mc = c->mstate.commands+j;
7990
7991 for (i = 0; i < mc->argc; i++)
7992 decrRefCount(mc->argv[i]);
7993 zfree(mc->argv);
7994 }
7995 zfree(c->mstate.commands);
7996 }
7997
7998 /* Add a new command into the MULTI commands queue */
7999 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
8000 multiCmd *mc;
8001 int j;
8002
8003 c->mstate.commands = zrealloc(c->mstate.commands,
8004 sizeof(multiCmd)*(c->mstate.count+1));
8005 mc = c->mstate.commands+c->mstate.count;
8006 mc->cmd = cmd;
8007 mc->argc = c->argc;
8008 mc->argv = zmalloc(sizeof(robj*)*c->argc);
8009 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8010 for (j = 0; j < c->argc; j++)
8011 incrRefCount(mc->argv[j]);
8012 c->mstate.count++;
8013 }
8014
8015 static void multiCommand(redisClient *c) {
8016 if (c->flags & REDIS_MULTI) {
8017 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8018 return;
8019 }
8020 c->flags |= REDIS_MULTI;
8021 addReply(c,shared.ok);
8022 }
8023
8024 static void discardCommand(redisClient *c) {
8025 if (!(c->flags & REDIS_MULTI)) {
8026 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8027 return;
8028 }
8029
8030 freeClientMultiState(c);
8031 initClientMultiState(c);
8032 c->flags &= (~REDIS_MULTI);
8033 unwatchAllKeys(c);
8034 addReply(c,shared.ok);
8035 }
8036
8037 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8038 * implememntation for more information. */
8039 static void execCommandReplicateMulti(redisClient *c) {
8040 struct redisCommand *cmd;
8041 robj *multistring = createStringObject("MULTI",5);
8042
8043 cmd = lookupCommand("multi");
8044 if (server.appendonly)
8045 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8046 if (listLength(server.slaves))
8047 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8048 decrRefCount(multistring);
8049 }
8050
8051 static void execCommand(redisClient *c) {
8052 int j;
8053 robj **orig_argv;
8054 int orig_argc;
8055
8056 if (!(c->flags & REDIS_MULTI)) {
8057 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8058 return;
8059 }
8060
8061 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8062 * A failed EXEC will return a multi bulk nil object. */
8063 if (c->flags & REDIS_DIRTY_CAS) {
8064 freeClientMultiState(c);
8065 initClientMultiState(c);
8066 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8067 unwatchAllKeys(c);
8068 addReply(c,shared.nullmultibulk);
8069 return;
8070 }
8071
8072 /* Replicate a MULTI request now that we are sure the block is executed.
8073 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8074 * both the AOF and the replication link will have the same consistency
8075 * and atomicity guarantees. */
8076 execCommandReplicateMulti(c);
8077
8078 /* Exec all the queued commands */
8079 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
8080 orig_argv = c->argv;
8081 orig_argc = c->argc;
8082 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8083 for (j = 0; j < c->mstate.count; j++) {
8084 c->argc = c->mstate.commands[j].argc;
8085 c->argv = c->mstate.commands[j].argv;
8086 call(c,c->mstate.commands[j].cmd);
8087 }
8088 c->argv = orig_argv;
8089 c->argc = orig_argc;
8090 freeClientMultiState(c);
8091 initClientMultiState(c);
8092 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8093 /* Make sure the EXEC command is always replicated / AOF, since we
8094 * always send the MULTI command (we can't know beforehand if the
8095 * next operations will contain at least a modification to the DB). */
8096 server.dirty++;
8097 }
8098
8099 /* =========================== Blocking Operations ========================= */
8100
8101 /* Currently Redis blocking operations support is limited to list POP ops,
8102 * so the current implementation is not fully generic, but it is also not
8103 * completely specific so it will not require a rewrite to support new
8104 * kind of blocking operations in the future.
8105 *
8106 * Still it's important to note that list blocking operations can be already
8107 * used as a notification mechanism in order to implement other blocking
8108 * operations at application level, so there must be a very strong evidence
8109 * of usefulness and generality before new blocking operations are implemented.
8110 *
8111 * This is how the current blocking POP works, we use BLPOP as example:
8112 * - If the user calls BLPOP and the key exists and contains a non empty list
8113 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8114 * if there is not to block.
8115 * - If instead BLPOP is called and the key does not exists or the list is
8116 * empty we need to block. In order to do so we remove the notification for
8117 * new data to read in the client socket (so that we'll not serve new
8118 * requests if the blocking request is not served). Also we put the client
8119 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
8120 * blocking for this keys.
8121 * - If a PUSH operation against a key with blocked clients waiting is
8122 * performed, we serve the first in the list: basically instead to push
8123 * the new element inside the list we return it to the (first / oldest)
8124 * blocking client, unblock the client, and remove it form the list.
8125 *
8126 * The above comment and the source code should be enough in order to understand
8127 * the implementation and modify / fix it later.
8128 */
8129
8130 /* Set a client in blocking mode for the specified key, with the specified
8131 * timeout */
8132 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
8133 dictEntry *de;
8134 list *l;
8135 int j;
8136
8137 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8138 c->blocking_keys_num = numkeys;
8139 c->blockingto = timeout;
8140 for (j = 0; j < numkeys; j++) {
8141 /* Add the key in the client structure, to map clients -> keys */
8142 c->blocking_keys[j] = keys[j];
8143 incrRefCount(keys[j]);
8144
8145 /* And in the other "side", to map keys -> clients */
8146 de = dictFind(c->db->blocking_keys,keys[j]);
8147 if (de == NULL) {
8148 int retval;
8149
8150 /* For every key we take a list of clients blocked for it */
8151 l = listCreate();
8152 retval = dictAdd(c->db->blocking_keys,keys[j],l);
8153 incrRefCount(keys[j]);
8154 assert(retval == DICT_OK);
8155 } else {
8156 l = dictGetEntryVal(de);
8157 }
8158 listAddNodeTail(l,c);
8159 }
8160 /* Mark the client as a blocked client */
8161 c->flags |= REDIS_BLOCKED;
8162 server.blpop_blocked_clients++;
8163 }
8164
8165 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
8166 static void unblockClientWaitingData(redisClient *c) {
8167 dictEntry *de;
8168 list *l;
8169 int j;
8170
8171 assert(c->blocking_keys != NULL);
8172 /* The client may wait for multiple keys, so unblock it for every key. */
8173 for (j = 0; j < c->blocking_keys_num; j++) {
8174 /* Remove this client from the list of clients waiting for this key. */
8175 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
8176 assert(de != NULL);
8177 l = dictGetEntryVal(de);
8178 listDelNode(l,listSearchKey(l,c));
8179 /* If the list is empty we need to remove it to avoid wasting memory */
8180 if (listLength(l) == 0)
8181 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8182 decrRefCount(c->blocking_keys[j]);
8183 }
8184 /* Cleanup the client structure */
8185 zfree(c->blocking_keys);
8186 c->blocking_keys = NULL;
8187 c->flags &= (~REDIS_BLOCKED);
8188 server.blpop_blocked_clients--;
8189 /* We want to process data if there is some command waiting
8190 * in the input buffer. Note that this is safe even if
8191 * unblockClientWaitingData() gets called from freeClient() because
8192 * freeClient() will be smart enough to call this function
8193 * *after* c->querybuf was set to NULL. */
8194 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8195 }
8196
8197 /* This should be called from any function PUSHing into lists.
8198 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8199 * 'ele' is the element pushed.
8200 *
8201 * If the function returns 0 there was no client waiting for a list push
8202 * against this key.
8203 *
8204 * If the function returns 1 there was a client waiting for a list push
8205 * against this key, the element was passed to this client thus it's not
8206 * needed to actually add it to the list and the caller should return asap. */
8207 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8208 struct dictEntry *de;
8209 redisClient *receiver;
8210 list *l;
8211 listNode *ln;
8212
8213 de = dictFind(c->db->blocking_keys,key);
8214 if (de == NULL) return 0;
8215 l = dictGetEntryVal(de);
8216 ln = listFirst(l);
8217 assert(ln != NULL);
8218 receiver = ln->value;
8219
8220 addReplySds(receiver,sdsnew("*2\r\n"));
8221 addReplyBulk(receiver,key);
8222 addReplyBulk(receiver,ele);
8223 unblockClientWaitingData(receiver);
8224 return 1;
8225 }
8226
8227 /* Blocking RPOP/LPOP */
8228 static void blockingPopGenericCommand(redisClient *c, int where) {
8229 robj *o;
8230 time_t timeout;
8231 int j;
8232
8233 for (j = 1; j < c->argc-1; j++) {
8234 o = lookupKeyWrite(c->db,c->argv[j]);
8235 if (o != NULL) {
8236 if (o->type != REDIS_LIST) {
8237 addReply(c,shared.wrongtypeerr);
8238 return;
8239 } else {
8240 list *list = o->ptr;
8241 if (listLength(list) != 0) {
8242 /* If the list contains elements fall back to the usual
8243 * non-blocking POP operation */
8244 robj *argv[2], **orig_argv;
8245 int orig_argc;
8246
8247 /* We need to alter the command arguments before to call
8248 * popGenericCommand() as the command takes a single key. */
8249 orig_argv = c->argv;
8250 orig_argc = c->argc;
8251 argv[1] = c->argv[j];
8252 c->argv = argv;
8253 c->argc = 2;
8254
8255 /* Also the return value is different, we need to output
8256 * the multi bulk reply header and the key name. The
8257 * "real" command will add the last element (the value)
8258 * for us. If this souds like an hack to you it's just
8259 * because it is... */
8260 addReplySds(c,sdsnew("*2\r\n"));
8261 addReplyBulk(c,argv[1]);
8262 popGenericCommand(c,where);
8263
8264 /* Fix the client structure with the original stuff */
8265 c->argv = orig_argv;
8266 c->argc = orig_argc;
8267 return;
8268 }
8269 }
8270 }
8271 }
8272 /* If the list is empty or the key does not exists we must block */
8273 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8274 if (timeout > 0) timeout += time(NULL);
8275 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8276 }
8277
8278 static void blpopCommand(redisClient *c) {
8279 blockingPopGenericCommand(c,REDIS_HEAD);
8280 }
8281
8282 static void brpopCommand(redisClient *c) {
8283 blockingPopGenericCommand(c,REDIS_TAIL);
8284 }
8285
8286 /* =============================== Replication ============================= */
8287
8288 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8289 ssize_t nwritten, ret = size;
8290 time_t start = time(NULL);
8291
8292 timeout++;
8293 while(size) {
8294 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8295 nwritten = write(fd,ptr,size);
8296 if (nwritten == -1) return -1;
8297 ptr += nwritten;
8298 size -= nwritten;
8299 }
8300 if ((time(NULL)-start) > timeout) {
8301 errno = ETIMEDOUT;
8302 return -1;
8303 }
8304 }
8305 return ret;
8306 }
8307
8308 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8309 ssize_t nread, totread = 0;
8310 time_t start = time(NULL);
8311
8312 timeout++;
8313 while(size) {
8314 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8315 nread = read(fd,ptr,size);
8316 if (nread == -1) return -1;
8317 ptr += nread;
8318 size -= nread;
8319 totread += nread;
8320 }
8321 if ((time(NULL)-start) > timeout) {
8322 errno = ETIMEDOUT;
8323 return -1;
8324 }
8325 }
8326 return totread;
8327 }
8328
8329 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8330 ssize_t nread = 0;
8331
8332 size--;
8333 while(size) {
8334 char c;
8335
8336 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8337 if (c == '\n') {
8338 *ptr = '\0';
8339 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8340 return nread;
8341 } else {
8342 *ptr++ = c;
8343 *ptr = '\0';
8344 nread++;
8345 }
8346 }
8347 return nread;
8348 }
8349
8350 static void syncCommand(redisClient *c) {
8351 /* ignore SYNC if aleady slave or in monitor mode */
8352 if (c->flags & REDIS_SLAVE) return;
8353
8354 /* SYNC can't be issued when the server has pending data to send to
8355 * the client about already issued commands. We need a fresh reply
8356 * buffer registering the differences between the BGSAVE and the current
8357 * dataset, so that we can copy to other slaves if needed. */
8358 if (listLength(c->reply) != 0) {
8359 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8360 return;
8361 }
8362
8363 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8364 /* Here we need to check if there is a background saving operation
8365 * in progress, or if it is required to start one */
8366 if (server.bgsavechildpid != -1) {
8367 /* Ok a background save is in progress. Let's check if it is a good
8368 * one for replication, i.e. if there is another slave that is
8369 * registering differences since the server forked to save */
8370 redisClient *slave;
8371 listNode *ln;
8372 listIter li;
8373
8374 listRewind(server.slaves,&li);
8375 while((ln = listNext(&li))) {
8376 slave = ln->value;
8377 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8378 }
8379 if (ln) {
8380 /* Perfect, the server is already registering differences for
8381 * another slave. Set the right state, and copy the buffer. */
8382 listRelease(c->reply);
8383 c->reply = listDup(slave->reply);
8384 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8385 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8386 } else {
8387 /* No way, we need to wait for the next BGSAVE in order to
8388 * register differences */
8389 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8390 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8391 }
8392 } else {
8393 /* Ok we don't have a BGSAVE in progress, let's start one */
8394 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8395 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8396 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8397 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8398 return;
8399 }
8400 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8401 }
8402 c->repldbfd = -1;
8403 c->flags |= REDIS_SLAVE;
8404 c->slaveseldb = 0;
8405 listAddNodeTail(server.slaves,c);
8406 return;
8407 }
8408
8409 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8410 redisClient *slave = privdata;
8411 REDIS_NOTUSED(el);
8412 REDIS_NOTUSED(mask);
8413 char buf[REDIS_IOBUF_LEN];
8414 ssize_t nwritten, buflen;
8415
8416 if (slave->repldboff == 0) {
8417 /* Write the bulk write count before to transfer the DB. In theory here
8418 * we don't know how much room there is in the output buffer of the
8419 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8420 * operations) will never be smaller than the few bytes we need. */
8421 sds bulkcount;
8422
8423 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8424 slave->repldbsize);
8425 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8426 {
8427 sdsfree(bulkcount);
8428 freeClient(slave);
8429 return;
8430 }
8431 sdsfree(bulkcount);
8432 }
8433 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8434 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8435 if (buflen <= 0) {
8436 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8437 (buflen == 0) ? "premature EOF" : strerror(errno));
8438 freeClient(slave);
8439 return;
8440 }
8441 if ((nwritten = write(fd,buf,buflen)) == -1) {
8442 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8443 strerror(errno));
8444 freeClient(slave);
8445 return;
8446 }
8447 slave->repldboff += nwritten;
8448 if (slave->repldboff == slave->repldbsize) {
8449 close(slave->repldbfd);
8450 slave->repldbfd = -1;
8451 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8452 slave->replstate = REDIS_REPL_ONLINE;
8453 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8454 sendReplyToClient, slave) == AE_ERR) {
8455 freeClient(slave);
8456 return;
8457 }
8458 addReplySds(slave,sdsempty());
8459 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8460 }
8461 }
8462
8463 /* This function is called at the end of every backgrond saving.
8464 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8465 * otherwise REDIS_ERR is passed to the function.
8466 *
8467 * The goal of this function is to handle slaves waiting for a successful
8468 * background saving in order to perform non-blocking synchronization. */
8469 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8470 listNode *ln;
8471 int startbgsave = 0;
8472 listIter li;
8473
8474 listRewind(server.slaves,&li);
8475 while((ln = listNext(&li))) {
8476 redisClient *slave = ln->value;
8477
8478 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8479 startbgsave = 1;
8480 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8481 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8482 struct redis_stat buf;
8483
8484 if (bgsaveerr != REDIS_OK) {
8485 freeClient(slave);
8486 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8487 continue;
8488 }
8489 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8490 redis_fstat(slave->repldbfd,&buf) == -1) {
8491 freeClient(slave);
8492 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8493 continue;
8494 }
8495 slave->repldboff = 0;
8496 slave->repldbsize = buf.st_size;
8497 slave->replstate = REDIS_REPL_SEND_BULK;
8498 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8499 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8500 freeClient(slave);
8501 continue;
8502 }
8503 }
8504 }
8505 if (startbgsave) {
8506 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8507 listIter li;
8508
8509 listRewind(server.slaves,&li);
8510 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8511 while((ln = listNext(&li))) {
8512 redisClient *slave = ln->value;
8513
8514 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8515 freeClient(slave);
8516 }
8517 }
8518 }
8519 }
8520
8521 static int syncWithMaster(void) {
8522 char buf[1024], tmpfile[256], authcmd[1024];
8523 long dumpsize;
8524 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8525 int dfd, maxtries = 5;
8526
8527 if (fd == -1) {
8528 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8529 strerror(errno));
8530 return REDIS_ERR;
8531 }
8532
8533 /* AUTH with the master if required. */
8534 if(server.masterauth) {
8535 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8536 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8537 close(fd);
8538 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8539 strerror(errno));
8540 return REDIS_ERR;
8541 }
8542 /* Read the AUTH result. */
8543 if (syncReadLine(fd,buf,1024,3600) == -1) {
8544 close(fd);
8545 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8546 strerror(errno));
8547 return REDIS_ERR;
8548 }
8549 if (buf[0] != '+') {
8550 close(fd);
8551 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8552 return REDIS_ERR;
8553 }
8554 }
8555
8556 /* Issue the SYNC command */
8557 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8558 close(fd);
8559 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8560 strerror(errno));
8561 return REDIS_ERR;
8562 }
8563 /* Read the bulk write count */
8564 if (syncReadLine(fd,buf,1024,3600) == -1) {
8565 close(fd);
8566 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8567 strerror(errno));
8568 return REDIS_ERR;
8569 }
8570 if (buf[0] != '$') {
8571 close(fd);
8572 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8573 return REDIS_ERR;
8574 }
8575 dumpsize = strtol(buf+1,NULL,10);
8576 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8577 /* Read the bulk write data on a temp file */
8578 while(maxtries--) {
8579 snprintf(tmpfile,256,
8580 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8581 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8582 if (dfd != -1) break;
8583 sleep(1);
8584 }
8585 if (dfd == -1) {
8586 close(fd);
8587 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8588 return REDIS_ERR;
8589 }
8590 while(dumpsize) {
8591 int nread, nwritten;
8592
8593 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8594 if (nread == -1) {
8595 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8596 strerror(errno));
8597 close(fd);
8598 close(dfd);
8599 return REDIS_ERR;
8600 }
8601 nwritten = write(dfd,buf,nread);
8602 if (nwritten == -1) {
8603 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8604 close(fd);
8605 close(dfd);
8606 return REDIS_ERR;
8607 }
8608 dumpsize -= nread;
8609 }
8610 close(dfd);
8611 if (rename(tmpfile,server.dbfilename) == -1) {
8612 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8613 unlink(tmpfile);
8614 close(fd);
8615 return REDIS_ERR;
8616 }
8617 emptyDb();
8618 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8619 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8620 close(fd);
8621 return REDIS_ERR;
8622 }
8623 server.master = createClient(fd);
8624 server.master->flags |= REDIS_MASTER;
8625 server.master->authenticated = 1;
8626 server.replstate = REDIS_REPL_CONNECTED;
8627 return REDIS_OK;
8628 }
8629
8630 static void slaveofCommand(redisClient *c) {
8631 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8632 !strcasecmp(c->argv[2]->ptr,"one")) {
8633 if (server.masterhost) {
8634 sdsfree(server.masterhost);
8635 server.masterhost = NULL;
8636 if (server.master) freeClient(server.master);
8637 server.replstate = REDIS_REPL_NONE;
8638 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8639 }
8640 } else {
8641 sdsfree(server.masterhost);
8642 server.masterhost = sdsdup(c->argv[1]->ptr);
8643 server.masterport = atoi(c->argv[2]->ptr);
8644 if (server.master) freeClient(server.master);
8645 server.replstate = REDIS_REPL_CONNECT;
8646 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8647 server.masterhost, server.masterport);
8648 }
8649 addReply(c,shared.ok);
8650 }
8651
8652 /* ============================ Maxmemory directive ======================== */
8653
8654 /* Try to free one object form the pre-allocated objects free list.
8655 * This is useful under low mem conditions as by default we take 1 million
8656 * free objects allocated. On success REDIS_OK is returned, otherwise
8657 * REDIS_ERR. */
8658 static int tryFreeOneObjectFromFreelist(void) {
8659 robj *o;
8660
8661 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8662 if (listLength(server.objfreelist)) {
8663 listNode *head = listFirst(server.objfreelist);
8664 o = listNodeValue(head);
8665 listDelNode(server.objfreelist,head);
8666 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8667 zfree(o);
8668 return REDIS_OK;
8669 } else {
8670 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8671 return REDIS_ERR;
8672 }
8673 }
8674
8675 /* This function gets called when 'maxmemory' is set on the config file to limit
8676 * the max memory used by the server, and we are out of memory.
8677 * This function will try to, in order:
8678 *
8679 * - Free objects from the free list
8680 * - Try to remove keys with an EXPIRE set
8681 *
8682 * It is not possible to free enough memory to reach used-memory < maxmemory
8683 * the server will start refusing commands that will enlarge even more the
8684 * memory usage.
8685 */
8686 static void freeMemoryIfNeeded(void) {
8687 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8688 int j, k, freed = 0;
8689
8690 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8691 for (j = 0; j < server.dbnum; j++) {
8692 int minttl = -1;
8693 robj *minkey = NULL;
8694 struct dictEntry *de;
8695
8696 if (dictSize(server.db[j].expires)) {
8697 freed = 1;
8698 /* From a sample of three keys drop the one nearest to
8699 * the natural expire */
8700 for (k = 0; k < 3; k++) {
8701 time_t t;
8702
8703 de = dictGetRandomKey(server.db[j].expires);
8704 t = (time_t) dictGetEntryVal(de);
8705 if (minttl == -1 || t < minttl) {
8706 minkey = dictGetEntryKey(de);
8707 minttl = t;
8708 }
8709 }
8710 dbDelete(server.db+j,minkey);
8711 }
8712 }
8713 if (!freed) return; /* nothing to free... */
8714 }
8715 }
8716
8717 /* ============================== Append Only file ========================== */
8718
8719 /* Called when the user switches from "appendonly yes" to "appendonly no"
8720 * at runtime using the CONFIG command. */
8721 static void stopAppendOnly(void) {
8722 flushAppendOnlyFile();
8723 aof_fsync(server.appendfd);
8724 close(server.appendfd);
8725
8726 server.appendfd = -1;
8727 server.appendseldb = -1;
8728 server.appendonly = 0;
8729 /* rewrite operation in progress? kill it, wait child exit */
8730 if (server.bgsavechildpid != -1) {
8731 int statloc;
8732
8733 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8734 wait3(&statloc,0,NULL);
8735 /* reset the buffer accumulating changes while the child saves */
8736 sdsfree(server.bgrewritebuf);
8737 server.bgrewritebuf = sdsempty();
8738 server.bgsavechildpid = -1;
8739 }
8740 }
8741
8742 /* Called when the user switches from "appendonly no" to "appendonly yes"
8743 * at runtime using the CONFIG command. */
8744 static int startAppendOnly(void) {
8745 server.appendonly = 1;
8746 server.lastfsync = time(NULL);
8747 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8748 if (server.appendfd == -1) {
8749 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8750 return REDIS_ERR;
8751 }
8752 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8753 server.appendonly = 0;
8754 close(server.appendfd);
8755 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8756 return REDIS_ERR;
8757 }
8758 return REDIS_OK;
8759 }
8760
8761 /* Write the append only file buffer on disk.
8762 *
8763 * Since we are required to write the AOF before replying to the client,
8764 * and the only way the client socket can get a write is entering when the
8765 * the event loop, we accumulate all the AOF writes in a memory
8766 * buffer and write it on disk using this function just before entering
8767 * the event loop again. */
8768 static void flushAppendOnlyFile(void) {
8769 time_t now;
8770 ssize_t nwritten;
8771
8772 if (sdslen(server.aofbuf) == 0) return;
8773
8774 /* We want to perform a single write. This should be guaranteed atomic
8775 * at least if the filesystem we are writing is a real physical one.
8776 * While this will save us against the server being killed I don't think
8777 * there is much to do about the whole server stopping for power problems
8778 * or alike */
8779 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8780 if (nwritten != (signed)sdslen(server.aofbuf)) {
8781 /* Ooops, we are in troubles. The best thing to do for now is
8782 * aborting instead of giving the illusion that everything is
8783 * working as expected. */
8784 if (nwritten == -1) {
8785 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8786 } else {
8787 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8788 }
8789 exit(1);
8790 }
8791 sdsfree(server.aofbuf);
8792 server.aofbuf = sdsempty();
8793
8794 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8795 * childs performing heavy I/O on disk. */
8796 if (server.no_appendfsync_on_rewrite &&
8797 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8798 return;
8799 /* Fsync if needed */
8800 now = time(NULL);
8801 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8802 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8803 now-server.lastfsync > 1))
8804 {
8805 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8806 * flushing metadata. */
8807 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8808 server.lastfsync = now;
8809 }
8810 }
8811
8812 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8813 int j;
8814 buf = sdscatprintf(buf,"*%d\r\n",argc);
8815 for (j = 0; j < argc; j++) {
8816 robj *o = getDecodedObject(argv[j]);
8817 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8818 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8819 buf = sdscatlen(buf,"\r\n",2);
8820 decrRefCount(o);
8821 }
8822 return buf;
8823 }
8824
8825 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8826 int argc = 3;
8827 long when;
8828 robj *argv[3];
8829
8830 /* Make sure we can use strtol */
8831 seconds = getDecodedObject(seconds);
8832 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8833 decrRefCount(seconds);
8834
8835 argv[0] = createStringObject("EXPIREAT",8);
8836 argv[1] = key;
8837 argv[2] = createObject(REDIS_STRING,
8838 sdscatprintf(sdsempty(),"%ld",when));
8839 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8840 decrRefCount(argv[0]);
8841 decrRefCount(argv[2]);
8842 return buf;
8843 }
8844
8845 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8846 sds buf = sdsempty();
8847 robj *tmpargv[3];
8848
8849 /* The DB this command was targetting is not the same as the last command
8850 * we appendend. To issue a SELECT command is needed. */
8851 if (dictid != server.appendseldb) {
8852 char seldb[64];
8853
8854 snprintf(seldb,sizeof(seldb),"%d",dictid);
8855 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8856 (unsigned long)strlen(seldb),seldb);
8857 server.appendseldb = dictid;
8858 }
8859
8860 if (cmd->proc == expireCommand) {
8861 /* Translate EXPIRE into EXPIREAT */
8862 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8863 } else if (cmd->proc == setexCommand) {
8864 /* Translate SETEX to SET and EXPIREAT */
8865 tmpargv[0] = createStringObject("SET",3);
8866 tmpargv[1] = argv[1];
8867 tmpargv[2] = argv[3];
8868 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8869 decrRefCount(tmpargv[0]);
8870 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8871 } else {
8872 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8873 }
8874
8875 /* Append to the AOF buffer. This will be flushed on disk just before
8876 * of re-entering the event loop, so before the client will get a
8877 * positive reply about the operation performed. */
8878 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8879
8880 /* If a background append only file rewriting is in progress we want to
8881 * accumulate the differences between the child DB and the current one
8882 * in a buffer, so that when the child process will do its work we
8883 * can append the differences to the new append only file. */
8884 if (server.bgrewritechildpid != -1)
8885 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8886
8887 sdsfree(buf);
8888 }
8889
8890 /* In Redis commands are always executed in the context of a client, so in
8891 * order to load the append only file we need to create a fake client. */
8892 static struct redisClient *createFakeClient(void) {
8893 struct redisClient *c = zmalloc(sizeof(*c));
8894
8895 selectDb(c,0);
8896 c->fd = -1;
8897 c->querybuf = sdsempty();
8898 c->argc = 0;
8899 c->argv = NULL;
8900 c->flags = 0;
8901 /* We set the fake client as a slave waiting for the synchronization
8902 * so that Redis will not try to send replies to this client. */
8903 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8904 c->reply = listCreate();
8905 listSetFreeMethod(c->reply,decrRefCount);
8906 listSetDupMethod(c->reply,dupClientReplyValue);
8907 initClientMultiState(c);
8908 return c;
8909 }
8910
8911 static void freeFakeClient(struct redisClient *c) {
8912 sdsfree(c->querybuf);
8913 listRelease(c->reply);
8914 freeClientMultiState(c);
8915 zfree(c);
8916 }
8917
8918 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8919 * error (the append only file is zero-length) REDIS_ERR is returned. On
8920 * fatal error an error message is logged and the program exists. */
8921 int loadAppendOnlyFile(char *filename) {
8922 struct redisClient *fakeClient;
8923 FILE *fp = fopen(filename,"r");
8924 struct redis_stat sb;
8925 int appendonly = server.appendonly;
8926
8927 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8928 return REDIS_ERR;
8929
8930 if (fp == NULL) {
8931 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8932 exit(1);
8933 }
8934
8935 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8936 * to the same file we're about to read. */
8937 server.appendonly = 0;
8938
8939 fakeClient = createFakeClient();
8940 while(1) {
8941 int argc, j;
8942 unsigned long len;
8943 robj **argv;
8944 char buf[128];
8945 sds argsds;
8946 struct redisCommand *cmd;
8947 int force_swapout;
8948
8949 if (fgets(buf,sizeof(buf),fp) == NULL) {
8950 if (feof(fp))
8951 break;
8952 else
8953 goto readerr;
8954 }
8955 if (buf[0] != '*') goto fmterr;
8956 argc = atoi(buf+1);
8957 argv = zmalloc(sizeof(robj*)*argc);
8958 for (j = 0; j < argc; j++) {
8959 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8960 if (buf[0] != '$') goto fmterr;
8961 len = strtol(buf+1,NULL,10);
8962 argsds = sdsnewlen(NULL,len);
8963 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8964 argv[j] = createObject(REDIS_STRING,argsds);
8965 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8966 }
8967
8968 /* Command lookup */
8969 cmd = lookupCommand(argv[0]->ptr);
8970 if (!cmd) {
8971 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8972 exit(1);
8973 }
8974 /* Try object encoding */
8975 if (cmd->flags & REDIS_CMD_BULK)
8976 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8977 /* Run the command in the context of a fake client */
8978 fakeClient->argc = argc;
8979 fakeClient->argv = argv;
8980 cmd->proc(fakeClient);
8981 /* Discard the reply objects list from the fake client */
8982 while(listLength(fakeClient->reply))
8983 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8984 /* Clean up, ready for the next command */
8985 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8986 zfree(argv);
8987 /* Handle swapping while loading big datasets when VM is on */
8988 force_swapout = 0;
8989 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8990 force_swapout = 1;
8991
8992 if (server.vm_enabled && force_swapout) {
8993 while (zmalloc_used_memory() > server.vm_max_memory) {
8994 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8995 }
8996 }
8997 }
8998
8999 /* This point can only be reached when EOF is reached without errors.
9000 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9001 if (fakeClient->flags & REDIS_MULTI) goto readerr;
9002
9003 fclose(fp);
9004 freeFakeClient(fakeClient);
9005 server.appendonly = appendonly;
9006 return REDIS_OK;
9007
9008 readerr:
9009 if (feof(fp)) {
9010 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9011 } else {
9012 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9013 }
9014 exit(1);
9015 fmterr:
9016 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9017 exit(1);
9018 }
9019
9020 /* Write binary-safe string into a file in the bulkformat
9021 * $<count>\r\n<payload>\r\n */
9022 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9023 char cbuf[128];
9024 int clen;
9025 cbuf[0] = '$';
9026 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9027 cbuf[clen++] = '\r';
9028 cbuf[clen++] = '\n';
9029 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9030 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9031 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9032 return 1;
9033 }
9034
9035 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9036 static int fwriteBulkDouble(FILE *fp, double d) {
9037 char buf[128], dbuf[128];
9038
9039 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9040 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9041 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9042 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9043 return 1;
9044 }
9045
9046 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9047 static int fwriteBulkLongLong(FILE *fp, long long l) {
9048 char bbuf[128], lbuf[128];
9049 unsigned int blen, llen;
9050 llen = ll2string(lbuf,32,l);
9051 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9052 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9053 return 1;
9054 }
9055
9056 /* Delegate writing an object to writing a bulk string or bulk long long. */
9057 static int fwriteBulkObject(FILE *fp, robj *obj) {
9058 /* Avoid using getDecodedObject to help copy-on-write (we are often
9059 * in a child process when this function is called). */
9060 if (obj->encoding == REDIS_ENCODING_INT) {
9061 return fwriteBulkLongLong(fp,(long)obj->ptr);
9062 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9063 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9064 } else {
9065 redisPanic("Unknown string encoding");
9066 }
9067 }
9068
9069 /* Write a sequence of commands able to fully rebuild the dataset into
9070 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9071 static int rewriteAppendOnlyFile(char *filename) {
9072 dictIterator *di = NULL;
9073 dictEntry *de;
9074 FILE *fp;
9075 char tmpfile[256];
9076 int j;
9077 time_t now = time(NULL);
9078
9079 /* Note that we have to use a different temp name here compared to the
9080 * one used by rewriteAppendOnlyFileBackground() function. */
9081 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9082 fp = fopen(tmpfile,"w");
9083 if (!fp) {
9084 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9085 return REDIS_ERR;
9086 }
9087 for (j = 0; j < server.dbnum; j++) {
9088 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9089 redisDb *db = server.db+j;
9090 dict *d = db->dict;
9091 if (dictSize(d) == 0) continue;
9092 di = dictGetIterator(d);
9093 if (!di) {
9094 fclose(fp);
9095 return REDIS_ERR;
9096 }
9097
9098 /* SELECT the new DB */
9099 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9100 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9101
9102 /* Iterate this DB writing every entry */
9103 while((de = dictNext(di)) != NULL) {
9104 sds keystr = dictGetEntryKey(de);
9105 robj key, *o;
9106 time_t expiretime;
9107 int swapped;
9108
9109 keystr = dictGetEntryKey(de);
9110 o = dictGetEntryVal(de);
9111 initStaticStringObject(key,keystr);
9112 /* If the value for this key is swapped, load a preview in memory.
9113 * We use a "swapped" flag to remember if we need to free the
9114 * value object instead to just increment the ref count anyway
9115 * in order to avoid copy-on-write of pages if we are forked() */
9116 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9117 o->storage == REDIS_VM_SWAPPING) {
9118 swapped = 0;
9119 } else {
9120 o = vmPreviewObject(o);
9121 swapped = 1;
9122 }
9123 expiretime = getExpire(db,&key);
9124
9125 /* Save the key and associated value */
9126 if (o->type == REDIS_STRING) {
9127 /* Emit a SET command */
9128 char cmd[]="*3\r\n$3\r\nSET\r\n";
9129 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9130 /* Key and value */
9131 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9132 if (fwriteBulkObject(fp,o) == 0) goto werr;
9133 } else if (o->type == REDIS_LIST) {
9134 /* Emit the RPUSHes needed to rebuild the list */
9135 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9136 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9137 unsigned char *zl = o->ptr;
9138 unsigned char *p = ziplistIndex(zl,0);
9139 unsigned char *vstr;
9140 unsigned int vlen;
9141 long long vlong;
9142
9143 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9144 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9145 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9146 if (vstr) {
9147 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9148 goto werr;
9149 } else {
9150 if (fwriteBulkLongLong(fp,vlong) == 0)
9151 goto werr;
9152 }
9153 p = ziplistNext(zl,p);
9154 }
9155 } else if (o->encoding == REDIS_ENCODING_LIST) {
9156 list *list = o->ptr;
9157 listNode *ln;
9158 listIter li;
9159
9160 listRewind(list,&li);
9161 while((ln = listNext(&li))) {
9162 robj *eleobj = listNodeValue(ln);
9163
9164 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9165 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9166 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9167 }
9168 } else {
9169 redisPanic("Unknown list encoding");
9170 }
9171 } else if (o->type == REDIS_SET) {
9172 /* Emit the SADDs needed to rebuild the set */
9173 dict *set = o->ptr;
9174 dictIterator *di = dictGetIterator(set);
9175 dictEntry *de;
9176
9177 while((de = dictNext(di)) != NULL) {
9178 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9179 robj *eleobj = dictGetEntryKey(de);
9180
9181 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9182 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9183 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9184 }
9185 dictReleaseIterator(di);
9186 } else if (o->type == REDIS_ZSET) {
9187 /* Emit the ZADDs needed to rebuild the sorted set */
9188 zset *zs = o->ptr;
9189 dictIterator *di = dictGetIterator(zs->dict);
9190 dictEntry *de;
9191
9192 while((de = dictNext(di)) != NULL) {
9193 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9194 robj *eleobj = dictGetEntryKey(de);
9195 double *score = dictGetEntryVal(de);
9196
9197 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9198 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9199 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9200 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9201 }
9202 dictReleaseIterator(di);
9203 } else if (o->type == REDIS_HASH) {
9204 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9205
9206 /* Emit the HSETs needed to rebuild the hash */
9207 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9208 unsigned char *p = zipmapRewind(o->ptr);
9209 unsigned char *field, *val;
9210 unsigned int flen, vlen;
9211
9212 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9213 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9214 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9215 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9216 return -1;
9217 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9218 return -1;
9219 }
9220 } else {
9221 dictIterator *di = dictGetIterator(o->ptr);
9222 dictEntry *de;
9223
9224 while((de = dictNext(di)) != NULL) {
9225 robj *field = dictGetEntryKey(de);
9226 robj *val = dictGetEntryVal(de);
9227
9228 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9229 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9230 if (fwriteBulkObject(fp,field) == -1) return -1;
9231 if (fwriteBulkObject(fp,val) == -1) return -1;
9232 }
9233 dictReleaseIterator(di);
9234 }
9235 } else {
9236 redisPanic("Unknown object type");
9237 }
9238 /* Save the expire time */
9239 if (expiretime != -1) {
9240 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9241 /* If this key is already expired skip it */
9242 if (expiretime < now) continue;
9243 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9244 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9245 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9246 }
9247 if (swapped) decrRefCount(o);
9248 }
9249 dictReleaseIterator(di);
9250 }
9251
9252 /* Make sure data will not remain on the OS's output buffers */
9253 fflush(fp);
9254 aof_fsync(fileno(fp));
9255 fclose(fp);
9256
9257 /* Use RENAME to make sure the DB file is changed atomically only
9258 * if the generate DB file is ok. */
9259 if (rename(tmpfile,filename) == -1) {
9260 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9261 unlink(tmpfile);
9262 return REDIS_ERR;
9263 }
9264 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9265 return REDIS_OK;
9266
9267 werr:
9268 fclose(fp);
9269 unlink(tmpfile);
9270 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9271 if (di) dictReleaseIterator(di);
9272 return REDIS_ERR;
9273 }
9274
9275 /* This is how rewriting of the append only file in background works:
9276 *
9277 * 1) The user calls BGREWRITEAOF
9278 * 2) Redis calls this function, that forks():
9279 * 2a) the child rewrite the append only file in a temp file.
9280 * 2b) the parent accumulates differences in server.bgrewritebuf.
9281 * 3) When the child finished '2a' exists.
9282 * 4) The parent will trap the exit code, if it's OK, will append the
9283 * data accumulated into server.bgrewritebuf into the temp file, and
9284 * finally will rename(2) the temp file in the actual file name.
9285 * The the new file is reopened as the new append only file. Profit!
9286 */
9287 static int rewriteAppendOnlyFileBackground(void) {
9288 pid_t childpid;
9289
9290 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9291 if (server.vm_enabled) waitEmptyIOJobsQueue();
9292 if ((childpid = fork()) == 0) {
9293 /* Child */
9294 char tmpfile[256];
9295
9296 if (server.vm_enabled) vmReopenSwapFile();
9297 close(server.fd);
9298 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9299 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9300 _exit(0);
9301 } else {
9302 _exit(1);
9303 }
9304 } else {
9305 /* Parent */
9306 if (childpid == -1) {
9307 redisLog(REDIS_WARNING,
9308 "Can't rewrite append only file in background: fork: %s",
9309 strerror(errno));
9310 return REDIS_ERR;
9311 }
9312 redisLog(REDIS_NOTICE,
9313 "Background append only file rewriting started by pid %d",childpid);
9314 server.bgrewritechildpid = childpid;
9315 updateDictResizePolicy();
9316 /* We set appendseldb to -1 in order to force the next call to the
9317 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9318 * accumulated by the parent into server.bgrewritebuf will start
9319 * with a SELECT statement and it will be safe to merge. */
9320 server.appendseldb = -1;
9321 return REDIS_OK;
9322 }
9323 return REDIS_OK; /* unreached */
9324 }
9325
9326 static void bgrewriteaofCommand(redisClient *c) {
9327 if (server.bgrewritechildpid != -1) {
9328 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9329 return;
9330 }
9331 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9332 char *status = "+Background append only file rewriting started\r\n";
9333 addReplySds(c,sdsnew(status));
9334 } else {
9335 addReply(c,shared.err);
9336 }
9337 }
9338
9339 static void aofRemoveTempFile(pid_t childpid) {
9340 char tmpfile[256];
9341
9342 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9343 unlink(tmpfile);
9344 }
9345
9346 /* Virtual Memory is composed mainly of two subsystems:
9347 * - Blocking Virutal Memory
9348 * - Threaded Virtual Memory I/O
9349 * The two parts are not fully decoupled, but functions are split among two
9350 * different sections of the source code (delimited by comments) in order to
9351 * make more clear what functionality is about the blocking VM and what about
9352 * the threaded (not blocking) VM.
9353 *
9354 * Redis VM design:
9355 *
9356 * Redis VM is a blocking VM (one that blocks reading swapped values from
9357 * disk into memory when a value swapped out is needed in memory) that is made
9358 * unblocking by trying to examine the command argument vector in order to
9359 * load in background values that will likely be needed in order to exec
9360 * the command. The command is executed only once all the relevant keys
9361 * are loaded into memory.
9362 *
9363 * This basically is almost as simple of a blocking VM, but almost as parallel
9364 * as a fully non-blocking VM.
9365 */
9366
9367 /* =================== Virtual Memory - Blocking Side ====================== */
9368
9369 /* Create a VM pointer object. This kind of objects are used in place of
9370 * values in the key -> value hash table, for swapped out objects. */
9371 static vmpointer *createVmPointer(int vtype) {
9372 vmpointer *vp = zmalloc(sizeof(vmpointer));
9373
9374 vp->type = REDIS_VMPOINTER;
9375 vp->storage = REDIS_VM_SWAPPED;
9376 vp->vtype = vtype;
9377 return vp;
9378 }
9379
9380 static void vmInit(void) {
9381 off_t totsize;
9382 int pipefds[2];
9383 size_t stacksize;
9384 struct flock fl;
9385
9386 if (server.vm_max_threads != 0)
9387 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9388
9389 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9390 /* Try to open the old swap file, otherwise create it */
9391 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9392 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9393 }
9394 if (server.vm_fp == NULL) {
9395 redisLog(REDIS_WARNING,
9396 "Can't open the swap file: %s. Exiting.",
9397 strerror(errno));
9398 exit(1);
9399 }
9400 server.vm_fd = fileno(server.vm_fp);
9401 /* Lock the swap file for writing, this is useful in order to avoid
9402 * another instance to use the same swap file for a config error. */
9403 fl.l_type = F_WRLCK;
9404 fl.l_whence = SEEK_SET;
9405 fl.l_start = fl.l_len = 0;
9406 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9407 redisLog(REDIS_WARNING,
9408 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9409 exit(1);
9410 }
9411 /* Initialize */
9412 server.vm_next_page = 0;
9413 server.vm_near_pages = 0;
9414 server.vm_stats_used_pages = 0;
9415 server.vm_stats_swapped_objects = 0;
9416 server.vm_stats_swapouts = 0;
9417 server.vm_stats_swapins = 0;
9418 totsize = server.vm_pages*server.vm_page_size;
9419 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9420 if (ftruncate(server.vm_fd,totsize) == -1) {
9421 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9422 strerror(errno));
9423 exit(1);
9424 } else {
9425 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9426 }
9427 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9428 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9429 (long long) (server.vm_pages+7)/8, server.vm_pages);
9430 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9431
9432 /* Initialize threaded I/O (used by Virtual Memory) */
9433 server.io_newjobs = listCreate();
9434 server.io_processing = listCreate();
9435 server.io_processed = listCreate();
9436 server.io_ready_clients = listCreate();
9437 pthread_mutex_init(&server.io_mutex,NULL);
9438 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9439 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9440 server.io_active_threads = 0;
9441 if (pipe(pipefds) == -1) {
9442 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9443 ,strerror(errno));
9444 exit(1);
9445 }
9446 server.io_ready_pipe_read = pipefds[0];
9447 server.io_ready_pipe_write = pipefds[1];
9448 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9449 /* LZF requires a lot of stack */
9450 pthread_attr_init(&server.io_threads_attr);
9451 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9452 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9453 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9454 /* Listen for events in the threaded I/O pipe */
9455 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9456 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9457 oom("creating file event");
9458 }
9459
9460 /* Mark the page as used */
9461 static void vmMarkPageUsed(off_t page) {
9462 off_t byte = page/8;
9463 int bit = page&7;
9464 redisAssert(vmFreePage(page) == 1);
9465 server.vm_bitmap[byte] |= 1<<bit;
9466 }
9467
9468 /* Mark N contiguous pages as used, with 'page' being the first. */
9469 static void vmMarkPagesUsed(off_t page, off_t count) {
9470 off_t j;
9471
9472 for (j = 0; j < count; j++)
9473 vmMarkPageUsed(page+j);
9474 server.vm_stats_used_pages += count;
9475 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9476 (long long)count, (long long)page);
9477 }
9478
9479 /* Mark the page as free */
9480 static void vmMarkPageFree(off_t page) {
9481 off_t byte = page/8;
9482 int bit = page&7;
9483 redisAssert(vmFreePage(page) == 0);
9484 server.vm_bitmap[byte] &= ~(1<<bit);
9485 }
9486
9487 /* Mark N contiguous pages as free, with 'page' being the first. */
9488 static void vmMarkPagesFree(off_t page, off_t count) {
9489 off_t j;
9490
9491 for (j = 0; j < count; j++)
9492 vmMarkPageFree(page+j);
9493 server.vm_stats_used_pages -= count;
9494 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9495 (long long)count, (long long)page);
9496 }
9497
9498 /* Test if the page is free */
9499 static int vmFreePage(off_t page) {
9500 off_t byte = page/8;
9501 int bit = page&7;
9502 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9503 }
9504
9505 /* Find N contiguous free pages storing the first page of the cluster in *first.
9506 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9507 * REDIS_ERR is returned.
9508 *
9509 * This function uses a simple algorithm: we try to allocate
9510 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9511 * again from the start of the swap file searching for free spaces.
9512 *
9513 * If it looks pretty clear that there are no free pages near our offset
9514 * we try to find less populated places doing a forward jump of
9515 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9516 * without hurry, and then we jump again and so forth...
9517 *
9518 * This function can be improved using a free list to avoid to guess
9519 * too much, since we could collect data about freed pages.
9520 *
9521 * note: I implemented this function just after watching an episode of
9522 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9523 */
9524 static int vmFindContiguousPages(off_t *first, off_t n) {
9525 off_t base, offset = 0, since_jump = 0, numfree = 0;
9526
9527 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9528 server.vm_near_pages = 0;
9529 server.vm_next_page = 0;
9530 }
9531 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9532 base = server.vm_next_page;
9533
9534 while(offset < server.vm_pages) {
9535 off_t this = base+offset;
9536
9537 /* If we overflow, restart from page zero */
9538 if (this >= server.vm_pages) {
9539 this -= server.vm_pages;
9540 if (this == 0) {
9541 /* Just overflowed, what we found on tail is no longer
9542 * interesting, as it's no longer contiguous. */
9543 numfree = 0;
9544 }
9545 }
9546 if (vmFreePage(this)) {
9547 /* This is a free page */
9548 numfree++;
9549 /* Already got N free pages? Return to the caller, with success */
9550 if (numfree == n) {
9551 *first = this-(n-1);
9552 server.vm_next_page = this+1;
9553 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9554 return REDIS_OK;
9555 }
9556 } else {
9557 /* The current one is not a free page */
9558 numfree = 0;
9559 }
9560
9561 /* Fast-forward if the current page is not free and we already
9562 * searched enough near this place. */
9563 since_jump++;
9564 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9565 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9566 since_jump = 0;
9567 /* Note that even if we rewind after the jump, we are don't need
9568 * to make sure numfree is set to zero as we only jump *if* it
9569 * is set to zero. */
9570 } else {
9571 /* Otherwise just check the next page */
9572 offset++;
9573 }
9574 }
9575 return REDIS_ERR;
9576 }
9577
9578 /* Write the specified object at the specified page of the swap file */
9579 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9580 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9581 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9582 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9583 redisLog(REDIS_WARNING,
9584 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9585 strerror(errno));
9586 return REDIS_ERR;
9587 }
9588 rdbSaveObject(server.vm_fp,o);
9589 fflush(server.vm_fp);
9590 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9591 return REDIS_OK;
9592 }
9593
9594 /* Transfers the 'val' object to disk. Store all the information
9595 * a 'vmpointer' object containing all the information needed to load the
9596 * object back later is returned.
9597 *
9598 * If we can't find enough contiguous empty pages to swap the object on disk
9599 * NULL is returned. */
9600 static vmpointer *vmSwapObjectBlocking(robj *val) {
9601 off_t pages = rdbSavedObjectPages(val,NULL);
9602 off_t page;
9603 vmpointer *vp;
9604
9605 assert(val->storage == REDIS_VM_MEMORY);
9606 assert(val->refcount == 1);
9607 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9608 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9609
9610 vp = createVmPointer(val->type);
9611 vp->page = page;
9612 vp->usedpages = pages;
9613 decrRefCount(val); /* Deallocate the object from memory. */
9614 vmMarkPagesUsed(page,pages);
9615 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9616 (void*) val,
9617 (unsigned long long) page, (unsigned long long) pages);
9618 server.vm_stats_swapped_objects++;
9619 server.vm_stats_swapouts++;
9620 return vp;
9621 }
9622
9623 static robj *vmReadObjectFromSwap(off_t page, int type) {
9624 robj *o;
9625
9626 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9627 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9628 redisLog(REDIS_WARNING,
9629 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9630 strerror(errno));
9631 _exit(1);
9632 }
9633 o = rdbLoadObject(type,server.vm_fp);
9634 if (o == NULL) {
9635 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9636 _exit(1);
9637 }
9638 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9639 return o;
9640 }
9641
9642 /* Load the specified object from swap to memory.
9643 * The newly allocated object is returned.
9644 *
9645 * If preview is true the unserialized object is returned to the caller but
9646 * the pages are not marked as freed, nor the vp object is freed. */
9647 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9648 robj *val;
9649
9650 redisAssert(vp->type == REDIS_VMPOINTER &&
9651 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9652 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9653 if (!preview) {
9654 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9655 vmMarkPagesFree(vp->page,vp->usedpages);
9656 zfree(vp);
9657 server.vm_stats_swapped_objects--;
9658 } else {
9659 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9660 }
9661 server.vm_stats_swapins++;
9662 return val;
9663 }
9664
9665 /* Plain object loading, from swap to memory.
9666 *
9667 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9668 * The return value is the loaded object. */
9669 static robj *vmLoadObject(robj *o) {
9670 /* If we are loading the object in background, stop it, we
9671 * need to load this object synchronously ASAP. */
9672 if (o->storage == REDIS_VM_LOADING)
9673 vmCancelThreadedIOJob(o);
9674 return vmGenericLoadObject((vmpointer*)o,0);
9675 }
9676
9677 /* Just load the value on disk, without to modify the key.
9678 * This is useful when we want to perform some operation on the value
9679 * without to really bring it from swap to memory, like while saving the
9680 * dataset or rewriting the append only log. */
9681 static robj *vmPreviewObject(robj *o) {
9682 return vmGenericLoadObject((vmpointer*)o,1);
9683 }
9684
9685 /* How a good candidate is this object for swapping?
9686 * The better candidate it is, the greater the returned value.
9687 *
9688 * Currently we try to perform a fast estimation of the object size in
9689 * memory, and combine it with aging informations.
9690 *
9691 * Basically swappability = idle-time * log(estimated size)
9692 *
9693 * Bigger objects are preferred over smaller objects, but not
9694 * proportionally, this is why we use the logarithm. This algorithm is
9695 * just a first try and will probably be tuned later. */
9696 static double computeObjectSwappability(robj *o) {
9697 /* actual age can be >= minage, but not < minage. As we use wrapping
9698 * 21 bit clocks with minutes resolution for the LRU. */
9699 time_t minage = abs(server.lruclock - o->lru);
9700 long asize = 0, elesize;
9701 robj *ele;
9702 list *l;
9703 listNode *ln;
9704 dict *d;
9705 struct dictEntry *de;
9706 int z;
9707
9708 if (minage <= 0) return 0;
9709 switch(o->type) {
9710 case REDIS_STRING:
9711 if (o->encoding != REDIS_ENCODING_RAW) {
9712 asize = sizeof(*o);
9713 } else {
9714 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9715 }
9716 break;
9717 case REDIS_LIST:
9718 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9719 asize = sizeof(*o)+ziplistSize(o->ptr);
9720 } else {
9721 l = o->ptr;
9722 ln = listFirst(l);
9723 asize = sizeof(list);
9724 if (ln) {
9725 ele = ln->value;
9726 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9727 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9728 asize += (sizeof(listNode)+elesize)*listLength(l);
9729 }
9730 }
9731 break;
9732 case REDIS_SET:
9733 case REDIS_ZSET:
9734 z = (o->type == REDIS_ZSET);
9735 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9736
9737 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9738 if (z) asize += sizeof(zset)-sizeof(dict);
9739 if (dictSize(d)) {
9740 de = dictGetRandomKey(d);
9741 ele = dictGetEntryKey(de);
9742 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9743 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9744 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9745 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9746 }
9747 break;
9748 case REDIS_HASH:
9749 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9750 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9751 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9752 unsigned int klen, vlen;
9753 unsigned char *key, *val;
9754
9755 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9756 klen = 0;
9757 vlen = 0;
9758 }
9759 asize = len*(klen+vlen+3);
9760 } else if (o->encoding == REDIS_ENCODING_HT) {
9761 d = o->ptr;
9762 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9763 if (dictSize(d)) {
9764 de = dictGetRandomKey(d);
9765 ele = dictGetEntryKey(de);
9766 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9767 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9768 ele = dictGetEntryVal(de);
9769 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9770 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9771 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9772 }
9773 }
9774 break;
9775 }
9776 return (double)minage*log(1+asize);
9777 }
9778
9779 /* Try to swap an object that's a good candidate for swapping.
9780 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9781 * to swap any object at all.
9782 *
9783 * If 'usethreaded' is true, Redis will try to swap the object in background
9784 * using I/O threads. */
9785 static int vmSwapOneObject(int usethreads) {
9786 int j, i;
9787 struct dictEntry *best = NULL;
9788 double best_swappability = 0;
9789 redisDb *best_db = NULL;
9790 robj *val;
9791 sds key;
9792
9793 for (j = 0; j < server.dbnum; j++) {
9794 redisDb *db = server.db+j;
9795 /* Why maxtries is set to 100?
9796 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9797 * are swappable objects */
9798 int maxtries = 100;
9799
9800 if (dictSize(db->dict) == 0) continue;
9801 for (i = 0; i < 5; i++) {
9802 dictEntry *de;
9803 double swappability;
9804
9805 if (maxtries) maxtries--;
9806 de = dictGetRandomKey(db->dict);
9807 val = dictGetEntryVal(de);
9808 /* Only swap objects that are currently in memory.
9809 *
9810 * Also don't swap shared objects: not a good idea in general and
9811 * we need to ensure that the main thread does not touch the
9812 * object while the I/O thread is using it, but we can't
9813 * control other keys without adding additional mutex. */
9814 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9815 if (maxtries) i--; /* don't count this try */
9816 continue;
9817 }
9818 swappability = computeObjectSwappability(val);
9819 if (!best || swappability > best_swappability) {
9820 best = de;
9821 best_swappability = swappability;
9822 best_db = db;
9823 }
9824 }
9825 }
9826 if (best == NULL) return REDIS_ERR;
9827 key = dictGetEntryKey(best);
9828 val = dictGetEntryVal(best);
9829
9830 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9831 key, best_swappability);
9832
9833 /* Swap it */
9834 if (usethreads) {
9835 robj *keyobj = createStringObject(key,sdslen(key));
9836 vmSwapObjectThreaded(keyobj,val,best_db);
9837 decrRefCount(keyobj);
9838 return REDIS_OK;
9839 } else {
9840 vmpointer *vp;
9841
9842 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9843 dictGetEntryVal(best) = vp;
9844 return REDIS_OK;
9845 } else {
9846 return REDIS_ERR;
9847 }
9848 }
9849 }
9850
9851 static int vmSwapOneObjectBlocking() {
9852 return vmSwapOneObject(0);
9853 }
9854
9855 static int vmSwapOneObjectThreaded() {
9856 return vmSwapOneObject(1);
9857 }
9858
9859 /* Return true if it's safe to swap out objects in a given moment.
9860 * Basically we don't want to swap objects out while there is a BGSAVE
9861 * or a BGAEOREWRITE running in backgroud. */
9862 static int vmCanSwapOut(void) {
9863 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9864 }
9865
9866 /* =================== Virtual Memory - Threaded I/O ======================= */
9867
9868 static void freeIOJob(iojob *j) {
9869 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9870 j->type == REDIS_IOJOB_DO_SWAP ||
9871 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9872 {
9873 /* we fix the storage type, otherwise decrRefCount() will try to
9874 * kill the I/O thread Job (that does no longer exists). */
9875 if (j->val->storage == REDIS_VM_SWAPPING)
9876 j->val->storage = REDIS_VM_MEMORY;
9877 decrRefCount(j->val);
9878 }
9879 decrRefCount(j->key);
9880 zfree(j);
9881 }
9882
9883 /* Every time a thread finished a Job, it writes a byte into the write side
9884 * of an unix pipe in order to "awake" the main thread, and this function
9885 * is called. */
9886 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9887 int mask)
9888 {
9889 char buf[1];
9890 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9891 REDIS_NOTUSED(el);
9892 REDIS_NOTUSED(mask);
9893 REDIS_NOTUSED(privdata);
9894
9895 /* For every byte we read in the read side of the pipe, there is one
9896 * I/O job completed to process. */
9897 while((retval = read(fd,buf,1)) == 1) {
9898 iojob *j;
9899 listNode *ln;
9900 struct dictEntry *de;
9901
9902 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9903
9904 /* Get the processed element (the oldest one) */
9905 lockThreadedIO();
9906 assert(listLength(server.io_processed) != 0);
9907 if (toprocess == -1) {
9908 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9909 if (toprocess <= 0) toprocess = 1;
9910 }
9911 ln = listFirst(server.io_processed);
9912 j = ln->value;
9913 listDelNode(server.io_processed,ln);
9914 unlockThreadedIO();
9915 /* If this job is marked as canceled, just ignore it */
9916 if (j->canceled) {
9917 freeIOJob(j);
9918 continue;
9919 }
9920 /* Post process it in the main thread, as there are things we
9921 * can do just here to avoid race conditions and/or invasive locks */
9922 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9923 de = dictFind(j->db->dict,j->key->ptr);
9924 redisAssert(de != NULL);
9925 if (j->type == REDIS_IOJOB_LOAD) {
9926 redisDb *db;
9927 vmpointer *vp = dictGetEntryVal(de);
9928
9929 /* Key loaded, bring it at home */
9930 vmMarkPagesFree(vp->page,vp->usedpages);
9931 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9932 (unsigned char*) j->key->ptr);
9933 server.vm_stats_swapped_objects--;
9934 server.vm_stats_swapins++;
9935 dictGetEntryVal(de) = j->val;
9936 incrRefCount(j->val);
9937 db = j->db;
9938 /* Handle clients waiting for this key to be loaded. */
9939 handleClientsBlockedOnSwappedKey(db,j->key);
9940 freeIOJob(j);
9941 zfree(vp);
9942 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9943 /* Now we know the amount of pages required to swap this object.
9944 * Let's find some space for it, and queue this task again
9945 * rebranded as REDIS_IOJOB_DO_SWAP. */
9946 if (!vmCanSwapOut() ||
9947 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9948 {
9949 /* Ooops... no space or we can't swap as there is
9950 * a fork()ed Redis trying to save stuff on disk. */
9951 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9952 freeIOJob(j);
9953 } else {
9954 /* Note that we need to mark this pages as used now,
9955 * if the job will be canceled, we'll mark them as freed
9956 * again. */
9957 vmMarkPagesUsed(j->page,j->pages);
9958 j->type = REDIS_IOJOB_DO_SWAP;
9959 lockThreadedIO();
9960 queueIOJob(j);
9961 unlockThreadedIO();
9962 }
9963 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9964 vmpointer *vp;
9965
9966 /* Key swapped. We can finally free some memory. */
9967 if (j->val->storage != REDIS_VM_SWAPPING) {
9968 vmpointer *vp = (vmpointer*) j->id;
9969 printf("storage: %d\n",vp->storage);
9970 printf("key->name: %s\n",(char*)j->key->ptr);
9971 printf("val: %p\n",(void*)j->val);
9972 printf("val->type: %d\n",j->val->type);
9973 printf("val->ptr: %s\n",(char*)j->val->ptr);
9974 }
9975 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9976 vp = createVmPointer(j->val->type);
9977 vp->page = j->page;
9978 vp->usedpages = j->pages;
9979 dictGetEntryVal(de) = vp;
9980 /* Fix the storage otherwise decrRefCount will attempt to
9981 * remove the associated I/O job */
9982 j->val->storage = REDIS_VM_MEMORY;
9983 decrRefCount(j->val);
9984 redisLog(REDIS_DEBUG,
9985 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9986 (unsigned char*) j->key->ptr,
9987 (unsigned long long) j->page, (unsigned long long) j->pages);
9988 server.vm_stats_swapped_objects++;
9989 server.vm_stats_swapouts++;
9990 freeIOJob(j);
9991 /* Put a few more swap requests in queue if we are still
9992 * out of memory */
9993 if (trytoswap && vmCanSwapOut() &&
9994 zmalloc_used_memory() > server.vm_max_memory)
9995 {
9996 int more = 1;
9997 while(more) {
9998 lockThreadedIO();
9999 more = listLength(server.io_newjobs) <
10000 (unsigned) server.vm_max_threads;
10001 unlockThreadedIO();
10002 /* Don't waste CPU time if swappable objects are rare. */
10003 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
10004 trytoswap = 0;
10005 break;
10006 }
10007 }
10008 }
10009 }
10010 processed++;
10011 if (processed == toprocess) return;
10012 }
10013 if (retval < 0 && errno != EAGAIN) {
10014 redisLog(REDIS_WARNING,
10015 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10016 strerror(errno));
10017 }
10018 }
10019
10020 static void lockThreadedIO(void) {
10021 pthread_mutex_lock(&server.io_mutex);
10022 }
10023
10024 static void unlockThreadedIO(void) {
10025 pthread_mutex_unlock(&server.io_mutex);
10026 }
10027
10028 /* Remove the specified object from the threaded I/O queue if still not
10029 * processed, otherwise make sure to flag it as canceled. */
10030 static void vmCancelThreadedIOJob(robj *o) {
10031 list *lists[3] = {
10032 server.io_newjobs, /* 0 */
10033 server.io_processing, /* 1 */
10034 server.io_processed /* 2 */
10035 };
10036 int i;
10037
10038 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
10039 again:
10040 lockThreadedIO();
10041 /* Search for a matching object in one of the queues */
10042 for (i = 0; i < 3; i++) {
10043 listNode *ln;
10044 listIter li;
10045
10046 listRewind(lists[i],&li);
10047 while ((ln = listNext(&li)) != NULL) {
10048 iojob *job = ln->value;
10049
10050 if (job->canceled) continue; /* Skip this, already canceled. */
10051 if (job->id == o) {
10052 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10053 (void*)job, (char*)job->key->ptr, job->type, i);
10054 /* Mark the pages as free since the swap didn't happened
10055 * or happened but is now discarded. */
10056 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
10057 vmMarkPagesFree(job->page,job->pages);
10058 /* Cancel the job. It depends on the list the job is
10059 * living in. */
10060 switch(i) {
10061 case 0: /* io_newjobs */
10062 /* If the job was yet not processed the best thing to do
10063 * is to remove it from the queue at all */
10064 freeIOJob(job);
10065 listDelNode(lists[i],ln);
10066 break;
10067 case 1: /* io_processing */
10068 /* Oh Shi- the thread is messing with the Job:
10069 *
10070 * Probably it's accessing the object if this is a
10071 * PREPARE_SWAP or DO_SWAP job.
10072 * If it's a LOAD job it may be reading from disk and
10073 * if we don't wait for the job to terminate before to
10074 * cancel it, maybe in a few microseconds data can be
10075 * corrupted in this pages. So the short story is:
10076 *
10077 * Better to wait for the job to move into the
10078 * next queue (processed)... */
10079
10080 /* We try again and again until the job is completed. */
10081 unlockThreadedIO();
10082 /* But let's wait some time for the I/O thread
10083 * to finish with this job. After all this condition
10084 * should be very rare. */
10085 usleep(1);
10086 goto again;
10087 case 2: /* io_processed */
10088 /* The job was already processed, that's easy...
10089 * just mark it as canceled so that we'll ignore it
10090 * when processing completed jobs. */
10091 job->canceled = 1;
10092 break;
10093 }
10094 /* Finally we have to adjust the storage type of the object
10095 * in order to "UNDO" the operaiton. */
10096 if (o->storage == REDIS_VM_LOADING)
10097 o->storage = REDIS_VM_SWAPPED;
10098 else if (o->storage == REDIS_VM_SWAPPING)
10099 o->storage = REDIS_VM_MEMORY;
10100 unlockThreadedIO();
10101 redisLog(REDIS_DEBUG,"*** DONE");
10102 return;
10103 }
10104 }
10105 }
10106 unlockThreadedIO();
10107 printf("Not found: %p\n", (void*)o);
10108 redisAssert(1 != 1); /* We should never reach this */
10109 }
10110
10111 static void *IOThreadEntryPoint(void *arg) {
10112 iojob *j;
10113 listNode *ln;
10114 REDIS_NOTUSED(arg);
10115
10116 pthread_detach(pthread_self());
10117 while(1) {
10118 /* Get a new job to process */
10119 lockThreadedIO();
10120 if (listLength(server.io_newjobs) == 0) {
10121 /* No new jobs in queue, exit. */
10122 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10123 (long) pthread_self());
10124 server.io_active_threads--;
10125 unlockThreadedIO();
10126 return NULL;
10127 }
10128 ln = listFirst(server.io_newjobs);
10129 j = ln->value;
10130 listDelNode(server.io_newjobs,ln);
10131 /* Add the job in the processing queue */
10132 j->thread = pthread_self();
10133 listAddNodeTail(server.io_processing,j);
10134 ln = listLast(server.io_processing); /* We use ln later to remove it */
10135 unlockThreadedIO();
10136 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10137 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
10138
10139 /* Process the Job */
10140 if (j->type == REDIS_IOJOB_LOAD) {
10141 vmpointer *vp = (vmpointer*)j->id;
10142 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
10143 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10144 FILE *fp = fopen("/dev/null","w+");
10145 j->pages = rdbSavedObjectPages(j->val,fp);
10146 fclose(fp);
10147 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
10148 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10149 j->canceled = 1;
10150 }
10151
10152 /* Done: insert the job into the processed queue */
10153 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10154 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
10155 lockThreadedIO();
10156 listDelNode(server.io_processing,ln);
10157 listAddNodeTail(server.io_processed,j);
10158 unlockThreadedIO();
10159
10160 /* Signal the main thread there is new stuff to process */
10161 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10162 }
10163 return NULL; /* never reached */
10164 }
10165
10166 static void spawnIOThread(void) {
10167 pthread_t thread;
10168 sigset_t mask, omask;
10169 int err;
10170
10171 sigemptyset(&mask);
10172 sigaddset(&mask,SIGCHLD);
10173 sigaddset(&mask,SIGHUP);
10174 sigaddset(&mask,SIGPIPE);
10175 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10176 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10177 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10178 strerror(err));
10179 usleep(1000000);
10180 }
10181 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10182 server.io_active_threads++;
10183 }
10184
10185 /* We need to wait for the last thread to exit before we are able to
10186 * fork() in order to BGSAVE or BGREWRITEAOF. */
10187 static void waitEmptyIOJobsQueue(void) {
10188 while(1) {
10189 int io_processed_len;
10190
10191 lockThreadedIO();
10192 if (listLength(server.io_newjobs) == 0 &&
10193 listLength(server.io_processing) == 0 &&
10194 server.io_active_threads == 0)
10195 {
10196 unlockThreadedIO();
10197 return;
10198 }
10199 /* While waiting for empty jobs queue condition we post-process some
10200 * finshed job, as I/O threads may be hanging trying to write against
10201 * the io_ready_pipe_write FD but there are so much pending jobs that
10202 * it's blocking. */
10203 io_processed_len = listLength(server.io_processed);
10204 unlockThreadedIO();
10205 if (io_processed_len) {
10206 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10207 usleep(1000); /* 1 millisecond */
10208 } else {
10209 usleep(10000); /* 10 milliseconds */
10210 }
10211 }
10212 }
10213
10214 static void vmReopenSwapFile(void) {
10215 /* Note: we don't close the old one as we are in the child process
10216 * and don't want to mess at all with the original file object. */
10217 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10218 if (server.vm_fp == NULL) {
10219 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10220 server.vm_swap_file);
10221 _exit(1);
10222 }
10223 server.vm_fd = fileno(server.vm_fp);
10224 }
10225
10226 /* This function must be called while with threaded IO locked */
10227 static void queueIOJob(iojob *j) {
10228 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10229 (void*)j, j->type, (char*)j->key->ptr);
10230 listAddNodeTail(server.io_newjobs,j);
10231 if (server.io_active_threads < server.vm_max_threads)
10232 spawnIOThread();
10233 }
10234
10235 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10236 iojob *j;
10237
10238 j = zmalloc(sizeof(*j));
10239 j->type = REDIS_IOJOB_PREPARE_SWAP;
10240 j->db = db;
10241 j->key = key;
10242 incrRefCount(key);
10243 j->id = j->val = val;
10244 incrRefCount(val);
10245 j->canceled = 0;
10246 j->thread = (pthread_t) -1;
10247 val->storage = REDIS_VM_SWAPPING;
10248
10249 lockThreadedIO();
10250 queueIOJob(j);
10251 unlockThreadedIO();
10252 return REDIS_OK;
10253 }
10254
10255 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10256
10257 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10258 * If there is not already a job loading the key, it is craeted.
10259 * The key is added to the io_keys list in the client structure, and also
10260 * in the hash table mapping swapped keys to waiting clients, that is,
10261 * server.io_waited_keys. */
10262 static int waitForSwappedKey(redisClient *c, robj *key) {
10263 struct dictEntry *de;
10264 robj *o;
10265 list *l;
10266
10267 /* If the key does not exist or is already in RAM we don't need to
10268 * block the client at all. */
10269 de = dictFind(c->db->dict,key->ptr);
10270 if (de == NULL) return 0;
10271 o = dictGetEntryVal(de);
10272 if (o->storage == REDIS_VM_MEMORY) {
10273 return 0;
10274 } else if (o->storage == REDIS_VM_SWAPPING) {
10275 /* We were swapping the key, undo it! */
10276 vmCancelThreadedIOJob(o);
10277 return 0;
10278 }
10279
10280 /* OK: the key is either swapped, or being loaded just now. */
10281
10282 /* Add the key to the list of keys this client is waiting for.
10283 * This maps clients to keys they are waiting for. */
10284 listAddNodeTail(c->io_keys,key);
10285 incrRefCount(key);
10286
10287 /* Add the client to the swapped keys => clients waiting map. */
10288 de = dictFind(c->db->io_keys,key);
10289 if (de == NULL) {
10290 int retval;
10291
10292 /* For every key we take a list of clients blocked for it */
10293 l = listCreate();
10294 retval = dictAdd(c->db->io_keys,key,l);
10295 incrRefCount(key);
10296 assert(retval == DICT_OK);
10297 } else {
10298 l = dictGetEntryVal(de);
10299 }
10300 listAddNodeTail(l,c);
10301
10302 /* Are we already loading the key from disk? If not create a job */
10303 if (o->storage == REDIS_VM_SWAPPED) {
10304 iojob *j;
10305 vmpointer *vp = (vmpointer*)o;
10306
10307 o->storage = REDIS_VM_LOADING;
10308 j = zmalloc(sizeof(*j));
10309 j->type = REDIS_IOJOB_LOAD;
10310 j->db = c->db;
10311 j->id = (robj*)vp;
10312 j->key = key;
10313 incrRefCount(key);
10314 j->page = vp->page;
10315 j->val = NULL;
10316 j->canceled = 0;
10317 j->thread = (pthread_t) -1;
10318 lockThreadedIO();
10319 queueIOJob(j);
10320 unlockThreadedIO();
10321 }
10322 return 1;
10323 }
10324
10325 /* Preload keys for any command with first, last and step values for
10326 * the command keys prototype, as defined in the command table. */
10327 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10328 int j, last;
10329 if (cmd->vm_firstkey == 0) return;
10330 last = cmd->vm_lastkey;
10331 if (last < 0) last = argc+last;
10332 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10333 redisAssert(j < argc);
10334 waitForSwappedKey(c,argv[j]);
10335 }
10336 }
10337
10338 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10339 * Note that the number of keys to preload is user-defined, so we need to
10340 * apply a sanity check against argc. */
10341 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10342 int i, num;
10343 REDIS_NOTUSED(cmd);
10344
10345 num = atoi(argv[2]->ptr);
10346 if (num > (argc-3)) return;
10347 for (i = 0; i < num; i++) {
10348 waitForSwappedKey(c,argv[3+i]);
10349 }
10350 }
10351
10352 /* Preload keys needed to execute the entire MULTI/EXEC block.
10353 *
10354 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10355 * and will block the client when any command requires a swapped out value. */
10356 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10357 int i, margc;
10358 struct redisCommand *mcmd;
10359 robj **margv;
10360 REDIS_NOTUSED(cmd);
10361 REDIS_NOTUSED(argc);
10362 REDIS_NOTUSED(argv);
10363
10364 if (!(c->flags & REDIS_MULTI)) return;
10365 for (i = 0; i < c->mstate.count; i++) {
10366 mcmd = c->mstate.commands[i].cmd;
10367 margc = c->mstate.commands[i].argc;
10368 margv = c->mstate.commands[i].argv;
10369
10370 if (mcmd->vm_preload_proc != NULL) {
10371 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10372 } else {
10373 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10374 }
10375 }
10376 }
10377
10378 /* Is this client attempting to run a command against swapped keys?
10379 * If so, block it ASAP, load the keys in background, then resume it.
10380 *
10381 * The important idea about this function is that it can fail! If keys will
10382 * still be swapped when the client is resumed, this key lookups will
10383 * just block loading keys from disk. In practical terms this should only
10384 * happen with SORT BY command or if there is a bug in this function.
10385 *
10386 * Return 1 if the client is marked as blocked, 0 if the client can
10387 * continue as the keys it is going to access appear to be in memory. */
10388 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10389 if (cmd->vm_preload_proc != NULL) {
10390 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10391 } else {
10392 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10393 }
10394
10395 /* If the client was blocked for at least one key, mark it as blocked. */
10396 if (listLength(c->io_keys)) {
10397 c->flags |= REDIS_IO_WAIT;
10398 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10399 server.vm_blocked_clients++;
10400 return 1;
10401 } else {
10402 return 0;
10403 }
10404 }
10405
10406 /* Remove the 'key' from the list of blocked keys for a given client.
10407 *
10408 * The function returns 1 when there are no longer blocking keys after
10409 * the current one was removed (and the client can be unblocked). */
10410 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10411 list *l;
10412 listNode *ln;
10413 listIter li;
10414 struct dictEntry *de;
10415
10416 /* Remove the key from the list of keys this client is waiting for. */
10417 listRewind(c->io_keys,&li);
10418 while ((ln = listNext(&li)) != NULL) {
10419 if (equalStringObjects(ln->value,key)) {
10420 listDelNode(c->io_keys,ln);
10421 break;
10422 }
10423 }
10424 assert(ln != NULL);
10425
10426 /* Remove the client form the key => waiting clients map. */
10427 de = dictFind(c->db->io_keys,key);
10428 assert(de != NULL);
10429 l = dictGetEntryVal(de);
10430 ln = listSearchKey(l,c);
10431 assert(ln != NULL);
10432 listDelNode(l,ln);
10433 if (listLength(l) == 0)
10434 dictDelete(c->db->io_keys,key);
10435
10436 return listLength(c->io_keys) == 0;
10437 }
10438
10439 /* Every time we now a key was loaded back in memory, we handle clients
10440 * waiting for this key if any. */
10441 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10442 struct dictEntry *de;
10443 list *l;
10444 listNode *ln;
10445 int len;
10446
10447 de = dictFind(db->io_keys,key);
10448 if (!de) return;
10449
10450 l = dictGetEntryVal(de);
10451 len = listLength(l);
10452 /* Note: we can't use something like while(listLength(l)) as the list
10453 * can be freed by the calling function when we remove the last element. */
10454 while (len--) {
10455 ln = listFirst(l);
10456 redisClient *c = ln->value;
10457
10458 if (dontWaitForSwappedKey(c,key)) {
10459 /* Put the client in the list of clients ready to go as we
10460 * loaded all the keys about it. */
10461 listAddNodeTail(server.io_ready_clients,c);
10462 }
10463 }
10464 }
10465
10466 /* =========================== Remote Configuration ========================= */
10467
10468 static void configSetCommand(redisClient *c) {
10469 robj *o = getDecodedObject(c->argv[3]);
10470 long long ll;
10471
10472 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10473 zfree(server.dbfilename);
10474 server.dbfilename = zstrdup(o->ptr);
10475 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10476 zfree(server.requirepass);
10477 server.requirepass = zstrdup(o->ptr);
10478 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10479 zfree(server.masterauth);
10480 server.masterauth = zstrdup(o->ptr);
10481 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10482 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10483 ll < 0) goto badfmt;
10484 server.maxmemory = ll;
10485 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10486 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10487 ll < 0 || ll > LONG_MAX) goto badfmt;
10488 server.maxidletime = ll;
10489 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10490 if (!strcasecmp(o->ptr,"no")) {
10491 server.appendfsync = APPENDFSYNC_NO;
10492 } else if (!strcasecmp(o->ptr,"everysec")) {
10493 server.appendfsync = APPENDFSYNC_EVERYSEC;
10494 } else if (!strcasecmp(o->ptr,"always")) {
10495 server.appendfsync = APPENDFSYNC_ALWAYS;
10496 } else {
10497 goto badfmt;
10498 }
10499 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10500 int yn = yesnotoi(o->ptr);
10501
10502 if (yn == -1) goto badfmt;
10503 server.no_appendfsync_on_rewrite = yn;
10504 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10505 int old = server.appendonly;
10506 int new = yesnotoi(o->ptr);
10507
10508 if (new == -1) goto badfmt;
10509 if (old != new) {
10510 if (new == 0) {
10511 stopAppendOnly();
10512 } else {
10513 if (startAppendOnly() == REDIS_ERR) {
10514 addReplySds(c,sdscatprintf(sdsempty(),
10515 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10516 decrRefCount(o);
10517 return;
10518 }
10519 }
10520 }
10521 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10522 int vlen, j;
10523 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10524
10525 /* Perform sanity check before setting the new config:
10526 * - Even number of args
10527 * - Seconds >= 1, changes >= 0 */
10528 if (vlen & 1) {
10529 sdsfreesplitres(v,vlen);
10530 goto badfmt;
10531 }
10532 for (j = 0; j < vlen; j++) {
10533 char *eptr;
10534 long val;
10535
10536 val = strtoll(v[j], &eptr, 10);
10537 if (eptr[0] != '\0' ||
10538 ((j & 1) == 0 && val < 1) ||
10539 ((j & 1) == 1 && val < 0)) {
10540 sdsfreesplitres(v,vlen);
10541 goto badfmt;
10542 }
10543 }
10544 /* Finally set the new config */
10545 resetServerSaveParams();
10546 for (j = 0; j < vlen; j += 2) {
10547 time_t seconds;
10548 int changes;
10549
10550 seconds = strtoll(v[j],NULL,10);
10551 changes = strtoll(v[j+1],NULL,10);
10552 appendServerSaveParams(seconds, changes);
10553 }
10554 sdsfreesplitres(v,vlen);
10555 } else {
10556 addReplySds(c,sdscatprintf(sdsempty(),
10557 "-ERR not supported CONFIG parameter %s\r\n",
10558 (char*)c->argv[2]->ptr));
10559 decrRefCount(o);
10560 return;
10561 }
10562 decrRefCount(o);
10563 addReply(c,shared.ok);
10564 return;
10565
10566 badfmt: /* Bad format errors */
10567 addReplySds(c,sdscatprintf(sdsempty(),
10568 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10569 (char*)o->ptr,
10570 (char*)c->argv[2]->ptr));
10571 decrRefCount(o);
10572 }
10573
10574 static void configGetCommand(redisClient *c) {
10575 robj *o = getDecodedObject(c->argv[2]);
10576 robj *lenobj = createObject(REDIS_STRING,NULL);
10577 char *pattern = o->ptr;
10578 int matches = 0;
10579
10580 addReply(c,lenobj);
10581 decrRefCount(lenobj);
10582
10583 if (stringmatch(pattern,"dbfilename",0)) {
10584 addReplyBulkCString(c,"dbfilename");
10585 addReplyBulkCString(c,server.dbfilename);
10586 matches++;
10587 }
10588 if (stringmatch(pattern,"requirepass",0)) {
10589 addReplyBulkCString(c,"requirepass");
10590 addReplyBulkCString(c,server.requirepass);
10591 matches++;
10592 }
10593 if (stringmatch(pattern,"masterauth",0)) {
10594 addReplyBulkCString(c,"masterauth");
10595 addReplyBulkCString(c,server.masterauth);
10596 matches++;
10597 }
10598 if (stringmatch(pattern,"maxmemory",0)) {
10599 char buf[128];
10600
10601 ll2string(buf,128,server.maxmemory);
10602 addReplyBulkCString(c,"maxmemory");
10603 addReplyBulkCString(c,buf);
10604 matches++;
10605 }
10606 if (stringmatch(pattern,"timeout",0)) {
10607 char buf[128];
10608
10609 ll2string(buf,128,server.maxidletime);
10610 addReplyBulkCString(c,"timeout");
10611 addReplyBulkCString(c,buf);
10612 matches++;
10613 }
10614 if (stringmatch(pattern,"appendonly",0)) {
10615 addReplyBulkCString(c,"appendonly");
10616 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10617 matches++;
10618 }
10619 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10620 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10621 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10622 matches++;
10623 }
10624 if (stringmatch(pattern,"appendfsync",0)) {
10625 char *policy;
10626
10627 switch(server.appendfsync) {
10628 case APPENDFSYNC_NO: policy = "no"; break;
10629 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10630 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10631 default: policy = "unknown"; break; /* too harmless to panic */
10632 }
10633 addReplyBulkCString(c,"appendfsync");
10634 addReplyBulkCString(c,policy);
10635 matches++;
10636 }
10637 if (stringmatch(pattern,"save",0)) {
10638 sds buf = sdsempty();
10639 int j;
10640
10641 for (j = 0; j < server.saveparamslen; j++) {
10642 buf = sdscatprintf(buf,"%ld %d",
10643 server.saveparams[j].seconds,
10644 server.saveparams[j].changes);
10645 if (j != server.saveparamslen-1)
10646 buf = sdscatlen(buf," ",1);
10647 }
10648 addReplyBulkCString(c,"save");
10649 addReplyBulkCString(c,buf);
10650 sdsfree(buf);
10651 matches++;
10652 }
10653 decrRefCount(o);
10654 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10655 }
10656
10657 static void configCommand(redisClient *c) {
10658 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10659 if (c->argc != 4) goto badarity;
10660 configSetCommand(c);
10661 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10662 if (c->argc != 3) goto badarity;
10663 configGetCommand(c);
10664 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10665 if (c->argc != 2) goto badarity;
10666 server.stat_numcommands = 0;
10667 server.stat_numconnections = 0;
10668 server.stat_expiredkeys = 0;
10669 server.stat_starttime = time(NULL);
10670 addReply(c,shared.ok);
10671 } else {
10672 addReplySds(c,sdscatprintf(sdsempty(),
10673 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10674 }
10675 return;
10676
10677 badarity:
10678 addReplySds(c,sdscatprintf(sdsempty(),
10679 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10680 (char*) c->argv[1]->ptr));
10681 }
10682
10683 /* =========================== Pubsub implementation ======================== */
10684
10685 static void freePubsubPattern(void *p) {
10686 pubsubPattern *pat = p;
10687
10688 decrRefCount(pat->pattern);
10689 zfree(pat);
10690 }
10691
10692 static int listMatchPubsubPattern(void *a, void *b) {
10693 pubsubPattern *pa = a, *pb = b;
10694
10695 return (pa->client == pb->client) &&
10696 (equalStringObjects(pa->pattern,pb->pattern));
10697 }
10698
10699 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10700 * 0 if the client was already subscribed to that channel. */
10701 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10702 struct dictEntry *de;
10703 list *clients = NULL;
10704 int retval = 0;
10705
10706 /* Add the channel to the client -> channels hash table */
10707 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10708 retval = 1;
10709 incrRefCount(channel);
10710 /* Add the client to the channel -> list of clients hash table */
10711 de = dictFind(server.pubsub_channels,channel);
10712 if (de == NULL) {
10713 clients = listCreate();
10714 dictAdd(server.pubsub_channels,channel,clients);
10715 incrRefCount(channel);
10716 } else {
10717 clients = dictGetEntryVal(de);
10718 }
10719 listAddNodeTail(clients,c);
10720 }
10721 /* Notify the client */
10722 addReply(c,shared.mbulk3);
10723 addReply(c,shared.subscribebulk);
10724 addReplyBulk(c,channel);
10725 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10726 return retval;
10727 }
10728
10729 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10730 * 0 if the client was not subscribed to the specified channel. */
10731 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10732 struct dictEntry *de;
10733 list *clients;
10734 listNode *ln;
10735 int retval = 0;
10736
10737 /* Remove the channel from the client -> channels hash table */
10738 incrRefCount(channel); /* channel may be just a pointer to the same object
10739 we have in the hash tables. Protect it... */
10740 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10741 retval = 1;
10742 /* Remove the client from the channel -> clients list hash table */
10743 de = dictFind(server.pubsub_channels,channel);
10744 assert(de != NULL);
10745 clients = dictGetEntryVal(de);
10746 ln = listSearchKey(clients,c);
10747 assert(ln != NULL);
10748 listDelNode(clients,ln);
10749 if (listLength(clients) == 0) {
10750 /* Free the list and associated hash entry at all if this was
10751 * the latest client, so that it will be possible to abuse
10752 * Redis PUBSUB creating millions of channels. */
10753 dictDelete(server.pubsub_channels,channel);
10754 }
10755 }
10756 /* Notify the client */
10757 if (notify) {
10758 addReply(c,shared.mbulk3);
10759 addReply(c,shared.unsubscribebulk);
10760 addReplyBulk(c,channel);
10761 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10762 listLength(c->pubsub_patterns));
10763
10764 }
10765 decrRefCount(channel); /* it is finally safe to release it */
10766 return retval;
10767 }
10768
10769 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10770 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10771 int retval = 0;
10772
10773 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10774 retval = 1;
10775 pubsubPattern *pat;
10776 listAddNodeTail(c->pubsub_patterns,pattern);
10777 incrRefCount(pattern);
10778 pat = zmalloc(sizeof(*pat));
10779 pat->pattern = getDecodedObject(pattern);
10780 pat->client = c;
10781 listAddNodeTail(server.pubsub_patterns,pat);
10782 }
10783 /* Notify the client */
10784 addReply(c,shared.mbulk3);
10785 addReply(c,shared.psubscribebulk);
10786 addReplyBulk(c,pattern);
10787 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10788 return retval;
10789 }
10790
10791 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10792 * 0 if the client was not subscribed to the specified channel. */
10793 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10794 listNode *ln;
10795 pubsubPattern pat;
10796 int retval = 0;
10797
10798 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10799 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10800 retval = 1;
10801 listDelNode(c->pubsub_patterns,ln);
10802 pat.client = c;
10803 pat.pattern = pattern;
10804 ln = listSearchKey(server.pubsub_patterns,&pat);
10805 listDelNode(server.pubsub_patterns,ln);
10806 }
10807 /* Notify the client */
10808 if (notify) {
10809 addReply(c,shared.mbulk3);
10810 addReply(c,shared.punsubscribebulk);
10811 addReplyBulk(c,pattern);
10812 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10813 listLength(c->pubsub_patterns));
10814 }
10815 decrRefCount(pattern);
10816 return retval;
10817 }
10818
10819 /* Unsubscribe from all the channels. Return the number of channels the
10820 * client was subscribed from. */
10821 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10822 dictIterator *di = dictGetIterator(c->pubsub_channels);
10823 dictEntry *de;
10824 int count = 0;
10825
10826 while((de = dictNext(di)) != NULL) {
10827 robj *channel = dictGetEntryKey(de);
10828
10829 count += pubsubUnsubscribeChannel(c,channel,notify);
10830 }
10831 dictReleaseIterator(di);
10832 return count;
10833 }
10834
10835 /* Unsubscribe from all the patterns. Return the number of patterns the
10836 * client was subscribed from. */
10837 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10838 listNode *ln;
10839 listIter li;
10840 int count = 0;
10841
10842 listRewind(c->pubsub_patterns,&li);
10843 while ((ln = listNext(&li)) != NULL) {
10844 robj *pattern = ln->value;
10845
10846 count += pubsubUnsubscribePattern(c,pattern,notify);
10847 }
10848 return count;
10849 }
10850
10851 /* Publish a message */
10852 static int pubsubPublishMessage(robj *channel, robj *message) {
10853 int receivers = 0;
10854 struct dictEntry *de;
10855 listNode *ln;
10856 listIter li;
10857
10858 /* Send to clients listening for that channel */
10859 de = dictFind(server.pubsub_channels,channel);
10860 if (de) {
10861 list *list = dictGetEntryVal(de);
10862 listNode *ln;
10863 listIter li;
10864
10865 listRewind(list,&li);
10866 while ((ln = listNext(&li)) != NULL) {
10867 redisClient *c = ln->value;
10868
10869 addReply(c,shared.mbulk3);
10870 addReply(c,shared.messagebulk);
10871 addReplyBulk(c,channel);
10872 addReplyBulk(c,message);
10873 receivers++;
10874 }
10875 }
10876 /* Send to clients listening to matching channels */
10877 if (listLength(server.pubsub_patterns)) {
10878 listRewind(server.pubsub_patterns,&li);
10879 channel = getDecodedObject(channel);
10880 while ((ln = listNext(&li)) != NULL) {
10881 pubsubPattern *pat = ln->value;
10882
10883 if (stringmatchlen((char*)pat->pattern->ptr,
10884 sdslen(pat->pattern->ptr),
10885 (char*)channel->ptr,
10886 sdslen(channel->ptr),0)) {
10887 addReply(pat->client,shared.mbulk4);
10888 addReply(pat->client,shared.pmessagebulk);
10889 addReplyBulk(pat->client,pat->pattern);
10890 addReplyBulk(pat->client,channel);
10891 addReplyBulk(pat->client,message);
10892 receivers++;
10893 }
10894 }
10895 decrRefCount(channel);
10896 }
10897 return receivers;
10898 }
10899
10900 static void subscribeCommand(redisClient *c) {
10901 int j;
10902
10903 for (j = 1; j < c->argc; j++)
10904 pubsubSubscribeChannel(c,c->argv[j]);
10905 }
10906
10907 static void unsubscribeCommand(redisClient *c) {
10908 if (c->argc == 1) {
10909 pubsubUnsubscribeAllChannels(c,1);
10910 return;
10911 } else {
10912 int j;
10913
10914 for (j = 1; j < c->argc; j++)
10915 pubsubUnsubscribeChannel(c,c->argv[j],1);
10916 }
10917 }
10918
10919 static void psubscribeCommand(redisClient *c) {
10920 int j;
10921
10922 for (j = 1; j < c->argc; j++)
10923 pubsubSubscribePattern(c,c->argv[j]);
10924 }
10925
10926 static void punsubscribeCommand(redisClient *c) {
10927 if (c->argc == 1) {
10928 pubsubUnsubscribeAllPatterns(c,1);
10929 return;
10930 } else {
10931 int j;
10932
10933 for (j = 1; j < c->argc; j++)
10934 pubsubUnsubscribePattern(c,c->argv[j],1);
10935 }
10936 }
10937
10938 static void publishCommand(redisClient *c) {
10939 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10940 addReplyLongLong(c,receivers);
10941 }
10942
10943 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10944 *
10945 * The implementation uses a per-DB hash table mapping keys to list of clients
10946 * WATCHing those keys, so that given a key that is going to be modified
10947 * we can mark all the associated clients as dirty.
10948 *
10949 * Also every client contains a list of WATCHed keys so that's possible to
10950 * un-watch such keys when the client is freed or when UNWATCH is called. */
10951
10952 /* In the client->watched_keys list we need to use watchedKey structures
10953 * as in order to identify a key in Redis we need both the key name and the
10954 * DB */
10955 typedef struct watchedKey {
10956 robj *key;
10957 redisDb *db;
10958 } watchedKey;
10959
10960 /* Watch for the specified key */
10961 static void watchForKey(redisClient *c, robj *key) {
10962 list *clients = NULL;
10963 listIter li;
10964 listNode *ln;
10965 watchedKey *wk;
10966
10967 /* Check if we are already watching for this key */
10968 listRewind(c->watched_keys,&li);
10969 while((ln = listNext(&li))) {
10970 wk = listNodeValue(ln);
10971 if (wk->db == c->db && equalStringObjects(key,wk->key))
10972 return; /* Key already watched */
10973 }
10974 /* This key is not already watched in this DB. Let's add it */
10975 clients = dictFetchValue(c->db->watched_keys,key);
10976 if (!clients) {
10977 clients = listCreate();
10978 dictAdd(c->db->watched_keys,key,clients);
10979 incrRefCount(key);
10980 }
10981 listAddNodeTail(clients,c);
10982 /* Add the new key to the lits of keys watched by this client */
10983 wk = zmalloc(sizeof(*wk));
10984 wk->key = key;
10985 wk->db = c->db;
10986 incrRefCount(key);
10987 listAddNodeTail(c->watched_keys,wk);
10988 }
10989
10990 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10991 * flag is up to the caller. */
10992 static void unwatchAllKeys(redisClient *c) {
10993 listIter li;
10994 listNode *ln;
10995
10996 if (listLength(c->watched_keys) == 0) return;
10997 listRewind(c->watched_keys,&li);
10998 while((ln = listNext(&li))) {
10999 list *clients;
11000 watchedKey *wk;
11001
11002 /* Lookup the watched key -> clients list and remove the client
11003 * from the list */
11004 wk = listNodeValue(ln);
11005 clients = dictFetchValue(wk->db->watched_keys, wk->key);
11006 assert(clients != NULL);
11007 listDelNode(clients,listSearchKey(clients,c));
11008 /* Kill the entry at all if this was the only client */
11009 if (listLength(clients) == 0)
11010 dictDelete(wk->db->watched_keys, wk->key);
11011 /* Remove this watched key from the client->watched list */
11012 listDelNode(c->watched_keys,ln);
11013 decrRefCount(wk->key);
11014 zfree(wk);
11015 }
11016 }
11017
11018 /* "Touch" a key, so that if this key is being WATCHed by some client the
11019 * next EXEC will fail. */
11020 static void touchWatchedKey(redisDb *db, robj *key) {
11021 list *clients;
11022 listIter li;
11023 listNode *ln;
11024
11025 if (dictSize(db->watched_keys) == 0) return;
11026 clients = dictFetchValue(db->watched_keys, key);
11027 if (!clients) return;
11028
11029 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11030 /* Check if we are already watching for this key */
11031 listRewind(clients,&li);
11032 while((ln = listNext(&li))) {
11033 redisClient *c = listNodeValue(ln);
11034
11035 c->flags |= REDIS_DIRTY_CAS;
11036 }
11037 }
11038
11039 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11040 * flush but will be deleted as effect of the flushing operation should
11041 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11042 * a FLUSHALL operation (all the DBs flushed). */
11043 static void touchWatchedKeysOnFlush(int dbid) {
11044 listIter li1, li2;
11045 listNode *ln;
11046
11047 /* For every client, check all the waited keys */
11048 listRewind(server.clients,&li1);
11049 while((ln = listNext(&li1))) {
11050 redisClient *c = listNodeValue(ln);
11051 listRewind(c->watched_keys,&li2);
11052 while((ln = listNext(&li2))) {
11053 watchedKey *wk = listNodeValue(ln);
11054
11055 /* For every watched key matching the specified DB, if the
11056 * key exists, mark the client as dirty, as the key will be
11057 * removed. */
11058 if (dbid == -1 || wk->db->id == dbid) {
11059 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
11060 c->flags |= REDIS_DIRTY_CAS;
11061 }
11062 }
11063 }
11064 }
11065
11066 static void watchCommand(redisClient *c) {
11067 int j;
11068
11069 if (c->flags & REDIS_MULTI) {
11070 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11071 return;
11072 }
11073 for (j = 1; j < c->argc; j++)
11074 watchForKey(c,c->argv[j]);
11075 addReply(c,shared.ok);
11076 }
11077
11078 static void unwatchCommand(redisClient *c) {
11079 unwatchAllKeys(c);
11080 c->flags &= (~REDIS_DIRTY_CAS);
11081 addReply(c,shared.ok);
11082 }
11083
11084 /* ================================= Debugging ============================== */
11085
11086 /* Compute the sha1 of string at 's' with 'len' bytes long.
11087 * The SHA1 is then xored againt the string pointed by digest.
11088 * Since xor is commutative, this operation is used in order to
11089 * "add" digests relative to unordered elements.
11090 *
11091 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11092 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11093 SHA1_CTX ctx;
11094 unsigned char hash[20], *s = ptr;
11095 int j;
11096
11097 SHA1Init(&ctx);
11098 SHA1Update(&ctx,s,len);
11099 SHA1Final(hash,&ctx);
11100
11101 for (j = 0; j < 20; j++)
11102 digest[j] ^= hash[j];
11103 }
11104
11105 static void xorObjectDigest(unsigned char *digest, robj *o) {
11106 o = getDecodedObject(o);
11107 xorDigest(digest,o->ptr,sdslen(o->ptr));
11108 decrRefCount(o);
11109 }
11110
11111 /* This function instead of just computing the SHA1 and xoring it
11112 * against diget, also perform the digest of "digest" itself and
11113 * replace the old value with the new one.
11114 *
11115 * So the final digest will be:
11116 *
11117 * digest = SHA1(digest xor SHA1(data))
11118 *
11119 * This function is used every time we want to preserve the order so
11120 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11121 *
11122 * Also note that mixdigest("foo") followed by mixdigest("bar")
11123 * will lead to a different digest compared to "fo", "obar".
11124 */
11125 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11126 SHA1_CTX ctx;
11127 char *s = ptr;
11128
11129 xorDigest(digest,s,len);
11130 SHA1Init(&ctx);
11131 SHA1Update(&ctx,digest,20);
11132 SHA1Final(digest,&ctx);
11133 }
11134
11135 static void mixObjectDigest(unsigned char *digest, robj *o) {
11136 o = getDecodedObject(o);
11137 mixDigest(digest,o->ptr,sdslen(o->ptr));
11138 decrRefCount(o);
11139 }
11140
11141 /* Compute the dataset digest. Since keys, sets elements, hashes elements
11142 * are not ordered, we use a trick: every aggregate digest is the xor
11143 * of the digests of their elements. This way the order will not change
11144 * the result. For list instead we use a feedback entering the output digest
11145 * as input in order to ensure that a different ordered list will result in
11146 * a different digest. */
11147 static void computeDatasetDigest(unsigned char *final) {
11148 unsigned char digest[20];
11149 char buf[128];
11150 dictIterator *di = NULL;
11151 dictEntry *de;
11152 int j;
11153 uint32_t aux;
11154
11155 memset(final,0,20); /* Start with a clean result */
11156
11157 for (j = 0; j < server.dbnum; j++) {
11158 redisDb *db = server.db+j;
11159
11160 if (dictSize(db->dict) == 0) continue;
11161 di = dictGetIterator(db->dict);
11162
11163 /* hash the DB id, so the same dataset moved in a different
11164 * DB will lead to a different digest */
11165 aux = htonl(j);
11166 mixDigest(final,&aux,sizeof(aux));
11167
11168 /* Iterate this DB writing every entry */
11169 while((de = dictNext(di)) != NULL) {
11170 sds key;
11171 robj *keyobj, *o;
11172 time_t expiretime;
11173
11174 memset(digest,0,20); /* This key-val digest */
11175 key = dictGetEntryKey(de);
11176 keyobj = createStringObject(key,sdslen(key));
11177
11178 mixDigest(digest,key,sdslen(key));
11179
11180 /* Make sure the key is loaded if VM is active */
11181 o = lookupKeyRead(db,keyobj);
11182
11183 aux = htonl(o->type);
11184 mixDigest(digest,&aux,sizeof(aux));
11185 expiretime = getExpire(db,keyobj);
11186
11187 /* Save the key and associated value */
11188 if (o->type == REDIS_STRING) {
11189 mixObjectDigest(digest,o);
11190 } else if (o->type == REDIS_LIST) {
11191 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11192 listTypeEntry entry;
11193 while(listTypeNext(li,&entry)) {
11194 robj *eleobj = listTypeGet(&entry);
11195 mixObjectDigest(digest,eleobj);
11196 decrRefCount(eleobj);
11197 }
11198 listTypeReleaseIterator(li);
11199 } else if (o->type == REDIS_SET) {
11200 dict *set = o->ptr;
11201 dictIterator *di = dictGetIterator(set);
11202 dictEntry *de;
11203
11204 while((de = dictNext(di)) != NULL) {
11205 robj *eleobj = dictGetEntryKey(de);
11206
11207 xorObjectDigest(digest,eleobj);
11208 }
11209 dictReleaseIterator(di);
11210 } else if (o->type == REDIS_ZSET) {
11211 zset *zs = o->ptr;
11212 dictIterator *di = dictGetIterator(zs->dict);
11213 dictEntry *de;
11214
11215 while((de = dictNext(di)) != NULL) {
11216 robj *eleobj = dictGetEntryKey(de);
11217 double *score = dictGetEntryVal(de);
11218 unsigned char eledigest[20];
11219
11220 snprintf(buf,sizeof(buf),"%.17g",*score);
11221 memset(eledigest,0,20);
11222 mixObjectDigest(eledigest,eleobj);
11223 mixDigest(eledigest,buf,strlen(buf));
11224 xorDigest(digest,eledigest,20);
11225 }
11226 dictReleaseIterator(di);
11227 } else if (o->type == REDIS_HASH) {
11228 hashTypeIterator *hi;
11229 robj *obj;
11230
11231 hi = hashTypeInitIterator(o);
11232 while (hashTypeNext(hi) != REDIS_ERR) {
11233 unsigned char eledigest[20];
11234
11235 memset(eledigest,0,20);
11236 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
11237 mixObjectDigest(eledigest,obj);
11238 decrRefCount(obj);
11239 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
11240 mixObjectDigest(eledigest,obj);
11241 decrRefCount(obj);
11242 xorDigest(digest,eledigest,20);
11243 }
11244 hashTypeReleaseIterator(hi);
11245 } else {
11246 redisPanic("Unknown object type");
11247 }
11248 /* If the key has an expire, add it to the mix */
11249 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11250 /* We can finally xor the key-val digest to the final digest */
11251 xorDigest(final,digest,20);
11252 decrRefCount(keyobj);
11253 }
11254 dictReleaseIterator(di);
11255 }
11256 }
11257
11258 static void debugCommand(redisClient *c) {
11259 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11260 *((char*)-1) = 'x';
11261 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11262 if (rdbSave(server.dbfilename) != REDIS_OK) {
11263 addReply(c,shared.err);
11264 return;
11265 }
11266 emptyDb();
11267 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11268 addReply(c,shared.err);
11269 return;
11270 }
11271 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11272 addReply(c,shared.ok);
11273 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11274 emptyDb();
11275 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11276 addReply(c,shared.err);
11277 return;
11278 }
11279 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11280 addReply(c,shared.ok);
11281 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11282 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11283 robj *val;
11284
11285 if (!de) {
11286 addReply(c,shared.nokeyerr);
11287 return;
11288 }
11289 val = dictGetEntryVal(de);
11290 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11291 val->storage == REDIS_VM_SWAPPING)) {
11292 char *strenc;
11293 char buf[128];
11294
11295 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11296 strenc = strencoding[val->encoding];
11297 } else {
11298 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11299 strenc = buf;
11300 }
11301 addReplySds(c,sdscatprintf(sdsempty(),
11302 "+Value at:%p refcount:%d "
11303 "encoding:%s serializedlength:%lld\r\n",
11304 (void*)val, val->refcount,
11305 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11306 } else {
11307 vmpointer *vp = (vmpointer*) val;
11308 addReplySds(c,sdscatprintf(sdsempty(),
11309 "+Value swapped at: page %llu "
11310 "using %llu pages\r\n",
11311 (unsigned long long) vp->page,
11312 (unsigned long long) vp->usedpages));
11313 }
11314 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11315 lookupKeyRead(c->db,c->argv[2]);
11316 addReply(c,shared.ok);
11317 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11318 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11319 robj *val;
11320 vmpointer *vp;
11321
11322 if (!server.vm_enabled) {
11323 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11324 return;
11325 }
11326 if (!de) {
11327 addReply(c,shared.nokeyerr);
11328 return;
11329 }
11330 val = dictGetEntryVal(de);
11331 /* Swap it */
11332 if (val->storage != REDIS_VM_MEMORY) {
11333 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11334 } else if (val->refcount != 1) {
11335 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11336 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11337 dictGetEntryVal(de) = vp;
11338 addReply(c,shared.ok);
11339 } else {
11340 addReply(c,shared.err);
11341 }
11342 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11343 long keys, j;
11344 robj *key, *val;
11345 char buf[128];
11346
11347 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11348 return;
11349 for (j = 0; j < keys; j++) {
11350 snprintf(buf,sizeof(buf),"key:%lu",j);
11351 key = createStringObject(buf,strlen(buf));
11352 if (lookupKeyRead(c->db,key) != NULL) {
11353 decrRefCount(key);
11354 continue;
11355 }
11356 snprintf(buf,sizeof(buf),"value:%lu",j);
11357 val = createStringObject(buf,strlen(buf));
11358 dbAdd(c->db,key,val);
11359 decrRefCount(key);
11360 }
11361 addReply(c,shared.ok);
11362 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11363 unsigned char digest[20];
11364 sds d = sdsnew("+");
11365 int j;
11366
11367 computeDatasetDigest(digest);
11368 for (j = 0; j < 20; j++)
11369 d = sdscatprintf(d, "%02x",digest[j]);
11370
11371 d = sdscatlen(d,"\r\n",2);
11372 addReplySds(c,d);
11373 } else {
11374 addReplySds(c,sdsnew(
11375 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11376 }
11377 }
11378
11379 static void _redisAssert(char *estr, char *file, int line) {
11380 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11381 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11382 #ifdef HAVE_BACKTRACE
11383 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11384 *((char*)-1) = 'x';
11385 #endif
11386 }
11387
11388 static void _redisPanic(char *msg, char *file, int line) {
11389 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11390 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11391 #ifdef HAVE_BACKTRACE
11392 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11393 *((char*)-1) = 'x';
11394 #endif
11395 }
11396
11397 /* =================================== Main! ================================ */
11398
11399 #ifdef __linux__
11400 int linuxOvercommitMemoryValue(void) {
11401 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11402 char buf[64];
11403
11404 if (!fp) return -1;
11405 if (fgets(buf,64,fp) == NULL) {
11406 fclose(fp);
11407 return -1;
11408 }
11409 fclose(fp);
11410
11411 return atoi(buf);
11412 }
11413
11414 void linuxOvercommitMemoryWarning(void) {
11415 if (linuxOvercommitMemoryValue() == 0) {
11416 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11417 }
11418 }
11419 #endif /* __linux__ */
11420
11421 static void daemonize(void) {
11422 int fd;
11423 FILE *fp;
11424
11425 if (fork() != 0) exit(0); /* parent exits */
11426 setsid(); /* create a new session */
11427
11428 /* Every output goes to /dev/null. If Redis is daemonized but
11429 * the 'logfile' is set to 'stdout' in the configuration file
11430 * it will not log at all. */
11431 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11432 dup2(fd, STDIN_FILENO);
11433 dup2(fd, STDOUT_FILENO);
11434 dup2(fd, STDERR_FILENO);
11435 if (fd > STDERR_FILENO) close(fd);
11436 }
11437 /* Try to write the pid file */
11438 fp = fopen(server.pidfile,"w");
11439 if (fp) {
11440 fprintf(fp,"%d\n",getpid());
11441 fclose(fp);
11442 }
11443 }
11444
11445 static void version() {
11446 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11447 redisGitSHA1(), atoi(redisGitDirty()) > 0);
11448 exit(0);
11449 }
11450
11451 static void usage() {
11452 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11453 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11454 exit(1);
11455 }
11456
11457 int main(int argc, char **argv) {
11458 time_t start;
11459
11460 initServerConfig();
11461 sortCommandTable();
11462 if (argc == 2) {
11463 if (strcmp(argv[1], "-v") == 0 ||
11464 strcmp(argv[1], "--version") == 0) version();
11465 if (strcmp(argv[1], "--help") == 0) usage();
11466 resetServerSaveParams();
11467 loadServerConfig(argv[1]);
11468 } else if ((argc > 2)) {
11469 usage();
11470 } else {
11471 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11472 }
11473 if (server.daemonize) daemonize();
11474 initServer();
11475 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11476 #ifdef __linux__
11477 linuxOvercommitMemoryWarning();
11478 #endif
11479 start = time(NULL);
11480 if (server.appendonly) {
11481 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11482 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11483 } else {
11484 if (rdbLoad(server.dbfilename) == REDIS_OK)
11485 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11486 }
11487 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11488 aeSetBeforeSleepProc(server.el,beforeSleep);
11489 aeMain(server.el);
11490 aeDeleteEventLoop(server.el);
11491 return 0;
11492 }
11493
11494 /* ============================= Backtrace support ========================= */
11495
11496 #ifdef HAVE_BACKTRACE
11497 static char *findFuncName(void *pointer, unsigned long *offset);
11498
11499 static void *getMcontextEip(ucontext_t *uc) {
11500 #if defined(__FreeBSD__)
11501 return (void*) uc->uc_mcontext.mc_eip;
11502 #elif defined(__dietlibc__)
11503 return (void*) uc->uc_mcontext.eip;
11504 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11505 #if __x86_64__
11506 return (void*) uc->uc_mcontext->__ss.__rip;
11507 #else
11508 return (void*) uc->uc_mcontext->__ss.__eip;
11509 #endif
11510 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11511 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11512 return (void*) uc->uc_mcontext->__ss.__rip;
11513 #else
11514 return (void*) uc->uc_mcontext->__ss.__eip;
11515 #endif
11516 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11517 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11518 #elif defined(__ia64__) /* Linux IA64 */
11519 return (void*) uc->uc_mcontext.sc_ip;
11520 #else
11521 return NULL;
11522 #endif
11523 }
11524
11525 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11526 void *trace[100];
11527 char **messages = NULL;
11528 int i, trace_size = 0;
11529 unsigned long offset=0;
11530 ucontext_t *uc = (ucontext_t*) secret;
11531 sds infostring;
11532 REDIS_NOTUSED(info);
11533
11534 redisLog(REDIS_WARNING,
11535 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11536 infostring = genRedisInfoString();
11537 redisLog(REDIS_WARNING, "%s",infostring);
11538 /* It's not safe to sdsfree() the returned string under memory
11539 * corruption conditions. Let it leak as we are going to abort */
11540
11541 trace_size = backtrace(trace, 100);
11542 /* overwrite sigaction with caller's address */
11543 if (getMcontextEip(uc) != NULL) {
11544 trace[1] = getMcontextEip(uc);
11545 }
11546 messages = backtrace_symbols(trace, trace_size);
11547
11548 for (i=1; i<trace_size; ++i) {
11549 char *fn = findFuncName(trace[i], &offset), *p;
11550
11551 p = strchr(messages[i],'+');
11552 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11553 redisLog(REDIS_WARNING,"%s", messages[i]);
11554 } else {
11555 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11556 }
11557 }
11558 /* free(messages); Don't call free() with possibly corrupted memory. */
11559 _exit(0);
11560 }
11561
11562 static void sigtermHandler(int sig) {
11563 REDIS_NOTUSED(sig);
11564
11565 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11566 server.shutdown_asap = 1;
11567 }
11568
11569 static void setupSigSegvAction(void) {
11570 struct sigaction act;
11571
11572 sigemptyset (&act.sa_mask);
11573 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11574 * is used. Otherwise, sa_handler is used */
11575 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11576 act.sa_sigaction = segvHandler;
11577 sigaction (SIGSEGV, &act, NULL);
11578 sigaction (SIGBUS, &act, NULL);
11579 sigaction (SIGFPE, &act, NULL);
11580 sigaction (SIGILL, &act, NULL);
11581 sigaction (SIGBUS, &act, NULL);
11582
11583 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11584 act.sa_handler = sigtermHandler;
11585 sigaction (SIGTERM, &act, NULL);
11586 return;
11587 }
11588
11589 #include "staticsymbols.h"
11590 /* This function try to convert a pointer into a function name. It's used in
11591 * oreder to provide a backtrace under segmentation fault that's able to
11592 * display functions declared as static (otherwise the backtrace is useless). */
11593 static char *findFuncName(void *pointer, unsigned long *offset){
11594 int i, ret = -1;
11595 unsigned long off, minoff = 0;
11596
11597 /* Try to match against the Symbol with the smallest offset */
11598 for (i=0; symsTable[i].pointer; i++) {
11599 unsigned long lp = (unsigned long) pointer;
11600
11601 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11602 off=lp-symsTable[i].pointer;
11603 if (ret < 0 || off < minoff) {
11604 minoff=off;
11605 ret=i;
11606 }
11607 }
11608 }
11609 if (ret == -1) return NULL;
11610 *offset = minoff;
11611 return symsTable[ret].name;
11612 }
11613 #else /* HAVE_BACKTRACE */
11614 static void setupSigSegvAction(void) {
11615 }
11616 #endif /* HAVE_BACKTRACE */
11617
11618
11619
11620 /* The End */
11621
11622
11623