]> git.saurik.com Git - redis.git/blob - redis.c
merge antirez/smallkeys
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "ziplist.h" /* Compact list data structure */
79 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
80 #include "release.h" /* Release and/or git repository information */
81
82 /* Error codes */
83 #define REDIS_OK 0
84 #define REDIS_ERR -1
85
86 /* Static server configuration */
87 #define REDIS_SERVERPORT 6379 /* TCP port */
88 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
89 #define REDIS_IOBUF_LEN 1024
90 #define REDIS_LOADBUF_LEN 1024
91 #define REDIS_STATIC_ARGS 8
92 #define REDIS_DEFAULT_DBNUM 16
93 #define REDIS_CONFIGLINE_MAX 1024
94 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
96 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
97 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
98 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101 #define REDIS_WRITEV_THRESHOLD 3
102 /* Max number of iovecs used for each writev call */
103 #define REDIS_WRITEV_IOVEC_COUNT 256
104
105 /* Hash table parameters */
106 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
107
108 /* Command flags */
109 #define REDIS_CMD_BULK 1 /* Bulk write command */
110 #define REDIS_CMD_INLINE 2 /* Inline command */
111 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115 #define REDIS_CMD_DENYOOM 4
116 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
117
118 /* Object types */
119 #define REDIS_STRING 0
120 #define REDIS_LIST 1
121 #define REDIS_SET 2
122 #define REDIS_ZSET 3
123 #define REDIS_HASH 4
124 #define REDIS_VMPOINTER 8
125
126 /* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
129 #define REDIS_ENCODING_RAW 0 /* Raw representation */
130 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
131 #define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132 #define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133 #define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134 #define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
135
136 static char* strencoding[] = {
137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
138 };
139
140 /* Object types only used for dumping to disk */
141 #define REDIS_EXPIRETIME 253
142 #define REDIS_SELECTDB 254
143 #define REDIS_EOF 255
144
145 /* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
155 *
156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
158 #define REDIS_RDB_6BITLEN 0
159 #define REDIS_RDB_14BITLEN 1
160 #define REDIS_RDB_32BITLEN 2
161 #define REDIS_RDB_ENCVAL 3
162 #define REDIS_RDB_LENERR UINT_MAX
163
164 /* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
170 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
171
172 /* Virtual memory object->where field. */
173 #define REDIS_VM_MEMORY 0 /* The object is on memory */
174 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
175 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
178 /* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180 #define REDIS_VM_MAX_NEAR_PAGES 65536
181 #define REDIS_VM_MAX_RANDOM_JUMP 4096
182 #define REDIS_VM_MAX_THREADS 32
183 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
184 /* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
188 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
189
190 /* Client flags */
191 #define REDIS_SLAVE 1 /* This client is a slave server */
192 #define REDIS_MASTER 2 /* This client is a master server */
193 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194 #define REDIS_MULTI 8 /* This client is in a MULTI context */
195 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
197 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
198
199 /* Slave replication state - slave side */
200 #define REDIS_REPL_NONE 0 /* No active replication */
201 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
202 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
204 /* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
213 /* List related stuff */
214 #define REDIS_HEAD 0
215 #define REDIS_TAIL 1
216
217 /* Sort operations */
218 #define REDIS_SORT_GET 0
219 #define REDIS_SORT_ASC 1
220 #define REDIS_SORT_DESC 2
221 #define REDIS_SORTKEY_MAX 1024
222
223 /* Log levels */
224 #define REDIS_DEBUG 0
225 #define REDIS_VERBOSE 1
226 #define REDIS_NOTICE 2
227 #define REDIS_WARNING 3
228
229 /* Anti-warning macro... */
230 #define REDIS_NOTUSED(V) ((void) V)
231
232 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
234
235 /* Append only defines */
236 #define APPENDFSYNC_NO 0
237 #define APPENDFSYNC_ALWAYS 1
238 #define APPENDFSYNC_EVERYSEC 2
239
240 /* Hashes related defaults */
241 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
243
244 /* We can print the stacktrace, so our assert is defined this way: */
245 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
246 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
247 static void _redisAssert(char *estr, char *file, int line);
248 static void _redisPanic(char *msg, char *file, int line);
249
250 /*================================= Data types ============================== */
251
252 /* A redis object, that is a type able to hold a string / list / set */
253
254 /* The actual Redis Object */
255 typedef struct redisObject {
256 unsigned type:4;
257 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
258 unsigned encoding:4;
259 unsigned lru:22; /* lru time (relative to server.lruclock) */
260 int refcount;
261 void *ptr;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 } robj;
267
268 /* The VM pointer structure - identifies an object in the swap file.
269 *
270 * This object is stored in place of the value
271 * object in the main key->value hash table representing a database.
272 * Note that the first fields (type, storage) are the same as the redisObject
273 * structure so that vmPointer strucuters can be accessed even when casted
274 * as redisObject structures.
275 *
276 * This is useful as we don't know if a value object is or not on disk, but we
277 * are always able to read obj->storage to check this. For vmPointer
278 * structures "type" is set to REDIS_VMPOINTER (even if without this field
279 * is still possible to check the kind of object from the value of 'storage').*/
280 typedef struct vmPointer {
281 unsigned type:4;
282 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
283 unsigned notused:26;
284 unsigned int vtype; /* type of the object stored in the swap file */
285 off_t page; /* the page at witch the object is stored on disk */
286 off_t usedpages; /* number of pages used on disk */
287 } vmpointer;
288
289 /* Macro used to initalize a Redis object allocated on the stack.
290 * Note that this macro is taken near the structure definition to make sure
291 * we'll update it when the structure is changed, to avoid bugs like
292 * bug #85 introduced exactly in this way. */
293 #define initStaticStringObject(_var,_ptr) do { \
294 _var.refcount = 1; \
295 _var.type = REDIS_STRING; \
296 _var.encoding = REDIS_ENCODING_RAW; \
297 _var.ptr = _ptr; \
298 _var.storage = REDIS_VM_MEMORY; \
299 } while(0);
300
301 typedef struct redisDb {
302 dict *dict; /* The keyspace for this DB */
303 dict *expires; /* Timeout of keys with a timeout set */
304 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
305 dict *io_keys; /* Keys with clients waiting for VM I/O */
306 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
307 int id;
308 } redisDb;
309
310 /* Client MULTI/EXEC state */
311 typedef struct multiCmd {
312 robj **argv;
313 int argc;
314 struct redisCommand *cmd;
315 } multiCmd;
316
317 typedef struct multiState {
318 multiCmd *commands; /* Array of MULTI commands */
319 int count; /* Total number of MULTI commands */
320 } multiState;
321
322 /* With multiplexing we need to take per-clinet state.
323 * Clients are taken in a liked list. */
324 typedef struct redisClient {
325 int fd;
326 redisDb *db;
327 int dictid;
328 sds querybuf;
329 robj **argv, **mbargv;
330 int argc, mbargc;
331 int bulklen; /* bulk read len. -1 if not in bulk read mode */
332 int multibulk; /* multi bulk command format active */
333 list *reply;
334 int sentlen;
335 time_t lastinteraction; /* time of the last interaction, used for timeout */
336 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
337 int slaveseldb; /* slave selected db, if this client is a slave */
338 int authenticated; /* when requirepass is non-NULL */
339 int replstate; /* replication state if this is a slave */
340 int repldbfd; /* replication DB file descriptor */
341 long repldboff; /* replication DB file offset */
342 off_t repldbsize; /* replication DB file size */
343 multiState mstate; /* MULTI/EXEC state */
344 robj **blocking_keys; /* The key we are waiting to terminate a blocking
345 * operation such as BLPOP. Otherwise NULL. */
346 int blocking_keys_num; /* Number of blocking keys */
347 time_t blockingto; /* Blocking operation timeout. If UNIX current time
348 * is >= blockingto then the operation timed out. */
349 list *io_keys; /* Keys this client is waiting to be loaded from the
350 * swap file in order to continue. */
351 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
352 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
353 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
354 } redisClient;
355
356 struct saveparam {
357 time_t seconds;
358 int changes;
359 };
360
361 /* Global server state structure */
362 struct redisServer {
363 int port;
364 int fd;
365 redisDb *db;
366 long long dirty; /* changes to DB from the last save */
367 list *clients;
368 list *slaves, *monitors;
369 char neterr[ANET_ERR_LEN];
370 aeEventLoop *el;
371 int cronloops; /* number of times the cron function run */
372 list *objfreelist; /* A list of freed objects to avoid malloc() */
373 time_t lastsave; /* Unix time of last save succeeede */
374 /* Fields used only for stats */
375 time_t stat_starttime; /* server start time */
376 long long stat_numcommands; /* number of processed commands */
377 long long stat_numconnections; /* number of connections received */
378 long long stat_expiredkeys; /* number of expired keys */
379 /* Configuration */
380 int verbosity;
381 int glueoutputbuf;
382 int maxidletime;
383 int dbnum;
384 int daemonize;
385 int appendonly;
386 int appendfsync;
387 int no_appendfsync_on_rewrite;
388 int shutdown_asap;
389 time_t lastfsync;
390 int appendfd;
391 int appendseldb;
392 char *pidfile;
393 pid_t bgsavechildpid;
394 pid_t bgrewritechildpid;
395 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
396 sds aofbuf; /* AOF buffer, written before entering the event loop */
397 struct saveparam *saveparams;
398 int saveparamslen;
399 char *logfile;
400 char *bindaddr;
401 char *dbfilename;
402 char *appendfilename;
403 char *requirepass;
404 int rdbcompression;
405 int activerehashing;
406 /* Replication related */
407 int isslave;
408 char *masterauth;
409 char *masterhost;
410 int masterport;
411 redisClient *master; /* client that is master for this slave */
412 int replstate;
413 unsigned int maxclients;
414 unsigned long long maxmemory;
415 unsigned int blpop_blocked_clients;
416 unsigned int vm_blocked_clients;
417 /* Sort parameters - qsort_r() is only available under BSD so we
418 * have to take this state global, in order to pass it to sortCompare() */
419 int sort_desc;
420 int sort_alpha;
421 int sort_bypattern;
422 /* Virtual memory configuration */
423 int vm_enabled;
424 char *vm_swap_file;
425 off_t vm_page_size;
426 off_t vm_pages;
427 unsigned long long vm_max_memory;
428 /* Hashes config */
429 size_t hash_max_zipmap_entries;
430 size_t hash_max_zipmap_value;
431 /* Virtual memory state */
432 FILE *vm_fp;
433 int vm_fd;
434 off_t vm_next_page; /* Next probably empty page */
435 off_t vm_near_pages; /* Number of pages allocated sequentially */
436 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
437 time_t unixtime; /* Unix time sampled every second. */
438 /* Virtual memory I/O threads stuff */
439 /* An I/O thread process an element taken from the io_jobs queue and
440 * put the result of the operation in the io_done list. While the
441 * job is being processed, it's put on io_processing queue. */
442 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
443 list *io_processing; /* List of VM I/O jobs being processed */
444 list *io_processed; /* List of VM I/O jobs already processed */
445 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
446 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
447 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
448 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
449 pthread_attr_t io_threads_attr; /* attributes for threads creation */
450 int io_active_threads; /* Number of running I/O threads */
451 int vm_max_threads; /* Max number of I/O threads running at the same time */
452 /* Our main thread is blocked on the event loop, locking for sockets ready
453 * to be read or written, so when a threaded I/O operation is ready to be
454 * processed by the main thread, the I/O thread will use a unix pipe to
455 * awake the main thread. The followings are the two pipe FDs. */
456 int io_ready_pipe_read;
457 int io_ready_pipe_write;
458 /* Virtual memory stats */
459 unsigned long long vm_stats_used_pages;
460 unsigned long long vm_stats_swapped_objects;
461 unsigned long long vm_stats_swapouts;
462 unsigned long long vm_stats_swapins;
463 /* Pubsub */
464 dict *pubsub_channels; /* Map channels to list of subscribed clients */
465 list *pubsub_patterns; /* A list of pubsub_patterns */
466 /* Misc */
467 FILE *devnull;
468 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
469 unsigned lruclock_padding:10;
470 };
471
472 typedef struct pubsubPattern {
473 redisClient *client;
474 robj *pattern;
475 } pubsubPattern;
476
477 typedef void redisCommandProc(redisClient *c);
478 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
479 struct redisCommand {
480 char *name;
481 redisCommandProc *proc;
482 int arity;
483 int flags;
484 /* Use a function to determine which keys need to be loaded
485 * in the background prior to executing this command. Takes precedence
486 * over vm_firstkey and others, ignored when NULL */
487 redisVmPreloadProc *vm_preload_proc;
488 /* What keys should be loaded in background when calling this command? */
489 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
490 int vm_lastkey; /* THe last argument that's a key */
491 int vm_keystep; /* The step between first and last key */
492 };
493
494 struct redisFunctionSym {
495 char *name;
496 unsigned long pointer;
497 };
498
499 typedef struct _redisSortObject {
500 robj *obj;
501 union {
502 double score;
503 robj *cmpobj;
504 } u;
505 } redisSortObject;
506
507 typedef struct _redisSortOperation {
508 int type;
509 robj *pattern;
510 } redisSortOperation;
511
512 /* ZSETs use a specialized version of Skiplists */
513
514 typedef struct zskiplistNode {
515 struct zskiplistNode **forward;
516 struct zskiplistNode *backward;
517 unsigned int *span;
518 double score;
519 robj *obj;
520 } zskiplistNode;
521
522 typedef struct zskiplist {
523 struct zskiplistNode *header, *tail;
524 unsigned long length;
525 int level;
526 } zskiplist;
527
528 typedef struct zset {
529 dict *dict;
530 zskiplist *zsl;
531 } zset;
532
533 /* Our shared "common" objects */
534
535 #define REDIS_SHARED_INTEGERS 10000
536 struct sharedObjectsStruct {
537 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
538 *colon, *nullbulk, *nullmultibulk, *queued,
539 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
540 *outofrangeerr, *plus,
541 *select0, *select1, *select2, *select3, *select4,
542 *select5, *select6, *select7, *select8, *select9,
543 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
544 *mbulk4, *psubscribebulk, *punsubscribebulk,
545 *integers[REDIS_SHARED_INTEGERS];
546 } shared;
547
548 /* Global vars that are actally used as constants. The following double
549 * values are used for double on-disk serialization, and are initialized
550 * at runtime to avoid strange compiler optimizations. */
551
552 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
553
554 /* VM threaded I/O request message */
555 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
556 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
557 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
558 typedef struct iojob {
559 int type; /* Request type, REDIS_IOJOB_* */
560 redisDb *db;/* Redis database */
561 robj *key; /* This I/O request is about swapping this key */
562 robj *id; /* Unique identifier of this job:
563 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
564 vmpointer objct for REDIS_IOREQ_LOAD. */
565 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
566 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
567 off_t page; /* Swap page where to read/write the object */
568 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
569 int canceled; /* True if this command was canceled by blocking side of VM */
570 pthread_t thread; /* ID of the thread processing this entry */
571 } iojob;
572
573 /*================================ Prototypes =============================== */
574
575 static void freeStringObject(robj *o);
576 static void freeListObject(robj *o);
577 static void freeSetObject(robj *o);
578 static void decrRefCount(void *o);
579 static robj *createObject(int type, void *ptr);
580 static void freeClient(redisClient *c);
581 static int rdbLoad(char *filename);
582 static void addReply(redisClient *c, robj *obj);
583 static void addReplySds(redisClient *c, sds s);
584 static void incrRefCount(robj *o);
585 static int rdbSaveBackground(char *filename);
586 static robj *createStringObject(char *ptr, size_t len);
587 static robj *dupStringObject(robj *o);
588 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
589 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
590 static void flushAppendOnlyFile(void);
591 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
592 static int syncWithMaster(void);
593 static robj *tryObjectEncoding(robj *o);
594 static robj *getDecodedObject(robj *o);
595 static int removeExpire(redisDb *db, robj *key);
596 static int expireIfNeeded(redisDb *db, robj *key);
597 static int deleteIfVolatile(redisDb *db, robj *key);
598 static int dbDelete(redisDb *db, robj *key);
599 static time_t getExpire(redisDb *db, robj *key);
600 static int setExpire(redisDb *db, robj *key, time_t when);
601 static void updateSlavesWaitingBgsave(int bgsaveerr);
602 static void freeMemoryIfNeeded(void);
603 static int processCommand(redisClient *c);
604 static void setupSigSegvAction(void);
605 static void rdbRemoveTempFile(pid_t childpid);
606 static void aofRemoveTempFile(pid_t childpid);
607 static size_t stringObjectLen(robj *o);
608 static void processInputBuffer(redisClient *c);
609 static zskiplist *zslCreate(void);
610 static void zslFree(zskiplist *zsl);
611 static void zslInsert(zskiplist *zsl, double score, robj *obj);
612 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
613 static void initClientMultiState(redisClient *c);
614 static void freeClientMultiState(redisClient *c);
615 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
616 static void unblockClientWaitingData(redisClient *c);
617 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
618 static void vmInit(void);
619 static void vmMarkPagesFree(off_t page, off_t count);
620 static robj *vmLoadObject(robj *o);
621 static robj *vmPreviewObject(robj *o);
622 static int vmSwapOneObjectBlocking(void);
623 static int vmSwapOneObjectThreaded(void);
624 static int vmCanSwapOut(void);
625 static int tryFreeOneObjectFromFreelist(void);
626 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
627 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
628 static void vmCancelThreadedIOJob(robj *o);
629 static void lockThreadedIO(void);
630 static void unlockThreadedIO(void);
631 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
632 static void freeIOJob(iojob *j);
633 static void queueIOJob(iojob *j);
634 static int vmWriteObjectOnSwap(robj *o, off_t page);
635 static robj *vmReadObjectFromSwap(off_t page, int type);
636 static void waitEmptyIOJobsQueue(void);
637 static void vmReopenSwapFile(void);
638 static int vmFreePage(off_t page);
639 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
640 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
641 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
642 static int dontWaitForSwappedKey(redisClient *c, robj *key);
643 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
644 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
645 static struct redisCommand *lookupCommand(char *name);
646 static void call(redisClient *c, struct redisCommand *cmd);
647 static void resetClient(redisClient *c);
648 static void convertToRealHash(robj *o);
649 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
650 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
651 static void freePubsubPattern(void *p);
652 static int listMatchPubsubPattern(void *a, void *b);
653 static int compareStringObjects(robj *a, robj *b);
654 static int equalStringObjects(robj *a, robj *b);
655 static void usage();
656 static int rewriteAppendOnlyFileBackground(void);
657 static vmpointer *vmSwapObjectBlocking(robj *val);
658 static int prepareForShutdown();
659 static void touchWatchedKey(redisDb *db, robj *key);
660 static void touchWatchedKeysOnFlush(int dbid);
661 static void unwatchAllKeys(redisClient *c);
662
663 static void authCommand(redisClient *c);
664 static void pingCommand(redisClient *c);
665 static void echoCommand(redisClient *c);
666 static void setCommand(redisClient *c);
667 static void setnxCommand(redisClient *c);
668 static void setexCommand(redisClient *c);
669 static void getCommand(redisClient *c);
670 static void delCommand(redisClient *c);
671 static void existsCommand(redisClient *c);
672 static void incrCommand(redisClient *c);
673 static void decrCommand(redisClient *c);
674 static void incrbyCommand(redisClient *c);
675 static void decrbyCommand(redisClient *c);
676 static void selectCommand(redisClient *c);
677 static void randomkeyCommand(redisClient *c);
678 static void keysCommand(redisClient *c);
679 static void dbsizeCommand(redisClient *c);
680 static void lastsaveCommand(redisClient *c);
681 static void saveCommand(redisClient *c);
682 static void bgsaveCommand(redisClient *c);
683 static void bgrewriteaofCommand(redisClient *c);
684 static void shutdownCommand(redisClient *c);
685 static void moveCommand(redisClient *c);
686 static void renameCommand(redisClient *c);
687 static void renamenxCommand(redisClient *c);
688 static void lpushCommand(redisClient *c);
689 static void rpushCommand(redisClient *c);
690 static void lpopCommand(redisClient *c);
691 static void rpopCommand(redisClient *c);
692 static void llenCommand(redisClient *c);
693 static void lindexCommand(redisClient *c);
694 static void lrangeCommand(redisClient *c);
695 static void ltrimCommand(redisClient *c);
696 static void typeCommand(redisClient *c);
697 static void lsetCommand(redisClient *c);
698 static void saddCommand(redisClient *c);
699 static void sremCommand(redisClient *c);
700 static void smoveCommand(redisClient *c);
701 static void sismemberCommand(redisClient *c);
702 static void scardCommand(redisClient *c);
703 static void spopCommand(redisClient *c);
704 static void srandmemberCommand(redisClient *c);
705 static void sinterCommand(redisClient *c);
706 static void sinterstoreCommand(redisClient *c);
707 static void sunionCommand(redisClient *c);
708 static void sunionstoreCommand(redisClient *c);
709 static void sdiffCommand(redisClient *c);
710 static void sdiffstoreCommand(redisClient *c);
711 static void syncCommand(redisClient *c);
712 static void flushdbCommand(redisClient *c);
713 static void flushallCommand(redisClient *c);
714 static void sortCommand(redisClient *c);
715 static void lremCommand(redisClient *c);
716 static void rpoplpushcommand(redisClient *c);
717 static void infoCommand(redisClient *c);
718 static void mgetCommand(redisClient *c);
719 static void monitorCommand(redisClient *c);
720 static void expireCommand(redisClient *c);
721 static void expireatCommand(redisClient *c);
722 static void getsetCommand(redisClient *c);
723 static void ttlCommand(redisClient *c);
724 static void slaveofCommand(redisClient *c);
725 static void debugCommand(redisClient *c);
726 static void msetCommand(redisClient *c);
727 static void msetnxCommand(redisClient *c);
728 static void zaddCommand(redisClient *c);
729 static void zincrbyCommand(redisClient *c);
730 static void zrangeCommand(redisClient *c);
731 static void zrangebyscoreCommand(redisClient *c);
732 static void zcountCommand(redisClient *c);
733 static void zrevrangeCommand(redisClient *c);
734 static void zcardCommand(redisClient *c);
735 static void zremCommand(redisClient *c);
736 static void zscoreCommand(redisClient *c);
737 static void zremrangebyscoreCommand(redisClient *c);
738 static void multiCommand(redisClient *c);
739 static void execCommand(redisClient *c);
740 static void discardCommand(redisClient *c);
741 static void blpopCommand(redisClient *c);
742 static void brpopCommand(redisClient *c);
743 static void appendCommand(redisClient *c);
744 static void substrCommand(redisClient *c);
745 static void zrankCommand(redisClient *c);
746 static void zrevrankCommand(redisClient *c);
747 static void hsetCommand(redisClient *c);
748 static void hsetnxCommand(redisClient *c);
749 static void hgetCommand(redisClient *c);
750 static void hmsetCommand(redisClient *c);
751 static void hmgetCommand(redisClient *c);
752 static void hdelCommand(redisClient *c);
753 static void hlenCommand(redisClient *c);
754 static void zremrangebyrankCommand(redisClient *c);
755 static void zunionstoreCommand(redisClient *c);
756 static void zinterstoreCommand(redisClient *c);
757 static void hkeysCommand(redisClient *c);
758 static void hvalsCommand(redisClient *c);
759 static void hgetallCommand(redisClient *c);
760 static void hexistsCommand(redisClient *c);
761 static void configCommand(redisClient *c);
762 static void hincrbyCommand(redisClient *c);
763 static void subscribeCommand(redisClient *c);
764 static void unsubscribeCommand(redisClient *c);
765 static void psubscribeCommand(redisClient *c);
766 static void punsubscribeCommand(redisClient *c);
767 static void publishCommand(redisClient *c);
768 static void watchCommand(redisClient *c);
769 static void unwatchCommand(redisClient *c);
770
771 /*================================= Globals ================================= */
772
773 /* Global vars */
774 static struct redisServer server; /* server global state */
775 static struct redisCommand *commandTable;
776 static struct redisCommand readonlyCommandTable[] = {
777 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
779 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
780 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
781 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
786 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
788 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
800 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
801 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
803 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
804 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
805 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
809 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
810 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
811 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
812 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
813 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
814 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
815 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
816 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
817 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
821 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
822 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
827 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
829 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
830 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
831 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
833 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
834 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
835 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
836 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
837 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
840 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
841 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
842 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
843 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
844 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
845 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
846 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
847 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
852 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
859 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
865 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
867 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
872 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
875 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
878 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
883 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
885 };
886
887 /*============================ Utility functions ============================ */
888
889 /* Glob-style pattern matching. */
890 static int stringmatchlen(const char *pattern, int patternLen,
891 const char *string, int stringLen, int nocase)
892 {
893 while(patternLen) {
894 switch(pattern[0]) {
895 case '*':
896 while (pattern[1] == '*') {
897 pattern++;
898 patternLen--;
899 }
900 if (patternLen == 1)
901 return 1; /* match */
902 while(stringLen) {
903 if (stringmatchlen(pattern+1, patternLen-1,
904 string, stringLen, nocase))
905 return 1; /* match */
906 string++;
907 stringLen--;
908 }
909 return 0; /* no match */
910 break;
911 case '?':
912 if (stringLen == 0)
913 return 0; /* no match */
914 string++;
915 stringLen--;
916 break;
917 case '[':
918 {
919 int not, match;
920
921 pattern++;
922 patternLen--;
923 not = pattern[0] == '^';
924 if (not) {
925 pattern++;
926 patternLen--;
927 }
928 match = 0;
929 while(1) {
930 if (pattern[0] == '\\') {
931 pattern++;
932 patternLen--;
933 if (pattern[0] == string[0])
934 match = 1;
935 } else if (pattern[0] == ']') {
936 break;
937 } else if (patternLen == 0) {
938 pattern--;
939 patternLen++;
940 break;
941 } else if (pattern[1] == '-' && patternLen >= 3) {
942 int start = pattern[0];
943 int end = pattern[2];
944 int c = string[0];
945 if (start > end) {
946 int t = start;
947 start = end;
948 end = t;
949 }
950 if (nocase) {
951 start = tolower(start);
952 end = tolower(end);
953 c = tolower(c);
954 }
955 pattern += 2;
956 patternLen -= 2;
957 if (c >= start && c <= end)
958 match = 1;
959 } else {
960 if (!nocase) {
961 if (pattern[0] == string[0])
962 match = 1;
963 } else {
964 if (tolower((int)pattern[0]) == tolower((int)string[0]))
965 match = 1;
966 }
967 }
968 pattern++;
969 patternLen--;
970 }
971 if (not)
972 match = !match;
973 if (!match)
974 return 0; /* no match */
975 string++;
976 stringLen--;
977 break;
978 }
979 case '\\':
980 if (patternLen >= 2) {
981 pattern++;
982 patternLen--;
983 }
984 /* fall through */
985 default:
986 if (!nocase) {
987 if (pattern[0] != string[0])
988 return 0; /* no match */
989 } else {
990 if (tolower((int)pattern[0]) != tolower((int)string[0]))
991 return 0; /* no match */
992 }
993 string++;
994 stringLen--;
995 break;
996 }
997 pattern++;
998 patternLen--;
999 if (stringLen == 0) {
1000 while(*pattern == '*') {
1001 pattern++;
1002 patternLen--;
1003 }
1004 break;
1005 }
1006 }
1007 if (patternLen == 0 && stringLen == 0)
1008 return 1;
1009 return 0;
1010 }
1011
1012 static int stringmatch(const char *pattern, const char *string, int nocase) {
1013 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1014 }
1015
1016 /* Convert a string representing an amount of memory into the number of
1017 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1018 * (1024*1024*1024).
1019 *
1020 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1021 * set to 0 */
1022 static long long memtoll(const char *p, int *err) {
1023 const char *u;
1024 char buf[128];
1025 long mul; /* unit multiplier */
1026 long long val;
1027 unsigned int digits;
1028
1029 if (err) *err = 0;
1030 /* Search the first non digit character. */
1031 u = p;
1032 if (*u == '-') u++;
1033 while(*u && isdigit(*u)) u++;
1034 if (*u == '\0' || !strcasecmp(u,"b")) {
1035 mul = 1;
1036 } else if (!strcasecmp(u,"k")) {
1037 mul = 1000;
1038 } else if (!strcasecmp(u,"kb")) {
1039 mul = 1024;
1040 } else if (!strcasecmp(u,"m")) {
1041 mul = 1000*1000;
1042 } else if (!strcasecmp(u,"mb")) {
1043 mul = 1024*1024;
1044 } else if (!strcasecmp(u,"g")) {
1045 mul = 1000L*1000*1000;
1046 } else if (!strcasecmp(u,"gb")) {
1047 mul = 1024L*1024*1024;
1048 } else {
1049 if (err) *err = 1;
1050 mul = 1;
1051 }
1052 digits = u-p;
1053 if (digits >= sizeof(buf)) {
1054 if (err) *err = 1;
1055 return LLONG_MAX;
1056 }
1057 memcpy(buf,p,digits);
1058 buf[digits] = '\0';
1059 val = strtoll(buf,NULL,10);
1060 return val*mul;
1061 }
1062
1063 /* Convert a long long into a string. Returns the number of
1064 * characters needed to represent the number, that can be shorter if passed
1065 * buffer length is not enough to store the whole number. */
1066 static int ll2string(char *s, size_t len, long long value) {
1067 char buf[32], *p;
1068 unsigned long long v;
1069 size_t l;
1070
1071 if (len == 0) return 0;
1072 v = (value < 0) ? -value : value;
1073 p = buf+31; /* point to the last character */
1074 do {
1075 *p-- = '0'+(v%10);
1076 v /= 10;
1077 } while(v);
1078 if (value < 0) *p-- = '-';
1079 p++;
1080 l = 32-(p-buf);
1081 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1082 memcpy(s,p,l);
1083 s[l] = '\0';
1084 return l;
1085 }
1086
1087 static void redisLog(int level, const char *fmt, ...) {
1088 va_list ap;
1089 FILE *fp;
1090
1091 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1092 if (!fp) return;
1093
1094 va_start(ap, fmt);
1095 if (level >= server.verbosity) {
1096 char *c = ".-*#";
1097 char buf[64];
1098 time_t now;
1099
1100 now = time(NULL);
1101 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1102 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1103 vfprintf(fp, fmt, ap);
1104 fprintf(fp,"\n");
1105 fflush(fp);
1106 }
1107 va_end(ap);
1108
1109 if (server.logfile) fclose(fp);
1110 }
1111
1112 /*====================== Hash table type implementation ==================== */
1113
1114 /* This is an hash table type that uses the SDS dynamic strings libary as
1115 * keys and radis objects as values (objects can hold SDS strings,
1116 * lists, sets). */
1117
1118 static void dictVanillaFree(void *privdata, void *val)
1119 {
1120 DICT_NOTUSED(privdata);
1121 zfree(val);
1122 }
1123
1124 static void dictListDestructor(void *privdata, void *val)
1125 {
1126 DICT_NOTUSED(privdata);
1127 listRelease((list*)val);
1128 }
1129
1130 static int dictSdsKeyCompare(void *privdata, const void *key1,
1131 const void *key2)
1132 {
1133 int l1,l2;
1134 DICT_NOTUSED(privdata);
1135
1136 l1 = sdslen((sds)key1);
1137 l2 = sdslen((sds)key2);
1138 if (l1 != l2) return 0;
1139 return memcmp(key1, key2, l1) == 0;
1140 }
1141
1142 static void dictRedisObjectDestructor(void *privdata, void *val)
1143 {
1144 DICT_NOTUSED(privdata);
1145
1146 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1147 decrRefCount(val);
1148 }
1149
1150 static void dictSdsDestructor(void *privdata, void *val)
1151 {
1152 DICT_NOTUSED(privdata);
1153
1154 sdsfree(val);
1155 }
1156
1157 static int dictObjKeyCompare(void *privdata, const void *key1,
1158 const void *key2)
1159 {
1160 const robj *o1 = key1, *o2 = key2;
1161 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1162 }
1163
1164 static unsigned int dictObjHash(const void *key) {
1165 const robj *o = key;
1166 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1167 }
1168
1169 static unsigned int dictSdsHash(const void *key) {
1170 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1171 }
1172
1173 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1174 const void *key2)
1175 {
1176 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1177 int cmp;
1178
1179 if (o1->encoding == REDIS_ENCODING_INT &&
1180 o2->encoding == REDIS_ENCODING_INT)
1181 return o1->ptr == o2->ptr;
1182
1183 o1 = getDecodedObject(o1);
1184 o2 = getDecodedObject(o2);
1185 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
1186 decrRefCount(o1);
1187 decrRefCount(o2);
1188 return cmp;
1189 }
1190
1191 static unsigned int dictEncObjHash(const void *key) {
1192 robj *o = (robj*) key;
1193
1194 if (o->encoding == REDIS_ENCODING_RAW) {
1195 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1196 } else {
1197 if (o->encoding == REDIS_ENCODING_INT) {
1198 char buf[32];
1199 int len;
1200
1201 len = ll2string(buf,32,(long)o->ptr);
1202 return dictGenHashFunction((unsigned char*)buf, len);
1203 } else {
1204 unsigned int hash;
1205
1206 o = getDecodedObject(o);
1207 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1208 decrRefCount(o);
1209 return hash;
1210 }
1211 }
1212 }
1213
1214 /* Sets type */
1215 static dictType setDictType = {
1216 dictEncObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictEncObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222 };
1223
1224 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1225 static dictType zsetDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1232 };
1233
1234 /* Db->dict, keys are sds strings, vals are Redis objects. */
1235 static dictType dbDictType = {
1236 dictSdsHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictSdsKeyCompare, /* key compare */
1240 dictSdsDestructor, /* key destructor */
1241 dictRedisObjectDestructor /* val destructor */
1242 };
1243
1244 /* Db->expires */
1245 static dictType keyptrDictType = {
1246 dictSdsHash, /* hash function */
1247 NULL, /* key dup */
1248 NULL, /* val dup */
1249 dictSdsKeyCompare, /* key compare */
1250 dictSdsDestructor, /* key destructor */
1251 NULL /* val destructor */
1252 };
1253
1254 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1255 static dictType hashDictType = {
1256 dictEncObjHash, /* hash function */
1257 NULL, /* key dup */
1258 NULL, /* val dup */
1259 dictEncObjKeyCompare, /* key compare */
1260 dictRedisObjectDestructor, /* key destructor */
1261 dictRedisObjectDestructor /* val destructor */
1262 };
1263
1264 /* Keylist hash table type has unencoded redis objects as keys and
1265 * lists as values. It's used for blocking operations (BLPOP) and to
1266 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1267 static dictType keylistDictType = {
1268 dictObjHash, /* hash function */
1269 NULL, /* key dup */
1270 NULL, /* val dup */
1271 dictObjKeyCompare, /* key compare */
1272 dictRedisObjectDestructor, /* key destructor */
1273 dictListDestructor /* val destructor */
1274 };
1275
1276 static void version();
1277
1278 /* ========================= Random utility functions ======================= */
1279
1280 /* Redis generally does not try to recover from out of memory conditions
1281 * when allocating objects or strings, it is not clear if it will be possible
1282 * to report this condition to the client since the networking layer itself
1283 * is based on heap allocation for send buffers, so we simply abort.
1284 * At least the code will be simpler to read... */
1285 static void oom(const char *msg) {
1286 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1287 sleep(1);
1288 abort();
1289 }
1290
1291 /* ====================== Redis server networking stuff ===================== */
1292 static void closeTimedoutClients(void) {
1293 redisClient *c;
1294 listNode *ln;
1295 time_t now = time(NULL);
1296 listIter li;
1297
1298 listRewind(server.clients,&li);
1299 while ((ln = listNext(&li)) != NULL) {
1300 c = listNodeValue(ln);
1301 if (server.maxidletime &&
1302 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1303 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1304 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1305 listLength(c->pubsub_patterns) == 0 &&
1306 (now - c->lastinteraction > server.maxidletime))
1307 {
1308 redisLog(REDIS_VERBOSE,"Closing idle client");
1309 freeClient(c);
1310 } else if (c->flags & REDIS_BLOCKED) {
1311 if (c->blockingto != 0 && c->blockingto < now) {
1312 addReply(c,shared.nullmultibulk);
1313 unblockClientWaitingData(c);
1314 }
1315 }
1316 }
1317 }
1318
1319 static int htNeedsResize(dict *dict) {
1320 long long size, used;
1321
1322 size = dictSlots(dict);
1323 used = dictSize(dict);
1324 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1325 (used*100/size < REDIS_HT_MINFILL));
1326 }
1327
1328 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1329 * we resize the hash table to save memory */
1330 static void tryResizeHashTables(void) {
1331 int j;
1332
1333 for (j = 0; j < server.dbnum; j++) {
1334 if (htNeedsResize(server.db[j].dict))
1335 dictResize(server.db[j].dict);
1336 if (htNeedsResize(server.db[j].expires))
1337 dictResize(server.db[j].expires);
1338 }
1339 }
1340
1341 /* Our hash table implementation performs rehashing incrementally while
1342 * we write/read from the hash table. Still if the server is idle, the hash
1343 * table will use two tables for a long time. So we try to use 1 millisecond
1344 * of CPU time at every serverCron() loop in order to rehash some key. */
1345 static void incrementallyRehash(void) {
1346 int j;
1347
1348 for (j = 0; j < server.dbnum; j++) {
1349 if (dictIsRehashing(server.db[j].dict)) {
1350 dictRehashMilliseconds(server.db[j].dict,1);
1351 break; /* already used our millisecond for this loop... */
1352 }
1353 }
1354 }
1355
1356 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1357 void backgroundSaveDoneHandler(int statloc) {
1358 int exitcode = WEXITSTATUS(statloc);
1359 int bysignal = WIFSIGNALED(statloc);
1360
1361 if (!bysignal && exitcode == 0) {
1362 redisLog(REDIS_NOTICE,
1363 "Background saving terminated with success");
1364 server.dirty = 0;
1365 server.lastsave = time(NULL);
1366 } else if (!bysignal && exitcode != 0) {
1367 redisLog(REDIS_WARNING, "Background saving error");
1368 } else {
1369 redisLog(REDIS_WARNING,
1370 "Background saving terminated by signal %d", WTERMSIG(statloc));
1371 rdbRemoveTempFile(server.bgsavechildpid);
1372 }
1373 server.bgsavechildpid = -1;
1374 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1375 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1376 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1377 }
1378
1379 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1380 * Handle this. */
1381 void backgroundRewriteDoneHandler(int statloc) {
1382 int exitcode = WEXITSTATUS(statloc);
1383 int bysignal = WIFSIGNALED(statloc);
1384
1385 if (!bysignal && exitcode == 0) {
1386 int fd;
1387 char tmpfile[256];
1388
1389 redisLog(REDIS_NOTICE,
1390 "Background append only file rewriting terminated with success");
1391 /* Now it's time to flush the differences accumulated by the parent */
1392 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1393 fd = open(tmpfile,O_WRONLY|O_APPEND);
1394 if (fd == -1) {
1395 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1396 goto cleanup;
1397 }
1398 /* Flush our data... */
1399 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1400 (signed) sdslen(server.bgrewritebuf)) {
1401 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1402 close(fd);
1403 goto cleanup;
1404 }
1405 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1406 /* Now our work is to rename the temp file into the stable file. And
1407 * switch the file descriptor used by the server for append only. */
1408 if (rename(tmpfile,server.appendfilename) == -1) {
1409 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1410 close(fd);
1411 goto cleanup;
1412 }
1413 /* Mission completed... almost */
1414 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1415 if (server.appendfd != -1) {
1416 /* If append only is actually enabled... */
1417 close(server.appendfd);
1418 server.appendfd = fd;
1419 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1420 server.appendseldb = -1; /* Make sure it will issue SELECT */
1421 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1422 } else {
1423 /* If append only is disabled we just generate a dump in this
1424 * format. Why not? */
1425 close(fd);
1426 }
1427 } else if (!bysignal && exitcode != 0) {
1428 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1429 } else {
1430 redisLog(REDIS_WARNING,
1431 "Background append only file rewriting terminated by signal %d",
1432 WTERMSIG(statloc));
1433 }
1434 cleanup:
1435 sdsfree(server.bgrewritebuf);
1436 server.bgrewritebuf = sdsempty();
1437 aofRemoveTempFile(server.bgrewritechildpid);
1438 server.bgrewritechildpid = -1;
1439 }
1440
1441 /* This function is called once a background process of some kind terminates,
1442 * as we want to avoid resizing the hash tables when there is a child in order
1443 * to play well with copy-on-write (otherwise when a resize happens lots of
1444 * memory pages are copied). The goal of this function is to update the ability
1445 * for dict.c to resize the hash tables accordingly to the fact we have o not
1446 * running childs. */
1447 static void updateDictResizePolicy(void) {
1448 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1449 dictEnableResize();
1450 else
1451 dictDisableResize();
1452 }
1453
1454 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1455 int j, loops = server.cronloops++;
1456 REDIS_NOTUSED(eventLoop);
1457 REDIS_NOTUSED(id);
1458 REDIS_NOTUSED(clientData);
1459
1460 /* We take a cached value of the unix time in the global state because
1461 * with virtual memory and aging there is to store the current time
1462 * in objects at every object access, and accuracy is not needed.
1463 * To access a global var is faster than calling time(NULL) */
1464 server.unixtime = time(NULL);
1465 /* We have just 21 bits per object for LRU information.
1466 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1467 *
1468 * When we need to select what object to swap, we compute the minimum
1469 * time distance between the current lruclock and the object last access
1470 * lruclock info. Even if clocks will wrap on overflow, there is
1471 * the interesting property that we are sure that at least
1472 * ABS(A-B) minutes passed between current time and timestamp B.
1473 *
1474 * This is not precise but we don't need at all precision, but just
1475 * something statistically reasonable.
1476 */
1477 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1478
1479 /* We received a SIGTERM, shutting down here in a safe way, as it is
1480 * not ok doing so inside the signal handler. */
1481 if (server.shutdown_asap) {
1482 if (prepareForShutdown() == REDIS_OK) exit(0);
1483 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1484 }
1485
1486 /* Show some info about non-empty databases */
1487 for (j = 0; j < server.dbnum; j++) {
1488 long long size, used, vkeys;
1489
1490 size = dictSlots(server.db[j].dict);
1491 used = dictSize(server.db[j].dict);
1492 vkeys = dictSize(server.db[j].expires);
1493 if (!(loops % 50) && (used || vkeys)) {
1494 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1495 /* dictPrintStats(server.dict); */
1496 }
1497 }
1498
1499 /* We don't want to resize the hash tables while a bacground saving
1500 * is in progress: the saving child is created using fork() that is
1501 * implemented with a copy-on-write semantic in most modern systems, so
1502 * if we resize the HT while there is the saving child at work actually
1503 * a lot of memory movements in the parent will cause a lot of pages
1504 * copied. */
1505 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1506 if (!(loops % 10)) tryResizeHashTables();
1507 if (server.activerehashing) incrementallyRehash();
1508 }
1509
1510 /* Show information about connected clients */
1511 if (!(loops % 50)) {
1512 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1513 listLength(server.clients)-listLength(server.slaves),
1514 listLength(server.slaves),
1515 zmalloc_used_memory());
1516 }
1517
1518 /* Close connections of timedout clients */
1519 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1520 closeTimedoutClients();
1521
1522 /* Check if a background saving or AOF rewrite in progress terminated */
1523 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1524 int statloc;
1525 pid_t pid;
1526
1527 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1528 if (pid == server.bgsavechildpid) {
1529 backgroundSaveDoneHandler(statloc);
1530 } else {
1531 backgroundRewriteDoneHandler(statloc);
1532 }
1533 updateDictResizePolicy();
1534 }
1535 } else {
1536 /* If there is not a background saving in progress check if
1537 * we have to save now */
1538 time_t now = time(NULL);
1539 for (j = 0; j < server.saveparamslen; j++) {
1540 struct saveparam *sp = server.saveparams+j;
1541
1542 if (server.dirty >= sp->changes &&
1543 now-server.lastsave > sp->seconds) {
1544 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1545 sp->changes, sp->seconds);
1546 rdbSaveBackground(server.dbfilename);
1547 break;
1548 }
1549 }
1550 }
1551
1552 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1553 * will use few CPU cycles if there are few expiring keys, otherwise
1554 * it will get more aggressive to avoid that too much memory is used by
1555 * keys that can be removed from the keyspace. */
1556 for (j = 0; j < server.dbnum; j++) {
1557 int expired;
1558 redisDb *db = server.db+j;
1559
1560 /* Continue to expire if at the end of the cycle more than 25%
1561 * of the keys were expired. */
1562 do {
1563 long num = dictSize(db->expires);
1564 time_t now = time(NULL);
1565
1566 expired = 0;
1567 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1568 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1569 while (num--) {
1570 dictEntry *de;
1571 time_t t;
1572
1573 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1574 t = (time_t) dictGetEntryVal(de);
1575 if (now > t) {
1576 sds key = dictGetEntryKey(de);
1577 robj *keyobj = createStringObject(key,sdslen(key));
1578
1579 dbDelete(db,keyobj);
1580 decrRefCount(keyobj);
1581 expired++;
1582 server.stat_expiredkeys++;
1583 }
1584 }
1585 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1586 }
1587
1588 /* Swap a few keys on disk if we are over the memory limit and VM
1589 * is enbled. Try to free objects from the free list first. */
1590 if (vmCanSwapOut()) {
1591 while (server.vm_enabled && zmalloc_used_memory() >
1592 server.vm_max_memory)
1593 {
1594 int retval;
1595
1596 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1597 retval = (server.vm_max_threads == 0) ?
1598 vmSwapOneObjectBlocking() :
1599 vmSwapOneObjectThreaded();
1600 if (retval == REDIS_ERR && !(loops % 300) &&
1601 zmalloc_used_memory() >
1602 (server.vm_max_memory+server.vm_max_memory/10))
1603 {
1604 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1605 }
1606 /* Note that when using threade I/O we free just one object,
1607 * because anyway when the I/O thread in charge to swap this
1608 * object out will finish, the handler of completed jobs
1609 * will try to swap more objects if we are still out of memory. */
1610 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1611 }
1612 }
1613
1614 /* Check if we should connect to a MASTER */
1615 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1616 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1617 if (syncWithMaster() == REDIS_OK) {
1618 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1619 if (server.appendonly) rewriteAppendOnlyFileBackground();
1620 }
1621 }
1622 return 100;
1623 }
1624
1625 /* This function gets called every time Redis is entering the
1626 * main loop of the event driven library, that is, before to sleep
1627 * for ready file descriptors. */
1628 static void beforeSleep(struct aeEventLoop *eventLoop) {
1629 REDIS_NOTUSED(eventLoop);
1630
1631 /* Awake clients that got all the swapped keys they requested */
1632 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1633 listIter li;
1634 listNode *ln;
1635
1636 listRewind(server.io_ready_clients,&li);
1637 while((ln = listNext(&li))) {
1638 redisClient *c = ln->value;
1639 struct redisCommand *cmd;
1640
1641 /* Resume the client. */
1642 listDelNode(server.io_ready_clients,ln);
1643 c->flags &= (~REDIS_IO_WAIT);
1644 server.vm_blocked_clients--;
1645 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1646 readQueryFromClient, c);
1647 cmd = lookupCommand(c->argv[0]->ptr);
1648 assert(cmd != NULL);
1649 call(c,cmd);
1650 resetClient(c);
1651 /* There may be more data to process in the input buffer. */
1652 if (c->querybuf && sdslen(c->querybuf) > 0)
1653 processInputBuffer(c);
1654 }
1655 }
1656 /* Write the AOF buffer on disk */
1657 flushAppendOnlyFile();
1658 }
1659
1660 static void createSharedObjects(void) {
1661 int j;
1662
1663 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1664 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1665 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1666 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1667 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1668 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1669 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1670 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1671 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1672 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1673 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1674 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1675 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1676 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1677 "-ERR no such key\r\n"));
1678 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1679 "-ERR syntax error\r\n"));
1680 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1681 "-ERR source and destination objects are the same\r\n"));
1682 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1683 "-ERR index out of range\r\n"));
1684 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1685 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1686 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1687 shared.select0 = createStringObject("select 0\r\n",10);
1688 shared.select1 = createStringObject("select 1\r\n",10);
1689 shared.select2 = createStringObject("select 2\r\n",10);
1690 shared.select3 = createStringObject("select 3\r\n",10);
1691 shared.select4 = createStringObject("select 4\r\n",10);
1692 shared.select5 = createStringObject("select 5\r\n",10);
1693 shared.select6 = createStringObject("select 6\r\n",10);
1694 shared.select7 = createStringObject("select 7\r\n",10);
1695 shared.select8 = createStringObject("select 8\r\n",10);
1696 shared.select9 = createStringObject("select 9\r\n",10);
1697 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1698 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1699 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1700 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1701 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1702 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1703 shared.mbulk3 = createStringObject("*3\r\n",4);
1704 shared.mbulk4 = createStringObject("*4\r\n",4);
1705 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1706 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1707 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1708 }
1709 }
1710
1711 static void appendServerSaveParams(time_t seconds, int changes) {
1712 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1713 server.saveparams[server.saveparamslen].seconds = seconds;
1714 server.saveparams[server.saveparamslen].changes = changes;
1715 server.saveparamslen++;
1716 }
1717
1718 static void resetServerSaveParams() {
1719 zfree(server.saveparams);
1720 server.saveparams = NULL;
1721 server.saveparamslen = 0;
1722 }
1723
1724 static void initServerConfig() {
1725 server.dbnum = REDIS_DEFAULT_DBNUM;
1726 server.port = REDIS_SERVERPORT;
1727 server.verbosity = REDIS_VERBOSE;
1728 server.maxidletime = REDIS_MAXIDLETIME;
1729 server.saveparams = NULL;
1730 server.logfile = NULL; /* NULL = log on standard output */
1731 server.bindaddr = NULL;
1732 server.glueoutputbuf = 1;
1733 server.daemonize = 0;
1734 server.appendonly = 0;
1735 server.appendfsync = APPENDFSYNC_EVERYSEC;
1736 server.no_appendfsync_on_rewrite = 0;
1737 server.lastfsync = time(NULL);
1738 server.appendfd = -1;
1739 server.appendseldb = -1; /* Make sure the first time will not match */
1740 server.pidfile = zstrdup("/var/run/redis.pid");
1741 server.dbfilename = zstrdup("dump.rdb");
1742 server.appendfilename = zstrdup("appendonly.aof");
1743 server.requirepass = NULL;
1744 server.rdbcompression = 1;
1745 server.activerehashing = 1;
1746 server.maxclients = 0;
1747 server.blpop_blocked_clients = 0;
1748 server.maxmemory = 0;
1749 server.vm_enabled = 0;
1750 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1751 server.vm_page_size = 256; /* 256 bytes per page */
1752 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1753 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1754 server.vm_max_threads = 4;
1755 server.vm_blocked_clients = 0;
1756 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1757 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1758 server.shutdown_asap = 0;
1759
1760 resetServerSaveParams();
1761
1762 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1763 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1764 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1765 /* Replication related */
1766 server.isslave = 0;
1767 server.masterauth = NULL;
1768 server.masterhost = NULL;
1769 server.masterport = 6379;
1770 server.master = NULL;
1771 server.replstate = REDIS_REPL_NONE;
1772
1773 /* Double constants initialization */
1774 R_Zero = 0.0;
1775 R_PosInf = 1.0/R_Zero;
1776 R_NegInf = -1.0/R_Zero;
1777 R_Nan = R_Zero/R_Zero;
1778 }
1779
1780 static void initServer() {
1781 int j;
1782
1783 signal(SIGHUP, SIG_IGN);
1784 signal(SIGPIPE, SIG_IGN);
1785 setupSigSegvAction();
1786
1787 server.devnull = fopen("/dev/null","w");
1788 if (server.devnull == NULL) {
1789 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1790 exit(1);
1791 }
1792 server.clients = listCreate();
1793 server.slaves = listCreate();
1794 server.monitors = listCreate();
1795 server.objfreelist = listCreate();
1796 createSharedObjects();
1797 server.el = aeCreateEventLoop();
1798 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1799 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1800 if (server.fd == -1) {
1801 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1802 exit(1);
1803 }
1804 for (j = 0; j < server.dbnum; j++) {
1805 server.db[j].dict = dictCreate(&dbDictType,NULL);
1806 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1807 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1808 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1809 if (server.vm_enabled)
1810 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1811 server.db[j].id = j;
1812 }
1813 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1814 server.pubsub_patterns = listCreate();
1815 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1816 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1817 server.cronloops = 0;
1818 server.bgsavechildpid = -1;
1819 server.bgrewritechildpid = -1;
1820 server.bgrewritebuf = sdsempty();
1821 server.aofbuf = sdsempty();
1822 server.lastsave = time(NULL);
1823 server.dirty = 0;
1824 server.stat_numcommands = 0;
1825 server.stat_numconnections = 0;
1826 server.stat_expiredkeys = 0;
1827 server.stat_starttime = time(NULL);
1828 server.unixtime = time(NULL);
1829 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1830 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1831 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1832
1833 if (server.appendonly) {
1834 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1835 if (server.appendfd == -1) {
1836 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1837 strerror(errno));
1838 exit(1);
1839 }
1840 }
1841
1842 if (server.vm_enabled) vmInit();
1843 }
1844
1845 /* Empty the whole database */
1846 static long long emptyDb() {
1847 int j;
1848 long long removed = 0;
1849
1850 for (j = 0; j < server.dbnum; j++) {
1851 removed += dictSize(server.db[j].dict);
1852 dictEmpty(server.db[j].dict);
1853 dictEmpty(server.db[j].expires);
1854 }
1855 return removed;
1856 }
1857
1858 static int yesnotoi(char *s) {
1859 if (!strcasecmp(s,"yes")) return 1;
1860 else if (!strcasecmp(s,"no")) return 0;
1861 else return -1;
1862 }
1863
1864 /* I agree, this is a very rudimental way to load a configuration...
1865 will improve later if the config gets more complex */
1866 static void loadServerConfig(char *filename) {
1867 FILE *fp;
1868 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1869 int linenum = 0;
1870 sds line = NULL;
1871
1872 if (filename[0] == '-' && filename[1] == '\0')
1873 fp = stdin;
1874 else {
1875 if ((fp = fopen(filename,"r")) == NULL) {
1876 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1877 exit(1);
1878 }
1879 }
1880
1881 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1882 sds *argv;
1883 int argc, j;
1884
1885 linenum++;
1886 line = sdsnew(buf);
1887 line = sdstrim(line," \t\r\n");
1888
1889 /* Skip comments and blank lines*/
1890 if (line[0] == '#' || line[0] == '\0') {
1891 sdsfree(line);
1892 continue;
1893 }
1894
1895 /* Split into arguments */
1896 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1897 sdstolower(argv[0]);
1898
1899 /* Execute config directives */
1900 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1901 server.maxidletime = atoi(argv[1]);
1902 if (server.maxidletime < 0) {
1903 err = "Invalid timeout value"; goto loaderr;
1904 }
1905 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1906 server.port = atoi(argv[1]);
1907 if (server.port < 1 || server.port > 65535) {
1908 err = "Invalid port"; goto loaderr;
1909 }
1910 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1911 server.bindaddr = zstrdup(argv[1]);
1912 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1913 int seconds = atoi(argv[1]);
1914 int changes = atoi(argv[2]);
1915 if (seconds < 1 || changes < 0) {
1916 err = "Invalid save parameters"; goto loaderr;
1917 }
1918 appendServerSaveParams(seconds,changes);
1919 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1920 if (chdir(argv[1]) == -1) {
1921 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1922 argv[1], strerror(errno));
1923 exit(1);
1924 }
1925 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1926 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1927 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1928 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1929 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1930 else {
1931 err = "Invalid log level. Must be one of debug, notice, warning";
1932 goto loaderr;
1933 }
1934 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1935 FILE *logfp;
1936
1937 server.logfile = zstrdup(argv[1]);
1938 if (!strcasecmp(server.logfile,"stdout")) {
1939 zfree(server.logfile);
1940 server.logfile = NULL;
1941 }
1942 if (server.logfile) {
1943 /* Test if we are able to open the file. The server will not
1944 * be able to abort just for this problem later... */
1945 logfp = fopen(server.logfile,"a");
1946 if (logfp == NULL) {
1947 err = sdscatprintf(sdsempty(),
1948 "Can't open the log file: %s", strerror(errno));
1949 goto loaderr;
1950 }
1951 fclose(logfp);
1952 }
1953 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1954 server.dbnum = atoi(argv[1]);
1955 if (server.dbnum < 1) {
1956 err = "Invalid number of databases"; goto loaderr;
1957 }
1958 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1959 loadServerConfig(argv[1]);
1960 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1961 server.maxclients = atoi(argv[1]);
1962 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1963 server.maxmemory = memtoll(argv[1],NULL);
1964 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1965 server.masterhost = sdsnew(argv[1]);
1966 server.masterport = atoi(argv[2]);
1967 server.replstate = REDIS_REPL_CONNECT;
1968 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1969 server.masterauth = zstrdup(argv[1]);
1970 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1971 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
1974 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1975 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1976 err = "argument must be 'yes' or 'no'"; goto loaderr;
1977 }
1978 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1979 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1980 err = "argument must be 'yes' or 'no'"; goto loaderr;
1981 }
1982 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1983 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1984 err = "argument must be 'yes' or 'no'"; goto loaderr;
1985 }
1986 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1987 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1988 err = "argument must be 'yes' or 'no'"; goto loaderr;
1989 }
1990 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1991 zfree(server.appendfilename);
1992 server.appendfilename = zstrdup(argv[1]);
1993 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1994 && argc == 2) {
1995 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1996 err = "argument must be 'yes' or 'no'"; goto loaderr;
1997 }
1998 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1999 if (!strcasecmp(argv[1],"no")) {
2000 server.appendfsync = APPENDFSYNC_NO;
2001 } else if (!strcasecmp(argv[1],"always")) {
2002 server.appendfsync = APPENDFSYNC_ALWAYS;
2003 } else if (!strcasecmp(argv[1],"everysec")) {
2004 server.appendfsync = APPENDFSYNC_EVERYSEC;
2005 } else {
2006 err = "argument must be 'no', 'always' or 'everysec'";
2007 goto loaderr;
2008 }
2009 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
2010 server.requirepass = zstrdup(argv[1]);
2011 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
2012 zfree(server.pidfile);
2013 server.pidfile = zstrdup(argv[1]);
2014 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
2015 zfree(server.dbfilename);
2016 server.dbfilename = zstrdup(argv[1]);
2017 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2018 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2019 err = "argument must be 'yes' or 'no'"; goto loaderr;
2020 }
2021 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2022 zfree(server.vm_swap_file);
2023 server.vm_swap_file = zstrdup(argv[1]);
2024 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2025 server.vm_max_memory = memtoll(argv[1],NULL);
2026 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2027 server.vm_page_size = memtoll(argv[1], NULL);
2028 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2029 server.vm_pages = memtoll(argv[1], NULL);
2030 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2031 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2032 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2033 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2034 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2035 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2036 } else {
2037 err = "Bad directive or wrong number of arguments"; goto loaderr;
2038 }
2039 for (j = 0; j < argc; j++)
2040 sdsfree(argv[j]);
2041 zfree(argv);
2042 sdsfree(line);
2043 }
2044 if (fp != stdin) fclose(fp);
2045 return;
2046
2047 loaderr:
2048 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2049 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2050 fprintf(stderr, ">>> '%s'\n", line);
2051 fprintf(stderr, "%s\n", err);
2052 exit(1);
2053 }
2054
2055 static void freeClientArgv(redisClient *c) {
2056 int j;
2057
2058 for (j = 0; j < c->argc; j++)
2059 decrRefCount(c->argv[j]);
2060 for (j = 0; j < c->mbargc; j++)
2061 decrRefCount(c->mbargv[j]);
2062 c->argc = 0;
2063 c->mbargc = 0;
2064 }
2065
2066 static void freeClient(redisClient *c) {
2067 listNode *ln;
2068
2069 /* Note that if the client we are freeing is blocked into a blocking
2070 * call, we have to set querybuf to NULL *before* to call
2071 * unblockClientWaitingData() to avoid processInputBuffer() will get
2072 * called. Also it is important to remove the file events after
2073 * this, because this call adds the READABLE event. */
2074 sdsfree(c->querybuf);
2075 c->querybuf = NULL;
2076 if (c->flags & REDIS_BLOCKED)
2077 unblockClientWaitingData(c);
2078
2079 /* UNWATCH all the keys */
2080 unwatchAllKeys(c);
2081 listRelease(c->watched_keys);
2082 /* Unsubscribe from all the pubsub channels */
2083 pubsubUnsubscribeAllChannels(c,0);
2084 pubsubUnsubscribeAllPatterns(c,0);
2085 dictRelease(c->pubsub_channels);
2086 listRelease(c->pubsub_patterns);
2087 /* Obvious cleanup */
2088 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2089 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2090 listRelease(c->reply);
2091 freeClientArgv(c);
2092 close(c->fd);
2093 /* Remove from the list of clients */
2094 ln = listSearchKey(server.clients,c);
2095 redisAssert(ln != NULL);
2096 listDelNode(server.clients,ln);
2097 /* Remove from the list of clients that are now ready to be restarted
2098 * after waiting for swapped keys */
2099 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2100 ln = listSearchKey(server.io_ready_clients,c);
2101 if (ln) {
2102 listDelNode(server.io_ready_clients,ln);
2103 server.vm_blocked_clients--;
2104 }
2105 }
2106 /* Remove from the list of clients waiting for swapped keys */
2107 while (server.vm_enabled && listLength(c->io_keys)) {
2108 ln = listFirst(c->io_keys);
2109 dontWaitForSwappedKey(c,ln->value);
2110 }
2111 listRelease(c->io_keys);
2112 /* Master/slave cleanup */
2113 if (c->flags & REDIS_SLAVE) {
2114 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2115 close(c->repldbfd);
2116 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2117 ln = listSearchKey(l,c);
2118 redisAssert(ln != NULL);
2119 listDelNode(l,ln);
2120 }
2121 if (c->flags & REDIS_MASTER) {
2122 server.master = NULL;
2123 server.replstate = REDIS_REPL_CONNECT;
2124 }
2125 /* Release memory */
2126 zfree(c->argv);
2127 zfree(c->mbargv);
2128 freeClientMultiState(c);
2129 zfree(c);
2130 }
2131
2132 #define GLUEREPLY_UP_TO (1024)
2133 static void glueReplyBuffersIfNeeded(redisClient *c) {
2134 int copylen = 0;
2135 char buf[GLUEREPLY_UP_TO];
2136 listNode *ln;
2137 listIter li;
2138 robj *o;
2139
2140 listRewind(c->reply,&li);
2141 while((ln = listNext(&li))) {
2142 int objlen;
2143
2144 o = ln->value;
2145 objlen = sdslen(o->ptr);
2146 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2147 memcpy(buf+copylen,o->ptr,objlen);
2148 copylen += objlen;
2149 listDelNode(c->reply,ln);
2150 } else {
2151 if (copylen == 0) return;
2152 break;
2153 }
2154 }
2155 /* Now the output buffer is empty, add the new single element */
2156 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2157 listAddNodeHead(c->reply,o);
2158 }
2159
2160 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2161 redisClient *c = privdata;
2162 int nwritten = 0, totwritten = 0, objlen;
2163 robj *o;
2164 REDIS_NOTUSED(el);
2165 REDIS_NOTUSED(mask);
2166
2167 /* Use writev() if we have enough buffers to send */
2168 if (!server.glueoutputbuf &&
2169 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2170 !(c->flags & REDIS_MASTER))
2171 {
2172 sendReplyToClientWritev(el, fd, privdata, mask);
2173 return;
2174 }
2175
2176 while(listLength(c->reply)) {
2177 if (server.glueoutputbuf && listLength(c->reply) > 1)
2178 glueReplyBuffersIfNeeded(c);
2179
2180 o = listNodeValue(listFirst(c->reply));
2181 objlen = sdslen(o->ptr);
2182
2183 if (objlen == 0) {
2184 listDelNode(c->reply,listFirst(c->reply));
2185 continue;
2186 }
2187
2188 if (c->flags & REDIS_MASTER) {
2189 /* Don't reply to a master */
2190 nwritten = objlen - c->sentlen;
2191 } else {
2192 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2193 if (nwritten <= 0) break;
2194 }
2195 c->sentlen += nwritten;
2196 totwritten += nwritten;
2197 /* If we fully sent the object on head go to the next one */
2198 if (c->sentlen == objlen) {
2199 listDelNode(c->reply,listFirst(c->reply));
2200 c->sentlen = 0;
2201 }
2202 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2203 * bytes, in a single threaded server it's a good idea to serve
2204 * other clients as well, even if a very large request comes from
2205 * super fast link that is always able to accept data (in real world
2206 * scenario think about 'KEYS *' against the loopback interfae) */
2207 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2208 }
2209 if (nwritten == -1) {
2210 if (errno == EAGAIN) {
2211 nwritten = 0;
2212 } else {
2213 redisLog(REDIS_VERBOSE,
2214 "Error writing to client: %s", strerror(errno));
2215 freeClient(c);
2216 return;
2217 }
2218 }
2219 if (totwritten > 0) c->lastinteraction = time(NULL);
2220 if (listLength(c->reply) == 0) {
2221 c->sentlen = 0;
2222 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2223 }
2224 }
2225
2226 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2227 {
2228 redisClient *c = privdata;
2229 int nwritten = 0, totwritten = 0, objlen, willwrite;
2230 robj *o;
2231 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2232 int offset, ion = 0;
2233 REDIS_NOTUSED(el);
2234 REDIS_NOTUSED(mask);
2235
2236 listNode *node;
2237 while (listLength(c->reply)) {
2238 offset = c->sentlen;
2239 ion = 0;
2240 willwrite = 0;
2241
2242 /* fill-in the iov[] array */
2243 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2244 o = listNodeValue(node);
2245 objlen = sdslen(o->ptr);
2246
2247 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2248 break;
2249
2250 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2251 break; /* no more iovecs */
2252
2253 iov[ion].iov_base = ((char*)o->ptr) + offset;
2254 iov[ion].iov_len = objlen - offset;
2255 willwrite += objlen - offset;
2256 offset = 0; /* just for the first item */
2257 ion++;
2258 }
2259
2260 if(willwrite == 0)
2261 break;
2262
2263 /* write all collected blocks at once */
2264 if((nwritten = writev(fd, iov, ion)) < 0) {
2265 if (errno != EAGAIN) {
2266 redisLog(REDIS_VERBOSE,
2267 "Error writing to client: %s", strerror(errno));
2268 freeClient(c);
2269 return;
2270 }
2271 break;
2272 }
2273
2274 totwritten += nwritten;
2275 offset = c->sentlen;
2276
2277 /* remove written robjs from c->reply */
2278 while (nwritten && listLength(c->reply)) {
2279 o = listNodeValue(listFirst(c->reply));
2280 objlen = sdslen(o->ptr);
2281
2282 if(nwritten >= objlen - offset) {
2283 listDelNode(c->reply, listFirst(c->reply));
2284 nwritten -= objlen - offset;
2285 c->sentlen = 0;
2286 } else {
2287 /* partial write */
2288 c->sentlen += nwritten;
2289 break;
2290 }
2291 offset = 0;
2292 }
2293 }
2294
2295 if (totwritten > 0)
2296 c->lastinteraction = time(NULL);
2297
2298 if (listLength(c->reply) == 0) {
2299 c->sentlen = 0;
2300 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2301 }
2302 }
2303
2304 static int qsortRedisCommands(const void *r1, const void *r2) {
2305 return strcasecmp(
2306 ((struct redisCommand*)r1)->name,
2307 ((struct redisCommand*)r2)->name);
2308 }
2309
2310 static void sortCommandTable() {
2311 /* Copy and sort the read-only version of the command table */
2312 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2313 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2314 qsort(commandTable,
2315 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2316 sizeof(struct redisCommand),qsortRedisCommands);
2317 }
2318
2319 static struct redisCommand *lookupCommand(char *name) {
2320 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2321 return bsearch(
2322 &tmp,
2323 commandTable,
2324 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2325 sizeof(struct redisCommand),
2326 qsortRedisCommands);
2327 }
2328
2329 /* resetClient prepare the client to process the next command */
2330 static void resetClient(redisClient *c) {
2331 freeClientArgv(c);
2332 c->bulklen = -1;
2333 c->multibulk = 0;
2334 }
2335
2336 /* Call() is the core of Redis execution of a command */
2337 static void call(redisClient *c, struct redisCommand *cmd) {
2338 long long dirty;
2339
2340 dirty = server.dirty;
2341 cmd->proc(c);
2342 dirty = server.dirty-dirty;
2343
2344 if (server.appendonly && dirty)
2345 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2346 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2347 listLength(server.slaves))
2348 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2349 if (listLength(server.monitors))
2350 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2351 server.stat_numcommands++;
2352 }
2353
2354 /* If this function gets called we already read a whole
2355 * command, argments are in the client argv/argc fields.
2356 * processCommand() execute the command or prepare the
2357 * server for a bulk read from the client.
2358 *
2359 * If 1 is returned the client is still alive and valid and
2360 * and other operations can be performed by the caller. Otherwise
2361 * if 0 is returned the client was destroied (i.e. after QUIT). */
2362 static int processCommand(redisClient *c) {
2363 struct redisCommand *cmd;
2364
2365 /* Free some memory if needed (maxmemory setting) */
2366 if (server.maxmemory) freeMemoryIfNeeded();
2367
2368 /* Handle the multi bulk command type. This is an alternative protocol
2369 * supported by Redis in order to receive commands that are composed of
2370 * multiple binary-safe "bulk" arguments. The latency of processing is
2371 * a bit higher but this allows things like multi-sets, so if this
2372 * protocol is used only for MSET and similar commands this is a big win. */
2373 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2374 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2375 if (c->multibulk <= 0) {
2376 resetClient(c);
2377 return 1;
2378 } else {
2379 decrRefCount(c->argv[c->argc-1]);
2380 c->argc--;
2381 return 1;
2382 }
2383 } else if (c->multibulk) {
2384 if (c->bulklen == -1) {
2385 if (((char*)c->argv[0]->ptr)[0] != '$') {
2386 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2387 resetClient(c);
2388 return 1;
2389 } else {
2390 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2391 decrRefCount(c->argv[0]);
2392 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2393 c->argc--;
2394 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2395 resetClient(c);
2396 return 1;
2397 }
2398 c->argc--;
2399 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2400 return 1;
2401 }
2402 } else {
2403 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2404 c->mbargv[c->mbargc] = c->argv[0];
2405 c->mbargc++;
2406 c->argc--;
2407 c->multibulk--;
2408 if (c->multibulk == 0) {
2409 robj **auxargv;
2410 int auxargc;
2411
2412 /* Here we need to swap the multi-bulk argc/argv with the
2413 * normal argc/argv of the client structure. */
2414 auxargv = c->argv;
2415 c->argv = c->mbargv;
2416 c->mbargv = auxargv;
2417
2418 auxargc = c->argc;
2419 c->argc = c->mbargc;
2420 c->mbargc = auxargc;
2421
2422 /* We need to set bulklen to something different than -1
2423 * in order for the code below to process the command without
2424 * to try to read the last argument of a bulk command as
2425 * a special argument. */
2426 c->bulklen = 0;
2427 /* continue below and process the command */
2428 } else {
2429 c->bulklen = -1;
2430 return 1;
2431 }
2432 }
2433 }
2434 /* -- end of multi bulk commands processing -- */
2435
2436 /* The QUIT command is handled as a special case. Normal command
2437 * procs are unable to close the client connection safely */
2438 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2439 freeClient(c);
2440 return 0;
2441 }
2442
2443 /* Now lookup the command and check ASAP about trivial error conditions
2444 * such wrong arity, bad command name and so forth. */
2445 cmd = lookupCommand(c->argv[0]->ptr);
2446 if (!cmd) {
2447 addReplySds(c,
2448 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2449 (char*)c->argv[0]->ptr));
2450 resetClient(c);
2451 return 1;
2452 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2453 (c->argc < -cmd->arity)) {
2454 addReplySds(c,
2455 sdscatprintf(sdsempty(),
2456 "-ERR wrong number of arguments for '%s' command\r\n",
2457 cmd->name));
2458 resetClient(c);
2459 return 1;
2460 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2461 /* This is a bulk command, we have to read the last argument yet. */
2462 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2463
2464 decrRefCount(c->argv[c->argc-1]);
2465 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2466 c->argc--;
2467 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2468 resetClient(c);
2469 return 1;
2470 }
2471 c->argc--;
2472 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2473 /* It is possible that the bulk read is already in the
2474 * buffer. Check this condition and handle it accordingly.
2475 * This is just a fast path, alternative to call processInputBuffer().
2476 * It's a good idea since the code is small and this condition
2477 * happens most of the times. */
2478 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2479 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2480 c->argc++;
2481 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2482 } else {
2483 /* Otherwise return... there is to read the last argument
2484 * from the socket. */
2485 return 1;
2486 }
2487 }
2488 /* Let's try to encode the bulk object to save space. */
2489 if (cmd->flags & REDIS_CMD_BULK)
2490 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2491
2492 /* Check if the user is authenticated */
2493 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2494 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2495 resetClient(c);
2496 return 1;
2497 }
2498
2499 /* Handle the maxmemory directive */
2500 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2501 zmalloc_used_memory() > server.maxmemory)
2502 {
2503 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2504 resetClient(c);
2505 return 1;
2506 }
2507
2508 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2509 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2510 &&
2511 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2512 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2513 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2514 resetClient(c);
2515 return 1;
2516 }
2517
2518 /* Exec the command */
2519 if (c->flags & REDIS_MULTI &&
2520 cmd->proc != execCommand && cmd->proc != discardCommand &&
2521 cmd->proc != multiCommand && cmd->proc != watchCommand)
2522 {
2523 queueMultiCommand(c,cmd);
2524 addReply(c,shared.queued);
2525 } else {
2526 if (server.vm_enabled && server.vm_max_threads > 0 &&
2527 blockClientOnSwappedKeys(c,cmd)) return 1;
2528 call(c,cmd);
2529 }
2530
2531 /* Prepare the client for the next command */
2532 resetClient(c);
2533 return 1;
2534 }
2535
2536 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2537 listNode *ln;
2538 listIter li;
2539 int outc = 0, j;
2540 robj **outv;
2541 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2542 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2543 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2544 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2545 robj *lenobj;
2546
2547 if (argc <= REDIS_STATIC_ARGS) {
2548 outv = static_outv;
2549 } else {
2550 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2551 }
2552
2553 lenobj = createObject(REDIS_STRING,
2554 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2555 lenobj->refcount = 0;
2556 outv[outc++] = lenobj;
2557 for (j = 0; j < argc; j++) {
2558 lenobj = createObject(REDIS_STRING,
2559 sdscatprintf(sdsempty(),"$%lu\r\n",
2560 (unsigned long) stringObjectLen(argv[j])));
2561 lenobj->refcount = 0;
2562 outv[outc++] = lenobj;
2563 outv[outc++] = argv[j];
2564 outv[outc++] = shared.crlf;
2565 }
2566
2567 /* Increment all the refcounts at start and decrement at end in order to
2568 * be sure to free objects if there is no slave in a replication state
2569 * able to be feed with commands */
2570 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2571 listRewind(slaves,&li);
2572 while((ln = listNext(&li))) {
2573 redisClient *slave = ln->value;
2574
2575 /* Don't feed slaves that are still waiting for BGSAVE to start */
2576 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2577
2578 /* Feed all the other slaves, MONITORs and so on */
2579 if (slave->slaveseldb != dictid) {
2580 robj *selectcmd;
2581
2582 switch(dictid) {
2583 case 0: selectcmd = shared.select0; break;
2584 case 1: selectcmd = shared.select1; break;
2585 case 2: selectcmd = shared.select2; break;
2586 case 3: selectcmd = shared.select3; break;
2587 case 4: selectcmd = shared.select4; break;
2588 case 5: selectcmd = shared.select5; break;
2589 case 6: selectcmd = shared.select6; break;
2590 case 7: selectcmd = shared.select7; break;
2591 case 8: selectcmd = shared.select8; break;
2592 case 9: selectcmd = shared.select9; break;
2593 default:
2594 selectcmd = createObject(REDIS_STRING,
2595 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2596 selectcmd->refcount = 0;
2597 break;
2598 }
2599 addReply(slave,selectcmd);
2600 slave->slaveseldb = dictid;
2601 }
2602 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2603 }
2604 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2605 if (outv != static_outv) zfree(outv);
2606 }
2607
2608 static sds sdscatrepr(sds s, char *p, size_t len) {
2609 s = sdscatlen(s,"\"",1);
2610 while(len--) {
2611 switch(*p) {
2612 case '\\':
2613 case '"':
2614 s = sdscatprintf(s,"\\%c",*p);
2615 break;
2616 case '\n': s = sdscatlen(s,"\\n",1); break;
2617 case '\r': s = sdscatlen(s,"\\r",1); break;
2618 case '\t': s = sdscatlen(s,"\\t",1); break;
2619 case '\a': s = sdscatlen(s,"\\a",1); break;
2620 case '\b': s = sdscatlen(s,"\\b",1); break;
2621 default:
2622 if (isprint(*p))
2623 s = sdscatprintf(s,"%c",*p);
2624 else
2625 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2626 break;
2627 }
2628 p++;
2629 }
2630 return sdscatlen(s,"\"",1);
2631 }
2632
2633 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2634 listNode *ln;
2635 listIter li;
2636 int j;
2637 sds cmdrepr = sdsnew("+");
2638 robj *cmdobj;
2639 struct timeval tv;
2640
2641 gettimeofday(&tv,NULL);
2642 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2643 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2644
2645 for (j = 0; j < argc; j++) {
2646 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2647 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2648 } else {
2649 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2650 sdslen(argv[j]->ptr));
2651 }
2652 if (j != argc-1)
2653 cmdrepr = sdscatlen(cmdrepr," ",1);
2654 }
2655 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2656 cmdobj = createObject(REDIS_STRING,cmdrepr);
2657
2658 listRewind(monitors,&li);
2659 while((ln = listNext(&li))) {
2660 redisClient *monitor = ln->value;
2661 addReply(monitor,cmdobj);
2662 }
2663 decrRefCount(cmdobj);
2664 }
2665
2666 static void processInputBuffer(redisClient *c) {
2667 again:
2668 /* Before to process the input buffer, make sure the client is not
2669 * waitig for a blocking operation such as BLPOP. Note that the first
2670 * iteration the client is never blocked, otherwise the processInputBuffer
2671 * would not be called at all, but after the execution of the first commands
2672 * in the input buffer the client may be blocked, and the "goto again"
2673 * will try to reiterate. The following line will make it return asap. */
2674 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2675 if (c->bulklen == -1) {
2676 /* Read the first line of the query */
2677 char *p = strchr(c->querybuf,'\n');
2678 size_t querylen;
2679
2680 if (p) {
2681 sds query, *argv;
2682 int argc, j;
2683
2684 query = c->querybuf;
2685 c->querybuf = sdsempty();
2686 querylen = 1+(p-(query));
2687 if (sdslen(query) > querylen) {
2688 /* leave data after the first line of the query in the buffer */
2689 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2690 }
2691 *p = '\0'; /* remove "\n" */
2692 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2693 sdsupdatelen(query);
2694
2695 /* Now we can split the query in arguments */
2696 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2697 sdsfree(query);
2698
2699 if (c->argv) zfree(c->argv);
2700 c->argv = zmalloc(sizeof(robj*)*argc);
2701
2702 for (j = 0; j < argc; j++) {
2703 if (sdslen(argv[j])) {
2704 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2705 c->argc++;
2706 } else {
2707 sdsfree(argv[j]);
2708 }
2709 }
2710 zfree(argv);
2711 if (c->argc) {
2712 /* Execute the command. If the client is still valid
2713 * after processCommand() return and there is something
2714 * on the query buffer try to process the next command. */
2715 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2716 } else {
2717 /* Nothing to process, argc == 0. Just process the query
2718 * buffer if it's not empty or return to the caller */
2719 if (sdslen(c->querybuf)) goto again;
2720 }
2721 return;
2722 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2723 redisLog(REDIS_VERBOSE, "Client protocol error");
2724 freeClient(c);
2725 return;
2726 }
2727 } else {
2728 /* Bulk read handling. Note that if we are at this point
2729 the client already sent a command terminated with a newline,
2730 we are reading the bulk data that is actually the last
2731 argument of the command. */
2732 int qbl = sdslen(c->querybuf);
2733
2734 if (c->bulklen <= qbl) {
2735 /* Copy everything but the final CRLF as final argument */
2736 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2737 c->argc++;
2738 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2739 /* Process the command. If the client is still valid after
2740 * the processing and there is more data in the buffer
2741 * try to parse it. */
2742 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2743 return;
2744 }
2745 }
2746 }
2747
2748 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2749 redisClient *c = (redisClient*) privdata;
2750 char buf[REDIS_IOBUF_LEN];
2751 int nread;
2752 REDIS_NOTUSED(el);
2753 REDIS_NOTUSED(mask);
2754
2755 nread = read(fd, buf, REDIS_IOBUF_LEN);
2756 if (nread == -1) {
2757 if (errno == EAGAIN) {
2758 nread = 0;
2759 } else {
2760 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2761 freeClient(c);
2762 return;
2763 }
2764 } else if (nread == 0) {
2765 redisLog(REDIS_VERBOSE, "Client closed connection");
2766 freeClient(c);
2767 return;
2768 }
2769 if (nread) {
2770 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2771 c->lastinteraction = time(NULL);
2772 } else {
2773 return;
2774 }
2775 processInputBuffer(c);
2776 }
2777
2778 static int selectDb(redisClient *c, int id) {
2779 if (id < 0 || id >= server.dbnum)
2780 return REDIS_ERR;
2781 c->db = &server.db[id];
2782 return REDIS_OK;
2783 }
2784
2785 static void *dupClientReplyValue(void *o) {
2786 incrRefCount((robj*)o);
2787 return o;
2788 }
2789
2790 static int listMatchObjects(void *a, void *b) {
2791 return equalStringObjects(a,b);
2792 }
2793
2794 static redisClient *createClient(int fd) {
2795 redisClient *c = zmalloc(sizeof(*c));
2796
2797 anetNonBlock(NULL,fd);
2798 anetTcpNoDelay(NULL,fd);
2799 if (!c) return NULL;
2800 selectDb(c,0);
2801 c->fd = fd;
2802 c->querybuf = sdsempty();
2803 c->argc = 0;
2804 c->argv = NULL;
2805 c->bulklen = -1;
2806 c->multibulk = 0;
2807 c->mbargc = 0;
2808 c->mbargv = NULL;
2809 c->sentlen = 0;
2810 c->flags = 0;
2811 c->lastinteraction = time(NULL);
2812 c->authenticated = 0;
2813 c->replstate = REDIS_REPL_NONE;
2814 c->reply = listCreate();
2815 listSetFreeMethod(c->reply,decrRefCount);
2816 listSetDupMethod(c->reply,dupClientReplyValue);
2817 c->blocking_keys = NULL;
2818 c->blocking_keys_num = 0;
2819 c->io_keys = listCreate();
2820 c->watched_keys = listCreate();
2821 listSetFreeMethod(c->io_keys,decrRefCount);
2822 c->pubsub_channels = dictCreate(&setDictType,NULL);
2823 c->pubsub_patterns = listCreate();
2824 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2825 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2826 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2827 readQueryFromClient, c) == AE_ERR) {
2828 freeClient(c);
2829 return NULL;
2830 }
2831 listAddNodeTail(server.clients,c);
2832 initClientMultiState(c);
2833 return c;
2834 }
2835
2836 static void addReply(redisClient *c, robj *obj) {
2837 if (listLength(c->reply) == 0 &&
2838 (c->replstate == REDIS_REPL_NONE ||
2839 c->replstate == REDIS_REPL_ONLINE) &&
2840 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2841 sendReplyToClient, c) == AE_ERR) return;
2842
2843 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2844 obj = dupStringObject(obj);
2845 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2846 }
2847 listAddNodeTail(c->reply,getDecodedObject(obj));
2848 }
2849
2850 static void addReplySds(redisClient *c, sds s) {
2851 robj *o = createObject(REDIS_STRING,s);
2852 addReply(c,o);
2853 decrRefCount(o);
2854 }
2855
2856 static void addReplyDouble(redisClient *c, double d) {
2857 char buf[128];
2858
2859 snprintf(buf,sizeof(buf),"%.17g",d);
2860 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2861 (unsigned long) strlen(buf),buf));
2862 }
2863
2864 static void addReplyLongLong(redisClient *c, long long ll) {
2865 char buf[128];
2866 size_t len;
2867
2868 if (ll == 0) {
2869 addReply(c,shared.czero);
2870 return;
2871 } else if (ll == 1) {
2872 addReply(c,shared.cone);
2873 return;
2874 }
2875 buf[0] = ':';
2876 len = ll2string(buf+1,sizeof(buf)-1,ll);
2877 buf[len+1] = '\r';
2878 buf[len+2] = '\n';
2879 addReplySds(c,sdsnewlen(buf,len+3));
2880 }
2881
2882 static void addReplyUlong(redisClient *c, unsigned long ul) {
2883 char buf[128];
2884 size_t len;
2885
2886 if (ul == 0) {
2887 addReply(c,shared.czero);
2888 return;
2889 } else if (ul == 1) {
2890 addReply(c,shared.cone);
2891 return;
2892 }
2893 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2894 addReplySds(c,sdsnewlen(buf,len));
2895 }
2896
2897 static void addReplyBulkLen(redisClient *c, robj *obj) {
2898 size_t len, intlen;
2899 char buf[128];
2900
2901 if (obj->encoding == REDIS_ENCODING_RAW) {
2902 len = sdslen(obj->ptr);
2903 } else {
2904 long n = (long)obj->ptr;
2905
2906 /* Compute how many bytes will take this integer as a radix 10 string */
2907 len = 1;
2908 if (n < 0) {
2909 len++;
2910 n = -n;
2911 }
2912 while((n = n/10) != 0) {
2913 len++;
2914 }
2915 }
2916 buf[0] = '$';
2917 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2918 buf[intlen+1] = '\r';
2919 buf[intlen+2] = '\n';
2920 addReplySds(c,sdsnewlen(buf,intlen+3));
2921 }
2922
2923 static void addReplyBulk(redisClient *c, robj *obj) {
2924 addReplyBulkLen(c,obj);
2925 addReply(c,obj);
2926 addReply(c,shared.crlf);
2927 }
2928
2929 static void addReplyBulkSds(redisClient *c, sds s) {
2930 robj *o = createStringObject(s, sdslen(s));
2931 addReplyBulk(c,o);
2932 decrRefCount(o);
2933 }
2934
2935 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2936 static void addReplyBulkCString(redisClient *c, char *s) {
2937 if (s == NULL) {
2938 addReply(c,shared.nullbulk);
2939 } else {
2940 robj *o = createStringObject(s,strlen(s));
2941 addReplyBulk(c,o);
2942 decrRefCount(o);
2943 }
2944 }
2945
2946 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2947 int cport, cfd;
2948 char cip[128];
2949 redisClient *c;
2950 REDIS_NOTUSED(el);
2951 REDIS_NOTUSED(mask);
2952 REDIS_NOTUSED(privdata);
2953
2954 cfd = anetAccept(server.neterr, fd, cip, &cport);
2955 if (cfd == AE_ERR) {
2956 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2957 return;
2958 }
2959 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2960 if ((c = createClient(cfd)) == NULL) {
2961 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2962 close(cfd); /* May be already closed, just ingore errors */
2963 return;
2964 }
2965 /* If maxclient directive is set and this is one client more... close the
2966 * connection. Note that we create the client instead to check before
2967 * for this condition, since now the socket is already set in nonblocking
2968 * mode and we can send an error for free using the Kernel I/O */
2969 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2970 char *err = "-ERR max number of clients reached\r\n";
2971
2972 /* That's a best effort error message, don't check write errors */
2973 if (write(c->fd,err,strlen(err)) == -1) {
2974 /* Nothing to do, Just to avoid the warning... */
2975 }
2976 freeClient(c);
2977 return;
2978 }
2979 server.stat_numconnections++;
2980 }
2981
2982 /* ======================= Redis objects implementation ===================== */
2983
2984 static robj *createObject(int type, void *ptr) {
2985 robj *o;
2986
2987 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2988 if (listLength(server.objfreelist)) {
2989 listNode *head = listFirst(server.objfreelist);
2990 o = listNodeValue(head);
2991 listDelNode(server.objfreelist,head);
2992 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2993 } else {
2994 if (server.vm_enabled)
2995 pthread_mutex_unlock(&server.obj_freelist_mutex);
2996 o = zmalloc(sizeof(*o));
2997 }
2998 o->type = type;
2999 o->encoding = REDIS_ENCODING_RAW;
3000 o->ptr = ptr;
3001 o->refcount = 1;
3002 if (server.vm_enabled) {
3003 /* Note that this code may run in the context of an I/O thread
3004 * and accessing server.lruclock in theory is an error
3005 * (no locks). But in practice this is safe, and even if we read
3006 * garbage Redis will not fail. */
3007 o->lru = server.lruclock;
3008 o->storage = REDIS_VM_MEMORY;
3009 }
3010 return o;
3011 }
3012
3013 static robj *createStringObject(char *ptr, size_t len) {
3014 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3015 }
3016
3017 static robj *createStringObjectFromLongLong(long long value) {
3018 robj *o;
3019 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3020 incrRefCount(shared.integers[value]);
3021 o = shared.integers[value];
3022 } else {
3023 if (value >= LONG_MIN && value <= LONG_MAX) {
3024 o = createObject(REDIS_STRING, NULL);
3025 o->encoding = REDIS_ENCODING_INT;
3026 o->ptr = (void*)((long)value);
3027 } else {
3028 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3029 }
3030 }
3031 return o;
3032 }
3033
3034 static robj *dupStringObject(robj *o) {
3035 assert(o->encoding == REDIS_ENCODING_RAW);
3036 return createStringObject(o->ptr,sdslen(o->ptr));
3037 }
3038
3039 static robj *createListObject(void) {
3040 list *l = listCreate();
3041 robj *o = createObject(REDIS_LIST,l);
3042 listSetFreeMethod(l,decrRefCount);
3043 o->encoding = REDIS_ENCODING_LIST;
3044 return o;
3045 }
3046
3047 static robj *createZiplistObject(void) {
3048 unsigned char *zl = ziplistNew();
3049 robj *o = createObject(REDIS_LIST,zl);
3050 o->encoding = REDIS_ENCODING_ZIPLIST;
3051 return o;
3052 }
3053
3054 static robj *createSetObject(void) {
3055 dict *d = dictCreate(&setDictType,NULL);
3056 return createObject(REDIS_SET,d);
3057 }
3058
3059 static robj *createHashObject(void) {
3060 /* All the Hashes start as zipmaps. Will be automatically converted
3061 * into hash tables if there are enough elements or big elements
3062 * inside. */
3063 unsigned char *zm = zipmapNew();
3064 robj *o = createObject(REDIS_HASH,zm);
3065 o->encoding = REDIS_ENCODING_ZIPMAP;
3066 return o;
3067 }
3068
3069 static robj *createZsetObject(void) {
3070 zset *zs = zmalloc(sizeof(*zs));
3071
3072 zs->dict = dictCreate(&zsetDictType,NULL);
3073 zs->zsl = zslCreate();
3074 return createObject(REDIS_ZSET,zs);
3075 }
3076
3077 static void freeStringObject(robj *o) {
3078 if (o->encoding == REDIS_ENCODING_RAW) {
3079 sdsfree(o->ptr);
3080 }
3081 }
3082
3083 static void freeListObject(robj *o) {
3084 switch (o->encoding) {
3085 case REDIS_ENCODING_LIST:
3086 listRelease((list*) o->ptr);
3087 break;
3088 case REDIS_ENCODING_ZIPLIST:
3089 zfree(o->ptr);
3090 break;
3091 default:
3092 redisPanic("Unknown list encoding type");
3093 }
3094 }
3095
3096 static void freeSetObject(robj *o) {
3097 dictRelease((dict*) o->ptr);
3098 }
3099
3100 static void freeZsetObject(robj *o) {
3101 zset *zs = o->ptr;
3102
3103 dictRelease(zs->dict);
3104 zslFree(zs->zsl);
3105 zfree(zs);
3106 }
3107
3108 static void freeHashObject(robj *o) {
3109 switch (o->encoding) {
3110 case REDIS_ENCODING_HT:
3111 dictRelease((dict*) o->ptr);
3112 break;
3113 case REDIS_ENCODING_ZIPMAP:
3114 zfree(o->ptr);
3115 break;
3116 default:
3117 redisPanic("Unknown hash encoding type");
3118 break;
3119 }
3120 }
3121
3122 static void incrRefCount(robj *o) {
3123 o->refcount++;
3124 }
3125
3126 static void decrRefCount(void *obj) {
3127 robj *o = obj;
3128
3129 /* Object is a swapped out value, or in the process of being loaded. */
3130 if (server.vm_enabled &&
3131 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3132 {
3133 vmpointer *vp = obj;
3134 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3135 vmMarkPagesFree(vp->page,vp->usedpages);
3136 server.vm_stats_swapped_objects--;
3137 zfree(vp);
3138 return;
3139 }
3140
3141 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3142 /* Object is in memory, or in the process of being swapped out.
3143 *
3144 * If the object is being swapped out, abort the operation on
3145 * decrRefCount even if the refcount does not drop to 0: the object
3146 * is referenced at least two times, as value of the key AND as
3147 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3148 * done but the relevant key was removed in the meantime, the
3149 * complete jobs handler will not find the key about the job and the
3150 * assert will fail. */
3151 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3152 vmCancelThreadedIOJob(o);
3153 if (--(o->refcount) == 0) {
3154 switch(o->type) {
3155 case REDIS_STRING: freeStringObject(o); break;
3156 case REDIS_LIST: freeListObject(o); break;
3157 case REDIS_SET: freeSetObject(o); break;
3158 case REDIS_ZSET: freeZsetObject(o); break;
3159 case REDIS_HASH: freeHashObject(o); break;
3160 default: redisPanic("Unknown object type"); break;
3161 }
3162 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3163 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3164 !listAddNodeHead(server.objfreelist,o))
3165 zfree(o);
3166 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3167 }
3168 }
3169
3170 static int checkType(redisClient *c, robj *o, int type) {
3171 if (o->type != type) {
3172 addReply(c,shared.wrongtypeerr);
3173 return 1;
3174 }
3175 return 0;
3176 }
3177
3178 /* Check if the nul-terminated string 's' can be represented by a long
3179 * (that is, is a number that fits into long without any other space or
3180 * character before or after the digits).
3181 *
3182 * If so, the function returns REDIS_OK and *longval is set to the value
3183 * of the number. Otherwise REDIS_ERR is returned */
3184 static int isStringRepresentableAsLong(sds s, long *longval) {
3185 char buf[32], *endptr;
3186 long value;
3187 int slen;
3188
3189 value = strtol(s, &endptr, 10);
3190 if (endptr[0] != '\0') return REDIS_ERR;
3191 slen = ll2string(buf,32,value);
3192
3193 /* If the number converted back into a string is not identical
3194 * then it's not possible to encode the string as integer */
3195 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3196 if (longval) *longval = value;
3197 return REDIS_OK;
3198 }
3199
3200 /* Try to encode a string object in order to save space */
3201 static robj *tryObjectEncoding(robj *o) {
3202 long value;
3203 sds s = o->ptr;
3204
3205 if (o->encoding != REDIS_ENCODING_RAW)
3206 return o; /* Already encoded */
3207
3208 /* It's not safe to encode shared objects: shared objects can be shared
3209 * everywhere in the "object space" of Redis. Encoded objects can only
3210 * appear as "values" (and not, for instance, as keys) */
3211 if (o->refcount > 1) return o;
3212
3213 /* Currently we try to encode only strings */
3214 redisAssert(o->type == REDIS_STRING);
3215
3216 /* Check if we can represent this string as a long integer */
3217 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3218
3219 /* Ok, this object can be encoded */
3220 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3221 decrRefCount(o);
3222 incrRefCount(shared.integers[value]);
3223 return shared.integers[value];
3224 } else {
3225 o->encoding = REDIS_ENCODING_INT;
3226 sdsfree(o->ptr);
3227 o->ptr = (void*) value;
3228 return o;
3229 }
3230 }
3231
3232 /* Get a decoded version of an encoded object (returned as a new object).
3233 * If the object is already raw-encoded just increment the ref count. */
3234 static robj *getDecodedObject(robj *o) {
3235 robj *dec;
3236
3237 if (o->encoding == REDIS_ENCODING_RAW) {
3238 incrRefCount(o);
3239 return o;
3240 }
3241 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3242 char buf[32];
3243
3244 ll2string(buf,32,(long)o->ptr);
3245 dec = createStringObject(buf,strlen(buf));
3246 return dec;
3247 } else {
3248 redisPanic("Unknown encoding type");
3249 }
3250 }
3251
3252 /* Compare two string objects via strcmp() or alike.
3253 * Note that the objects may be integer-encoded. In such a case we
3254 * use ll2string() to get a string representation of the numbers on the stack
3255 * and compare the strings, it's much faster than calling getDecodedObject().
3256 *
3257 * Important note: if objects are not integer encoded, but binary-safe strings,
3258 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3259 * binary safe. */
3260 static int compareStringObjects(robj *a, robj *b) {
3261 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3262 char bufa[128], bufb[128], *astr, *bstr;
3263 int bothsds = 1;
3264
3265 if (a == b) return 0;
3266 if (a->encoding != REDIS_ENCODING_RAW) {
3267 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3268 astr = bufa;
3269 bothsds = 0;
3270 } else {
3271 astr = a->ptr;
3272 }
3273 if (b->encoding != REDIS_ENCODING_RAW) {
3274 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3275 bstr = bufb;
3276 bothsds = 0;
3277 } else {
3278 bstr = b->ptr;
3279 }
3280 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3281 }
3282
3283 /* Equal string objects return 1 if the two objects are the same from the
3284 * point of view of a string comparison, otherwise 0 is returned. Note that
3285 * this function is faster then checking for (compareStringObject(a,b) == 0)
3286 * because it can perform some more optimization. */
3287 static int equalStringObjects(robj *a, robj *b) {
3288 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3289 return a->ptr == b->ptr;
3290 } else {
3291 return compareStringObjects(a,b) == 0;
3292 }
3293 }
3294
3295 static size_t stringObjectLen(robj *o) {
3296 redisAssert(o->type == REDIS_STRING);
3297 if (o->encoding == REDIS_ENCODING_RAW) {
3298 return sdslen(o->ptr);
3299 } else {
3300 char buf[32];
3301
3302 return ll2string(buf,32,(long)o->ptr);
3303 }
3304 }
3305
3306 static int getDoubleFromObject(robj *o, double *target) {
3307 double value;
3308 char *eptr;
3309
3310 if (o == NULL) {
3311 value = 0;
3312 } else {
3313 redisAssert(o->type == REDIS_STRING);
3314 if (o->encoding == REDIS_ENCODING_RAW) {
3315 value = strtod(o->ptr, &eptr);
3316 if (eptr[0] != '\0') return REDIS_ERR;
3317 } else if (o->encoding == REDIS_ENCODING_INT) {
3318 value = (long)o->ptr;
3319 } else {
3320 redisPanic("Unknown string encoding");
3321 }
3322 }
3323
3324 *target = value;
3325 return REDIS_OK;
3326 }
3327
3328 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3329 double value;
3330 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3331 if (msg != NULL) {
3332 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3333 } else {
3334 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3335 }
3336 return REDIS_ERR;
3337 }
3338
3339 *target = value;
3340 return REDIS_OK;
3341 }
3342
3343 static int getLongLongFromObject(robj *o, long long *target) {
3344 long long value;
3345 char *eptr;
3346
3347 if (o == NULL) {
3348 value = 0;
3349 } else {
3350 redisAssert(o->type == REDIS_STRING);
3351 if (o->encoding == REDIS_ENCODING_RAW) {
3352 value = strtoll(o->ptr, &eptr, 10);
3353 if (eptr[0] != '\0') return REDIS_ERR;
3354 } else if (o->encoding == REDIS_ENCODING_INT) {
3355 value = (long)o->ptr;
3356 } else {
3357 redisPanic("Unknown string encoding");
3358 }
3359 }
3360
3361 *target = value;
3362 return REDIS_OK;
3363 }
3364
3365 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3366 long long value;
3367 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3368 if (msg != NULL) {
3369 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3370 } else {
3371 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3372 }
3373 return REDIS_ERR;
3374 }
3375
3376 *target = value;
3377 return REDIS_OK;
3378 }
3379
3380 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3381 long long value;
3382
3383 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3384 if (value < LONG_MIN || value > LONG_MAX) {
3385 if (msg != NULL) {
3386 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3387 } else {
3388 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3389 }
3390 return REDIS_ERR;
3391 }
3392
3393 *target = value;
3394 return REDIS_OK;
3395 }
3396
3397 /* =========================== Keyspace access API ========================== */
3398
3399 static robj *lookupKey(redisDb *db, robj *key) {
3400 dictEntry *de = dictFind(db->dict,key->ptr);
3401 if (de) {
3402 robj *val = dictGetEntryVal(de);
3403
3404 if (server.vm_enabled) {
3405 if (val->storage == REDIS_VM_MEMORY ||
3406 val->storage == REDIS_VM_SWAPPING)
3407 {
3408 /* If we were swapping the object out, cancel the operation */
3409 if (val->storage == REDIS_VM_SWAPPING)
3410 vmCancelThreadedIOJob(val);
3411 /* Update the access time for the aging algorithm. */
3412 val->lru = server.lruclock;
3413 } else {
3414 int notify = (val->storage == REDIS_VM_LOADING);
3415
3416 /* Our value was swapped on disk. Bring it at home. */
3417 redisAssert(val->type == REDIS_VMPOINTER);
3418 val = vmLoadObject(val);
3419 dictGetEntryVal(de) = val;
3420
3421 /* Clients blocked by the VM subsystem may be waiting for
3422 * this key... */
3423 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3424 }
3425 }
3426 return val;
3427 } else {
3428 return NULL;
3429 }
3430 }
3431
3432 static robj *lookupKeyRead(redisDb *db, robj *key) {
3433 expireIfNeeded(db,key);
3434 return lookupKey(db,key);
3435 }
3436
3437 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3438 deleteIfVolatile(db,key);
3439 touchWatchedKey(db,key);
3440 return lookupKey(db,key);
3441 }
3442
3443 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3444 robj *o = lookupKeyRead(c->db, key);
3445 if (!o) addReply(c,reply);
3446 return o;
3447 }
3448
3449 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3450 robj *o = lookupKeyWrite(c->db, key);
3451 if (!o) addReply(c,reply);
3452 return o;
3453 }
3454
3455 /* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3456 * otherwise REDIS_OK is returned, and the caller should increment the
3457 * refcount of 'val'. */
3458 static int dbAdd(redisDb *db, robj *key, robj *val) {
3459 /* Perform a lookup before adding the key, as we need to copy the
3460 * key value. */
3461 if (dictFind(db->dict, key->ptr) != NULL) {
3462 return REDIS_ERR;
3463 } else {
3464 sds copy = sdsdup(key->ptr);
3465 dictAdd(db->dict, copy, val);
3466 return REDIS_OK;
3467 }
3468 }
3469
3470 /* If the key does not exist, this is just like dbAdd(). Otherwise
3471 * the value associated to the key is replaced with the new one.
3472 *
3473 * On update (key already existed) 0 is returned. Otherwise 1. */
3474 static int dbReplace(redisDb *db, robj *key, robj *val) {
3475 if (dictFind(db->dict,key->ptr) == NULL) {
3476 sds copy = sdsdup(key->ptr);
3477 dictAdd(db->dict, copy, val);
3478 return 1;
3479 } else {
3480 dictReplace(db->dict, key->ptr, val);
3481 return 0;
3482 }
3483 }
3484
3485 static int dbExists(redisDb *db, robj *key) {
3486 return dictFind(db->dict,key->ptr) != NULL;
3487 }
3488
3489 /* Return a random key, in form of a Redis object.
3490 * If there are no keys, NULL is returned.
3491 *
3492 * The function makes sure to return keys not already expired. */
3493 static robj *dbRandomKey(redisDb *db) {
3494 struct dictEntry *de;
3495
3496 while(1) {
3497 sds key;
3498 robj *keyobj;
3499
3500 de = dictGetRandomKey(db->dict);
3501 if (de == NULL) return NULL;
3502
3503 key = dictGetEntryKey(de);
3504 keyobj = createStringObject(key,sdslen(key));
3505 if (dictFind(db->expires,key)) {
3506 if (expireIfNeeded(db,keyobj)) {
3507 decrRefCount(keyobj);
3508 continue; /* search for another key. This expired. */
3509 }
3510 }
3511 return keyobj;
3512 }
3513 }
3514
3515 /* Delete a key, value, and associated expiration entry if any, from the DB */
3516 static int dbDelete(redisDb *db, robj *key) {
3517 int retval;
3518
3519 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3520 retval = dictDelete(db->dict,key->ptr);
3521
3522 return retval == DICT_OK;
3523 }
3524
3525 /*============================ RDB saving/loading =========================== */
3526
3527 static int rdbSaveType(FILE *fp, unsigned char type) {
3528 if (fwrite(&type,1,1,fp) == 0) return -1;
3529 return 0;
3530 }
3531
3532 static int rdbSaveTime(FILE *fp, time_t t) {
3533 int32_t t32 = (int32_t) t;
3534 if (fwrite(&t32,4,1,fp) == 0) return -1;
3535 return 0;
3536 }
3537
3538 /* check rdbLoadLen() comments for more info */
3539 static int rdbSaveLen(FILE *fp, uint32_t len) {
3540 unsigned char buf[2];
3541
3542 if (len < (1<<6)) {
3543 /* Save a 6 bit len */
3544 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3545 if (fwrite(buf,1,1,fp) == 0) return -1;
3546 } else if (len < (1<<14)) {
3547 /* Save a 14 bit len */
3548 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3549 buf[1] = len&0xFF;
3550 if (fwrite(buf,2,1,fp) == 0) return -1;
3551 } else {
3552 /* Save a 32 bit len */
3553 buf[0] = (REDIS_RDB_32BITLEN<<6);
3554 if (fwrite(buf,1,1,fp) == 0) return -1;
3555 len = htonl(len);
3556 if (fwrite(&len,4,1,fp) == 0) return -1;
3557 }
3558 return 0;
3559 }
3560
3561 /* Encode 'value' as an integer if possible (if integer will fit the
3562 * supported range). If the function sucessful encoded the integer
3563 * then the (up to 5 bytes) encoded representation is written in the
3564 * string pointed by 'enc' and the length is returned. Otherwise
3565 * 0 is returned. */
3566 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3567 /* Finally check if it fits in our ranges */
3568 if (value >= -(1<<7) && value <= (1<<7)-1) {
3569 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3570 enc[1] = value&0xFF;
3571 return 2;
3572 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3573 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3574 enc[1] = value&0xFF;
3575 enc[2] = (value>>8)&0xFF;
3576 return 3;
3577 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3578 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3579 enc[1] = value&0xFF;
3580 enc[2] = (value>>8)&0xFF;
3581 enc[3] = (value>>16)&0xFF;
3582 enc[4] = (value>>24)&0xFF;
3583 return 5;
3584 } else {
3585 return 0;
3586 }
3587 }
3588
3589 /* String objects in the form "2391" "-100" without any space and with a
3590 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3591 * encoded as integers to save space */
3592 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3593 long long value;
3594 char *endptr, buf[32];
3595
3596 /* Check if it's possible to encode this value as a number */
3597 value = strtoll(s, &endptr, 10);
3598 if (endptr[0] != '\0') return 0;
3599 ll2string(buf,32,value);
3600
3601 /* If the number converted back into a string is not identical
3602 * then it's not possible to encode the string as integer */
3603 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3604
3605 return rdbEncodeInteger(value,enc);
3606 }
3607
3608 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3609 size_t comprlen, outlen;
3610 unsigned char byte;
3611 void *out;
3612
3613 /* We require at least four bytes compression for this to be worth it */
3614 if (len <= 4) return 0;
3615 outlen = len-4;
3616 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3617 comprlen = lzf_compress(s, len, out, outlen);
3618 if (comprlen == 0) {
3619 zfree(out);
3620 return 0;
3621 }
3622 /* Data compressed! Let's save it on disk */
3623 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3624 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3625 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3626 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3627 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3628 zfree(out);
3629 return comprlen;
3630
3631 writeerr:
3632 zfree(out);
3633 return -1;
3634 }
3635
3636 /* Save a string objet as [len][data] on disk. If the object is a string
3637 * representation of an integer value we try to safe it in a special form */
3638 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3639 int enclen;
3640
3641 /* Try integer encoding */
3642 if (len <= 11) {
3643 unsigned char buf[5];
3644 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3645 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3646 return 0;
3647 }
3648 }
3649
3650 /* Try LZF compression - under 20 bytes it's unable to compress even
3651 * aaaaaaaaaaaaaaaaaa so skip it */
3652 if (server.rdbcompression && len > 20) {
3653 int retval;
3654
3655 retval = rdbSaveLzfStringObject(fp,s,len);
3656 if (retval == -1) return -1;
3657 if (retval > 0) return 0;
3658 /* retval == 0 means data can't be compressed, save the old way */
3659 }
3660
3661 /* Store verbatim */
3662 if (rdbSaveLen(fp,len) == -1) return -1;
3663 if (len && fwrite(s,len,1,fp) == 0) return -1;
3664 return 0;
3665 }
3666
3667 /* Save a long long value as either an encoded string or a string. */
3668 static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3669 unsigned char buf[32];
3670 int enclen = rdbEncodeInteger(value,buf);
3671 if (enclen > 0) {
3672 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3673 } else {
3674 /* Encode as string */
3675 enclen = ll2string((char*)buf,32,value);
3676 redisAssert(enclen < 32);
3677 if (rdbSaveLen(fp,enclen) == -1) return -1;
3678 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3679 }
3680 return 0;
3681 }
3682
3683 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3684 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3685 /* Avoid to decode the object, then encode it again, if the
3686 * object is alrady integer encoded. */
3687 if (obj->encoding == REDIS_ENCODING_INT) {
3688 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
3689 } else {
3690 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3691 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3692 }
3693 }
3694
3695 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3696 * 8 bit integer specifing the length of the representation.
3697 * This 8 bit integer has special values in order to specify the following
3698 * conditions:
3699 * 253: not a number
3700 * 254: + inf
3701 * 255: - inf
3702 */
3703 static int rdbSaveDoubleValue(FILE *fp, double val) {
3704 unsigned char buf[128];
3705 int len;
3706
3707 if (isnan(val)) {
3708 buf[0] = 253;
3709 len = 1;
3710 } else if (!isfinite(val)) {
3711 len = 1;
3712 buf[0] = (val < 0) ? 255 : 254;
3713 } else {
3714 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3715 /* Check if the float is in a safe range to be casted into a
3716 * long long. We are assuming that long long is 64 bit here.
3717 * Also we are assuming that there are no implementations around where
3718 * double has precision < 52 bit.
3719 *
3720 * Under this assumptions we test if a double is inside an interval
3721 * where casting to long long is safe. Then using two castings we
3722 * make sure the decimal part is zero. If all this is true we use
3723 * integer printing function that is much faster. */
3724 double min = -4503599627370495; /* (2^52)-1 */
3725 double max = 4503599627370496; /* -(2^52) */
3726 if (val > min && val < max && val == ((double)((long long)val)))
3727 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3728 else
3729 #endif
3730 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3731 buf[0] = strlen((char*)buf+1);
3732 len = buf[0]+1;
3733 }
3734 if (fwrite(buf,len,1,fp) == 0) return -1;
3735 return 0;
3736 }
3737
3738 /* Save a Redis object. */
3739 static int rdbSaveObject(FILE *fp, robj *o) {
3740 if (o->type == REDIS_STRING) {
3741 /* Save a string value */
3742 if (rdbSaveStringObject(fp,o) == -1) return -1;
3743 } else if (o->type == REDIS_LIST) {
3744 /* Save a list value */
3745 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3746 unsigned char *p;
3747 unsigned char *vstr;
3748 unsigned int vlen;
3749 long long vlong;
3750
3751 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3752 p = ziplistIndex(o->ptr,0);
3753 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3754 if (vstr) {
3755 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3756 return -1;
3757 } else {
3758 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3759 return -1;
3760 }
3761 p = ziplistNext(o->ptr,p);
3762 }
3763 } else if (o->encoding == REDIS_ENCODING_LIST) {
3764 list *list = o->ptr;
3765 listIter li;
3766 listNode *ln;
3767
3768 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3769 listRewind(list,&li);
3770 while((ln = listNext(&li))) {
3771 robj *eleobj = listNodeValue(ln);
3772 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3773 }
3774 } else {
3775 redisPanic("Unknown list encoding");
3776 }
3777 } else if (o->type == REDIS_SET) {
3778 /* Save a set value */
3779 dict *set = o->ptr;
3780 dictIterator *di = dictGetIterator(set);
3781 dictEntry *de;
3782
3783 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3784 while((de = dictNext(di)) != NULL) {
3785 robj *eleobj = dictGetEntryKey(de);
3786
3787 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3788 }
3789 dictReleaseIterator(di);
3790 } else if (o->type == REDIS_ZSET) {
3791 /* Save a set value */
3792 zset *zs = o->ptr;
3793 dictIterator *di = dictGetIterator(zs->dict);
3794 dictEntry *de;
3795
3796 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3797 while((de = dictNext(di)) != NULL) {
3798 robj *eleobj = dictGetEntryKey(de);
3799 double *score = dictGetEntryVal(de);
3800
3801 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3802 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3803 }
3804 dictReleaseIterator(di);
3805 } else if (o->type == REDIS_HASH) {
3806 /* Save a hash value */
3807 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3808 unsigned char *p = zipmapRewind(o->ptr);
3809 unsigned int count = zipmapLen(o->ptr);
3810 unsigned char *key, *val;
3811 unsigned int klen, vlen;
3812
3813 if (rdbSaveLen(fp,count) == -1) return -1;
3814 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3815 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3816 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3817 }
3818 } else {
3819 dictIterator *di = dictGetIterator(o->ptr);
3820 dictEntry *de;
3821
3822 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3823 while((de = dictNext(di)) != NULL) {
3824 robj *key = dictGetEntryKey(de);
3825 robj *val = dictGetEntryVal(de);
3826
3827 if (rdbSaveStringObject(fp,key) == -1) return -1;
3828 if (rdbSaveStringObject(fp,val) == -1) return -1;
3829 }
3830 dictReleaseIterator(di);
3831 }
3832 } else {
3833 redisPanic("Unknown object type");
3834 }
3835 return 0;
3836 }
3837
3838 /* Return the length the object will have on disk if saved with
3839 * the rdbSaveObject() function. Currently we use a trick to get
3840 * this length with very little changes to the code. In the future
3841 * we could switch to a faster solution. */
3842 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3843 if (fp == NULL) fp = server.devnull;
3844 rewind(fp);
3845 assert(rdbSaveObject(fp,o) != 1);
3846 return ftello(fp);
3847 }
3848
3849 /* Return the number of pages required to save this object in the swap file */
3850 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3851 off_t bytes = rdbSavedObjectLen(o,fp);
3852
3853 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3854 }
3855
3856 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3857 static int rdbSave(char *filename) {
3858 dictIterator *di = NULL;
3859 dictEntry *de;
3860 FILE *fp;
3861 char tmpfile[256];
3862 int j;
3863 time_t now = time(NULL);
3864
3865 /* Wait for I/O therads to terminate, just in case this is a
3866 * foreground-saving, to avoid seeking the swap file descriptor at the
3867 * same time. */
3868 if (server.vm_enabled)
3869 waitEmptyIOJobsQueue();
3870
3871 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3872 fp = fopen(tmpfile,"w");
3873 if (!fp) {
3874 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3875 return REDIS_ERR;
3876 }
3877 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3878 for (j = 0; j < server.dbnum; j++) {
3879 redisDb *db = server.db+j;
3880 dict *d = db->dict;
3881 if (dictSize(d) == 0) continue;
3882 di = dictGetIterator(d);
3883 if (!di) {
3884 fclose(fp);
3885 return REDIS_ERR;
3886 }
3887
3888 /* Write the SELECT DB opcode */
3889 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3890 if (rdbSaveLen(fp,j) == -1) goto werr;
3891
3892 /* Iterate this DB writing every entry */
3893 while((de = dictNext(di)) != NULL) {
3894 sds keystr = dictGetEntryKey(de);
3895 robj key, *o = dictGetEntryVal(de);
3896 time_t expiretime;
3897
3898 initStaticStringObject(key,keystr);
3899 expiretime = getExpire(db,&key);
3900
3901 /* Save the expire time */
3902 if (expiretime != -1) {
3903 /* If this key is already expired skip it */
3904 if (expiretime < now) continue;
3905 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3906 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3907 }
3908 /* Save the key and associated value. This requires special
3909 * handling if the value is swapped out. */
3910 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3911 o->storage == REDIS_VM_SWAPPING) {
3912 /* Save type, key, value */
3913 if (rdbSaveType(fp,o->type) == -1) goto werr;
3914 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3915 if (rdbSaveObject(fp,o) == -1) goto werr;
3916 } else {
3917 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3918 robj *po;
3919 /* Get a preview of the object in memory */
3920 po = vmPreviewObject(o);
3921 /* Save type, key, value */
3922 if (rdbSaveType(fp,po->type) == -1) goto werr;
3923 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
3924 if (rdbSaveObject(fp,po) == -1) goto werr;
3925 /* Remove the loaded object from memory */
3926 decrRefCount(po);
3927 }
3928 }
3929 dictReleaseIterator(di);
3930 }
3931 /* EOF opcode */
3932 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3933
3934 /* Make sure data will not remain on the OS's output buffers */
3935 fflush(fp);
3936 fsync(fileno(fp));
3937 fclose(fp);
3938
3939 /* Use RENAME to make sure the DB file is changed atomically only
3940 * if the generate DB file is ok. */
3941 if (rename(tmpfile,filename) == -1) {
3942 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3943 unlink(tmpfile);
3944 return REDIS_ERR;
3945 }
3946 redisLog(REDIS_NOTICE,"DB saved on disk");
3947 server.dirty = 0;
3948 server.lastsave = time(NULL);
3949 return REDIS_OK;
3950
3951 werr:
3952 fclose(fp);
3953 unlink(tmpfile);
3954 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3955 if (di) dictReleaseIterator(di);
3956 return REDIS_ERR;
3957 }
3958
3959 static int rdbSaveBackground(char *filename) {
3960 pid_t childpid;
3961
3962 if (server.bgsavechildpid != -1) return REDIS_ERR;
3963 if (server.vm_enabled) waitEmptyIOJobsQueue();
3964 if ((childpid = fork()) == 0) {
3965 /* Child */
3966 if (server.vm_enabled) vmReopenSwapFile();
3967 close(server.fd);
3968 if (rdbSave(filename) == REDIS_OK) {
3969 _exit(0);
3970 } else {
3971 _exit(1);
3972 }
3973 } else {
3974 /* Parent */
3975 if (childpid == -1) {
3976 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3977 strerror(errno));
3978 return REDIS_ERR;
3979 }
3980 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3981 server.bgsavechildpid = childpid;
3982 updateDictResizePolicy();
3983 return REDIS_OK;
3984 }
3985 return REDIS_OK; /* unreached */
3986 }
3987
3988 static void rdbRemoveTempFile(pid_t childpid) {
3989 char tmpfile[256];
3990
3991 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3992 unlink(tmpfile);
3993 }
3994
3995 static int rdbLoadType(FILE *fp) {
3996 unsigned char type;
3997 if (fread(&type,1,1,fp) == 0) return -1;
3998 return type;
3999 }
4000
4001 static time_t rdbLoadTime(FILE *fp) {
4002 int32_t t32;
4003 if (fread(&t32,4,1,fp) == 0) return -1;
4004 return (time_t) t32;
4005 }
4006
4007 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4008 * of this file for a description of how this are stored on disk.
4009 *
4010 * isencoded is set to 1 if the readed length is not actually a length but
4011 * an "encoding type", check the above comments for more info */
4012 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
4013 unsigned char buf[2];
4014 uint32_t len;
4015 int type;
4016
4017 if (isencoded) *isencoded = 0;
4018 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4019 type = (buf[0]&0xC0)>>6;
4020 if (type == REDIS_RDB_6BITLEN) {
4021 /* Read a 6 bit len */
4022 return buf[0]&0x3F;
4023 } else if (type == REDIS_RDB_ENCVAL) {
4024 /* Read a 6 bit len encoding type */
4025 if (isencoded) *isencoded = 1;
4026 return buf[0]&0x3F;
4027 } else if (type == REDIS_RDB_14BITLEN) {
4028 /* Read a 14 bit len */
4029 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4030 return ((buf[0]&0x3F)<<8)|buf[1];
4031 } else {
4032 /* Read a 32 bit len */
4033 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4034 return ntohl(len);
4035 }
4036 }
4037
4038 /* Load an integer-encoded object from file 'fp', with the specified
4039 * encoding type 'enctype'. If encode is true the function may return
4040 * an integer-encoded object as reply, otherwise the returned object
4041 * will always be encoded as a raw string. */
4042 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
4043 unsigned char enc[4];
4044 long long val;
4045
4046 if (enctype == REDIS_RDB_ENC_INT8) {
4047 if (fread(enc,1,1,fp) == 0) return NULL;
4048 val = (signed char)enc[0];
4049 } else if (enctype == REDIS_RDB_ENC_INT16) {
4050 uint16_t v;
4051 if (fread(enc,2,1,fp) == 0) return NULL;
4052 v = enc[0]|(enc[1]<<8);
4053 val = (int16_t)v;
4054 } else if (enctype == REDIS_RDB_ENC_INT32) {
4055 uint32_t v;
4056 if (fread(enc,4,1,fp) == 0) return NULL;
4057 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4058 val = (int32_t)v;
4059 } else {
4060 val = 0; /* anti-warning */
4061 redisPanic("Unknown RDB integer encoding type");
4062 }
4063 if (encode)
4064 return createStringObjectFromLongLong(val);
4065 else
4066 return createObject(REDIS_STRING,sdsfromlonglong(val));
4067 }
4068
4069 static robj *rdbLoadLzfStringObject(FILE*fp) {
4070 unsigned int len, clen;
4071 unsigned char *c = NULL;
4072 sds val = NULL;
4073
4074 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4075 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4076 if ((c = zmalloc(clen)) == NULL) goto err;
4077 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4078 if (fread(c,clen,1,fp) == 0) goto err;
4079 if (lzf_decompress(c,clen,val,len) == 0) goto err;
4080 zfree(c);
4081 return createObject(REDIS_STRING,val);
4082 err:
4083 zfree(c);
4084 sdsfree(val);
4085 return NULL;
4086 }
4087
4088 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
4089 int isencoded;
4090 uint32_t len;
4091 sds val;
4092
4093 len = rdbLoadLen(fp,&isencoded);
4094 if (isencoded) {
4095 switch(len) {
4096 case REDIS_RDB_ENC_INT8:
4097 case REDIS_RDB_ENC_INT16:
4098 case REDIS_RDB_ENC_INT32:
4099 return rdbLoadIntegerObject(fp,len,encode);
4100 case REDIS_RDB_ENC_LZF:
4101 return rdbLoadLzfStringObject(fp);
4102 default:
4103 redisPanic("Unknown RDB encoding type");
4104 }
4105 }
4106
4107 if (len == REDIS_RDB_LENERR) return NULL;
4108 val = sdsnewlen(NULL,len);
4109 if (len && fread(val,len,1,fp) == 0) {
4110 sdsfree(val);
4111 return NULL;
4112 }
4113 return createObject(REDIS_STRING,val);
4114 }
4115
4116 static robj *rdbLoadStringObject(FILE *fp) {
4117 return rdbGenericLoadStringObject(fp,0);
4118 }
4119
4120 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4121 return rdbGenericLoadStringObject(fp,1);
4122 }
4123
4124 /* For information about double serialization check rdbSaveDoubleValue() */
4125 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4126 char buf[128];
4127 unsigned char len;
4128
4129 if (fread(&len,1,1,fp) == 0) return -1;
4130 switch(len) {
4131 case 255: *val = R_NegInf; return 0;
4132 case 254: *val = R_PosInf; return 0;
4133 case 253: *val = R_Nan; return 0;
4134 default:
4135 if (fread(buf,len,1,fp) == 0) return -1;
4136 buf[len] = '\0';
4137 sscanf(buf, "%lg", val);
4138 return 0;
4139 }
4140 }
4141
4142 /* Load a Redis object of the specified type from the specified file.
4143 * On success a newly allocated object is returned, otherwise NULL. */
4144 static robj *rdbLoadObject(int type, FILE *fp) {
4145 robj *o, *ele, *dec;
4146 size_t len;
4147
4148 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4149 if (type == REDIS_STRING) {
4150 /* Read string value */
4151 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4152 o = tryObjectEncoding(o);
4153 } else if (type == REDIS_LIST) {
4154 /* Read list value */
4155 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4156
4157 o = createZiplistObject();
4158
4159 /* Load every single element of the list */
4160 while(len--) {
4161 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4162
4163 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4164 dec = getDecodedObject(ele);
4165 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4166 decrRefCount(dec);
4167 decrRefCount(ele);
4168 } else {
4169 ele = tryObjectEncoding(ele);
4170 listAddNodeTail(o->ptr,ele);
4171 incrRefCount(ele);
4172 }
4173 }
4174 } else if (type == REDIS_SET) {
4175 /* Read list/set value */
4176 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4177 o = createSetObject();
4178 /* It's faster to expand the dict to the right size asap in order
4179 * to avoid rehashing */
4180 if (len > DICT_HT_INITIAL_SIZE)
4181 dictExpand(o->ptr,len);
4182 /* Load every single element of the list/set */
4183 while(len--) {
4184 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4185 ele = tryObjectEncoding(ele);
4186 dictAdd((dict*)o->ptr,ele,NULL);
4187 }
4188 } else if (type == REDIS_ZSET) {
4189 /* Read list/set value */
4190 size_t zsetlen;
4191 zset *zs;
4192
4193 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4194 o = createZsetObject();
4195 zs = o->ptr;
4196 /* Load every single element of the list/set */
4197 while(zsetlen--) {
4198 robj *ele;
4199 double *score = zmalloc(sizeof(double));
4200
4201 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4202 ele = tryObjectEncoding(ele);
4203 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4204 dictAdd(zs->dict,ele,score);
4205 zslInsert(zs->zsl,*score,ele);
4206 incrRefCount(ele); /* added to skiplist */
4207 }
4208 } else if (type == REDIS_HASH) {
4209 size_t hashlen;
4210
4211 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4212 o = createHashObject();
4213 /* Too many entries? Use an hash table. */
4214 if (hashlen > server.hash_max_zipmap_entries)
4215 convertToRealHash(o);
4216 /* Load every key/value, then set it into the zipmap or hash
4217 * table, as needed. */
4218 while(hashlen--) {
4219 robj *key, *val;
4220
4221 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4222 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4223 /* If we are using a zipmap and there are too big values
4224 * the object is converted to real hash table encoding. */
4225 if (o->encoding != REDIS_ENCODING_HT &&
4226 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4227 sdslen(val->ptr) > server.hash_max_zipmap_value))
4228 {
4229 convertToRealHash(o);
4230 }
4231
4232 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4233 unsigned char *zm = o->ptr;
4234
4235 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4236 val->ptr,sdslen(val->ptr),NULL);
4237 o->ptr = zm;
4238 decrRefCount(key);
4239 decrRefCount(val);
4240 } else {
4241 key = tryObjectEncoding(key);
4242 val = tryObjectEncoding(val);
4243 dictAdd((dict*)o->ptr,key,val);
4244 }
4245 }
4246 } else {
4247 redisPanic("Unknown object type");
4248 }
4249 return o;
4250 }
4251
4252 static int rdbLoad(char *filename) {
4253 FILE *fp;
4254 uint32_t dbid;
4255 int type, retval, rdbver;
4256 int swap_all_values = 0;
4257 redisDb *db = server.db+0;
4258 char buf[1024];
4259 time_t expiretime, now = time(NULL);
4260
4261 fp = fopen(filename,"r");
4262 if (!fp) return REDIS_ERR;
4263 if (fread(buf,9,1,fp) == 0) goto eoferr;
4264 buf[9] = '\0';
4265 if (memcmp(buf,"REDIS",5) != 0) {
4266 fclose(fp);
4267 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4268 return REDIS_ERR;
4269 }
4270 rdbver = atoi(buf+5);
4271 if (rdbver != 1) {
4272 fclose(fp);
4273 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4274 return REDIS_ERR;
4275 }
4276 while(1) {
4277 robj *key, *val;
4278 int force_swapout;
4279
4280 expiretime = -1;
4281 /* Read type. */
4282 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4283 if (type == REDIS_EXPIRETIME) {
4284 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4285 /* We read the time so we need to read the object type again */
4286 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4287 }
4288 if (type == REDIS_EOF) break;
4289 /* Handle SELECT DB opcode as a special case */
4290 if (type == REDIS_SELECTDB) {
4291 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4292 goto eoferr;
4293 if (dbid >= (unsigned)server.dbnum) {
4294 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4295 exit(1);
4296 }
4297 db = server.db+dbid;
4298 continue;
4299 }
4300 /* Read key */
4301 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4302 /* Read value */
4303 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4304 /* Check if the key already expired */
4305 if (expiretime != -1 && expiretime < now) {
4306 decrRefCount(key);
4307 decrRefCount(val);
4308 continue;
4309 }
4310 /* Add the new object in the hash table */
4311 retval = dbAdd(db,key,val);
4312 if (retval == REDIS_ERR) {
4313 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4314 exit(1);
4315 }
4316 /* Set the expire time if needed */
4317 if (expiretime != -1) setExpire(db,key,expiretime);
4318
4319 /* Handle swapping while loading big datasets when VM is on */
4320
4321 /* If we detecter we are hopeless about fitting something in memory
4322 * we just swap every new key on disk. Directly...
4323 * Note that's important to check for this condition before resorting
4324 * to random sampling, otherwise we may try to swap already
4325 * swapped keys. */
4326 if (swap_all_values) {
4327 dictEntry *de = dictFind(db->dict,key->ptr);
4328
4329 /* de may be NULL since the key already expired */
4330 if (de) {
4331 vmpointer *vp;
4332 val = dictGetEntryVal(de);
4333
4334 if (val->refcount == 1 &&
4335 (vp = vmSwapObjectBlocking(val)) != NULL)
4336 dictGetEntryVal(de) = vp;
4337 }
4338 decrRefCount(key);
4339 continue;
4340 }
4341 decrRefCount(key);
4342
4343 /* Flush data on disk once 32 MB of additional RAM are used... */
4344 force_swapout = 0;
4345 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4346 force_swapout = 1;
4347
4348 /* If we have still some hope of having some value fitting memory
4349 * then we try random sampling. */
4350 if (!swap_all_values && server.vm_enabled && force_swapout) {
4351 while (zmalloc_used_memory() > server.vm_max_memory) {
4352 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4353 }
4354 if (zmalloc_used_memory() > server.vm_max_memory)
4355 swap_all_values = 1; /* We are already using too much mem */
4356 }
4357 }
4358 fclose(fp);
4359 return REDIS_OK;
4360
4361 eoferr: /* unexpected end of file is handled here with a fatal exit */
4362 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4363 exit(1);
4364 return REDIS_ERR; /* Just to avoid warning */
4365 }
4366
4367 /*================================== Shutdown =============================== */
4368 static int prepareForShutdown() {
4369 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4370 /* Kill the saving child if there is a background saving in progress.
4371 We want to avoid race conditions, for instance our saving child may
4372 overwrite the synchronous saving did by SHUTDOWN. */
4373 if (server.bgsavechildpid != -1) {
4374 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4375 kill(server.bgsavechildpid,SIGKILL);
4376 rdbRemoveTempFile(server.bgsavechildpid);
4377 }
4378 if (server.appendonly) {
4379 /* Append only file: fsync() the AOF and exit */
4380 aof_fsync(server.appendfd);
4381 if (server.vm_enabled) unlink(server.vm_swap_file);
4382 } else {
4383 /* Snapshotting. Perform a SYNC SAVE and exit */
4384 if (rdbSave(server.dbfilename) == REDIS_OK) {
4385 if (server.daemonize)
4386 unlink(server.pidfile);
4387 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4388 } else {
4389 /* Ooops.. error saving! The best we can do is to continue
4390 * operating. Note that if there was a background saving process,
4391 * in the next cron() Redis will be notified that the background
4392 * saving aborted, handling special stuff like slaves pending for
4393 * synchronization... */
4394 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4395 return REDIS_ERR;
4396 }
4397 }
4398 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4399 return REDIS_OK;
4400 }
4401
4402 /*================================== Commands =============================== */
4403
4404 static void authCommand(redisClient *c) {
4405 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4406 c->authenticated = 1;
4407 addReply(c,shared.ok);
4408 } else {
4409 c->authenticated = 0;
4410 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4411 }
4412 }
4413
4414 static void pingCommand(redisClient *c) {
4415 addReply(c,shared.pong);
4416 }
4417
4418 static void echoCommand(redisClient *c) {
4419 addReplyBulk(c,c->argv[1]);
4420 }
4421
4422 /*=================================== Strings =============================== */
4423
4424 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4425 int retval;
4426 long seconds = 0; /* initialized to avoid an harmness warning */
4427
4428 if (expire) {
4429 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4430 return;
4431 if (seconds <= 0) {
4432 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4433 return;
4434 }
4435 }
4436
4437 touchWatchedKey(c->db,key);
4438 if (nx) deleteIfVolatile(c->db,key);
4439 retval = dbAdd(c->db,key,val);
4440 if (retval == REDIS_ERR) {
4441 if (!nx) {
4442 dbReplace(c->db,key,val);
4443 incrRefCount(val);
4444 } else {
4445 addReply(c,shared.czero);
4446 return;
4447 }
4448 } else {
4449 incrRefCount(val);
4450 }
4451 server.dirty++;
4452 removeExpire(c->db,key);
4453 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4454 addReply(c, nx ? shared.cone : shared.ok);
4455 }
4456
4457 static void setCommand(redisClient *c) {
4458 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4459 }
4460
4461 static void setnxCommand(redisClient *c) {
4462 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4463 }
4464
4465 static void setexCommand(redisClient *c) {
4466 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4467 }
4468
4469 static int getGenericCommand(redisClient *c) {
4470 robj *o;
4471
4472 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4473 return REDIS_OK;
4474
4475 if (o->type != REDIS_STRING) {
4476 addReply(c,shared.wrongtypeerr);
4477 return REDIS_ERR;
4478 } else {
4479 addReplyBulk(c,o);
4480 return REDIS_OK;
4481 }
4482 }
4483
4484 static void getCommand(redisClient *c) {
4485 getGenericCommand(c);
4486 }
4487
4488 static void getsetCommand(redisClient *c) {
4489 if (getGenericCommand(c) == REDIS_ERR) return;
4490 dbReplace(c->db,c->argv[1],c->argv[2]);
4491 incrRefCount(c->argv[2]);
4492 server.dirty++;
4493 removeExpire(c->db,c->argv[1]);
4494 }
4495
4496 static void mgetCommand(redisClient *c) {
4497 int j;
4498
4499 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4500 for (j = 1; j < c->argc; j++) {
4501 robj *o = lookupKeyRead(c->db,c->argv[j]);
4502 if (o == NULL) {
4503 addReply(c,shared.nullbulk);
4504 } else {
4505 if (o->type != REDIS_STRING) {
4506 addReply(c,shared.nullbulk);
4507 } else {
4508 addReplyBulk(c,o);
4509 }
4510 }
4511 }
4512 }
4513
4514 static void msetGenericCommand(redisClient *c, int nx) {
4515 int j, busykeys = 0;
4516
4517 if ((c->argc % 2) == 0) {
4518 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4519 return;
4520 }
4521 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4522 * set nothing at all if at least one already key exists. */
4523 if (nx) {
4524 for (j = 1; j < c->argc; j += 2) {
4525 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4526 busykeys++;
4527 }
4528 }
4529 }
4530 if (busykeys) {
4531 addReply(c, shared.czero);
4532 return;
4533 }
4534
4535 for (j = 1; j < c->argc; j += 2) {
4536 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4537 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4538 incrRefCount(c->argv[j+1]);
4539 removeExpire(c->db,c->argv[j]);
4540 }
4541 server.dirty += (c->argc-1)/2;
4542 addReply(c, nx ? shared.cone : shared.ok);
4543 }
4544
4545 static void msetCommand(redisClient *c) {
4546 msetGenericCommand(c,0);
4547 }
4548
4549 static void msetnxCommand(redisClient *c) {
4550 msetGenericCommand(c,1);
4551 }
4552
4553 static void incrDecrCommand(redisClient *c, long long incr) {
4554 long long value;
4555 robj *o;
4556
4557 o = lookupKeyWrite(c->db,c->argv[1]);
4558 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4559 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4560
4561 value += incr;
4562 o = createStringObjectFromLongLong(value);
4563 dbReplace(c->db,c->argv[1],o);
4564 server.dirty++;
4565 addReply(c,shared.colon);
4566 addReply(c,o);
4567 addReply(c,shared.crlf);
4568 }
4569
4570 static void incrCommand(redisClient *c) {
4571 incrDecrCommand(c,1);
4572 }
4573
4574 static void decrCommand(redisClient *c) {
4575 incrDecrCommand(c,-1);
4576 }
4577
4578 static void incrbyCommand(redisClient *c) {
4579 long long incr;
4580
4581 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4582 incrDecrCommand(c,incr);
4583 }
4584
4585 static void decrbyCommand(redisClient *c) {
4586 long long incr;
4587
4588 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4589 incrDecrCommand(c,-incr);
4590 }
4591
4592 static void appendCommand(redisClient *c) {
4593 int retval;
4594 size_t totlen;
4595 robj *o;
4596
4597 o = lookupKeyWrite(c->db,c->argv[1]);
4598 if (o == NULL) {
4599 /* Create the key */
4600 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4601 incrRefCount(c->argv[2]);
4602 totlen = stringObjectLen(c->argv[2]);
4603 } else {
4604 if (o->type != REDIS_STRING) {
4605 addReply(c,shared.wrongtypeerr);
4606 return;
4607 }
4608 /* If the object is specially encoded or shared we have to make
4609 * a copy */
4610 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4611 robj *decoded = getDecodedObject(o);
4612
4613 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4614 decrRefCount(decoded);
4615 dbReplace(c->db,c->argv[1],o);
4616 }
4617 /* APPEND! */
4618 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4619 o->ptr = sdscatlen(o->ptr,
4620 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4621 } else {
4622 o->ptr = sdscatprintf(o->ptr, "%ld",
4623 (unsigned long) c->argv[2]->ptr);
4624 }
4625 totlen = sdslen(o->ptr);
4626 }
4627 server.dirty++;
4628 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4629 }
4630
4631 static void substrCommand(redisClient *c) {
4632 robj *o;
4633 long start = atoi(c->argv[2]->ptr);
4634 long end = atoi(c->argv[3]->ptr);
4635 size_t rangelen, strlen;
4636 sds range;
4637
4638 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4639 checkType(c,o,REDIS_STRING)) return;
4640
4641 o = getDecodedObject(o);
4642 strlen = sdslen(o->ptr);
4643
4644 /* convert negative indexes */
4645 if (start < 0) start = strlen+start;
4646 if (end < 0) end = strlen+end;
4647 if (start < 0) start = 0;
4648 if (end < 0) end = 0;
4649
4650 /* indexes sanity checks */
4651 if (start > end || (size_t)start >= strlen) {
4652 /* Out of range start or start > end result in null reply */
4653 addReply(c,shared.nullbulk);
4654 decrRefCount(o);
4655 return;
4656 }
4657 if ((size_t)end >= strlen) end = strlen-1;
4658 rangelen = (end-start)+1;
4659
4660 /* Return the result */
4661 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4662 range = sdsnewlen((char*)o->ptr+start,rangelen);
4663 addReplySds(c,range);
4664 addReply(c,shared.crlf);
4665 decrRefCount(o);
4666 }
4667
4668 /* ========================= Type agnostic commands ========================= */
4669
4670 static void delCommand(redisClient *c) {
4671 int deleted = 0, j;
4672
4673 for (j = 1; j < c->argc; j++) {
4674 if (dbDelete(c->db,c->argv[j])) {
4675 touchWatchedKey(c->db,c->argv[j]);
4676 server.dirty++;
4677 deleted++;
4678 }
4679 }
4680 addReplyLongLong(c,deleted);
4681 }
4682
4683 static void existsCommand(redisClient *c) {
4684 expireIfNeeded(c->db,c->argv[1]);
4685 if (dbExists(c->db,c->argv[1])) {
4686 addReply(c, shared.cone);
4687 } else {
4688 addReply(c, shared.czero);
4689 }
4690 }
4691
4692 static void selectCommand(redisClient *c) {
4693 int id = atoi(c->argv[1]->ptr);
4694
4695 if (selectDb(c,id) == REDIS_ERR) {
4696 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4697 } else {
4698 addReply(c,shared.ok);
4699 }
4700 }
4701
4702 static void randomkeyCommand(redisClient *c) {
4703 robj *key;
4704
4705 if ((key = dbRandomKey(c->db)) == NULL) {
4706 addReply(c,shared.nullbulk);
4707 return;
4708 }
4709
4710 addReplyBulk(c,key);
4711 decrRefCount(key);
4712 }
4713
4714 static void keysCommand(redisClient *c) {
4715 dictIterator *di;
4716 dictEntry *de;
4717 sds pattern = c->argv[1]->ptr;
4718 int plen = sdslen(pattern);
4719 unsigned long numkeys = 0;
4720 robj *lenobj = createObject(REDIS_STRING,NULL);
4721
4722 di = dictGetIterator(c->db->dict);
4723 addReply(c,lenobj);
4724 decrRefCount(lenobj);
4725 while((de = dictNext(di)) != NULL) {
4726 sds key = dictGetEntryKey(de);
4727 robj *keyobj;
4728
4729 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4730 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4731 keyobj = createStringObject(key,sdslen(key));
4732 if (expireIfNeeded(c->db,keyobj) == 0) {
4733 addReplyBulk(c,keyobj);
4734 numkeys++;
4735 }
4736 decrRefCount(keyobj);
4737 }
4738 }
4739 dictReleaseIterator(di);
4740 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4741 }
4742
4743 static void dbsizeCommand(redisClient *c) {
4744 addReplySds(c,
4745 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4746 }
4747
4748 static void lastsaveCommand(redisClient *c) {
4749 addReplySds(c,
4750 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4751 }
4752
4753 static void typeCommand(redisClient *c) {
4754 robj *o;
4755 char *type;
4756
4757 o = lookupKeyRead(c->db,c->argv[1]);
4758 if (o == NULL) {
4759 type = "+none";
4760 } else {
4761 switch(o->type) {
4762 case REDIS_STRING: type = "+string"; break;
4763 case REDIS_LIST: type = "+list"; break;
4764 case REDIS_SET: type = "+set"; break;
4765 case REDIS_ZSET: type = "+zset"; break;
4766 case REDIS_HASH: type = "+hash"; break;
4767 default: type = "+unknown"; break;
4768 }
4769 }
4770 addReplySds(c,sdsnew(type));
4771 addReply(c,shared.crlf);
4772 }
4773
4774 static void saveCommand(redisClient *c) {
4775 if (server.bgsavechildpid != -1) {
4776 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4777 return;
4778 }
4779 if (rdbSave(server.dbfilename) == REDIS_OK) {
4780 addReply(c,shared.ok);
4781 } else {
4782 addReply(c,shared.err);
4783 }
4784 }
4785
4786 static void bgsaveCommand(redisClient *c) {
4787 if (server.bgsavechildpid != -1) {
4788 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4789 return;
4790 }
4791 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4792 char *status = "+Background saving started\r\n";
4793 addReplySds(c,sdsnew(status));
4794 } else {
4795 addReply(c,shared.err);
4796 }
4797 }
4798
4799 static void shutdownCommand(redisClient *c) {
4800 if (prepareForShutdown() == REDIS_OK)
4801 exit(0);
4802 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4803 }
4804
4805 static void renameGenericCommand(redisClient *c, int nx) {
4806 robj *o;
4807
4808 /* To use the same key as src and dst is probably an error */
4809 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4810 addReply(c,shared.sameobjecterr);
4811 return;
4812 }
4813
4814 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4815 return;
4816
4817 incrRefCount(o);
4818 deleteIfVolatile(c->db,c->argv[2]);
4819 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
4820 if (nx) {
4821 decrRefCount(o);
4822 addReply(c,shared.czero);
4823 return;
4824 }
4825 dbReplace(c->db,c->argv[2],o);
4826 }
4827 dbDelete(c->db,c->argv[1]);
4828 touchWatchedKey(c->db,c->argv[2]);
4829 server.dirty++;
4830 addReply(c,nx ? shared.cone : shared.ok);
4831 }
4832
4833 static void renameCommand(redisClient *c) {
4834 renameGenericCommand(c,0);
4835 }
4836
4837 static void renamenxCommand(redisClient *c) {
4838 renameGenericCommand(c,1);
4839 }
4840
4841 static void moveCommand(redisClient *c) {
4842 robj *o;
4843 redisDb *src, *dst;
4844 int srcid;
4845
4846 /* Obtain source and target DB pointers */
4847 src = c->db;
4848 srcid = c->db->id;
4849 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4850 addReply(c,shared.outofrangeerr);
4851 return;
4852 }
4853 dst = c->db;
4854 selectDb(c,srcid); /* Back to the source DB */
4855
4856 /* If the user is moving using as target the same
4857 * DB as the source DB it is probably an error. */
4858 if (src == dst) {
4859 addReply(c,shared.sameobjecterr);
4860 return;
4861 }
4862
4863 /* Check if the element exists and get a reference */
4864 o = lookupKeyWrite(c->db,c->argv[1]);
4865 if (!o) {
4866 addReply(c,shared.czero);
4867 return;
4868 }
4869
4870 /* Try to add the element to the target DB */
4871 deleteIfVolatile(dst,c->argv[1]);
4872 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
4873 addReply(c,shared.czero);
4874 return;
4875 }
4876 incrRefCount(o);
4877
4878 /* OK! key moved, free the entry in the source DB */
4879 dbDelete(src,c->argv[1]);
4880 server.dirty++;
4881 addReply(c,shared.cone);
4882 }
4883
4884 /* =================================== Lists ================================ */
4885 static void lPush(robj *subject, robj *value, int where) {
4886 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4887 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4888 value = getDecodedObject(value);
4889 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4890 decrRefCount(value);
4891 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4892 if (where == REDIS_HEAD) {
4893 listAddNodeHead(subject->ptr,value);
4894 } else {
4895 listAddNodeTail(subject->ptr,value);
4896 }
4897 incrRefCount(value);
4898 } else {
4899 redisPanic("Unknown list encoding");
4900 }
4901 }
4902
4903 static robj *lPop(robj *subject, int where) {
4904 robj *value = NULL;
4905 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4906 unsigned char *p;
4907 unsigned char *vstr;
4908 unsigned int vlen;
4909 long long vlong;
4910 int pos = (where == REDIS_HEAD) ? 0 : -1;
4911 p = ziplistIndex(subject->ptr,pos);
4912 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4913 if (vstr) {
4914 value = createStringObject((char*)vstr,vlen);
4915 } else {
4916 value = createStringObjectFromLongLong(vlong);
4917 }
4918 /* We only need to delete an element when it exists */
4919 subject->ptr = ziplistDelete(subject->ptr,&p);
4920 }
4921 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4922 list *list = subject->ptr;
4923 listNode *ln;
4924 if (where == REDIS_HEAD) {
4925 ln = listFirst(list);
4926 } else {
4927 ln = listLast(list);
4928 }
4929 if (ln != NULL) {
4930 value = listNodeValue(ln);
4931 incrRefCount(value);
4932 listDelNode(list,ln);
4933 }
4934 } else {
4935 redisPanic("Unknown list encoding");
4936 }
4937 return value;
4938 }
4939
4940 static unsigned long lLength(robj *subject) {
4941 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4942 return ziplistLen(subject->ptr);
4943 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4944 return listLength((list*)subject->ptr);
4945 } else {
4946 redisPanic("Unknown list encoding");
4947 }
4948 }
4949
4950 /* Structure to hold set iteration abstraction. */
4951 typedef struct {
4952 robj *subject;
4953 unsigned char encoding;
4954 unsigned char direction; /* Iteration direction */
4955 unsigned char *zi;
4956 listNode *ln;
4957 } lIterator;
4958
4959 /* Structure for an entry while iterating over a list. */
4960 typedef struct {
4961 lIterator *li;
4962 unsigned char *zi; /* Entry in ziplist */
4963 listNode *ln; /* Entry in linked list */
4964 } lEntry;
4965
4966 /* Initialize an iterator at the specified index. */
4967 static lIterator *lInitIterator(robj *subject, int index, unsigned char direction) {
4968 lIterator *li = zmalloc(sizeof(lIterator));
4969 li->subject = subject;
4970 li->encoding = subject->encoding;
4971 li->direction = direction;
4972 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
4973 li->zi = ziplistIndex(subject->ptr,index);
4974 } else if (li->encoding == REDIS_ENCODING_LIST) {
4975 li->ln = listIndex(subject->ptr,index);
4976 } else {
4977 redisPanic("Unknown list encoding");
4978 }
4979 return li;
4980 }
4981
4982 /* Clean up the iterator. */
4983 static void lReleaseIterator(lIterator *li) {
4984 zfree(li);
4985 }
4986
4987 /* Stores pointer to current the entry in the provided entry structure
4988 * and advances the position of the iterator. Returns 1 when the current
4989 * entry is in fact an entry, 0 otherwise. */
4990 static int lNext(lIterator *li, lEntry *entry) {
4991 entry->li = li;
4992 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
4993 entry->zi = li->zi;
4994 if (entry->zi != NULL) {
4995 if (li->direction == REDIS_TAIL)
4996 li->zi = ziplistNext(li->subject->ptr,li->zi);
4997 else
4998 li->zi = ziplistPrev(li->subject->ptr,li->zi);
4999 return 1;
5000 }
5001 } else if (li->encoding == REDIS_ENCODING_LIST) {
5002 entry->ln = li->ln;
5003 if (entry->ln != NULL) {
5004 if (li->direction == REDIS_TAIL)
5005 li->ln = li->ln->next;
5006 else
5007 li->ln = li->ln->prev;
5008 return 1;
5009 }
5010 } else {
5011 redisPanic("Unknown list encoding");
5012 }
5013 return 0;
5014 }
5015
5016 /* Return entry or NULL at the current position of the iterator. */
5017 static robj *lGet(lEntry *entry) {
5018 lIterator *li = entry->li;
5019 robj *value = NULL;
5020 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5021 unsigned char *vstr;
5022 unsigned int vlen;
5023 long long vlong;
5024 redisAssert(entry->zi != NULL);
5025 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5026 if (vstr) {
5027 value = createStringObject((char*)vstr,vlen);
5028 } else {
5029 value = createStringObjectFromLongLong(vlong);
5030 }
5031 }
5032 } else if (li->encoding == REDIS_ENCODING_LIST) {
5033 redisAssert(entry->ln != NULL);
5034 value = listNodeValue(entry->ln);
5035 incrRefCount(value);
5036 } else {
5037 redisPanic("Unknown list encoding");
5038 }
5039 return value;
5040 }
5041
5042 /* Compare the given object with the entry at the current position. */
5043 static int lEqual(lEntry *entry, robj *o) {
5044 lIterator *li = entry->li;
5045 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5046 redisAssert(o->encoding == REDIS_ENCODING_RAW);
5047 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
5048 } else if (li->encoding == REDIS_ENCODING_LIST) {
5049 return equalStringObjects(o,listNodeValue(entry->ln));
5050 } else {
5051 redisPanic("Unknown list encoding");
5052 }
5053 }
5054
5055 /* Delete the element pointed to. */
5056 static void lDelete(lEntry *entry) {
5057 lIterator *li = entry->li;
5058 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5059 unsigned char *p = entry->zi;
5060 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5061
5062 /* Update position of the iterator depending on the direction */
5063 if (li->direction == REDIS_TAIL)
5064 li->zi = p;
5065 else
5066 li->zi = ziplistPrev(li->subject->ptr,p);
5067 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5068 listNode *next;
5069 if (li->direction == REDIS_TAIL)
5070 next = entry->ln->next;
5071 else
5072 next = entry->ln->prev;
5073 listDelNode(li->subject->ptr,entry->ln);
5074 li->ln = next;
5075 } else {
5076 redisPanic("Unknown list encoding");
5077 }
5078 }
5079
5080 static void pushGenericCommand(redisClient *c, int where) {
5081 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
5082 if (lobj == NULL) {
5083 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5084 addReply(c,shared.cone);
5085 return;
5086 }
5087 lobj = createZiplistObject();
5088 dbAdd(c->db,c->argv[1],lobj);
5089 } else {
5090 if (lobj->type != REDIS_LIST) {
5091 addReply(c,shared.wrongtypeerr);
5092 return;
5093 }
5094 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
5095 addReply(c,shared.cone);
5096 return;
5097 }
5098 }
5099 lPush(lobj,c->argv[2],where);
5100 addReplyLongLong(c,lLength(lobj));
5101 server.dirty++;
5102 }
5103
5104 static void lpushCommand(redisClient *c) {
5105 pushGenericCommand(c,REDIS_HEAD);
5106 }
5107
5108 static void rpushCommand(redisClient *c) {
5109 pushGenericCommand(c,REDIS_TAIL);
5110 }
5111
5112 static void llenCommand(redisClient *c) {
5113 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5114 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5115 addReplyUlong(c,lLength(o));
5116 }
5117
5118 static void lindexCommand(redisClient *c) {
5119 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5120 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5121 int index = atoi(c->argv[2]->ptr);
5122 robj *value = NULL;
5123
5124 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5125 unsigned char *p;
5126 unsigned char *vstr;
5127 unsigned int vlen;
5128 long long vlong;
5129 p = ziplistIndex(o->ptr,index);
5130 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5131 if (vstr) {
5132 value = createStringObject((char*)vstr,vlen);
5133 } else {
5134 value = createStringObjectFromLongLong(vlong);
5135 }
5136 addReplyBulk(c,value);
5137 decrRefCount(value);
5138 } else {
5139 addReply(c,shared.nullbulk);
5140 }
5141 } else if (o->encoding == REDIS_ENCODING_LIST) {
5142 listNode *ln = listIndex(o->ptr,index);
5143 if (ln != NULL) {
5144 value = listNodeValue(ln);
5145 addReplyBulk(c,value);
5146 } else {
5147 addReply(c,shared.nullbulk);
5148 }
5149 } else {
5150 redisPanic("Unknown list encoding");
5151 }
5152 }
5153
5154 static void lsetCommand(redisClient *c) {
5155 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5156 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5157 int index = atoi(c->argv[2]->ptr);
5158 robj *value = c->argv[3];
5159
5160 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5161 unsigned char *p, *zl = o->ptr;
5162 p = ziplistIndex(zl,index);
5163 if (p == NULL) {
5164 addReply(c,shared.outofrangeerr);
5165 } else {
5166 o->ptr = ziplistDelete(o->ptr,&p);
5167 value = getDecodedObject(value);
5168 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5169 decrRefCount(value);
5170 addReply(c,shared.ok);
5171 server.dirty++;
5172 }
5173 } else if (o->encoding == REDIS_ENCODING_LIST) {
5174 listNode *ln = listIndex(o->ptr,index);
5175 if (ln == NULL) {
5176 addReply(c,shared.outofrangeerr);
5177 } else {
5178 decrRefCount((robj*)listNodeValue(ln));
5179 listNodeValue(ln) = value;
5180 incrRefCount(value);
5181 addReply(c,shared.ok);
5182 server.dirty++;
5183 }
5184 } else {
5185 redisPanic("Unknown list encoding");
5186 }
5187 }
5188
5189 static void popGenericCommand(redisClient *c, int where) {
5190 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5191 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5192
5193 robj *value = lPop(o,where);
5194 if (value == NULL) {
5195 addReply(c,shared.nullbulk);
5196 } else {
5197 addReplyBulk(c,value);
5198 decrRefCount(value);
5199 if (lLength(o) == 0) dbDelete(c->db,c->argv[1]);
5200 server.dirty++;
5201 }
5202 }
5203
5204 static void lpopCommand(redisClient *c) {
5205 popGenericCommand(c,REDIS_HEAD);
5206 }
5207
5208 static void rpopCommand(redisClient *c) {
5209 popGenericCommand(c,REDIS_TAIL);
5210 }
5211
5212 static void lrangeCommand(redisClient *c) {
5213 robj *o, *value;
5214 int start = atoi(c->argv[2]->ptr);
5215 int end = atoi(c->argv[3]->ptr);
5216 int llen;
5217 int rangelen, j;
5218 lEntry entry;
5219
5220 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5221 || checkType(c,o,REDIS_LIST)) return;
5222 llen = lLength(o);
5223
5224 /* convert negative indexes */
5225 if (start < 0) start = llen+start;
5226 if (end < 0) end = llen+end;
5227 if (start < 0) start = 0;
5228 if (end < 0) end = 0;
5229
5230 /* indexes sanity checks */
5231 if (start > end || start >= llen) {
5232 /* Out of range start or start > end result in empty list */
5233 addReply(c,shared.emptymultibulk);
5234 return;
5235 }
5236 if (end >= llen) end = llen-1;
5237 rangelen = (end-start)+1;
5238
5239 /* Return the result in form of a multi-bulk reply */
5240 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
5241 lIterator *li = lInitIterator(o,start,REDIS_TAIL);
5242 for (j = 0; j < rangelen; j++) {
5243 redisAssert(lNext(li,&entry));
5244 value = lGet(&entry);
5245 addReplyBulk(c,value);
5246 decrRefCount(value);
5247 }
5248 lReleaseIterator(li);
5249 }
5250
5251 static void ltrimCommand(redisClient *c) {
5252 robj *o;
5253 int start = atoi(c->argv[2]->ptr);
5254 int end = atoi(c->argv[3]->ptr);
5255 int llen;
5256 int j, ltrim, rtrim;
5257 list *list;
5258 listNode *ln;
5259
5260 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5261 checkType(c,o,REDIS_LIST)) return;
5262 llen = lLength(o);
5263
5264 /* convert negative indexes */
5265 if (start < 0) start = llen+start;
5266 if (end < 0) end = llen+end;
5267 if (start < 0) start = 0;
5268 if (end < 0) end = 0;
5269
5270 /* indexes sanity checks */
5271 if (start > end || start >= llen) {
5272 /* Out of range start or start > end result in empty list */
5273 ltrim = llen;
5274 rtrim = 0;
5275 } else {
5276 if (end >= llen) end = llen-1;
5277 ltrim = start;
5278 rtrim = llen-end-1;
5279 }
5280
5281 /* Remove list elements to perform the trim */
5282 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5283 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5284 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5285 } else if (o->encoding == REDIS_ENCODING_LIST) {
5286 list = o->ptr;
5287 for (j = 0; j < ltrim; j++) {
5288 ln = listFirst(list);
5289 listDelNode(list,ln);
5290 }
5291 for (j = 0; j < rtrim; j++) {
5292 ln = listLast(list);
5293 listDelNode(list,ln);
5294 }
5295 } else {
5296 redisPanic("Unknown list encoding");
5297 }
5298 if (lLength(o) == 0) dbDelete(c->db,c->argv[1]);
5299 server.dirty++;
5300 addReply(c,shared.ok);
5301 }
5302
5303 static void lremCommand(redisClient *c) {
5304 robj *subject, *obj = c->argv[3];
5305 int toremove = atoi(c->argv[2]->ptr);
5306 int removed = 0;
5307 lEntry entry;
5308
5309 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5310 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
5311
5312 /* Make sure obj is raw when we're dealing with a ziplist */
5313 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5314 obj = getDecodedObject(obj);
5315
5316 lIterator *li;
5317 if (toremove < 0) {
5318 toremove = -toremove;
5319 li = lInitIterator(subject,-1,REDIS_HEAD);
5320 } else {
5321 li = lInitIterator(subject,0,REDIS_TAIL);
5322 }
5323
5324 while (lNext(li,&entry)) {
5325 if (lEqual(&entry,obj)) {
5326 lDelete(&entry);
5327 server.dirty++;
5328 removed++;
5329 if (toremove && removed == toremove) break;
5330 }
5331 }
5332 lReleaseIterator(li);
5333
5334 /* Clean up raw encoded object */
5335 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5336 decrRefCount(obj);
5337
5338 if (lLength(subject) == 0) dbDelete(c->db,c->argv[1]);
5339 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5340 }
5341
5342 /* This is the semantic of this command:
5343 * RPOPLPUSH srclist dstlist:
5344 * IF LLEN(srclist) > 0
5345 * element = RPOP srclist
5346 * LPUSH dstlist element
5347 * RETURN element
5348 * ELSE
5349 * RETURN nil
5350 * END
5351 * END
5352 *
5353 * The idea is to be able to get an element from a list in a reliable way
5354 * since the element is not just returned but pushed against another list
5355 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5356 */
5357 static void rpoplpushcommand(redisClient *c) {
5358 robj *sobj, *value;
5359 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5360 checkType(c,sobj,REDIS_LIST)) return;
5361
5362 if (lLength(sobj) == 0) {
5363 addReply(c,shared.nullbulk);
5364 } else {
5365 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5366 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5367 value = lPop(sobj,REDIS_TAIL);
5368
5369 /* Add the element to the target list (unless it's directly
5370 * passed to some BLPOP-ing client */
5371 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5372 /* Create the list if the key does not exist */
5373 if (!dobj) {
5374 dobj = createZiplistObject();
5375 dbAdd(c->db,c->argv[2],dobj);
5376 }
5377 lPush(dobj,value,REDIS_HEAD);
5378 }
5379
5380 /* Send the element to the client as reply as well */
5381 addReplyBulk(c,value);
5382
5383 /* lPop returns an object with its refcount incremented */
5384 decrRefCount(value);
5385
5386 /* Delete the source list when it is empty */
5387 if (lLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
5388 server.dirty++;
5389 }
5390 }
5391
5392 /* ==================================== Sets ================================ */
5393
5394 static void saddCommand(redisClient *c) {
5395 robj *set;
5396
5397 set = lookupKeyWrite(c->db,c->argv[1]);
5398 if (set == NULL) {
5399 set = createSetObject();
5400 dbAdd(c->db,c->argv[1],set);
5401 } else {
5402 if (set->type != REDIS_SET) {
5403 addReply(c,shared.wrongtypeerr);
5404 return;
5405 }
5406 }
5407 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5408 incrRefCount(c->argv[2]);
5409 server.dirty++;
5410 addReply(c,shared.cone);
5411 } else {
5412 addReply(c,shared.czero);
5413 }
5414 }
5415
5416 static void sremCommand(redisClient *c) {
5417 robj *set;
5418
5419 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5420 checkType(c,set,REDIS_SET)) return;
5421
5422 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5423 server.dirty++;
5424 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5425 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5426 addReply(c,shared.cone);
5427 } else {
5428 addReply(c,shared.czero);
5429 }
5430 }
5431
5432 static void smoveCommand(redisClient *c) {
5433 robj *srcset, *dstset;
5434
5435 srcset = lookupKeyWrite(c->db,c->argv[1]);
5436 dstset = lookupKeyWrite(c->db,c->argv[2]);
5437
5438 /* If the source key does not exist return 0, if it's of the wrong type
5439 * raise an error */
5440 if (srcset == NULL || srcset->type != REDIS_SET) {
5441 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5442 return;
5443 }
5444 /* Error if the destination key is not a set as well */
5445 if (dstset && dstset->type != REDIS_SET) {
5446 addReply(c,shared.wrongtypeerr);
5447 return;
5448 }
5449 /* Remove the element from the source set */
5450 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5451 /* Key not found in the src set! return zero */
5452 addReply(c,shared.czero);
5453 return;
5454 }
5455 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5456 dbDelete(c->db,c->argv[1]);
5457 server.dirty++;
5458 /* Add the element to the destination set */
5459 if (!dstset) {
5460 dstset = createSetObject();
5461 dbAdd(c->db,c->argv[2],dstset);
5462 }
5463 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5464 incrRefCount(c->argv[3]);
5465 addReply(c,shared.cone);
5466 }
5467
5468 static void sismemberCommand(redisClient *c) {
5469 robj *set;
5470
5471 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5472 checkType(c,set,REDIS_SET)) return;
5473
5474 if (dictFind(set->ptr,c->argv[2]))
5475 addReply(c,shared.cone);
5476 else
5477 addReply(c,shared.czero);
5478 }
5479
5480 static void scardCommand(redisClient *c) {
5481 robj *o;
5482 dict *s;
5483
5484 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5485 checkType(c,o,REDIS_SET)) return;
5486
5487 s = o->ptr;
5488 addReplyUlong(c,dictSize(s));
5489 }
5490
5491 static void spopCommand(redisClient *c) {
5492 robj *set;
5493 dictEntry *de;
5494
5495 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5496 checkType(c,set,REDIS_SET)) return;
5497
5498 de = dictGetRandomKey(set->ptr);
5499 if (de == NULL) {
5500 addReply(c,shared.nullbulk);
5501 } else {
5502 robj *ele = dictGetEntryKey(de);
5503
5504 addReplyBulk(c,ele);
5505 dictDelete(set->ptr,ele);
5506 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5507 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
5508 server.dirty++;
5509 }
5510 }
5511
5512 static void srandmemberCommand(redisClient *c) {
5513 robj *set;
5514 dictEntry *de;
5515
5516 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5517 checkType(c,set,REDIS_SET)) return;
5518
5519 de = dictGetRandomKey(set->ptr);
5520 if (de == NULL) {
5521 addReply(c,shared.nullbulk);
5522 } else {
5523 robj *ele = dictGetEntryKey(de);
5524
5525 addReplyBulk(c,ele);
5526 }
5527 }
5528
5529 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5530 dict **d1 = (void*) s1, **d2 = (void*) s2;
5531
5532 return dictSize(*d1)-dictSize(*d2);
5533 }
5534
5535 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5536 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5537 dictIterator *di;
5538 dictEntry *de;
5539 robj *lenobj = NULL, *dstset = NULL;
5540 unsigned long j, cardinality = 0;
5541
5542 for (j = 0; j < setsnum; j++) {
5543 robj *setobj;
5544
5545 setobj = dstkey ?
5546 lookupKeyWrite(c->db,setskeys[j]) :
5547 lookupKeyRead(c->db,setskeys[j]);
5548 if (!setobj) {
5549 zfree(dv);
5550 if (dstkey) {
5551 if (dbDelete(c->db,dstkey))
5552 server.dirty++;
5553 addReply(c,shared.czero);
5554 } else {
5555 addReply(c,shared.emptymultibulk);
5556 }
5557 return;
5558 }
5559 if (setobj->type != REDIS_SET) {
5560 zfree(dv);
5561 addReply(c,shared.wrongtypeerr);
5562 return;
5563 }
5564 dv[j] = setobj->ptr;
5565 }
5566 /* Sort sets from the smallest to largest, this will improve our
5567 * algorithm's performace */
5568 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5569
5570 /* The first thing we should output is the total number of elements...
5571 * since this is a multi-bulk write, but at this stage we don't know
5572 * the intersection set size, so we use a trick, append an empty object
5573 * to the output list and save the pointer to later modify it with the
5574 * right length */
5575 if (!dstkey) {
5576 lenobj = createObject(REDIS_STRING,NULL);
5577 addReply(c,lenobj);
5578 decrRefCount(lenobj);
5579 } else {
5580 /* If we have a target key where to store the resulting set
5581 * create this key with an empty set inside */
5582 dstset = createSetObject();
5583 }
5584
5585 /* Iterate all the elements of the first (smallest) set, and test
5586 * the element against all the other sets, if at least one set does
5587 * not include the element it is discarded */
5588 di = dictGetIterator(dv[0]);
5589
5590 while((de = dictNext(di)) != NULL) {
5591 robj *ele;
5592
5593 for (j = 1; j < setsnum; j++)
5594 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5595 if (j != setsnum)
5596 continue; /* at least one set does not contain the member */
5597 ele = dictGetEntryKey(de);
5598 if (!dstkey) {
5599 addReplyBulk(c,ele);
5600 cardinality++;
5601 } else {
5602 dictAdd(dstset->ptr,ele,NULL);
5603 incrRefCount(ele);
5604 }
5605 }
5606 dictReleaseIterator(di);
5607
5608 if (dstkey) {
5609 /* Store the resulting set into the target, if the intersection
5610 * is not an empty set. */
5611 dbDelete(c->db,dstkey);
5612 if (dictSize((dict*)dstset->ptr) > 0) {
5613 dbAdd(c->db,dstkey,dstset);
5614 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5615 } else {
5616 decrRefCount(dstset);
5617 addReply(c,shared.czero);
5618 }
5619 server.dirty++;
5620 } else {
5621 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5622 }
5623 zfree(dv);
5624 }
5625
5626 static void sinterCommand(redisClient *c) {
5627 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5628 }
5629
5630 static void sinterstoreCommand(redisClient *c) {
5631 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5632 }
5633
5634 #define REDIS_OP_UNION 0
5635 #define REDIS_OP_DIFF 1
5636 #define REDIS_OP_INTER 2
5637
5638 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5639 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5640 dictIterator *di;
5641 dictEntry *de;
5642 robj *dstset = NULL;
5643 int j, cardinality = 0;
5644
5645 for (j = 0; j < setsnum; j++) {
5646 robj *setobj;
5647
5648 setobj = dstkey ?
5649 lookupKeyWrite(c->db,setskeys[j]) :
5650 lookupKeyRead(c->db,setskeys[j]);
5651 if (!setobj) {
5652 dv[j] = NULL;
5653 continue;
5654 }
5655 if (setobj->type != REDIS_SET) {
5656 zfree(dv);
5657 addReply(c,shared.wrongtypeerr);
5658 return;
5659 }
5660 dv[j] = setobj->ptr;
5661 }
5662
5663 /* We need a temp set object to store our union. If the dstkey
5664 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5665 * this set object will be the resulting object to set into the target key*/
5666 dstset = createSetObject();
5667
5668 /* Iterate all the elements of all the sets, add every element a single
5669 * time to the result set */
5670 for (j = 0; j < setsnum; j++) {
5671 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5672 if (!dv[j]) continue; /* non existing keys are like empty sets */
5673
5674 di = dictGetIterator(dv[j]);
5675
5676 while((de = dictNext(di)) != NULL) {
5677 robj *ele;
5678
5679 /* dictAdd will not add the same element multiple times */
5680 ele = dictGetEntryKey(de);
5681 if (op == REDIS_OP_UNION || j == 0) {
5682 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5683 incrRefCount(ele);
5684 cardinality++;
5685 }
5686 } else if (op == REDIS_OP_DIFF) {
5687 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5688 cardinality--;
5689 }
5690 }
5691 }
5692 dictReleaseIterator(di);
5693
5694 /* result set is empty? Exit asap. */
5695 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5696 }
5697
5698 /* Output the content of the resulting set, if not in STORE mode */
5699 if (!dstkey) {
5700 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5701 di = dictGetIterator(dstset->ptr);
5702 while((de = dictNext(di)) != NULL) {
5703 robj *ele;
5704
5705 ele = dictGetEntryKey(de);
5706 addReplyBulk(c,ele);
5707 }
5708 dictReleaseIterator(di);
5709 decrRefCount(dstset);
5710 } else {
5711 /* If we have a target key where to store the resulting set
5712 * create this key with the result set inside */
5713 dbDelete(c->db,dstkey);
5714 if (dictSize((dict*)dstset->ptr) > 0) {
5715 dbAdd(c->db,dstkey,dstset);
5716 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5717 } else {
5718 decrRefCount(dstset);
5719 addReply(c,shared.czero);
5720 }
5721 server.dirty++;
5722 }
5723 zfree(dv);
5724 }
5725
5726 static void sunionCommand(redisClient *c) {
5727 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5728 }
5729
5730 static void sunionstoreCommand(redisClient *c) {
5731 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5732 }
5733
5734 static void sdiffCommand(redisClient *c) {
5735 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5736 }
5737
5738 static void sdiffstoreCommand(redisClient *c) {
5739 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5740 }
5741
5742 /* ==================================== ZSets =============================== */
5743
5744 /* ZSETs are ordered sets using two data structures to hold the same elements
5745 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5746 * data structure.
5747 *
5748 * The elements are added to an hash table mapping Redis objects to scores.
5749 * At the same time the elements are added to a skip list mapping scores
5750 * to Redis objects (so objects are sorted by scores in this "view"). */
5751
5752 /* This skiplist implementation is almost a C translation of the original
5753 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5754 * Alternative to Balanced Trees", modified in three ways:
5755 * a) this implementation allows for repeated values.
5756 * b) the comparison is not just by key (our 'score') but by satellite data.
5757 * c) there is a back pointer, so it's a doubly linked list with the back
5758 * pointers being only at "level 1". This allows to traverse the list
5759 * from tail to head, useful for ZREVRANGE. */
5760
5761 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5762 zskiplistNode *zn = zmalloc(sizeof(*zn));
5763
5764 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5765 if (level > 1)
5766 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5767 else
5768 zn->span = NULL;
5769 zn->score = score;
5770 zn->obj = obj;
5771 return zn;
5772 }
5773
5774 static zskiplist *zslCreate(void) {
5775 int j;
5776 zskiplist *zsl;
5777
5778 zsl = zmalloc(sizeof(*zsl));
5779 zsl->level = 1;
5780 zsl->length = 0;
5781 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5782 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5783 zsl->header->forward[j] = NULL;
5784
5785 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5786 if (j < ZSKIPLIST_MAXLEVEL-1)
5787 zsl->header->span[j] = 0;
5788 }
5789 zsl->header->backward = NULL;
5790 zsl->tail = NULL;
5791 return zsl;
5792 }
5793
5794 static void zslFreeNode(zskiplistNode *node) {
5795 decrRefCount(node->obj);
5796 zfree(node->forward);
5797 zfree(node->span);
5798 zfree(node);
5799 }
5800
5801 static void zslFree(zskiplist *zsl) {
5802 zskiplistNode *node = zsl->header->forward[0], *next;
5803
5804 zfree(zsl->header->forward);
5805 zfree(zsl->header->span);
5806 zfree(zsl->header);
5807 while(node) {
5808 next = node->forward[0];
5809 zslFreeNode(node);
5810 node = next;
5811 }
5812 zfree(zsl);
5813 }
5814
5815 static int zslRandomLevel(void) {
5816 int level = 1;
5817 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5818 level += 1;
5819 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5820 }
5821
5822 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5823 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5824 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5825 int i, level;
5826
5827 x = zsl->header;
5828 for (i = zsl->level-1; i >= 0; i--) {
5829 /* store rank that is crossed to reach the insert position */
5830 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5831
5832 while (x->forward[i] &&
5833 (x->forward[i]->score < score ||
5834 (x->forward[i]->score == score &&
5835 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5836 rank[i] += i > 0 ? x->span[i-1] : 1;
5837 x = x->forward[i];
5838 }
5839 update[i] = x;
5840 }
5841 /* we assume the key is not already inside, since we allow duplicated
5842 * scores, and the re-insertion of score and redis object should never
5843 * happpen since the caller of zslInsert() should test in the hash table
5844 * if the element is already inside or not. */
5845 level = zslRandomLevel();
5846 if (level > zsl->level) {
5847 for (i = zsl->level; i < level; i++) {
5848 rank[i] = 0;
5849 update[i] = zsl->header;
5850 update[i]->span[i-1] = zsl->length;
5851 }
5852 zsl->level = level;
5853 }
5854 x = zslCreateNode(level,score,obj);
5855 for (i = 0; i < level; i++) {
5856 x->forward[i] = update[i]->forward[i];
5857 update[i]->forward[i] = x;
5858
5859 /* update span covered by update[i] as x is inserted here */
5860 if (i > 0) {
5861 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5862 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5863 }
5864 }
5865
5866 /* increment span for untouched levels */
5867 for (i = level; i < zsl->level; i++) {
5868 update[i]->span[i-1]++;
5869 }
5870
5871 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5872 if (x->forward[0])
5873 x->forward[0]->backward = x;
5874 else
5875 zsl->tail = x;
5876 zsl->length++;
5877 }
5878
5879 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5880 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5881 int i;
5882 for (i = 0; i < zsl->level; i++) {
5883 if (update[i]->forward[i] == x) {
5884 if (i > 0) {
5885 update[i]->span[i-1] += x->span[i-1] - 1;
5886 }
5887 update[i]->forward[i] = x->forward[i];
5888 } else {
5889 /* invariant: i > 0, because update[0]->forward[0]
5890 * is always equal to x */
5891 update[i]->span[i-1] -= 1;
5892 }
5893 }
5894 if (x->forward[0]) {
5895 x->forward[0]->backward = x->backward;
5896 } else {
5897 zsl->tail = x->backward;
5898 }
5899 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5900 zsl->level--;
5901 zsl->length--;
5902 }
5903
5904 /* Delete an element with matching score/object from the skiplist. */
5905 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5906 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5907 int i;
5908
5909 x = zsl->header;
5910 for (i = zsl->level-1; i >= 0; i--) {
5911 while (x->forward[i] &&
5912 (x->forward[i]->score < score ||
5913 (x->forward[i]->score == score &&
5914 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5915 x = x->forward[i];
5916 update[i] = x;
5917 }
5918 /* We may have multiple elements with the same score, what we need
5919 * is to find the element with both the right score and object. */
5920 x = x->forward[0];
5921 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5922 zslDeleteNode(zsl, x, update);
5923 zslFreeNode(x);
5924 return 1;
5925 } else {
5926 return 0; /* not found */
5927 }
5928 return 0; /* not found */
5929 }
5930
5931 /* Delete all the elements with score between min and max from the skiplist.
5932 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5933 * Note that this function takes the reference to the hash table view of the
5934 * sorted set, in order to remove the elements from the hash table too. */
5935 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5936 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5937 unsigned long removed = 0;
5938 int i;
5939
5940 x = zsl->header;
5941 for (i = zsl->level-1; i >= 0; i--) {
5942 while (x->forward[i] && x->forward[i]->score < min)
5943 x = x->forward[i];
5944 update[i] = x;
5945 }
5946 /* We may have multiple elements with the same score, what we need
5947 * is to find the element with both the right score and object. */
5948 x = x->forward[0];
5949 while (x && x->score <= max) {
5950 zskiplistNode *next = x->forward[0];
5951 zslDeleteNode(zsl, x, update);
5952 dictDelete(dict,x->obj);
5953 zslFreeNode(x);
5954 removed++;
5955 x = next;
5956 }
5957 return removed; /* not found */
5958 }
5959
5960 /* Delete all the elements with rank between start and end from the skiplist.
5961 * Start and end are inclusive. Note that start and end need to be 1-based */
5962 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5963 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5964 unsigned long traversed = 0, removed = 0;
5965 int i;
5966
5967 x = zsl->header;
5968 for (i = zsl->level-1; i >= 0; i--) {
5969 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5970 traversed += i > 0 ? x->span[i-1] : 1;
5971 x = x->forward[i];
5972 }
5973 update[i] = x;
5974 }
5975
5976 traversed++;
5977 x = x->forward[0];
5978 while (x && traversed <= end) {
5979 zskiplistNode *next = x->forward[0];
5980 zslDeleteNode(zsl, x, update);
5981 dictDelete(dict,x->obj);
5982 zslFreeNode(x);
5983 removed++;
5984 traversed++;
5985 x = next;
5986 }
5987 return removed;
5988 }
5989
5990 /* Find the first node having a score equal or greater than the specified one.
5991 * Returns NULL if there is no match. */
5992 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5993 zskiplistNode *x;
5994 int i;
5995
5996 x = zsl->header;
5997 for (i = zsl->level-1; i >= 0; i--) {
5998 while (x->forward[i] && x->forward[i]->score < score)
5999 x = x->forward[i];
6000 }
6001 /* We may have multiple elements with the same score, what we need
6002 * is to find the element with both the right score and object. */
6003 return x->forward[0];
6004 }
6005
6006 /* Find the rank for an element by both score and key.
6007 * Returns 0 when the element cannot be found, rank otherwise.
6008 * Note that the rank is 1-based due to the span of zsl->header to the
6009 * first element. */
6010 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
6011 zskiplistNode *x;
6012 unsigned long rank = 0;
6013 int i;
6014
6015 x = zsl->header;
6016 for (i = zsl->level-1; i >= 0; i--) {
6017 while (x->forward[i] &&
6018 (x->forward[i]->score < score ||
6019 (x->forward[i]->score == score &&
6020 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
6021 rank += i > 0 ? x->span[i-1] : 1;
6022 x = x->forward[i];
6023 }
6024
6025 /* x might be equal to zsl->header, so test if obj is non-NULL */
6026 if (x->obj && equalStringObjects(x->obj,o)) {
6027 return rank;
6028 }
6029 }
6030 return 0;
6031 }
6032
6033 /* Finds an element by its rank. The rank argument needs to be 1-based. */
6034 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
6035 zskiplistNode *x;
6036 unsigned long traversed = 0;
6037 int i;
6038
6039 x = zsl->header;
6040 for (i = zsl->level-1; i >= 0; i--) {
6041 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6042 {
6043 traversed += i > 0 ? x->span[i-1] : 1;
6044 x = x->forward[i];
6045 }
6046 if (traversed == rank) {
6047 return x;
6048 }
6049 }
6050 return NULL;
6051 }
6052
6053 /* The actual Z-commands implementations */
6054
6055 /* This generic command implements both ZADD and ZINCRBY.
6056 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
6057 * the increment if the operation is a ZINCRBY (doincrement == 1). */
6058 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
6059 robj *zsetobj;
6060 zset *zs;
6061 double *score;
6062
6063 if (isnan(scoreval)) {
6064 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6065 return;
6066 }
6067
6068 zsetobj = lookupKeyWrite(c->db,key);
6069 if (zsetobj == NULL) {
6070 zsetobj = createZsetObject();
6071 dbAdd(c->db,key,zsetobj);
6072 } else {
6073 if (zsetobj->type != REDIS_ZSET) {
6074 addReply(c,shared.wrongtypeerr);
6075 return;
6076 }
6077 }
6078 zs = zsetobj->ptr;
6079
6080 /* Ok now since we implement both ZADD and ZINCRBY here the code
6081 * needs to handle the two different conditions. It's all about setting
6082 * '*score', that is, the new score to set, to the right value. */
6083 score = zmalloc(sizeof(double));
6084 if (doincrement) {
6085 dictEntry *de;
6086
6087 /* Read the old score. If the element was not present starts from 0 */
6088 de = dictFind(zs->dict,ele);
6089 if (de) {
6090 double *oldscore = dictGetEntryVal(de);
6091 *score = *oldscore + scoreval;
6092 } else {
6093 *score = scoreval;
6094 }
6095 if (isnan(*score)) {
6096 addReplySds(c,
6097 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6098 zfree(score);
6099 /* Note that we don't need to check if the zset may be empty and
6100 * should be removed here, as we can only obtain Nan as score if
6101 * there was already an element in the sorted set. */
6102 return;
6103 }
6104 } else {
6105 *score = scoreval;
6106 }
6107
6108 /* What follows is a simple remove and re-insert operation that is common
6109 * to both ZADD and ZINCRBY... */
6110 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
6111 /* case 1: New element */
6112 incrRefCount(ele); /* added to hash */
6113 zslInsert(zs->zsl,*score,ele);
6114 incrRefCount(ele); /* added to skiplist */
6115 server.dirty++;
6116 if (doincrement)
6117 addReplyDouble(c,*score);
6118 else
6119 addReply(c,shared.cone);
6120 } else {
6121 dictEntry *de;
6122 double *oldscore;
6123
6124 /* case 2: Score update operation */
6125 de = dictFind(zs->dict,ele);
6126 redisAssert(de != NULL);
6127 oldscore = dictGetEntryVal(de);
6128 if (*score != *oldscore) {
6129 int deleted;
6130
6131 /* Remove and insert the element in the skip list with new score */
6132 deleted = zslDelete(zs->zsl,*oldscore,ele);
6133 redisAssert(deleted != 0);
6134 zslInsert(zs->zsl,*score,ele);
6135 incrRefCount(ele);
6136 /* Update the score in the hash table */
6137 dictReplace(zs->dict,ele,score);
6138 server.dirty++;
6139 } else {
6140 zfree(score);
6141 }
6142 if (doincrement)
6143 addReplyDouble(c,*score);
6144 else
6145 addReply(c,shared.czero);
6146 }
6147 }
6148
6149 static void zaddCommand(redisClient *c) {
6150 double scoreval;
6151
6152 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6153 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6154 }
6155
6156 static void zincrbyCommand(redisClient *c) {
6157 double scoreval;
6158
6159 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
6160 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6161 }
6162
6163 static void zremCommand(redisClient *c) {
6164 robj *zsetobj;
6165 zset *zs;
6166 dictEntry *de;
6167 double *oldscore;
6168 int deleted;
6169
6170 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6171 checkType(c,zsetobj,REDIS_ZSET)) return;
6172
6173 zs = zsetobj->ptr;
6174 de = dictFind(zs->dict,c->argv[2]);
6175 if (de == NULL) {
6176 addReply(c,shared.czero);
6177 return;
6178 }
6179 /* Delete from the skiplist */
6180 oldscore = dictGetEntryVal(de);
6181 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6182 redisAssert(deleted != 0);
6183
6184 /* Delete from the hash table */
6185 dictDelete(zs->dict,c->argv[2]);
6186 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6187 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6188 server.dirty++;
6189 addReply(c,shared.cone);
6190 }
6191
6192 static void zremrangebyscoreCommand(redisClient *c) {
6193 double min;
6194 double max;
6195 long deleted;
6196 robj *zsetobj;
6197 zset *zs;
6198
6199 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6200 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
6201
6202 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6203 checkType(c,zsetobj,REDIS_ZSET)) return;
6204
6205 zs = zsetobj->ptr;
6206 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6207 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6208 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6209 server.dirty += deleted;
6210 addReplyLongLong(c,deleted);
6211 }
6212
6213 static void zremrangebyrankCommand(redisClient *c) {
6214 long start;
6215 long end;
6216 int llen;
6217 long deleted;
6218 robj *zsetobj;
6219 zset *zs;
6220
6221 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6222 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6223
6224 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6225 checkType(c,zsetobj,REDIS_ZSET)) return;
6226 zs = zsetobj->ptr;
6227 llen = zs->zsl->length;
6228
6229 /* convert negative indexes */
6230 if (start < 0) start = llen+start;
6231 if (end < 0) end = llen+end;
6232 if (start < 0) start = 0;
6233 if (end < 0) end = 0;
6234
6235 /* indexes sanity checks */
6236 if (start > end || start >= llen) {
6237 addReply(c,shared.czero);
6238 return;
6239 }
6240 if (end >= llen) end = llen-1;
6241
6242 /* increment start and end because zsl*Rank functions
6243 * use 1-based rank */
6244 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6245 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
6246 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
6247 server.dirty += deleted;
6248 addReplyLongLong(c, deleted);
6249 }
6250
6251 typedef struct {
6252 dict *dict;
6253 double weight;
6254 } zsetopsrc;
6255
6256 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6257 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6258 unsigned long size1, size2;
6259 size1 = d1->dict ? dictSize(d1->dict) : 0;
6260 size2 = d2->dict ? dictSize(d2->dict) : 0;
6261 return size1 - size2;
6262 }
6263
6264 #define REDIS_AGGR_SUM 1
6265 #define REDIS_AGGR_MIN 2
6266 #define REDIS_AGGR_MAX 3
6267 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
6268
6269 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6270 if (aggregate == REDIS_AGGR_SUM) {
6271 *target = *target + val;
6272 } else if (aggregate == REDIS_AGGR_MIN) {
6273 *target = val < *target ? val : *target;
6274 } else if (aggregate == REDIS_AGGR_MAX) {
6275 *target = val > *target ? val : *target;
6276 } else {
6277 /* safety net */
6278 redisPanic("Unknown ZUNION/INTER aggregate type");
6279 }
6280 }
6281
6282 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6283 int i, j, setnum;
6284 int aggregate = REDIS_AGGR_SUM;
6285 zsetopsrc *src;
6286 robj *dstobj;
6287 zset *dstzset;
6288 dictIterator *di;
6289 dictEntry *de;
6290
6291 /* expect setnum input keys to be given */
6292 setnum = atoi(c->argv[2]->ptr);
6293 if (setnum < 1) {
6294 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6295 return;
6296 }
6297
6298 /* test if the expected number of keys would overflow */
6299 if (3+setnum > c->argc) {
6300 addReply(c,shared.syntaxerr);
6301 return;
6302 }
6303
6304 /* read keys to be used for input */
6305 src = zmalloc(sizeof(zsetopsrc) * setnum);
6306 for (i = 0, j = 3; i < setnum; i++, j++) {
6307 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6308 if (!obj) {
6309 src[i].dict = NULL;
6310 } else {
6311 if (obj->type == REDIS_ZSET) {
6312 src[i].dict = ((zset*)obj->ptr)->dict;
6313 } else if (obj->type == REDIS_SET) {
6314 src[i].dict = (obj->ptr);
6315 } else {
6316 zfree(src);
6317 addReply(c,shared.wrongtypeerr);
6318 return;
6319 }
6320 }
6321
6322 /* default all weights to 1 */
6323 src[i].weight = 1.0;
6324 }
6325
6326 /* parse optional extra arguments */
6327 if (j < c->argc) {
6328 int remaining = c->argc - j;
6329
6330 while (remaining) {
6331 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6332 j++; remaining--;
6333 for (i = 0; i < setnum; i++, j++, remaining--) {
6334 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6335 return;
6336 }
6337 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6338 j++; remaining--;
6339 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6340 aggregate = REDIS_AGGR_SUM;
6341 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6342 aggregate = REDIS_AGGR_MIN;
6343 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6344 aggregate = REDIS_AGGR_MAX;
6345 } else {
6346 zfree(src);
6347 addReply(c,shared.syntaxerr);
6348 return;
6349 }
6350 j++; remaining--;
6351 } else {
6352 zfree(src);
6353 addReply(c,shared.syntaxerr);
6354 return;
6355 }
6356 }
6357 }
6358
6359 /* sort sets from the smallest to largest, this will improve our
6360 * algorithm's performance */
6361 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6362
6363 dstobj = createZsetObject();
6364 dstzset = dstobj->ptr;
6365
6366 if (op == REDIS_OP_INTER) {
6367 /* skip going over all entries if the smallest zset is NULL or empty */
6368 if (src[0].dict && dictSize(src[0].dict) > 0) {
6369 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6370 * from small to large, all src[i > 0].dict are non-empty too */
6371 di = dictGetIterator(src[0].dict);
6372 while((de = dictNext(di)) != NULL) {
6373 double *score = zmalloc(sizeof(double)), value;
6374 *score = src[0].weight * zunionInterDictValue(de);
6375
6376 for (j = 1; j < setnum; j++) {
6377 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6378 if (other) {
6379 value = src[j].weight * zunionInterDictValue(other);
6380 zunionInterAggregate(score, value, aggregate);
6381 } else {
6382 break;
6383 }
6384 }
6385
6386 /* skip entry when not present in every source dict */
6387 if (j != setnum) {
6388 zfree(score);
6389 } else {
6390 robj *o = dictGetEntryKey(de);
6391 dictAdd(dstzset->dict,o,score);
6392 incrRefCount(o); /* added to dictionary */
6393 zslInsert(dstzset->zsl,*score,o);
6394 incrRefCount(o); /* added to skiplist */
6395 }
6396 }
6397 dictReleaseIterator(di);
6398 }
6399 } else if (op == REDIS_OP_UNION) {
6400 for (i = 0; i < setnum; i++) {
6401 if (!src[i].dict) continue;
6402
6403 di = dictGetIterator(src[i].dict);
6404 while((de = dictNext(di)) != NULL) {
6405 /* skip key when already processed */
6406 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6407
6408 double *score = zmalloc(sizeof(double)), value;
6409 *score = src[i].weight * zunionInterDictValue(de);
6410
6411 /* because the zsets are sorted by size, its only possible
6412 * for sets at larger indices to hold this entry */
6413 for (j = (i+1); j < setnum; j++) {
6414 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6415 if (other) {
6416 value = src[j].weight * zunionInterDictValue(other);
6417 zunionInterAggregate(score, value, aggregate);
6418 }
6419 }
6420
6421 robj *o = dictGetEntryKey(de);
6422 dictAdd(dstzset->dict,o,score);
6423 incrRefCount(o); /* added to dictionary */
6424 zslInsert(dstzset->zsl,*score,o);
6425 incrRefCount(o); /* added to skiplist */
6426 }
6427 dictReleaseIterator(di);
6428 }
6429 } else {
6430 /* unknown operator */
6431 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6432 }
6433
6434 dbDelete(c->db,dstkey);
6435 if (dstzset->zsl->length) {
6436 dbAdd(c->db,dstkey,dstobj);
6437 addReplyLongLong(c, dstzset->zsl->length);
6438 server.dirty++;
6439 } else {
6440 decrRefCount(dstobj);
6441 addReply(c, shared.czero);
6442 }
6443 zfree(src);
6444 }
6445
6446 static void zunionstoreCommand(redisClient *c) {
6447 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6448 }
6449
6450 static void zinterstoreCommand(redisClient *c) {
6451 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6452 }
6453
6454 static void zrangeGenericCommand(redisClient *c, int reverse) {
6455 robj *o;
6456 long start;
6457 long end;
6458 int withscores = 0;
6459 int llen;
6460 int rangelen, j;
6461 zset *zsetobj;
6462 zskiplist *zsl;
6463 zskiplistNode *ln;
6464 robj *ele;
6465
6466 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6467 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6468
6469 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6470 withscores = 1;
6471 } else if (c->argc >= 5) {
6472 addReply(c,shared.syntaxerr);
6473 return;
6474 }
6475
6476 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6477 || checkType(c,o,REDIS_ZSET)) return;
6478 zsetobj = o->ptr;
6479 zsl = zsetobj->zsl;
6480 llen = zsl->length;
6481
6482 /* convert negative indexes */
6483 if (start < 0) start = llen+start;
6484 if (end < 0) end = llen+end;
6485 if (start < 0) start = 0;
6486 if (end < 0) end = 0;
6487
6488 /* indexes sanity checks */
6489 if (start > end || start >= llen) {
6490 /* Out of range start or start > end result in empty list */
6491 addReply(c,shared.emptymultibulk);
6492 return;
6493 }
6494 if (end >= llen) end = llen-1;
6495 rangelen = (end-start)+1;
6496
6497 /* check if starting point is trivial, before searching
6498 * the element in log(N) time */
6499 if (reverse) {
6500 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6501 } else {
6502 ln = start == 0 ?
6503 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6504 }
6505
6506 /* Return the result in form of a multi-bulk reply */
6507 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6508 withscores ? (rangelen*2) : rangelen));
6509 for (j = 0; j < rangelen; j++) {
6510 ele = ln->obj;
6511 addReplyBulk(c,ele);
6512 if (withscores)
6513 addReplyDouble(c,ln->score);
6514 ln = reverse ? ln->backward : ln->forward[0];
6515 }
6516 }
6517
6518 static void zrangeCommand(redisClient *c) {
6519 zrangeGenericCommand(c,0);
6520 }
6521
6522 static void zrevrangeCommand(redisClient *c) {
6523 zrangeGenericCommand(c,1);
6524 }
6525
6526 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6527 * If justcount is non-zero, just the count is returned. */
6528 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6529 robj *o;
6530 double min, max;
6531 int minex = 0, maxex = 0; /* are min or max exclusive? */
6532 int offset = 0, limit = -1;
6533 int withscores = 0;
6534 int badsyntax = 0;
6535
6536 /* Parse the min-max interval. If one of the values is prefixed
6537 * by the "(" character, it's considered "open". For instance
6538 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6539 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6540 if (((char*)c->argv[2]->ptr)[0] == '(') {
6541 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6542 minex = 1;
6543 } else {
6544 min = strtod(c->argv[2]->ptr,NULL);
6545 }
6546 if (((char*)c->argv[3]->ptr)[0] == '(') {
6547 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6548 maxex = 1;
6549 } else {
6550 max = strtod(c->argv[3]->ptr,NULL);
6551 }
6552
6553 /* Parse "WITHSCORES": note that if the command was called with
6554 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6555 * enter the following paths to parse WITHSCORES and LIMIT. */
6556 if (c->argc == 5 || c->argc == 8) {
6557 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6558 withscores = 1;
6559 else
6560 badsyntax = 1;
6561 }
6562 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6563 badsyntax = 1;
6564 if (badsyntax) {
6565 addReplySds(c,
6566 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6567 return;
6568 }
6569
6570 /* Parse "LIMIT" */
6571 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6572 addReply(c,shared.syntaxerr);
6573 return;
6574 } else if (c->argc == (7 + withscores)) {
6575 offset = atoi(c->argv[5]->ptr);
6576 limit = atoi(c->argv[6]->ptr);
6577 if (offset < 0) offset = 0;
6578 }
6579
6580 /* Ok, lookup the key and get the range */
6581 o = lookupKeyRead(c->db,c->argv[1]);
6582 if (o == NULL) {
6583 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6584 } else {
6585 if (o->type != REDIS_ZSET) {
6586 addReply(c,shared.wrongtypeerr);
6587 } else {
6588 zset *zsetobj = o->ptr;
6589 zskiplist *zsl = zsetobj->zsl;
6590 zskiplistNode *ln;
6591 robj *ele, *lenobj = NULL;
6592 unsigned long rangelen = 0;
6593
6594 /* Get the first node with the score >= min, or with
6595 * score > min if 'minex' is true. */
6596 ln = zslFirstWithScore(zsl,min);
6597 while (minex && ln && ln->score == min) ln = ln->forward[0];
6598
6599 if (ln == NULL) {
6600 /* No element matching the speciifed interval */
6601 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6602 return;
6603 }
6604
6605 /* We don't know in advance how many matching elements there
6606 * are in the list, so we push this object that will represent
6607 * the multi-bulk length in the output buffer, and will "fix"
6608 * it later */
6609 if (!justcount) {
6610 lenobj = createObject(REDIS_STRING,NULL);
6611 addReply(c,lenobj);
6612 decrRefCount(lenobj);
6613 }
6614
6615 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6616 if (offset) {
6617 offset--;
6618 ln = ln->forward[0];
6619 continue;
6620 }
6621 if (limit == 0) break;
6622 if (!justcount) {
6623 ele = ln->obj;
6624 addReplyBulk(c,ele);
6625 if (withscores)
6626 addReplyDouble(c,ln->score);
6627 }
6628 ln = ln->forward[0];
6629 rangelen++;
6630 if (limit > 0) limit--;
6631 }
6632 if (justcount) {
6633 addReplyLongLong(c,(long)rangelen);
6634 } else {
6635 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6636 withscores ? (rangelen*2) : rangelen);
6637 }
6638 }
6639 }
6640 }
6641
6642 static void zrangebyscoreCommand(redisClient *c) {
6643 genericZrangebyscoreCommand(c,0);
6644 }
6645
6646 static void zcountCommand(redisClient *c) {
6647 genericZrangebyscoreCommand(c,1);
6648 }
6649
6650 static void zcardCommand(redisClient *c) {
6651 robj *o;
6652 zset *zs;
6653
6654 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6655 checkType(c,o,REDIS_ZSET)) return;
6656
6657 zs = o->ptr;
6658 addReplyUlong(c,zs->zsl->length);
6659 }
6660
6661 static void zscoreCommand(redisClient *c) {
6662 robj *o;
6663 zset *zs;
6664 dictEntry *de;
6665
6666 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6667 checkType(c,o,REDIS_ZSET)) return;
6668
6669 zs = o->ptr;
6670 de = dictFind(zs->dict,c->argv[2]);
6671 if (!de) {
6672 addReply(c,shared.nullbulk);
6673 } else {
6674 double *score = dictGetEntryVal(de);
6675
6676 addReplyDouble(c,*score);
6677 }
6678 }
6679
6680 static void zrankGenericCommand(redisClient *c, int reverse) {
6681 robj *o;
6682 zset *zs;
6683 zskiplist *zsl;
6684 dictEntry *de;
6685 unsigned long rank;
6686 double *score;
6687
6688 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6689 checkType(c,o,REDIS_ZSET)) return;
6690
6691 zs = o->ptr;
6692 zsl = zs->zsl;
6693 de = dictFind(zs->dict,c->argv[2]);
6694 if (!de) {
6695 addReply(c,shared.nullbulk);
6696 return;
6697 }
6698
6699 score = dictGetEntryVal(de);
6700 rank = zslGetRank(zsl, *score, c->argv[2]);
6701 if (rank) {
6702 if (reverse) {
6703 addReplyLongLong(c, zsl->length - rank);
6704 } else {
6705 addReplyLongLong(c, rank-1);
6706 }
6707 } else {
6708 addReply(c,shared.nullbulk);
6709 }
6710 }
6711
6712 static void zrankCommand(redisClient *c) {
6713 zrankGenericCommand(c, 0);
6714 }
6715
6716 static void zrevrankCommand(redisClient *c) {
6717 zrankGenericCommand(c, 1);
6718 }
6719
6720 /* ========================= Hashes utility functions ======================= */
6721 #define REDIS_HASH_KEY 1
6722 #define REDIS_HASH_VALUE 2
6723
6724 /* Check the length of a number of objects to see if we need to convert a
6725 * zipmap to a real hash. Note that we only check string encoded objects
6726 * as their string length can be queried in constant time. */
6727 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6728 int i;
6729 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6730
6731 for (i = start; i <= end; i++) {
6732 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6733 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6734 {
6735 convertToRealHash(subject);
6736 return;
6737 }
6738 }
6739 }
6740
6741 /* Encode given objects in-place when the hash uses a dict. */
6742 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6743 if (subject->encoding == REDIS_ENCODING_HT) {
6744 if (o1) *o1 = tryObjectEncoding(*o1);
6745 if (o2) *o2 = tryObjectEncoding(*o2);
6746 }
6747 }
6748
6749 /* Get the value from a hash identified by key. Returns either a string
6750 * object or NULL if the value cannot be found. The refcount of the object
6751 * is always increased by 1 when the value was found. */
6752 static robj *hashGet(robj *o, robj *key) {
6753 robj *value = NULL;
6754 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6755 unsigned char *v;
6756 unsigned int vlen;
6757 key = getDecodedObject(key);
6758 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6759 value = createStringObject((char*)v,vlen);
6760 }
6761 decrRefCount(key);
6762 } else {
6763 dictEntry *de = dictFind(o->ptr,key);
6764 if (de != NULL) {
6765 value = dictGetEntryVal(de);
6766 incrRefCount(value);
6767 }
6768 }
6769 return value;
6770 }
6771
6772 /* Test if the key exists in the given hash. Returns 1 if the key
6773 * exists and 0 when it doesn't. */
6774 static int hashExists(robj *o, robj *key) {
6775 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6776 key = getDecodedObject(key);
6777 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6778 decrRefCount(key);
6779 return 1;
6780 }
6781 decrRefCount(key);
6782 } else {
6783 if (dictFind(o->ptr,key) != NULL) {
6784 return 1;
6785 }
6786 }
6787 return 0;
6788 }
6789
6790 /* Add an element, discard the old if the key already exists.
6791 * Return 0 on insert and 1 on update. */
6792 static int hashSet(robj *o, robj *key, robj *value) {
6793 int update = 0;
6794 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6795 key = getDecodedObject(key);
6796 value = getDecodedObject(value);
6797 o->ptr = zipmapSet(o->ptr,
6798 key->ptr,sdslen(key->ptr),
6799 value->ptr,sdslen(value->ptr), &update);
6800 decrRefCount(key);
6801 decrRefCount(value);
6802
6803 /* Check if the zipmap needs to be upgraded to a real hash table */
6804 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6805 convertToRealHash(o);
6806 } else {
6807 if (dictReplace(o->ptr,key,value)) {
6808 /* Insert */
6809 incrRefCount(key);
6810 } else {
6811 /* Update */
6812 update = 1;
6813 }
6814 incrRefCount(value);
6815 }
6816 return update;
6817 }
6818
6819 /* Delete an element from a hash.
6820 * Return 1 on deleted and 0 on not found. */
6821 static int hashDelete(robj *o, robj *key) {
6822 int deleted = 0;
6823 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6824 key = getDecodedObject(key);
6825 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6826 decrRefCount(key);
6827 } else {
6828 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6829 /* Always check if the dictionary needs a resize after a delete. */
6830 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6831 }
6832 return deleted;
6833 }
6834
6835 /* Return the number of elements in a hash. */
6836 static unsigned long hashLength(robj *o) {
6837 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6838 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6839 }
6840
6841 /* Structure to hold hash iteration abstration. Note that iteration over
6842 * hashes involves both fields and values. Because it is possible that
6843 * not both are required, store pointers in the iterator to avoid
6844 * unnecessary memory allocation for fields/values. */
6845 typedef struct {
6846 int encoding;
6847 unsigned char *zi;
6848 unsigned char *zk, *zv;
6849 unsigned int zklen, zvlen;
6850
6851 dictIterator *di;
6852 dictEntry *de;
6853 } hashIterator;
6854
6855 static hashIterator *hashInitIterator(robj *subject) {
6856 hashIterator *hi = zmalloc(sizeof(hashIterator));
6857 hi->encoding = subject->encoding;
6858 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6859 hi->zi = zipmapRewind(subject->ptr);
6860 } else if (hi->encoding == REDIS_ENCODING_HT) {
6861 hi->di = dictGetIterator(subject->ptr);
6862 } else {
6863 redisAssert(NULL);
6864 }
6865 return hi;
6866 }
6867
6868 static void hashReleaseIterator(hashIterator *hi) {
6869 if (hi->encoding == REDIS_ENCODING_HT) {
6870 dictReleaseIterator(hi->di);
6871 }
6872 zfree(hi);
6873 }
6874
6875 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6876 * could be found and REDIS_ERR when the iterator reaches the end. */
6877 static int hashNext(hashIterator *hi) {
6878 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6879 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6880 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6881 } else {
6882 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6883 }
6884 return REDIS_OK;
6885 }
6886
6887 /* Get key or value object at current iteration position.
6888 * This increases the refcount of the field object by 1. */
6889 static robj *hashCurrent(hashIterator *hi, int what) {
6890 robj *o;
6891 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6892 if (what & REDIS_HASH_KEY) {
6893 o = createStringObject((char*)hi->zk,hi->zklen);
6894 } else {
6895 o = createStringObject((char*)hi->zv,hi->zvlen);
6896 }
6897 } else {
6898 if (what & REDIS_HASH_KEY) {
6899 o = dictGetEntryKey(hi->de);
6900 } else {
6901 o = dictGetEntryVal(hi->de);
6902 }
6903 incrRefCount(o);
6904 }
6905 return o;
6906 }
6907
6908 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6909 robj *o = lookupKeyWrite(c->db,key);
6910 if (o == NULL) {
6911 o = createHashObject();
6912 dbAdd(c->db,key,o);
6913 } else {
6914 if (o->type != REDIS_HASH) {
6915 addReply(c,shared.wrongtypeerr);
6916 return NULL;
6917 }
6918 }
6919 return o;
6920 }
6921
6922 /* ============================= Hash commands ============================== */
6923 static void hsetCommand(redisClient *c) {
6924 int update;
6925 robj *o;
6926
6927 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6928 hashTryConversion(o,c->argv,2,3);
6929 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6930 update = hashSet(o,c->argv[2],c->argv[3]);
6931 addReply(c, update ? shared.czero : shared.cone);
6932 server.dirty++;
6933 }
6934
6935 static void hsetnxCommand(redisClient *c) {
6936 robj *o;
6937 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6938 hashTryConversion(o,c->argv,2,3);
6939
6940 if (hashExists(o, c->argv[2])) {
6941 addReply(c, shared.czero);
6942 } else {
6943 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6944 hashSet(o,c->argv[2],c->argv[3]);
6945 addReply(c, shared.cone);
6946 server.dirty++;
6947 }
6948 }
6949
6950 static void hmsetCommand(redisClient *c) {
6951 int i;
6952 robj *o;
6953
6954 if ((c->argc % 2) == 1) {
6955 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6956 return;
6957 }
6958
6959 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6960 hashTryConversion(o,c->argv,2,c->argc-1);
6961 for (i = 2; i < c->argc; i += 2) {
6962 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6963 hashSet(o,c->argv[i],c->argv[i+1]);
6964 }
6965 addReply(c, shared.ok);
6966 server.dirty++;
6967 }
6968
6969 static void hincrbyCommand(redisClient *c) {
6970 long long value, incr;
6971 robj *o, *current, *new;
6972
6973 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6974 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6975 if ((current = hashGet(o,c->argv[2])) != NULL) {
6976 if (getLongLongFromObjectOrReply(c,current,&value,
6977 "hash value is not an integer") != REDIS_OK) {
6978 decrRefCount(current);
6979 return;
6980 }
6981 decrRefCount(current);
6982 } else {
6983 value = 0;
6984 }
6985
6986 value += incr;
6987 new = createStringObjectFromLongLong(value);
6988 hashTryObjectEncoding(o,&c->argv[2],NULL);
6989 hashSet(o,c->argv[2],new);
6990 decrRefCount(new);
6991 addReplyLongLong(c,value);
6992 server.dirty++;
6993 }
6994
6995 static void hgetCommand(redisClient *c) {
6996 robj *o, *value;
6997 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6998 checkType(c,o,REDIS_HASH)) return;
6999
7000 if ((value = hashGet(o,c->argv[2])) != NULL) {
7001 addReplyBulk(c,value);
7002 decrRefCount(value);
7003 } else {
7004 addReply(c,shared.nullbulk);
7005 }
7006 }
7007
7008 static void hmgetCommand(redisClient *c) {
7009 int i;
7010 robj *o, *value;
7011 o = lookupKeyRead(c->db,c->argv[1]);
7012 if (o != NULL && o->type != REDIS_HASH) {
7013 addReply(c,shared.wrongtypeerr);
7014 }
7015
7016 /* Note the check for o != NULL happens inside the loop. This is
7017 * done because objects that cannot be found are considered to be
7018 * an empty hash. The reply should then be a series of NULLs. */
7019 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7020 for (i = 2; i < c->argc; i++) {
7021 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
7022 addReplyBulk(c,value);
7023 decrRefCount(value);
7024 } else {
7025 addReply(c,shared.nullbulk);
7026 }
7027 }
7028 }
7029
7030 static void hdelCommand(redisClient *c) {
7031 robj *o;
7032 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7033 checkType(c,o,REDIS_HASH)) return;
7034
7035 if (hashDelete(o,c->argv[2])) {
7036 if (hashLength(o) == 0) dbDelete(c->db,c->argv[1]);
7037 addReply(c,shared.cone);
7038 server.dirty++;
7039 } else {
7040 addReply(c,shared.czero);
7041 }
7042 }
7043
7044 static void hlenCommand(redisClient *c) {
7045 robj *o;
7046 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7047 checkType(c,o,REDIS_HASH)) return;
7048
7049 addReplyUlong(c,hashLength(o));
7050 }
7051
7052 static void genericHgetallCommand(redisClient *c, int flags) {
7053 robj *o, *lenobj, *obj;
7054 unsigned long count = 0;
7055 hashIterator *hi;
7056
7057 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
7058 || checkType(c,o,REDIS_HASH)) return;
7059
7060 lenobj = createObject(REDIS_STRING,NULL);
7061 addReply(c,lenobj);
7062 decrRefCount(lenobj);
7063
7064 hi = hashInitIterator(o);
7065 while (hashNext(hi) != REDIS_ERR) {
7066 if (flags & REDIS_HASH_KEY) {
7067 obj = hashCurrent(hi,REDIS_HASH_KEY);
7068 addReplyBulk(c,obj);
7069 decrRefCount(obj);
7070 count++;
7071 }
7072 if (flags & REDIS_HASH_VALUE) {
7073 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7074 addReplyBulk(c,obj);
7075 decrRefCount(obj);
7076 count++;
7077 }
7078 }
7079 hashReleaseIterator(hi);
7080
7081 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7082 }
7083
7084 static void hkeysCommand(redisClient *c) {
7085 genericHgetallCommand(c,REDIS_HASH_KEY);
7086 }
7087
7088 static void hvalsCommand(redisClient *c) {
7089 genericHgetallCommand(c,REDIS_HASH_VALUE);
7090 }
7091
7092 static void hgetallCommand(redisClient *c) {
7093 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
7094 }
7095
7096 static void hexistsCommand(redisClient *c) {
7097 robj *o;
7098 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7099 checkType(c,o,REDIS_HASH)) return;
7100
7101 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
7102 }
7103
7104 static void convertToRealHash(robj *o) {
7105 unsigned char *key, *val, *p, *zm = o->ptr;
7106 unsigned int klen, vlen;
7107 dict *dict = dictCreate(&hashDictType,NULL);
7108
7109 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7110 p = zipmapRewind(zm);
7111 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7112 robj *keyobj, *valobj;
7113
7114 keyobj = createStringObject((char*)key,klen);
7115 valobj = createStringObject((char*)val,vlen);
7116 keyobj = tryObjectEncoding(keyobj);
7117 valobj = tryObjectEncoding(valobj);
7118 dictAdd(dict,keyobj,valobj);
7119 }
7120 o->encoding = REDIS_ENCODING_HT;
7121 o->ptr = dict;
7122 zfree(zm);
7123 }
7124
7125 /* ========================= Non type-specific commands ==================== */
7126
7127 static void flushdbCommand(redisClient *c) {
7128 server.dirty += dictSize(c->db->dict);
7129 touchWatchedKeysOnFlush(c->db->id);
7130 dictEmpty(c->db->dict);
7131 dictEmpty(c->db->expires);
7132 addReply(c,shared.ok);
7133 }
7134
7135 static void flushallCommand(redisClient *c) {
7136 touchWatchedKeysOnFlush(-1);
7137 server.dirty += emptyDb();
7138 addReply(c,shared.ok);
7139 if (server.bgsavechildpid != -1) {
7140 kill(server.bgsavechildpid,SIGKILL);
7141 rdbRemoveTempFile(server.bgsavechildpid);
7142 }
7143 rdbSave(server.dbfilename);
7144 server.dirty++;
7145 }
7146
7147 static redisSortOperation *createSortOperation(int type, robj *pattern) {
7148 redisSortOperation *so = zmalloc(sizeof(*so));
7149 so->type = type;
7150 so->pattern = pattern;
7151 return so;
7152 }
7153
7154 /* Return the value associated to the key with a name obtained
7155 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7156 * The returned object will always have its refcount increased by 1
7157 * when it is non-NULL. */
7158 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
7159 char *p, *f;
7160 sds spat, ssub;
7161 robj keyobj, fieldobj, *o;
7162 int prefixlen, sublen, postfixlen, fieldlen;
7163 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7164 struct {
7165 long len;
7166 long free;
7167 char buf[REDIS_SORTKEY_MAX+1];
7168 } keyname, fieldname;
7169
7170 /* If the pattern is "#" return the substitution object itself in order
7171 * to implement the "SORT ... GET #" feature. */
7172 spat = pattern->ptr;
7173 if (spat[0] == '#' && spat[1] == '\0') {
7174 incrRefCount(subst);
7175 return subst;
7176 }
7177
7178 /* The substitution object may be specially encoded. If so we create
7179 * a decoded object on the fly. Otherwise getDecodedObject will just
7180 * increment the ref count, that we'll decrement later. */
7181 subst = getDecodedObject(subst);
7182
7183 ssub = subst->ptr;
7184 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7185 p = strchr(spat,'*');
7186 if (!p) {
7187 decrRefCount(subst);
7188 return NULL;
7189 }
7190
7191 /* Find out if we're dealing with a hash dereference. */
7192 if ((f = strstr(p+1, "->")) != NULL) {
7193 fieldlen = sdslen(spat)-(f-spat);
7194 /* this also copies \0 character */
7195 memcpy(fieldname.buf,f+2,fieldlen-1);
7196 fieldname.len = fieldlen-2;
7197 } else {
7198 fieldlen = 0;
7199 }
7200
7201 prefixlen = p-spat;
7202 sublen = sdslen(ssub);
7203 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
7204 memcpy(keyname.buf,spat,prefixlen);
7205 memcpy(keyname.buf+prefixlen,ssub,sublen);
7206 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7207 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7208 keyname.len = prefixlen+sublen+postfixlen;
7209 decrRefCount(subst);
7210
7211 /* Lookup substituted key */
7212 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7213 o = lookupKeyRead(db,&keyobj);
7214 if (o == NULL) return NULL;
7215
7216 if (fieldlen > 0) {
7217 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
7218
7219 /* Retrieve value from hash by the field name. This operation
7220 * already increases the refcount of the returned object. */
7221 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7222 o = hashGet(o, &fieldobj);
7223 } else {
7224 if (o->type != REDIS_STRING) return NULL;
7225
7226 /* Every object that this function returns needs to have its refcount
7227 * increased. sortCommand decreases it again. */
7228 incrRefCount(o);
7229 }
7230
7231 return o;
7232 }
7233
7234 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7235 * the additional parameter is not standard but a BSD-specific we have to
7236 * pass sorting parameters via the global 'server' structure */
7237 static int sortCompare(const void *s1, const void *s2) {
7238 const redisSortObject *so1 = s1, *so2 = s2;
7239 int cmp;
7240
7241 if (!server.sort_alpha) {
7242 /* Numeric sorting. Here it's trivial as we precomputed scores */
7243 if (so1->u.score > so2->u.score) {
7244 cmp = 1;
7245 } else if (so1->u.score < so2->u.score) {
7246 cmp = -1;
7247 } else {
7248 cmp = 0;
7249 }
7250 } else {
7251 /* Alphanumeric sorting */
7252 if (server.sort_bypattern) {
7253 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7254 /* At least one compare object is NULL */
7255 if (so1->u.cmpobj == so2->u.cmpobj)
7256 cmp = 0;
7257 else if (so1->u.cmpobj == NULL)
7258 cmp = -1;
7259 else
7260 cmp = 1;
7261 } else {
7262 /* We have both the objects, use strcoll */
7263 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7264 }
7265 } else {
7266 /* Compare elements directly. */
7267 cmp = compareStringObjects(so1->obj,so2->obj);
7268 }
7269 }
7270 return server.sort_desc ? -cmp : cmp;
7271 }
7272
7273 /* The SORT command is the most complex command in Redis. Warning: this code
7274 * is optimized for speed and a bit less for readability */
7275 static void sortCommand(redisClient *c) {
7276 list *operations;
7277 unsigned int outputlen = 0;
7278 int desc = 0, alpha = 0;
7279 int limit_start = 0, limit_count = -1, start, end;
7280 int j, dontsort = 0, vectorlen;
7281 int getop = 0; /* GET operation counter */
7282 robj *sortval, *sortby = NULL, *storekey = NULL;
7283 redisSortObject *vector; /* Resulting vector to sort */
7284
7285 /* Lookup the key to sort. It must be of the right types */
7286 sortval = lookupKeyRead(c->db,c->argv[1]);
7287 if (sortval == NULL) {
7288 addReply(c,shared.emptymultibulk);
7289 return;
7290 }
7291 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7292 sortval->type != REDIS_ZSET)
7293 {
7294 addReply(c,shared.wrongtypeerr);
7295 return;
7296 }
7297
7298 /* Create a list of operations to perform for every sorted element.
7299 * Operations can be GET/DEL/INCR/DECR */
7300 operations = listCreate();
7301 listSetFreeMethod(operations,zfree);
7302 j = 2;
7303
7304 /* Now we need to protect sortval incrementing its count, in the future
7305 * SORT may have options able to overwrite/delete keys during the sorting
7306 * and the sorted key itself may get destroied */
7307 incrRefCount(sortval);
7308
7309 /* The SORT command has an SQL-alike syntax, parse it */
7310 while(j < c->argc) {
7311 int leftargs = c->argc-j-1;
7312 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7313 desc = 0;
7314 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7315 desc = 1;
7316 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7317 alpha = 1;
7318 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7319 limit_start = atoi(c->argv[j+1]->ptr);
7320 limit_count = atoi(c->argv[j+2]->ptr);
7321 j+=2;
7322 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7323 storekey = c->argv[j+1];
7324 j++;
7325 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7326 sortby = c->argv[j+1];
7327 /* If the BY pattern does not contain '*', i.e. it is constant,
7328 * we don't need to sort nor to lookup the weight keys. */
7329 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7330 j++;
7331 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7332 listAddNodeTail(operations,createSortOperation(
7333 REDIS_SORT_GET,c->argv[j+1]));
7334 getop++;
7335 j++;
7336 } else {
7337 decrRefCount(sortval);
7338 listRelease(operations);
7339 addReply(c,shared.syntaxerr);
7340 return;
7341 }
7342 j++;
7343 }
7344
7345 /* Load the sorting vector with all the objects to sort */
7346 switch(sortval->type) {
7347 case REDIS_LIST: vectorlen = lLength(sortval); break;
7348 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7349 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7350 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7351 }
7352 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7353 j = 0;
7354
7355 if (sortval->type == REDIS_LIST) {
7356 lIterator *li = lInitIterator(sortval,0,REDIS_TAIL);
7357 lEntry entry;
7358 while(lNext(li,&entry)) {
7359 vector[j].obj = lGet(&entry);
7360 vector[j].u.score = 0;
7361 vector[j].u.cmpobj = NULL;
7362 j++;
7363 }
7364 lReleaseIterator(li);
7365 } else {
7366 dict *set;
7367 dictIterator *di;
7368 dictEntry *setele;
7369
7370 if (sortval->type == REDIS_SET) {
7371 set = sortval->ptr;
7372 } else {
7373 zset *zs = sortval->ptr;
7374 set = zs->dict;
7375 }
7376
7377 di = dictGetIterator(set);
7378 while((setele = dictNext(di)) != NULL) {
7379 vector[j].obj = dictGetEntryKey(setele);
7380 vector[j].u.score = 0;
7381 vector[j].u.cmpobj = NULL;
7382 j++;
7383 }
7384 dictReleaseIterator(di);
7385 }
7386 redisAssert(j == vectorlen);
7387
7388 /* Now it's time to load the right scores in the sorting vector */
7389 if (dontsort == 0) {
7390 for (j = 0; j < vectorlen; j++) {
7391 robj *byval;
7392 if (sortby) {
7393 /* lookup value to sort by */
7394 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7395 if (!byval) continue;
7396 } else {
7397 /* use object itself to sort by */
7398 byval = vector[j].obj;
7399 }
7400
7401 if (alpha) {
7402 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7403 } else {
7404 if (byval->encoding == REDIS_ENCODING_RAW) {
7405 vector[j].u.score = strtod(byval->ptr,NULL);
7406 } else if (byval->encoding == REDIS_ENCODING_INT) {
7407 /* Don't need to decode the object if it's
7408 * integer-encoded (the only encoding supported) so
7409 * far. We can just cast it */
7410 vector[j].u.score = (long)byval->ptr;
7411 } else {
7412 redisAssert(1 != 1);
7413 }
7414 }
7415
7416 /* when the object was retrieved using lookupKeyByPattern,
7417 * its refcount needs to be decreased. */
7418 if (sortby) {
7419 decrRefCount(byval);
7420 }
7421 }
7422 }
7423
7424 /* We are ready to sort the vector... perform a bit of sanity check
7425 * on the LIMIT option too. We'll use a partial version of quicksort. */
7426 start = (limit_start < 0) ? 0 : limit_start;
7427 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7428 if (start >= vectorlen) {
7429 start = vectorlen-1;
7430 end = vectorlen-2;
7431 }
7432 if (end >= vectorlen) end = vectorlen-1;
7433
7434 if (dontsort == 0) {
7435 server.sort_desc = desc;
7436 server.sort_alpha = alpha;
7437 server.sort_bypattern = sortby ? 1 : 0;
7438 if (sortby && (start != 0 || end != vectorlen-1))
7439 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7440 else
7441 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7442 }
7443
7444 /* Send command output to the output buffer, performing the specified
7445 * GET/DEL/INCR/DECR operations if any. */
7446 outputlen = getop ? getop*(end-start+1) : end-start+1;
7447 if (storekey == NULL) {
7448 /* STORE option not specified, sent the sorting result to client */
7449 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7450 for (j = start; j <= end; j++) {
7451 listNode *ln;
7452 listIter li;
7453
7454 if (!getop) addReplyBulk(c,vector[j].obj);
7455 listRewind(operations,&li);
7456 while((ln = listNext(&li))) {
7457 redisSortOperation *sop = ln->value;
7458 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7459 vector[j].obj);
7460
7461 if (sop->type == REDIS_SORT_GET) {
7462 if (!val) {
7463 addReply(c,shared.nullbulk);
7464 } else {
7465 addReplyBulk(c,val);
7466 decrRefCount(val);
7467 }
7468 } else {
7469 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7470 }
7471 }
7472 }
7473 } else {
7474 robj *sobj = createZiplistObject();
7475
7476 /* STORE option specified, set the sorting result as a List object */
7477 for (j = start; j <= end; j++) {
7478 listNode *ln;
7479 listIter li;
7480
7481 if (!getop) {
7482 lPush(sobj,vector[j].obj,REDIS_TAIL);
7483 } else {
7484 listRewind(operations,&li);
7485 while((ln = listNext(&li))) {
7486 redisSortOperation *sop = ln->value;
7487 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7488 vector[j].obj);
7489
7490 if (sop->type == REDIS_SORT_GET) {
7491 if (!val) val = createStringObject("",0);
7492
7493 /* lPush does an incrRefCount, so we should take care
7494 * care of the incremented refcount caused by either
7495 * lookupKeyByPattern or createStringObject("",0) */
7496 lPush(sobj,val,REDIS_TAIL);
7497 decrRefCount(val);
7498 } else {
7499 /* always fails */
7500 redisAssert(sop->type == REDIS_SORT_GET);
7501 }
7502 }
7503 }
7504 }
7505 dbReplace(c->db,storekey,sobj);
7506 /* Note: we add 1 because the DB is dirty anyway since even if the
7507 * SORT result is empty a new key is set and maybe the old content
7508 * replaced. */
7509 server.dirty += 1+outputlen;
7510 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7511 }
7512
7513 /* Cleanup */
7514 if (sortval->type == REDIS_LIST)
7515 for (j = 0; j < vectorlen; j++)
7516 decrRefCount(vector[j].obj);
7517 decrRefCount(sortval);
7518 listRelease(operations);
7519 for (j = 0; j < vectorlen; j++) {
7520 if (alpha && vector[j].u.cmpobj)
7521 decrRefCount(vector[j].u.cmpobj);
7522 }
7523 zfree(vector);
7524 }
7525
7526 /* Convert an amount of bytes into a human readable string in the form
7527 * of 100B, 2G, 100M, 4K, and so forth. */
7528 static void bytesToHuman(char *s, unsigned long long n) {
7529 double d;
7530
7531 if (n < 1024) {
7532 /* Bytes */
7533 sprintf(s,"%lluB",n);
7534 return;
7535 } else if (n < (1024*1024)) {
7536 d = (double)n/(1024);
7537 sprintf(s,"%.2fK",d);
7538 } else if (n < (1024LL*1024*1024)) {
7539 d = (double)n/(1024*1024);
7540 sprintf(s,"%.2fM",d);
7541 } else if (n < (1024LL*1024*1024*1024)) {
7542 d = (double)n/(1024LL*1024*1024);
7543 sprintf(s,"%.2fG",d);
7544 }
7545 }
7546
7547 /* Create the string returned by the INFO command. This is decoupled
7548 * by the INFO command itself as we need to report the same information
7549 * on memory corruption problems. */
7550 static sds genRedisInfoString(void) {
7551 sds info;
7552 time_t uptime = time(NULL)-server.stat_starttime;
7553 int j;
7554 char hmem[64];
7555
7556 bytesToHuman(hmem,zmalloc_used_memory());
7557 info = sdscatprintf(sdsempty(),
7558 "redis_version:%s\r\n"
7559 "redis_git_sha1:%s\r\n"
7560 "redis_git_dirty:%d\r\n"
7561 "arch_bits:%s\r\n"
7562 "multiplexing_api:%s\r\n"
7563 "process_id:%ld\r\n"
7564 "uptime_in_seconds:%ld\r\n"
7565 "uptime_in_days:%ld\r\n"
7566 "connected_clients:%d\r\n"
7567 "connected_slaves:%d\r\n"
7568 "blocked_clients:%d\r\n"
7569 "used_memory:%zu\r\n"
7570 "used_memory_human:%s\r\n"
7571 "changes_since_last_save:%lld\r\n"
7572 "bgsave_in_progress:%d\r\n"
7573 "last_save_time:%ld\r\n"
7574 "bgrewriteaof_in_progress:%d\r\n"
7575 "total_connections_received:%lld\r\n"
7576 "total_commands_processed:%lld\r\n"
7577 "expired_keys:%lld\r\n"
7578 "hash_max_zipmap_entries:%zu\r\n"
7579 "hash_max_zipmap_value:%zu\r\n"
7580 "pubsub_channels:%ld\r\n"
7581 "pubsub_patterns:%u\r\n"
7582 "vm_enabled:%d\r\n"
7583 "role:%s\r\n"
7584 ,REDIS_VERSION,
7585 REDIS_GIT_SHA1,
7586 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7587 (sizeof(long) == 8) ? "64" : "32",
7588 aeGetApiName(),
7589 (long) getpid(),
7590 uptime,
7591 uptime/(3600*24),
7592 listLength(server.clients)-listLength(server.slaves),
7593 listLength(server.slaves),
7594 server.blpop_blocked_clients,
7595 zmalloc_used_memory(),
7596 hmem,
7597 server.dirty,
7598 server.bgsavechildpid != -1,
7599 server.lastsave,
7600 server.bgrewritechildpid != -1,
7601 server.stat_numconnections,
7602 server.stat_numcommands,
7603 server.stat_expiredkeys,
7604 server.hash_max_zipmap_entries,
7605 server.hash_max_zipmap_value,
7606 dictSize(server.pubsub_channels),
7607 listLength(server.pubsub_patterns),
7608 server.vm_enabled != 0,
7609 server.masterhost == NULL ? "master" : "slave"
7610 );
7611 if (server.masterhost) {
7612 info = sdscatprintf(info,
7613 "master_host:%s\r\n"
7614 "master_port:%d\r\n"
7615 "master_link_status:%s\r\n"
7616 "master_last_io_seconds_ago:%d\r\n"
7617 ,server.masterhost,
7618 server.masterport,
7619 (server.replstate == REDIS_REPL_CONNECTED) ?
7620 "up" : "down",
7621 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7622 );
7623 }
7624 if (server.vm_enabled) {
7625 lockThreadedIO();
7626 info = sdscatprintf(info,
7627 "vm_conf_max_memory:%llu\r\n"
7628 "vm_conf_page_size:%llu\r\n"
7629 "vm_conf_pages:%llu\r\n"
7630 "vm_stats_used_pages:%llu\r\n"
7631 "vm_stats_swapped_objects:%llu\r\n"
7632 "vm_stats_swappin_count:%llu\r\n"
7633 "vm_stats_swappout_count:%llu\r\n"
7634 "vm_stats_io_newjobs_len:%lu\r\n"
7635 "vm_stats_io_processing_len:%lu\r\n"
7636 "vm_stats_io_processed_len:%lu\r\n"
7637 "vm_stats_io_active_threads:%lu\r\n"
7638 "vm_stats_blocked_clients:%lu\r\n"
7639 ,(unsigned long long) server.vm_max_memory,
7640 (unsigned long long) server.vm_page_size,
7641 (unsigned long long) server.vm_pages,
7642 (unsigned long long) server.vm_stats_used_pages,
7643 (unsigned long long) server.vm_stats_swapped_objects,
7644 (unsigned long long) server.vm_stats_swapins,
7645 (unsigned long long) server.vm_stats_swapouts,
7646 (unsigned long) listLength(server.io_newjobs),
7647 (unsigned long) listLength(server.io_processing),
7648 (unsigned long) listLength(server.io_processed),
7649 (unsigned long) server.io_active_threads,
7650 (unsigned long) server.vm_blocked_clients
7651 );
7652 unlockThreadedIO();
7653 }
7654 for (j = 0; j < server.dbnum; j++) {
7655 long long keys, vkeys;
7656
7657 keys = dictSize(server.db[j].dict);
7658 vkeys = dictSize(server.db[j].expires);
7659 if (keys || vkeys) {
7660 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7661 j, keys, vkeys);
7662 }
7663 }
7664 return info;
7665 }
7666
7667 static void infoCommand(redisClient *c) {
7668 sds info = genRedisInfoString();
7669 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7670 (unsigned long)sdslen(info)));
7671 addReplySds(c,info);
7672 addReply(c,shared.crlf);
7673 }
7674
7675 static void monitorCommand(redisClient *c) {
7676 /* ignore MONITOR if aleady slave or in monitor mode */
7677 if (c->flags & REDIS_SLAVE) return;
7678
7679 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7680 c->slaveseldb = 0;
7681 listAddNodeTail(server.monitors,c);
7682 addReply(c,shared.ok);
7683 }
7684
7685 /* ================================= Expire ================================= */
7686 static int removeExpire(redisDb *db, robj *key) {
7687 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
7688 return 1;
7689 } else {
7690 return 0;
7691 }
7692 }
7693
7694 static int setExpire(redisDb *db, robj *key, time_t when) {
7695 sds copy = sdsdup(key->ptr);
7696 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7697 sdsfree(copy);
7698 return 0;
7699 } else {
7700 return 1;
7701 }
7702 }
7703
7704 /* Return the expire time of the specified key, or -1 if no expire
7705 * is associated with this key (i.e. the key is non volatile) */
7706 static time_t getExpire(redisDb *db, robj *key) {
7707 dictEntry *de;
7708
7709 /* No expire? return ASAP */
7710 if (dictSize(db->expires) == 0 ||
7711 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
7712
7713 return (time_t) dictGetEntryVal(de);
7714 }
7715
7716 static int expireIfNeeded(redisDb *db, robj *key) {
7717 time_t when;
7718 dictEntry *de;
7719
7720 /* No expire? return ASAP */
7721 if (dictSize(db->expires) == 0 ||
7722 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7723
7724 /* Lookup the expire */
7725 when = (time_t) dictGetEntryVal(de);
7726 if (time(NULL) <= when) return 0;
7727
7728 /* Delete the key */
7729 dbDelete(db,key);
7730 server.stat_expiredkeys++;
7731 return 1;
7732 }
7733
7734 static int deleteIfVolatile(redisDb *db, robj *key) {
7735 dictEntry *de;
7736
7737 /* No expire? return ASAP */
7738 if (dictSize(db->expires) == 0 ||
7739 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
7740
7741 /* Delete the key */
7742 server.dirty++;
7743 server.stat_expiredkeys++;
7744 dictDelete(db->expires,key->ptr);
7745 return dictDelete(db->dict,key->ptr) == DICT_OK;
7746 }
7747
7748 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7749 dictEntry *de;
7750 time_t seconds;
7751
7752 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7753
7754 seconds -= offset;
7755
7756 de = dictFind(c->db->dict,key->ptr);
7757 if (de == NULL) {
7758 addReply(c,shared.czero);
7759 return;
7760 }
7761 if (seconds <= 0) {
7762 if (dbDelete(c->db,key)) server.dirty++;
7763 addReply(c, shared.cone);
7764 return;
7765 } else {
7766 time_t when = time(NULL)+seconds;
7767 if (setExpire(c->db,key,when)) {
7768 addReply(c,shared.cone);
7769 server.dirty++;
7770 } else {
7771 addReply(c,shared.czero);
7772 }
7773 return;
7774 }
7775 }
7776
7777 static void expireCommand(redisClient *c) {
7778 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7779 }
7780
7781 static void expireatCommand(redisClient *c) {
7782 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7783 }
7784
7785 static void ttlCommand(redisClient *c) {
7786 time_t expire;
7787 int ttl = -1;
7788
7789 expire = getExpire(c->db,c->argv[1]);
7790 if (expire != -1) {
7791 ttl = (int) (expire-time(NULL));
7792 if (ttl < 0) ttl = -1;
7793 }
7794 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7795 }
7796
7797 /* ================================ MULTI/EXEC ============================== */
7798
7799 /* Client state initialization for MULTI/EXEC */
7800 static void initClientMultiState(redisClient *c) {
7801 c->mstate.commands = NULL;
7802 c->mstate.count = 0;
7803 }
7804
7805 /* Release all the resources associated with MULTI/EXEC state */
7806 static void freeClientMultiState(redisClient *c) {
7807 int j;
7808
7809 for (j = 0; j < c->mstate.count; j++) {
7810 int i;
7811 multiCmd *mc = c->mstate.commands+j;
7812
7813 for (i = 0; i < mc->argc; i++)
7814 decrRefCount(mc->argv[i]);
7815 zfree(mc->argv);
7816 }
7817 zfree(c->mstate.commands);
7818 }
7819
7820 /* Add a new command into the MULTI commands queue */
7821 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7822 multiCmd *mc;
7823 int j;
7824
7825 c->mstate.commands = zrealloc(c->mstate.commands,
7826 sizeof(multiCmd)*(c->mstate.count+1));
7827 mc = c->mstate.commands+c->mstate.count;
7828 mc->cmd = cmd;
7829 mc->argc = c->argc;
7830 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7831 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7832 for (j = 0; j < c->argc; j++)
7833 incrRefCount(mc->argv[j]);
7834 c->mstate.count++;
7835 }
7836
7837 static void multiCommand(redisClient *c) {
7838 if (c->flags & REDIS_MULTI) {
7839 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7840 return;
7841 }
7842 c->flags |= REDIS_MULTI;
7843 addReply(c,shared.ok);
7844 }
7845
7846 static void discardCommand(redisClient *c) {
7847 if (!(c->flags & REDIS_MULTI)) {
7848 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7849 return;
7850 }
7851
7852 freeClientMultiState(c);
7853 initClientMultiState(c);
7854 c->flags &= (~REDIS_MULTI);
7855 addReply(c,shared.ok);
7856 }
7857
7858 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7859 * implememntation for more information. */
7860 static void execCommandReplicateMulti(redisClient *c) {
7861 struct redisCommand *cmd;
7862 robj *multistring = createStringObject("MULTI",5);
7863
7864 cmd = lookupCommand("multi");
7865 if (server.appendonly)
7866 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7867 if (listLength(server.slaves))
7868 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7869 decrRefCount(multistring);
7870 }
7871
7872 static void execCommand(redisClient *c) {
7873 int j;
7874 robj **orig_argv;
7875 int orig_argc;
7876
7877 if (!(c->flags & REDIS_MULTI)) {
7878 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7879 return;
7880 }
7881
7882 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7883 * A failed EXEC will return a multi bulk nil object. */
7884 if (c->flags & REDIS_DIRTY_CAS) {
7885 freeClientMultiState(c);
7886 initClientMultiState(c);
7887 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7888 unwatchAllKeys(c);
7889 addReply(c,shared.nullmultibulk);
7890 return;
7891 }
7892
7893 /* Replicate a MULTI request now that we are sure the block is executed.
7894 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7895 * both the AOF and the replication link will have the same consistency
7896 * and atomicity guarantees. */
7897 execCommandReplicateMulti(c);
7898
7899 /* Exec all the queued commands */
7900 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7901 orig_argv = c->argv;
7902 orig_argc = c->argc;
7903 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7904 for (j = 0; j < c->mstate.count; j++) {
7905 c->argc = c->mstate.commands[j].argc;
7906 c->argv = c->mstate.commands[j].argv;
7907 call(c,c->mstate.commands[j].cmd);
7908 }
7909 c->argv = orig_argv;
7910 c->argc = orig_argc;
7911 freeClientMultiState(c);
7912 initClientMultiState(c);
7913 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7914 /* Make sure the EXEC command is always replicated / AOF, since we
7915 * always send the MULTI command (we can't know beforehand if the
7916 * next operations will contain at least a modification to the DB). */
7917 server.dirty++;
7918 }
7919
7920 /* =========================== Blocking Operations ========================= */
7921
7922 /* Currently Redis blocking operations support is limited to list POP ops,
7923 * so the current implementation is not fully generic, but it is also not
7924 * completely specific so it will not require a rewrite to support new
7925 * kind of blocking operations in the future.
7926 *
7927 * Still it's important to note that list blocking operations can be already
7928 * used as a notification mechanism in order to implement other blocking
7929 * operations at application level, so there must be a very strong evidence
7930 * of usefulness and generality before new blocking operations are implemented.
7931 *
7932 * This is how the current blocking POP works, we use BLPOP as example:
7933 * - If the user calls BLPOP and the key exists and contains a non empty list
7934 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7935 * if there is not to block.
7936 * - If instead BLPOP is called and the key does not exists or the list is
7937 * empty we need to block. In order to do so we remove the notification for
7938 * new data to read in the client socket (so that we'll not serve new
7939 * requests if the blocking request is not served). Also we put the client
7940 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7941 * blocking for this keys.
7942 * - If a PUSH operation against a key with blocked clients waiting is
7943 * performed, we serve the first in the list: basically instead to push
7944 * the new element inside the list we return it to the (first / oldest)
7945 * blocking client, unblock the client, and remove it form the list.
7946 *
7947 * The above comment and the source code should be enough in order to understand
7948 * the implementation and modify / fix it later.
7949 */
7950
7951 /* Set a client in blocking mode for the specified key, with the specified
7952 * timeout */
7953 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7954 dictEntry *de;
7955 list *l;
7956 int j;
7957
7958 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7959 c->blocking_keys_num = numkeys;
7960 c->blockingto = timeout;
7961 for (j = 0; j < numkeys; j++) {
7962 /* Add the key in the client structure, to map clients -> keys */
7963 c->blocking_keys[j] = keys[j];
7964 incrRefCount(keys[j]);
7965
7966 /* And in the other "side", to map keys -> clients */
7967 de = dictFind(c->db->blocking_keys,keys[j]);
7968 if (de == NULL) {
7969 int retval;
7970
7971 /* For every key we take a list of clients blocked for it */
7972 l = listCreate();
7973 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7974 incrRefCount(keys[j]);
7975 assert(retval == DICT_OK);
7976 } else {
7977 l = dictGetEntryVal(de);
7978 }
7979 listAddNodeTail(l,c);
7980 }
7981 /* Mark the client as a blocked client */
7982 c->flags |= REDIS_BLOCKED;
7983 server.blpop_blocked_clients++;
7984 }
7985
7986 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7987 static void unblockClientWaitingData(redisClient *c) {
7988 dictEntry *de;
7989 list *l;
7990 int j;
7991
7992 assert(c->blocking_keys != NULL);
7993 /* The client may wait for multiple keys, so unblock it for every key. */
7994 for (j = 0; j < c->blocking_keys_num; j++) {
7995 /* Remove this client from the list of clients waiting for this key. */
7996 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7997 assert(de != NULL);
7998 l = dictGetEntryVal(de);
7999 listDelNode(l,listSearchKey(l,c));
8000 /* If the list is empty we need to remove it to avoid wasting memory */
8001 if (listLength(l) == 0)
8002 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8003 decrRefCount(c->blocking_keys[j]);
8004 }
8005 /* Cleanup the client structure */
8006 zfree(c->blocking_keys);
8007 c->blocking_keys = NULL;
8008 c->flags &= (~REDIS_BLOCKED);
8009 server.blpop_blocked_clients--;
8010 /* We want to process data if there is some command waiting
8011 * in the input buffer. Note that this is safe even if
8012 * unblockClientWaitingData() gets called from freeClient() because
8013 * freeClient() will be smart enough to call this function
8014 * *after* c->querybuf was set to NULL. */
8015 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8016 }
8017
8018 /* This should be called from any function PUSHing into lists.
8019 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8020 * 'ele' is the element pushed.
8021 *
8022 * If the function returns 0 there was no client waiting for a list push
8023 * against this key.
8024 *
8025 * If the function returns 1 there was a client waiting for a list push
8026 * against this key, the element was passed to this client thus it's not
8027 * needed to actually add it to the list and the caller should return asap. */
8028 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8029 struct dictEntry *de;
8030 redisClient *receiver;
8031 list *l;
8032 listNode *ln;
8033
8034 de = dictFind(c->db->blocking_keys,key);
8035 if (de == NULL) return 0;
8036 l = dictGetEntryVal(de);
8037 ln = listFirst(l);
8038 assert(ln != NULL);
8039 receiver = ln->value;
8040
8041 addReplySds(receiver,sdsnew("*2\r\n"));
8042 addReplyBulk(receiver,key);
8043 addReplyBulk(receiver,ele);
8044 unblockClientWaitingData(receiver);
8045 return 1;
8046 }
8047
8048 /* Blocking RPOP/LPOP */
8049 static void blockingPopGenericCommand(redisClient *c, int where) {
8050 robj *o;
8051 time_t timeout;
8052 int j;
8053
8054 for (j = 1; j < c->argc-1; j++) {
8055 o = lookupKeyWrite(c->db,c->argv[j]);
8056 if (o != NULL) {
8057 if (o->type != REDIS_LIST) {
8058 addReply(c,shared.wrongtypeerr);
8059 return;
8060 } else {
8061 list *list = o->ptr;
8062 if (listLength(list) != 0) {
8063 /* If the list contains elements fall back to the usual
8064 * non-blocking POP operation */
8065 robj *argv[2], **orig_argv;
8066 int orig_argc;
8067
8068 /* We need to alter the command arguments before to call
8069 * popGenericCommand() as the command takes a single key. */
8070 orig_argv = c->argv;
8071 orig_argc = c->argc;
8072 argv[1] = c->argv[j];
8073 c->argv = argv;
8074 c->argc = 2;
8075
8076 /* Also the return value is different, we need to output
8077 * the multi bulk reply header and the key name. The
8078 * "real" command will add the last element (the value)
8079 * for us. If this souds like an hack to you it's just
8080 * because it is... */
8081 addReplySds(c,sdsnew("*2\r\n"));
8082 addReplyBulk(c,argv[1]);
8083 popGenericCommand(c,where);
8084
8085 /* Fix the client structure with the original stuff */
8086 c->argv = orig_argv;
8087 c->argc = orig_argc;
8088 return;
8089 }
8090 }
8091 }
8092 }
8093 /* If the list is empty or the key does not exists we must block */
8094 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
8095 if (timeout > 0) timeout += time(NULL);
8096 blockForKeys(c,c->argv+1,c->argc-2,timeout);
8097 }
8098
8099 static void blpopCommand(redisClient *c) {
8100 blockingPopGenericCommand(c,REDIS_HEAD);
8101 }
8102
8103 static void brpopCommand(redisClient *c) {
8104 blockingPopGenericCommand(c,REDIS_TAIL);
8105 }
8106
8107 /* =============================== Replication ============================= */
8108
8109 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
8110 ssize_t nwritten, ret = size;
8111 time_t start = time(NULL);
8112
8113 timeout++;
8114 while(size) {
8115 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8116 nwritten = write(fd,ptr,size);
8117 if (nwritten == -1) return -1;
8118 ptr += nwritten;
8119 size -= nwritten;
8120 }
8121 if ((time(NULL)-start) > timeout) {
8122 errno = ETIMEDOUT;
8123 return -1;
8124 }
8125 }
8126 return ret;
8127 }
8128
8129 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
8130 ssize_t nread, totread = 0;
8131 time_t start = time(NULL);
8132
8133 timeout++;
8134 while(size) {
8135 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8136 nread = read(fd,ptr,size);
8137 if (nread == -1) return -1;
8138 ptr += nread;
8139 size -= nread;
8140 totread += nread;
8141 }
8142 if ((time(NULL)-start) > timeout) {
8143 errno = ETIMEDOUT;
8144 return -1;
8145 }
8146 }
8147 return totread;
8148 }
8149
8150 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8151 ssize_t nread = 0;
8152
8153 size--;
8154 while(size) {
8155 char c;
8156
8157 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8158 if (c == '\n') {
8159 *ptr = '\0';
8160 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8161 return nread;
8162 } else {
8163 *ptr++ = c;
8164 *ptr = '\0';
8165 nread++;
8166 }
8167 }
8168 return nread;
8169 }
8170
8171 static void syncCommand(redisClient *c) {
8172 /* ignore SYNC if aleady slave or in monitor mode */
8173 if (c->flags & REDIS_SLAVE) return;
8174
8175 /* SYNC can't be issued when the server has pending data to send to
8176 * the client about already issued commands. We need a fresh reply
8177 * buffer registering the differences between the BGSAVE and the current
8178 * dataset, so that we can copy to other slaves if needed. */
8179 if (listLength(c->reply) != 0) {
8180 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8181 return;
8182 }
8183
8184 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8185 /* Here we need to check if there is a background saving operation
8186 * in progress, or if it is required to start one */
8187 if (server.bgsavechildpid != -1) {
8188 /* Ok a background save is in progress. Let's check if it is a good
8189 * one for replication, i.e. if there is another slave that is
8190 * registering differences since the server forked to save */
8191 redisClient *slave;
8192 listNode *ln;
8193 listIter li;
8194
8195 listRewind(server.slaves,&li);
8196 while((ln = listNext(&li))) {
8197 slave = ln->value;
8198 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
8199 }
8200 if (ln) {
8201 /* Perfect, the server is already registering differences for
8202 * another slave. Set the right state, and copy the buffer. */
8203 listRelease(c->reply);
8204 c->reply = listDup(slave->reply);
8205 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8206 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8207 } else {
8208 /* No way, we need to wait for the next BGSAVE in order to
8209 * register differences */
8210 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8211 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8212 }
8213 } else {
8214 /* Ok we don't have a BGSAVE in progress, let's start one */
8215 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8216 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8217 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8218 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8219 return;
8220 }
8221 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8222 }
8223 c->repldbfd = -1;
8224 c->flags |= REDIS_SLAVE;
8225 c->slaveseldb = 0;
8226 listAddNodeTail(server.slaves,c);
8227 return;
8228 }
8229
8230 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8231 redisClient *slave = privdata;
8232 REDIS_NOTUSED(el);
8233 REDIS_NOTUSED(mask);
8234 char buf[REDIS_IOBUF_LEN];
8235 ssize_t nwritten, buflen;
8236
8237 if (slave->repldboff == 0) {
8238 /* Write the bulk write count before to transfer the DB. In theory here
8239 * we don't know how much room there is in the output buffer of the
8240 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8241 * operations) will never be smaller than the few bytes we need. */
8242 sds bulkcount;
8243
8244 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8245 slave->repldbsize);
8246 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8247 {
8248 sdsfree(bulkcount);
8249 freeClient(slave);
8250 return;
8251 }
8252 sdsfree(bulkcount);
8253 }
8254 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8255 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8256 if (buflen <= 0) {
8257 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8258 (buflen == 0) ? "premature EOF" : strerror(errno));
8259 freeClient(slave);
8260 return;
8261 }
8262 if ((nwritten = write(fd,buf,buflen)) == -1) {
8263 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
8264 strerror(errno));
8265 freeClient(slave);
8266 return;
8267 }
8268 slave->repldboff += nwritten;
8269 if (slave->repldboff == slave->repldbsize) {
8270 close(slave->repldbfd);
8271 slave->repldbfd = -1;
8272 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8273 slave->replstate = REDIS_REPL_ONLINE;
8274 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8275 sendReplyToClient, slave) == AE_ERR) {
8276 freeClient(slave);
8277 return;
8278 }
8279 addReplySds(slave,sdsempty());
8280 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8281 }
8282 }
8283
8284 /* This function is called at the end of every backgrond saving.
8285 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8286 * otherwise REDIS_ERR is passed to the function.
8287 *
8288 * The goal of this function is to handle slaves waiting for a successful
8289 * background saving in order to perform non-blocking synchronization. */
8290 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8291 listNode *ln;
8292 int startbgsave = 0;
8293 listIter li;
8294
8295 listRewind(server.slaves,&li);
8296 while((ln = listNext(&li))) {
8297 redisClient *slave = ln->value;
8298
8299 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8300 startbgsave = 1;
8301 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8302 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8303 struct redis_stat buf;
8304
8305 if (bgsaveerr != REDIS_OK) {
8306 freeClient(slave);
8307 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8308 continue;
8309 }
8310 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8311 redis_fstat(slave->repldbfd,&buf) == -1) {
8312 freeClient(slave);
8313 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8314 continue;
8315 }
8316 slave->repldboff = 0;
8317 slave->repldbsize = buf.st_size;
8318 slave->replstate = REDIS_REPL_SEND_BULK;
8319 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8320 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8321 freeClient(slave);
8322 continue;
8323 }
8324 }
8325 }
8326 if (startbgsave) {
8327 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8328 listIter li;
8329
8330 listRewind(server.slaves,&li);
8331 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8332 while((ln = listNext(&li))) {
8333 redisClient *slave = ln->value;
8334
8335 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8336 freeClient(slave);
8337 }
8338 }
8339 }
8340 }
8341
8342 static int syncWithMaster(void) {
8343 char buf[1024], tmpfile[256], authcmd[1024];
8344 long dumpsize;
8345 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8346 int dfd, maxtries = 5;
8347
8348 if (fd == -1) {
8349 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8350 strerror(errno));
8351 return REDIS_ERR;
8352 }
8353
8354 /* AUTH with the master if required. */
8355 if(server.masterauth) {
8356 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8357 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8358 close(fd);
8359 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8360 strerror(errno));
8361 return REDIS_ERR;
8362 }
8363 /* Read the AUTH result. */
8364 if (syncReadLine(fd,buf,1024,3600) == -1) {
8365 close(fd);
8366 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8367 strerror(errno));
8368 return REDIS_ERR;
8369 }
8370 if (buf[0] != '+') {
8371 close(fd);
8372 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8373 return REDIS_ERR;
8374 }
8375 }
8376
8377 /* Issue the SYNC command */
8378 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8379 close(fd);
8380 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8381 strerror(errno));
8382 return REDIS_ERR;
8383 }
8384 /* Read the bulk write count */
8385 if (syncReadLine(fd,buf,1024,3600) == -1) {
8386 close(fd);
8387 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8388 strerror(errno));
8389 return REDIS_ERR;
8390 }
8391 if (buf[0] != '$') {
8392 close(fd);
8393 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8394 return REDIS_ERR;
8395 }
8396 dumpsize = strtol(buf+1,NULL,10);
8397 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8398 /* Read the bulk write data on a temp file */
8399 while(maxtries--) {
8400 snprintf(tmpfile,256,
8401 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8402 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8403 if (dfd != -1) break;
8404 sleep(1);
8405 }
8406 if (dfd == -1) {
8407 close(fd);
8408 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8409 return REDIS_ERR;
8410 }
8411 while(dumpsize) {
8412 int nread, nwritten;
8413
8414 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8415 if (nread == -1) {
8416 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8417 strerror(errno));
8418 close(fd);
8419 close(dfd);
8420 return REDIS_ERR;
8421 }
8422 nwritten = write(dfd,buf,nread);
8423 if (nwritten == -1) {
8424 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8425 close(fd);
8426 close(dfd);
8427 return REDIS_ERR;
8428 }
8429 dumpsize -= nread;
8430 }
8431 close(dfd);
8432 if (rename(tmpfile,server.dbfilename) == -1) {
8433 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8434 unlink(tmpfile);
8435 close(fd);
8436 return REDIS_ERR;
8437 }
8438 emptyDb();
8439 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8440 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8441 close(fd);
8442 return REDIS_ERR;
8443 }
8444 server.master = createClient(fd);
8445 server.master->flags |= REDIS_MASTER;
8446 server.master->authenticated = 1;
8447 server.replstate = REDIS_REPL_CONNECTED;
8448 return REDIS_OK;
8449 }
8450
8451 static void slaveofCommand(redisClient *c) {
8452 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8453 !strcasecmp(c->argv[2]->ptr,"one")) {
8454 if (server.masterhost) {
8455 sdsfree(server.masterhost);
8456 server.masterhost = NULL;
8457 if (server.master) freeClient(server.master);
8458 server.replstate = REDIS_REPL_NONE;
8459 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8460 }
8461 } else {
8462 sdsfree(server.masterhost);
8463 server.masterhost = sdsdup(c->argv[1]->ptr);
8464 server.masterport = atoi(c->argv[2]->ptr);
8465 if (server.master) freeClient(server.master);
8466 server.replstate = REDIS_REPL_CONNECT;
8467 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8468 server.masterhost, server.masterport);
8469 }
8470 addReply(c,shared.ok);
8471 }
8472
8473 /* ============================ Maxmemory directive ======================== */
8474
8475 /* Try to free one object form the pre-allocated objects free list.
8476 * This is useful under low mem conditions as by default we take 1 million
8477 * free objects allocated. On success REDIS_OK is returned, otherwise
8478 * REDIS_ERR. */
8479 static int tryFreeOneObjectFromFreelist(void) {
8480 robj *o;
8481
8482 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8483 if (listLength(server.objfreelist)) {
8484 listNode *head = listFirst(server.objfreelist);
8485 o = listNodeValue(head);
8486 listDelNode(server.objfreelist,head);
8487 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8488 zfree(o);
8489 return REDIS_OK;
8490 } else {
8491 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8492 return REDIS_ERR;
8493 }
8494 }
8495
8496 /* This function gets called when 'maxmemory' is set on the config file to limit
8497 * the max memory used by the server, and we are out of memory.
8498 * This function will try to, in order:
8499 *
8500 * - Free objects from the free list
8501 * - Try to remove keys with an EXPIRE set
8502 *
8503 * It is not possible to free enough memory to reach used-memory < maxmemory
8504 * the server will start refusing commands that will enlarge even more the
8505 * memory usage.
8506 */
8507 static void freeMemoryIfNeeded(void) {
8508 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8509 int j, k, freed = 0;
8510
8511 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8512 for (j = 0; j < server.dbnum; j++) {
8513 int minttl = -1;
8514 robj *minkey = NULL;
8515 struct dictEntry *de;
8516
8517 if (dictSize(server.db[j].expires)) {
8518 freed = 1;
8519 /* From a sample of three keys drop the one nearest to
8520 * the natural expire */
8521 for (k = 0; k < 3; k++) {
8522 time_t t;
8523
8524 de = dictGetRandomKey(server.db[j].expires);
8525 t = (time_t) dictGetEntryVal(de);
8526 if (minttl == -1 || t < minttl) {
8527 minkey = dictGetEntryKey(de);
8528 minttl = t;
8529 }
8530 }
8531 dbDelete(server.db+j,minkey);
8532 }
8533 }
8534 if (!freed) return; /* nothing to free... */
8535 }
8536 }
8537
8538 /* ============================== Append Only file ========================== */
8539
8540 /* Called when the user switches from "appendonly yes" to "appendonly no"
8541 * at runtime using the CONFIG command. */
8542 static void stopAppendOnly(void) {
8543 flushAppendOnlyFile();
8544 aof_fsync(server.appendfd);
8545 close(server.appendfd);
8546
8547 server.appendfd = -1;
8548 server.appendseldb = -1;
8549 server.appendonly = 0;
8550 /* rewrite operation in progress? kill it, wait child exit */
8551 if (server.bgsavechildpid != -1) {
8552 int statloc;
8553
8554 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8555 wait3(&statloc,0,NULL);
8556 /* reset the buffer accumulating changes while the child saves */
8557 sdsfree(server.bgrewritebuf);
8558 server.bgrewritebuf = sdsempty();
8559 server.bgsavechildpid = -1;
8560 }
8561 }
8562
8563 /* Called when the user switches from "appendonly no" to "appendonly yes"
8564 * at runtime using the CONFIG command. */
8565 static int startAppendOnly(void) {
8566 server.appendonly = 1;
8567 server.lastfsync = time(NULL);
8568 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8569 if (server.appendfd == -1) {
8570 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8571 return REDIS_ERR;
8572 }
8573 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8574 server.appendonly = 0;
8575 close(server.appendfd);
8576 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8577 return REDIS_ERR;
8578 }
8579 return REDIS_OK;
8580 }
8581
8582 /* Write the append only file buffer on disk.
8583 *
8584 * Since we are required to write the AOF before replying to the client,
8585 * and the only way the client socket can get a write is entering when the
8586 * the event loop, we accumulate all the AOF writes in a memory
8587 * buffer and write it on disk using this function just before entering
8588 * the event loop again. */
8589 static void flushAppendOnlyFile(void) {
8590 time_t now;
8591 ssize_t nwritten;
8592
8593 if (sdslen(server.aofbuf) == 0) return;
8594
8595 /* We want to perform a single write. This should be guaranteed atomic
8596 * at least if the filesystem we are writing is a real physical one.
8597 * While this will save us against the server being killed I don't think
8598 * there is much to do about the whole server stopping for power problems
8599 * or alike */
8600 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8601 if (nwritten != (signed)sdslen(server.aofbuf)) {
8602 /* Ooops, we are in troubles. The best thing to do for now is
8603 * aborting instead of giving the illusion that everything is
8604 * working as expected. */
8605 if (nwritten == -1) {
8606 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8607 } else {
8608 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8609 }
8610 exit(1);
8611 }
8612 sdsfree(server.aofbuf);
8613 server.aofbuf = sdsempty();
8614
8615 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8616 * childs performing heavy I/O on disk. */
8617 if (server.no_appendfsync_on_rewrite &&
8618 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8619 return;
8620 /* Fsync if needed */
8621 now = time(NULL);
8622 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8623 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8624 now-server.lastfsync > 1))
8625 {
8626 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8627 * flushing metadata. */
8628 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8629 server.lastfsync = now;
8630 }
8631 }
8632
8633 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8634 int j;
8635 buf = sdscatprintf(buf,"*%d\r\n",argc);
8636 for (j = 0; j < argc; j++) {
8637 robj *o = getDecodedObject(argv[j]);
8638 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8639 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8640 buf = sdscatlen(buf,"\r\n",2);
8641 decrRefCount(o);
8642 }
8643 return buf;
8644 }
8645
8646 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8647 int argc = 3;
8648 long when;
8649 robj *argv[3];
8650
8651 /* Make sure we can use strtol */
8652 seconds = getDecodedObject(seconds);
8653 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8654 decrRefCount(seconds);
8655
8656 argv[0] = createStringObject("EXPIREAT",8);
8657 argv[1] = key;
8658 argv[2] = createObject(REDIS_STRING,
8659 sdscatprintf(sdsempty(),"%ld",when));
8660 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8661 decrRefCount(argv[0]);
8662 decrRefCount(argv[2]);
8663 return buf;
8664 }
8665
8666 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8667 sds buf = sdsempty();
8668 robj *tmpargv[3];
8669
8670 /* The DB this command was targetting is not the same as the last command
8671 * we appendend. To issue a SELECT command is needed. */
8672 if (dictid != server.appendseldb) {
8673 char seldb[64];
8674
8675 snprintf(seldb,sizeof(seldb),"%d",dictid);
8676 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8677 (unsigned long)strlen(seldb),seldb);
8678 server.appendseldb = dictid;
8679 }
8680
8681 if (cmd->proc == expireCommand) {
8682 /* Translate EXPIRE into EXPIREAT */
8683 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8684 } else if (cmd->proc == setexCommand) {
8685 /* Translate SETEX to SET and EXPIREAT */
8686 tmpargv[0] = createStringObject("SET",3);
8687 tmpargv[1] = argv[1];
8688 tmpargv[2] = argv[3];
8689 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8690 decrRefCount(tmpargv[0]);
8691 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8692 } else {
8693 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8694 }
8695
8696 /* Append to the AOF buffer. This will be flushed on disk just before
8697 * of re-entering the event loop, so before the client will get a
8698 * positive reply about the operation performed. */
8699 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8700
8701 /* If a background append only file rewriting is in progress we want to
8702 * accumulate the differences between the child DB and the current one
8703 * in a buffer, so that when the child process will do its work we
8704 * can append the differences to the new append only file. */
8705 if (server.bgrewritechildpid != -1)
8706 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8707
8708 sdsfree(buf);
8709 }
8710
8711 /* In Redis commands are always executed in the context of a client, so in
8712 * order to load the append only file we need to create a fake client. */
8713 static struct redisClient *createFakeClient(void) {
8714 struct redisClient *c = zmalloc(sizeof(*c));
8715
8716 selectDb(c,0);
8717 c->fd = -1;
8718 c->querybuf = sdsempty();
8719 c->argc = 0;
8720 c->argv = NULL;
8721 c->flags = 0;
8722 /* We set the fake client as a slave waiting for the synchronization
8723 * so that Redis will not try to send replies to this client. */
8724 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8725 c->reply = listCreate();
8726 listSetFreeMethod(c->reply,decrRefCount);
8727 listSetDupMethod(c->reply,dupClientReplyValue);
8728 initClientMultiState(c);
8729 return c;
8730 }
8731
8732 static void freeFakeClient(struct redisClient *c) {
8733 sdsfree(c->querybuf);
8734 listRelease(c->reply);
8735 freeClientMultiState(c);
8736 zfree(c);
8737 }
8738
8739 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8740 * error (the append only file is zero-length) REDIS_ERR is returned. On
8741 * fatal error an error message is logged and the program exists. */
8742 int loadAppendOnlyFile(char *filename) {
8743 struct redisClient *fakeClient;
8744 FILE *fp = fopen(filename,"r");
8745 struct redis_stat sb;
8746 int appendonly = server.appendonly;
8747
8748 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8749 return REDIS_ERR;
8750
8751 if (fp == NULL) {
8752 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8753 exit(1);
8754 }
8755
8756 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8757 * to the same file we're about to read. */
8758 server.appendonly = 0;
8759
8760 fakeClient = createFakeClient();
8761 while(1) {
8762 int argc, j;
8763 unsigned long len;
8764 robj **argv;
8765 char buf[128];
8766 sds argsds;
8767 struct redisCommand *cmd;
8768 int force_swapout;
8769
8770 if (fgets(buf,sizeof(buf),fp) == NULL) {
8771 if (feof(fp))
8772 break;
8773 else
8774 goto readerr;
8775 }
8776 if (buf[0] != '*') goto fmterr;
8777 argc = atoi(buf+1);
8778 argv = zmalloc(sizeof(robj*)*argc);
8779 for (j = 0; j < argc; j++) {
8780 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8781 if (buf[0] != '$') goto fmterr;
8782 len = strtol(buf+1,NULL,10);
8783 argsds = sdsnewlen(NULL,len);
8784 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8785 argv[j] = createObject(REDIS_STRING,argsds);
8786 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8787 }
8788
8789 /* Command lookup */
8790 cmd = lookupCommand(argv[0]->ptr);
8791 if (!cmd) {
8792 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8793 exit(1);
8794 }
8795 /* Try object encoding */
8796 if (cmd->flags & REDIS_CMD_BULK)
8797 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8798 /* Run the command in the context of a fake client */
8799 fakeClient->argc = argc;
8800 fakeClient->argv = argv;
8801 cmd->proc(fakeClient);
8802 /* Discard the reply objects list from the fake client */
8803 while(listLength(fakeClient->reply))
8804 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8805 /* Clean up, ready for the next command */
8806 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8807 zfree(argv);
8808 /* Handle swapping while loading big datasets when VM is on */
8809 force_swapout = 0;
8810 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8811 force_swapout = 1;
8812
8813 if (server.vm_enabled && force_swapout) {
8814 while (zmalloc_used_memory() > server.vm_max_memory) {
8815 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8816 }
8817 }
8818 }
8819
8820 /* This point can only be reached when EOF is reached without errors.
8821 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8822 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8823
8824 fclose(fp);
8825 freeFakeClient(fakeClient);
8826 server.appendonly = appendonly;
8827 return REDIS_OK;
8828
8829 readerr:
8830 if (feof(fp)) {
8831 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8832 } else {
8833 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8834 }
8835 exit(1);
8836 fmterr:
8837 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8838 exit(1);
8839 }
8840
8841 /* Write binary-safe string into a file in the bulkformat
8842 * $<count>\r\n<payload>\r\n */
8843 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8844 char cbuf[128];
8845 int clen;
8846 cbuf[0] = '$';
8847 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
8848 cbuf[clen++] = '\r';
8849 cbuf[clen++] = '\n';
8850 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
8851 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
8852 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8853 return 1;
8854 }
8855
8856 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8857 static int fwriteBulkDouble(FILE *fp, double d) {
8858 char buf[128], dbuf[128];
8859
8860 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8861 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8862 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8863 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8864 return 1;
8865 }
8866
8867 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8868 static int fwriteBulkLongLong(FILE *fp, long long l) {
8869 char bbuf[128], lbuf[128];
8870 unsigned int blen, llen;
8871 llen = ll2string(lbuf,32,l);
8872 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
8873 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
8874 return 1;
8875 }
8876
8877 /* Delegate writing an object to writing a bulk string or bulk long long. */
8878 static int fwriteBulkObject(FILE *fp, robj *obj) {
8879 /* Avoid using getDecodedObject to help copy-on-write (we are often
8880 * in a child process when this function is called). */
8881 if (obj->encoding == REDIS_ENCODING_INT) {
8882 return fwriteBulkLongLong(fp,(long)obj->ptr);
8883 } else if (obj->encoding == REDIS_ENCODING_RAW) {
8884 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
8885 } else {
8886 redisPanic("Unknown string encoding");
8887 }
8888 }
8889
8890 /* Write a sequence of commands able to fully rebuild the dataset into
8891 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8892 static int rewriteAppendOnlyFile(char *filename) {
8893 dictIterator *di = NULL;
8894 dictEntry *de;
8895 FILE *fp;
8896 char tmpfile[256];
8897 int j;
8898 time_t now = time(NULL);
8899
8900 /* Note that we have to use a different temp name here compared to the
8901 * one used by rewriteAppendOnlyFileBackground() function. */
8902 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8903 fp = fopen(tmpfile,"w");
8904 if (!fp) {
8905 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8906 return REDIS_ERR;
8907 }
8908 for (j = 0; j < server.dbnum; j++) {
8909 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8910 redisDb *db = server.db+j;
8911 dict *d = db->dict;
8912 if (dictSize(d) == 0) continue;
8913 di = dictGetIterator(d);
8914 if (!di) {
8915 fclose(fp);
8916 return REDIS_ERR;
8917 }
8918
8919 /* SELECT the new DB */
8920 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8921 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
8922
8923 /* Iterate this DB writing every entry */
8924 while((de = dictNext(di)) != NULL) {
8925 sds keystr = dictGetEntryKey(de);
8926 robj key, *o;
8927 time_t expiretime;
8928 int swapped;
8929
8930 keystr = dictGetEntryKey(de);
8931 o = dictGetEntryVal(de);
8932 initStaticStringObject(key,keystr);
8933 /* If the value for this key is swapped, load a preview in memory.
8934 * We use a "swapped" flag to remember if we need to free the
8935 * value object instead to just increment the ref count anyway
8936 * in order to avoid copy-on-write of pages if we are forked() */
8937 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
8938 o->storage == REDIS_VM_SWAPPING) {
8939 swapped = 0;
8940 } else {
8941 o = vmPreviewObject(o);
8942 swapped = 1;
8943 }
8944 expiretime = getExpire(db,&key);
8945
8946 /* Save the key and associated value */
8947 if (o->type == REDIS_STRING) {
8948 /* Emit a SET command */
8949 char cmd[]="*3\r\n$3\r\nSET\r\n";
8950 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8951 /* Key and value */
8952 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8953 if (fwriteBulkObject(fp,o) == 0) goto werr;
8954 } else if (o->type == REDIS_LIST) {
8955 /* Emit the RPUSHes needed to rebuild the list */
8956 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8957 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
8958 unsigned char *zl = o->ptr;
8959 unsigned char *p = ziplistIndex(zl,0);
8960 unsigned char *vstr;
8961 unsigned int vlen;
8962 long long vlong;
8963
8964 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
8965 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8966 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8967 if (vstr) {
8968 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
8969 goto werr;
8970 } else {
8971 if (fwriteBulkLongLong(fp,vlong) == 0)
8972 goto werr;
8973 }
8974 p = ziplistNext(zl,p);
8975 }
8976 } else if (o->encoding == REDIS_ENCODING_LIST) {
8977 list *list = o->ptr;
8978 listNode *ln;
8979 listIter li;
8980
8981 listRewind(list,&li);
8982 while((ln = listNext(&li))) {
8983 robj *eleobj = listNodeValue(ln);
8984
8985 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8986 if (fwriteBulkObject(fp,&key) == 0) goto werr;
8987 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8988 }
8989 } else {
8990 redisPanic("Unknown list encoding");
8991 }
8992 } else if (o->type == REDIS_SET) {
8993 /* Emit the SADDs needed to rebuild the set */
8994 dict *set = o->ptr;
8995 dictIterator *di = dictGetIterator(set);
8996 dictEntry *de;
8997
8998 while((de = dictNext(di)) != NULL) {
8999 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9000 robj *eleobj = dictGetEntryKey(de);
9001
9002 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9003 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9004 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9005 }
9006 dictReleaseIterator(di);
9007 } else if (o->type == REDIS_ZSET) {
9008 /* Emit the ZADDs needed to rebuild the sorted set */
9009 zset *zs = o->ptr;
9010 dictIterator *di = dictGetIterator(zs->dict);
9011 dictEntry *de;
9012
9013 while((de = dictNext(di)) != NULL) {
9014 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9015 robj *eleobj = dictGetEntryKey(de);
9016 double *score = dictGetEntryVal(de);
9017
9018 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9019 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9020 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9021 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9022 }
9023 dictReleaseIterator(di);
9024 } else if (o->type == REDIS_HASH) {
9025 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9026
9027 /* Emit the HSETs needed to rebuild the hash */
9028 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9029 unsigned char *p = zipmapRewind(o->ptr);
9030 unsigned char *field, *val;
9031 unsigned int flen, vlen;
9032
9033 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9034 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9035 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9036 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9037 return -1;
9038 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9039 return -1;
9040 }
9041 } else {
9042 dictIterator *di = dictGetIterator(o->ptr);
9043 dictEntry *de;
9044
9045 while((de = dictNext(di)) != NULL) {
9046 robj *field = dictGetEntryKey(de);
9047 robj *val = dictGetEntryVal(de);
9048
9049 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9050 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9051 if (fwriteBulkObject(fp,field) == -1) return -1;
9052 if (fwriteBulkObject(fp,val) == -1) return -1;
9053 }
9054 dictReleaseIterator(di);
9055 }
9056 } else {
9057 redisPanic("Unknown object type");
9058 }
9059 /* Save the expire time */
9060 if (expiretime != -1) {
9061 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9062 /* If this key is already expired skip it */
9063 if (expiretime < now) continue;
9064 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9065 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9066 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9067 }
9068 if (swapped) decrRefCount(o);
9069 }
9070 dictReleaseIterator(di);
9071 }
9072
9073 /* Make sure data will not remain on the OS's output buffers */
9074 fflush(fp);
9075 aof_fsync(fileno(fp));
9076 fclose(fp);
9077
9078 /* Use RENAME to make sure the DB file is changed atomically only
9079 * if the generate DB file is ok. */
9080 if (rename(tmpfile,filename) == -1) {
9081 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9082 unlink(tmpfile);
9083 return REDIS_ERR;
9084 }
9085 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9086 return REDIS_OK;
9087
9088 werr:
9089 fclose(fp);
9090 unlink(tmpfile);
9091 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9092 if (di) dictReleaseIterator(di);
9093 return REDIS_ERR;
9094 }
9095
9096 /* This is how rewriting of the append only file in background works:
9097 *
9098 * 1) The user calls BGREWRITEAOF
9099 * 2) Redis calls this function, that forks():
9100 * 2a) the child rewrite the append only file in a temp file.
9101 * 2b) the parent accumulates differences in server.bgrewritebuf.
9102 * 3) When the child finished '2a' exists.
9103 * 4) The parent will trap the exit code, if it's OK, will append the
9104 * data accumulated into server.bgrewritebuf into the temp file, and
9105 * finally will rename(2) the temp file in the actual file name.
9106 * The the new file is reopened as the new append only file. Profit!
9107 */
9108 static int rewriteAppendOnlyFileBackground(void) {
9109 pid_t childpid;
9110
9111 if (server.bgrewritechildpid != -1) return REDIS_ERR;
9112 if (server.vm_enabled) waitEmptyIOJobsQueue();
9113 if ((childpid = fork()) == 0) {
9114 /* Child */
9115 char tmpfile[256];
9116
9117 if (server.vm_enabled) vmReopenSwapFile();
9118 close(server.fd);
9119 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9120 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
9121 _exit(0);
9122 } else {
9123 _exit(1);
9124 }
9125 } else {
9126 /* Parent */
9127 if (childpid == -1) {
9128 redisLog(REDIS_WARNING,
9129 "Can't rewrite append only file in background: fork: %s",
9130 strerror(errno));
9131 return REDIS_ERR;
9132 }
9133 redisLog(REDIS_NOTICE,
9134 "Background append only file rewriting started by pid %d",childpid);
9135 server.bgrewritechildpid = childpid;
9136 updateDictResizePolicy();
9137 /* We set appendseldb to -1 in order to force the next call to the
9138 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9139 * accumulated by the parent into server.bgrewritebuf will start
9140 * with a SELECT statement and it will be safe to merge. */
9141 server.appendseldb = -1;
9142 return REDIS_OK;
9143 }
9144 return REDIS_OK; /* unreached */
9145 }
9146
9147 static void bgrewriteaofCommand(redisClient *c) {
9148 if (server.bgrewritechildpid != -1) {
9149 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9150 return;
9151 }
9152 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
9153 char *status = "+Background append only file rewriting started\r\n";
9154 addReplySds(c,sdsnew(status));
9155 } else {
9156 addReply(c,shared.err);
9157 }
9158 }
9159
9160 static void aofRemoveTempFile(pid_t childpid) {
9161 char tmpfile[256];
9162
9163 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9164 unlink(tmpfile);
9165 }
9166
9167 /* Virtual Memory is composed mainly of two subsystems:
9168 * - Blocking Virutal Memory
9169 * - Threaded Virtual Memory I/O
9170 * The two parts are not fully decoupled, but functions are split among two
9171 * different sections of the source code (delimited by comments) in order to
9172 * make more clear what functionality is about the blocking VM and what about
9173 * the threaded (not blocking) VM.
9174 *
9175 * Redis VM design:
9176 *
9177 * Redis VM is a blocking VM (one that blocks reading swapped values from
9178 * disk into memory when a value swapped out is needed in memory) that is made
9179 * unblocking by trying to examine the command argument vector in order to
9180 * load in background values that will likely be needed in order to exec
9181 * the command. The command is executed only once all the relevant keys
9182 * are loaded into memory.
9183 *
9184 * This basically is almost as simple of a blocking VM, but almost as parallel
9185 * as a fully non-blocking VM.
9186 */
9187
9188 /* =================== Virtual Memory - Blocking Side ====================== */
9189
9190 /* Create a VM pointer object. This kind of objects are used in place of
9191 * values in the key -> value hash table, for swapped out objects. */
9192 static vmpointer *createVmPointer(int vtype) {
9193 vmpointer *vp = zmalloc(sizeof(vmpointer));
9194
9195 vp->type = REDIS_VMPOINTER;
9196 vp->storage = REDIS_VM_SWAPPED;
9197 vp->vtype = vtype;
9198 return vp;
9199 }
9200
9201 static void vmInit(void) {
9202 off_t totsize;
9203 int pipefds[2];
9204 size_t stacksize;
9205 struct flock fl;
9206
9207 if (server.vm_max_threads != 0)
9208 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9209
9210 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
9211 /* Try to open the old swap file, otherwise create it */
9212 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9213 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9214 }
9215 if (server.vm_fp == NULL) {
9216 redisLog(REDIS_WARNING,
9217 "Can't open the swap file: %s. Exiting.",
9218 strerror(errno));
9219 exit(1);
9220 }
9221 server.vm_fd = fileno(server.vm_fp);
9222 /* Lock the swap file for writing, this is useful in order to avoid
9223 * another instance to use the same swap file for a config error. */
9224 fl.l_type = F_WRLCK;
9225 fl.l_whence = SEEK_SET;
9226 fl.l_start = fl.l_len = 0;
9227 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9228 redisLog(REDIS_WARNING,
9229 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9230 exit(1);
9231 }
9232 /* Initialize */
9233 server.vm_next_page = 0;
9234 server.vm_near_pages = 0;
9235 server.vm_stats_used_pages = 0;
9236 server.vm_stats_swapped_objects = 0;
9237 server.vm_stats_swapouts = 0;
9238 server.vm_stats_swapins = 0;
9239 totsize = server.vm_pages*server.vm_page_size;
9240 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9241 if (ftruncate(server.vm_fd,totsize) == -1) {
9242 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9243 strerror(errno));
9244 exit(1);
9245 } else {
9246 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9247 }
9248 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
9249 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
9250 (long long) (server.vm_pages+7)/8, server.vm_pages);
9251 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
9252
9253 /* Initialize threaded I/O (used by Virtual Memory) */
9254 server.io_newjobs = listCreate();
9255 server.io_processing = listCreate();
9256 server.io_processed = listCreate();
9257 server.io_ready_clients = listCreate();
9258 pthread_mutex_init(&server.io_mutex,NULL);
9259 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9260 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
9261 server.io_active_threads = 0;
9262 if (pipe(pipefds) == -1) {
9263 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9264 ,strerror(errno));
9265 exit(1);
9266 }
9267 server.io_ready_pipe_read = pipefds[0];
9268 server.io_ready_pipe_write = pipefds[1];
9269 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
9270 /* LZF requires a lot of stack */
9271 pthread_attr_init(&server.io_threads_attr);
9272 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9273 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9274 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
9275 /* Listen for events in the threaded I/O pipe */
9276 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9277 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9278 oom("creating file event");
9279 }
9280
9281 /* Mark the page as used */
9282 static void vmMarkPageUsed(off_t page) {
9283 off_t byte = page/8;
9284 int bit = page&7;
9285 redisAssert(vmFreePage(page) == 1);
9286 server.vm_bitmap[byte] |= 1<<bit;
9287 }
9288
9289 /* Mark N contiguous pages as used, with 'page' being the first. */
9290 static void vmMarkPagesUsed(off_t page, off_t count) {
9291 off_t j;
9292
9293 for (j = 0; j < count; j++)
9294 vmMarkPageUsed(page+j);
9295 server.vm_stats_used_pages += count;
9296 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9297 (long long)count, (long long)page);
9298 }
9299
9300 /* Mark the page as free */
9301 static void vmMarkPageFree(off_t page) {
9302 off_t byte = page/8;
9303 int bit = page&7;
9304 redisAssert(vmFreePage(page) == 0);
9305 server.vm_bitmap[byte] &= ~(1<<bit);
9306 }
9307
9308 /* Mark N contiguous pages as free, with 'page' being the first. */
9309 static void vmMarkPagesFree(off_t page, off_t count) {
9310 off_t j;
9311
9312 for (j = 0; j < count; j++)
9313 vmMarkPageFree(page+j);
9314 server.vm_stats_used_pages -= count;
9315 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9316 (long long)count, (long long)page);
9317 }
9318
9319 /* Test if the page is free */
9320 static int vmFreePage(off_t page) {
9321 off_t byte = page/8;
9322 int bit = page&7;
9323 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9324 }
9325
9326 /* Find N contiguous free pages storing the first page of the cluster in *first.
9327 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9328 * REDIS_ERR is returned.
9329 *
9330 * This function uses a simple algorithm: we try to allocate
9331 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9332 * again from the start of the swap file searching for free spaces.
9333 *
9334 * If it looks pretty clear that there are no free pages near our offset
9335 * we try to find less populated places doing a forward jump of
9336 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9337 * without hurry, and then we jump again and so forth...
9338 *
9339 * This function can be improved using a free list to avoid to guess
9340 * too much, since we could collect data about freed pages.
9341 *
9342 * note: I implemented this function just after watching an episode of
9343 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9344 */
9345 static int vmFindContiguousPages(off_t *first, off_t n) {
9346 off_t base, offset = 0, since_jump = 0, numfree = 0;
9347
9348 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9349 server.vm_near_pages = 0;
9350 server.vm_next_page = 0;
9351 }
9352 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9353 base = server.vm_next_page;
9354
9355 while(offset < server.vm_pages) {
9356 off_t this = base+offset;
9357
9358 /* If we overflow, restart from page zero */
9359 if (this >= server.vm_pages) {
9360 this -= server.vm_pages;
9361 if (this == 0) {
9362 /* Just overflowed, what we found on tail is no longer
9363 * interesting, as it's no longer contiguous. */
9364 numfree = 0;
9365 }
9366 }
9367 if (vmFreePage(this)) {
9368 /* This is a free page */
9369 numfree++;
9370 /* Already got N free pages? Return to the caller, with success */
9371 if (numfree == n) {
9372 *first = this-(n-1);
9373 server.vm_next_page = this+1;
9374 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9375 return REDIS_OK;
9376 }
9377 } else {
9378 /* The current one is not a free page */
9379 numfree = 0;
9380 }
9381
9382 /* Fast-forward if the current page is not free and we already
9383 * searched enough near this place. */
9384 since_jump++;
9385 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9386 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9387 since_jump = 0;
9388 /* Note that even if we rewind after the jump, we are don't need
9389 * to make sure numfree is set to zero as we only jump *if* it
9390 * is set to zero. */
9391 } else {
9392 /* Otherwise just check the next page */
9393 offset++;
9394 }
9395 }
9396 return REDIS_ERR;
9397 }
9398
9399 /* Write the specified object at the specified page of the swap file */
9400 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9401 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9402 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9403 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9404 redisLog(REDIS_WARNING,
9405 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9406 strerror(errno));
9407 return REDIS_ERR;
9408 }
9409 rdbSaveObject(server.vm_fp,o);
9410 fflush(server.vm_fp);
9411 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9412 return REDIS_OK;
9413 }
9414
9415 /* Transfers the 'val' object to disk. Store all the information
9416 * a 'vmpointer' object containing all the information needed to load the
9417 * object back later is returned.
9418 *
9419 * If we can't find enough contiguous empty pages to swap the object on disk
9420 * NULL is returned. */
9421 static vmpointer *vmSwapObjectBlocking(robj *val) {
9422 off_t pages = rdbSavedObjectPages(val,NULL);
9423 off_t page;
9424 vmpointer *vp;
9425
9426 assert(val->storage == REDIS_VM_MEMORY);
9427 assert(val->refcount == 1);
9428 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9429 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9430
9431 vp = createVmPointer(val->type);
9432 vp->page = page;
9433 vp->usedpages = pages;
9434 decrRefCount(val); /* Deallocate the object from memory. */
9435 vmMarkPagesUsed(page,pages);
9436 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9437 (void*) val,
9438 (unsigned long long) page, (unsigned long long) pages);
9439 server.vm_stats_swapped_objects++;
9440 server.vm_stats_swapouts++;
9441 return vp;
9442 }
9443
9444 static robj *vmReadObjectFromSwap(off_t page, int type) {
9445 robj *o;
9446
9447 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9448 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9449 redisLog(REDIS_WARNING,
9450 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9451 strerror(errno));
9452 _exit(1);
9453 }
9454 o = rdbLoadObject(type,server.vm_fp);
9455 if (o == NULL) {
9456 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9457 _exit(1);
9458 }
9459 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9460 return o;
9461 }
9462
9463 /* Load the specified object from swap to memory.
9464 * The newly allocated object is returned.
9465 *
9466 * If preview is true the unserialized object is returned to the caller but
9467 * the pages are not marked as freed, nor the vp object is freed. */
9468 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9469 robj *val;
9470
9471 redisAssert(vp->type == REDIS_VMPOINTER &&
9472 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9473 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9474 if (!preview) {
9475 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9476 vmMarkPagesFree(vp->page,vp->usedpages);
9477 zfree(vp);
9478 server.vm_stats_swapped_objects--;
9479 } else {
9480 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9481 }
9482 server.vm_stats_swapins++;
9483 return val;
9484 }
9485
9486 /* Plain object loading, from swap to memory.
9487 *
9488 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9489 * The return value is the loaded object. */
9490 static robj *vmLoadObject(robj *o) {
9491 /* If we are loading the object in background, stop it, we
9492 * need to load this object synchronously ASAP. */
9493 if (o->storage == REDIS_VM_LOADING)
9494 vmCancelThreadedIOJob(o);
9495 return vmGenericLoadObject((vmpointer*)o,0);
9496 }
9497
9498 /* Just load the value on disk, without to modify the key.
9499 * This is useful when we want to perform some operation on the value
9500 * without to really bring it from swap to memory, like while saving the
9501 * dataset or rewriting the append only log. */
9502 static robj *vmPreviewObject(robj *o) {
9503 return vmGenericLoadObject((vmpointer*)o,1);
9504 }
9505
9506 /* How a good candidate is this object for swapping?
9507 * The better candidate it is, the greater the returned value.
9508 *
9509 * Currently we try to perform a fast estimation of the object size in
9510 * memory, and combine it with aging informations.
9511 *
9512 * Basically swappability = idle-time * log(estimated size)
9513 *
9514 * Bigger objects are preferred over smaller objects, but not
9515 * proportionally, this is why we use the logarithm. This algorithm is
9516 * just a first try and will probably be tuned later. */
9517 static double computeObjectSwappability(robj *o) {
9518 /* actual age can be >= minage, but not < minage. As we use wrapping
9519 * 21 bit clocks with minutes resolution for the LRU. */
9520 time_t minage = abs(server.lruclock - o->lru);
9521 long asize = 0;
9522 list *l;
9523 dict *d;
9524 struct dictEntry *de;
9525 int z;
9526
9527 if (minage <= 0) return 0;
9528 switch(o->type) {
9529 case REDIS_STRING:
9530 if (o->encoding != REDIS_ENCODING_RAW) {
9531 asize = sizeof(*o);
9532 } else {
9533 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9534 }
9535 break;
9536 case REDIS_LIST:
9537 l = o->ptr;
9538 listNode *ln = listFirst(l);
9539
9540 asize = sizeof(list);
9541 if (ln) {
9542 robj *ele = ln->value;
9543 long elesize;
9544
9545 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9546 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9547 asize += (sizeof(listNode)+elesize)*listLength(l);
9548 }
9549 break;
9550 case REDIS_SET:
9551 case REDIS_ZSET:
9552 z = (o->type == REDIS_ZSET);
9553 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9554
9555 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9556 if (z) asize += sizeof(zset)-sizeof(dict);
9557 if (dictSize(d)) {
9558 long elesize;
9559 robj *ele;
9560
9561 de = dictGetRandomKey(d);
9562 ele = dictGetEntryKey(de);
9563 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9564 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9565 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9566 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9567 }
9568 break;
9569 case REDIS_HASH:
9570 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9571 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9572 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9573 unsigned int klen, vlen;
9574 unsigned char *key, *val;
9575
9576 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9577 klen = 0;
9578 vlen = 0;
9579 }
9580 asize = len*(klen+vlen+3);
9581 } else if (o->encoding == REDIS_ENCODING_HT) {
9582 d = o->ptr;
9583 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9584 if (dictSize(d)) {
9585 long elesize;
9586 robj *ele;
9587
9588 de = dictGetRandomKey(d);
9589 ele = dictGetEntryKey(de);
9590 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9591 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9592 ele = dictGetEntryVal(de);
9593 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9594 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9595 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9596 }
9597 }
9598 break;
9599 }
9600 return (double)minage*log(1+asize);
9601 }
9602
9603 /* Try to swap an object that's a good candidate for swapping.
9604 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9605 * to swap any object at all.
9606 *
9607 * If 'usethreaded' is true, Redis will try to swap the object in background
9608 * using I/O threads. */
9609 static int vmSwapOneObject(int usethreads) {
9610 int j, i;
9611 struct dictEntry *best = NULL;
9612 double best_swappability = 0;
9613 redisDb *best_db = NULL;
9614 robj *val;
9615 sds key;
9616
9617 for (j = 0; j < server.dbnum; j++) {
9618 redisDb *db = server.db+j;
9619 /* Why maxtries is set to 100?
9620 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9621 * are swappable objects */
9622 int maxtries = 100;
9623
9624 if (dictSize(db->dict) == 0) continue;
9625 for (i = 0; i < 5; i++) {
9626 dictEntry *de;
9627 double swappability;
9628
9629 if (maxtries) maxtries--;
9630 de = dictGetRandomKey(db->dict);
9631 val = dictGetEntryVal(de);
9632 /* Only swap objects that are currently in memory.
9633 *
9634 * Also don't swap shared objects: not a good idea in general and
9635 * we need to ensure that the main thread does not touch the
9636 * object while the I/O thread is using it, but we can't
9637 * control other keys without adding additional mutex. */
9638 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9639 if (maxtries) i--; /* don't count this try */
9640 continue;
9641 }
9642 swappability = computeObjectSwappability(val);
9643 if (!best || swappability > best_swappability) {
9644 best = de;
9645 best_swappability = swappability;
9646 best_db = db;
9647 }
9648 }
9649 }
9650 if (best == NULL) return REDIS_ERR;
9651 key = dictGetEntryKey(best);
9652 val = dictGetEntryVal(best);
9653
9654 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9655 key, best_swappability);
9656
9657 /* Swap it */
9658 if (usethreads) {
9659 robj *keyobj = createStringObject(key,sdslen(key));
9660 vmSwapObjectThreaded(keyobj,val,best_db);
9661 decrRefCount(keyobj);
9662 return REDIS_OK;
9663 } else {
9664 vmpointer *vp;
9665
9666 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9667 dictGetEntryVal(best) = vp;
9668 return REDIS_OK;
9669 } else {
9670 return REDIS_ERR;
9671 }
9672 }
9673 }
9674
9675 static int vmSwapOneObjectBlocking() {
9676 return vmSwapOneObject(0);
9677 }
9678
9679 static int vmSwapOneObjectThreaded() {
9680 return vmSwapOneObject(1);
9681 }
9682
9683 /* Return true if it's safe to swap out objects in a given moment.
9684 * Basically we don't want to swap objects out while there is a BGSAVE
9685 * or a BGAEOREWRITE running in backgroud. */
9686 static int vmCanSwapOut(void) {
9687 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9688 }
9689
9690 /* =================== Virtual Memory - Threaded I/O ======================= */
9691
9692 static void freeIOJob(iojob *j) {
9693 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9694 j->type == REDIS_IOJOB_DO_SWAP ||
9695 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9696 {
9697 /* we fix the storage type, otherwise decrRefCount() will try to
9698 * kill the I/O thread Job (that does no longer exists). */
9699 if (j->val->storage == REDIS_VM_SWAPPING)
9700 j->val->storage = REDIS_VM_MEMORY;
9701 decrRefCount(j->val);
9702 }
9703 decrRefCount(j->key);
9704 zfree(j);
9705 }
9706
9707 /* Every time a thread finished a Job, it writes a byte into the write side
9708 * of an unix pipe in order to "awake" the main thread, and this function
9709 * is called. */
9710 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9711 int mask)
9712 {
9713 char buf[1];
9714 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9715 REDIS_NOTUSED(el);
9716 REDIS_NOTUSED(mask);
9717 REDIS_NOTUSED(privdata);
9718
9719 /* For every byte we read in the read side of the pipe, there is one
9720 * I/O job completed to process. */
9721 while((retval = read(fd,buf,1)) == 1) {
9722 iojob *j;
9723 listNode *ln;
9724 struct dictEntry *de;
9725
9726 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9727
9728 /* Get the processed element (the oldest one) */
9729 lockThreadedIO();
9730 assert(listLength(server.io_processed) != 0);
9731 if (toprocess == -1) {
9732 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9733 if (toprocess <= 0) toprocess = 1;
9734 }
9735 ln = listFirst(server.io_processed);
9736 j = ln->value;
9737 listDelNode(server.io_processed,ln);
9738 unlockThreadedIO();
9739 /* If this job is marked as canceled, just ignore it */
9740 if (j->canceled) {
9741 freeIOJob(j);
9742 continue;
9743 }
9744 /* Post process it in the main thread, as there are things we
9745 * can do just here to avoid race conditions and/or invasive locks */
9746 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9747 de = dictFind(j->db->dict,j->key->ptr);
9748 redisAssert(de != NULL);
9749 if (j->type == REDIS_IOJOB_LOAD) {
9750 redisDb *db;
9751 vmpointer *vp = dictGetEntryVal(de);
9752
9753 /* Key loaded, bring it at home */
9754 vmMarkPagesFree(vp->page,vp->usedpages);
9755 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9756 (unsigned char*) j->key->ptr);
9757 server.vm_stats_swapped_objects--;
9758 server.vm_stats_swapins++;
9759 dictGetEntryVal(de) = j->val;
9760 incrRefCount(j->val);
9761 db = j->db;
9762 /* Handle clients waiting for this key to be loaded. */
9763 handleClientsBlockedOnSwappedKey(db,j->key);
9764 freeIOJob(j);
9765 zfree(vp);
9766 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9767 /* Now we know the amount of pages required to swap this object.
9768 * Let's find some space for it, and queue this task again
9769 * rebranded as REDIS_IOJOB_DO_SWAP. */
9770 if (!vmCanSwapOut() ||
9771 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9772 {
9773 /* Ooops... no space or we can't swap as there is
9774 * a fork()ed Redis trying to save stuff on disk. */
9775 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9776 freeIOJob(j);
9777 } else {
9778 /* Note that we need to mark this pages as used now,
9779 * if the job will be canceled, we'll mark them as freed
9780 * again. */
9781 vmMarkPagesUsed(j->page,j->pages);
9782 j->type = REDIS_IOJOB_DO_SWAP;
9783 lockThreadedIO();
9784 queueIOJob(j);
9785 unlockThreadedIO();
9786 }
9787 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9788 vmpointer *vp;
9789
9790 /* Key swapped. We can finally free some memory. */
9791 if (j->val->storage != REDIS_VM_SWAPPING) {
9792 vmpointer *vp = (vmpointer*) j->id;
9793 printf("storage: %d\n",vp->storage);
9794 printf("key->name: %s\n",(char*)j->key->ptr);
9795 printf("val: %p\n",(void*)j->val);
9796 printf("val->type: %d\n",j->val->type);
9797 printf("val->ptr: %s\n",(char*)j->val->ptr);
9798 }
9799 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9800 vp = createVmPointer(j->val->type);
9801 vp->page = j->page;
9802 vp->usedpages = j->pages;
9803 dictGetEntryVal(de) = vp;
9804 /* Fix the storage otherwise decrRefCount will attempt to
9805 * remove the associated I/O job */
9806 j->val->storage = REDIS_VM_MEMORY;
9807 decrRefCount(j->val);
9808 redisLog(REDIS_DEBUG,
9809 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9810 (unsigned char*) j->key->ptr,
9811 (unsigned long long) j->page, (unsigned long long) j->pages);
9812 server.vm_stats_swapped_objects++;
9813 server.vm_stats_swapouts++;
9814 freeIOJob(j);
9815 /* Put a few more swap requests in queue if we are still
9816 * out of memory */
9817 if (trytoswap && vmCanSwapOut() &&
9818 zmalloc_used_memory() > server.vm_max_memory)
9819 {
9820 int more = 1;
9821 while(more) {
9822 lockThreadedIO();
9823 more = listLength(server.io_newjobs) <
9824 (unsigned) server.vm_max_threads;
9825 unlockThreadedIO();
9826 /* Don't waste CPU time if swappable objects are rare. */
9827 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9828 trytoswap = 0;
9829 break;
9830 }
9831 }
9832 }
9833 }
9834 processed++;
9835 if (processed == toprocess) return;
9836 }
9837 if (retval < 0 && errno != EAGAIN) {
9838 redisLog(REDIS_WARNING,
9839 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9840 strerror(errno));
9841 }
9842 }
9843
9844 static void lockThreadedIO(void) {
9845 pthread_mutex_lock(&server.io_mutex);
9846 }
9847
9848 static void unlockThreadedIO(void) {
9849 pthread_mutex_unlock(&server.io_mutex);
9850 }
9851
9852 /* Remove the specified object from the threaded I/O queue if still not
9853 * processed, otherwise make sure to flag it as canceled. */
9854 static void vmCancelThreadedIOJob(robj *o) {
9855 list *lists[3] = {
9856 server.io_newjobs, /* 0 */
9857 server.io_processing, /* 1 */
9858 server.io_processed /* 2 */
9859 };
9860 int i;
9861
9862 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9863 again:
9864 lockThreadedIO();
9865 /* Search for a matching object in one of the queues */
9866 for (i = 0; i < 3; i++) {
9867 listNode *ln;
9868 listIter li;
9869
9870 listRewind(lists[i],&li);
9871 while ((ln = listNext(&li)) != NULL) {
9872 iojob *job = ln->value;
9873
9874 if (job->canceled) continue; /* Skip this, already canceled. */
9875 if (job->id == o) {
9876 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
9877 (void*)job, (char*)job->key->ptr, job->type, i);
9878 /* Mark the pages as free since the swap didn't happened
9879 * or happened but is now discarded. */
9880 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9881 vmMarkPagesFree(job->page,job->pages);
9882 /* Cancel the job. It depends on the list the job is
9883 * living in. */
9884 switch(i) {
9885 case 0: /* io_newjobs */
9886 /* If the job was yet not processed the best thing to do
9887 * is to remove it from the queue at all */
9888 freeIOJob(job);
9889 listDelNode(lists[i],ln);
9890 break;
9891 case 1: /* io_processing */
9892 /* Oh Shi- the thread is messing with the Job:
9893 *
9894 * Probably it's accessing the object if this is a
9895 * PREPARE_SWAP or DO_SWAP job.
9896 * If it's a LOAD job it may be reading from disk and
9897 * if we don't wait for the job to terminate before to
9898 * cancel it, maybe in a few microseconds data can be
9899 * corrupted in this pages. So the short story is:
9900 *
9901 * Better to wait for the job to move into the
9902 * next queue (processed)... */
9903
9904 /* We try again and again until the job is completed. */
9905 unlockThreadedIO();
9906 /* But let's wait some time for the I/O thread
9907 * to finish with this job. After all this condition
9908 * should be very rare. */
9909 usleep(1);
9910 goto again;
9911 case 2: /* io_processed */
9912 /* The job was already processed, that's easy...
9913 * just mark it as canceled so that we'll ignore it
9914 * when processing completed jobs. */
9915 job->canceled = 1;
9916 break;
9917 }
9918 /* Finally we have to adjust the storage type of the object
9919 * in order to "UNDO" the operaiton. */
9920 if (o->storage == REDIS_VM_LOADING)
9921 o->storage = REDIS_VM_SWAPPED;
9922 else if (o->storage == REDIS_VM_SWAPPING)
9923 o->storage = REDIS_VM_MEMORY;
9924 unlockThreadedIO();
9925 redisLog(REDIS_DEBUG,"*** DONE");
9926 return;
9927 }
9928 }
9929 }
9930 unlockThreadedIO();
9931 printf("Not found: %p\n", (void*)o);
9932 redisAssert(1 != 1); /* We should never reach this */
9933 }
9934
9935 static void *IOThreadEntryPoint(void *arg) {
9936 iojob *j;
9937 listNode *ln;
9938 REDIS_NOTUSED(arg);
9939
9940 pthread_detach(pthread_self());
9941 while(1) {
9942 /* Get a new job to process */
9943 lockThreadedIO();
9944 if (listLength(server.io_newjobs) == 0) {
9945 /* No new jobs in queue, exit. */
9946 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9947 (long) pthread_self());
9948 server.io_active_threads--;
9949 unlockThreadedIO();
9950 return NULL;
9951 }
9952 ln = listFirst(server.io_newjobs);
9953 j = ln->value;
9954 listDelNode(server.io_newjobs,ln);
9955 /* Add the job in the processing queue */
9956 j->thread = pthread_self();
9957 listAddNodeTail(server.io_processing,j);
9958 ln = listLast(server.io_processing); /* We use ln later to remove it */
9959 unlockThreadedIO();
9960 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9961 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9962
9963 /* Process the Job */
9964 if (j->type == REDIS_IOJOB_LOAD) {
9965 vmpointer *vp = (vmpointer*)j->id;
9966 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
9967 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9968 FILE *fp = fopen("/dev/null","w+");
9969 j->pages = rdbSavedObjectPages(j->val,fp);
9970 fclose(fp);
9971 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9972 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9973 j->canceled = 1;
9974 }
9975
9976 /* Done: insert the job into the processed queue */
9977 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9978 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9979 lockThreadedIO();
9980 listDelNode(server.io_processing,ln);
9981 listAddNodeTail(server.io_processed,j);
9982 unlockThreadedIO();
9983
9984 /* Signal the main thread there is new stuff to process */
9985 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9986 }
9987 return NULL; /* never reached */
9988 }
9989
9990 static void spawnIOThread(void) {
9991 pthread_t thread;
9992 sigset_t mask, omask;
9993 int err;
9994
9995 sigemptyset(&mask);
9996 sigaddset(&mask,SIGCHLD);
9997 sigaddset(&mask,SIGHUP);
9998 sigaddset(&mask,SIGPIPE);
9999 pthread_sigmask(SIG_SETMASK, &mask, &omask);
10000 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10001 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10002 strerror(err));
10003 usleep(1000000);
10004 }
10005 pthread_sigmask(SIG_SETMASK, &omask, NULL);
10006 server.io_active_threads++;
10007 }
10008
10009 /* We need to wait for the last thread to exit before we are able to
10010 * fork() in order to BGSAVE or BGREWRITEAOF. */
10011 static void waitEmptyIOJobsQueue(void) {
10012 while(1) {
10013 int io_processed_len;
10014
10015 lockThreadedIO();
10016 if (listLength(server.io_newjobs) == 0 &&
10017 listLength(server.io_processing) == 0 &&
10018 server.io_active_threads == 0)
10019 {
10020 unlockThreadedIO();
10021 return;
10022 }
10023 /* While waiting for empty jobs queue condition we post-process some
10024 * finshed job, as I/O threads may be hanging trying to write against
10025 * the io_ready_pipe_write FD but there are so much pending jobs that
10026 * it's blocking. */
10027 io_processed_len = listLength(server.io_processed);
10028 unlockThreadedIO();
10029 if (io_processed_len) {
10030 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10031 usleep(1000); /* 1 millisecond */
10032 } else {
10033 usleep(10000); /* 10 milliseconds */
10034 }
10035 }
10036 }
10037
10038 static void vmReopenSwapFile(void) {
10039 /* Note: we don't close the old one as we are in the child process
10040 * and don't want to mess at all with the original file object. */
10041 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10042 if (server.vm_fp == NULL) {
10043 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10044 server.vm_swap_file);
10045 _exit(1);
10046 }
10047 server.vm_fd = fileno(server.vm_fp);
10048 }
10049
10050 /* This function must be called while with threaded IO locked */
10051 static void queueIOJob(iojob *j) {
10052 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10053 (void*)j, j->type, (char*)j->key->ptr);
10054 listAddNodeTail(server.io_newjobs,j);
10055 if (server.io_active_threads < server.vm_max_threads)
10056 spawnIOThread();
10057 }
10058
10059 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10060 iojob *j;
10061
10062 j = zmalloc(sizeof(*j));
10063 j->type = REDIS_IOJOB_PREPARE_SWAP;
10064 j->db = db;
10065 j->key = key;
10066 incrRefCount(key);
10067 j->id = j->val = val;
10068 incrRefCount(val);
10069 j->canceled = 0;
10070 j->thread = (pthread_t) -1;
10071 val->storage = REDIS_VM_SWAPPING;
10072
10073 lockThreadedIO();
10074 queueIOJob(j);
10075 unlockThreadedIO();
10076 return REDIS_OK;
10077 }
10078
10079 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
10080
10081 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10082 * If there is not already a job loading the key, it is craeted.
10083 * The key is added to the io_keys list in the client structure, and also
10084 * in the hash table mapping swapped keys to waiting clients, that is,
10085 * server.io_waited_keys. */
10086 static int waitForSwappedKey(redisClient *c, robj *key) {
10087 struct dictEntry *de;
10088 robj *o;
10089 list *l;
10090
10091 /* If the key does not exist or is already in RAM we don't need to
10092 * block the client at all. */
10093 de = dictFind(c->db->dict,key->ptr);
10094 if (de == NULL) return 0;
10095 o = dictGetEntryVal(de);
10096 if (o->storage == REDIS_VM_MEMORY) {
10097 return 0;
10098 } else if (o->storage == REDIS_VM_SWAPPING) {
10099 /* We were swapping the key, undo it! */
10100 vmCancelThreadedIOJob(o);
10101 return 0;
10102 }
10103
10104 /* OK: the key is either swapped, or being loaded just now. */
10105
10106 /* Add the key to the list of keys this client is waiting for.
10107 * This maps clients to keys they are waiting for. */
10108 listAddNodeTail(c->io_keys,key);
10109 incrRefCount(key);
10110
10111 /* Add the client to the swapped keys => clients waiting map. */
10112 de = dictFind(c->db->io_keys,key);
10113 if (de == NULL) {
10114 int retval;
10115
10116 /* For every key we take a list of clients blocked for it */
10117 l = listCreate();
10118 retval = dictAdd(c->db->io_keys,key,l);
10119 incrRefCount(key);
10120 assert(retval == DICT_OK);
10121 } else {
10122 l = dictGetEntryVal(de);
10123 }
10124 listAddNodeTail(l,c);
10125
10126 /* Are we already loading the key from disk? If not create a job */
10127 if (o->storage == REDIS_VM_SWAPPED) {
10128 iojob *j;
10129 vmpointer *vp = (vmpointer*)o;
10130
10131 o->storage = REDIS_VM_LOADING;
10132 j = zmalloc(sizeof(*j));
10133 j->type = REDIS_IOJOB_LOAD;
10134 j->db = c->db;
10135 j->id = (robj*)vp;
10136 j->key = key;
10137 incrRefCount(key);
10138 j->page = vp->page;
10139 j->val = NULL;
10140 j->canceled = 0;
10141 j->thread = (pthread_t) -1;
10142 lockThreadedIO();
10143 queueIOJob(j);
10144 unlockThreadedIO();
10145 }
10146 return 1;
10147 }
10148
10149 /* Preload keys for any command with first, last and step values for
10150 * the command keys prototype, as defined in the command table. */
10151 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10152 int j, last;
10153 if (cmd->vm_firstkey == 0) return;
10154 last = cmd->vm_lastkey;
10155 if (last < 0) last = argc+last;
10156 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10157 redisAssert(j < argc);
10158 waitForSwappedKey(c,argv[j]);
10159 }
10160 }
10161
10162 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
10163 * Note that the number of keys to preload is user-defined, so we need to
10164 * apply a sanity check against argc. */
10165 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10166 int i, num;
10167 REDIS_NOTUSED(cmd);
10168
10169 num = atoi(argv[2]->ptr);
10170 if (num > (argc-3)) return;
10171 for (i = 0; i < num; i++) {
10172 waitForSwappedKey(c,argv[3+i]);
10173 }
10174 }
10175
10176 /* Preload keys needed to execute the entire MULTI/EXEC block.
10177 *
10178 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10179 * and will block the client when any command requires a swapped out value. */
10180 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10181 int i, margc;
10182 struct redisCommand *mcmd;
10183 robj **margv;
10184 REDIS_NOTUSED(cmd);
10185 REDIS_NOTUSED(argc);
10186 REDIS_NOTUSED(argv);
10187
10188 if (!(c->flags & REDIS_MULTI)) return;
10189 for (i = 0; i < c->mstate.count; i++) {
10190 mcmd = c->mstate.commands[i].cmd;
10191 margc = c->mstate.commands[i].argc;
10192 margv = c->mstate.commands[i].argv;
10193
10194 if (mcmd->vm_preload_proc != NULL) {
10195 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10196 } else {
10197 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10198 }
10199 }
10200 }
10201
10202 /* Is this client attempting to run a command against swapped keys?
10203 * If so, block it ASAP, load the keys in background, then resume it.
10204 *
10205 * The important idea about this function is that it can fail! If keys will
10206 * still be swapped when the client is resumed, this key lookups will
10207 * just block loading keys from disk. In practical terms this should only
10208 * happen with SORT BY command or if there is a bug in this function.
10209 *
10210 * Return 1 if the client is marked as blocked, 0 if the client can
10211 * continue as the keys it is going to access appear to be in memory. */
10212 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
10213 if (cmd->vm_preload_proc != NULL) {
10214 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
10215 } else {
10216 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
10217 }
10218
10219 /* If the client was blocked for at least one key, mark it as blocked. */
10220 if (listLength(c->io_keys)) {
10221 c->flags |= REDIS_IO_WAIT;
10222 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10223 server.vm_blocked_clients++;
10224 return 1;
10225 } else {
10226 return 0;
10227 }
10228 }
10229
10230 /* Remove the 'key' from the list of blocked keys for a given client.
10231 *
10232 * The function returns 1 when there are no longer blocking keys after
10233 * the current one was removed (and the client can be unblocked). */
10234 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10235 list *l;
10236 listNode *ln;
10237 listIter li;
10238 struct dictEntry *de;
10239
10240 /* Remove the key from the list of keys this client is waiting for. */
10241 listRewind(c->io_keys,&li);
10242 while ((ln = listNext(&li)) != NULL) {
10243 if (equalStringObjects(ln->value,key)) {
10244 listDelNode(c->io_keys,ln);
10245 break;
10246 }
10247 }
10248 assert(ln != NULL);
10249
10250 /* Remove the client form the key => waiting clients map. */
10251 de = dictFind(c->db->io_keys,key);
10252 assert(de != NULL);
10253 l = dictGetEntryVal(de);
10254 ln = listSearchKey(l,c);
10255 assert(ln != NULL);
10256 listDelNode(l,ln);
10257 if (listLength(l) == 0)
10258 dictDelete(c->db->io_keys,key);
10259
10260 return listLength(c->io_keys) == 0;
10261 }
10262
10263 /* Every time we now a key was loaded back in memory, we handle clients
10264 * waiting for this key if any. */
10265 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10266 struct dictEntry *de;
10267 list *l;
10268 listNode *ln;
10269 int len;
10270
10271 de = dictFind(db->io_keys,key);
10272 if (!de) return;
10273
10274 l = dictGetEntryVal(de);
10275 len = listLength(l);
10276 /* Note: we can't use something like while(listLength(l)) as the list
10277 * can be freed by the calling function when we remove the last element. */
10278 while (len--) {
10279 ln = listFirst(l);
10280 redisClient *c = ln->value;
10281
10282 if (dontWaitForSwappedKey(c,key)) {
10283 /* Put the client in the list of clients ready to go as we
10284 * loaded all the keys about it. */
10285 listAddNodeTail(server.io_ready_clients,c);
10286 }
10287 }
10288 }
10289
10290 /* =========================== Remote Configuration ========================= */
10291
10292 static void configSetCommand(redisClient *c) {
10293 robj *o = getDecodedObject(c->argv[3]);
10294 long long ll;
10295
10296 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10297 zfree(server.dbfilename);
10298 server.dbfilename = zstrdup(o->ptr);
10299 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10300 zfree(server.requirepass);
10301 server.requirepass = zstrdup(o->ptr);
10302 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10303 zfree(server.masterauth);
10304 server.masterauth = zstrdup(o->ptr);
10305 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10306 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10307 ll < 0) goto badfmt;
10308 server.maxmemory = ll;
10309 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10310 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10311 ll < 0 || ll > LONG_MAX) goto badfmt;
10312 server.maxidletime = ll;
10313 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10314 if (!strcasecmp(o->ptr,"no")) {
10315 server.appendfsync = APPENDFSYNC_NO;
10316 } else if (!strcasecmp(o->ptr,"everysec")) {
10317 server.appendfsync = APPENDFSYNC_EVERYSEC;
10318 } else if (!strcasecmp(o->ptr,"always")) {
10319 server.appendfsync = APPENDFSYNC_ALWAYS;
10320 } else {
10321 goto badfmt;
10322 }
10323 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10324 int yn = yesnotoi(o->ptr);
10325
10326 if (yn == -1) goto badfmt;
10327 server.no_appendfsync_on_rewrite = yn;
10328 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10329 int old = server.appendonly;
10330 int new = yesnotoi(o->ptr);
10331
10332 if (new == -1) goto badfmt;
10333 if (old != new) {
10334 if (new == 0) {
10335 stopAppendOnly();
10336 } else {
10337 if (startAppendOnly() == REDIS_ERR) {
10338 addReplySds(c,sdscatprintf(sdsempty(),
10339 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10340 decrRefCount(o);
10341 return;
10342 }
10343 }
10344 }
10345 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10346 int vlen, j;
10347 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10348
10349 /* Perform sanity check before setting the new config:
10350 * - Even number of args
10351 * - Seconds >= 1, changes >= 0 */
10352 if (vlen & 1) {
10353 sdsfreesplitres(v,vlen);
10354 goto badfmt;
10355 }
10356 for (j = 0; j < vlen; j++) {
10357 char *eptr;
10358 long val;
10359
10360 val = strtoll(v[j], &eptr, 10);
10361 if (eptr[0] != '\0' ||
10362 ((j & 1) == 0 && val < 1) ||
10363 ((j & 1) == 1 && val < 0)) {
10364 sdsfreesplitres(v,vlen);
10365 goto badfmt;
10366 }
10367 }
10368 /* Finally set the new config */
10369 resetServerSaveParams();
10370 for (j = 0; j < vlen; j += 2) {
10371 time_t seconds;
10372 int changes;
10373
10374 seconds = strtoll(v[j],NULL,10);
10375 changes = strtoll(v[j+1],NULL,10);
10376 appendServerSaveParams(seconds, changes);
10377 }
10378 sdsfreesplitres(v,vlen);
10379 } else {
10380 addReplySds(c,sdscatprintf(sdsempty(),
10381 "-ERR not supported CONFIG parameter %s\r\n",
10382 (char*)c->argv[2]->ptr));
10383 decrRefCount(o);
10384 return;
10385 }
10386 decrRefCount(o);
10387 addReply(c,shared.ok);
10388 return;
10389
10390 badfmt: /* Bad format errors */
10391 addReplySds(c,sdscatprintf(sdsempty(),
10392 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10393 (char*)o->ptr,
10394 (char*)c->argv[2]->ptr));
10395 decrRefCount(o);
10396 }
10397
10398 static void configGetCommand(redisClient *c) {
10399 robj *o = getDecodedObject(c->argv[2]);
10400 robj *lenobj = createObject(REDIS_STRING,NULL);
10401 char *pattern = o->ptr;
10402 int matches = 0;
10403
10404 addReply(c,lenobj);
10405 decrRefCount(lenobj);
10406
10407 if (stringmatch(pattern,"dbfilename",0)) {
10408 addReplyBulkCString(c,"dbfilename");
10409 addReplyBulkCString(c,server.dbfilename);
10410 matches++;
10411 }
10412 if (stringmatch(pattern,"requirepass",0)) {
10413 addReplyBulkCString(c,"requirepass");
10414 addReplyBulkCString(c,server.requirepass);
10415 matches++;
10416 }
10417 if (stringmatch(pattern,"masterauth",0)) {
10418 addReplyBulkCString(c,"masterauth");
10419 addReplyBulkCString(c,server.masterauth);
10420 matches++;
10421 }
10422 if (stringmatch(pattern,"maxmemory",0)) {
10423 char buf[128];
10424
10425 ll2string(buf,128,server.maxmemory);
10426 addReplyBulkCString(c,"maxmemory");
10427 addReplyBulkCString(c,buf);
10428 matches++;
10429 }
10430 if (stringmatch(pattern,"timeout",0)) {
10431 char buf[128];
10432
10433 ll2string(buf,128,server.maxidletime);
10434 addReplyBulkCString(c,"timeout");
10435 addReplyBulkCString(c,buf);
10436 matches++;
10437 }
10438 if (stringmatch(pattern,"appendonly",0)) {
10439 addReplyBulkCString(c,"appendonly");
10440 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10441 matches++;
10442 }
10443 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10444 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10445 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10446 matches++;
10447 }
10448 if (stringmatch(pattern,"appendfsync",0)) {
10449 char *policy;
10450
10451 switch(server.appendfsync) {
10452 case APPENDFSYNC_NO: policy = "no"; break;
10453 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10454 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10455 default: policy = "unknown"; break; /* too harmless to panic */
10456 }
10457 addReplyBulkCString(c,"appendfsync");
10458 addReplyBulkCString(c,policy);
10459 matches++;
10460 }
10461 if (stringmatch(pattern,"save",0)) {
10462 sds buf = sdsempty();
10463 int j;
10464
10465 for (j = 0; j < server.saveparamslen; j++) {
10466 buf = sdscatprintf(buf,"%ld %d",
10467 server.saveparams[j].seconds,
10468 server.saveparams[j].changes);
10469 if (j != server.saveparamslen-1)
10470 buf = sdscatlen(buf," ",1);
10471 }
10472 addReplyBulkCString(c,"save");
10473 addReplyBulkCString(c,buf);
10474 sdsfree(buf);
10475 matches++;
10476 }
10477 decrRefCount(o);
10478 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10479 }
10480
10481 static void configCommand(redisClient *c) {
10482 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10483 if (c->argc != 4) goto badarity;
10484 configSetCommand(c);
10485 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10486 if (c->argc != 3) goto badarity;
10487 configGetCommand(c);
10488 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10489 if (c->argc != 2) goto badarity;
10490 server.stat_numcommands = 0;
10491 server.stat_numconnections = 0;
10492 server.stat_expiredkeys = 0;
10493 server.stat_starttime = time(NULL);
10494 addReply(c,shared.ok);
10495 } else {
10496 addReplySds(c,sdscatprintf(sdsempty(),
10497 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10498 }
10499 return;
10500
10501 badarity:
10502 addReplySds(c,sdscatprintf(sdsempty(),
10503 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10504 (char*) c->argv[1]->ptr));
10505 }
10506
10507 /* =========================== Pubsub implementation ======================== */
10508
10509 static void freePubsubPattern(void *p) {
10510 pubsubPattern *pat = p;
10511
10512 decrRefCount(pat->pattern);
10513 zfree(pat);
10514 }
10515
10516 static int listMatchPubsubPattern(void *a, void *b) {
10517 pubsubPattern *pa = a, *pb = b;
10518
10519 return (pa->client == pb->client) &&
10520 (equalStringObjects(pa->pattern,pb->pattern));
10521 }
10522
10523 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10524 * 0 if the client was already subscribed to that channel. */
10525 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10526 struct dictEntry *de;
10527 list *clients = NULL;
10528 int retval = 0;
10529
10530 /* Add the channel to the client -> channels hash table */
10531 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10532 retval = 1;
10533 incrRefCount(channel);
10534 /* Add the client to the channel -> list of clients hash table */
10535 de = dictFind(server.pubsub_channels,channel);
10536 if (de == NULL) {
10537 clients = listCreate();
10538 dictAdd(server.pubsub_channels,channel,clients);
10539 incrRefCount(channel);
10540 } else {
10541 clients = dictGetEntryVal(de);
10542 }
10543 listAddNodeTail(clients,c);
10544 }
10545 /* Notify the client */
10546 addReply(c,shared.mbulk3);
10547 addReply(c,shared.subscribebulk);
10548 addReplyBulk(c,channel);
10549 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10550 return retval;
10551 }
10552
10553 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10554 * 0 if the client was not subscribed to the specified channel. */
10555 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10556 struct dictEntry *de;
10557 list *clients;
10558 listNode *ln;
10559 int retval = 0;
10560
10561 /* Remove the channel from the client -> channels hash table */
10562 incrRefCount(channel); /* channel may be just a pointer to the same object
10563 we have in the hash tables. Protect it... */
10564 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10565 retval = 1;
10566 /* Remove the client from the channel -> clients list hash table */
10567 de = dictFind(server.pubsub_channels,channel);
10568 assert(de != NULL);
10569 clients = dictGetEntryVal(de);
10570 ln = listSearchKey(clients,c);
10571 assert(ln != NULL);
10572 listDelNode(clients,ln);
10573 if (listLength(clients) == 0) {
10574 /* Free the list and associated hash entry at all if this was
10575 * the latest client, so that it will be possible to abuse
10576 * Redis PUBSUB creating millions of channels. */
10577 dictDelete(server.pubsub_channels,channel);
10578 }
10579 }
10580 /* Notify the client */
10581 if (notify) {
10582 addReply(c,shared.mbulk3);
10583 addReply(c,shared.unsubscribebulk);
10584 addReplyBulk(c,channel);
10585 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10586 listLength(c->pubsub_patterns));
10587
10588 }
10589 decrRefCount(channel); /* it is finally safe to release it */
10590 return retval;
10591 }
10592
10593 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10594 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10595 int retval = 0;
10596
10597 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10598 retval = 1;
10599 pubsubPattern *pat;
10600 listAddNodeTail(c->pubsub_patterns,pattern);
10601 incrRefCount(pattern);
10602 pat = zmalloc(sizeof(*pat));
10603 pat->pattern = getDecodedObject(pattern);
10604 pat->client = c;
10605 listAddNodeTail(server.pubsub_patterns,pat);
10606 }
10607 /* Notify the client */
10608 addReply(c,shared.mbulk3);
10609 addReply(c,shared.psubscribebulk);
10610 addReplyBulk(c,pattern);
10611 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10612 return retval;
10613 }
10614
10615 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10616 * 0 if the client was not subscribed to the specified channel. */
10617 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10618 listNode *ln;
10619 pubsubPattern pat;
10620 int retval = 0;
10621
10622 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10623 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10624 retval = 1;
10625 listDelNode(c->pubsub_patterns,ln);
10626 pat.client = c;
10627 pat.pattern = pattern;
10628 ln = listSearchKey(server.pubsub_patterns,&pat);
10629 listDelNode(server.pubsub_patterns,ln);
10630 }
10631 /* Notify the client */
10632 if (notify) {
10633 addReply(c,shared.mbulk3);
10634 addReply(c,shared.punsubscribebulk);
10635 addReplyBulk(c,pattern);
10636 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10637 listLength(c->pubsub_patterns));
10638 }
10639 decrRefCount(pattern);
10640 return retval;
10641 }
10642
10643 /* Unsubscribe from all the channels. Return the number of channels the
10644 * client was subscribed from. */
10645 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10646 dictIterator *di = dictGetIterator(c->pubsub_channels);
10647 dictEntry *de;
10648 int count = 0;
10649
10650 while((de = dictNext(di)) != NULL) {
10651 robj *channel = dictGetEntryKey(de);
10652
10653 count += pubsubUnsubscribeChannel(c,channel,notify);
10654 }
10655 dictReleaseIterator(di);
10656 return count;
10657 }
10658
10659 /* Unsubscribe from all the patterns. Return the number of patterns the
10660 * client was subscribed from. */
10661 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10662 listNode *ln;
10663 listIter li;
10664 int count = 0;
10665
10666 listRewind(c->pubsub_patterns,&li);
10667 while ((ln = listNext(&li)) != NULL) {
10668 robj *pattern = ln->value;
10669
10670 count += pubsubUnsubscribePattern(c,pattern,notify);
10671 }
10672 return count;
10673 }
10674
10675 /* Publish a message */
10676 static int pubsubPublishMessage(robj *channel, robj *message) {
10677 int receivers = 0;
10678 struct dictEntry *de;
10679 listNode *ln;
10680 listIter li;
10681
10682 /* Send to clients listening for that channel */
10683 de = dictFind(server.pubsub_channels,channel);
10684 if (de) {
10685 list *list = dictGetEntryVal(de);
10686 listNode *ln;
10687 listIter li;
10688
10689 listRewind(list,&li);
10690 while ((ln = listNext(&li)) != NULL) {
10691 redisClient *c = ln->value;
10692
10693 addReply(c,shared.mbulk3);
10694 addReply(c,shared.messagebulk);
10695 addReplyBulk(c,channel);
10696 addReplyBulk(c,message);
10697 receivers++;
10698 }
10699 }
10700 /* Send to clients listening to matching channels */
10701 if (listLength(server.pubsub_patterns)) {
10702 listRewind(server.pubsub_patterns,&li);
10703 channel = getDecodedObject(channel);
10704 while ((ln = listNext(&li)) != NULL) {
10705 pubsubPattern *pat = ln->value;
10706
10707 if (stringmatchlen((char*)pat->pattern->ptr,
10708 sdslen(pat->pattern->ptr),
10709 (char*)channel->ptr,
10710 sdslen(channel->ptr),0)) {
10711 addReply(pat->client,shared.mbulk4);
10712 addReply(pat->client,shared.pmessagebulk);
10713 addReplyBulk(pat->client,pat->pattern);
10714 addReplyBulk(pat->client,channel);
10715 addReplyBulk(pat->client,message);
10716 receivers++;
10717 }
10718 }
10719 decrRefCount(channel);
10720 }
10721 return receivers;
10722 }
10723
10724 static void subscribeCommand(redisClient *c) {
10725 int j;
10726
10727 for (j = 1; j < c->argc; j++)
10728 pubsubSubscribeChannel(c,c->argv[j]);
10729 }
10730
10731 static void unsubscribeCommand(redisClient *c) {
10732 if (c->argc == 1) {
10733 pubsubUnsubscribeAllChannels(c,1);
10734 return;
10735 } else {
10736 int j;
10737
10738 for (j = 1; j < c->argc; j++)
10739 pubsubUnsubscribeChannel(c,c->argv[j],1);
10740 }
10741 }
10742
10743 static void psubscribeCommand(redisClient *c) {
10744 int j;
10745
10746 for (j = 1; j < c->argc; j++)
10747 pubsubSubscribePattern(c,c->argv[j]);
10748 }
10749
10750 static void punsubscribeCommand(redisClient *c) {
10751 if (c->argc == 1) {
10752 pubsubUnsubscribeAllPatterns(c,1);
10753 return;
10754 } else {
10755 int j;
10756
10757 for (j = 1; j < c->argc; j++)
10758 pubsubUnsubscribePattern(c,c->argv[j],1);
10759 }
10760 }
10761
10762 static void publishCommand(redisClient *c) {
10763 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10764 addReplyLongLong(c,receivers);
10765 }
10766
10767 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10768 *
10769 * The implementation uses a per-DB hash table mapping keys to list of clients
10770 * WATCHing those keys, so that given a key that is going to be modified
10771 * we can mark all the associated clients as dirty.
10772 *
10773 * Also every client contains a list of WATCHed keys so that's possible to
10774 * un-watch such keys when the client is freed or when UNWATCH is called. */
10775
10776 /* In the client->watched_keys list we need to use watchedKey structures
10777 * as in order to identify a key in Redis we need both the key name and the
10778 * DB */
10779 typedef struct watchedKey {
10780 robj *key;
10781 redisDb *db;
10782 } watchedKey;
10783
10784 /* Watch for the specified key */
10785 static void watchForKey(redisClient *c, robj *key) {
10786 list *clients = NULL;
10787 listIter li;
10788 listNode *ln;
10789 watchedKey *wk;
10790
10791 /* Check if we are already watching for this key */
10792 listRewind(c->watched_keys,&li);
10793 while((ln = listNext(&li))) {
10794 wk = listNodeValue(ln);
10795 if (wk->db == c->db && equalStringObjects(key,wk->key))
10796 return; /* Key already watched */
10797 }
10798 /* This key is not already watched in this DB. Let's add it */
10799 clients = dictFetchValue(c->db->watched_keys,key);
10800 if (!clients) {
10801 clients = listCreate();
10802 dictAdd(c->db->watched_keys,key,clients);
10803 incrRefCount(key);
10804 }
10805 listAddNodeTail(clients,c);
10806 /* Add the new key to the lits of keys watched by this client */
10807 wk = zmalloc(sizeof(*wk));
10808 wk->key = key;
10809 wk->db = c->db;
10810 incrRefCount(key);
10811 listAddNodeTail(c->watched_keys,wk);
10812 }
10813
10814 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10815 * flag is up to the caller. */
10816 static void unwatchAllKeys(redisClient *c) {
10817 listIter li;
10818 listNode *ln;
10819
10820 if (listLength(c->watched_keys) == 0) return;
10821 listRewind(c->watched_keys,&li);
10822 while((ln = listNext(&li))) {
10823 list *clients;
10824 watchedKey *wk;
10825
10826 /* Lookup the watched key -> clients list and remove the client
10827 * from the list */
10828 wk = listNodeValue(ln);
10829 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10830 assert(clients != NULL);
10831 listDelNode(clients,listSearchKey(clients,c));
10832 /* Kill the entry at all if this was the only client */
10833 if (listLength(clients) == 0)
10834 dictDelete(wk->db->watched_keys, wk->key);
10835 /* Remove this watched key from the client->watched list */
10836 listDelNode(c->watched_keys,ln);
10837 decrRefCount(wk->key);
10838 zfree(wk);
10839 }
10840 }
10841
10842 /* "Touch" a key, so that if this key is being WATCHed by some client the
10843 * next EXEC will fail. */
10844 static void touchWatchedKey(redisDb *db, robj *key) {
10845 list *clients;
10846 listIter li;
10847 listNode *ln;
10848
10849 if (dictSize(db->watched_keys) == 0) return;
10850 clients = dictFetchValue(db->watched_keys, key);
10851 if (!clients) return;
10852
10853 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10854 /* Check if we are already watching for this key */
10855 listRewind(clients,&li);
10856 while((ln = listNext(&li))) {
10857 redisClient *c = listNodeValue(ln);
10858
10859 c->flags |= REDIS_DIRTY_CAS;
10860 }
10861 }
10862
10863 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10864 * flush but will be deleted as effect of the flushing operation should
10865 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10866 * a FLUSHALL operation (all the DBs flushed). */
10867 static void touchWatchedKeysOnFlush(int dbid) {
10868 listIter li1, li2;
10869 listNode *ln;
10870
10871 /* For every client, check all the waited keys */
10872 listRewind(server.clients,&li1);
10873 while((ln = listNext(&li1))) {
10874 redisClient *c = listNodeValue(ln);
10875 listRewind(c->watched_keys,&li2);
10876 while((ln = listNext(&li2))) {
10877 watchedKey *wk = listNodeValue(ln);
10878
10879 /* For every watched key matching the specified DB, if the
10880 * key exists, mark the client as dirty, as the key will be
10881 * removed. */
10882 if (dbid == -1 || wk->db->id == dbid) {
10883 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
10884 c->flags |= REDIS_DIRTY_CAS;
10885 }
10886 }
10887 }
10888 }
10889
10890 static void watchCommand(redisClient *c) {
10891 int j;
10892
10893 if (c->flags & REDIS_MULTI) {
10894 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10895 return;
10896 }
10897 for (j = 1; j < c->argc; j++)
10898 watchForKey(c,c->argv[j]);
10899 addReply(c,shared.ok);
10900 }
10901
10902 static void unwatchCommand(redisClient *c) {
10903 unwatchAllKeys(c);
10904 c->flags &= (~REDIS_DIRTY_CAS);
10905 addReply(c,shared.ok);
10906 }
10907
10908 /* ================================= Debugging ============================== */
10909
10910 /* Compute the sha1 of string at 's' with 'len' bytes long.
10911 * The SHA1 is then xored againt the string pointed by digest.
10912 * Since xor is commutative, this operation is used in order to
10913 * "add" digests relative to unordered elements.
10914 *
10915 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10916 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10917 SHA1_CTX ctx;
10918 unsigned char hash[20], *s = ptr;
10919 int j;
10920
10921 SHA1Init(&ctx);
10922 SHA1Update(&ctx,s,len);
10923 SHA1Final(hash,&ctx);
10924
10925 for (j = 0; j < 20; j++)
10926 digest[j] ^= hash[j];
10927 }
10928
10929 static void xorObjectDigest(unsigned char *digest, robj *o) {
10930 o = getDecodedObject(o);
10931 xorDigest(digest,o->ptr,sdslen(o->ptr));
10932 decrRefCount(o);
10933 }
10934
10935 /* This function instead of just computing the SHA1 and xoring it
10936 * against diget, also perform the digest of "digest" itself and
10937 * replace the old value with the new one.
10938 *
10939 * So the final digest will be:
10940 *
10941 * digest = SHA1(digest xor SHA1(data))
10942 *
10943 * This function is used every time we want to preserve the order so
10944 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10945 *
10946 * Also note that mixdigest("foo") followed by mixdigest("bar")
10947 * will lead to a different digest compared to "fo", "obar".
10948 */
10949 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10950 SHA1_CTX ctx;
10951 char *s = ptr;
10952
10953 xorDigest(digest,s,len);
10954 SHA1Init(&ctx);
10955 SHA1Update(&ctx,digest,20);
10956 SHA1Final(digest,&ctx);
10957 }
10958
10959 static void mixObjectDigest(unsigned char *digest, robj *o) {
10960 o = getDecodedObject(o);
10961 mixDigest(digest,o->ptr,sdslen(o->ptr));
10962 decrRefCount(o);
10963 }
10964
10965 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10966 * are not ordered, we use a trick: every aggregate digest is the xor
10967 * of the digests of their elements. This way the order will not change
10968 * the result. For list instead we use a feedback entering the output digest
10969 * as input in order to ensure that a different ordered list will result in
10970 * a different digest. */
10971 static void computeDatasetDigest(unsigned char *final) {
10972 unsigned char digest[20];
10973 char buf[128];
10974 dictIterator *di = NULL;
10975 dictEntry *de;
10976 int j;
10977 uint32_t aux;
10978
10979 memset(final,0,20); /* Start with a clean result */
10980
10981 for (j = 0; j < server.dbnum; j++) {
10982 redisDb *db = server.db+j;
10983
10984 if (dictSize(db->dict) == 0) continue;
10985 di = dictGetIterator(db->dict);
10986
10987 /* hash the DB id, so the same dataset moved in a different
10988 * DB will lead to a different digest */
10989 aux = htonl(j);
10990 mixDigest(final,&aux,sizeof(aux));
10991
10992 /* Iterate this DB writing every entry */
10993 while((de = dictNext(di)) != NULL) {
10994 sds key;
10995 robj *keyobj, *o;
10996 time_t expiretime;
10997
10998 memset(digest,0,20); /* This key-val digest */
10999 key = dictGetEntryKey(de);
11000 keyobj = createStringObject(key,sdslen(key));
11001
11002 mixDigest(digest,key,sdslen(key));
11003
11004 /* Make sure the key is loaded if VM is active */
11005 o = lookupKeyRead(db,keyobj);
11006
11007 aux = htonl(o->type);
11008 mixDigest(digest,&aux,sizeof(aux));
11009 expiretime = getExpire(db,keyobj);
11010
11011 /* Save the key and associated value */
11012 if (o->type == REDIS_STRING) {
11013 mixObjectDigest(digest,o);
11014 } else if (o->type == REDIS_LIST) {
11015 lIterator *li = lInitIterator(o,0,REDIS_TAIL);
11016 lEntry entry;
11017 while(lNext(li,&entry)) {
11018 robj *eleobj = lGet(&entry);
11019 mixObjectDigest(digest,eleobj);
11020 decrRefCount(eleobj);
11021 }
11022 lReleaseIterator(li);
11023 } else if (o->type == REDIS_SET) {
11024 dict *set = o->ptr;
11025 dictIterator *di = dictGetIterator(set);
11026 dictEntry *de;
11027
11028 while((de = dictNext(di)) != NULL) {
11029 robj *eleobj = dictGetEntryKey(de);
11030
11031 xorObjectDigest(digest,eleobj);
11032 }
11033 dictReleaseIterator(di);
11034 } else if (o->type == REDIS_ZSET) {
11035 zset *zs = o->ptr;
11036 dictIterator *di = dictGetIterator(zs->dict);
11037 dictEntry *de;
11038
11039 while((de = dictNext(di)) != NULL) {
11040 robj *eleobj = dictGetEntryKey(de);
11041 double *score = dictGetEntryVal(de);
11042 unsigned char eledigest[20];
11043
11044 snprintf(buf,sizeof(buf),"%.17g",*score);
11045 memset(eledigest,0,20);
11046 mixObjectDigest(eledigest,eleobj);
11047 mixDigest(eledigest,buf,strlen(buf));
11048 xorDigest(digest,eledigest,20);
11049 }
11050 dictReleaseIterator(di);
11051 } else if (o->type == REDIS_HASH) {
11052 hashIterator *hi;
11053 robj *obj;
11054
11055 hi = hashInitIterator(o);
11056 while (hashNext(hi) != REDIS_ERR) {
11057 unsigned char eledigest[20];
11058
11059 memset(eledigest,0,20);
11060 obj = hashCurrent(hi,REDIS_HASH_KEY);
11061 mixObjectDigest(eledigest,obj);
11062 decrRefCount(obj);
11063 obj = hashCurrent(hi,REDIS_HASH_VALUE);
11064 mixObjectDigest(eledigest,obj);
11065 decrRefCount(obj);
11066 xorDigest(digest,eledigest,20);
11067 }
11068 hashReleaseIterator(hi);
11069 } else {
11070 redisPanic("Unknown object type");
11071 }
11072 /* If the key has an expire, add it to the mix */
11073 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11074 /* We can finally xor the key-val digest to the final digest */
11075 xorDigest(final,digest,20);
11076 decrRefCount(keyobj);
11077 }
11078 dictReleaseIterator(di);
11079 }
11080 }
11081
11082 static void debugCommand(redisClient *c) {
11083 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11084 *((char*)-1) = 'x';
11085 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11086 if (rdbSave(server.dbfilename) != REDIS_OK) {
11087 addReply(c,shared.err);
11088 return;
11089 }
11090 emptyDb();
11091 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11092 addReply(c,shared.err);
11093 return;
11094 }
11095 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11096 addReply(c,shared.ok);
11097 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11098 emptyDb();
11099 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11100 addReply(c,shared.err);
11101 return;
11102 }
11103 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11104 addReply(c,shared.ok);
11105 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11106 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11107 robj *val;
11108
11109 if (!de) {
11110 addReply(c,shared.nokeyerr);
11111 return;
11112 }
11113 val = dictGetEntryVal(de);
11114 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11115 val->storage == REDIS_VM_SWAPPING)) {
11116 char *strenc;
11117 char buf[128];
11118
11119 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11120 strenc = strencoding[val->encoding];
11121 } else {
11122 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11123 strenc = buf;
11124 }
11125 addReplySds(c,sdscatprintf(sdsempty(),
11126 "+Value at:%p refcount:%d "
11127 "encoding:%s serializedlength:%lld\r\n",
11128 (void*)val, val->refcount,
11129 strenc, (long long) rdbSavedObjectLen(val,NULL)));
11130 } else {
11131 vmpointer *vp = (vmpointer*) val;
11132 addReplySds(c,sdscatprintf(sdsempty(),
11133 "+Value swapped at: page %llu "
11134 "using %llu pages\r\n",
11135 (unsigned long long) vp->page,
11136 (unsigned long long) vp->usedpages));
11137 }
11138 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11139 lookupKeyRead(c->db,c->argv[2]);
11140 addReply(c,shared.ok);
11141 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11142 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11143 robj *val;
11144 vmpointer *vp;
11145
11146 if (!server.vm_enabled) {
11147 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11148 return;
11149 }
11150 if (!de) {
11151 addReply(c,shared.nokeyerr);
11152 return;
11153 }
11154 val = dictGetEntryVal(de);
11155 /* Swap it */
11156 if (val->storage != REDIS_VM_MEMORY) {
11157 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
11158 } else if (val->refcount != 1) {
11159 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11160 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11161 dictGetEntryVal(de) = vp;
11162 addReply(c,shared.ok);
11163 } else {
11164 addReply(c,shared.err);
11165 }
11166 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11167 long keys, j;
11168 robj *key, *val;
11169 char buf[128];
11170
11171 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11172 return;
11173 for (j = 0; j < keys; j++) {
11174 snprintf(buf,sizeof(buf),"key:%lu",j);
11175 key = createStringObject(buf,strlen(buf));
11176 if (lookupKeyRead(c->db,key) != NULL) {
11177 decrRefCount(key);
11178 continue;
11179 }
11180 snprintf(buf,sizeof(buf),"value:%lu",j);
11181 val = createStringObject(buf,strlen(buf));
11182 dbAdd(c->db,key,val);
11183 decrRefCount(key);
11184 }
11185 addReply(c,shared.ok);
11186 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11187 unsigned char digest[20];
11188 sds d = sdsnew("+");
11189 int j;
11190
11191 computeDatasetDigest(digest);
11192 for (j = 0; j < 20; j++)
11193 d = sdscatprintf(d, "%02x",digest[j]);
11194
11195 d = sdscatlen(d,"\r\n",2);
11196 addReplySds(c,d);
11197 } else {
11198 addReplySds(c,sdsnew(
11199 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
11200 }
11201 }
11202
11203 static void _redisAssert(char *estr, char *file, int line) {
11204 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
11205 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
11206 #ifdef HAVE_BACKTRACE
11207 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11208 *((char*)-1) = 'x';
11209 #endif
11210 }
11211
11212 static void _redisPanic(char *msg, char *file, int line) {
11213 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
11214 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
11215 #ifdef HAVE_BACKTRACE
11216 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11217 *((char*)-1) = 'x';
11218 #endif
11219 }
11220
11221 /* =================================== Main! ================================ */
11222
11223 #ifdef __linux__
11224 int linuxOvercommitMemoryValue(void) {
11225 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11226 char buf[64];
11227
11228 if (!fp) return -1;
11229 if (fgets(buf,64,fp) == NULL) {
11230 fclose(fp);
11231 return -1;
11232 }
11233 fclose(fp);
11234
11235 return atoi(buf);
11236 }
11237
11238 void linuxOvercommitMemoryWarning(void) {
11239 if (linuxOvercommitMemoryValue() == 0) {
11240 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
11241 }
11242 }
11243 #endif /* __linux__ */
11244
11245 static void daemonize(void) {
11246 int fd;
11247 FILE *fp;
11248
11249 if (fork() != 0) exit(0); /* parent exits */
11250 setsid(); /* create a new session */
11251
11252 /* Every output goes to /dev/null. If Redis is daemonized but
11253 * the 'logfile' is set to 'stdout' in the configuration file
11254 * it will not log at all. */
11255 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11256 dup2(fd, STDIN_FILENO);
11257 dup2(fd, STDOUT_FILENO);
11258 dup2(fd, STDERR_FILENO);
11259 if (fd > STDERR_FILENO) close(fd);
11260 }
11261 /* Try to write the pid file */
11262 fp = fopen(server.pidfile,"w");
11263 if (fp) {
11264 fprintf(fp,"%d\n",getpid());
11265 fclose(fp);
11266 }
11267 }
11268
11269 static void version() {
11270 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11271 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11272 exit(0);
11273 }
11274
11275 static void usage() {
11276 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11277 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11278 exit(1);
11279 }
11280
11281 int main(int argc, char **argv) {
11282 time_t start;
11283
11284 initServerConfig();
11285 sortCommandTable();
11286 if (argc == 2) {
11287 if (strcmp(argv[1], "-v") == 0 ||
11288 strcmp(argv[1], "--version") == 0) version();
11289 if (strcmp(argv[1], "--help") == 0) usage();
11290 resetServerSaveParams();
11291 loadServerConfig(argv[1]);
11292 } else if ((argc > 2)) {
11293 usage();
11294 } else {
11295 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11296 }
11297 if (server.daemonize) daemonize();
11298 initServer();
11299 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11300 #ifdef __linux__
11301 linuxOvercommitMemoryWarning();
11302 #endif
11303 start = time(NULL);
11304 if (server.appendonly) {
11305 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11306 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11307 } else {
11308 if (rdbLoad(server.dbfilename) == REDIS_OK)
11309 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11310 }
11311 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11312 aeSetBeforeSleepProc(server.el,beforeSleep);
11313 aeMain(server.el);
11314 aeDeleteEventLoop(server.el);
11315 return 0;
11316 }
11317
11318 /* ============================= Backtrace support ========================= */
11319
11320 #ifdef HAVE_BACKTRACE
11321 static char *findFuncName(void *pointer, unsigned long *offset);
11322
11323 static void *getMcontextEip(ucontext_t *uc) {
11324 #if defined(__FreeBSD__)
11325 return (void*) uc->uc_mcontext.mc_eip;
11326 #elif defined(__dietlibc__)
11327 return (void*) uc->uc_mcontext.eip;
11328 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11329 #if __x86_64__
11330 return (void*) uc->uc_mcontext->__ss.__rip;
11331 #else
11332 return (void*) uc->uc_mcontext->__ss.__eip;
11333 #endif
11334 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11335 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11336 return (void*) uc->uc_mcontext->__ss.__rip;
11337 #else
11338 return (void*) uc->uc_mcontext->__ss.__eip;
11339 #endif
11340 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11341 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11342 #elif defined(__ia64__) /* Linux IA64 */
11343 return (void*) uc->uc_mcontext.sc_ip;
11344 #else
11345 return NULL;
11346 #endif
11347 }
11348
11349 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11350 void *trace[100];
11351 char **messages = NULL;
11352 int i, trace_size = 0;
11353 unsigned long offset=0;
11354 ucontext_t *uc = (ucontext_t*) secret;
11355 sds infostring;
11356 REDIS_NOTUSED(info);
11357
11358 redisLog(REDIS_WARNING,
11359 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11360 infostring = genRedisInfoString();
11361 redisLog(REDIS_WARNING, "%s",infostring);
11362 /* It's not safe to sdsfree() the returned string under memory
11363 * corruption conditions. Let it leak as we are going to abort */
11364
11365 trace_size = backtrace(trace, 100);
11366 /* overwrite sigaction with caller's address */
11367 if (getMcontextEip(uc) != NULL) {
11368 trace[1] = getMcontextEip(uc);
11369 }
11370 messages = backtrace_symbols(trace, trace_size);
11371
11372 for (i=1; i<trace_size; ++i) {
11373 char *fn = findFuncName(trace[i], &offset), *p;
11374
11375 p = strchr(messages[i],'+');
11376 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11377 redisLog(REDIS_WARNING,"%s", messages[i]);
11378 } else {
11379 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11380 }
11381 }
11382 /* free(messages); Don't call free() with possibly corrupted memory. */
11383 _exit(0);
11384 }
11385
11386 static void sigtermHandler(int sig) {
11387 REDIS_NOTUSED(sig);
11388
11389 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11390 server.shutdown_asap = 1;
11391 }
11392
11393 static void setupSigSegvAction(void) {
11394 struct sigaction act;
11395
11396 sigemptyset (&act.sa_mask);
11397 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11398 * is used. Otherwise, sa_handler is used */
11399 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11400 act.sa_sigaction = segvHandler;
11401 sigaction (SIGSEGV, &act, NULL);
11402 sigaction (SIGBUS, &act, NULL);
11403 sigaction (SIGFPE, &act, NULL);
11404 sigaction (SIGILL, &act, NULL);
11405 sigaction (SIGBUS, &act, NULL);
11406
11407 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11408 act.sa_handler = sigtermHandler;
11409 sigaction (SIGTERM, &act, NULL);
11410 return;
11411 }
11412
11413 #include "staticsymbols.h"
11414 /* This function try to convert a pointer into a function name. It's used in
11415 * oreder to provide a backtrace under segmentation fault that's able to
11416 * display functions declared as static (otherwise the backtrace is useless). */
11417 static char *findFuncName(void *pointer, unsigned long *offset){
11418 int i, ret = -1;
11419 unsigned long off, minoff = 0;
11420
11421 /* Try to match against the Symbol with the smallest offset */
11422 for (i=0; symsTable[i].pointer; i++) {
11423 unsigned long lp = (unsigned long) pointer;
11424
11425 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11426 off=lp-symsTable[i].pointer;
11427 if (ret < 0 || off < minoff) {
11428 minoff=off;
11429 ret=i;
11430 }
11431 }
11432 }
11433 if (ret == -1) return NULL;
11434 *offset = minoff;
11435 return symsTable[ret].name;
11436 }
11437 #else /* HAVE_BACKTRACE */
11438 static void setupSigSegvAction(void) {
11439 }
11440 #endif /* HAVE_BACKTRACE */
11441
11442
11443
11444 /* The End */
11445
11446
11447