]> git.saurik.com Git - redis.git/blob - redis.c
Merge branch 'smallkeys' of github.com:antirez/redis into smallkeys
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.1"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 0 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123 #define REDIS_VMPOINTER 8
124
125 /* Objects encoding. Some kind of objects like Strings and Hashes can be
126 * internally represented in multiple ways. The 'encoding' field of the object
127 * is set to one of this fields for this object. */
128 #define REDIS_ENCODING_RAW 0 /* Raw representation */
129 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
130 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
131 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
132
133 static char* strencoding[] = {
134 "raw", "int", "zipmap", "hashtable"
135 };
136
137 /* Object types only used for dumping to disk */
138 #define REDIS_EXPIRETIME 253
139 #define REDIS_SELECTDB 254
140 #define REDIS_EOF 255
141
142 /* Defines related to the dump file format. To store 32 bits lengths for short
143 * keys requires a lot of space, so we check the most significant 2 bits of
144 * the first byte to interpreter the length:
145 *
146 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
147 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
148 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
149 * 11|000000 this means: specially encoded object will follow. The six bits
150 * number specify the kind of object that follows.
151 * See the REDIS_RDB_ENC_* defines.
152 *
153 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
154 * values, will fit inside. */
155 #define REDIS_RDB_6BITLEN 0
156 #define REDIS_RDB_14BITLEN 1
157 #define REDIS_RDB_32BITLEN 2
158 #define REDIS_RDB_ENCVAL 3
159 #define REDIS_RDB_LENERR UINT_MAX
160
161 /* When a length of a string object stored on disk has the first two bits
162 * set, the remaining two bits specify a special encoding for the object
163 * accordingly to the following defines: */
164 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
165 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
166 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
167 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
168
169 /* Virtual memory object->where field. */
170 #define REDIS_VM_MEMORY 0 /* The object is on memory */
171 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
172 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
173 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
174
175 /* Virtual memory static configuration stuff.
176 * Check vmFindContiguousPages() to know more about this magic numbers. */
177 #define REDIS_VM_MAX_NEAR_PAGES 65536
178 #define REDIS_VM_MAX_RANDOM_JUMP 4096
179 #define REDIS_VM_MAX_THREADS 32
180 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
181 /* The following is the *percentage* of completed I/O jobs to process when the
182 * handelr is called. While Virtual Memory I/O operations are performed by
183 * threads, this operations must be processed by the main thread when completed
184 * in order to take effect. */
185 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
186
187 /* Client flags */
188 #define REDIS_SLAVE 1 /* This client is a slave server */
189 #define REDIS_MASTER 2 /* This client is a master server */
190 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
191 #define REDIS_MULTI 8 /* This client is in a MULTI context */
192 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
193 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
194 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
195
196 /* Slave replication state - slave side */
197 #define REDIS_REPL_NONE 0 /* No active replication */
198 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
199 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
200
201 /* Slave replication state - from the point of view of master
202 * Note that in SEND_BULK and ONLINE state the slave receives new updates
203 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
204 * to start the next background saving in order to send updates to it. */
205 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
206 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
207 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
208 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
209
210 /* List related stuff */
211 #define REDIS_HEAD 0
212 #define REDIS_TAIL 1
213
214 /* Sort operations */
215 #define REDIS_SORT_GET 0
216 #define REDIS_SORT_ASC 1
217 #define REDIS_SORT_DESC 2
218 #define REDIS_SORTKEY_MAX 1024
219
220 /* Log levels */
221 #define REDIS_DEBUG 0
222 #define REDIS_VERBOSE 1
223 #define REDIS_NOTICE 2
224 #define REDIS_WARNING 3
225
226 /* Anti-warning macro... */
227 #define REDIS_NOTUSED(V) ((void) V)
228
229 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
230 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
231
232 /* Append only defines */
233 #define APPENDFSYNC_NO 0
234 #define APPENDFSYNC_ALWAYS 1
235 #define APPENDFSYNC_EVERYSEC 2
236
237 /* Hashes related defaults */
238 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
239 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
240
241 /* We can print the stacktrace, so our assert is defined this way: */
242 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
243 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
244 static void _redisAssert(char *estr, char *file, int line);
245 static void _redisPanic(char *msg, char *file, int line);
246
247 /*================================= Data types ============================== */
248
249 /* A redis object, that is a type able to hold a string / list / set */
250
251 /* The actual Redis Object */
252 typedef struct redisObject {
253 unsigned type:4;
254 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
255 unsigned encoding:4;
256 unsigned lru:22; /* lru time (relative to server.lruclock) */
257 int refcount;
258 void *ptr;
259 /* VM fields, this are only allocated if VM is active, otherwise the
260 * object allocation function will just allocate
261 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
262 * Redis without VM active will not have any overhead. */
263 } robj;
264
265 /* The VM pointer structure - identifies an object in the swap file.
266 *
267 * This object is stored in place of the value
268 * object in the main key->value hash table representing a database.
269 * Note that the first fields (type, storage) are the same as the redisObject
270 * structure so that vmPointer strucuters can be accessed even when casted
271 * as redisObject structures.
272 *
273 * This is useful as we don't know if a value object is or not on disk, but we
274 * are always free of accessing obj->storage to check this. For vmPointer
275 * structures "type" is set to REDIS_VMPOINTER (even if without this field
276 * is still possible to check the kind of object from the value of 'storage').*/
277 typedef struct vmPointer {
278 unsigned type:4;
279 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
280 unsigned notused:26;
281 unsigned int vtype; /* type of the object stored in the swap file */
282 off_t page; /* the page at witch the object is stored on disk */
283 off_t usedpages; /* number of pages used on disk */
284 } vmpointer;
285
286 /* Macro used to initalize a Redis object allocated on the stack.
287 * Note that this macro is taken near the structure definition to make sure
288 * we'll update it when the structure is changed, to avoid bugs like
289 * bug #85 introduced exactly in this way. */
290 #define initStaticStringObject(_var,_ptr) do { \
291 _var.refcount = 1; \
292 _var.type = REDIS_STRING; \
293 _var.encoding = REDIS_ENCODING_RAW; \
294 _var.ptr = _ptr; \
295 _var.storage = REDIS_VM_MEMORY; \
296 } while(0);
297
298 typedef struct redisDb {
299 dict *dict; /* The keyspace for this DB */
300 dict *expires; /* Timeout of keys with a timeout set */
301 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
302 dict *io_keys; /* Keys with clients waiting for VM I/O */
303 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
304 int id;
305 } redisDb;
306
307 /* Client MULTI/EXEC state */
308 typedef struct multiCmd {
309 robj **argv;
310 int argc;
311 struct redisCommand *cmd;
312 } multiCmd;
313
314 typedef struct multiState {
315 multiCmd *commands; /* Array of MULTI commands */
316 int count; /* Total number of MULTI commands */
317 } multiState;
318
319 /* With multiplexing we need to take per-clinet state.
320 * Clients are taken in a liked list. */
321 typedef struct redisClient {
322 int fd;
323 redisDb *db;
324 int dictid;
325 sds querybuf;
326 robj **argv, **mbargv;
327 int argc, mbargc;
328 int bulklen; /* bulk read len. -1 if not in bulk read mode */
329 int multibulk; /* multi bulk command format active */
330 list *reply;
331 int sentlen;
332 time_t lastinteraction; /* time of the last interaction, used for timeout */
333 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
334 int slaveseldb; /* slave selected db, if this client is a slave */
335 int authenticated; /* when requirepass is non-NULL */
336 int replstate; /* replication state if this is a slave */
337 int repldbfd; /* replication DB file descriptor */
338 long repldboff; /* replication DB file offset */
339 off_t repldbsize; /* replication DB file size */
340 multiState mstate; /* MULTI/EXEC state */
341 robj **blocking_keys; /* The key we are waiting to terminate a blocking
342 * operation such as BLPOP. Otherwise NULL. */
343 int blocking_keys_num; /* Number of blocking keys */
344 time_t blockingto; /* Blocking operation timeout. If UNIX current time
345 * is >= blockingto then the operation timed out. */
346 list *io_keys; /* Keys this client is waiting to be loaded from the
347 * swap file in order to continue. */
348 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
349 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
350 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
351 } redisClient;
352
353 struct saveparam {
354 time_t seconds;
355 int changes;
356 };
357
358 /* Global server state structure */
359 struct redisServer {
360 int port;
361 int fd;
362 redisDb *db;
363 long long dirty; /* changes to DB from the last save */
364 list *clients;
365 list *slaves, *monitors;
366 char neterr[ANET_ERR_LEN];
367 aeEventLoop *el;
368 int cronloops; /* number of times the cron function run */
369 list *objfreelist; /* A list of freed objects to avoid malloc() */
370 time_t lastsave; /* Unix time of last save succeeede */
371 /* Fields used only for stats */
372 time_t stat_starttime; /* server start time */
373 long long stat_numcommands; /* number of processed commands */
374 long long stat_numconnections; /* number of connections received */
375 long long stat_expiredkeys; /* number of expired keys */
376 /* Configuration */
377 int verbosity;
378 int glueoutputbuf;
379 int maxidletime;
380 int dbnum;
381 int daemonize;
382 int appendonly;
383 int appendfsync;
384 int no_appendfsync_on_rewrite;
385 int shutdown_asap;
386 time_t lastfsync;
387 int appendfd;
388 int appendseldb;
389 char *pidfile;
390 pid_t bgsavechildpid;
391 pid_t bgrewritechildpid;
392 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
393 sds aofbuf; /* AOF buffer, written before entering the event loop */
394 struct saveparam *saveparams;
395 int saveparamslen;
396 char *logfile;
397 char *bindaddr;
398 char *dbfilename;
399 char *appendfilename;
400 char *requirepass;
401 int rdbcompression;
402 int activerehashing;
403 /* Replication related */
404 int isslave;
405 char *masterauth;
406 char *masterhost;
407 int masterport;
408 redisClient *master; /* client that is master for this slave */
409 int replstate;
410 unsigned int maxclients;
411 unsigned long long maxmemory;
412 unsigned int blpop_blocked_clients;
413 unsigned int vm_blocked_clients;
414 /* Sort parameters - qsort_r() is only available under BSD so we
415 * have to take this state global, in order to pass it to sortCompare() */
416 int sort_desc;
417 int sort_alpha;
418 int sort_bypattern;
419 /* Virtual memory configuration */
420 int vm_enabled;
421 char *vm_swap_file;
422 off_t vm_page_size;
423 off_t vm_pages;
424 unsigned long long vm_max_memory;
425 /* Hashes config */
426 size_t hash_max_zipmap_entries;
427 size_t hash_max_zipmap_value;
428 /* Virtual memory state */
429 FILE *vm_fp;
430 int vm_fd;
431 off_t vm_next_page; /* Next probably empty page */
432 off_t vm_near_pages; /* Number of pages allocated sequentially */
433 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
434 time_t unixtime; /* Unix time sampled every second. */
435 /* Virtual memory I/O threads stuff */
436 /* An I/O thread process an element taken from the io_jobs queue and
437 * put the result of the operation in the io_done list. While the
438 * job is being processed, it's put on io_processing queue. */
439 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
440 list *io_processing; /* List of VM I/O jobs being processed */
441 list *io_processed; /* List of VM I/O jobs already processed */
442 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
443 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
444 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
445 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
446 pthread_attr_t io_threads_attr; /* attributes for threads creation */
447 int io_active_threads; /* Number of running I/O threads */
448 int vm_max_threads; /* Max number of I/O threads running at the same time */
449 /* Our main thread is blocked on the event loop, locking for sockets ready
450 * to be read or written, so when a threaded I/O operation is ready to be
451 * processed by the main thread, the I/O thread will use a unix pipe to
452 * awake the main thread. The followings are the two pipe FDs. */
453 int io_ready_pipe_read;
454 int io_ready_pipe_write;
455 /* Virtual memory stats */
456 unsigned long long vm_stats_used_pages;
457 unsigned long long vm_stats_swapped_objects;
458 unsigned long long vm_stats_swapouts;
459 unsigned long long vm_stats_swapins;
460 /* Pubsub */
461 dict *pubsub_channels; /* Map channels to list of subscribed clients */
462 list *pubsub_patterns; /* A list of pubsub_patterns */
463 /* Misc */
464 FILE *devnull;
465 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
466 unsigned lruclock_padding:10;
467 };
468
469 typedef struct pubsubPattern {
470 redisClient *client;
471 robj *pattern;
472 } pubsubPattern;
473
474 typedef void redisCommandProc(redisClient *c);
475 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
476 struct redisCommand {
477 char *name;
478 redisCommandProc *proc;
479 int arity;
480 int flags;
481 /* Use a function to determine which keys need to be loaded
482 * in the background prior to executing this command. Takes precedence
483 * over vm_firstkey and others, ignored when NULL */
484 redisVmPreloadProc *vm_preload_proc;
485 /* What keys should be loaded in background when calling this command? */
486 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
487 int vm_lastkey; /* THe last argument that's a key */
488 int vm_keystep; /* The step between first and last key */
489 };
490
491 struct redisFunctionSym {
492 char *name;
493 unsigned long pointer;
494 };
495
496 typedef struct _redisSortObject {
497 robj *obj;
498 union {
499 double score;
500 robj *cmpobj;
501 } u;
502 } redisSortObject;
503
504 typedef struct _redisSortOperation {
505 int type;
506 robj *pattern;
507 } redisSortOperation;
508
509 /* ZSETs use a specialized version of Skiplists */
510
511 typedef struct zskiplistNode {
512 struct zskiplistNode **forward;
513 struct zskiplistNode *backward;
514 unsigned int *span;
515 double score;
516 robj *obj;
517 } zskiplistNode;
518
519 typedef struct zskiplist {
520 struct zskiplistNode *header, *tail;
521 unsigned long length;
522 int level;
523 } zskiplist;
524
525 typedef struct zset {
526 dict *dict;
527 zskiplist *zsl;
528 } zset;
529
530 /* Our shared "common" objects */
531
532 #define REDIS_SHARED_INTEGERS 10000
533 struct sharedObjectsStruct {
534 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
535 *colon, *nullbulk, *nullmultibulk, *queued,
536 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
537 *outofrangeerr, *plus,
538 *select0, *select1, *select2, *select3, *select4,
539 *select5, *select6, *select7, *select8, *select9,
540 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
541 *mbulk4, *psubscribebulk, *punsubscribebulk,
542 *integers[REDIS_SHARED_INTEGERS];
543 } shared;
544
545 /* Global vars that are actally used as constants. The following double
546 * values are used for double on-disk serialization, and are initialized
547 * at runtime to avoid strange compiler optimizations. */
548
549 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
550
551 /* VM threaded I/O request message */
552 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
553 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
554 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
555 typedef struct iojob {
556 int type; /* Request type, REDIS_IOJOB_* */
557 redisDb *db;/* Redis database */
558 robj *key; /* This I/O request is about swapping this key */
559 robj *id; /* Unique identifier of this job:
560 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
561 vmpointer objct for REDIS_IOREQ_LOAD. */
562 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
563 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
564 off_t page; /* Swap page where to read/write the object */
565 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
566 int canceled; /* True if this command was canceled by blocking side of VM */
567 pthread_t thread; /* ID of the thread processing this entry */
568 } iojob;
569
570 /*================================ Prototypes =============================== */
571
572 static void freeStringObject(robj *o);
573 static void freeListObject(robj *o);
574 static void freeSetObject(robj *o);
575 static void decrRefCount(void *o);
576 static robj *createObject(int type, void *ptr);
577 static void freeClient(redisClient *c);
578 static int rdbLoad(char *filename);
579 static void addReply(redisClient *c, robj *obj);
580 static void addReplySds(redisClient *c, sds s);
581 static void incrRefCount(robj *o);
582 static int rdbSaveBackground(char *filename);
583 static robj *createStringObject(char *ptr, size_t len);
584 static robj *dupStringObject(robj *o);
585 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
586 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
587 static void flushAppendOnlyFile(void);
588 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
589 static int syncWithMaster(void);
590 static robj *tryObjectEncoding(robj *o);
591 static robj *getDecodedObject(robj *o);
592 static int removeExpire(redisDb *db, robj *key);
593 static int expireIfNeeded(redisDb *db, robj *key);
594 static int deleteIfVolatile(redisDb *db, robj *key);
595 static int deleteIfSwapped(redisDb *db, robj *key);
596 static int deleteKey(redisDb *db, robj *key);
597 static time_t getExpire(redisDb *db, robj *key);
598 static int setExpire(redisDb *db, robj *key, time_t when);
599 static void updateSlavesWaitingBgsave(int bgsaveerr);
600 static void freeMemoryIfNeeded(void);
601 static int processCommand(redisClient *c);
602 static void setupSigSegvAction(void);
603 static void rdbRemoveTempFile(pid_t childpid);
604 static void aofRemoveTempFile(pid_t childpid);
605 static size_t stringObjectLen(robj *o);
606 static void processInputBuffer(redisClient *c);
607 static zskiplist *zslCreate(void);
608 static void zslFree(zskiplist *zsl);
609 static void zslInsert(zskiplist *zsl, double score, robj *obj);
610 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
611 static void initClientMultiState(redisClient *c);
612 static void freeClientMultiState(redisClient *c);
613 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
614 static void unblockClientWaitingData(redisClient *c);
615 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
616 static void vmInit(void);
617 static void vmMarkPagesFree(off_t page, off_t count);
618 static robj *vmLoadObject(robj *o);
619 static robj *vmPreviewObject(robj *o);
620 static int vmSwapOneObjectBlocking(void);
621 static int vmSwapOneObjectThreaded(void);
622 static int vmCanSwapOut(void);
623 static int tryFreeOneObjectFromFreelist(void);
624 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
625 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
626 static void vmCancelThreadedIOJob(robj *o);
627 static void lockThreadedIO(void);
628 static void unlockThreadedIO(void);
629 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
630 static void freeIOJob(iojob *j);
631 static void queueIOJob(iojob *j);
632 static int vmWriteObjectOnSwap(robj *o, off_t page);
633 static robj *vmReadObjectFromSwap(off_t page, int type);
634 static void waitEmptyIOJobsQueue(void);
635 static void vmReopenSwapFile(void);
636 static int vmFreePage(off_t page);
637 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
638 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
639 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
640 static int dontWaitForSwappedKey(redisClient *c, robj *key);
641 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
642 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
643 static struct redisCommand *lookupCommand(char *name);
644 static void call(redisClient *c, struct redisCommand *cmd);
645 static void resetClient(redisClient *c);
646 static void convertToRealHash(robj *o);
647 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
648 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
649 static void freePubsubPattern(void *p);
650 static int listMatchPubsubPattern(void *a, void *b);
651 static int compareStringObjects(robj *a, robj *b);
652 static int equalStringObjects(robj *a, robj *b);
653 static void usage();
654 static int rewriteAppendOnlyFileBackground(void);
655 static vmpointer *vmSwapObjectBlocking(robj *val);
656 static int prepareForShutdown();
657 static void touchWatchedKey(redisDb *db, robj *key);
658 static void touchWatchedKeysOnFlush(int dbid);
659 static void unwatchAllKeys(redisClient *c);
660
661 static void authCommand(redisClient *c);
662 static void pingCommand(redisClient *c);
663 static void echoCommand(redisClient *c);
664 static void setCommand(redisClient *c);
665 static void setnxCommand(redisClient *c);
666 static void setexCommand(redisClient *c);
667 static void getCommand(redisClient *c);
668 static void delCommand(redisClient *c);
669 static void existsCommand(redisClient *c);
670 static void incrCommand(redisClient *c);
671 static void decrCommand(redisClient *c);
672 static void incrbyCommand(redisClient *c);
673 static void decrbyCommand(redisClient *c);
674 static void selectCommand(redisClient *c);
675 static void randomkeyCommand(redisClient *c);
676 static void keysCommand(redisClient *c);
677 static void dbsizeCommand(redisClient *c);
678 static void lastsaveCommand(redisClient *c);
679 static void saveCommand(redisClient *c);
680 static void bgsaveCommand(redisClient *c);
681 static void bgrewriteaofCommand(redisClient *c);
682 static void shutdownCommand(redisClient *c);
683 static void moveCommand(redisClient *c);
684 static void renameCommand(redisClient *c);
685 static void renamenxCommand(redisClient *c);
686 static void lpushCommand(redisClient *c);
687 static void rpushCommand(redisClient *c);
688 static void lpopCommand(redisClient *c);
689 static void rpopCommand(redisClient *c);
690 static void llenCommand(redisClient *c);
691 static void lindexCommand(redisClient *c);
692 static void lrangeCommand(redisClient *c);
693 static void ltrimCommand(redisClient *c);
694 static void typeCommand(redisClient *c);
695 static void lsetCommand(redisClient *c);
696 static void saddCommand(redisClient *c);
697 static void sremCommand(redisClient *c);
698 static void smoveCommand(redisClient *c);
699 static void sismemberCommand(redisClient *c);
700 static void scardCommand(redisClient *c);
701 static void spopCommand(redisClient *c);
702 static void srandmemberCommand(redisClient *c);
703 static void sinterCommand(redisClient *c);
704 static void sinterstoreCommand(redisClient *c);
705 static void sunionCommand(redisClient *c);
706 static void sunionstoreCommand(redisClient *c);
707 static void sdiffCommand(redisClient *c);
708 static void sdiffstoreCommand(redisClient *c);
709 static void syncCommand(redisClient *c);
710 static void flushdbCommand(redisClient *c);
711 static void flushallCommand(redisClient *c);
712 static void sortCommand(redisClient *c);
713 static void lremCommand(redisClient *c);
714 static void rpoplpushcommand(redisClient *c);
715 static void infoCommand(redisClient *c);
716 static void mgetCommand(redisClient *c);
717 static void monitorCommand(redisClient *c);
718 static void expireCommand(redisClient *c);
719 static void expireatCommand(redisClient *c);
720 static void getsetCommand(redisClient *c);
721 static void ttlCommand(redisClient *c);
722 static void slaveofCommand(redisClient *c);
723 static void debugCommand(redisClient *c);
724 static void msetCommand(redisClient *c);
725 static void msetnxCommand(redisClient *c);
726 static void zaddCommand(redisClient *c);
727 static void zincrbyCommand(redisClient *c);
728 static void zrangeCommand(redisClient *c);
729 static void zrangebyscoreCommand(redisClient *c);
730 static void zcountCommand(redisClient *c);
731 static void zrevrangeCommand(redisClient *c);
732 static void zcardCommand(redisClient *c);
733 static void zremCommand(redisClient *c);
734 static void zscoreCommand(redisClient *c);
735 static void zremrangebyscoreCommand(redisClient *c);
736 static void multiCommand(redisClient *c);
737 static void execCommand(redisClient *c);
738 static void discardCommand(redisClient *c);
739 static void blpopCommand(redisClient *c);
740 static void brpopCommand(redisClient *c);
741 static void appendCommand(redisClient *c);
742 static void substrCommand(redisClient *c);
743 static void zrankCommand(redisClient *c);
744 static void zrevrankCommand(redisClient *c);
745 static void hsetCommand(redisClient *c);
746 static void hsetnxCommand(redisClient *c);
747 static void hgetCommand(redisClient *c);
748 static void hmsetCommand(redisClient *c);
749 static void hmgetCommand(redisClient *c);
750 static void hdelCommand(redisClient *c);
751 static void hlenCommand(redisClient *c);
752 static void zremrangebyrankCommand(redisClient *c);
753 static void zunionstoreCommand(redisClient *c);
754 static void zinterstoreCommand(redisClient *c);
755 static void hkeysCommand(redisClient *c);
756 static void hvalsCommand(redisClient *c);
757 static void hgetallCommand(redisClient *c);
758 static void hexistsCommand(redisClient *c);
759 static void configCommand(redisClient *c);
760 static void hincrbyCommand(redisClient *c);
761 static void subscribeCommand(redisClient *c);
762 static void unsubscribeCommand(redisClient *c);
763 static void psubscribeCommand(redisClient *c);
764 static void punsubscribeCommand(redisClient *c);
765 static void publishCommand(redisClient *c);
766 static void watchCommand(redisClient *c);
767 static void unwatchCommand(redisClient *c);
768
769 /*================================= Globals ================================= */
770
771 /* Global vars */
772 static struct redisServer server; /* server global state */
773 static struct redisCommand *commandTable;
774 static struct redisCommand readonlyCommandTable[] = {
775 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
777 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
778 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
779 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
784 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
786 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
798 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
799 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
802 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
803 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
807 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
808 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
809 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
810 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
811 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
812 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
819 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
820 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
827 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
829 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
830 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
831 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
833 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
834 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
835 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
843 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
844 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
845 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
857 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
865 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
870 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
873 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
876 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
881 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
883 };
884
885 /*============================ Utility functions ============================ */
886
887 /* Glob-style pattern matching. */
888 static int stringmatchlen(const char *pattern, int patternLen,
889 const char *string, int stringLen, int nocase)
890 {
891 while(patternLen) {
892 switch(pattern[0]) {
893 case '*':
894 while (pattern[1] == '*') {
895 pattern++;
896 patternLen--;
897 }
898 if (patternLen == 1)
899 return 1; /* match */
900 while(stringLen) {
901 if (stringmatchlen(pattern+1, patternLen-1,
902 string, stringLen, nocase))
903 return 1; /* match */
904 string++;
905 stringLen--;
906 }
907 return 0; /* no match */
908 break;
909 case '?':
910 if (stringLen == 0)
911 return 0; /* no match */
912 string++;
913 stringLen--;
914 break;
915 case '[':
916 {
917 int not, match;
918
919 pattern++;
920 patternLen--;
921 not = pattern[0] == '^';
922 if (not) {
923 pattern++;
924 patternLen--;
925 }
926 match = 0;
927 while(1) {
928 if (pattern[0] == '\\') {
929 pattern++;
930 patternLen--;
931 if (pattern[0] == string[0])
932 match = 1;
933 } else if (pattern[0] == ']') {
934 break;
935 } else if (patternLen == 0) {
936 pattern--;
937 patternLen++;
938 break;
939 } else if (pattern[1] == '-' && patternLen >= 3) {
940 int start = pattern[0];
941 int end = pattern[2];
942 int c = string[0];
943 if (start > end) {
944 int t = start;
945 start = end;
946 end = t;
947 }
948 if (nocase) {
949 start = tolower(start);
950 end = tolower(end);
951 c = tolower(c);
952 }
953 pattern += 2;
954 patternLen -= 2;
955 if (c >= start && c <= end)
956 match = 1;
957 } else {
958 if (!nocase) {
959 if (pattern[0] == string[0])
960 match = 1;
961 } else {
962 if (tolower((int)pattern[0]) == tolower((int)string[0]))
963 match = 1;
964 }
965 }
966 pattern++;
967 patternLen--;
968 }
969 if (not)
970 match = !match;
971 if (!match)
972 return 0; /* no match */
973 string++;
974 stringLen--;
975 break;
976 }
977 case '\\':
978 if (patternLen >= 2) {
979 pattern++;
980 patternLen--;
981 }
982 /* fall through */
983 default:
984 if (!nocase) {
985 if (pattern[0] != string[0])
986 return 0; /* no match */
987 } else {
988 if (tolower((int)pattern[0]) != tolower((int)string[0]))
989 return 0; /* no match */
990 }
991 string++;
992 stringLen--;
993 break;
994 }
995 pattern++;
996 patternLen--;
997 if (stringLen == 0) {
998 while(*pattern == '*') {
999 pattern++;
1000 patternLen--;
1001 }
1002 break;
1003 }
1004 }
1005 if (patternLen == 0 && stringLen == 0)
1006 return 1;
1007 return 0;
1008 }
1009
1010 static int stringmatch(const char *pattern, const char *string, int nocase) {
1011 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1012 }
1013
1014 /* Convert a string representing an amount of memory into the number of
1015 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1016 * (1024*1024*1024).
1017 *
1018 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1019 * set to 0 */
1020 static long long memtoll(const char *p, int *err) {
1021 const char *u;
1022 char buf[128];
1023 long mul; /* unit multiplier */
1024 long long val;
1025 unsigned int digits;
1026
1027 if (err) *err = 0;
1028 /* Search the first non digit character. */
1029 u = p;
1030 if (*u == '-') u++;
1031 while(*u && isdigit(*u)) u++;
1032 if (*u == '\0' || !strcasecmp(u,"b")) {
1033 mul = 1;
1034 } else if (!strcasecmp(u,"k")) {
1035 mul = 1000;
1036 } else if (!strcasecmp(u,"kb")) {
1037 mul = 1024;
1038 } else if (!strcasecmp(u,"m")) {
1039 mul = 1000*1000;
1040 } else if (!strcasecmp(u,"mb")) {
1041 mul = 1024*1024;
1042 } else if (!strcasecmp(u,"g")) {
1043 mul = 1000L*1000*1000;
1044 } else if (!strcasecmp(u,"gb")) {
1045 mul = 1024L*1024*1024;
1046 } else {
1047 if (err) *err = 1;
1048 mul = 1;
1049 }
1050 digits = u-p;
1051 if (digits >= sizeof(buf)) {
1052 if (err) *err = 1;
1053 return LLONG_MAX;
1054 }
1055 memcpy(buf,p,digits);
1056 buf[digits] = '\0';
1057 val = strtoll(buf,NULL,10);
1058 return val*mul;
1059 }
1060
1061 /* Convert a long long into a string. Returns the number of
1062 * characters needed to represent the number, that can be shorter if passed
1063 * buffer length is not enough to store the whole number. */
1064 static int ll2string(char *s, size_t len, long long value) {
1065 char buf[32], *p;
1066 unsigned long long v;
1067 size_t l;
1068
1069 if (len == 0) return 0;
1070 v = (value < 0) ? -value : value;
1071 p = buf+31; /* point to the last character */
1072 do {
1073 *p-- = '0'+(v%10);
1074 v /= 10;
1075 } while(v);
1076 if (value < 0) *p-- = '-';
1077 p++;
1078 l = 32-(p-buf);
1079 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1080 memcpy(s,p,l);
1081 s[l] = '\0';
1082 return l;
1083 }
1084
1085 static void redisLog(int level, const char *fmt, ...) {
1086 va_list ap;
1087 FILE *fp;
1088
1089 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1090 if (!fp) return;
1091
1092 va_start(ap, fmt);
1093 if (level >= server.verbosity) {
1094 char *c = ".-*#";
1095 char buf[64];
1096 time_t now;
1097
1098 now = time(NULL);
1099 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1100 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1101 vfprintf(fp, fmt, ap);
1102 fprintf(fp,"\n");
1103 fflush(fp);
1104 }
1105 va_end(ap);
1106
1107 if (server.logfile) fclose(fp);
1108 }
1109
1110 /*====================== Hash table type implementation ==================== */
1111
1112 /* This is an hash table type that uses the SDS dynamic strings libary as
1113 * keys and radis objects as values (objects can hold SDS strings,
1114 * lists, sets). */
1115
1116 static void dictVanillaFree(void *privdata, void *val)
1117 {
1118 DICT_NOTUSED(privdata);
1119 zfree(val);
1120 }
1121
1122 static void dictListDestructor(void *privdata, void *val)
1123 {
1124 DICT_NOTUSED(privdata);
1125 listRelease((list*)val);
1126 }
1127
1128 static int sdsDictKeyCompare(void *privdata, const void *key1,
1129 const void *key2)
1130 {
1131 int l1,l2;
1132 DICT_NOTUSED(privdata);
1133
1134 l1 = sdslen((sds)key1);
1135 l2 = sdslen((sds)key2);
1136 if (l1 != l2) return 0;
1137 return memcmp(key1, key2, l1) == 0;
1138 }
1139
1140 static void dictRedisObjectDestructor(void *privdata, void *val)
1141 {
1142 DICT_NOTUSED(privdata);
1143
1144 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1145 decrRefCount(val);
1146 }
1147
1148 static int dictObjKeyCompare(void *privdata, const void *key1,
1149 const void *key2)
1150 {
1151 const robj *o1 = key1, *o2 = key2;
1152 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1153 }
1154
1155 static unsigned int dictObjHash(const void *key) {
1156 const robj *o = key;
1157 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1158 }
1159
1160 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1161 const void *key2)
1162 {
1163 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1164 int cmp;
1165
1166 if (o1->encoding == REDIS_ENCODING_INT &&
1167 o2->encoding == REDIS_ENCODING_INT)
1168 return o1->ptr == o2->ptr;
1169
1170 o1 = getDecodedObject(o1);
1171 o2 = getDecodedObject(o2);
1172 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1173 decrRefCount(o1);
1174 decrRefCount(o2);
1175 return cmp;
1176 }
1177
1178 static unsigned int dictEncObjHash(const void *key) {
1179 robj *o = (robj*) key;
1180
1181 if (o->encoding == REDIS_ENCODING_RAW) {
1182 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1183 } else {
1184 if (o->encoding == REDIS_ENCODING_INT) {
1185 char buf[32];
1186 int len;
1187
1188 len = ll2string(buf,32,(long)o->ptr);
1189 return dictGenHashFunction((unsigned char*)buf, len);
1190 } else {
1191 unsigned int hash;
1192
1193 o = getDecodedObject(o);
1194 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1195 decrRefCount(o);
1196 return hash;
1197 }
1198 }
1199 }
1200
1201 /* Sets type and expires */
1202 static dictType setDictType = {
1203 dictEncObjHash, /* hash function */
1204 NULL, /* key dup */
1205 NULL, /* val dup */
1206 dictEncObjKeyCompare, /* key compare */
1207 dictRedisObjectDestructor, /* key destructor */
1208 NULL /* val destructor */
1209 };
1210
1211 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1212 static dictType zsetDictType = {
1213 dictEncObjHash, /* hash function */
1214 NULL, /* key dup */
1215 NULL, /* val dup */
1216 dictEncObjKeyCompare, /* key compare */
1217 dictRedisObjectDestructor, /* key destructor */
1218 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1219 };
1220
1221 /* Db->dict */
1222 static dictType dbDictType = {
1223 dictObjHash, /* hash function */
1224 NULL, /* key dup */
1225 NULL, /* val dup */
1226 dictObjKeyCompare, /* key compare */
1227 dictRedisObjectDestructor, /* key destructor */
1228 dictRedisObjectDestructor /* val destructor */
1229 };
1230
1231 /* Db->expires */
1232 static dictType keyptrDictType = {
1233 dictObjHash, /* hash function */
1234 NULL, /* key dup */
1235 NULL, /* val dup */
1236 dictObjKeyCompare, /* key compare */
1237 dictRedisObjectDestructor, /* key destructor */
1238 NULL /* val destructor */
1239 };
1240
1241 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1242 static dictType hashDictType = {
1243 dictEncObjHash, /* hash function */
1244 NULL, /* key dup */
1245 NULL, /* val dup */
1246 dictEncObjKeyCompare, /* key compare */
1247 dictRedisObjectDestructor, /* key destructor */
1248 dictRedisObjectDestructor /* val destructor */
1249 };
1250
1251 /* Keylist hash table type has unencoded redis objects as keys and
1252 * lists as values. It's used for blocking operations (BLPOP) and to
1253 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1254 static dictType keylistDictType = {
1255 dictObjHash, /* hash function */
1256 NULL, /* key dup */
1257 NULL, /* val dup */
1258 dictObjKeyCompare, /* key compare */
1259 dictRedisObjectDestructor, /* key destructor */
1260 dictListDestructor /* val destructor */
1261 };
1262
1263 static void version();
1264
1265 /* ========================= Random utility functions ======================= */
1266
1267 /* Redis generally does not try to recover from out of memory conditions
1268 * when allocating objects or strings, it is not clear if it will be possible
1269 * to report this condition to the client since the networking layer itself
1270 * is based on heap allocation for send buffers, so we simply abort.
1271 * At least the code will be simpler to read... */
1272 static void oom(const char *msg) {
1273 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1274 sleep(1);
1275 abort();
1276 }
1277
1278 /* ====================== Redis server networking stuff ===================== */
1279 static void closeTimedoutClients(void) {
1280 redisClient *c;
1281 listNode *ln;
1282 time_t now = time(NULL);
1283 listIter li;
1284
1285 listRewind(server.clients,&li);
1286 while ((ln = listNext(&li)) != NULL) {
1287 c = listNodeValue(ln);
1288 if (server.maxidletime &&
1289 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1290 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1291 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1292 listLength(c->pubsub_patterns) == 0 &&
1293 (now - c->lastinteraction > server.maxidletime))
1294 {
1295 redisLog(REDIS_VERBOSE,"Closing idle client");
1296 freeClient(c);
1297 } else if (c->flags & REDIS_BLOCKED) {
1298 if (c->blockingto != 0 && c->blockingto < now) {
1299 addReply(c,shared.nullmultibulk);
1300 unblockClientWaitingData(c);
1301 }
1302 }
1303 }
1304 }
1305
1306 static int htNeedsResize(dict *dict) {
1307 long long size, used;
1308
1309 size = dictSlots(dict);
1310 used = dictSize(dict);
1311 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1312 (used*100/size < REDIS_HT_MINFILL));
1313 }
1314
1315 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1316 * we resize the hash table to save memory */
1317 static void tryResizeHashTables(void) {
1318 int j;
1319
1320 for (j = 0; j < server.dbnum; j++) {
1321 if (htNeedsResize(server.db[j].dict))
1322 dictResize(server.db[j].dict);
1323 if (htNeedsResize(server.db[j].expires))
1324 dictResize(server.db[j].expires);
1325 }
1326 }
1327
1328 /* Our hash table implementation performs rehashing incrementally while
1329 * we write/read from the hash table. Still if the server is idle, the hash
1330 * table will use two tables for a long time. So we try to use 1 millisecond
1331 * of CPU time at every serverCron() loop in order to rehash some key. */
1332 static void incrementallyRehash(void) {
1333 int j;
1334
1335 for (j = 0; j < server.dbnum; j++) {
1336 if (dictIsRehashing(server.db[j].dict)) {
1337 dictRehashMilliseconds(server.db[j].dict,1);
1338 break; /* already used our millisecond for this loop... */
1339 }
1340 }
1341 }
1342
1343 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1344 void backgroundSaveDoneHandler(int statloc) {
1345 int exitcode = WEXITSTATUS(statloc);
1346 int bysignal = WIFSIGNALED(statloc);
1347
1348 if (!bysignal && exitcode == 0) {
1349 redisLog(REDIS_NOTICE,
1350 "Background saving terminated with success");
1351 server.dirty = 0;
1352 server.lastsave = time(NULL);
1353 } else if (!bysignal && exitcode != 0) {
1354 redisLog(REDIS_WARNING, "Background saving error");
1355 } else {
1356 redisLog(REDIS_WARNING,
1357 "Background saving terminated by signal %d", WTERMSIG(statloc));
1358 rdbRemoveTempFile(server.bgsavechildpid);
1359 }
1360 server.bgsavechildpid = -1;
1361 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1362 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1363 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1364 }
1365
1366 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1367 * Handle this. */
1368 void backgroundRewriteDoneHandler(int statloc) {
1369 int exitcode = WEXITSTATUS(statloc);
1370 int bysignal = WIFSIGNALED(statloc);
1371
1372 if (!bysignal && exitcode == 0) {
1373 int fd;
1374 char tmpfile[256];
1375
1376 redisLog(REDIS_NOTICE,
1377 "Background append only file rewriting terminated with success");
1378 /* Now it's time to flush the differences accumulated by the parent */
1379 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1380 fd = open(tmpfile,O_WRONLY|O_APPEND);
1381 if (fd == -1) {
1382 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1383 goto cleanup;
1384 }
1385 /* Flush our data... */
1386 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1387 (signed) sdslen(server.bgrewritebuf)) {
1388 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1389 close(fd);
1390 goto cleanup;
1391 }
1392 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1393 /* Now our work is to rename the temp file into the stable file. And
1394 * switch the file descriptor used by the server for append only. */
1395 if (rename(tmpfile,server.appendfilename) == -1) {
1396 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1397 close(fd);
1398 goto cleanup;
1399 }
1400 /* Mission completed... almost */
1401 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1402 if (server.appendfd != -1) {
1403 /* If append only is actually enabled... */
1404 close(server.appendfd);
1405 server.appendfd = fd;
1406 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
1407 server.appendseldb = -1; /* Make sure it will issue SELECT */
1408 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1409 } else {
1410 /* If append only is disabled we just generate a dump in this
1411 * format. Why not? */
1412 close(fd);
1413 }
1414 } else if (!bysignal && exitcode != 0) {
1415 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1416 } else {
1417 redisLog(REDIS_WARNING,
1418 "Background append only file rewriting terminated by signal %d",
1419 WTERMSIG(statloc));
1420 }
1421 cleanup:
1422 sdsfree(server.bgrewritebuf);
1423 server.bgrewritebuf = sdsempty();
1424 aofRemoveTempFile(server.bgrewritechildpid);
1425 server.bgrewritechildpid = -1;
1426 }
1427
1428 /* This function is called once a background process of some kind terminates,
1429 * as we want to avoid resizing the hash tables when there is a child in order
1430 * to play well with copy-on-write (otherwise when a resize happens lots of
1431 * memory pages are copied). The goal of this function is to update the ability
1432 * for dict.c to resize the hash tables accordingly to the fact we have o not
1433 * running childs. */
1434 static void updateDictResizePolicy(void) {
1435 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1436 dictEnableResize();
1437 else
1438 dictDisableResize();
1439 }
1440
1441 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1442 int j, loops = server.cronloops++;
1443 REDIS_NOTUSED(eventLoop);
1444 REDIS_NOTUSED(id);
1445 REDIS_NOTUSED(clientData);
1446
1447 /* We take a cached value of the unix time in the global state because
1448 * with virtual memory and aging there is to store the current time
1449 * in objects at every object access, and accuracy is not needed.
1450 * To access a global var is faster than calling time(NULL) */
1451 server.unixtime = time(NULL);
1452 /* We have just 21 bits per object for LRU information.
1453 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1454 *
1455 * When we need to select what object to swap, we compute the minimum
1456 * time distance between the current lruclock and the object last access
1457 * lruclock info. Even if clocks will wrap on overflow, there is
1458 * the interesting property that we are sure that at least
1459 * ABS(A-B) minutes passed between current time and timestamp B.
1460 *
1461 * This is not precise but we don't need at all precision, but just
1462 * something statistically reasonable.
1463 */
1464 server.lruclock = (time(NULL)/60)&((1<<21)-1);
1465
1466 /* We received a SIGTERM, shutting down here in a safe way, as it is
1467 * not ok doing so inside the signal handler. */
1468 if (server.shutdown_asap) {
1469 if (prepareForShutdown() == REDIS_OK) exit(0);
1470 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1471 }
1472
1473 /* Show some info about non-empty databases */
1474 for (j = 0; j < server.dbnum; j++) {
1475 long long size, used, vkeys;
1476
1477 size = dictSlots(server.db[j].dict);
1478 used = dictSize(server.db[j].dict);
1479 vkeys = dictSize(server.db[j].expires);
1480 if (!(loops % 50) && (used || vkeys)) {
1481 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1482 /* dictPrintStats(server.dict); */
1483 }
1484 }
1485
1486 /* We don't want to resize the hash tables while a bacground saving
1487 * is in progress: the saving child is created using fork() that is
1488 * implemented with a copy-on-write semantic in most modern systems, so
1489 * if we resize the HT while there is the saving child at work actually
1490 * a lot of memory movements in the parent will cause a lot of pages
1491 * copied. */
1492 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1493 if (!(loops % 10)) tryResizeHashTables();
1494 if (server.activerehashing) incrementallyRehash();
1495 }
1496
1497 /* Show information about connected clients */
1498 if (!(loops % 50)) {
1499 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1500 listLength(server.clients)-listLength(server.slaves),
1501 listLength(server.slaves),
1502 zmalloc_used_memory());
1503 }
1504
1505 /* Close connections of timedout clients */
1506 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1507 closeTimedoutClients();
1508
1509 /* Check if a background saving or AOF rewrite in progress terminated */
1510 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1511 int statloc;
1512 pid_t pid;
1513
1514 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1515 if (pid == server.bgsavechildpid) {
1516 backgroundSaveDoneHandler(statloc);
1517 } else {
1518 backgroundRewriteDoneHandler(statloc);
1519 }
1520 updateDictResizePolicy();
1521 }
1522 } else {
1523 /* If there is not a background saving in progress check if
1524 * we have to save now */
1525 time_t now = time(NULL);
1526 for (j = 0; j < server.saveparamslen; j++) {
1527 struct saveparam *sp = server.saveparams+j;
1528
1529 if (server.dirty >= sp->changes &&
1530 now-server.lastsave > sp->seconds) {
1531 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1532 sp->changes, sp->seconds);
1533 rdbSaveBackground(server.dbfilename);
1534 break;
1535 }
1536 }
1537 }
1538
1539 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1540 * will use few CPU cycles if there are few expiring keys, otherwise
1541 * it will get more aggressive to avoid that too much memory is used by
1542 * keys that can be removed from the keyspace. */
1543 for (j = 0; j < server.dbnum; j++) {
1544 int expired;
1545 redisDb *db = server.db+j;
1546
1547 /* Continue to expire if at the end of the cycle more than 25%
1548 * of the keys were expired. */
1549 do {
1550 long num = dictSize(db->expires);
1551 time_t now = time(NULL);
1552
1553 expired = 0;
1554 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1555 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1556 while (num--) {
1557 dictEntry *de;
1558 time_t t;
1559
1560 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1561 t = (time_t) dictGetEntryVal(de);
1562 if (now > t) {
1563 deleteKey(db,dictGetEntryKey(de));
1564 expired++;
1565 server.stat_expiredkeys++;
1566 }
1567 }
1568 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1569 }
1570
1571 /* Swap a few keys on disk if we are over the memory limit and VM
1572 * is enbled. Try to free objects from the free list first. */
1573 if (vmCanSwapOut()) {
1574 while (server.vm_enabled && zmalloc_used_memory() >
1575 server.vm_max_memory)
1576 {
1577 int retval;
1578
1579 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1580 retval = (server.vm_max_threads == 0) ?
1581 vmSwapOneObjectBlocking() :
1582 vmSwapOneObjectThreaded();
1583 if (retval == REDIS_ERR && !(loops % 300) &&
1584 zmalloc_used_memory() >
1585 (server.vm_max_memory+server.vm_max_memory/10))
1586 {
1587 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1588 }
1589 /* Note that when using threade I/O we free just one object,
1590 * because anyway when the I/O thread in charge to swap this
1591 * object out will finish, the handler of completed jobs
1592 * will try to swap more objects if we are still out of memory. */
1593 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1594 }
1595 }
1596
1597 /* Check if we should connect to a MASTER */
1598 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1599 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1600 if (syncWithMaster() == REDIS_OK) {
1601 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1602 if (server.appendonly) rewriteAppendOnlyFileBackground();
1603 }
1604 }
1605 return 100;
1606 }
1607
1608 /* This function gets called every time Redis is entering the
1609 * main loop of the event driven library, that is, before to sleep
1610 * for ready file descriptors. */
1611 static void beforeSleep(struct aeEventLoop *eventLoop) {
1612 REDIS_NOTUSED(eventLoop);
1613
1614 /* Awake clients that got all the swapped keys they requested */
1615 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1616 listIter li;
1617 listNode *ln;
1618
1619 listRewind(server.io_ready_clients,&li);
1620 while((ln = listNext(&li))) {
1621 redisClient *c = ln->value;
1622 struct redisCommand *cmd;
1623
1624 /* Resume the client. */
1625 listDelNode(server.io_ready_clients,ln);
1626 c->flags &= (~REDIS_IO_WAIT);
1627 server.vm_blocked_clients--;
1628 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1629 readQueryFromClient, c);
1630 cmd = lookupCommand(c->argv[0]->ptr);
1631 assert(cmd != NULL);
1632 call(c,cmd);
1633 resetClient(c);
1634 /* There may be more data to process in the input buffer. */
1635 if (c->querybuf && sdslen(c->querybuf) > 0)
1636 processInputBuffer(c);
1637 }
1638 }
1639 /* Write the AOF buffer on disk */
1640 flushAppendOnlyFile();
1641 }
1642
1643 static void createSharedObjects(void) {
1644 int j;
1645
1646 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1647 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1648 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1649 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1650 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1651 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1652 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1653 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1654 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1655 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1656 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1657 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1658 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1659 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1660 "-ERR no such key\r\n"));
1661 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1662 "-ERR syntax error\r\n"));
1663 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1664 "-ERR source and destination objects are the same\r\n"));
1665 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1666 "-ERR index out of range\r\n"));
1667 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1668 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1669 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1670 shared.select0 = createStringObject("select 0\r\n",10);
1671 shared.select1 = createStringObject("select 1\r\n",10);
1672 shared.select2 = createStringObject("select 2\r\n",10);
1673 shared.select3 = createStringObject("select 3\r\n",10);
1674 shared.select4 = createStringObject("select 4\r\n",10);
1675 shared.select5 = createStringObject("select 5\r\n",10);
1676 shared.select6 = createStringObject("select 6\r\n",10);
1677 shared.select7 = createStringObject("select 7\r\n",10);
1678 shared.select8 = createStringObject("select 8\r\n",10);
1679 shared.select9 = createStringObject("select 9\r\n",10);
1680 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1681 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1682 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1683 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1684 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1685 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1686 shared.mbulk3 = createStringObject("*3\r\n",4);
1687 shared.mbulk4 = createStringObject("*4\r\n",4);
1688 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1689 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1690 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1691 }
1692 }
1693
1694 static void appendServerSaveParams(time_t seconds, int changes) {
1695 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1696 server.saveparams[server.saveparamslen].seconds = seconds;
1697 server.saveparams[server.saveparamslen].changes = changes;
1698 server.saveparamslen++;
1699 }
1700
1701 static void resetServerSaveParams() {
1702 zfree(server.saveparams);
1703 server.saveparams = NULL;
1704 server.saveparamslen = 0;
1705 }
1706
1707 static void initServerConfig() {
1708 server.dbnum = REDIS_DEFAULT_DBNUM;
1709 server.port = REDIS_SERVERPORT;
1710 server.verbosity = REDIS_VERBOSE;
1711 server.maxidletime = REDIS_MAXIDLETIME;
1712 server.saveparams = NULL;
1713 server.logfile = NULL; /* NULL = log on standard output */
1714 server.bindaddr = NULL;
1715 server.glueoutputbuf = 1;
1716 server.daemonize = 0;
1717 server.appendonly = 0;
1718 server.appendfsync = APPENDFSYNC_EVERYSEC;
1719 server.no_appendfsync_on_rewrite = 0;
1720 server.lastfsync = time(NULL);
1721 server.appendfd = -1;
1722 server.appendseldb = -1; /* Make sure the first time will not match */
1723 server.pidfile = zstrdup("/var/run/redis.pid");
1724 server.dbfilename = zstrdup("dump.rdb");
1725 server.appendfilename = zstrdup("appendonly.aof");
1726 server.requirepass = NULL;
1727 server.rdbcompression = 1;
1728 server.activerehashing = 1;
1729 server.maxclients = 0;
1730 server.blpop_blocked_clients = 0;
1731 server.maxmemory = 0;
1732 server.vm_enabled = 0;
1733 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1734 server.vm_page_size = 256; /* 256 bytes per page */
1735 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1736 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1737 server.vm_max_threads = 4;
1738 server.vm_blocked_clients = 0;
1739 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1740 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1741 server.shutdown_asap = 0;
1742
1743 resetServerSaveParams();
1744
1745 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1746 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1747 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1748 /* Replication related */
1749 server.isslave = 0;
1750 server.masterauth = NULL;
1751 server.masterhost = NULL;
1752 server.masterport = 6379;
1753 server.master = NULL;
1754 server.replstate = REDIS_REPL_NONE;
1755
1756 /* Double constants initialization */
1757 R_Zero = 0.0;
1758 R_PosInf = 1.0/R_Zero;
1759 R_NegInf = -1.0/R_Zero;
1760 R_Nan = R_Zero/R_Zero;
1761 }
1762
1763 static void initServer() {
1764 int j;
1765
1766 signal(SIGHUP, SIG_IGN);
1767 signal(SIGPIPE, SIG_IGN);
1768 setupSigSegvAction();
1769
1770 server.devnull = fopen("/dev/null","w");
1771 if (server.devnull == NULL) {
1772 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1773 exit(1);
1774 }
1775 server.clients = listCreate();
1776 server.slaves = listCreate();
1777 server.monitors = listCreate();
1778 server.objfreelist = listCreate();
1779 createSharedObjects();
1780 server.el = aeCreateEventLoop();
1781 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1782 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1783 if (server.fd == -1) {
1784 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1785 exit(1);
1786 }
1787 for (j = 0; j < server.dbnum; j++) {
1788 server.db[j].dict = dictCreate(&dbDictType,NULL);
1789 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1790 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1791 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1792 if (server.vm_enabled)
1793 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1794 server.db[j].id = j;
1795 }
1796 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1797 server.pubsub_patterns = listCreate();
1798 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1799 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1800 server.cronloops = 0;
1801 server.bgsavechildpid = -1;
1802 server.bgrewritechildpid = -1;
1803 server.bgrewritebuf = sdsempty();
1804 server.aofbuf = sdsempty();
1805 server.lastsave = time(NULL);
1806 server.dirty = 0;
1807 server.stat_numcommands = 0;
1808 server.stat_numconnections = 0;
1809 server.stat_expiredkeys = 0;
1810 server.stat_starttime = time(NULL);
1811 server.unixtime = time(NULL);
1812 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1813 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1814 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1815
1816 if (server.appendonly) {
1817 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1818 if (server.appendfd == -1) {
1819 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1820 strerror(errno));
1821 exit(1);
1822 }
1823 }
1824
1825 if (server.vm_enabled) vmInit();
1826 }
1827
1828 /* Empty the whole database */
1829 static long long emptyDb() {
1830 int j;
1831 long long removed = 0;
1832
1833 for (j = 0; j < server.dbnum; j++) {
1834 removed += dictSize(server.db[j].dict);
1835 dictEmpty(server.db[j].dict);
1836 dictEmpty(server.db[j].expires);
1837 }
1838 return removed;
1839 }
1840
1841 static int yesnotoi(char *s) {
1842 if (!strcasecmp(s,"yes")) return 1;
1843 else if (!strcasecmp(s,"no")) return 0;
1844 else return -1;
1845 }
1846
1847 /* I agree, this is a very rudimental way to load a configuration...
1848 will improve later if the config gets more complex */
1849 static void loadServerConfig(char *filename) {
1850 FILE *fp;
1851 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1852 int linenum = 0;
1853 sds line = NULL;
1854
1855 if (filename[0] == '-' && filename[1] == '\0')
1856 fp = stdin;
1857 else {
1858 if ((fp = fopen(filename,"r")) == NULL) {
1859 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1860 exit(1);
1861 }
1862 }
1863
1864 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1865 sds *argv;
1866 int argc, j;
1867
1868 linenum++;
1869 line = sdsnew(buf);
1870 line = sdstrim(line," \t\r\n");
1871
1872 /* Skip comments and blank lines*/
1873 if (line[0] == '#' || line[0] == '\0') {
1874 sdsfree(line);
1875 continue;
1876 }
1877
1878 /* Split into arguments */
1879 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1880 sdstolower(argv[0]);
1881
1882 /* Execute config directives */
1883 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1884 server.maxidletime = atoi(argv[1]);
1885 if (server.maxidletime < 0) {
1886 err = "Invalid timeout value"; goto loaderr;
1887 }
1888 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1889 server.port = atoi(argv[1]);
1890 if (server.port < 1 || server.port > 65535) {
1891 err = "Invalid port"; goto loaderr;
1892 }
1893 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1894 server.bindaddr = zstrdup(argv[1]);
1895 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1896 int seconds = atoi(argv[1]);
1897 int changes = atoi(argv[2]);
1898 if (seconds < 1 || changes < 0) {
1899 err = "Invalid save parameters"; goto loaderr;
1900 }
1901 appendServerSaveParams(seconds,changes);
1902 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1903 if (chdir(argv[1]) == -1) {
1904 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1905 argv[1], strerror(errno));
1906 exit(1);
1907 }
1908 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1909 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1910 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1911 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1912 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1913 else {
1914 err = "Invalid log level. Must be one of debug, notice, warning";
1915 goto loaderr;
1916 }
1917 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1918 FILE *logfp;
1919
1920 server.logfile = zstrdup(argv[1]);
1921 if (!strcasecmp(server.logfile,"stdout")) {
1922 zfree(server.logfile);
1923 server.logfile = NULL;
1924 }
1925 if (server.logfile) {
1926 /* Test if we are able to open the file. The server will not
1927 * be able to abort just for this problem later... */
1928 logfp = fopen(server.logfile,"a");
1929 if (logfp == NULL) {
1930 err = sdscatprintf(sdsempty(),
1931 "Can't open the log file: %s", strerror(errno));
1932 goto loaderr;
1933 }
1934 fclose(logfp);
1935 }
1936 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1937 server.dbnum = atoi(argv[1]);
1938 if (server.dbnum < 1) {
1939 err = "Invalid number of databases"; goto loaderr;
1940 }
1941 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1942 loadServerConfig(argv[1]);
1943 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1944 server.maxclients = atoi(argv[1]);
1945 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1946 server.maxmemory = memtoll(argv[1],NULL);
1947 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1948 server.masterhost = sdsnew(argv[1]);
1949 server.masterport = atoi(argv[2]);
1950 server.replstate = REDIS_REPL_CONNECT;
1951 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1952 server.masterauth = zstrdup(argv[1]);
1953 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1954 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1955 err = "argument must be 'yes' or 'no'"; goto loaderr;
1956 }
1957 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1958 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1959 err = "argument must be 'yes' or 'no'"; goto loaderr;
1960 }
1961 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1962 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1963 err = "argument must be 'yes' or 'no'"; goto loaderr;
1964 }
1965 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1966 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1967 err = "argument must be 'yes' or 'no'"; goto loaderr;
1968 }
1969 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1970 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1971 err = "argument must be 'yes' or 'no'"; goto loaderr;
1972 }
1973 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1974 zfree(server.appendfilename);
1975 server.appendfilename = zstrdup(argv[1]);
1976 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1977 && argc == 2) {
1978 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1979 err = "argument must be 'yes' or 'no'"; goto loaderr;
1980 }
1981 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1982 if (!strcasecmp(argv[1],"no")) {
1983 server.appendfsync = APPENDFSYNC_NO;
1984 } else if (!strcasecmp(argv[1],"always")) {
1985 server.appendfsync = APPENDFSYNC_ALWAYS;
1986 } else if (!strcasecmp(argv[1],"everysec")) {
1987 server.appendfsync = APPENDFSYNC_EVERYSEC;
1988 } else {
1989 err = "argument must be 'no', 'always' or 'everysec'";
1990 goto loaderr;
1991 }
1992 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1993 server.requirepass = zstrdup(argv[1]);
1994 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1995 zfree(server.pidfile);
1996 server.pidfile = zstrdup(argv[1]);
1997 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1998 zfree(server.dbfilename);
1999 server.dbfilename = zstrdup(argv[1]);
2000 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2001 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2002 err = "argument must be 'yes' or 'no'"; goto loaderr;
2003 }
2004 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
2005 zfree(server.vm_swap_file);
2006 server.vm_swap_file = zstrdup(argv[1]);
2007 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2008 server.vm_max_memory = memtoll(argv[1],NULL);
2009 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2010 server.vm_page_size = memtoll(argv[1], NULL);
2011 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2012 server.vm_pages = memtoll(argv[1], NULL);
2013 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2014 server.vm_max_threads = strtoll(argv[1], NULL, 10);
2015 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2016 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
2017 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2018 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
2019 } else {
2020 err = "Bad directive or wrong number of arguments"; goto loaderr;
2021 }
2022 for (j = 0; j < argc; j++)
2023 sdsfree(argv[j]);
2024 zfree(argv);
2025 sdsfree(line);
2026 }
2027 if (fp != stdin) fclose(fp);
2028 return;
2029
2030 loaderr:
2031 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2032 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2033 fprintf(stderr, ">>> '%s'\n", line);
2034 fprintf(stderr, "%s\n", err);
2035 exit(1);
2036 }
2037
2038 static void freeClientArgv(redisClient *c) {
2039 int j;
2040
2041 for (j = 0; j < c->argc; j++)
2042 decrRefCount(c->argv[j]);
2043 for (j = 0; j < c->mbargc; j++)
2044 decrRefCount(c->mbargv[j]);
2045 c->argc = 0;
2046 c->mbargc = 0;
2047 }
2048
2049 static void freeClient(redisClient *c) {
2050 listNode *ln;
2051
2052 /* Note that if the client we are freeing is blocked into a blocking
2053 * call, we have to set querybuf to NULL *before* to call
2054 * unblockClientWaitingData() to avoid processInputBuffer() will get
2055 * called. Also it is important to remove the file events after
2056 * this, because this call adds the READABLE event. */
2057 sdsfree(c->querybuf);
2058 c->querybuf = NULL;
2059 if (c->flags & REDIS_BLOCKED)
2060 unblockClientWaitingData(c);
2061
2062 /* UNWATCH all the keys */
2063 unwatchAllKeys(c);
2064 listRelease(c->watched_keys);
2065 /* Unsubscribe from all the pubsub channels */
2066 pubsubUnsubscribeAllChannels(c,0);
2067 pubsubUnsubscribeAllPatterns(c,0);
2068 dictRelease(c->pubsub_channels);
2069 listRelease(c->pubsub_patterns);
2070 /* Obvious cleanup */
2071 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2072 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2073 listRelease(c->reply);
2074 freeClientArgv(c);
2075 close(c->fd);
2076 /* Remove from the list of clients */
2077 ln = listSearchKey(server.clients,c);
2078 redisAssert(ln != NULL);
2079 listDelNode(server.clients,ln);
2080 /* Remove from the list of clients that are now ready to be restarted
2081 * after waiting for swapped keys */
2082 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2083 ln = listSearchKey(server.io_ready_clients,c);
2084 if (ln) {
2085 listDelNode(server.io_ready_clients,ln);
2086 server.vm_blocked_clients--;
2087 }
2088 }
2089 /* Remove from the list of clients waiting for swapped keys */
2090 while (server.vm_enabled && listLength(c->io_keys)) {
2091 ln = listFirst(c->io_keys);
2092 dontWaitForSwappedKey(c,ln->value);
2093 }
2094 listRelease(c->io_keys);
2095 /* Master/slave cleanup */
2096 if (c->flags & REDIS_SLAVE) {
2097 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2098 close(c->repldbfd);
2099 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2100 ln = listSearchKey(l,c);
2101 redisAssert(ln != NULL);
2102 listDelNode(l,ln);
2103 }
2104 if (c->flags & REDIS_MASTER) {
2105 server.master = NULL;
2106 server.replstate = REDIS_REPL_CONNECT;
2107 }
2108 /* Release memory */
2109 zfree(c->argv);
2110 zfree(c->mbargv);
2111 freeClientMultiState(c);
2112 zfree(c);
2113 }
2114
2115 #define GLUEREPLY_UP_TO (1024)
2116 static void glueReplyBuffersIfNeeded(redisClient *c) {
2117 int copylen = 0;
2118 char buf[GLUEREPLY_UP_TO];
2119 listNode *ln;
2120 listIter li;
2121 robj *o;
2122
2123 listRewind(c->reply,&li);
2124 while((ln = listNext(&li))) {
2125 int objlen;
2126
2127 o = ln->value;
2128 objlen = sdslen(o->ptr);
2129 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2130 memcpy(buf+copylen,o->ptr,objlen);
2131 copylen += objlen;
2132 listDelNode(c->reply,ln);
2133 } else {
2134 if (copylen == 0) return;
2135 break;
2136 }
2137 }
2138 /* Now the output buffer is empty, add the new single element */
2139 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2140 listAddNodeHead(c->reply,o);
2141 }
2142
2143 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2144 redisClient *c = privdata;
2145 int nwritten = 0, totwritten = 0, objlen;
2146 robj *o;
2147 REDIS_NOTUSED(el);
2148 REDIS_NOTUSED(mask);
2149
2150 /* Use writev() if we have enough buffers to send */
2151 if (!server.glueoutputbuf &&
2152 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2153 !(c->flags & REDIS_MASTER))
2154 {
2155 sendReplyToClientWritev(el, fd, privdata, mask);
2156 return;
2157 }
2158
2159 while(listLength(c->reply)) {
2160 if (server.glueoutputbuf && listLength(c->reply) > 1)
2161 glueReplyBuffersIfNeeded(c);
2162
2163 o = listNodeValue(listFirst(c->reply));
2164 objlen = sdslen(o->ptr);
2165
2166 if (objlen == 0) {
2167 listDelNode(c->reply,listFirst(c->reply));
2168 continue;
2169 }
2170
2171 if (c->flags & REDIS_MASTER) {
2172 /* Don't reply to a master */
2173 nwritten = objlen - c->sentlen;
2174 } else {
2175 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2176 if (nwritten <= 0) break;
2177 }
2178 c->sentlen += nwritten;
2179 totwritten += nwritten;
2180 /* If we fully sent the object on head go to the next one */
2181 if (c->sentlen == objlen) {
2182 listDelNode(c->reply,listFirst(c->reply));
2183 c->sentlen = 0;
2184 }
2185 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2186 * bytes, in a single threaded server it's a good idea to serve
2187 * other clients as well, even if a very large request comes from
2188 * super fast link that is always able to accept data (in real world
2189 * scenario think about 'KEYS *' against the loopback interfae) */
2190 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2191 }
2192 if (nwritten == -1) {
2193 if (errno == EAGAIN) {
2194 nwritten = 0;
2195 } else {
2196 redisLog(REDIS_VERBOSE,
2197 "Error writing to client: %s", strerror(errno));
2198 freeClient(c);
2199 return;
2200 }
2201 }
2202 if (totwritten > 0) c->lastinteraction = time(NULL);
2203 if (listLength(c->reply) == 0) {
2204 c->sentlen = 0;
2205 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2206 }
2207 }
2208
2209 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2210 {
2211 redisClient *c = privdata;
2212 int nwritten = 0, totwritten = 0, objlen, willwrite;
2213 robj *o;
2214 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2215 int offset, ion = 0;
2216 REDIS_NOTUSED(el);
2217 REDIS_NOTUSED(mask);
2218
2219 listNode *node;
2220 while (listLength(c->reply)) {
2221 offset = c->sentlen;
2222 ion = 0;
2223 willwrite = 0;
2224
2225 /* fill-in the iov[] array */
2226 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2227 o = listNodeValue(node);
2228 objlen = sdslen(o->ptr);
2229
2230 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2231 break;
2232
2233 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2234 break; /* no more iovecs */
2235
2236 iov[ion].iov_base = ((char*)o->ptr) + offset;
2237 iov[ion].iov_len = objlen - offset;
2238 willwrite += objlen - offset;
2239 offset = 0; /* just for the first item */
2240 ion++;
2241 }
2242
2243 if(willwrite == 0)
2244 break;
2245
2246 /* write all collected blocks at once */
2247 if((nwritten = writev(fd, iov, ion)) < 0) {
2248 if (errno != EAGAIN) {
2249 redisLog(REDIS_VERBOSE,
2250 "Error writing to client: %s", strerror(errno));
2251 freeClient(c);
2252 return;
2253 }
2254 break;
2255 }
2256
2257 totwritten += nwritten;
2258 offset = c->sentlen;
2259
2260 /* remove written robjs from c->reply */
2261 while (nwritten && listLength(c->reply)) {
2262 o = listNodeValue(listFirst(c->reply));
2263 objlen = sdslen(o->ptr);
2264
2265 if(nwritten >= objlen - offset) {
2266 listDelNode(c->reply, listFirst(c->reply));
2267 nwritten -= objlen - offset;
2268 c->sentlen = 0;
2269 } else {
2270 /* partial write */
2271 c->sentlen += nwritten;
2272 break;
2273 }
2274 offset = 0;
2275 }
2276 }
2277
2278 if (totwritten > 0)
2279 c->lastinteraction = time(NULL);
2280
2281 if (listLength(c->reply) == 0) {
2282 c->sentlen = 0;
2283 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2284 }
2285 }
2286
2287 static int qsortRedisCommands(const void *r1, const void *r2) {
2288 return strcasecmp(
2289 ((struct redisCommand*)r1)->name,
2290 ((struct redisCommand*)r2)->name);
2291 }
2292
2293 static void sortCommandTable() {
2294 /* Copy and sort the read-only version of the command table */
2295 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2296 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2297 qsort(commandTable,
2298 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2299 sizeof(struct redisCommand),qsortRedisCommands);
2300 }
2301
2302 static struct redisCommand *lookupCommand(char *name) {
2303 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2304 return bsearch(
2305 &tmp,
2306 commandTable,
2307 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2308 sizeof(struct redisCommand),
2309 qsortRedisCommands);
2310 }
2311
2312 /* resetClient prepare the client to process the next command */
2313 static void resetClient(redisClient *c) {
2314 freeClientArgv(c);
2315 c->bulklen = -1;
2316 c->multibulk = 0;
2317 }
2318
2319 /* Call() is the core of Redis execution of a command */
2320 static void call(redisClient *c, struct redisCommand *cmd) {
2321 long long dirty;
2322
2323 dirty = server.dirty;
2324 cmd->proc(c);
2325 dirty = server.dirty-dirty;
2326
2327 if (server.appendonly && dirty)
2328 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2329 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2330 listLength(server.slaves))
2331 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2332 if (listLength(server.monitors))
2333 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2334 server.stat_numcommands++;
2335 }
2336
2337 /* If this function gets called we already read a whole
2338 * command, argments are in the client argv/argc fields.
2339 * processCommand() execute the command or prepare the
2340 * server for a bulk read from the client.
2341 *
2342 * If 1 is returned the client is still alive and valid and
2343 * and other operations can be performed by the caller. Otherwise
2344 * if 0 is returned the client was destroied (i.e. after QUIT). */
2345 static int processCommand(redisClient *c) {
2346 struct redisCommand *cmd;
2347
2348 /* Free some memory if needed (maxmemory setting) */
2349 if (server.maxmemory) freeMemoryIfNeeded();
2350
2351 /* Handle the multi bulk command type. This is an alternative protocol
2352 * supported by Redis in order to receive commands that are composed of
2353 * multiple binary-safe "bulk" arguments. The latency of processing is
2354 * a bit higher but this allows things like multi-sets, so if this
2355 * protocol is used only for MSET and similar commands this is a big win. */
2356 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2357 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2358 if (c->multibulk <= 0) {
2359 resetClient(c);
2360 return 1;
2361 } else {
2362 decrRefCount(c->argv[c->argc-1]);
2363 c->argc--;
2364 return 1;
2365 }
2366 } else if (c->multibulk) {
2367 if (c->bulklen == -1) {
2368 if (((char*)c->argv[0]->ptr)[0] != '$') {
2369 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2370 resetClient(c);
2371 return 1;
2372 } else {
2373 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2374 decrRefCount(c->argv[0]);
2375 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2376 c->argc--;
2377 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2378 resetClient(c);
2379 return 1;
2380 }
2381 c->argc--;
2382 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2383 return 1;
2384 }
2385 } else {
2386 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2387 c->mbargv[c->mbargc] = c->argv[0];
2388 c->mbargc++;
2389 c->argc--;
2390 c->multibulk--;
2391 if (c->multibulk == 0) {
2392 robj **auxargv;
2393 int auxargc;
2394
2395 /* Here we need to swap the multi-bulk argc/argv with the
2396 * normal argc/argv of the client structure. */
2397 auxargv = c->argv;
2398 c->argv = c->mbargv;
2399 c->mbargv = auxargv;
2400
2401 auxargc = c->argc;
2402 c->argc = c->mbargc;
2403 c->mbargc = auxargc;
2404
2405 /* We need to set bulklen to something different than -1
2406 * in order for the code below to process the command without
2407 * to try to read the last argument of a bulk command as
2408 * a special argument. */
2409 c->bulklen = 0;
2410 /* continue below and process the command */
2411 } else {
2412 c->bulklen = -1;
2413 return 1;
2414 }
2415 }
2416 }
2417 /* -- end of multi bulk commands processing -- */
2418
2419 /* The QUIT command is handled as a special case. Normal command
2420 * procs are unable to close the client connection safely */
2421 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2422 freeClient(c);
2423 return 0;
2424 }
2425
2426 /* Now lookup the command and check ASAP about trivial error conditions
2427 * such wrong arity, bad command name and so forth. */
2428 cmd = lookupCommand(c->argv[0]->ptr);
2429 if (!cmd) {
2430 addReplySds(c,
2431 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2432 (char*)c->argv[0]->ptr));
2433 resetClient(c);
2434 return 1;
2435 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2436 (c->argc < -cmd->arity)) {
2437 addReplySds(c,
2438 sdscatprintf(sdsempty(),
2439 "-ERR wrong number of arguments for '%s' command\r\n",
2440 cmd->name));
2441 resetClient(c);
2442 return 1;
2443 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2444 /* This is a bulk command, we have to read the last argument yet. */
2445 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2446
2447 decrRefCount(c->argv[c->argc-1]);
2448 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2449 c->argc--;
2450 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2451 resetClient(c);
2452 return 1;
2453 }
2454 c->argc--;
2455 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2456 /* It is possible that the bulk read is already in the
2457 * buffer. Check this condition and handle it accordingly.
2458 * This is just a fast path, alternative to call processInputBuffer().
2459 * It's a good idea since the code is small and this condition
2460 * happens most of the times. */
2461 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2462 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2463 c->argc++;
2464 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2465 } else {
2466 /* Otherwise return... there is to read the last argument
2467 * from the socket. */
2468 return 1;
2469 }
2470 }
2471 /* Let's try to encode the bulk object to save space. */
2472 if (cmd->flags & REDIS_CMD_BULK)
2473 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2474
2475 /* Check if the user is authenticated */
2476 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2477 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2478 resetClient(c);
2479 return 1;
2480 }
2481
2482 /* Handle the maxmemory directive */
2483 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2484 zmalloc_used_memory() > server.maxmemory)
2485 {
2486 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490
2491 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2492 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2493 &&
2494 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2495 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2496 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2497 resetClient(c);
2498 return 1;
2499 }
2500
2501 /* Exec the command */
2502 if (c->flags & REDIS_MULTI &&
2503 cmd->proc != execCommand && cmd->proc != discardCommand &&
2504 cmd->proc != multiCommand && cmd->proc != watchCommand)
2505 {
2506 queueMultiCommand(c,cmd);
2507 addReply(c,shared.queued);
2508 } else {
2509 if (server.vm_enabled && server.vm_max_threads > 0 &&
2510 blockClientOnSwappedKeys(c,cmd)) return 1;
2511 call(c,cmd);
2512 }
2513
2514 /* Prepare the client for the next command */
2515 resetClient(c);
2516 return 1;
2517 }
2518
2519 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2520 listNode *ln;
2521 listIter li;
2522 int outc = 0, j;
2523 robj **outv;
2524 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2525 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2526 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2527 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2528 robj *lenobj;
2529
2530 if (argc <= REDIS_STATIC_ARGS) {
2531 outv = static_outv;
2532 } else {
2533 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2534 }
2535
2536 lenobj = createObject(REDIS_STRING,
2537 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2538 lenobj->refcount = 0;
2539 outv[outc++] = lenobj;
2540 for (j = 0; j < argc; j++) {
2541 lenobj = createObject(REDIS_STRING,
2542 sdscatprintf(sdsempty(),"$%lu\r\n",
2543 (unsigned long) stringObjectLen(argv[j])));
2544 lenobj->refcount = 0;
2545 outv[outc++] = lenobj;
2546 outv[outc++] = argv[j];
2547 outv[outc++] = shared.crlf;
2548 }
2549
2550 /* Increment all the refcounts at start and decrement at end in order to
2551 * be sure to free objects if there is no slave in a replication state
2552 * able to be feed with commands */
2553 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2554 listRewind(slaves,&li);
2555 while((ln = listNext(&li))) {
2556 redisClient *slave = ln->value;
2557
2558 /* Don't feed slaves that are still waiting for BGSAVE to start */
2559 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2560
2561 /* Feed all the other slaves, MONITORs and so on */
2562 if (slave->slaveseldb != dictid) {
2563 robj *selectcmd;
2564
2565 switch(dictid) {
2566 case 0: selectcmd = shared.select0; break;
2567 case 1: selectcmd = shared.select1; break;
2568 case 2: selectcmd = shared.select2; break;
2569 case 3: selectcmd = shared.select3; break;
2570 case 4: selectcmd = shared.select4; break;
2571 case 5: selectcmd = shared.select5; break;
2572 case 6: selectcmd = shared.select6; break;
2573 case 7: selectcmd = shared.select7; break;
2574 case 8: selectcmd = shared.select8; break;
2575 case 9: selectcmd = shared.select9; break;
2576 default:
2577 selectcmd = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2579 selectcmd->refcount = 0;
2580 break;
2581 }
2582 addReply(slave,selectcmd);
2583 slave->slaveseldb = dictid;
2584 }
2585 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2586 }
2587 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2588 if (outv != static_outv) zfree(outv);
2589 }
2590
2591 static sds sdscatrepr(sds s, char *p, size_t len) {
2592 s = sdscatlen(s,"\"",1);
2593 while(len--) {
2594 switch(*p) {
2595 case '\\':
2596 case '"':
2597 s = sdscatprintf(s,"\\%c",*p);
2598 break;
2599 case '\n': s = sdscatlen(s,"\\n",1); break;
2600 case '\r': s = sdscatlen(s,"\\r",1); break;
2601 case '\t': s = sdscatlen(s,"\\t",1); break;
2602 case '\a': s = sdscatlen(s,"\\a",1); break;
2603 case '\b': s = sdscatlen(s,"\\b",1); break;
2604 default:
2605 if (isprint(*p))
2606 s = sdscatprintf(s,"%c",*p);
2607 else
2608 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2609 break;
2610 }
2611 p++;
2612 }
2613 return sdscatlen(s,"\"",1);
2614 }
2615
2616 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2617 listNode *ln;
2618 listIter li;
2619 int j;
2620 sds cmdrepr = sdsnew("+");
2621 robj *cmdobj;
2622 struct timeval tv;
2623
2624 gettimeofday(&tv,NULL);
2625 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2626 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2627
2628 for (j = 0; j < argc; j++) {
2629 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2630 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2631 } else {
2632 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2633 sdslen(argv[j]->ptr));
2634 }
2635 if (j != argc-1)
2636 cmdrepr = sdscatlen(cmdrepr," ",1);
2637 }
2638 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2639 cmdobj = createObject(REDIS_STRING,cmdrepr);
2640
2641 listRewind(monitors,&li);
2642 while((ln = listNext(&li))) {
2643 redisClient *monitor = ln->value;
2644 addReply(monitor,cmdobj);
2645 }
2646 decrRefCount(cmdobj);
2647 }
2648
2649 static void processInputBuffer(redisClient *c) {
2650 again:
2651 /* Before to process the input buffer, make sure the client is not
2652 * waitig for a blocking operation such as BLPOP. Note that the first
2653 * iteration the client is never blocked, otherwise the processInputBuffer
2654 * would not be called at all, but after the execution of the first commands
2655 * in the input buffer the client may be blocked, and the "goto again"
2656 * will try to reiterate. The following line will make it return asap. */
2657 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2658 if (c->bulklen == -1) {
2659 /* Read the first line of the query */
2660 char *p = strchr(c->querybuf,'\n');
2661 size_t querylen;
2662
2663 if (p) {
2664 sds query, *argv;
2665 int argc, j;
2666
2667 query = c->querybuf;
2668 c->querybuf = sdsempty();
2669 querylen = 1+(p-(query));
2670 if (sdslen(query) > querylen) {
2671 /* leave data after the first line of the query in the buffer */
2672 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2673 }
2674 *p = '\0'; /* remove "\n" */
2675 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2676 sdsupdatelen(query);
2677
2678 /* Now we can split the query in arguments */
2679 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2680 sdsfree(query);
2681
2682 if (c->argv) zfree(c->argv);
2683 c->argv = zmalloc(sizeof(robj*)*argc);
2684
2685 for (j = 0; j < argc; j++) {
2686 if (sdslen(argv[j])) {
2687 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2688 c->argc++;
2689 } else {
2690 sdsfree(argv[j]);
2691 }
2692 }
2693 zfree(argv);
2694 if (c->argc) {
2695 /* Execute the command. If the client is still valid
2696 * after processCommand() return and there is something
2697 * on the query buffer try to process the next command. */
2698 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2699 } else {
2700 /* Nothing to process, argc == 0. Just process the query
2701 * buffer if it's not empty or return to the caller */
2702 if (sdslen(c->querybuf)) goto again;
2703 }
2704 return;
2705 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2706 redisLog(REDIS_VERBOSE, "Client protocol error");
2707 freeClient(c);
2708 return;
2709 }
2710 } else {
2711 /* Bulk read handling. Note that if we are at this point
2712 the client already sent a command terminated with a newline,
2713 we are reading the bulk data that is actually the last
2714 argument of the command. */
2715 int qbl = sdslen(c->querybuf);
2716
2717 if (c->bulklen <= qbl) {
2718 /* Copy everything but the final CRLF as final argument */
2719 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2720 c->argc++;
2721 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2722 /* Process the command. If the client is still valid after
2723 * the processing and there is more data in the buffer
2724 * try to parse it. */
2725 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2726 return;
2727 }
2728 }
2729 }
2730
2731 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2732 redisClient *c = (redisClient*) privdata;
2733 char buf[REDIS_IOBUF_LEN];
2734 int nread;
2735 REDIS_NOTUSED(el);
2736 REDIS_NOTUSED(mask);
2737
2738 nread = read(fd, buf, REDIS_IOBUF_LEN);
2739 if (nread == -1) {
2740 if (errno == EAGAIN) {
2741 nread = 0;
2742 } else {
2743 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2744 freeClient(c);
2745 return;
2746 }
2747 } else if (nread == 0) {
2748 redisLog(REDIS_VERBOSE, "Client closed connection");
2749 freeClient(c);
2750 return;
2751 }
2752 if (nread) {
2753 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2754 c->lastinteraction = time(NULL);
2755 } else {
2756 return;
2757 }
2758 processInputBuffer(c);
2759 }
2760
2761 static int selectDb(redisClient *c, int id) {
2762 if (id < 0 || id >= server.dbnum)
2763 return REDIS_ERR;
2764 c->db = &server.db[id];
2765 return REDIS_OK;
2766 }
2767
2768 static void *dupClientReplyValue(void *o) {
2769 incrRefCount((robj*)o);
2770 return o;
2771 }
2772
2773 static int listMatchObjects(void *a, void *b) {
2774 return equalStringObjects(a,b);
2775 }
2776
2777 static redisClient *createClient(int fd) {
2778 redisClient *c = zmalloc(sizeof(*c));
2779
2780 anetNonBlock(NULL,fd);
2781 anetTcpNoDelay(NULL,fd);
2782 if (!c) return NULL;
2783 selectDb(c,0);
2784 c->fd = fd;
2785 c->querybuf = sdsempty();
2786 c->argc = 0;
2787 c->argv = NULL;
2788 c->bulklen = -1;
2789 c->multibulk = 0;
2790 c->mbargc = 0;
2791 c->mbargv = NULL;
2792 c->sentlen = 0;
2793 c->flags = 0;
2794 c->lastinteraction = time(NULL);
2795 c->authenticated = 0;
2796 c->replstate = REDIS_REPL_NONE;
2797 c->reply = listCreate();
2798 listSetFreeMethod(c->reply,decrRefCount);
2799 listSetDupMethod(c->reply,dupClientReplyValue);
2800 c->blocking_keys = NULL;
2801 c->blocking_keys_num = 0;
2802 c->io_keys = listCreate();
2803 c->watched_keys = listCreate();
2804 listSetFreeMethod(c->io_keys,decrRefCount);
2805 c->pubsub_channels = dictCreate(&setDictType,NULL);
2806 c->pubsub_patterns = listCreate();
2807 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2808 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2809 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2810 readQueryFromClient, c) == AE_ERR) {
2811 freeClient(c);
2812 return NULL;
2813 }
2814 listAddNodeTail(server.clients,c);
2815 initClientMultiState(c);
2816 return c;
2817 }
2818
2819 static void addReply(redisClient *c, robj *obj) {
2820 if (listLength(c->reply) == 0 &&
2821 (c->replstate == REDIS_REPL_NONE ||
2822 c->replstate == REDIS_REPL_ONLINE) &&
2823 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2824 sendReplyToClient, c) == AE_ERR) return;
2825
2826 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2827 obj = dupStringObject(obj);
2828 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2829 }
2830 listAddNodeTail(c->reply,getDecodedObject(obj));
2831 }
2832
2833 static void addReplySds(redisClient *c, sds s) {
2834 robj *o = createObject(REDIS_STRING,s);
2835 addReply(c,o);
2836 decrRefCount(o);
2837 }
2838
2839 static void addReplyDouble(redisClient *c, double d) {
2840 char buf[128];
2841
2842 snprintf(buf,sizeof(buf),"%.17g",d);
2843 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2844 (unsigned long) strlen(buf),buf));
2845 }
2846
2847 static void addReplyLongLong(redisClient *c, long long ll) {
2848 char buf[128];
2849 size_t len;
2850
2851 if (ll == 0) {
2852 addReply(c,shared.czero);
2853 return;
2854 } else if (ll == 1) {
2855 addReply(c,shared.cone);
2856 return;
2857 }
2858 buf[0] = ':';
2859 len = ll2string(buf+1,sizeof(buf)-1,ll);
2860 buf[len+1] = '\r';
2861 buf[len+2] = '\n';
2862 addReplySds(c,sdsnewlen(buf,len+3));
2863 }
2864
2865 static void addReplyUlong(redisClient *c, unsigned long ul) {
2866 char buf[128];
2867 size_t len;
2868
2869 if (ul == 0) {
2870 addReply(c,shared.czero);
2871 return;
2872 } else if (ul == 1) {
2873 addReply(c,shared.cone);
2874 return;
2875 }
2876 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2877 addReplySds(c,sdsnewlen(buf,len));
2878 }
2879
2880 static void addReplyBulkLen(redisClient *c, robj *obj) {
2881 size_t len, intlen;
2882 char buf[128];
2883
2884 if (obj->encoding == REDIS_ENCODING_RAW) {
2885 len = sdslen(obj->ptr);
2886 } else {
2887 long n = (long)obj->ptr;
2888
2889 /* Compute how many bytes will take this integer as a radix 10 string */
2890 len = 1;
2891 if (n < 0) {
2892 len++;
2893 n = -n;
2894 }
2895 while((n = n/10) != 0) {
2896 len++;
2897 }
2898 }
2899 buf[0] = '$';
2900 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2901 buf[intlen+1] = '\r';
2902 buf[intlen+2] = '\n';
2903 addReplySds(c,sdsnewlen(buf,intlen+3));
2904 }
2905
2906 static void addReplyBulk(redisClient *c, robj *obj) {
2907 addReplyBulkLen(c,obj);
2908 addReply(c,obj);
2909 addReply(c,shared.crlf);
2910 }
2911
2912 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2913 static void addReplyBulkCString(redisClient *c, char *s) {
2914 if (s == NULL) {
2915 addReply(c,shared.nullbulk);
2916 } else {
2917 robj *o = createStringObject(s,strlen(s));
2918 addReplyBulk(c,o);
2919 decrRefCount(o);
2920 }
2921 }
2922
2923 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2924 int cport, cfd;
2925 char cip[128];
2926 redisClient *c;
2927 REDIS_NOTUSED(el);
2928 REDIS_NOTUSED(mask);
2929 REDIS_NOTUSED(privdata);
2930
2931 cfd = anetAccept(server.neterr, fd, cip, &cport);
2932 if (cfd == AE_ERR) {
2933 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2934 return;
2935 }
2936 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2937 if ((c = createClient(cfd)) == NULL) {
2938 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2939 close(cfd); /* May be already closed, just ingore errors */
2940 return;
2941 }
2942 /* If maxclient directive is set and this is one client more... close the
2943 * connection. Note that we create the client instead to check before
2944 * for this condition, since now the socket is already set in nonblocking
2945 * mode and we can send an error for free using the Kernel I/O */
2946 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2947 char *err = "-ERR max number of clients reached\r\n";
2948
2949 /* That's a best effort error message, don't check write errors */
2950 if (write(c->fd,err,strlen(err)) == -1) {
2951 /* Nothing to do, Just to avoid the warning... */
2952 }
2953 freeClient(c);
2954 return;
2955 }
2956 server.stat_numconnections++;
2957 }
2958
2959 /* ======================= Redis objects implementation ===================== */
2960
2961 static robj *createObject(int type, void *ptr) {
2962 robj *o;
2963
2964 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2965 if (listLength(server.objfreelist)) {
2966 listNode *head = listFirst(server.objfreelist);
2967 o = listNodeValue(head);
2968 listDelNode(server.objfreelist,head);
2969 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2970 } else {
2971 if (server.vm_enabled)
2972 pthread_mutex_unlock(&server.obj_freelist_mutex);
2973 o = zmalloc(sizeof(*o));
2974 }
2975 o->type = type;
2976 o->encoding = REDIS_ENCODING_RAW;
2977 o->ptr = ptr;
2978 o->refcount = 1;
2979 if (server.vm_enabled) {
2980 /* Note that this code may run in the context of an I/O thread
2981 * and accessing server.lruclock in theory is an error
2982 * (no locks). But in practice this is safe, and even if we read
2983 * garbage Redis will not fail. */
2984 o->lru = server.lruclock;
2985 o->storage = REDIS_VM_MEMORY;
2986 }
2987 return o;
2988 }
2989
2990 static robj *createStringObject(char *ptr, size_t len) {
2991 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2992 }
2993
2994 static robj *createStringObjectFromLongLong(long long value) {
2995 robj *o;
2996 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2997 incrRefCount(shared.integers[value]);
2998 o = shared.integers[value];
2999 } else {
3000 if (value >= LONG_MIN && value <= LONG_MAX) {
3001 o = createObject(REDIS_STRING, NULL);
3002 o->encoding = REDIS_ENCODING_INT;
3003 o->ptr = (void*)((long)value);
3004 } else {
3005 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3006 }
3007 }
3008 return o;
3009 }
3010
3011 static robj *dupStringObject(robj *o) {
3012 assert(o->encoding == REDIS_ENCODING_RAW);
3013 return createStringObject(o->ptr,sdslen(o->ptr));
3014 }
3015
3016 static robj *createListObject(void) {
3017 list *l = listCreate();
3018
3019 listSetFreeMethod(l,decrRefCount);
3020 return createObject(REDIS_LIST,l);
3021 }
3022
3023 static robj *createSetObject(void) {
3024 dict *d = dictCreate(&setDictType,NULL);
3025 return createObject(REDIS_SET,d);
3026 }
3027
3028 static robj *createHashObject(void) {
3029 /* All the Hashes start as zipmaps. Will be automatically converted
3030 * into hash tables if there are enough elements or big elements
3031 * inside. */
3032 unsigned char *zm = zipmapNew();
3033 robj *o = createObject(REDIS_HASH,zm);
3034 o->encoding = REDIS_ENCODING_ZIPMAP;
3035 return o;
3036 }
3037
3038 static robj *createZsetObject(void) {
3039 zset *zs = zmalloc(sizeof(*zs));
3040
3041 zs->dict = dictCreate(&zsetDictType,NULL);
3042 zs->zsl = zslCreate();
3043 return createObject(REDIS_ZSET,zs);
3044 }
3045
3046 static void freeStringObject(robj *o) {
3047 if (o->encoding == REDIS_ENCODING_RAW) {
3048 sdsfree(o->ptr);
3049 }
3050 }
3051
3052 static void freeListObject(robj *o) {
3053 listRelease((list*) o->ptr);
3054 }
3055
3056 static void freeSetObject(robj *o) {
3057 dictRelease((dict*) o->ptr);
3058 }
3059
3060 static void freeZsetObject(robj *o) {
3061 zset *zs = o->ptr;
3062
3063 dictRelease(zs->dict);
3064 zslFree(zs->zsl);
3065 zfree(zs);
3066 }
3067
3068 static void freeHashObject(robj *o) {
3069 switch (o->encoding) {
3070 case REDIS_ENCODING_HT:
3071 dictRelease((dict*) o->ptr);
3072 break;
3073 case REDIS_ENCODING_ZIPMAP:
3074 zfree(o->ptr);
3075 break;
3076 default:
3077 redisPanic("Unknown hash encoding type");
3078 break;
3079 }
3080 }
3081
3082 static void incrRefCount(robj *o) {
3083 o->refcount++;
3084 }
3085
3086 static void decrRefCount(void *obj) {
3087 robj *o = obj;
3088
3089 /* Object is a swapped out value, or in the process of being loaded. */
3090 if (server.vm_enabled &&
3091 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3092 {
3093 vmpointer *vp = obj;
3094 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3095 vmMarkPagesFree(vp->page,vp->usedpages);
3096 server.vm_stats_swapped_objects--;
3097 zfree(vp);
3098 return;
3099 }
3100
3101 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3102 /* Object is in memory, or in the process of being swapped out.
3103 *
3104 * If the object is being swapped out, abort the operation on
3105 * decrRefCount even if the refcount does not drop to 0: the object
3106 * is referenced at least two times, as value of the key AND as
3107 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3108 * done but the relevant key was removed in the meantime, the
3109 * complete jobs handler will not find the key about the job and the
3110 * assert will fail. */
3111 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3112 vmCancelThreadedIOJob(o);
3113 if (--(o->refcount) == 0) {
3114 switch(o->type) {
3115 case REDIS_STRING: freeStringObject(o); break;
3116 case REDIS_LIST: freeListObject(o); break;
3117 case REDIS_SET: freeSetObject(o); break;
3118 case REDIS_ZSET: freeZsetObject(o); break;
3119 case REDIS_HASH: freeHashObject(o); break;
3120 default: redisPanic("Unknown object type"); break;
3121 }
3122 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3123 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3124 !listAddNodeHead(server.objfreelist,o))
3125 zfree(o);
3126 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3127 }
3128 }
3129
3130 static robj *lookupKey(redisDb *db, robj *key) {
3131 dictEntry *de = dictFind(db->dict,key);
3132 if (de) {
3133 robj *key = dictGetEntryKey(de);
3134 robj *val = dictGetEntryVal(de);
3135
3136 if (server.vm_enabled) {
3137 if (val->storage == REDIS_VM_MEMORY ||
3138 val->storage == REDIS_VM_SWAPPING)
3139 {
3140 /* If we were swapping the object out, cancel the operation */
3141 if (val->storage == REDIS_VM_SWAPPING)
3142 vmCancelThreadedIOJob(val);
3143 /* Update the access time of the key for the aging algorithm. */
3144 val->lru = server.lruclock;
3145 } else {
3146 int notify = (val->storage == REDIS_VM_LOADING);
3147
3148 /* Our value was swapped on disk. Bring it at home. */
3149 redisAssert(val->type == REDIS_VMPOINTER);
3150 val = vmLoadObject(val);
3151 dictGetEntryVal(de) = val;
3152
3153 /* Clients blocked by the VM subsystem may be waiting for
3154 * this key... */
3155 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3156 }
3157 }
3158 return val;
3159 } else {
3160 return NULL;
3161 }
3162 }
3163
3164 static robj *lookupKeyRead(redisDb *db, robj *key) {
3165 expireIfNeeded(db,key);
3166 return lookupKey(db,key);
3167 }
3168
3169 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3170 deleteIfVolatile(db,key);
3171 touchWatchedKey(db,key);
3172 return lookupKey(db,key);
3173 }
3174
3175 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3176 robj *o = lookupKeyRead(c->db, key);
3177 if (!o) addReply(c,reply);
3178 return o;
3179 }
3180
3181 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3182 robj *o = lookupKeyWrite(c->db, key);
3183 if (!o) addReply(c,reply);
3184 return o;
3185 }
3186
3187 static int checkType(redisClient *c, robj *o, int type) {
3188 if (o->type != type) {
3189 addReply(c,shared.wrongtypeerr);
3190 return 1;
3191 }
3192 return 0;
3193 }
3194
3195 static int deleteKey(redisDb *db, robj *key) {
3196 int retval;
3197
3198 /* We need to protect key from destruction: after the first dictDelete()
3199 * it may happen that 'key' is no longer valid if we don't increment
3200 * it's count. This may happen when we get the object reference directly
3201 * from the hash table with dictRandomKey() or dict iterators */
3202 incrRefCount(key);
3203 if (dictSize(db->expires)) dictDelete(db->expires,key);
3204 retval = dictDelete(db->dict,key);
3205 decrRefCount(key);
3206
3207 return retval == DICT_OK;
3208 }
3209
3210 /* Check if the nul-terminated string 's' can be represented by a long
3211 * (that is, is a number that fits into long without any other space or
3212 * character before or after the digits).
3213 *
3214 * If so, the function returns REDIS_OK and *longval is set to the value
3215 * of the number. Otherwise REDIS_ERR is returned */
3216 static int isStringRepresentableAsLong(sds s, long *longval) {
3217 char buf[32], *endptr;
3218 long value;
3219 int slen;
3220
3221 value = strtol(s, &endptr, 10);
3222 if (endptr[0] != '\0') return REDIS_ERR;
3223 slen = ll2string(buf,32,value);
3224
3225 /* If the number converted back into a string is not identical
3226 * then it's not possible to encode the string as integer */
3227 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3228 if (longval) *longval = value;
3229 return REDIS_OK;
3230 }
3231
3232 /* Try to encode a string object in order to save space */
3233 static robj *tryObjectEncoding(robj *o) {
3234 long value;
3235 sds s = o->ptr;
3236
3237 if (o->encoding != REDIS_ENCODING_RAW)
3238 return o; /* Already encoded */
3239
3240 /* It's not safe to encode shared objects: shared objects can be shared
3241 * everywhere in the "object space" of Redis. Encoded objects can only
3242 * appear as "values" (and not, for instance, as keys) */
3243 if (o->refcount > 1) return o;
3244
3245 /* Currently we try to encode only strings */
3246 redisAssert(o->type == REDIS_STRING);
3247
3248 /* Check if we can represent this string as a long integer */
3249 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3250
3251 /* Ok, this object can be encoded */
3252 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3253 decrRefCount(o);
3254 incrRefCount(shared.integers[value]);
3255 return shared.integers[value];
3256 } else {
3257 o->encoding = REDIS_ENCODING_INT;
3258 sdsfree(o->ptr);
3259 o->ptr = (void*) value;
3260 return o;
3261 }
3262 }
3263
3264 /* Get a decoded version of an encoded object (returned as a new object).
3265 * If the object is already raw-encoded just increment the ref count. */
3266 static robj *getDecodedObject(robj *o) {
3267 robj *dec;
3268
3269 if (o->encoding == REDIS_ENCODING_RAW) {
3270 incrRefCount(o);
3271 return o;
3272 }
3273 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3274 char buf[32];
3275
3276 ll2string(buf,32,(long)o->ptr);
3277 dec = createStringObject(buf,strlen(buf));
3278 return dec;
3279 } else {
3280 redisPanic("Unknown encoding type");
3281 }
3282 }
3283
3284 /* Compare two string objects via strcmp() or alike.
3285 * Note that the objects may be integer-encoded. In such a case we
3286 * use ll2string() to get a string representation of the numbers on the stack
3287 * and compare the strings, it's much faster than calling getDecodedObject().
3288 *
3289 * Important note: if objects are not integer encoded, but binary-safe strings,
3290 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3291 * binary safe. */
3292 static int compareStringObjects(robj *a, robj *b) {
3293 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3294 char bufa[128], bufb[128], *astr, *bstr;
3295 int bothsds = 1;
3296
3297 if (a == b) return 0;
3298 if (a->encoding != REDIS_ENCODING_RAW) {
3299 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3300 astr = bufa;
3301 bothsds = 0;
3302 } else {
3303 astr = a->ptr;
3304 }
3305 if (b->encoding != REDIS_ENCODING_RAW) {
3306 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3307 bstr = bufb;
3308 bothsds = 0;
3309 } else {
3310 bstr = b->ptr;
3311 }
3312 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3313 }
3314
3315 /* Equal string objects return 1 if the two objects are the same from the
3316 * point of view of a string comparison, otherwise 0 is returned. Note that
3317 * this function is faster then checking for (compareStringObject(a,b) == 0)
3318 * because it can perform some more optimization. */
3319 static int equalStringObjects(robj *a, robj *b) {
3320 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3321 return a->ptr == b->ptr;
3322 } else {
3323 return compareStringObjects(a,b) == 0;
3324 }
3325 }
3326
3327 static size_t stringObjectLen(robj *o) {
3328 redisAssert(o->type == REDIS_STRING);
3329 if (o->encoding == REDIS_ENCODING_RAW) {
3330 return sdslen(o->ptr);
3331 } else {
3332 char buf[32];
3333
3334 return ll2string(buf,32,(long)o->ptr);
3335 }
3336 }
3337
3338 static int getDoubleFromObject(robj *o, double *target) {
3339 double value;
3340 char *eptr;
3341
3342 if (o == NULL) {
3343 value = 0;
3344 } else {
3345 redisAssert(o->type == REDIS_STRING);
3346 if (o->encoding == REDIS_ENCODING_RAW) {
3347 value = strtod(o->ptr, &eptr);
3348 if (eptr[0] != '\0') return REDIS_ERR;
3349 } else if (o->encoding == REDIS_ENCODING_INT) {
3350 value = (long)o->ptr;
3351 } else {
3352 redisPanic("Unknown string encoding");
3353 }
3354 }
3355
3356 *target = value;
3357 return REDIS_OK;
3358 }
3359
3360 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3361 double value;
3362 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3363 if (msg != NULL) {
3364 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3365 } else {
3366 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3367 }
3368 return REDIS_ERR;
3369 }
3370
3371 *target = value;
3372 return REDIS_OK;
3373 }
3374
3375 static int getLongLongFromObject(robj *o, long long *target) {
3376 long long value;
3377 char *eptr;
3378
3379 if (o == NULL) {
3380 value = 0;
3381 } else {
3382 redisAssert(o->type == REDIS_STRING);
3383 if (o->encoding == REDIS_ENCODING_RAW) {
3384 value = strtoll(o->ptr, &eptr, 10);
3385 if (eptr[0] != '\0') return REDIS_ERR;
3386 } else if (o->encoding == REDIS_ENCODING_INT) {
3387 value = (long)o->ptr;
3388 } else {
3389 redisPanic("Unknown string encoding");
3390 }
3391 }
3392
3393 *target = value;
3394 return REDIS_OK;
3395 }
3396
3397 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3398 long long value;
3399 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3400 if (msg != NULL) {
3401 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3402 } else {
3403 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3404 }
3405 return REDIS_ERR;
3406 }
3407
3408 *target = value;
3409 return REDIS_OK;
3410 }
3411
3412 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3413 long long value;
3414
3415 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3416 if (value < LONG_MIN || value > LONG_MAX) {
3417 if (msg != NULL) {
3418 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3419 } else {
3420 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3421 }
3422 return REDIS_ERR;
3423 }
3424
3425 *target = value;
3426 return REDIS_OK;
3427 }
3428
3429 /*============================ RDB saving/loading =========================== */
3430
3431 static int rdbSaveType(FILE *fp, unsigned char type) {
3432 if (fwrite(&type,1,1,fp) == 0) return -1;
3433 return 0;
3434 }
3435
3436 static int rdbSaveTime(FILE *fp, time_t t) {
3437 int32_t t32 = (int32_t) t;
3438 if (fwrite(&t32,4,1,fp) == 0) return -1;
3439 return 0;
3440 }
3441
3442 /* check rdbLoadLen() comments for more info */
3443 static int rdbSaveLen(FILE *fp, uint32_t len) {
3444 unsigned char buf[2];
3445
3446 if (len < (1<<6)) {
3447 /* Save a 6 bit len */
3448 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3449 if (fwrite(buf,1,1,fp) == 0) return -1;
3450 } else if (len < (1<<14)) {
3451 /* Save a 14 bit len */
3452 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3453 buf[1] = len&0xFF;
3454 if (fwrite(buf,2,1,fp) == 0) return -1;
3455 } else {
3456 /* Save a 32 bit len */
3457 buf[0] = (REDIS_RDB_32BITLEN<<6);
3458 if (fwrite(buf,1,1,fp) == 0) return -1;
3459 len = htonl(len);
3460 if (fwrite(&len,4,1,fp) == 0) return -1;
3461 }
3462 return 0;
3463 }
3464
3465 /* Encode 'value' as an integer if possible (if integer will fit the
3466 * supported range). If the function sucessful encoded the integer
3467 * then the (up to 5 bytes) encoded representation is written in the
3468 * string pointed by 'enc' and the length is returned. Otherwise
3469 * 0 is returned. */
3470 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3471 /* Finally check if it fits in our ranges */
3472 if (value >= -(1<<7) && value <= (1<<7)-1) {
3473 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3474 enc[1] = value&0xFF;
3475 return 2;
3476 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3477 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3478 enc[1] = value&0xFF;
3479 enc[2] = (value>>8)&0xFF;
3480 return 3;
3481 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3482 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3483 enc[1] = value&0xFF;
3484 enc[2] = (value>>8)&0xFF;
3485 enc[3] = (value>>16)&0xFF;
3486 enc[4] = (value>>24)&0xFF;
3487 return 5;
3488 } else {
3489 return 0;
3490 }
3491 }
3492
3493 /* String objects in the form "2391" "-100" without any space and with a
3494 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3495 * encoded as integers to save space */
3496 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3497 long long value;
3498 char *endptr, buf[32];
3499
3500 /* Check if it's possible to encode this value as a number */
3501 value = strtoll(s, &endptr, 10);
3502 if (endptr[0] != '\0') return 0;
3503 ll2string(buf,32,value);
3504
3505 /* If the number converted back into a string is not identical
3506 * then it's not possible to encode the string as integer */
3507 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3508
3509 return rdbEncodeInteger(value,enc);
3510 }
3511
3512 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3513 size_t comprlen, outlen;
3514 unsigned char byte;
3515 void *out;
3516
3517 /* We require at least four bytes compression for this to be worth it */
3518 if (len <= 4) return 0;
3519 outlen = len-4;
3520 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3521 comprlen = lzf_compress(s, len, out, outlen);
3522 if (comprlen == 0) {
3523 zfree(out);
3524 return 0;
3525 }
3526 /* Data compressed! Let's save it on disk */
3527 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3528 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3529 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3530 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3531 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3532 zfree(out);
3533 return comprlen;
3534
3535 writeerr:
3536 zfree(out);
3537 return -1;
3538 }
3539
3540 /* Save a string objet as [len][data] on disk. If the object is a string
3541 * representation of an integer value we try to safe it in a special form */
3542 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3543 int enclen;
3544
3545 /* Try integer encoding */
3546 if (len <= 11) {
3547 unsigned char buf[5];
3548 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3549 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3550 return 0;
3551 }
3552 }
3553
3554 /* Try LZF compression - under 20 bytes it's unable to compress even
3555 * aaaaaaaaaaaaaaaaaa so skip it */
3556 if (server.rdbcompression && len > 20) {
3557 int retval;
3558
3559 retval = rdbSaveLzfStringObject(fp,s,len);
3560 if (retval == -1) return -1;
3561 if (retval > 0) return 0;
3562 /* retval == 0 means data can't be compressed, save the old way */
3563 }
3564
3565 /* Store verbatim */
3566 if (rdbSaveLen(fp,len) == -1) return -1;
3567 if (len && fwrite(s,len,1,fp) == 0) return -1;
3568 return 0;
3569 }
3570
3571 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3572 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3573 int retval;
3574
3575 /* Avoid to decode the object, then encode it again, if the
3576 * object is alrady integer encoded. */
3577 if (obj->encoding == REDIS_ENCODING_INT) {
3578 long val = (long) obj->ptr;
3579 unsigned char buf[5];
3580 int enclen;
3581
3582 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3583 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3584 return 0;
3585 }
3586 /* otherwise... fall throught and continue with the usual
3587 * code path. */
3588 }
3589
3590 /* Avoid incr/decr ref count business when possible.
3591 * This plays well with copy-on-write given that we are probably
3592 * in a child process (BGSAVE). Also this makes sure key objects
3593 * of swapped objects are not incRefCount-ed (an assert does not allow
3594 * this in order to avoid bugs) */
3595 if (obj->encoding != REDIS_ENCODING_RAW) {
3596 obj = getDecodedObject(obj);
3597 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3598 decrRefCount(obj);
3599 } else {
3600 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3601 }
3602 return retval;
3603 }
3604
3605 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3606 * 8 bit integer specifing the length of the representation.
3607 * This 8 bit integer has special values in order to specify the following
3608 * conditions:
3609 * 253: not a number
3610 * 254: + inf
3611 * 255: - inf
3612 */
3613 static int rdbSaveDoubleValue(FILE *fp, double val) {
3614 unsigned char buf[128];
3615 int len;
3616
3617 if (isnan(val)) {
3618 buf[0] = 253;
3619 len = 1;
3620 } else if (!isfinite(val)) {
3621 len = 1;
3622 buf[0] = (val < 0) ? 255 : 254;
3623 } else {
3624 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3625 /* Check if the float is in a safe range to be casted into a
3626 * long long. We are assuming that long long is 64 bit here.
3627 * Also we are assuming that there are no implementations around where
3628 * double has precision < 52 bit.
3629 *
3630 * Under this assumptions we test if a double is inside an interval
3631 * where casting to long long is safe. Then using two castings we
3632 * make sure the decimal part is zero. If all this is true we use
3633 * integer printing function that is much faster. */
3634 double min = -4503599627370495; /* (2^52)-1 */
3635 double max = 4503599627370496; /* -(2^52) */
3636 if (val > min && val < max && val == ((double)((long long)val)))
3637 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3638 else
3639 #endif
3640 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3641 buf[0] = strlen((char*)buf+1);
3642 len = buf[0]+1;
3643 }
3644 if (fwrite(buf,len,1,fp) == 0) return -1;
3645 return 0;
3646 }
3647
3648 /* Save a Redis object. */
3649 static int rdbSaveObject(FILE *fp, robj *o) {
3650 if (o->type == REDIS_STRING) {
3651 /* Save a string value */
3652 if (rdbSaveStringObject(fp,o) == -1) return -1;
3653 } else if (o->type == REDIS_LIST) {
3654 /* Save a list value */
3655 list *list = o->ptr;
3656 listIter li;
3657 listNode *ln;
3658
3659 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3660 listRewind(list,&li);
3661 while((ln = listNext(&li))) {
3662 robj *eleobj = listNodeValue(ln);
3663
3664 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3665 }
3666 } else if (o->type == REDIS_SET) {
3667 /* Save a set value */
3668 dict *set = o->ptr;
3669 dictIterator *di = dictGetIterator(set);
3670 dictEntry *de;
3671
3672 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3673 while((de = dictNext(di)) != NULL) {
3674 robj *eleobj = dictGetEntryKey(de);
3675
3676 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3677 }
3678 dictReleaseIterator(di);
3679 } else if (o->type == REDIS_ZSET) {
3680 /* Save a set value */
3681 zset *zs = o->ptr;
3682 dictIterator *di = dictGetIterator(zs->dict);
3683 dictEntry *de;
3684
3685 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3686 while((de = dictNext(di)) != NULL) {
3687 robj *eleobj = dictGetEntryKey(de);
3688 double *score = dictGetEntryVal(de);
3689
3690 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3691 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3692 }
3693 dictReleaseIterator(di);
3694 } else if (o->type == REDIS_HASH) {
3695 /* Save a hash value */
3696 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3697 unsigned char *p = zipmapRewind(o->ptr);
3698 unsigned int count = zipmapLen(o->ptr);
3699 unsigned char *key, *val;
3700 unsigned int klen, vlen;
3701
3702 if (rdbSaveLen(fp,count) == -1) return -1;
3703 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3704 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3705 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3706 }
3707 } else {
3708 dictIterator *di = dictGetIterator(o->ptr);
3709 dictEntry *de;
3710
3711 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3712 while((de = dictNext(di)) != NULL) {
3713 robj *key = dictGetEntryKey(de);
3714 robj *val = dictGetEntryVal(de);
3715
3716 if (rdbSaveStringObject(fp,key) == -1) return -1;
3717 if (rdbSaveStringObject(fp,val) == -1) return -1;
3718 }
3719 dictReleaseIterator(di);
3720 }
3721 } else {
3722 redisPanic("Unknown object type");
3723 }
3724 return 0;
3725 }
3726
3727 /* Return the length the object will have on disk if saved with
3728 * the rdbSaveObject() function. Currently we use a trick to get
3729 * this length with very little changes to the code. In the future
3730 * we could switch to a faster solution. */
3731 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3732 if (fp == NULL) fp = server.devnull;
3733 rewind(fp);
3734 assert(rdbSaveObject(fp,o) != 1);
3735 return ftello(fp);
3736 }
3737
3738 /* Return the number of pages required to save this object in the swap file */
3739 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3740 off_t bytes = rdbSavedObjectLen(o,fp);
3741
3742 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3743 }
3744
3745 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3746 static int rdbSave(char *filename) {
3747 dictIterator *di = NULL;
3748 dictEntry *de;
3749 FILE *fp;
3750 char tmpfile[256];
3751 int j;
3752 time_t now = time(NULL);
3753
3754 /* Wait for I/O therads to terminate, just in case this is a
3755 * foreground-saving, to avoid seeking the swap file descriptor at the
3756 * same time. */
3757 if (server.vm_enabled)
3758 waitEmptyIOJobsQueue();
3759
3760 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3761 fp = fopen(tmpfile,"w");
3762 if (!fp) {
3763 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3764 return REDIS_ERR;
3765 }
3766 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3767 for (j = 0; j < server.dbnum; j++) {
3768 redisDb *db = server.db+j;
3769 dict *d = db->dict;
3770 if (dictSize(d) == 0) continue;
3771 di = dictGetIterator(d);
3772 if (!di) {
3773 fclose(fp);
3774 return REDIS_ERR;
3775 }
3776
3777 /* Write the SELECT DB opcode */
3778 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3779 if (rdbSaveLen(fp,j) == -1) goto werr;
3780
3781 /* Iterate this DB writing every entry */
3782 while((de = dictNext(di)) != NULL) {
3783 robj *key = dictGetEntryKey(de);
3784 robj *o = dictGetEntryVal(de);
3785 time_t expiretime = getExpire(db,key);
3786
3787 /* Save the expire time */
3788 if (expiretime != -1) {
3789 /* If this key is already expired skip it */
3790 if (expiretime < now) continue;
3791 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3792 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3793 }
3794 /* Save the key and associated value. This requires special
3795 * handling if the value is swapped out. */
3796 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3797 o->storage == REDIS_VM_SWAPPING) {
3798 /* Save type, key, value */
3799 if (rdbSaveType(fp,o->type) == -1) goto werr;
3800 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3801 if (rdbSaveObject(fp,o) == -1) goto werr;
3802 } else {
3803 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3804 robj *po;
3805 /* Get a preview of the object in memory */
3806 po = vmPreviewObject(o);
3807 /* Save type, key, value */
3808 if (rdbSaveType(fp,po->type) == -1) goto werr;
3809 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3810 if (rdbSaveObject(fp,po) == -1) goto werr;
3811 /* Remove the loaded object from memory */
3812 decrRefCount(po);
3813 }
3814 }
3815 dictReleaseIterator(di);
3816 }
3817 /* EOF opcode */
3818 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3819
3820 /* Make sure data will not remain on the OS's output buffers */
3821 fflush(fp);
3822 fsync(fileno(fp));
3823 fclose(fp);
3824
3825 /* Use RENAME to make sure the DB file is changed atomically only
3826 * if the generate DB file is ok. */
3827 if (rename(tmpfile,filename) == -1) {
3828 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3829 unlink(tmpfile);
3830 return REDIS_ERR;
3831 }
3832 redisLog(REDIS_NOTICE,"DB saved on disk");
3833 server.dirty = 0;
3834 server.lastsave = time(NULL);
3835 return REDIS_OK;
3836
3837 werr:
3838 fclose(fp);
3839 unlink(tmpfile);
3840 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3841 if (di) dictReleaseIterator(di);
3842 return REDIS_ERR;
3843 }
3844
3845 static int rdbSaveBackground(char *filename) {
3846 pid_t childpid;
3847
3848 if (server.bgsavechildpid != -1) return REDIS_ERR;
3849 if (server.vm_enabled) waitEmptyIOJobsQueue();
3850 if ((childpid = fork()) == 0) {
3851 /* Child */
3852 if (server.vm_enabled) vmReopenSwapFile();
3853 close(server.fd);
3854 if (rdbSave(filename) == REDIS_OK) {
3855 _exit(0);
3856 } else {
3857 _exit(1);
3858 }
3859 } else {
3860 /* Parent */
3861 if (childpid == -1) {
3862 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3863 strerror(errno));
3864 return REDIS_ERR;
3865 }
3866 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3867 server.bgsavechildpid = childpid;
3868 updateDictResizePolicy();
3869 return REDIS_OK;
3870 }
3871 return REDIS_OK; /* unreached */
3872 }
3873
3874 static void rdbRemoveTempFile(pid_t childpid) {
3875 char tmpfile[256];
3876
3877 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3878 unlink(tmpfile);
3879 }
3880
3881 static int rdbLoadType(FILE *fp) {
3882 unsigned char type;
3883 if (fread(&type,1,1,fp) == 0) return -1;
3884 return type;
3885 }
3886
3887 static time_t rdbLoadTime(FILE *fp) {
3888 int32_t t32;
3889 if (fread(&t32,4,1,fp) == 0) return -1;
3890 return (time_t) t32;
3891 }
3892
3893 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3894 * of this file for a description of how this are stored on disk.
3895 *
3896 * isencoded is set to 1 if the readed length is not actually a length but
3897 * an "encoding type", check the above comments for more info */
3898 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3899 unsigned char buf[2];
3900 uint32_t len;
3901 int type;
3902
3903 if (isencoded) *isencoded = 0;
3904 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3905 type = (buf[0]&0xC0)>>6;
3906 if (type == REDIS_RDB_6BITLEN) {
3907 /* Read a 6 bit len */
3908 return buf[0]&0x3F;
3909 } else if (type == REDIS_RDB_ENCVAL) {
3910 /* Read a 6 bit len encoding type */
3911 if (isencoded) *isencoded = 1;
3912 return buf[0]&0x3F;
3913 } else if (type == REDIS_RDB_14BITLEN) {
3914 /* Read a 14 bit len */
3915 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3916 return ((buf[0]&0x3F)<<8)|buf[1];
3917 } else {
3918 /* Read a 32 bit len */
3919 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3920 return ntohl(len);
3921 }
3922 }
3923
3924 /* Load an integer-encoded object from file 'fp', with the specified
3925 * encoding type 'enctype'. If encode is true the function may return
3926 * an integer-encoded object as reply, otherwise the returned object
3927 * will always be encoded as a raw string. */
3928 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3929 unsigned char enc[4];
3930 long long val;
3931
3932 if (enctype == REDIS_RDB_ENC_INT8) {
3933 if (fread(enc,1,1,fp) == 0) return NULL;
3934 val = (signed char)enc[0];
3935 } else if (enctype == REDIS_RDB_ENC_INT16) {
3936 uint16_t v;
3937 if (fread(enc,2,1,fp) == 0) return NULL;
3938 v = enc[0]|(enc[1]<<8);
3939 val = (int16_t)v;
3940 } else if (enctype == REDIS_RDB_ENC_INT32) {
3941 uint32_t v;
3942 if (fread(enc,4,1,fp) == 0) return NULL;
3943 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3944 val = (int32_t)v;
3945 } else {
3946 val = 0; /* anti-warning */
3947 redisPanic("Unknown RDB integer encoding type");
3948 }
3949 if (encode)
3950 return createStringObjectFromLongLong(val);
3951 else
3952 return createObject(REDIS_STRING,sdsfromlonglong(val));
3953 }
3954
3955 static robj *rdbLoadLzfStringObject(FILE*fp) {
3956 unsigned int len, clen;
3957 unsigned char *c = NULL;
3958 sds val = NULL;
3959
3960 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3961 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3962 if ((c = zmalloc(clen)) == NULL) goto err;
3963 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3964 if (fread(c,clen,1,fp) == 0) goto err;
3965 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3966 zfree(c);
3967 return createObject(REDIS_STRING,val);
3968 err:
3969 zfree(c);
3970 sdsfree(val);
3971 return NULL;
3972 }
3973
3974 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3975 int isencoded;
3976 uint32_t len;
3977 sds val;
3978
3979 len = rdbLoadLen(fp,&isencoded);
3980 if (isencoded) {
3981 switch(len) {
3982 case REDIS_RDB_ENC_INT8:
3983 case REDIS_RDB_ENC_INT16:
3984 case REDIS_RDB_ENC_INT32:
3985 return rdbLoadIntegerObject(fp,len,encode);
3986 case REDIS_RDB_ENC_LZF:
3987 return rdbLoadLzfStringObject(fp);
3988 default:
3989 redisPanic("Unknown RDB encoding type");
3990 }
3991 }
3992
3993 if (len == REDIS_RDB_LENERR) return NULL;
3994 val = sdsnewlen(NULL,len);
3995 if (len && fread(val,len,1,fp) == 0) {
3996 sdsfree(val);
3997 return NULL;
3998 }
3999 return createObject(REDIS_STRING,val);
4000 }
4001
4002 static robj *rdbLoadStringObject(FILE *fp) {
4003 return rdbGenericLoadStringObject(fp,0);
4004 }
4005
4006 static robj *rdbLoadEncodedStringObject(FILE *fp) {
4007 return rdbGenericLoadStringObject(fp,1);
4008 }
4009
4010 /* For information about double serialization check rdbSaveDoubleValue() */
4011 static int rdbLoadDoubleValue(FILE *fp, double *val) {
4012 char buf[128];
4013 unsigned char len;
4014
4015 if (fread(&len,1,1,fp) == 0) return -1;
4016 switch(len) {
4017 case 255: *val = R_NegInf; return 0;
4018 case 254: *val = R_PosInf; return 0;
4019 case 253: *val = R_Nan; return 0;
4020 default:
4021 if (fread(buf,len,1,fp) == 0) return -1;
4022 buf[len] = '\0';
4023 sscanf(buf, "%lg", val);
4024 return 0;
4025 }
4026 }
4027
4028 /* Load a Redis object of the specified type from the specified file.
4029 * On success a newly allocated object is returned, otherwise NULL. */
4030 static robj *rdbLoadObject(int type, FILE *fp) {
4031 robj *o;
4032
4033 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
4034 if (type == REDIS_STRING) {
4035 /* Read string value */
4036 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4037 o = tryObjectEncoding(o);
4038 } else if (type == REDIS_LIST || type == REDIS_SET) {
4039 /* Read list/set value */
4040 uint32_t listlen;
4041
4042 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4043 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
4044 /* It's faster to expand the dict to the right size asap in order
4045 * to avoid rehashing */
4046 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4047 dictExpand(o->ptr,listlen);
4048 /* Load every single element of the list/set */
4049 while(listlen--) {
4050 robj *ele;
4051
4052 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4053 ele = tryObjectEncoding(ele);
4054 if (type == REDIS_LIST) {
4055 listAddNodeTail((list*)o->ptr,ele);
4056 } else {
4057 dictAdd((dict*)o->ptr,ele,NULL);
4058 }
4059 }
4060 } else if (type == REDIS_ZSET) {
4061 /* Read list/set value */
4062 size_t zsetlen;
4063 zset *zs;
4064
4065 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4066 o = createZsetObject();
4067 zs = o->ptr;
4068 /* Load every single element of the list/set */
4069 while(zsetlen--) {
4070 robj *ele;
4071 double *score = zmalloc(sizeof(double));
4072
4073 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4074 ele = tryObjectEncoding(ele);
4075 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4076 dictAdd(zs->dict,ele,score);
4077 zslInsert(zs->zsl,*score,ele);
4078 incrRefCount(ele); /* added to skiplist */
4079 }
4080 } else if (type == REDIS_HASH) {
4081 size_t hashlen;
4082
4083 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4084 o = createHashObject();
4085 /* Too many entries? Use an hash table. */
4086 if (hashlen > server.hash_max_zipmap_entries)
4087 convertToRealHash(o);
4088 /* Load every key/value, then set it into the zipmap or hash
4089 * table, as needed. */
4090 while(hashlen--) {
4091 robj *key, *val;
4092
4093 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4094 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4095 /* If we are using a zipmap and there are too big values
4096 * the object is converted to real hash table encoding. */
4097 if (o->encoding != REDIS_ENCODING_HT &&
4098 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4099 sdslen(val->ptr) > server.hash_max_zipmap_value))
4100 {
4101 convertToRealHash(o);
4102 }
4103
4104 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4105 unsigned char *zm = o->ptr;
4106
4107 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4108 val->ptr,sdslen(val->ptr),NULL);
4109 o->ptr = zm;
4110 decrRefCount(key);
4111 decrRefCount(val);
4112 } else {
4113 key = tryObjectEncoding(key);
4114 val = tryObjectEncoding(val);
4115 dictAdd((dict*)o->ptr,key,val);
4116 }
4117 }
4118 } else {
4119 redisPanic("Unknown object type");
4120 }
4121 return o;
4122 }
4123
4124 static int rdbLoad(char *filename) {
4125 FILE *fp;
4126 uint32_t dbid;
4127 int type, retval, rdbver;
4128 int swap_all_values = 0;
4129 dict *d = server.db[0].dict;
4130 redisDb *db = server.db+0;
4131 char buf[1024];
4132 time_t expiretime, now = time(NULL);
4133 long long loadedkeys = 0;
4134
4135 fp = fopen(filename,"r");
4136 if (!fp) return REDIS_ERR;
4137 if (fread(buf,9,1,fp) == 0) goto eoferr;
4138 buf[9] = '\0';
4139 if (memcmp(buf,"REDIS",5) != 0) {
4140 fclose(fp);
4141 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4142 return REDIS_ERR;
4143 }
4144 rdbver = atoi(buf+5);
4145 if (rdbver != 1) {
4146 fclose(fp);
4147 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4148 return REDIS_ERR;
4149 }
4150 while(1) {
4151 robj *key, *val;
4152
4153 expiretime = -1;
4154 /* Read type. */
4155 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4156 if (type == REDIS_EXPIRETIME) {
4157 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4158 /* We read the time so we need to read the object type again */
4159 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4160 }
4161 if (type == REDIS_EOF) break;
4162 /* Handle SELECT DB opcode as a special case */
4163 if (type == REDIS_SELECTDB) {
4164 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4165 goto eoferr;
4166 if (dbid >= (unsigned)server.dbnum) {
4167 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4168 exit(1);
4169 }
4170 db = server.db+dbid;
4171 d = db->dict;
4172 continue;
4173 }
4174 /* Read key */
4175 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4176 /* Read value */
4177 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4178 /* Check if the key already expired */
4179 if (expiretime != -1 && expiretime < now) {
4180 decrRefCount(key);
4181 decrRefCount(val);
4182 continue;
4183 }
4184 /* Add the new object in the hash table */
4185 retval = dictAdd(d,key,val);
4186 if (retval == DICT_ERR) {
4187 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4188 exit(1);
4189 }
4190 loadedkeys++;
4191 /* Set the expire time if needed */
4192 if (expiretime != -1) setExpire(db,key,expiretime);
4193
4194 /* Handle swapping while loading big datasets when VM is on */
4195
4196 /* If we detecter we are hopeless about fitting something in memory
4197 * we just swap every new key on disk. Directly...
4198 * Note that's important to check for this condition before resorting
4199 * to random sampling, otherwise we may try to swap already
4200 * swapped keys. */
4201 if (swap_all_values) {
4202 dictEntry *de = dictFind(d,key);
4203
4204 /* de may be NULL since the key already expired */
4205 if (de) {
4206 vmpointer *vp;
4207 key = dictGetEntryKey(de);
4208 val = dictGetEntryVal(de);
4209
4210 if (val->refcount == 1 &&
4211 (vp = vmSwapObjectBlocking(val)) != NULL)
4212 dictGetEntryVal(de) = vp;
4213 }
4214 continue;
4215 }
4216
4217 /* If we have still some hope of having some value fitting memory
4218 * then we try random sampling. */
4219 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4220 while (zmalloc_used_memory() > server.vm_max_memory) {
4221 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4222 }
4223 if (zmalloc_used_memory() > server.vm_max_memory)
4224 swap_all_values = 1; /* We are already using too much mem */
4225 }
4226 }
4227 fclose(fp);
4228 return REDIS_OK;
4229
4230 eoferr: /* unexpected end of file is handled here with a fatal exit */
4231 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4232 exit(1);
4233 return REDIS_ERR; /* Just to avoid warning */
4234 }
4235
4236 /*================================== Shutdown =============================== */
4237 static int prepareForShutdown() {
4238 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4239 /* Kill the saving child if there is a background saving in progress.
4240 We want to avoid race conditions, for instance our saving child may
4241 overwrite the synchronous saving did by SHUTDOWN. */
4242 if (server.bgsavechildpid != -1) {
4243 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4244 kill(server.bgsavechildpid,SIGKILL);
4245 rdbRemoveTempFile(server.bgsavechildpid);
4246 }
4247 if (server.appendonly) {
4248 /* Append only file: fsync() the AOF and exit */
4249 aof_fsync(server.appendfd);
4250 if (server.vm_enabled) unlink(server.vm_swap_file);
4251 } else {
4252 /* Snapshotting. Perform a SYNC SAVE and exit */
4253 if (rdbSave(server.dbfilename) == REDIS_OK) {
4254 if (server.daemonize)
4255 unlink(server.pidfile);
4256 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4257 } else {
4258 /* Ooops.. error saving! The best we can do is to continue
4259 * operating. Note that if there was a background saving process,
4260 * in the next cron() Redis will be notified that the background
4261 * saving aborted, handling special stuff like slaves pending for
4262 * synchronization... */
4263 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4264 return REDIS_ERR;
4265 }
4266 }
4267 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4268 return REDIS_OK;
4269 }
4270
4271 /*================================== Commands =============================== */
4272
4273 static void authCommand(redisClient *c) {
4274 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4275 c->authenticated = 1;
4276 addReply(c,shared.ok);
4277 } else {
4278 c->authenticated = 0;
4279 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4280 }
4281 }
4282
4283 static void pingCommand(redisClient *c) {
4284 addReply(c,shared.pong);
4285 }
4286
4287 static void echoCommand(redisClient *c) {
4288 addReplyBulk(c,c->argv[1]);
4289 }
4290
4291 /*=================================== Strings =============================== */
4292
4293 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4294 int retval;
4295 long seconds = 0; /* initialized to avoid an harmness warning */
4296
4297 if (expire) {
4298 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4299 return;
4300 if (seconds <= 0) {
4301 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4302 return;
4303 }
4304 }
4305
4306 touchWatchedKey(c->db,key);
4307 if (nx) deleteIfVolatile(c->db,key);
4308 retval = dictAdd(c->db->dict,key,val);
4309 if (retval == DICT_ERR) {
4310 if (!nx) {
4311 /* If the key is about a swapped value, we want a new key object
4312 * to overwrite the old. So we delete the old key in the database.
4313 * This will also make sure that swap pages about the old object
4314 * will be marked as free. */
4315 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4316 incrRefCount(key);
4317 dictReplace(c->db->dict,key,val);
4318 incrRefCount(val);
4319 } else {
4320 addReply(c,shared.czero);
4321 return;
4322 }
4323 } else {
4324 incrRefCount(key);
4325 incrRefCount(val);
4326 }
4327 server.dirty++;
4328 removeExpire(c->db,key);
4329 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4330 addReply(c, nx ? shared.cone : shared.ok);
4331 }
4332
4333 static void setCommand(redisClient *c) {
4334 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4335 }
4336
4337 static void setnxCommand(redisClient *c) {
4338 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4339 }
4340
4341 static void setexCommand(redisClient *c) {
4342 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4343 }
4344
4345 static int getGenericCommand(redisClient *c) {
4346 robj *o;
4347
4348 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4349 return REDIS_OK;
4350
4351 if (o->type != REDIS_STRING) {
4352 addReply(c,shared.wrongtypeerr);
4353 return REDIS_ERR;
4354 } else {
4355 addReplyBulk(c,o);
4356 return REDIS_OK;
4357 }
4358 }
4359
4360 static void getCommand(redisClient *c) {
4361 getGenericCommand(c);
4362 }
4363
4364 static void getsetCommand(redisClient *c) {
4365 if (getGenericCommand(c) == REDIS_ERR) return;
4366 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4367 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4368 } else {
4369 incrRefCount(c->argv[1]);
4370 }
4371 incrRefCount(c->argv[2]);
4372 server.dirty++;
4373 removeExpire(c->db,c->argv[1]);
4374 }
4375
4376 static void mgetCommand(redisClient *c) {
4377 int j;
4378
4379 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4380 for (j = 1; j < c->argc; j++) {
4381 robj *o = lookupKeyRead(c->db,c->argv[j]);
4382 if (o == NULL) {
4383 addReply(c,shared.nullbulk);
4384 } else {
4385 if (o->type != REDIS_STRING) {
4386 addReply(c,shared.nullbulk);
4387 } else {
4388 addReplyBulk(c,o);
4389 }
4390 }
4391 }
4392 }
4393
4394 static void msetGenericCommand(redisClient *c, int nx) {
4395 int j, busykeys = 0;
4396
4397 if ((c->argc % 2) == 0) {
4398 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4399 return;
4400 }
4401 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4402 * set nothing at all if at least one already key exists. */
4403 if (nx) {
4404 for (j = 1; j < c->argc; j += 2) {
4405 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4406 busykeys++;
4407 }
4408 }
4409 }
4410 if (busykeys) {
4411 addReply(c, shared.czero);
4412 return;
4413 }
4414
4415 for (j = 1; j < c->argc; j += 2) {
4416 int retval;
4417
4418 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4419 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4420 if (retval == DICT_ERR) {
4421 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4422 incrRefCount(c->argv[j+1]);
4423 } else {
4424 incrRefCount(c->argv[j]);
4425 incrRefCount(c->argv[j+1]);
4426 }
4427 removeExpire(c->db,c->argv[j]);
4428 }
4429 server.dirty += (c->argc-1)/2;
4430 addReply(c, nx ? shared.cone : shared.ok);
4431 }
4432
4433 static void msetCommand(redisClient *c) {
4434 msetGenericCommand(c,0);
4435 }
4436
4437 static void msetnxCommand(redisClient *c) {
4438 msetGenericCommand(c,1);
4439 }
4440
4441 static void incrDecrCommand(redisClient *c, long long incr) {
4442 long long value;
4443 int retval;
4444 robj *o;
4445
4446 o = lookupKeyWrite(c->db,c->argv[1]);
4447 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4448 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4449
4450 value += incr;
4451 o = createStringObjectFromLongLong(value);
4452 retval = dictAdd(c->db->dict,c->argv[1],o);
4453 if (retval == DICT_ERR) {
4454 dictReplace(c->db->dict,c->argv[1],o);
4455 removeExpire(c->db,c->argv[1]);
4456 } else {
4457 incrRefCount(c->argv[1]);
4458 }
4459 server.dirty++;
4460 addReply(c,shared.colon);
4461 addReply(c,o);
4462 addReply(c,shared.crlf);
4463 }
4464
4465 static void incrCommand(redisClient *c) {
4466 incrDecrCommand(c,1);
4467 }
4468
4469 static void decrCommand(redisClient *c) {
4470 incrDecrCommand(c,-1);
4471 }
4472
4473 static void incrbyCommand(redisClient *c) {
4474 long long incr;
4475
4476 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4477 incrDecrCommand(c,incr);
4478 }
4479
4480 static void decrbyCommand(redisClient *c) {
4481 long long incr;
4482
4483 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4484 incrDecrCommand(c,-incr);
4485 }
4486
4487 static void appendCommand(redisClient *c) {
4488 int retval;
4489 size_t totlen;
4490 robj *o;
4491
4492 o = lookupKeyWrite(c->db,c->argv[1]);
4493 if (o == NULL) {
4494 /* Create the key */
4495 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4496 incrRefCount(c->argv[1]);
4497 incrRefCount(c->argv[2]);
4498 totlen = stringObjectLen(c->argv[2]);
4499 } else {
4500 dictEntry *de;
4501
4502 de = dictFind(c->db->dict,c->argv[1]);
4503 assert(de != NULL);
4504
4505 o = dictGetEntryVal(de);
4506 if (o->type != REDIS_STRING) {
4507 addReply(c,shared.wrongtypeerr);
4508 return;
4509 }
4510 /* If the object is specially encoded or shared we have to make
4511 * a copy */
4512 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4513 robj *decoded = getDecodedObject(o);
4514
4515 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4516 decrRefCount(decoded);
4517 dictReplace(c->db->dict,c->argv[1],o);
4518 }
4519 /* APPEND! */
4520 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4521 o->ptr = sdscatlen(o->ptr,
4522 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4523 } else {
4524 o->ptr = sdscatprintf(o->ptr, "%ld",
4525 (unsigned long) c->argv[2]->ptr);
4526 }
4527 totlen = sdslen(o->ptr);
4528 }
4529 server.dirty++;
4530 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4531 }
4532
4533 static void substrCommand(redisClient *c) {
4534 robj *o;
4535 long start = atoi(c->argv[2]->ptr);
4536 long end = atoi(c->argv[3]->ptr);
4537 size_t rangelen, strlen;
4538 sds range;
4539
4540 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4541 checkType(c,o,REDIS_STRING)) return;
4542
4543 o = getDecodedObject(o);
4544 strlen = sdslen(o->ptr);
4545
4546 /* convert negative indexes */
4547 if (start < 0) start = strlen+start;
4548 if (end < 0) end = strlen+end;
4549 if (start < 0) start = 0;
4550 if (end < 0) end = 0;
4551
4552 /* indexes sanity checks */
4553 if (start > end || (size_t)start >= strlen) {
4554 /* Out of range start or start > end result in null reply */
4555 addReply(c,shared.nullbulk);
4556 decrRefCount(o);
4557 return;
4558 }
4559 if ((size_t)end >= strlen) end = strlen-1;
4560 rangelen = (end-start)+1;
4561
4562 /* Return the result */
4563 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4564 range = sdsnewlen((char*)o->ptr+start,rangelen);
4565 addReplySds(c,range);
4566 addReply(c,shared.crlf);
4567 decrRefCount(o);
4568 }
4569
4570 /* ========================= Type agnostic commands ========================= */
4571
4572 static void delCommand(redisClient *c) {
4573 int deleted = 0, j;
4574
4575 for (j = 1; j < c->argc; j++) {
4576 if (deleteKey(c->db,c->argv[j])) {
4577 touchWatchedKey(c->db,c->argv[j]);
4578 server.dirty++;
4579 deleted++;
4580 }
4581 }
4582 addReplyLongLong(c,deleted);
4583 }
4584
4585 static void existsCommand(redisClient *c) {
4586 expireIfNeeded(c->db,c->argv[1]);
4587 if (dictFind(c->db->dict,c->argv[1])) {
4588 addReply(c, shared.cone);
4589 } else {
4590 addReply(c, shared.czero);
4591 }
4592 }
4593
4594 static void selectCommand(redisClient *c) {
4595 int id = atoi(c->argv[1]->ptr);
4596
4597 if (selectDb(c,id) == REDIS_ERR) {
4598 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4599 } else {
4600 addReply(c,shared.ok);
4601 }
4602 }
4603
4604 static void randomkeyCommand(redisClient *c) {
4605 dictEntry *de;
4606 robj *key;
4607
4608 while(1) {
4609 de = dictGetRandomKey(c->db->dict);
4610 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4611 }
4612
4613 if (de == NULL) {
4614 addReply(c,shared.nullbulk);
4615 return;
4616 }
4617
4618 key = dictGetEntryKey(de);
4619 if (server.vm_enabled) {
4620 key = dupStringObject(key);
4621 addReplyBulk(c,key);
4622 decrRefCount(key);
4623 } else {
4624 addReplyBulk(c,key);
4625 }
4626 }
4627
4628 static void keysCommand(redisClient *c) {
4629 dictIterator *di;
4630 dictEntry *de;
4631 sds pattern = c->argv[1]->ptr;
4632 int plen = sdslen(pattern);
4633 unsigned long numkeys = 0;
4634 robj *lenobj = createObject(REDIS_STRING,NULL);
4635
4636 di = dictGetIterator(c->db->dict);
4637 addReply(c,lenobj);
4638 decrRefCount(lenobj);
4639 while((de = dictNext(di)) != NULL) {
4640 robj *keyobj = dictGetEntryKey(de);
4641
4642 sds key = keyobj->ptr;
4643 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4644 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4645 if (expireIfNeeded(c->db,keyobj) == 0) {
4646 addReplyBulk(c,keyobj);
4647 numkeys++;
4648 }
4649 }
4650 }
4651 dictReleaseIterator(di);
4652 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4653 }
4654
4655 static void dbsizeCommand(redisClient *c) {
4656 addReplySds(c,
4657 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4658 }
4659
4660 static void lastsaveCommand(redisClient *c) {
4661 addReplySds(c,
4662 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4663 }
4664
4665 static void typeCommand(redisClient *c) {
4666 robj *o;
4667 char *type;
4668
4669 o = lookupKeyRead(c->db,c->argv[1]);
4670 if (o == NULL) {
4671 type = "+none";
4672 } else {
4673 switch(o->type) {
4674 case REDIS_STRING: type = "+string"; break;
4675 case REDIS_LIST: type = "+list"; break;
4676 case REDIS_SET: type = "+set"; break;
4677 case REDIS_ZSET: type = "+zset"; break;
4678 case REDIS_HASH: type = "+hash"; break;
4679 default: type = "+unknown"; break;
4680 }
4681 }
4682 addReplySds(c,sdsnew(type));
4683 addReply(c,shared.crlf);
4684 }
4685
4686 static void saveCommand(redisClient *c) {
4687 if (server.bgsavechildpid != -1) {
4688 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4689 return;
4690 }
4691 if (rdbSave(server.dbfilename) == REDIS_OK) {
4692 addReply(c,shared.ok);
4693 } else {
4694 addReply(c,shared.err);
4695 }
4696 }
4697
4698 static void bgsaveCommand(redisClient *c) {
4699 if (server.bgsavechildpid != -1) {
4700 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4701 return;
4702 }
4703 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4704 char *status = "+Background saving started\r\n";
4705 addReplySds(c,sdsnew(status));
4706 } else {
4707 addReply(c,shared.err);
4708 }
4709 }
4710
4711 static void shutdownCommand(redisClient *c) {
4712 if (prepareForShutdown() == REDIS_OK)
4713 exit(0);
4714 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4715 }
4716
4717 static void renameGenericCommand(redisClient *c, int nx) {
4718 robj *o;
4719
4720 /* To use the same key as src and dst is probably an error */
4721 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4722 addReply(c,shared.sameobjecterr);
4723 return;
4724 }
4725
4726 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4727 return;
4728
4729 incrRefCount(o);
4730 deleteIfVolatile(c->db,c->argv[2]);
4731 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4732 if (nx) {
4733 decrRefCount(o);
4734 addReply(c,shared.czero);
4735 return;
4736 }
4737 dictReplace(c->db->dict,c->argv[2],o);
4738 } else {
4739 incrRefCount(c->argv[2]);
4740 }
4741 deleteKey(c->db,c->argv[1]);
4742 touchWatchedKey(c->db,c->argv[2]);
4743 server.dirty++;
4744 addReply(c,nx ? shared.cone : shared.ok);
4745 }
4746
4747 static void renameCommand(redisClient *c) {
4748 renameGenericCommand(c,0);
4749 }
4750
4751 static void renamenxCommand(redisClient *c) {
4752 renameGenericCommand(c,1);
4753 }
4754
4755 static void moveCommand(redisClient *c) {
4756 robj *o;
4757 redisDb *src, *dst;
4758 int srcid;
4759
4760 /* Obtain source and target DB pointers */
4761 src = c->db;
4762 srcid = c->db->id;
4763 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4764 addReply(c,shared.outofrangeerr);
4765 return;
4766 }
4767 dst = c->db;
4768 selectDb(c,srcid); /* Back to the source DB */
4769
4770 /* If the user is moving using as target the same
4771 * DB as the source DB it is probably an error. */
4772 if (src == dst) {
4773 addReply(c,shared.sameobjecterr);
4774 return;
4775 }
4776
4777 /* Check if the element exists and get a reference */
4778 o = lookupKeyWrite(c->db,c->argv[1]);
4779 if (!o) {
4780 addReply(c,shared.czero);
4781 return;
4782 }
4783
4784 /* Try to add the element to the target DB */
4785 deleteIfVolatile(dst,c->argv[1]);
4786 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4787 addReply(c,shared.czero);
4788 return;
4789 }
4790 incrRefCount(c->argv[1]);
4791 incrRefCount(o);
4792
4793 /* OK! key moved, free the entry in the source DB */
4794 deleteKey(src,c->argv[1]);
4795 server.dirty++;
4796 addReply(c,shared.cone);
4797 }
4798
4799 /* =================================== Lists ================================ */
4800 static void pushGenericCommand(redisClient *c, int where) {
4801 robj *lobj;
4802 list *list;
4803
4804 lobj = lookupKeyWrite(c->db,c->argv[1]);
4805 if (lobj == NULL) {
4806 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4807 addReply(c,shared.cone);
4808 return;
4809 }
4810 lobj = createListObject();
4811 list = lobj->ptr;
4812 if (where == REDIS_HEAD) {
4813 listAddNodeHead(list,c->argv[2]);
4814 } else {
4815 listAddNodeTail(list,c->argv[2]);
4816 }
4817 dictAdd(c->db->dict,c->argv[1],lobj);
4818 incrRefCount(c->argv[1]);
4819 incrRefCount(c->argv[2]);
4820 } else {
4821 if (lobj->type != REDIS_LIST) {
4822 addReply(c,shared.wrongtypeerr);
4823 return;
4824 }
4825 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4826 addReply(c,shared.cone);
4827 return;
4828 }
4829 list = lobj->ptr;
4830 if (where == REDIS_HEAD) {
4831 listAddNodeHead(list,c->argv[2]);
4832 } else {
4833 listAddNodeTail(list,c->argv[2]);
4834 }
4835 incrRefCount(c->argv[2]);
4836 }
4837 server.dirty++;
4838 addReplyLongLong(c,listLength(list));
4839 }
4840
4841 static void lpushCommand(redisClient *c) {
4842 pushGenericCommand(c,REDIS_HEAD);
4843 }
4844
4845 static void rpushCommand(redisClient *c) {
4846 pushGenericCommand(c,REDIS_TAIL);
4847 }
4848
4849 static void llenCommand(redisClient *c) {
4850 robj *o;
4851 list *l;
4852
4853 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4854 checkType(c,o,REDIS_LIST)) return;
4855
4856 l = o->ptr;
4857 addReplyUlong(c,listLength(l));
4858 }
4859
4860 static void lindexCommand(redisClient *c) {
4861 robj *o;
4862 int index = atoi(c->argv[2]->ptr);
4863 list *list;
4864 listNode *ln;
4865
4866 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4867 checkType(c,o,REDIS_LIST)) return;
4868 list = o->ptr;
4869
4870 ln = listIndex(list, index);
4871 if (ln == NULL) {
4872 addReply(c,shared.nullbulk);
4873 } else {
4874 robj *ele = listNodeValue(ln);
4875 addReplyBulk(c,ele);
4876 }
4877 }
4878
4879 static void lsetCommand(redisClient *c) {
4880 robj *o;
4881 int index = atoi(c->argv[2]->ptr);
4882 list *list;
4883 listNode *ln;
4884
4885 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4886 checkType(c,o,REDIS_LIST)) return;
4887 list = o->ptr;
4888
4889 ln = listIndex(list, index);
4890 if (ln == NULL) {
4891 addReply(c,shared.outofrangeerr);
4892 } else {
4893 robj *ele = listNodeValue(ln);
4894
4895 decrRefCount(ele);
4896 listNodeValue(ln) = c->argv[3];
4897 incrRefCount(c->argv[3]);
4898 addReply(c,shared.ok);
4899 server.dirty++;
4900 }
4901 }
4902
4903 static void popGenericCommand(redisClient *c, int where) {
4904 robj *o;
4905 list *list;
4906 listNode *ln;
4907
4908 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4909 checkType(c,o,REDIS_LIST)) return;
4910 list = o->ptr;
4911
4912 if (where == REDIS_HEAD)
4913 ln = listFirst(list);
4914 else
4915 ln = listLast(list);
4916
4917 if (ln == NULL) {
4918 addReply(c,shared.nullbulk);
4919 } else {
4920 robj *ele = listNodeValue(ln);
4921 addReplyBulk(c,ele);
4922 listDelNode(list,ln);
4923 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4924 server.dirty++;
4925 }
4926 }
4927
4928 static void lpopCommand(redisClient *c) {
4929 popGenericCommand(c,REDIS_HEAD);
4930 }
4931
4932 static void rpopCommand(redisClient *c) {
4933 popGenericCommand(c,REDIS_TAIL);
4934 }
4935
4936 static void lrangeCommand(redisClient *c) {
4937 robj *o;
4938 int start = atoi(c->argv[2]->ptr);
4939 int end = atoi(c->argv[3]->ptr);
4940 int llen;
4941 int rangelen, j;
4942 list *list;
4943 listNode *ln;
4944 robj *ele;
4945
4946 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4947 || checkType(c,o,REDIS_LIST)) return;
4948 list = o->ptr;
4949 llen = listLength(list);
4950
4951 /* convert negative indexes */
4952 if (start < 0) start = llen+start;
4953 if (end < 0) end = llen+end;
4954 if (start < 0) start = 0;
4955 if (end < 0) end = 0;
4956
4957 /* indexes sanity checks */
4958 if (start > end || start >= llen) {
4959 /* Out of range start or start > end result in empty list */
4960 addReply(c,shared.emptymultibulk);
4961 return;
4962 }
4963 if (end >= llen) end = llen-1;
4964 rangelen = (end-start)+1;
4965
4966 /* Return the result in form of a multi-bulk reply */
4967 ln = listIndex(list, start);
4968 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4969 for (j = 0; j < rangelen; j++) {
4970 ele = listNodeValue(ln);
4971 addReplyBulk(c,ele);
4972 ln = ln->next;
4973 }
4974 }
4975
4976 static void ltrimCommand(redisClient *c) {
4977 robj *o;
4978 int start = atoi(c->argv[2]->ptr);
4979 int end = atoi(c->argv[3]->ptr);
4980 int llen;
4981 int j, ltrim, rtrim;
4982 list *list;
4983 listNode *ln;
4984
4985 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4986 checkType(c,o,REDIS_LIST)) return;
4987 list = o->ptr;
4988 llen = listLength(list);
4989
4990 /* convert negative indexes */
4991 if (start < 0) start = llen+start;
4992 if (end < 0) end = llen+end;
4993 if (start < 0) start = 0;
4994 if (end < 0) end = 0;
4995
4996 /* indexes sanity checks */
4997 if (start > end || start >= llen) {
4998 /* Out of range start or start > end result in empty list */
4999 ltrim = llen;
5000 rtrim = 0;
5001 } else {
5002 if (end >= llen) end = llen-1;
5003 ltrim = start;
5004 rtrim = llen-end-1;
5005 }
5006
5007 /* Remove list elements to perform the trim */
5008 for (j = 0; j < ltrim; j++) {
5009 ln = listFirst(list);
5010 listDelNode(list,ln);
5011 }
5012 for (j = 0; j < rtrim; j++) {
5013 ln = listLast(list);
5014 listDelNode(list,ln);
5015 }
5016 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5017 server.dirty++;
5018 addReply(c,shared.ok);
5019 }
5020
5021 static void lremCommand(redisClient *c) {
5022 robj *o;
5023 list *list;
5024 listNode *ln, *next;
5025 int toremove = atoi(c->argv[2]->ptr);
5026 int removed = 0;
5027 int fromtail = 0;
5028
5029 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5030 checkType(c,o,REDIS_LIST)) return;
5031 list = o->ptr;
5032
5033 if (toremove < 0) {
5034 toremove = -toremove;
5035 fromtail = 1;
5036 }
5037 ln = fromtail ? list->tail : list->head;
5038 while (ln) {
5039 robj *ele = listNodeValue(ln);
5040
5041 next = fromtail ? ln->prev : ln->next;
5042 if (equalStringObjects(ele,c->argv[3])) {
5043 listDelNode(list,ln);
5044 server.dirty++;
5045 removed++;
5046 if (toremove && removed == toremove) break;
5047 }
5048 ln = next;
5049 }
5050 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
5051 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
5052 }
5053
5054 /* This is the semantic of this command:
5055 * RPOPLPUSH srclist dstlist:
5056 * IF LLEN(srclist) > 0
5057 * element = RPOP srclist
5058 * LPUSH dstlist element
5059 * RETURN element
5060 * ELSE
5061 * RETURN nil
5062 * END
5063 * END
5064 *
5065 * The idea is to be able to get an element from a list in a reliable way
5066 * since the element is not just returned but pushed against another list
5067 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5068 */
5069 static void rpoplpushcommand(redisClient *c) {
5070 robj *sobj;
5071 list *srclist;
5072 listNode *ln;
5073
5074 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5075 checkType(c,sobj,REDIS_LIST)) return;
5076 srclist = sobj->ptr;
5077 ln = listLast(srclist);
5078
5079 if (ln == NULL) {
5080 addReply(c,shared.nullbulk);
5081 } else {
5082 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5083 robj *ele = listNodeValue(ln);
5084 list *dstlist;
5085
5086 if (dobj && dobj->type != REDIS_LIST) {
5087 addReply(c,shared.wrongtypeerr);
5088 return;
5089 }
5090
5091 /* Add the element to the target list (unless it's directly
5092 * passed to some BLPOP-ing client */
5093 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5094 if (dobj == NULL) {
5095 /* Create the list if the key does not exist */
5096 dobj = createListObject();
5097 dictAdd(c->db->dict,c->argv[2],dobj);
5098 incrRefCount(c->argv[2]);
5099 }
5100 dstlist = dobj->ptr;
5101 listAddNodeHead(dstlist,ele);
5102 incrRefCount(ele);
5103 }
5104
5105 /* Send the element to the client as reply as well */
5106 addReplyBulk(c,ele);
5107
5108 /* Finally remove the element from the source list */
5109 listDelNode(srclist,ln);
5110 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5111 server.dirty++;
5112 }
5113 }
5114
5115 /* ==================================== Sets ================================ */
5116
5117 static void saddCommand(redisClient *c) {
5118 robj *set;
5119
5120 set = lookupKeyWrite(c->db,c->argv[1]);
5121 if (set == NULL) {
5122 set = createSetObject();
5123 dictAdd(c->db->dict,c->argv[1],set);
5124 incrRefCount(c->argv[1]);
5125 } else {
5126 if (set->type != REDIS_SET) {
5127 addReply(c,shared.wrongtypeerr);
5128 return;
5129 }
5130 }
5131 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5132 incrRefCount(c->argv[2]);
5133 server.dirty++;
5134 addReply(c,shared.cone);
5135 } else {
5136 addReply(c,shared.czero);
5137 }
5138 }
5139
5140 static void sremCommand(redisClient *c) {
5141 robj *set;
5142
5143 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5144 checkType(c,set,REDIS_SET)) return;
5145
5146 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5147 server.dirty++;
5148 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5149 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5150 addReply(c,shared.cone);
5151 } else {
5152 addReply(c,shared.czero);
5153 }
5154 }
5155
5156 static void smoveCommand(redisClient *c) {
5157 robj *srcset, *dstset;
5158
5159 srcset = lookupKeyWrite(c->db,c->argv[1]);
5160 dstset = lookupKeyWrite(c->db,c->argv[2]);
5161
5162 /* If the source key does not exist return 0, if it's of the wrong type
5163 * raise an error */
5164 if (srcset == NULL || srcset->type != REDIS_SET) {
5165 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5166 return;
5167 }
5168 /* Error if the destination key is not a set as well */
5169 if (dstset && dstset->type != REDIS_SET) {
5170 addReply(c,shared.wrongtypeerr);
5171 return;
5172 }
5173 /* Remove the element from the source set */
5174 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5175 /* Key not found in the src set! return zero */
5176 addReply(c,shared.czero);
5177 return;
5178 }
5179 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5180 deleteKey(c->db,c->argv[1]);
5181 server.dirty++;
5182 /* Add the element to the destination set */
5183 if (!dstset) {
5184 dstset = createSetObject();
5185 dictAdd(c->db->dict,c->argv[2],dstset);
5186 incrRefCount(c->argv[2]);
5187 }
5188 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5189 incrRefCount(c->argv[3]);
5190 addReply(c,shared.cone);
5191 }
5192
5193 static void sismemberCommand(redisClient *c) {
5194 robj *set;
5195
5196 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5197 checkType(c,set,REDIS_SET)) return;
5198
5199 if (dictFind(set->ptr,c->argv[2]))
5200 addReply(c,shared.cone);
5201 else
5202 addReply(c,shared.czero);
5203 }
5204
5205 static void scardCommand(redisClient *c) {
5206 robj *o;
5207 dict *s;
5208
5209 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5210 checkType(c,o,REDIS_SET)) return;
5211
5212 s = o->ptr;
5213 addReplyUlong(c,dictSize(s));
5214 }
5215
5216 static void spopCommand(redisClient *c) {
5217 robj *set;
5218 dictEntry *de;
5219
5220 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5221 checkType(c,set,REDIS_SET)) return;
5222
5223 de = dictGetRandomKey(set->ptr);
5224 if (de == NULL) {
5225 addReply(c,shared.nullbulk);
5226 } else {
5227 robj *ele = dictGetEntryKey(de);
5228
5229 addReplyBulk(c,ele);
5230 dictDelete(set->ptr,ele);
5231 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5232 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5233 server.dirty++;
5234 }
5235 }
5236
5237 static void srandmemberCommand(redisClient *c) {
5238 robj *set;
5239 dictEntry *de;
5240
5241 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5242 checkType(c,set,REDIS_SET)) return;
5243
5244 de = dictGetRandomKey(set->ptr);
5245 if (de == NULL) {
5246 addReply(c,shared.nullbulk);
5247 } else {
5248 robj *ele = dictGetEntryKey(de);
5249
5250 addReplyBulk(c,ele);
5251 }
5252 }
5253
5254 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5255 dict **d1 = (void*) s1, **d2 = (void*) s2;
5256
5257 return dictSize(*d1)-dictSize(*d2);
5258 }
5259
5260 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5261 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5262 dictIterator *di;
5263 dictEntry *de;
5264 robj *lenobj = NULL, *dstset = NULL;
5265 unsigned long j, cardinality = 0;
5266
5267 for (j = 0; j < setsnum; j++) {
5268 robj *setobj;
5269
5270 setobj = dstkey ?
5271 lookupKeyWrite(c->db,setskeys[j]) :
5272 lookupKeyRead(c->db,setskeys[j]);
5273 if (!setobj) {
5274 zfree(dv);
5275 if (dstkey) {
5276 if (deleteKey(c->db,dstkey))
5277 server.dirty++;
5278 addReply(c,shared.czero);
5279 } else {
5280 addReply(c,shared.emptymultibulk);
5281 }
5282 return;
5283 }
5284 if (setobj->type != REDIS_SET) {
5285 zfree(dv);
5286 addReply(c,shared.wrongtypeerr);
5287 return;
5288 }
5289 dv[j] = setobj->ptr;
5290 }
5291 /* Sort sets from the smallest to largest, this will improve our
5292 * algorithm's performace */
5293 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5294
5295 /* The first thing we should output is the total number of elements...
5296 * since this is a multi-bulk write, but at this stage we don't know
5297 * the intersection set size, so we use a trick, append an empty object
5298 * to the output list and save the pointer to later modify it with the
5299 * right length */
5300 if (!dstkey) {
5301 lenobj = createObject(REDIS_STRING,NULL);
5302 addReply(c,lenobj);
5303 decrRefCount(lenobj);
5304 } else {
5305 /* If we have a target key where to store the resulting set
5306 * create this key with an empty set inside */
5307 dstset = createSetObject();
5308 }
5309
5310 /* Iterate all the elements of the first (smallest) set, and test
5311 * the element against all the other sets, if at least one set does
5312 * not include the element it is discarded */
5313 di = dictGetIterator(dv[0]);
5314
5315 while((de = dictNext(di)) != NULL) {
5316 robj *ele;
5317
5318 for (j = 1; j < setsnum; j++)
5319 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5320 if (j != setsnum)
5321 continue; /* at least one set does not contain the member */
5322 ele = dictGetEntryKey(de);
5323 if (!dstkey) {
5324 addReplyBulk(c,ele);
5325 cardinality++;
5326 } else {
5327 dictAdd(dstset->ptr,ele,NULL);
5328 incrRefCount(ele);
5329 }
5330 }
5331 dictReleaseIterator(di);
5332
5333 if (dstkey) {
5334 /* Store the resulting set into the target, if the intersection
5335 * is not an empty set. */
5336 deleteKey(c->db,dstkey);
5337 if (dictSize((dict*)dstset->ptr) > 0) {
5338 dictAdd(c->db->dict,dstkey,dstset);
5339 incrRefCount(dstkey);
5340 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5341 } else {
5342 decrRefCount(dstset);
5343 addReply(c,shared.czero);
5344 }
5345 server.dirty++;
5346 } else {
5347 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5348 }
5349 zfree(dv);
5350 }
5351
5352 static void sinterCommand(redisClient *c) {
5353 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5354 }
5355
5356 static void sinterstoreCommand(redisClient *c) {
5357 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5358 }
5359
5360 #define REDIS_OP_UNION 0
5361 #define REDIS_OP_DIFF 1
5362 #define REDIS_OP_INTER 2
5363
5364 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5365 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5366 dictIterator *di;
5367 dictEntry *de;
5368 robj *dstset = NULL;
5369 int j, cardinality = 0;
5370
5371 for (j = 0; j < setsnum; j++) {
5372 robj *setobj;
5373
5374 setobj = dstkey ?
5375 lookupKeyWrite(c->db,setskeys[j]) :
5376 lookupKeyRead(c->db,setskeys[j]);
5377 if (!setobj) {
5378 dv[j] = NULL;
5379 continue;
5380 }
5381 if (setobj->type != REDIS_SET) {
5382 zfree(dv);
5383 addReply(c,shared.wrongtypeerr);
5384 return;
5385 }
5386 dv[j] = setobj->ptr;
5387 }
5388
5389 /* We need a temp set object to store our union. If the dstkey
5390 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5391 * this set object will be the resulting object to set into the target key*/
5392 dstset = createSetObject();
5393
5394 /* Iterate all the elements of all the sets, add every element a single
5395 * time to the result set */
5396 for (j = 0; j < setsnum; j++) {
5397 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5398 if (!dv[j]) continue; /* non existing keys are like empty sets */
5399
5400 di = dictGetIterator(dv[j]);
5401
5402 while((de = dictNext(di)) != NULL) {
5403 robj *ele;
5404
5405 /* dictAdd will not add the same element multiple times */
5406 ele = dictGetEntryKey(de);
5407 if (op == REDIS_OP_UNION || j == 0) {
5408 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5409 incrRefCount(ele);
5410 cardinality++;
5411 }
5412 } else if (op == REDIS_OP_DIFF) {
5413 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5414 cardinality--;
5415 }
5416 }
5417 }
5418 dictReleaseIterator(di);
5419
5420 /* result set is empty? Exit asap. */
5421 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5422 }
5423
5424 /* Output the content of the resulting set, if not in STORE mode */
5425 if (!dstkey) {
5426 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5427 di = dictGetIterator(dstset->ptr);
5428 while((de = dictNext(di)) != NULL) {
5429 robj *ele;
5430
5431 ele = dictGetEntryKey(de);
5432 addReplyBulk(c,ele);
5433 }
5434 dictReleaseIterator(di);
5435 decrRefCount(dstset);
5436 } else {
5437 /* If we have a target key where to store the resulting set
5438 * create this key with the result set inside */
5439 deleteKey(c->db,dstkey);
5440 if (dictSize((dict*)dstset->ptr) > 0) {
5441 dictAdd(c->db->dict,dstkey,dstset);
5442 incrRefCount(dstkey);
5443 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5444 } else {
5445 decrRefCount(dstset);
5446 addReply(c,shared.czero);
5447 }
5448 server.dirty++;
5449 }
5450 zfree(dv);
5451 }
5452
5453 static void sunionCommand(redisClient *c) {
5454 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5455 }
5456
5457 static void sunionstoreCommand(redisClient *c) {
5458 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5459 }
5460
5461 static void sdiffCommand(redisClient *c) {
5462 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5463 }
5464
5465 static void sdiffstoreCommand(redisClient *c) {
5466 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5467 }
5468
5469 /* ==================================== ZSets =============================== */
5470
5471 /* ZSETs are ordered sets using two data structures to hold the same elements
5472 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5473 * data structure.
5474 *
5475 * The elements are added to an hash table mapping Redis objects to scores.
5476 * At the same time the elements are added to a skip list mapping scores
5477 * to Redis objects (so objects are sorted by scores in this "view"). */
5478
5479 /* This skiplist implementation is almost a C translation of the original
5480 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5481 * Alternative to Balanced Trees", modified in three ways:
5482 * a) this implementation allows for repeated values.
5483 * b) the comparison is not just by key (our 'score') but by satellite data.
5484 * c) there is a back pointer, so it's a doubly linked list with the back
5485 * pointers being only at "level 1". This allows to traverse the list
5486 * from tail to head, useful for ZREVRANGE. */
5487
5488 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5489 zskiplistNode *zn = zmalloc(sizeof(*zn));
5490
5491 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5492 if (level > 1)
5493 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5494 else
5495 zn->span = NULL;
5496 zn->score = score;
5497 zn->obj = obj;
5498 return zn;
5499 }
5500
5501 static zskiplist *zslCreate(void) {
5502 int j;
5503 zskiplist *zsl;
5504
5505 zsl = zmalloc(sizeof(*zsl));
5506 zsl->level = 1;
5507 zsl->length = 0;
5508 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5509 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5510 zsl->header->forward[j] = NULL;
5511
5512 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5513 if (j < ZSKIPLIST_MAXLEVEL-1)
5514 zsl->header->span[j] = 0;
5515 }
5516 zsl->header->backward = NULL;
5517 zsl->tail = NULL;
5518 return zsl;
5519 }
5520
5521 static void zslFreeNode(zskiplistNode *node) {
5522 decrRefCount(node->obj);
5523 zfree(node->forward);
5524 zfree(node->span);
5525 zfree(node);
5526 }
5527
5528 static void zslFree(zskiplist *zsl) {
5529 zskiplistNode *node = zsl->header->forward[0], *next;
5530
5531 zfree(zsl->header->forward);
5532 zfree(zsl->header->span);
5533 zfree(zsl->header);
5534 while(node) {
5535 next = node->forward[0];
5536 zslFreeNode(node);
5537 node = next;
5538 }
5539 zfree(zsl);
5540 }
5541
5542 static int zslRandomLevel(void) {
5543 int level = 1;
5544 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5545 level += 1;
5546 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5547 }
5548
5549 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5550 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5551 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5552 int i, level;
5553
5554 x = zsl->header;
5555 for (i = zsl->level-1; i >= 0; i--) {
5556 /* store rank that is crossed to reach the insert position */
5557 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5558
5559 while (x->forward[i] &&
5560 (x->forward[i]->score < score ||
5561 (x->forward[i]->score == score &&
5562 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5563 rank[i] += i > 0 ? x->span[i-1] : 1;
5564 x = x->forward[i];
5565 }
5566 update[i] = x;
5567 }
5568 /* we assume the key is not already inside, since we allow duplicated
5569 * scores, and the re-insertion of score and redis object should never
5570 * happpen since the caller of zslInsert() should test in the hash table
5571 * if the element is already inside or not. */
5572 level = zslRandomLevel();
5573 if (level > zsl->level) {
5574 for (i = zsl->level; i < level; i++) {
5575 rank[i] = 0;
5576 update[i] = zsl->header;
5577 update[i]->span[i-1] = zsl->length;
5578 }
5579 zsl->level = level;
5580 }
5581 x = zslCreateNode(level,score,obj);
5582 for (i = 0; i < level; i++) {
5583 x->forward[i] = update[i]->forward[i];
5584 update[i]->forward[i] = x;
5585
5586 /* update span covered by update[i] as x is inserted here */
5587 if (i > 0) {
5588 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5589 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5590 }
5591 }
5592
5593 /* increment span for untouched levels */
5594 for (i = level; i < zsl->level; i++) {
5595 update[i]->span[i-1]++;
5596 }
5597
5598 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5599 if (x->forward[0])
5600 x->forward[0]->backward = x;
5601 else
5602 zsl->tail = x;
5603 zsl->length++;
5604 }
5605
5606 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5607 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5608 int i;
5609 for (i = 0; i < zsl->level; i++) {
5610 if (update[i]->forward[i] == x) {
5611 if (i > 0) {
5612 update[i]->span[i-1] += x->span[i-1] - 1;
5613 }
5614 update[i]->forward[i] = x->forward[i];
5615 } else {
5616 /* invariant: i > 0, because update[0]->forward[0]
5617 * is always equal to x */
5618 update[i]->span[i-1] -= 1;
5619 }
5620 }
5621 if (x->forward[0]) {
5622 x->forward[0]->backward = x->backward;
5623 } else {
5624 zsl->tail = x->backward;
5625 }
5626 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5627 zsl->level--;
5628 zsl->length--;
5629 }
5630
5631 /* Delete an element with matching score/object from the skiplist. */
5632 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5633 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5634 int i;
5635
5636 x = zsl->header;
5637 for (i = zsl->level-1; i >= 0; i--) {
5638 while (x->forward[i] &&
5639 (x->forward[i]->score < score ||
5640 (x->forward[i]->score == score &&
5641 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5642 x = x->forward[i];
5643 update[i] = x;
5644 }
5645 /* We may have multiple elements with the same score, what we need
5646 * is to find the element with both the right score and object. */
5647 x = x->forward[0];
5648 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5649 zslDeleteNode(zsl, x, update);
5650 zslFreeNode(x);
5651 return 1;
5652 } else {
5653 return 0; /* not found */
5654 }
5655 return 0; /* not found */
5656 }
5657
5658 /* Delete all the elements with score between min and max from the skiplist.
5659 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5660 * Note that this function takes the reference to the hash table view of the
5661 * sorted set, in order to remove the elements from the hash table too. */
5662 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5663 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5664 unsigned long removed = 0;
5665 int i;
5666
5667 x = zsl->header;
5668 for (i = zsl->level-1; i >= 0; i--) {
5669 while (x->forward[i] && x->forward[i]->score < min)
5670 x = x->forward[i];
5671 update[i] = x;
5672 }
5673 /* We may have multiple elements with the same score, what we need
5674 * is to find the element with both the right score and object. */
5675 x = x->forward[0];
5676 while (x && x->score <= max) {
5677 zskiplistNode *next = x->forward[0];
5678 zslDeleteNode(zsl, x, update);
5679 dictDelete(dict,x->obj);
5680 zslFreeNode(x);
5681 removed++;
5682 x = next;
5683 }
5684 return removed; /* not found */
5685 }
5686
5687 /* Delete all the elements with rank between start and end from the skiplist.
5688 * Start and end are inclusive. Note that start and end need to be 1-based */
5689 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5690 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5691 unsigned long traversed = 0, removed = 0;
5692 int i;
5693
5694 x = zsl->header;
5695 for (i = zsl->level-1; i >= 0; i--) {
5696 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5697 traversed += i > 0 ? x->span[i-1] : 1;
5698 x = x->forward[i];
5699 }
5700 update[i] = x;
5701 }
5702
5703 traversed++;
5704 x = x->forward[0];
5705 while (x && traversed <= end) {
5706 zskiplistNode *next = x->forward[0];
5707 zslDeleteNode(zsl, x, update);
5708 dictDelete(dict,x->obj);
5709 zslFreeNode(x);
5710 removed++;
5711 traversed++;
5712 x = next;
5713 }
5714 return removed;
5715 }
5716
5717 /* Find the first node having a score equal or greater than the specified one.
5718 * Returns NULL if there is no match. */
5719 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5720 zskiplistNode *x;
5721 int i;
5722
5723 x = zsl->header;
5724 for (i = zsl->level-1; i >= 0; i--) {
5725 while (x->forward[i] && x->forward[i]->score < score)
5726 x = x->forward[i];
5727 }
5728 /* We may have multiple elements with the same score, what we need
5729 * is to find the element with both the right score and object. */
5730 return x->forward[0];
5731 }
5732
5733 /* Find the rank for an element by both score and key.
5734 * Returns 0 when the element cannot be found, rank otherwise.
5735 * Note that the rank is 1-based due to the span of zsl->header to the
5736 * first element. */
5737 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5738 zskiplistNode *x;
5739 unsigned long rank = 0;
5740 int i;
5741
5742 x = zsl->header;
5743 for (i = zsl->level-1; i >= 0; i--) {
5744 while (x->forward[i] &&
5745 (x->forward[i]->score < score ||
5746 (x->forward[i]->score == score &&
5747 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5748 rank += i > 0 ? x->span[i-1] : 1;
5749 x = x->forward[i];
5750 }
5751
5752 /* x might be equal to zsl->header, so test if obj is non-NULL */
5753 if (x->obj && equalStringObjects(x->obj,o)) {
5754 return rank;
5755 }
5756 }
5757 return 0;
5758 }
5759
5760 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5761 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5762 zskiplistNode *x;
5763 unsigned long traversed = 0;
5764 int i;
5765
5766 x = zsl->header;
5767 for (i = zsl->level-1; i >= 0; i--) {
5768 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5769 {
5770 traversed += i > 0 ? x->span[i-1] : 1;
5771 x = x->forward[i];
5772 }
5773 if (traversed == rank) {
5774 return x;
5775 }
5776 }
5777 return NULL;
5778 }
5779
5780 /* The actual Z-commands implementations */
5781
5782 /* This generic command implements both ZADD and ZINCRBY.
5783 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5784 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5785 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5786 robj *zsetobj;
5787 zset *zs;
5788 double *score;
5789
5790 if (isnan(scoreval)) {
5791 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5792 return;
5793 }
5794
5795 zsetobj = lookupKeyWrite(c->db,key);
5796 if (zsetobj == NULL) {
5797 zsetobj = createZsetObject();
5798 dictAdd(c->db->dict,key,zsetobj);
5799 incrRefCount(key);
5800 } else {
5801 if (zsetobj->type != REDIS_ZSET) {
5802 addReply(c,shared.wrongtypeerr);
5803 return;
5804 }
5805 }
5806 zs = zsetobj->ptr;
5807
5808 /* Ok now since we implement both ZADD and ZINCRBY here the code
5809 * needs to handle the two different conditions. It's all about setting
5810 * '*score', that is, the new score to set, to the right value. */
5811 score = zmalloc(sizeof(double));
5812 if (doincrement) {
5813 dictEntry *de;
5814
5815 /* Read the old score. If the element was not present starts from 0 */
5816 de = dictFind(zs->dict,ele);
5817 if (de) {
5818 double *oldscore = dictGetEntryVal(de);
5819 *score = *oldscore + scoreval;
5820 } else {
5821 *score = scoreval;
5822 }
5823 if (isnan(*score)) {
5824 addReplySds(c,
5825 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5826 zfree(score);
5827 /* Note that we don't need to check if the zset may be empty and
5828 * should be removed here, as we can only obtain Nan as score if
5829 * there was already an element in the sorted set. */
5830 return;
5831 }
5832 } else {
5833 *score = scoreval;
5834 }
5835
5836 /* What follows is a simple remove and re-insert operation that is common
5837 * to both ZADD and ZINCRBY... */
5838 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5839 /* case 1: New element */
5840 incrRefCount(ele); /* added to hash */
5841 zslInsert(zs->zsl,*score,ele);
5842 incrRefCount(ele); /* added to skiplist */
5843 server.dirty++;
5844 if (doincrement)
5845 addReplyDouble(c,*score);
5846 else
5847 addReply(c,shared.cone);
5848 } else {
5849 dictEntry *de;
5850 double *oldscore;
5851
5852 /* case 2: Score update operation */
5853 de = dictFind(zs->dict,ele);
5854 redisAssert(de != NULL);
5855 oldscore = dictGetEntryVal(de);
5856 if (*score != *oldscore) {
5857 int deleted;
5858
5859 /* Remove and insert the element in the skip list with new score */
5860 deleted = zslDelete(zs->zsl,*oldscore,ele);
5861 redisAssert(deleted != 0);
5862 zslInsert(zs->zsl,*score,ele);
5863 incrRefCount(ele);
5864 /* Update the score in the hash table */
5865 dictReplace(zs->dict,ele,score);
5866 server.dirty++;
5867 } else {
5868 zfree(score);
5869 }
5870 if (doincrement)
5871 addReplyDouble(c,*score);
5872 else
5873 addReply(c,shared.czero);
5874 }
5875 }
5876
5877 static void zaddCommand(redisClient *c) {
5878 double scoreval;
5879
5880 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5881 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5882 }
5883
5884 static void zincrbyCommand(redisClient *c) {
5885 double scoreval;
5886
5887 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5888 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5889 }
5890
5891 static void zremCommand(redisClient *c) {
5892 robj *zsetobj;
5893 zset *zs;
5894 dictEntry *de;
5895 double *oldscore;
5896 int deleted;
5897
5898 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5899 checkType(c,zsetobj,REDIS_ZSET)) return;
5900
5901 zs = zsetobj->ptr;
5902 de = dictFind(zs->dict,c->argv[2]);
5903 if (de == NULL) {
5904 addReply(c,shared.czero);
5905 return;
5906 }
5907 /* Delete from the skiplist */
5908 oldscore = dictGetEntryVal(de);
5909 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5910 redisAssert(deleted != 0);
5911
5912 /* Delete from the hash table */
5913 dictDelete(zs->dict,c->argv[2]);
5914 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5915 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5916 server.dirty++;
5917 addReply(c,shared.cone);
5918 }
5919
5920 static void zremrangebyscoreCommand(redisClient *c) {
5921 double min;
5922 double max;
5923 long deleted;
5924 robj *zsetobj;
5925 zset *zs;
5926
5927 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5928 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5929
5930 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5931 checkType(c,zsetobj,REDIS_ZSET)) return;
5932
5933 zs = zsetobj->ptr;
5934 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5935 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5936 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5937 server.dirty += deleted;
5938 addReplyLongLong(c,deleted);
5939 }
5940
5941 static void zremrangebyrankCommand(redisClient *c) {
5942 long start;
5943 long end;
5944 int llen;
5945 long deleted;
5946 robj *zsetobj;
5947 zset *zs;
5948
5949 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5950 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5951
5952 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5953 checkType(c,zsetobj,REDIS_ZSET)) return;
5954 zs = zsetobj->ptr;
5955 llen = zs->zsl->length;
5956
5957 /* convert negative indexes */
5958 if (start < 0) start = llen+start;
5959 if (end < 0) end = llen+end;
5960 if (start < 0) start = 0;
5961 if (end < 0) end = 0;
5962
5963 /* indexes sanity checks */
5964 if (start > end || start >= llen) {
5965 addReply(c,shared.czero);
5966 return;
5967 }
5968 if (end >= llen) end = llen-1;
5969
5970 /* increment start and end because zsl*Rank functions
5971 * use 1-based rank */
5972 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5973 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5974 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5975 server.dirty += deleted;
5976 addReplyLongLong(c, deleted);
5977 }
5978
5979 typedef struct {
5980 dict *dict;
5981 double weight;
5982 } zsetopsrc;
5983
5984 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5985 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5986 unsigned long size1, size2;
5987 size1 = d1->dict ? dictSize(d1->dict) : 0;
5988 size2 = d2->dict ? dictSize(d2->dict) : 0;
5989 return size1 - size2;
5990 }
5991
5992 #define REDIS_AGGR_SUM 1
5993 #define REDIS_AGGR_MIN 2
5994 #define REDIS_AGGR_MAX 3
5995 #define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
5996
5997 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5998 if (aggregate == REDIS_AGGR_SUM) {
5999 *target = *target + val;
6000 } else if (aggregate == REDIS_AGGR_MIN) {
6001 *target = val < *target ? val : *target;
6002 } else if (aggregate == REDIS_AGGR_MAX) {
6003 *target = val > *target ? val : *target;
6004 } else {
6005 /* safety net */
6006 redisPanic("Unknown ZUNION/INTER aggregate type");
6007 }
6008 }
6009
6010 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
6011 int i, j, setnum;
6012 int aggregate = REDIS_AGGR_SUM;
6013 zsetopsrc *src;
6014 robj *dstobj;
6015 zset *dstzset;
6016 dictIterator *di;
6017 dictEntry *de;
6018
6019 /* expect setnum input keys to be given */
6020 setnum = atoi(c->argv[2]->ptr);
6021 if (setnum < 1) {
6022 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
6023 return;
6024 }
6025
6026 /* test if the expected number of keys would overflow */
6027 if (3+setnum > c->argc) {
6028 addReply(c,shared.syntaxerr);
6029 return;
6030 }
6031
6032 /* read keys to be used for input */
6033 src = zmalloc(sizeof(zsetopsrc) * setnum);
6034 for (i = 0, j = 3; i < setnum; i++, j++) {
6035 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6036 if (!obj) {
6037 src[i].dict = NULL;
6038 } else {
6039 if (obj->type == REDIS_ZSET) {
6040 src[i].dict = ((zset*)obj->ptr)->dict;
6041 } else if (obj->type == REDIS_SET) {
6042 src[i].dict = (obj->ptr);
6043 } else {
6044 zfree(src);
6045 addReply(c,shared.wrongtypeerr);
6046 return;
6047 }
6048 }
6049
6050 /* default all weights to 1 */
6051 src[i].weight = 1.0;
6052 }
6053
6054 /* parse optional extra arguments */
6055 if (j < c->argc) {
6056 int remaining = c->argc - j;
6057
6058 while (remaining) {
6059 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
6060 j++; remaining--;
6061 for (i = 0; i < setnum; i++, j++, remaining--) {
6062 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
6063 return;
6064 }
6065 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6066 j++; remaining--;
6067 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6068 aggregate = REDIS_AGGR_SUM;
6069 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6070 aggregate = REDIS_AGGR_MIN;
6071 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6072 aggregate = REDIS_AGGR_MAX;
6073 } else {
6074 zfree(src);
6075 addReply(c,shared.syntaxerr);
6076 return;
6077 }
6078 j++; remaining--;
6079 } else {
6080 zfree(src);
6081 addReply(c,shared.syntaxerr);
6082 return;
6083 }
6084 }
6085 }
6086
6087 /* sort sets from the smallest to largest, this will improve our
6088 * algorithm's performance */
6089 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
6090
6091 dstobj = createZsetObject();
6092 dstzset = dstobj->ptr;
6093
6094 if (op == REDIS_OP_INTER) {
6095 /* skip going over all entries if the smallest zset is NULL or empty */
6096 if (src[0].dict && dictSize(src[0].dict) > 0) {
6097 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6098 * from small to large, all src[i > 0].dict are non-empty too */
6099 di = dictGetIterator(src[0].dict);
6100 while((de = dictNext(di)) != NULL) {
6101 double *score = zmalloc(sizeof(double)), value;
6102 *score = src[0].weight * zunionInterDictValue(de);
6103
6104 for (j = 1; j < setnum; j++) {
6105 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6106 if (other) {
6107 value = src[j].weight * zunionInterDictValue(other);
6108 zunionInterAggregate(score, value, aggregate);
6109 } else {
6110 break;
6111 }
6112 }
6113
6114 /* skip entry when not present in every source dict */
6115 if (j != setnum) {
6116 zfree(score);
6117 } else {
6118 robj *o = dictGetEntryKey(de);
6119 dictAdd(dstzset->dict,o,score);
6120 incrRefCount(o); /* added to dictionary */
6121 zslInsert(dstzset->zsl,*score,o);
6122 incrRefCount(o); /* added to skiplist */
6123 }
6124 }
6125 dictReleaseIterator(di);
6126 }
6127 } else if (op == REDIS_OP_UNION) {
6128 for (i = 0; i < setnum; i++) {
6129 if (!src[i].dict) continue;
6130
6131 di = dictGetIterator(src[i].dict);
6132 while((de = dictNext(di)) != NULL) {
6133 /* skip key when already processed */
6134 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6135
6136 double *score = zmalloc(sizeof(double)), value;
6137 *score = src[i].weight * zunionInterDictValue(de);
6138
6139 /* because the zsets are sorted by size, its only possible
6140 * for sets at larger indices to hold this entry */
6141 for (j = (i+1); j < setnum; j++) {
6142 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6143 if (other) {
6144 value = src[j].weight * zunionInterDictValue(other);
6145 zunionInterAggregate(score, value, aggregate);
6146 }
6147 }
6148
6149 robj *o = dictGetEntryKey(de);
6150 dictAdd(dstzset->dict,o,score);
6151 incrRefCount(o); /* added to dictionary */
6152 zslInsert(dstzset->zsl,*score,o);
6153 incrRefCount(o); /* added to skiplist */
6154 }
6155 dictReleaseIterator(di);
6156 }
6157 } else {
6158 /* unknown operator */
6159 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6160 }
6161
6162 deleteKey(c->db,dstkey);
6163 if (dstzset->zsl->length) {
6164 dictAdd(c->db->dict,dstkey,dstobj);
6165 incrRefCount(dstkey);
6166 addReplyLongLong(c, dstzset->zsl->length);
6167 server.dirty++;
6168 } else {
6169 decrRefCount(dstobj);
6170 addReply(c, shared.czero);
6171 }
6172 zfree(src);
6173 }
6174
6175 static void zunionstoreCommand(redisClient *c) {
6176 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6177 }
6178
6179 static void zinterstoreCommand(redisClient *c) {
6180 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6181 }
6182
6183 static void zrangeGenericCommand(redisClient *c, int reverse) {
6184 robj *o;
6185 long start;
6186 long end;
6187 int withscores = 0;
6188 int llen;
6189 int rangelen, j;
6190 zset *zsetobj;
6191 zskiplist *zsl;
6192 zskiplistNode *ln;
6193 robj *ele;
6194
6195 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6196 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6197
6198 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6199 withscores = 1;
6200 } else if (c->argc >= 5) {
6201 addReply(c,shared.syntaxerr);
6202 return;
6203 }
6204
6205 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6206 || checkType(c,o,REDIS_ZSET)) return;
6207 zsetobj = o->ptr;
6208 zsl = zsetobj->zsl;
6209 llen = zsl->length;
6210
6211 /* convert negative indexes */
6212 if (start < 0) start = llen+start;
6213 if (end < 0) end = llen+end;
6214 if (start < 0) start = 0;
6215 if (end < 0) end = 0;
6216
6217 /* indexes sanity checks */
6218 if (start > end || start >= llen) {
6219 /* Out of range start or start > end result in empty list */
6220 addReply(c,shared.emptymultibulk);
6221 return;
6222 }
6223 if (end >= llen) end = llen-1;
6224 rangelen = (end-start)+1;
6225
6226 /* check if starting point is trivial, before searching
6227 * the element in log(N) time */
6228 if (reverse) {
6229 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6230 } else {
6231 ln = start == 0 ?
6232 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6233 }
6234
6235 /* Return the result in form of a multi-bulk reply */
6236 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6237 withscores ? (rangelen*2) : rangelen));
6238 for (j = 0; j < rangelen; j++) {
6239 ele = ln->obj;
6240 addReplyBulk(c,ele);
6241 if (withscores)
6242 addReplyDouble(c,ln->score);
6243 ln = reverse ? ln->backward : ln->forward[0];
6244 }
6245 }
6246
6247 static void zrangeCommand(redisClient *c) {
6248 zrangeGenericCommand(c,0);
6249 }
6250
6251 static void zrevrangeCommand(redisClient *c) {
6252 zrangeGenericCommand(c,1);
6253 }
6254
6255 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6256 * If justcount is non-zero, just the count is returned. */
6257 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6258 robj *o;
6259 double min, max;
6260 int minex = 0, maxex = 0; /* are min or max exclusive? */
6261 int offset = 0, limit = -1;
6262 int withscores = 0;
6263 int badsyntax = 0;
6264
6265 /* Parse the min-max interval. If one of the values is prefixed
6266 * by the "(" character, it's considered "open". For instance
6267 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6268 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6269 if (((char*)c->argv[2]->ptr)[0] == '(') {
6270 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6271 minex = 1;
6272 } else {
6273 min = strtod(c->argv[2]->ptr,NULL);
6274 }
6275 if (((char*)c->argv[3]->ptr)[0] == '(') {
6276 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6277 maxex = 1;
6278 } else {
6279 max = strtod(c->argv[3]->ptr,NULL);
6280 }
6281
6282 /* Parse "WITHSCORES": note that if the command was called with
6283 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6284 * enter the following paths to parse WITHSCORES and LIMIT. */
6285 if (c->argc == 5 || c->argc == 8) {
6286 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6287 withscores = 1;
6288 else
6289 badsyntax = 1;
6290 }
6291 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6292 badsyntax = 1;
6293 if (badsyntax) {
6294 addReplySds(c,
6295 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6296 return;
6297 }
6298
6299 /* Parse "LIMIT" */
6300 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6301 addReply(c,shared.syntaxerr);
6302 return;
6303 } else if (c->argc == (7 + withscores)) {
6304 offset = atoi(c->argv[5]->ptr);
6305 limit = atoi(c->argv[6]->ptr);
6306 if (offset < 0) offset = 0;
6307 }
6308
6309 /* Ok, lookup the key and get the range */
6310 o = lookupKeyRead(c->db,c->argv[1]);
6311 if (o == NULL) {
6312 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6313 } else {
6314 if (o->type != REDIS_ZSET) {
6315 addReply(c,shared.wrongtypeerr);
6316 } else {
6317 zset *zsetobj = o->ptr;
6318 zskiplist *zsl = zsetobj->zsl;
6319 zskiplistNode *ln;
6320 robj *ele, *lenobj = NULL;
6321 unsigned long rangelen = 0;
6322
6323 /* Get the first node with the score >= min, or with
6324 * score > min if 'minex' is true. */
6325 ln = zslFirstWithScore(zsl,min);
6326 while (minex && ln && ln->score == min) ln = ln->forward[0];
6327
6328 if (ln == NULL) {
6329 /* No element matching the speciifed interval */
6330 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6331 return;
6332 }
6333
6334 /* We don't know in advance how many matching elements there
6335 * are in the list, so we push this object that will represent
6336 * the multi-bulk length in the output buffer, and will "fix"
6337 * it later */
6338 if (!justcount) {
6339 lenobj = createObject(REDIS_STRING,NULL);
6340 addReply(c,lenobj);
6341 decrRefCount(lenobj);
6342 }
6343
6344 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6345 if (offset) {
6346 offset--;
6347 ln = ln->forward[0];
6348 continue;
6349 }
6350 if (limit == 0) break;
6351 if (!justcount) {
6352 ele = ln->obj;
6353 addReplyBulk(c,ele);
6354 if (withscores)
6355 addReplyDouble(c,ln->score);
6356 }
6357 ln = ln->forward[0];
6358 rangelen++;
6359 if (limit > 0) limit--;
6360 }
6361 if (justcount) {
6362 addReplyLongLong(c,(long)rangelen);
6363 } else {
6364 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6365 withscores ? (rangelen*2) : rangelen);
6366 }
6367 }
6368 }
6369 }
6370
6371 static void zrangebyscoreCommand(redisClient *c) {
6372 genericZrangebyscoreCommand(c,0);
6373 }
6374
6375 static void zcountCommand(redisClient *c) {
6376 genericZrangebyscoreCommand(c,1);
6377 }
6378
6379 static void zcardCommand(redisClient *c) {
6380 robj *o;
6381 zset *zs;
6382
6383 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6384 checkType(c,o,REDIS_ZSET)) return;
6385
6386 zs = o->ptr;
6387 addReplyUlong(c,zs->zsl->length);
6388 }
6389
6390 static void zscoreCommand(redisClient *c) {
6391 robj *o;
6392 zset *zs;
6393 dictEntry *de;
6394
6395 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6396 checkType(c,o,REDIS_ZSET)) return;
6397
6398 zs = o->ptr;
6399 de = dictFind(zs->dict,c->argv[2]);
6400 if (!de) {
6401 addReply(c,shared.nullbulk);
6402 } else {
6403 double *score = dictGetEntryVal(de);
6404
6405 addReplyDouble(c,*score);
6406 }
6407 }
6408
6409 static void zrankGenericCommand(redisClient *c, int reverse) {
6410 robj *o;
6411 zset *zs;
6412 zskiplist *zsl;
6413 dictEntry *de;
6414 unsigned long rank;
6415 double *score;
6416
6417 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6418 checkType(c,o,REDIS_ZSET)) return;
6419
6420 zs = o->ptr;
6421 zsl = zs->zsl;
6422 de = dictFind(zs->dict,c->argv[2]);
6423 if (!de) {
6424 addReply(c,shared.nullbulk);
6425 return;
6426 }
6427
6428 score = dictGetEntryVal(de);
6429 rank = zslGetRank(zsl, *score, c->argv[2]);
6430 if (rank) {
6431 if (reverse) {
6432 addReplyLongLong(c, zsl->length - rank);
6433 } else {
6434 addReplyLongLong(c, rank-1);
6435 }
6436 } else {
6437 addReply(c,shared.nullbulk);
6438 }
6439 }
6440
6441 static void zrankCommand(redisClient *c) {
6442 zrankGenericCommand(c, 0);
6443 }
6444
6445 static void zrevrankCommand(redisClient *c) {
6446 zrankGenericCommand(c, 1);
6447 }
6448
6449 /* ========================= Hashes utility functions ======================= */
6450 #define REDIS_HASH_KEY 1
6451 #define REDIS_HASH_VALUE 2
6452
6453 /* Check the length of a number of objects to see if we need to convert a
6454 * zipmap to a real hash. Note that we only check string encoded objects
6455 * as their string length can be queried in constant time. */
6456 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6457 int i;
6458 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6459
6460 for (i = start; i <= end; i++) {
6461 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6462 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6463 {
6464 convertToRealHash(subject);
6465 return;
6466 }
6467 }
6468 }
6469
6470 /* Encode given objects in-place when the hash uses a dict. */
6471 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6472 if (subject->encoding == REDIS_ENCODING_HT) {
6473 if (o1) *o1 = tryObjectEncoding(*o1);
6474 if (o2) *o2 = tryObjectEncoding(*o2);
6475 }
6476 }
6477
6478 /* Get the value from a hash identified by key. Returns either a string
6479 * object or NULL if the value cannot be found. The refcount of the object
6480 * is always increased by 1 when the value was found. */
6481 static robj *hashGet(robj *o, robj *key) {
6482 robj *value = NULL;
6483 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6484 unsigned char *v;
6485 unsigned int vlen;
6486 key = getDecodedObject(key);
6487 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6488 value = createStringObject((char*)v,vlen);
6489 }
6490 decrRefCount(key);
6491 } else {
6492 dictEntry *de = dictFind(o->ptr,key);
6493 if (de != NULL) {
6494 value = dictGetEntryVal(de);
6495 incrRefCount(value);
6496 }
6497 }
6498 return value;
6499 }
6500
6501 /* Test if the key exists in the given hash. Returns 1 if the key
6502 * exists and 0 when it doesn't. */
6503 static int hashExists(robj *o, robj *key) {
6504 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6505 key = getDecodedObject(key);
6506 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6507 decrRefCount(key);
6508 return 1;
6509 }
6510 decrRefCount(key);
6511 } else {
6512 if (dictFind(o->ptr,key) != NULL) {
6513 return 1;
6514 }
6515 }
6516 return 0;
6517 }
6518
6519 /* Add an element, discard the old if the key already exists.
6520 * Return 0 on insert and 1 on update. */
6521 static int hashSet(robj *o, robj *key, robj *value) {
6522 int update = 0;
6523 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6524 key = getDecodedObject(key);
6525 value = getDecodedObject(value);
6526 o->ptr = zipmapSet(o->ptr,
6527 key->ptr,sdslen(key->ptr),
6528 value->ptr,sdslen(value->ptr), &update);
6529 decrRefCount(key);
6530 decrRefCount(value);
6531
6532 /* Check if the zipmap needs to be upgraded to a real hash table */
6533 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6534 convertToRealHash(o);
6535 } else {
6536 if (dictReplace(o->ptr,key,value)) {
6537 /* Insert */
6538 incrRefCount(key);
6539 } else {
6540 /* Update */
6541 update = 1;
6542 }
6543 incrRefCount(value);
6544 }
6545 return update;
6546 }
6547
6548 /* Delete an element from a hash.
6549 * Return 1 on deleted and 0 on not found. */
6550 static int hashDelete(robj *o, robj *key) {
6551 int deleted = 0;
6552 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6553 key = getDecodedObject(key);
6554 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6555 decrRefCount(key);
6556 } else {
6557 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6558 /* Always check if the dictionary needs a resize after a delete. */
6559 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6560 }
6561 return deleted;
6562 }
6563
6564 /* Return the number of elements in a hash. */
6565 static unsigned long hashLength(robj *o) {
6566 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6567 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6568 }
6569
6570 /* Structure to hold hash iteration abstration. Note that iteration over
6571 * hashes involves both fields and values. Because it is possible that
6572 * not both are required, store pointers in the iterator to avoid
6573 * unnecessary memory allocation for fields/values. */
6574 typedef struct {
6575 int encoding;
6576 unsigned char *zi;
6577 unsigned char *zk, *zv;
6578 unsigned int zklen, zvlen;
6579
6580 dictIterator *di;
6581 dictEntry *de;
6582 } hashIterator;
6583
6584 static hashIterator *hashInitIterator(robj *subject) {
6585 hashIterator *hi = zmalloc(sizeof(hashIterator));
6586 hi->encoding = subject->encoding;
6587 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6588 hi->zi = zipmapRewind(subject->ptr);
6589 } else if (hi->encoding == REDIS_ENCODING_HT) {
6590 hi->di = dictGetIterator(subject->ptr);
6591 } else {
6592 redisAssert(NULL);
6593 }
6594 return hi;
6595 }
6596
6597 static void hashReleaseIterator(hashIterator *hi) {
6598 if (hi->encoding == REDIS_ENCODING_HT) {
6599 dictReleaseIterator(hi->di);
6600 }
6601 zfree(hi);
6602 }
6603
6604 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6605 * could be found and REDIS_ERR when the iterator reaches the end. */
6606 static int hashNext(hashIterator *hi) {
6607 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6608 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6609 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6610 } else {
6611 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6612 }
6613 return REDIS_OK;
6614 }
6615
6616 /* Get key or value object at current iteration position.
6617 * This increases the refcount of the field object by 1. */
6618 static robj *hashCurrent(hashIterator *hi, int what) {
6619 robj *o;
6620 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6621 if (what & REDIS_HASH_KEY) {
6622 o = createStringObject((char*)hi->zk,hi->zklen);
6623 } else {
6624 o = createStringObject((char*)hi->zv,hi->zvlen);
6625 }
6626 } else {
6627 if (what & REDIS_HASH_KEY) {
6628 o = dictGetEntryKey(hi->de);
6629 } else {
6630 o = dictGetEntryVal(hi->de);
6631 }
6632 incrRefCount(o);
6633 }
6634 return o;
6635 }
6636
6637 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6638 robj *o = lookupKeyWrite(c->db,key);
6639 if (o == NULL) {
6640 o = createHashObject();
6641 dictAdd(c->db->dict,key,o);
6642 incrRefCount(key);
6643 } else {
6644 if (o->type != REDIS_HASH) {
6645 addReply(c,shared.wrongtypeerr);
6646 return NULL;
6647 }
6648 }
6649 return o;
6650 }
6651
6652 /* ============================= Hash commands ============================== */
6653 static void hsetCommand(redisClient *c) {
6654 int update;
6655 robj *o;
6656
6657 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6658 hashTryConversion(o,c->argv,2,3);
6659 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6660 update = hashSet(o,c->argv[2],c->argv[3]);
6661 addReply(c, update ? shared.czero : shared.cone);
6662 server.dirty++;
6663 }
6664
6665 static void hsetnxCommand(redisClient *c) {
6666 robj *o;
6667 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6668 hashTryConversion(o,c->argv,2,3);
6669
6670 if (hashExists(o, c->argv[2])) {
6671 addReply(c, shared.czero);
6672 } else {
6673 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6674 hashSet(o,c->argv[2],c->argv[3]);
6675 addReply(c, shared.cone);
6676 server.dirty++;
6677 }
6678 }
6679
6680 static void hmsetCommand(redisClient *c) {
6681 int i;
6682 robj *o;
6683
6684 if ((c->argc % 2) == 1) {
6685 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6686 return;
6687 }
6688
6689 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6690 hashTryConversion(o,c->argv,2,c->argc-1);
6691 for (i = 2; i < c->argc; i += 2) {
6692 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6693 hashSet(o,c->argv[i],c->argv[i+1]);
6694 }
6695 addReply(c, shared.ok);
6696 server.dirty++;
6697 }
6698
6699 static void hincrbyCommand(redisClient *c) {
6700 long long value, incr;
6701 robj *o, *current, *new;
6702
6703 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6704 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6705 if ((current = hashGet(o,c->argv[2])) != NULL) {
6706 if (getLongLongFromObjectOrReply(c,current,&value,
6707 "hash value is not an integer") != REDIS_OK) {
6708 decrRefCount(current);
6709 return;
6710 }
6711 decrRefCount(current);
6712 } else {
6713 value = 0;
6714 }
6715
6716 value += incr;
6717 new = createStringObjectFromLongLong(value);
6718 hashTryObjectEncoding(o,&c->argv[2],NULL);
6719 hashSet(o,c->argv[2],new);
6720 decrRefCount(new);
6721 addReplyLongLong(c,value);
6722 server.dirty++;
6723 }
6724
6725 static void hgetCommand(redisClient *c) {
6726 robj *o, *value;
6727 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6728 checkType(c,o,REDIS_HASH)) return;
6729
6730 if ((value = hashGet(o,c->argv[2])) != NULL) {
6731 addReplyBulk(c,value);
6732 decrRefCount(value);
6733 } else {
6734 addReply(c,shared.nullbulk);
6735 }
6736 }
6737
6738 static void hmgetCommand(redisClient *c) {
6739 int i;
6740 robj *o, *value;
6741 o = lookupKeyRead(c->db,c->argv[1]);
6742 if (o != NULL && o->type != REDIS_HASH) {
6743 addReply(c,shared.wrongtypeerr);
6744 }
6745
6746 /* Note the check for o != NULL happens inside the loop. This is
6747 * done because objects that cannot be found are considered to be
6748 * an empty hash. The reply should then be a series of NULLs. */
6749 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6750 for (i = 2; i < c->argc; i++) {
6751 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6752 addReplyBulk(c,value);
6753 decrRefCount(value);
6754 } else {
6755 addReply(c,shared.nullbulk);
6756 }
6757 }
6758 }
6759
6760 static void hdelCommand(redisClient *c) {
6761 robj *o;
6762 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6763 checkType(c,o,REDIS_HASH)) return;
6764
6765 if (hashDelete(o,c->argv[2])) {
6766 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6767 addReply(c,shared.cone);
6768 server.dirty++;
6769 } else {
6770 addReply(c,shared.czero);
6771 }
6772 }
6773
6774 static void hlenCommand(redisClient *c) {
6775 robj *o;
6776 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6777 checkType(c,o,REDIS_HASH)) return;
6778
6779 addReplyUlong(c,hashLength(o));
6780 }
6781
6782 static void genericHgetallCommand(redisClient *c, int flags) {
6783 robj *o, *lenobj, *obj;
6784 unsigned long count = 0;
6785 hashIterator *hi;
6786
6787 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6788 || checkType(c,o,REDIS_HASH)) return;
6789
6790 lenobj = createObject(REDIS_STRING,NULL);
6791 addReply(c,lenobj);
6792 decrRefCount(lenobj);
6793
6794 hi = hashInitIterator(o);
6795 while (hashNext(hi) != REDIS_ERR) {
6796 if (flags & REDIS_HASH_KEY) {
6797 obj = hashCurrent(hi,REDIS_HASH_KEY);
6798 addReplyBulk(c,obj);
6799 decrRefCount(obj);
6800 count++;
6801 }
6802 if (flags & REDIS_HASH_VALUE) {
6803 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6804 addReplyBulk(c,obj);
6805 decrRefCount(obj);
6806 count++;
6807 }
6808 }
6809 hashReleaseIterator(hi);
6810
6811 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6812 }
6813
6814 static void hkeysCommand(redisClient *c) {
6815 genericHgetallCommand(c,REDIS_HASH_KEY);
6816 }
6817
6818 static void hvalsCommand(redisClient *c) {
6819 genericHgetallCommand(c,REDIS_HASH_VALUE);
6820 }
6821
6822 static void hgetallCommand(redisClient *c) {
6823 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6824 }
6825
6826 static void hexistsCommand(redisClient *c) {
6827 robj *o;
6828 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6829 checkType(c,o,REDIS_HASH)) return;
6830
6831 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6832 }
6833
6834 static void convertToRealHash(robj *o) {
6835 unsigned char *key, *val, *p, *zm = o->ptr;
6836 unsigned int klen, vlen;
6837 dict *dict = dictCreate(&hashDictType,NULL);
6838
6839 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6840 p = zipmapRewind(zm);
6841 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6842 robj *keyobj, *valobj;
6843
6844 keyobj = createStringObject((char*)key,klen);
6845 valobj = createStringObject((char*)val,vlen);
6846 keyobj = tryObjectEncoding(keyobj);
6847 valobj = tryObjectEncoding(valobj);
6848 dictAdd(dict,keyobj,valobj);
6849 }
6850 o->encoding = REDIS_ENCODING_HT;
6851 o->ptr = dict;
6852 zfree(zm);
6853 }
6854
6855 /* ========================= Non type-specific commands ==================== */
6856
6857 static void flushdbCommand(redisClient *c) {
6858 server.dirty += dictSize(c->db->dict);
6859 touchWatchedKeysOnFlush(c->db->id);
6860 dictEmpty(c->db->dict);
6861 dictEmpty(c->db->expires);
6862 addReply(c,shared.ok);
6863 }
6864
6865 static void flushallCommand(redisClient *c) {
6866 touchWatchedKeysOnFlush(-1);
6867 server.dirty += emptyDb();
6868 addReply(c,shared.ok);
6869 if (server.bgsavechildpid != -1) {
6870 kill(server.bgsavechildpid,SIGKILL);
6871 rdbRemoveTempFile(server.bgsavechildpid);
6872 }
6873 rdbSave(server.dbfilename);
6874 server.dirty++;
6875 }
6876
6877 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6878 redisSortOperation *so = zmalloc(sizeof(*so));
6879 so->type = type;
6880 so->pattern = pattern;
6881 return so;
6882 }
6883
6884 /* Return the value associated to the key with a name obtained
6885 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6886 * The returned object will always have its refcount increased by 1
6887 * when it is non-NULL. */
6888 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6889 char *p, *f;
6890 sds spat, ssub;
6891 robj keyobj, fieldobj, *o;
6892 int prefixlen, sublen, postfixlen, fieldlen;
6893 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6894 struct {
6895 long len;
6896 long free;
6897 char buf[REDIS_SORTKEY_MAX+1];
6898 } keyname, fieldname;
6899
6900 /* If the pattern is "#" return the substitution object itself in order
6901 * to implement the "SORT ... GET #" feature. */
6902 spat = pattern->ptr;
6903 if (spat[0] == '#' && spat[1] == '\0') {
6904 incrRefCount(subst);
6905 return subst;
6906 }
6907
6908 /* The substitution object may be specially encoded. If so we create
6909 * a decoded object on the fly. Otherwise getDecodedObject will just
6910 * increment the ref count, that we'll decrement later. */
6911 subst = getDecodedObject(subst);
6912
6913 ssub = subst->ptr;
6914 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6915 p = strchr(spat,'*');
6916 if (!p) {
6917 decrRefCount(subst);
6918 return NULL;
6919 }
6920
6921 /* Find out if we're dealing with a hash dereference. */
6922 if ((f = strstr(p+1, "->")) != NULL) {
6923 fieldlen = sdslen(spat)-(f-spat);
6924 /* this also copies \0 character */
6925 memcpy(fieldname.buf,f+2,fieldlen-1);
6926 fieldname.len = fieldlen-2;
6927 } else {
6928 fieldlen = 0;
6929 }
6930
6931 prefixlen = p-spat;
6932 sublen = sdslen(ssub);
6933 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6934 memcpy(keyname.buf,spat,prefixlen);
6935 memcpy(keyname.buf+prefixlen,ssub,sublen);
6936 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6937 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6938 keyname.len = prefixlen+sublen+postfixlen;
6939 decrRefCount(subst);
6940
6941 /* Lookup substituted key */
6942 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6943 o = lookupKeyRead(db,&keyobj);
6944 if (o == NULL) return NULL;
6945
6946 if (fieldlen > 0) {
6947 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6948
6949 /* Retrieve value from hash by the field name. This operation
6950 * already increases the refcount of the returned object. */
6951 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6952 o = hashGet(o, &fieldobj);
6953 } else {
6954 if (o->type != REDIS_STRING) return NULL;
6955
6956 /* Every object that this function returns needs to have its refcount
6957 * increased. sortCommand decreases it again. */
6958 incrRefCount(o);
6959 }
6960
6961 return o;
6962 }
6963
6964 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6965 * the additional parameter is not standard but a BSD-specific we have to
6966 * pass sorting parameters via the global 'server' structure */
6967 static int sortCompare(const void *s1, const void *s2) {
6968 const redisSortObject *so1 = s1, *so2 = s2;
6969 int cmp;
6970
6971 if (!server.sort_alpha) {
6972 /* Numeric sorting. Here it's trivial as we precomputed scores */
6973 if (so1->u.score > so2->u.score) {
6974 cmp = 1;
6975 } else if (so1->u.score < so2->u.score) {
6976 cmp = -1;
6977 } else {
6978 cmp = 0;
6979 }
6980 } else {
6981 /* Alphanumeric sorting */
6982 if (server.sort_bypattern) {
6983 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6984 /* At least one compare object is NULL */
6985 if (so1->u.cmpobj == so2->u.cmpobj)
6986 cmp = 0;
6987 else if (so1->u.cmpobj == NULL)
6988 cmp = -1;
6989 else
6990 cmp = 1;
6991 } else {
6992 /* We have both the objects, use strcoll */
6993 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6994 }
6995 } else {
6996 /* Compare elements directly. */
6997 cmp = compareStringObjects(so1->obj,so2->obj);
6998 }
6999 }
7000 return server.sort_desc ? -cmp : cmp;
7001 }
7002
7003 /* The SORT command is the most complex command in Redis. Warning: this code
7004 * is optimized for speed and a bit less for readability */
7005 static void sortCommand(redisClient *c) {
7006 list *operations;
7007 int outputlen = 0;
7008 int desc = 0, alpha = 0;
7009 int limit_start = 0, limit_count = -1, start, end;
7010 int j, dontsort = 0, vectorlen;
7011 int getop = 0; /* GET operation counter */
7012 robj *sortval, *sortby = NULL, *storekey = NULL;
7013 redisSortObject *vector; /* Resulting vector to sort */
7014
7015 /* Lookup the key to sort. It must be of the right types */
7016 sortval = lookupKeyRead(c->db,c->argv[1]);
7017 if (sortval == NULL) {
7018 addReply(c,shared.emptymultibulk);
7019 return;
7020 }
7021 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7022 sortval->type != REDIS_ZSET)
7023 {
7024 addReply(c,shared.wrongtypeerr);
7025 return;
7026 }
7027
7028 /* Create a list of operations to perform for every sorted element.
7029 * Operations can be GET/DEL/INCR/DECR */
7030 operations = listCreate();
7031 listSetFreeMethod(operations,zfree);
7032 j = 2;
7033
7034 /* Now we need to protect sortval incrementing its count, in the future
7035 * SORT may have options able to overwrite/delete keys during the sorting
7036 * and the sorted key itself may get destroied */
7037 incrRefCount(sortval);
7038
7039 /* The SORT command has an SQL-alike syntax, parse it */
7040 while(j < c->argc) {
7041 int leftargs = c->argc-j-1;
7042 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7043 desc = 0;
7044 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7045 desc = 1;
7046 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7047 alpha = 1;
7048 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7049 limit_start = atoi(c->argv[j+1]->ptr);
7050 limit_count = atoi(c->argv[j+2]->ptr);
7051 j+=2;
7052 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7053 storekey = c->argv[j+1];
7054 j++;
7055 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7056 sortby = c->argv[j+1];
7057 /* If the BY pattern does not contain '*', i.e. it is constant,
7058 * we don't need to sort nor to lookup the weight keys. */
7059 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7060 j++;
7061 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7062 listAddNodeTail(operations,createSortOperation(
7063 REDIS_SORT_GET,c->argv[j+1]));
7064 getop++;
7065 j++;
7066 } else {
7067 decrRefCount(sortval);
7068 listRelease(operations);
7069 addReply(c,shared.syntaxerr);
7070 return;
7071 }
7072 j++;
7073 }
7074
7075 /* Load the sorting vector with all the objects to sort */
7076 switch(sortval->type) {
7077 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7078 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7079 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7080 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7081 }
7082 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7083 j = 0;
7084
7085 if (sortval->type == REDIS_LIST) {
7086 list *list = sortval->ptr;
7087 listNode *ln;
7088 listIter li;
7089
7090 listRewind(list,&li);
7091 while((ln = listNext(&li))) {
7092 robj *ele = ln->value;
7093 vector[j].obj = ele;
7094 vector[j].u.score = 0;
7095 vector[j].u.cmpobj = NULL;
7096 j++;
7097 }
7098 } else {
7099 dict *set;
7100 dictIterator *di;
7101 dictEntry *setele;
7102
7103 if (sortval->type == REDIS_SET) {
7104 set = sortval->ptr;
7105 } else {
7106 zset *zs = sortval->ptr;
7107 set = zs->dict;
7108 }
7109
7110 di = dictGetIterator(set);
7111 while((setele = dictNext(di)) != NULL) {
7112 vector[j].obj = dictGetEntryKey(setele);
7113 vector[j].u.score = 0;
7114 vector[j].u.cmpobj = NULL;
7115 j++;
7116 }
7117 dictReleaseIterator(di);
7118 }
7119 redisAssert(j == vectorlen);
7120
7121 /* Now it's time to load the right scores in the sorting vector */
7122 if (dontsort == 0) {
7123 for (j = 0; j < vectorlen; j++) {
7124 robj *byval;
7125 if (sortby) {
7126 /* lookup value to sort by */
7127 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7128 if (!byval) continue;
7129 } else {
7130 /* use object itself to sort by */
7131 byval = vector[j].obj;
7132 }
7133
7134 if (alpha) {
7135 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7136 } else {
7137 if (byval->encoding == REDIS_ENCODING_RAW) {
7138 vector[j].u.score = strtod(byval->ptr,NULL);
7139 } else if (byval->encoding == REDIS_ENCODING_INT) {
7140 /* Don't need to decode the object if it's
7141 * integer-encoded (the only encoding supported) so
7142 * far. We can just cast it */
7143 vector[j].u.score = (long)byval->ptr;
7144 } else {
7145 redisAssert(1 != 1);
7146 }
7147 }
7148
7149 /* when the object was retrieved using lookupKeyByPattern,
7150 * its refcount needs to be decreased. */
7151 if (sortby) {
7152 decrRefCount(byval);
7153 }
7154 }
7155 }
7156
7157 /* We are ready to sort the vector... perform a bit of sanity check
7158 * on the LIMIT option too. We'll use a partial version of quicksort. */
7159 start = (limit_start < 0) ? 0 : limit_start;
7160 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7161 if (start >= vectorlen) {
7162 start = vectorlen-1;
7163 end = vectorlen-2;
7164 }
7165 if (end >= vectorlen) end = vectorlen-1;
7166
7167 if (dontsort == 0) {
7168 server.sort_desc = desc;
7169 server.sort_alpha = alpha;
7170 server.sort_bypattern = sortby ? 1 : 0;
7171 if (sortby && (start != 0 || end != vectorlen-1))
7172 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7173 else
7174 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7175 }
7176
7177 /* Send command output to the output buffer, performing the specified
7178 * GET/DEL/INCR/DECR operations if any. */
7179 outputlen = getop ? getop*(end-start+1) : end-start+1;
7180 if (storekey == NULL) {
7181 /* STORE option not specified, sent the sorting result to client */
7182 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7183 for (j = start; j <= end; j++) {
7184 listNode *ln;
7185 listIter li;
7186
7187 if (!getop) addReplyBulk(c,vector[j].obj);
7188 listRewind(operations,&li);
7189 while((ln = listNext(&li))) {
7190 redisSortOperation *sop = ln->value;
7191 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7192 vector[j].obj);
7193
7194 if (sop->type == REDIS_SORT_GET) {
7195 if (!val) {
7196 addReply(c,shared.nullbulk);
7197 } else {
7198 addReplyBulk(c,val);
7199 decrRefCount(val);
7200 }
7201 } else {
7202 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7203 }
7204 }
7205 }
7206 } else {
7207 robj *listObject = createListObject();
7208 list *listPtr = (list*) listObject->ptr;
7209
7210 /* STORE option specified, set the sorting result as a List object */
7211 for (j = start; j <= end; j++) {
7212 listNode *ln;
7213 listIter li;
7214
7215 if (!getop) {
7216 listAddNodeTail(listPtr,vector[j].obj);
7217 incrRefCount(vector[j].obj);
7218 }
7219 listRewind(operations,&li);
7220 while((ln = listNext(&li))) {
7221 redisSortOperation *sop = ln->value;
7222 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7223 vector[j].obj);
7224
7225 if (sop->type == REDIS_SORT_GET) {
7226 if (!val) {
7227 listAddNodeTail(listPtr,createStringObject("",0));
7228 } else {
7229 /* We should do a incrRefCount on val because it is
7230 * added to the list, but also a decrRefCount because
7231 * it is returned by lookupKeyByPattern. This results
7232 * in doing nothing at all. */
7233 listAddNodeTail(listPtr,val);
7234 }
7235 } else {
7236 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7237 }
7238 }
7239 }
7240 if (dictReplace(c->db->dict,storekey,listObject)) {
7241 incrRefCount(storekey);
7242 }
7243 /* Note: we add 1 because the DB is dirty anyway since even if the
7244 * SORT result is empty a new key is set and maybe the old content
7245 * replaced. */
7246 server.dirty += 1+outputlen;
7247 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7248 }
7249
7250 /* Cleanup */
7251 decrRefCount(sortval);
7252 listRelease(operations);
7253 for (j = 0; j < vectorlen; j++) {
7254 if (alpha && vector[j].u.cmpobj)
7255 decrRefCount(vector[j].u.cmpobj);
7256 }
7257 zfree(vector);
7258 }
7259
7260 /* Convert an amount of bytes into a human readable string in the form
7261 * of 100B, 2G, 100M, 4K, and so forth. */
7262 static void bytesToHuman(char *s, unsigned long long n) {
7263 double d;
7264
7265 if (n < 1024) {
7266 /* Bytes */
7267 sprintf(s,"%lluB",n);
7268 return;
7269 } else if (n < (1024*1024)) {
7270 d = (double)n/(1024);
7271 sprintf(s,"%.2fK",d);
7272 } else if (n < (1024LL*1024*1024)) {
7273 d = (double)n/(1024*1024);
7274 sprintf(s,"%.2fM",d);
7275 } else if (n < (1024LL*1024*1024*1024)) {
7276 d = (double)n/(1024LL*1024*1024);
7277 sprintf(s,"%.2fG",d);
7278 }
7279 }
7280
7281 /* Create the string returned by the INFO command. This is decoupled
7282 * by the INFO command itself as we need to report the same information
7283 * on memory corruption problems. */
7284 static sds genRedisInfoString(void) {
7285 sds info;
7286 time_t uptime = time(NULL)-server.stat_starttime;
7287 int j;
7288 char hmem[64];
7289
7290 bytesToHuman(hmem,zmalloc_used_memory());
7291 info = sdscatprintf(sdsempty(),
7292 "redis_version:%s\r\n"
7293 "redis_git_sha1:%s\r\n"
7294 "redis_git_dirty:%d\r\n"
7295 "arch_bits:%s\r\n"
7296 "multiplexing_api:%s\r\n"
7297 "process_id:%ld\r\n"
7298 "uptime_in_seconds:%ld\r\n"
7299 "uptime_in_days:%ld\r\n"
7300 "connected_clients:%d\r\n"
7301 "connected_slaves:%d\r\n"
7302 "blocked_clients:%d\r\n"
7303 "used_memory:%zu\r\n"
7304 "used_memory_human:%s\r\n"
7305 "changes_since_last_save:%lld\r\n"
7306 "bgsave_in_progress:%d\r\n"
7307 "last_save_time:%ld\r\n"
7308 "bgrewriteaof_in_progress:%d\r\n"
7309 "total_connections_received:%lld\r\n"
7310 "total_commands_processed:%lld\r\n"
7311 "expired_keys:%lld\r\n"
7312 "hash_max_zipmap_entries:%zu\r\n"
7313 "hash_max_zipmap_value:%zu\r\n"
7314 "pubsub_channels:%ld\r\n"
7315 "pubsub_patterns:%u\r\n"
7316 "vm_enabled:%d\r\n"
7317 "role:%s\r\n"
7318 ,REDIS_VERSION,
7319 REDIS_GIT_SHA1,
7320 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7321 (sizeof(long) == 8) ? "64" : "32",
7322 aeGetApiName(),
7323 (long) getpid(),
7324 uptime,
7325 uptime/(3600*24),
7326 listLength(server.clients)-listLength(server.slaves),
7327 listLength(server.slaves),
7328 server.blpop_blocked_clients,
7329 zmalloc_used_memory(),
7330 hmem,
7331 server.dirty,
7332 server.bgsavechildpid != -1,
7333 server.lastsave,
7334 server.bgrewritechildpid != -1,
7335 server.stat_numconnections,
7336 server.stat_numcommands,
7337 server.stat_expiredkeys,
7338 server.hash_max_zipmap_entries,
7339 server.hash_max_zipmap_value,
7340 dictSize(server.pubsub_channels),
7341 listLength(server.pubsub_patterns),
7342 server.vm_enabled != 0,
7343 server.masterhost == NULL ? "master" : "slave"
7344 );
7345 if (server.masterhost) {
7346 info = sdscatprintf(info,
7347 "master_host:%s\r\n"
7348 "master_port:%d\r\n"
7349 "master_link_status:%s\r\n"
7350 "master_last_io_seconds_ago:%d\r\n"
7351 ,server.masterhost,
7352 server.masterport,
7353 (server.replstate == REDIS_REPL_CONNECTED) ?
7354 "up" : "down",
7355 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7356 );
7357 }
7358 if (server.vm_enabled) {
7359 lockThreadedIO();
7360 info = sdscatprintf(info,
7361 "vm_conf_max_memory:%llu\r\n"
7362 "vm_conf_page_size:%llu\r\n"
7363 "vm_conf_pages:%llu\r\n"
7364 "vm_stats_used_pages:%llu\r\n"
7365 "vm_stats_swapped_objects:%llu\r\n"
7366 "vm_stats_swappin_count:%llu\r\n"
7367 "vm_stats_swappout_count:%llu\r\n"
7368 "vm_stats_io_newjobs_len:%lu\r\n"
7369 "vm_stats_io_processing_len:%lu\r\n"
7370 "vm_stats_io_processed_len:%lu\r\n"
7371 "vm_stats_io_active_threads:%lu\r\n"
7372 "vm_stats_blocked_clients:%lu\r\n"
7373 ,(unsigned long long) server.vm_max_memory,
7374 (unsigned long long) server.vm_page_size,
7375 (unsigned long long) server.vm_pages,
7376 (unsigned long long) server.vm_stats_used_pages,
7377 (unsigned long long) server.vm_stats_swapped_objects,
7378 (unsigned long long) server.vm_stats_swapins,
7379 (unsigned long long) server.vm_stats_swapouts,
7380 (unsigned long) listLength(server.io_newjobs),
7381 (unsigned long) listLength(server.io_processing),
7382 (unsigned long) listLength(server.io_processed),
7383 (unsigned long) server.io_active_threads,
7384 (unsigned long) server.vm_blocked_clients
7385 );
7386 unlockThreadedIO();
7387 }
7388 for (j = 0; j < server.dbnum; j++) {
7389 long long keys, vkeys;
7390
7391 keys = dictSize(server.db[j].dict);
7392 vkeys = dictSize(server.db[j].expires);
7393 if (keys || vkeys) {
7394 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7395 j, keys, vkeys);
7396 }
7397 }
7398 return info;
7399 }
7400
7401 static void infoCommand(redisClient *c) {
7402 sds info = genRedisInfoString();
7403 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7404 (unsigned long)sdslen(info)));
7405 addReplySds(c,info);
7406 addReply(c,shared.crlf);
7407 }
7408
7409 static void monitorCommand(redisClient *c) {
7410 /* ignore MONITOR if aleady slave or in monitor mode */
7411 if (c->flags & REDIS_SLAVE) return;
7412
7413 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7414 c->slaveseldb = 0;
7415 listAddNodeTail(server.monitors,c);
7416 addReply(c,shared.ok);
7417 }
7418
7419 /* ================================= Expire ================================= */
7420 static int removeExpire(redisDb *db, robj *key) {
7421 if (dictDelete(db->expires,key) == DICT_OK) {
7422 return 1;
7423 } else {
7424 return 0;
7425 }
7426 }
7427
7428 static int setExpire(redisDb *db, robj *key, time_t when) {
7429 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7430 return 0;
7431 } else {
7432 incrRefCount(key);
7433 return 1;
7434 }
7435 }
7436
7437 /* Return the expire time of the specified key, or -1 if no expire
7438 * is associated with this key (i.e. the key is non volatile) */
7439 static time_t getExpire(redisDb *db, robj *key) {
7440 dictEntry *de;
7441
7442 /* No expire? return ASAP */
7443 if (dictSize(db->expires) == 0 ||
7444 (de = dictFind(db->expires,key)) == NULL) return -1;
7445
7446 return (time_t) dictGetEntryVal(de);
7447 }
7448
7449 static int expireIfNeeded(redisDb *db, robj *key) {
7450 time_t when;
7451 dictEntry *de;
7452
7453 /* No expire? return ASAP */
7454 if (dictSize(db->expires) == 0 ||
7455 (de = dictFind(db->expires,key)) == NULL) return 0;
7456
7457 /* Lookup the expire */
7458 when = (time_t) dictGetEntryVal(de);
7459 if (time(NULL) <= when) return 0;
7460
7461 /* Delete the key */
7462 dictDelete(db->expires,key);
7463 server.stat_expiredkeys++;
7464 return dictDelete(db->dict,key) == DICT_OK;
7465 }
7466
7467 static int deleteIfVolatile(redisDb *db, robj *key) {
7468 dictEntry *de;
7469
7470 /* No expire? return ASAP */
7471 if (dictSize(db->expires) == 0 ||
7472 (de = dictFind(db->expires,key)) == NULL) return 0;
7473
7474 /* Delete the key */
7475 server.dirty++;
7476 server.stat_expiredkeys++;
7477 dictDelete(db->expires,key);
7478 return dictDelete(db->dict,key) == DICT_OK;
7479 }
7480
7481 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7482 dictEntry *de;
7483 time_t seconds;
7484
7485 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7486
7487 seconds -= offset;
7488
7489 de = dictFind(c->db->dict,key);
7490 if (de == NULL) {
7491 addReply(c,shared.czero);
7492 return;
7493 }
7494 if (seconds <= 0) {
7495 if (deleteKey(c->db,key)) server.dirty++;
7496 addReply(c, shared.cone);
7497 return;
7498 } else {
7499 time_t when = time(NULL)+seconds;
7500 if (setExpire(c->db,key,when)) {
7501 addReply(c,shared.cone);
7502 server.dirty++;
7503 } else {
7504 addReply(c,shared.czero);
7505 }
7506 return;
7507 }
7508 }
7509
7510 static void expireCommand(redisClient *c) {
7511 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7512 }
7513
7514 static void expireatCommand(redisClient *c) {
7515 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7516 }
7517
7518 static void ttlCommand(redisClient *c) {
7519 time_t expire;
7520 int ttl = -1;
7521
7522 expire = getExpire(c->db,c->argv[1]);
7523 if (expire != -1) {
7524 ttl = (int) (expire-time(NULL));
7525 if (ttl < 0) ttl = -1;
7526 }
7527 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7528 }
7529
7530 /* ================================ MULTI/EXEC ============================== */
7531
7532 /* Client state initialization for MULTI/EXEC */
7533 static void initClientMultiState(redisClient *c) {
7534 c->mstate.commands = NULL;
7535 c->mstate.count = 0;
7536 }
7537
7538 /* Release all the resources associated with MULTI/EXEC state */
7539 static void freeClientMultiState(redisClient *c) {
7540 int j;
7541
7542 for (j = 0; j < c->mstate.count; j++) {
7543 int i;
7544 multiCmd *mc = c->mstate.commands+j;
7545
7546 for (i = 0; i < mc->argc; i++)
7547 decrRefCount(mc->argv[i]);
7548 zfree(mc->argv);
7549 }
7550 zfree(c->mstate.commands);
7551 }
7552
7553 /* Add a new command into the MULTI commands queue */
7554 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7555 multiCmd *mc;
7556 int j;
7557
7558 c->mstate.commands = zrealloc(c->mstate.commands,
7559 sizeof(multiCmd)*(c->mstate.count+1));
7560 mc = c->mstate.commands+c->mstate.count;
7561 mc->cmd = cmd;
7562 mc->argc = c->argc;
7563 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7564 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7565 for (j = 0; j < c->argc; j++)
7566 incrRefCount(mc->argv[j]);
7567 c->mstate.count++;
7568 }
7569
7570 static void multiCommand(redisClient *c) {
7571 if (c->flags & REDIS_MULTI) {
7572 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7573 return;
7574 }
7575 c->flags |= REDIS_MULTI;
7576 addReply(c,shared.ok);
7577 }
7578
7579 static void discardCommand(redisClient *c) {
7580 if (!(c->flags & REDIS_MULTI)) {
7581 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7582 return;
7583 }
7584
7585 freeClientMultiState(c);
7586 initClientMultiState(c);
7587 c->flags &= (~REDIS_MULTI);
7588 addReply(c,shared.ok);
7589 }
7590
7591 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7592 * implememntation for more information. */
7593 static void execCommandReplicateMulti(redisClient *c) {
7594 struct redisCommand *cmd;
7595 robj *multistring = createStringObject("MULTI",5);
7596
7597 cmd = lookupCommand("multi");
7598 if (server.appendonly)
7599 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7600 if (listLength(server.slaves))
7601 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7602 decrRefCount(multistring);
7603 }
7604
7605 static void execCommand(redisClient *c) {
7606 int j;
7607 robj **orig_argv;
7608 int orig_argc;
7609
7610 if (!(c->flags & REDIS_MULTI)) {
7611 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7612 return;
7613 }
7614
7615 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7616 * A failed EXEC will return a multi bulk nil object. */
7617 if (c->flags & REDIS_DIRTY_CAS) {
7618 freeClientMultiState(c);
7619 initClientMultiState(c);
7620 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7621 unwatchAllKeys(c);
7622 addReply(c,shared.nullmultibulk);
7623 return;
7624 }
7625
7626 /* Replicate a MULTI request now that we are sure the block is executed.
7627 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7628 * both the AOF and the replication link will have the same consistency
7629 * and atomicity guarantees. */
7630 execCommandReplicateMulti(c);
7631
7632 /* Exec all the queued commands */
7633 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
7634 orig_argv = c->argv;
7635 orig_argc = c->argc;
7636 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7637 for (j = 0; j < c->mstate.count; j++) {
7638 c->argc = c->mstate.commands[j].argc;
7639 c->argv = c->mstate.commands[j].argv;
7640 call(c,c->mstate.commands[j].cmd);
7641 }
7642 c->argv = orig_argv;
7643 c->argc = orig_argc;
7644 freeClientMultiState(c);
7645 initClientMultiState(c);
7646 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7647 /* Make sure the EXEC command is always replicated / AOF, since we
7648 * always send the MULTI command (we can't know beforehand if the
7649 * next operations will contain at least a modification to the DB). */
7650 server.dirty++;
7651 }
7652
7653 /* =========================== Blocking Operations ========================= */
7654
7655 /* Currently Redis blocking operations support is limited to list POP ops,
7656 * so the current implementation is not fully generic, but it is also not
7657 * completely specific so it will not require a rewrite to support new
7658 * kind of blocking operations in the future.
7659 *
7660 * Still it's important to note that list blocking operations can be already
7661 * used as a notification mechanism in order to implement other blocking
7662 * operations at application level, so there must be a very strong evidence
7663 * of usefulness and generality before new blocking operations are implemented.
7664 *
7665 * This is how the current blocking POP works, we use BLPOP as example:
7666 * - If the user calls BLPOP and the key exists and contains a non empty list
7667 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7668 * if there is not to block.
7669 * - If instead BLPOP is called and the key does not exists or the list is
7670 * empty we need to block. In order to do so we remove the notification for
7671 * new data to read in the client socket (so that we'll not serve new
7672 * requests if the blocking request is not served). Also we put the client
7673 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7674 * blocking for this keys.
7675 * - If a PUSH operation against a key with blocked clients waiting is
7676 * performed, we serve the first in the list: basically instead to push
7677 * the new element inside the list we return it to the (first / oldest)
7678 * blocking client, unblock the client, and remove it form the list.
7679 *
7680 * The above comment and the source code should be enough in order to understand
7681 * the implementation and modify / fix it later.
7682 */
7683
7684 /* Set a client in blocking mode for the specified key, with the specified
7685 * timeout */
7686 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7687 dictEntry *de;
7688 list *l;
7689 int j;
7690
7691 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7692 c->blocking_keys_num = numkeys;
7693 c->blockingto = timeout;
7694 for (j = 0; j < numkeys; j++) {
7695 /* Add the key in the client structure, to map clients -> keys */
7696 c->blocking_keys[j] = keys[j];
7697 incrRefCount(keys[j]);
7698
7699 /* And in the other "side", to map keys -> clients */
7700 de = dictFind(c->db->blocking_keys,keys[j]);
7701 if (de == NULL) {
7702 int retval;
7703
7704 /* For every key we take a list of clients blocked for it */
7705 l = listCreate();
7706 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7707 incrRefCount(keys[j]);
7708 assert(retval == DICT_OK);
7709 } else {
7710 l = dictGetEntryVal(de);
7711 }
7712 listAddNodeTail(l,c);
7713 }
7714 /* Mark the client as a blocked client */
7715 c->flags |= REDIS_BLOCKED;
7716 server.blpop_blocked_clients++;
7717 }
7718
7719 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7720 static void unblockClientWaitingData(redisClient *c) {
7721 dictEntry *de;
7722 list *l;
7723 int j;
7724
7725 assert(c->blocking_keys != NULL);
7726 /* The client may wait for multiple keys, so unblock it for every key. */
7727 for (j = 0; j < c->blocking_keys_num; j++) {
7728 /* Remove this client from the list of clients waiting for this key. */
7729 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7730 assert(de != NULL);
7731 l = dictGetEntryVal(de);
7732 listDelNode(l,listSearchKey(l,c));
7733 /* If the list is empty we need to remove it to avoid wasting memory */
7734 if (listLength(l) == 0)
7735 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7736 decrRefCount(c->blocking_keys[j]);
7737 }
7738 /* Cleanup the client structure */
7739 zfree(c->blocking_keys);
7740 c->blocking_keys = NULL;
7741 c->flags &= (~REDIS_BLOCKED);
7742 server.blpop_blocked_clients--;
7743 /* We want to process data if there is some command waiting
7744 * in the input buffer. Note that this is safe even if
7745 * unblockClientWaitingData() gets called from freeClient() because
7746 * freeClient() will be smart enough to call this function
7747 * *after* c->querybuf was set to NULL. */
7748 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7749 }
7750
7751 /* This should be called from any function PUSHing into lists.
7752 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7753 * 'ele' is the element pushed.
7754 *
7755 * If the function returns 0 there was no client waiting for a list push
7756 * against this key.
7757 *
7758 * If the function returns 1 there was a client waiting for a list push
7759 * against this key, the element was passed to this client thus it's not
7760 * needed to actually add it to the list and the caller should return asap. */
7761 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7762 struct dictEntry *de;
7763 redisClient *receiver;
7764 list *l;
7765 listNode *ln;
7766
7767 de = dictFind(c->db->blocking_keys,key);
7768 if (de == NULL) return 0;
7769 l = dictGetEntryVal(de);
7770 ln = listFirst(l);
7771 assert(ln != NULL);
7772 receiver = ln->value;
7773
7774 addReplySds(receiver,sdsnew("*2\r\n"));
7775 addReplyBulk(receiver,key);
7776 addReplyBulk(receiver,ele);
7777 unblockClientWaitingData(receiver);
7778 return 1;
7779 }
7780
7781 /* Blocking RPOP/LPOP */
7782 static void blockingPopGenericCommand(redisClient *c, int where) {
7783 robj *o;
7784 time_t timeout;
7785 int j;
7786
7787 for (j = 1; j < c->argc-1; j++) {
7788 o = lookupKeyWrite(c->db,c->argv[j]);
7789 if (o != NULL) {
7790 if (o->type != REDIS_LIST) {
7791 addReply(c,shared.wrongtypeerr);
7792 return;
7793 } else {
7794 list *list = o->ptr;
7795 if (listLength(list) != 0) {
7796 /* If the list contains elements fall back to the usual
7797 * non-blocking POP operation */
7798 robj *argv[2], **orig_argv;
7799 int orig_argc;
7800
7801 /* We need to alter the command arguments before to call
7802 * popGenericCommand() as the command takes a single key. */
7803 orig_argv = c->argv;
7804 orig_argc = c->argc;
7805 argv[1] = c->argv[j];
7806 c->argv = argv;
7807 c->argc = 2;
7808
7809 /* Also the return value is different, we need to output
7810 * the multi bulk reply header and the key name. The
7811 * "real" command will add the last element (the value)
7812 * for us. If this souds like an hack to you it's just
7813 * because it is... */
7814 addReplySds(c,sdsnew("*2\r\n"));
7815 addReplyBulk(c,argv[1]);
7816 popGenericCommand(c,where);
7817
7818 /* Fix the client structure with the original stuff */
7819 c->argv = orig_argv;
7820 c->argc = orig_argc;
7821 return;
7822 }
7823 }
7824 }
7825 }
7826 /* If the list is empty or the key does not exists we must block */
7827 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7828 if (timeout > 0) timeout += time(NULL);
7829 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7830 }
7831
7832 static void blpopCommand(redisClient *c) {
7833 blockingPopGenericCommand(c,REDIS_HEAD);
7834 }
7835
7836 static void brpopCommand(redisClient *c) {
7837 blockingPopGenericCommand(c,REDIS_TAIL);
7838 }
7839
7840 /* =============================== Replication ============================= */
7841
7842 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7843 ssize_t nwritten, ret = size;
7844 time_t start = time(NULL);
7845
7846 timeout++;
7847 while(size) {
7848 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7849 nwritten = write(fd,ptr,size);
7850 if (nwritten == -1) return -1;
7851 ptr += nwritten;
7852 size -= nwritten;
7853 }
7854 if ((time(NULL)-start) > timeout) {
7855 errno = ETIMEDOUT;
7856 return -1;
7857 }
7858 }
7859 return ret;
7860 }
7861
7862 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7863 ssize_t nread, totread = 0;
7864 time_t start = time(NULL);
7865
7866 timeout++;
7867 while(size) {
7868 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7869 nread = read(fd,ptr,size);
7870 if (nread == -1) return -1;
7871 ptr += nread;
7872 size -= nread;
7873 totread += nread;
7874 }
7875 if ((time(NULL)-start) > timeout) {
7876 errno = ETIMEDOUT;
7877 return -1;
7878 }
7879 }
7880 return totread;
7881 }
7882
7883 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7884 ssize_t nread = 0;
7885
7886 size--;
7887 while(size) {
7888 char c;
7889
7890 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7891 if (c == '\n') {
7892 *ptr = '\0';
7893 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7894 return nread;
7895 } else {
7896 *ptr++ = c;
7897 *ptr = '\0';
7898 nread++;
7899 }
7900 }
7901 return nread;
7902 }
7903
7904 static void syncCommand(redisClient *c) {
7905 /* ignore SYNC if aleady slave or in monitor mode */
7906 if (c->flags & REDIS_SLAVE) return;
7907
7908 /* SYNC can't be issued when the server has pending data to send to
7909 * the client about already issued commands. We need a fresh reply
7910 * buffer registering the differences between the BGSAVE and the current
7911 * dataset, so that we can copy to other slaves if needed. */
7912 if (listLength(c->reply) != 0) {
7913 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7914 return;
7915 }
7916
7917 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7918 /* Here we need to check if there is a background saving operation
7919 * in progress, or if it is required to start one */
7920 if (server.bgsavechildpid != -1) {
7921 /* Ok a background save is in progress. Let's check if it is a good
7922 * one for replication, i.e. if there is another slave that is
7923 * registering differences since the server forked to save */
7924 redisClient *slave;
7925 listNode *ln;
7926 listIter li;
7927
7928 listRewind(server.slaves,&li);
7929 while((ln = listNext(&li))) {
7930 slave = ln->value;
7931 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7932 }
7933 if (ln) {
7934 /* Perfect, the server is already registering differences for
7935 * another slave. Set the right state, and copy the buffer. */
7936 listRelease(c->reply);
7937 c->reply = listDup(slave->reply);
7938 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7939 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7940 } else {
7941 /* No way, we need to wait for the next BGSAVE in order to
7942 * register differences */
7943 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7944 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7945 }
7946 } else {
7947 /* Ok we don't have a BGSAVE in progress, let's start one */
7948 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7949 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7950 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7951 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7952 return;
7953 }
7954 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7955 }
7956 c->repldbfd = -1;
7957 c->flags |= REDIS_SLAVE;
7958 c->slaveseldb = 0;
7959 listAddNodeTail(server.slaves,c);
7960 return;
7961 }
7962
7963 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7964 redisClient *slave = privdata;
7965 REDIS_NOTUSED(el);
7966 REDIS_NOTUSED(mask);
7967 char buf[REDIS_IOBUF_LEN];
7968 ssize_t nwritten, buflen;
7969
7970 if (slave->repldboff == 0) {
7971 /* Write the bulk write count before to transfer the DB. In theory here
7972 * we don't know how much room there is in the output buffer of the
7973 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7974 * operations) will never be smaller than the few bytes we need. */
7975 sds bulkcount;
7976
7977 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7978 slave->repldbsize);
7979 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7980 {
7981 sdsfree(bulkcount);
7982 freeClient(slave);
7983 return;
7984 }
7985 sdsfree(bulkcount);
7986 }
7987 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7988 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7989 if (buflen <= 0) {
7990 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7991 (buflen == 0) ? "premature EOF" : strerror(errno));
7992 freeClient(slave);
7993 return;
7994 }
7995 if ((nwritten = write(fd,buf,buflen)) == -1) {
7996 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7997 strerror(errno));
7998 freeClient(slave);
7999 return;
8000 }
8001 slave->repldboff += nwritten;
8002 if (slave->repldboff == slave->repldbsize) {
8003 close(slave->repldbfd);
8004 slave->repldbfd = -1;
8005 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8006 slave->replstate = REDIS_REPL_ONLINE;
8007 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
8008 sendReplyToClient, slave) == AE_ERR) {
8009 freeClient(slave);
8010 return;
8011 }
8012 addReplySds(slave,sdsempty());
8013 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8014 }
8015 }
8016
8017 /* This function is called at the end of every backgrond saving.
8018 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8019 * otherwise REDIS_ERR is passed to the function.
8020 *
8021 * The goal of this function is to handle slaves waiting for a successful
8022 * background saving in order to perform non-blocking synchronization. */
8023 static void updateSlavesWaitingBgsave(int bgsaveerr) {
8024 listNode *ln;
8025 int startbgsave = 0;
8026 listIter li;
8027
8028 listRewind(server.slaves,&li);
8029 while((ln = listNext(&li))) {
8030 redisClient *slave = ln->value;
8031
8032 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8033 startbgsave = 1;
8034 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8035 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
8036 struct redis_stat buf;
8037
8038 if (bgsaveerr != REDIS_OK) {
8039 freeClient(slave);
8040 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8041 continue;
8042 }
8043 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
8044 redis_fstat(slave->repldbfd,&buf) == -1) {
8045 freeClient(slave);
8046 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8047 continue;
8048 }
8049 slave->repldboff = 0;
8050 slave->repldbsize = buf.st_size;
8051 slave->replstate = REDIS_REPL_SEND_BULK;
8052 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8053 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
8054 freeClient(slave);
8055 continue;
8056 }
8057 }
8058 }
8059 if (startbgsave) {
8060 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8061 listIter li;
8062
8063 listRewind(server.slaves,&li);
8064 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
8065 while((ln = listNext(&li))) {
8066 redisClient *slave = ln->value;
8067
8068 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8069 freeClient(slave);
8070 }
8071 }
8072 }
8073 }
8074
8075 static int syncWithMaster(void) {
8076 char buf[1024], tmpfile[256], authcmd[1024];
8077 long dumpsize;
8078 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8079 int dfd, maxtries = 5;
8080
8081 if (fd == -1) {
8082 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8083 strerror(errno));
8084 return REDIS_ERR;
8085 }
8086
8087 /* AUTH with the master if required. */
8088 if(server.masterauth) {
8089 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8090 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8091 close(fd);
8092 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8093 strerror(errno));
8094 return REDIS_ERR;
8095 }
8096 /* Read the AUTH result. */
8097 if (syncReadLine(fd,buf,1024,3600) == -1) {
8098 close(fd);
8099 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8100 strerror(errno));
8101 return REDIS_ERR;
8102 }
8103 if (buf[0] != '+') {
8104 close(fd);
8105 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8106 return REDIS_ERR;
8107 }
8108 }
8109
8110 /* Issue the SYNC command */
8111 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8112 close(fd);
8113 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8114 strerror(errno));
8115 return REDIS_ERR;
8116 }
8117 /* Read the bulk write count */
8118 if (syncReadLine(fd,buf,1024,3600) == -1) {
8119 close(fd);
8120 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8121 strerror(errno));
8122 return REDIS_ERR;
8123 }
8124 if (buf[0] != '$') {
8125 close(fd);
8126 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8127 return REDIS_ERR;
8128 }
8129 dumpsize = strtol(buf+1,NULL,10);
8130 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8131 /* Read the bulk write data on a temp file */
8132 while(maxtries--) {
8133 snprintf(tmpfile,256,
8134 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8135 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8136 if (dfd != -1) break;
8137 sleep(1);
8138 }
8139 if (dfd == -1) {
8140 close(fd);
8141 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8142 return REDIS_ERR;
8143 }
8144 while(dumpsize) {
8145 int nread, nwritten;
8146
8147 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8148 if (nread == -1) {
8149 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8150 strerror(errno));
8151 close(fd);
8152 close(dfd);
8153 return REDIS_ERR;
8154 }
8155 nwritten = write(dfd,buf,nread);
8156 if (nwritten == -1) {
8157 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8158 close(fd);
8159 close(dfd);
8160 return REDIS_ERR;
8161 }
8162 dumpsize -= nread;
8163 }
8164 close(dfd);
8165 if (rename(tmpfile,server.dbfilename) == -1) {
8166 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8167 unlink(tmpfile);
8168 close(fd);
8169 return REDIS_ERR;
8170 }
8171 emptyDb();
8172 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8173 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8174 close(fd);
8175 return REDIS_ERR;
8176 }
8177 server.master = createClient(fd);
8178 server.master->flags |= REDIS_MASTER;
8179 server.master->authenticated = 1;
8180 server.replstate = REDIS_REPL_CONNECTED;
8181 return REDIS_OK;
8182 }
8183
8184 static void slaveofCommand(redisClient *c) {
8185 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8186 !strcasecmp(c->argv[2]->ptr,"one")) {
8187 if (server.masterhost) {
8188 sdsfree(server.masterhost);
8189 server.masterhost = NULL;
8190 if (server.master) freeClient(server.master);
8191 server.replstate = REDIS_REPL_NONE;
8192 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8193 }
8194 } else {
8195 sdsfree(server.masterhost);
8196 server.masterhost = sdsdup(c->argv[1]->ptr);
8197 server.masterport = atoi(c->argv[2]->ptr);
8198 if (server.master) freeClient(server.master);
8199 server.replstate = REDIS_REPL_CONNECT;
8200 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8201 server.masterhost, server.masterport);
8202 }
8203 addReply(c,shared.ok);
8204 }
8205
8206 /* ============================ Maxmemory directive ======================== */
8207
8208 /* Try to free one object form the pre-allocated objects free list.
8209 * This is useful under low mem conditions as by default we take 1 million
8210 * free objects allocated. On success REDIS_OK is returned, otherwise
8211 * REDIS_ERR. */
8212 static int tryFreeOneObjectFromFreelist(void) {
8213 robj *o;
8214
8215 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8216 if (listLength(server.objfreelist)) {
8217 listNode *head = listFirst(server.objfreelist);
8218 o = listNodeValue(head);
8219 listDelNode(server.objfreelist,head);
8220 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8221 zfree(o);
8222 return REDIS_OK;
8223 } else {
8224 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8225 return REDIS_ERR;
8226 }
8227 }
8228
8229 /* This function gets called when 'maxmemory' is set on the config file to limit
8230 * the max memory used by the server, and we are out of memory.
8231 * This function will try to, in order:
8232 *
8233 * - Free objects from the free list
8234 * - Try to remove keys with an EXPIRE set
8235 *
8236 * It is not possible to free enough memory to reach used-memory < maxmemory
8237 * the server will start refusing commands that will enlarge even more the
8238 * memory usage.
8239 */
8240 static void freeMemoryIfNeeded(void) {
8241 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8242 int j, k, freed = 0;
8243
8244 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8245 for (j = 0; j < server.dbnum; j++) {
8246 int minttl = -1;
8247 robj *minkey = NULL;
8248 struct dictEntry *de;
8249
8250 if (dictSize(server.db[j].expires)) {
8251 freed = 1;
8252 /* From a sample of three keys drop the one nearest to
8253 * the natural expire */
8254 for (k = 0; k < 3; k++) {
8255 time_t t;
8256
8257 de = dictGetRandomKey(server.db[j].expires);
8258 t = (time_t) dictGetEntryVal(de);
8259 if (minttl == -1 || t < minttl) {
8260 minkey = dictGetEntryKey(de);
8261 minttl = t;
8262 }
8263 }
8264 deleteKey(server.db+j,minkey);
8265 }
8266 }
8267 if (!freed) return; /* nothing to free... */
8268 }
8269 }
8270
8271 /* ============================== Append Only file ========================== */
8272
8273 /* Called when the user switches from "appendonly yes" to "appendonly no"
8274 * at runtime using the CONFIG command. */
8275 static void stopAppendOnly(void) {
8276 flushAppendOnlyFile();
8277 aof_fsync(server.appendfd);
8278 close(server.appendfd);
8279
8280 server.appendfd = -1;
8281 server.appendseldb = -1;
8282 server.appendonly = 0;
8283 /* rewrite operation in progress? kill it, wait child exit */
8284 if (server.bgsavechildpid != -1) {
8285 int statloc;
8286
8287 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8288 wait3(&statloc,0,NULL);
8289 /* reset the buffer accumulating changes while the child saves */
8290 sdsfree(server.bgrewritebuf);
8291 server.bgrewritebuf = sdsempty();
8292 server.bgsavechildpid = -1;
8293 }
8294 }
8295
8296 /* Called when the user switches from "appendonly no" to "appendonly yes"
8297 * at runtime using the CONFIG command. */
8298 static int startAppendOnly(void) {
8299 server.appendonly = 1;
8300 server.lastfsync = time(NULL);
8301 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8302 if (server.appendfd == -1) {
8303 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8304 return REDIS_ERR;
8305 }
8306 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8307 server.appendonly = 0;
8308 close(server.appendfd);
8309 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8310 return REDIS_ERR;
8311 }
8312 return REDIS_OK;
8313 }
8314
8315 /* Write the append only file buffer on disk.
8316 *
8317 * Since we are required to write the AOF before replying to the client,
8318 * and the only way the client socket can get a write is entering when the
8319 * the event loop, we accumulate all the AOF writes in a memory
8320 * buffer and write it on disk using this function just before entering
8321 * the event loop again. */
8322 static void flushAppendOnlyFile(void) {
8323 time_t now;
8324 ssize_t nwritten;
8325
8326 if (sdslen(server.aofbuf) == 0) return;
8327
8328 /* We want to perform a single write. This should be guaranteed atomic
8329 * at least if the filesystem we are writing is a real physical one.
8330 * While this will save us against the server being killed I don't think
8331 * there is much to do about the whole server stopping for power problems
8332 * or alike */
8333 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8334 if (nwritten != (signed)sdslen(server.aofbuf)) {
8335 /* Ooops, we are in troubles. The best thing to do for now is
8336 * aborting instead of giving the illusion that everything is
8337 * working as expected. */
8338 if (nwritten == -1) {
8339 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8340 } else {
8341 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8342 }
8343 exit(1);
8344 }
8345 sdsfree(server.aofbuf);
8346 server.aofbuf = sdsempty();
8347
8348 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8349 * childs performing heavy I/O on disk. */
8350 if (server.no_appendfsync_on_rewrite &&
8351 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8352 return;
8353 /* Fsync if needed */
8354 now = time(NULL);
8355 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8356 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8357 now-server.lastfsync > 1))
8358 {
8359 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8360 * flushing metadata. */
8361 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8362 server.lastfsync = now;
8363 }
8364 }
8365
8366 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8367 int j;
8368 buf = sdscatprintf(buf,"*%d\r\n",argc);
8369 for (j = 0; j < argc; j++) {
8370 robj *o = getDecodedObject(argv[j]);
8371 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8372 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8373 buf = sdscatlen(buf,"\r\n",2);
8374 decrRefCount(o);
8375 }
8376 return buf;
8377 }
8378
8379 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8380 int argc = 3;
8381 long when;
8382 robj *argv[3];
8383
8384 /* Make sure we can use strtol */
8385 seconds = getDecodedObject(seconds);
8386 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8387 decrRefCount(seconds);
8388
8389 argv[0] = createStringObject("EXPIREAT",8);
8390 argv[1] = key;
8391 argv[2] = createObject(REDIS_STRING,
8392 sdscatprintf(sdsempty(),"%ld",when));
8393 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8394 decrRefCount(argv[0]);
8395 decrRefCount(argv[2]);
8396 return buf;
8397 }
8398
8399 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8400 sds buf = sdsempty();
8401 robj *tmpargv[3];
8402
8403 /* The DB this command was targetting is not the same as the last command
8404 * we appendend. To issue a SELECT command is needed. */
8405 if (dictid != server.appendseldb) {
8406 char seldb[64];
8407
8408 snprintf(seldb,sizeof(seldb),"%d",dictid);
8409 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8410 (unsigned long)strlen(seldb),seldb);
8411 server.appendseldb = dictid;
8412 }
8413
8414 if (cmd->proc == expireCommand) {
8415 /* Translate EXPIRE into EXPIREAT */
8416 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8417 } else if (cmd->proc == setexCommand) {
8418 /* Translate SETEX to SET and EXPIREAT */
8419 tmpargv[0] = createStringObject("SET",3);
8420 tmpargv[1] = argv[1];
8421 tmpargv[2] = argv[3];
8422 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8423 decrRefCount(tmpargv[0]);
8424 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8425 } else {
8426 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8427 }
8428
8429 /* Append to the AOF buffer. This will be flushed on disk just before
8430 * of re-entering the event loop, so before the client will get a
8431 * positive reply about the operation performed. */
8432 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8433
8434 /* If a background append only file rewriting is in progress we want to
8435 * accumulate the differences between the child DB and the current one
8436 * in a buffer, so that when the child process will do its work we
8437 * can append the differences to the new append only file. */
8438 if (server.bgrewritechildpid != -1)
8439 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8440
8441 sdsfree(buf);
8442 }
8443
8444 /* In Redis commands are always executed in the context of a client, so in
8445 * order to load the append only file we need to create a fake client. */
8446 static struct redisClient *createFakeClient(void) {
8447 struct redisClient *c = zmalloc(sizeof(*c));
8448
8449 selectDb(c,0);
8450 c->fd = -1;
8451 c->querybuf = sdsempty();
8452 c->argc = 0;
8453 c->argv = NULL;
8454 c->flags = 0;
8455 /* We set the fake client as a slave waiting for the synchronization
8456 * so that Redis will not try to send replies to this client. */
8457 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8458 c->reply = listCreate();
8459 listSetFreeMethod(c->reply,decrRefCount);
8460 listSetDupMethod(c->reply,dupClientReplyValue);
8461 initClientMultiState(c);
8462 return c;
8463 }
8464
8465 static void freeFakeClient(struct redisClient *c) {
8466 sdsfree(c->querybuf);
8467 listRelease(c->reply);
8468 freeClientMultiState(c);
8469 zfree(c);
8470 }
8471
8472 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8473 * error (the append only file is zero-length) REDIS_ERR is returned. On
8474 * fatal error an error message is logged and the program exists. */
8475 int loadAppendOnlyFile(char *filename) {
8476 struct redisClient *fakeClient;
8477 FILE *fp = fopen(filename,"r");
8478 struct redis_stat sb;
8479 unsigned long long loadedkeys = 0;
8480 int appendonly = server.appendonly;
8481
8482 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8483 return REDIS_ERR;
8484
8485 if (fp == NULL) {
8486 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8487 exit(1);
8488 }
8489
8490 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8491 * to the same file we're about to read. */
8492 server.appendonly = 0;
8493
8494 fakeClient = createFakeClient();
8495 while(1) {
8496 int argc, j;
8497 unsigned long len;
8498 robj **argv;
8499 char buf[128];
8500 sds argsds;
8501 struct redisCommand *cmd;
8502
8503 if (fgets(buf,sizeof(buf),fp) == NULL) {
8504 if (feof(fp))
8505 break;
8506 else
8507 goto readerr;
8508 }
8509 if (buf[0] != '*') goto fmterr;
8510 argc = atoi(buf+1);
8511 argv = zmalloc(sizeof(robj*)*argc);
8512 for (j = 0; j < argc; j++) {
8513 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8514 if (buf[0] != '$') goto fmterr;
8515 len = strtol(buf+1,NULL,10);
8516 argsds = sdsnewlen(NULL,len);
8517 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8518 argv[j] = createObject(REDIS_STRING,argsds);
8519 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8520 }
8521
8522 /* Command lookup */
8523 cmd = lookupCommand(argv[0]->ptr);
8524 if (!cmd) {
8525 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8526 exit(1);
8527 }
8528 /* Try object encoding */
8529 if (cmd->flags & REDIS_CMD_BULK)
8530 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8531 /* Run the command in the context of a fake client */
8532 fakeClient->argc = argc;
8533 fakeClient->argv = argv;
8534 cmd->proc(fakeClient);
8535 /* Discard the reply objects list from the fake client */
8536 while(listLength(fakeClient->reply))
8537 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8538 /* Clean up, ready for the next command */
8539 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8540 zfree(argv);
8541 /* Handle swapping while loading big datasets when VM is on */
8542 loadedkeys++;
8543 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8544 while (zmalloc_used_memory() > server.vm_max_memory) {
8545 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8546 }
8547 }
8548 }
8549
8550 /* This point can only be reached when EOF is reached without errors.
8551 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8552 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8553
8554 fclose(fp);
8555 freeFakeClient(fakeClient);
8556 server.appendonly = appendonly;
8557 return REDIS_OK;
8558
8559 readerr:
8560 if (feof(fp)) {
8561 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8562 } else {
8563 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8564 }
8565 exit(1);
8566 fmterr:
8567 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8568 exit(1);
8569 }
8570
8571 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8572 static int fwriteBulkObject(FILE *fp, robj *obj) {
8573 char buf[128];
8574 int decrrc = 0;
8575
8576 /* Avoid the incr/decr ref count business if possible to help
8577 * copy-on-write (we are often in a child process when this function
8578 * is called).
8579 * Also makes sure that key objects don't get incrRefCount-ed when VM
8580 * is enabled */
8581 if (obj->encoding != REDIS_ENCODING_RAW) {
8582 obj = getDecodedObject(obj);
8583 decrrc = 1;
8584 }
8585 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8586 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8587 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8588 goto err;
8589 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8590 if (decrrc) decrRefCount(obj);
8591 return 1;
8592 err:
8593 if (decrrc) decrRefCount(obj);
8594 return 0;
8595 }
8596
8597 /* Write binary-safe string into a file in the bulkformat
8598 * $<count>\r\n<payload>\r\n */
8599 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8600 char buf[128];
8601
8602 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8603 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8604 if (len && fwrite(s,len,1,fp) == 0) return 0;
8605 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8606 return 1;
8607 }
8608
8609 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8610 static int fwriteBulkDouble(FILE *fp, double d) {
8611 char buf[128], dbuf[128];
8612
8613 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8614 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8615 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8616 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8617 return 1;
8618 }
8619
8620 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8621 static int fwriteBulkLong(FILE *fp, long l) {
8622 char buf[128], lbuf[128];
8623
8624 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8625 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8626 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8627 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8628 return 1;
8629 }
8630
8631 /* Write a sequence of commands able to fully rebuild the dataset into
8632 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8633 static int rewriteAppendOnlyFile(char *filename) {
8634 dictIterator *di = NULL;
8635 dictEntry *de;
8636 FILE *fp;
8637 char tmpfile[256];
8638 int j;
8639 time_t now = time(NULL);
8640
8641 /* Note that we have to use a different temp name here compared to the
8642 * one used by rewriteAppendOnlyFileBackground() function. */
8643 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8644 fp = fopen(tmpfile,"w");
8645 if (!fp) {
8646 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8647 return REDIS_ERR;
8648 }
8649 for (j = 0; j < server.dbnum; j++) {
8650 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8651 redisDb *db = server.db+j;
8652 dict *d = db->dict;
8653 if (dictSize(d) == 0) continue;
8654 di = dictGetIterator(d);
8655 if (!di) {
8656 fclose(fp);
8657 return REDIS_ERR;
8658 }
8659
8660 /* SELECT the new DB */
8661 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8662 if (fwriteBulkLong(fp,j) == 0) goto werr;
8663
8664 /* Iterate this DB writing every entry */
8665 while((de = dictNext(di)) != NULL) {
8666 robj *key, *o;
8667 time_t expiretime;
8668 int swapped;
8669
8670 key = dictGetEntryKey(de);
8671 o = dictGetEntryVal(de);
8672 /* If the value for this key is swapped, load a preview in memory.
8673 * We use a "swapped" flag to remember if we need to free the
8674 * value object instead to just increment the ref count anyway
8675 * in order to avoid copy-on-write of pages if we are forked() */
8676 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
8677 o->storage == REDIS_VM_SWAPPING) {
8678 swapped = 0;
8679 } else {
8680 o = vmPreviewObject(o);
8681 swapped = 1;
8682 }
8683 expiretime = getExpire(db,key);
8684
8685 /* Save the key and associated value */
8686 if (o->type == REDIS_STRING) {
8687 /* Emit a SET command */
8688 char cmd[]="*3\r\n$3\r\nSET\r\n";
8689 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8690 /* Key and value */
8691 if (fwriteBulkObject(fp,key) == 0) goto werr;
8692 if (fwriteBulkObject(fp,o) == 0) goto werr;
8693 } else if (o->type == REDIS_LIST) {
8694 /* Emit the RPUSHes needed to rebuild the list */
8695 list *list = o->ptr;
8696 listNode *ln;
8697 listIter li;
8698
8699 listRewind(list,&li);
8700 while((ln = listNext(&li))) {
8701 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8702 robj *eleobj = listNodeValue(ln);
8703
8704 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8705 if (fwriteBulkObject(fp,key) == 0) goto werr;
8706 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8707 }
8708 } else if (o->type == REDIS_SET) {
8709 /* Emit the SADDs needed to rebuild the set */
8710 dict *set = o->ptr;
8711 dictIterator *di = dictGetIterator(set);
8712 dictEntry *de;
8713
8714 while((de = dictNext(di)) != NULL) {
8715 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8716 robj *eleobj = dictGetEntryKey(de);
8717
8718 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8719 if (fwriteBulkObject(fp,key) == 0) goto werr;
8720 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8721 }
8722 dictReleaseIterator(di);
8723 } else if (o->type == REDIS_ZSET) {
8724 /* Emit the ZADDs needed to rebuild the sorted set */
8725 zset *zs = o->ptr;
8726 dictIterator *di = dictGetIterator(zs->dict);
8727 dictEntry *de;
8728
8729 while((de = dictNext(di)) != NULL) {
8730 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8731 robj *eleobj = dictGetEntryKey(de);
8732 double *score = dictGetEntryVal(de);
8733
8734 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8735 if (fwriteBulkObject(fp,key) == 0) goto werr;
8736 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8737 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8738 }
8739 dictReleaseIterator(di);
8740 } else if (o->type == REDIS_HASH) {
8741 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8742
8743 /* Emit the HSETs needed to rebuild the hash */
8744 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8745 unsigned char *p = zipmapRewind(o->ptr);
8746 unsigned char *field, *val;
8747 unsigned int flen, vlen;
8748
8749 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8750 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8751 if (fwriteBulkObject(fp,key) == 0) goto werr;
8752 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8753 return -1;
8754 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8755 return -1;
8756 }
8757 } else {
8758 dictIterator *di = dictGetIterator(o->ptr);
8759 dictEntry *de;
8760
8761 while((de = dictNext(di)) != NULL) {
8762 robj *field = dictGetEntryKey(de);
8763 robj *val = dictGetEntryVal(de);
8764
8765 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8766 if (fwriteBulkObject(fp,key) == 0) goto werr;
8767 if (fwriteBulkObject(fp,field) == -1) return -1;
8768 if (fwriteBulkObject(fp,val) == -1) return -1;
8769 }
8770 dictReleaseIterator(di);
8771 }
8772 } else {
8773 redisPanic("Unknown object type");
8774 }
8775 /* Save the expire time */
8776 if (expiretime != -1) {
8777 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8778 /* If this key is already expired skip it */
8779 if (expiretime < now) continue;
8780 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8781 if (fwriteBulkObject(fp,key) == 0) goto werr;
8782 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8783 }
8784 if (swapped) decrRefCount(o);
8785 }
8786 dictReleaseIterator(di);
8787 }
8788
8789 /* Make sure data will not remain on the OS's output buffers */
8790 fflush(fp);
8791 aof_fsync(fileno(fp));
8792 fclose(fp);
8793
8794 /* Use RENAME to make sure the DB file is changed atomically only
8795 * if the generate DB file is ok. */
8796 if (rename(tmpfile,filename) == -1) {
8797 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8798 unlink(tmpfile);
8799 return REDIS_ERR;
8800 }
8801 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8802 return REDIS_OK;
8803
8804 werr:
8805 fclose(fp);
8806 unlink(tmpfile);
8807 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8808 if (di) dictReleaseIterator(di);
8809 return REDIS_ERR;
8810 }
8811
8812 /* This is how rewriting of the append only file in background works:
8813 *
8814 * 1) The user calls BGREWRITEAOF
8815 * 2) Redis calls this function, that forks():
8816 * 2a) the child rewrite the append only file in a temp file.
8817 * 2b) the parent accumulates differences in server.bgrewritebuf.
8818 * 3) When the child finished '2a' exists.
8819 * 4) The parent will trap the exit code, if it's OK, will append the
8820 * data accumulated into server.bgrewritebuf into the temp file, and
8821 * finally will rename(2) the temp file in the actual file name.
8822 * The the new file is reopened as the new append only file. Profit!
8823 */
8824 static int rewriteAppendOnlyFileBackground(void) {
8825 pid_t childpid;
8826
8827 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8828 if (server.vm_enabled) waitEmptyIOJobsQueue();
8829 if ((childpid = fork()) == 0) {
8830 /* Child */
8831 char tmpfile[256];
8832
8833 if (server.vm_enabled) vmReopenSwapFile();
8834 close(server.fd);
8835 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8836 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8837 _exit(0);
8838 } else {
8839 _exit(1);
8840 }
8841 } else {
8842 /* Parent */
8843 if (childpid == -1) {
8844 redisLog(REDIS_WARNING,
8845 "Can't rewrite append only file in background: fork: %s",
8846 strerror(errno));
8847 return REDIS_ERR;
8848 }
8849 redisLog(REDIS_NOTICE,
8850 "Background append only file rewriting started by pid %d",childpid);
8851 server.bgrewritechildpid = childpid;
8852 updateDictResizePolicy();
8853 /* We set appendseldb to -1 in order to force the next call to the
8854 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8855 * accumulated by the parent into server.bgrewritebuf will start
8856 * with a SELECT statement and it will be safe to merge. */
8857 server.appendseldb = -1;
8858 return REDIS_OK;
8859 }
8860 return REDIS_OK; /* unreached */
8861 }
8862
8863 static void bgrewriteaofCommand(redisClient *c) {
8864 if (server.bgrewritechildpid != -1) {
8865 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8866 return;
8867 }
8868 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8869 char *status = "+Background append only file rewriting started\r\n";
8870 addReplySds(c,sdsnew(status));
8871 } else {
8872 addReply(c,shared.err);
8873 }
8874 }
8875
8876 static void aofRemoveTempFile(pid_t childpid) {
8877 char tmpfile[256];
8878
8879 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8880 unlink(tmpfile);
8881 }
8882
8883 /* Virtual Memory is composed mainly of two subsystems:
8884 * - Blocking Virutal Memory
8885 * - Threaded Virtual Memory I/O
8886 * The two parts are not fully decoupled, but functions are split among two
8887 * different sections of the source code (delimited by comments) in order to
8888 * make more clear what functionality is about the blocking VM and what about
8889 * the threaded (not blocking) VM.
8890 *
8891 * Redis VM design:
8892 *
8893 * Redis VM is a blocking VM (one that blocks reading swapped values from
8894 * disk into memory when a value swapped out is needed in memory) that is made
8895 * unblocking by trying to examine the command argument vector in order to
8896 * load in background values that will likely be needed in order to exec
8897 * the command. The command is executed only once all the relevant keys
8898 * are loaded into memory.
8899 *
8900 * This basically is almost as simple of a blocking VM, but almost as parallel
8901 * as a fully non-blocking VM.
8902 */
8903
8904 /* =================== Virtual Memory - Blocking Side ====================== */
8905
8906 /* Create a VM pointer object. This kind of objects are used in place of
8907 * values in the key -> value hash table, for swapped out objects. */
8908 static vmpointer *createVmPointer(int vtype) {
8909 vmpointer *vp = zmalloc(sizeof(vmpointer));
8910
8911 vp->type = REDIS_VMPOINTER;
8912 vp->storage = REDIS_VM_SWAPPED;
8913 vp->vtype = vtype;
8914 return vp;
8915 }
8916
8917 static void vmInit(void) {
8918 off_t totsize;
8919 int pipefds[2];
8920 size_t stacksize;
8921 struct flock fl;
8922
8923 if (server.vm_max_threads != 0)
8924 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8925
8926 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8927 /* Try to open the old swap file, otherwise create it */
8928 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8929 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8930 }
8931 if (server.vm_fp == NULL) {
8932 redisLog(REDIS_WARNING,
8933 "Can't open the swap file: %s. Exiting.",
8934 strerror(errno));
8935 exit(1);
8936 }
8937 server.vm_fd = fileno(server.vm_fp);
8938 /* Lock the swap file for writing, this is useful in order to avoid
8939 * another instance to use the same swap file for a config error. */
8940 fl.l_type = F_WRLCK;
8941 fl.l_whence = SEEK_SET;
8942 fl.l_start = fl.l_len = 0;
8943 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8944 redisLog(REDIS_WARNING,
8945 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8946 exit(1);
8947 }
8948 /* Initialize */
8949 server.vm_next_page = 0;
8950 server.vm_near_pages = 0;
8951 server.vm_stats_used_pages = 0;
8952 server.vm_stats_swapped_objects = 0;
8953 server.vm_stats_swapouts = 0;
8954 server.vm_stats_swapins = 0;
8955 totsize = server.vm_pages*server.vm_page_size;
8956 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8957 if (ftruncate(server.vm_fd,totsize) == -1) {
8958 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8959 strerror(errno));
8960 exit(1);
8961 } else {
8962 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8963 }
8964 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8965 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8966 (long long) (server.vm_pages+7)/8, server.vm_pages);
8967 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8968
8969 /* Initialize threaded I/O (used by Virtual Memory) */
8970 server.io_newjobs = listCreate();
8971 server.io_processing = listCreate();
8972 server.io_processed = listCreate();
8973 server.io_ready_clients = listCreate();
8974 pthread_mutex_init(&server.io_mutex,NULL);
8975 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8976 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8977 server.io_active_threads = 0;
8978 if (pipe(pipefds) == -1) {
8979 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8980 ,strerror(errno));
8981 exit(1);
8982 }
8983 server.io_ready_pipe_read = pipefds[0];
8984 server.io_ready_pipe_write = pipefds[1];
8985 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8986 /* LZF requires a lot of stack */
8987 pthread_attr_init(&server.io_threads_attr);
8988 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8989 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8990 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8991 /* Listen for events in the threaded I/O pipe */
8992 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8993 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8994 oom("creating file event");
8995 }
8996
8997 /* Mark the page as used */
8998 static void vmMarkPageUsed(off_t page) {
8999 off_t byte = page/8;
9000 int bit = page&7;
9001 redisAssert(vmFreePage(page) == 1);
9002 server.vm_bitmap[byte] |= 1<<bit;
9003 }
9004
9005 /* Mark N contiguous pages as used, with 'page' being the first. */
9006 static void vmMarkPagesUsed(off_t page, off_t count) {
9007 off_t j;
9008
9009 for (j = 0; j < count; j++)
9010 vmMarkPageUsed(page+j);
9011 server.vm_stats_used_pages += count;
9012 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9013 (long long)count, (long long)page);
9014 }
9015
9016 /* Mark the page as free */
9017 static void vmMarkPageFree(off_t page) {
9018 off_t byte = page/8;
9019 int bit = page&7;
9020 redisAssert(vmFreePage(page) == 0);
9021 server.vm_bitmap[byte] &= ~(1<<bit);
9022 }
9023
9024 /* Mark N contiguous pages as free, with 'page' being the first. */
9025 static void vmMarkPagesFree(off_t page, off_t count) {
9026 off_t j;
9027
9028 for (j = 0; j < count; j++)
9029 vmMarkPageFree(page+j);
9030 server.vm_stats_used_pages -= count;
9031 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9032 (long long)count, (long long)page);
9033 }
9034
9035 /* Test if the page is free */
9036 static int vmFreePage(off_t page) {
9037 off_t byte = page/8;
9038 int bit = page&7;
9039 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
9040 }
9041
9042 /* Find N contiguous free pages storing the first page of the cluster in *first.
9043 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
9044 * REDIS_ERR is returned.
9045 *
9046 * This function uses a simple algorithm: we try to allocate
9047 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9048 * again from the start of the swap file searching for free spaces.
9049 *
9050 * If it looks pretty clear that there are no free pages near our offset
9051 * we try to find less populated places doing a forward jump of
9052 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9053 * without hurry, and then we jump again and so forth...
9054 *
9055 * This function can be improved using a free list to avoid to guess
9056 * too much, since we could collect data about freed pages.
9057 *
9058 * note: I implemented this function just after watching an episode of
9059 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9060 */
9061 static int vmFindContiguousPages(off_t *first, off_t n) {
9062 off_t base, offset = 0, since_jump = 0, numfree = 0;
9063
9064 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9065 server.vm_near_pages = 0;
9066 server.vm_next_page = 0;
9067 }
9068 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9069 base = server.vm_next_page;
9070
9071 while(offset < server.vm_pages) {
9072 off_t this = base+offset;
9073
9074 /* If we overflow, restart from page zero */
9075 if (this >= server.vm_pages) {
9076 this -= server.vm_pages;
9077 if (this == 0) {
9078 /* Just overflowed, what we found on tail is no longer
9079 * interesting, as it's no longer contiguous. */
9080 numfree = 0;
9081 }
9082 }
9083 if (vmFreePage(this)) {
9084 /* This is a free page */
9085 numfree++;
9086 /* Already got N free pages? Return to the caller, with success */
9087 if (numfree == n) {
9088 *first = this-(n-1);
9089 server.vm_next_page = this+1;
9090 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
9091 return REDIS_OK;
9092 }
9093 } else {
9094 /* The current one is not a free page */
9095 numfree = 0;
9096 }
9097
9098 /* Fast-forward if the current page is not free and we already
9099 * searched enough near this place. */
9100 since_jump++;
9101 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9102 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9103 since_jump = 0;
9104 /* Note that even if we rewind after the jump, we are don't need
9105 * to make sure numfree is set to zero as we only jump *if* it
9106 * is set to zero. */
9107 } else {
9108 /* Otherwise just check the next page */
9109 offset++;
9110 }
9111 }
9112 return REDIS_ERR;
9113 }
9114
9115 /* Write the specified object at the specified page of the swap file */
9116 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9117 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9118 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9119 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9120 redisLog(REDIS_WARNING,
9121 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9122 strerror(errno));
9123 return REDIS_ERR;
9124 }
9125 rdbSaveObject(server.vm_fp,o);
9126 fflush(server.vm_fp);
9127 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9128 return REDIS_OK;
9129 }
9130
9131 /* Transfers the 'val' object to disk. Store all the information
9132 * a 'vmpointer' object containing all the information needed to load the
9133 * object back later is returned.
9134 *
9135 * If we can't find enough contiguous empty pages to swap the object on disk
9136 * NULL is returned. */
9137 static vmpointer *vmSwapObjectBlocking(robj *val) {
9138 off_t pages = rdbSavedObjectPages(val,NULL);
9139 off_t page;
9140 vmpointer *vp;
9141
9142 assert(val->storage == REDIS_VM_MEMORY);
9143 assert(val->refcount == 1);
9144 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9145 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9146
9147 vp = createVmPointer(val->type);
9148 vp->page = page;
9149 vp->usedpages = pages;
9150 decrRefCount(val); /* Deallocate the object from memory. */
9151 vmMarkPagesUsed(page,pages);
9152 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9153 (void*) val,
9154 (unsigned long long) page, (unsigned long long) pages);
9155 server.vm_stats_swapped_objects++;
9156 server.vm_stats_swapouts++;
9157 return vp;
9158 }
9159
9160 static robj *vmReadObjectFromSwap(off_t page, int type) {
9161 robj *o;
9162
9163 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9164 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9165 redisLog(REDIS_WARNING,
9166 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9167 strerror(errno));
9168 _exit(1);
9169 }
9170 o = rdbLoadObject(type,server.vm_fp);
9171 if (o == NULL) {
9172 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9173 _exit(1);
9174 }
9175 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9176 return o;
9177 }
9178
9179 /* Load the specified object from swap to memory.
9180 * The newly allocated object is returned.
9181 *
9182 * If preview is true the unserialized object is returned to the caller but
9183 * the pages are not marked as freed, nor the vp object is freed. */
9184 static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
9185 robj *val;
9186
9187 redisAssert(vp->type == REDIS_VMPOINTER &&
9188 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9189 val = vmReadObjectFromSwap(vp->page,vp->vtype);
9190 if (!preview) {
9191 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9192 vmMarkPagesFree(vp->page,vp->usedpages);
9193 zfree(vp);
9194 server.vm_stats_swapped_objects--;
9195 } else {
9196 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
9197 }
9198 server.vm_stats_swapins++;
9199 return val;
9200 }
9201
9202 /* Plain object loading, from swap to memory.
9203 *
9204 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9205 * The return value is the loaded object. */
9206 static robj *vmLoadObject(robj *o) {
9207 /* If we are loading the object in background, stop it, we
9208 * need to load this object synchronously ASAP. */
9209 if (o->storage == REDIS_VM_LOADING)
9210 vmCancelThreadedIOJob(o);
9211 return vmGenericLoadObject((vmpointer*)o,0);
9212 }
9213
9214 /* Just load the value on disk, without to modify the key.
9215 * This is useful when we want to perform some operation on the value
9216 * without to really bring it from swap to memory, like while saving the
9217 * dataset or rewriting the append only log. */
9218 static robj *vmPreviewObject(robj *o) {
9219 return vmGenericLoadObject((vmpointer*)o,1);
9220 }
9221
9222 /* How a good candidate is this object for swapping?
9223 * The better candidate it is, the greater the returned value.
9224 *
9225 * Currently we try to perform a fast estimation of the object size in
9226 * memory, and combine it with aging informations.
9227 *
9228 * Basically swappability = idle-time * log(estimated size)
9229 *
9230 * Bigger objects are preferred over smaller objects, but not
9231 * proportionally, this is why we use the logarithm. This algorithm is
9232 * just a first try and will probably be tuned later. */
9233 static double computeObjectSwappability(robj *o) {
9234 /* actual age can be >= minage, but not < minage. As we use wrapping
9235 * 21 bit clocks with minutes resolution for the LRU. */
9236 time_t minage = abs(server.lruclock - o->lru);
9237 long asize = 0;
9238 list *l;
9239 dict *d;
9240 struct dictEntry *de;
9241 int z;
9242
9243 if (minage <= 0) return 0;
9244 switch(o->type) {
9245 case REDIS_STRING:
9246 if (o->encoding != REDIS_ENCODING_RAW) {
9247 asize = sizeof(*o);
9248 } else {
9249 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9250 }
9251 break;
9252 case REDIS_LIST:
9253 l = o->ptr;
9254 listNode *ln = listFirst(l);
9255
9256 asize = sizeof(list);
9257 if (ln) {
9258 robj *ele = ln->value;
9259 long elesize;
9260
9261 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9262 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9263 asize += (sizeof(listNode)+elesize)*listLength(l);
9264 }
9265 break;
9266 case REDIS_SET:
9267 case REDIS_ZSET:
9268 z = (o->type == REDIS_ZSET);
9269 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9270
9271 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9272 if (z) asize += sizeof(zset)-sizeof(dict);
9273 if (dictSize(d)) {
9274 long elesize;
9275 robj *ele;
9276
9277 de = dictGetRandomKey(d);
9278 ele = dictGetEntryKey(de);
9279 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9280 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9281 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9282 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9283 }
9284 break;
9285 case REDIS_HASH:
9286 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9287 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9288 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9289 unsigned int klen, vlen;
9290 unsigned char *key, *val;
9291
9292 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9293 klen = 0;
9294 vlen = 0;
9295 }
9296 asize = len*(klen+vlen+3);
9297 } else if (o->encoding == REDIS_ENCODING_HT) {
9298 d = o->ptr;
9299 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9300 if (dictSize(d)) {
9301 long elesize;
9302 robj *ele;
9303
9304 de = dictGetRandomKey(d);
9305 ele = dictGetEntryKey(de);
9306 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9307 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9308 ele = dictGetEntryVal(de);
9309 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9310 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9311 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9312 }
9313 }
9314 break;
9315 }
9316 return (double)minage*log(1+asize);
9317 }
9318
9319 /* Try to swap an object that's a good candidate for swapping.
9320 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9321 * to swap any object at all.
9322 *
9323 * If 'usethreaded' is true, Redis will try to swap the object in background
9324 * using I/O threads. */
9325 static int vmSwapOneObject(int usethreads) {
9326 int j, i;
9327 struct dictEntry *best = NULL;
9328 double best_swappability = 0;
9329 redisDb *best_db = NULL;
9330 robj *key, *val;
9331
9332 for (j = 0; j < server.dbnum; j++) {
9333 redisDb *db = server.db+j;
9334 /* Why maxtries is set to 100?
9335 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9336 * are swappable objects */
9337 int maxtries = 100;
9338
9339 if (dictSize(db->dict) == 0) continue;
9340 for (i = 0; i < 5; i++) {
9341 dictEntry *de;
9342 double swappability;
9343
9344 if (maxtries) maxtries--;
9345 de = dictGetRandomKey(db->dict);
9346 key = dictGetEntryKey(de);
9347 val = dictGetEntryVal(de);
9348 /* Only swap objects that are currently in memory.
9349 *
9350 * Also don't swap shared objects: not a good idea in general and
9351 * we need to ensure that the main thread does not touch the
9352 * object while the I/O thread is using it, but we can't
9353 * control other keys without adding additional mutex. */
9354 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
9355 if (maxtries) i--; /* don't count this try */
9356 continue;
9357 }
9358 swappability = computeObjectSwappability(val);
9359 if (!best || swappability > best_swappability) {
9360 best = de;
9361 best_swappability = swappability;
9362 best_db = db;
9363 }
9364 }
9365 }
9366 if (best == NULL) return REDIS_ERR;
9367 key = dictGetEntryKey(best);
9368 val = dictGetEntryVal(best);
9369
9370 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9371 key->ptr, best_swappability);
9372
9373 /* Swap it */
9374 if (usethreads) {
9375 vmSwapObjectThreaded(key,val,best_db);
9376 return REDIS_OK;
9377 } else {
9378 vmpointer *vp;
9379
9380 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9381 dictGetEntryVal(best) = vp;
9382 return REDIS_OK;
9383 } else {
9384 return REDIS_ERR;
9385 }
9386 }
9387 }
9388
9389 static int vmSwapOneObjectBlocking() {
9390 return vmSwapOneObject(0);
9391 }
9392
9393 static int vmSwapOneObjectThreaded() {
9394 return vmSwapOneObject(1);
9395 }
9396
9397 /* Return true if it's safe to swap out objects in a given moment.
9398 * Basically we don't want to swap objects out while there is a BGSAVE
9399 * or a BGAEOREWRITE running in backgroud. */
9400 static int vmCanSwapOut(void) {
9401 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9402 }
9403
9404 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9405 * and was deleted. Otherwise 0 is returned. */
9406 static int deleteIfSwapped(redisDb *db, robj *key) {
9407 robj *val;
9408
9409 if ((val = dictFetchValue(db->dict,key)) == NULL) return 0;
9410 if (val->storage == REDIS_VM_MEMORY) return 0;
9411 deleteKey(db,key);
9412 return 1;
9413 }
9414
9415 /* =================== Virtual Memory - Threaded I/O ======================= */
9416
9417 static void freeIOJob(iojob *j) {
9418 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9419 j->type == REDIS_IOJOB_DO_SWAP ||
9420 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9421 {
9422 /* we fix the storage type, otherwise decrRefCount() will try to
9423 * kill the I/O thread Job (that does no longer exists). */
9424 if (j->val->storage == REDIS_VM_SWAPPING)
9425 j->val->storage = REDIS_VM_MEMORY;
9426 decrRefCount(j->val);
9427 }
9428 decrRefCount(j->key);
9429 zfree(j);
9430 }
9431
9432 /* Every time a thread finished a Job, it writes a byte into the write side
9433 * of an unix pipe in order to "awake" the main thread, and this function
9434 * is called. */
9435 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9436 int mask)
9437 {
9438 char buf[1];
9439 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9440 REDIS_NOTUSED(el);
9441 REDIS_NOTUSED(mask);
9442 REDIS_NOTUSED(privdata);
9443
9444 /* For every byte we read in the read side of the pipe, there is one
9445 * I/O job completed to process. */
9446 while((retval = read(fd,buf,1)) == 1) {
9447 iojob *j;
9448 listNode *ln;
9449 struct dictEntry *de;
9450
9451 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9452
9453 /* Get the processed element (the oldest one) */
9454 lockThreadedIO();
9455 assert(listLength(server.io_processed) != 0);
9456 if (toprocess == -1) {
9457 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9458 if (toprocess <= 0) toprocess = 1;
9459 }
9460 ln = listFirst(server.io_processed);
9461 j = ln->value;
9462 listDelNode(server.io_processed,ln);
9463 unlockThreadedIO();
9464 /* If this job is marked as canceled, just ignore it */
9465 if (j->canceled) {
9466 freeIOJob(j);
9467 continue;
9468 }
9469 /* Post process it in the main thread, as there are things we
9470 * can do just here to avoid race conditions and/or invasive locks */
9471 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
9472 de = dictFind(j->db->dict,j->key);
9473 redisAssert(de != NULL);
9474 if (j->type == REDIS_IOJOB_LOAD) {
9475 redisDb *db;
9476 vmpointer *vp = dictGetEntryVal(de);
9477
9478 /* Key loaded, bring it at home */
9479 vmMarkPagesFree(vp->page,vp->usedpages);
9480 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9481 (unsigned char*) j->key->ptr);
9482 server.vm_stats_swapped_objects--;
9483 server.vm_stats_swapins++;
9484 dictGetEntryVal(de) = j->val;
9485 incrRefCount(j->val);
9486 db = j->db;
9487 /* Handle clients waiting for this key to be loaded. */
9488 handleClientsBlockedOnSwappedKey(db,j->key);
9489 freeIOJob(j);
9490 zfree(vp);
9491 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9492 /* Now we know the amount of pages required to swap this object.
9493 * Let's find some space for it, and queue this task again
9494 * rebranded as REDIS_IOJOB_DO_SWAP. */
9495 if (!vmCanSwapOut() ||
9496 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9497 {
9498 /* Ooops... no space or we can't swap as there is
9499 * a fork()ed Redis trying to save stuff on disk. */
9500 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
9501 freeIOJob(j);
9502 } else {
9503 /* Note that we need to mark this pages as used now,
9504 * if the job will be canceled, we'll mark them as freed
9505 * again. */
9506 vmMarkPagesUsed(j->page,j->pages);
9507 j->type = REDIS_IOJOB_DO_SWAP;
9508 lockThreadedIO();
9509 queueIOJob(j);
9510 unlockThreadedIO();
9511 }
9512 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9513 vmpointer *vp;
9514
9515 /* Key swapped. We can finally free some memory. */
9516 if (j->val->storage != REDIS_VM_SWAPPING) {
9517 vmpointer *vp = (vmpointer*) j->id;
9518 printf("storage: %d\n",vp->storage);
9519 printf("key->name: %s\n",(char*)j->key->ptr);
9520 printf("val: %p\n",(void*)j->val);
9521 printf("val->type: %d\n",j->val->type);
9522 printf("val->ptr: %s\n",(char*)j->val->ptr);
9523 }
9524 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9525 vp = createVmPointer(j->val->type);
9526 vp->page = j->page;
9527 vp->usedpages = j->pages;
9528 dictGetEntryVal(de) = vp;
9529 /* Fix the storage otherwise decrRefCount will attempt to
9530 * remove the associated I/O job */
9531 j->val->storage = REDIS_VM_MEMORY;
9532 decrRefCount(j->val);
9533 redisLog(REDIS_DEBUG,
9534 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9535 (unsigned char*) j->key->ptr,
9536 (unsigned long long) j->page, (unsigned long long) j->pages);
9537 server.vm_stats_swapped_objects++;
9538 server.vm_stats_swapouts++;
9539 freeIOJob(j);
9540 /* Put a few more swap requests in queue if we are still
9541 * out of memory */
9542 if (trytoswap && vmCanSwapOut() &&
9543 zmalloc_used_memory() > server.vm_max_memory)
9544 {
9545 int more = 1;
9546 while(more) {
9547 lockThreadedIO();
9548 more = listLength(server.io_newjobs) <
9549 (unsigned) server.vm_max_threads;
9550 unlockThreadedIO();
9551 /* Don't waste CPU time if swappable objects are rare. */
9552 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9553 trytoswap = 0;
9554 break;
9555 }
9556 }
9557 }
9558 }
9559 processed++;
9560 if (processed == toprocess) return;
9561 }
9562 if (retval < 0 && errno != EAGAIN) {
9563 redisLog(REDIS_WARNING,
9564 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9565 strerror(errno));
9566 }
9567 }
9568
9569 static void lockThreadedIO(void) {
9570 pthread_mutex_lock(&server.io_mutex);
9571 }
9572
9573 static void unlockThreadedIO(void) {
9574 pthread_mutex_unlock(&server.io_mutex);
9575 }
9576
9577 /* Remove the specified object from the threaded I/O queue if still not
9578 * processed, otherwise make sure to flag it as canceled. */
9579 static void vmCancelThreadedIOJob(robj *o) {
9580 list *lists[3] = {
9581 server.io_newjobs, /* 0 */
9582 server.io_processing, /* 1 */
9583 server.io_processed /* 2 */
9584 };
9585 int i;
9586
9587 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9588 again:
9589 lockThreadedIO();
9590 /* Search for a matching object in one of the queues */
9591 for (i = 0; i < 3; i++) {
9592 listNode *ln;
9593 listIter li;
9594
9595 listRewind(lists[i],&li);
9596 while ((ln = listNext(&li)) != NULL) {
9597 iojob *job = ln->value;
9598
9599 if (job->canceled) continue; /* Skip this, already canceled. */
9600 if (job->id == o) {
9601 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9602 (void*)job, (char*)o->ptr, job->type, i);
9603 /* Mark the pages as free since the swap didn't happened
9604 * or happened but is now discarded. */
9605 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9606 vmMarkPagesFree(job->page,job->pages);
9607 /* Cancel the job. It depends on the list the job is
9608 * living in. */
9609 switch(i) {
9610 case 0: /* io_newjobs */
9611 /* If the job was yet not processed the best thing to do
9612 * is to remove it from the queue at all */
9613 freeIOJob(job);
9614 listDelNode(lists[i],ln);
9615 break;
9616 case 1: /* io_processing */
9617 /* Oh Shi- the thread is messing with the Job:
9618 *
9619 * Probably it's accessing the object if this is a
9620 * PREPARE_SWAP or DO_SWAP job.
9621 * If it's a LOAD job it may be reading from disk and
9622 * if we don't wait for the job to terminate before to
9623 * cancel it, maybe in a few microseconds data can be
9624 * corrupted in this pages. So the short story is:
9625 *
9626 * Better to wait for the job to move into the
9627 * next queue (processed)... */
9628
9629 /* We try again and again until the job is completed. */
9630 unlockThreadedIO();
9631 /* But let's wait some time for the I/O thread
9632 * to finish with this job. After all this condition
9633 * should be very rare. */
9634 usleep(1);
9635 goto again;
9636 case 2: /* io_processed */
9637 /* The job was already processed, that's easy...
9638 * just mark it as canceled so that we'll ignore it
9639 * when processing completed jobs. */
9640 job->canceled = 1;
9641 break;
9642 }
9643 /* Finally we have to adjust the storage type of the object
9644 * in order to "UNDO" the operaiton. */
9645 if (o->storage == REDIS_VM_LOADING)
9646 o->storage = REDIS_VM_SWAPPED;
9647 else if (o->storage == REDIS_VM_SWAPPING)
9648 o->storage = REDIS_VM_MEMORY;
9649 unlockThreadedIO();
9650 redisLog(REDIS_DEBUG,"*** DONE");
9651 return;
9652 }
9653 }
9654 }
9655 unlockThreadedIO();
9656 printf("Not found: %p\n", (void*)o);
9657 redisAssert(1 != 1); /* We should never reach this */
9658 }
9659
9660 static void *IOThreadEntryPoint(void *arg) {
9661 iojob *j;
9662 listNode *ln;
9663 REDIS_NOTUSED(arg);
9664
9665 pthread_detach(pthread_self());
9666 while(1) {
9667 /* Get a new job to process */
9668 lockThreadedIO();
9669 if (listLength(server.io_newjobs) == 0) {
9670 /* No new jobs in queue, exit. */
9671 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9672 (long) pthread_self());
9673 server.io_active_threads--;
9674 unlockThreadedIO();
9675 return NULL;
9676 }
9677 ln = listFirst(server.io_newjobs);
9678 j = ln->value;
9679 listDelNode(server.io_newjobs,ln);
9680 /* Add the job in the processing queue */
9681 j->thread = pthread_self();
9682 listAddNodeTail(server.io_processing,j);
9683 ln = listLast(server.io_processing); /* We use ln later to remove it */
9684 unlockThreadedIO();
9685 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9686 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9687
9688 /* Process the Job */
9689 if (j->type == REDIS_IOJOB_LOAD) {
9690 vmpointer *vp = (vmpointer*)j->id;
9691 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
9692 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9693 FILE *fp = fopen("/dev/null","w+");
9694 j->pages = rdbSavedObjectPages(j->val,fp);
9695 fclose(fp);
9696 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9697 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9698 j->canceled = 1;
9699 }
9700
9701 /* Done: insert the job into the processed queue */
9702 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9703 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9704 lockThreadedIO();
9705 listDelNode(server.io_processing,ln);
9706 listAddNodeTail(server.io_processed,j);
9707 unlockThreadedIO();
9708
9709 /* Signal the main thread there is new stuff to process */
9710 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9711 }
9712 return NULL; /* never reached */
9713 }
9714
9715 static void spawnIOThread(void) {
9716 pthread_t thread;
9717 sigset_t mask, omask;
9718 int err;
9719
9720 sigemptyset(&mask);
9721 sigaddset(&mask,SIGCHLD);
9722 sigaddset(&mask,SIGHUP);
9723 sigaddset(&mask,SIGPIPE);
9724 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9725 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9726 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9727 strerror(err));
9728 usleep(1000000);
9729 }
9730 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9731 server.io_active_threads++;
9732 }
9733
9734 /* We need to wait for the last thread to exit before we are able to
9735 * fork() in order to BGSAVE or BGREWRITEAOF. */
9736 static void waitEmptyIOJobsQueue(void) {
9737 while(1) {
9738 int io_processed_len;
9739
9740 lockThreadedIO();
9741 if (listLength(server.io_newjobs) == 0 &&
9742 listLength(server.io_processing) == 0 &&
9743 server.io_active_threads == 0)
9744 {
9745 unlockThreadedIO();
9746 return;
9747 }
9748 /* While waiting for empty jobs queue condition we post-process some
9749 * finshed job, as I/O threads may be hanging trying to write against
9750 * the io_ready_pipe_write FD but there are so much pending jobs that
9751 * it's blocking. */
9752 io_processed_len = listLength(server.io_processed);
9753 unlockThreadedIO();
9754 if (io_processed_len) {
9755 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9756 usleep(1000); /* 1 millisecond */
9757 } else {
9758 usleep(10000); /* 10 milliseconds */
9759 }
9760 }
9761 }
9762
9763 static void vmReopenSwapFile(void) {
9764 /* Note: we don't close the old one as we are in the child process
9765 * and don't want to mess at all with the original file object. */
9766 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9767 if (server.vm_fp == NULL) {
9768 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9769 server.vm_swap_file);
9770 _exit(1);
9771 }
9772 server.vm_fd = fileno(server.vm_fp);
9773 }
9774
9775 /* This function must be called while with threaded IO locked */
9776 static void queueIOJob(iojob *j) {
9777 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9778 (void*)j, j->type, (char*)j->key->ptr);
9779 listAddNodeTail(server.io_newjobs,j);
9780 if (server.io_active_threads < server.vm_max_threads)
9781 spawnIOThread();
9782 }
9783
9784 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9785 iojob *j;
9786
9787 assert(key->storage == REDIS_VM_MEMORY);
9788
9789 j = zmalloc(sizeof(*j));
9790 j->type = REDIS_IOJOB_PREPARE_SWAP;
9791 j->db = db;
9792 j->key = key;
9793 incrRefCount(key);
9794 j->id = j->val = val;
9795 incrRefCount(val);
9796 j->canceled = 0;
9797 j->thread = (pthread_t) -1;
9798 val->storage = REDIS_VM_SWAPPING;
9799
9800 lockThreadedIO();
9801 queueIOJob(j);
9802 unlockThreadedIO();
9803 return REDIS_OK;
9804 }
9805
9806 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9807
9808 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9809 * If there is not already a job loading the key, it is craeted.
9810 * The key is added to the io_keys list in the client structure, and also
9811 * in the hash table mapping swapped keys to waiting clients, that is,
9812 * server.io_waited_keys. */
9813 static int waitForSwappedKey(redisClient *c, robj *key) {
9814 struct dictEntry *de;
9815 robj *o;
9816 list *l;
9817
9818 /* If the key does not exist or is already in RAM we don't need to
9819 * block the client at all. */
9820 de = dictFind(c->db->dict,key);
9821 if (de == NULL) return 0;
9822 o = dictGetEntryVal(de);
9823 if (o->storage == REDIS_VM_MEMORY) {
9824 return 0;
9825 } else if (o->storage == REDIS_VM_SWAPPING) {
9826 /* We were swapping the key, undo it! */
9827 vmCancelThreadedIOJob(o);
9828 return 0;
9829 }
9830
9831 /* OK: the key is either swapped, or being loaded just now. */
9832
9833 /* Add the key to the list of keys this client is waiting for.
9834 * This maps clients to keys they are waiting for. */
9835 listAddNodeTail(c->io_keys,key);
9836 incrRefCount(key);
9837
9838 /* Add the client to the swapped keys => clients waiting map. */
9839 de = dictFind(c->db->io_keys,key);
9840 if (de == NULL) {
9841 int retval;
9842
9843 /* For every key we take a list of clients blocked for it */
9844 l = listCreate();
9845 retval = dictAdd(c->db->io_keys,key,l);
9846 incrRefCount(key);
9847 assert(retval == DICT_OK);
9848 } else {
9849 l = dictGetEntryVal(de);
9850 }
9851 listAddNodeTail(l,c);
9852
9853 /* Are we already loading the key from disk? If not create a job */
9854 if (o->storage == REDIS_VM_SWAPPED) {
9855 iojob *j;
9856 vmpointer *vp = (vmpointer*)o;
9857
9858 o->storage = REDIS_VM_LOADING;
9859 j = zmalloc(sizeof(*j));
9860 j->type = REDIS_IOJOB_LOAD;
9861 j->db = c->db;
9862 j->id = (robj*)vp;
9863 j->key = key;
9864 incrRefCount(key);
9865 j->page = vp->page;
9866 j->val = NULL;
9867 j->canceled = 0;
9868 j->thread = (pthread_t) -1;
9869 lockThreadedIO();
9870 queueIOJob(j);
9871 unlockThreadedIO();
9872 }
9873 return 1;
9874 }
9875
9876 /* Preload keys for any command with first, last and step values for
9877 * the command keys prototype, as defined in the command table. */
9878 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9879 int j, last;
9880 if (cmd->vm_firstkey == 0) return;
9881 last = cmd->vm_lastkey;
9882 if (last < 0) last = argc+last;
9883 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9884 redisAssert(j < argc);
9885 waitForSwappedKey(c,argv[j]);
9886 }
9887 }
9888
9889 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9890 * Note that the number of keys to preload is user-defined, so we need to
9891 * apply a sanity check against argc. */
9892 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9893 int i, num;
9894 REDIS_NOTUSED(cmd);
9895
9896 num = atoi(argv[2]->ptr);
9897 if (num > (argc-3)) return;
9898 for (i = 0; i < num; i++) {
9899 waitForSwappedKey(c,argv[3+i]);
9900 }
9901 }
9902
9903 /* Preload keys needed to execute the entire MULTI/EXEC block.
9904 *
9905 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9906 * and will block the client when any command requires a swapped out value. */
9907 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9908 int i, margc;
9909 struct redisCommand *mcmd;
9910 robj **margv;
9911 REDIS_NOTUSED(cmd);
9912 REDIS_NOTUSED(argc);
9913 REDIS_NOTUSED(argv);
9914
9915 if (!(c->flags & REDIS_MULTI)) return;
9916 for (i = 0; i < c->mstate.count; i++) {
9917 mcmd = c->mstate.commands[i].cmd;
9918 margc = c->mstate.commands[i].argc;
9919 margv = c->mstate.commands[i].argv;
9920
9921 if (mcmd->vm_preload_proc != NULL) {
9922 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9923 } else {
9924 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9925 }
9926 }
9927 }
9928
9929 /* Is this client attempting to run a command against swapped keys?
9930 * If so, block it ASAP, load the keys in background, then resume it.
9931 *
9932 * The important idea about this function is that it can fail! If keys will
9933 * still be swapped when the client is resumed, this key lookups will
9934 * just block loading keys from disk. In practical terms this should only
9935 * happen with SORT BY command or if there is a bug in this function.
9936 *
9937 * Return 1 if the client is marked as blocked, 0 if the client can
9938 * continue as the keys it is going to access appear to be in memory. */
9939 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9940 if (cmd->vm_preload_proc != NULL) {
9941 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9942 } else {
9943 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9944 }
9945
9946 /* If the client was blocked for at least one key, mark it as blocked. */
9947 if (listLength(c->io_keys)) {
9948 c->flags |= REDIS_IO_WAIT;
9949 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9950 server.vm_blocked_clients++;
9951 return 1;
9952 } else {
9953 return 0;
9954 }
9955 }
9956
9957 /* Remove the 'key' from the list of blocked keys for a given client.
9958 *
9959 * The function returns 1 when there are no longer blocking keys after
9960 * the current one was removed (and the client can be unblocked). */
9961 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9962 list *l;
9963 listNode *ln;
9964 listIter li;
9965 struct dictEntry *de;
9966
9967 /* Remove the key from the list of keys this client is waiting for. */
9968 listRewind(c->io_keys,&li);
9969 while ((ln = listNext(&li)) != NULL) {
9970 if (equalStringObjects(ln->value,key)) {
9971 listDelNode(c->io_keys,ln);
9972 break;
9973 }
9974 }
9975 assert(ln != NULL);
9976
9977 /* Remove the client form the key => waiting clients map. */
9978 de = dictFind(c->db->io_keys,key);
9979 assert(de != NULL);
9980 l = dictGetEntryVal(de);
9981 ln = listSearchKey(l,c);
9982 assert(ln != NULL);
9983 listDelNode(l,ln);
9984 if (listLength(l) == 0)
9985 dictDelete(c->db->io_keys,key);
9986
9987 return listLength(c->io_keys) == 0;
9988 }
9989
9990 /* Every time we now a key was loaded back in memory, we handle clients
9991 * waiting for this key if any. */
9992 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9993 struct dictEntry *de;
9994 list *l;
9995 listNode *ln;
9996 int len;
9997
9998 de = dictFind(db->io_keys,key);
9999 if (!de) return;
10000
10001 l = dictGetEntryVal(de);
10002 len = listLength(l);
10003 /* Note: we can't use something like while(listLength(l)) as the list
10004 * can be freed by the calling function when we remove the last element. */
10005 while (len--) {
10006 ln = listFirst(l);
10007 redisClient *c = ln->value;
10008
10009 if (dontWaitForSwappedKey(c,key)) {
10010 /* Put the client in the list of clients ready to go as we
10011 * loaded all the keys about it. */
10012 listAddNodeTail(server.io_ready_clients,c);
10013 }
10014 }
10015 }
10016
10017 /* =========================== Remote Configuration ========================= */
10018
10019 static void configSetCommand(redisClient *c) {
10020 robj *o = getDecodedObject(c->argv[3]);
10021 long long ll;
10022
10023 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10024 zfree(server.dbfilename);
10025 server.dbfilename = zstrdup(o->ptr);
10026 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10027 zfree(server.requirepass);
10028 server.requirepass = zstrdup(o->ptr);
10029 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10030 zfree(server.masterauth);
10031 server.masterauth = zstrdup(o->ptr);
10032 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
10033 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10034 ll < 0) goto badfmt;
10035 server.maxmemory = ll;
10036 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10037 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10038 ll < 0 || ll > LONG_MAX) goto badfmt;
10039 server.maxidletime = ll;
10040 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10041 if (!strcasecmp(o->ptr,"no")) {
10042 server.appendfsync = APPENDFSYNC_NO;
10043 } else if (!strcasecmp(o->ptr,"everysec")) {
10044 server.appendfsync = APPENDFSYNC_EVERYSEC;
10045 } else if (!strcasecmp(o->ptr,"always")) {
10046 server.appendfsync = APPENDFSYNC_ALWAYS;
10047 } else {
10048 goto badfmt;
10049 }
10050 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10051 int yn = yesnotoi(o->ptr);
10052
10053 if (yn == -1) goto badfmt;
10054 server.no_appendfsync_on_rewrite = yn;
10055 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10056 int old = server.appendonly;
10057 int new = yesnotoi(o->ptr);
10058
10059 if (new == -1) goto badfmt;
10060 if (old != new) {
10061 if (new == 0) {
10062 stopAppendOnly();
10063 } else {
10064 if (startAppendOnly() == REDIS_ERR) {
10065 addReplySds(c,sdscatprintf(sdsempty(),
10066 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10067 decrRefCount(o);
10068 return;
10069 }
10070 }
10071 }
10072 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10073 int vlen, j;
10074 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10075
10076 /* Perform sanity check before setting the new config:
10077 * - Even number of args
10078 * - Seconds >= 1, changes >= 0 */
10079 if (vlen & 1) {
10080 sdsfreesplitres(v,vlen);
10081 goto badfmt;
10082 }
10083 for (j = 0; j < vlen; j++) {
10084 char *eptr;
10085 long val;
10086
10087 val = strtoll(v[j], &eptr, 10);
10088 if (eptr[0] != '\0' ||
10089 ((j & 1) == 0 && val < 1) ||
10090 ((j & 1) == 1 && val < 0)) {
10091 sdsfreesplitres(v,vlen);
10092 goto badfmt;
10093 }
10094 }
10095 /* Finally set the new config */
10096 resetServerSaveParams();
10097 for (j = 0; j < vlen; j += 2) {
10098 time_t seconds;
10099 int changes;
10100
10101 seconds = strtoll(v[j],NULL,10);
10102 changes = strtoll(v[j+1],NULL,10);
10103 appendServerSaveParams(seconds, changes);
10104 }
10105 sdsfreesplitres(v,vlen);
10106 } else {
10107 addReplySds(c,sdscatprintf(sdsempty(),
10108 "-ERR not supported CONFIG parameter %s\r\n",
10109 (char*)c->argv[2]->ptr));
10110 decrRefCount(o);
10111 return;
10112 }
10113 decrRefCount(o);
10114 addReply(c,shared.ok);
10115 return;
10116
10117 badfmt: /* Bad format errors */
10118 addReplySds(c,sdscatprintf(sdsempty(),
10119 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10120 (char*)o->ptr,
10121 (char*)c->argv[2]->ptr));
10122 decrRefCount(o);
10123 }
10124
10125 static void configGetCommand(redisClient *c) {
10126 robj *o = getDecodedObject(c->argv[2]);
10127 robj *lenobj = createObject(REDIS_STRING,NULL);
10128 char *pattern = o->ptr;
10129 int matches = 0;
10130
10131 addReply(c,lenobj);
10132 decrRefCount(lenobj);
10133
10134 if (stringmatch(pattern,"dbfilename",0)) {
10135 addReplyBulkCString(c,"dbfilename");
10136 addReplyBulkCString(c,server.dbfilename);
10137 matches++;
10138 }
10139 if (stringmatch(pattern,"requirepass",0)) {
10140 addReplyBulkCString(c,"requirepass");
10141 addReplyBulkCString(c,server.requirepass);
10142 matches++;
10143 }
10144 if (stringmatch(pattern,"masterauth",0)) {
10145 addReplyBulkCString(c,"masterauth");
10146 addReplyBulkCString(c,server.masterauth);
10147 matches++;
10148 }
10149 if (stringmatch(pattern,"maxmemory",0)) {
10150 char buf[128];
10151
10152 ll2string(buf,128,server.maxmemory);
10153 addReplyBulkCString(c,"maxmemory");
10154 addReplyBulkCString(c,buf);
10155 matches++;
10156 }
10157 if (stringmatch(pattern,"timeout",0)) {
10158 char buf[128];
10159
10160 ll2string(buf,128,server.maxidletime);
10161 addReplyBulkCString(c,"timeout");
10162 addReplyBulkCString(c,buf);
10163 matches++;
10164 }
10165 if (stringmatch(pattern,"appendonly",0)) {
10166 addReplyBulkCString(c,"appendonly");
10167 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10168 matches++;
10169 }
10170 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10171 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10172 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10173 matches++;
10174 }
10175 if (stringmatch(pattern,"appendfsync",0)) {
10176 char *policy;
10177
10178 switch(server.appendfsync) {
10179 case APPENDFSYNC_NO: policy = "no"; break;
10180 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10181 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10182 default: policy = "unknown"; break; /* too harmless to panic */
10183 }
10184 addReplyBulkCString(c,"appendfsync");
10185 addReplyBulkCString(c,policy);
10186 matches++;
10187 }
10188 if (stringmatch(pattern,"save",0)) {
10189 sds buf = sdsempty();
10190 int j;
10191
10192 for (j = 0; j < server.saveparamslen; j++) {
10193 buf = sdscatprintf(buf,"%ld %d",
10194 server.saveparams[j].seconds,
10195 server.saveparams[j].changes);
10196 if (j != server.saveparamslen-1)
10197 buf = sdscatlen(buf," ",1);
10198 }
10199 addReplyBulkCString(c,"save");
10200 addReplyBulkCString(c,buf);
10201 sdsfree(buf);
10202 matches++;
10203 }
10204 decrRefCount(o);
10205 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10206 }
10207
10208 static void configCommand(redisClient *c) {
10209 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10210 if (c->argc != 4) goto badarity;
10211 configSetCommand(c);
10212 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10213 if (c->argc != 3) goto badarity;
10214 configGetCommand(c);
10215 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10216 if (c->argc != 2) goto badarity;
10217 server.stat_numcommands = 0;
10218 server.stat_numconnections = 0;
10219 server.stat_expiredkeys = 0;
10220 server.stat_starttime = time(NULL);
10221 addReply(c,shared.ok);
10222 } else {
10223 addReplySds(c,sdscatprintf(sdsempty(),
10224 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10225 }
10226 return;
10227
10228 badarity:
10229 addReplySds(c,sdscatprintf(sdsempty(),
10230 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10231 (char*) c->argv[1]->ptr));
10232 }
10233
10234 /* =========================== Pubsub implementation ======================== */
10235
10236 static void freePubsubPattern(void *p) {
10237 pubsubPattern *pat = p;
10238
10239 decrRefCount(pat->pattern);
10240 zfree(pat);
10241 }
10242
10243 static int listMatchPubsubPattern(void *a, void *b) {
10244 pubsubPattern *pa = a, *pb = b;
10245
10246 return (pa->client == pb->client) &&
10247 (equalStringObjects(pa->pattern,pb->pattern));
10248 }
10249
10250 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10251 * 0 if the client was already subscribed to that channel. */
10252 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10253 struct dictEntry *de;
10254 list *clients = NULL;
10255 int retval = 0;
10256
10257 /* Add the channel to the client -> channels hash table */
10258 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10259 retval = 1;
10260 incrRefCount(channel);
10261 /* Add the client to the channel -> list of clients hash table */
10262 de = dictFind(server.pubsub_channels,channel);
10263 if (de == NULL) {
10264 clients = listCreate();
10265 dictAdd(server.pubsub_channels,channel,clients);
10266 incrRefCount(channel);
10267 } else {
10268 clients = dictGetEntryVal(de);
10269 }
10270 listAddNodeTail(clients,c);
10271 }
10272 /* Notify the client */
10273 addReply(c,shared.mbulk3);
10274 addReply(c,shared.subscribebulk);
10275 addReplyBulk(c,channel);
10276 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10277 return retval;
10278 }
10279
10280 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10281 * 0 if the client was not subscribed to the specified channel. */
10282 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10283 struct dictEntry *de;
10284 list *clients;
10285 listNode *ln;
10286 int retval = 0;
10287
10288 /* Remove the channel from the client -> channels hash table */
10289 incrRefCount(channel); /* channel may be just a pointer to the same object
10290 we have in the hash tables. Protect it... */
10291 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10292 retval = 1;
10293 /* Remove the client from the channel -> clients list hash table */
10294 de = dictFind(server.pubsub_channels,channel);
10295 assert(de != NULL);
10296 clients = dictGetEntryVal(de);
10297 ln = listSearchKey(clients,c);
10298 assert(ln != NULL);
10299 listDelNode(clients,ln);
10300 if (listLength(clients) == 0) {
10301 /* Free the list and associated hash entry at all if this was
10302 * the latest client, so that it will be possible to abuse
10303 * Redis PUBSUB creating millions of channels. */
10304 dictDelete(server.pubsub_channels,channel);
10305 }
10306 }
10307 /* Notify the client */
10308 if (notify) {
10309 addReply(c,shared.mbulk3);
10310 addReply(c,shared.unsubscribebulk);
10311 addReplyBulk(c,channel);
10312 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10313 listLength(c->pubsub_patterns));
10314
10315 }
10316 decrRefCount(channel); /* it is finally safe to release it */
10317 return retval;
10318 }
10319
10320 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10321 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10322 int retval = 0;
10323
10324 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10325 retval = 1;
10326 pubsubPattern *pat;
10327 listAddNodeTail(c->pubsub_patterns,pattern);
10328 incrRefCount(pattern);
10329 pat = zmalloc(sizeof(*pat));
10330 pat->pattern = getDecodedObject(pattern);
10331 pat->client = c;
10332 listAddNodeTail(server.pubsub_patterns,pat);
10333 }
10334 /* Notify the client */
10335 addReply(c,shared.mbulk3);
10336 addReply(c,shared.psubscribebulk);
10337 addReplyBulk(c,pattern);
10338 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10339 return retval;
10340 }
10341
10342 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10343 * 0 if the client was not subscribed to the specified channel. */
10344 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10345 listNode *ln;
10346 pubsubPattern pat;
10347 int retval = 0;
10348
10349 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10350 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10351 retval = 1;
10352 listDelNode(c->pubsub_patterns,ln);
10353 pat.client = c;
10354 pat.pattern = pattern;
10355 ln = listSearchKey(server.pubsub_patterns,&pat);
10356 listDelNode(server.pubsub_patterns,ln);
10357 }
10358 /* Notify the client */
10359 if (notify) {
10360 addReply(c,shared.mbulk3);
10361 addReply(c,shared.punsubscribebulk);
10362 addReplyBulk(c,pattern);
10363 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10364 listLength(c->pubsub_patterns));
10365 }
10366 decrRefCount(pattern);
10367 return retval;
10368 }
10369
10370 /* Unsubscribe from all the channels. Return the number of channels the
10371 * client was subscribed from. */
10372 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10373 dictIterator *di = dictGetIterator(c->pubsub_channels);
10374 dictEntry *de;
10375 int count = 0;
10376
10377 while((de = dictNext(di)) != NULL) {
10378 robj *channel = dictGetEntryKey(de);
10379
10380 count += pubsubUnsubscribeChannel(c,channel,notify);
10381 }
10382 dictReleaseIterator(di);
10383 return count;
10384 }
10385
10386 /* Unsubscribe from all the patterns. Return the number of patterns the
10387 * client was subscribed from. */
10388 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10389 listNode *ln;
10390 listIter li;
10391 int count = 0;
10392
10393 listRewind(c->pubsub_patterns,&li);
10394 while ((ln = listNext(&li)) != NULL) {
10395 robj *pattern = ln->value;
10396
10397 count += pubsubUnsubscribePattern(c,pattern,notify);
10398 }
10399 return count;
10400 }
10401
10402 /* Publish a message */
10403 static int pubsubPublishMessage(robj *channel, robj *message) {
10404 int receivers = 0;
10405 struct dictEntry *de;
10406 listNode *ln;
10407 listIter li;
10408
10409 /* Send to clients listening for that channel */
10410 de = dictFind(server.pubsub_channels,channel);
10411 if (de) {
10412 list *list = dictGetEntryVal(de);
10413 listNode *ln;
10414 listIter li;
10415
10416 listRewind(list,&li);
10417 while ((ln = listNext(&li)) != NULL) {
10418 redisClient *c = ln->value;
10419
10420 addReply(c,shared.mbulk3);
10421 addReply(c,shared.messagebulk);
10422 addReplyBulk(c,channel);
10423 addReplyBulk(c,message);
10424 receivers++;
10425 }
10426 }
10427 /* Send to clients listening to matching channels */
10428 if (listLength(server.pubsub_patterns)) {
10429 listRewind(server.pubsub_patterns,&li);
10430 channel = getDecodedObject(channel);
10431 while ((ln = listNext(&li)) != NULL) {
10432 pubsubPattern *pat = ln->value;
10433
10434 if (stringmatchlen((char*)pat->pattern->ptr,
10435 sdslen(pat->pattern->ptr),
10436 (char*)channel->ptr,
10437 sdslen(channel->ptr),0)) {
10438 addReply(pat->client,shared.mbulk4);
10439 addReply(pat->client,shared.pmessagebulk);
10440 addReplyBulk(pat->client,pat->pattern);
10441 addReplyBulk(pat->client,channel);
10442 addReplyBulk(pat->client,message);
10443 receivers++;
10444 }
10445 }
10446 decrRefCount(channel);
10447 }
10448 return receivers;
10449 }
10450
10451 static void subscribeCommand(redisClient *c) {
10452 int j;
10453
10454 for (j = 1; j < c->argc; j++)
10455 pubsubSubscribeChannel(c,c->argv[j]);
10456 }
10457
10458 static void unsubscribeCommand(redisClient *c) {
10459 if (c->argc == 1) {
10460 pubsubUnsubscribeAllChannels(c,1);
10461 return;
10462 } else {
10463 int j;
10464
10465 for (j = 1; j < c->argc; j++)
10466 pubsubUnsubscribeChannel(c,c->argv[j],1);
10467 }
10468 }
10469
10470 static void psubscribeCommand(redisClient *c) {
10471 int j;
10472
10473 for (j = 1; j < c->argc; j++)
10474 pubsubSubscribePattern(c,c->argv[j]);
10475 }
10476
10477 static void punsubscribeCommand(redisClient *c) {
10478 if (c->argc == 1) {
10479 pubsubUnsubscribeAllPatterns(c,1);
10480 return;
10481 } else {
10482 int j;
10483
10484 for (j = 1; j < c->argc; j++)
10485 pubsubUnsubscribePattern(c,c->argv[j],1);
10486 }
10487 }
10488
10489 static void publishCommand(redisClient *c) {
10490 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10491 addReplyLongLong(c,receivers);
10492 }
10493
10494 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10495 *
10496 * The implementation uses a per-DB hash table mapping keys to list of clients
10497 * WATCHing those keys, so that given a key that is going to be modified
10498 * we can mark all the associated clients as dirty.
10499 *
10500 * Also every client contains a list of WATCHed keys so that's possible to
10501 * un-watch such keys when the client is freed or when UNWATCH is called. */
10502
10503 /* In the client->watched_keys list we need to use watchedKey structures
10504 * as in order to identify a key in Redis we need both the key name and the
10505 * DB */
10506 typedef struct watchedKey {
10507 robj *key;
10508 redisDb *db;
10509 } watchedKey;
10510
10511 /* Watch for the specified key */
10512 static void watchForKey(redisClient *c, robj *key) {
10513 list *clients = NULL;
10514 listIter li;
10515 listNode *ln;
10516 watchedKey *wk;
10517
10518 /* Check if we are already watching for this key */
10519 listRewind(c->watched_keys,&li);
10520 while((ln = listNext(&li))) {
10521 wk = listNodeValue(ln);
10522 if (wk->db == c->db && equalStringObjects(key,wk->key))
10523 return; /* Key already watched */
10524 }
10525 /* This key is not already watched in this DB. Let's add it */
10526 clients = dictFetchValue(c->db->watched_keys,key);
10527 if (!clients) {
10528 clients = listCreate();
10529 dictAdd(c->db->watched_keys,key,clients);
10530 incrRefCount(key);
10531 }
10532 listAddNodeTail(clients,c);
10533 /* Add the new key to the lits of keys watched by this client */
10534 wk = zmalloc(sizeof(*wk));
10535 wk->key = key;
10536 wk->db = c->db;
10537 incrRefCount(key);
10538 listAddNodeTail(c->watched_keys,wk);
10539 }
10540
10541 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10542 * flag is up to the caller. */
10543 static void unwatchAllKeys(redisClient *c) {
10544 listIter li;
10545 listNode *ln;
10546
10547 if (listLength(c->watched_keys) == 0) return;
10548 listRewind(c->watched_keys,&li);
10549 while((ln = listNext(&li))) {
10550 list *clients;
10551 watchedKey *wk;
10552
10553 /* Lookup the watched key -> clients list and remove the client
10554 * from the list */
10555 wk = listNodeValue(ln);
10556 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10557 assert(clients != NULL);
10558 listDelNode(clients,listSearchKey(clients,c));
10559 /* Kill the entry at all if this was the only client */
10560 if (listLength(clients) == 0)
10561 dictDelete(wk->db->watched_keys, wk->key);
10562 /* Remove this watched key from the client->watched list */
10563 listDelNode(c->watched_keys,ln);
10564 decrRefCount(wk->key);
10565 zfree(wk);
10566 }
10567 }
10568
10569 /* "Touch" a key, so that if this key is being WATCHed by some client the
10570 * next EXEC will fail. */
10571 static void touchWatchedKey(redisDb *db, robj *key) {
10572 list *clients;
10573 listIter li;
10574 listNode *ln;
10575
10576 if (dictSize(db->watched_keys) == 0) return;
10577 clients = dictFetchValue(db->watched_keys, key);
10578 if (!clients) return;
10579
10580 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10581 /* Check if we are already watching for this key */
10582 listRewind(clients,&li);
10583 while((ln = listNext(&li))) {
10584 redisClient *c = listNodeValue(ln);
10585
10586 c->flags |= REDIS_DIRTY_CAS;
10587 }
10588 }
10589
10590 /* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10591 * flush but will be deleted as effect of the flushing operation should
10592 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10593 * a FLUSHALL operation (all the DBs flushed). */
10594 static void touchWatchedKeysOnFlush(int dbid) {
10595 listIter li1, li2;
10596 listNode *ln;
10597
10598 /* For every client, check all the waited keys */
10599 listRewind(server.clients,&li1);
10600 while((ln = listNext(&li1))) {
10601 redisClient *c = listNodeValue(ln);
10602 listRewind(c->watched_keys,&li2);
10603 while((ln = listNext(&li2))) {
10604 watchedKey *wk = listNodeValue(ln);
10605
10606 /* For every watched key matching the specified DB, if the
10607 * key exists, mark the client as dirty, as the key will be
10608 * removed. */
10609 if (dbid == -1 || wk->db->id == dbid) {
10610 if (dictFind(wk->db->dict, wk->key) != NULL)
10611 c->flags |= REDIS_DIRTY_CAS;
10612 }
10613 }
10614 }
10615 }
10616
10617 static void watchCommand(redisClient *c) {
10618 int j;
10619
10620 if (c->flags & REDIS_MULTI) {
10621 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10622 return;
10623 }
10624 for (j = 1; j < c->argc; j++)
10625 watchForKey(c,c->argv[j]);
10626 addReply(c,shared.ok);
10627 }
10628
10629 static void unwatchCommand(redisClient *c) {
10630 unwatchAllKeys(c);
10631 c->flags &= (~REDIS_DIRTY_CAS);
10632 addReply(c,shared.ok);
10633 }
10634
10635 /* ================================= Debugging ============================== */
10636
10637 /* Compute the sha1 of string at 's' with 'len' bytes long.
10638 * The SHA1 is then xored againt the string pointed by digest.
10639 * Since xor is commutative, this operation is used in order to
10640 * "add" digests relative to unordered elements.
10641 *
10642 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10643 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10644 SHA1_CTX ctx;
10645 unsigned char hash[20], *s = ptr;
10646 int j;
10647
10648 SHA1Init(&ctx);
10649 SHA1Update(&ctx,s,len);
10650 SHA1Final(hash,&ctx);
10651
10652 for (j = 0; j < 20; j++)
10653 digest[j] ^= hash[j];
10654 }
10655
10656 static void xorObjectDigest(unsigned char *digest, robj *o) {
10657 o = getDecodedObject(o);
10658 xorDigest(digest,o->ptr,sdslen(o->ptr));
10659 decrRefCount(o);
10660 }
10661
10662 /* This function instead of just computing the SHA1 and xoring it
10663 * against diget, also perform the digest of "digest" itself and
10664 * replace the old value with the new one.
10665 *
10666 * So the final digest will be:
10667 *
10668 * digest = SHA1(digest xor SHA1(data))
10669 *
10670 * This function is used every time we want to preserve the order so
10671 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10672 *
10673 * Also note that mixdigest("foo") followed by mixdigest("bar")
10674 * will lead to a different digest compared to "fo", "obar".
10675 */
10676 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10677 SHA1_CTX ctx;
10678 char *s = ptr;
10679
10680 xorDigest(digest,s,len);
10681 SHA1Init(&ctx);
10682 SHA1Update(&ctx,digest,20);
10683 SHA1Final(digest,&ctx);
10684 }
10685
10686 static void mixObjectDigest(unsigned char *digest, robj *o) {
10687 o = getDecodedObject(o);
10688 mixDigest(digest,o->ptr,sdslen(o->ptr));
10689 decrRefCount(o);
10690 }
10691
10692 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10693 * are not ordered, we use a trick: every aggregate digest is the xor
10694 * of the digests of their elements. This way the order will not change
10695 * the result. For list instead we use a feedback entering the output digest
10696 * as input in order to ensure that a different ordered list will result in
10697 * a different digest. */
10698 static void computeDatasetDigest(unsigned char *final) {
10699 unsigned char digest[20];
10700 char buf[128];
10701 dictIterator *di = NULL;
10702 dictEntry *de;
10703 int j;
10704 uint32_t aux;
10705
10706 memset(final,0,20); /* Start with a clean result */
10707
10708 for (j = 0; j < server.dbnum; j++) {
10709 redisDb *db = server.db+j;
10710
10711 if (dictSize(db->dict) == 0) continue;
10712 di = dictGetIterator(db->dict);
10713
10714 /* hash the DB id, so the same dataset moved in a different
10715 * DB will lead to a different digest */
10716 aux = htonl(j);
10717 mixDigest(final,&aux,sizeof(aux));
10718
10719 /* Iterate this DB writing every entry */
10720 while((de = dictNext(di)) != NULL) {
10721 robj *key, *o, *kcopy;
10722 time_t expiretime;
10723
10724 memset(digest,0,20); /* This key-val digest */
10725 key = dictGetEntryKey(de);
10726
10727 if (!server.vm_enabled) {
10728 mixObjectDigest(digest,key);
10729 o = dictGetEntryVal(de);
10730 } else {
10731 /* Don't work with the key directly as when VM is active
10732 * this is unsafe: TODO: fix decrRefCount to check if the
10733 * count really reached 0 to avoid this mess */
10734 kcopy = dupStringObject(key);
10735 mixObjectDigest(digest,kcopy);
10736 o = lookupKeyRead(db,kcopy);
10737 decrRefCount(kcopy);
10738 }
10739 aux = htonl(o->type);
10740 mixDigest(digest,&aux,sizeof(aux));
10741 expiretime = getExpire(db,key);
10742
10743 /* Save the key and associated value */
10744 if (o->type == REDIS_STRING) {
10745 mixObjectDigest(digest,o);
10746 } else if (o->type == REDIS_LIST) {
10747 list *list = o->ptr;
10748 listNode *ln;
10749 listIter li;
10750
10751 listRewind(list,&li);
10752 while((ln = listNext(&li))) {
10753 robj *eleobj = listNodeValue(ln);
10754
10755 mixObjectDigest(digest,eleobj);
10756 }
10757 } else if (o->type == REDIS_SET) {
10758 dict *set = o->ptr;
10759 dictIterator *di = dictGetIterator(set);
10760 dictEntry *de;
10761
10762 while((de = dictNext(di)) != NULL) {
10763 robj *eleobj = dictGetEntryKey(de);
10764
10765 xorObjectDigest(digest,eleobj);
10766 }
10767 dictReleaseIterator(di);
10768 } else if (o->type == REDIS_ZSET) {
10769 zset *zs = o->ptr;
10770 dictIterator *di = dictGetIterator(zs->dict);
10771 dictEntry *de;
10772
10773 while((de = dictNext(di)) != NULL) {
10774 robj *eleobj = dictGetEntryKey(de);
10775 double *score = dictGetEntryVal(de);
10776 unsigned char eledigest[20];
10777
10778 snprintf(buf,sizeof(buf),"%.17g",*score);
10779 memset(eledigest,0,20);
10780 mixObjectDigest(eledigest,eleobj);
10781 mixDigest(eledigest,buf,strlen(buf));
10782 xorDigest(digest,eledigest,20);
10783 }
10784 dictReleaseIterator(di);
10785 } else if (o->type == REDIS_HASH) {
10786 hashIterator *hi;
10787 robj *obj;
10788
10789 hi = hashInitIterator(o);
10790 while (hashNext(hi) != REDIS_ERR) {
10791 unsigned char eledigest[20];
10792
10793 memset(eledigest,0,20);
10794 obj = hashCurrent(hi,REDIS_HASH_KEY);
10795 mixObjectDigest(eledigest,obj);
10796 decrRefCount(obj);
10797 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10798 mixObjectDigest(eledigest,obj);
10799 decrRefCount(obj);
10800 xorDigest(digest,eledigest,20);
10801 }
10802 hashReleaseIterator(hi);
10803 } else {
10804 redisPanic("Unknown object type");
10805 }
10806 /* If the key has an expire, add it to the mix */
10807 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10808 /* We can finally xor the key-val digest to the final digest */
10809 xorDigest(final,digest,20);
10810 }
10811 dictReleaseIterator(di);
10812 }
10813 }
10814
10815 static void debugCommand(redisClient *c) {
10816 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10817 *((char*)-1) = 'x';
10818 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10819 if (rdbSave(server.dbfilename) != REDIS_OK) {
10820 addReply(c,shared.err);
10821 return;
10822 }
10823 emptyDb();
10824 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10825 addReply(c,shared.err);
10826 return;
10827 }
10828 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10829 addReply(c,shared.ok);
10830 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10831 emptyDb();
10832 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10833 addReply(c,shared.err);
10834 return;
10835 }
10836 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10837 addReply(c,shared.ok);
10838 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10839 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10840 robj *key, *val;
10841
10842 if (!de) {
10843 addReply(c,shared.nokeyerr);
10844 return;
10845 }
10846 key = dictGetEntryKey(de);
10847 val = dictGetEntryVal(de);
10848 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
10849 val->storage == REDIS_VM_SWAPPING)) {
10850 char *strenc;
10851 char buf[128];
10852
10853 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10854 strenc = strencoding[val->encoding];
10855 } else {
10856 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10857 strenc = buf;
10858 }
10859 addReplySds(c,sdscatprintf(sdsempty(),
10860 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10861 "encoding:%s serializedlength:%lld\r\n",
10862 (void*)key, key->refcount, (void*)val, val->refcount,
10863 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10864 } else {
10865 vmpointer *vp = (vmpointer*) val;
10866 addReplySds(c,sdscatprintf(sdsempty(),
10867 "+Key at:%p refcount:%d, value swapped at: page %llu "
10868 "using %llu pages\r\n",
10869 (void*)key, key->refcount, (unsigned long long) vp->page,
10870 (unsigned long long) vp->usedpages));
10871 }
10872 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10873 lookupKeyRead(c->db,c->argv[2]);
10874 addReply(c,shared.ok);
10875 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10876 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10877 robj *key, *val;
10878 vmpointer *vp;
10879
10880 if (!server.vm_enabled) {
10881 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10882 return;
10883 }
10884 if (!de) {
10885 addReply(c,shared.nokeyerr);
10886 return;
10887 }
10888 key = dictGetEntryKey(de);
10889 val = dictGetEntryVal(de);
10890 /* Swap it */
10891 if (val->storage != REDIS_VM_MEMORY) {
10892 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10893 } else if (val->refcount != 1) {
10894 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
10895 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
10896 dictGetEntryVal(de) = vp;
10897 addReply(c,shared.ok);
10898 } else {
10899 addReply(c,shared.err);
10900 }
10901 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10902 long keys, j;
10903 robj *key, *val;
10904 char buf[128];
10905
10906 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10907 return;
10908 for (j = 0; j < keys; j++) {
10909 snprintf(buf,sizeof(buf),"key:%lu",j);
10910 key = createStringObject(buf,strlen(buf));
10911 if (lookupKeyRead(c->db,key) != NULL) {
10912 decrRefCount(key);
10913 continue;
10914 }
10915 snprintf(buf,sizeof(buf),"value:%lu",j);
10916 val = createStringObject(buf,strlen(buf));
10917 dictAdd(c->db->dict,key,val);
10918 }
10919 addReply(c,shared.ok);
10920 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10921 unsigned char digest[20];
10922 sds d = sdsnew("+");
10923 int j;
10924
10925 computeDatasetDigest(digest);
10926 for (j = 0; j < 20; j++)
10927 d = sdscatprintf(d, "%02x",digest[j]);
10928
10929 d = sdscatlen(d,"\r\n",2);
10930 addReplySds(c,d);
10931 } else {
10932 addReplySds(c,sdsnew(
10933 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10934 }
10935 }
10936
10937 static void _redisAssert(char *estr, char *file, int line) {
10938 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10939 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10940 #ifdef HAVE_BACKTRACE
10941 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10942 *((char*)-1) = 'x';
10943 #endif
10944 }
10945
10946 static void _redisPanic(char *msg, char *file, int line) {
10947 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10948 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10949 #ifdef HAVE_BACKTRACE
10950 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10951 *((char*)-1) = 'x';
10952 #endif
10953 }
10954
10955 /* =================================== Main! ================================ */
10956
10957 #ifdef __linux__
10958 int linuxOvercommitMemoryValue(void) {
10959 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10960 char buf[64];
10961
10962 if (!fp) return -1;
10963 if (fgets(buf,64,fp) == NULL) {
10964 fclose(fp);
10965 return -1;
10966 }
10967 fclose(fp);
10968
10969 return atoi(buf);
10970 }
10971
10972 void linuxOvercommitMemoryWarning(void) {
10973 if (linuxOvercommitMemoryValue() == 0) {
10974 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10975 }
10976 }
10977 #endif /* __linux__ */
10978
10979 static void daemonize(void) {
10980 int fd;
10981 FILE *fp;
10982
10983 if (fork() != 0) exit(0); /* parent exits */
10984 setsid(); /* create a new session */
10985
10986 /* Every output goes to /dev/null. If Redis is daemonized but
10987 * the 'logfile' is set to 'stdout' in the configuration file
10988 * it will not log at all. */
10989 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10990 dup2(fd, STDIN_FILENO);
10991 dup2(fd, STDOUT_FILENO);
10992 dup2(fd, STDERR_FILENO);
10993 if (fd > STDERR_FILENO) close(fd);
10994 }
10995 /* Try to write the pid file */
10996 fp = fopen(server.pidfile,"w");
10997 if (fp) {
10998 fprintf(fp,"%d\n",getpid());
10999 fclose(fp);
11000 }
11001 }
11002
11003 static void version() {
11004 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11005 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
11006 exit(0);
11007 }
11008
11009 static void usage() {
11010 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
11011 fprintf(stderr," ./redis-server - (read config from stdin)\n");
11012 exit(1);
11013 }
11014
11015 int main(int argc, char **argv) {
11016 time_t start;
11017
11018 initServerConfig();
11019 sortCommandTable();
11020 if (argc == 2) {
11021 if (strcmp(argv[1], "-v") == 0 ||
11022 strcmp(argv[1], "--version") == 0) version();
11023 if (strcmp(argv[1], "--help") == 0) usage();
11024 resetServerSaveParams();
11025 loadServerConfig(argv[1]);
11026 } else if ((argc > 2)) {
11027 usage();
11028 } else {
11029 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11030 }
11031 if (server.daemonize) daemonize();
11032 initServer();
11033 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11034 #ifdef __linux__
11035 linuxOvercommitMemoryWarning();
11036 #endif
11037 start = time(NULL);
11038 if (server.appendonly) {
11039 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
11040 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
11041 } else {
11042 if (rdbLoad(server.dbfilename) == REDIS_OK)
11043 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
11044 }
11045 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
11046 aeSetBeforeSleepProc(server.el,beforeSleep);
11047 aeMain(server.el);
11048 aeDeleteEventLoop(server.el);
11049 return 0;
11050 }
11051
11052 /* ============================= Backtrace support ========================= */
11053
11054 #ifdef HAVE_BACKTRACE
11055 static char *findFuncName(void *pointer, unsigned long *offset);
11056
11057 static void *getMcontextEip(ucontext_t *uc) {
11058 #if defined(__FreeBSD__)
11059 return (void*) uc->uc_mcontext.mc_eip;
11060 #elif defined(__dietlibc__)
11061 return (void*) uc->uc_mcontext.eip;
11062 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
11063 #if __x86_64__
11064 return (void*) uc->uc_mcontext->__ss.__rip;
11065 #else
11066 return (void*) uc->uc_mcontext->__ss.__eip;
11067 #endif
11068 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
11069 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
11070 return (void*) uc->uc_mcontext->__ss.__rip;
11071 #else
11072 return (void*) uc->uc_mcontext->__ss.__eip;
11073 #endif
11074 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
11075 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
11076 #elif defined(__ia64__) /* Linux IA64 */
11077 return (void*) uc->uc_mcontext.sc_ip;
11078 #else
11079 return NULL;
11080 #endif
11081 }
11082
11083 static void segvHandler(int sig, siginfo_t *info, void *secret) {
11084 void *trace[100];
11085 char **messages = NULL;
11086 int i, trace_size = 0;
11087 unsigned long offset=0;
11088 ucontext_t *uc = (ucontext_t*) secret;
11089 sds infostring;
11090 REDIS_NOTUSED(info);
11091
11092 redisLog(REDIS_WARNING,
11093 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
11094 infostring = genRedisInfoString();
11095 redisLog(REDIS_WARNING, "%s",infostring);
11096 /* It's not safe to sdsfree() the returned string under memory
11097 * corruption conditions. Let it leak as we are going to abort */
11098
11099 trace_size = backtrace(trace, 100);
11100 /* overwrite sigaction with caller's address */
11101 if (getMcontextEip(uc) != NULL) {
11102 trace[1] = getMcontextEip(uc);
11103 }
11104 messages = backtrace_symbols(trace, trace_size);
11105
11106 for (i=1; i<trace_size; ++i) {
11107 char *fn = findFuncName(trace[i], &offset), *p;
11108
11109 p = strchr(messages[i],'+');
11110 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11111 redisLog(REDIS_WARNING,"%s", messages[i]);
11112 } else {
11113 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11114 }
11115 }
11116 /* free(messages); Don't call free() with possibly corrupted memory. */
11117 _exit(0);
11118 }
11119
11120 static void sigtermHandler(int sig) {
11121 REDIS_NOTUSED(sig);
11122
11123 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11124 server.shutdown_asap = 1;
11125 }
11126
11127 static void setupSigSegvAction(void) {
11128 struct sigaction act;
11129
11130 sigemptyset (&act.sa_mask);
11131 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11132 * is used. Otherwise, sa_handler is used */
11133 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11134 act.sa_sigaction = segvHandler;
11135 sigaction (SIGSEGV, &act, NULL);
11136 sigaction (SIGBUS, &act, NULL);
11137 sigaction (SIGFPE, &act, NULL);
11138 sigaction (SIGILL, &act, NULL);
11139 sigaction (SIGBUS, &act, NULL);
11140
11141 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
11142 act.sa_handler = sigtermHandler;
11143 sigaction (SIGTERM, &act, NULL);
11144 return;
11145 }
11146
11147 #include "staticsymbols.h"
11148 /* This function try to convert a pointer into a function name. It's used in
11149 * oreder to provide a backtrace under segmentation fault that's able to
11150 * display functions declared as static (otherwise the backtrace is useless). */
11151 static char *findFuncName(void *pointer, unsigned long *offset){
11152 int i, ret = -1;
11153 unsigned long off, minoff = 0;
11154
11155 /* Try to match against the Symbol with the smallest offset */
11156 for (i=0; symsTable[i].pointer; i++) {
11157 unsigned long lp = (unsigned long) pointer;
11158
11159 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11160 off=lp-symsTable[i].pointer;
11161 if (ret < 0 || off < minoff) {
11162 minoff=off;
11163 ret=i;
11164 }
11165 }
11166 }
11167 if (ret == -1) return NULL;
11168 *offset = minoff;
11169 return symsTable[ret].name;
11170 }
11171 #else /* HAVE_BACKTRACE */
11172 static void setupSigSegvAction(void) {
11173 }
11174 #endif /* HAVE_BACKTRACE */
11175
11176
11177
11178 /* The End */
11179
11180
11181