]> git.saurik.com Git - redis.git/blob - redis.c
b6900d0dacdf0193fb951d51c16d417365afcc04
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "2.1.0"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <float.h>
61 #include <math.h>
62 #include <pthread.h>
63
64 #if defined(__sun)
65 #include "solarisfixes.h"
66 #endif
67
68 #include "redis.h"
69 #include "ae.h" /* Event driven programming library */
70 #include "sds.h" /* Dynamic safe strings */
71 #include "anet.h" /* Networking the easy way */
72 #include "dict.h" /* Hash tables */
73 #include "adlist.h" /* Linked lists */
74 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
75 #include "lzf.h" /* LZF compression library */
76 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
77 #include "zipmap.h" /* Compact dictionary-alike data structure */
78 #include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
79 #include "release.h" /* Release and/or git repository information */
80
81 /* Error codes */
82 #define REDIS_OK 0
83 #define REDIS_ERR -1
84
85 /* Static server configuration */
86 #define REDIS_SERVERPORT 6379 /* TCP port */
87 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
88 #define REDIS_IOBUF_LEN 1024
89 #define REDIS_LOADBUF_LEN 1024
90 #define REDIS_STATIC_ARGS 8
91 #define REDIS_DEFAULT_DBNUM 16
92 #define REDIS_CONFIGLINE_MAX 1024
93 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
95 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
96 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
97 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100 #define REDIS_WRITEV_THRESHOLD 3
101 /* Max number of iovecs used for each writev call */
102 #define REDIS_WRITEV_IOVEC_COUNT 256
103
104 /* Hash table parameters */
105 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
106
107 /* Command flags */
108 #define REDIS_CMD_BULK 1 /* Bulk write command */
109 #define REDIS_CMD_INLINE 2 /* Inline command */
110 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114 #define REDIS_CMD_DENYOOM 4
115 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
116
117 /* Object types */
118 #define REDIS_STRING 0
119 #define REDIS_LIST 1
120 #define REDIS_SET 2
121 #define REDIS_ZSET 3
122 #define REDIS_HASH 4
123
124 /* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
127 #define REDIS_ENCODING_RAW 0 /* Raw representation */
128 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
129 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
131
132 static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134 };
135
136 /* Object types only used for dumping to disk */
137 #define REDIS_EXPIRETIME 253
138 #define REDIS_SELECTDB 254
139 #define REDIS_EOF 255
140
141 /* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
151 *
152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
154 #define REDIS_RDB_6BITLEN 0
155 #define REDIS_RDB_14BITLEN 1
156 #define REDIS_RDB_32BITLEN 2
157 #define REDIS_RDB_ENCVAL 3
158 #define REDIS_RDB_LENERR UINT_MAX
159
160 /* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
166 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
167
168 /* Virtual memory object->where field. */
169 #define REDIS_VM_MEMORY 0 /* The object is on memory */
170 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
171 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
174 /* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176 #define REDIS_VM_MAX_NEAR_PAGES 65536
177 #define REDIS_VM_MAX_RANDOM_JUMP 4096
178 #define REDIS_VM_MAX_THREADS 32
179 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
180 /* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
184 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
185
186 /* Client flags */
187 #define REDIS_SLAVE 1 /* This client is a slave server */
188 #define REDIS_MASTER 2 /* This client is a master server */
189 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190 #define REDIS_MULTI 8 /* This client is in a MULTI context */
191 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
193 #define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
194
195 /* Slave replication state - slave side */
196 #define REDIS_REPL_NONE 0 /* No active replication */
197 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
198 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
200 /* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
209 /* List related stuff */
210 #define REDIS_HEAD 0
211 #define REDIS_TAIL 1
212
213 /* Sort operations */
214 #define REDIS_SORT_GET 0
215 #define REDIS_SORT_ASC 1
216 #define REDIS_SORT_DESC 2
217 #define REDIS_SORTKEY_MAX 1024
218
219 /* Log levels */
220 #define REDIS_DEBUG 0
221 #define REDIS_VERBOSE 1
222 #define REDIS_NOTICE 2
223 #define REDIS_WARNING 3
224
225 /* Anti-warning macro... */
226 #define REDIS_NOTUSED(V) ((void) V)
227
228 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
230
231 /* Append only defines */
232 #define APPENDFSYNC_NO 0
233 #define APPENDFSYNC_ALWAYS 1
234 #define APPENDFSYNC_EVERYSEC 2
235
236 /* Hashes related defaults */
237 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
240 /* We can print the stacktrace, so our assert is defined this way: */
241 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
242 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
243 static void _redisAssert(char *estr, char *file, int line);
244 static void _redisPanic(char *msg, char *file, int line);
245
246 /*================================= Data types ============================== */
247
248 /* A redis object, that is a type able to hold a string / list / set */
249
250 /* The VM object structure */
251 struct redisObjectVM {
252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
255 } vm;
256
257 /* The actual Redis Object */
258 typedef struct redisObject {
259 void *ptr;
260 unsigned char type;
261 unsigned char encoding;
262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
266 int refcount;
267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
272 } robj;
273
274 /* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278 #define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
284 } while(0);
285
286 typedef struct redisDb {
287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
290 dict *io_keys; /* Keys with clients waiting for VM I/O */
291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
292 int id;
293 } redisDb;
294
295 /* Client MULTI/EXEC state */
296 typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300 } multiCmd;
301
302 typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305 } multiState;
306
307 /* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309 typedef struct redisClient {
310 int fd;
311 redisDb *db;
312 int dictid;
313 sds querybuf;
314 robj **argv, **mbargv;
315 int argc, mbargc;
316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
317 int multibulk; /* multi bulk command format active */
318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
326 long repldboff; /* replication DB file offset */
327 off_t repldbsize; /* replication DB file size */
328 multiState mstate; /* MULTI/EXEC state */
329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
330 * operation such as BLPOP. Otherwise NULL. */
331 int blocking_keys_num; /* Number of blocking keys */
332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
339 } redisClient;
340
341 struct saveparam {
342 time_t seconds;
343 int changes;
344 };
345
346 /* Global server state structure */
347 struct redisServer {
348 int port;
349 int fd;
350 redisDb *db;
351 long long dirty; /* changes to DB from the last save */
352 list *clients;
353 list *slaves, *monitors;
354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
363 long long stat_expiredkeys; /* number of expired keys */
364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
370 int appendonly;
371 int appendfsync;
372 int shutdown_asap;
373 time_t lastfsync;
374 int appendfd;
375 int appendseldb;
376 char *pidfile;
377 pid_t bgsavechildpid;
378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
380 sds aofbuf; /* AOF buffer, written before entering the event loop */
381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
386 char *appendfilename;
387 char *requirepass;
388 int rdbcompression;
389 int activerehashing;
390 /* Replication related */
391 int isslave;
392 char *masterauth;
393 char *masterhost;
394 int masterport;
395 redisClient *master; /* client that is master for this slave */
396 int replstate;
397 unsigned int maxclients;
398 unsigned long long maxmemory;
399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
406 /* Virtual memory configuration */
407 int vm_enabled;
408 char *vm_swap_file;
409 off_t vm_page_size;
410 off_t vm_pages;
411 unsigned long long vm_max_memory;
412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
421 time_t unixtime; /* Unix time sampled every second. */
422 /* Virtual memory I/O threads stuff */
423 /* An I/O thread process an element taken from the io_jobs queue and
424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
447 /* Pubsub */
448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
450 /* Misc */
451 FILE *devnull;
452 };
453
454 typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457 } pubsubPattern;
458
459 typedef void redisCommandProc(redisClient *c);
460 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
461 struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
469 redisVmPreloadProc *vm_preload_proc;
470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
474 };
475
476 struct redisFunctionSym {
477 char *name;
478 unsigned long pointer;
479 };
480
481 typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487 } redisSortObject;
488
489 typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492 } redisSortOperation;
493
494 /* ZSETs use a specialized version of Skiplists */
495
496 typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
498 struct zskiplistNode *backward;
499 unsigned int *span;
500 double score;
501 robj *obj;
502 } zskiplistNode;
503
504 typedef struct zskiplist {
505 struct zskiplistNode *header, *tail;
506 unsigned long length;
507 int level;
508 } zskiplist;
509
510 typedef struct zset {
511 dict *dict;
512 zskiplist *zsl;
513 } zset;
514
515 /* Our shared "common" objects */
516
517 #define REDIS_SHARED_INTEGERS 10000
518 struct sharedObjectsStruct {
519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
520 *colon, *nullbulk, *nullmultibulk, *queued,
521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
523 *select0, *select1, *select2, *select3, *select4,
524 *select5, *select6, *select7, *select8, *select9,
525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
528 } shared;
529
530 /* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
536 /* VM threaded I/O request message */
537 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
540 typedef struct iojob {
541 int type; /* Request type, REDIS_IOJOB_* */
542 redisDb *db;/* Redis database */
543 robj *key; /* This I/O request is about swapping this key */
544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550 } iojob;
551
552 /*================================ Prototypes =============================== */
553
554 static void freeStringObject(robj *o);
555 static void freeListObject(robj *o);
556 static void freeSetObject(robj *o);
557 static void decrRefCount(void *o);
558 static robj *createObject(int type, void *ptr);
559 static void freeClient(redisClient *c);
560 static int rdbLoad(char *filename);
561 static void addReply(redisClient *c, robj *obj);
562 static void addReplySds(redisClient *c, sds s);
563 static void incrRefCount(robj *o);
564 static int rdbSaveBackground(char *filename);
565 static robj *createStringObject(char *ptr, size_t len);
566 static robj *dupStringObject(robj *o);
567 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
568 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
569 static void flushAppendOnlyFile(void);
570 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
571 static int syncWithMaster(void);
572 static robj *tryObjectEncoding(robj *o);
573 static robj *getDecodedObject(robj *o);
574 static int removeExpire(redisDb *db, robj *key);
575 static int expireIfNeeded(redisDb *db, robj *key);
576 static int deleteIfVolatile(redisDb *db, robj *key);
577 static int deleteIfSwapped(redisDb *db, robj *key);
578 static int deleteKey(redisDb *db, robj *key);
579 static time_t getExpire(redisDb *db, robj *key);
580 static int setExpire(redisDb *db, robj *key, time_t when);
581 static void updateSlavesWaitingBgsave(int bgsaveerr);
582 static void freeMemoryIfNeeded(void);
583 static int processCommand(redisClient *c);
584 static void setupSigSegvAction(void);
585 static void rdbRemoveTempFile(pid_t childpid);
586 static void aofRemoveTempFile(pid_t childpid);
587 static size_t stringObjectLen(robj *o);
588 static void processInputBuffer(redisClient *c);
589 static zskiplist *zslCreate(void);
590 static void zslFree(zskiplist *zsl);
591 static void zslInsert(zskiplist *zsl, double score, robj *obj);
592 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
593 static void initClientMultiState(redisClient *c);
594 static void freeClientMultiState(redisClient *c);
595 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
596 static void unblockClientWaitingData(redisClient *c);
597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
598 static void vmInit(void);
599 static void vmMarkPagesFree(off_t page, off_t count);
600 static robj *vmLoadObject(robj *key);
601 static robj *vmPreviewObject(robj *key);
602 static int vmSwapOneObjectBlocking(void);
603 static int vmSwapOneObjectThreaded(void);
604 static int vmCanSwapOut(void);
605 static int tryFreeOneObjectFromFreelist(void);
606 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608 static void vmCancelThreadedIOJob(robj *o);
609 static void lockThreadedIO(void);
610 static void unlockThreadedIO(void);
611 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612 static void freeIOJob(iojob *j);
613 static void queueIOJob(iojob *j);
614 static int vmWriteObjectOnSwap(robj *o, off_t page);
615 static robj *vmReadObjectFromSwap(off_t page, int type);
616 static void waitEmptyIOJobsQueue(void);
617 static void vmReopenSwapFile(void);
618 static int vmFreePage(off_t page);
619 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
620 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
621 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
622 static int dontWaitForSwappedKey(redisClient *c, robj *key);
623 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625 static struct redisCommand *lookupCommand(char *name);
626 static void call(redisClient *c, struct redisCommand *cmd);
627 static void resetClient(redisClient *c);
628 static void convertToRealHash(robj *o);
629 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631 static void freePubsubPattern(void *p);
632 static int listMatchPubsubPattern(void *a, void *b);
633 static int compareStringObjects(robj *a, robj *b);
634 static int equalStringObjects(robj *a, robj *b);
635 static void usage();
636 static int rewriteAppendOnlyFileBackground(void);
637 static int vmSwapObjectBlocking(robj *key, robj *val);
638 static int prepareForShutdown();
639 static void touchWatchedKey(redisDb *db, robj *key);
640 static void unwatchAllKeys(redisClient *c);
641
642 static void authCommand(redisClient *c);
643 static void pingCommand(redisClient *c);
644 static void echoCommand(redisClient *c);
645 static void setCommand(redisClient *c);
646 static void setnxCommand(redisClient *c);
647 static void setexCommand(redisClient *c);
648 static void getCommand(redisClient *c);
649 static void delCommand(redisClient *c);
650 static void existsCommand(redisClient *c);
651 static void incrCommand(redisClient *c);
652 static void decrCommand(redisClient *c);
653 static void incrbyCommand(redisClient *c);
654 static void decrbyCommand(redisClient *c);
655 static void selectCommand(redisClient *c);
656 static void randomkeyCommand(redisClient *c);
657 static void keysCommand(redisClient *c);
658 static void dbsizeCommand(redisClient *c);
659 static void lastsaveCommand(redisClient *c);
660 static void saveCommand(redisClient *c);
661 static void bgsaveCommand(redisClient *c);
662 static void bgrewriteaofCommand(redisClient *c);
663 static void shutdownCommand(redisClient *c);
664 static void moveCommand(redisClient *c);
665 static void renameCommand(redisClient *c);
666 static void renamenxCommand(redisClient *c);
667 static void lpushCommand(redisClient *c);
668 static void rpushCommand(redisClient *c);
669 static void lpopCommand(redisClient *c);
670 static void rpopCommand(redisClient *c);
671 static void llenCommand(redisClient *c);
672 static void lindexCommand(redisClient *c);
673 static void lrangeCommand(redisClient *c);
674 static void ltrimCommand(redisClient *c);
675 static void typeCommand(redisClient *c);
676 static void lsetCommand(redisClient *c);
677 static void saddCommand(redisClient *c);
678 static void sremCommand(redisClient *c);
679 static void smoveCommand(redisClient *c);
680 static void sismemberCommand(redisClient *c);
681 static void scardCommand(redisClient *c);
682 static void spopCommand(redisClient *c);
683 static void srandmemberCommand(redisClient *c);
684 static void sinterCommand(redisClient *c);
685 static void sinterstoreCommand(redisClient *c);
686 static void sunionCommand(redisClient *c);
687 static void sunionstoreCommand(redisClient *c);
688 static void sdiffCommand(redisClient *c);
689 static void sdiffstoreCommand(redisClient *c);
690 static void syncCommand(redisClient *c);
691 static void flushdbCommand(redisClient *c);
692 static void flushallCommand(redisClient *c);
693 static void sortCommand(redisClient *c);
694 static void lremCommand(redisClient *c);
695 static void rpoplpushcommand(redisClient *c);
696 static void infoCommand(redisClient *c);
697 static void mgetCommand(redisClient *c);
698 static void monitorCommand(redisClient *c);
699 static void expireCommand(redisClient *c);
700 static void expireatCommand(redisClient *c);
701 static void getsetCommand(redisClient *c);
702 static void ttlCommand(redisClient *c);
703 static void slaveofCommand(redisClient *c);
704 static void debugCommand(redisClient *c);
705 static void msetCommand(redisClient *c);
706 static void msetnxCommand(redisClient *c);
707 static void zaddCommand(redisClient *c);
708 static void zincrbyCommand(redisClient *c);
709 static void zrangeCommand(redisClient *c);
710 static void zrangebyscoreCommand(redisClient *c);
711 static void zcountCommand(redisClient *c);
712 static void zrevrangeCommand(redisClient *c);
713 static void zcardCommand(redisClient *c);
714 static void zremCommand(redisClient *c);
715 static void zscoreCommand(redisClient *c);
716 static void zremrangebyscoreCommand(redisClient *c);
717 static void multiCommand(redisClient *c);
718 static void execCommand(redisClient *c);
719 static void discardCommand(redisClient *c);
720 static void blpopCommand(redisClient *c);
721 static void brpopCommand(redisClient *c);
722 static void appendCommand(redisClient *c);
723 static void substrCommand(redisClient *c);
724 static void zrankCommand(redisClient *c);
725 static void zrevrankCommand(redisClient *c);
726 static void hsetCommand(redisClient *c);
727 static void hsetnxCommand(redisClient *c);
728 static void hgetCommand(redisClient *c);
729 static void hmsetCommand(redisClient *c);
730 static void hmgetCommand(redisClient *c);
731 static void hdelCommand(redisClient *c);
732 static void hlenCommand(redisClient *c);
733 static void zremrangebyrankCommand(redisClient *c);
734 static void zunionstoreCommand(redisClient *c);
735 static void zinterstoreCommand(redisClient *c);
736 static void hkeysCommand(redisClient *c);
737 static void hvalsCommand(redisClient *c);
738 static void hgetallCommand(redisClient *c);
739 static void hexistsCommand(redisClient *c);
740 static void configCommand(redisClient *c);
741 static void hincrbyCommand(redisClient *c);
742 static void subscribeCommand(redisClient *c);
743 static void unsubscribeCommand(redisClient *c);
744 static void psubscribeCommand(redisClient *c);
745 static void punsubscribeCommand(redisClient *c);
746 static void publishCommand(redisClient *c);
747 static void watchCommand(redisClient *c);
748 static void unwatchCommand(redisClient *c);
749
750 /*================================= Globals ================================= */
751
752 /* Global vars */
753 static struct redisServer server; /* server global state */
754 static struct redisCommand cmdTable[] = {
755 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
757 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
758 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
760 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
762 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
764 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
766 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
775 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
778 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
779 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
782 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
787 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
788 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
789 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
790 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
791 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
792 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
796 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
799 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
800 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
806 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
807 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
820 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
824 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
825 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
837 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
843 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
845 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
850 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
853 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
856 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
861 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {NULL,NULL,0,0,NULL,0,0,0}
864 };
865
866 /*============================ Utility functions ============================ */
867
868 /* Glob-style pattern matching. */
869 static int stringmatchlen(const char *pattern, int patternLen,
870 const char *string, int stringLen, int nocase)
871 {
872 while(patternLen) {
873 switch(pattern[0]) {
874 case '*':
875 while (pattern[1] == '*') {
876 pattern++;
877 patternLen--;
878 }
879 if (patternLen == 1)
880 return 1; /* match */
881 while(stringLen) {
882 if (stringmatchlen(pattern+1, patternLen-1,
883 string, stringLen, nocase))
884 return 1; /* match */
885 string++;
886 stringLen--;
887 }
888 return 0; /* no match */
889 break;
890 case '?':
891 if (stringLen == 0)
892 return 0; /* no match */
893 string++;
894 stringLen--;
895 break;
896 case '[':
897 {
898 int not, match;
899
900 pattern++;
901 patternLen--;
902 not = pattern[0] == '^';
903 if (not) {
904 pattern++;
905 patternLen--;
906 }
907 match = 0;
908 while(1) {
909 if (pattern[0] == '\\') {
910 pattern++;
911 patternLen--;
912 if (pattern[0] == string[0])
913 match = 1;
914 } else if (pattern[0] == ']') {
915 break;
916 } else if (patternLen == 0) {
917 pattern--;
918 patternLen++;
919 break;
920 } else if (pattern[1] == '-' && patternLen >= 3) {
921 int start = pattern[0];
922 int end = pattern[2];
923 int c = string[0];
924 if (start > end) {
925 int t = start;
926 start = end;
927 end = t;
928 }
929 if (nocase) {
930 start = tolower(start);
931 end = tolower(end);
932 c = tolower(c);
933 }
934 pattern += 2;
935 patternLen -= 2;
936 if (c >= start && c <= end)
937 match = 1;
938 } else {
939 if (!nocase) {
940 if (pattern[0] == string[0])
941 match = 1;
942 } else {
943 if (tolower((int)pattern[0]) == tolower((int)string[0]))
944 match = 1;
945 }
946 }
947 pattern++;
948 patternLen--;
949 }
950 if (not)
951 match = !match;
952 if (!match)
953 return 0; /* no match */
954 string++;
955 stringLen--;
956 break;
957 }
958 case '\\':
959 if (patternLen >= 2) {
960 pattern++;
961 patternLen--;
962 }
963 /* fall through */
964 default:
965 if (!nocase) {
966 if (pattern[0] != string[0])
967 return 0; /* no match */
968 } else {
969 if (tolower((int)pattern[0]) != tolower((int)string[0]))
970 return 0; /* no match */
971 }
972 string++;
973 stringLen--;
974 break;
975 }
976 pattern++;
977 patternLen--;
978 if (stringLen == 0) {
979 while(*pattern == '*') {
980 pattern++;
981 patternLen--;
982 }
983 break;
984 }
985 }
986 if (patternLen == 0 && stringLen == 0)
987 return 1;
988 return 0;
989 }
990
991 static int stringmatch(const char *pattern, const char *string, int nocase) {
992 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
993 }
994
995 /* Convert a string representing an amount of memory into the number of
996 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
997 * (1024*1024*1024).
998 *
999 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1000 * set to 0 */
1001 static long long memtoll(const char *p, int *err) {
1002 const char *u;
1003 char buf[128];
1004 long mul; /* unit multiplier */
1005 long long val;
1006 unsigned int digits;
1007
1008 if (err) *err = 0;
1009 /* Search the first non digit character. */
1010 u = p;
1011 if (*u == '-') u++;
1012 while(*u && isdigit(*u)) u++;
1013 if (*u == '\0' || !strcasecmp(u,"b")) {
1014 mul = 1;
1015 } else if (!strcasecmp(u,"k")) {
1016 mul = 1000;
1017 } else if (!strcasecmp(u,"kb")) {
1018 mul = 1024;
1019 } else if (!strcasecmp(u,"m")) {
1020 mul = 1000*1000;
1021 } else if (!strcasecmp(u,"mb")) {
1022 mul = 1024*1024;
1023 } else if (!strcasecmp(u,"g")) {
1024 mul = 1000L*1000*1000;
1025 } else if (!strcasecmp(u,"gb")) {
1026 mul = 1024L*1024*1024;
1027 } else {
1028 if (err) *err = 1;
1029 mul = 1;
1030 }
1031 digits = u-p;
1032 if (digits >= sizeof(buf)) {
1033 if (err) *err = 1;
1034 return LLONG_MAX;
1035 }
1036 memcpy(buf,p,digits);
1037 buf[digits] = '\0';
1038 val = strtoll(buf,NULL,10);
1039 return val*mul;
1040 }
1041
1042 /* Convert a long long into a string. Returns the number of
1043 * characters needed to represent the number, that can be shorter if passed
1044 * buffer length is not enough to store the whole number. */
1045 static int ll2string(char *s, size_t len, long long value) {
1046 char buf[32], *p;
1047 unsigned long long v;
1048 size_t l;
1049
1050 if (len == 0) return 0;
1051 v = (value < 0) ? -value : value;
1052 p = buf+31; /* point to the last character */
1053 do {
1054 *p-- = '0'+(v%10);
1055 v /= 10;
1056 } while(v);
1057 if (value < 0) *p-- = '-';
1058 p++;
1059 l = 32-(p-buf);
1060 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1061 memcpy(s,p,l);
1062 s[l] = '\0';
1063 return l;
1064 }
1065
1066 static void redisLog(int level, const char *fmt, ...) {
1067 va_list ap;
1068 FILE *fp;
1069
1070 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1071 if (!fp) return;
1072
1073 va_start(ap, fmt);
1074 if (level >= server.verbosity) {
1075 char *c = ".-*#";
1076 char buf[64];
1077 time_t now;
1078
1079 now = time(NULL);
1080 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1081 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1082 vfprintf(fp, fmt, ap);
1083 fprintf(fp,"\n");
1084 fflush(fp);
1085 }
1086 va_end(ap);
1087
1088 if (server.logfile) fclose(fp);
1089 }
1090
1091 /*====================== Hash table type implementation ==================== */
1092
1093 /* This is an hash table type that uses the SDS dynamic strings libary as
1094 * keys and radis objects as values (objects can hold SDS strings,
1095 * lists, sets). */
1096
1097 static void dictVanillaFree(void *privdata, void *val)
1098 {
1099 DICT_NOTUSED(privdata);
1100 zfree(val);
1101 }
1102
1103 static void dictListDestructor(void *privdata, void *val)
1104 {
1105 DICT_NOTUSED(privdata);
1106 listRelease((list*)val);
1107 }
1108
1109 static int sdsDictKeyCompare(void *privdata, const void *key1,
1110 const void *key2)
1111 {
1112 int l1,l2;
1113 DICT_NOTUSED(privdata);
1114
1115 l1 = sdslen((sds)key1);
1116 l2 = sdslen((sds)key2);
1117 if (l1 != l2) return 0;
1118 return memcmp(key1, key2, l1) == 0;
1119 }
1120
1121 static void dictRedisObjectDestructor(void *privdata, void *val)
1122 {
1123 DICT_NOTUSED(privdata);
1124
1125 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1126 decrRefCount(val);
1127 }
1128
1129 static int dictObjKeyCompare(void *privdata, const void *key1,
1130 const void *key2)
1131 {
1132 const robj *o1 = key1, *o2 = key2;
1133 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1134 }
1135
1136 static unsigned int dictObjHash(const void *key) {
1137 const robj *o = key;
1138 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1139 }
1140
1141 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1142 const void *key2)
1143 {
1144 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1145 int cmp;
1146
1147 if (o1->encoding == REDIS_ENCODING_INT &&
1148 o2->encoding == REDIS_ENCODING_INT)
1149 return o1->ptr == o2->ptr;
1150
1151 o1 = getDecodedObject(o1);
1152 o2 = getDecodedObject(o2);
1153 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1154 decrRefCount(o1);
1155 decrRefCount(o2);
1156 return cmp;
1157 }
1158
1159 static unsigned int dictEncObjHash(const void *key) {
1160 robj *o = (robj*) key;
1161
1162 if (o->encoding == REDIS_ENCODING_RAW) {
1163 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1164 } else {
1165 if (o->encoding == REDIS_ENCODING_INT) {
1166 char buf[32];
1167 int len;
1168
1169 len = ll2string(buf,32,(long)o->ptr);
1170 return dictGenHashFunction((unsigned char*)buf, len);
1171 } else {
1172 unsigned int hash;
1173
1174 o = getDecodedObject(o);
1175 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1176 decrRefCount(o);
1177 return hash;
1178 }
1179 }
1180 }
1181
1182 /* Sets type and expires */
1183 static dictType setDictType = {
1184 dictEncObjHash, /* hash function */
1185 NULL, /* key dup */
1186 NULL, /* val dup */
1187 dictEncObjKeyCompare, /* key compare */
1188 dictRedisObjectDestructor, /* key destructor */
1189 NULL /* val destructor */
1190 };
1191
1192 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1193 static dictType zsetDictType = {
1194 dictEncObjHash, /* hash function */
1195 NULL, /* key dup */
1196 NULL, /* val dup */
1197 dictEncObjKeyCompare, /* key compare */
1198 dictRedisObjectDestructor, /* key destructor */
1199 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1200 };
1201
1202 /* Db->dict */
1203 static dictType dbDictType = {
1204 dictObjHash, /* hash function */
1205 NULL, /* key dup */
1206 NULL, /* val dup */
1207 dictObjKeyCompare, /* key compare */
1208 dictRedisObjectDestructor, /* key destructor */
1209 dictRedisObjectDestructor /* val destructor */
1210 };
1211
1212 /* Db->expires */
1213 static dictType keyptrDictType = {
1214 dictObjHash, /* hash function */
1215 NULL, /* key dup */
1216 NULL, /* val dup */
1217 dictObjKeyCompare, /* key compare */
1218 dictRedisObjectDestructor, /* key destructor */
1219 NULL /* val destructor */
1220 };
1221
1222 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1223 static dictType hashDictType = {
1224 dictEncObjHash, /* hash function */
1225 NULL, /* key dup */
1226 NULL, /* val dup */
1227 dictEncObjKeyCompare, /* key compare */
1228 dictRedisObjectDestructor, /* key destructor */
1229 dictRedisObjectDestructor /* val destructor */
1230 };
1231
1232 /* Keylist hash table type has unencoded redis objects as keys and
1233 * lists as values. It's used for blocking operations (BLPOP) and to
1234 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1235 static dictType keylistDictType = {
1236 dictObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
1241 dictListDestructor /* val destructor */
1242 };
1243
1244 static void version();
1245
1246 /* ========================= Random utility functions ======================= */
1247
1248 /* Redis generally does not try to recover from out of memory conditions
1249 * when allocating objects or strings, it is not clear if it will be possible
1250 * to report this condition to the client since the networking layer itself
1251 * is based on heap allocation for send buffers, so we simply abort.
1252 * At least the code will be simpler to read... */
1253 static void oom(const char *msg) {
1254 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1255 sleep(1);
1256 abort();
1257 }
1258
1259 /* ====================== Redis server networking stuff ===================== */
1260 static void closeTimedoutClients(void) {
1261 redisClient *c;
1262 listNode *ln;
1263 time_t now = time(NULL);
1264 listIter li;
1265
1266 listRewind(server.clients,&li);
1267 while ((ln = listNext(&li)) != NULL) {
1268 c = listNodeValue(ln);
1269 if (server.maxidletime &&
1270 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1271 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1272 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1273 listLength(c->pubsub_patterns) == 0 &&
1274 (now - c->lastinteraction > server.maxidletime))
1275 {
1276 redisLog(REDIS_VERBOSE,"Closing idle client");
1277 freeClient(c);
1278 } else if (c->flags & REDIS_BLOCKED) {
1279 if (c->blockingto != 0 && c->blockingto < now) {
1280 addReply(c,shared.nullmultibulk);
1281 unblockClientWaitingData(c);
1282 }
1283 }
1284 }
1285 }
1286
1287 static int htNeedsResize(dict *dict) {
1288 long long size, used;
1289
1290 size = dictSlots(dict);
1291 used = dictSize(dict);
1292 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1293 (used*100/size < REDIS_HT_MINFILL));
1294 }
1295
1296 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1297 * we resize the hash table to save memory */
1298 static void tryResizeHashTables(void) {
1299 int j;
1300
1301 for (j = 0; j < server.dbnum; j++) {
1302 if (htNeedsResize(server.db[j].dict))
1303 dictResize(server.db[j].dict);
1304 if (htNeedsResize(server.db[j].expires))
1305 dictResize(server.db[j].expires);
1306 }
1307 }
1308
1309 /* Our hash table implementation performs rehashing incrementally while
1310 * we write/read from the hash table. Still if the server is idle, the hash
1311 * table will use two tables for a long time. So we try to use 1 millisecond
1312 * of CPU time at every serverCron() loop in order to rehash some key. */
1313 static void incrementallyRehash(void) {
1314 int j;
1315
1316 for (j = 0; j < server.dbnum; j++) {
1317 if (dictIsRehashing(server.db[j].dict)) {
1318 dictRehashMilliseconds(server.db[j].dict,1);
1319 break; /* already used our millisecond for this loop... */
1320 }
1321 }
1322 }
1323
1324 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1325 void backgroundSaveDoneHandler(int statloc) {
1326 int exitcode = WEXITSTATUS(statloc);
1327 int bysignal = WIFSIGNALED(statloc);
1328
1329 if (!bysignal && exitcode == 0) {
1330 redisLog(REDIS_NOTICE,
1331 "Background saving terminated with success");
1332 server.dirty = 0;
1333 server.lastsave = time(NULL);
1334 } else if (!bysignal && exitcode != 0) {
1335 redisLog(REDIS_WARNING, "Background saving error");
1336 } else {
1337 redisLog(REDIS_WARNING,
1338 "Background saving terminated by signal %d", WTERMSIG(statloc));
1339 rdbRemoveTempFile(server.bgsavechildpid);
1340 }
1341 server.bgsavechildpid = -1;
1342 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1343 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1344 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1345 }
1346
1347 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1348 * Handle this. */
1349 void backgroundRewriteDoneHandler(int statloc) {
1350 int exitcode = WEXITSTATUS(statloc);
1351 int bysignal = WIFSIGNALED(statloc);
1352
1353 if (!bysignal && exitcode == 0) {
1354 int fd;
1355 char tmpfile[256];
1356
1357 redisLog(REDIS_NOTICE,
1358 "Background append only file rewriting terminated with success");
1359 /* Now it's time to flush the differences accumulated by the parent */
1360 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1361 fd = open(tmpfile,O_WRONLY|O_APPEND);
1362 if (fd == -1) {
1363 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1364 goto cleanup;
1365 }
1366 /* Flush our data... */
1367 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1368 (signed) sdslen(server.bgrewritebuf)) {
1369 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1370 close(fd);
1371 goto cleanup;
1372 }
1373 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1374 /* Now our work is to rename the temp file into the stable file. And
1375 * switch the file descriptor used by the server for append only. */
1376 if (rename(tmpfile,server.appendfilename) == -1) {
1377 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1378 close(fd);
1379 goto cleanup;
1380 }
1381 /* Mission completed... almost */
1382 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1383 if (server.appendfd != -1) {
1384 /* If append only is actually enabled... */
1385 close(server.appendfd);
1386 server.appendfd = fd;
1387 fsync(fd);
1388 server.appendseldb = -1; /* Make sure it will issue SELECT */
1389 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1390 } else {
1391 /* If append only is disabled we just generate a dump in this
1392 * format. Why not? */
1393 close(fd);
1394 }
1395 } else if (!bysignal && exitcode != 0) {
1396 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1397 } else {
1398 redisLog(REDIS_WARNING,
1399 "Background append only file rewriting terminated by signal %d",
1400 WTERMSIG(statloc));
1401 }
1402 cleanup:
1403 sdsfree(server.bgrewritebuf);
1404 server.bgrewritebuf = sdsempty();
1405 aofRemoveTempFile(server.bgrewritechildpid);
1406 server.bgrewritechildpid = -1;
1407 }
1408
1409 /* This function is called once a background process of some kind terminates,
1410 * as we want to avoid resizing the hash tables when there is a child in order
1411 * to play well with copy-on-write (otherwise when a resize happens lots of
1412 * memory pages are copied). The goal of this function is to update the ability
1413 * for dict.c to resize the hash tables accordingly to the fact we have o not
1414 * running childs. */
1415 static void updateDictResizePolicy(void) {
1416 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1417 dictEnableResize();
1418 else
1419 dictDisableResize();
1420 }
1421
1422 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1423 int j, loops = server.cronloops++;
1424 REDIS_NOTUSED(eventLoop);
1425 REDIS_NOTUSED(id);
1426 REDIS_NOTUSED(clientData);
1427
1428 /* We take a cached value of the unix time in the global state because
1429 * with virtual memory and aging there is to store the current time
1430 * in objects at every object access, and accuracy is not needed.
1431 * To access a global var is faster than calling time(NULL) */
1432 server.unixtime = time(NULL);
1433
1434 /* We received a SIGTERM, shutting down here in a safe way, as it is
1435 * not ok doing so inside the signal handler. */
1436 if (server.shutdown_asap) {
1437 if (prepareForShutdown() == REDIS_OK) exit(0);
1438 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1439 }
1440
1441 /* Show some info about non-empty databases */
1442 for (j = 0; j < server.dbnum; j++) {
1443 long long size, used, vkeys;
1444
1445 size = dictSlots(server.db[j].dict);
1446 used = dictSize(server.db[j].dict);
1447 vkeys = dictSize(server.db[j].expires);
1448 if (!(loops % 50) && (used || vkeys)) {
1449 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1450 /* dictPrintStats(server.dict); */
1451 }
1452 }
1453
1454 /* We don't want to resize the hash tables while a bacground saving
1455 * is in progress: the saving child is created using fork() that is
1456 * implemented with a copy-on-write semantic in most modern systems, so
1457 * if we resize the HT while there is the saving child at work actually
1458 * a lot of memory movements in the parent will cause a lot of pages
1459 * copied. */
1460 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1461 if (!(loops % 10)) tryResizeHashTables();
1462 if (server.activerehashing) incrementallyRehash();
1463 }
1464
1465 /* Show information about connected clients */
1466 if (!(loops % 50)) {
1467 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1468 listLength(server.clients)-listLength(server.slaves),
1469 listLength(server.slaves),
1470 zmalloc_used_memory());
1471 }
1472
1473 /* Close connections of timedout clients */
1474 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1475 closeTimedoutClients();
1476
1477 /* Check if a background saving or AOF rewrite in progress terminated */
1478 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1479 int statloc;
1480 pid_t pid;
1481
1482 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1483 if (pid == server.bgsavechildpid) {
1484 backgroundSaveDoneHandler(statloc);
1485 } else {
1486 backgroundRewriteDoneHandler(statloc);
1487 }
1488 updateDictResizePolicy();
1489 }
1490 } else {
1491 /* If there is not a background saving in progress check if
1492 * we have to save now */
1493 time_t now = time(NULL);
1494 for (j = 0; j < server.saveparamslen; j++) {
1495 struct saveparam *sp = server.saveparams+j;
1496
1497 if (server.dirty >= sp->changes &&
1498 now-server.lastsave > sp->seconds) {
1499 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1500 sp->changes, sp->seconds);
1501 rdbSaveBackground(server.dbfilename);
1502 break;
1503 }
1504 }
1505 }
1506
1507 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1508 * will use few CPU cycles if there are few expiring keys, otherwise
1509 * it will get more aggressive to avoid that too much memory is used by
1510 * keys that can be removed from the keyspace. */
1511 for (j = 0; j < server.dbnum; j++) {
1512 int expired;
1513 redisDb *db = server.db+j;
1514
1515 /* Continue to expire if at the end of the cycle more than 25%
1516 * of the keys were expired. */
1517 do {
1518 long num = dictSize(db->expires);
1519 time_t now = time(NULL);
1520
1521 expired = 0;
1522 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1523 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1524 while (num--) {
1525 dictEntry *de;
1526 time_t t;
1527
1528 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1529 t = (time_t) dictGetEntryVal(de);
1530 if (now > t) {
1531 deleteKey(db,dictGetEntryKey(de));
1532 expired++;
1533 server.stat_expiredkeys++;
1534 }
1535 }
1536 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1537 }
1538
1539 /* Swap a few keys on disk if we are over the memory limit and VM
1540 * is enbled. Try to free objects from the free list first. */
1541 if (vmCanSwapOut()) {
1542 while (server.vm_enabled && zmalloc_used_memory() >
1543 server.vm_max_memory)
1544 {
1545 int retval;
1546
1547 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1548 retval = (server.vm_max_threads == 0) ?
1549 vmSwapOneObjectBlocking() :
1550 vmSwapOneObjectThreaded();
1551 if (retval == REDIS_ERR && !(loops % 300) &&
1552 zmalloc_used_memory() >
1553 (server.vm_max_memory+server.vm_max_memory/10))
1554 {
1555 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1556 }
1557 /* Note that when using threade I/O we free just one object,
1558 * because anyway when the I/O thread in charge to swap this
1559 * object out will finish, the handler of completed jobs
1560 * will try to swap more objects if we are still out of memory. */
1561 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1562 }
1563 }
1564
1565 /* Check if we should connect to a MASTER */
1566 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1567 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1568 if (syncWithMaster() == REDIS_OK) {
1569 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1570 if (server.appendonly) rewriteAppendOnlyFileBackground();
1571 }
1572 }
1573 return 100;
1574 }
1575
1576 /* This function gets called every time Redis is entering the
1577 * main loop of the event driven library, that is, before to sleep
1578 * for ready file descriptors. */
1579 static void beforeSleep(struct aeEventLoop *eventLoop) {
1580 REDIS_NOTUSED(eventLoop);
1581
1582 /* Awake clients that got all the swapped keys they requested */
1583 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1584 listIter li;
1585 listNode *ln;
1586
1587 listRewind(server.io_ready_clients,&li);
1588 while((ln = listNext(&li))) {
1589 redisClient *c = ln->value;
1590 struct redisCommand *cmd;
1591
1592 /* Resume the client. */
1593 listDelNode(server.io_ready_clients,ln);
1594 c->flags &= (~REDIS_IO_WAIT);
1595 server.vm_blocked_clients--;
1596 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1597 readQueryFromClient, c);
1598 cmd = lookupCommand(c->argv[0]->ptr);
1599 assert(cmd != NULL);
1600 call(c,cmd);
1601 resetClient(c);
1602 /* There may be more data to process in the input buffer. */
1603 if (c->querybuf && sdslen(c->querybuf) > 0)
1604 processInputBuffer(c);
1605 }
1606 }
1607 /* Write the AOF buffer on disk */
1608 flushAppendOnlyFile();
1609 }
1610
1611 static void createSharedObjects(void) {
1612 int j;
1613
1614 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1615 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1616 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1617 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1618 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1619 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1620 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1621 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1622 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1623 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1624 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1625 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1626 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1627 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR no such key\r\n"));
1629 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR syntax error\r\n"));
1631 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR source and destination objects are the same\r\n"));
1633 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR index out of range\r\n"));
1635 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1636 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1637 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1638 shared.select0 = createStringObject("select 0\r\n",10);
1639 shared.select1 = createStringObject("select 1\r\n",10);
1640 shared.select2 = createStringObject("select 2\r\n",10);
1641 shared.select3 = createStringObject("select 3\r\n",10);
1642 shared.select4 = createStringObject("select 4\r\n",10);
1643 shared.select5 = createStringObject("select 5\r\n",10);
1644 shared.select6 = createStringObject("select 6\r\n",10);
1645 shared.select7 = createStringObject("select 7\r\n",10);
1646 shared.select8 = createStringObject("select 8\r\n",10);
1647 shared.select9 = createStringObject("select 9\r\n",10);
1648 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1649 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1650 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1651 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1652 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1653 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1654 shared.mbulk3 = createStringObject("*3\r\n",4);
1655 shared.mbulk4 = createStringObject("*4\r\n",4);
1656 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1657 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1658 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1659 }
1660 }
1661
1662 static void appendServerSaveParams(time_t seconds, int changes) {
1663 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1664 server.saveparams[server.saveparamslen].seconds = seconds;
1665 server.saveparams[server.saveparamslen].changes = changes;
1666 server.saveparamslen++;
1667 }
1668
1669 static void resetServerSaveParams() {
1670 zfree(server.saveparams);
1671 server.saveparams = NULL;
1672 server.saveparamslen = 0;
1673 }
1674
1675 static void initServerConfig() {
1676 server.dbnum = REDIS_DEFAULT_DBNUM;
1677 server.port = REDIS_SERVERPORT;
1678 server.verbosity = REDIS_VERBOSE;
1679 server.maxidletime = REDIS_MAXIDLETIME;
1680 server.saveparams = NULL;
1681 server.logfile = NULL; /* NULL = log on standard output */
1682 server.bindaddr = NULL;
1683 server.glueoutputbuf = 1;
1684 server.daemonize = 0;
1685 server.appendonly = 0;
1686 server.appendfsync = APPENDFSYNC_EVERYSEC;
1687 server.lastfsync = time(NULL);
1688 server.appendfd = -1;
1689 server.appendseldb = -1; /* Make sure the first time will not match */
1690 server.pidfile = zstrdup("/var/run/redis.pid");
1691 server.dbfilename = zstrdup("dump.rdb");
1692 server.appendfilename = zstrdup("appendonly.aof");
1693 server.requirepass = NULL;
1694 server.rdbcompression = 1;
1695 server.activerehashing = 1;
1696 server.maxclients = 0;
1697 server.blpop_blocked_clients = 0;
1698 server.maxmemory = 0;
1699 server.vm_enabled = 0;
1700 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1701 server.vm_page_size = 256; /* 256 bytes per page */
1702 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1703 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1704 server.vm_max_threads = 4;
1705 server.vm_blocked_clients = 0;
1706 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1707 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1708 server.shutdown_asap = 0;
1709
1710 resetServerSaveParams();
1711
1712 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1713 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1714 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1715 /* Replication related */
1716 server.isslave = 0;
1717 server.masterauth = NULL;
1718 server.masterhost = NULL;
1719 server.masterport = 6379;
1720 server.master = NULL;
1721 server.replstate = REDIS_REPL_NONE;
1722
1723 /* Double constants initialization */
1724 R_Zero = 0.0;
1725 R_PosInf = 1.0/R_Zero;
1726 R_NegInf = -1.0/R_Zero;
1727 R_Nan = R_Zero/R_Zero;
1728 }
1729
1730 static void initServer() {
1731 int j;
1732
1733 signal(SIGHUP, SIG_IGN);
1734 signal(SIGPIPE, SIG_IGN);
1735 setupSigSegvAction();
1736
1737 server.devnull = fopen("/dev/null","w");
1738 if (server.devnull == NULL) {
1739 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1740 exit(1);
1741 }
1742 server.clients = listCreate();
1743 server.slaves = listCreate();
1744 server.monitors = listCreate();
1745 server.objfreelist = listCreate();
1746 createSharedObjects();
1747 server.el = aeCreateEventLoop();
1748 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1749 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1750 if (server.fd == -1) {
1751 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1752 exit(1);
1753 }
1754 for (j = 0; j < server.dbnum; j++) {
1755 server.db[j].dict = dictCreate(&dbDictType,NULL);
1756 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1757 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1758 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
1759 if (server.vm_enabled)
1760 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].id = j;
1762 }
1763 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1764 server.pubsub_patterns = listCreate();
1765 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1766 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1767 server.cronloops = 0;
1768 server.bgsavechildpid = -1;
1769 server.bgrewritechildpid = -1;
1770 server.bgrewritebuf = sdsempty();
1771 server.aofbuf = sdsempty();
1772 server.lastsave = time(NULL);
1773 server.dirty = 0;
1774 server.stat_numcommands = 0;
1775 server.stat_numconnections = 0;
1776 server.stat_expiredkeys = 0;
1777 server.stat_starttime = time(NULL);
1778 server.unixtime = time(NULL);
1779 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1780 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1781 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1782
1783 if (server.appendonly) {
1784 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1785 if (server.appendfd == -1) {
1786 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1787 strerror(errno));
1788 exit(1);
1789 }
1790 }
1791
1792 if (server.vm_enabled) vmInit();
1793 }
1794
1795 /* Empty the whole database */
1796 static long long emptyDb() {
1797 int j;
1798 long long removed = 0;
1799
1800 for (j = 0; j < server.dbnum; j++) {
1801 removed += dictSize(server.db[j].dict);
1802 dictEmpty(server.db[j].dict);
1803 dictEmpty(server.db[j].expires);
1804 }
1805 return removed;
1806 }
1807
1808 static int yesnotoi(char *s) {
1809 if (!strcasecmp(s,"yes")) return 1;
1810 else if (!strcasecmp(s,"no")) return 0;
1811 else return -1;
1812 }
1813
1814 /* I agree, this is a very rudimental way to load a configuration...
1815 will improve later if the config gets more complex */
1816 static void loadServerConfig(char *filename) {
1817 FILE *fp;
1818 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1819 int linenum = 0;
1820 sds line = NULL;
1821
1822 if (filename[0] == '-' && filename[1] == '\0')
1823 fp = stdin;
1824 else {
1825 if ((fp = fopen(filename,"r")) == NULL) {
1826 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1827 exit(1);
1828 }
1829 }
1830
1831 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1832 sds *argv;
1833 int argc, j;
1834
1835 linenum++;
1836 line = sdsnew(buf);
1837 line = sdstrim(line," \t\r\n");
1838
1839 /* Skip comments and blank lines*/
1840 if (line[0] == '#' || line[0] == '\0') {
1841 sdsfree(line);
1842 continue;
1843 }
1844
1845 /* Split into arguments */
1846 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1847 sdstolower(argv[0]);
1848
1849 /* Execute config directives */
1850 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1851 server.maxidletime = atoi(argv[1]);
1852 if (server.maxidletime < 0) {
1853 err = "Invalid timeout value"; goto loaderr;
1854 }
1855 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1856 server.port = atoi(argv[1]);
1857 if (server.port < 1 || server.port > 65535) {
1858 err = "Invalid port"; goto loaderr;
1859 }
1860 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1861 server.bindaddr = zstrdup(argv[1]);
1862 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1863 int seconds = atoi(argv[1]);
1864 int changes = atoi(argv[2]);
1865 if (seconds < 1 || changes < 0) {
1866 err = "Invalid save parameters"; goto loaderr;
1867 }
1868 appendServerSaveParams(seconds,changes);
1869 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1870 if (chdir(argv[1]) == -1) {
1871 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1872 argv[1], strerror(errno));
1873 exit(1);
1874 }
1875 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1876 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1877 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1878 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1879 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1880 else {
1881 err = "Invalid log level. Must be one of debug, notice, warning";
1882 goto loaderr;
1883 }
1884 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1885 FILE *logfp;
1886
1887 server.logfile = zstrdup(argv[1]);
1888 if (!strcasecmp(server.logfile,"stdout")) {
1889 zfree(server.logfile);
1890 server.logfile = NULL;
1891 }
1892 if (server.logfile) {
1893 /* Test if we are able to open the file. The server will not
1894 * be able to abort just for this problem later... */
1895 logfp = fopen(server.logfile,"a");
1896 if (logfp == NULL) {
1897 err = sdscatprintf(sdsempty(),
1898 "Can't open the log file: %s", strerror(errno));
1899 goto loaderr;
1900 }
1901 fclose(logfp);
1902 }
1903 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1904 server.dbnum = atoi(argv[1]);
1905 if (server.dbnum < 1) {
1906 err = "Invalid number of databases"; goto loaderr;
1907 }
1908 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1909 loadServerConfig(argv[1]);
1910 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1911 server.maxclients = atoi(argv[1]);
1912 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1913 server.maxmemory = memtoll(argv[1],NULL);
1914 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1915 server.masterhost = sdsnew(argv[1]);
1916 server.masterport = atoi(argv[2]);
1917 server.replstate = REDIS_REPL_CONNECT;
1918 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1919 server.masterauth = zstrdup(argv[1]);
1920 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1921 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1922 err = "argument must be 'yes' or 'no'"; goto loaderr;
1923 }
1924 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1925 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1926 err = "argument must be 'yes' or 'no'"; goto loaderr;
1927 }
1928 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1929 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1930 err = "argument must be 'yes' or 'no'"; goto loaderr;
1931 }
1932 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1933 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1934 err = "argument must be 'yes' or 'no'"; goto loaderr;
1935 }
1936 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1937 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1938 err = "argument must be 'yes' or 'no'"; goto loaderr;
1939 }
1940 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1941 zfree(server.appendfilename);
1942 server.appendfilename = zstrdup(argv[1]);
1943 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1944 if (!strcasecmp(argv[1],"no")) {
1945 server.appendfsync = APPENDFSYNC_NO;
1946 } else if (!strcasecmp(argv[1],"always")) {
1947 server.appendfsync = APPENDFSYNC_ALWAYS;
1948 } else if (!strcasecmp(argv[1],"everysec")) {
1949 server.appendfsync = APPENDFSYNC_EVERYSEC;
1950 } else {
1951 err = "argument must be 'no', 'always' or 'everysec'";
1952 goto loaderr;
1953 }
1954 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1955 server.requirepass = zstrdup(argv[1]);
1956 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1957 zfree(server.pidfile);
1958 server.pidfile = zstrdup(argv[1]);
1959 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1960 zfree(server.dbfilename);
1961 server.dbfilename = zstrdup(argv[1]);
1962 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1963 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1964 err = "argument must be 'yes' or 'no'"; goto loaderr;
1965 }
1966 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1967 zfree(server.vm_swap_file);
1968 server.vm_swap_file = zstrdup(argv[1]);
1969 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1970 server.vm_max_memory = memtoll(argv[1],NULL);
1971 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1972 server.vm_page_size = memtoll(argv[1], NULL);
1973 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1974 server.vm_pages = memtoll(argv[1], NULL);
1975 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1976 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1977 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1978 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1979 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1980 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1981 } else {
1982 err = "Bad directive or wrong number of arguments"; goto loaderr;
1983 }
1984 for (j = 0; j < argc; j++)
1985 sdsfree(argv[j]);
1986 zfree(argv);
1987 sdsfree(line);
1988 }
1989 if (fp != stdin) fclose(fp);
1990 return;
1991
1992 loaderr:
1993 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1994 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1995 fprintf(stderr, ">>> '%s'\n", line);
1996 fprintf(stderr, "%s\n", err);
1997 exit(1);
1998 }
1999
2000 static void freeClientArgv(redisClient *c) {
2001 int j;
2002
2003 for (j = 0; j < c->argc; j++)
2004 decrRefCount(c->argv[j]);
2005 for (j = 0; j < c->mbargc; j++)
2006 decrRefCount(c->mbargv[j]);
2007 c->argc = 0;
2008 c->mbargc = 0;
2009 }
2010
2011 static void freeClient(redisClient *c) {
2012 listNode *ln;
2013
2014 /* Note that if the client we are freeing is blocked into a blocking
2015 * call, we have to set querybuf to NULL *before* to call
2016 * unblockClientWaitingData() to avoid processInputBuffer() will get
2017 * called. Also it is important to remove the file events after
2018 * this, because this call adds the READABLE event. */
2019 sdsfree(c->querybuf);
2020 c->querybuf = NULL;
2021 if (c->flags & REDIS_BLOCKED)
2022 unblockClientWaitingData(c);
2023
2024 /* UNWATCH all the keys */
2025 unwatchAllKeys(c);
2026 listRelease(c->watched_keys);
2027 /* Unsubscribe from all the pubsub channels */
2028 pubsubUnsubscribeAllChannels(c,0);
2029 pubsubUnsubscribeAllPatterns(c,0);
2030 dictRelease(c->pubsub_channels);
2031 listRelease(c->pubsub_patterns);
2032 /* Obvious cleanup */
2033 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2034 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2035 listRelease(c->reply);
2036 freeClientArgv(c);
2037 close(c->fd);
2038 /* Remove from the list of clients */
2039 ln = listSearchKey(server.clients,c);
2040 redisAssert(ln != NULL);
2041 listDelNode(server.clients,ln);
2042 /* Remove from the list of clients that are now ready to be restarted
2043 * after waiting for swapped keys */
2044 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2045 ln = listSearchKey(server.io_ready_clients,c);
2046 if (ln) {
2047 listDelNode(server.io_ready_clients,ln);
2048 server.vm_blocked_clients--;
2049 }
2050 }
2051 /* Remove from the list of clients waiting for swapped keys */
2052 while (server.vm_enabled && listLength(c->io_keys)) {
2053 ln = listFirst(c->io_keys);
2054 dontWaitForSwappedKey(c,ln->value);
2055 }
2056 listRelease(c->io_keys);
2057 /* Master/slave cleanup */
2058 if (c->flags & REDIS_SLAVE) {
2059 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2060 close(c->repldbfd);
2061 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2062 ln = listSearchKey(l,c);
2063 redisAssert(ln != NULL);
2064 listDelNode(l,ln);
2065 }
2066 if (c->flags & REDIS_MASTER) {
2067 server.master = NULL;
2068 server.replstate = REDIS_REPL_CONNECT;
2069 }
2070 /* Release memory */
2071 zfree(c->argv);
2072 zfree(c->mbargv);
2073 freeClientMultiState(c);
2074 zfree(c);
2075 }
2076
2077 #define GLUEREPLY_UP_TO (1024)
2078 static void glueReplyBuffersIfNeeded(redisClient *c) {
2079 int copylen = 0;
2080 char buf[GLUEREPLY_UP_TO];
2081 listNode *ln;
2082 listIter li;
2083 robj *o;
2084
2085 listRewind(c->reply,&li);
2086 while((ln = listNext(&li))) {
2087 int objlen;
2088
2089 o = ln->value;
2090 objlen = sdslen(o->ptr);
2091 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2092 memcpy(buf+copylen,o->ptr,objlen);
2093 copylen += objlen;
2094 listDelNode(c->reply,ln);
2095 } else {
2096 if (copylen == 0) return;
2097 break;
2098 }
2099 }
2100 /* Now the output buffer is empty, add the new single element */
2101 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2102 listAddNodeHead(c->reply,o);
2103 }
2104
2105 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2106 redisClient *c = privdata;
2107 int nwritten = 0, totwritten = 0, objlen;
2108 robj *o;
2109 REDIS_NOTUSED(el);
2110 REDIS_NOTUSED(mask);
2111
2112 /* Use writev() if we have enough buffers to send */
2113 if (!server.glueoutputbuf &&
2114 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2115 !(c->flags & REDIS_MASTER))
2116 {
2117 sendReplyToClientWritev(el, fd, privdata, mask);
2118 return;
2119 }
2120
2121 while(listLength(c->reply)) {
2122 if (server.glueoutputbuf && listLength(c->reply) > 1)
2123 glueReplyBuffersIfNeeded(c);
2124
2125 o = listNodeValue(listFirst(c->reply));
2126 objlen = sdslen(o->ptr);
2127
2128 if (objlen == 0) {
2129 listDelNode(c->reply,listFirst(c->reply));
2130 continue;
2131 }
2132
2133 if (c->flags & REDIS_MASTER) {
2134 /* Don't reply to a master */
2135 nwritten = objlen - c->sentlen;
2136 } else {
2137 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2138 if (nwritten <= 0) break;
2139 }
2140 c->sentlen += nwritten;
2141 totwritten += nwritten;
2142 /* If we fully sent the object on head go to the next one */
2143 if (c->sentlen == objlen) {
2144 listDelNode(c->reply,listFirst(c->reply));
2145 c->sentlen = 0;
2146 }
2147 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2148 * bytes, in a single threaded server it's a good idea to serve
2149 * other clients as well, even if a very large request comes from
2150 * super fast link that is always able to accept data (in real world
2151 * scenario think about 'KEYS *' against the loopback interfae) */
2152 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2153 }
2154 if (nwritten == -1) {
2155 if (errno == EAGAIN) {
2156 nwritten = 0;
2157 } else {
2158 redisLog(REDIS_VERBOSE,
2159 "Error writing to client: %s", strerror(errno));
2160 freeClient(c);
2161 return;
2162 }
2163 }
2164 if (totwritten > 0) c->lastinteraction = time(NULL);
2165 if (listLength(c->reply) == 0) {
2166 c->sentlen = 0;
2167 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2168 }
2169 }
2170
2171 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2172 {
2173 redisClient *c = privdata;
2174 int nwritten = 0, totwritten = 0, objlen, willwrite;
2175 robj *o;
2176 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2177 int offset, ion = 0;
2178 REDIS_NOTUSED(el);
2179 REDIS_NOTUSED(mask);
2180
2181 listNode *node;
2182 while (listLength(c->reply)) {
2183 offset = c->sentlen;
2184 ion = 0;
2185 willwrite = 0;
2186
2187 /* fill-in the iov[] array */
2188 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2189 o = listNodeValue(node);
2190 objlen = sdslen(o->ptr);
2191
2192 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2193 break;
2194
2195 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2196 break; /* no more iovecs */
2197
2198 iov[ion].iov_base = ((char*)o->ptr) + offset;
2199 iov[ion].iov_len = objlen - offset;
2200 willwrite += objlen - offset;
2201 offset = 0; /* just for the first item */
2202 ion++;
2203 }
2204
2205 if(willwrite == 0)
2206 break;
2207
2208 /* write all collected blocks at once */
2209 if((nwritten = writev(fd, iov, ion)) < 0) {
2210 if (errno != EAGAIN) {
2211 redisLog(REDIS_VERBOSE,
2212 "Error writing to client: %s", strerror(errno));
2213 freeClient(c);
2214 return;
2215 }
2216 break;
2217 }
2218
2219 totwritten += nwritten;
2220 offset = c->sentlen;
2221
2222 /* remove written robjs from c->reply */
2223 while (nwritten && listLength(c->reply)) {
2224 o = listNodeValue(listFirst(c->reply));
2225 objlen = sdslen(o->ptr);
2226
2227 if(nwritten >= objlen - offset) {
2228 listDelNode(c->reply, listFirst(c->reply));
2229 nwritten -= objlen - offset;
2230 c->sentlen = 0;
2231 } else {
2232 /* partial write */
2233 c->sentlen += nwritten;
2234 break;
2235 }
2236 offset = 0;
2237 }
2238 }
2239
2240 if (totwritten > 0)
2241 c->lastinteraction = time(NULL);
2242
2243 if (listLength(c->reply) == 0) {
2244 c->sentlen = 0;
2245 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2246 }
2247 }
2248
2249 static struct redisCommand *lookupCommand(char *name) {
2250 int j = 0;
2251 while(cmdTable[j].name != NULL) {
2252 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2253 j++;
2254 }
2255 return NULL;
2256 }
2257
2258 /* resetClient prepare the client to process the next command */
2259 static void resetClient(redisClient *c) {
2260 freeClientArgv(c);
2261 c->bulklen = -1;
2262 c->multibulk = 0;
2263 }
2264
2265 /* Call() is the core of Redis execution of a command */
2266 static void call(redisClient *c, struct redisCommand *cmd) {
2267 long long dirty;
2268
2269 dirty = server.dirty;
2270 cmd->proc(c);
2271 dirty = server.dirty-dirty;
2272
2273 if (server.appendonly && dirty)
2274 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2275 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2276 listLength(server.slaves))
2277 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2278 if (listLength(server.monitors))
2279 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2280 server.stat_numcommands++;
2281 }
2282
2283 /* If this function gets called we already read a whole
2284 * command, argments are in the client argv/argc fields.
2285 * processCommand() execute the command or prepare the
2286 * server for a bulk read from the client.
2287 *
2288 * If 1 is returned the client is still alive and valid and
2289 * and other operations can be performed by the caller. Otherwise
2290 * if 0 is returned the client was destroied (i.e. after QUIT). */
2291 static int processCommand(redisClient *c) {
2292 struct redisCommand *cmd;
2293
2294 /* Free some memory if needed (maxmemory setting) */
2295 if (server.maxmemory) freeMemoryIfNeeded();
2296
2297 /* Handle the multi bulk command type. This is an alternative protocol
2298 * supported by Redis in order to receive commands that are composed of
2299 * multiple binary-safe "bulk" arguments. The latency of processing is
2300 * a bit higher but this allows things like multi-sets, so if this
2301 * protocol is used only for MSET and similar commands this is a big win. */
2302 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2303 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2304 if (c->multibulk <= 0) {
2305 resetClient(c);
2306 return 1;
2307 } else {
2308 decrRefCount(c->argv[c->argc-1]);
2309 c->argc--;
2310 return 1;
2311 }
2312 } else if (c->multibulk) {
2313 if (c->bulklen == -1) {
2314 if (((char*)c->argv[0]->ptr)[0] != '$') {
2315 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2316 resetClient(c);
2317 return 1;
2318 } else {
2319 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2320 decrRefCount(c->argv[0]);
2321 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2322 c->argc--;
2323 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2324 resetClient(c);
2325 return 1;
2326 }
2327 c->argc--;
2328 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2329 return 1;
2330 }
2331 } else {
2332 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2333 c->mbargv[c->mbargc] = c->argv[0];
2334 c->mbargc++;
2335 c->argc--;
2336 c->multibulk--;
2337 if (c->multibulk == 0) {
2338 robj **auxargv;
2339 int auxargc;
2340
2341 /* Here we need to swap the multi-bulk argc/argv with the
2342 * normal argc/argv of the client structure. */
2343 auxargv = c->argv;
2344 c->argv = c->mbargv;
2345 c->mbargv = auxargv;
2346
2347 auxargc = c->argc;
2348 c->argc = c->mbargc;
2349 c->mbargc = auxargc;
2350
2351 /* We need to set bulklen to something different than -1
2352 * in order for the code below to process the command without
2353 * to try to read the last argument of a bulk command as
2354 * a special argument. */
2355 c->bulklen = 0;
2356 /* continue below and process the command */
2357 } else {
2358 c->bulklen = -1;
2359 return 1;
2360 }
2361 }
2362 }
2363 /* -- end of multi bulk commands processing -- */
2364
2365 /* The QUIT command is handled as a special case. Normal command
2366 * procs are unable to close the client connection safely */
2367 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2368 freeClient(c);
2369 return 0;
2370 }
2371
2372 /* Now lookup the command and check ASAP about trivial error conditions
2373 * such wrong arity, bad command name and so forth. */
2374 cmd = lookupCommand(c->argv[0]->ptr);
2375 if (!cmd) {
2376 addReplySds(c,
2377 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2378 (char*)c->argv[0]->ptr));
2379 resetClient(c);
2380 return 1;
2381 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2382 (c->argc < -cmd->arity)) {
2383 addReplySds(c,
2384 sdscatprintf(sdsempty(),
2385 "-ERR wrong number of arguments for '%s' command\r\n",
2386 cmd->name));
2387 resetClient(c);
2388 return 1;
2389 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2390 /* This is a bulk command, we have to read the last argument yet. */
2391 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2392
2393 decrRefCount(c->argv[c->argc-1]);
2394 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2395 c->argc--;
2396 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2397 resetClient(c);
2398 return 1;
2399 }
2400 c->argc--;
2401 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2402 /* It is possible that the bulk read is already in the
2403 * buffer. Check this condition and handle it accordingly.
2404 * This is just a fast path, alternative to call processInputBuffer().
2405 * It's a good idea since the code is small and this condition
2406 * happens most of the times. */
2407 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2408 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2409 c->argc++;
2410 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2411 } else {
2412 /* Otherwise return... there is to read the last argument
2413 * from the socket. */
2414 return 1;
2415 }
2416 }
2417 /* Let's try to encode the bulk object to save space. */
2418 if (cmd->flags & REDIS_CMD_BULK)
2419 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2420
2421 /* Check if the user is authenticated */
2422 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2423 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2424 resetClient(c);
2425 return 1;
2426 }
2427
2428 /* Handle the maxmemory directive */
2429 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2430 zmalloc_used_memory() > server.maxmemory)
2431 {
2432 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2433 resetClient(c);
2434 return 1;
2435 }
2436
2437 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2438 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2439 &&
2440 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2441 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2442 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2443 resetClient(c);
2444 return 1;
2445 }
2446
2447 /* Exec the command */
2448 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2449 queueMultiCommand(c,cmd);
2450 addReply(c,shared.queued);
2451 } else {
2452 if (server.vm_enabled && server.vm_max_threads > 0 &&
2453 blockClientOnSwappedKeys(c,cmd)) return 1;
2454 call(c,cmd);
2455 }
2456
2457 /* Prepare the client for the next command */
2458 resetClient(c);
2459 return 1;
2460 }
2461
2462 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2463 listNode *ln;
2464 listIter li;
2465 int outc = 0, j;
2466 robj **outv;
2467 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2468 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2469 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2470 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2471 robj *lenobj;
2472
2473 if (argc <= REDIS_STATIC_ARGS) {
2474 outv = static_outv;
2475 } else {
2476 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2477 }
2478
2479 lenobj = createObject(REDIS_STRING,
2480 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2481 lenobj->refcount = 0;
2482 outv[outc++] = lenobj;
2483 for (j = 0; j < argc; j++) {
2484 lenobj = createObject(REDIS_STRING,
2485 sdscatprintf(sdsempty(),"$%lu\r\n",
2486 (unsigned long) stringObjectLen(argv[j])));
2487 lenobj->refcount = 0;
2488 outv[outc++] = lenobj;
2489 outv[outc++] = argv[j];
2490 outv[outc++] = shared.crlf;
2491 }
2492
2493 /* Increment all the refcounts at start and decrement at end in order to
2494 * be sure to free objects if there is no slave in a replication state
2495 * able to be feed with commands */
2496 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2497 listRewind(slaves,&li);
2498 while((ln = listNext(&li))) {
2499 redisClient *slave = ln->value;
2500
2501 /* Don't feed slaves that are still waiting for BGSAVE to start */
2502 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2503
2504 /* Feed all the other slaves, MONITORs and so on */
2505 if (slave->slaveseldb != dictid) {
2506 robj *selectcmd;
2507
2508 switch(dictid) {
2509 case 0: selectcmd = shared.select0; break;
2510 case 1: selectcmd = shared.select1; break;
2511 case 2: selectcmd = shared.select2; break;
2512 case 3: selectcmd = shared.select3; break;
2513 case 4: selectcmd = shared.select4; break;
2514 case 5: selectcmd = shared.select5; break;
2515 case 6: selectcmd = shared.select6; break;
2516 case 7: selectcmd = shared.select7; break;
2517 case 8: selectcmd = shared.select8; break;
2518 case 9: selectcmd = shared.select9; break;
2519 default:
2520 selectcmd = createObject(REDIS_STRING,
2521 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2522 selectcmd->refcount = 0;
2523 break;
2524 }
2525 addReply(slave,selectcmd);
2526 slave->slaveseldb = dictid;
2527 }
2528 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2529 }
2530 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2531 if (outv != static_outv) zfree(outv);
2532 }
2533
2534 static sds sdscatrepr(sds s, char *p, size_t len) {
2535 s = sdscatlen(s,"\"",1);
2536 while(len--) {
2537 switch(*p) {
2538 case '\\':
2539 case '"':
2540 s = sdscatprintf(s,"\\%c",*p);
2541 break;
2542 case '\n': s = sdscatlen(s,"\\n",1); break;
2543 case '\r': s = sdscatlen(s,"\\r",1); break;
2544 case '\t': s = sdscatlen(s,"\\t",1); break;
2545 case '\a': s = sdscatlen(s,"\\a",1); break;
2546 case '\b': s = sdscatlen(s,"\\b",1); break;
2547 default:
2548 if (isprint(*p))
2549 s = sdscatprintf(s,"%c",*p);
2550 else
2551 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2552 break;
2553 }
2554 p++;
2555 }
2556 return sdscatlen(s,"\"",1);
2557 }
2558
2559 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2560 listNode *ln;
2561 listIter li;
2562 int j;
2563 sds cmdrepr = sdsnew("+");
2564 robj *cmdobj;
2565 struct timeval tv;
2566
2567 gettimeofday(&tv,NULL);
2568 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2569 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2570
2571 for (j = 0; j < argc; j++) {
2572 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2573 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2574 } else {
2575 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2576 sdslen(argv[j]->ptr));
2577 }
2578 if (j != argc-1)
2579 cmdrepr = sdscatlen(cmdrepr," ",1);
2580 }
2581 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2582 cmdobj = createObject(REDIS_STRING,cmdrepr);
2583
2584 listRewind(monitors,&li);
2585 while((ln = listNext(&li))) {
2586 redisClient *monitor = ln->value;
2587 addReply(monitor,cmdobj);
2588 }
2589 decrRefCount(cmdobj);
2590 }
2591
2592 static void processInputBuffer(redisClient *c) {
2593 again:
2594 /* Before to process the input buffer, make sure the client is not
2595 * waitig for a blocking operation such as BLPOP. Note that the first
2596 * iteration the client is never blocked, otherwise the processInputBuffer
2597 * would not be called at all, but after the execution of the first commands
2598 * in the input buffer the client may be blocked, and the "goto again"
2599 * will try to reiterate. The following line will make it return asap. */
2600 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2601 if (c->bulklen == -1) {
2602 /* Read the first line of the query */
2603 char *p = strchr(c->querybuf,'\n');
2604 size_t querylen;
2605
2606 if (p) {
2607 sds query, *argv;
2608 int argc, j;
2609
2610 query = c->querybuf;
2611 c->querybuf = sdsempty();
2612 querylen = 1+(p-(query));
2613 if (sdslen(query) > querylen) {
2614 /* leave data after the first line of the query in the buffer */
2615 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2616 }
2617 *p = '\0'; /* remove "\n" */
2618 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2619 sdsupdatelen(query);
2620
2621 /* Now we can split the query in arguments */
2622 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2623 sdsfree(query);
2624
2625 if (c->argv) zfree(c->argv);
2626 c->argv = zmalloc(sizeof(robj*)*argc);
2627
2628 for (j = 0; j < argc; j++) {
2629 if (sdslen(argv[j])) {
2630 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2631 c->argc++;
2632 } else {
2633 sdsfree(argv[j]);
2634 }
2635 }
2636 zfree(argv);
2637 if (c->argc) {
2638 /* Execute the command. If the client is still valid
2639 * after processCommand() return and there is something
2640 * on the query buffer try to process the next command. */
2641 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2642 } else {
2643 /* Nothing to process, argc == 0. Just process the query
2644 * buffer if it's not empty or return to the caller */
2645 if (sdslen(c->querybuf)) goto again;
2646 }
2647 return;
2648 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2649 redisLog(REDIS_VERBOSE, "Client protocol error");
2650 freeClient(c);
2651 return;
2652 }
2653 } else {
2654 /* Bulk read handling. Note that if we are at this point
2655 the client already sent a command terminated with a newline,
2656 we are reading the bulk data that is actually the last
2657 argument of the command. */
2658 int qbl = sdslen(c->querybuf);
2659
2660 if (c->bulklen <= qbl) {
2661 /* Copy everything but the final CRLF as final argument */
2662 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2663 c->argc++;
2664 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2665 /* Process the command. If the client is still valid after
2666 * the processing and there is more data in the buffer
2667 * try to parse it. */
2668 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2669 return;
2670 }
2671 }
2672 }
2673
2674 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2675 redisClient *c = (redisClient*) privdata;
2676 char buf[REDIS_IOBUF_LEN];
2677 int nread;
2678 REDIS_NOTUSED(el);
2679 REDIS_NOTUSED(mask);
2680
2681 nread = read(fd, buf, REDIS_IOBUF_LEN);
2682 if (nread == -1) {
2683 if (errno == EAGAIN) {
2684 nread = 0;
2685 } else {
2686 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2687 freeClient(c);
2688 return;
2689 }
2690 } else if (nread == 0) {
2691 redisLog(REDIS_VERBOSE, "Client closed connection");
2692 freeClient(c);
2693 return;
2694 }
2695 if (nread) {
2696 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2697 c->lastinteraction = time(NULL);
2698 } else {
2699 return;
2700 }
2701 processInputBuffer(c);
2702 }
2703
2704 static int selectDb(redisClient *c, int id) {
2705 if (id < 0 || id >= server.dbnum)
2706 return REDIS_ERR;
2707 c->db = &server.db[id];
2708 return REDIS_OK;
2709 }
2710
2711 static void *dupClientReplyValue(void *o) {
2712 incrRefCount((robj*)o);
2713 return o;
2714 }
2715
2716 static int listMatchObjects(void *a, void *b) {
2717 return equalStringObjects(a,b);
2718 }
2719
2720 static redisClient *createClient(int fd) {
2721 redisClient *c = zmalloc(sizeof(*c));
2722
2723 anetNonBlock(NULL,fd);
2724 anetTcpNoDelay(NULL,fd);
2725 if (!c) return NULL;
2726 selectDb(c,0);
2727 c->fd = fd;
2728 c->querybuf = sdsempty();
2729 c->argc = 0;
2730 c->argv = NULL;
2731 c->bulklen = -1;
2732 c->multibulk = 0;
2733 c->mbargc = 0;
2734 c->mbargv = NULL;
2735 c->sentlen = 0;
2736 c->flags = 0;
2737 c->lastinteraction = time(NULL);
2738 c->authenticated = 0;
2739 c->replstate = REDIS_REPL_NONE;
2740 c->reply = listCreate();
2741 listSetFreeMethod(c->reply,decrRefCount);
2742 listSetDupMethod(c->reply,dupClientReplyValue);
2743 c->blocking_keys = NULL;
2744 c->blocking_keys_num = 0;
2745 c->io_keys = listCreate();
2746 listSetFreeMethod(c->io_keys,decrRefCount);
2747 c->pubsub_channels = dictCreate(&setDictType,NULL);
2748 c->pubsub_patterns = listCreate();
2749 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2750 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2751 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2752 readQueryFromClient, c) == AE_ERR) {
2753 freeClient(c);
2754 return NULL;
2755 }
2756 listAddNodeTail(server.clients,c);
2757 initClientMultiState(c);
2758 return c;
2759 }
2760
2761 static void addReply(redisClient *c, robj *obj) {
2762 if (listLength(c->reply) == 0 &&
2763 (c->replstate == REDIS_REPL_NONE ||
2764 c->replstate == REDIS_REPL_ONLINE) &&
2765 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2766 sendReplyToClient, c) == AE_ERR) return;
2767
2768 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2769 obj = dupStringObject(obj);
2770 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2771 }
2772 listAddNodeTail(c->reply,getDecodedObject(obj));
2773 }
2774
2775 static void addReplySds(redisClient *c, sds s) {
2776 robj *o = createObject(REDIS_STRING,s);
2777 addReply(c,o);
2778 decrRefCount(o);
2779 }
2780
2781 static void addReplyDouble(redisClient *c, double d) {
2782 char buf[128];
2783
2784 snprintf(buf,sizeof(buf),"%.17g",d);
2785 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2786 (unsigned long) strlen(buf),buf));
2787 }
2788
2789 static void addReplyLongLong(redisClient *c, long long ll) {
2790 char buf[128];
2791 size_t len;
2792
2793 if (ll == 0) {
2794 addReply(c,shared.czero);
2795 return;
2796 } else if (ll == 1) {
2797 addReply(c,shared.cone);
2798 return;
2799 }
2800 buf[0] = ':';
2801 len = ll2string(buf+1,sizeof(buf)-1,ll);
2802 buf[len+1] = '\r';
2803 buf[len+2] = '\n';
2804 addReplySds(c,sdsnewlen(buf,len+3));
2805 }
2806
2807 static void addReplyUlong(redisClient *c, unsigned long ul) {
2808 char buf[128];
2809 size_t len;
2810
2811 if (ul == 0) {
2812 addReply(c,shared.czero);
2813 return;
2814 } else if (ul == 1) {
2815 addReply(c,shared.cone);
2816 return;
2817 }
2818 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2819 addReplySds(c,sdsnewlen(buf,len));
2820 }
2821
2822 static void addReplyBulkLen(redisClient *c, robj *obj) {
2823 size_t len, intlen;
2824 char buf[128];
2825
2826 if (obj->encoding == REDIS_ENCODING_RAW) {
2827 len = sdslen(obj->ptr);
2828 } else {
2829 long n = (long)obj->ptr;
2830
2831 /* Compute how many bytes will take this integer as a radix 10 string */
2832 len = 1;
2833 if (n < 0) {
2834 len++;
2835 n = -n;
2836 }
2837 while((n = n/10) != 0) {
2838 len++;
2839 }
2840 }
2841 buf[0] = '$';
2842 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2843 buf[intlen+1] = '\r';
2844 buf[intlen+2] = '\n';
2845 addReplySds(c,sdsnewlen(buf,intlen+3));
2846 }
2847
2848 static void addReplyBulk(redisClient *c, robj *obj) {
2849 addReplyBulkLen(c,obj);
2850 addReply(c,obj);
2851 addReply(c,shared.crlf);
2852 }
2853
2854 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2855 static void addReplyBulkCString(redisClient *c, char *s) {
2856 if (s == NULL) {
2857 addReply(c,shared.nullbulk);
2858 } else {
2859 robj *o = createStringObject(s,strlen(s));
2860 addReplyBulk(c,o);
2861 decrRefCount(o);
2862 }
2863 }
2864
2865 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2866 int cport, cfd;
2867 char cip[128];
2868 redisClient *c;
2869 REDIS_NOTUSED(el);
2870 REDIS_NOTUSED(mask);
2871 REDIS_NOTUSED(privdata);
2872
2873 cfd = anetAccept(server.neterr, fd, cip, &cport);
2874 if (cfd == AE_ERR) {
2875 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2876 return;
2877 }
2878 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2879 if ((c = createClient(cfd)) == NULL) {
2880 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2881 close(cfd); /* May be already closed, just ingore errors */
2882 return;
2883 }
2884 /* If maxclient directive is set and this is one client more... close the
2885 * connection. Note that we create the client instead to check before
2886 * for this condition, since now the socket is already set in nonblocking
2887 * mode and we can send an error for free using the Kernel I/O */
2888 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2889 char *err = "-ERR max number of clients reached\r\n";
2890
2891 /* That's a best effort error message, don't check write errors */
2892 if (write(c->fd,err,strlen(err)) == -1) {
2893 /* Nothing to do, Just to avoid the warning... */
2894 }
2895 freeClient(c);
2896 return;
2897 }
2898 server.stat_numconnections++;
2899 }
2900
2901 /* ======================= Redis objects implementation ===================== */
2902
2903 static robj *createObject(int type, void *ptr) {
2904 robj *o;
2905
2906 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2907 if (listLength(server.objfreelist)) {
2908 listNode *head = listFirst(server.objfreelist);
2909 o = listNodeValue(head);
2910 listDelNode(server.objfreelist,head);
2911 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2912 } else {
2913 if (server.vm_enabled) {
2914 pthread_mutex_unlock(&server.obj_freelist_mutex);
2915 o = zmalloc(sizeof(*o));
2916 } else {
2917 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2918 }
2919 }
2920 o->type = type;
2921 o->encoding = REDIS_ENCODING_RAW;
2922 o->ptr = ptr;
2923 o->refcount = 1;
2924 if (server.vm_enabled) {
2925 /* Note that this code may run in the context of an I/O thread
2926 * and accessing to server.unixtime in theory is an error
2927 * (no locks). But in practice this is safe, and even if we read
2928 * garbage Redis will not fail, as it's just a statistical info */
2929 o->vm.atime = server.unixtime;
2930 o->storage = REDIS_VM_MEMORY;
2931 }
2932 return o;
2933 }
2934
2935 static robj *createStringObject(char *ptr, size_t len) {
2936 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2937 }
2938
2939 static robj *createStringObjectFromLongLong(long long value) {
2940 robj *o;
2941 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2942 incrRefCount(shared.integers[value]);
2943 o = shared.integers[value];
2944 } else {
2945 if (value >= LONG_MIN && value <= LONG_MAX) {
2946 o = createObject(REDIS_STRING, NULL);
2947 o->encoding = REDIS_ENCODING_INT;
2948 o->ptr = (void*)((long)value);
2949 } else {
2950 o = createObject(REDIS_STRING,sdsfromlonglong(value));
2951 }
2952 }
2953 return o;
2954 }
2955
2956 static robj *dupStringObject(robj *o) {
2957 assert(o->encoding == REDIS_ENCODING_RAW);
2958 return createStringObject(o->ptr,sdslen(o->ptr));
2959 }
2960
2961 static robj *createListObject(void) {
2962 list *l = listCreate();
2963
2964 listSetFreeMethod(l,decrRefCount);
2965 return createObject(REDIS_LIST,l);
2966 }
2967
2968 static robj *createSetObject(void) {
2969 dict *d = dictCreate(&setDictType,NULL);
2970 return createObject(REDIS_SET,d);
2971 }
2972
2973 static robj *createHashObject(void) {
2974 /* All the Hashes start as zipmaps. Will be automatically converted
2975 * into hash tables if there are enough elements or big elements
2976 * inside. */
2977 unsigned char *zm = zipmapNew();
2978 robj *o = createObject(REDIS_HASH,zm);
2979 o->encoding = REDIS_ENCODING_ZIPMAP;
2980 return o;
2981 }
2982
2983 static robj *createZsetObject(void) {
2984 zset *zs = zmalloc(sizeof(*zs));
2985
2986 zs->dict = dictCreate(&zsetDictType,NULL);
2987 zs->zsl = zslCreate();
2988 return createObject(REDIS_ZSET,zs);
2989 }
2990
2991 static void freeStringObject(robj *o) {
2992 if (o->encoding == REDIS_ENCODING_RAW) {
2993 sdsfree(o->ptr);
2994 }
2995 }
2996
2997 static void freeListObject(robj *o) {
2998 listRelease((list*) o->ptr);
2999 }
3000
3001 static void freeSetObject(robj *o) {
3002 dictRelease((dict*) o->ptr);
3003 }
3004
3005 static void freeZsetObject(robj *o) {
3006 zset *zs = o->ptr;
3007
3008 dictRelease(zs->dict);
3009 zslFree(zs->zsl);
3010 zfree(zs);
3011 }
3012
3013 static void freeHashObject(robj *o) {
3014 switch (o->encoding) {
3015 case REDIS_ENCODING_HT:
3016 dictRelease((dict*) o->ptr);
3017 break;
3018 case REDIS_ENCODING_ZIPMAP:
3019 zfree(o->ptr);
3020 break;
3021 default:
3022 redisPanic("Unknown hash encoding type");
3023 break;
3024 }
3025 }
3026
3027 static void incrRefCount(robj *o) {
3028 o->refcount++;
3029 }
3030
3031 static void decrRefCount(void *obj) {
3032 robj *o = obj;
3033
3034 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
3035 /* Object is a key of a swapped out value, or in the process of being
3036 * loaded. */
3037 if (server.vm_enabled &&
3038 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3039 {
3040 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
3041 redisAssert(o->type == REDIS_STRING);
3042 freeStringObject(o);
3043 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
3044 pthread_mutex_lock(&server.obj_freelist_mutex);
3045 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3046 !listAddNodeHead(server.objfreelist,o))
3047 zfree(o);
3048 pthread_mutex_unlock(&server.obj_freelist_mutex);
3049 server.vm_stats_swapped_objects--;
3050 return;
3051 }
3052 /* Object is in memory, or in the process of being swapped out. */
3053 if (--(o->refcount) == 0) {
3054 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3055 vmCancelThreadedIOJob(obj);
3056 switch(o->type) {
3057 case REDIS_STRING: freeStringObject(o); break;
3058 case REDIS_LIST: freeListObject(o); break;
3059 case REDIS_SET: freeSetObject(o); break;
3060 case REDIS_ZSET: freeZsetObject(o); break;
3061 case REDIS_HASH: freeHashObject(o); break;
3062 default: redisPanic("Unknown object type"); break;
3063 }
3064 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3065 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3066 !listAddNodeHead(server.objfreelist,o))
3067 zfree(o);
3068 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3069 }
3070 }
3071
3072 static robj *lookupKey(redisDb *db, robj *key) {
3073 dictEntry *de = dictFind(db->dict,key);
3074 if (de) {
3075 robj *key = dictGetEntryKey(de);
3076 robj *val = dictGetEntryVal(de);
3077
3078 if (server.vm_enabled) {
3079 if (key->storage == REDIS_VM_MEMORY ||
3080 key->storage == REDIS_VM_SWAPPING)
3081 {
3082 /* If we were swapping the object out, stop it, this key
3083 * was requested. */
3084 if (key->storage == REDIS_VM_SWAPPING)
3085 vmCancelThreadedIOJob(key);
3086 /* Update the access time of the key for the aging algorithm. */
3087 key->vm.atime = server.unixtime;
3088 } else {
3089 int notify = (key->storage == REDIS_VM_LOADING);
3090
3091 /* Our value was swapped on disk. Bring it at home. */
3092 redisAssert(val == NULL);
3093 val = vmLoadObject(key);
3094 dictGetEntryVal(de) = val;
3095
3096 /* Clients blocked by the VM subsystem may be waiting for
3097 * this key... */
3098 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3099 }
3100 }
3101 return val;
3102 } else {
3103 return NULL;
3104 }
3105 }
3106
3107 static robj *lookupKeyRead(redisDb *db, robj *key) {
3108 expireIfNeeded(db,key);
3109 return lookupKey(db,key);
3110 }
3111
3112 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3113 deleteIfVolatile(db,key);
3114 touchWatchedKey(db,key);
3115 return lookupKey(db,key);
3116 }
3117
3118 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3119 robj *o = lookupKeyRead(c->db, key);
3120 if (!o) addReply(c,reply);
3121 return o;
3122 }
3123
3124 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3125 robj *o = lookupKeyWrite(c->db, key);
3126 if (!o) addReply(c,reply);
3127 return o;
3128 }
3129
3130 static int checkType(redisClient *c, robj *o, int type) {
3131 if (o->type != type) {
3132 addReply(c,shared.wrongtypeerr);
3133 return 1;
3134 }
3135 return 0;
3136 }
3137
3138 static int deleteKey(redisDb *db, robj *key) {
3139 int retval;
3140
3141 /* We need to protect key from destruction: after the first dictDelete()
3142 * it may happen that 'key' is no longer valid if we don't increment
3143 * it's count. This may happen when we get the object reference directly
3144 * from the hash table with dictRandomKey() or dict iterators */
3145 incrRefCount(key);
3146 if (dictSize(db->expires)) dictDelete(db->expires,key);
3147 retval = dictDelete(db->dict,key);
3148 decrRefCount(key);
3149
3150 return retval == DICT_OK;
3151 }
3152
3153 /* Check if the nul-terminated string 's' can be represented by a long
3154 * (that is, is a number that fits into long without any other space or
3155 * character before or after the digits).
3156 *
3157 * If so, the function returns REDIS_OK and *longval is set to the value
3158 * of the number. Otherwise REDIS_ERR is returned */
3159 static int isStringRepresentableAsLong(sds s, long *longval) {
3160 char buf[32], *endptr;
3161 long value;
3162 int slen;
3163
3164 value = strtol(s, &endptr, 10);
3165 if (endptr[0] != '\0') return REDIS_ERR;
3166 slen = ll2string(buf,32,value);
3167
3168 /* If the number converted back into a string is not identical
3169 * then it's not possible to encode the string as integer */
3170 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3171 if (longval) *longval = value;
3172 return REDIS_OK;
3173 }
3174
3175 /* Try to encode a string object in order to save space */
3176 static robj *tryObjectEncoding(robj *o) {
3177 long value;
3178 sds s = o->ptr;
3179
3180 if (o->encoding != REDIS_ENCODING_RAW)
3181 return o; /* Already encoded */
3182
3183 /* It's not safe to encode shared objects: shared objects can be shared
3184 * everywhere in the "object space" of Redis. Encoded objects can only
3185 * appear as "values" (and not, for instance, as keys) */
3186 if (o->refcount > 1) return o;
3187
3188 /* Currently we try to encode only strings */
3189 redisAssert(o->type == REDIS_STRING);
3190
3191 /* Check if we can represent this string as a long integer */
3192 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3193
3194 /* Ok, this object can be encoded */
3195 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3196 decrRefCount(o);
3197 incrRefCount(shared.integers[value]);
3198 return shared.integers[value];
3199 } else {
3200 o->encoding = REDIS_ENCODING_INT;
3201 sdsfree(o->ptr);
3202 o->ptr = (void*) value;
3203 return o;
3204 }
3205 }
3206
3207 /* Get a decoded version of an encoded object (returned as a new object).
3208 * If the object is already raw-encoded just increment the ref count. */
3209 static robj *getDecodedObject(robj *o) {
3210 robj *dec;
3211
3212 if (o->encoding == REDIS_ENCODING_RAW) {
3213 incrRefCount(o);
3214 return o;
3215 }
3216 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3217 char buf[32];
3218
3219 ll2string(buf,32,(long)o->ptr);
3220 dec = createStringObject(buf,strlen(buf));
3221 return dec;
3222 } else {
3223 redisPanic("Unknown encoding type");
3224 }
3225 }
3226
3227 /* Compare two string objects via strcmp() or alike.
3228 * Note that the objects may be integer-encoded. In such a case we
3229 * use ll2string() to get a string representation of the numbers on the stack
3230 * and compare the strings, it's much faster than calling getDecodedObject().
3231 *
3232 * Important note: if objects are not integer encoded, but binary-safe strings,
3233 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3234 * binary safe. */
3235 static int compareStringObjects(robj *a, robj *b) {
3236 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3237 char bufa[128], bufb[128], *astr, *bstr;
3238 int bothsds = 1;
3239
3240 if (a == b) return 0;
3241 if (a->encoding != REDIS_ENCODING_RAW) {
3242 ll2string(bufa,sizeof(bufa),(long) a->ptr);
3243 astr = bufa;
3244 bothsds = 0;
3245 } else {
3246 astr = a->ptr;
3247 }
3248 if (b->encoding != REDIS_ENCODING_RAW) {
3249 ll2string(bufb,sizeof(bufb),(long) b->ptr);
3250 bstr = bufb;
3251 bothsds = 0;
3252 } else {
3253 bstr = b->ptr;
3254 }
3255 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3256 }
3257
3258 /* Equal string objects return 1 if the two objects are the same from the
3259 * point of view of a string comparison, otherwise 0 is returned. Note that
3260 * this function is faster then checking for (compareStringObject(a,b) == 0)
3261 * because it can perform some more optimization. */
3262 static int equalStringObjects(robj *a, robj *b) {
3263 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3264 return a->ptr == b->ptr;
3265 } else {
3266 return compareStringObjects(a,b) == 0;
3267 }
3268 }
3269
3270 static size_t stringObjectLen(robj *o) {
3271 redisAssert(o->type == REDIS_STRING);
3272 if (o->encoding == REDIS_ENCODING_RAW) {
3273 return sdslen(o->ptr);
3274 } else {
3275 char buf[32];
3276
3277 return ll2string(buf,32,(long)o->ptr);
3278 }
3279 }
3280
3281 static int getDoubleFromObject(robj *o, double *target) {
3282 double value;
3283 char *eptr;
3284
3285 if (o == NULL) {
3286 value = 0;
3287 } else {
3288 redisAssert(o->type == REDIS_STRING);
3289 if (o->encoding == REDIS_ENCODING_RAW) {
3290 value = strtod(o->ptr, &eptr);
3291 if (eptr[0] != '\0') return REDIS_ERR;
3292 } else if (o->encoding == REDIS_ENCODING_INT) {
3293 value = (long)o->ptr;
3294 } else {
3295 redisPanic("Unknown string encoding");
3296 }
3297 }
3298
3299 *target = value;
3300 return REDIS_OK;
3301 }
3302
3303 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3304 double value;
3305 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3306 if (msg != NULL) {
3307 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3308 } else {
3309 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3310 }
3311 return REDIS_ERR;
3312 }
3313
3314 *target = value;
3315 return REDIS_OK;
3316 }
3317
3318 static int getLongLongFromObject(robj *o, long long *target) {
3319 long long value;
3320 char *eptr;
3321
3322 if (o == NULL) {
3323 value = 0;
3324 } else {
3325 redisAssert(o->type == REDIS_STRING);
3326 if (o->encoding == REDIS_ENCODING_RAW) {
3327 value = strtoll(o->ptr, &eptr, 10);
3328 if (eptr[0] != '\0') return REDIS_ERR;
3329 } else if (o->encoding == REDIS_ENCODING_INT) {
3330 value = (long)o->ptr;
3331 } else {
3332 redisPanic("Unknown string encoding");
3333 }
3334 }
3335
3336 *target = value;
3337 return REDIS_OK;
3338 }
3339
3340 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3341 long long value;
3342 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3343 if (msg != NULL) {
3344 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3345 } else {
3346 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3347 }
3348 return REDIS_ERR;
3349 }
3350
3351 *target = value;
3352 return REDIS_OK;
3353 }
3354
3355 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3356 long long value;
3357
3358 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3359 if (value < LONG_MIN || value > LONG_MAX) {
3360 if (msg != NULL) {
3361 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3362 } else {
3363 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3364 }
3365 return REDIS_ERR;
3366 }
3367
3368 *target = value;
3369 return REDIS_OK;
3370 }
3371
3372 /*============================ RDB saving/loading =========================== */
3373
3374 static int rdbSaveType(FILE *fp, unsigned char type) {
3375 if (fwrite(&type,1,1,fp) == 0) return -1;
3376 return 0;
3377 }
3378
3379 static int rdbSaveTime(FILE *fp, time_t t) {
3380 int32_t t32 = (int32_t) t;
3381 if (fwrite(&t32,4,1,fp) == 0) return -1;
3382 return 0;
3383 }
3384
3385 /* check rdbLoadLen() comments for more info */
3386 static int rdbSaveLen(FILE *fp, uint32_t len) {
3387 unsigned char buf[2];
3388
3389 if (len < (1<<6)) {
3390 /* Save a 6 bit len */
3391 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3392 if (fwrite(buf,1,1,fp) == 0) return -1;
3393 } else if (len < (1<<14)) {
3394 /* Save a 14 bit len */
3395 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3396 buf[1] = len&0xFF;
3397 if (fwrite(buf,2,1,fp) == 0) return -1;
3398 } else {
3399 /* Save a 32 bit len */
3400 buf[0] = (REDIS_RDB_32BITLEN<<6);
3401 if (fwrite(buf,1,1,fp) == 0) return -1;
3402 len = htonl(len);
3403 if (fwrite(&len,4,1,fp) == 0) return -1;
3404 }
3405 return 0;
3406 }
3407
3408 /* Encode 'value' as an integer if possible (if integer will fit the
3409 * supported range). If the function sucessful encoded the integer
3410 * then the (up to 5 bytes) encoded representation is written in the
3411 * string pointed by 'enc' and the length is returned. Otherwise
3412 * 0 is returned. */
3413 static int rdbEncodeInteger(long long value, unsigned char *enc) {
3414 /* Finally check if it fits in our ranges */
3415 if (value >= -(1<<7) && value <= (1<<7)-1) {
3416 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3417 enc[1] = value&0xFF;
3418 return 2;
3419 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3420 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3421 enc[1] = value&0xFF;
3422 enc[2] = (value>>8)&0xFF;
3423 return 3;
3424 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3425 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3426 enc[1] = value&0xFF;
3427 enc[2] = (value>>8)&0xFF;
3428 enc[3] = (value>>16)&0xFF;
3429 enc[4] = (value>>24)&0xFF;
3430 return 5;
3431 } else {
3432 return 0;
3433 }
3434 }
3435
3436 /* String objects in the form "2391" "-100" without any space and with a
3437 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3438 * encoded as integers to save space */
3439 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3440 long long value;
3441 char *endptr, buf[32];
3442
3443 /* Check if it's possible to encode this value as a number */
3444 value = strtoll(s, &endptr, 10);
3445 if (endptr[0] != '\0') return 0;
3446 ll2string(buf,32,value);
3447
3448 /* If the number converted back into a string is not identical
3449 * then it's not possible to encode the string as integer */
3450 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3451
3452 return rdbEncodeInteger(value,enc);
3453 }
3454
3455 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3456 size_t comprlen, outlen;
3457 unsigned char byte;
3458 void *out;
3459
3460 /* We require at least four bytes compression for this to be worth it */
3461 if (len <= 4) return 0;
3462 outlen = len-4;
3463 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3464 comprlen = lzf_compress(s, len, out, outlen);
3465 if (comprlen == 0) {
3466 zfree(out);
3467 return 0;
3468 }
3469 /* Data compressed! Let's save it on disk */
3470 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3471 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3472 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3473 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3474 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3475 zfree(out);
3476 return comprlen;
3477
3478 writeerr:
3479 zfree(out);
3480 return -1;
3481 }
3482
3483 /* Save a string objet as [len][data] on disk. If the object is a string
3484 * representation of an integer value we try to safe it in a special form */
3485 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3486 int enclen;
3487
3488 /* Try integer encoding */
3489 if (len <= 11) {
3490 unsigned char buf[5];
3491 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3492 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3493 return 0;
3494 }
3495 }
3496
3497 /* Try LZF compression - under 20 bytes it's unable to compress even
3498 * aaaaaaaaaaaaaaaaaa so skip it */
3499 if (server.rdbcompression && len > 20) {
3500 int retval;
3501
3502 retval = rdbSaveLzfStringObject(fp,s,len);
3503 if (retval == -1) return -1;
3504 if (retval > 0) return 0;
3505 /* retval == 0 means data can't be compressed, save the old way */
3506 }
3507
3508 /* Store verbatim */
3509 if (rdbSaveLen(fp,len) == -1) return -1;
3510 if (len && fwrite(s,len,1,fp) == 0) return -1;
3511 return 0;
3512 }
3513
3514 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3515 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3516 int retval;
3517
3518 /* Avoid to decode the object, then encode it again, if the
3519 * object is alrady integer encoded. */
3520 if (obj->encoding == REDIS_ENCODING_INT) {
3521 long val = (long) obj->ptr;
3522 unsigned char buf[5];
3523 int enclen;
3524
3525 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3526 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3527 return 0;
3528 }
3529 /* otherwise... fall throught and continue with the usual
3530 * code path. */
3531 }
3532
3533 /* Avoid incr/decr ref count business when possible.
3534 * This plays well with copy-on-write given that we are probably
3535 * in a child process (BGSAVE). Also this makes sure key objects
3536 * of swapped objects are not incRefCount-ed (an assert does not allow
3537 * this in order to avoid bugs) */
3538 if (obj->encoding != REDIS_ENCODING_RAW) {
3539 obj = getDecodedObject(obj);
3540 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3541 decrRefCount(obj);
3542 } else {
3543 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3544 }
3545 return retval;
3546 }
3547
3548 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3549 * 8 bit integer specifing the length of the representation.
3550 * This 8 bit integer has special values in order to specify the following
3551 * conditions:
3552 * 253: not a number
3553 * 254: + inf
3554 * 255: - inf
3555 */
3556 static int rdbSaveDoubleValue(FILE *fp, double val) {
3557 unsigned char buf[128];
3558 int len;
3559
3560 if (isnan(val)) {
3561 buf[0] = 253;
3562 len = 1;
3563 } else if (!isfinite(val)) {
3564 len = 1;
3565 buf[0] = (val < 0) ? 255 : 254;
3566 } else {
3567 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
3568 /* Check if the float is in a safe range to be casted into a
3569 * long long. We are assuming that long long is 64 bit here.
3570 * Also we are assuming that there are no implementations around where
3571 * double has precision < 52 bit.
3572 *
3573 * Under this assumptions we test if a double is inside an interval
3574 * where casting to long long is safe. Then using two castings we
3575 * make sure the decimal part is zero. If all this is true we use
3576 * integer printing function that is much faster. */
3577 double min = -4503599627370495; /* (2^52)-1 */
3578 double max = 4503599627370496; /* -(2^52) */
3579 if (val > min && val < max && val == ((double)((long long)val)))
3580 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3581 else
3582 #endif
3583 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3584 buf[0] = strlen((char*)buf+1);
3585 len = buf[0]+1;
3586 }
3587 if (fwrite(buf,len,1,fp) == 0) return -1;
3588 return 0;
3589 }
3590
3591 /* Save a Redis object. */
3592 static int rdbSaveObject(FILE *fp, robj *o) {
3593 if (o->type == REDIS_STRING) {
3594 /* Save a string value */
3595 if (rdbSaveStringObject(fp,o) == -1) return -1;
3596 } else if (o->type == REDIS_LIST) {
3597 /* Save a list value */
3598 list *list = o->ptr;
3599 listIter li;
3600 listNode *ln;
3601
3602 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3603 listRewind(list,&li);
3604 while((ln = listNext(&li))) {
3605 robj *eleobj = listNodeValue(ln);
3606
3607 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3608 }
3609 } else if (o->type == REDIS_SET) {
3610 /* Save a set value */
3611 dict *set = o->ptr;
3612 dictIterator *di = dictGetIterator(set);
3613 dictEntry *de;
3614
3615 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3616 while((de = dictNext(di)) != NULL) {
3617 robj *eleobj = dictGetEntryKey(de);
3618
3619 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3620 }
3621 dictReleaseIterator(di);
3622 } else if (o->type == REDIS_ZSET) {
3623 /* Save a set value */
3624 zset *zs = o->ptr;
3625 dictIterator *di = dictGetIterator(zs->dict);
3626 dictEntry *de;
3627
3628 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3629 while((de = dictNext(di)) != NULL) {
3630 robj *eleobj = dictGetEntryKey(de);
3631 double *score = dictGetEntryVal(de);
3632
3633 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3634 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3635 }
3636 dictReleaseIterator(di);
3637 } else if (o->type == REDIS_HASH) {
3638 /* Save a hash value */
3639 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3640 unsigned char *p = zipmapRewind(o->ptr);
3641 unsigned int count = zipmapLen(o->ptr);
3642 unsigned char *key, *val;
3643 unsigned int klen, vlen;
3644
3645 if (rdbSaveLen(fp,count) == -1) return -1;
3646 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3647 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3648 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3649 }
3650 } else {
3651 dictIterator *di = dictGetIterator(o->ptr);
3652 dictEntry *de;
3653
3654 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3655 while((de = dictNext(di)) != NULL) {
3656 robj *key = dictGetEntryKey(de);
3657 robj *val = dictGetEntryVal(de);
3658
3659 if (rdbSaveStringObject(fp,key) == -1) return -1;
3660 if (rdbSaveStringObject(fp,val) == -1) return -1;
3661 }
3662 dictReleaseIterator(di);
3663 }
3664 } else {
3665 redisPanic("Unknown object type");
3666 }
3667 return 0;
3668 }
3669
3670 /* Return the length the object will have on disk if saved with
3671 * the rdbSaveObject() function. Currently we use a trick to get
3672 * this length with very little changes to the code. In the future
3673 * we could switch to a faster solution. */
3674 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3675 if (fp == NULL) fp = server.devnull;
3676 rewind(fp);
3677 assert(rdbSaveObject(fp,o) != 1);
3678 return ftello(fp);
3679 }
3680
3681 /* Return the number of pages required to save this object in the swap file */
3682 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3683 off_t bytes = rdbSavedObjectLen(o,fp);
3684
3685 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3686 }
3687
3688 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3689 static int rdbSave(char *filename) {
3690 dictIterator *di = NULL;
3691 dictEntry *de;
3692 FILE *fp;
3693 char tmpfile[256];
3694 int j;
3695 time_t now = time(NULL);
3696
3697 /* Wait for I/O therads to terminate, just in case this is a
3698 * foreground-saving, to avoid seeking the swap file descriptor at the
3699 * same time. */
3700 if (server.vm_enabled)
3701 waitEmptyIOJobsQueue();
3702
3703 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3704 fp = fopen(tmpfile,"w");
3705 if (!fp) {
3706 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3707 return REDIS_ERR;
3708 }
3709 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3710 for (j = 0; j < server.dbnum; j++) {
3711 redisDb *db = server.db+j;
3712 dict *d = db->dict;
3713 if (dictSize(d) == 0) continue;
3714 di = dictGetIterator(d);
3715 if (!di) {
3716 fclose(fp);
3717 return REDIS_ERR;
3718 }
3719
3720 /* Write the SELECT DB opcode */
3721 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3722 if (rdbSaveLen(fp,j) == -1) goto werr;
3723
3724 /* Iterate this DB writing every entry */
3725 while((de = dictNext(di)) != NULL) {
3726 robj *key = dictGetEntryKey(de);
3727 robj *o = dictGetEntryVal(de);
3728 time_t expiretime = getExpire(db,key);
3729
3730 /* Save the expire time */
3731 if (expiretime != -1) {
3732 /* If this key is already expired skip it */
3733 if (expiretime < now) continue;
3734 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3735 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3736 }
3737 /* Save the key and associated value. This requires special
3738 * handling if the value is swapped out. */
3739 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3740 key->storage == REDIS_VM_SWAPPING) {
3741 /* Save type, key, value */
3742 if (rdbSaveType(fp,o->type) == -1) goto werr;
3743 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3744 if (rdbSaveObject(fp,o) == -1) goto werr;
3745 } else {
3746 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3747 robj *po;
3748 /* Get a preview of the object in memory */
3749 po = vmPreviewObject(key);
3750 /* Save type, key, value */
3751 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3752 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3753 if (rdbSaveObject(fp,po) == -1) goto werr;
3754 /* Remove the loaded object from memory */
3755 decrRefCount(po);
3756 }
3757 }
3758 dictReleaseIterator(di);
3759 }
3760 /* EOF opcode */
3761 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3762
3763 /* Make sure data will not remain on the OS's output buffers */
3764 fflush(fp);
3765 fsync(fileno(fp));
3766 fclose(fp);
3767
3768 /* Use RENAME to make sure the DB file is changed atomically only
3769 * if the generate DB file is ok. */
3770 if (rename(tmpfile,filename) == -1) {
3771 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3772 unlink(tmpfile);
3773 return REDIS_ERR;
3774 }
3775 redisLog(REDIS_NOTICE,"DB saved on disk");
3776 server.dirty = 0;
3777 server.lastsave = time(NULL);
3778 return REDIS_OK;
3779
3780 werr:
3781 fclose(fp);
3782 unlink(tmpfile);
3783 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3784 if (di) dictReleaseIterator(di);
3785 return REDIS_ERR;
3786 }
3787
3788 static int rdbSaveBackground(char *filename) {
3789 pid_t childpid;
3790
3791 if (server.bgsavechildpid != -1) return REDIS_ERR;
3792 if (server.vm_enabled) waitEmptyIOJobsQueue();
3793 if ((childpid = fork()) == 0) {
3794 /* Child */
3795 if (server.vm_enabled) vmReopenSwapFile();
3796 close(server.fd);
3797 if (rdbSave(filename) == REDIS_OK) {
3798 _exit(0);
3799 } else {
3800 _exit(1);
3801 }
3802 } else {
3803 /* Parent */
3804 if (childpid == -1) {
3805 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3806 strerror(errno));
3807 return REDIS_ERR;
3808 }
3809 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3810 server.bgsavechildpid = childpid;
3811 updateDictResizePolicy();
3812 return REDIS_OK;
3813 }
3814 return REDIS_OK; /* unreached */
3815 }
3816
3817 static void rdbRemoveTempFile(pid_t childpid) {
3818 char tmpfile[256];
3819
3820 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3821 unlink(tmpfile);
3822 }
3823
3824 static int rdbLoadType(FILE *fp) {
3825 unsigned char type;
3826 if (fread(&type,1,1,fp) == 0) return -1;
3827 return type;
3828 }
3829
3830 static time_t rdbLoadTime(FILE *fp) {
3831 int32_t t32;
3832 if (fread(&t32,4,1,fp) == 0) return -1;
3833 return (time_t) t32;
3834 }
3835
3836 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3837 * of this file for a description of how this are stored on disk.
3838 *
3839 * isencoded is set to 1 if the readed length is not actually a length but
3840 * an "encoding type", check the above comments for more info */
3841 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3842 unsigned char buf[2];
3843 uint32_t len;
3844 int type;
3845
3846 if (isencoded) *isencoded = 0;
3847 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3848 type = (buf[0]&0xC0)>>6;
3849 if (type == REDIS_RDB_6BITLEN) {
3850 /* Read a 6 bit len */
3851 return buf[0]&0x3F;
3852 } else if (type == REDIS_RDB_ENCVAL) {
3853 /* Read a 6 bit len encoding type */
3854 if (isencoded) *isencoded = 1;
3855 return buf[0]&0x3F;
3856 } else if (type == REDIS_RDB_14BITLEN) {
3857 /* Read a 14 bit len */
3858 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3859 return ((buf[0]&0x3F)<<8)|buf[1];
3860 } else {
3861 /* Read a 32 bit len */
3862 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3863 return ntohl(len);
3864 }
3865 }
3866
3867 /* Load an integer-encoded object from file 'fp', with the specified
3868 * encoding type 'enctype'. If encode is true the function may return
3869 * an integer-encoded object as reply, otherwise the returned object
3870 * will always be encoded as a raw string. */
3871 static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
3872 unsigned char enc[4];
3873 long long val;
3874
3875 if (enctype == REDIS_RDB_ENC_INT8) {
3876 if (fread(enc,1,1,fp) == 0) return NULL;
3877 val = (signed char)enc[0];
3878 } else if (enctype == REDIS_RDB_ENC_INT16) {
3879 uint16_t v;
3880 if (fread(enc,2,1,fp) == 0) return NULL;
3881 v = enc[0]|(enc[1]<<8);
3882 val = (int16_t)v;
3883 } else if (enctype == REDIS_RDB_ENC_INT32) {
3884 uint32_t v;
3885 if (fread(enc,4,1,fp) == 0) return NULL;
3886 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3887 val = (int32_t)v;
3888 } else {
3889 val = 0; /* anti-warning */
3890 redisPanic("Unknown RDB integer encoding type");
3891 }
3892 if (encode)
3893 return createStringObjectFromLongLong(val);
3894 else
3895 return createObject(REDIS_STRING,sdsfromlonglong(val));
3896 }
3897
3898 static robj *rdbLoadLzfStringObject(FILE*fp) {
3899 unsigned int len, clen;
3900 unsigned char *c = NULL;
3901 sds val = NULL;
3902
3903 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3904 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3905 if ((c = zmalloc(clen)) == NULL) goto err;
3906 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3907 if (fread(c,clen,1,fp) == 0) goto err;
3908 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3909 zfree(c);
3910 return createObject(REDIS_STRING,val);
3911 err:
3912 zfree(c);
3913 sdsfree(val);
3914 return NULL;
3915 }
3916
3917 static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
3918 int isencoded;
3919 uint32_t len;
3920 sds val;
3921
3922 len = rdbLoadLen(fp,&isencoded);
3923 if (isencoded) {
3924 switch(len) {
3925 case REDIS_RDB_ENC_INT8:
3926 case REDIS_RDB_ENC_INT16:
3927 case REDIS_RDB_ENC_INT32:
3928 return rdbLoadIntegerObject(fp,len,encode);
3929 case REDIS_RDB_ENC_LZF:
3930 return rdbLoadLzfStringObject(fp);
3931 default:
3932 redisPanic("Unknown RDB encoding type");
3933 }
3934 }
3935
3936 if (len == REDIS_RDB_LENERR) return NULL;
3937 val = sdsnewlen(NULL,len);
3938 if (len && fread(val,len,1,fp) == 0) {
3939 sdsfree(val);
3940 return NULL;
3941 }
3942 return createObject(REDIS_STRING,val);
3943 }
3944
3945 static robj *rdbLoadStringObject(FILE *fp) {
3946 return rdbGenericLoadStringObject(fp,0);
3947 }
3948
3949 static robj *rdbLoadEncodedStringObject(FILE *fp) {
3950 return rdbGenericLoadStringObject(fp,1);
3951 }
3952
3953 /* For information about double serialization check rdbSaveDoubleValue() */
3954 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3955 char buf[128];
3956 unsigned char len;
3957
3958 if (fread(&len,1,1,fp) == 0) return -1;
3959 switch(len) {
3960 case 255: *val = R_NegInf; return 0;
3961 case 254: *val = R_PosInf; return 0;
3962 case 253: *val = R_Nan; return 0;
3963 default:
3964 if (fread(buf,len,1,fp) == 0) return -1;
3965 buf[len] = '\0';
3966 sscanf(buf, "%lg", val);
3967 return 0;
3968 }
3969 }
3970
3971 /* Load a Redis object of the specified type from the specified file.
3972 * On success a newly allocated object is returned, otherwise NULL. */
3973 static robj *rdbLoadObject(int type, FILE *fp) {
3974 robj *o;
3975
3976 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3977 if (type == REDIS_STRING) {
3978 /* Read string value */
3979 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3980 o = tryObjectEncoding(o);
3981 } else if (type == REDIS_LIST || type == REDIS_SET) {
3982 /* Read list/set value */
3983 uint32_t listlen;
3984
3985 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3986 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3987 /* It's faster to expand the dict to the right size asap in order
3988 * to avoid rehashing */
3989 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3990 dictExpand(o->ptr,listlen);
3991 /* Load every single element of the list/set */
3992 while(listlen--) {
3993 robj *ele;
3994
3995 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
3996 ele = tryObjectEncoding(ele);
3997 if (type == REDIS_LIST) {
3998 listAddNodeTail((list*)o->ptr,ele);
3999 } else {
4000 dictAdd((dict*)o->ptr,ele,NULL);
4001 }
4002 }
4003 } else if (type == REDIS_ZSET) {
4004 /* Read list/set value */
4005 size_t zsetlen;
4006 zset *zs;
4007
4008 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4009 o = createZsetObject();
4010 zs = o->ptr;
4011 /* Load every single element of the list/set */
4012 while(zsetlen--) {
4013 robj *ele;
4014 double *score = zmalloc(sizeof(double));
4015
4016 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4017 ele = tryObjectEncoding(ele);
4018 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4019 dictAdd(zs->dict,ele,score);
4020 zslInsert(zs->zsl,*score,ele);
4021 incrRefCount(ele); /* added to skiplist */
4022 }
4023 } else if (type == REDIS_HASH) {
4024 size_t hashlen;
4025
4026 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4027 o = createHashObject();
4028 /* Too many entries? Use an hash table. */
4029 if (hashlen > server.hash_max_zipmap_entries)
4030 convertToRealHash(o);
4031 /* Load every key/value, then set it into the zipmap or hash
4032 * table, as needed. */
4033 while(hashlen--) {
4034 robj *key, *val;
4035
4036 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4037 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4038 /* If we are using a zipmap and there are too big values
4039 * the object is converted to real hash table encoding. */
4040 if (o->encoding != REDIS_ENCODING_HT &&
4041 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4042 sdslen(val->ptr) > server.hash_max_zipmap_value))
4043 {
4044 convertToRealHash(o);
4045 }
4046
4047 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4048 unsigned char *zm = o->ptr;
4049
4050 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4051 val->ptr,sdslen(val->ptr),NULL);
4052 o->ptr = zm;
4053 decrRefCount(key);
4054 decrRefCount(val);
4055 } else {
4056 key = tryObjectEncoding(key);
4057 val = tryObjectEncoding(val);
4058 dictAdd((dict*)o->ptr,key,val);
4059 }
4060 }
4061 } else {
4062 redisPanic("Unknown object type");
4063 }
4064 return o;
4065 }
4066
4067 static int rdbLoad(char *filename) {
4068 FILE *fp;
4069 uint32_t dbid;
4070 int type, retval, rdbver;
4071 int swap_all_values = 0;
4072 dict *d = server.db[0].dict;
4073 redisDb *db = server.db+0;
4074 char buf[1024];
4075 time_t expiretime, now = time(NULL);
4076 long long loadedkeys = 0;
4077
4078 fp = fopen(filename,"r");
4079 if (!fp) return REDIS_ERR;
4080 if (fread(buf,9,1,fp) == 0) goto eoferr;
4081 buf[9] = '\0';
4082 if (memcmp(buf,"REDIS",5) != 0) {
4083 fclose(fp);
4084 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4085 return REDIS_ERR;
4086 }
4087 rdbver = atoi(buf+5);
4088 if (rdbver != 1) {
4089 fclose(fp);
4090 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4091 return REDIS_ERR;
4092 }
4093 while(1) {
4094 robj *key, *val;
4095
4096 expiretime = -1;
4097 /* Read type. */
4098 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4099 if (type == REDIS_EXPIRETIME) {
4100 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4101 /* We read the time so we need to read the object type again */
4102 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4103 }
4104 if (type == REDIS_EOF) break;
4105 /* Handle SELECT DB opcode as a special case */
4106 if (type == REDIS_SELECTDB) {
4107 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4108 goto eoferr;
4109 if (dbid >= (unsigned)server.dbnum) {
4110 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4111 exit(1);
4112 }
4113 db = server.db+dbid;
4114 d = db->dict;
4115 continue;
4116 }
4117 /* Read key */
4118 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4119 /* Read value */
4120 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4121 /* Check if the key already expired */
4122 if (expiretime != -1 && expiretime < now) {
4123 decrRefCount(key);
4124 decrRefCount(val);
4125 continue;
4126 }
4127 /* Add the new object in the hash table */
4128 retval = dictAdd(d,key,val);
4129 if (retval == DICT_ERR) {
4130 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4131 exit(1);
4132 }
4133 loadedkeys++;
4134 /* Set the expire time if needed */
4135 if (expiretime != -1) setExpire(db,key,expiretime);
4136
4137 /* Handle swapping while loading big datasets when VM is on */
4138
4139 /* If we detecter we are hopeless about fitting something in memory
4140 * we just swap every new key on disk. Directly...
4141 * Note that's important to check for this condition before resorting
4142 * to random sampling, otherwise we may try to swap already
4143 * swapped keys. */
4144 if (swap_all_values) {
4145 dictEntry *de = dictFind(d,key);
4146
4147 /* de may be NULL since the key already expired */
4148 if (de) {
4149 key = dictGetEntryKey(de);
4150 val = dictGetEntryVal(de);
4151
4152 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4153 dictGetEntryVal(de) = NULL;
4154 }
4155 }
4156 continue;
4157 }
4158
4159 /* If we have still some hope of having some value fitting memory
4160 * then we try random sampling. */
4161 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4162 while (zmalloc_used_memory() > server.vm_max_memory) {
4163 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4164 }
4165 if (zmalloc_used_memory() > server.vm_max_memory)
4166 swap_all_values = 1; /* We are already using too much mem */
4167 }
4168 }
4169 fclose(fp);
4170 return REDIS_OK;
4171
4172 eoferr: /* unexpected end of file is handled here with a fatal exit */
4173 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4174 exit(1);
4175 return REDIS_ERR; /* Just to avoid warning */
4176 }
4177
4178 /*================================== Shutdown =============================== */
4179 static int prepareForShutdown() {
4180 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4181 /* Kill the saving child if there is a background saving in progress.
4182 We want to avoid race conditions, for instance our saving child may
4183 overwrite the synchronous saving did by SHUTDOWN. */
4184 if (server.bgsavechildpid != -1) {
4185 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4186 kill(server.bgsavechildpid,SIGKILL);
4187 rdbRemoveTempFile(server.bgsavechildpid);
4188 }
4189 if (server.appendonly) {
4190 /* Append only file: fsync() the AOF and exit */
4191 fsync(server.appendfd);
4192 if (server.vm_enabled) unlink(server.vm_swap_file);
4193 } else {
4194 /* Snapshotting. Perform a SYNC SAVE and exit */
4195 if (rdbSave(server.dbfilename) == REDIS_OK) {
4196 if (server.daemonize)
4197 unlink(server.pidfile);
4198 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4199 } else {
4200 /* Ooops.. error saving! The best we can do is to continue
4201 * operating. Note that if there was a background saving process,
4202 * in the next cron() Redis will be notified that the background
4203 * saving aborted, handling special stuff like slaves pending for
4204 * synchronization... */
4205 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4206 return REDIS_ERR;
4207 }
4208 }
4209 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4210 return REDIS_OK;
4211 }
4212
4213 /*================================== Commands =============================== */
4214
4215 static void authCommand(redisClient *c) {
4216 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4217 c->authenticated = 1;
4218 addReply(c,shared.ok);
4219 } else {
4220 c->authenticated = 0;
4221 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4222 }
4223 }
4224
4225 static void pingCommand(redisClient *c) {
4226 addReply(c,shared.pong);
4227 }
4228
4229 static void echoCommand(redisClient *c) {
4230 addReplyBulk(c,c->argv[1]);
4231 }
4232
4233 /*=================================== Strings =============================== */
4234
4235 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4236 int retval;
4237 long seconds = 0; /* initialized to avoid an harmness warning */
4238
4239 if (expire) {
4240 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4241 return;
4242 if (seconds <= 0) {
4243 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4244 return;
4245 }
4246 }
4247
4248 touchWatchedKey(c->db,key);
4249 if (nx) deleteIfVolatile(c->db,key);
4250 retval = dictAdd(c->db->dict,key,val);
4251 if (retval == DICT_ERR) {
4252 if (!nx) {
4253 /* If the key is about a swapped value, we want a new key object
4254 * to overwrite the old. So we delete the old key in the database.
4255 * This will also make sure that swap pages about the old object
4256 * will be marked as free. */
4257 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4258 incrRefCount(key);
4259 dictReplace(c->db->dict,key,val);
4260 incrRefCount(val);
4261 } else {
4262 addReply(c,shared.czero);
4263 return;
4264 }
4265 } else {
4266 incrRefCount(key);
4267 incrRefCount(val);
4268 }
4269 server.dirty++;
4270 removeExpire(c->db,key);
4271 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4272 addReply(c, nx ? shared.cone : shared.ok);
4273 }
4274
4275 static void setCommand(redisClient *c) {
4276 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4277 }
4278
4279 static void setnxCommand(redisClient *c) {
4280 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4281 }
4282
4283 static void setexCommand(redisClient *c) {
4284 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4285 }
4286
4287 static int getGenericCommand(redisClient *c) {
4288 robj *o;
4289
4290 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4291 return REDIS_OK;
4292
4293 if (o->type != REDIS_STRING) {
4294 addReply(c,shared.wrongtypeerr);
4295 return REDIS_ERR;
4296 } else {
4297 addReplyBulk(c,o);
4298 return REDIS_OK;
4299 }
4300 }
4301
4302 static void getCommand(redisClient *c) {
4303 getGenericCommand(c);
4304 }
4305
4306 static void getsetCommand(redisClient *c) {
4307 if (getGenericCommand(c) == REDIS_ERR) return;
4308 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4309 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4310 } else {
4311 incrRefCount(c->argv[1]);
4312 }
4313 incrRefCount(c->argv[2]);
4314 server.dirty++;
4315 removeExpire(c->db,c->argv[1]);
4316 }
4317
4318 static void mgetCommand(redisClient *c) {
4319 int j;
4320
4321 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4322 for (j = 1; j < c->argc; j++) {
4323 robj *o = lookupKeyRead(c->db,c->argv[j]);
4324 if (o == NULL) {
4325 addReply(c,shared.nullbulk);
4326 } else {
4327 if (o->type != REDIS_STRING) {
4328 addReply(c,shared.nullbulk);
4329 } else {
4330 addReplyBulk(c,o);
4331 }
4332 }
4333 }
4334 }
4335
4336 static void msetGenericCommand(redisClient *c, int nx) {
4337 int j, busykeys = 0;
4338
4339 if ((c->argc % 2) == 0) {
4340 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4341 return;
4342 }
4343 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4344 * set nothing at all if at least one already key exists. */
4345 if (nx) {
4346 for (j = 1; j < c->argc; j += 2) {
4347 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4348 busykeys++;
4349 }
4350 }
4351 }
4352 if (busykeys) {
4353 addReply(c, shared.czero);
4354 return;
4355 }
4356
4357 for (j = 1; j < c->argc; j += 2) {
4358 int retval;
4359
4360 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4361 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4362 if (retval == DICT_ERR) {
4363 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4364 incrRefCount(c->argv[j+1]);
4365 } else {
4366 incrRefCount(c->argv[j]);
4367 incrRefCount(c->argv[j+1]);
4368 }
4369 removeExpire(c->db,c->argv[j]);
4370 }
4371 server.dirty += (c->argc-1)/2;
4372 addReply(c, nx ? shared.cone : shared.ok);
4373 }
4374
4375 static void msetCommand(redisClient *c) {
4376 msetGenericCommand(c,0);
4377 }
4378
4379 static void msetnxCommand(redisClient *c) {
4380 msetGenericCommand(c,1);
4381 }
4382
4383 static void incrDecrCommand(redisClient *c, long long incr) {
4384 long long value;
4385 int retval;
4386 robj *o;
4387
4388 o = lookupKeyWrite(c->db,c->argv[1]);
4389 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4390 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
4391
4392 value += incr;
4393 o = createStringObjectFromLongLong(value);
4394 retval = dictAdd(c->db->dict,c->argv[1],o);
4395 if (retval == DICT_ERR) {
4396 dictReplace(c->db->dict,c->argv[1],o);
4397 removeExpire(c->db,c->argv[1]);
4398 } else {
4399 incrRefCount(c->argv[1]);
4400 }
4401 server.dirty++;
4402 addReply(c,shared.colon);
4403 addReply(c,o);
4404 addReply(c,shared.crlf);
4405 }
4406
4407 static void incrCommand(redisClient *c) {
4408 incrDecrCommand(c,1);
4409 }
4410
4411 static void decrCommand(redisClient *c) {
4412 incrDecrCommand(c,-1);
4413 }
4414
4415 static void incrbyCommand(redisClient *c) {
4416 long long incr;
4417
4418 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4419 incrDecrCommand(c,incr);
4420 }
4421
4422 static void decrbyCommand(redisClient *c) {
4423 long long incr;
4424
4425 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4426 incrDecrCommand(c,-incr);
4427 }
4428
4429 static void appendCommand(redisClient *c) {
4430 int retval;
4431 size_t totlen;
4432 robj *o;
4433
4434 o = lookupKeyWrite(c->db,c->argv[1]);
4435 if (o == NULL) {
4436 /* Create the key */
4437 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4438 incrRefCount(c->argv[1]);
4439 incrRefCount(c->argv[2]);
4440 totlen = stringObjectLen(c->argv[2]);
4441 } else {
4442 dictEntry *de;
4443
4444 de = dictFind(c->db->dict,c->argv[1]);
4445 assert(de != NULL);
4446
4447 o = dictGetEntryVal(de);
4448 if (o->type != REDIS_STRING) {
4449 addReply(c,shared.wrongtypeerr);
4450 return;
4451 }
4452 /* If the object is specially encoded or shared we have to make
4453 * a copy */
4454 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4455 robj *decoded = getDecodedObject(o);
4456
4457 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4458 decrRefCount(decoded);
4459 dictReplace(c->db->dict,c->argv[1],o);
4460 }
4461 /* APPEND! */
4462 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4463 o->ptr = sdscatlen(o->ptr,
4464 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4465 } else {
4466 o->ptr = sdscatprintf(o->ptr, "%ld",
4467 (unsigned long) c->argv[2]->ptr);
4468 }
4469 totlen = sdslen(o->ptr);
4470 }
4471 server.dirty++;
4472 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4473 }
4474
4475 static void substrCommand(redisClient *c) {
4476 robj *o;
4477 long start = atoi(c->argv[2]->ptr);
4478 long end = atoi(c->argv[3]->ptr);
4479 size_t rangelen, strlen;
4480 sds range;
4481
4482 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4483 checkType(c,o,REDIS_STRING)) return;
4484
4485 o = getDecodedObject(o);
4486 strlen = sdslen(o->ptr);
4487
4488 /* convert negative indexes */
4489 if (start < 0) start = strlen+start;
4490 if (end < 0) end = strlen+end;
4491 if (start < 0) start = 0;
4492 if (end < 0) end = 0;
4493
4494 /* indexes sanity checks */
4495 if (start > end || (size_t)start >= strlen) {
4496 /* Out of range start or start > end result in null reply */
4497 addReply(c,shared.nullbulk);
4498 decrRefCount(o);
4499 return;
4500 }
4501 if ((size_t)end >= strlen) end = strlen-1;
4502 rangelen = (end-start)+1;
4503
4504 /* Return the result */
4505 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4506 range = sdsnewlen((char*)o->ptr+start,rangelen);
4507 addReplySds(c,range);
4508 addReply(c,shared.crlf);
4509 decrRefCount(o);
4510 }
4511
4512 /* ========================= Type agnostic commands ========================= */
4513
4514 static void delCommand(redisClient *c) {
4515 int deleted = 0, j;
4516
4517 for (j = 1; j < c->argc; j++) {
4518 if (deleteKey(c->db,c->argv[j])) {
4519 touchWatchedKey(c->db,c->argv[j]);
4520 server.dirty++;
4521 deleted++;
4522 }
4523 }
4524 addReplyLongLong(c,deleted);
4525 }
4526
4527 static void existsCommand(redisClient *c) {
4528 expireIfNeeded(c->db,c->argv[1]);
4529 if (dictFind(c->db->dict,c->argv[1])) {
4530 addReply(c, shared.cone);
4531 } else {
4532 addReply(c, shared.czero);
4533 }
4534 }
4535
4536 static void selectCommand(redisClient *c) {
4537 int id = atoi(c->argv[1]->ptr);
4538
4539 if (selectDb(c,id) == REDIS_ERR) {
4540 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4541 } else {
4542 addReply(c,shared.ok);
4543 }
4544 }
4545
4546 static void randomkeyCommand(redisClient *c) {
4547 dictEntry *de;
4548 robj *key;
4549
4550 while(1) {
4551 de = dictGetRandomKey(c->db->dict);
4552 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4553 }
4554
4555 if (de == NULL) {
4556 addReply(c,shared.nullbulk);
4557 return;
4558 }
4559
4560 key = dictGetEntryKey(de);
4561 if (server.vm_enabled) {
4562 key = dupStringObject(key);
4563 addReplyBulk(c,key);
4564 decrRefCount(key);
4565 } else {
4566 addReplyBulk(c,key);
4567 }
4568 }
4569
4570 static void keysCommand(redisClient *c) {
4571 dictIterator *di;
4572 dictEntry *de;
4573 sds pattern = c->argv[1]->ptr;
4574 int plen = sdslen(pattern);
4575 unsigned long numkeys = 0;
4576 robj *lenobj = createObject(REDIS_STRING,NULL);
4577
4578 di = dictGetIterator(c->db->dict);
4579 addReply(c,lenobj);
4580 decrRefCount(lenobj);
4581 while((de = dictNext(di)) != NULL) {
4582 robj *keyobj = dictGetEntryKey(de);
4583
4584 sds key = keyobj->ptr;
4585 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4586 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4587 if (expireIfNeeded(c->db,keyobj) == 0) {
4588 addReplyBulk(c,keyobj);
4589 numkeys++;
4590 }
4591 }
4592 }
4593 dictReleaseIterator(di);
4594 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4595 }
4596
4597 static void dbsizeCommand(redisClient *c) {
4598 addReplySds(c,
4599 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4600 }
4601
4602 static void lastsaveCommand(redisClient *c) {
4603 addReplySds(c,
4604 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4605 }
4606
4607 static void typeCommand(redisClient *c) {
4608 robj *o;
4609 char *type;
4610
4611 o = lookupKeyRead(c->db,c->argv[1]);
4612 if (o == NULL) {
4613 type = "+none";
4614 } else {
4615 switch(o->type) {
4616 case REDIS_STRING: type = "+string"; break;
4617 case REDIS_LIST: type = "+list"; break;
4618 case REDIS_SET: type = "+set"; break;
4619 case REDIS_ZSET: type = "+zset"; break;
4620 case REDIS_HASH: type = "+hash"; break;
4621 default: type = "+unknown"; break;
4622 }
4623 }
4624 addReplySds(c,sdsnew(type));
4625 addReply(c,shared.crlf);
4626 }
4627
4628 static void saveCommand(redisClient *c) {
4629 if (server.bgsavechildpid != -1) {
4630 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4631 return;
4632 }
4633 if (rdbSave(server.dbfilename) == REDIS_OK) {
4634 addReply(c,shared.ok);
4635 } else {
4636 addReply(c,shared.err);
4637 }
4638 }
4639
4640 static void bgsaveCommand(redisClient *c) {
4641 if (server.bgsavechildpid != -1) {
4642 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4643 return;
4644 }
4645 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4646 char *status = "+Background saving started\r\n";
4647 addReplySds(c,sdsnew(status));
4648 } else {
4649 addReply(c,shared.err);
4650 }
4651 }
4652
4653 static void shutdownCommand(redisClient *c) {
4654 if (prepareForShutdown() == REDIS_OK)
4655 exit(0);
4656 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
4657 }
4658
4659 static void renameGenericCommand(redisClient *c, int nx) {
4660 robj *o;
4661
4662 /* To use the same key as src and dst is probably an error */
4663 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4664 addReply(c,shared.sameobjecterr);
4665 return;
4666 }
4667
4668 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4669 return;
4670
4671 incrRefCount(o);
4672 deleteIfVolatile(c->db,c->argv[2]);
4673 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4674 if (nx) {
4675 decrRefCount(o);
4676 addReply(c,shared.czero);
4677 return;
4678 }
4679 dictReplace(c->db->dict,c->argv[2],o);
4680 } else {
4681 incrRefCount(c->argv[2]);
4682 }
4683 deleteKey(c->db,c->argv[1]);
4684 server.dirty++;
4685 addReply(c,nx ? shared.cone : shared.ok);
4686 }
4687
4688 static void renameCommand(redisClient *c) {
4689 renameGenericCommand(c,0);
4690 }
4691
4692 static void renamenxCommand(redisClient *c) {
4693 renameGenericCommand(c,1);
4694 }
4695
4696 static void moveCommand(redisClient *c) {
4697 robj *o;
4698 redisDb *src, *dst;
4699 int srcid;
4700
4701 /* Obtain source and target DB pointers */
4702 src = c->db;
4703 srcid = c->db->id;
4704 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4705 addReply(c,shared.outofrangeerr);
4706 return;
4707 }
4708 dst = c->db;
4709 selectDb(c,srcid); /* Back to the source DB */
4710
4711 /* If the user is moving using as target the same
4712 * DB as the source DB it is probably an error. */
4713 if (src == dst) {
4714 addReply(c,shared.sameobjecterr);
4715 return;
4716 }
4717
4718 /* Check if the element exists and get a reference */
4719 o = lookupKeyWrite(c->db,c->argv[1]);
4720 if (!o) {
4721 addReply(c,shared.czero);
4722 return;
4723 }
4724
4725 /* Try to add the element to the target DB */
4726 deleteIfVolatile(dst,c->argv[1]);
4727 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4728 addReply(c,shared.czero);
4729 return;
4730 }
4731 incrRefCount(c->argv[1]);
4732 incrRefCount(o);
4733
4734 /* OK! key moved, free the entry in the source DB */
4735 deleteKey(src,c->argv[1]);
4736 server.dirty++;
4737 addReply(c,shared.cone);
4738 }
4739
4740 /* =================================== Lists ================================ */
4741 static void pushGenericCommand(redisClient *c, int where) {
4742 robj *lobj;
4743 list *list;
4744
4745 lobj = lookupKeyWrite(c->db,c->argv[1]);
4746 if (lobj == NULL) {
4747 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4748 addReply(c,shared.cone);
4749 return;
4750 }
4751 lobj = createListObject();
4752 list = lobj->ptr;
4753 if (where == REDIS_HEAD) {
4754 listAddNodeHead(list,c->argv[2]);
4755 } else {
4756 listAddNodeTail(list,c->argv[2]);
4757 }
4758 dictAdd(c->db->dict,c->argv[1],lobj);
4759 incrRefCount(c->argv[1]);
4760 incrRefCount(c->argv[2]);
4761 } else {
4762 if (lobj->type != REDIS_LIST) {
4763 addReply(c,shared.wrongtypeerr);
4764 return;
4765 }
4766 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4767 addReply(c,shared.cone);
4768 return;
4769 }
4770 list = lobj->ptr;
4771 if (where == REDIS_HEAD) {
4772 listAddNodeHead(list,c->argv[2]);
4773 } else {
4774 listAddNodeTail(list,c->argv[2]);
4775 }
4776 incrRefCount(c->argv[2]);
4777 }
4778 server.dirty++;
4779 addReplyLongLong(c,listLength(list));
4780 }
4781
4782 static void lpushCommand(redisClient *c) {
4783 pushGenericCommand(c,REDIS_HEAD);
4784 }
4785
4786 static void rpushCommand(redisClient *c) {
4787 pushGenericCommand(c,REDIS_TAIL);
4788 }
4789
4790 static void llenCommand(redisClient *c) {
4791 robj *o;
4792 list *l;
4793
4794 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4795 checkType(c,o,REDIS_LIST)) return;
4796
4797 l = o->ptr;
4798 addReplyUlong(c,listLength(l));
4799 }
4800
4801 static void lindexCommand(redisClient *c) {
4802 robj *o;
4803 int index = atoi(c->argv[2]->ptr);
4804 list *list;
4805 listNode *ln;
4806
4807 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4808 checkType(c,o,REDIS_LIST)) return;
4809 list = o->ptr;
4810
4811 ln = listIndex(list, index);
4812 if (ln == NULL) {
4813 addReply(c,shared.nullbulk);
4814 } else {
4815 robj *ele = listNodeValue(ln);
4816 addReplyBulk(c,ele);
4817 }
4818 }
4819
4820 static void lsetCommand(redisClient *c) {
4821 robj *o;
4822 int index = atoi(c->argv[2]->ptr);
4823 list *list;
4824 listNode *ln;
4825
4826 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4827 checkType(c,o,REDIS_LIST)) return;
4828 list = o->ptr;
4829
4830 ln = listIndex(list, index);
4831 if (ln == NULL) {
4832 addReply(c,shared.outofrangeerr);
4833 } else {
4834 robj *ele = listNodeValue(ln);
4835
4836 decrRefCount(ele);
4837 listNodeValue(ln) = c->argv[3];
4838 incrRefCount(c->argv[3]);
4839 addReply(c,shared.ok);
4840 server.dirty++;
4841 }
4842 }
4843
4844 static void popGenericCommand(redisClient *c, int where) {
4845 robj *o;
4846 list *list;
4847 listNode *ln;
4848
4849 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4850 checkType(c,o,REDIS_LIST)) return;
4851 list = o->ptr;
4852
4853 if (where == REDIS_HEAD)
4854 ln = listFirst(list);
4855 else
4856 ln = listLast(list);
4857
4858 if (ln == NULL) {
4859 addReply(c,shared.nullbulk);
4860 } else {
4861 robj *ele = listNodeValue(ln);
4862 addReplyBulk(c,ele);
4863 listDelNode(list,ln);
4864 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4865 server.dirty++;
4866 }
4867 }
4868
4869 static void lpopCommand(redisClient *c) {
4870 popGenericCommand(c,REDIS_HEAD);
4871 }
4872
4873 static void rpopCommand(redisClient *c) {
4874 popGenericCommand(c,REDIS_TAIL);
4875 }
4876
4877 static void lrangeCommand(redisClient *c) {
4878 robj *o;
4879 int start = atoi(c->argv[2]->ptr);
4880 int end = atoi(c->argv[3]->ptr);
4881 int llen;
4882 int rangelen, j;
4883 list *list;
4884 listNode *ln;
4885 robj *ele;
4886
4887 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4888 || checkType(c,o,REDIS_LIST)) return;
4889 list = o->ptr;
4890 llen = listLength(list);
4891
4892 /* convert negative indexes */
4893 if (start < 0) start = llen+start;
4894 if (end < 0) end = llen+end;
4895 if (start < 0) start = 0;
4896 if (end < 0) end = 0;
4897
4898 /* indexes sanity checks */
4899 if (start > end || start >= llen) {
4900 /* Out of range start or start > end result in empty list */
4901 addReply(c,shared.emptymultibulk);
4902 return;
4903 }
4904 if (end >= llen) end = llen-1;
4905 rangelen = (end-start)+1;
4906
4907 /* Return the result in form of a multi-bulk reply */
4908 ln = listIndex(list, start);
4909 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4910 for (j = 0; j < rangelen; j++) {
4911 ele = listNodeValue(ln);
4912 addReplyBulk(c,ele);
4913 ln = ln->next;
4914 }
4915 }
4916
4917 static void ltrimCommand(redisClient *c) {
4918 robj *o;
4919 int start = atoi(c->argv[2]->ptr);
4920 int end = atoi(c->argv[3]->ptr);
4921 int llen;
4922 int j, ltrim, rtrim;
4923 list *list;
4924 listNode *ln;
4925
4926 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4927 checkType(c,o,REDIS_LIST)) return;
4928 list = o->ptr;
4929 llen = listLength(list);
4930
4931 /* convert negative indexes */
4932 if (start < 0) start = llen+start;
4933 if (end < 0) end = llen+end;
4934 if (start < 0) start = 0;
4935 if (end < 0) end = 0;
4936
4937 /* indexes sanity checks */
4938 if (start > end || start >= llen) {
4939 /* Out of range start or start > end result in empty list */
4940 ltrim = llen;
4941 rtrim = 0;
4942 } else {
4943 if (end >= llen) end = llen-1;
4944 ltrim = start;
4945 rtrim = llen-end-1;
4946 }
4947
4948 /* Remove list elements to perform the trim */
4949 for (j = 0; j < ltrim; j++) {
4950 ln = listFirst(list);
4951 listDelNode(list,ln);
4952 }
4953 for (j = 0; j < rtrim; j++) {
4954 ln = listLast(list);
4955 listDelNode(list,ln);
4956 }
4957 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4958 server.dirty++;
4959 addReply(c,shared.ok);
4960 }
4961
4962 static void lremCommand(redisClient *c) {
4963 robj *o;
4964 list *list;
4965 listNode *ln, *next;
4966 int toremove = atoi(c->argv[2]->ptr);
4967 int removed = 0;
4968 int fromtail = 0;
4969
4970 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4971 checkType(c,o,REDIS_LIST)) return;
4972 list = o->ptr;
4973
4974 if (toremove < 0) {
4975 toremove = -toremove;
4976 fromtail = 1;
4977 }
4978 ln = fromtail ? list->tail : list->head;
4979 while (ln) {
4980 robj *ele = listNodeValue(ln);
4981
4982 next = fromtail ? ln->prev : ln->next;
4983 if (equalStringObjects(ele,c->argv[3])) {
4984 listDelNode(list,ln);
4985 server.dirty++;
4986 removed++;
4987 if (toremove && removed == toremove) break;
4988 }
4989 ln = next;
4990 }
4991 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4992 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4993 }
4994
4995 /* This is the semantic of this command:
4996 * RPOPLPUSH srclist dstlist:
4997 * IF LLEN(srclist) > 0
4998 * element = RPOP srclist
4999 * LPUSH dstlist element
5000 * RETURN element
5001 * ELSE
5002 * RETURN nil
5003 * END
5004 * END
5005 *
5006 * The idea is to be able to get an element from a list in a reliable way
5007 * since the element is not just returned but pushed against another list
5008 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5009 */
5010 static void rpoplpushcommand(redisClient *c) {
5011 robj *sobj;
5012 list *srclist;
5013 listNode *ln;
5014
5015 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5016 checkType(c,sobj,REDIS_LIST)) return;
5017 srclist = sobj->ptr;
5018 ln = listLast(srclist);
5019
5020 if (ln == NULL) {
5021 addReply(c,shared.nullbulk);
5022 } else {
5023 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5024 robj *ele = listNodeValue(ln);
5025 list *dstlist;
5026
5027 if (dobj && dobj->type != REDIS_LIST) {
5028 addReply(c,shared.wrongtypeerr);
5029 return;
5030 }
5031
5032 /* Add the element to the target list (unless it's directly
5033 * passed to some BLPOP-ing client */
5034 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5035 if (dobj == NULL) {
5036 /* Create the list if the key does not exist */
5037 dobj = createListObject();
5038 dictAdd(c->db->dict,c->argv[2],dobj);
5039 incrRefCount(c->argv[2]);
5040 }
5041 dstlist = dobj->ptr;
5042 listAddNodeHead(dstlist,ele);
5043 incrRefCount(ele);
5044 }
5045
5046 /* Send the element to the client as reply as well */
5047 addReplyBulk(c,ele);
5048
5049 /* Finally remove the element from the source list */
5050 listDelNode(srclist,ln);
5051 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
5052 server.dirty++;
5053 }
5054 }
5055
5056 /* ==================================== Sets ================================ */
5057
5058 static void saddCommand(redisClient *c) {
5059 robj *set;
5060
5061 set = lookupKeyWrite(c->db,c->argv[1]);
5062 if (set == NULL) {
5063 set = createSetObject();
5064 dictAdd(c->db->dict,c->argv[1],set);
5065 incrRefCount(c->argv[1]);
5066 } else {
5067 if (set->type != REDIS_SET) {
5068 addReply(c,shared.wrongtypeerr);
5069 return;
5070 }
5071 }
5072 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5073 incrRefCount(c->argv[2]);
5074 server.dirty++;
5075 addReply(c,shared.cone);
5076 } else {
5077 addReply(c,shared.czero);
5078 }
5079 }
5080
5081 static void sremCommand(redisClient *c) {
5082 robj *set;
5083
5084 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5085 checkType(c,set,REDIS_SET)) return;
5086
5087 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5088 server.dirty++;
5089 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5090 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5091 addReply(c,shared.cone);
5092 } else {
5093 addReply(c,shared.czero);
5094 }
5095 }
5096
5097 static void smoveCommand(redisClient *c) {
5098 robj *srcset, *dstset;
5099
5100 srcset = lookupKeyWrite(c->db,c->argv[1]);
5101 dstset = lookupKeyWrite(c->db,c->argv[2]);
5102
5103 /* If the source key does not exist return 0, if it's of the wrong type
5104 * raise an error */
5105 if (srcset == NULL || srcset->type != REDIS_SET) {
5106 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5107 return;
5108 }
5109 /* Error if the destination key is not a set as well */
5110 if (dstset && dstset->type != REDIS_SET) {
5111 addReply(c,shared.wrongtypeerr);
5112 return;
5113 }
5114 /* Remove the element from the source set */
5115 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5116 /* Key not found in the src set! return zero */
5117 addReply(c,shared.czero);
5118 return;
5119 }
5120 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5121 deleteKey(c->db,c->argv[1]);
5122 server.dirty++;
5123 /* Add the element to the destination set */
5124 if (!dstset) {
5125 dstset = createSetObject();
5126 dictAdd(c->db->dict,c->argv[2],dstset);
5127 incrRefCount(c->argv[2]);
5128 }
5129 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5130 incrRefCount(c->argv[3]);
5131 addReply(c,shared.cone);
5132 }
5133
5134 static void sismemberCommand(redisClient *c) {
5135 robj *set;
5136
5137 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5138 checkType(c,set,REDIS_SET)) return;
5139
5140 if (dictFind(set->ptr,c->argv[2]))
5141 addReply(c,shared.cone);
5142 else
5143 addReply(c,shared.czero);
5144 }
5145
5146 static void scardCommand(redisClient *c) {
5147 robj *o;
5148 dict *s;
5149
5150 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5151 checkType(c,o,REDIS_SET)) return;
5152
5153 s = o->ptr;
5154 addReplyUlong(c,dictSize(s));
5155 }
5156
5157 static void spopCommand(redisClient *c) {
5158 robj *set;
5159 dictEntry *de;
5160
5161 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5162 checkType(c,set,REDIS_SET)) return;
5163
5164 de = dictGetRandomKey(set->ptr);
5165 if (de == NULL) {
5166 addReply(c,shared.nullbulk);
5167 } else {
5168 robj *ele = dictGetEntryKey(de);
5169
5170 addReplyBulk(c,ele);
5171 dictDelete(set->ptr,ele);
5172 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5173 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5174 server.dirty++;
5175 }
5176 }
5177
5178 static void srandmemberCommand(redisClient *c) {
5179 robj *set;
5180 dictEntry *de;
5181
5182 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5183 checkType(c,set,REDIS_SET)) return;
5184
5185 de = dictGetRandomKey(set->ptr);
5186 if (de == NULL) {
5187 addReply(c,shared.nullbulk);
5188 } else {
5189 robj *ele = dictGetEntryKey(de);
5190
5191 addReplyBulk(c,ele);
5192 }
5193 }
5194
5195 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5196 dict **d1 = (void*) s1, **d2 = (void*) s2;
5197
5198 return dictSize(*d1)-dictSize(*d2);
5199 }
5200
5201 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5202 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5203 dictIterator *di;
5204 dictEntry *de;
5205 robj *lenobj = NULL, *dstset = NULL;
5206 unsigned long j, cardinality = 0;
5207
5208 for (j = 0; j < setsnum; j++) {
5209 robj *setobj;
5210
5211 setobj = dstkey ?
5212 lookupKeyWrite(c->db,setskeys[j]) :
5213 lookupKeyRead(c->db,setskeys[j]);
5214 if (!setobj) {
5215 zfree(dv);
5216 if (dstkey) {
5217 if (deleteKey(c->db,dstkey))
5218 server.dirty++;
5219 addReply(c,shared.czero);
5220 } else {
5221 addReply(c,shared.emptymultibulk);
5222 }
5223 return;
5224 }
5225 if (setobj->type != REDIS_SET) {
5226 zfree(dv);
5227 addReply(c,shared.wrongtypeerr);
5228 return;
5229 }
5230 dv[j] = setobj->ptr;
5231 }
5232 /* Sort sets from the smallest to largest, this will improve our
5233 * algorithm's performace */
5234 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5235
5236 /* The first thing we should output is the total number of elements...
5237 * since this is a multi-bulk write, but at this stage we don't know
5238 * the intersection set size, so we use a trick, append an empty object
5239 * to the output list and save the pointer to later modify it with the
5240 * right length */
5241 if (!dstkey) {
5242 lenobj = createObject(REDIS_STRING,NULL);
5243 addReply(c,lenobj);
5244 decrRefCount(lenobj);
5245 } else {
5246 /* If we have a target key where to store the resulting set
5247 * create this key with an empty set inside */
5248 dstset = createSetObject();
5249 }
5250
5251 /* Iterate all the elements of the first (smallest) set, and test
5252 * the element against all the other sets, if at least one set does
5253 * not include the element it is discarded */
5254 di = dictGetIterator(dv[0]);
5255
5256 while((de = dictNext(di)) != NULL) {
5257 robj *ele;
5258
5259 for (j = 1; j < setsnum; j++)
5260 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5261 if (j != setsnum)
5262 continue; /* at least one set does not contain the member */
5263 ele = dictGetEntryKey(de);
5264 if (!dstkey) {
5265 addReplyBulk(c,ele);
5266 cardinality++;
5267 } else {
5268 dictAdd(dstset->ptr,ele,NULL);
5269 incrRefCount(ele);
5270 }
5271 }
5272 dictReleaseIterator(di);
5273
5274 if (dstkey) {
5275 /* Store the resulting set into the target, if the intersection
5276 * is not an empty set. */
5277 deleteKey(c->db,dstkey);
5278 if (dictSize((dict*)dstset->ptr) > 0) {
5279 dictAdd(c->db->dict,dstkey,dstset);
5280 incrRefCount(dstkey);
5281 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5282 } else {
5283 decrRefCount(dstset);
5284 addReply(c,shared.czero);
5285 }
5286 server.dirty++;
5287 } else {
5288 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5289 }
5290 zfree(dv);
5291 }
5292
5293 static void sinterCommand(redisClient *c) {
5294 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5295 }
5296
5297 static void sinterstoreCommand(redisClient *c) {
5298 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5299 }
5300
5301 #define REDIS_OP_UNION 0
5302 #define REDIS_OP_DIFF 1
5303 #define REDIS_OP_INTER 2
5304
5305 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5306 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5307 dictIterator *di;
5308 dictEntry *de;
5309 robj *dstset = NULL;
5310 int j, cardinality = 0;
5311
5312 for (j = 0; j < setsnum; j++) {
5313 robj *setobj;
5314
5315 setobj = dstkey ?
5316 lookupKeyWrite(c->db,setskeys[j]) :
5317 lookupKeyRead(c->db,setskeys[j]);
5318 if (!setobj) {
5319 dv[j] = NULL;
5320 continue;
5321 }
5322 if (setobj->type != REDIS_SET) {
5323 zfree(dv);
5324 addReply(c,shared.wrongtypeerr);
5325 return;
5326 }
5327 dv[j] = setobj->ptr;
5328 }
5329
5330 /* We need a temp set object to store our union. If the dstkey
5331 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5332 * this set object will be the resulting object to set into the target key*/
5333 dstset = createSetObject();
5334
5335 /* Iterate all the elements of all the sets, add every element a single
5336 * time to the result set */
5337 for (j = 0; j < setsnum; j++) {
5338 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5339 if (!dv[j]) continue; /* non existing keys are like empty sets */
5340
5341 di = dictGetIterator(dv[j]);
5342
5343 while((de = dictNext(di)) != NULL) {
5344 robj *ele;
5345
5346 /* dictAdd will not add the same element multiple times */
5347 ele = dictGetEntryKey(de);
5348 if (op == REDIS_OP_UNION || j == 0) {
5349 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5350 incrRefCount(ele);
5351 cardinality++;
5352 }
5353 } else if (op == REDIS_OP_DIFF) {
5354 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5355 cardinality--;
5356 }
5357 }
5358 }
5359 dictReleaseIterator(di);
5360
5361 /* result set is empty? Exit asap. */
5362 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5363 }
5364
5365 /* Output the content of the resulting set, if not in STORE mode */
5366 if (!dstkey) {
5367 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5368 di = dictGetIterator(dstset->ptr);
5369 while((de = dictNext(di)) != NULL) {
5370 robj *ele;
5371
5372 ele = dictGetEntryKey(de);
5373 addReplyBulk(c,ele);
5374 }
5375 dictReleaseIterator(di);
5376 decrRefCount(dstset);
5377 } else {
5378 /* If we have a target key where to store the resulting set
5379 * create this key with the result set inside */
5380 deleteKey(c->db,dstkey);
5381 if (dictSize((dict*)dstset->ptr) > 0) {
5382 dictAdd(c->db->dict,dstkey,dstset);
5383 incrRefCount(dstkey);
5384 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
5385 } else {
5386 decrRefCount(dstset);
5387 addReply(c,shared.czero);
5388 }
5389 server.dirty++;
5390 }
5391 zfree(dv);
5392 }
5393
5394 static void sunionCommand(redisClient *c) {
5395 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5396 }
5397
5398 static void sunionstoreCommand(redisClient *c) {
5399 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5400 }
5401
5402 static void sdiffCommand(redisClient *c) {
5403 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5404 }
5405
5406 static void sdiffstoreCommand(redisClient *c) {
5407 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5408 }
5409
5410 /* ==================================== ZSets =============================== */
5411
5412 /* ZSETs are ordered sets using two data structures to hold the same elements
5413 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5414 * data structure.
5415 *
5416 * The elements are added to an hash table mapping Redis objects to scores.
5417 * At the same time the elements are added to a skip list mapping scores
5418 * to Redis objects (so objects are sorted by scores in this "view"). */
5419
5420 /* This skiplist implementation is almost a C translation of the original
5421 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5422 * Alternative to Balanced Trees", modified in three ways:
5423 * a) this implementation allows for repeated values.
5424 * b) the comparison is not just by key (our 'score') but by satellite data.
5425 * c) there is a back pointer, so it's a doubly linked list with the back
5426 * pointers being only at "level 1". This allows to traverse the list
5427 * from tail to head, useful for ZREVRANGE. */
5428
5429 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5430 zskiplistNode *zn = zmalloc(sizeof(*zn));
5431
5432 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5433 if (level > 1)
5434 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5435 else
5436 zn->span = NULL;
5437 zn->score = score;
5438 zn->obj = obj;
5439 return zn;
5440 }
5441
5442 static zskiplist *zslCreate(void) {
5443 int j;
5444 zskiplist *zsl;
5445
5446 zsl = zmalloc(sizeof(*zsl));
5447 zsl->level = 1;
5448 zsl->length = 0;
5449 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5450 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5451 zsl->header->forward[j] = NULL;
5452
5453 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5454 if (j < ZSKIPLIST_MAXLEVEL-1)
5455 zsl->header->span[j] = 0;
5456 }
5457 zsl->header->backward = NULL;
5458 zsl->tail = NULL;
5459 return zsl;
5460 }
5461
5462 static void zslFreeNode(zskiplistNode *node) {
5463 decrRefCount(node->obj);
5464 zfree(node->forward);
5465 zfree(node->span);
5466 zfree(node);
5467 }
5468
5469 static void zslFree(zskiplist *zsl) {
5470 zskiplistNode *node = zsl->header->forward[0], *next;
5471
5472 zfree(zsl->header->forward);
5473 zfree(zsl->header->span);
5474 zfree(zsl->header);
5475 while(node) {
5476 next = node->forward[0];
5477 zslFreeNode(node);
5478 node = next;
5479 }
5480 zfree(zsl);
5481 }
5482
5483 static int zslRandomLevel(void) {
5484 int level = 1;
5485 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5486 level += 1;
5487 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5488 }
5489
5490 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5491 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5492 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5493 int i, level;
5494
5495 x = zsl->header;
5496 for (i = zsl->level-1; i >= 0; i--) {
5497 /* store rank that is crossed to reach the insert position */
5498 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5499
5500 while (x->forward[i] &&
5501 (x->forward[i]->score < score ||
5502 (x->forward[i]->score == score &&
5503 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5504 rank[i] += i > 0 ? x->span[i-1] : 1;
5505 x = x->forward[i];
5506 }
5507 update[i] = x;
5508 }
5509 /* we assume the key is not already inside, since we allow duplicated
5510 * scores, and the re-insertion of score and redis object should never
5511 * happpen since the caller of zslInsert() should test in the hash table
5512 * if the element is already inside or not. */
5513 level = zslRandomLevel();
5514 if (level > zsl->level) {
5515 for (i = zsl->level; i < level; i++) {
5516 rank[i] = 0;
5517 update[i] = zsl->header;
5518 update[i]->span[i-1] = zsl->length;
5519 }
5520 zsl->level = level;
5521 }
5522 x = zslCreateNode(level,score,obj);
5523 for (i = 0; i < level; i++) {
5524 x->forward[i] = update[i]->forward[i];
5525 update[i]->forward[i] = x;
5526
5527 /* update span covered by update[i] as x is inserted here */
5528 if (i > 0) {
5529 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5530 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5531 }
5532 }
5533
5534 /* increment span for untouched levels */
5535 for (i = level; i < zsl->level; i++) {
5536 update[i]->span[i-1]++;
5537 }
5538
5539 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5540 if (x->forward[0])
5541 x->forward[0]->backward = x;
5542 else
5543 zsl->tail = x;
5544 zsl->length++;
5545 }
5546
5547 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5548 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5549 int i;
5550 for (i = 0; i < zsl->level; i++) {
5551 if (update[i]->forward[i] == x) {
5552 if (i > 0) {
5553 update[i]->span[i-1] += x->span[i-1] - 1;
5554 }
5555 update[i]->forward[i] = x->forward[i];
5556 } else {
5557 /* invariant: i > 0, because update[0]->forward[0]
5558 * is always equal to x */
5559 update[i]->span[i-1] -= 1;
5560 }
5561 }
5562 if (x->forward[0]) {
5563 x->forward[0]->backward = x->backward;
5564 } else {
5565 zsl->tail = x->backward;
5566 }
5567 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5568 zsl->level--;
5569 zsl->length--;
5570 }
5571
5572 /* Delete an element with matching score/object from the skiplist. */
5573 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5574 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5575 int i;
5576
5577 x = zsl->header;
5578 for (i = zsl->level-1; i >= 0; i--) {
5579 while (x->forward[i] &&
5580 (x->forward[i]->score < score ||
5581 (x->forward[i]->score == score &&
5582 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5583 x = x->forward[i];
5584 update[i] = x;
5585 }
5586 /* We may have multiple elements with the same score, what we need
5587 * is to find the element with both the right score and object. */
5588 x = x->forward[0];
5589 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5590 zslDeleteNode(zsl, x, update);
5591 zslFreeNode(x);
5592 return 1;
5593 } else {
5594 return 0; /* not found */
5595 }
5596 return 0; /* not found */
5597 }
5598
5599 /* Delete all the elements with score between min and max from the skiplist.
5600 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5601 * Note that this function takes the reference to the hash table view of the
5602 * sorted set, in order to remove the elements from the hash table too. */
5603 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5604 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5605 unsigned long removed = 0;
5606 int i;
5607
5608 x = zsl->header;
5609 for (i = zsl->level-1; i >= 0; i--) {
5610 while (x->forward[i] && x->forward[i]->score < min)
5611 x = x->forward[i];
5612 update[i] = x;
5613 }
5614 /* We may have multiple elements with the same score, what we need
5615 * is to find the element with both the right score and object. */
5616 x = x->forward[0];
5617 while (x && x->score <= max) {
5618 zskiplistNode *next = x->forward[0];
5619 zslDeleteNode(zsl, x, update);
5620 dictDelete(dict,x->obj);
5621 zslFreeNode(x);
5622 removed++;
5623 x = next;
5624 }
5625 return removed; /* not found */
5626 }
5627
5628 /* Delete all the elements with rank between start and end from the skiplist.
5629 * Start and end are inclusive. Note that start and end need to be 1-based */
5630 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5631 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5632 unsigned long traversed = 0, removed = 0;
5633 int i;
5634
5635 x = zsl->header;
5636 for (i = zsl->level-1; i >= 0; i--) {
5637 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5638 traversed += i > 0 ? x->span[i-1] : 1;
5639 x = x->forward[i];
5640 }
5641 update[i] = x;
5642 }
5643
5644 traversed++;
5645 x = x->forward[0];
5646 while (x && traversed <= end) {
5647 zskiplistNode *next = x->forward[0];
5648 zslDeleteNode(zsl, x, update);
5649 dictDelete(dict,x->obj);
5650 zslFreeNode(x);
5651 removed++;
5652 traversed++;
5653 x = next;
5654 }
5655 return removed;
5656 }
5657
5658 /* Find the first node having a score equal or greater than the specified one.
5659 * Returns NULL if there is no match. */
5660 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5661 zskiplistNode *x;
5662 int i;
5663
5664 x = zsl->header;
5665 for (i = zsl->level-1; i >= 0; i--) {
5666 while (x->forward[i] && x->forward[i]->score < score)
5667 x = x->forward[i];
5668 }
5669 /* We may have multiple elements with the same score, what we need
5670 * is to find the element with both the right score and object. */
5671 return x->forward[0];
5672 }
5673
5674 /* Find the rank for an element by both score and key.
5675 * Returns 0 when the element cannot be found, rank otherwise.
5676 * Note that the rank is 1-based due to the span of zsl->header to the
5677 * first element. */
5678 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5679 zskiplistNode *x;
5680 unsigned long rank = 0;
5681 int i;
5682
5683 x = zsl->header;
5684 for (i = zsl->level-1; i >= 0; i--) {
5685 while (x->forward[i] &&
5686 (x->forward[i]->score < score ||
5687 (x->forward[i]->score == score &&
5688 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5689 rank += i > 0 ? x->span[i-1] : 1;
5690 x = x->forward[i];
5691 }
5692
5693 /* x might be equal to zsl->header, so test if obj is non-NULL */
5694 if (x->obj && equalStringObjects(x->obj,o)) {
5695 return rank;
5696 }
5697 }
5698 return 0;
5699 }
5700
5701 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5702 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5703 zskiplistNode *x;
5704 unsigned long traversed = 0;
5705 int i;
5706
5707 x = zsl->header;
5708 for (i = zsl->level-1; i >= 0; i--) {
5709 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5710 {
5711 traversed += i > 0 ? x->span[i-1] : 1;
5712 x = x->forward[i];
5713 }
5714 if (traversed == rank) {
5715 return x;
5716 }
5717 }
5718 return NULL;
5719 }
5720
5721 /* The actual Z-commands implementations */
5722
5723 /* This generic command implements both ZADD and ZINCRBY.
5724 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5725 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5726 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5727 robj *zsetobj;
5728 zset *zs;
5729 double *score;
5730
5731 zsetobj = lookupKeyWrite(c->db,key);
5732 if (zsetobj == NULL) {
5733 zsetobj = createZsetObject();
5734 dictAdd(c->db->dict,key,zsetobj);
5735 incrRefCount(key);
5736 } else {
5737 if (zsetobj->type != REDIS_ZSET) {
5738 addReply(c,shared.wrongtypeerr);
5739 return;
5740 }
5741 }
5742 zs = zsetobj->ptr;
5743
5744 /* Ok now since we implement both ZADD and ZINCRBY here the code
5745 * needs to handle the two different conditions. It's all about setting
5746 * '*score', that is, the new score to set, to the right value. */
5747 score = zmalloc(sizeof(double));
5748 if (doincrement) {
5749 dictEntry *de;
5750
5751 /* Read the old score. If the element was not present starts from 0 */
5752 de = dictFind(zs->dict,ele);
5753 if (de) {
5754 double *oldscore = dictGetEntryVal(de);
5755 *score = *oldscore + scoreval;
5756 } else {
5757 *score = scoreval;
5758 }
5759 } else {
5760 *score = scoreval;
5761 }
5762
5763 /* What follows is a simple remove and re-insert operation that is common
5764 * to both ZADD and ZINCRBY... */
5765 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5766 /* case 1: New element */
5767 incrRefCount(ele); /* added to hash */
5768 zslInsert(zs->zsl,*score,ele);
5769 incrRefCount(ele); /* added to skiplist */
5770 server.dirty++;
5771 if (doincrement)
5772 addReplyDouble(c,*score);
5773 else
5774 addReply(c,shared.cone);
5775 } else {
5776 dictEntry *de;
5777 double *oldscore;
5778
5779 /* case 2: Score update operation */
5780 de = dictFind(zs->dict,ele);
5781 redisAssert(de != NULL);
5782 oldscore = dictGetEntryVal(de);
5783 if (*score != *oldscore) {
5784 int deleted;
5785
5786 /* Remove and insert the element in the skip list with new score */
5787 deleted = zslDelete(zs->zsl,*oldscore,ele);
5788 redisAssert(deleted != 0);
5789 zslInsert(zs->zsl,*score,ele);
5790 incrRefCount(ele);
5791 /* Update the score in the hash table */
5792 dictReplace(zs->dict,ele,score);
5793 server.dirty++;
5794 } else {
5795 zfree(score);
5796 }
5797 if (doincrement)
5798 addReplyDouble(c,*score);
5799 else
5800 addReply(c,shared.czero);
5801 }
5802 }
5803
5804 static void zaddCommand(redisClient *c) {
5805 double scoreval;
5806
5807 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5808 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5809 }
5810
5811 static void zincrbyCommand(redisClient *c) {
5812 double scoreval;
5813
5814 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5815 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5816 }
5817
5818 static void zremCommand(redisClient *c) {
5819 robj *zsetobj;
5820 zset *zs;
5821 dictEntry *de;
5822 double *oldscore;
5823 int deleted;
5824
5825 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5826 checkType(c,zsetobj,REDIS_ZSET)) return;
5827
5828 zs = zsetobj->ptr;
5829 de = dictFind(zs->dict,c->argv[2]);
5830 if (de == NULL) {
5831 addReply(c,shared.czero);
5832 return;
5833 }
5834 /* Delete from the skiplist */
5835 oldscore = dictGetEntryVal(de);
5836 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5837 redisAssert(deleted != 0);
5838
5839 /* Delete from the hash table */
5840 dictDelete(zs->dict,c->argv[2]);
5841 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5842 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5843 server.dirty++;
5844 addReply(c,shared.cone);
5845 }
5846
5847 static void zremrangebyscoreCommand(redisClient *c) {
5848 double min;
5849 double max;
5850 long deleted;
5851 robj *zsetobj;
5852 zset *zs;
5853
5854 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5855 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5856
5857 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5858 checkType(c,zsetobj,REDIS_ZSET)) return;
5859
5860 zs = zsetobj->ptr;
5861 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5862 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5863 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5864 server.dirty += deleted;
5865 addReplyLongLong(c,deleted);
5866 }
5867
5868 static void zremrangebyrankCommand(redisClient *c) {
5869 long start;
5870 long end;
5871 int llen;
5872 long deleted;
5873 robj *zsetobj;
5874 zset *zs;
5875
5876 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5877 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5878
5879 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5880 checkType(c,zsetobj,REDIS_ZSET)) return;
5881 zs = zsetobj->ptr;
5882 llen = zs->zsl->length;
5883
5884 /* convert negative indexes */
5885 if (start < 0) start = llen+start;
5886 if (end < 0) end = llen+end;
5887 if (start < 0) start = 0;
5888 if (end < 0) end = 0;
5889
5890 /* indexes sanity checks */
5891 if (start > end || start >= llen) {
5892 addReply(c,shared.czero);
5893 return;
5894 }
5895 if (end >= llen) end = llen-1;
5896
5897 /* increment start and end because zsl*Rank functions
5898 * use 1-based rank */
5899 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5900 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5901 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5902 server.dirty += deleted;
5903 addReplyLongLong(c, deleted);
5904 }
5905
5906 typedef struct {
5907 dict *dict;
5908 double weight;
5909 } zsetopsrc;
5910
5911 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5912 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5913 unsigned long size1, size2;
5914 size1 = d1->dict ? dictSize(d1->dict) : 0;
5915 size2 = d2->dict ? dictSize(d2->dict) : 0;
5916 return size1 - size2;
5917 }
5918
5919 #define REDIS_AGGR_SUM 1
5920 #define REDIS_AGGR_MIN 2
5921 #define REDIS_AGGR_MAX 3
5922
5923 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5924 if (aggregate == REDIS_AGGR_SUM) {
5925 *target = *target + val;
5926 } else if (aggregate == REDIS_AGGR_MIN) {
5927 *target = val < *target ? val : *target;
5928 } else if (aggregate == REDIS_AGGR_MAX) {
5929 *target = val > *target ? val : *target;
5930 } else {
5931 /* safety net */
5932 redisPanic("Unknown ZUNION/INTER aggregate type");
5933 }
5934 }
5935
5936 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5937 int i, j, zsetnum;
5938 int aggregate = REDIS_AGGR_SUM;
5939 zsetopsrc *src;
5940 robj *dstobj;
5941 zset *dstzset;
5942 dictIterator *di;
5943 dictEntry *de;
5944
5945 /* expect zsetnum input keys to be given */
5946 zsetnum = atoi(c->argv[2]->ptr);
5947 if (zsetnum < 1) {
5948 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
5949 return;
5950 }
5951
5952 /* test if the expected number of keys would overflow */
5953 if (3+zsetnum > c->argc) {
5954 addReply(c,shared.syntaxerr);
5955 return;
5956 }
5957
5958 /* read keys to be used for input */
5959 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5960 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5961 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5962 if (!zsetobj) {
5963 src[i].dict = NULL;
5964 } else {
5965 if (zsetobj->type != REDIS_ZSET) {
5966 zfree(src);
5967 addReply(c,shared.wrongtypeerr);
5968 return;
5969 }
5970 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5971 }
5972
5973 /* default all weights to 1 */
5974 src[i].weight = 1.0;
5975 }
5976
5977 /* parse optional extra arguments */
5978 if (j < c->argc) {
5979 int remaining = c->argc - j;
5980
5981 while (remaining) {
5982 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5983 j++; remaining--;
5984 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5985 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5986 return;
5987 }
5988 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5989 j++; remaining--;
5990 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5991 aggregate = REDIS_AGGR_SUM;
5992 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5993 aggregate = REDIS_AGGR_MIN;
5994 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5995 aggregate = REDIS_AGGR_MAX;
5996 } else {
5997 zfree(src);
5998 addReply(c,shared.syntaxerr);
5999 return;
6000 }
6001 j++; remaining--;
6002 } else {
6003 zfree(src);
6004 addReply(c,shared.syntaxerr);
6005 return;
6006 }
6007 }
6008 }
6009
6010 /* sort sets from the smallest to largest, this will improve our
6011 * algorithm's performance */
6012 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
6013
6014 dstobj = createZsetObject();
6015 dstzset = dstobj->ptr;
6016
6017 if (op == REDIS_OP_INTER) {
6018 /* skip going over all entries if the smallest zset is NULL or empty */
6019 if (src[0].dict && dictSize(src[0].dict) > 0) {
6020 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6021 * from small to large, all src[i > 0].dict are non-empty too */
6022 di = dictGetIterator(src[0].dict);
6023 while((de = dictNext(di)) != NULL) {
6024 double *score = zmalloc(sizeof(double)), value;
6025 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
6026
6027 for (j = 1; j < zsetnum; j++) {
6028 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6029 if (other) {
6030 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6031 zunionInterAggregate(score, value, aggregate);
6032 } else {
6033 break;
6034 }
6035 }
6036
6037 /* skip entry when not present in every source dict */
6038 if (j != zsetnum) {
6039 zfree(score);
6040 } else {
6041 robj *o = dictGetEntryKey(de);
6042 dictAdd(dstzset->dict,o,score);
6043 incrRefCount(o); /* added to dictionary */
6044 zslInsert(dstzset->zsl,*score,o);
6045 incrRefCount(o); /* added to skiplist */
6046 }
6047 }
6048 dictReleaseIterator(di);
6049 }
6050 } else if (op == REDIS_OP_UNION) {
6051 for (i = 0; i < zsetnum; i++) {
6052 if (!src[i].dict) continue;
6053
6054 di = dictGetIterator(src[i].dict);
6055 while((de = dictNext(di)) != NULL) {
6056 /* skip key when already processed */
6057 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6058
6059 double *score = zmalloc(sizeof(double)), value;
6060 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
6061
6062 /* because the zsets are sorted by size, its only possible
6063 * for sets at larger indices to hold this entry */
6064 for (j = (i+1); j < zsetnum; j++) {
6065 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
6066 if (other) {
6067 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6068 zunionInterAggregate(score, value, aggregate);
6069 }
6070 }
6071
6072 robj *o = dictGetEntryKey(de);
6073 dictAdd(dstzset->dict,o,score);
6074 incrRefCount(o); /* added to dictionary */
6075 zslInsert(dstzset->zsl,*score,o);
6076 incrRefCount(o); /* added to skiplist */
6077 }
6078 dictReleaseIterator(di);
6079 }
6080 } else {
6081 /* unknown operator */
6082 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
6083 }
6084
6085 deleteKey(c->db,dstkey);
6086 if (dstzset->zsl->length) {
6087 dictAdd(c->db->dict,dstkey,dstobj);
6088 incrRefCount(dstkey);
6089 addReplyLongLong(c, dstzset->zsl->length);
6090 server.dirty++;
6091 } else {
6092 decrRefCount(dstobj);
6093 addReply(c, shared.czero);
6094 }
6095 zfree(src);
6096 }
6097
6098 static void zunionstoreCommand(redisClient *c) {
6099 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
6100 }
6101
6102 static void zinterstoreCommand(redisClient *c) {
6103 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
6104 }
6105
6106 static void zrangeGenericCommand(redisClient *c, int reverse) {
6107 robj *o;
6108 long start;
6109 long end;
6110 int withscores = 0;
6111 int llen;
6112 int rangelen, j;
6113 zset *zsetobj;
6114 zskiplist *zsl;
6115 zskiplistNode *ln;
6116 robj *ele;
6117
6118 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6119 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6120
6121 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6122 withscores = 1;
6123 } else if (c->argc >= 5) {
6124 addReply(c,shared.syntaxerr);
6125 return;
6126 }
6127
6128 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6129 || checkType(c,o,REDIS_ZSET)) return;
6130 zsetobj = o->ptr;
6131 zsl = zsetobj->zsl;
6132 llen = zsl->length;
6133
6134 /* convert negative indexes */
6135 if (start < 0) start = llen+start;
6136 if (end < 0) end = llen+end;
6137 if (start < 0) start = 0;
6138 if (end < 0) end = 0;
6139
6140 /* indexes sanity checks */
6141 if (start > end || start >= llen) {
6142 /* Out of range start or start > end result in empty list */
6143 addReply(c,shared.emptymultibulk);
6144 return;
6145 }
6146 if (end >= llen) end = llen-1;
6147 rangelen = (end-start)+1;
6148
6149 /* check if starting point is trivial, before searching
6150 * the element in log(N) time */
6151 if (reverse) {
6152 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6153 } else {
6154 ln = start == 0 ?
6155 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6156 }
6157
6158 /* Return the result in form of a multi-bulk reply */
6159 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6160 withscores ? (rangelen*2) : rangelen));
6161 for (j = 0; j < rangelen; j++) {
6162 ele = ln->obj;
6163 addReplyBulk(c,ele);
6164 if (withscores)
6165 addReplyDouble(c,ln->score);
6166 ln = reverse ? ln->backward : ln->forward[0];
6167 }
6168 }
6169
6170 static void zrangeCommand(redisClient *c) {
6171 zrangeGenericCommand(c,0);
6172 }
6173
6174 static void zrevrangeCommand(redisClient *c) {
6175 zrangeGenericCommand(c,1);
6176 }
6177
6178 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6179 * If justcount is non-zero, just the count is returned. */
6180 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6181 robj *o;
6182 double min, max;
6183 int minex = 0, maxex = 0; /* are min or max exclusive? */
6184 int offset = 0, limit = -1;
6185 int withscores = 0;
6186 int badsyntax = 0;
6187
6188 /* Parse the min-max interval. If one of the values is prefixed
6189 * by the "(" character, it's considered "open". For instance
6190 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6191 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6192 if (((char*)c->argv[2]->ptr)[0] == '(') {
6193 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6194 minex = 1;
6195 } else {
6196 min = strtod(c->argv[2]->ptr,NULL);
6197 }
6198 if (((char*)c->argv[3]->ptr)[0] == '(') {
6199 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6200 maxex = 1;
6201 } else {
6202 max = strtod(c->argv[3]->ptr,NULL);
6203 }
6204
6205 /* Parse "WITHSCORES": note that if the command was called with
6206 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6207 * enter the following paths to parse WITHSCORES and LIMIT. */
6208 if (c->argc == 5 || c->argc == 8) {
6209 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6210 withscores = 1;
6211 else
6212 badsyntax = 1;
6213 }
6214 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6215 badsyntax = 1;
6216 if (badsyntax) {
6217 addReplySds(c,
6218 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6219 return;
6220 }
6221
6222 /* Parse "LIMIT" */
6223 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6224 addReply(c,shared.syntaxerr);
6225 return;
6226 } else if (c->argc == (7 + withscores)) {
6227 offset = atoi(c->argv[5]->ptr);
6228 limit = atoi(c->argv[6]->ptr);
6229 if (offset < 0) offset = 0;
6230 }
6231
6232 /* Ok, lookup the key and get the range */
6233 o = lookupKeyRead(c->db,c->argv[1]);
6234 if (o == NULL) {
6235 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6236 } else {
6237 if (o->type != REDIS_ZSET) {
6238 addReply(c,shared.wrongtypeerr);
6239 } else {
6240 zset *zsetobj = o->ptr;
6241 zskiplist *zsl = zsetobj->zsl;
6242 zskiplistNode *ln;
6243 robj *ele, *lenobj = NULL;
6244 unsigned long rangelen = 0;
6245
6246 /* Get the first node with the score >= min, or with
6247 * score > min if 'minex' is true. */
6248 ln = zslFirstWithScore(zsl,min);
6249 while (minex && ln && ln->score == min) ln = ln->forward[0];
6250
6251 if (ln == NULL) {
6252 /* No element matching the speciifed interval */
6253 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6254 return;
6255 }
6256
6257 /* We don't know in advance how many matching elements there
6258 * are in the list, so we push this object that will represent
6259 * the multi-bulk length in the output buffer, and will "fix"
6260 * it later */
6261 if (!justcount) {
6262 lenobj = createObject(REDIS_STRING,NULL);
6263 addReply(c,lenobj);
6264 decrRefCount(lenobj);
6265 }
6266
6267 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6268 if (offset) {
6269 offset--;
6270 ln = ln->forward[0];
6271 continue;
6272 }
6273 if (limit == 0) break;
6274 if (!justcount) {
6275 ele = ln->obj;
6276 addReplyBulk(c,ele);
6277 if (withscores)
6278 addReplyDouble(c,ln->score);
6279 }
6280 ln = ln->forward[0];
6281 rangelen++;
6282 if (limit > 0) limit--;
6283 }
6284 if (justcount) {
6285 addReplyLongLong(c,(long)rangelen);
6286 } else {
6287 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6288 withscores ? (rangelen*2) : rangelen);
6289 }
6290 }
6291 }
6292 }
6293
6294 static void zrangebyscoreCommand(redisClient *c) {
6295 genericZrangebyscoreCommand(c,0);
6296 }
6297
6298 static void zcountCommand(redisClient *c) {
6299 genericZrangebyscoreCommand(c,1);
6300 }
6301
6302 static void zcardCommand(redisClient *c) {
6303 robj *o;
6304 zset *zs;
6305
6306 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6307 checkType(c,o,REDIS_ZSET)) return;
6308
6309 zs = o->ptr;
6310 addReplyUlong(c,zs->zsl->length);
6311 }
6312
6313 static void zscoreCommand(redisClient *c) {
6314 robj *o;
6315 zset *zs;
6316 dictEntry *de;
6317
6318 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6319 checkType(c,o,REDIS_ZSET)) return;
6320
6321 zs = o->ptr;
6322 de = dictFind(zs->dict,c->argv[2]);
6323 if (!de) {
6324 addReply(c,shared.nullbulk);
6325 } else {
6326 double *score = dictGetEntryVal(de);
6327
6328 addReplyDouble(c,*score);
6329 }
6330 }
6331
6332 static void zrankGenericCommand(redisClient *c, int reverse) {
6333 robj *o;
6334 zset *zs;
6335 zskiplist *zsl;
6336 dictEntry *de;
6337 unsigned long rank;
6338 double *score;
6339
6340 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6341 checkType(c,o,REDIS_ZSET)) return;
6342
6343 zs = o->ptr;
6344 zsl = zs->zsl;
6345 de = dictFind(zs->dict,c->argv[2]);
6346 if (!de) {
6347 addReply(c,shared.nullbulk);
6348 return;
6349 }
6350
6351 score = dictGetEntryVal(de);
6352 rank = zslGetRank(zsl, *score, c->argv[2]);
6353 if (rank) {
6354 if (reverse) {
6355 addReplyLongLong(c, zsl->length - rank);
6356 } else {
6357 addReplyLongLong(c, rank-1);
6358 }
6359 } else {
6360 addReply(c,shared.nullbulk);
6361 }
6362 }
6363
6364 static void zrankCommand(redisClient *c) {
6365 zrankGenericCommand(c, 0);
6366 }
6367
6368 static void zrevrankCommand(redisClient *c) {
6369 zrankGenericCommand(c, 1);
6370 }
6371
6372 /* ========================= Hashes utility functions ======================= */
6373 #define REDIS_HASH_KEY 1
6374 #define REDIS_HASH_VALUE 2
6375
6376 /* Check the length of a number of objects to see if we need to convert a
6377 * zipmap to a real hash. Note that we only check string encoded objects
6378 * as their string length can be queried in constant time. */
6379 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6380 int i;
6381 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6382
6383 for (i = start; i <= end; i++) {
6384 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6385 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6386 {
6387 convertToRealHash(subject);
6388 return;
6389 }
6390 }
6391 }
6392
6393 /* Encode given objects in-place when the hash uses a dict. */
6394 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6395 if (subject->encoding == REDIS_ENCODING_HT) {
6396 if (o1) *o1 = tryObjectEncoding(*o1);
6397 if (o2) *o2 = tryObjectEncoding(*o2);
6398 }
6399 }
6400
6401 /* Get the value from a hash identified by key. Returns either a string
6402 * object or NULL if the value cannot be found. The refcount of the object
6403 * is always increased by 1 when the value was found. */
6404 static robj *hashGet(robj *o, robj *key) {
6405 robj *value = NULL;
6406 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6407 unsigned char *v;
6408 unsigned int vlen;
6409 key = getDecodedObject(key);
6410 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6411 value = createStringObject((char*)v,vlen);
6412 }
6413 decrRefCount(key);
6414 } else {
6415 dictEntry *de = dictFind(o->ptr,key);
6416 if (de != NULL) {
6417 value = dictGetEntryVal(de);
6418 incrRefCount(value);
6419 }
6420 }
6421 return value;
6422 }
6423
6424 /* Test if the key exists in the given hash. Returns 1 if the key
6425 * exists and 0 when it doesn't. */
6426 static int hashExists(robj *o, robj *key) {
6427 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6428 key = getDecodedObject(key);
6429 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6430 decrRefCount(key);
6431 return 1;
6432 }
6433 decrRefCount(key);
6434 } else {
6435 if (dictFind(o->ptr,key) != NULL) {
6436 return 1;
6437 }
6438 }
6439 return 0;
6440 }
6441
6442 /* Add an element, discard the old if the key already exists.
6443 * Return 0 on insert and 1 on update. */
6444 static int hashSet(robj *o, robj *key, robj *value) {
6445 int update = 0;
6446 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6447 key = getDecodedObject(key);
6448 value = getDecodedObject(value);
6449 o->ptr = zipmapSet(o->ptr,
6450 key->ptr,sdslen(key->ptr),
6451 value->ptr,sdslen(value->ptr), &update);
6452 decrRefCount(key);
6453 decrRefCount(value);
6454
6455 /* Check if the zipmap needs to be upgraded to a real hash table */
6456 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6457 convertToRealHash(o);
6458 } else {
6459 if (dictReplace(o->ptr,key,value)) {
6460 /* Insert */
6461 incrRefCount(key);
6462 } else {
6463 /* Update */
6464 update = 1;
6465 }
6466 incrRefCount(value);
6467 }
6468 return update;
6469 }
6470
6471 /* Delete an element from a hash.
6472 * Return 1 on deleted and 0 on not found. */
6473 static int hashDelete(robj *o, robj *key) {
6474 int deleted = 0;
6475 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6476 key = getDecodedObject(key);
6477 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6478 decrRefCount(key);
6479 } else {
6480 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6481 /* Always check if the dictionary needs a resize after a delete. */
6482 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6483 }
6484 return deleted;
6485 }
6486
6487 /* Return the number of elements in a hash. */
6488 static unsigned long hashLength(robj *o) {
6489 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6490 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6491 }
6492
6493 /* Structure to hold hash iteration abstration. Note that iteration over
6494 * hashes involves both fields and values. Because it is possible that
6495 * not both are required, store pointers in the iterator to avoid
6496 * unnecessary memory allocation for fields/values. */
6497 typedef struct {
6498 int encoding;
6499 unsigned char *zi;
6500 unsigned char *zk, *zv;
6501 unsigned int zklen, zvlen;
6502
6503 dictIterator *di;
6504 dictEntry *de;
6505 } hashIterator;
6506
6507 static hashIterator *hashInitIterator(robj *subject) {
6508 hashIterator *hi = zmalloc(sizeof(hashIterator));
6509 hi->encoding = subject->encoding;
6510 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6511 hi->zi = zipmapRewind(subject->ptr);
6512 } else if (hi->encoding == REDIS_ENCODING_HT) {
6513 hi->di = dictGetIterator(subject->ptr);
6514 } else {
6515 redisAssert(NULL);
6516 }
6517 return hi;
6518 }
6519
6520 static void hashReleaseIterator(hashIterator *hi) {
6521 if (hi->encoding == REDIS_ENCODING_HT) {
6522 dictReleaseIterator(hi->di);
6523 }
6524 zfree(hi);
6525 }
6526
6527 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6528 * could be found and REDIS_ERR when the iterator reaches the end. */
6529 static int hashNext(hashIterator *hi) {
6530 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6531 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6532 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6533 } else {
6534 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6535 }
6536 return REDIS_OK;
6537 }
6538
6539 /* Get key or value object at current iteration position.
6540 * This increases the refcount of the field object by 1. */
6541 static robj *hashCurrent(hashIterator *hi, int what) {
6542 robj *o;
6543 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6544 if (what & REDIS_HASH_KEY) {
6545 o = createStringObject((char*)hi->zk,hi->zklen);
6546 } else {
6547 o = createStringObject((char*)hi->zv,hi->zvlen);
6548 }
6549 } else {
6550 if (what & REDIS_HASH_KEY) {
6551 o = dictGetEntryKey(hi->de);
6552 } else {
6553 o = dictGetEntryVal(hi->de);
6554 }
6555 incrRefCount(o);
6556 }
6557 return o;
6558 }
6559
6560 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6561 robj *o = lookupKeyWrite(c->db,key);
6562 if (o == NULL) {
6563 o = createHashObject();
6564 dictAdd(c->db->dict,key,o);
6565 incrRefCount(key);
6566 } else {
6567 if (o->type != REDIS_HASH) {
6568 addReply(c,shared.wrongtypeerr);
6569 return NULL;
6570 }
6571 }
6572 return o;
6573 }
6574
6575 /* ============================= Hash commands ============================== */
6576 static void hsetCommand(redisClient *c) {
6577 int update;
6578 robj *o;
6579
6580 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6581 hashTryConversion(o,c->argv,2,3);
6582 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6583 update = hashSet(o,c->argv[2],c->argv[3]);
6584 addReply(c, update ? shared.czero : shared.cone);
6585 server.dirty++;
6586 }
6587
6588 static void hsetnxCommand(redisClient *c) {
6589 robj *o;
6590 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6591 hashTryConversion(o,c->argv,2,3);
6592
6593 if (hashExists(o, c->argv[2])) {
6594 addReply(c, shared.czero);
6595 } else {
6596 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6597 hashSet(o,c->argv[2],c->argv[3]);
6598 addReply(c, shared.cone);
6599 server.dirty++;
6600 }
6601 }
6602
6603 static void hmsetCommand(redisClient *c) {
6604 int i;
6605 robj *o;
6606
6607 if ((c->argc % 2) == 1) {
6608 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6609 return;
6610 }
6611
6612 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6613 hashTryConversion(o,c->argv,2,c->argc-1);
6614 for (i = 2; i < c->argc; i += 2) {
6615 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6616 hashSet(o,c->argv[i],c->argv[i+1]);
6617 }
6618 addReply(c, shared.ok);
6619 server.dirty++;
6620 }
6621
6622 static void hincrbyCommand(redisClient *c) {
6623 long long value, incr;
6624 robj *o, *current, *new;
6625
6626 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6627 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6628 if ((current = hashGet(o,c->argv[2])) != NULL) {
6629 if (getLongLongFromObjectOrReply(c,current,&value,
6630 "hash value is not an integer") != REDIS_OK) {
6631 decrRefCount(current);
6632 return;
6633 }
6634 decrRefCount(current);
6635 } else {
6636 value = 0;
6637 }
6638
6639 value += incr;
6640 new = createStringObjectFromLongLong(value);
6641 hashTryObjectEncoding(o,&c->argv[2],NULL);
6642 hashSet(o,c->argv[2],new);
6643 decrRefCount(new);
6644 addReplyLongLong(c,value);
6645 server.dirty++;
6646 }
6647
6648 static void hgetCommand(redisClient *c) {
6649 robj *o, *value;
6650 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6651 checkType(c,o,REDIS_HASH)) return;
6652
6653 if ((value = hashGet(o,c->argv[2])) != NULL) {
6654 addReplyBulk(c,value);
6655 decrRefCount(value);
6656 } else {
6657 addReply(c,shared.nullbulk);
6658 }
6659 }
6660
6661 static void hmgetCommand(redisClient *c) {
6662 int i;
6663 robj *o, *value;
6664 o = lookupKeyRead(c->db,c->argv[1]);
6665 if (o != NULL && o->type != REDIS_HASH) {
6666 addReply(c,shared.wrongtypeerr);
6667 }
6668
6669 /* Note the check for o != NULL happens inside the loop. This is
6670 * done because objects that cannot be found are considered to be
6671 * an empty hash. The reply should then be a series of NULLs. */
6672 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6673 for (i = 2; i < c->argc; i++) {
6674 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6675 addReplyBulk(c,value);
6676 decrRefCount(value);
6677 } else {
6678 addReply(c,shared.nullbulk);
6679 }
6680 }
6681 }
6682
6683 static void hdelCommand(redisClient *c) {
6684 robj *o;
6685 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6686 checkType(c,o,REDIS_HASH)) return;
6687
6688 if (hashDelete(o,c->argv[2])) {
6689 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6690 addReply(c,shared.cone);
6691 server.dirty++;
6692 } else {
6693 addReply(c,shared.czero);
6694 }
6695 }
6696
6697 static void hlenCommand(redisClient *c) {
6698 robj *o;
6699 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6700 checkType(c,o,REDIS_HASH)) return;
6701
6702 addReplyUlong(c,hashLength(o));
6703 }
6704
6705 static void genericHgetallCommand(redisClient *c, int flags) {
6706 robj *o, *lenobj, *obj;
6707 unsigned long count = 0;
6708 hashIterator *hi;
6709
6710 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6711 || checkType(c,o,REDIS_HASH)) return;
6712
6713 lenobj = createObject(REDIS_STRING,NULL);
6714 addReply(c,lenobj);
6715 decrRefCount(lenobj);
6716
6717 hi = hashInitIterator(o);
6718 while (hashNext(hi) != REDIS_ERR) {
6719 if (flags & REDIS_HASH_KEY) {
6720 obj = hashCurrent(hi,REDIS_HASH_KEY);
6721 addReplyBulk(c,obj);
6722 decrRefCount(obj);
6723 count++;
6724 }
6725 if (flags & REDIS_HASH_VALUE) {
6726 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6727 addReplyBulk(c,obj);
6728 decrRefCount(obj);
6729 count++;
6730 }
6731 }
6732 hashReleaseIterator(hi);
6733
6734 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6735 }
6736
6737 static void hkeysCommand(redisClient *c) {
6738 genericHgetallCommand(c,REDIS_HASH_KEY);
6739 }
6740
6741 static void hvalsCommand(redisClient *c) {
6742 genericHgetallCommand(c,REDIS_HASH_VALUE);
6743 }
6744
6745 static void hgetallCommand(redisClient *c) {
6746 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6747 }
6748
6749 static void hexistsCommand(redisClient *c) {
6750 robj *o;
6751 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6752 checkType(c,o,REDIS_HASH)) return;
6753
6754 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6755 }
6756
6757 static void convertToRealHash(robj *o) {
6758 unsigned char *key, *val, *p, *zm = o->ptr;
6759 unsigned int klen, vlen;
6760 dict *dict = dictCreate(&hashDictType,NULL);
6761
6762 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6763 p = zipmapRewind(zm);
6764 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6765 robj *keyobj, *valobj;
6766
6767 keyobj = createStringObject((char*)key,klen);
6768 valobj = createStringObject((char*)val,vlen);
6769 keyobj = tryObjectEncoding(keyobj);
6770 valobj = tryObjectEncoding(valobj);
6771 dictAdd(dict,keyobj,valobj);
6772 }
6773 o->encoding = REDIS_ENCODING_HT;
6774 o->ptr = dict;
6775 zfree(zm);
6776 }
6777
6778 /* ========================= Non type-specific commands ==================== */
6779
6780 static void flushdbCommand(redisClient *c) {
6781 server.dirty += dictSize(c->db->dict);
6782 dictEmpty(c->db->dict);
6783 dictEmpty(c->db->expires);
6784 addReply(c,shared.ok);
6785 }
6786
6787 static void flushallCommand(redisClient *c) {
6788 server.dirty += emptyDb();
6789 addReply(c,shared.ok);
6790 if (server.bgsavechildpid != -1) {
6791 kill(server.bgsavechildpid,SIGKILL);
6792 rdbRemoveTempFile(server.bgsavechildpid);
6793 }
6794 rdbSave(server.dbfilename);
6795 server.dirty++;
6796 }
6797
6798 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6799 redisSortOperation *so = zmalloc(sizeof(*so));
6800 so->type = type;
6801 so->pattern = pattern;
6802 return so;
6803 }
6804
6805 /* Return the value associated to the key with a name obtained
6806 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6807 * The returned object will always have its refcount increased by 1
6808 * when it is non-NULL. */
6809 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6810 char *p, *f;
6811 sds spat, ssub;
6812 robj keyobj, fieldobj, *o;
6813 int prefixlen, sublen, postfixlen, fieldlen;
6814 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6815 struct {
6816 long len;
6817 long free;
6818 char buf[REDIS_SORTKEY_MAX+1];
6819 } keyname, fieldname;
6820
6821 /* If the pattern is "#" return the substitution object itself in order
6822 * to implement the "SORT ... GET #" feature. */
6823 spat = pattern->ptr;
6824 if (spat[0] == '#' && spat[1] == '\0') {
6825 incrRefCount(subst);
6826 return subst;
6827 }
6828
6829 /* The substitution object may be specially encoded. If so we create
6830 * a decoded object on the fly. Otherwise getDecodedObject will just
6831 * increment the ref count, that we'll decrement later. */
6832 subst = getDecodedObject(subst);
6833
6834 ssub = subst->ptr;
6835 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6836 p = strchr(spat,'*');
6837 if (!p) {
6838 decrRefCount(subst);
6839 return NULL;
6840 }
6841
6842 /* Find out if we're dealing with a hash dereference. */
6843 if ((f = strstr(p+1, "->")) != NULL) {
6844 fieldlen = sdslen(spat)-(f-spat);
6845 /* this also copies \0 character */
6846 memcpy(fieldname.buf,f+2,fieldlen-1);
6847 fieldname.len = fieldlen-2;
6848 } else {
6849 fieldlen = 0;
6850 }
6851
6852 prefixlen = p-spat;
6853 sublen = sdslen(ssub);
6854 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6855 memcpy(keyname.buf,spat,prefixlen);
6856 memcpy(keyname.buf+prefixlen,ssub,sublen);
6857 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6858 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6859 keyname.len = prefixlen+sublen+postfixlen;
6860 decrRefCount(subst);
6861
6862 /* Lookup substituted key */
6863 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6864 o = lookupKeyRead(db,&keyobj);
6865 if (o == NULL) return NULL;
6866
6867 if (fieldlen > 0) {
6868 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6869
6870 /* Retrieve value from hash by the field name. This operation
6871 * already increases the refcount of the returned object. */
6872 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6873 o = hashGet(o, &fieldobj);
6874 } else {
6875 if (o->type != REDIS_STRING) return NULL;
6876
6877 /* Every object that this function returns needs to have its refcount
6878 * increased. sortCommand decreases it again. */
6879 incrRefCount(o);
6880 }
6881
6882 return o;
6883 }
6884
6885 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6886 * the additional parameter is not standard but a BSD-specific we have to
6887 * pass sorting parameters via the global 'server' structure */
6888 static int sortCompare(const void *s1, const void *s2) {
6889 const redisSortObject *so1 = s1, *so2 = s2;
6890 int cmp;
6891
6892 if (!server.sort_alpha) {
6893 /* Numeric sorting. Here it's trivial as we precomputed scores */
6894 if (so1->u.score > so2->u.score) {
6895 cmp = 1;
6896 } else if (so1->u.score < so2->u.score) {
6897 cmp = -1;
6898 } else {
6899 cmp = 0;
6900 }
6901 } else {
6902 /* Alphanumeric sorting */
6903 if (server.sort_bypattern) {
6904 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6905 /* At least one compare object is NULL */
6906 if (so1->u.cmpobj == so2->u.cmpobj)
6907 cmp = 0;
6908 else if (so1->u.cmpobj == NULL)
6909 cmp = -1;
6910 else
6911 cmp = 1;
6912 } else {
6913 /* We have both the objects, use strcoll */
6914 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6915 }
6916 } else {
6917 /* Compare elements directly. */
6918 cmp = compareStringObjects(so1->obj,so2->obj);
6919 }
6920 }
6921 return server.sort_desc ? -cmp : cmp;
6922 }
6923
6924 /* The SORT command is the most complex command in Redis. Warning: this code
6925 * is optimized for speed and a bit less for readability */
6926 static void sortCommand(redisClient *c) {
6927 list *operations;
6928 int outputlen = 0;
6929 int desc = 0, alpha = 0;
6930 int limit_start = 0, limit_count = -1, start, end;
6931 int j, dontsort = 0, vectorlen;
6932 int getop = 0; /* GET operation counter */
6933 robj *sortval, *sortby = NULL, *storekey = NULL;
6934 redisSortObject *vector; /* Resulting vector to sort */
6935
6936 /* Lookup the key to sort. It must be of the right types */
6937 sortval = lookupKeyRead(c->db,c->argv[1]);
6938 if (sortval == NULL) {
6939 addReply(c,shared.emptymultibulk);
6940 return;
6941 }
6942 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6943 sortval->type != REDIS_ZSET)
6944 {
6945 addReply(c,shared.wrongtypeerr);
6946 return;
6947 }
6948
6949 /* Create a list of operations to perform for every sorted element.
6950 * Operations can be GET/DEL/INCR/DECR */
6951 operations = listCreate();
6952 listSetFreeMethod(operations,zfree);
6953 j = 2;
6954
6955 /* Now we need to protect sortval incrementing its count, in the future
6956 * SORT may have options able to overwrite/delete keys during the sorting
6957 * and the sorted key itself may get destroied */
6958 incrRefCount(sortval);
6959
6960 /* The SORT command has an SQL-alike syntax, parse it */
6961 while(j < c->argc) {
6962 int leftargs = c->argc-j-1;
6963 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6964 desc = 0;
6965 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6966 desc = 1;
6967 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6968 alpha = 1;
6969 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6970 limit_start = atoi(c->argv[j+1]->ptr);
6971 limit_count = atoi(c->argv[j+2]->ptr);
6972 j+=2;
6973 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6974 storekey = c->argv[j+1];
6975 j++;
6976 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6977 sortby = c->argv[j+1];
6978 /* If the BY pattern does not contain '*', i.e. it is constant,
6979 * we don't need to sort nor to lookup the weight keys. */
6980 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6981 j++;
6982 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6983 listAddNodeTail(operations,createSortOperation(
6984 REDIS_SORT_GET,c->argv[j+1]));
6985 getop++;
6986 j++;
6987 } else {
6988 decrRefCount(sortval);
6989 listRelease(operations);
6990 addReply(c,shared.syntaxerr);
6991 return;
6992 }
6993 j++;
6994 }
6995
6996 /* Load the sorting vector with all the objects to sort */
6997 switch(sortval->type) {
6998 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6999 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7000 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
7001 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
7002 }
7003 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
7004 j = 0;
7005
7006 if (sortval->type == REDIS_LIST) {
7007 list *list = sortval->ptr;
7008 listNode *ln;
7009 listIter li;
7010
7011 listRewind(list,&li);
7012 while((ln = listNext(&li))) {
7013 robj *ele = ln->value;
7014 vector[j].obj = ele;
7015 vector[j].u.score = 0;
7016 vector[j].u.cmpobj = NULL;
7017 j++;
7018 }
7019 } else {
7020 dict *set;
7021 dictIterator *di;
7022 dictEntry *setele;
7023
7024 if (sortval->type == REDIS_SET) {
7025 set = sortval->ptr;
7026 } else {
7027 zset *zs = sortval->ptr;
7028 set = zs->dict;
7029 }
7030
7031 di = dictGetIterator(set);
7032 while((setele = dictNext(di)) != NULL) {
7033 vector[j].obj = dictGetEntryKey(setele);
7034 vector[j].u.score = 0;
7035 vector[j].u.cmpobj = NULL;
7036 j++;
7037 }
7038 dictReleaseIterator(di);
7039 }
7040 redisAssert(j == vectorlen);
7041
7042 /* Now it's time to load the right scores in the sorting vector */
7043 if (dontsort == 0) {
7044 for (j = 0; j < vectorlen; j++) {
7045 robj *byval;
7046 if (sortby) {
7047 /* lookup value to sort by */
7048 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
7049 if (!byval) continue;
7050 } else {
7051 /* use object itself to sort by */
7052 byval = vector[j].obj;
7053 }
7054
7055 if (alpha) {
7056 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
7057 } else {
7058 if (byval->encoding == REDIS_ENCODING_RAW) {
7059 vector[j].u.score = strtod(byval->ptr,NULL);
7060 } else if (byval->encoding == REDIS_ENCODING_INT) {
7061 /* Don't need to decode the object if it's
7062 * integer-encoded (the only encoding supported) so
7063 * far. We can just cast it */
7064 vector[j].u.score = (long)byval->ptr;
7065 } else {
7066 redisAssert(1 != 1);
7067 }
7068 }
7069
7070 /* when the object was retrieved using lookupKeyByPattern,
7071 * its refcount needs to be decreased. */
7072 if (sortby) {
7073 decrRefCount(byval);
7074 }
7075 }
7076 }
7077
7078 /* We are ready to sort the vector... perform a bit of sanity check
7079 * on the LIMIT option too. We'll use a partial version of quicksort. */
7080 start = (limit_start < 0) ? 0 : limit_start;
7081 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7082 if (start >= vectorlen) {
7083 start = vectorlen-1;
7084 end = vectorlen-2;
7085 }
7086 if (end >= vectorlen) end = vectorlen-1;
7087
7088 if (dontsort == 0) {
7089 server.sort_desc = desc;
7090 server.sort_alpha = alpha;
7091 server.sort_bypattern = sortby ? 1 : 0;
7092 if (sortby && (start != 0 || end != vectorlen-1))
7093 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7094 else
7095 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
7096 }
7097
7098 /* Send command output to the output buffer, performing the specified
7099 * GET/DEL/INCR/DECR operations if any. */
7100 outputlen = getop ? getop*(end-start+1) : end-start+1;
7101 if (storekey == NULL) {
7102 /* STORE option not specified, sent the sorting result to client */
7103 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7104 for (j = start; j <= end; j++) {
7105 listNode *ln;
7106 listIter li;
7107
7108 if (!getop) addReplyBulk(c,vector[j].obj);
7109 listRewind(operations,&li);
7110 while((ln = listNext(&li))) {
7111 redisSortOperation *sop = ln->value;
7112 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7113 vector[j].obj);
7114
7115 if (sop->type == REDIS_SORT_GET) {
7116 if (!val) {
7117 addReply(c,shared.nullbulk);
7118 } else {
7119 addReplyBulk(c,val);
7120 decrRefCount(val);
7121 }
7122 } else {
7123 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7124 }
7125 }
7126 }
7127 } else {
7128 robj *listObject = createListObject();
7129 list *listPtr = (list*) listObject->ptr;
7130
7131 /* STORE option specified, set the sorting result as a List object */
7132 for (j = start; j <= end; j++) {
7133 listNode *ln;
7134 listIter li;
7135
7136 if (!getop) {
7137 listAddNodeTail(listPtr,vector[j].obj);
7138 incrRefCount(vector[j].obj);
7139 }
7140 listRewind(operations,&li);
7141 while((ln = listNext(&li))) {
7142 redisSortOperation *sop = ln->value;
7143 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7144 vector[j].obj);
7145
7146 if (sop->type == REDIS_SORT_GET) {
7147 if (!val) {
7148 listAddNodeTail(listPtr,createStringObject("",0));
7149 } else {
7150 /* We should do a incrRefCount on val because it is
7151 * added to the list, but also a decrRefCount because
7152 * it is returned by lookupKeyByPattern. This results
7153 * in doing nothing at all. */
7154 listAddNodeTail(listPtr,val);
7155 }
7156 } else {
7157 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7158 }
7159 }
7160 }
7161 if (dictReplace(c->db->dict,storekey,listObject)) {
7162 incrRefCount(storekey);
7163 }
7164 /* Note: we add 1 because the DB is dirty anyway since even if the
7165 * SORT result is empty a new key is set and maybe the old content
7166 * replaced. */
7167 server.dirty += 1+outputlen;
7168 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7169 }
7170
7171 /* Cleanup */
7172 decrRefCount(sortval);
7173 listRelease(operations);
7174 for (j = 0; j < vectorlen; j++) {
7175 if (alpha && vector[j].u.cmpobj)
7176 decrRefCount(vector[j].u.cmpobj);
7177 }
7178 zfree(vector);
7179 }
7180
7181 /* Convert an amount of bytes into a human readable string in the form
7182 * of 100B, 2G, 100M, 4K, and so forth. */
7183 static void bytesToHuman(char *s, unsigned long long n) {
7184 double d;
7185
7186 if (n < 1024) {
7187 /* Bytes */
7188 sprintf(s,"%lluB",n);
7189 return;
7190 } else if (n < (1024*1024)) {
7191 d = (double)n/(1024);
7192 sprintf(s,"%.2fK",d);
7193 } else if (n < (1024LL*1024*1024)) {
7194 d = (double)n/(1024*1024);
7195 sprintf(s,"%.2fM",d);
7196 } else if (n < (1024LL*1024*1024*1024)) {
7197 d = (double)n/(1024LL*1024*1024);
7198 sprintf(s,"%.2fG",d);
7199 }
7200 }
7201
7202 /* Create the string returned by the INFO command. This is decoupled
7203 * by the INFO command itself as we need to report the same information
7204 * on memory corruption problems. */
7205 static sds genRedisInfoString(void) {
7206 sds info;
7207 time_t uptime = time(NULL)-server.stat_starttime;
7208 int j;
7209 char hmem[64];
7210
7211 bytesToHuman(hmem,zmalloc_used_memory());
7212 info = sdscatprintf(sdsempty(),
7213 "redis_version:%s\r\n"
7214 "redis_git_sha1:%s\r\n"
7215 "redis_git_dirty:%d\r\n"
7216 "arch_bits:%s\r\n"
7217 "multiplexing_api:%s\r\n"
7218 "process_id:%ld\r\n"
7219 "uptime_in_seconds:%ld\r\n"
7220 "uptime_in_days:%ld\r\n"
7221 "connected_clients:%d\r\n"
7222 "connected_slaves:%d\r\n"
7223 "blocked_clients:%d\r\n"
7224 "used_memory:%zu\r\n"
7225 "used_memory_human:%s\r\n"
7226 "changes_since_last_save:%lld\r\n"
7227 "bgsave_in_progress:%d\r\n"
7228 "last_save_time:%ld\r\n"
7229 "bgrewriteaof_in_progress:%d\r\n"
7230 "total_connections_received:%lld\r\n"
7231 "total_commands_processed:%lld\r\n"
7232 "expired_keys:%lld\r\n"
7233 "hash_max_zipmap_entries:%zu\r\n"
7234 "hash_max_zipmap_value:%zu\r\n"
7235 "pubsub_channels:%ld\r\n"
7236 "pubsub_patterns:%u\r\n"
7237 "vm_enabled:%d\r\n"
7238 "role:%s\r\n"
7239 ,REDIS_VERSION,
7240 REDIS_GIT_SHA1,
7241 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
7242 (sizeof(long) == 8) ? "64" : "32",
7243 aeGetApiName(),
7244 (long) getpid(),
7245 uptime,
7246 uptime/(3600*24),
7247 listLength(server.clients)-listLength(server.slaves),
7248 listLength(server.slaves),
7249 server.blpop_blocked_clients,
7250 zmalloc_used_memory(),
7251 hmem,
7252 server.dirty,
7253 server.bgsavechildpid != -1,
7254 server.lastsave,
7255 server.bgrewritechildpid != -1,
7256 server.stat_numconnections,
7257 server.stat_numcommands,
7258 server.stat_expiredkeys,
7259 server.hash_max_zipmap_entries,
7260 server.hash_max_zipmap_value,
7261 dictSize(server.pubsub_channels),
7262 listLength(server.pubsub_patterns),
7263 server.vm_enabled != 0,
7264 server.masterhost == NULL ? "master" : "slave"
7265 );
7266 if (server.masterhost) {
7267 info = sdscatprintf(info,
7268 "master_host:%s\r\n"
7269 "master_port:%d\r\n"
7270 "master_link_status:%s\r\n"
7271 "master_last_io_seconds_ago:%d\r\n"
7272 ,server.masterhost,
7273 server.masterport,
7274 (server.replstate == REDIS_REPL_CONNECTED) ?
7275 "up" : "down",
7276 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7277 );
7278 }
7279 if (server.vm_enabled) {
7280 lockThreadedIO();
7281 info = sdscatprintf(info,
7282 "vm_conf_max_memory:%llu\r\n"
7283 "vm_conf_page_size:%llu\r\n"
7284 "vm_conf_pages:%llu\r\n"
7285 "vm_stats_used_pages:%llu\r\n"
7286 "vm_stats_swapped_objects:%llu\r\n"
7287 "vm_stats_swappin_count:%llu\r\n"
7288 "vm_stats_swappout_count:%llu\r\n"
7289 "vm_stats_io_newjobs_len:%lu\r\n"
7290 "vm_stats_io_processing_len:%lu\r\n"
7291 "vm_stats_io_processed_len:%lu\r\n"
7292 "vm_stats_io_active_threads:%lu\r\n"
7293 "vm_stats_blocked_clients:%lu\r\n"
7294 ,(unsigned long long) server.vm_max_memory,
7295 (unsigned long long) server.vm_page_size,
7296 (unsigned long long) server.vm_pages,
7297 (unsigned long long) server.vm_stats_used_pages,
7298 (unsigned long long) server.vm_stats_swapped_objects,
7299 (unsigned long long) server.vm_stats_swapins,
7300 (unsigned long long) server.vm_stats_swapouts,
7301 (unsigned long) listLength(server.io_newjobs),
7302 (unsigned long) listLength(server.io_processing),
7303 (unsigned long) listLength(server.io_processed),
7304 (unsigned long) server.io_active_threads,
7305 (unsigned long) server.vm_blocked_clients
7306 );
7307 unlockThreadedIO();
7308 }
7309 for (j = 0; j < server.dbnum; j++) {
7310 long long keys, vkeys;
7311
7312 keys = dictSize(server.db[j].dict);
7313 vkeys = dictSize(server.db[j].expires);
7314 if (keys || vkeys) {
7315 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7316 j, keys, vkeys);
7317 }
7318 }
7319 return info;
7320 }
7321
7322 static void infoCommand(redisClient *c) {
7323 sds info = genRedisInfoString();
7324 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7325 (unsigned long)sdslen(info)));
7326 addReplySds(c,info);
7327 addReply(c,shared.crlf);
7328 }
7329
7330 static void monitorCommand(redisClient *c) {
7331 /* ignore MONITOR if aleady slave or in monitor mode */
7332 if (c->flags & REDIS_SLAVE) return;
7333
7334 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7335 c->slaveseldb = 0;
7336 listAddNodeTail(server.monitors,c);
7337 addReply(c,shared.ok);
7338 }
7339
7340 /* ================================= Expire ================================= */
7341 static int removeExpire(redisDb *db, robj *key) {
7342 if (dictDelete(db->expires,key) == DICT_OK) {
7343 return 1;
7344 } else {
7345 return 0;
7346 }
7347 }
7348
7349 static int setExpire(redisDb *db, robj *key, time_t when) {
7350 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7351 return 0;
7352 } else {
7353 incrRefCount(key);
7354 return 1;
7355 }
7356 }
7357
7358 /* Return the expire time of the specified key, or -1 if no expire
7359 * is associated with this key (i.e. the key is non volatile) */
7360 static time_t getExpire(redisDb *db, robj *key) {
7361 dictEntry *de;
7362
7363 /* No expire? return ASAP */
7364 if (dictSize(db->expires) == 0 ||
7365 (de = dictFind(db->expires,key)) == NULL) return -1;
7366
7367 return (time_t) dictGetEntryVal(de);
7368 }
7369
7370 static int expireIfNeeded(redisDb *db, robj *key) {
7371 time_t when;
7372 dictEntry *de;
7373
7374 /* No expire? return ASAP */
7375 if (dictSize(db->expires) == 0 ||
7376 (de = dictFind(db->expires,key)) == NULL) return 0;
7377
7378 /* Lookup the expire */
7379 when = (time_t) dictGetEntryVal(de);
7380 if (time(NULL) <= when) return 0;
7381
7382 /* Delete the key */
7383 dictDelete(db->expires,key);
7384 server.stat_expiredkeys++;
7385 return dictDelete(db->dict,key) == DICT_OK;
7386 }
7387
7388 static int deleteIfVolatile(redisDb *db, robj *key) {
7389 dictEntry *de;
7390
7391 /* No expire? return ASAP */
7392 if (dictSize(db->expires) == 0 ||
7393 (de = dictFind(db->expires,key)) == NULL) return 0;
7394
7395 /* Delete the key */
7396 server.dirty++;
7397 server.stat_expiredkeys++;
7398 dictDelete(db->expires,key);
7399 return dictDelete(db->dict,key) == DICT_OK;
7400 }
7401
7402 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7403 dictEntry *de;
7404 time_t seconds;
7405
7406 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7407
7408 seconds -= offset;
7409
7410 de = dictFind(c->db->dict,key);
7411 if (de == NULL) {
7412 addReply(c,shared.czero);
7413 return;
7414 }
7415 if (seconds <= 0) {
7416 if (deleteKey(c->db,key)) server.dirty++;
7417 addReply(c, shared.cone);
7418 return;
7419 } else {
7420 time_t when = time(NULL)+seconds;
7421 if (setExpire(c->db,key,when)) {
7422 addReply(c,shared.cone);
7423 server.dirty++;
7424 } else {
7425 addReply(c,shared.czero);
7426 }
7427 return;
7428 }
7429 }
7430
7431 static void expireCommand(redisClient *c) {
7432 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7433 }
7434
7435 static void expireatCommand(redisClient *c) {
7436 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7437 }
7438
7439 static void ttlCommand(redisClient *c) {
7440 time_t expire;
7441 int ttl = -1;
7442
7443 expire = getExpire(c->db,c->argv[1]);
7444 if (expire != -1) {
7445 ttl = (int) (expire-time(NULL));
7446 if (ttl < 0) ttl = -1;
7447 }
7448 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7449 }
7450
7451 /* ================================ MULTI/EXEC ============================== */
7452
7453 /* Client state initialization for MULTI/EXEC */
7454 static void initClientMultiState(redisClient *c) {
7455 c->mstate.commands = NULL;
7456 c->mstate.count = 0;
7457 }
7458
7459 /* Release all the resources associated with MULTI/EXEC state */
7460 static void freeClientMultiState(redisClient *c) {
7461 int j;
7462
7463 for (j = 0; j < c->mstate.count; j++) {
7464 int i;
7465 multiCmd *mc = c->mstate.commands+j;
7466
7467 for (i = 0; i < mc->argc; i++)
7468 decrRefCount(mc->argv[i]);
7469 zfree(mc->argv);
7470 }
7471 zfree(c->mstate.commands);
7472 }
7473
7474 /* Add a new command into the MULTI commands queue */
7475 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7476 multiCmd *mc;
7477 int j;
7478
7479 c->mstate.commands = zrealloc(c->mstate.commands,
7480 sizeof(multiCmd)*(c->mstate.count+1));
7481 mc = c->mstate.commands+c->mstate.count;
7482 mc->cmd = cmd;
7483 mc->argc = c->argc;
7484 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7485 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7486 for (j = 0; j < c->argc; j++)
7487 incrRefCount(mc->argv[j]);
7488 c->mstate.count++;
7489 }
7490
7491 static void multiCommand(redisClient *c) {
7492 c->flags |= REDIS_MULTI;
7493 addReply(c,shared.ok);
7494 }
7495
7496 static void discardCommand(redisClient *c) {
7497 if (!(c->flags & REDIS_MULTI)) {
7498 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7499 return;
7500 }
7501
7502 freeClientMultiState(c);
7503 initClientMultiState(c);
7504 c->flags &= (~REDIS_MULTI);
7505 addReply(c,shared.ok);
7506 }
7507
7508 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7509 * implememntation for more information. */
7510 static void execCommandReplicateMulti(redisClient *c) {
7511 struct redisCommand *cmd;
7512 robj *multistring = createStringObject("MULTI",5);
7513
7514 cmd = lookupCommand("multi");
7515 if (server.appendonly)
7516 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7517 if (listLength(server.slaves))
7518 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7519 decrRefCount(multistring);
7520 }
7521
7522 static void execCommand(redisClient *c) {
7523 int j;
7524 robj **orig_argv;
7525 int orig_argc;
7526
7527 if (!(c->flags & REDIS_MULTI)) {
7528 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7529 return;
7530 }
7531
7532 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7533 * A failed EXEC will return a multi bulk nil object. */
7534 if (c->flags & REDIS_DIRTY_CAS) {
7535 freeClientMultiState(c);
7536 initClientMultiState(c);
7537 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7538 unwatchAllKeys(c);
7539 addReply(c,shared.nullmultibulk);
7540 return;
7541 }
7542
7543 /* Replicate a MULTI request now that we are sure the block is executed.
7544 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7545 * both the AOF and the replication link will have the same consistency
7546 * and atomicity guarantees. */
7547 execCommandReplicateMulti(c);
7548
7549 /* Exec all the queued commands */
7550 orig_argv = c->argv;
7551 orig_argc = c->argc;
7552 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7553 for (j = 0; j < c->mstate.count; j++) {
7554 c->argc = c->mstate.commands[j].argc;
7555 c->argv = c->mstate.commands[j].argv;
7556 call(c,c->mstate.commands[j].cmd);
7557 }
7558 c->argv = orig_argv;
7559 c->argc = orig_argc;
7560 freeClientMultiState(c);
7561 initClientMultiState(c);
7562 c->flags &= (~REDIS_MULTI);
7563 unwatchAllKeys(c);
7564 /* Make sure the EXEC command is always replicated / AOF, since we
7565 * always send the MULTI command (we can't know beforehand if the
7566 * next operations will contain at least a modification to the DB). */
7567 server.dirty++;
7568 }
7569
7570 /* =========================== Blocking Operations ========================= */
7571
7572 /* Currently Redis blocking operations support is limited to list POP ops,
7573 * so the current implementation is not fully generic, but it is also not
7574 * completely specific so it will not require a rewrite to support new
7575 * kind of blocking operations in the future.
7576 *
7577 * Still it's important to note that list blocking operations can be already
7578 * used as a notification mechanism in order to implement other blocking
7579 * operations at application level, so there must be a very strong evidence
7580 * of usefulness and generality before new blocking operations are implemented.
7581 *
7582 * This is how the current blocking POP works, we use BLPOP as example:
7583 * - If the user calls BLPOP and the key exists and contains a non empty list
7584 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7585 * if there is not to block.
7586 * - If instead BLPOP is called and the key does not exists or the list is
7587 * empty we need to block. In order to do so we remove the notification for
7588 * new data to read in the client socket (so that we'll not serve new
7589 * requests if the blocking request is not served). Also we put the client
7590 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
7591 * blocking for this keys.
7592 * - If a PUSH operation against a key with blocked clients waiting is
7593 * performed, we serve the first in the list: basically instead to push
7594 * the new element inside the list we return it to the (first / oldest)
7595 * blocking client, unblock the client, and remove it form the list.
7596 *
7597 * The above comment and the source code should be enough in order to understand
7598 * the implementation and modify / fix it later.
7599 */
7600
7601 /* Set a client in blocking mode for the specified key, with the specified
7602 * timeout */
7603 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7604 dictEntry *de;
7605 list *l;
7606 int j;
7607
7608 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7609 c->blocking_keys_num = numkeys;
7610 c->blockingto = timeout;
7611 for (j = 0; j < numkeys; j++) {
7612 /* Add the key in the client structure, to map clients -> keys */
7613 c->blocking_keys[j] = keys[j];
7614 incrRefCount(keys[j]);
7615
7616 /* And in the other "side", to map keys -> clients */
7617 de = dictFind(c->db->blocking_keys,keys[j]);
7618 if (de == NULL) {
7619 int retval;
7620
7621 /* For every key we take a list of clients blocked for it */
7622 l = listCreate();
7623 retval = dictAdd(c->db->blocking_keys,keys[j],l);
7624 incrRefCount(keys[j]);
7625 assert(retval == DICT_OK);
7626 } else {
7627 l = dictGetEntryVal(de);
7628 }
7629 listAddNodeTail(l,c);
7630 }
7631 /* Mark the client as a blocked client */
7632 c->flags |= REDIS_BLOCKED;
7633 server.blpop_blocked_clients++;
7634 }
7635
7636 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7637 static void unblockClientWaitingData(redisClient *c) {
7638 dictEntry *de;
7639 list *l;
7640 int j;
7641
7642 assert(c->blocking_keys != NULL);
7643 /* The client may wait for multiple keys, so unblock it for every key. */
7644 for (j = 0; j < c->blocking_keys_num; j++) {
7645 /* Remove this client from the list of clients waiting for this key. */
7646 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
7647 assert(de != NULL);
7648 l = dictGetEntryVal(de);
7649 listDelNode(l,listSearchKey(l,c));
7650 /* If the list is empty we need to remove it to avoid wasting memory */
7651 if (listLength(l) == 0)
7652 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7653 decrRefCount(c->blocking_keys[j]);
7654 }
7655 /* Cleanup the client structure */
7656 zfree(c->blocking_keys);
7657 c->blocking_keys = NULL;
7658 c->flags &= (~REDIS_BLOCKED);
7659 server.blpop_blocked_clients--;
7660 /* We want to process data if there is some command waiting
7661 * in the input buffer. Note that this is safe even if
7662 * unblockClientWaitingData() gets called from freeClient() because
7663 * freeClient() will be smart enough to call this function
7664 * *after* c->querybuf was set to NULL. */
7665 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7666 }
7667
7668 /* This should be called from any function PUSHing into lists.
7669 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7670 * 'ele' is the element pushed.
7671 *
7672 * If the function returns 0 there was no client waiting for a list push
7673 * against this key.
7674 *
7675 * If the function returns 1 there was a client waiting for a list push
7676 * against this key, the element was passed to this client thus it's not
7677 * needed to actually add it to the list and the caller should return asap. */
7678 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7679 struct dictEntry *de;
7680 redisClient *receiver;
7681 list *l;
7682 listNode *ln;
7683
7684 de = dictFind(c->db->blocking_keys,key);
7685 if (de == NULL) return 0;
7686 l = dictGetEntryVal(de);
7687 ln = listFirst(l);
7688 assert(ln != NULL);
7689 receiver = ln->value;
7690
7691 addReplySds(receiver,sdsnew("*2\r\n"));
7692 addReplyBulk(receiver,key);
7693 addReplyBulk(receiver,ele);
7694 unblockClientWaitingData(receiver);
7695 return 1;
7696 }
7697
7698 /* Blocking RPOP/LPOP */
7699 static void blockingPopGenericCommand(redisClient *c, int where) {
7700 robj *o;
7701 time_t timeout;
7702 int j;
7703
7704 for (j = 1; j < c->argc-1; j++) {
7705 o = lookupKeyWrite(c->db,c->argv[j]);
7706 if (o != NULL) {
7707 if (o->type != REDIS_LIST) {
7708 addReply(c,shared.wrongtypeerr);
7709 return;
7710 } else {
7711 list *list = o->ptr;
7712 if (listLength(list) != 0) {
7713 /* If the list contains elements fall back to the usual
7714 * non-blocking POP operation */
7715 robj *argv[2], **orig_argv;
7716 int orig_argc;
7717
7718 /* We need to alter the command arguments before to call
7719 * popGenericCommand() as the command takes a single key. */
7720 orig_argv = c->argv;
7721 orig_argc = c->argc;
7722 argv[1] = c->argv[j];
7723 c->argv = argv;
7724 c->argc = 2;
7725
7726 /* Also the return value is different, we need to output
7727 * the multi bulk reply header and the key name. The
7728 * "real" command will add the last element (the value)
7729 * for us. If this souds like an hack to you it's just
7730 * because it is... */
7731 addReplySds(c,sdsnew("*2\r\n"));
7732 addReplyBulk(c,argv[1]);
7733 popGenericCommand(c,where);
7734
7735 /* Fix the client structure with the original stuff */
7736 c->argv = orig_argv;
7737 c->argc = orig_argc;
7738 return;
7739 }
7740 }
7741 }
7742 }
7743 /* If the list is empty or the key does not exists we must block */
7744 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7745 if (timeout > 0) timeout += time(NULL);
7746 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7747 }
7748
7749 static void blpopCommand(redisClient *c) {
7750 blockingPopGenericCommand(c,REDIS_HEAD);
7751 }
7752
7753 static void brpopCommand(redisClient *c) {
7754 blockingPopGenericCommand(c,REDIS_TAIL);
7755 }
7756
7757 /* =============================== Replication ============================= */
7758
7759 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7760 ssize_t nwritten, ret = size;
7761 time_t start = time(NULL);
7762
7763 timeout++;
7764 while(size) {
7765 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7766 nwritten = write(fd,ptr,size);
7767 if (nwritten == -1) return -1;
7768 ptr += nwritten;
7769 size -= nwritten;
7770 }
7771 if ((time(NULL)-start) > timeout) {
7772 errno = ETIMEDOUT;
7773 return -1;
7774 }
7775 }
7776 return ret;
7777 }
7778
7779 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7780 ssize_t nread, totread = 0;
7781 time_t start = time(NULL);
7782
7783 timeout++;
7784 while(size) {
7785 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7786 nread = read(fd,ptr,size);
7787 if (nread == -1) return -1;
7788 ptr += nread;
7789 size -= nread;
7790 totread += nread;
7791 }
7792 if ((time(NULL)-start) > timeout) {
7793 errno = ETIMEDOUT;
7794 return -1;
7795 }
7796 }
7797 return totread;
7798 }
7799
7800 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7801 ssize_t nread = 0;
7802
7803 size--;
7804 while(size) {
7805 char c;
7806
7807 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7808 if (c == '\n') {
7809 *ptr = '\0';
7810 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7811 return nread;
7812 } else {
7813 *ptr++ = c;
7814 *ptr = '\0';
7815 nread++;
7816 }
7817 }
7818 return nread;
7819 }
7820
7821 static void syncCommand(redisClient *c) {
7822 /* ignore SYNC if aleady slave or in monitor mode */
7823 if (c->flags & REDIS_SLAVE) return;
7824
7825 /* SYNC can't be issued when the server has pending data to send to
7826 * the client about already issued commands. We need a fresh reply
7827 * buffer registering the differences between the BGSAVE and the current
7828 * dataset, so that we can copy to other slaves if needed. */
7829 if (listLength(c->reply) != 0) {
7830 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7831 return;
7832 }
7833
7834 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7835 /* Here we need to check if there is a background saving operation
7836 * in progress, or if it is required to start one */
7837 if (server.bgsavechildpid != -1) {
7838 /* Ok a background save is in progress. Let's check if it is a good
7839 * one for replication, i.e. if there is another slave that is
7840 * registering differences since the server forked to save */
7841 redisClient *slave;
7842 listNode *ln;
7843 listIter li;
7844
7845 listRewind(server.slaves,&li);
7846 while((ln = listNext(&li))) {
7847 slave = ln->value;
7848 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7849 }
7850 if (ln) {
7851 /* Perfect, the server is already registering differences for
7852 * another slave. Set the right state, and copy the buffer. */
7853 listRelease(c->reply);
7854 c->reply = listDup(slave->reply);
7855 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7856 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7857 } else {
7858 /* No way, we need to wait for the next BGSAVE in order to
7859 * register differences */
7860 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7861 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7862 }
7863 } else {
7864 /* Ok we don't have a BGSAVE in progress, let's start one */
7865 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7866 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7867 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7868 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7869 return;
7870 }
7871 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7872 }
7873 c->repldbfd = -1;
7874 c->flags |= REDIS_SLAVE;
7875 c->slaveseldb = 0;
7876 listAddNodeTail(server.slaves,c);
7877 return;
7878 }
7879
7880 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7881 redisClient *slave = privdata;
7882 REDIS_NOTUSED(el);
7883 REDIS_NOTUSED(mask);
7884 char buf[REDIS_IOBUF_LEN];
7885 ssize_t nwritten, buflen;
7886
7887 if (slave->repldboff == 0) {
7888 /* Write the bulk write count before to transfer the DB. In theory here
7889 * we don't know how much room there is in the output buffer of the
7890 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7891 * operations) will never be smaller than the few bytes we need. */
7892 sds bulkcount;
7893
7894 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7895 slave->repldbsize);
7896 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7897 {
7898 sdsfree(bulkcount);
7899 freeClient(slave);
7900 return;
7901 }
7902 sdsfree(bulkcount);
7903 }
7904 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7905 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7906 if (buflen <= 0) {
7907 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7908 (buflen == 0) ? "premature EOF" : strerror(errno));
7909 freeClient(slave);
7910 return;
7911 }
7912 if ((nwritten = write(fd,buf,buflen)) == -1) {
7913 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7914 strerror(errno));
7915 freeClient(slave);
7916 return;
7917 }
7918 slave->repldboff += nwritten;
7919 if (slave->repldboff == slave->repldbsize) {
7920 close(slave->repldbfd);
7921 slave->repldbfd = -1;
7922 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7923 slave->replstate = REDIS_REPL_ONLINE;
7924 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7925 sendReplyToClient, slave) == AE_ERR) {
7926 freeClient(slave);
7927 return;
7928 }
7929 addReplySds(slave,sdsempty());
7930 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7931 }
7932 }
7933
7934 /* This function is called at the end of every backgrond saving.
7935 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7936 * otherwise REDIS_ERR is passed to the function.
7937 *
7938 * The goal of this function is to handle slaves waiting for a successful
7939 * background saving in order to perform non-blocking synchronization. */
7940 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7941 listNode *ln;
7942 int startbgsave = 0;
7943 listIter li;
7944
7945 listRewind(server.slaves,&li);
7946 while((ln = listNext(&li))) {
7947 redisClient *slave = ln->value;
7948
7949 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7950 startbgsave = 1;
7951 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7952 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7953 struct redis_stat buf;
7954
7955 if (bgsaveerr != REDIS_OK) {
7956 freeClient(slave);
7957 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7958 continue;
7959 }
7960 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7961 redis_fstat(slave->repldbfd,&buf) == -1) {
7962 freeClient(slave);
7963 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7964 continue;
7965 }
7966 slave->repldboff = 0;
7967 slave->repldbsize = buf.st_size;
7968 slave->replstate = REDIS_REPL_SEND_BULK;
7969 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7970 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7971 freeClient(slave);
7972 continue;
7973 }
7974 }
7975 }
7976 if (startbgsave) {
7977 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7978 listIter li;
7979
7980 listRewind(server.slaves,&li);
7981 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7982 while((ln = listNext(&li))) {
7983 redisClient *slave = ln->value;
7984
7985 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7986 freeClient(slave);
7987 }
7988 }
7989 }
7990 }
7991
7992 static int syncWithMaster(void) {
7993 char buf[1024], tmpfile[256], authcmd[1024];
7994 long dumpsize;
7995 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7996 int dfd, maxtries = 5;
7997
7998 if (fd == -1) {
7999 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8000 strerror(errno));
8001 return REDIS_ERR;
8002 }
8003
8004 /* AUTH with the master if required. */
8005 if(server.masterauth) {
8006 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8007 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8008 close(fd);
8009 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8010 strerror(errno));
8011 return REDIS_ERR;
8012 }
8013 /* Read the AUTH result. */
8014 if (syncReadLine(fd,buf,1024,3600) == -1) {
8015 close(fd);
8016 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8017 strerror(errno));
8018 return REDIS_ERR;
8019 }
8020 if (buf[0] != '+') {
8021 close(fd);
8022 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8023 return REDIS_ERR;
8024 }
8025 }
8026
8027 /* Issue the SYNC command */
8028 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8029 close(fd);
8030 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8031 strerror(errno));
8032 return REDIS_ERR;
8033 }
8034 /* Read the bulk write count */
8035 if (syncReadLine(fd,buf,1024,3600) == -1) {
8036 close(fd);
8037 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8038 strerror(errno));
8039 return REDIS_ERR;
8040 }
8041 if (buf[0] != '$') {
8042 close(fd);
8043 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8044 return REDIS_ERR;
8045 }
8046 dumpsize = strtol(buf+1,NULL,10);
8047 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
8048 /* Read the bulk write data on a temp file */
8049 while(maxtries--) {
8050 snprintf(tmpfile,256,
8051 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8052 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8053 if (dfd != -1) break;
8054 sleep(1);
8055 }
8056 if (dfd == -1) {
8057 close(fd);
8058 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8059 return REDIS_ERR;
8060 }
8061 while(dumpsize) {
8062 int nread, nwritten;
8063
8064 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8065 if (nread == -1) {
8066 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8067 strerror(errno));
8068 close(fd);
8069 close(dfd);
8070 return REDIS_ERR;
8071 }
8072 nwritten = write(dfd,buf,nread);
8073 if (nwritten == -1) {
8074 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8075 close(fd);
8076 close(dfd);
8077 return REDIS_ERR;
8078 }
8079 dumpsize -= nread;
8080 }
8081 close(dfd);
8082 if (rename(tmpfile,server.dbfilename) == -1) {
8083 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8084 unlink(tmpfile);
8085 close(fd);
8086 return REDIS_ERR;
8087 }
8088 emptyDb();
8089 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8090 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8091 close(fd);
8092 return REDIS_ERR;
8093 }
8094 server.master = createClient(fd);
8095 server.master->flags |= REDIS_MASTER;
8096 server.master->authenticated = 1;
8097 server.replstate = REDIS_REPL_CONNECTED;
8098 return REDIS_OK;
8099 }
8100
8101 static void slaveofCommand(redisClient *c) {
8102 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8103 !strcasecmp(c->argv[2]->ptr,"one")) {
8104 if (server.masterhost) {
8105 sdsfree(server.masterhost);
8106 server.masterhost = NULL;
8107 if (server.master) freeClient(server.master);
8108 server.replstate = REDIS_REPL_NONE;
8109 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8110 }
8111 } else {
8112 sdsfree(server.masterhost);
8113 server.masterhost = sdsdup(c->argv[1]->ptr);
8114 server.masterport = atoi(c->argv[2]->ptr);
8115 if (server.master) freeClient(server.master);
8116 server.replstate = REDIS_REPL_CONNECT;
8117 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8118 server.masterhost, server.masterport);
8119 }
8120 addReply(c,shared.ok);
8121 }
8122
8123 /* ============================ Maxmemory directive ======================== */
8124
8125 /* Try to free one object form the pre-allocated objects free list.
8126 * This is useful under low mem conditions as by default we take 1 million
8127 * free objects allocated. On success REDIS_OK is returned, otherwise
8128 * REDIS_ERR. */
8129 static int tryFreeOneObjectFromFreelist(void) {
8130 robj *o;
8131
8132 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8133 if (listLength(server.objfreelist)) {
8134 listNode *head = listFirst(server.objfreelist);
8135 o = listNodeValue(head);
8136 listDelNode(server.objfreelist,head);
8137 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8138 zfree(o);
8139 return REDIS_OK;
8140 } else {
8141 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8142 return REDIS_ERR;
8143 }
8144 }
8145
8146 /* This function gets called when 'maxmemory' is set on the config file to limit
8147 * the max memory used by the server, and we are out of memory.
8148 * This function will try to, in order:
8149 *
8150 * - Free objects from the free list
8151 * - Try to remove keys with an EXPIRE set
8152 *
8153 * It is not possible to free enough memory to reach used-memory < maxmemory
8154 * the server will start refusing commands that will enlarge even more the
8155 * memory usage.
8156 */
8157 static void freeMemoryIfNeeded(void) {
8158 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8159 int j, k, freed = 0;
8160
8161 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8162 for (j = 0; j < server.dbnum; j++) {
8163 int minttl = -1;
8164 robj *minkey = NULL;
8165 struct dictEntry *de;
8166
8167 if (dictSize(server.db[j].expires)) {
8168 freed = 1;
8169 /* From a sample of three keys drop the one nearest to
8170 * the natural expire */
8171 for (k = 0; k < 3; k++) {
8172 time_t t;
8173
8174 de = dictGetRandomKey(server.db[j].expires);
8175 t = (time_t) dictGetEntryVal(de);
8176 if (minttl == -1 || t < minttl) {
8177 minkey = dictGetEntryKey(de);
8178 minttl = t;
8179 }
8180 }
8181 deleteKey(server.db+j,minkey);
8182 }
8183 }
8184 if (!freed) return; /* nothing to free... */
8185 }
8186 }
8187
8188 /* ============================== Append Only file ========================== */
8189
8190 /* Write the append only file buffer on disk.
8191 *
8192 * Since we are required to write the AOF before replying to the client,
8193 * and the only way the client socket can get a write is entering when the
8194 * the event loop, we accumulate all the AOF writes in a memory
8195 * buffer and write it on disk using this function just before entering
8196 * the event loop again. */
8197 static void flushAppendOnlyFile(void) {
8198 time_t now;
8199 ssize_t nwritten;
8200
8201 if (sdslen(server.aofbuf) == 0) return;
8202
8203 /* We want to perform a single write. This should be guaranteed atomic
8204 * at least if the filesystem we are writing is a real physical one.
8205 * While this will save us against the server being killed I don't think
8206 * there is much to do about the whole server stopping for power problems
8207 * or alike */
8208 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8209 if (nwritten != (signed)sdslen(server.aofbuf)) {
8210 /* Ooops, we are in troubles. The best thing to do for now is
8211 * aborting instead of giving the illusion that everything is
8212 * working as expected. */
8213 if (nwritten == -1) {
8214 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8215 } else {
8216 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8217 }
8218 exit(1);
8219 }
8220 sdsfree(server.aofbuf);
8221 server.aofbuf = sdsempty();
8222
8223 /* Fsync if needed */
8224 now = time(NULL);
8225 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8226 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8227 now-server.lastfsync > 1))
8228 {
8229 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8230 * flushing metadata. */
8231 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8232 server.lastfsync = now;
8233 }
8234 }
8235
8236 static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8237 int j;
8238 buf = sdscatprintf(buf,"*%d\r\n",argc);
8239 for (j = 0; j < argc; j++) {
8240 robj *o = getDecodedObject(argv[j]);
8241 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8242 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8243 buf = sdscatlen(buf,"\r\n",2);
8244 decrRefCount(o);
8245 }
8246 return buf;
8247 }
8248
8249 static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8250 int argc = 3;
8251 long when;
8252 robj *argv[3];
8253
8254 /* Make sure we can use strtol */
8255 seconds = getDecodedObject(seconds);
8256 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8257 decrRefCount(seconds);
8258
8259 argv[0] = createStringObject("EXPIREAT",8);
8260 argv[1] = key;
8261 argv[2] = createObject(REDIS_STRING,
8262 sdscatprintf(sdsempty(),"%ld",when));
8263 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8264 decrRefCount(argv[0]);
8265 decrRefCount(argv[2]);
8266 return buf;
8267 }
8268
8269 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8270 sds buf = sdsempty();
8271 robj *tmpargv[3];
8272
8273 /* The DB this command was targetting is not the same as the last command
8274 * we appendend. To issue a SELECT command is needed. */
8275 if (dictid != server.appendseldb) {
8276 char seldb[64];
8277
8278 snprintf(seldb,sizeof(seldb),"%d",dictid);
8279 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8280 (unsigned long)strlen(seldb),seldb);
8281 server.appendseldb = dictid;
8282 }
8283
8284 if (cmd->proc == expireCommand) {
8285 /* Translate EXPIRE into EXPIREAT */
8286 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8287 } else if (cmd->proc == setexCommand) {
8288 /* Translate SETEX to SET and EXPIREAT */
8289 tmpargv[0] = createStringObject("SET",3);
8290 tmpargv[1] = argv[1];
8291 tmpargv[2] = argv[3];
8292 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8293 decrRefCount(tmpargv[0]);
8294 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8295 } else {
8296 buf = catAppendOnlyGenericCommand(buf,argc,argv);
8297 }
8298
8299 /* Append to the AOF buffer. This will be flushed on disk just before
8300 * of re-entering the event loop, so before the client will get a
8301 * positive reply about the operation performed. */
8302 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8303
8304 /* If a background append only file rewriting is in progress we want to
8305 * accumulate the differences between the child DB and the current one
8306 * in a buffer, so that when the child process will do its work we
8307 * can append the differences to the new append only file. */
8308 if (server.bgrewritechildpid != -1)
8309 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8310
8311 sdsfree(buf);
8312 }
8313
8314 /* In Redis commands are always executed in the context of a client, so in
8315 * order to load the append only file we need to create a fake client. */
8316 static struct redisClient *createFakeClient(void) {
8317 struct redisClient *c = zmalloc(sizeof(*c));
8318
8319 selectDb(c,0);
8320 c->fd = -1;
8321 c->querybuf = sdsempty();
8322 c->argc = 0;
8323 c->argv = NULL;
8324 c->flags = 0;
8325 /* We set the fake client as a slave waiting for the synchronization
8326 * so that Redis will not try to send replies to this client. */
8327 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8328 c->reply = listCreate();
8329 listSetFreeMethod(c->reply,decrRefCount);
8330 listSetDupMethod(c->reply,dupClientReplyValue);
8331 initClientMultiState(c);
8332 return c;
8333 }
8334
8335 static void freeFakeClient(struct redisClient *c) {
8336 sdsfree(c->querybuf);
8337 listRelease(c->reply);
8338 freeClientMultiState(c);
8339 zfree(c);
8340 }
8341
8342 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8343 * error (the append only file is zero-length) REDIS_ERR is returned. On
8344 * fatal error an error message is logged and the program exists. */
8345 int loadAppendOnlyFile(char *filename) {
8346 struct redisClient *fakeClient;
8347 FILE *fp = fopen(filename,"r");
8348 struct redis_stat sb;
8349 unsigned long long loadedkeys = 0;
8350 int appendonly = server.appendonly;
8351
8352 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8353 return REDIS_ERR;
8354
8355 if (fp == NULL) {
8356 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8357 exit(1);
8358 }
8359
8360 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8361 * to the same file we're about to read. */
8362 server.appendonly = 0;
8363
8364 fakeClient = createFakeClient();
8365 while(1) {
8366 int argc, j;
8367 unsigned long len;
8368 robj **argv;
8369 char buf[128];
8370 sds argsds;
8371 struct redisCommand *cmd;
8372
8373 if (fgets(buf,sizeof(buf),fp) == NULL) {
8374 if (feof(fp))
8375 break;
8376 else
8377 goto readerr;
8378 }
8379 if (buf[0] != '*') goto fmterr;
8380 argc = atoi(buf+1);
8381 argv = zmalloc(sizeof(robj*)*argc);
8382 for (j = 0; j < argc; j++) {
8383 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8384 if (buf[0] != '$') goto fmterr;
8385 len = strtol(buf+1,NULL,10);
8386 argsds = sdsnewlen(NULL,len);
8387 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8388 argv[j] = createObject(REDIS_STRING,argsds);
8389 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8390 }
8391
8392 /* Command lookup */
8393 cmd = lookupCommand(argv[0]->ptr);
8394 if (!cmd) {
8395 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8396 exit(1);
8397 }
8398 /* Try object encoding */
8399 if (cmd->flags & REDIS_CMD_BULK)
8400 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8401 /* Run the command in the context of a fake client */
8402 fakeClient->argc = argc;
8403 fakeClient->argv = argv;
8404 cmd->proc(fakeClient);
8405 /* Discard the reply objects list from the fake client */
8406 while(listLength(fakeClient->reply))
8407 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8408 /* Clean up, ready for the next command */
8409 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8410 zfree(argv);
8411 /* Handle swapping while loading big datasets when VM is on */
8412 loadedkeys++;
8413 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8414 while (zmalloc_used_memory() > server.vm_max_memory) {
8415 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8416 }
8417 }
8418 }
8419
8420 /* This point can only be reached when EOF is reached without errors.
8421 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8422 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8423
8424 fclose(fp);
8425 freeFakeClient(fakeClient);
8426 server.appendonly = appendonly;
8427 return REDIS_OK;
8428
8429 readerr:
8430 if (feof(fp)) {
8431 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8432 } else {
8433 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8434 }
8435 exit(1);
8436 fmterr:
8437 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8438 exit(1);
8439 }
8440
8441 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8442 static int fwriteBulkObject(FILE *fp, robj *obj) {
8443 char buf[128];
8444 int decrrc = 0;
8445
8446 /* Avoid the incr/decr ref count business if possible to help
8447 * copy-on-write (we are often in a child process when this function
8448 * is called).
8449 * Also makes sure that key objects don't get incrRefCount-ed when VM
8450 * is enabled */
8451 if (obj->encoding != REDIS_ENCODING_RAW) {
8452 obj = getDecodedObject(obj);
8453 decrrc = 1;
8454 }
8455 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8456 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8457 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8458 goto err;
8459 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8460 if (decrrc) decrRefCount(obj);
8461 return 1;
8462 err:
8463 if (decrrc) decrRefCount(obj);
8464 return 0;
8465 }
8466
8467 /* Write binary-safe string into a file in the bulkformat
8468 * $<count>\r\n<payload>\r\n */
8469 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8470 char buf[128];
8471
8472 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8473 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8474 if (len && fwrite(s,len,1,fp) == 0) return 0;
8475 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8476 return 1;
8477 }
8478
8479 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8480 static int fwriteBulkDouble(FILE *fp, double d) {
8481 char buf[128], dbuf[128];
8482
8483 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8484 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8485 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8486 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8487 return 1;
8488 }
8489
8490 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8491 static int fwriteBulkLong(FILE *fp, long l) {
8492 char buf[128], lbuf[128];
8493
8494 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8495 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8496 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8497 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8498 return 1;
8499 }
8500
8501 /* Write a sequence of commands able to fully rebuild the dataset into
8502 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8503 static int rewriteAppendOnlyFile(char *filename) {
8504 dictIterator *di = NULL;
8505 dictEntry *de;
8506 FILE *fp;
8507 char tmpfile[256];
8508 int j;
8509 time_t now = time(NULL);
8510
8511 /* Note that we have to use a different temp name here compared to the
8512 * one used by rewriteAppendOnlyFileBackground() function. */
8513 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8514 fp = fopen(tmpfile,"w");
8515 if (!fp) {
8516 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8517 return REDIS_ERR;
8518 }
8519 for (j = 0; j < server.dbnum; j++) {
8520 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8521 redisDb *db = server.db+j;
8522 dict *d = db->dict;
8523 if (dictSize(d) == 0) continue;
8524 di = dictGetIterator(d);
8525 if (!di) {
8526 fclose(fp);
8527 return REDIS_ERR;
8528 }
8529
8530 /* SELECT the new DB */
8531 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8532 if (fwriteBulkLong(fp,j) == 0) goto werr;
8533
8534 /* Iterate this DB writing every entry */
8535 while((de = dictNext(di)) != NULL) {
8536 robj *key, *o;
8537 time_t expiretime;
8538 int swapped;
8539
8540 key = dictGetEntryKey(de);
8541 /* If the value for this key is swapped, load a preview in memory.
8542 * We use a "swapped" flag to remember if we need to free the
8543 * value object instead to just increment the ref count anyway
8544 * in order to avoid copy-on-write of pages if we are forked() */
8545 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8546 key->storage == REDIS_VM_SWAPPING) {
8547 o = dictGetEntryVal(de);
8548 swapped = 0;
8549 } else {
8550 o = vmPreviewObject(key);
8551 swapped = 1;
8552 }
8553 expiretime = getExpire(db,key);
8554
8555 /* Save the key and associated value */
8556 if (o->type == REDIS_STRING) {
8557 /* Emit a SET command */
8558 char cmd[]="*3\r\n$3\r\nSET\r\n";
8559 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8560 /* Key and value */
8561 if (fwriteBulkObject(fp,key) == 0) goto werr;
8562 if (fwriteBulkObject(fp,o) == 0) goto werr;
8563 } else if (o->type == REDIS_LIST) {
8564 /* Emit the RPUSHes needed to rebuild the list */
8565 list *list = o->ptr;
8566 listNode *ln;
8567 listIter li;
8568
8569 listRewind(list,&li);
8570 while((ln = listNext(&li))) {
8571 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8572 robj *eleobj = listNodeValue(ln);
8573
8574 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8575 if (fwriteBulkObject(fp,key) == 0) goto werr;
8576 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8577 }
8578 } else if (o->type == REDIS_SET) {
8579 /* Emit the SADDs needed to rebuild the set */
8580 dict *set = o->ptr;
8581 dictIterator *di = dictGetIterator(set);
8582 dictEntry *de;
8583
8584 while((de = dictNext(di)) != NULL) {
8585 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8586 robj *eleobj = dictGetEntryKey(de);
8587
8588 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8589 if (fwriteBulkObject(fp,key) == 0) goto werr;
8590 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8591 }
8592 dictReleaseIterator(di);
8593 } else if (o->type == REDIS_ZSET) {
8594 /* Emit the ZADDs needed to rebuild the sorted set */
8595 zset *zs = o->ptr;
8596 dictIterator *di = dictGetIterator(zs->dict);
8597 dictEntry *de;
8598
8599 while((de = dictNext(di)) != NULL) {
8600 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8601 robj *eleobj = dictGetEntryKey(de);
8602 double *score = dictGetEntryVal(de);
8603
8604 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8605 if (fwriteBulkObject(fp,key) == 0) goto werr;
8606 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8607 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8608 }
8609 dictReleaseIterator(di);
8610 } else if (o->type == REDIS_HASH) {
8611 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8612
8613 /* Emit the HSETs needed to rebuild the hash */
8614 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8615 unsigned char *p = zipmapRewind(o->ptr);
8616 unsigned char *field, *val;
8617 unsigned int flen, vlen;
8618
8619 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8620 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8621 if (fwriteBulkObject(fp,key) == 0) goto werr;
8622 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8623 return -1;
8624 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8625 return -1;
8626 }
8627 } else {
8628 dictIterator *di = dictGetIterator(o->ptr);
8629 dictEntry *de;
8630
8631 while((de = dictNext(di)) != NULL) {
8632 robj *field = dictGetEntryKey(de);
8633 robj *val = dictGetEntryVal(de);
8634
8635 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8636 if (fwriteBulkObject(fp,key) == 0) goto werr;
8637 if (fwriteBulkObject(fp,field) == -1) return -1;
8638 if (fwriteBulkObject(fp,val) == -1) return -1;
8639 }
8640 dictReleaseIterator(di);
8641 }
8642 } else {
8643 redisPanic("Unknown object type");
8644 }
8645 /* Save the expire time */
8646 if (expiretime != -1) {
8647 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8648 /* If this key is already expired skip it */
8649 if (expiretime < now) continue;
8650 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8651 if (fwriteBulkObject(fp,key) == 0) goto werr;
8652 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8653 }
8654 if (swapped) decrRefCount(o);
8655 }
8656 dictReleaseIterator(di);
8657 }
8658
8659 /* Make sure data will not remain on the OS's output buffers */
8660 fflush(fp);
8661 fsync(fileno(fp));
8662 fclose(fp);
8663
8664 /* Use RENAME to make sure the DB file is changed atomically only
8665 * if the generate DB file is ok. */
8666 if (rename(tmpfile,filename) == -1) {
8667 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8668 unlink(tmpfile);
8669 return REDIS_ERR;
8670 }
8671 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8672 return REDIS_OK;
8673
8674 werr:
8675 fclose(fp);
8676 unlink(tmpfile);
8677 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8678 if (di) dictReleaseIterator(di);
8679 return REDIS_ERR;
8680 }
8681
8682 /* This is how rewriting of the append only file in background works:
8683 *
8684 * 1) The user calls BGREWRITEAOF
8685 * 2) Redis calls this function, that forks():
8686 * 2a) the child rewrite the append only file in a temp file.
8687 * 2b) the parent accumulates differences in server.bgrewritebuf.
8688 * 3) When the child finished '2a' exists.
8689 * 4) The parent will trap the exit code, if it's OK, will append the
8690 * data accumulated into server.bgrewritebuf into the temp file, and
8691 * finally will rename(2) the temp file in the actual file name.
8692 * The the new file is reopened as the new append only file. Profit!
8693 */
8694 static int rewriteAppendOnlyFileBackground(void) {
8695 pid_t childpid;
8696
8697 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8698 if (server.vm_enabled) waitEmptyIOJobsQueue();
8699 if ((childpid = fork()) == 0) {
8700 /* Child */
8701 char tmpfile[256];
8702
8703 if (server.vm_enabled) vmReopenSwapFile();
8704 close(server.fd);
8705 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8706 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8707 _exit(0);
8708 } else {
8709 _exit(1);
8710 }
8711 } else {
8712 /* Parent */
8713 if (childpid == -1) {
8714 redisLog(REDIS_WARNING,
8715 "Can't rewrite append only file in background: fork: %s",
8716 strerror(errno));
8717 return REDIS_ERR;
8718 }
8719 redisLog(REDIS_NOTICE,
8720 "Background append only file rewriting started by pid %d",childpid);
8721 server.bgrewritechildpid = childpid;
8722 updateDictResizePolicy();
8723 /* We set appendseldb to -1 in order to force the next call to the
8724 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8725 * accumulated by the parent into server.bgrewritebuf will start
8726 * with a SELECT statement and it will be safe to merge. */
8727 server.appendseldb = -1;
8728 return REDIS_OK;
8729 }
8730 return REDIS_OK; /* unreached */
8731 }
8732
8733 static void bgrewriteaofCommand(redisClient *c) {
8734 if (server.bgrewritechildpid != -1) {
8735 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8736 return;
8737 }
8738 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8739 char *status = "+Background append only file rewriting started\r\n";
8740 addReplySds(c,sdsnew(status));
8741 } else {
8742 addReply(c,shared.err);
8743 }
8744 }
8745
8746 static void aofRemoveTempFile(pid_t childpid) {
8747 char tmpfile[256];
8748
8749 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8750 unlink(tmpfile);
8751 }
8752
8753 /* Virtual Memory is composed mainly of two subsystems:
8754 * - Blocking Virutal Memory
8755 * - Threaded Virtual Memory I/O
8756 * The two parts are not fully decoupled, but functions are split among two
8757 * different sections of the source code (delimited by comments) in order to
8758 * make more clear what functionality is about the blocking VM and what about
8759 * the threaded (not blocking) VM.
8760 *
8761 * Redis VM design:
8762 *
8763 * Redis VM is a blocking VM (one that blocks reading swapped values from
8764 * disk into memory when a value swapped out is needed in memory) that is made
8765 * unblocking by trying to examine the command argument vector in order to
8766 * load in background values that will likely be needed in order to exec
8767 * the command. The command is executed only once all the relevant keys
8768 * are loaded into memory.
8769 *
8770 * This basically is almost as simple of a blocking VM, but almost as parallel
8771 * as a fully non-blocking VM.
8772 */
8773
8774 /* Called when the user switches from "appendonly yes" to "appendonly no"
8775 * at runtime using the CONFIG command. */
8776 static void stopAppendOnly(void) {
8777 flushAppendOnlyFile();
8778 fsync(server.appendfd);
8779 close(server.appendfd);
8780
8781 server.appendfd = -1;
8782 server.appendseldb = -1;
8783 server.appendonly = 0;
8784 /* rewrite operation in progress? kill it, wait child exit */
8785 if (server.bgsavechildpid != -1) {
8786 int statloc;
8787
8788 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8789 wait3(&statloc,0,NULL);
8790 /* reset the buffer accumulating changes while the child saves */
8791 sdsfree(server.bgrewritebuf);
8792 server.bgrewritebuf = sdsempty();
8793 server.bgsavechildpid = -1;
8794 }
8795 }
8796
8797 /* Called when the user switches from "appendonly no" to "appendonly yes"
8798 * at runtime using the CONFIG command. */
8799 static int startAppendOnly(void) {
8800 server.appendonly = 1;
8801 server.lastfsync = time(NULL);
8802 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8803 if (server.appendfd == -1) {
8804 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8805 return REDIS_ERR;
8806 }
8807 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8808 server.appendonly = 0;
8809 close(server.appendfd);
8810 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8811 return REDIS_ERR;
8812 }
8813 return REDIS_OK;
8814 }
8815
8816 /* =================== Virtual Memory - Blocking Side ====================== */
8817
8818 static void vmInit(void) {
8819 off_t totsize;
8820 int pipefds[2];
8821 size_t stacksize;
8822 struct flock fl;
8823
8824 if (server.vm_max_threads != 0)
8825 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8826
8827 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8828 /* Try to open the old swap file, otherwise create it */
8829 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8830 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8831 }
8832 if (server.vm_fp == NULL) {
8833 redisLog(REDIS_WARNING,
8834 "Can't open the swap file: %s. Exiting.",
8835 strerror(errno));
8836 exit(1);
8837 }
8838 server.vm_fd = fileno(server.vm_fp);
8839 /* Lock the swap file for writing, this is useful in order to avoid
8840 * another instance to use the same swap file for a config error. */
8841 fl.l_type = F_WRLCK;
8842 fl.l_whence = SEEK_SET;
8843 fl.l_start = fl.l_len = 0;
8844 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8845 redisLog(REDIS_WARNING,
8846 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8847 exit(1);
8848 }
8849 /* Initialize */
8850 server.vm_next_page = 0;
8851 server.vm_near_pages = 0;
8852 server.vm_stats_used_pages = 0;
8853 server.vm_stats_swapped_objects = 0;
8854 server.vm_stats_swapouts = 0;
8855 server.vm_stats_swapins = 0;
8856 totsize = server.vm_pages*server.vm_page_size;
8857 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8858 if (ftruncate(server.vm_fd,totsize) == -1) {
8859 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8860 strerror(errno));
8861 exit(1);
8862 } else {
8863 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8864 }
8865 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8866 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8867 (long long) (server.vm_pages+7)/8, server.vm_pages);
8868 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8869
8870 /* Initialize threaded I/O (used by Virtual Memory) */
8871 server.io_newjobs = listCreate();
8872 server.io_processing = listCreate();
8873 server.io_processed = listCreate();
8874 server.io_ready_clients = listCreate();
8875 pthread_mutex_init(&server.io_mutex,NULL);
8876 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8877 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8878 server.io_active_threads = 0;
8879 if (pipe(pipefds) == -1) {
8880 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8881 ,strerror(errno));
8882 exit(1);
8883 }
8884 server.io_ready_pipe_read = pipefds[0];
8885 server.io_ready_pipe_write = pipefds[1];
8886 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8887 /* LZF requires a lot of stack */
8888 pthread_attr_init(&server.io_threads_attr);
8889 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8890 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8891 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8892 /* Listen for events in the threaded I/O pipe */
8893 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8894 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8895 oom("creating file event");
8896 }
8897
8898 /* Mark the page as used */
8899 static void vmMarkPageUsed(off_t page) {
8900 off_t byte = page/8;
8901 int bit = page&7;
8902 redisAssert(vmFreePage(page) == 1);
8903 server.vm_bitmap[byte] |= 1<<bit;
8904 }
8905
8906 /* Mark N contiguous pages as used, with 'page' being the first. */
8907 static void vmMarkPagesUsed(off_t page, off_t count) {
8908 off_t j;
8909
8910 for (j = 0; j < count; j++)
8911 vmMarkPageUsed(page+j);
8912 server.vm_stats_used_pages += count;
8913 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8914 (long long)count, (long long)page);
8915 }
8916
8917 /* Mark the page as free */
8918 static void vmMarkPageFree(off_t page) {
8919 off_t byte = page/8;
8920 int bit = page&7;
8921 redisAssert(vmFreePage(page) == 0);
8922 server.vm_bitmap[byte] &= ~(1<<bit);
8923 }
8924
8925 /* Mark N contiguous pages as free, with 'page' being the first. */
8926 static void vmMarkPagesFree(off_t page, off_t count) {
8927 off_t j;
8928
8929 for (j = 0; j < count; j++)
8930 vmMarkPageFree(page+j);
8931 server.vm_stats_used_pages -= count;
8932 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8933 (long long)count, (long long)page);
8934 }
8935
8936 /* Test if the page is free */
8937 static int vmFreePage(off_t page) {
8938 off_t byte = page/8;
8939 int bit = page&7;
8940 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8941 }
8942
8943 /* Find N contiguous free pages storing the first page of the cluster in *first.
8944 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8945 * REDIS_ERR is returned.
8946 *
8947 * This function uses a simple algorithm: we try to allocate
8948 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8949 * again from the start of the swap file searching for free spaces.
8950 *
8951 * If it looks pretty clear that there are no free pages near our offset
8952 * we try to find less populated places doing a forward jump of
8953 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8954 * without hurry, and then we jump again and so forth...
8955 *
8956 * This function can be improved using a free list to avoid to guess
8957 * too much, since we could collect data about freed pages.
8958 *
8959 * note: I implemented this function just after watching an episode of
8960 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8961 */
8962 static int vmFindContiguousPages(off_t *first, off_t n) {
8963 off_t base, offset = 0, since_jump = 0, numfree = 0;
8964
8965 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8966 server.vm_near_pages = 0;
8967 server.vm_next_page = 0;
8968 }
8969 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8970 base = server.vm_next_page;
8971
8972 while(offset < server.vm_pages) {
8973 off_t this = base+offset;
8974
8975 /* If we overflow, restart from page zero */
8976 if (this >= server.vm_pages) {
8977 this -= server.vm_pages;
8978 if (this == 0) {
8979 /* Just overflowed, what we found on tail is no longer
8980 * interesting, as it's no longer contiguous. */
8981 numfree = 0;
8982 }
8983 }
8984 if (vmFreePage(this)) {
8985 /* This is a free page */
8986 numfree++;
8987 /* Already got N free pages? Return to the caller, with success */
8988 if (numfree == n) {
8989 *first = this-(n-1);
8990 server.vm_next_page = this+1;
8991 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8992 return REDIS_OK;
8993 }
8994 } else {
8995 /* The current one is not a free page */
8996 numfree = 0;
8997 }
8998
8999 /* Fast-forward if the current page is not free and we already
9000 * searched enough near this place. */
9001 since_jump++;
9002 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9003 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9004 since_jump = 0;
9005 /* Note that even if we rewind after the jump, we are don't need
9006 * to make sure numfree is set to zero as we only jump *if* it
9007 * is set to zero. */
9008 } else {
9009 /* Otherwise just check the next page */
9010 offset++;
9011 }
9012 }
9013 return REDIS_ERR;
9014 }
9015
9016 /* Write the specified object at the specified page of the swap file */
9017 static int vmWriteObjectOnSwap(robj *o, off_t page) {
9018 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9019 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9020 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9021 redisLog(REDIS_WARNING,
9022 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
9023 strerror(errno));
9024 return REDIS_ERR;
9025 }
9026 rdbSaveObject(server.vm_fp,o);
9027 fflush(server.vm_fp);
9028 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9029 return REDIS_OK;
9030 }
9031
9032 /* Swap the 'val' object relative to 'key' into disk. Store all the information
9033 * needed to later retrieve the object into the key object.
9034 * If we can't find enough contiguous empty pages to swap the object on disk
9035 * REDIS_ERR is returned. */
9036 static int vmSwapObjectBlocking(robj *key, robj *val) {
9037 off_t pages = rdbSavedObjectPages(val,NULL);
9038 off_t page;
9039
9040 assert(key->storage == REDIS_VM_MEMORY);
9041 assert(key->refcount == 1);
9042 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
9043 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
9044 key->vm.page = page;
9045 key->vm.usedpages = pages;
9046 key->storage = REDIS_VM_SWAPPED;
9047 key->vtype = val->type;
9048 decrRefCount(val); /* Deallocate the object from memory. */
9049 vmMarkPagesUsed(page,pages);
9050 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9051 (unsigned char*) key->ptr,
9052 (unsigned long long) page, (unsigned long long) pages);
9053 server.vm_stats_swapped_objects++;
9054 server.vm_stats_swapouts++;
9055 return REDIS_OK;
9056 }
9057
9058 static robj *vmReadObjectFromSwap(off_t page, int type) {
9059 robj *o;
9060
9061 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9062 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9063 redisLog(REDIS_WARNING,
9064 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
9065 strerror(errno));
9066 _exit(1);
9067 }
9068 o = rdbLoadObject(type,server.vm_fp);
9069 if (o == NULL) {
9070 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
9071 _exit(1);
9072 }
9073 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9074 return o;
9075 }
9076
9077 /* Load the value object relative to the 'key' object from swap to memory.
9078 * The newly allocated object is returned.
9079 *
9080 * If preview is true the unserialized object is returned to the caller but
9081 * no changes are made to the key object, nor the pages are marked as freed */
9082 static robj *vmGenericLoadObject(robj *key, int preview) {
9083 robj *val;
9084
9085 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
9086 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
9087 if (!preview) {
9088 key->storage = REDIS_VM_MEMORY;
9089 key->vm.atime = server.unixtime;
9090 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9091 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9092 (unsigned char*) key->ptr);
9093 server.vm_stats_swapped_objects--;
9094 } else {
9095 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9096 (unsigned char*) key->ptr);
9097 }
9098 server.vm_stats_swapins++;
9099 return val;
9100 }
9101
9102 /* Plain object loading, from swap to memory */
9103 static robj *vmLoadObject(robj *key) {
9104 /* If we are loading the object in background, stop it, we
9105 * need to load this object synchronously ASAP. */
9106 if (key->storage == REDIS_VM_LOADING)
9107 vmCancelThreadedIOJob(key);
9108 return vmGenericLoadObject(key,0);
9109 }
9110
9111 /* Just load the value on disk, without to modify the key.
9112 * This is useful when we want to perform some operation on the value
9113 * without to really bring it from swap to memory, like while saving the
9114 * dataset or rewriting the append only log. */
9115 static robj *vmPreviewObject(robj *key) {
9116 return vmGenericLoadObject(key,1);
9117 }
9118
9119 /* How a good candidate is this object for swapping?
9120 * The better candidate it is, the greater the returned value.
9121 *
9122 * Currently we try to perform a fast estimation of the object size in
9123 * memory, and combine it with aging informations.
9124 *
9125 * Basically swappability = idle-time * log(estimated size)
9126 *
9127 * Bigger objects are preferred over smaller objects, but not
9128 * proportionally, this is why we use the logarithm. This algorithm is
9129 * just a first try and will probably be tuned later. */
9130 static double computeObjectSwappability(robj *o) {
9131 time_t age = server.unixtime - o->vm.atime;
9132 long asize = 0;
9133 list *l;
9134 dict *d;
9135 struct dictEntry *de;
9136 int z;
9137
9138 if (age <= 0) return 0;
9139 switch(o->type) {
9140 case REDIS_STRING:
9141 if (o->encoding != REDIS_ENCODING_RAW) {
9142 asize = sizeof(*o);
9143 } else {
9144 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9145 }
9146 break;
9147 case REDIS_LIST:
9148 l = o->ptr;
9149 listNode *ln = listFirst(l);
9150
9151 asize = sizeof(list);
9152 if (ln) {
9153 robj *ele = ln->value;
9154 long elesize;
9155
9156 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9157 (sizeof(*o)+sdslen(ele->ptr)) :
9158 sizeof(*o);
9159 asize += (sizeof(listNode)+elesize)*listLength(l);
9160 }
9161 break;
9162 case REDIS_SET:
9163 case REDIS_ZSET:
9164 z = (o->type == REDIS_ZSET);
9165 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9166
9167 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9168 if (z) asize += sizeof(zset)-sizeof(dict);
9169 if (dictSize(d)) {
9170 long elesize;
9171 robj *ele;
9172
9173 de = dictGetRandomKey(d);
9174 ele = dictGetEntryKey(de);
9175 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9176 (sizeof(*o)+sdslen(ele->ptr)) :
9177 sizeof(*o);
9178 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9179 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9180 }
9181 break;
9182 case REDIS_HASH:
9183 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9184 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9185 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9186 unsigned int klen, vlen;
9187 unsigned char *key, *val;
9188
9189 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9190 klen = 0;
9191 vlen = 0;
9192 }
9193 asize = len*(klen+vlen+3);
9194 } else if (o->encoding == REDIS_ENCODING_HT) {
9195 d = o->ptr;
9196 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9197 if (dictSize(d)) {
9198 long elesize;
9199 robj *ele;
9200
9201 de = dictGetRandomKey(d);
9202 ele = dictGetEntryKey(de);
9203 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9204 (sizeof(*o)+sdslen(ele->ptr)) :
9205 sizeof(*o);
9206 ele = dictGetEntryVal(de);
9207 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9208 (sizeof(*o)+sdslen(ele->ptr)) :
9209 sizeof(*o);
9210 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9211 }
9212 }
9213 break;
9214 }
9215 return (double)age*log(1+asize);
9216 }
9217
9218 /* Try to swap an object that's a good candidate for swapping.
9219 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9220 * to swap any object at all.
9221 *
9222 * If 'usethreaded' is true, Redis will try to swap the object in background
9223 * using I/O threads. */
9224 static int vmSwapOneObject(int usethreads) {
9225 int j, i;
9226 struct dictEntry *best = NULL;
9227 double best_swappability = 0;
9228 redisDb *best_db = NULL;
9229 robj *key, *val;
9230
9231 for (j = 0; j < server.dbnum; j++) {
9232 redisDb *db = server.db+j;
9233 /* Why maxtries is set to 100?
9234 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9235 * are swappable objects */
9236 int maxtries = 100;
9237
9238 if (dictSize(db->dict) == 0) continue;
9239 for (i = 0; i < 5; i++) {
9240 dictEntry *de;
9241 double swappability;
9242
9243 if (maxtries) maxtries--;
9244 de = dictGetRandomKey(db->dict);
9245 key = dictGetEntryKey(de);
9246 val = dictGetEntryVal(de);
9247 /* Only swap objects that are currently in memory.
9248 *
9249 * Also don't swap shared objects if threaded VM is on, as we
9250 * try to ensure that the main thread does not touch the
9251 * object while the I/O thread is using it, but we can't
9252 * control other keys without adding additional mutex. */
9253 if (key->storage != REDIS_VM_MEMORY ||
9254 (server.vm_max_threads != 0 && val->refcount != 1)) {
9255 if (maxtries) i--; /* don't count this try */
9256 continue;
9257 }
9258 swappability = computeObjectSwappability(val);
9259 if (!best || swappability > best_swappability) {
9260 best = de;
9261 best_swappability = swappability;
9262 best_db = db;
9263 }
9264 }
9265 }
9266 if (best == NULL) return REDIS_ERR;
9267 key = dictGetEntryKey(best);
9268 val = dictGetEntryVal(best);
9269
9270 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9271 key->ptr, best_swappability);
9272
9273 /* Unshare the key if needed */
9274 if (key->refcount > 1) {
9275 robj *newkey = dupStringObject(key);
9276 decrRefCount(key);
9277 key = dictGetEntryKey(best) = newkey;
9278 }
9279 /* Swap it */
9280 if (usethreads) {
9281 vmSwapObjectThreaded(key,val,best_db);
9282 return REDIS_OK;
9283 } else {
9284 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9285 dictGetEntryVal(best) = NULL;
9286 return REDIS_OK;
9287 } else {
9288 return REDIS_ERR;
9289 }
9290 }
9291 }
9292
9293 static int vmSwapOneObjectBlocking() {
9294 return vmSwapOneObject(0);
9295 }
9296
9297 static int vmSwapOneObjectThreaded() {
9298 return vmSwapOneObject(1);
9299 }
9300
9301 /* Return true if it's safe to swap out objects in a given moment.
9302 * Basically we don't want to swap objects out while there is a BGSAVE
9303 * or a BGAEOREWRITE running in backgroud. */
9304 static int vmCanSwapOut(void) {
9305 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9306 }
9307
9308 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9309 * and was deleted. Otherwise 0 is returned. */
9310 static int deleteIfSwapped(redisDb *db, robj *key) {
9311 dictEntry *de;
9312 robj *foundkey;
9313
9314 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9315 foundkey = dictGetEntryKey(de);
9316 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9317 deleteKey(db,key);
9318 return 1;
9319 }
9320
9321 /* =================== Virtual Memory - Threaded I/O ======================= */
9322
9323 static void freeIOJob(iojob *j) {
9324 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9325 j->type == REDIS_IOJOB_DO_SWAP ||
9326 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9327 decrRefCount(j->val);
9328 /* We don't decrRefCount the j->key field as we did't incremented
9329 * the count creating IO Jobs. This is because the key field here is
9330 * just used as an indentifier and if a key is removed the Job should
9331 * never be touched again. */
9332 zfree(j);
9333 }
9334
9335 /* Every time a thread finished a Job, it writes a byte into the write side
9336 * of an unix pipe in order to "awake" the main thread, and this function
9337 * is called. */
9338 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9339 int mask)
9340 {
9341 char buf[1];
9342 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9343 REDIS_NOTUSED(el);
9344 REDIS_NOTUSED(mask);
9345 REDIS_NOTUSED(privdata);
9346
9347 /* For every byte we read in the read side of the pipe, there is one
9348 * I/O job completed to process. */
9349 while((retval = read(fd,buf,1)) == 1) {
9350 iojob *j;
9351 listNode *ln;
9352 robj *key;
9353 struct dictEntry *de;
9354
9355 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9356
9357 /* Get the processed element (the oldest one) */
9358 lockThreadedIO();
9359 assert(listLength(server.io_processed) != 0);
9360 if (toprocess == -1) {
9361 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9362 if (toprocess <= 0) toprocess = 1;
9363 }
9364 ln = listFirst(server.io_processed);
9365 j = ln->value;
9366 listDelNode(server.io_processed,ln);
9367 unlockThreadedIO();
9368 /* If this job is marked as canceled, just ignore it */
9369 if (j->canceled) {
9370 freeIOJob(j);
9371 continue;
9372 }
9373 /* Post process it in the main thread, as there are things we
9374 * can do just here to avoid race conditions and/or invasive locks */
9375 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9376 de = dictFind(j->db->dict,j->key);
9377 assert(de != NULL);
9378 key = dictGetEntryKey(de);
9379 if (j->type == REDIS_IOJOB_LOAD) {
9380 redisDb *db;
9381
9382 /* Key loaded, bring it at home */
9383 key->storage = REDIS_VM_MEMORY;
9384 key->vm.atime = server.unixtime;
9385 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9386 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9387 (unsigned char*) key->ptr);
9388 server.vm_stats_swapped_objects--;
9389 server.vm_stats_swapins++;
9390 dictGetEntryVal(de) = j->val;
9391 incrRefCount(j->val);
9392 db = j->db;
9393 freeIOJob(j);
9394 /* Handle clients waiting for this key to be loaded. */
9395 handleClientsBlockedOnSwappedKey(db,key);
9396 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9397 /* Now we know the amount of pages required to swap this object.
9398 * Let's find some space for it, and queue this task again
9399 * rebranded as REDIS_IOJOB_DO_SWAP. */
9400 if (!vmCanSwapOut() ||
9401 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9402 {
9403 /* Ooops... no space or we can't swap as there is
9404 * a fork()ed Redis trying to save stuff on disk. */
9405 freeIOJob(j);
9406 key->storage = REDIS_VM_MEMORY; /* undo operation */
9407 } else {
9408 /* Note that we need to mark this pages as used now,
9409 * if the job will be canceled, we'll mark them as freed
9410 * again. */
9411 vmMarkPagesUsed(j->page,j->pages);
9412 j->type = REDIS_IOJOB_DO_SWAP;
9413 lockThreadedIO();
9414 queueIOJob(j);
9415 unlockThreadedIO();
9416 }
9417 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9418 robj *val;
9419
9420 /* Key swapped. We can finally free some memory. */
9421 if (key->storage != REDIS_VM_SWAPPING) {
9422 printf("key->storage: %d\n",key->storage);
9423 printf("key->name: %s\n",(char*)key->ptr);
9424 printf("key->refcount: %d\n",key->refcount);
9425 printf("val: %p\n",(void*)j->val);
9426 printf("val->type: %d\n",j->val->type);
9427 printf("val->ptr: %s\n",(char*)j->val->ptr);
9428 }
9429 redisAssert(key->storage == REDIS_VM_SWAPPING);
9430 val = dictGetEntryVal(de);
9431 key->vm.page = j->page;
9432 key->vm.usedpages = j->pages;
9433 key->storage = REDIS_VM_SWAPPED;
9434 key->vtype = j->val->type;
9435 decrRefCount(val); /* Deallocate the object from memory. */
9436 dictGetEntryVal(de) = NULL;
9437 redisLog(REDIS_DEBUG,
9438 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9439 (unsigned char*) key->ptr,
9440 (unsigned long long) j->page, (unsigned long long) j->pages);
9441 server.vm_stats_swapped_objects++;
9442 server.vm_stats_swapouts++;
9443 freeIOJob(j);
9444 /* Put a few more swap requests in queue if we are still
9445 * out of memory */
9446 if (trytoswap && vmCanSwapOut() &&
9447 zmalloc_used_memory() > server.vm_max_memory)
9448 {
9449 int more = 1;
9450 while(more) {
9451 lockThreadedIO();
9452 more = listLength(server.io_newjobs) <
9453 (unsigned) server.vm_max_threads;
9454 unlockThreadedIO();
9455 /* Don't waste CPU time if swappable objects are rare. */
9456 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9457 trytoswap = 0;
9458 break;
9459 }
9460 }
9461 }
9462 }
9463 processed++;
9464 if (processed == toprocess) return;
9465 }
9466 if (retval < 0 && errno != EAGAIN) {
9467 redisLog(REDIS_WARNING,
9468 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9469 strerror(errno));
9470 }
9471 }
9472
9473 static void lockThreadedIO(void) {
9474 pthread_mutex_lock(&server.io_mutex);
9475 }
9476
9477 static void unlockThreadedIO(void) {
9478 pthread_mutex_unlock(&server.io_mutex);
9479 }
9480
9481 /* Remove the specified object from the threaded I/O queue if still not
9482 * processed, otherwise make sure to flag it as canceled. */
9483 static void vmCancelThreadedIOJob(robj *o) {
9484 list *lists[3] = {
9485 server.io_newjobs, /* 0 */
9486 server.io_processing, /* 1 */
9487 server.io_processed /* 2 */
9488 };
9489 int i;
9490
9491 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9492 again:
9493 lockThreadedIO();
9494 /* Search for a matching key in one of the queues */
9495 for (i = 0; i < 3; i++) {
9496 listNode *ln;
9497 listIter li;
9498
9499 listRewind(lists[i],&li);
9500 while ((ln = listNext(&li)) != NULL) {
9501 iojob *job = ln->value;
9502
9503 if (job->canceled) continue; /* Skip this, already canceled. */
9504 if (job->key == o) {
9505 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9506 (void*)job, (char*)o->ptr, job->type, i);
9507 /* Mark the pages as free since the swap didn't happened
9508 * or happened but is now discarded. */
9509 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9510 vmMarkPagesFree(job->page,job->pages);
9511 /* Cancel the job. It depends on the list the job is
9512 * living in. */
9513 switch(i) {
9514 case 0: /* io_newjobs */
9515 /* If the job was yet not processed the best thing to do
9516 * is to remove it from the queue at all */
9517 freeIOJob(job);
9518 listDelNode(lists[i],ln);
9519 break;
9520 case 1: /* io_processing */
9521 /* Oh Shi- the thread is messing with the Job:
9522 *
9523 * Probably it's accessing the object if this is a
9524 * PREPARE_SWAP or DO_SWAP job.
9525 * If it's a LOAD job it may be reading from disk and
9526 * if we don't wait for the job to terminate before to
9527 * cancel it, maybe in a few microseconds data can be
9528 * corrupted in this pages. So the short story is:
9529 *
9530 * Better to wait for the job to move into the
9531 * next queue (processed)... */
9532
9533 /* We try again and again until the job is completed. */
9534 unlockThreadedIO();
9535 /* But let's wait some time for the I/O thread
9536 * to finish with this job. After all this condition
9537 * should be very rare. */
9538 usleep(1);
9539 goto again;
9540 case 2: /* io_processed */
9541 /* The job was already processed, that's easy...
9542 * just mark it as canceled so that we'll ignore it
9543 * when processing completed jobs. */
9544 job->canceled = 1;
9545 break;
9546 }
9547 /* Finally we have to adjust the storage type of the object
9548 * in order to "UNDO" the operaiton. */
9549 if (o->storage == REDIS_VM_LOADING)
9550 o->storage = REDIS_VM_SWAPPED;
9551 else if (o->storage == REDIS_VM_SWAPPING)
9552 o->storage = REDIS_VM_MEMORY;
9553 unlockThreadedIO();
9554 return;
9555 }
9556 }
9557 }
9558 unlockThreadedIO();
9559 assert(1 != 1); /* We should never reach this */
9560 }
9561
9562 static void *IOThreadEntryPoint(void *arg) {
9563 iojob *j;
9564 listNode *ln;
9565 REDIS_NOTUSED(arg);
9566
9567 pthread_detach(pthread_self());
9568 while(1) {
9569 /* Get a new job to process */
9570 lockThreadedIO();
9571 if (listLength(server.io_newjobs) == 0) {
9572 /* No new jobs in queue, exit. */
9573 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9574 (long) pthread_self());
9575 server.io_active_threads--;
9576 unlockThreadedIO();
9577 return NULL;
9578 }
9579 ln = listFirst(server.io_newjobs);
9580 j = ln->value;
9581 listDelNode(server.io_newjobs,ln);
9582 /* Add the job in the processing queue */
9583 j->thread = pthread_self();
9584 listAddNodeTail(server.io_processing,j);
9585 ln = listLast(server.io_processing); /* We use ln later to remove it */
9586 unlockThreadedIO();
9587 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9588 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9589
9590 /* Process the Job */
9591 if (j->type == REDIS_IOJOB_LOAD) {
9592 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9593 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9594 FILE *fp = fopen("/dev/null","w+");
9595 j->pages = rdbSavedObjectPages(j->val,fp);
9596 fclose(fp);
9597 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9598 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9599 j->canceled = 1;
9600 }
9601
9602 /* Done: insert the job into the processed queue */
9603 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9604 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9605 lockThreadedIO();
9606 listDelNode(server.io_processing,ln);
9607 listAddNodeTail(server.io_processed,j);
9608 unlockThreadedIO();
9609
9610 /* Signal the main thread there is new stuff to process */
9611 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9612 }
9613 return NULL; /* never reached */
9614 }
9615
9616 static void spawnIOThread(void) {
9617 pthread_t thread;
9618 sigset_t mask, omask;
9619 int err;
9620
9621 sigemptyset(&mask);
9622 sigaddset(&mask,SIGCHLD);
9623 sigaddset(&mask,SIGHUP);
9624 sigaddset(&mask,SIGPIPE);
9625 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9626 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9627 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9628 strerror(err));
9629 usleep(1000000);
9630 }
9631 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9632 server.io_active_threads++;
9633 }
9634
9635 /* We need to wait for the last thread to exit before we are able to
9636 * fork() in order to BGSAVE or BGREWRITEAOF. */
9637 static void waitEmptyIOJobsQueue(void) {
9638 while(1) {
9639 int io_processed_len;
9640
9641 lockThreadedIO();
9642 if (listLength(server.io_newjobs) == 0 &&
9643 listLength(server.io_processing) == 0 &&
9644 server.io_active_threads == 0)
9645 {
9646 unlockThreadedIO();
9647 return;
9648 }
9649 /* While waiting for empty jobs queue condition we post-process some
9650 * finshed job, as I/O threads may be hanging trying to write against
9651 * the io_ready_pipe_write FD but there are so much pending jobs that
9652 * it's blocking. */
9653 io_processed_len = listLength(server.io_processed);
9654 unlockThreadedIO();
9655 if (io_processed_len) {
9656 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9657 usleep(1000); /* 1 millisecond */
9658 } else {
9659 usleep(10000); /* 10 milliseconds */
9660 }
9661 }
9662 }
9663
9664 static void vmReopenSwapFile(void) {
9665 /* Note: we don't close the old one as we are in the child process
9666 * and don't want to mess at all with the original file object. */
9667 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9668 if (server.vm_fp == NULL) {
9669 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9670 server.vm_swap_file);
9671 _exit(1);
9672 }
9673 server.vm_fd = fileno(server.vm_fp);
9674 }
9675
9676 /* This function must be called while with threaded IO locked */
9677 static void queueIOJob(iojob *j) {
9678 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9679 (void*)j, j->type, (char*)j->key->ptr);
9680 listAddNodeTail(server.io_newjobs,j);
9681 if (server.io_active_threads < server.vm_max_threads)
9682 spawnIOThread();
9683 }
9684
9685 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9686 iojob *j;
9687
9688 assert(key->storage == REDIS_VM_MEMORY);
9689 assert(key->refcount == 1);
9690
9691 j = zmalloc(sizeof(*j));
9692 j->type = REDIS_IOJOB_PREPARE_SWAP;
9693 j->db = db;
9694 j->key = key;
9695 j->val = val;
9696 incrRefCount(val);
9697 j->canceled = 0;
9698 j->thread = (pthread_t) -1;
9699 key->storage = REDIS_VM_SWAPPING;
9700
9701 lockThreadedIO();
9702 queueIOJob(j);
9703 unlockThreadedIO();
9704 return REDIS_OK;
9705 }
9706
9707 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9708
9709 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9710 * If there is not already a job loading the key, it is craeted.
9711 * The key is added to the io_keys list in the client structure, and also
9712 * in the hash table mapping swapped keys to waiting clients, that is,
9713 * server.io_waited_keys. */
9714 static int waitForSwappedKey(redisClient *c, robj *key) {
9715 struct dictEntry *de;
9716 robj *o;
9717 list *l;
9718
9719 /* If the key does not exist or is already in RAM we don't need to
9720 * block the client at all. */
9721 de = dictFind(c->db->dict,key);
9722 if (de == NULL) return 0;
9723 o = dictGetEntryKey(de);
9724 if (o->storage == REDIS_VM_MEMORY) {
9725 return 0;
9726 } else if (o->storage == REDIS_VM_SWAPPING) {
9727 /* We were swapping the key, undo it! */
9728 vmCancelThreadedIOJob(o);
9729 return 0;
9730 }
9731
9732 /* OK: the key is either swapped, or being loaded just now. */
9733
9734 /* Add the key to the list of keys this client is waiting for.
9735 * This maps clients to keys they are waiting for. */
9736 listAddNodeTail(c->io_keys,key);
9737 incrRefCount(key);
9738
9739 /* Add the client to the swapped keys => clients waiting map. */
9740 de = dictFind(c->db->io_keys,key);
9741 if (de == NULL) {
9742 int retval;
9743
9744 /* For every key we take a list of clients blocked for it */
9745 l = listCreate();
9746 retval = dictAdd(c->db->io_keys,key,l);
9747 incrRefCount(key);
9748 assert(retval == DICT_OK);
9749 } else {
9750 l = dictGetEntryVal(de);
9751 }
9752 listAddNodeTail(l,c);
9753
9754 /* Are we already loading the key from disk? If not create a job */
9755 if (o->storage == REDIS_VM_SWAPPED) {
9756 iojob *j;
9757
9758 o->storage = REDIS_VM_LOADING;
9759 j = zmalloc(sizeof(*j));
9760 j->type = REDIS_IOJOB_LOAD;
9761 j->db = c->db;
9762 j->key = o;
9763 j->key->vtype = o->vtype;
9764 j->page = o->vm.page;
9765 j->val = NULL;
9766 j->canceled = 0;
9767 j->thread = (pthread_t) -1;
9768 lockThreadedIO();
9769 queueIOJob(j);
9770 unlockThreadedIO();
9771 }
9772 return 1;
9773 }
9774
9775 /* Preload keys for any command with first, last and step values for
9776 * the command keys prototype, as defined in the command table. */
9777 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9778 int j, last;
9779 if (cmd->vm_firstkey == 0) return;
9780 last = cmd->vm_lastkey;
9781 if (last < 0) last = argc+last;
9782 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9783 redisAssert(j < argc);
9784 waitForSwappedKey(c,argv[j]);
9785 }
9786 }
9787
9788 /* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
9789 * Note that the number of keys to preload is user-defined, so we need to
9790 * apply a sanity check against argc. */
9791 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9792 int i, num;
9793 REDIS_NOTUSED(cmd);
9794
9795 num = atoi(argv[2]->ptr);
9796 if (num > (argc-3)) return;
9797 for (i = 0; i < num; i++) {
9798 waitForSwappedKey(c,argv[3+i]);
9799 }
9800 }
9801
9802 /* Preload keys needed to execute the entire MULTI/EXEC block.
9803 *
9804 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9805 * and will block the client when any command requires a swapped out value. */
9806 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9807 int i, margc;
9808 struct redisCommand *mcmd;
9809 robj **margv;
9810 REDIS_NOTUSED(cmd);
9811 REDIS_NOTUSED(argc);
9812 REDIS_NOTUSED(argv);
9813
9814 if (!(c->flags & REDIS_MULTI)) return;
9815 for (i = 0; i < c->mstate.count; i++) {
9816 mcmd = c->mstate.commands[i].cmd;
9817 margc = c->mstate.commands[i].argc;
9818 margv = c->mstate.commands[i].argv;
9819
9820 if (mcmd->vm_preload_proc != NULL) {
9821 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9822 } else {
9823 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9824 }
9825 }
9826 }
9827
9828 /* Is this client attempting to run a command against swapped keys?
9829 * If so, block it ASAP, load the keys in background, then resume it.
9830 *
9831 * The important idea about this function is that it can fail! If keys will
9832 * still be swapped when the client is resumed, this key lookups will
9833 * just block loading keys from disk. In practical terms this should only
9834 * happen with SORT BY command or if there is a bug in this function.
9835 *
9836 * Return 1 if the client is marked as blocked, 0 if the client can
9837 * continue as the keys it is going to access appear to be in memory. */
9838 static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
9839 if (cmd->vm_preload_proc != NULL) {
9840 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9841 } else {
9842 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9843 }
9844
9845 /* If the client was blocked for at least one key, mark it as blocked. */
9846 if (listLength(c->io_keys)) {
9847 c->flags |= REDIS_IO_WAIT;
9848 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9849 server.vm_blocked_clients++;
9850 return 1;
9851 } else {
9852 return 0;
9853 }
9854 }
9855
9856 /* Remove the 'key' from the list of blocked keys for a given client.
9857 *
9858 * The function returns 1 when there are no longer blocking keys after
9859 * the current one was removed (and the client can be unblocked). */
9860 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9861 list *l;
9862 listNode *ln;
9863 listIter li;
9864 struct dictEntry *de;
9865
9866 /* Remove the key from the list of keys this client is waiting for. */
9867 listRewind(c->io_keys,&li);
9868 while ((ln = listNext(&li)) != NULL) {
9869 if (equalStringObjects(ln->value,key)) {
9870 listDelNode(c->io_keys,ln);
9871 break;
9872 }
9873 }
9874 assert(ln != NULL);
9875
9876 /* Remove the client form the key => waiting clients map. */
9877 de = dictFind(c->db->io_keys,key);
9878 assert(de != NULL);
9879 l = dictGetEntryVal(de);
9880 ln = listSearchKey(l,c);
9881 assert(ln != NULL);
9882 listDelNode(l,ln);
9883 if (listLength(l) == 0)
9884 dictDelete(c->db->io_keys,key);
9885
9886 return listLength(c->io_keys) == 0;
9887 }
9888
9889 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9890 struct dictEntry *de;
9891 list *l;
9892 listNode *ln;
9893 int len;
9894
9895 de = dictFind(db->io_keys,key);
9896 if (!de) return;
9897
9898 l = dictGetEntryVal(de);
9899 len = listLength(l);
9900 /* Note: we can't use something like while(listLength(l)) as the list
9901 * can be freed by the calling function when we remove the last element. */
9902 while (len--) {
9903 ln = listFirst(l);
9904 redisClient *c = ln->value;
9905
9906 if (dontWaitForSwappedKey(c,key)) {
9907 /* Put the client in the list of clients ready to go as we
9908 * loaded all the keys about it. */
9909 listAddNodeTail(server.io_ready_clients,c);
9910 }
9911 }
9912 }
9913
9914 /* =========================== Remote Configuration ========================= */
9915
9916 static void configSetCommand(redisClient *c) {
9917 robj *o = getDecodedObject(c->argv[3]);
9918 long long ll;
9919
9920 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9921 zfree(server.dbfilename);
9922 server.dbfilename = zstrdup(o->ptr);
9923 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9924 zfree(server.requirepass);
9925 server.requirepass = zstrdup(o->ptr);
9926 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9927 zfree(server.masterauth);
9928 server.masterauth = zstrdup(o->ptr);
9929 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9930 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9931 ll < 0) goto badfmt;
9932 server.maxmemory = ll;
9933 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9934 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9935 ll < 0 || ll > LONG_MAX) goto badfmt;
9936 server.maxidletime = ll;
9937 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9938 if (!strcasecmp(o->ptr,"no")) {
9939 server.appendfsync = APPENDFSYNC_NO;
9940 } else if (!strcasecmp(o->ptr,"everysec")) {
9941 server.appendfsync = APPENDFSYNC_EVERYSEC;
9942 } else if (!strcasecmp(o->ptr,"always")) {
9943 server.appendfsync = APPENDFSYNC_ALWAYS;
9944 } else {
9945 goto badfmt;
9946 }
9947 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9948 int old = server.appendonly;
9949 int new = yesnotoi(o->ptr);
9950
9951 if (new == -1) goto badfmt;
9952 if (old != new) {
9953 if (new == 0) {
9954 stopAppendOnly();
9955 } else {
9956 if (startAppendOnly() == REDIS_ERR) {
9957 addReplySds(c,sdscatprintf(sdsempty(),
9958 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9959 decrRefCount(o);
9960 return;
9961 }
9962 }
9963 }
9964 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9965 int vlen, j;
9966 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9967
9968 /* Perform sanity check before setting the new config:
9969 * - Even number of args
9970 * - Seconds >= 1, changes >= 0 */
9971 if (vlen & 1) {
9972 sdsfreesplitres(v,vlen);
9973 goto badfmt;
9974 }
9975 for (j = 0; j < vlen; j++) {
9976 char *eptr;
9977 long val;
9978
9979 val = strtoll(v[j], &eptr, 10);
9980 if (eptr[0] != '\0' ||
9981 ((j & 1) == 0 && val < 1) ||
9982 ((j & 1) == 1 && val < 0)) {
9983 sdsfreesplitres(v,vlen);
9984 goto badfmt;
9985 }
9986 }
9987 /* Finally set the new config */
9988 resetServerSaveParams();
9989 for (j = 0; j < vlen; j += 2) {
9990 time_t seconds;
9991 int changes;
9992
9993 seconds = strtoll(v[j],NULL,10);
9994 changes = strtoll(v[j+1],NULL,10);
9995 appendServerSaveParams(seconds, changes);
9996 }
9997 sdsfreesplitres(v,vlen);
9998 } else {
9999 addReplySds(c,sdscatprintf(sdsempty(),
10000 "-ERR not supported CONFIG parameter %s\r\n",
10001 (char*)c->argv[2]->ptr));
10002 decrRefCount(o);
10003 return;
10004 }
10005 decrRefCount(o);
10006 addReply(c,shared.ok);
10007 return;
10008
10009 badfmt: /* Bad format errors */
10010 addReplySds(c,sdscatprintf(sdsempty(),
10011 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10012 (char*)o->ptr,
10013 (char*)c->argv[2]->ptr));
10014 decrRefCount(o);
10015 }
10016
10017 static void configGetCommand(redisClient *c) {
10018 robj *o = getDecodedObject(c->argv[2]);
10019 robj *lenobj = createObject(REDIS_STRING,NULL);
10020 char *pattern = o->ptr;
10021 int matches = 0;
10022
10023 addReply(c,lenobj);
10024 decrRefCount(lenobj);
10025
10026 if (stringmatch(pattern,"dbfilename",0)) {
10027 addReplyBulkCString(c,"dbfilename");
10028 addReplyBulkCString(c,server.dbfilename);
10029 matches++;
10030 }
10031 if (stringmatch(pattern,"requirepass",0)) {
10032 addReplyBulkCString(c,"requirepass");
10033 addReplyBulkCString(c,server.requirepass);
10034 matches++;
10035 }
10036 if (stringmatch(pattern,"masterauth",0)) {
10037 addReplyBulkCString(c,"masterauth");
10038 addReplyBulkCString(c,server.masterauth);
10039 matches++;
10040 }
10041 if (stringmatch(pattern,"maxmemory",0)) {
10042 char buf[128];
10043
10044 ll2string(buf,128,server.maxmemory);
10045 addReplyBulkCString(c,"maxmemory");
10046 addReplyBulkCString(c,buf);
10047 matches++;
10048 }
10049 if (stringmatch(pattern,"timeout",0)) {
10050 char buf[128];
10051
10052 ll2string(buf,128,server.maxidletime);
10053 addReplyBulkCString(c,"timeout");
10054 addReplyBulkCString(c,buf);
10055 matches++;
10056 }
10057 if (stringmatch(pattern,"appendonly",0)) {
10058 addReplyBulkCString(c,"appendonly");
10059 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10060 matches++;
10061 }
10062 if (stringmatch(pattern,"appendfsync",0)) {
10063 char *policy;
10064
10065 switch(server.appendfsync) {
10066 case APPENDFSYNC_NO: policy = "no"; break;
10067 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10068 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10069 default: policy = "unknown"; break; /* too harmless to panic */
10070 }
10071 addReplyBulkCString(c,"appendfsync");
10072 addReplyBulkCString(c,policy);
10073 matches++;
10074 }
10075 if (stringmatch(pattern,"save",0)) {
10076 sds buf = sdsempty();
10077 int j;
10078
10079 for (j = 0; j < server.saveparamslen; j++) {
10080 buf = sdscatprintf(buf,"%ld %d",
10081 server.saveparams[j].seconds,
10082 server.saveparams[j].changes);
10083 if (j != server.saveparamslen-1)
10084 buf = sdscatlen(buf," ",1);
10085 }
10086 addReplyBulkCString(c,"save");
10087 addReplyBulkCString(c,buf);
10088 sdsfree(buf);
10089 matches++;
10090 }
10091 decrRefCount(o);
10092 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10093 }
10094
10095 static void configCommand(redisClient *c) {
10096 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10097 if (c->argc != 4) goto badarity;
10098 configSetCommand(c);
10099 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10100 if (c->argc != 3) goto badarity;
10101 configGetCommand(c);
10102 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10103 if (c->argc != 2) goto badarity;
10104 server.stat_numcommands = 0;
10105 server.stat_numconnections = 0;
10106 server.stat_expiredkeys = 0;
10107 server.stat_starttime = time(NULL);
10108 addReply(c,shared.ok);
10109 } else {
10110 addReplySds(c,sdscatprintf(sdsempty(),
10111 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10112 }
10113 return;
10114
10115 badarity:
10116 addReplySds(c,sdscatprintf(sdsempty(),
10117 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10118 (char*) c->argv[1]->ptr));
10119 }
10120
10121 /* =========================== Pubsub implementation ======================== */
10122
10123 static void freePubsubPattern(void *p) {
10124 pubsubPattern *pat = p;
10125
10126 decrRefCount(pat->pattern);
10127 zfree(pat);
10128 }
10129
10130 static int listMatchPubsubPattern(void *a, void *b) {
10131 pubsubPattern *pa = a, *pb = b;
10132
10133 return (pa->client == pb->client) &&
10134 (equalStringObjects(pa->pattern,pb->pattern));
10135 }
10136
10137 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10138 * 0 if the client was already subscribed to that channel. */
10139 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
10140 struct dictEntry *de;
10141 list *clients = NULL;
10142 int retval = 0;
10143
10144 /* Add the channel to the client -> channels hash table */
10145 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
10146 retval = 1;
10147 incrRefCount(channel);
10148 /* Add the client to the channel -> list of clients hash table */
10149 de = dictFind(server.pubsub_channels,channel);
10150 if (de == NULL) {
10151 clients = listCreate();
10152 dictAdd(server.pubsub_channels,channel,clients);
10153 incrRefCount(channel);
10154 } else {
10155 clients = dictGetEntryVal(de);
10156 }
10157 listAddNodeTail(clients,c);
10158 }
10159 /* Notify the client */
10160 addReply(c,shared.mbulk3);
10161 addReply(c,shared.subscribebulk);
10162 addReplyBulk(c,channel);
10163 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10164 return retval;
10165 }
10166
10167 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10168 * 0 if the client was not subscribed to the specified channel. */
10169 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
10170 struct dictEntry *de;
10171 list *clients;
10172 listNode *ln;
10173 int retval = 0;
10174
10175 /* Remove the channel from the client -> channels hash table */
10176 incrRefCount(channel); /* channel may be just a pointer to the same object
10177 we have in the hash tables. Protect it... */
10178 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
10179 retval = 1;
10180 /* Remove the client from the channel -> clients list hash table */
10181 de = dictFind(server.pubsub_channels,channel);
10182 assert(de != NULL);
10183 clients = dictGetEntryVal(de);
10184 ln = listSearchKey(clients,c);
10185 assert(ln != NULL);
10186 listDelNode(clients,ln);
10187 if (listLength(clients) == 0) {
10188 /* Free the list and associated hash entry at all if this was
10189 * the latest client, so that it will be possible to abuse
10190 * Redis PUBSUB creating millions of channels. */
10191 dictDelete(server.pubsub_channels,channel);
10192 }
10193 }
10194 /* Notify the client */
10195 if (notify) {
10196 addReply(c,shared.mbulk3);
10197 addReply(c,shared.unsubscribebulk);
10198 addReplyBulk(c,channel);
10199 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10200 listLength(c->pubsub_patterns));
10201
10202 }
10203 decrRefCount(channel); /* it is finally safe to release it */
10204 return retval;
10205 }
10206
10207 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10208 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10209 int retval = 0;
10210
10211 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10212 retval = 1;
10213 pubsubPattern *pat;
10214 listAddNodeTail(c->pubsub_patterns,pattern);
10215 incrRefCount(pattern);
10216 pat = zmalloc(sizeof(*pat));
10217 pat->pattern = getDecodedObject(pattern);
10218 pat->client = c;
10219 listAddNodeTail(server.pubsub_patterns,pat);
10220 }
10221 /* Notify the client */
10222 addReply(c,shared.mbulk3);
10223 addReply(c,shared.psubscribebulk);
10224 addReplyBulk(c,pattern);
10225 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10226 return retval;
10227 }
10228
10229 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10230 * 0 if the client was not subscribed to the specified channel. */
10231 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10232 listNode *ln;
10233 pubsubPattern pat;
10234 int retval = 0;
10235
10236 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10237 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10238 retval = 1;
10239 listDelNode(c->pubsub_patterns,ln);
10240 pat.client = c;
10241 pat.pattern = pattern;
10242 ln = listSearchKey(server.pubsub_patterns,&pat);
10243 listDelNode(server.pubsub_patterns,ln);
10244 }
10245 /* Notify the client */
10246 if (notify) {
10247 addReply(c,shared.mbulk3);
10248 addReply(c,shared.punsubscribebulk);
10249 addReplyBulk(c,pattern);
10250 addReplyLongLong(c,dictSize(c->pubsub_channels)+
10251 listLength(c->pubsub_patterns));
10252 }
10253 decrRefCount(pattern);
10254 return retval;
10255 }
10256
10257 /* Unsubscribe from all the channels. Return the number of channels the
10258 * client was subscribed from. */
10259 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10260 dictIterator *di = dictGetIterator(c->pubsub_channels);
10261 dictEntry *de;
10262 int count = 0;
10263
10264 while((de = dictNext(di)) != NULL) {
10265 robj *channel = dictGetEntryKey(de);
10266
10267 count += pubsubUnsubscribeChannel(c,channel,notify);
10268 }
10269 dictReleaseIterator(di);
10270 return count;
10271 }
10272
10273 /* Unsubscribe from all the patterns. Return the number of patterns the
10274 * client was subscribed from. */
10275 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10276 listNode *ln;
10277 listIter li;
10278 int count = 0;
10279
10280 listRewind(c->pubsub_patterns,&li);
10281 while ((ln = listNext(&li)) != NULL) {
10282 robj *pattern = ln->value;
10283
10284 count += pubsubUnsubscribePattern(c,pattern,notify);
10285 }
10286 return count;
10287 }
10288
10289 /* Publish a message */
10290 static int pubsubPublishMessage(robj *channel, robj *message) {
10291 int receivers = 0;
10292 struct dictEntry *de;
10293 listNode *ln;
10294 listIter li;
10295
10296 /* Send to clients listening for that channel */
10297 de = dictFind(server.pubsub_channels,channel);
10298 if (de) {
10299 list *list = dictGetEntryVal(de);
10300 listNode *ln;
10301 listIter li;
10302
10303 listRewind(list,&li);
10304 while ((ln = listNext(&li)) != NULL) {
10305 redisClient *c = ln->value;
10306
10307 addReply(c,shared.mbulk3);
10308 addReply(c,shared.messagebulk);
10309 addReplyBulk(c,channel);
10310 addReplyBulk(c,message);
10311 receivers++;
10312 }
10313 }
10314 /* Send to clients listening to matching channels */
10315 if (listLength(server.pubsub_patterns)) {
10316 listRewind(server.pubsub_patterns,&li);
10317 channel = getDecodedObject(channel);
10318 while ((ln = listNext(&li)) != NULL) {
10319 pubsubPattern *pat = ln->value;
10320
10321 if (stringmatchlen((char*)pat->pattern->ptr,
10322 sdslen(pat->pattern->ptr),
10323 (char*)channel->ptr,
10324 sdslen(channel->ptr),0)) {
10325 addReply(pat->client,shared.mbulk4);
10326 addReply(pat->client,shared.pmessagebulk);
10327 addReplyBulk(pat->client,pat->pattern);
10328 addReplyBulk(pat->client,channel);
10329 addReplyBulk(pat->client,message);
10330 receivers++;
10331 }
10332 }
10333 decrRefCount(channel);
10334 }
10335 return receivers;
10336 }
10337
10338 static void subscribeCommand(redisClient *c) {
10339 int j;
10340
10341 for (j = 1; j < c->argc; j++)
10342 pubsubSubscribeChannel(c,c->argv[j]);
10343 }
10344
10345 static void unsubscribeCommand(redisClient *c) {
10346 if (c->argc == 1) {
10347 pubsubUnsubscribeAllChannels(c,1);
10348 return;
10349 } else {
10350 int j;
10351
10352 for (j = 1; j < c->argc; j++)
10353 pubsubUnsubscribeChannel(c,c->argv[j],1);
10354 }
10355 }
10356
10357 static void psubscribeCommand(redisClient *c) {
10358 int j;
10359
10360 for (j = 1; j < c->argc; j++)
10361 pubsubSubscribePattern(c,c->argv[j]);
10362 }
10363
10364 static void punsubscribeCommand(redisClient *c) {
10365 if (c->argc == 1) {
10366 pubsubUnsubscribeAllPatterns(c,1);
10367 return;
10368 } else {
10369 int j;
10370
10371 for (j = 1; j < c->argc; j++)
10372 pubsubUnsubscribePattern(c,c->argv[j],1);
10373 }
10374 }
10375
10376 static void publishCommand(redisClient *c) {
10377 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10378 addReplyLongLong(c,receivers);
10379 }
10380
10381 /* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10382 *
10383 * The implementation uses a per-DB hash table mapping keys to list of clients
10384 * WATCHing those keys, so that given a key that is going to be modified
10385 * we can mark all the associated clients as dirty.
10386 *
10387 * Also every client contains a list of WATCHed keys so that's possible to
10388 * un-watch such keys when the client is freed or when UNWATCH is called. */
10389
10390 /* In the client->watched_keys list we need to use watchedKey structures
10391 * as in order to identify a key in Redis we need both the key name and the
10392 * DB */
10393 typedef struct watchedKey {
10394 robj *key;
10395 redisDb *db;
10396 } watchedKey;
10397
10398 /* Watch for the specified key */
10399 static void watchForKey(redisClient *c, robj *key) {
10400 list *clients = NULL;
10401 listIter li;
10402 listNode *ln;
10403 watchedKey *wk;
10404
10405 /* Check if we are already watching for this key */
10406 listRewind(c->watched_keys,&li);
10407 while((ln = listNext(&li))) {
10408 wk = listNodeValue(ln);
10409 if (wk->db == c->db && equalStringObjects(key,wk->key))
10410 return; /* Key already watched */
10411 }
10412 /* This key is not already watched in this DB. Let's add it */
10413 clients = dictFetchValue(c->db->watched_keys,key);
10414 if (!clients) {
10415 clients = listCreate();
10416 dictAdd(c->db->watched_keys,key,clients);
10417 incrRefCount(key);
10418 }
10419 listAddNodeTail(clients,c);
10420 /* Add the new key to the lits of keys watched by this client */
10421 wk = zmalloc(sizeof(*wk));
10422 wk->key = key;
10423 wk->db = c->db;
10424 incrRefCount(key);
10425 listAddNodeTail(c->watched_keys,wk);
10426 }
10427
10428 /* Unwatch all the keys watched by this client. To clean the EXEC dirty
10429 * flag is up to the caller. */
10430 static void unwatchAllKeys(redisClient *c) {
10431 listIter li;
10432 listNode *ln;
10433
10434 if (listLength(c->watched_keys) == 0) return;
10435 listRewind(c->watched_keys,&li);
10436 while((ln = listNext(&li))) {
10437 list *clients;
10438 watchedKey *wk;
10439
10440 /* Lookup the watched key -> clients list and remove the client
10441 * from the list */
10442 wk = listNodeValue(ln);
10443 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10444 assert(clients != NULL);
10445 listDelNode(clients,listSearchKey(clients,c));
10446 /* Kill the entry at all if this was the only client */
10447 if (listLength(clients) == 0)
10448 dictDelete(wk->db->watched_keys, wk->key);
10449 /* Remove this watched key from the client->watched list */
10450 listDelNode(c->watched_keys,ln);
10451 decrRefCount(wk->key);
10452 zfree(wk);
10453 }
10454 }
10455
10456 /* "Touch" a key, so that if this key is being WATCHed by soem client the
10457 * next EXEC will fail. */
10458 static void touchWatchedKey(redisDb *db, robj *key) {
10459 list *clients;
10460 listIter li;
10461 listNode *ln;
10462
10463 if (dictSize(db->watched_keys) == 0) return;
10464 clients = dictFetchValue(db->watched_keys, key);
10465 if (!clients) return;
10466
10467 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10468 /* Check if we are already watching for this key */
10469 listRewind(clients,&li);
10470 while((ln = listNext(&li))) {
10471 redisClient *c = listNodeValue(ln);
10472
10473 c->flags |= REDIS_DIRTY_CAS;
10474 }
10475 }
10476
10477 static void watchCommand(redisClient *c) {
10478 int j;
10479
10480 for (j = 1; j < c->argc; j++)
10481 watchForKey(c,c->argv[j]);
10482 addReply(c,shared.ok);
10483 }
10484
10485 static void unwatchCommand(redisClient *c) {
10486 unwatchAllKeys(c);
10487 c->flags &= (~REDIS_DIRTY_CAS);
10488 addReply(c,shared.ok);
10489 }
10490
10491 /* ================================= Debugging ============================== */
10492
10493 /* Compute the sha1 of string at 's' with 'len' bytes long.
10494 * The SHA1 is then xored againt the string pointed by digest.
10495 * Since xor is commutative, this operation is used in order to
10496 * "add" digests relative to unordered elements.
10497 *
10498 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10499 static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10500 SHA1_CTX ctx;
10501 unsigned char hash[20], *s = ptr;
10502 int j;
10503
10504 SHA1Init(&ctx);
10505 SHA1Update(&ctx,s,len);
10506 SHA1Final(hash,&ctx);
10507
10508 for (j = 0; j < 20; j++)
10509 digest[j] ^= hash[j];
10510 }
10511
10512 static void xorObjectDigest(unsigned char *digest, robj *o) {
10513 o = getDecodedObject(o);
10514 xorDigest(digest,o->ptr,sdslen(o->ptr));
10515 decrRefCount(o);
10516 }
10517
10518 /* This function instead of just computing the SHA1 and xoring it
10519 * against diget, also perform the digest of "digest" itself and
10520 * replace the old value with the new one.
10521 *
10522 * So the final digest will be:
10523 *
10524 * digest = SHA1(digest xor SHA1(data))
10525 *
10526 * This function is used every time we want to preserve the order so
10527 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10528 *
10529 * Also note that mixdigest("foo") followed by mixdigest("bar")
10530 * will lead to a different digest compared to "fo", "obar".
10531 */
10532 static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10533 SHA1_CTX ctx;
10534 char *s = ptr;
10535
10536 xorDigest(digest,s,len);
10537 SHA1Init(&ctx);
10538 SHA1Update(&ctx,digest,20);
10539 SHA1Final(digest,&ctx);
10540 }
10541
10542 static void mixObjectDigest(unsigned char *digest, robj *o) {
10543 o = getDecodedObject(o);
10544 mixDigest(digest,o->ptr,sdslen(o->ptr));
10545 decrRefCount(o);
10546 }
10547
10548 /* Compute the dataset digest. Since keys, sets elements, hashes elements
10549 * are not ordered, we use a trick: every aggregate digest is the xor
10550 * of the digests of their elements. This way the order will not change
10551 * the result. For list instead we use a feedback entering the output digest
10552 * as input in order to ensure that a different ordered list will result in
10553 * a different digest. */
10554 static void computeDatasetDigest(unsigned char *final) {
10555 unsigned char digest[20];
10556 char buf[128];
10557 dictIterator *di = NULL;
10558 dictEntry *de;
10559 int j;
10560 uint32_t aux;
10561
10562 memset(final,0,20); /* Start with a clean result */
10563
10564 for (j = 0; j < server.dbnum; j++) {
10565 redisDb *db = server.db+j;
10566
10567 if (dictSize(db->dict) == 0) continue;
10568 di = dictGetIterator(db->dict);
10569
10570 /* hash the DB id, so the same dataset moved in a different
10571 * DB will lead to a different digest */
10572 aux = htonl(j);
10573 mixDigest(final,&aux,sizeof(aux));
10574
10575 /* Iterate this DB writing every entry */
10576 while((de = dictNext(di)) != NULL) {
10577 robj *key, *o, *kcopy;
10578 time_t expiretime;
10579
10580 memset(digest,0,20); /* This key-val digest */
10581 key = dictGetEntryKey(de);
10582
10583 if (!server.vm_enabled) {
10584 mixObjectDigest(digest,key);
10585 o = dictGetEntryVal(de);
10586 } else {
10587 /* Don't work with the key directly as when VM is active
10588 * this is unsafe: TODO: fix decrRefCount to check if the
10589 * count really reached 0 to avoid this mess */
10590 kcopy = dupStringObject(key);
10591 mixObjectDigest(digest,kcopy);
10592 o = lookupKeyRead(db,kcopy);
10593 decrRefCount(kcopy);
10594 }
10595 aux = htonl(o->type);
10596 mixDigest(digest,&aux,sizeof(aux));
10597 expiretime = getExpire(db,key);
10598
10599 /* Save the key and associated value */
10600 if (o->type == REDIS_STRING) {
10601 mixObjectDigest(digest,o);
10602 } else if (o->type == REDIS_LIST) {
10603 list *list = o->ptr;
10604 listNode *ln;
10605 listIter li;
10606
10607 listRewind(list,&li);
10608 while((ln = listNext(&li))) {
10609 robj *eleobj = listNodeValue(ln);
10610
10611 mixObjectDigest(digest,eleobj);
10612 }
10613 } else if (o->type == REDIS_SET) {
10614 dict *set = o->ptr;
10615 dictIterator *di = dictGetIterator(set);
10616 dictEntry *de;
10617
10618 while((de = dictNext(di)) != NULL) {
10619 robj *eleobj = dictGetEntryKey(de);
10620
10621 xorObjectDigest(digest,eleobj);
10622 }
10623 dictReleaseIterator(di);
10624 } else if (o->type == REDIS_ZSET) {
10625 zset *zs = o->ptr;
10626 dictIterator *di = dictGetIterator(zs->dict);
10627 dictEntry *de;
10628
10629 while((de = dictNext(di)) != NULL) {
10630 robj *eleobj = dictGetEntryKey(de);
10631 double *score = dictGetEntryVal(de);
10632 unsigned char eledigest[20];
10633
10634 snprintf(buf,sizeof(buf),"%.17g",*score);
10635 memset(eledigest,0,20);
10636 mixObjectDigest(eledigest,eleobj);
10637 mixDigest(eledigest,buf,strlen(buf));
10638 xorDigest(digest,eledigest,20);
10639 }
10640 dictReleaseIterator(di);
10641 } else if (o->type == REDIS_HASH) {
10642 hashIterator *hi;
10643 robj *obj;
10644
10645 hi = hashInitIterator(o);
10646 while (hashNext(hi) != REDIS_ERR) {
10647 unsigned char eledigest[20];
10648
10649 memset(eledigest,0,20);
10650 obj = hashCurrent(hi,REDIS_HASH_KEY);
10651 mixObjectDigest(eledigest,obj);
10652 decrRefCount(obj);
10653 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10654 mixObjectDigest(eledigest,obj);
10655 decrRefCount(obj);
10656 xorDigest(digest,eledigest,20);
10657 }
10658 hashReleaseIterator(hi);
10659 } else {
10660 redisPanic("Unknown object type");
10661 }
10662 /* If the key has an expire, add it to the mix */
10663 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10664 /* We can finally xor the key-val digest to the final digest */
10665 xorDigest(final,digest,20);
10666 }
10667 dictReleaseIterator(di);
10668 }
10669 }
10670
10671 static void debugCommand(redisClient *c) {
10672 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10673 *((char*)-1) = 'x';
10674 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10675 if (rdbSave(server.dbfilename) != REDIS_OK) {
10676 addReply(c,shared.err);
10677 return;
10678 }
10679 emptyDb();
10680 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10681 addReply(c,shared.err);
10682 return;
10683 }
10684 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10685 addReply(c,shared.ok);
10686 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10687 emptyDb();
10688 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10689 addReply(c,shared.err);
10690 return;
10691 }
10692 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10693 addReply(c,shared.ok);
10694 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10695 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10696 robj *key, *val;
10697
10698 if (!de) {
10699 addReply(c,shared.nokeyerr);
10700 return;
10701 }
10702 key = dictGetEntryKey(de);
10703 val = dictGetEntryVal(de);
10704 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10705 key->storage == REDIS_VM_SWAPPING)) {
10706 char *strenc;
10707 char buf[128];
10708
10709 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10710 strenc = strencoding[val->encoding];
10711 } else {
10712 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10713 strenc = buf;
10714 }
10715 addReplySds(c,sdscatprintf(sdsempty(),
10716 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10717 "encoding:%s serializedlength:%lld\r\n",
10718 (void*)key, key->refcount, (void*)val, val->refcount,
10719 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10720 } else {
10721 addReplySds(c,sdscatprintf(sdsempty(),
10722 "+Key at:%p refcount:%d, value swapped at: page %llu "
10723 "using %llu pages\r\n",
10724 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10725 (unsigned long long) key->vm.usedpages));
10726 }
10727 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10728 lookupKeyRead(c->db,c->argv[2]);
10729 addReply(c,shared.ok);
10730 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10731 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10732 robj *key, *val;
10733
10734 if (!server.vm_enabled) {
10735 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10736 return;
10737 }
10738 if (!de) {
10739 addReply(c,shared.nokeyerr);
10740 return;
10741 }
10742 key = dictGetEntryKey(de);
10743 val = dictGetEntryVal(de);
10744 /* If the key is shared we want to create a copy */
10745 if (key->refcount > 1) {
10746 robj *newkey = dupStringObject(key);
10747 decrRefCount(key);
10748 key = dictGetEntryKey(de) = newkey;
10749 }
10750 /* Swap it */
10751 if (key->storage != REDIS_VM_MEMORY) {
10752 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10753 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10754 dictGetEntryVal(de) = NULL;
10755 addReply(c,shared.ok);
10756 } else {
10757 addReply(c,shared.err);
10758 }
10759 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10760 long keys, j;
10761 robj *key, *val;
10762 char buf[128];
10763
10764 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10765 return;
10766 for (j = 0; j < keys; j++) {
10767 snprintf(buf,sizeof(buf),"key:%lu",j);
10768 key = createStringObject(buf,strlen(buf));
10769 if (lookupKeyRead(c->db,key) != NULL) {
10770 decrRefCount(key);
10771 continue;
10772 }
10773 snprintf(buf,sizeof(buf),"value:%lu",j);
10774 val = createStringObject(buf,strlen(buf));
10775 dictAdd(c->db->dict,key,val);
10776 }
10777 addReply(c,shared.ok);
10778 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10779 unsigned char digest[20];
10780 sds d = sdsnew("+");
10781 int j;
10782
10783 computeDatasetDigest(digest);
10784 for (j = 0; j < 20; j++)
10785 d = sdscatprintf(d, "%02x",digest[j]);
10786
10787 d = sdscatlen(d,"\r\n",2);
10788 addReplySds(c,d);
10789 } else {
10790 addReplySds(c,sdsnew(
10791 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10792 }
10793 }
10794
10795 static void _redisAssert(char *estr, char *file, int line) {
10796 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10797 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
10798 #ifdef HAVE_BACKTRACE
10799 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10800 *((char*)-1) = 'x';
10801 #endif
10802 }
10803
10804 static void _redisPanic(char *msg, char *file, int line) {
10805 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10806 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10807 #ifdef HAVE_BACKTRACE
10808 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10809 *((char*)-1) = 'x';
10810 #endif
10811 }
10812
10813 /* =================================== Main! ================================ */
10814
10815 #ifdef __linux__
10816 int linuxOvercommitMemoryValue(void) {
10817 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10818 char buf[64];
10819
10820 if (!fp) return -1;
10821 if (fgets(buf,64,fp) == NULL) {
10822 fclose(fp);
10823 return -1;
10824 }
10825 fclose(fp);
10826
10827 return atoi(buf);
10828 }
10829
10830 void linuxOvercommitMemoryWarning(void) {
10831 if (linuxOvercommitMemoryValue() == 0) {
10832 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10833 }
10834 }
10835 #endif /* __linux__ */
10836
10837 static void daemonize(void) {
10838 int fd;
10839 FILE *fp;
10840
10841 if (fork() != 0) exit(0); /* parent exits */
10842 setsid(); /* create a new session */
10843
10844 /* Every output goes to /dev/null. If Redis is daemonized but
10845 * the 'logfile' is set to 'stdout' in the configuration file
10846 * it will not log at all. */
10847 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10848 dup2(fd, STDIN_FILENO);
10849 dup2(fd, STDOUT_FILENO);
10850 dup2(fd, STDERR_FILENO);
10851 if (fd > STDERR_FILENO) close(fd);
10852 }
10853 /* Try to write the pid file */
10854 fp = fopen(server.pidfile,"w");
10855 if (fp) {
10856 fprintf(fp,"%d\n",getpid());
10857 fclose(fp);
10858 }
10859 }
10860
10861 static void version() {
10862 printf("Redis server version %s\n", REDIS_VERSION);
10863 exit(0);
10864 }
10865
10866 static void usage() {
10867 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10868 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10869 exit(1);
10870 }
10871
10872 int main(int argc, char **argv) {
10873 time_t start;
10874
10875 initServerConfig();
10876 if (argc == 2) {
10877 if (strcmp(argv[1], "-v") == 0 ||
10878 strcmp(argv[1], "--version") == 0) version();
10879 if (strcmp(argv[1], "--help") == 0) usage();
10880 resetServerSaveParams();
10881 loadServerConfig(argv[1]);
10882 } else if ((argc > 2)) {
10883 usage();
10884 } else {
10885 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10886 }
10887 if (server.daemonize) daemonize();
10888 initServer();
10889 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10890 #ifdef __linux__
10891 linuxOvercommitMemoryWarning();
10892 #endif
10893 start = time(NULL);
10894 if (server.appendonly) {
10895 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10896 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10897 } else {
10898 if (rdbLoad(server.dbfilename) == REDIS_OK)
10899 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10900 }
10901 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10902 aeSetBeforeSleepProc(server.el,beforeSleep);
10903 aeMain(server.el);
10904 aeDeleteEventLoop(server.el);
10905 return 0;
10906 }
10907
10908 /* ============================= Backtrace support ========================= */
10909
10910 #ifdef HAVE_BACKTRACE
10911 static char *findFuncName(void *pointer, unsigned long *offset);
10912
10913 static void *getMcontextEip(ucontext_t *uc) {
10914 #if defined(__FreeBSD__)
10915 return (void*) uc->uc_mcontext.mc_eip;
10916 #elif defined(__dietlibc__)
10917 return (void*) uc->uc_mcontext.eip;
10918 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10919 #if __x86_64__
10920 return (void*) uc->uc_mcontext->__ss.__rip;
10921 #else
10922 return (void*) uc->uc_mcontext->__ss.__eip;
10923 #endif
10924 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10925 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10926 return (void*) uc->uc_mcontext->__ss.__rip;
10927 #else
10928 return (void*) uc->uc_mcontext->__ss.__eip;
10929 #endif
10930 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10931 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10932 #elif defined(__ia64__) /* Linux IA64 */
10933 return (void*) uc->uc_mcontext.sc_ip;
10934 #else
10935 return NULL;
10936 #endif
10937 }
10938
10939 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10940 void *trace[100];
10941 char **messages = NULL;
10942 int i, trace_size = 0;
10943 unsigned long offset=0;
10944 ucontext_t *uc = (ucontext_t*) secret;
10945 sds infostring;
10946 REDIS_NOTUSED(info);
10947
10948 redisLog(REDIS_WARNING,
10949 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10950 infostring = genRedisInfoString();
10951 redisLog(REDIS_WARNING, "%s",infostring);
10952 /* It's not safe to sdsfree() the returned string under memory
10953 * corruption conditions. Let it leak as we are going to abort */
10954
10955 trace_size = backtrace(trace, 100);
10956 /* overwrite sigaction with caller's address */
10957 if (getMcontextEip(uc) != NULL) {
10958 trace[1] = getMcontextEip(uc);
10959 }
10960 messages = backtrace_symbols(trace, trace_size);
10961
10962 for (i=1; i<trace_size; ++i) {
10963 char *fn = findFuncName(trace[i], &offset), *p;
10964
10965 p = strchr(messages[i],'+');
10966 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10967 redisLog(REDIS_WARNING,"%s", messages[i]);
10968 } else {
10969 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10970 }
10971 }
10972 /* free(messages); Don't call free() with possibly corrupted memory. */
10973 _exit(0);
10974 }
10975
10976 static void sigtermHandler(int sig) {
10977 REDIS_NOTUSED(sig);
10978
10979 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
10980 server.shutdown_asap = 1;
10981 }
10982
10983 static void setupSigSegvAction(void) {
10984 struct sigaction act;
10985
10986 sigemptyset (&act.sa_mask);
10987 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10988 * is used. Otherwise, sa_handler is used */
10989 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10990 act.sa_sigaction = segvHandler;
10991 sigaction (SIGSEGV, &act, NULL);
10992 sigaction (SIGBUS, &act, NULL);
10993 sigaction (SIGFPE, &act, NULL);
10994 sigaction (SIGILL, &act, NULL);
10995 sigaction (SIGBUS, &act, NULL);
10996
10997 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
10998 act.sa_handler = sigtermHandler;
10999 sigaction (SIGTERM, &act, NULL);
11000 return;
11001 }
11002
11003 #include "staticsymbols.h"
11004 /* This function try to convert a pointer into a function name. It's used in
11005 * oreder to provide a backtrace under segmentation fault that's able to
11006 * display functions declared as static (otherwise the backtrace is useless). */
11007 static char *findFuncName(void *pointer, unsigned long *offset){
11008 int i, ret = -1;
11009 unsigned long off, minoff = 0;
11010
11011 /* Try to match against the Symbol with the smallest offset */
11012 for (i=0; symsTable[i].pointer; i++) {
11013 unsigned long lp = (unsigned long) pointer;
11014
11015 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11016 off=lp-symsTable[i].pointer;
11017 if (ret < 0 || off < minoff) {
11018 minoff=off;
11019 ret=i;
11020 }
11021 }
11022 }
11023 if (ret == -1) return NULL;
11024 *offset = minoff;
11025 return symsTable[ret].name;
11026 }
11027 #else /* HAVE_BACKTRACE */
11028 static void setupSigSegvAction(void) {
11029 }
11030 #endif /* HAVE_BACKTRACE */
11031
11032
11033
11034 /* The End */
11035
11036
11037