]> git.saurik.com Git - redis.git/blob - redis.c
e342416f1b8a48808d3a681992b9ba6aa5602b84
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <math.h>
61 #include <pthread.h>
62
63 #if defined(__sun)
64 #include "solarisfixes.h"
65 #endif
66
67 #include "redis.h"
68 #include "ae.h" /* Event driven programming library */
69 #include "sds.h" /* Dynamic safe strings */
70 #include "anet.h" /* Networking the easy way */
71 #include "dict.h" /* Hash tables */
72 #include "adlist.h" /* Linked lists */
73 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
74 #include "lzf.h" /* LZF compression library */
75 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
76 #include "zipmap.h"
77
78 /* Error codes */
79 #define REDIS_OK 0
80 #define REDIS_ERR -1
81
82 /* Static server configuration */
83 #define REDIS_SERVERPORT 6379 /* TCP port */
84 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
85 #define REDIS_IOBUF_LEN 1024
86 #define REDIS_LOADBUF_LEN 1024
87 #define REDIS_STATIC_ARGS 8
88 #define REDIS_DEFAULT_DBNUM 16
89 #define REDIS_CONFIGLINE_MAX 1024
90 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
92 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
93 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
94 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97 #define REDIS_WRITEV_THRESHOLD 3
98 /* Max number of iovecs used for each writev call */
99 #define REDIS_WRITEV_IOVEC_COUNT 256
100
101 /* Hash table parameters */
102 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
103
104 /* Command flags */
105 #define REDIS_CMD_BULK 1 /* Bulk write command */
106 #define REDIS_CMD_INLINE 2 /* Inline command */
107 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111 #define REDIS_CMD_DENYOOM 4
112 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
124 #define REDIS_ENCODING_RAW 0 /* Raw representation */
125 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
126 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
128
129 static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131 };
132
133 /* Object types only used for dumping to disk */
134 #define REDIS_EXPIRETIME 253
135 #define REDIS_SELECTDB 254
136 #define REDIS_EOF 255
137
138 /* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
148 *
149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
151 #define REDIS_RDB_6BITLEN 0
152 #define REDIS_RDB_14BITLEN 1
153 #define REDIS_RDB_32BITLEN 2
154 #define REDIS_RDB_ENCVAL 3
155 #define REDIS_RDB_LENERR UINT_MAX
156
157 /* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
163 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
164
165 /* Virtual memory object->where field. */
166 #define REDIS_VM_MEMORY 0 /* The object is on memory */
167 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
168 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
171 /* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173 #define REDIS_VM_MAX_NEAR_PAGES 65536
174 #define REDIS_VM_MAX_RANDOM_JUMP 4096
175 #define REDIS_VM_MAX_THREADS 32
176 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
177 /* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
181 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
182
183 /* Client flags */
184 #define REDIS_SLAVE 1 /* This client is a slave server */
185 #define REDIS_MASTER 2 /* This client is a master server */
186 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187 #define REDIS_MULTI 8 /* This client is in a MULTI context */
188 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
190
191 /* Slave replication state - slave side */
192 #define REDIS_REPL_NONE 0 /* No active replication */
193 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
194 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
196 /* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
205 /* List related stuff */
206 #define REDIS_HEAD 0
207 #define REDIS_TAIL 1
208
209 /* Sort operations */
210 #define REDIS_SORT_GET 0
211 #define REDIS_SORT_ASC 1
212 #define REDIS_SORT_DESC 2
213 #define REDIS_SORTKEY_MAX 1024
214
215 /* Log levels */
216 #define REDIS_DEBUG 0
217 #define REDIS_VERBOSE 1
218 #define REDIS_NOTICE 2
219 #define REDIS_WARNING 3
220
221 /* Anti-warning macro... */
222 #define REDIS_NOTUSED(V) ((void) V)
223
224 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
226
227 /* Append only defines */
228 #define APPENDFSYNC_NO 0
229 #define APPENDFSYNC_ALWAYS 1
230 #define APPENDFSYNC_EVERYSEC 2
231
232 /* Hashes related defaults */
233 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
236 /* We can print the stacktrace, so our assert is defined this way: */
237 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
238 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
239 static void _redisAssert(char *estr, char *file, int line);
240 static void _redisPanic(char *msg, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 sds aofbuf; /* AOF buffer, written before entering the event loop */
374 struct saveparam *saveparams;
375 int saveparamslen;
376 char *logfile;
377 char *bindaddr;
378 char *dbfilename;
379 char *appendfilename;
380 char *requirepass;
381 int rdbcompression;
382 int activerehashing;
383 /* Replication related */
384 int isslave;
385 char *masterauth;
386 char *masterhost;
387 int masterport;
388 redisClient *master; /* client that is master for this slave */
389 int replstate;
390 unsigned int maxclients;
391 unsigned long long maxmemory;
392 unsigned int blpop_blocked_clients;
393 unsigned int vm_blocked_clients;
394 /* Sort parameters - qsort_r() is only available under BSD so we
395 * have to take this state global, in order to pass it to sortCompare() */
396 int sort_desc;
397 int sort_alpha;
398 int sort_bypattern;
399 /* Virtual memory configuration */
400 int vm_enabled;
401 char *vm_swap_file;
402 off_t vm_page_size;
403 off_t vm_pages;
404 unsigned long long vm_max_memory;
405 /* Hashes config */
406 size_t hash_max_zipmap_entries;
407 size_t hash_max_zipmap_value;
408 /* Virtual memory state */
409 FILE *vm_fp;
410 int vm_fd;
411 off_t vm_next_page; /* Next probably empty page */
412 off_t vm_near_pages; /* Number of pages allocated sequentially */
413 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
414 time_t unixtime; /* Unix time sampled every second. */
415 /* Virtual memory I/O threads stuff */
416 /* An I/O thread process an element taken from the io_jobs queue and
417 * put the result of the operation in the io_done list. While the
418 * job is being processed, it's put on io_processing queue. */
419 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
420 list *io_processing; /* List of VM I/O jobs being processed */
421 list *io_processed; /* List of VM I/O jobs already processed */
422 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
423 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
424 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
425 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
426 pthread_attr_t io_threads_attr; /* attributes for threads creation */
427 int io_active_threads; /* Number of running I/O threads */
428 int vm_max_threads; /* Max number of I/O threads running at the same time */
429 /* Our main thread is blocked on the event loop, locking for sockets ready
430 * to be read or written, so when a threaded I/O operation is ready to be
431 * processed by the main thread, the I/O thread will use a unix pipe to
432 * awake the main thread. The followings are the two pipe FDs. */
433 int io_ready_pipe_read;
434 int io_ready_pipe_write;
435 /* Virtual memory stats */
436 unsigned long long vm_stats_used_pages;
437 unsigned long long vm_stats_swapped_objects;
438 unsigned long long vm_stats_swapouts;
439 unsigned long long vm_stats_swapins;
440 /* Pubsub */
441 dict *pubsub_channels; /* Map channels to list of subscribed clients */
442 list *pubsub_patterns; /* A list of pubsub_patterns */
443 /* Misc */
444 FILE *devnull;
445 };
446
447 typedef struct pubsubPattern {
448 redisClient *client;
449 robj *pattern;
450 } pubsubPattern;
451
452 typedef void redisCommandProc(redisClient *c);
453 struct redisCommand {
454 char *name;
455 redisCommandProc *proc;
456 int arity;
457 int flags;
458 /* Use a function to determine which keys need to be loaded
459 * in the background prior to executing this command. Takes precedence
460 * over vm_firstkey and others, ignored when NULL */
461 redisCommandProc *vm_preload_proc;
462 /* What keys should be loaded in background when calling this command? */
463 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
464 int vm_lastkey; /* THe last argument that's a key */
465 int vm_keystep; /* The step between first and last key */
466 };
467
468 struct redisFunctionSym {
469 char *name;
470 unsigned long pointer;
471 };
472
473 typedef struct _redisSortObject {
474 robj *obj;
475 union {
476 double score;
477 robj *cmpobj;
478 } u;
479 } redisSortObject;
480
481 typedef struct _redisSortOperation {
482 int type;
483 robj *pattern;
484 } redisSortOperation;
485
486 /* ZSETs use a specialized version of Skiplists */
487
488 typedef struct zskiplistNode {
489 struct zskiplistNode **forward;
490 struct zskiplistNode *backward;
491 unsigned int *span;
492 double score;
493 robj *obj;
494 } zskiplistNode;
495
496 typedef struct zskiplist {
497 struct zskiplistNode *header, *tail;
498 unsigned long length;
499 int level;
500 } zskiplist;
501
502 typedef struct zset {
503 dict *dict;
504 zskiplist *zsl;
505 } zset;
506
507 /* Our shared "common" objects */
508
509 #define REDIS_SHARED_INTEGERS 10000
510 struct sharedObjectsStruct {
511 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
512 *colon, *nullbulk, *nullmultibulk, *queued,
513 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
514 *outofrangeerr, *plus,
515 *select0, *select1, *select2, *select3, *select4,
516 *select5, *select6, *select7, *select8, *select9,
517 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
518 *mbulk4, *psubscribebulk, *punsubscribebulk,
519 *integers[REDIS_SHARED_INTEGERS];
520 } shared;
521
522 /* Global vars that are actally used as constants. The following double
523 * values are used for double on-disk serialization, and are initialized
524 * at runtime to avoid strange compiler optimizations. */
525
526 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
527
528 /* VM threaded I/O request message */
529 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
530 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
531 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
532 typedef struct iojob {
533 int type; /* Request type, REDIS_IOJOB_* */
534 redisDb *db;/* Redis database */
535 robj *key; /* This I/O request is about swapping this key */
536 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
537 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
538 off_t page; /* Swap page where to read/write the object */
539 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
540 int canceled; /* True if this command was canceled by blocking side of VM */
541 pthread_t thread; /* ID of the thread processing this entry */
542 } iojob;
543
544 /*================================ Prototypes =============================== */
545
546 static void freeStringObject(robj *o);
547 static void freeListObject(robj *o);
548 static void freeSetObject(robj *o);
549 static void decrRefCount(void *o);
550 static robj *createObject(int type, void *ptr);
551 static void freeClient(redisClient *c);
552 static int rdbLoad(char *filename);
553 static void addReply(redisClient *c, robj *obj);
554 static void addReplySds(redisClient *c, sds s);
555 static void incrRefCount(robj *o);
556 static int rdbSaveBackground(char *filename);
557 static robj *createStringObject(char *ptr, size_t len);
558 static robj *dupStringObject(robj *o);
559 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
560 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
561 static void flushAppendOnlyFile(void);
562 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
563 static int syncWithMaster(void);
564 static robj *tryObjectEncoding(robj *o);
565 static robj *getDecodedObject(robj *o);
566 static int removeExpire(redisDb *db, robj *key);
567 static int expireIfNeeded(redisDb *db, robj *key);
568 static int deleteIfVolatile(redisDb *db, robj *key);
569 static int deleteIfSwapped(redisDb *db, robj *key);
570 static int deleteKey(redisDb *db, robj *key);
571 static time_t getExpire(redisDb *db, robj *key);
572 static int setExpire(redisDb *db, robj *key, time_t when);
573 static void updateSlavesWaitingBgsave(int bgsaveerr);
574 static void freeMemoryIfNeeded(void);
575 static int processCommand(redisClient *c);
576 static void setupSigSegvAction(void);
577 static void rdbRemoveTempFile(pid_t childpid);
578 static void aofRemoveTempFile(pid_t childpid);
579 static size_t stringObjectLen(robj *o);
580 static void processInputBuffer(redisClient *c);
581 static zskiplist *zslCreate(void);
582 static void zslFree(zskiplist *zsl);
583 static void zslInsert(zskiplist *zsl, double score, robj *obj);
584 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
585 static void initClientMultiState(redisClient *c);
586 static void freeClientMultiState(redisClient *c);
587 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
588 static void unblockClientWaitingData(redisClient *c);
589 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
590 static void vmInit(void);
591 static void vmMarkPagesFree(off_t page, off_t count);
592 static robj *vmLoadObject(robj *key);
593 static robj *vmPreviewObject(robj *key);
594 static int vmSwapOneObjectBlocking(void);
595 static int vmSwapOneObjectThreaded(void);
596 static int vmCanSwapOut(void);
597 static int tryFreeOneObjectFromFreelist(void);
598 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
599 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
600 static void vmCancelThreadedIOJob(robj *o);
601 static void lockThreadedIO(void);
602 static void unlockThreadedIO(void);
603 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
604 static void freeIOJob(iojob *j);
605 static void queueIOJob(iojob *j);
606 static int vmWriteObjectOnSwap(robj *o, off_t page);
607 static robj *vmReadObjectFromSwap(off_t page, int type);
608 static void waitEmptyIOJobsQueue(void);
609 static void vmReopenSwapFile(void);
610 static int vmFreePage(off_t page);
611 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
612 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
613 static int dontWaitForSwappedKey(redisClient *c, robj *key);
614 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
615 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
616 static struct redisCommand *lookupCommand(char *name);
617 static void call(redisClient *c, struct redisCommand *cmd);
618 static void resetClient(redisClient *c);
619 static void convertToRealHash(robj *o);
620 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
621 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
622 static void freePubsubPattern(void *p);
623 static int listMatchPubsubPattern(void *a, void *b);
624 static int compareStringObjects(robj *a, robj *b);
625 static void usage();
626 static int rewriteAppendOnlyFileBackground(void);
627
628 static void authCommand(redisClient *c);
629 static void pingCommand(redisClient *c);
630 static void echoCommand(redisClient *c);
631 static void setCommand(redisClient *c);
632 static void setnxCommand(redisClient *c);
633 static void setexCommand(redisClient *c);
634 static void getCommand(redisClient *c);
635 static void delCommand(redisClient *c);
636 static void existsCommand(redisClient *c);
637 static void incrCommand(redisClient *c);
638 static void decrCommand(redisClient *c);
639 static void incrbyCommand(redisClient *c);
640 static void decrbyCommand(redisClient *c);
641 static void selectCommand(redisClient *c);
642 static void randomkeyCommand(redisClient *c);
643 static void keysCommand(redisClient *c);
644 static void dbsizeCommand(redisClient *c);
645 static void lastsaveCommand(redisClient *c);
646 static void saveCommand(redisClient *c);
647 static void bgsaveCommand(redisClient *c);
648 static void bgrewriteaofCommand(redisClient *c);
649 static void shutdownCommand(redisClient *c);
650 static void moveCommand(redisClient *c);
651 static void renameCommand(redisClient *c);
652 static void renamenxCommand(redisClient *c);
653 static void lpushCommand(redisClient *c);
654 static void rpushCommand(redisClient *c);
655 static void lpopCommand(redisClient *c);
656 static void rpopCommand(redisClient *c);
657 static void llenCommand(redisClient *c);
658 static void lindexCommand(redisClient *c);
659 static void lrangeCommand(redisClient *c);
660 static void ltrimCommand(redisClient *c);
661 static void typeCommand(redisClient *c);
662 static void lsetCommand(redisClient *c);
663 static void saddCommand(redisClient *c);
664 static void sremCommand(redisClient *c);
665 static void smoveCommand(redisClient *c);
666 static void sismemberCommand(redisClient *c);
667 static void scardCommand(redisClient *c);
668 static void spopCommand(redisClient *c);
669 static void srandmemberCommand(redisClient *c);
670 static void sinterCommand(redisClient *c);
671 static void sinterstoreCommand(redisClient *c);
672 static void sunionCommand(redisClient *c);
673 static void sunionstoreCommand(redisClient *c);
674 static void sdiffCommand(redisClient *c);
675 static void sdiffstoreCommand(redisClient *c);
676 static void syncCommand(redisClient *c);
677 static void flushdbCommand(redisClient *c);
678 static void flushallCommand(redisClient *c);
679 static void sortCommand(redisClient *c);
680 static void lremCommand(redisClient *c);
681 static void rpoplpushcommand(redisClient *c);
682 static void infoCommand(redisClient *c);
683 static void mgetCommand(redisClient *c);
684 static void monitorCommand(redisClient *c);
685 static void expireCommand(redisClient *c);
686 static void expireatCommand(redisClient *c);
687 static void getsetCommand(redisClient *c);
688 static void ttlCommand(redisClient *c);
689 static void slaveofCommand(redisClient *c);
690 static void debugCommand(redisClient *c);
691 static void msetCommand(redisClient *c);
692 static void msetnxCommand(redisClient *c);
693 static void zaddCommand(redisClient *c);
694 static void zincrbyCommand(redisClient *c);
695 static void zrangeCommand(redisClient *c);
696 static void zrangebyscoreCommand(redisClient *c);
697 static void zcountCommand(redisClient *c);
698 static void zrevrangeCommand(redisClient *c);
699 static void zcardCommand(redisClient *c);
700 static void zremCommand(redisClient *c);
701 static void zscoreCommand(redisClient *c);
702 static void zremrangebyscoreCommand(redisClient *c);
703 static void multiCommand(redisClient *c);
704 static void execCommand(redisClient *c);
705 static void discardCommand(redisClient *c);
706 static void blpopCommand(redisClient *c);
707 static void brpopCommand(redisClient *c);
708 static void appendCommand(redisClient *c);
709 static void substrCommand(redisClient *c);
710 static void zrankCommand(redisClient *c);
711 static void zrevrankCommand(redisClient *c);
712 static void hsetCommand(redisClient *c);
713 static void hsetnxCommand(redisClient *c);
714 static void hgetCommand(redisClient *c);
715 static void hmsetCommand(redisClient *c);
716 static void hmgetCommand(redisClient *c);
717 static void hdelCommand(redisClient *c);
718 static void hlenCommand(redisClient *c);
719 static void zremrangebyrankCommand(redisClient *c);
720 static void zunionCommand(redisClient *c);
721 static void zinterCommand(redisClient *c);
722 static void hkeysCommand(redisClient *c);
723 static void hvalsCommand(redisClient *c);
724 static void hgetallCommand(redisClient *c);
725 static void hexistsCommand(redisClient *c);
726 static void configCommand(redisClient *c);
727 static void hincrbyCommand(redisClient *c);
728 static void subscribeCommand(redisClient *c);
729 static void unsubscribeCommand(redisClient *c);
730 static void psubscribeCommand(redisClient *c);
731 static void punsubscribeCommand(redisClient *c);
732 static void publishCommand(redisClient *c);
733
734 /*================================= Globals ================================= */
735
736 /* Global vars */
737 static struct redisServer server; /* server global state */
738 static struct redisCommand cmdTable[] = {
739 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
740 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
741 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
742 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
743 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
746 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
750 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
752 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
762 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
763 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
764 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
765 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
766 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
767 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
775 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
776 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
780 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
783 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
784 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
791 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
792 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
804 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
806 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
808 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
809 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
821 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
827 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
829 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
834 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
845 {NULL,NULL,0,0,NULL,0,0,0}
846 };
847
848 /*============================ Utility functions ============================ */
849
850 /* Glob-style pattern matching. */
851 static int stringmatchlen(const char *pattern, int patternLen,
852 const char *string, int stringLen, int nocase)
853 {
854 while(patternLen) {
855 switch(pattern[0]) {
856 case '*':
857 while (pattern[1] == '*') {
858 pattern++;
859 patternLen--;
860 }
861 if (patternLen == 1)
862 return 1; /* match */
863 while(stringLen) {
864 if (stringmatchlen(pattern+1, patternLen-1,
865 string, stringLen, nocase))
866 return 1; /* match */
867 string++;
868 stringLen--;
869 }
870 return 0; /* no match */
871 break;
872 case '?':
873 if (stringLen == 0)
874 return 0; /* no match */
875 string++;
876 stringLen--;
877 break;
878 case '[':
879 {
880 int not, match;
881
882 pattern++;
883 patternLen--;
884 not = pattern[0] == '^';
885 if (not) {
886 pattern++;
887 patternLen--;
888 }
889 match = 0;
890 while(1) {
891 if (pattern[0] == '\\') {
892 pattern++;
893 patternLen--;
894 if (pattern[0] == string[0])
895 match = 1;
896 } else if (pattern[0] == ']') {
897 break;
898 } else if (patternLen == 0) {
899 pattern--;
900 patternLen++;
901 break;
902 } else if (pattern[1] == '-' && patternLen >= 3) {
903 int start = pattern[0];
904 int end = pattern[2];
905 int c = string[0];
906 if (start > end) {
907 int t = start;
908 start = end;
909 end = t;
910 }
911 if (nocase) {
912 start = tolower(start);
913 end = tolower(end);
914 c = tolower(c);
915 }
916 pattern += 2;
917 patternLen -= 2;
918 if (c >= start && c <= end)
919 match = 1;
920 } else {
921 if (!nocase) {
922 if (pattern[0] == string[0])
923 match = 1;
924 } else {
925 if (tolower((int)pattern[0]) == tolower((int)string[0]))
926 match = 1;
927 }
928 }
929 pattern++;
930 patternLen--;
931 }
932 if (not)
933 match = !match;
934 if (!match)
935 return 0; /* no match */
936 string++;
937 stringLen--;
938 break;
939 }
940 case '\\':
941 if (patternLen >= 2) {
942 pattern++;
943 patternLen--;
944 }
945 /* fall through */
946 default:
947 if (!nocase) {
948 if (pattern[0] != string[0])
949 return 0; /* no match */
950 } else {
951 if (tolower((int)pattern[0]) != tolower((int)string[0]))
952 return 0; /* no match */
953 }
954 string++;
955 stringLen--;
956 break;
957 }
958 pattern++;
959 patternLen--;
960 if (stringLen == 0) {
961 while(*pattern == '*') {
962 pattern++;
963 patternLen--;
964 }
965 break;
966 }
967 }
968 if (patternLen == 0 && stringLen == 0)
969 return 1;
970 return 0;
971 }
972
973 static int stringmatch(const char *pattern, const char *string, int nocase) {
974 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
975 }
976
977 /* Convert a string representing an amount of memory into the number of
978 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
979 * (1024*1024*1024).
980 *
981 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
982 * set to 0 */
983 static long long memtoll(const char *p, int *err) {
984 const char *u;
985 char buf[128];
986 long mul; /* unit multiplier */
987 long long val;
988 unsigned int digits;
989
990 if (err) *err = 0;
991 /* Search the first non digit character. */
992 u = p;
993 if (*u == '-') u++;
994 while(*u && isdigit(*u)) u++;
995 if (*u == '\0' || !strcasecmp(u,"b")) {
996 mul = 1;
997 } else if (!strcasecmp(u,"k")) {
998 mul = 1000;
999 } else if (!strcasecmp(u,"kb")) {
1000 mul = 1024;
1001 } else if (!strcasecmp(u,"m")) {
1002 mul = 1000*1000;
1003 } else if (!strcasecmp(u,"mb")) {
1004 mul = 1024*1024;
1005 } else if (!strcasecmp(u,"g")) {
1006 mul = 1000L*1000*1000;
1007 } else if (!strcasecmp(u,"gb")) {
1008 mul = 1024L*1024*1024;
1009 } else {
1010 if (err) *err = 1;
1011 mul = 1;
1012 }
1013 digits = u-p;
1014 if (digits >= sizeof(buf)) {
1015 if (err) *err = 1;
1016 return LLONG_MAX;
1017 }
1018 memcpy(buf,p,digits);
1019 buf[digits] = '\0';
1020 val = strtoll(buf,NULL,10);
1021 return val*mul;
1022 }
1023
1024 static void redisLog(int level, const char *fmt, ...) {
1025 va_list ap;
1026 FILE *fp;
1027
1028 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1029 if (!fp) return;
1030
1031 va_start(ap, fmt);
1032 if (level >= server.verbosity) {
1033 char *c = ".-*#";
1034 char buf[64];
1035 time_t now;
1036
1037 now = time(NULL);
1038 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1039 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1040 vfprintf(fp, fmt, ap);
1041 fprintf(fp,"\n");
1042 fflush(fp);
1043 }
1044 va_end(ap);
1045
1046 if (server.logfile) fclose(fp);
1047 }
1048
1049 /*====================== Hash table type implementation ==================== */
1050
1051 /* This is an hash table type that uses the SDS dynamic strings libary as
1052 * keys and radis objects as values (objects can hold SDS strings,
1053 * lists, sets). */
1054
1055 static void dictVanillaFree(void *privdata, void *val)
1056 {
1057 DICT_NOTUSED(privdata);
1058 zfree(val);
1059 }
1060
1061 static void dictListDestructor(void *privdata, void *val)
1062 {
1063 DICT_NOTUSED(privdata);
1064 listRelease((list*)val);
1065 }
1066
1067 static int sdsDictKeyCompare(void *privdata, const void *key1,
1068 const void *key2)
1069 {
1070 int l1,l2;
1071 DICT_NOTUSED(privdata);
1072
1073 l1 = sdslen((sds)key1);
1074 l2 = sdslen((sds)key2);
1075 if (l1 != l2) return 0;
1076 return memcmp(key1, key2, l1) == 0;
1077 }
1078
1079 static void dictRedisObjectDestructor(void *privdata, void *val)
1080 {
1081 DICT_NOTUSED(privdata);
1082
1083 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1084 decrRefCount(val);
1085 }
1086
1087 static int dictObjKeyCompare(void *privdata, const void *key1,
1088 const void *key2)
1089 {
1090 const robj *o1 = key1, *o2 = key2;
1091 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1092 }
1093
1094 static unsigned int dictObjHash(const void *key) {
1095 const robj *o = key;
1096 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1097 }
1098
1099 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1100 const void *key2)
1101 {
1102 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1103 int cmp;
1104
1105 if (o1->encoding == REDIS_ENCODING_INT &&
1106 o2->encoding == REDIS_ENCODING_INT &&
1107 o1->ptr == o2->ptr) return 1;
1108
1109 o1 = getDecodedObject(o1);
1110 o2 = getDecodedObject(o2);
1111 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1112 decrRefCount(o1);
1113 decrRefCount(o2);
1114 return cmp;
1115 }
1116
1117 static unsigned int dictEncObjHash(const void *key) {
1118 robj *o = (robj*) key;
1119
1120 if (o->encoding == REDIS_ENCODING_RAW) {
1121 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1122 } else {
1123 if (o->encoding == REDIS_ENCODING_INT) {
1124 char buf[32];
1125 int len;
1126
1127 len = snprintf(buf,32,"%ld",(long)o->ptr);
1128 return dictGenHashFunction((unsigned char*)buf, len);
1129 } else {
1130 unsigned int hash;
1131
1132 o = getDecodedObject(o);
1133 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1134 decrRefCount(o);
1135 return hash;
1136 }
1137 }
1138 }
1139
1140 /* Sets type and expires */
1141 static dictType setDictType = {
1142 dictEncObjHash, /* hash function */
1143 NULL, /* key dup */
1144 NULL, /* val dup */
1145 dictEncObjKeyCompare, /* key compare */
1146 dictRedisObjectDestructor, /* key destructor */
1147 NULL /* val destructor */
1148 };
1149
1150 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1151 static dictType zsetDictType = {
1152 dictEncObjHash, /* hash function */
1153 NULL, /* key dup */
1154 NULL, /* val dup */
1155 dictEncObjKeyCompare, /* key compare */
1156 dictRedisObjectDestructor, /* key destructor */
1157 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1158 };
1159
1160 /* Db->dict */
1161 static dictType dbDictType = {
1162 dictObjHash, /* hash function */
1163 NULL, /* key dup */
1164 NULL, /* val dup */
1165 dictObjKeyCompare, /* key compare */
1166 dictRedisObjectDestructor, /* key destructor */
1167 dictRedisObjectDestructor /* val destructor */
1168 };
1169
1170 /* Db->expires */
1171 static dictType keyptrDictType = {
1172 dictObjHash, /* hash function */
1173 NULL, /* key dup */
1174 NULL, /* val dup */
1175 dictObjKeyCompare, /* key compare */
1176 dictRedisObjectDestructor, /* key destructor */
1177 NULL /* val destructor */
1178 };
1179
1180 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1181 static dictType hashDictType = {
1182 dictEncObjHash, /* hash function */
1183 NULL, /* key dup */
1184 NULL, /* val dup */
1185 dictEncObjKeyCompare, /* key compare */
1186 dictRedisObjectDestructor, /* key destructor */
1187 dictRedisObjectDestructor /* val destructor */
1188 };
1189
1190 /* Keylist hash table type has unencoded redis objects as keys and
1191 * lists as values. It's used for blocking operations (BLPOP) and to
1192 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1193 static dictType keylistDictType = {
1194 dictObjHash, /* hash function */
1195 NULL, /* key dup */
1196 NULL, /* val dup */
1197 dictObjKeyCompare, /* key compare */
1198 dictRedisObjectDestructor, /* key destructor */
1199 dictListDestructor /* val destructor */
1200 };
1201
1202 static void version();
1203
1204 /* ========================= Random utility functions ======================= */
1205
1206 /* Redis generally does not try to recover from out of memory conditions
1207 * when allocating objects or strings, it is not clear if it will be possible
1208 * to report this condition to the client since the networking layer itself
1209 * is based on heap allocation for send buffers, so we simply abort.
1210 * At least the code will be simpler to read... */
1211 static void oom(const char *msg) {
1212 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1213 sleep(1);
1214 abort();
1215 }
1216
1217 /* ====================== Redis server networking stuff ===================== */
1218 static void closeTimedoutClients(void) {
1219 redisClient *c;
1220 listNode *ln;
1221 time_t now = time(NULL);
1222 listIter li;
1223
1224 listRewind(server.clients,&li);
1225 while ((ln = listNext(&li)) != NULL) {
1226 c = listNodeValue(ln);
1227 if (server.maxidletime &&
1228 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1229 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1230 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1231 listLength(c->pubsub_patterns) == 0 &&
1232 (now - c->lastinteraction > server.maxidletime))
1233 {
1234 redisLog(REDIS_VERBOSE,"Closing idle client");
1235 freeClient(c);
1236 } else if (c->flags & REDIS_BLOCKED) {
1237 if (c->blockingto != 0 && c->blockingto < now) {
1238 addReply(c,shared.nullmultibulk);
1239 unblockClientWaitingData(c);
1240 }
1241 }
1242 }
1243 }
1244
1245 static int htNeedsResize(dict *dict) {
1246 long long size, used;
1247
1248 size = dictSlots(dict);
1249 used = dictSize(dict);
1250 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1251 (used*100/size < REDIS_HT_MINFILL));
1252 }
1253
1254 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1255 * we resize the hash table to save memory */
1256 static void tryResizeHashTables(void) {
1257 int j;
1258
1259 for (j = 0; j < server.dbnum; j++) {
1260 if (htNeedsResize(server.db[j].dict))
1261 dictResize(server.db[j].dict);
1262 if (htNeedsResize(server.db[j].expires))
1263 dictResize(server.db[j].expires);
1264 }
1265 }
1266
1267 /* Our hash table implementation performs rehashing incrementally while
1268 * we write/read from the hash table. Still if the server is idle, the hash
1269 * table will use two tables for a long time. So we try to use 1 millisecond
1270 * of CPU time at every serverCron() loop in order to rehash some key. */
1271 static void incrementallyRehash(void) {
1272 int j;
1273
1274 for (j = 0; j < server.dbnum; j++) {
1275 if (dictIsRehashing(server.db[j].dict)) {
1276 dictRehashMilliseconds(server.db[j].dict,1);
1277 break; /* already used our millisecond for this loop... */
1278 }
1279 }
1280 }
1281
1282 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1283 void backgroundSaveDoneHandler(int statloc) {
1284 int exitcode = WEXITSTATUS(statloc);
1285 int bysignal = WIFSIGNALED(statloc);
1286
1287 if (!bysignal && exitcode == 0) {
1288 redisLog(REDIS_NOTICE,
1289 "Background saving terminated with success");
1290 server.dirty = 0;
1291 server.lastsave = time(NULL);
1292 } else if (!bysignal && exitcode != 0) {
1293 redisLog(REDIS_WARNING, "Background saving error");
1294 } else {
1295 redisLog(REDIS_WARNING,
1296 "Background saving terminated by signal %d", WTERMSIG(statloc));
1297 rdbRemoveTempFile(server.bgsavechildpid);
1298 }
1299 server.bgsavechildpid = -1;
1300 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1301 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1302 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1303 }
1304
1305 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1306 * Handle this. */
1307 void backgroundRewriteDoneHandler(int statloc) {
1308 int exitcode = WEXITSTATUS(statloc);
1309 int bysignal = WIFSIGNALED(statloc);
1310
1311 if (!bysignal && exitcode == 0) {
1312 int fd;
1313 char tmpfile[256];
1314
1315 redisLog(REDIS_NOTICE,
1316 "Background append only file rewriting terminated with success");
1317 /* Now it's time to flush the differences accumulated by the parent */
1318 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1319 fd = open(tmpfile,O_WRONLY|O_APPEND);
1320 if (fd == -1) {
1321 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1322 goto cleanup;
1323 }
1324 /* Flush our data... */
1325 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1326 (signed) sdslen(server.bgrewritebuf)) {
1327 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1328 close(fd);
1329 goto cleanup;
1330 }
1331 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1332 /* Now our work is to rename the temp file into the stable file. And
1333 * switch the file descriptor used by the server for append only. */
1334 if (rename(tmpfile,server.appendfilename) == -1) {
1335 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1336 close(fd);
1337 goto cleanup;
1338 }
1339 /* Mission completed... almost */
1340 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1341 if (server.appendfd != -1) {
1342 /* If append only is actually enabled... */
1343 close(server.appendfd);
1344 server.appendfd = fd;
1345 fsync(fd);
1346 server.appendseldb = -1; /* Make sure it will issue SELECT */
1347 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1348 } else {
1349 /* If append only is disabled we just generate a dump in this
1350 * format. Why not? */
1351 close(fd);
1352 }
1353 } else if (!bysignal && exitcode != 0) {
1354 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1355 } else {
1356 redisLog(REDIS_WARNING,
1357 "Background append only file rewriting terminated by signal %d",
1358 WTERMSIG(statloc));
1359 }
1360 cleanup:
1361 sdsfree(server.bgrewritebuf);
1362 server.bgrewritebuf = sdsempty();
1363 aofRemoveTempFile(server.bgrewritechildpid);
1364 server.bgrewritechildpid = -1;
1365 }
1366
1367 /* This function is called once a background process of some kind terminates,
1368 * as we want to avoid resizing the hash tables when there is a child in order
1369 * to play well with copy-on-write (otherwise when a resize happens lots of
1370 * memory pages are copied). The goal of this function is to update the ability
1371 * for dict.c to resize the hash tables accordingly to the fact we have o not
1372 * running childs. */
1373 static void updateDictResizePolicy(void) {
1374 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1375 dictEnableResize();
1376 else
1377 dictDisableResize();
1378 }
1379
1380 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1381 int j, loops = server.cronloops++;
1382 REDIS_NOTUSED(eventLoop);
1383 REDIS_NOTUSED(id);
1384 REDIS_NOTUSED(clientData);
1385
1386 /* We take a cached value of the unix time in the global state because
1387 * with virtual memory and aging there is to store the current time
1388 * in objects at every object access, and accuracy is not needed.
1389 * To access a global var is faster than calling time(NULL) */
1390 server.unixtime = time(NULL);
1391
1392 /* Show some info about non-empty databases */
1393 for (j = 0; j < server.dbnum; j++) {
1394 long long size, used, vkeys;
1395
1396 size = dictSlots(server.db[j].dict);
1397 used = dictSize(server.db[j].dict);
1398 vkeys = dictSize(server.db[j].expires);
1399 if (!(loops % 50) && (used || vkeys)) {
1400 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1401 /* dictPrintStats(server.dict); */
1402 }
1403 }
1404
1405 /* We don't want to resize the hash tables while a bacground saving
1406 * is in progress: the saving child is created using fork() that is
1407 * implemented with a copy-on-write semantic in most modern systems, so
1408 * if we resize the HT while there is the saving child at work actually
1409 * a lot of memory movements in the parent will cause a lot of pages
1410 * copied. */
1411 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1412 if (!(loops % 10)) tryResizeHashTables();
1413 if (server.activerehashing) incrementallyRehash();
1414 }
1415
1416 /* Show information about connected clients */
1417 if (!(loops % 50)) {
1418 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1419 listLength(server.clients)-listLength(server.slaves),
1420 listLength(server.slaves),
1421 zmalloc_used_memory());
1422 }
1423
1424 /* Close connections of timedout clients */
1425 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1426 closeTimedoutClients();
1427
1428 /* Check if a background saving or AOF rewrite in progress terminated */
1429 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1430 int statloc;
1431 pid_t pid;
1432
1433 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1434 if (pid == server.bgsavechildpid) {
1435 backgroundSaveDoneHandler(statloc);
1436 } else {
1437 backgroundRewriteDoneHandler(statloc);
1438 }
1439 updateDictResizePolicy();
1440 }
1441 } else {
1442 /* If there is not a background saving in progress check if
1443 * we have to save now */
1444 time_t now = time(NULL);
1445 for (j = 0; j < server.saveparamslen; j++) {
1446 struct saveparam *sp = server.saveparams+j;
1447
1448 if (server.dirty >= sp->changes &&
1449 now-server.lastsave > sp->seconds) {
1450 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1451 sp->changes, sp->seconds);
1452 rdbSaveBackground(server.dbfilename);
1453 break;
1454 }
1455 }
1456 }
1457
1458 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1459 * will use few CPU cycles if there are few expiring keys, otherwise
1460 * it will get more aggressive to avoid that too much memory is used by
1461 * keys that can be removed from the keyspace. */
1462 for (j = 0; j < server.dbnum; j++) {
1463 int expired;
1464 redisDb *db = server.db+j;
1465
1466 /* Continue to expire if at the end of the cycle more than 25%
1467 * of the keys were expired. */
1468 do {
1469 long num = dictSize(db->expires);
1470 time_t now = time(NULL);
1471
1472 expired = 0;
1473 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1474 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1475 while (num--) {
1476 dictEntry *de;
1477 time_t t;
1478
1479 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1480 t = (time_t) dictGetEntryVal(de);
1481 if (now > t) {
1482 deleteKey(db,dictGetEntryKey(de));
1483 expired++;
1484 server.stat_expiredkeys++;
1485 }
1486 }
1487 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1488 }
1489
1490 /* Swap a few keys on disk if we are over the memory limit and VM
1491 * is enbled. Try to free objects from the free list first. */
1492 if (vmCanSwapOut()) {
1493 while (server.vm_enabled && zmalloc_used_memory() >
1494 server.vm_max_memory)
1495 {
1496 int retval;
1497
1498 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1499 retval = (server.vm_max_threads == 0) ?
1500 vmSwapOneObjectBlocking() :
1501 vmSwapOneObjectThreaded();
1502 if (retval == REDIS_ERR && !(loops % 300) &&
1503 zmalloc_used_memory() >
1504 (server.vm_max_memory+server.vm_max_memory/10))
1505 {
1506 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1507 }
1508 /* Note that when using threade I/O we free just one object,
1509 * because anyway when the I/O thread in charge to swap this
1510 * object out will finish, the handler of completed jobs
1511 * will try to swap more objects if we are still out of memory. */
1512 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1513 }
1514 }
1515
1516 /* Check if we should connect to a MASTER */
1517 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1518 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1519 if (syncWithMaster() == REDIS_OK) {
1520 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1521 if (server.appendonly) rewriteAppendOnlyFileBackground();
1522 }
1523 }
1524 return 100;
1525 }
1526
1527 /* This function gets called every time Redis is entering the
1528 * main loop of the event driven library, that is, before to sleep
1529 * for ready file descriptors. */
1530 static void beforeSleep(struct aeEventLoop *eventLoop) {
1531 REDIS_NOTUSED(eventLoop);
1532
1533 /* Awake clients that got all the swapped keys they requested */
1534 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1535 listIter li;
1536 listNode *ln;
1537
1538 listRewind(server.io_ready_clients,&li);
1539 while((ln = listNext(&li))) {
1540 redisClient *c = ln->value;
1541 struct redisCommand *cmd;
1542
1543 /* Resume the client. */
1544 listDelNode(server.io_ready_clients,ln);
1545 c->flags &= (~REDIS_IO_WAIT);
1546 server.vm_blocked_clients--;
1547 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1548 readQueryFromClient, c);
1549 cmd = lookupCommand(c->argv[0]->ptr);
1550 assert(cmd != NULL);
1551 call(c,cmd);
1552 resetClient(c);
1553 /* There may be more data to process in the input buffer. */
1554 if (c->querybuf && sdslen(c->querybuf) > 0)
1555 processInputBuffer(c);
1556 }
1557 }
1558 /* Write the AOF buffer on disk */
1559 flushAppendOnlyFile();
1560 }
1561
1562 static void createSharedObjects(void) {
1563 int j;
1564
1565 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1566 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1567 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1568 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1569 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1570 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1571 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1572 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1573 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1574 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1575 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1576 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1577 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1578 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR no such key\r\n"));
1580 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1581 "-ERR syntax error\r\n"));
1582 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1583 "-ERR source and destination objects are the same\r\n"));
1584 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1585 "-ERR index out of range\r\n"));
1586 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1587 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1588 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1589 shared.select0 = createStringObject("select 0\r\n",10);
1590 shared.select1 = createStringObject("select 1\r\n",10);
1591 shared.select2 = createStringObject("select 2\r\n",10);
1592 shared.select3 = createStringObject("select 3\r\n",10);
1593 shared.select4 = createStringObject("select 4\r\n",10);
1594 shared.select5 = createStringObject("select 5\r\n",10);
1595 shared.select6 = createStringObject("select 6\r\n",10);
1596 shared.select7 = createStringObject("select 7\r\n",10);
1597 shared.select8 = createStringObject("select 8\r\n",10);
1598 shared.select9 = createStringObject("select 9\r\n",10);
1599 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1600 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1601 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1602 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1603 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1604 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1605 shared.mbulk3 = createStringObject("*3\r\n",4);
1606 shared.mbulk4 = createStringObject("*4\r\n",4);
1607 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1608 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1609 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1610 }
1611 }
1612
1613 static void appendServerSaveParams(time_t seconds, int changes) {
1614 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1615 server.saveparams[server.saveparamslen].seconds = seconds;
1616 server.saveparams[server.saveparamslen].changes = changes;
1617 server.saveparamslen++;
1618 }
1619
1620 static void resetServerSaveParams() {
1621 zfree(server.saveparams);
1622 server.saveparams = NULL;
1623 server.saveparamslen = 0;
1624 }
1625
1626 static void initServerConfig() {
1627 server.dbnum = REDIS_DEFAULT_DBNUM;
1628 server.port = REDIS_SERVERPORT;
1629 server.verbosity = REDIS_VERBOSE;
1630 server.maxidletime = REDIS_MAXIDLETIME;
1631 server.saveparams = NULL;
1632 server.logfile = NULL; /* NULL = log on standard output */
1633 server.bindaddr = NULL;
1634 server.glueoutputbuf = 1;
1635 server.daemonize = 0;
1636 server.appendonly = 0;
1637 server.appendfsync = APPENDFSYNC_ALWAYS;
1638 server.lastfsync = time(NULL);
1639 server.appendfd = -1;
1640 server.appendseldb = -1; /* Make sure the first time will not match */
1641 server.pidfile = zstrdup("/var/run/redis.pid");
1642 server.dbfilename = zstrdup("dump.rdb");
1643 server.appendfilename = zstrdup("appendonly.aof");
1644 server.requirepass = NULL;
1645 server.rdbcompression = 1;
1646 server.activerehashing = 1;
1647 server.maxclients = 0;
1648 server.blpop_blocked_clients = 0;
1649 server.maxmemory = 0;
1650 server.vm_enabled = 0;
1651 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1652 server.vm_page_size = 256; /* 256 bytes per page */
1653 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1654 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1655 server.vm_max_threads = 4;
1656 server.vm_blocked_clients = 0;
1657 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1658 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1659
1660 resetServerSaveParams();
1661
1662 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1663 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1664 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1665 /* Replication related */
1666 server.isslave = 0;
1667 server.masterauth = NULL;
1668 server.masterhost = NULL;
1669 server.masterport = 6379;
1670 server.master = NULL;
1671 server.replstate = REDIS_REPL_NONE;
1672
1673 /* Double constants initialization */
1674 R_Zero = 0.0;
1675 R_PosInf = 1.0/R_Zero;
1676 R_NegInf = -1.0/R_Zero;
1677 R_Nan = R_Zero/R_Zero;
1678 }
1679
1680 static void initServer() {
1681 int j;
1682
1683 signal(SIGHUP, SIG_IGN);
1684 signal(SIGPIPE, SIG_IGN);
1685 setupSigSegvAction();
1686
1687 server.devnull = fopen("/dev/null","w");
1688 if (server.devnull == NULL) {
1689 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1690 exit(1);
1691 }
1692 server.clients = listCreate();
1693 server.slaves = listCreate();
1694 server.monitors = listCreate();
1695 server.objfreelist = listCreate();
1696 createSharedObjects();
1697 server.el = aeCreateEventLoop();
1698 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1699 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1700 if (server.fd == -1) {
1701 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1702 exit(1);
1703 }
1704 for (j = 0; j < server.dbnum; j++) {
1705 server.db[j].dict = dictCreate(&dbDictType,NULL);
1706 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1707 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1708 if (server.vm_enabled)
1709 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1710 server.db[j].id = j;
1711 }
1712 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1713 server.pubsub_patterns = listCreate();
1714 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1715 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1716 server.cronloops = 0;
1717 server.bgsavechildpid = -1;
1718 server.bgrewritechildpid = -1;
1719 server.bgrewritebuf = sdsempty();
1720 server.aofbuf = sdsempty();
1721 server.lastsave = time(NULL);
1722 server.dirty = 0;
1723 server.stat_numcommands = 0;
1724 server.stat_numconnections = 0;
1725 server.stat_expiredkeys = 0;
1726 server.stat_starttime = time(NULL);
1727 server.unixtime = time(NULL);
1728 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1729 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1730 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1731
1732 if (server.appendonly) {
1733 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1734 if (server.appendfd == -1) {
1735 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1736 strerror(errno));
1737 exit(1);
1738 }
1739 }
1740
1741 if (server.vm_enabled) vmInit();
1742 }
1743
1744 /* Empty the whole database */
1745 static long long emptyDb() {
1746 int j;
1747 long long removed = 0;
1748
1749 for (j = 0; j < server.dbnum; j++) {
1750 removed += dictSize(server.db[j].dict);
1751 dictEmpty(server.db[j].dict);
1752 dictEmpty(server.db[j].expires);
1753 }
1754 return removed;
1755 }
1756
1757 static int yesnotoi(char *s) {
1758 if (!strcasecmp(s,"yes")) return 1;
1759 else if (!strcasecmp(s,"no")) return 0;
1760 else return -1;
1761 }
1762
1763 /* I agree, this is a very rudimental way to load a configuration...
1764 will improve later if the config gets more complex */
1765 static void loadServerConfig(char *filename) {
1766 FILE *fp;
1767 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1768 int linenum = 0;
1769 sds line = NULL;
1770
1771 if (filename[0] == '-' && filename[1] == '\0')
1772 fp = stdin;
1773 else {
1774 if ((fp = fopen(filename,"r")) == NULL) {
1775 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1776 exit(1);
1777 }
1778 }
1779
1780 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1781 sds *argv;
1782 int argc, j;
1783
1784 linenum++;
1785 line = sdsnew(buf);
1786 line = sdstrim(line," \t\r\n");
1787
1788 /* Skip comments and blank lines*/
1789 if (line[0] == '#' || line[0] == '\0') {
1790 sdsfree(line);
1791 continue;
1792 }
1793
1794 /* Split into arguments */
1795 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1796 sdstolower(argv[0]);
1797
1798 /* Execute config directives */
1799 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1800 server.maxidletime = atoi(argv[1]);
1801 if (server.maxidletime < 0) {
1802 err = "Invalid timeout value"; goto loaderr;
1803 }
1804 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1805 server.port = atoi(argv[1]);
1806 if (server.port < 1 || server.port > 65535) {
1807 err = "Invalid port"; goto loaderr;
1808 }
1809 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1810 server.bindaddr = zstrdup(argv[1]);
1811 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1812 int seconds = atoi(argv[1]);
1813 int changes = atoi(argv[2]);
1814 if (seconds < 1 || changes < 0) {
1815 err = "Invalid save parameters"; goto loaderr;
1816 }
1817 appendServerSaveParams(seconds,changes);
1818 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1819 if (chdir(argv[1]) == -1) {
1820 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1821 argv[1], strerror(errno));
1822 exit(1);
1823 }
1824 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1825 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1826 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1827 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1828 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1829 else {
1830 err = "Invalid log level. Must be one of debug, notice, warning";
1831 goto loaderr;
1832 }
1833 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1834 FILE *logfp;
1835
1836 server.logfile = zstrdup(argv[1]);
1837 if (!strcasecmp(server.logfile,"stdout")) {
1838 zfree(server.logfile);
1839 server.logfile = NULL;
1840 }
1841 if (server.logfile) {
1842 /* Test if we are able to open the file. The server will not
1843 * be able to abort just for this problem later... */
1844 logfp = fopen(server.logfile,"a");
1845 if (logfp == NULL) {
1846 err = sdscatprintf(sdsempty(),
1847 "Can't open the log file: %s", strerror(errno));
1848 goto loaderr;
1849 }
1850 fclose(logfp);
1851 }
1852 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1853 server.dbnum = atoi(argv[1]);
1854 if (server.dbnum < 1) {
1855 err = "Invalid number of databases"; goto loaderr;
1856 }
1857 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1858 loadServerConfig(argv[1]);
1859 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1860 server.maxclients = atoi(argv[1]);
1861 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1862 server.maxmemory = memtoll(argv[1],NULL);
1863 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1864 server.masterhost = sdsnew(argv[1]);
1865 server.masterport = atoi(argv[2]);
1866 server.replstate = REDIS_REPL_CONNECT;
1867 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1868 server.masterauth = zstrdup(argv[1]);
1869 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1870 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1871 err = "argument must be 'yes' or 'no'"; goto loaderr;
1872 }
1873 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1874 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1875 err = "argument must be 'yes' or 'no'"; goto loaderr;
1876 }
1877 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1878 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1879 err = "argument must be 'yes' or 'no'"; goto loaderr;
1880 }
1881 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1882 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1883 err = "argument must be 'yes' or 'no'"; goto loaderr;
1884 }
1885 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1886 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1887 err = "argument must be 'yes' or 'no'"; goto loaderr;
1888 }
1889 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1890 zfree(server.appendfilename);
1891 server.appendfilename = zstrdup(argv[1]);
1892 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1893 if (!strcasecmp(argv[1],"no")) {
1894 server.appendfsync = APPENDFSYNC_NO;
1895 } else if (!strcasecmp(argv[1],"always")) {
1896 server.appendfsync = APPENDFSYNC_ALWAYS;
1897 } else if (!strcasecmp(argv[1],"everysec")) {
1898 server.appendfsync = APPENDFSYNC_EVERYSEC;
1899 } else {
1900 err = "argument must be 'no', 'always' or 'everysec'";
1901 goto loaderr;
1902 }
1903 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1904 server.requirepass = zstrdup(argv[1]);
1905 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1906 zfree(server.pidfile);
1907 server.pidfile = zstrdup(argv[1]);
1908 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1909 zfree(server.dbfilename);
1910 server.dbfilename = zstrdup(argv[1]);
1911 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1912 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1913 err = "argument must be 'yes' or 'no'"; goto loaderr;
1914 }
1915 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1916 zfree(server.vm_swap_file);
1917 server.vm_swap_file = zstrdup(argv[1]);
1918 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1919 server.vm_max_memory = memtoll(argv[1],NULL);
1920 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1921 server.vm_page_size = memtoll(argv[1], NULL);
1922 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1923 server.vm_pages = memtoll(argv[1], NULL);
1924 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1925 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1926 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1927 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1928 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1929 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1930 } else {
1931 err = "Bad directive or wrong number of arguments"; goto loaderr;
1932 }
1933 for (j = 0; j < argc; j++)
1934 sdsfree(argv[j]);
1935 zfree(argv);
1936 sdsfree(line);
1937 }
1938 if (fp != stdin) fclose(fp);
1939 return;
1940
1941 loaderr:
1942 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1943 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1944 fprintf(stderr, ">>> '%s'\n", line);
1945 fprintf(stderr, "%s\n", err);
1946 exit(1);
1947 }
1948
1949 static void freeClientArgv(redisClient *c) {
1950 int j;
1951
1952 for (j = 0; j < c->argc; j++)
1953 decrRefCount(c->argv[j]);
1954 for (j = 0; j < c->mbargc; j++)
1955 decrRefCount(c->mbargv[j]);
1956 c->argc = 0;
1957 c->mbargc = 0;
1958 }
1959
1960 static void freeClient(redisClient *c) {
1961 listNode *ln;
1962
1963 /* Note that if the client we are freeing is blocked into a blocking
1964 * call, we have to set querybuf to NULL *before* to call
1965 * unblockClientWaitingData() to avoid processInputBuffer() will get
1966 * called. Also it is important to remove the file events after
1967 * this, because this call adds the READABLE event. */
1968 sdsfree(c->querybuf);
1969 c->querybuf = NULL;
1970 if (c->flags & REDIS_BLOCKED)
1971 unblockClientWaitingData(c);
1972
1973 /* Unsubscribe from all the pubsub channels */
1974 pubsubUnsubscribeAllChannels(c,0);
1975 pubsubUnsubscribeAllPatterns(c,0);
1976 dictRelease(c->pubsub_channels);
1977 listRelease(c->pubsub_patterns);
1978 /* Obvious cleanup */
1979 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1980 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1981 listRelease(c->reply);
1982 freeClientArgv(c);
1983 close(c->fd);
1984 /* Remove from the list of clients */
1985 ln = listSearchKey(server.clients,c);
1986 redisAssert(ln != NULL);
1987 listDelNode(server.clients,ln);
1988 /* Remove from the list of clients waiting for swapped keys */
1989 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1990 ln = listSearchKey(server.io_ready_clients,c);
1991 if (ln) {
1992 listDelNode(server.io_ready_clients,ln);
1993 server.vm_blocked_clients--;
1994 }
1995 }
1996 while (server.vm_enabled && listLength(c->io_keys)) {
1997 ln = listFirst(c->io_keys);
1998 dontWaitForSwappedKey(c,ln->value);
1999 }
2000 listRelease(c->io_keys);
2001 /* Master/slave cleanup */
2002 if (c->flags & REDIS_SLAVE) {
2003 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2004 close(c->repldbfd);
2005 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2006 ln = listSearchKey(l,c);
2007 redisAssert(ln != NULL);
2008 listDelNode(l,ln);
2009 }
2010 if (c->flags & REDIS_MASTER) {
2011 server.master = NULL;
2012 server.replstate = REDIS_REPL_CONNECT;
2013 }
2014 /* Release memory */
2015 zfree(c->argv);
2016 zfree(c->mbargv);
2017 freeClientMultiState(c);
2018 zfree(c);
2019 }
2020
2021 #define GLUEREPLY_UP_TO (1024)
2022 static void glueReplyBuffersIfNeeded(redisClient *c) {
2023 int copylen = 0;
2024 char buf[GLUEREPLY_UP_TO];
2025 listNode *ln;
2026 listIter li;
2027 robj *o;
2028
2029 listRewind(c->reply,&li);
2030 while((ln = listNext(&li))) {
2031 int objlen;
2032
2033 o = ln->value;
2034 objlen = sdslen(o->ptr);
2035 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2036 memcpy(buf+copylen,o->ptr,objlen);
2037 copylen += objlen;
2038 listDelNode(c->reply,ln);
2039 } else {
2040 if (copylen == 0) return;
2041 break;
2042 }
2043 }
2044 /* Now the output buffer is empty, add the new single element */
2045 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2046 listAddNodeHead(c->reply,o);
2047 }
2048
2049 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2050 redisClient *c = privdata;
2051 int nwritten = 0, totwritten = 0, objlen;
2052 robj *o;
2053 REDIS_NOTUSED(el);
2054 REDIS_NOTUSED(mask);
2055
2056 /* Use writev() if we have enough buffers to send */
2057 if (!server.glueoutputbuf &&
2058 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2059 !(c->flags & REDIS_MASTER))
2060 {
2061 sendReplyToClientWritev(el, fd, privdata, mask);
2062 return;
2063 }
2064
2065 while(listLength(c->reply)) {
2066 if (server.glueoutputbuf && listLength(c->reply) > 1)
2067 glueReplyBuffersIfNeeded(c);
2068
2069 o = listNodeValue(listFirst(c->reply));
2070 objlen = sdslen(o->ptr);
2071
2072 if (objlen == 0) {
2073 listDelNode(c->reply,listFirst(c->reply));
2074 continue;
2075 }
2076
2077 if (c->flags & REDIS_MASTER) {
2078 /* Don't reply to a master */
2079 nwritten = objlen - c->sentlen;
2080 } else {
2081 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2082 if (nwritten <= 0) break;
2083 }
2084 c->sentlen += nwritten;
2085 totwritten += nwritten;
2086 /* If we fully sent the object on head go to the next one */
2087 if (c->sentlen == objlen) {
2088 listDelNode(c->reply,listFirst(c->reply));
2089 c->sentlen = 0;
2090 }
2091 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2092 * bytes, in a single threaded server it's a good idea to serve
2093 * other clients as well, even if a very large request comes from
2094 * super fast link that is always able to accept data (in real world
2095 * scenario think about 'KEYS *' against the loopback interfae) */
2096 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2097 }
2098 if (nwritten == -1) {
2099 if (errno == EAGAIN) {
2100 nwritten = 0;
2101 } else {
2102 redisLog(REDIS_VERBOSE,
2103 "Error writing to client: %s", strerror(errno));
2104 freeClient(c);
2105 return;
2106 }
2107 }
2108 if (totwritten > 0) c->lastinteraction = time(NULL);
2109 if (listLength(c->reply) == 0) {
2110 c->sentlen = 0;
2111 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2112 }
2113 }
2114
2115 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2116 {
2117 redisClient *c = privdata;
2118 int nwritten = 0, totwritten = 0, objlen, willwrite;
2119 robj *o;
2120 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2121 int offset, ion = 0;
2122 REDIS_NOTUSED(el);
2123 REDIS_NOTUSED(mask);
2124
2125 listNode *node;
2126 while (listLength(c->reply)) {
2127 offset = c->sentlen;
2128 ion = 0;
2129 willwrite = 0;
2130
2131 /* fill-in the iov[] array */
2132 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2133 o = listNodeValue(node);
2134 objlen = sdslen(o->ptr);
2135
2136 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2137 break;
2138
2139 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2140 break; /* no more iovecs */
2141
2142 iov[ion].iov_base = ((char*)o->ptr) + offset;
2143 iov[ion].iov_len = objlen - offset;
2144 willwrite += objlen - offset;
2145 offset = 0; /* just for the first item */
2146 ion++;
2147 }
2148
2149 if(willwrite == 0)
2150 break;
2151
2152 /* write all collected blocks at once */
2153 if((nwritten = writev(fd, iov, ion)) < 0) {
2154 if (errno != EAGAIN) {
2155 redisLog(REDIS_VERBOSE,
2156 "Error writing to client: %s", strerror(errno));
2157 freeClient(c);
2158 return;
2159 }
2160 break;
2161 }
2162
2163 totwritten += nwritten;
2164 offset = c->sentlen;
2165
2166 /* remove written robjs from c->reply */
2167 while (nwritten && listLength(c->reply)) {
2168 o = listNodeValue(listFirst(c->reply));
2169 objlen = sdslen(o->ptr);
2170
2171 if(nwritten >= objlen - offset) {
2172 listDelNode(c->reply, listFirst(c->reply));
2173 nwritten -= objlen - offset;
2174 c->sentlen = 0;
2175 } else {
2176 /* partial write */
2177 c->sentlen += nwritten;
2178 break;
2179 }
2180 offset = 0;
2181 }
2182 }
2183
2184 if (totwritten > 0)
2185 c->lastinteraction = time(NULL);
2186
2187 if (listLength(c->reply) == 0) {
2188 c->sentlen = 0;
2189 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2190 }
2191 }
2192
2193 static struct redisCommand *lookupCommand(char *name) {
2194 int j = 0;
2195 while(cmdTable[j].name != NULL) {
2196 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2197 j++;
2198 }
2199 return NULL;
2200 }
2201
2202 /* resetClient prepare the client to process the next command */
2203 static void resetClient(redisClient *c) {
2204 freeClientArgv(c);
2205 c->bulklen = -1;
2206 c->multibulk = 0;
2207 }
2208
2209 /* Call() is the core of Redis execution of a command */
2210 static void call(redisClient *c, struct redisCommand *cmd) {
2211 long long dirty;
2212
2213 dirty = server.dirty;
2214 cmd->proc(c);
2215 dirty = server.dirty-dirty;
2216
2217 if (server.appendonly && dirty)
2218 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2219 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2220 listLength(server.slaves))
2221 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2222 if (listLength(server.monitors))
2223 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2224 server.stat_numcommands++;
2225 }
2226
2227 /* If this function gets called we already read a whole
2228 * command, argments are in the client argv/argc fields.
2229 * processCommand() execute the command or prepare the
2230 * server for a bulk read from the client.
2231 *
2232 * If 1 is returned the client is still alive and valid and
2233 * and other operations can be performed by the caller. Otherwise
2234 * if 0 is returned the client was destroied (i.e. after QUIT). */
2235 static int processCommand(redisClient *c) {
2236 struct redisCommand *cmd;
2237
2238 /* Free some memory if needed (maxmemory setting) */
2239 if (server.maxmemory) freeMemoryIfNeeded();
2240
2241 /* Handle the multi bulk command type. This is an alternative protocol
2242 * supported by Redis in order to receive commands that are composed of
2243 * multiple binary-safe "bulk" arguments. The latency of processing is
2244 * a bit higher but this allows things like multi-sets, so if this
2245 * protocol is used only for MSET and similar commands this is a big win. */
2246 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2247 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2248 if (c->multibulk <= 0) {
2249 resetClient(c);
2250 return 1;
2251 } else {
2252 decrRefCount(c->argv[c->argc-1]);
2253 c->argc--;
2254 return 1;
2255 }
2256 } else if (c->multibulk) {
2257 if (c->bulklen == -1) {
2258 if (((char*)c->argv[0]->ptr)[0] != '$') {
2259 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2260 resetClient(c);
2261 return 1;
2262 } else {
2263 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2264 decrRefCount(c->argv[0]);
2265 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2266 c->argc--;
2267 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2268 resetClient(c);
2269 return 1;
2270 }
2271 c->argc--;
2272 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2273 return 1;
2274 }
2275 } else {
2276 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2277 c->mbargv[c->mbargc] = c->argv[0];
2278 c->mbargc++;
2279 c->argc--;
2280 c->multibulk--;
2281 if (c->multibulk == 0) {
2282 robj **auxargv;
2283 int auxargc;
2284
2285 /* Here we need to swap the multi-bulk argc/argv with the
2286 * normal argc/argv of the client structure. */
2287 auxargv = c->argv;
2288 c->argv = c->mbargv;
2289 c->mbargv = auxargv;
2290
2291 auxargc = c->argc;
2292 c->argc = c->mbargc;
2293 c->mbargc = auxargc;
2294
2295 /* We need to set bulklen to something different than -1
2296 * in order for the code below to process the command without
2297 * to try to read the last argument of a bulk command as
2298 * a special argument. */
2299 c->bulklen = 0;
2300 /* continue below and process the command */
2301 } else {
2302 c->bulklen = -1;
2303 return 1;
2304 }
2305 }
2306 }
2307 /* -- end of multi bulk commands processing -- */
2308
2309 /* The QUIT command is handled as a special case. Normal command
2310 * procs are unable to close the client connection safely */
2311 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2312 freeClient(c);
2313 return 0;
2314 }
2315
2316 /* Now lookup the command and check ASAP about trivial error conditions
2317 * such wrong arity, bad command name and so forth. */
2318 cmd = lookupCommand(c->argv[0]->ptr);
2319 if (!cmd) {
2320 addReplySds(c,
2321 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2322 (char*)c->argv[0]->ptr));
2323 resetClient(c);
2324 return 1;
2325 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2326 (c->argc < -cmd->arity)) {
2327 addReplySds(c,
2328 sdscatprintf(sdsempty(),
2329 "-ERR wrong number of arguments for '%s' command\r\n",
2330 cmd->name));
2331 resetClient(c);
2332 return 1;
2333 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2334 /* This is a bulk command, we have to read the last argument yet. */
2335 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2336
2337 decrRefCount(c->argv[c->argc-1]);
2338 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2339 c->argc--;
2340 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2341 resetClient(c);
2342 return 1;
2343 }
2344 c->argc--;
2345 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2346 /* It is possible that the bulk read is already in the
2347 * buffer. Check this condition and handle it accordingly.
2348 * This is just a fast path, alternative to call processInputBuffer().
2349 * It's a good idea since the code is small and this condition
2350 * happens most of the times. */
2351 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2352 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2353 c->argc++;
2354 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2355 } else {
2356 /* Otherwise return... there is to read the last argument
2357 * from the socket. */
2358 return 1;
2359 }
2360 }
2361 /* Let's try to encode the bulk object to save space. */
2362 if (cmd->flags & REDIS_CMD_BULK)
2363 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2364
2365 /* Check if the user is authenticated */
2366 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2367 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2368 resetClient(c);
2369 return 1;
2370 }
2371
2372 /* Handle the maxmemory directive */
2373 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2374 zmalloc_used_memory() > server.maxmemory)
2375 {
2376 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2377 resetClient(c);
2378 return 1;
2379 }
2380
2381 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2382 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2383 &&
2384 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2385 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2386 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2387 resetClient(c);
2388 return 1;
2389 }
2390
2391 /* Exec the command */
2392 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2393 queueMultiCommand(c,cmd);
2394 addReply(c,shared.queued);
2395 } else {
2396 if (server.vm_enabled && server.vm_max_threads > 0 &&
2397 blockClientOnSwappedKeys(cmd,c)) return 1;
2398 call(c,cmd);
2399 }
2400
2401 /* Prepare the client for the next command */
2402 resetClient(c);
2403 return 1;
2404 }
2405
2406 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2407 listNode *ln;
2408 listIter li;
2409 int outc = 0, j;
2410 robj **outv;
2411 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2412 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2413 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2414 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2415 robj *lenobj;
2416
2417 if (argc <= REDIS_STATIC_ARGS) {
2418 outv = static_outv;
2419 } else {
2420 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2421 }
2422
2423 lenobj = createObject(REDIS_STRING,
2424 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2425 lenobj->refcount = 0;
2426 outv[outc++] = lenobj;
2427 for (j = 0; j < argc; j++) {
2428 lenobj = createObject(REDIS_STRING,
2429 sdscatprintf(sdsempty(),"$%lu\r\n",
2430 (unsigned long) stringObjectLen(argv[j])));
2431 lenobj->refcount = 0;
2432 outv[outc++] = lenobj;
2433 outv[outc++] = argv[j];
2434 outv[outc++] = shared.crlf;
2435 }
2436
2437 /* Increment all the refcounts at start and decrement at end in order to
2438 * be sure to free objects if there is no slave in a replication state
2439 * able to be feed with commands */
2440 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2441 listRewind(slaves,&li);
2442 while((ln = listNext(&li))) {
2443 redisClient *slave = ln->value;
2444
2445 /* Don't feed slaves that are still waiting for BGSAVE to start */
2446 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2447
2448 /* Feed all the other slaves, MONITORs and so on */
2449 if (slave->slaveseldb != dictid) {
2450 robj *selectcmd;
2451
2452 switch(dictid) {
2453 case 0: selectcmd = shared.select0; break;
2454 case 1: selectcmd = shared.select1; break;
2455 case 2: selectcmd = shared.select2; break;
2456 case 3: selectcmd = shared.select3; break;
2457 case 4: selectcmd = shared.select4; break;
2458 case 5: selectcmd = shared.select5; break;
2459 case 6: selectcmd = shared.select6; break;
2460 case 7: selectcmd = shared.select7; break;
2461 case 8: selectcmd = shared.select8; break;
2462 case 9: selectcmd = shared.select9; break;
2463 default:
2464 selectcmd = createObject(REDIS_STRING,
2465 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2466 selectcmd->refcount = 0;
2467 break;
2468 }
2469 addReply(slave,selectcmd);
2470 slave->slaveseldb = dictid;
2471 }
2472 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2473 }
2474 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2475 if (outv != static_outv) zfree(outv);
2476 }
2477
2478 static sds sdscatrepr(sds s, char *p, size_t len) {
2479 s = sdscatlen(s,"\"",1);
2480 while(len--) {
2481 switch(*p) {
2482 case '\\':
2483 case '"':
2484 s = sdscatprintf(s,"\\%c",*p);
2485 break;
2486 case '\n': s = sdscatlen(s,"\\n",1); break;
2487 case '\r': s = sdscatlen(s,"\\r",1); break;
2488 case '\t': s = sdscatlen(s,"\\t",1); break;
2489 case '\a': s = sdscatlen(s,"\\a",1); break;
2490 case '\b': s = sdscatlen(s,"\\b",1); break;
2491 default:
2492 if (isprint(*p))
2493 s = sdscatprintf(s,"%c",*p);
2494 else
2495 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2496 break;
2497 }
2498 p++;
2499 }
2500 return sdscatlen(s,"\"",1);
2501 }
2502
2503 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2504 listNode *ln;
2505 listIter li;
2506 int j;
2507 sds cmdrepr = sdsnew("+");
2508 robj *cmdobj;
2509 struct timeval tv;
2510
2511 gettimeofday(&tv,NULL);
2512 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2513 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2514
2515 for (j = 0; j < argc; j++) {
2516 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2517 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2518 } else {
2519 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2520 sdslen(argv[j]->ptr));
2521 }
2522 if (j != argc-1)
2523 cmdrepr = sdscatlen(cmdrepr," ",1);
2524 }
2525 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2526 cmdobj = createObject(REDIS_STRING,cmdrepr);
2527
2528 listRewind(monitors,&li);
2529 while((ln = listNext(&li))) {
2530 redisClient *monitor = ln->value;
2531 addReply(monitor,cmdobj);
2532 }
2533 decrRefCount(cmdobj);
2534 }
2535
2536 static void processInputBuffer(redisClient *c) {
2537 again:
2538 /* Before to process the input buffer, make sure the client is not
2539 * waitig for a blocking operation such as BLPOP. Note that the first
2540 * iteration the client is never blocked, otherwise the processInputBuffer
2541 * would not be called at all, but after the execution of the first commands
2542 * in the input buffer the client may be blocked, and the "goto again"
2543 * will try to reiterate. The following line will make it return asap. */
2544 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2545 if (c->bulklen == -1) {
2546 /* Read the first line of the query */
2547 char *p = strchr(c->querybuf,'\n');
2548 size_t querylen;
2549
2550 if (p) {
2551 sds query, *argv;
2552 int argc, j;
2553
2554 query = c->querybuf;
2555 c->querybuf = sdsempty();
2556 querylen = 1+(p-(query));
2557 if (sdslen(query) > querylen) {
2558 /* leave data after the first line of the query in the buffer */
2559 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2560 }
2561 *p = '\0'; /* remove "\n" */
2562 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2563 sdsupdatelen(query);
2564
2565 /* Now we can split the query in arguments */
2566 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2567 sdsfree(query);
2568
2569 if (c->argv) zfree(c->argv);
2570 c->argv = zmalloc(sizeof(robj*)*argc);
2571
2572 for (j = 0; j < argc; j++) {
2573 if (sdslen(argv[j])) {
2574 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2575 c->argc++;
2576 } else {
2577 sdsfree(argv[j]);
2578 }
2579 }
2580 zfree(argv);
2581 if (c->argc) {
2582 /* Execute the command. If the client is still valid
2583 * after processCommand() return and there is something
2584 * on the query buffer try to process the next command. */
2585 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2586 } else {
2587 /* Nothing to process, argc == 0. Just process the query
2588 * buffer if it's not empty or return to the caller */
2589 if (sdslen(c->querybuf)) goto again;
2590 }
2591 return;
2592 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2593 redisLog(REDIS_VERBOSE, "Client protocol error");
2594 freeClient(c);
2595 return;
2596 }
2597 } else {
2598 /* Bulk read handling. Note that if we are at this point
2599 the client already sent a command terminated with a newline,
2600 we are reading the bulk data that is actually the last
2601 argument of the command. */
2602 int qbl = sdslen(c->querybuf);
2603
2604 if (c->bulklen <= qbl) {
2605 /* Copy everything but the final CRLF as final argument */
2606 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2607 c->argc++;
2608 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2609 /* Process the command. If the client is still valid after
2610 * the processing and there is more data in the buffer
2611 * try to parse it. */
2612 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2613 return;
2614 }
2615 }
2616 }
2617
2618 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2619 redisClient *c = (redisClient*) privdata;
2620 char buf[REDIS_IOBUF_LEN];
2621 int nread;
2622 REDIS_NOTUSED(el);
2623 REDIS_NOTUSED(mask);
2624
2625 nread = read(fd, buf, REDIS_IOBUF_LEN);
2626 if (nread == -1) {
2627 if (errno == EAGAIN) {
2628 nread = 0;
2629 } else {
2630 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2631 freeClient(c);
2632 return;
2633 }
2634 } else if (nread == 0) {
2635 redisLog(REDIS_VERBOSE, "Client closed connection");
2636 freeClient(c);
2637 return;
2638 }
2639 if (nread) {
2640 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2641 c->lastinteraction = time(NULL);
2642 } else {
2643 return;
2644 }
2645 processInputBuffer(c);
2646 }
2647
2648 static int selectDb(redisClient *c, int id) {
2649 if (id < 0 || id >= server.dbnum)
2650 return REDIS_ERR;
2651 c->db = &server.db[id];
2652 return REDIS_OK;
2653 }
2654
2655 static void *dupClientReplyValue(void *o) {
2656 incrRefCount((robj*)o);
2657 return o;
2658 }
2659
2660 static int listMatchObjects(void *a, void *b) {
2661 return compareStringObjects(a,b) == 0;
2662 }
2663
2664 static redisClient *createClient(int fd) {
2665 redisClient *c = zmalloc(sizeof(*c));
2666
2667 anetNonBlock(NULL,fd);
2668 anetTcpNoDelay(NULL,fd);
2669 if (!c) return NULL;
2670 selectDb(c,0);
2671 c->fd = fd;
2672 c->querybuf = sdsempty();
2673 c->argc = 0;
2674 c->argv = NULL;
2675 c->bulklen = -1;
2676 c->multibulk = 0;
2677 c->mbargc = 0;
2678 c->mbargv = NULL;
2679 c->sentlen = 0;
2680 c->flags = 0;
2681 c->lastinteraction = time(NULL);
2682 c->authenticated = 0;
2683 c->replstate = REDIS_REPL_NONE;
2684 c->reply = listCreate();
2685 listSetFreeMethod(c->reply,decrRefCount);
2686 listSetDupMethod(c->reply,dupClientReplyValue);
2687 c->blockingkeys = NULL;
2688 c->blockingkeysnum = 0;
2689 c->io_keys = listCreate();
2690 listSetFreeMethod(c->io_keys,decrRefCount);
2691 c->pubsub_channels = dictCreate(&setDictType,NULL);
2692 c->pubsub_patterns = listCreate();
2693 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2694 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2695 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2696 readQueryFromClient, c) == AE_ERR) {
2697 freeClient(c);
2698 return NULL;
2699 }
2700 listAddNodeTail(server.clients,c);
2701 initClientMultiState(c);
2702 return c;
2703 }
2704
2705 static void addReply(redisClient *c, robj *obj) {
2706 if (listLength(c->reply) == 0 &&
2707 (c->replstate == REDIS_REPL_NONE ||
2708 c->replstate == REDIS_REPL_ONLINE) &&
2709 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2710 sendReplyToClient, c) == AE_ERR) return;
2711
2712 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2713 obj = dupStringObject(obj);
2714 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2715 }
2716 listAddNodeTail(c->reply,getDecodedObject(obj));
2717 }
2718
2719 static void addReplySds(redisClient *c, sds s) {
2720 robj *o = createObject(REDIS_STRING,s);
2721 addReply(c,o);
2722 decrRefCount(o);
2723 }
2724
2725 static void addReplyDouble(redisClient *c, double d) {
2726 char buf[128];
2727
2728 snprintf(buf,sizeof(buf),"%.17g",d);
2729 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2730 (unsigned long) strlen(buf),buf));
2731 }
2732
2733 static void addReplyLong(redisClient *c, long l) {
2734 char buf[128];
2735 size_t len;
2736
2737 if (l == 0) {
2738 addReply(c,shared.czero);
2739 return;
2740 } else if (l == 1) {
2741 addReply(c,shared.cone);
2742 return;
2743 }
2744 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2745 addReplySds(c,sdsnewlen(buf,len));
2746 }
2747
2748 static void addReplyLongLong(redisClient *c, long long ll) {
2749 char buf[128];
2750 size_t len;
2751
2752 if (ll == 0) {
2753 addReply(c,shared.czero);
2754 return;
2755 } else if (ll == 1) {
2756 addReply(c,shared.cone);
2757 return;
2758 }
2759 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2760 addReplySds(c,sdsnewlen(buf,len));
2761 }
2762
2763 static void addReplyUlong(redisClient *c, unsigned long ul) {
2764 char buf[128];
2765 size_t len;
2766
2767 if (ul == 0) {
2768 addReply(c,shared.czero);
2769 return;
2770 } else if (ul == 1) {
2771 addReply(c,shared.cone);
2772 return;
2773 }
2774 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2775 addReplySds(c,sdsnewlen(buf,len));
2776 }
2777
2778 static void addReplyBulkLen(redisClient *c, robj *obj) {
2779 size_t len;
2780
2781 if (obj->encoding == REDIS_ENCODING_RAW) {
2782 len = sdslen(obj->ptr);
2783 } else {
2784 long n = (long)obj->ptr;
2785
2786 /* Compute how many bytes will take this integer as a radix 10 string */
2787 len = 1;
2788 if (n < 0) {
2789 len++;
2790 n = -n;
2791 }
2792 while((n = n/10) != 0) {
2793 len++;
2794 }
2795 }
2796 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2797 }
2798
2799 static void addReplyBulk(redisClient *c, robj *obj) {
2800 addReplyBulkLen(c,obj);
2801 addReply(c,obj);
2802 addReply(c,shared.crlf);
2803 }
2804
2805 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2806 static void addReplyBulkCString(redisClient *c, char *s) {
2807 if (s == NULL) {
2808 addReply(c,shared.nullbulk);
2809 } else {
2810 robj *o = createStringObject(s,strlen(s));
2811 addReplyBulk(c,o);
2812 decrRefCount(o);
2813 }
2814 }
2815
2816 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2817 int cport, cfd;
2818 char cip[128];
2819 redisClient *c;
2820 REDIS_NOTUSED(el);
2821 REDIS_NOTUSED(mask);
2822 REDIS_NOTUSED(privdata);
2823
2824 cfd = anetAccept(server.neterr, fd, cip, &cport);
2825 if (cfd == AE_ERR) {
2826 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2827 return;
2828 }
2829 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2830 if ((c = createClient(cfd)) == NULL) {
2831 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2832 close(cfd); /* May be already closed, just ingore errors */
2833 return;
2834 }
2835 /* If maxclient directive is set and this is one client more... close the
2836 * connection. Note that we create the client instead to check before
2837 * for this condition, since now the socket is already set in nonblocking
2838 * mode and we can send an error for free using the Kernel I/O */
2839 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2840 char *err = "-ERR max number of clients reached\r\n";
2841
2842 /* That's a best effort error message, don't check write errors */
2843 if (write(c->fd,err,strlen(err)) == -1) {
2844 /* Nothing to do, Just to avoid the warning... */
2845 }
2846 freeClient(c);
2847 return;
2848 }
2849 server.stat_numconnections++;
2850 }
2851
2852 /* ======================= Redis objects implementation ===================== */
2853
2854 static robj *createObject(int type, void *ptr) {
2855 robj *o;
2856
2857 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2858 if (listLength(server.objfreelist)) {
2859 listNode *head = listFirst(server.objfreelist);
2860 o = listNodeValue(head);
2861 listDelNode(server.objfreelist,head);
2862 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2863 } else {
2864 if (server.vm_enabled) {
2865 pthread_mutex_unlock(&server.obj_freelist_mutex);
2866 o = zmalloc(sizeof(*o));
2867 } else {
2868 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2869 }
2870 }
2871 o->type = type;
2872 o->encoding = REDIS_ENCODING_RAW;
2873 o->ptr = ptr;
2874 o->refcount = 1;
2875 if (server.vm_enabled) {
2876 /* Note that this code may run in the context of an I/O thread
2877 * and accessing to server.unixtime in theory is an error
2878 * (no locks). But in practice this is safe, and even if we read
2879 * garbage Redis will not fail, as it's just a statistical info */
2880 o->vm.atime = server.unixtime;
2881 o->storage = REDIS_VM_MEMORY;
2882 }
2883 return o;
2884 }
2885
2886 static robj *createStringObject(char *ptr, size_t len) {
2887 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2888 }
2889
2890 static robj *createStringObjectFromLongLong(long long value) {
2891 robj *o;
2892 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2893 incrRefCount(shared.integers[value]);
2894 o = shared.integers[value];
2895 } else {
2896 o = createObject(REDIS_STRING, NULL);
2897 if (value >= LONG_MIN && value <= LONG_MAX) {
2898 o->encoding = REDIS_ENCODING_INT;
2899 o->ptr = (void*)((long)value);
2900 } else {
2901 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2902 }
2903 }
2904 return o;
2905 }
2906
2907 static robj *dupStringObject(robj *o) {
2908 assert(o->encoding == REDIS_ENCODING_RAW);
2909 return createStringObject(o->ptr,sdslen(o->ptr));
2910 }
2911
2912 static robj *createListObject(void) {
2913 list *l = listCreate();
2914
2915 listSetFreeMethod(l,decrRefCount);
2916 return createObject(REDIS_LIST,l);
2917 }
2918
2919 static robj *createSetObject(void) {
2920 dict *d = dictCreate(&setDictType,NULL);
2921 return createObject(REDIS_SET,d);
2922 }
2923
2924 static robj *createHashObject(void) {
2925 /* All the Hashes start as zipmaps. Will be automatically converted
2926 * into hash tables if there are enough elements or big elements
2927 * inside. */
2928 unsigned char *zm = zipmapNew();
2929 robj *o = createObject(REDIS_HASH,zm);
2930 o->encoding = REDIS_ENCODING_ZIPMAP;
2931 return o;
2932 }
2933
2934 static robj *createZsetObject(void) {
2935 zset *zs = zmalloc(sizeof(*zs));
2936
2937 zs->dict = dictCreate(&zsetDictType,NULL);
2938 zs->zsl = zslCreate();
2939 return createObject(REDIS_ZSET,zs);
2940 }
2941
2942 static void freeStringObject(robj *o) {
2943 if (o->encoding == REDIS_ENCODING_RAW) {
2944 sdsfree(o->ptr);
2945 }
2946 }
2947
2948 static void freeListObject(robj *o) {
2949 listRelease((list*) o->ptr);
2950 }
2951
2952 static void freeSetObject(robj *o) {
2953 dictRelease((dict*) o->ptr);
2954 }
2955
2956 static void freeZsetObject(robj *o) {
2957 zset *zs = o->ptr;
2958
2959 dictRelease(zs->dict);
2960 zslFree(zs->zsl);
2961 zfree(zs);
2962 }
2963
2964 static void freeHashObject(robj *o) {
2965 switch (o->encoding) {
2966 case REDIS_ENCODING_HT:
2967 dictRelease((dict*) o->ptr);
2968 break;
2969 case REDIS_ENCODING_ZIPMAP:
2970 zfree(o->ptr);
2971 break;
2972 default:
2973 redisPanic("Unknown hash encoding type");
2974 break;
2975 }
2976 }
2977
2978 static void incrRefCount(robj *o) {
2979 o->refcount++;
2980 }
2981
2982 static void decrRefCount(void *obj) {
2983 robj *o = obj;
2984
2985 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2986 /* Object is a key of a swapped out value, or in the process of being
2987 * loaded. */
2988 if (server.vm_enabled &&
2989 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2990 {
2991 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2992 redisAssert(o->type == REDIS_STRING);
2993 freeStringObject(o);
2994 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2995 pthread_mutex_lock(&server.obj_freelist_mutex);
2996 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2997 !listAddNodeHead(server.objfreelist,o))
2998 zfree(o);
2999 pthread_mutex_unlock(&server.obj_freelist_mutex);
3000 server.vm_stats_swapped_objects--;
3001 return;
3002 }
3003 /* Object is in memory, or in the process of being swapped out. */
3004 if (--(o->refcount) == 0) {
3005 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3006 vmCancelThreadedIOJob(obj);
3007 switch(o->type) {
3008 case REDIS_STRING: freeStringObject(o); break;
3009 case REDIS_LIST: freeListObject(o); break;
3010 case REDIS_SET: freeSetObject(o); break;
3011 case REDIS_ZSET: freeZsetObject(o); break;
3012 case REDIS_HASH: freeHashObject(o); break;
3013 default: redisPanic("Unknown object type"); break;
3014 }
3015 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3016 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3017 !listAddNodeHead(server.objfreelist,o))
3018 zfree(o);
3019 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3020 }
3021 }
3022
3023 static robj *lookupKey(redisDb *db, robj *key) {
3024 dictEntry *de = dictFind(db->dict,key);
3025 if (de) {
3026 robj *key = dictGetEntryKey(de);
3027 robj *val = dictGetEntryVal(de);
3028
3029 if (server.vm_enabled) {
3030 if (key->storage == REDIS_VM_MEMORY ||
3031 key->storage == REDIS_VM_SWAPPING)
3032 {
3033 /* If we were swapping the object out, stop it, this key
3034 * was requested. */
3035 if (key->storage == REDIS_VM_SWAPPING)
3036 vmCancelThreadedIOJob(key);
3037 /* Update the access time of the key for the aging algorithm. */
3038 key->vm.atime = server.unixtime;
3039 } else {
3040 int notify = (key->storage == REDIS_VM_LOADING);
3041
3042 /* Our value was swapped on disk. Bring it at home. */
3043 redisAssert(val == NULL);
3044 val = vmLoadObject(key);
3045 dictGetEntryVal(de) = val;
3046
3047 /* Clients blocked by the VM subsystem may be waiting for
3048 * this key... */
3049 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3050 }
3051 }
3052 return val;
3053 } else {
3054 return NULL;
3055 }
3056 }
3057
3058 static robj *lookupKeyRead(redisDb *db, robj *key) {
3059 expireIfNeeded(db,key);
3060 return lookupKey(db,key);
3061 }
3062
3063 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3064 deleteIfVolatile(db,key);
3065 return lookupKey(db,key);
3066 }
3067
3068 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3069 robj *o = lookupKeyRead(c->db, key);
3070 if (!o) addReply(c,reply);
3071 return o;
3072 }
3073
3074 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3075 robj *o = lookupKeyWrite(c->db, key);
3076 if (!o) addReply(c,reply);
3077 return o;
3078 }
3079
3080 static int checkType(redisClient *c, robj *o, int type) {
3081 if (o->type != type) {
3082 addReply(c,shared.wrongtypeerr);
3083 return 1;
3084 }
3085 return 0;
3086 }
3087
3088 static int deleteKey(redisDb *db, robj *key) {
3089 int retval;
3090
3091 /* We need to protect key from destruction: after the first dictDelete()
3092 * it may happen that 'key' is no longer valid if we don't increment
3093 * it's count. This may happen when we get the object reference directly
3094 * from the hash table with dictRandomKey() or dict iterators */
3095 incrRefCount(key);
3096 if (dictSize(db->expires)) dictDelete(db->expires,key);
3097 retval = dictDelete(db->dict,key);
3098 decrRefCount(key);
3099
3100 return retval == DICT_OK;
3101 }
3102
3103 /* Check if the nul-terminated string 's' can be represented by a long
3104 * (that is, is a number that fits into long without any other space or
3105 * character before or after the digits).
3106 *
3107 * If so, the function returns REDIS_OK and *longval is set to the value
3108 * of the number. Otherwise REDIS_ERR is returned */
3109 static int isStringRepresentableAsLong(sds s, long *longval) {
3110 char buf[32], *endptr;
3111 long value;
3112 int slen;
3113
3114 value = strtol(s, &endptr, 10);
3115 if (endptr[0] != '\0') return REDIS_ERR;
3116 slen = snprintf(buf,32,"%ld",value);
3117
3118 /* If the number converted back into a string is not identical
3119 * then it's not possible to encode the string as integer */
3120 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3121 if (longval) *longval = value;
3122 return REDIS_OK;
3123 }
3124
3125 /* Try to encode a string object in order to save space */
3126 static robj *tryObjectEncoding(robj *o) {
3127 long value;
3128 sds s = o->ptr;
3129
3130 if (o->encoding != REDIS_ENCODING_RAW)
3131 return o; /* Already encoded */
3132
3133 /* It's not safe to encode shared objects: shared objects can be shared
3134 * everywhere in the "object space" of Redis. Encoded objects can only
3135 * appear as "values" (and not, for instance, as keys) */
3136 if (o->refcount > 1) return o;
3137
3138 /* Currently we try to encode only strings */
3139 redisAssert(o->type == REDIS_STRING);
3140
3141 /* Check if we can represent this string as a long integer */
3142 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3143
3144 /* Ok, this object can be encoded */
3145 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3146 decrRefCount(o);
3147 incrRefCount(shared.integers[value]);
3148 return shared.integers[value];
3149 } else {
3150 o->encoding = REDIS_ENCODING_INT;
3151 sdsfree(o->ptr);
3152 o->ptr = (void*) value;
3153 return o;
3154 }
3155 }
3156
3157 /* Get a decoded version of an encoded object (returned as a new object).
3158 * If the object is already raw-encoded just increment the ref count. */
3159 static robj *getDecodedObject(robj *o) {
3160 robj *dec;
3161
3162 if (o->encoding == REDIS_ENCODING_RAW) {
3163 incrRefCount(o);
3164 return o;
3165 }
3166 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3167 char buf[32];
3168
3169 snprintf(buf,32,"%ld",(long)o->ptr);
3170 dec = createStringObject(buf,strlen(buf));
3171 return dec;
3172 } else {
3173 redisPanic("Unknown encoding type");
3174 }
3175 }
3176
3177 /* Compare two string objects via strcmp() or alike.
3178 * Note that the objects may be integer-encoded. In such a case we
3179 * use snprintf() to get a string representation of the numbers on the stack
3180 * and compare the strings, it's much faster than calling getDecodedObject().
3181 *
3182 * Important note: if objects are not integer encoded, but binary-safe strings,
3183 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3184 * binary safe. */
3185 static int compareStringObjects(robj *a, robj *b) {
3186 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3187 char bufa[128], bufb[128], *astr, *bstr;
3188 int bothsds = 1;
3189
3190 if (a == b) return 0;
3191 if (a->encoding != REDIS_ENCODING_RAW) {
3192 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3193 astr = bufa;
3194 bothsds = 0;
3195 } else {
3196 astr = a->ptr;
3197 }
3198 if (b->encoding != REDIS_ENCODING_RAW) {
3199 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3200 bstr = bufb;
3201 bothsds = 0;
3202 } else {
3203 bstr = b->ptr;
3204 }
3205 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3206 }
3207
3208 static size_t stringObjectLen(robj *o) {
3209 redisAssert(o->type == REDIS_STRING);
3210 if (o->encoding == REDIS_ENCODING_RAW) {
3211 return sdslen(o->ptr);
3212 } else {
3213 char buf[32];
3214
3215 return snprintf(buf,32,"%ld",(long)o->ptr);
3216 }
3217 }
3218
3219 static int getDoubleFromObject(robj *o, double *target) {
3220 double value;
3221 char *eptr;
3222
3223 if (o == NULL) {
3224 value = 0;
3225 } else {
3226 redisAssert(o->type == REDIS_STRING);
3227 if (o->encoding == REDIS_ENCODING_RAW) {
3228 value = strtod(o->ptr, &eptr);
3229 if (eptr[0] != '\0') return REDIS_ERR;
3230 } else if (o->encoding == REDIS_ENCODING_INT) {
3231 value = (long)o->ptr;
3232 } else {
3233 redisPanic("Unknown string encoding");
3234 }
3235 }
3236
3237 *target = value;
3238 return REDIS_OK;
3239 }
3240
3241 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3242 double value;
3243 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3244 if (msg != NULL) {
3245 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3246 } else {
3247 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3248 }
3249 return REDIS_ERR;
3250 }
3251
3252 *target = value;
3253 return REDIS_OK;
3254 }
3255
3256 static int getLongLongFromObject(robj *o, long long *target) {
3257 long long value;
3258 char *eptr;
3259
3260 if (o == NULL) {
3261 value = 0;
3262 } else {
3263 redisAssert(o->type == REDIS_STRING);
3264 if (o->encoding == REDIS_ENCODING_RAW) {
3265 value = strtoll(o->ptr, &eptr, 10);
3266 if (eptr[0] != '\0') return REDIS_ERR;
3267 } else if (o->encoding == REDIS_ENCODING_INT) {
3268 value = (long)o->ptr;
3269 } else {
3270 redisPanic("Unknown string encoding");
3271 }
3272 }
3273
3274 *target = value;
3275 return REDIS_OK;
3276 }
3277
3278 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3279 long long value;
3280 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3281 if (msg != NULL) {
3282 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3283 } else {
3284 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3285 }
3286 return REDIS_ERR;
3287 }
3288
3289 *target = value;
3290 return REDIS_OK;
3291 }
3292
3293 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3294 long long value;
3295
3296 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3297 if (value < LONG_MIN || value > LONG_MAX) {
3298 if (msg != NULL) {
3299 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3300 } else {
3301 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3302 }
3303 return REDIS_ERR;
3304 }
3305
3306 *target = value;
3307 return REDIS_OK;
3308 }
3309
3310 /*============================ RDB saving/loading =========================== */
3311
3312 static int rdbSaveType(FILE *fp, unsigned char type) {
3313 if (fwrite(&type,1,1,fp) == 0) return -1;
3314 return 0;
3315 }
3316
3317 static int rdbSaveTime(FILE *fp, time_t t) {
3318 int32_t t32 = (int32_t) t;
3319 if (fwrite(&t32,4,1,fp) == 0) return -1;
3320 return 0;
3321 }
3322
3323 /* check rdbLoadLen() comments for more info */
3324 static int rdbSaveLen(FILE *fp, uint32_t len) {
3325 unsigned char buf[2];
3326
3327 if (len < (1<<6)) {
3328 /* Save a 6 bit len */
3329 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3330 if (fwrite(buf,1,1,fp) == 0) return -1;
3331 } else if (len < (1<<14)) {
3332 /* Save a 14 bit len */
3333 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3334 buf[1] = len&0xFF;
3335 if (fwrite(buf,2,1,fp) == 0) return -1;
3336 } else {
3337 /* Save a 32 bit len */
3338 buf[0] = (REDIS_RDB_32BITLEN<<6);
3339 if (fwrite(buf,1,1,fp) == 0) return -1;
3340 len = htonl(len);
3341 if (fwrite(&len,4,1,fp) == 0) return -1;
3342 }
3343 return 0;
3344 }
3345
3346 /* String objects in the form "2391" "-100" without any space and with a
3347 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3348 * encoded as integers to save space */
3349 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3350 long long value;
3351 char *endptr, buf[32];
3352
3353 /* Check if it's possible to encode this value as a number */
3354 value = strtoll(s, &endptr, 10);
3355 if (endptr[0] != '\0') return 0;
3356 snprintf(buf,32,"%lld",value);
3357
3358 /* If the number converted back into a string is not identical
3359 * then it's not possible to encode the string as integer */
3360 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3361
3362 /* Finally check if it fits in our ranges */
3363 if (value >= -(1<<7) && value <= (1<<7)-1) {
3364 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3365 enc[1] = value&0xFF;
3366 return 2;
3367 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3368 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3369 enc[1] = value&0xFF;
3370 enc[2] = (value>>8)&0xFF;
3371 return 3;
3372 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3373 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3374 enc[1] = value&0xFF;
3375 enc[2] = (value>>8)&0xFF;
3376 enc[3] = (value>>16)&0xFF;
3377 enc[4] = (value>>24)&0xFF;
3378 return 5;
3379 } else {
3380 return 0;
3381 }
3382 }
3383
3384 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3385 size_t comprlen, outlen;
3386 unsigned char byte;
3387 void *out;
3388
3389 /* We require at least four bytes compression for this to be worth it */
3390 if (len <= 4) return 0;
3391 outlen = len-4;
3392 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3393 comprlen = lzf_compress(s, len, out, outlen);
3394 if (comprlen == 0) {
3395 zfree(out);
3396 return 0;
3397 }
3398 /* Data compressed! Let's save it on disk */
3399 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3400 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3401 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3402 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3403 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3404 zfree(out);
3405 return comprlen;
3406
3407 writeerr:
3408 zfree(out);
3409 return -1;
3410 }
3411
3412 /* Save a string objet as [len][data] on disk. If the object is a string
3413 * representation of an integer value we try to safe it in a special form */
3414 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3415 int enclen;
3416
3417 /* Try integer encoding */
3418 if (len <= 11) {
3419 unsigned char buf[5];
3420 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3421 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3422 return 0;
3423 }
3424 }
3425
3426 /* Try LZF compression - under 20 bytes it's unable to compress even
3427 * aaaaaaaaaaaaaaaaaa so skip it */
3428 if (server.rdbcompression && len > 20) {
3429 int retval;
3430
3431 retval = rdbSaveLzfStringObject(fp,s,len);
3432 if (retval == -1) return -1;
3433 if (retval > 0) return 0;
3434 /* retval == 0 means data can't be compressed, save the old way */
3435 }
3436
3437 /* Store verbatim */
3438 if (rdbSaveLen(fp,len) == -1) return -1;
3439 if (len && fwrite(s,len,1,fp) == 0) return -1;
3440 return 0;
3441 }
3442
3443 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3444 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3445 int retval;
3446
3447 /* Avoid incr/decr ref count business when possible.
3448 * This plays well with copy-on-write given that we are probably
3449 * in a child process (BGSAVE). Also this makes sure key objects
3450 * of swapped objects are not incRefCount-ed (an assert does not allow
3451 * this in order to avoid bugs) */
3452 if (obj->encoding != REDIS_ENCODING_RAW) {
3453 obj = getDecodedObject(obj);
3454 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3455 decrRefCount(obj);
3456 } else {
3457 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3458 }
3459 return retval;
3460 }
3461
3462 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3463 * 8 bit integer specifing the length of the representation.
3464 * This 8 bit integer has special values in order to specify the following
3465 * conditions:
3466 * 253: not a number
3467 * 254: + inf
3468 * 255: - inf
3469 */
3470 static int rdbSaveDoubleValue(FILE *fp, double val) {
3471 unsigned char buf[128];
3472 int len;
3473
3474 if (isnan(val)) {
3475 buf[0] = 253;
3476 len = 1;
3477 } else if (!isfinite(val)) {
3478 len = 1;
3479 buf[0] = (val < 0) ? 255 : 254;
3480 } else {
3481 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3482 buf[0] = strlen((char*)buf+1);
3483 len = buf[0]+1;
3484 }
3485 if (fwrite(buf,len,1,fp) == 0) return -1;
3486 return 0;
3487 }
3488
3489 /* Save a Redis object. */
3490 static int rdbSaveObject(FILE *fp, robj *o) {
3491 if (o->type == REDIS_STRING) {
3492 /* Save a string value */
3493 if (rdbSaveStringObject(fp,o) == -1) return -1;
3494 } else if (o->type == REDIS_LIST) {
3495 /* Save a list value */
3496 list *list = o->ptr;
3497 listIter li;
3498 listNode *ln;
3499
3500 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3501 listRewind(list,&li);
3502 while((ln = listNext(&li))) {
3503 robj *eleobj = listNodeValue(ln);
3504
3505 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3506 }
3507 } else if (o->type == REDIS_SET) {
3508 /* Save a set value */
3509 dict *set = o->ptr;
3510 dictIterator *di = dictGetIterator(set);
3511 dictEntry *de;
3512
3513 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3514 while((de = dictNext(di)) != NULL) {
3515 robj *eleobj = dictGetEntryKey(de);
3516
3517 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3518 }
3519 dictReleaseIterator(di);
3520 } else if (o->type == REDIS_ZSET) {
3521 /* Save a set value */
3522 zset *zs = o->ptr;
3523 dictIterator *di = dictGetIterator(zs->dict);
3524 dictEntry *de;
3525
3526 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3527 while((de = dictNext(di)) != NULL) {
3528 robj *eleobj = dictGetEntryKey(de);
3529 double *score = dictGetEntryVal(de);
3530
3531 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3532 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3533 }
3534 dictReleaseIterator(di);
3535 } else if (o->type == REDIS_HASH) {
3536 /* Save a hash value */
3537 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3538 unsigned char *p = zipmapRewind(o->ptr);
3539 unsigned int count = zipmapLen(o->ptr);
3540 unsigned char *key, *val;
3541 unsigned int klen, vlen;
3542
3543 if (rdbSaveLen(fp,count) == -1) return -1;
3544 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3545 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3546 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3547 }
3548 } else {
3549 dictIterator *di = dictGetIterator(o->ptr);
3550 dictEntry *de;
3551
3552 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3553 while((de = dictNext(di)) != NULL) {
3554 robj *key = dictGetEntryKey(de);
3555 robj *val = dictGetEntryVal(de);
3556
3557 if (rdbSaveStringObject(fp,key) == -1) return -1;
3558 if (rdbSaveStringObject(fp,val) == -1) return -1;
3559 }
3560 dictReleaseIterator(di);
3561 }
3562 } else {
3563 redisPanic("Unknown object type");
3564 }
3565 return 0;
3566 }
3567
3568 /* Return the length the object will have on disk if saved with
3569 * the rdbSaveObject() function. Currently we use a trick to get
3570 * this length with very little changes to the code. In the future
3571 * we could switch to a faster solution. */
3572 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3573 if (fp == NULL) fp = server.devnull;
3574 rewind(fp);
3575 assert(rdbSaveObject(fp,o) != 1);
3576 return ftello(fp);
3577 }
3578
3579 /* Return the number of pages required to save this object in the swap file */
3580 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3581 off_t bytes = rdbSavedObjectLen(o,fp);
3582
3583 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3584 }
3585
3586 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3587 static int rdbSave(char *filename) {
3588 dictIterator *di = NULL;
3589 dictEntry *de;
3590 FILE *fp;
3591 char tmpfile[256];
3592 int j;
3593 time_t now = time(NULL);
3594
3595 /* Wait for I/O therads to terminate, just in case this is a
3596 * foreground-saving, to avoid seeking the swap file descriptor at the
3597 * same time. */
3598 if (server.vm_enabled)
3599 waitEmptyIOJobsQueue();
3600
3601 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3602 fp = fopen(tmpfile,"w");
3603 if (!fp) {
3604 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3605 return REDIS_ERR;
3606 }
3607 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3608 for (j = 0; j < server.dbnum; j++) {
3609 redisDb *db = server.db+j;
3610 dict *d = db->dict;
3611 if (dictSize(d) == 0) continue;
3612 di = dictGetIterator(d);
3613 if (!di) {
3614 fclose(fp);
3615 return REDIS_ERR;
3616 }
3617
3618 /* Write the SELECT DB opcode */
3619 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3620 if (rdbSaveLen(fp,j) == -1) goto werr;
3621
3622 /* Iterate this DB writing every entry */
3623 while((de = dictNext(di)) != NULL) {
3624 robj *key = dictGetEntryKey(de);
3625 robj *o = dictGetEntryVal(de);
3626 time_t expiretime = getExpire(db,key);
3627
3628 /* Save the expire time */
3629 if (expiretime != -1) {
3630 /* If this key is already expired skip it */
3631 if (expiretime < now) continue;
3632 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3633 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3634 }
3635 /* Save the key and associated value. This requires special
3636 * handling if the value is swapped out. */
3637 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3638 key->storage == REDIS_VM_SWAPPING) {
3639 /* Save type, key, value */
3640 if (rdbSaveType(fp,o->type) == -1) goto werr;
3641 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3642 if (rdbSaveObject(fp,o) == -1) goto werr;
3643 } else {
3644 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3645 robj *po;
3646 /* Get a preview of the object in memory */
3647 po = vmPreviewObject(key);
3648 /* Save type, key, value */
3649 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3650 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3651 if (rdbSaveObject(fp,po) == -1) goto werr;
3652 /* Remove the loaded object from memory */
3653 decrRefCount(po);
3654 }
3655 }
3656 dictReleaseIterator(di);
3657 }
3658 /* EOF opcode */
3659 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3660
3661 /* Make sure data will not remain on the OS's output buffers */
3662 fflush(fp);
3663 fsync(fileno(fp));
3664 fclose(fp);
3665
3666 /* Use RENAME to make sure the DB file is changed atomically only
3667 * if the generate DB file is ok. */
3668 if (rename(tmpfile,filename) == -1) {
3669 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3670 unlink(tmpfile);
3671 return REDIS_ERR;
3672 }
3673 redisLog(REDIS_NOTICE,"DB saved on disk");
3674 server.dirty = 0;
3675 server.lastsave = time(NULL);
3676 return REDIS_OK;
3677
3678 werr:
3679 fclose(fp);
3680 unlink(tmpfile);
3681 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3682 if (di) dictReleaseIterator(di);
3683 return REDIS_ERR;
3684 }
3685
3686 static int rdbSaveBackground(char *filename) {
3687 pid_t childpid;
3688
3689 if (server.bgsavechildpid != -1) return REDIS_ERR;
3690 if (server.vm_enabled) waitEmptyIOJobsQueue();
3691 if ((childpid = fork()) == 0) {
3692 /* Child */
3693 if (server.vm_enabled) vmReopenSwapFile();
3694 close(server.fd);
3695 if (rdbSave(filename) == REDIS_OK) {
3696 _exit(0);
3697 } else {
3698 _exit(1);
3699 }
3700 } else {
3701 /* Parent */
3702 if (childpid == -1) {
3703 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3704 strerror(errno));
3705 return REDIS_ERR;
3706 }
3707 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3708 server.bgsavechildpid = childpid;
3709 updateDictResizePolicy();
3710 return REDIS_OK;
3711 }
3712 return REDIS_OK; /* unreached */
3713 }
3714
3715 static void rdbRemoveTempFile(pid_t childpid) {
3716 char tmpfile[256];
3717
3718 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3719 unlink(tmpfile);
3720 }
3721
3722 static int rdbLoadType(FILE *fp) {
3723 unsigned char type;
3724 if (fread(&type,1,1,fp) == 0) return -1;
3725 return type;
3726 }
3727
3728 static time_t rdbLoadTime(FILE *fp) {
3729 int32_t t32;
3730 if (fread(&t32,4,1,fp) == 0) return -1;
3731 return (time_t) t32;
3732 }
3733
3734 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3735 * of this file for a description of how this are stored on disk.
3736 *
3737 * isencoded is set to 1 if the readed length is not actually a length but
3738 * an "encoding type", check the above comments for more info */
3739 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3740 unsigned char buf[2];
3741 uint32_t len;
3742 int type;
3743
3744 if (isencoded) *isencoded = 0;
3745 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3746 type = (buf[0]&0xC0)>>6;
3747 if (type == REDIS_RDB_6BITLEN) {
3748 /* Read a 6 bit len */
3749 return buf[0]&0x3F;
3750 } else if (type == REDIS_RDB_ENCVAL) {
3751 /* Read a 6 bit len encoding type */
3752 if (isencoded) *isencoded = 1;
3753 return buf[0]&0x3F;
3754 } else if (type == REDIS_RDB_14BITLEN) {
3755 /* Read a 14 bit len */
3756 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3757 return ((buf[0]&0x3F)<<8)|buf[1];
3758 } else {
3759 /* Read a 32 bit len */
3760 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3761 return ntohl(len);
3762 }
3763 }
3764
3765 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3766 unsigned char enc[4];
3767 long long val;
3768
3769 if (enctype == REDIS_RDB_ENC_INT8) {
3770 if (fread(enc,1,1,fp) == 0) return NULL;
3771 val = (signed char)enc[0];
3772 } else if (enctype == REDIS_RDB_ENC_INT16) {
3773 uint16_t v;
3774 if (fread(enc,2,1,fp) == 0) return NULL;
3775 v = enc[0]|(enc[1]<<8);
3776 val = (int16_t)v;
3777 } else if (enctype == REDIS_RDB_ENC_INT32) {
3778 uint32_t v;
3779 if (fread(enc,4,1,fp) == 0) return NULL;
3780 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3781 val = (int32_t)v;
3782 } else {
3783 val = 0; /* anti-warning */
3784 redisPanic("Unknown RDB integer encoding type");
3785 }
3786 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3787 }
3788
3789 static robj *rdbLoadLzfStringObject(FILE*fp) {
3790 unsigned int len, clen;
3791 unsigned char *c = NULL;
3792 sds val = NULL;
3793
3794 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3795 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3796 if ((c = zmalloc(clen)) == NULL) goto err;
3797 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3798 if (fread(c,clen,1,fp) == 0) goto err;
3799 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3800 zfree(c);
3801 return createObject(REDIS_STRING,val);
3802 err:
3803 zfree(c);
3804 sdsfree(val);
3805 return NULL;
3806 }
3807
3808 static robj *rdbLoadStringObject(FILE*fp) {
3809 int isencoded;
3810 uint32_t len;
3811 sds val;
3812
3813 len = rdbLoadLen(fp,&isencoded);
3814 if (isencoded) {
3815 switch(len) {
3816 case REDIS_RDB_ENC_INT8:
3817 case REDIS_RDB_ENC_INT16:
3818 case REDIS_RDB_ENC_INT32:
3819 return rdbLoadIntegerObject(fp,len);
3820 case REDIS_RDB_ENC_LZF:
3821 return rdbLoadLzfStringObject(fp);
3822 default:
3823 redisPanic("Unknown RDB encoding type");
3824 }
3825 }
3826
3827 if (len == REDIS_RDB_LENERR) return NULL;
3828 val = sdsnewlen(NULL,len);
3829 if (len && fread(val,len,1,fp) == 0) {
3830 sdsfree(val);
3831 return NULL;
3832 }
3833 return createObject(REDIS_STRING,val);
3834 }
3835
3836 /* For information about double serialization check rdbSaveDoubleValue() */
3837 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3838 char buf[128];
3839 unsigned char len;
3840
3841 if (fread(&len,1,1,fp) == 0) return -1;
3842 switch(len) {
3843 case 255: *val = R_NegInf; return 0;
3844 case 254: *val = R_PosInf; return 0;
3845 case 253: *val = R_Nan; return 0;
3846 default:
3847 if (fread(buf,len,1,fp) == 0) return -1;
3848 buf[len] = '\0';
3849 sscanf(buf, "%lg", val);
3850 return 0;
3851 }
3852 }
3853
3854 /* Load a Redis object of the specified type from the specified file.
3855 * On success a newly allocated object is returned, otherwise NULL. */
3856 static robj *rdbLoadObject(int type, FILE *fp) {
3857 robj *o;
3858
3859 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3860 if (type == REDIS_STRING) {
3861 /* Read string value */
3862 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3863 o = tryObjectEncoding(o);
3864 } else if (type == REDIS_LIST || type == REDIS_SET) {
3865 /* Read list/set value */
3866 uint32_t listlen;
3867
3868 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3869 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3870 /* It's faster to expand the dict to the right size asap in order
3871 * to avoid rehashing */
3872 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3873 dictExpand(o->ptr,listlen);
3874 /* Load every single element of the list/set */
3875 while(listlen--) {
3876 robj *ele;
3877
3878 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3879 ele = tryObjectEncoding(ele);
3880 if (type == REDIS_LIST) {
3881 listAddNodeTail((list*)o->ptr,ele);
3882 } else {
3883 dictAdd((dict*)o->ptr,ele,NULL);
3884 }
3885 }
3886 } else if (type == REDIS_ZSET) {
3887 /* Read list/set value */
3888 size_t zsetlen;
3889 zset *zs;
3890
3891 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3892 o = createZsetObject();
3893 zs = o->ptr;
3894 /* Load every single element of the list/set */
3895 while(zsetlen--) {
3896 robj *ele;
3897 double *score = zmalloc(sizeof(double));
3898
3899 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3900 ele = tryObjectEncoding(ele);
3901 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3902 dictAdd(zs->dict,ele,score);
3903 zslInsert(zs->zsl,*score,ele);
3904 incrRefCount(ele); /* added to skiplist */
3905 }
3906 } else if (type == REDIS_HASH) {
3907 size_t hashlen;
3908
3909 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3910 o = createHashObject();
3911 /* Too many entries? Use an hash table. */
3912 if (hashlen > server.hash_max_zipmap_entries)
3913 convertToRealHash(o);
3914 /* Load every key/value, then set it into the zipmap or hash
3915 * table, as needed. */
3916 while(hashlen--) {
3917 robj *key, *val;
3918
3919 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3920 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3921 /* If we are using a zipmap and there are too big values
3922 * the object is converted to real hash table encoding. */
3923 if (o->encoding != REDIS_ENCODING_HT &&
3924 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3925 sdslen(val->ptr) > server.hash_max_zipmap_value))
3926 {
3927 convertToRealHash(o);
3928 }
3929
3930 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3931 unsigned char *zm = o->ptr;
3932
3933 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3934 val->ptr,sdslen(val->ptr),NULL);
3935 o->ptr = zm;
3936 decrRefCount(key);
3937 decrRefCount(val);
3938 } else {
3939 key = tryObjectEncoding(key);
3940 val = tryObjectEncoding(val);
3941 dictAdd((dict*)o->ptr,key,val);
3942 }
3943 }
3944 } else {
3945 redisPanic("Unknown object type");
3946 }
3947 return o;
3948 }
3949
3950 static int rdbLoad(char *filename) {
3951 FILE *fp;
3952 robj *keyobj = NULL;
3953 uint32_t dbid;
3954 int type, retval, rdbver;
3955 dict *d = server.db[0].dict;
3956 redisDb *db = server.db+0;
3957 char buf[1024];
3958 time_t expiretime = -1, now = time(NULL);
3959 long long loadedkeys = 0;
3960
3961 fp = fopen(filename,"r");
3962 if (!fp) return REDIS_ERR;
3963 if (fread(buf,9,1,fp) == 0) goto eoferr;
3964 buf[9] = '\0';
3965 if (memcmp(buf,"REDIS",5) != 0) {
3966 fclose(fp);
3967 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3968 return REDIS_ERR;
3969 }
3970 rdbver = atoi(buf+5);
3971 if (rdbver != 1) {
3972 fclose(fp);
3973 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3974 return REDIS_ERR;
3975 }
3976 while(1) {
3977 robj *o;
3978
3979 /* Read type. */
3980 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3981 if (type == REDIS_EXPIRETIME) {
3982 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3983 /* We read the time so we need to read the object type again */
3984 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3985 }
3986 if (type == REDIS_EOF) break;
3987 /* Handle SELECT DB opcode as a special case */
3988 if (type == REDIS_SELECTDB) {
3989 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3990 goto eoferr;
3991 if (dbid >= (unsigned)server.dbnum) {
3992 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3993 exit(1);
3994 }
3995 db = server.db+dbid;
3996 d = db->dict;
3997 continue;
3998 }
3999 /* Read key */
4000 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4001 /* Read value */
4002 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4003 /* Add the new object in the hash table */
4004 retval = dictAdd(d,keyobj,o);
4005 if (retval == DICT_ERR) {
4006 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
4007 exit(1);
4008 }
4009 /* Set the expire time if needed */
4010 if (expiretime != -1) {
4011 setExpire(db,keyobj,expiretime);
4012 /* Delete this key if already expired */
4013 if (expiretime < now) deleteKey(db,keyobj);
4014 expiretime = -1;
4015 }
4016 keyobj = o = NULL;
4017 /* Handle swapping while loading big datasets when VM is on */
4018 loadedkeys++;
4019 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
4020 while (zmalloc_used_memory() > server.vm_max_memory) {
4021 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4022 }
4023 }
4024 }
4025 fclose(fp);
4026 return REDIS_OK;
4027
4028 eoferr: /* unexpected end of file is handled here with a fatal exit */
4029 if (keyobj) decrRefCount(keyobj);
4030 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4031 exit(1);
4032 return REDIS_ERR; /* Just to avoid warning */
4033 }
4034
4035 /*================================== Commands =============================== */
4036
4037 static void authCommand(redisClient *c) {
4038 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4039 c->authenticated = 1;
4040 addReply(c,shared.ok);
4041 } else {
4042 c->authenticated = 0;
4043 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4044 }
4045 }
4046
4047 static void pingCommand(redisClient *c) {
4048 addReply(c,shared.pong);
4049 }
4050
4051 static void echoCommand(redisClient *c) {
4052 addReplyBulk(c,c->argv[1]);
4053 }
4054
4055 /*=================================== Strings =============================== */
4056
4057 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4058 int retval;
4059 long seconds = 0; /* initialized to avoid an harmness warning */
4060
4061 if (expire) {
4062 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4063 return;
4064 if (seconds <= 0) {
4065 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4066 return;
4067 }
4068 }
4069
4070 if (nx) deleteIfVolatile(c->db,key);
4071 retval = dictAdd(c->db->dict,key,val);
4072 if (retval == DICT_ERR) {
4073 if (!nx) {
4074 /* If the key is about a swapped value, we want a new key object
4075 * to overwrite the old. So we delete the old key in the database.
4076 * This will also make sure that swap pages about the old object
4077 * will be marked as free. */
4078 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4079 incrRefCount(key);
4080 dictReplace(c->db->dict,key,val);
4081 incrRefCount(val);
4082 } else {
4083 addReply(c,shared.czero);
4084 return;
4085 }
4086 } else {
4087 incrRefCount(key);
4088 incrRefCount(val);
4089 }
4090 server.dirty++;
4091 removeExpire(c->db,key);
4092 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4093 addReply(c, nx ? shared.cone : shared.ok);
4094 }
4095
4096 static void setCommand(redisClient *c) {
4097 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4098 }
4099
4100 static void setnxCommand(redisClient *c) {
4101 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4102 }
4103
4104 static void setexCommand(redisClient *c) {
4105 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4106 }
4107
4108 static int getGenericCommand(redisClient *c) {
4109 robj *o;
4110
4111 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4112 return REDIS_OK;
4113
4114 if (o->type != REDIS_STRING) {
4115 addReply(c,shared.wrongtypeerr);
4116 return REDIS_ERR;
4117 } else {
4118 addReplyBulk(c,o);
4119 return REDIS_OK;
4120 }
4121 }
4122
4123 static void getCommand(redisClient *c) {
4124 getGenericCommand(c);
4125 }
4126
4127 static void getsetCommand(redisClient *c) {
4128 if (getGenericCommand(c) == REDIS_ERR) return;
4129 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4130 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4131 } else {
4132 incrRefCount(c->argv[1]);
4133 }
4134 incrRefCount(c->argv[2]);
4135 server.dirty++;
4136 removeExpire(c->db,c->argv[1]);
4137 }
4138
4139 static void mgetCommand(redisClient *c) {
4140 int j;
4141
4142 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4143 for (j = 1; j < c->argc; j++) {
4144 robj *o = lookupKeyRead(c->db,c->argv[j]);
4145 if (o == NULL) {
4146 addReply(c,shared.nullbulk);
4147 } else {
4148 if (o->type != REDIS_STRING) {
4149 addReply(c,shared.nullbulk);
4150 } else {
4151 addReplyBulk(c,o);
4152 }
4153 }
4154 }
4155 }
4156
4157 static void msetGenericCommand(redisClient *c, int nx) {
4158 int j, busykeys = 0;
4159
4160 if ((c->argc % 2) == 0) {
4161 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4162 return;
4163 }
4164 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4165 * set nothing at all if at least one already key exists. */
4166 if (nx) {
4167 for (j = 1; j < c->argc; j += 2) {
4168 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4169 busykeys++;
4170 }
4171 }
4172 }
4173 if (busykeys) {
4174 addReply(c, shared.czero);
4175 return;
4176 }
4177
4178 for (j = 1; j < c->argc; j += 2) {
4179 int retval;
4180
4181 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4182 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4183 if (retval == DICT_ERR) {
4184 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4185 incrRefCount(c->argv[j+1]);
4186 } else {
4187 incrRefCount(c->argv[j]);
4188 incrRefCount(c->argv[j+1]);
4189 }
4190 removeExpire(c->db,c->argv[j]);
4191 }
4192 server.dirty += (c->argc-1)/2;
4193 addReply(c, nx ? shared.cone : shared.ok);
4194 }
4195
4196 static void msetCommand(redisClient *c) {
4197 msetGenericCommand(c,0);
4198 }
4199
4200 static void msetnxCommand(redisClient *c) {
4201 msetGenericCommand(c,1);
4202 }
4203
4204 static void incrDecrCommand(redisClient *c, long long incr) {
4205 long long value;
4206 int retval;
4207 robj *o;
4208
4209 o = lookupKeyWrite(c->db,c->argv[1]);
4210
4211 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4212
4213 value += incr;
4214 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4215 o = tryObjectEncoding(o);
4216 retval = dictAdd(c->db->dict,c->argv[1],o);
4217 if (retval == DICT_ERR) {
4218 dictReplace(c->db->dict,c->argv[1],o);
4219 removeExpire(c->db,c->argv[1]);
4220 } else {
4221 incrRefCount(c->argv[1]);
4222 }
4223 server.dirty++;
4224 addReply(c,shared.colon);
4225 addReply(c,o);
4226 addReply(c,shared.crlf);
4227 }
4228
4229 static void incrCommand(redisClient *c) {
4230 incrDecrCommand(c,1);
4231 }
4232
4233 static void decrCommand(redisClient *c) {
4234 incrDecrCommand(c,-1);
4235 }
4236
4237 static void incrbyCommand(redisClient *c) {
4238 long long incr;
4239
4240 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4241 incrDecrCommand(c,incr);
4242 }
4243
4244 static void decrbyCommand(redisClient *c) {
4245 long long incr;
4246
4247 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4248 incrDecrCommand(c,-incr);
4249 }
4250
4251 static void appendCommand(redisClient *c) {
4252 int retval;
4253 size_t totlen;
4254 robj *o;
4255
4256 o = lookupKeyWrite(c->db,c->argv[1]);
4257 if (o == NULL) {
4258 /* Create the key */
4259 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4260 incrRefCount(c->argv[1]);
4261 incrRefCount(c->argv[2]);
4262 totlen = stringObjectLen(c->argv[2]);
4263 } else {
4264 dictEntry *de;
4265
4266 de = dictFind(c->db->dict,c->argv[1]);
4267 assert(de != NULL);
4268
4269 o = dictGetEntryVal(de);
4270 if (o->type != REDIS_STRING) {
4271 addReply(c,shared.wrongtypeerr);
4272 return;
4273 }
4274 /* If the object is specially encoded or shared we have to make
4275 * a copy */
4276 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4277 robj *decoded = getDecodedObject(o);
4278
4279 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4280 decrRefCount(decoded);
4281 dictReplace(c->db->dict,c->argv[1],o);
4282 }
4283 /* APPEND! */
4284 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4285 o->ptr = sdscatlen(o->ptr,
4286 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4287 } else {
4288 o->ptr = sdscatprintf(o->ptr, "%ld",
4289 (unsigned long) c->argv[2]->ptr);
4290 }
4291 totlen = sdslen(o->ptr);
4292 }
4293 server.dirty++;
4294 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4295 }
4296
4297 static void substrCommand(redisClient *c) {
4298 robj *o;
4299 long start = atoi(c->argv[2]->ptr);
4300 long end = atoi(c->argv[3]->ptr);
4301 size_t rangelen, strlen;
4302 sds range;
4303
4304 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4305 checkType(c,o,REDIS_STRING)) return;
4306
4307 o = getDecodedObject(o);
4308 strlen = sdslen(o->ptr);
4309
4310 /* convert negative indexes */
4311 if (start < 0) start = strlen+start;
4312 if (end < 0) end = strlen+end;
4313 if (start < 0) start = 0;
4314 if (end < 0) end = 0;
4315
4316 /* indexes sanity checks */
4317 if (start > end || (size_t)start >= strlen) {
4318 /* Out of range start or start > end result in null reply */
4319 addReply(c,shared.nullbulk);
4320 decrRefCount(o);
4321 return;
4322 }
4323 if ((size_t)end >= strlen) end = strlen-1;
4324 rangelen = (end-start)+1;
4325
4326 /* Return the result */
4327 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4328 range = sdsnewlen((char*)o->ptr+start,rangelen);
4329 addReplySds(c,range);
4330 addReply(c,shared.crlf);
4331 decrRefCount(o);
4332 }
4333
4334 /* ========================= Type agnostic commands ========================= */
4335
4336 static void delCommand(redisClient *c) {
4337 int deleted = 0, j;
4338
4339 for (j = 1; j < c->argc; j++) {
4340 if (deleteKey(c->db,c->argv[j])) {
4341 server.dirty++;
4342 deleted++;
4343 }
4344 }
4345 addReplyLong(c,deleted);
4346 }
4347
4348 static void existsCommand(redisClient *c) {
4349 expireIfNeeded(c->db,c->argv[1]);
4350 if (dictFind(c->db->dict,c->argv[1])) {
4351 addReply(c, shared.cone);
4352 } else {
4353 addReply(c, shared.czero);
4354 }
4355 }
4356
4357 static void selectCommand(redisClient *c) {
4358 int id = atoi(c->argv[1]->ptr);
4359
4360 if (selectDb(c,id) == REDIS_ERR) {
4361 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4362 } else {
4363 addReply(c,shared.ok);
4364 }
4365 }
4366
4367 static void randomkeyCommand(redisClient *c) {
4368 dictEntry *de;
4369 robj *key;
4370
4371 while(1) {
4372 de = dictGetRandomKey(c->db->dict);
4373 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4374 }
4375
4376 if (de == NULL) {
4377 addReply(c,shared.nullbulk);
4378 return;
4379 }
4380
4381 key = dictGetEntryKey(de);
4382 if (server.vm_enabled) {
4383 key = dupStringObject(key);
4384 addReplyBulk(c,key);
4385 decrRefCount(key);
4386 } else {
4387 addReplyBulk(c,key);
4388 }
4389 }
4390
4391 static void keysCommand(redisClient *c) {
4392 dictIterator *di;
4393 dictEntry *de;
4394 sds pattern = c->argv[1]->ptr;
4395 int plen = sdslen(pattern);
4396 unsigned long numkeys = 0;
4397 robj *lenobj = createObject(REDIS_STRING,NULL);
4398
4399 di = dictGetIterator(c->db->dict);
4400 addReply(c,lenobj);
4401 decrRefCount(lenobj);
4402 while((de = dictNext(di)) != NULL) {
4403 robj *keyobj = dictGetEntryKey(de);
4404
4405 sds key = keyobj->ptr;
4406 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4407 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4408 if (expireIfNeeded(c->db,keyobj) == 0) {
4409 addReplyBulk(c,keyobj);
4410 numkeys++;
4411 }
4412 }
4413 }
4414 dictReleaseIterator(di);
4415 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4416 }
4417
4418 static void dbsizeCommand(redisClient *c) {
4419 addReplySds(c,
4420 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4421 }
4422
4423 static void lastsaveCommand(redisClient *c) {
4424 addReplySds(c,
4425 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4426 }
4427
4428 static void typeCommand(redisClient *c) {
4429 robj *o;
4430 char *type;
4431
4432 o = lookupKeyRead(c->db,c->argv[1]);
4433 if (o == NULL) {
4434 type = "+none";
4435 } else {
4436 switch(o->type) {
4437 case REDIS_STRING: type = "+string"; break;
4438 case REDIS_LIST: type = "+list"; break;
4439 case REDIS_SET: type = "+set"; break;
4440 case REDIS_ZSET: type = "+zset"; break;
4441 case REDIS_HASH: type = "+hash"; break;
4442 default: type = "+unknown"; break;
4443 }
4444 }
4445 addReplySds(c,sdsnew(type));
4446 addReply(c,shared.crlf);
4447 }
4448
4449 static void saveCommand(redisClient *c) {
4450 if (server.bgsavechildpid != -1) {
4451 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4452 return;
4453 }
4454 if (rdbSave(server.dbfilename) == REDIS_OK) {
4455 addReply(c,shared.ok);
4456 } else {
4457 addReply(c,shared.err);
4458 }
4459 }
4460
4461 static void bgsaveCommand(redisClient *c) {
4462 if (server.bgsavechildpid != -1) {
4463 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4464 return;
4465 }
4466 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4467 char *status = "+Background saving started\r\n";
4468 addReplySds(c,sdsnew(status));
4469 } else {
4470 addReply(c,shared.err);
4471 }
4472 }
4473
4474 static void shutdownCommand(redisClient *c) {
4475 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4476 /* Kill the saving child if there is a background saving in progress.
4477 We want to avoid race conditions, for instance our saving child may
4478 overwrite the synchronous saving did by SHUTDOWN. */
4479 if (server.bgsavechildpid != -1) {
4480 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4481 kill(server.bgsavechildpid,SIGKILL);
4482 rdbRemoveTempFile(server.bgsavechildpid);
4483 }
4484 if (server.appendonly) {
4485 /* Append only file: fsync() the AOF and exit */
4486 fsync(server.appendfd);
4487 if (server.vm_enabled) unlink(server.vm_swap_file);
4488 exit(0);
4489 } else {
4490 /* Snapshotting. Perform a SYNC SAVE and exit */
4491 if (rdbSave(server.dbfilename) == REDIS_OK) {
4492 if (server.daemonize)
4493 unlink(server.pidfile);
4494 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4495 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4496 exit(0);
4497 } else {
4498 /* Ooops.. error saving! The best we can do is to continue
4499 * operating. Note that if there was a background saving process,
4500 * in the next cron() Redis will be notified that the background
4501 * saving aborted, handling special stuff like slaves pending for
4502 * synchronization... */
4503 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4504 addReplySds(c,
4505 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4506 }
4507 }
4508 }
4509
4510 static void renameGenericCommand(redisClient *c, int nx) {
4511 robj *o;
4512
4513 /* To use the same key as src and dst is probably an error */
4514 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4515 addReply(c,shared.sameobjecterr);
4516 return;
4517 }
4518
4519 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4520 return;
4521
4522 incrRefCount(o);
4523 deleteIfVolatile(c->db,c->argv[2]);
4524 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4525 if (nx) {
4526 decrRefCount(o);
4527 addReply(c,shared.czero);
4528 return;
4529 }
4530 dictReplace(c->db->dict,c->argv[2],o);
4531 } else {
4532 incrRefCount(c->argv[2]);
4533 }
4534 deleteKey(c->db,c->argv[1]);
4535 server.dirty++;
4536 addReply(c,nx ? shared.cone : shared.ok);
4537 }
4538
4539 static void renameCommand(redisClient *c) {
4540 renameGenericCommand(c,0);
4541 }
4542
4543 static void renamenxCommand(redisClient *c) {
4544 renameGenericCommand(c,1);
4545 }
4546
4547 static void moveCommand(redisClient *c) {
4548 robj *o;
4549 redisDb *src, *dst;
4550 int srcid;
4551
4552 /* Obtain source and target DB pointers */
4553 src = c->db;
4554 srcid = c->db->id;
4555 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4556 addReply(c,shared.outofrangeerr);
4557 return;
4558 }
4559 dst = c->db;
4560 selectDb(c,srcid); /* Back to the source DB */
4561
4562 /* If the user is moving using as target the same
4563 * DB as the source DB it is probably an error. */
4564 if (src == dst) {
4565 addReply(c,shared.sameobjecterr);
4566 return;
4567 }
4568
4569 /* Check if the element exists and get a reference */
4570 o = lookupKeyWrite(c->db,c->argv[1]);
4571 if (!o) {
4572 addReply(c,shared.czero);
4573 return;
4574 }
4575
4576 /* Try to add the element to the target DB */
4577 deleteIfVolatile(dst,c->argv[1]);
4578 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4579 addReply(c,shared.czero);
4580 return;
4581 }
4582 incrRefCount(c->argv[1]);
4583 incrRefCount(o);
4584
4585 /* OK! key moved, free the entry in the source DB */
4586 deleteKey(src,c->argv[1]);
4587 server.dirty++;
4588 addReply(c,shared.cone);
4589 }
4590
4591 /* =================================== Lists ================================ */
4592 static void pushGenericCommand(redisClient *c, int where) {
4593 robj *lobj;
4594 list *list;
4595
4596 lobj = lookupKeyWrite(c->db,c->argv[1]);
4597 if (lobj == NULL) {
4598 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4599 addReply(c,shared.cone);
4600 return;
4601 }
4602 lobj = createListObject();
4603 list = lobj->ptr;
4604 if (where == REDIS_HEAD) {
4605 listAddNodeHead(list,c->argv[2]);
4606 } else {
4607 listAddNodeTail(list,c->argv[2]);
4608 }
4609 dictAdd(c->db->dict,c->argv[1],lobj);
4610 incrRefCount(c->argv[1]);
4611 incrRefCount(c->argv[2]);
4612 } else {
4613 if (lobj->type != REDIS_LIST) {
4614 addReply(c,shared.wrongtypeerr);
4615 return;
4616 }
4617 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4618 addReply(c,shared.cone);
4619 return;
4620 }
4621 list = lobj->ptr;
4622 if (where == REDIS_HEAD) {
4623 listAddNodeHead(list,c->argv[2]);
4624 } else {
4625 listAddNodeTail(list,c->argv[2]);
4626 }
4627 incrRefCount(c->argv[2]);
4628 }
4629 server.dirty++;
4630 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4631 }
4632
4633 static void lpushCommand(redisClient *c) {
4634 pushGenericCommand(c,REDIS_HEAD);
4635 }
4636
4637 static void rpushCommand(redisClient *c) {
4638 pushGenericCommand(c,REDIS_TAIL);
4639 }
4640
4641 static void llenCommand(redisClient *c) {
4642 robj *o;
4643 list *l;
4644
4645 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4646 checkType(c,o,REDIS_LIST)) return;
4647
4648 l = o->ptr;
4649 addReplyUlong(c,listLength(l));
4650 }
4651
4652 static void lindexCommand(redisClient *c) {
4653 robj *o;
4654 int index = atoi(c->argv[2]->ptr);
4655 list *list;
4656 listNode *ln;
4657
4658 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4659 checkType(c,o,REDIS_LIST)) return;
4660 list = o->ptr;
4661
4662 ln = listIndex(list, index);
4663 if (ln == NULL) {
4664 addReply(c,shared.nullbulk);
4665 } else {
4666 robj *ele = listNodeValue(ln);
4667 addReplyBulk(c,ele);
4668 }
4669 }
4670
4671 static void lsetCommand(redisClient *c) {
4672 robj *o;
4673 int index = atoi(c->argv[2]->ptr);
4674 list *list;
4675 listNode *ln;
4676
4677 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4678 checkType(c,o,REDIS_LIST)) return;
4679 list = o->ptr;
4680
4681 ln = listIndex(list, index);
4682 if (ln == NULL) {
4683 addReply(c,shared.outofrangeerr);
4684 } else {
4685 robj *ele = listNodeValue(ln);
4686
4687 decrRefCount(ele);
4688 listNodeValue(ln) = c->argv[3];
4689 incrRefCount(c->argv[3]);
4690 addReply(c,shared.ok);
4691 server.dirty++;
4692 }
4693 }
4694
4695 static void popGenericCommand(redisClient *c, int where) {
4696 robj *o;
4697 list *list;
4698 listNode *ln;
4699
4700 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4701 checkType(c,o,REDIS_LIST)) return;
4702 list = o->ptr;
4703
4704 if (where == REDIS_HEAD)
4705 ln = listFirst(list);
4706 else
4707 ln = listLast(list);
4708
4709 if (ln == NULL) {
4710 addReply(c,shared.nullbulk);
4711 } else {
4712 robj *ele = listNodeValue(ln);
4713 addReplyBulk(c,ele);
4714 listDelNode(list,ln);
4715 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4716 server.dirty++;
4717 }
4718 }
4719
4720 static void lpopCommand(redisClient *c) {
4721 popGenericCommand(c,REDIS_HEAD);
4722 }
4723
4724 static void rpopCommand(redisClient *c) {
4725 popGenericCommand(c,REDIS_TAIL);
4726 }
4727
4728 static void lrangeCommand(redisClient *c) {
4729 robj *o;
4730 int start = atoi(c->argv[2]->ptr);
4731 int end = atoi(c->argv[3]->ptr);
4732 int llen;
4733 int rangelen, j;
4734 list *list;
4735 listNode *ln;
4736 robj *ele;
4737
4738 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4739 || checkType(c,o,REDIS_LIST)) return;
4740 list = o->ptr;
4741 llen = listLength(list);
4742
4743 /* convert negative indexes */
4744 if (start < 0) start = llen+start;
4745 if (end < 0) end = llen+end;
4746 if (start < 0) start = 0;
4747 if (end < 0) end = 0;
4748
4749 /* indexes sanity checks */
4750 if (start > end || start >= llen) {
4751 /* Out of range start or start > end result in empty list */
4752 addReply(c,shared.emptymultibulk);
4753 return;
4754 }
4755 if (end >= llen) end = llen-1;
4756 rangelen = (end-start)+1;
4757
4758 /* Return the result in form of a multi-bulk reply */
4759 ln = listIndex(list, start);
4760 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4761 for (j = 0; j < rangelen; j++) {
4762 ele = listNodeValue(ln);
4763 addReplyBulk(c,ele);
4764 ln = ln->next;
4765 }
4766 }
4767
4768 static void ltrimCommand(redisClient *c) {
4769 robj *o;
4770 int start = atoi(c->argv[2]->ptr);
4771 int end = atoi(c->argv[3]->ptr);
4772 int llen;
4773 int j, ltrim, rtrim;
4774 list *list;
4775 listNode *ln;
4776
4777 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4778 checkType(c,o,REDIS_LIST)) return;
4779 list = o->ptr;
4780 llen = listLength(list);
4781
4782 /* convert negative indexes */
4783 if (start < 0) start = llen+start;
4784 if (end < 0) end = llen+end;
4785 if (start < 0) start = 0;
4786 if (end < 0) end = 0;
4787
4788 /* indexes sanity checks */
4789 if (start > end || start >= llen) {
4790 /* Out of range start or start > end result in empty list */
4791 ltrim = llen;
4792 rtrim = 0;
4793 } else {
4794 if (end >= llen) end = llen-1;
4795 ltrim = start;
4796 rtrim = llen-end-1;
4797 }
4798
4799 /* Remove list elements to perform the trim */
4800 for (j = 0; j < ltrim; j++) {
4801 ln = listFirst(list);
4802 listDelNode(list,ln);
4803 }
4804 for (j = 0; j < rtrim; j++) {
4805 ln = listLast(list);
4806 listDelNode(list,ln);
4807 }
4808 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4809 server.dirty++;
4810 addReply(c,shared.ok);
4811 }
4812
4813 static void lremCommand(redisClient *c) {
4814 robj *o;
4815 list *list;
4816 listNode *ln, *next;
4817 int toremove = atoi(c->argv[2]->ptr);
4818 int removed = 0;
4819 int fromtail = 0;
4820
4821 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4822 checkType(c,o,REDIS_LIST)) return;
4823 list = o->ptr;
4824
4825 if (toremove < 0) {
4826 toremove = -toremove;
4827 fromtail = 1;
4828 }
4829 ln = fromtail ? list->tail : list->head;
4830 while (ln) {
4831 robj *ele = listNodeValue(ln);
4832
4833 next = fromtail ? ln->prev : ln->next;
4834 if (compareStringObjects(ele,c->argv[3]) == 0) {
4835 listDelNode(list,ln);
4836 server.dirty++;
4837 removed++;
4838 if (toremove && removed == toremove) break;
4839 }
4840 ln = next;
4841 }
4842 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4843 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4844 }
4845
4846 /* This is the semantic of this command:
4847 * RPOPLPUSH srclist dstlist:
4848 * IF LLEN(srclist) > 0
4849 * element = RPOP srclist
4850 * LPUSH dstlist element
4851 * RETURN element
4852 * ELSE
4853 * RETURN nil
4854 * END
4855 * END
4856 *
4857 * The idea is to be able to get an element from a list in a reliable way
4858 * since the element is not just returned but pushed against another list
4859 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4860 */
4861 static void rpoplpushcommand(redisClient *c) {
4862 robj *sobj;
4863 list *srclist;
4864 listNode *ln;
4865
4866 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4867 checkType(c,sobj,REDIS_LIST)) return;
4868 srclist = sobj->ptr;
4869 ln = listLast(srclist);
4870
4871 if (ln == NULL) {
4872 addReply(c,shared.nullbulk);
4873 } else {
4874 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4875 robj *ele = listNodeValue(ln);
4876 list *dstlist;
4877
4878 if (dobj && dobj->type != REDIS_LIST) {
4879 addReply(c,shared.wrongtypeerr);
4880 return;
4881 }
4882
4883 /* Add the element to the target list (unless it's directly
4884 * passed to some BLPOP-ing client */
4885 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4886 if (dobj == NULL) {
4887 /* Create the list if the key does not exist */
4888 dobj = createListObject();
4889 dictAdd(c->db->dict,c->argv[2],dobj);
4890 incrRefCount(c->argv[2]);
4891 }
4892 dstlist = dobj->ptr;
4893 listAddNodeHead(dstlist,ele);
4894 incrRefCount(ele);
4895 }
4896
4897 /* Send the element to the client as reply as well */
4898 addReplyBulk(c,ele);
4899
4900 /* Finally remove the element from the source list */
4901 listDelNode(srclist,ln);
4902 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4903 server.dirty++;
4904 }
4905 }
4906
4907 /* ==================================== Sets ================================ */
4908
4909 static void saddCommand(redisClient *c) {
4910 robj *set;
4911
4912 set = lookupKeyWrite(c->db,c->argv[1]);
4913 if (set == NULL) {
4914 set = createSetObject();
4915 dictAdd(c->db->dict,c->argv[1],set);
4916 incrRefCount(c->argv[1]);
4917 } else {
4918 if (set->type != REDIS_SET) {
4919 addReply(c,shared.wrongtypeerr);
4920 return;
4921 }
4922 }
4923 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4924 incrRefCount(c->argv[2]);
4925 server.dirty++;
4926 addReply(c,shared.cone);
4927 } else {
4928 addReply(c,shared.czero);
4929 }
4930 }
4931
4932 static void sremCommand(redisClient *c) {
4933 robj *set;
4934
4935 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4936 checkType(c,set,REDIS_SET)) return;
4937
4938 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4939 server.dirty++;
4940 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4941 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4942 addReply(c,shared.cone);
4943 } else {
4944 addReply(c,shared.czero);
4945 }
4946 }
4947
4948 static void smoveCommand(redisClient *c) {
4949 robj *srcset, *dstset;
4950
4951 srcset = lookupKeyWrite(c->db,c->argv[1]);
4952 dstset = lookupKeyWrite(c->db,c->argv[2]);
4953
4954 /* If the source key does not exist return 0, if it's of the wrong type
4955 * raise an error */
4956 if (srcset == NULL || srcset->type != REDIS_SET) {
4957 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4958 return;
4959 }
4960 /* Error if the destination key is not a set as well */
4961 if (dstset && dstset->type != REDIS_SET) {
4962 addReply(c,shared.wrongtypeerr);
4963 return;
4964 }
4965 /* Remove the element from the source set */
4966 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4967 /* Key not found in the src set! return zero */
4968 addReply(c,shared.czero);
4969 return;
4970 }
4971 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4972 deleteKey(c->db,c->argv[1]);
4973 server.dirty++;
4974 /* Add the element to the destination set */
4975 if (!dstset) {
4976 dstset = createSetObject();
4977 dictAdd(c->db->dict,c->argv[2],dstset);
4978 incrRefCount(c->argv[2]);
4979 }
4980 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4981 incrRefCount(c->argv[3]);
4982 addReply(c,shared.cone);
4983 }
4984
4985 static void sismemberCommand(redisClient *c) {
4986 robj *set;
4987
4988 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4989 checkType(c,set,REDIS_SET)) return;
4990
4991 if (dictFind(set->ptr,c->argv[2]))
4992 addReply(c,shared.cone);
4993 else
4994 addReply(c,shared.czero);
4995 }
4996
4997 static void scardCommand(redisClient *c) {
4998 robj *o;
4999 dict *s;
5000
5001 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5002 checkType(c,o,REDIS_SET)) return;
5003
5004 s = o->ptr;
5005 addReplyUlong(c,dictSize(s));
5006 }
5007
5008 static void spopCommand(redisClient *c) {
5009 robj *set;
5010 dictEntry *de;
5011
5012 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5013 checkType(c,set,REDIS_SET)) return;
5014
5015 de = dictGetRandomKey(set->ptr);
5016 if (de == NULL) {
5017 addReply(c,shared.nullbulk);
5018 } else {
5019 robj *ele = dictGetEntryKey(de);
5020
5021 addReplyBulk(c,ele);
5022 dictDelete(set->ptr,ele);
5023 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5024 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5025 server.dirty++;
5026 }
5027 }
5028
5029 static void srandmemberCommand(redisClient *c) {
5030 robj *set;
5031 dictEntry *de;
5032
5033 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5034 checkType(c,set,REDIS_SET)) return;
5035
5036 de = dictGetRandomKey(set->ptr);
5037 if (de == NULL) {
5038 addReply(c,shared.nullbulk);
5039 } else {
5040 robj *ele = dictGetEntryKey(de);
5041
5042 addReplyBulk(c,ele);
5043 }
5044 }
5045
5046 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5047 dict **d1 = (void*) s1, **d2 = (void*) s2;
5048
5049 return dictSize(*d1)-dictSize(*d2);
5050 }
5051
5052 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5053 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5054 dictIterator *di;
5055 dictEntry *de;
5056 robj *lenobj = NULL, *dstset = NULL;
5057 unsigned long j, cardinality = 0;
5058
5059 for (j = 0; j < setsnum; j++) {
5060 robj *setobj;
5061
5062 setobj = dstkey ?
5063 lookupKeyWrite(c->db,setskeys[j]) :
5064 lookupKeyRead(c->db,setskeys[j]);
5065 if (!setobj) {
5066 zfree(dv);
5067 if (dstkey) {
5068 if (deleteKey(c->db,dstkey))
5069 server.dirty++;
5070 addReply(c,shared.czero);
5071 } else {
5072 addReply(c,shared.emptymultibulk);
5073 }
5074 return;
5075 }
5076 if (setobj->type != REDIS_SET) {
5077 zfree(dv);
5078 addReply(c,shared.wrongtypeerr);
5079 return;
5080 }
5081 dv[j] = setobj->ptr;
5082 }
5083 /* Sort sets from the smallest to largest, this will improve our
5084 * algorithm's performace */
5085 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5086
5087 /* The first thing we should output is the total number of elements...
5088 * since this is a multi-bulk write, but at this stage we don't know
5089 * the intersection set size, so we use a trick, append an empty object
5090 * to the output list and save the pointer to later modify it with the
5091 * right length */
5092 if (!dstkey) {
5093 lenobj = createObject(REDIS_STRING,NULL);
5094 addReply(c,lenobj);
5095 decrRefCount(lenobj);
5096 } else {
5097 /* If we have a target key where to store the resulting set
5098 * create this key with an empty set inside */
5099 dstset = createSetObject();
5100 }
5101
5102 /* Iterate all the elements of the first (smallest) set, and test
5103 * the element against all the other sets, if at least one set does
5104 * not include the element it is discarded */
5105 di = dictGetIterator(dv[0]);
5106
5107 while((de = dictNext(di)) != NULL) {
5108 robj *ele;
5109
5110 for (j = 1; j < setsnum; j++)
5111 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5112 if (j != setsnum)
5113 continue; /* at least one set does not contain the member */
5114 ele = dictGetEntryKey(de);
5115 if (!dstkey) {
5116 addReplyBulk(c,ele);
5117 cardinality++;
5118 } else {
5119 dictAdd(dstset->ptr,ele,NULL);
5120 incrRefCount(ele);
5121 }
5122 }
5123 dictReleaseIterator(di);
5124
5125 if (dstkey) {
5126 /* Store the resulting set into the target, if the intersection
5127 * is not an empty set. */
5128 deleteKey(c->db,dstkey);
5129 if (dictSize((dict*)dstset->ptr) > 0) {
5130 dictAdd(c->db->dict,dstkey,dstset);
5131 incrRefCount(dstkey);
5132 addReplyLong(c,dictSize((dict*)dstset->ptr));
5133 } else {
5134 decrRefCount(dstset);
5135 addReply(c,shared.czero);
5136 }
5137 server.dirty++;
5138 } else {
5139 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5140 }
5141 zfree(dv);
5142 }
5143
5144 static void sinterCommand(redisClient *c) {
5145 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5146 }
5147
5148 static void sinterstoreCommand(redisClient *c) {
5149 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5150 }
5151
5152 #define REDIS_OP_UNION 0
5153 #define REDIS_OP_DIFF 1
5154 #define REDIS_OP_INTER 2
5155
5156 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5157 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5158 dictIterator *di;
5159 dictEntry *de;
5160 robj *dstset = NULL;
5161 int j, cardinality = 0;
5162
5163 for (j = 0; j < setsnum; j++) {
5164 robj *setobj;
5165
5166 setobj = dstkey ?
5167 lookupKeyWrite(c->db,setskeys[j]) :
5168 lookupKeyRead(c->db,setskeys[j]);
5169 if (!setobj) {
5170 dv[j] = NULL;
5171 continue;
5172 }
5173 if (setobj->type != REDIS_SET) {
5174 zfree(dv);
5175 addReply(c,shared.wrongtypeerr);
5176 return;
5177 }
5178 dv[j] = setobj->ptr;
5179 }
5180
5181 /* We need a temp set object to store our union. If the dstkey
5182 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5183 * this set object will be the resulting object to set into the target key*/
5184 dstset = createSetObject();
5185
5186 /* Iterate all the elements of all the sets, add every element a single
5187 * time to the result set */
5188 for (j = 0; j < setsnum; j++) {
5189 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5190 if (!dv[j]) continue; /* non existing keys are like empty sets */
5191
5192 di = dictGetIterator(dv[j]);
5193
5194 while((de = dictNext(di)) != NULL) {
5195 robj *ele;
5196
5197 /* dictAdd will not add the same element multiple times */
5198 ele = dictGetEntryKey(de);
5199 if (op == REDIS_OP_UNION || j == 0) {
5200 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5201 incrRefCount(ele);
5202 cardinality++;
5203 }
5204 } else if (op == REDIS_OP_DIFF) {
5205 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5206 cardinality--;
5207 }
5208 }
5209 }
5210 dictReleaseIterator(di);
5211
5212 /* result set is empty? Exit asap. */
5213 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5214 }
5215
5216 /* Output the content of the resulting set, if not in STORE mode */
5217 if (!dstkey) {
5218 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5219 di = dictGetIterator(dstset->ptr);
5220 while((de = dictNext(di)) != NULL) {
5221 robj *ele;
5222
5223 ele = dictGetEntryKey(de);
5224 addReplyBulk(c,ele);
5225 }
5226 dictReleaseIterator(di);
5227 decrRefCount(dstset);
5228 } else {
5229 /* If we have a target key where to store the resulting set
5230 * create this key with the result set inside */
5231 deleteKey(c->db,dstkey);
5232 if (dictSize((dict*)dstset->ptr) > 0) {
5233 dictAdd(c->db->dict,dstkey,dstset);
5234 incrRefCount(dstkey);
5235 addReplyLong(c,dictSize((dict*)dstset->ptr));
5236 } else {
5237 decrRefCount(dstset);
5238 addReply(c,shared.czero);
5239 }
5240 server.dirty++;
5241 }
5242 zfree(dv);
5243 }
5244
5245 static void sunionCommand(redisClient *c) {
5246 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5247 }
5248
5249 static void sunionstoreCommand(redisClient *c) {
5250 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5251 }
5252
5253 static void sdiffCommand(redisClient *c) {
5254 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5255 }
5256
5257 static void sdiffstoreCommand(redisClient *c) {
5258 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5259 }
5260
5261 /* ==================================== ZSets =============================== */
5262
5263 /* ZSETs are ordered sets using two data structures to hold the same elements
5264 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5265 * data structure.
5266 *
5267 * The elements are added to an hash table mapping Redis objects to scores.
5268 * At the same time the elements are added to a skip list mapping scores
5269 * to Redis objects (so objects are sorted by scores in this "view"). */
5270
5271 /* This skiplist implementation is almost a C translation of the original
5272 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5273 * Alternative to Balanced Trees", modified in three ways:
5274 * a) this implementation allows for repeated values.
5275 * b) the comparison is not just by key (our 'score') but by satellite data.
5276 * c) there is a back pointer, so it's a doubly linked list with the back
5277 * pointers being only at "level 1". This allows to traverse the list
5278 * from tail to head, useful for ZREVRANGE. */
5279
5280 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5281 zskiplistNode *zn = zmalloc(sizeof(*zn));
5282
5283 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5284 if (level > 0)
5285 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5286 zn->score = score;
5287 zn->obj = obj;
5288 return zn;
5289 }
5290
5291 static zskiplist *zslCreate(void) {
5292 int j;
5293 zskiplist *zsl;
5294
5295 zsl = zmalloc(sizeof(*zsl));
5296 zsl->level = 1;
5297 zsl->length = 0;
5298 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5299 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5300 zsl->header->forward[j] = NULL;
5301
5302 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5303 if (j < ZSKIPLIST_MAXLEVEL-1)
5304 zsl->header->span[j] = 0;
5305 }
5306 zsl->header->backward = NULL;
5307 zsl->tail = NULL;
5308 return zsl;
5309 }
5310
5311 static void zslFreeNode(zskiplistNode *node) {
5312 decrRefCount(node->obj);
5313 zfree(node->forward);
5314 zfree(node->span);
5315 zfree(node);
5316 }
5317
5318 static void zslFree(zskiplist *zsl) {
5319 zskiplistNode *node = zsl->header->forward[0], *next;
5320
5321 zfree(zsl->header->forward);
5322 zfree(zsl->header->span);
5323 zfree(zsl->header);
5324 while(node) {
5325 next = node->forward[0];
5326 zslFreeNode(node);
5327 node = next;
5328 }
5329 zfree(zsl);
5330 }
5331
5332 static int zslRandomLevel(void) {
5333 int level = 1;
5334 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5335 level += 1;
5336 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5337 }
5338
5339 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5340 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5341 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5342 int i, level;
5343
5344 x = zsl->header;
5345 for (i = zsl->level-1; i >= 0; i--) {
5346 /* store rank that is crossed to reach the insert position */
5347 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5348
5349 while (x->forward[i] &&
5350 (x->forward[i]->score < score ||
5351 (x->forward[i]->score == score &&
5352 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5353 rank[i] += i > 0 ? x->span[i-1] : 1;
5354 x = x->forward[i];
5355 }
5356 update[i] = x;
5357 }
5358 /* we assume the key is not already inside, since we allow duplicated
5359 * scores, and the re-insertion of score and redis object should never
5360 * happpen since the caller of zslInsert() should test in the hash table
5361 * if the element is already inside or not. */
5362 level = zslRandomLevel();
5363 if (level > zsl->level) {
5364 for (i = zsl->level; i < level; i++) {
5365 rank[i] = 0;
5366 update[i] = zsl->header;
5367 update[i]->span[i-1] = zsl->length;
5368 }
5369 zsl->level = level;
5370 }
5371 x = zslCreateNode(level,score,obj);
5372 for (i = 0; i < level; i++) {
5373 x->forward[i] = update[i]->forward[i];
5374 update[i]->forward[i] = x;
5375
5376 /* update span covered by update[i] as x is inserted here */
5377 if (i > 0) {
5378 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5379 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5380 }
5381 }
5382
5383 /* increment span for untouched levels */
5384 for (i = level; i < zsl->level; i++) {
5385 update[i]->span[i-1]++;
5386 }
5387
5388 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5389 if (x->forward[0])
5390 x->forward[0]->backward = x;
5391 else
5392 zsl->tail = x;
5393 zsl->length++;
5394 }
5395
5396 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5397 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5398 int i;
5399 for (i = 0; i < zsl->level; i++) {
5400 if (update[i]->forward[i] == x) {
5401 if (i > 0) {
5402 update[i]->span[i-1] += x->span[i-1] - 1;
5403 }
5404 update[i]->forward[i] = x->forward[i];
5405 } else {
5406 /* invariant: i > 0, because update[0]->forward[0]
5407 * is always equal to x */
5408 update[i]->span[i-1] -= 1;
5409 }
5410 }
5411 if (x->forward[0]) {
5412 x->forward[0]->backward = x->backward;
5413 } else {
5414 zsl->tail = x->backward;
5415 }
5416 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5417 zsl->level--;
5418 zsl->length--;
5419 }
5420
5421 /* Delete an element with matching score/object from the skiplist. */
5422 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5423 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5424 int i;
5425
5426 x = zsl->header;
5427 for (i = zsl->level-1; i >= 0; i--) {
5428 while (x->forward[i] &&
5429 (x->forward[i]->score < score ||
5430 (x->forward[i]->score == score &&
5431 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5432 x = x->forward[i];
5433 update[i] = x;
5434 }
5435 /* We may have multiple elements with the same score, what we need
5436 * is to find the element with both the right score and object. */
5437 x = x->forward[0];
5438 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5439 zslDeleteNode(zsl, x, update);
5440 zslFreeNode(x);
5441 return 1;
5442 } else {
5443 return 0; /* not found */
5444 }
5445 return 0; /* not found */
5446 }
5447
5448 /* Delete all the elements with score between min and max from the skiplist.
5449 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5450 * Note that this function takes the reference to the hash table view of the
5451 * sorted set, in order to remove the elements from the hash table too. */
5452 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5453 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5454 unsigned long removed = 0;
5455 int i;
5456
5457 x = zsl->header;
5458 for (i = zsl->level-1; i >= 0; i--) {
5459 while (x->forward[i] && x->forward[i]->score < min)
5460 x = x->forward[i];
5461 update[i] = x;
5462 }
5463 /* We may have multiple elements with the same score, what we need
5464 * is to find the element with both the right score and object. */
5465 x = x->forward[0];
5466 while (x && x->score <= max) {
5467 zskiplistNode *next = x->forward[0];
5468 zslDeleteNode(zsl, x, update);
5469 dictDelete(dict,x->obj);
5470 zslFreeNode(x);
5471 removed++;
5472 x = next;
5473 }
5474 return removed; /* not found */
5475 }
5476
5477 /* Delete all the elements with rank between start and end from the skiplist.
5478 * Start and end are inclusive. Note that start and end need to be 1-based */
5479 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5480 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5481 unsigned long traversed = 0, removed = 0;
5482 int i;
5483
5484 x = zsl->header;
5485 for (i = zsl->level-1; i >= 0; i--) {
5486 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5487 traversed += i > 0 ? x->span[i-1] : 1;
5488 x = x->forward[i];
5489 }
5490 update[i] = x;
5491 }
5492
5493 traversed++;
5494 x = x->forward[0];
5495 while (x && traversed <= end) {
5496 zskiplistNode *next = x->forward[0];
5497 zslDeleteNode(zsl, x, update);
5498 dictDelete(dict,x->obj);
5499 zslFreeNode(x);
5500 removed++;
5501 traversed++;
5502 x = next;
5503 }
5504 return removed;
5505 }
5506
5507 /* Find the first node having a score equal or greater than the specified one.
5508 * Returns NULL if there is no match. */
5509 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5510 zskiplistNode *x;
5511 int i;
5512
5513 x = zsl->header;
5514 for (i = zsl->level-1; i >= 0; i--) {
5515 while (x->forward[i] && x->forward[i]->score < score)
5516 x = x->forward[i];
5517 }
5518 /* We may have multiple elements with the same score, what we need
5519 * is to find the element with both the right score and object. */
5520 return x->forward[0];
5521 }
5522
5523 /* Find the rank for an element by both score and key.
5524 * Returns 0 when the element cannot be found, rank otherwise.
5525 * Note that the rank is 1-based due to the span of zsl->header to the
5526 * first element. */
5527 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5528 zskiplistNode *x;
5529 unsigned long rank = 0;
5530 int i;
5531
5532 x = zsl->header;
5533 for (i = zsl->level-1; i >= 0; i--) {
5534 while (x->forward[i] &&
5535 (x->forward[i]->score < score ||
5536 (x->forward[i]->score == score &&
5537 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5538 rank += i > 0 ? x->span[i-1] : 1;
5539 x = x->forward[i];
5540 }
5541
5542 /* x might be equal to zsl->header, so test if obj is non-NULL */
5543 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5544 return rank;
5545 }
5546 }
5547 return 0;
5548 }
5549
5550 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5551 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5552 zskiplistNode *x;
5553 unsigned long traversed = 0;
5554 int i;
5555
5556 x = zsl->header;
5557 for (i = zsl->level-1; i >= 0; i--) {
5558 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5559 {
5560 traversed += i > 0 ? x->span[i-1] : 1;
5561 x = x->forward[i];
5562 }
5563 if (traversed == rank) {
5564 return x;
5565 }
5566 }
5567 return NULL;
5568 }
5569
5570 /* The actual Z-commands implementations */
5571
5572 /* This generic command implements both ZADD and ZINCRBY.
5573 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5574 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5575 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5576 robj *zsetobj;
5577 zset *zs;
5578 double *score;
5579
5580 zsetobj = lookupKeyWrite(c->db,key);
5581 if (zsetobj == NULL) {
5582 zsetobj = createZsetObject();
5583 dictAdd(c->db->dict,key,zsetobj);
5584 incrRefCount(key);
5585 } else {
5586 if (zsetobj->type != REDIS_ZSET) {
5587 addReply(c,shared.wrongtypeerr);
5588 return;
5589 }
5590 }
5591 zs = zsetobj->ptr;
5592
5593 /* Ok now since we implement both ZADD and ZINCRBY here the code
5594 * needs to handle the two different conditions. It's all about setting
5595 * '*score', that is, the new score to set, to the right value. */
5596 score = zmalloc(sizeof(double));
5597 if (doincrement) {
5598 dictEntry *de;
5599
5600 /* Read the old score. If the element was not present starts from 0 */
5601 de = dictFind(zs->dict,ele);
5602 if (de) {
5603 double *oldscore = dictGetEntryVal(de);
5604 *score = *oldscore + scoreval;
5605 } else {
5606 *score = scoreval;
5607 }
5608 } else {
5609 *score = scoreval;
5610 }
5611
5612 /* What follows is a simple remove and re-insert operation that is common
5613 * to both ZADD and ZINCRBY... */
5614 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5615 /* case 1: New element */
5616 incrRefCount(ele); /* added to hash */
5617 zslInsert(zs->zsl,*score,ele);
5618 incrRefCount(ele); /* added to skiplist */
5619 server.dirty++;
5620 if (doincrement)
5621 addReplyDouble(c,*score);
5622 else
5623 addReply(c,shared.cone);
5624 } else {
5625 dictEntry *de;
5626 double *oldscore;
5627
5628 /* case 2: Score update operation */
5629 de = dictFind(zs->dict,ele);
5630 redisAssert(de != NULL);
5631 oldscore = dictGetEntryVal(de);
5632 if (*score != *oldscore) {
5633 int deleted;
5634
5635 /* Remove and insert the element in the skip list with new score */
5636 deleted = zslDelete(zs->zsl,*oldscore,ele);
5637 redisAssert(deleted != 0);
5638 zslInsert(zs->zsl,*score,ele);
5639 incrRefCount(ele);
5640 /* Update the score in the hash table */
5641 dictReplace(zs->dict,ele,score);
5642 server.dirty++;
5643 } else {
5644 zfree(score);
5645 }
5646 if (doincrement)
5647 addReplyDouble(c,*score);
5648 else
5649 addReply(c,shared.czero);
5650 }
5651 }
5652
5653 static void zaddCommand(redisClient *c) {
5654 double scoreval;
5655
5656 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5657 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5658 }
5659
5660 static void zincrbyCommand(redisClient *c) {
5661 double scoreval;
5662
5663 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5664 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5665 }
5666
5667 static void zremCommand(redisClient *c) {
5668 robj *zsetobj;
5669 zset *zs;
5670 dictEntry *de;
5671 double *oldscore;
5672 int deleted;
5673
5674 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5675 checkType(c,zsetobj,REDIS_ZSET)) return;
5676
5677 zs = zsetobj->ptr;
5678 de = dictFind(zs->dict,c->argv[2]);
5679 if (de == NULL) {
5680 addReply(c,shared.czero);
5681 return;
5682 }
5683 /* Delete from the skiplist */
5684 oldscore = dictGetEntryVal(de);
5685 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5686 redisAssert(deleted != 0);
5687
5688 /* Delete from the hash table */
5689 dictDelete(zs->dict,c->argv[2]);
5690 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5691 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5692 server.dirty++;
5693 addReply(c,shared.cone);
5694 }
5695
5696 static void zremrangebyscoreCommand(redisClient *c) {
5697 double min;
5698 double max;
5699 long deleted;
5700 robj *zsetobj;
5701 zset *zs;
5702
5703 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5704 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5705
5706 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5707 checkType(c,zsetobj,REDIS_ZSET)) return;
5708
5709 zs = zsetobj->ptr;
5710 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5711 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5712 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5713 server.dirty += deleted;
5714 addReplyLong(c,deleted);
5715 }
5716
5717 static void zremrangebyrankCommand(redisClient *c) {
5718 long start;
5719 long end;
5720 int llen;
5721 long deleted;
5722 robj *zsetobj;
5723 zset *zs;
5724
5725 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5726 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5727
5728 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5729 checkType(c,zsetobj,REDIS_ZSET)) return;
5730 zs = zsetobj->ptr;
5731 llen = zs->zsl->length;
5732
5733 /* convert negative indexes */
5734 if (start < 0) start = llen+start;
5735 if (end < 0) end = llen+end;
5736 if (start < 0) start = 0;
5737 if (end < 0) end = 0;
5738
5739 /* indexes sanity checks */
5740 if (start > end || start >= llen) {
5741 addReply(c,shared.czero);
5742 return;
5743 }
5744 if (end >= llen) end = llen-1;
5745
5746 /* increment start and end because zsl*Rank functions
5747 * use 1-based rank */
5748 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5749 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5750 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5751 server.dirty += deleted;
5752 addReplyLong(c, deleted);
5753 }
5754
5755 typedef struct {
5756 dict *dict;
5757 double weight;
5758 } zsetopsrc;
5759
5760 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5761 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5762 unsigned long size1, size2;
5763 size1 = d1->dict ? dictSize(d1->dict) : 0;
5764 size2 = d2->dict ? dictSize(d2->dict) : 0;
5765 return size1 - size2;
5766 }
5767
5768 #define REDIS_AGGR_SUM 1
5769 #define REDIS_AGGR_MIN 2
5770 #define REDIS_AGGR_MAX 3
5771
5772 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5773 if (aggregate == REDIS_AGGR_SUM) {
5774 *target = *target + val;
5775 } else if (aggregate == REDIS_AGGR_MIN) {
5776 *target = val < *target ? val : *target;
5777 } else if (aggregate == REDIS_AGGR_MAX) {
5778 *target = val > *target ? val : *target;
5779 } else {
5780 /* safety net */
5781 redisPanic("Unknown ZUNION/INTER aggregate type");
5782 }
5783 }
5784
5785 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5786 int i, j, zsetnum;
5787 int aggregate = REDIS_AGGR_SUM;
5788 zsetopsrc *src;
5789 robj *dstobj;
5790 zset *dstzset;
5791 dictIterator *di;
5792 dictEntry *de;
5793
5794 /* expect zsetnum input keys to be given */
5795 zsetnum = atoi(c->argv[2]->ptr);
5796 if (zsetnum < 1) {
5797 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5798 return;
5799 }
5800
5801 /* test if the expected number of keys would overflow */
5802 if (3+zsetnum > c->argc) {
5803 addReply(c,shared.syntaxerr);
5804 return;
5805 }
5806
5807 /* read keys to be used for input */
5808 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5809 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5810 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5811 if (!zsetobj) {
5812 src[i].dict = NULL;
5813 } else {
5814 if (zsetobj->type != REDIS_ZSET) {
5815 zfree(src);
5816 addReply(c,shared.wrongtypeerr);
5817 return;
5818 }
5819 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5820 }
5821
5822 /* default all weights to 1 */
5823 src[i].weight = 1.0;
5824 }
5825
5826 /* parse optional extra arguments */
5827 if (j < c->argc) {
5828 int remaining = c->argc - j;
5829
5830 while (remaining) {
5831 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5832 j++; remaining--;
5833 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5834 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5835 return;
5836 }
5837 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5838 j++; remaining--;
5839 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5840 aggregate = REDIS_AGGR_SUM;
5841 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5842 aggregate = REDIS_AGGR_MIN;
5843 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5844 aggregate = REDIS_AGGR_MAX;
5845 } else {
5846 zfree(src);
5847 addReply(c,shared.syntaxerr);
5848 return;
5849 }
5850 j++; remaining--;
5851 } else {
5852 zfree(src);
5853 addReply(c,shared.syntaxerr);
5854 return;
5855 }
5856 }
5857 }
5858
5859 /* sort sets from the smallest to largest, this will improve our
5860 * algorithm's performance */
5861 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5862
5863 dstobj = createZsetObject();
5864 dstzset = dstobj->ptr;
5865
5866 if (op == REDIS_OP_INTER) {
5867 /* skip going over all entries if the smallest zset is NULL or empty */
5868 if (src[0].dict && dictSize(src[0].dict) > 0) {
5869 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5870 * from small to large, all src[i > 0].dict are non-empty too */
5871 di = dictGetIterator(src[0].dict);
5872 while((de = dictNext(di)) != NULL) {
5873 double *score = zmalloc(sizeof(double)), value;
5874 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5875
5876 for (j = 1; j < zsetnum; j++) {
5877 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5878 if (other) {
5879 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5880 zunionInterAggregate(score, value, aggregate);
5881 } else {
5882 break;
5883 }
5884 }
5885
5886 /* skip entry when not present in every source dict */
5887 if (j != zsetnum) {
5888 zfree(score);
5889 } else {
5890 robj *o = dictGetEntryKey(de);
5891 dictAdd(dstzset->dict,o,score);
5892 incrRefCount(o); /* added to dictionary */
5893 zslInsert(dstzset->zsl,*score,o);
5894 incrRefCount(o); /* added to skiplist */
5895 }
5896 }
5897 dictReleaseIterator(di);
5898 }
5899 } else if (op == REDIS_OP_UNION) {
5900 for (i = 0; i < zsetnum; i++) {
5901 if (!src[i].dict) continue;
5902
5903 di = dictGetIterator(src[i].dict);
5904 while((de = dictNext(di)) != NULL) {
5905 /* skip key when already processed */
5906 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5907
5908 double *score = zmalloc(sizeof(double)), value;
5909 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5910
5911 /* because the zsets are sorted by size, its only possible
5912 * for sets at larger indices to hold this entry */
5913 for (j = (i+1); j < zsetnum; j++) {
5914 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5915 if (other) {
5916 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5917 zunionInterAggregate(score, value, aggregate);
5918 }
5919 }
5920
5921 robj *o = dictGetEntryKey(de);
5922 dictAdd(dstzset->dict,o,score);
5923 incrRefCount(o); /* added to dictionary */
5924 zslInsert(dstzset->zsl,*score,o);
5925 incrRefCount(o); /* added to skiplist */
5926 }
5927 dictReleaseIterator(di);
5928 }
5929 } else {
5930 /* unknown operator */
5931 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5932 }
5933
5934 deleteKey(c->db,dstkey);
5935 if (dstzset->zsl->length) {
5936 dictAdd(c->db->dict,dstkey,dstobj);
5937 incrRefCount(dstkey);
5938 addReplyLong(c, dstzset->zsl->length);
5939 server.dirty++;
5940 } else {
5941 decrRefCount(dstobj);
5942 addReply(c, shared.czero);
5943 }
5944 zfree(src);
5945 }
5946
5947 static void zunionCommand(redisClient *c) {
5948 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5949 }
5950
5951 static void zinterCommand(redisClient *c) {
5952 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5953 }
5954
5955 static void zrangeGenericCommand(redisClient *c, int reverse) {
5956 robj *o;
5957 long start;
5958 long end;
5959 int withscores = 0;
5960 int llen;
5961 int rangelen, j;
5962 zset *zsetobj;
5963 zskiplist *zsl;
5964 zskiplistNode *ln;
5965 robj *ele;
5966
5967 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5968 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5969
5970 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5971 withscores = 1;
5972 } else if (c->argc >= 5) {
5973 addReply(c,shared.syntaxerr);
5974 return;
5975 }
5976
5977 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5978 || checkType(c,o,REDIS_ZSET)) return;
5979 zsetobj = o->ptr;
5980 zsl = zsetobj->zsl;
5981 llen = zsl->length;
5982
5983 /* convert negative indexes */
5984 if (start < 0) start = llen+start;
5985 if (end < 0) end = llen+end;
5986 if (start < 0) start = 0;
5987 if (end < 0) end = 0;
5988
5989 /* indexes sanity checks */
5990 if (start > end || start >= llen) {
5991 /* Out of range start or start > end result in empty list */
5992 addReply(c,shared.emptymultibulk);
5993 return;
5994 }
5995 if (end >= llen) end = llen-1;
5996 rangelen = (end-start)+1;
5997
5998 /* check if starting point is trivial, before searching
5999 * the element in log(N) time */
6000 if (reverse) {
6001 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6002 } else {
6003 ln = start == 0 ?
6004 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6005 }
6006
6007 /* Return the result in form of a multi-bulk reply */
6008 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6009 withscores ? (rangelen*2) : rangelen));
6010 for (j = 0; j < rangelen; j++) {
6011 ele = ln->obj;
6012 addReplyBulk(c,ele);
6013 if (withscores)
6014 addReplyDouble(c,ln->score);
6015 ln = reverse ? ln->backward : ln->forward[0];
6016 }
6017 }
6018
6019 static void zrangeCommand(redisClient *c) {
6020 zrangeGenericCommand(c,0);
6021 }
6022
6023 static void zrevrangeCommand(redisClient *c) {
6024 zrangeGenericCommand(c,1);
6025 }
6026
6027 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6028 * If justcount is non-zero, just the count is returned. */
6029 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6030 robj *o;
6031 double min, max;
6032 int minex = 0, maxex = 0; /* are min or max exclusive? */
6033 int offset = 0, limit = -1;
6034 int withscores = 0;
6035 int badsyntax = 0;
6036
6037 /* Parse the min-max interval. If one of the values is prefixed
6038 * by the "(" character, it's considered "open". For instance
6039 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6040 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6041 if (((char*)c->argv[2]->ptr)[0] == '(') {
6042 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6043 minex = 1;
6044 } else {
6045 min = strtod(c->argv[2]->ptr,NULL);
6046 }
6047 if (((char*)c->argv[3]->ptr)[0] == '(') {
6048 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6049 maxex = 1;
6050 } else {
6051 max = strtod(c->argv[3]->ptr,NULL);
6052 }
6053
6054 /* Parse "WITHSCORES": note that if the command was called with
6055 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6056 * enter the following paths to parse WITHSCORES and LIMIT. */
6057 if (c->argc == 5 || c->argc == 8) {
6058 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6059 withscores = 1;
6060 else
6061 badsyntax = 1;
6062 }
6063 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6064 badsyntax = 1;
6065 if (badsyntax) {
6066 addReplySds(c,
6067 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6068 return;
6069 }
6070
6071 /* Parse "LIMIT" */
6072 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6073 addReply(c,shared.syntaxerr);
6074 return;
6075 } else if (c->argc == (7 + withscores)) {
6076 offset = atoi(c->argv[5]->ptr);
6077 limit = atoi(c->argv[6]->ptr);
6078 if (offset < 0) offset = 0;
6079 }
6080
6081 /* Ok, lookup the key and get the range */
6082 o = lookupKeyRead(c->db,c->argv[1]);
6083 if (o == NULL) {
6084 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6085 } else {
6086 if (o->type != REDIS_ZSET) {
6087 addReply(c,shared.wrongtypeerr);
6088 } else {
6089 zset *zsetobj = o->ptr;
6090 zskiplist *zsl = zsetobj->zsl;
6091 zskiplistNode *ln;
6092 robj *ele, *lenobj = NULL;
6093 unsigned long rangelen = 0;
6094
6095 /* Get the first node with the score >= min, or with
6096 * score > min if 'minex' is true. */
6097 ln = zslFirstWithScore(zsl,min);
6098 while (minex && ln && ln->score == min) ln = ln->forward[0];
6099
6100 if (ln == NULL) {
6101 /* No element matching the speciifed interval */
6102 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6103 return;
6104 }
6105
6106 /* We don't know in advance how many matching elements there
6107 * are in the list, so we push this object that will represent
6108 * the multi-bulk length in the output buffer, and will "fix"
6109 * it later */
6110 if (!justcount) {
6111 lenobj = createObject(REDIS_STRING,NULL);
6112 addReply(c,lenobj);
6113 decrRefCount(lenobj);
6114 }
6115
6116 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6117 if (offset) {
6118 offset--;
6119 ln = ln->forward[0];
6120 continue;
6121 }
6122 if (limit == 0) break;
6123 if (!justcount) {
6124 ele = ln->obj;
6125 addReplyBulk(c,ele);
6126 if (withscores)
6127 addReplyDouble(c,ln->score);
6128 }
6129 ln = ln->forward[0];
6130 rangelen++;
6131 if (limit > 0) limit--;
6132 }
6133 if (justcount) {
6134 addReplyLong(c,(long)rangelen);
6135 } else {
6136 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6137 withscores ? (rangelen*2) : rangelen);
6138 }
6139 }
6140 }
6141 }
6142
6143 static void zrangebyscoreCommand(redisClient *c) {
6144 genericZrangebyscoreCommand(c,0);
6145 }
6146
6147 static void zcountCommand(redisClient *c) {
6148 genericZrangebyscoreCommand(c,1);
6149 }
6150
6151 static void zcardCommand(redisClient *c) {
6152 robj *o;
6153 zset *zs;
6154
6155 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6156 checkType(c,o,REDIS_ZSET)) return;
6157
6158 zs = o->ptr;
6159 addReplyUlong(c,zs->zsl->length);
6160 }
6161
6162 static void zscoreCommand(redisClient *c) {
6163 robj *o;
6164 zset *zs;
6165 dictEntry *de;
6166
6167 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6168 checkType(c,o,REDIS_ZSET)) return;
6169
6170 zs = o->ptr;
6171 de = dictFind(zs->dict,c->argv[2]);
6172 if (!de) {
6173 addReply(c,shared.nullbulk);
6174 } else {
6175 double *score = dictGetEntryVal(de);
6176
6177 addReplyDouble(c,*score);
6178 }
6179 }
6180
6181 static void zrankGenericCommand(redisClient *c, int reverse) {
6182 robj *o;
6183 zset *zs;
6184 zskiplist *zsl;
6185 dictEntry *de;
6186 unsigned long rank;
6187 double *score;
6188
6189 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6190 checkType(c,o,REDIS_ZSET)) return;
6191
6192 zs = o->ptr;
6193 zsl = zs->zsl;
6194 de = dictFind(zs->dict,c->argv[2]);
6195 if (!de) {
6196 addReply(c,shared.nullbulk);
6197 return;
6198 }
6199
6200 score = dictGetEntryVal(de);
6201 rank = zslGetRank(zsl, *score, c->argv[2]);
6202 if (rank) {
6203 if (reverse) {
6204 addReplyLong(c, zsl->length - rank);
6205 } else {
6206 addReplyLong(c, rank-1);
6207 }
6208 } else {
6209 addReply(c,shared.nullbulk);
6210 }
6211 }
6212
6213 static void zrankCommand(redisClient *c) {
6214 zrankGenericCommand(c, 0);
6215 }
6216
6217 static void zrevrankCommand(redisClient *c) {
6218 zrankGenericCommand(c, 1);
6219 }
6220
6221 /* ========================= Hashes utility functions ======================= */
6222 #define REDIS_HASH_KEY 1
6223 #define REDIS_HASH_VALUE 2
6224
6225 /* Check the length of a number of objects to see if we need to convert a
6226 * zipmap to a real hash. Note that we only check string encoded objects
6227 * as their string length can be queried in constant time. */
6228 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6229 int i;
6230 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6231
6232 for (i = start; i <= end; i++) {
6233 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6234 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6235 {
6236 convertToRealHash(subject);
6237 return;
6238 }
6239 }
6240 }
6241
6242 /* Encode given objects in-place when the hash uses a dict. */
6243 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6244 if (subject->encoding == REDIS_ENCODING_HT) {
6245 if (o1) *o1 = tryObjectEncoding(*o1);
6246 if (o2) *o2 = tryObjectEncoding(*o2);
6247 }
6248 }
6249
6250 /* Get the value from a hash identified by key. Returns either a string
6251 * object or NULL if the value cannot be found. The refcount of the object
6252 * is always increased by 1 when the value was found. */
6253 static robj *hashGet(robj *o, robj *key) {
6254 robj *value = NULL;
6255 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6256 unsigned char *v;
6257 unsigned int vlen;
6258 key = getDecodedObject(key);
6259 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6260 value = createStringObject((char*)v,vlen);
6261 }
6262 decrRefCount(key);
6263 } else {
6264 dictEntry *de = dictFind(o->ptr,key);
6265 if (de != NULL) {
6266 value = dictGetEntryVal(de);
6267 incrRefCount(value);
6268 }
6269 }
6270 return value;
6271 }
6272
6273 /* Test if the key exists in the given hash. Returns 1 if the key
6274 * exists and 0 when it doesn't. */
6275 static int hashExists(robj *o, robj *key) {
6276 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6277 key = getDecodedObject(key);
6278 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6279 decrRefCount(key);
6280 return 1;
6281 }
6282 decrRefCount(key);
6283 } else {
6284 if (dictFind(o->ptr,key) != NULL) {
6285 return 1;
6286 }
6287 }
6288 return 0;
6289 }
6290
6291 /* Add an element, discard the old if the key already exists.
6292 * Return 0 on insert and 1 on update. */
6293 static int hashSet(robj *o, robj *key, robj *value) {
6294 int update = 0;
6295 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6296 key = getDecodedObject(key);
6297 value = getDecodedObject(value);
6298 o->ptr = zipmapSet(o->ptr,
6299 key->ptr,sdslen(key->ptr),
6300 value->ptr,sdslen(value->ptr), &update);
6301 decrRefCount(key);
6302 decrRefCount(value);
6303
6304 /* Check if the zipmap needs to be upgraded to a real hash table */
6305 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6306 convertToRealHash(o);
6307 } else {
6308 if (dictReplace(o->ptr,key,value)) {
6309 /* Insert */
6310 incrRefCount(key);
6311 } else {
6312 /* Update */
6313 update = 1;
6314 }
6315 incrRefCount(value);
6316 }
6317 return update;
6318 }
6319
6320 /* Delete an element from a hash.
6321 * Return 1 on deleted and 0 on not found. */
6322 static int hashDelete(robj *o, robj *key) {
6323 int deleted = 0;
6324 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6325 key = getDecodedObject(key);
6326 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6327 decrRefCount(key);
6328 } else {
6329 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6330 /* Always check if the dictionary needs a resize after a delete. */
6331 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6332 }
6333 return deleted;
6334 }
6335
6336 /* Return the number of elements in a hash. */
6337 static unsigned long hashLength(robj *o) {
6338 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6339 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6340 }
6341
6342 /* Structure to hold hash iteration abstration. Note that iteration over
6343 * hashes involves both fields and values. Because it is possible that
6344 * not both are required, store pointers in the iterator to avoid
6345 * unnecessary memory allocation for fields/values. */
6346 typedef struct {
6347 int encoding;
6348 unsigned char *zi;
6349 unsigned char *zk, *zv;
6350 unsigned int zklen, zvlen;
6351
6352 dictIterator *di;
6353 dictEntry *de;
6354 } hashIterator;
6355
6356 static hashIterator *hashInitIterator(robj *subject) {
6357 hashIterator *hi = zmalloc(sizeof(hashIterator));
6358 hi->encoding = subject->encoding;
6359 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6360 hi->zi = zipmapRewind(subject->ptr);
6361 } else if (hi->encoding == REDIS_ENCODING_HT) {
6362 hi->di = dictGetIterator(subject->ptr);
6363 } else {
6364 redisAssert(NULL);
6365 }
6366 return hi;
6367 }
6368
6369 static void hashReleaseIterator(hashIterator *hi) {
6370 if (hi->encoding == REDIS_ENCODING_HT) {
6371 dictReleaseIterator(hi->di);
6372 }
6373 zfree(hi);
6374 }
6375
6376 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6377 * could be found and REDIS_ERR when the iterator reaches the end. */
6378 static int hashNext(hashIterator *hi) {
6379 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6380 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6381 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6382 } else {
6383 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6384 }
6385 return REDIS_OK;
6386 }
6387
6388 /* Get key or value object at current iteration position.
6389 * This increases the refcount of the field object by 1. */
6390 static robj *hashCurrent(hashIterator *hi, int what) {
6391 robj *o;
6392 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6393 if (what & REDIS_HASH_KEY) {
6394 o = createStringObject((char*)hi->zk,hi->zklen);
6395 } else {
6396 o = createStringObject((char*)hi->zv,hi->zvlen);
6397 }
6398 } else {
6399 if (what & REDIS_HASH_KEY) {
6400 o = dictGetEntryKey(hi->de);
6401 } else {
6402 o = dictGetEntryVal(hi->de);
6403 }
6404 incrRefCount(o);
6405 }
6406 return o;
6407 }
6408
6409 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6410 robj *o = lookupKeyWrite(c->db,key);
6411 if (o == NULL) {
6412 o = createHashObject();
6413 dictAdd(c->db->dict,key,o);
6414 incrRefCount(key);
6415 } else {
6416 if (o->type != REDIS_HASH) {
6417 addReply(c,shared.wrongtypeerr);
6418 return NULL;
6419 }
6420 }
6421 return o;
6422 }
6423
6424 /* ============================= Hash commands ============================== */
6425 static void hsetCommand(redisClient *c) {
6426 int update;
6427 robj *o;
6428
6429 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6430 hashTryConversion(o,c->argv,2,3);
6431 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6432 update = hashSet(o,c->argv[2],c->argv[3]);
6433 addReply(c, update ? shared.czero : shared.cone);
6434 server.dirty++;
6435 }
6436
6437 static void hsetnxCommand(redisClient *c) {
6438 robj *o;
6439 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6440 hashTryConversion(o,c->argv,2,3);
6441
6442 if (hashExists(o, c->argv[2])) {
6443 addReply(c, shared.czero);
6444 } else {
6445 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6446 hashSet(o,c->argv[2],c->argv[3]);
6447 addReply(c, shared.cone);
6448 server.dirty++;
6449 }
6450 }
6451
6452 static void hmsetCommand(redisClient *c) {
6453 int i;
6454 robj *o;
6455
6456 if ((c->argc % 2) == 1) {
6457 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6458 return;
6459 }
6460
6461 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6462 hashTryConversion(o,c->argv,2,c->argc-1);
6463 for (i = 2; i < c->argc; i += 2) {
6464 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6465 hashSet(o,c->argv[i],c->argv[i+1]);
6466 }
6467 addReply(c, shared.ok);
6468 server.dirty++;
6469 }
6470
6471 static void hincrbyCommand(redisClient *c) {
6472 long long value, incr;
6473 robj *o, *current, *new;
6474
6475 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6476 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6477 if ((current = hashGet(o,c->argv[2])) != NULL) {
6478 if (getLongLongFromObjectOrReply(c,current,&value,
6479 "hash value is not an integer") != REDIS_OK) {
6480 decrRefCount(current);
6481 return;
6482 }
6483 decrRefCount(current);
6484 } else {
6485 value = 0;
6486 }
6487
6488 value += incr;
6489 new = createStringObjectFromLongLong(value);
6490 hashTryObjectEncoding(o,&c->argv[2],NULL);
6491 hashSet(o,c->argv[2],new);
6492 decrRefCount(new);
6493 addReplyLongLong(c,value);
6494 server.dirty++;
6495 }
6496
6497 static void hgetCommand(redisClient *c) {
6498 robj *o, *value;
6499 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6500 checkType(c,o,REDIS_HASH)) return;
6501
6502 if ((value = hashGet(o,c->argv[2])) != NULL) {
6503 addReplyBulk(c,value);
6504 decrRefCount(value);
6505 } else {
6506 addReply(c,shared.nullbulk);
6507 }
6508 }
6509
6510 static void hmgetCommand(redisClient *c) {
6511 int i;
6512 robj *o, *value;
6513 o = lookupKeyRead(c->db,c->argv[1]);
6514 if (o != NULL && o->type != REDIS_HASH) {
6515 addReply(c,shared.wrongtypeerr);
6516 }
6517
6518 /* Note the check for o != NULL happens inside the loop. This is
6519 * done because objects that cannot be found are considered to be
6520 * an empty hash. The reply should then be a series of NULLs. */
6521 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6522 for (i = 2; i < c->argc; i++) {
6523 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6524 addReplyBulk(c,value);
6525 decrRefCount(value);
6526 } else {
6527 addReply(c,shared.nullbulk);
6528 }
6529 }
6530 }
6531
6532 static void hdelCommand(redisClient *c) {
6533 robj *o;
6534 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6535 checkType(c,o,REDIS_HASH)) return;
6536
6537 if (hashDelete(o,c->argv[2])) {
6538 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6539 addReply(c,shared.cone);
6540 server.dirty++;
6541 } else {
6542 addReply(c,shared.czero);
6543 }
6544 }
6545
6546 static void hlenCommand(redisClient *c) {
6547 robj *o;
6548 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6549 checkType(c,o,REDIS_HASH)) return;
6550
6551 addReplyUlong(c,hashLength(o));
6552 }
6553
6554 static void genericHgetallCommand(redisClient *c, int flags) {
6555 robj *o, *lenobj, *obj;
6556 unsigned long count = 0;
6557 hashIterator *hi;
6558
6559 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6560 || checkType(c,o,REDIS_HASH)) return;
6561
6562 lenobj = createObject(REDIS_STRING,NULL);
6563 addReply(c,lenobj);
6564 decrRefCount(lenobj);
6565
6566 hi = hashInitIterator(o);
6567 while (hashNext(hi) != REDIS_ERR) {
6568 if (flags & REDIS_HASH_KEY) {
6569 obj = hashCurrent(hi,REDIS_HASH_KEY);
6570 addReplyBulk(c,obj);
6571 decrRefCount(obj);
6572 count++;
6573 }
6574 if (flags & REDIS_HASH_VALUE) {
6575 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6576 addReplyBulk(c,obj);
6577 decrRefCount(obj);
6578 count++;
6579 }
6580 }
6581 hashReleaseIterator(hi);
6582
6583 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6584 }
6585
6586 static void hkeysCommand(redisClient *c) {
6587 genericHgetallCommand(c,REDIS_HASH_KEY);
6588 }
6589
6590 static void hvalsCommand(redisClient *c) {
6591 genericHgetallCommand(c,REDIS_HASH_VALUE);
6592 }
6593
6594 static void hgetallCommand(redisClient *c) {
6595 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6596 }
6597
6598 static void hexistsCommand(redisClient *c) {
6599 robj *o;
6600 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6601 checkType(c,o,REDIS_HASH)) return;
6602
6603 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6604 }
6605
6606 static void convertToRealHash(robj *o) {
6607 unsigned char *key, *val, *p, *zm = o->ptr;
6608 unsigned int klen, vlen;
6609 dict *dict = dictCreate(&hashDictType,NULL);
6610
6611 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6612 p = zipmapRewind(zm);
6613 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6614 robj *keyobj, *valobj;
6615
6616 keyobj = createStringObject((char*)key,klen);
6617 valobj = createStringObject((char*)val,vlen);
6618 keyobj = tryObjectEncoding(keyobj);
6619 valobj = tryObjectEncoding(valobj);
6620 dictAdd(dict,keyobj,valobj);
6621 }
6622 o->encoding = REDIS_ENCODING_HT;
6623 o->ptr = dict;
6624 zfree(zm);
6625 }
6626
6627 /* ========================= Non type-specific commands ==================== */
6628
6629 static void flushdbCommand(redisClient *c) {
6630 server.dirty += dictSize(c->db->dict);
6631 dictEmpty(c->db->dict);
6632 dictEmpty(c->db->expires);
6633 addReply(c,shared.ok);
6634 }
6635
6636 static void flushallCommand(redisClient *c) {
6637 server.dirty += emptyDb();
6638 addReply(c,shared.ok);
6639 if (server.bgsavechildpid != -1) {
6640 kill(server.bgsavechildpid,SIGKILL);
6641 rdbRemoveTempFile(server.bgsavechildpid);
6642 }
6643 rdbSave(server.dbfilename);
6644 server.dirty++;
6645 }
6646
6647 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6648 redisSortOperation *so = zmalloc(sizeof(*so));
6649 so->type = type;
6650 so->pattern = pattern;
6651 return so;
6652 }
6653
6654 /* Return the value associated to the key with a name obtained
6655 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6656 * The returned object will always have its refcount increased by 1
6657 * when it is non-NULL. */
6658 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6659 char *p, *f;
6660 sds spat, ssub;
6661 robj keyobj, fieldobj, *o;
6662 int prefixlen, sublen, postfixlen, fieldlen;
6663 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6664 struct {
6665 long len;
6666 long free;
6667 char buf[REDIS_SORTKEY_MAX+1];
6668 } keyname, fieldname;
6669
6670 /* If the pattern is "#" return the substitution object itself in order
6671 * to implement the "SORT ... GET #" feature. */
6672 spat = pattern->ptr;
6673 if (spat[0] == '#' && spat[1] == '\0') {
6674 incrRefCount(subst);
6675 return subst;
6676 }
6677
6678 /* The substitution object may be specially encoded. If so we create
6679 * a decoded object on the fly. Otherwise getDecodedObject will just
6680 * increment the ref count, that we'll decrement later. */
6681 subst = getDecodedObject(subst);
6682
6683 ssub = subst->ptr;
6684 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6685 p = strchr(spat,'*');
6686 if (!p) {
6687 decrRefCount(subst);
6688 return NULL;
6689 }
6690
6691 /* Find out if we're dealing with a hash dereference. */
6692 if ((f = strstr(p+1, "->")) != NULL) {
6693 fieldlen = sdslen(spat)-(f-spat);
6694 /* this also copies \0 character */
6695 memcpy(fieldname.buf,f+2,fieldlen-1);
6696 fieldname.len = fieldlen-2;
6697 } else {
6698 fieldlen = 0;
6699 }
6700
6701 prefixlen = p-spat;
6702 sublen = sdslen(ssub);
6703 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6704 memcpy(keyname.buf,spat,prefixlen);
6705 memcpy(keyname.buf+prefixlen,ssub,sublen);
6706 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6707 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6708 keyname.len = prefixlen+sublen+postfixlen;
6709 decrRefCount(subst);
6710
6711 /* Lookup substituted key */
6712 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6713 o = lookupKeyRead(db,&keyobj);
6714 if (o == NULL) return NULL;
6715
6716 if (fieldlen > 0) {
6717 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6718
6719 /* Retrieve value from hash by the field name. This operation
6720 * already increases the refcount of the returned object. */
6721 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6722 o = hashGet(o, &fieldobj);
6723 } else {
6724 if (o->type != REDIS_STRING) return NULL;
6725
6726 /* Every object that this function returns needs to have its refcount
6727 * increased. sortCommand decreases it again. */
6728 incrRefCount(o);
6729 }
6730
6731 return o;
6732 }
6733
6734 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6735 * the additional parameter is not standard but a BSD-specific we have to
6736 * pass sorting parameters via the global 'server' structure */
6737 static int sortCompare(const void *s1, const void *s2) {
6738 const redisSortObject *so1 = s1, *so2 = s2;
6739 int cmp;
6740
6741 if (!server.sort_alpha) {
6742 /* Numeric sorting. Here it's trivial as we precomputed scores */
6743 if (so1->u.score > so2->u.score) {
6744 cmp = 1;
6745 } else if (so1->u.score < so2->u.score) {
6746 cmp = -1;
6747 } else {
6748 cmp = 0;
6749 }
6750 } else {
6751 /* Alphanumeric sorting */
6752 if (server.sort_bypattern) {
6753 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6754 /* At least one compare object is NULL */
6755 if (so1->u.cmpobj == so2->u.cmpobj)
6756 cmp = 0;
6757 else if (so1->u.cmpobj == NULL)
6758 cmp = -1;
6759 else
6760 cmp = 1;
6761 } else {
6762 /* We have both the objects, use strcoll */
6763 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6764 }
6765 } else {
6766 /* Compare elements directly. */
6767 cmp = compareStringObjects(so1->obj,so2->obj);
6768 }
6769 }
6770 return server.sort_desc ? -cmp : cmp;
6771 }
6772
6773 /* The SORT command is the most complex command in Redis. Warning: this code
6774 * is optimized for speed and a bit less for readability */
6775 static void sortCommand(redisClient *c) {
6776 list *operations;
6777 int outputlen = 0;
6778 int desc = 0, alpha = 0;
6779 int limit_start = 0, limit_count = -1, start, end;
6780 int j, dontsort = 0, vectorlen;
6781 int getop = 0; /* GET operation counter */
6782 robj *sortval, *sortby = NULL, *storekey = NULL;
6783 redisSortObject *vector; /* Resulting vector to sort */
6784
6785 /* Lookup the key to sort. It must be of the right types */
6786 sortval = lookupKeyRead(c->db,c->argv[1]);
6787 if (sortval == NULL) {
6788 addReply(c,shared.emptymultibulk);
6789 return;
6790 }
6791 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6792 sortval->type != REDIS_ZSET)
6793 {
6794 addReply(c,shared.wrongtypeerr);
6795 return;
6796 }
6797
6798 /* Create a list of operations to perform for every sorted element.
6799 * Operations can be GET/DEL/INCR/DECR */
6800 operations = listCreate();
6801 listSetFreeMethod(operations,zfree);
6802 j = 2;
6803
6804 /* Now we need to protect sortval incrementing its count, in the future
6805 * SORT may have options able to overwrite/delete keys during the sorting
6806 * and the sorted key itself may get destroied */
6807 incrRefCount(sortval);
6808
6809 /* The SORT command has an SQL-alike syntax, parse it */
6810 while(j < c->argc) {
6811 int leftargs = c->argc-j-1;
6812 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6813 desc = 0;
6814 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6815 desc = 1;
6816 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6817 alpha = 1;
6818 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6819 limit_start = atoi(c->argv[j+1]->ptr);
6820 limit_count = atoi(c->argv[j+2]->ptr);
6821 j+=2;
6822 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6823 storekey = c->argv[j+1];
6824 j++;
6825 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6826 sortby = c->argv[j+1];
6827 /* If the BY pattern does not contain '*', i.e. it is constant,
6828 * we don't need to sort nor to lookup the weight keys. */
6829 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6830 j++;
6831 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6832 listAddNodeTail(operations,createSortOperation(
6833 REDIS_SORT_GET,c->argv[j+1]));
6834 getop++;
6835 j++;
6836 } else {
6837 decrRefCount(sortval);
6838 listRelease(operations);
6839 addReply(c,shared.syntaxerr);
6840 return;
6841 }
6842 j++;
6843 }
6844
6845 /* Load the sorting vector with all the objects to sort */
6846 switch(sortval->type) {
6847 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6848 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6849 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6850 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6851 }
6852 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6853 j = 0;
6854
6855 if (sortval->type == REDIS_LIST) {
6856 list *list = sortval->ptr;
6857 listNode *ln;
6858 listIter li;
6859
6860 listRewind(list,&li);
6861 while((ln = listNext(&li))) {
6862 robj *ele = ln->value;
6863 vector[j].obj = ele;
6864 vector[j].u.score = 0;
6865 vector[j].u.cmpobj = NULL;
6866 j++;
6867 }
6868 } else {
6869 dict *set;
6870 dictIterator *di;
6871 dictEntry *setele;
6872
6873 if (sortval->type == REDIS_SET) {
6874 set = sortval->ptr;
6875 } else {
6876 zset *zs = sortval->ptr;
6877 set = zs->dict;
6878 }
6879
6880 di = dictGetIterator(set);
6881 while((setele = dictNext(di)) != NULL) {
6882 vector[j].obj = dictGetEntryKey(setele);
6883 vector[j].u.score = 0;
6884 vector[j].u.cmpobj = NULL;
6885 j++;
6886 }
6887 dictReleaseIterator(di);
6888 }
6889 redisAssert(j == vectorlen);
6890
6891 /* Now it's time to load the right scores in the sorting vector */
6892 if (dontsort == 0) {
6893 for (j = 0; j < vectorlen; j++) {
6894 robj *byval;
6895 if (sortby) {
6896 /* lookup value to sort by */
6897 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6898 if (!byval) continue;
6899 } else {
6900 /* use object itself to sort by */
6901 byval = vector[j].obj;
6902 }
6903
6904 if (alpha) {
6905 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6906 } else {
6907 if (byval->encoding == REDIS_ENCODING_RAW) {
6908 vector[j].u.score = strtod(byval->ptr,NULL);
6909 } else if (byval->encoding == REDIS_ENCODING_INT) {
6910 /* Don't need to decode the object if it's
6911 * integer-encoded (the only encoding supported) so
6912 * far. We can just cast it */
6913 vector[j].u.score = (long)byval->ptr;
6914 } else {
6915 redisAssert(1 != 1);
6916 }
6917 }
6918
6919 /* when the object was retrieved using lookupKeyByPattern,
6920 * its refcount needs to be decreased. */
6921 if (sortby) {
6922 decrRefCount(byval);
6923 }
6924 }
6925 }
6926
6927 /* We are ready to sort the vector... perform a bit of sanity check
6928 * on the LIMIT option too. We'll use a partial version of quicksort. */
6929 start = (limit_start < 0) ? 0 : limit_start;
6930 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6931 if (start >= vectorlen) {
6932 start = vectorlen-1;
6933 end = vectorlen-2;
6934 }
6935 if (end >= vectorlen) end = vectorlen-1;
6936
6937 if (dontsort == 0) {
6938 server.sort_desc = desc;
6939 server.sort_alpha = alpha;
6940 server.sort_bypattern = sortby ? 1 : 0;
6941 if (sortby && (start != 0 || end != vectorlen-1))
6942 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6943 else
6944 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6945 }
6946
6947 /* Send command output to the output buffer, performing the specified
6948 * GET/DEL/INCR/DECR operations if any. */
6949 outputlen = getop ? getop*(end-start+1) : end-start+1;
6950 if (storekey == NULL) {
6951 /* STORE option not specified, sent the sorting result to client */
6952 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6953 for (j = start; j <= end; j++) {
6954 listNode *ln;
6955 listIter li;
6956
6957 if (!getop) addReplyBulk(c,vector[j].obj);
6958 listRewind(operations,&li);
6959 while((ln = listNext(&li))) {
6960 redisSortOperation *sop = ln->value;
6961 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6962 vector[j].obj);
6963
6964 if (sop->type == REDIS_SORT_GET) {
6965 if (!val) {
6966 addReply(c,shared.nullbulk);
6967 } else {
6968 addReplyBulk(c,val);
6969 decrRefCount(val);
6970 }
6971 } else {
6972 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6973 }
6974 }
6975 }
6976 } else {
6977 robj *listObject = createListObject();
6978 list *listPtr = (list*) listObject->ptr;
6979
6980 /* STORE option specified, set the sorting result as a List object */
6981 for (j = start; j <= end; j++) {
6982 listNode *ln;
6983 listIter li;
6984
6985 if (!getop) {
6986 listAddNodeTail(listPtr,vector[j].obj);
6987 incrRefCount(vector[j].obj);
6988 }
6989 listRewind(operations,&li);
6990 while((ln = listNext(&li))) {
6991 redisSortOperation *sop = ln->value;
6992 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6993 vector[j].obj);
6994
6995 if (sop->type == REDIS_SORT_GET) {
6996 if (!val) {
6997 listAddNodeTail(listPtr,createStringObject("",0));
6998 } else {
6999 /* We should do a incrRefCount on val because it is
7000 * added to the list, but also a decrRefCount because
7001 * it is returned by lookupKeyByPattern. This results
7002 * in doing nothing at all. */
7003 listAddNodeTail(listPtr,val);
7004 }
7005 } else {
7006 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7007 }
7008 }
7009 }
7010 if (dictReplace(c->db->dict,storekey,listObject)) {
7011 incrRefCount(storekey);
7012 }
7013 /* Note: we add 1 because the DB is dirty anyway since even if the
7014 * SORT result is empty a new key is set and maybe the old content
7015 * replaced. */
7016 server.dirty += 1+outputlen;
7017 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7018 }
7019
7020 /* Cleanup */
7021 decrRefCount(sortval);
7022 listRelease(operations);
7023 for (j = 0; j < vectorlen; j++) {
7024 if (alpha && vector[j].u.cmpobj)
7025 decrRefCount(vector[j].u.cmpobj);
7026 }
7027 zfree(vector);
7028 }
7029
7030 /* Convert an amount of bytes into a human readable string in the form
7031 * of 100B, 2G, 100M, 4K, and so forth. */
7032 static void bytesToHuman(char *s, unsigned long long n) {
7033 double d;
7034
7035 if (n < 1024) {
7036 /* Bytes */
7037 sprintf(s,"%lluB",n);
7038 return;
7039 } else if (n < (1024*1024)) {
7040 d = (double)n/(1024);
7041 sprintf(s,"%.2fK",d);
7042 } else if (n < (1024LL*1024*1024)) {
7043 d = (double)n/(1024*1024);
7044 sprintf(s,"%.2fM",d);
7045 } else if (n < (1024LL*1024*1024*1024)) {
7046 d = (double)n/(1024LL*1024*1024);
7047 sprintf(s,"%.2fG",d);
7048 }
7049 }
7050
7051 /* Create the string returned by the INFO command. This is decoupled
7052 * by the INFO command itself as we need to report the same information
7053 * on memory corruption problems. */
7054 static sds genRedisInfoString(void) {
7055 sds info;
7056 time_t uptime = time(NULL)-server.stat_starttime;
7057 int j;
7058 char hmem[64];
7059
7060 bytesToHuman(hmem,zmalloc_used_memory());
7061 info = sdscatprintf(sdsempty(),
7062 "redis_version:%s\r\n"
7063 "arch_bits:%s\r\n"
7064 "multiplexing_api:%s\r\n"
7065 "process_id:%ld\r\n"
7066 "uptime_in_seconds:%ld\r\n"
7067 "uptime_in_days:%ld\r\n"
7068 "connected_clients:%d\r\n"
7069 "connected_slaves:%d\r\n"
7070 "blocked_clients:%d\r\n"
7071 "used_memory:%zu\r\n"
7072 "used_memory_human:%s\r\n"
7073 "changes_since_last_save:%lld\r\n"
7074 "bgsave_in_progress:%d\r\n"
7075 "last_save_time:%ld\r\n"
7076 "bgrewriteaof_in_progress:%d\r\n"
7077 "total_connections_received:%lld\r\n"
7078 "total_commands_processed:%lld\r\n"
7079 "expired_keys:%lld\r\n"
7080 "hash_max_zipmap_entries:%ld\r\n"
7081 "hash_max_zipmap_value:%ld\r\n"
7082 "pubsub_channels:%ld\r\n"
7083 "pubsub_patterns:%u\r\n"
7084 "vm_enabled:%d\r\n"
7085 "role:%s\r\n"
7086 ,REDIS_VERSION,
7087 (sizeof(long) == 8) ? "64" : "32",
7088 aeGetApiName(),
7089 (long) getpid(),
7090 uptime,
7091 uptime/(3600*24),
7092 listLength(server.clients)-listLength(server.slaves),
7093 listLength(server.slaves),
7094 server.blpop_blocked_clients,
7095 zmalloc_used_memory(),
7096 hmem,
7097 server.dirty,
7098 server.bgsavechildpid != -1,
7099 server.lastsave,
7100 server.bgrewritechildpid != -1,
7101 server.stat_numconnections,
7102 server.stat_numcommands,
7103 server.stat_expiredkeys,
7104 server.hash_max_zipmap_entries,
7105 server.hash_max_zipmap_value,
7106 dictSize(server.pubsub_channels),
7107 listLength(server.pubsub_patterns),
7108 server.vm_enabled != 0,
7109 server.masterhost == NULL ? "master" : "slave"
7110 );
7111 if (server.masterhost) {
7112 info = sdscatprintf(info,
7113 "master_host:%s\r\n"
7114 "master_port:%d\r\n"
7115 "master_link_status:%s\r\n"
7116 "master_last_io_seconds_ago:%d\r\n"
7117 ,server.masterhost,
7118 server.masterport,
7119 (server.replstate == REDIS_REPL_CONNECTED) ?
7120 "up" : "down",
7121 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7122 );
7123 }
7124 if (server.vm_enabled) {
7125 lockThreadedIO();
7126 info = sdscatprintf(info,
7127 "vm_conf_max_memory:%llu\r\n"
7128 "vm_conf_page_size:%llu\r\n"
7129 "vm_conf_pages:%llu\r\n"
7130 "vm_stats_used_pages:%llu\r\n"
7131 "vm_stats_swapped_objects:%llu\r\n"
7132 "vm_stats_swappin_count:%llu\r\n"
7133 "vm_stats_swappout_count:%llu\r\n"
7134 "vm_stats_io_newjobs_len:%lu\r\n"
7135 "vm_stats_io_processing_len:%lu\r\n"
7136 "vm_stats_io_processed_len:%lu\r\n"
7137 "vm_stats_io_active_threads:%lu\r\n"
7138 "vm_stats_blocked_clients:%lu\r\n"
7139 ,(unsigned long long) server.vm_max_memory,
7140 (unsigned long long) server.vm_page_size,
7141 (unsigned long long) server.vm_pages,
7142 (unsigned long long) server.vm_stats_used_pages,
7143 (unsigned long long) server.vm_stats_swapped_objects,
7144 (unsigned long long) server.vm_stats_swapins,
7145 (unsigned long long) server.vm_stats_swapouts,
7146 (unsigned long) listLength(server.io_newjobs),
7147 (unsigned long) listLength(server.io_processing),
7148 (unsigned long) listLength(server.io_processed),
7149 (unsigned long) server.io_active_threads,
7150 (unsigned long) server.vm_blocked_clients
7151 );
7152 unlockThreadedIO();
7153 }
7154 for (j = 0; j < server.dbnum; j++) {
7155 long long keys, vkeys;
7156
7157 keys = dictSize(server.db[j].dict);
7158 vkeys = dictSize(server.db[j].expires);
7159 if (keys || vkeys) {
7160 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7161 j, keys, vkeys);
7162 }
7163 }
7164 return info;
7165 }
7166
7167 static void infoCommand(redisClient *c) {
7168 sds info = genRedisInfoString();
7169 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7170 (unsigned long)sdslen(info)));
7171 addReplySds(c,info);
7172 addReply(c,shared.crlf);
7173 }
7174
7175 static void monitorCommand(redisClient *c) {
7176 /* ignore MONITOR if aleady slave or in monitor mode */
7177 if (c->flags & REDIS_SLAVE) return;
7178
7179 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7180 c->slaveseldb = 0;
7181 listAddNodeTail(server.monitors,c);
7182 addReply(c,shared.ok);
7183 }
7184
7185 /* ================================= Expire ================================= */
7186 static int removeExpire(redisDb *db, robj *key) {
7187 if (dictDelete(db->expires,key) == DICT_OK) {
7188 return 1;
7189 } else {
7190 return 0;
7191 }
7192 }
7193
7194 static int setExpire(redisDb *db, robj *key, time_t when) {
7195 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7196 return 0;
7197 } else {
7198 incrRefCount(key);
7199 return 1;
7200 }
7201 }
7202
7203 /* Return the expire time of the specified key, or -1 if no expire
7204 * is associated with this key (i.e. the key is non volatile) */
7205 static time_t getExpire(redisDb *db, robj *key) {
7206 dictEntry *de;
7207
7208 /* No expire? return ASAP */
7209 if (dictSize(db->expires) == 0 ||
7210 (de = dictFind(db->expires,key)) == NULL) return -1;
7211
7212 return (time_t) dictGetEntryVal(de);
7213 }
7214
7215 static int expireIfNeeded(redisDb *db, robj *key) {
7216 time_t when;
7217 dictEntry *de;
7218
7219 /* No expire? return ASAP */
7220 if (dictSize(db->expires) == 0 ||
7221 (de = dictFind(db->expires,key)) == NULL) return 0;
7222
7223 /* Lookup the expire */
7224 when = (time_t) dictGetEntryVal(de);
7225 if (time(NULL) <= when) return 0;
7226
7227 /* Delete the key */
7228 dictDelete(db->expires,key);
7229 server.stat_expiredkeys++;
7230 return dictDelete(db->dict,key) == DICT_OK;
7231 }
7232
7233 static int deleteIfVolatile(redisDb *db, robj *key) {
7234 dictEntry *de;
7235
7236 /* No expire? return ASAP */
7237 if (dictSize(db->expires) == 0 ||
7238 (de = dictFind(db->expires,key)) == NULL) return 0;
7239
7240 /* Delete the key */
7241 server.dirty++;
7242 server.stat_expiredkeys++;
7243 dictDelete(db->expires,key);
7244 return dictDelete(db->dict,key) == DICT_OK;
7245 }
7246
7247 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7248 dictEntry *de;
7249 time_t seconds;
7250
7251 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7252
7253 seconds -= offset;
7254
7255 de = dictFind(c->db->dict,key);
7256 if (de == NULL) {
7257 addReply(c,shared.czero);
7258 return;
7259 }
7260 if (seconds <= 0) {
7261 if (deleteKey(c->db,key)) server.dirty++;
7262 addReply(c, shared.cone);
7263 return;
7264 } else {
7265 time_t when = time(NULL)+seconds;
7266 if (setExpire(c->db,key,when)) {
7267 addReply(c,shared.cone);
7268 server.dirty++;
7269 } else {
7270 addReply(c,shared.czero);
7271 }
7272 return;
7273 }
7274 }
7275
7276 static void expireCommand(redisClient *c) {
7277 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7278 }
7279
7280 static void expireatCommand(redisClient *c) {
7281 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7282 }
7283
7284 static void ttlCommand(redisClient *c) {
7285 time_t expire;
7286 int ttl = -1;
7287
7288 expire = getExpire(c->db,c->argv[1]);
7289 if (expire != -1) {
7290 ttl = (int) (expire-time(NULL));
7291 if (ttl < 0) ttl = -1;
7292 }
7293 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7294 }
7295
7296 /* ================================ MULTI/EXEC ============================== */
7297
7298 /* Client state initialization for MULTI/EXEC */
7299 static void initClientMultiState(redisClient *c) {
7300 c->mstate.commands = NULL;
7301 c->mstate.count = 0;
7302 }
7303
7304 /* Release all the resources associated with MULTI/EXEC state */
7305 static void freeClientMultiState(redisClient *c) {
7306 int j;
7307
7308 for (j = 0; j < c->mstate.count; j++) {
7309 int i;
7310 multiCmd *mc = c->mstate.commands+j;
7311
7312 for (i = 0; i < mc->argc; i++)
7313 decrRefCount(mc->argv[i]);
7314 zfree(mc->argv);
7315 }
7316 zfree(c->mstate.commands);
7317 }
7318
7319 /* Add a new command into the MULTI commands queue */
7320 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7321 multiCmd *mc;
7322 int j;
7323
7324 c->mstate.commands = zrealloc(c->mstate.commands,
7325 sizeof(multiCmd)*(c->mstate.count+1));
7326 mc = c->mstate.commands+c->mstate.count;
7327 mc->cmd = cmd;
7328 mc->argc = c->argc;
7329 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7330 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7331 for (j = 0; j < c->argc; j++)
7332 incrRefCount(mc->argv[j]);
7333 c->mstate.count++;
7334 }
7335
7336 static void multiCommand(redisClient *c) {
7337 c->flags |= REDIS_MULTI;
7338 addReply(c,shared.ok);
7339 }
7340
7341 static void discardCommand(redisClient *c) {
7342 if (!(c->flags & REDIS_MULTI)) {
7343 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7344 return;
7345 }
7346
7347 freeClientMultiState(c);
7348 initClientMultiState(c);
7349 c->flags &= (~REDIS_MULTI);
7350 addReply(c,shared.ok);
7351 }
7352
7353 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7354 * implememntation for more information. */
7355 static void execCommandReplicateMulti(redisClient *c) {
7356 struct redisCommand *cmd;
7357 robj *multistring = createStringObject("MULTI",5);
7358
7359 cmd = lookupCommand("multi");
7360 if (server.appendonly)
7361 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7362 if (listLength(server.slaves))
7363 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7364 decrRefCount(multistring);
7365 }
7366
7367 static void execCommand(redisClient *c) {
7368 int j;
7369 robj **orig_argv;
7370 int orig_argc;
7371
7372 if (!(c->flags & REDIS_MULTI)) {
7373 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7374 return;
7375 }
7376
7377 /* Replicate a MULTI request now that we are sure the block is executed.
7378 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7379 * both the AOF and the replication link will have the same consistency
7380 * and atomicity guarantees. */
7381 execCommandReplicateMulti(c);
7382
7383 /* Exec all the queued commands */
7384 orig_argv = c->argv;
7385 orig_argc = c->argc;
7386 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7387 for (j = 0; j < c->mstate.count; j++) {
7388 c->argc = c->mstate.commands[j].argc;
7389 c->argv = c->mstate.commands[j].argv;
7390 call(c,c->mstate.commands[j].cmd);
7391 }
7392 c->argv = orig_argv;
7393 c->argc = orig_argc;
7394 freeClientMultiState(c);
7395 initClientMultiState(c);
7396 c->flags &= (~REDIS_MULTI);
7397 /* Make sure the EXEC command is always replicated / AOF, since we
7398 * always send the MULTI command (we can't know beforehand if the
7399 * next operations will contain at least a modification to the DB). */
7400 server.dirty++;
7401 }
7402
7403 /* =========================== Blocking Operations ========================= */
7404
7405 /* Currently Redis blocking operations support is limited to list POP ops,
7406 * so the current implementation is not fully generic, but it is also not
7407 * completely specific so it will not require a rewrite to support new
7408 * kind of blocking operations in the future.
7409 *
7410 * Still it's important to note that list blocking operations can be already
7411 * used as a notification mechanism in order to implement other blocking
7412 * operations at application level, so there must be a very strong evidence
7413 * of usefulness and generality before new blocking operations are implemented.
7414 *
7415 * This is how the current blocking POP works, we use BLPOP as example:
7416 * - If the user calls BLPOP and the key exists and contains a non empty list
7417 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7418 * if there is not to block.
7419 * - If instead BLPOP is called and the key does not exists or the list is
7420 * empty we need to block. In order to do so we remove the notification for
7421 * new data to read in the client socket (so that we'll not serve new
7422 * requests if the blocking request is not served). Also we put the client
7423 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7424 * blocking for this keys.
7425 * - If a PUSH operation against a key with blocked clients waiting is
7426 * performed, we serve the first in the list: basically instead to push
7427 * the new element inside the list we return it to the (first / oldest)
7428 * blocking client, unblock the client, and remove it form the list.
7429 *
7430 * The above comment and the source code should be enough in order to understand
7431 * the implementation and modify / fix it later.
7432 */
7433
7434 /* Set a client in blocking mode for the specified key, with the specified
7435 * timeout */
7436 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7437 dictEntry *de;
7438 list *l;
7439 int j;
7440
7441 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7442 c->blockingkeysnum = numkeys;
7443 c->blockingto = timeout;
7444 for (j = 0; j < numkeys; j++) {
7445 /* Add the key in the client structure, to map clients -> keys */
7446 c->blockingkeys[j] = keys[j];
7447 incrRefCount(keys[j]);
7448
7449 /* And in the other "side", to map keys -> clients */
7450 de = dictFind(c->db->blockingkeys,keys[j]);
7451 if (de == NULL) {
7452 int retval;
7453
7454 /* For every key we take a list of clients blocked for it */
7455 l = listCreate();
7456 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7457 incrRefCount(keys[j]);
7458 assert(retval == DICT_OK);
7459 } else {
7460 l = dictGetEntryVal(de);
7461 }
7462 listAddNodeTail(l,c);
7463 }
7464 /* Mark the client as a blocked client */
7465 c->flags |= REDIS_BLOCKED;
7466 server.blpop_blocked_clients++;
7467 }
7468
7469 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7470 static void unblockClientWaitingData(redisClient *c) {
7471 dictEntry *de;
7472 list *l;
7473 int j;
7474
7475 assert(c->blockingkeys != NULL);
7476 /* The client may wait for multiple keys, so unblock it for every key. */
7477 for (j = 0; j < c->blockingkeysnum; j++) {
7478 /* Remove this client from the list of clients waiting for this key. */
7479 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7480 assert(de != NULL);
7481 l = dictGetEntryVal(de);
7482 listDelNode(l,listSearchKey(l,c));
7483 /* If the list is empty we need to remove it to avoid wasting memory */
7484 if (listLength(l) == 0)
7485 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7486 decrRefCount(c->blockingkeys[j]);
7487 }
7488 /* Cleanup the client structure */
7489 zfree(c->blockingkeys);
7490 c->blockingkeys = NULL;
7491 c->flags &= (~REDIS_BLOCKED);
7492 server.blpop_blocked_clients--;
7493 /* We want to process data if there is some command waiting
7494 * in the input buffer. Note that this is safe even if
7495 * unblockClientWaitingData() gets called from freeClient() because
7496 * freeClient() will be smart enough to call this function
7497 * *after* c->querybuf was set to NULL. */
7498 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7499 }
7500
7501 /* This should be called from any function PUSHing into lists.
7502 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7503 * 'ele' is the element pushed.
7504 *
7505 * If the function returns 0 there was no client waiting for a list push
7506 * against this key.
7507 *
7508 * If the function returns 1 there was a client waiting for a list push
7509 * against this key, the element was passed to this client thus it's not
7510 * needed to actually add it to the list and the caller should return asap. */
7511 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7512 struct dictEntry *de;
7513 redisClient *receiver;
7514 list *l;
7515 listNode *ln;
7516
7517 de = dictFind(c->db->blockingkeys,key);
7518 if (de == NULL) return 0;
7519 l = dictGetEntryVal(de);
7520 ln = listFirst(l);
7521 assert(ln != NULL);
7522 receiver = ln->value;
7523
7524 addReplySds(receiver,sdsnew("*2\r\n"));
7525 addReplyBulk(receiver,key);
7526 addReplyBulk(receiver,ele);
7527 unblockClientWaitingData(receiver);
7528 return 1;
7529 }
7530
7531 /* Blocking RPOP/LPOP */
7532 static void blockingPopGenericCommand(redisClient *c, int where) {
7533 robj *o;
7534 time_t timeout;
7535 int j;
7536
7537 for (j = 1; j < c->argc-1; j++) {
7538 o = lookupKeyWrite(c->db,c->argv[j]);
7539 if (o != NULL) {
7540 if (o->type != REDIS_LIST) {
7541 addReply(c,shared.wrongtypeerr);
7542 return;
7543 } else {
7544 list *list = o->ptr;
7545 if (listLength(list) != 0) {
7546 /* If the list contains elements fall back to the usual
7547 * non-blocking POP operation */
7548 robj *argv[2], **orig_argv;
7549 int orig_argc;
7550
7551 /* We need to alter the command arguments before to call
7552 * popGenericCommand() as the command takes a single key. */
7553 orig_argv = c->argv;
7554 orig_argc = c->argc;
7555 argv[1] = c->argv[j];
7556 c->argv = argv;
7557 c->argc = 2;
7558
7559 /* Also the return value is different, we need to output
7560 * the multi bulk reply header and the key name. The
7561 * "real" command will add the last element (the value)
7562 * for us. If this souds like an hack to you it's just
7563 * because it is... */
7564 addReplySds(c,sdsnew("*2\r\n"));
7565 addReplyBulk(c,argv[1]);
7566 popGenericCommand(c,where);
7567
7568 /* Fix the client structure with the original stuff */
7569 c->argv = orig_argv;
7570 c->argc = orig_argc;
7571 return;
7572 }
7573 }
7574 }
7575 }
7576 /* If the list is empty or the key does not exists we must block */
7577 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7578 if (timeout > 0) timeout += time(NULL);
7579 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7580 }
7581
7582 static void blpopCommand(redisClient *c) {
7583 blockingPopGenericCommand(c,REDIS_HEAD);
7584 }
7585
7586 static void brpopCommand(redisClient *c) {
7587 blockingPopGenericCommand(c,REDIS_TAIL);
7588 }
7589
7590 /* =============================== Replication ============================= */
7591
7592 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7593 ssize_t nwritten, ret = size;
7594 time_t start = time(NULL);
7595
7596 timeout++;
7597 while(size) {
7598 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7599 nwritten = write(fd,ptr,size);
7600 if (nwritten == -1) return -1;
7601 ptr += nwritten;
7602 size -= nwritten;
7603 }
7604 if ((time(NULL)-start) > timeout) {
7605 errno = ETIMEDOUT;
7606 return -1;
7607 }
7608 }
7609 return ret;
7610 }
7611
7612 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7613 ssize_t nread, totread = 0;
7614 time_t start = time(NULL);
7615
7616 timeout++;
7617 while(size) {
7618 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7619 nread = read(fd,ptr,size);
7620 if (nread == -1) return -1;
7621 ptr += nread;
7622 size -= nread;
7623 totread += nread;
7624 }
7625 if ((time(NULL)-start) > timeout) {
7626 errno = ETIMEDOUT;
7627 return -1;
7628 }
7629 }
7630 return totread;
7631 }
7632
7633 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7634 ssize_t nread = 0;
7635
7636 size--;
7637 while(size) {
7638 char c;
7639
7640 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7641 if (c == '\n') {
7642 *ptr = '\0';
7643 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7644 return nread;
7645 } else {
7646 *ptr++ = c;
7647 *ptr = '\0';
7648 nread++;
7649 }
7650 }
7651 return nread;
7652 }
7653
7654 static void syncCommand(redisClient *c) {
7655 /* ignore SYNC if aleady slave or in monitor mode */
7656 if (c->flags & REDIS_SLAVE) return;
7657
7658 /* SYNC can't be issued when the server has pending data to send to
7659 * the client about already issued commands. We need a fresh reply
7660 * buffer registering the differences between the BGSAVE and the current
7661 * dataset, so that we can copy to other slaves if needed. */
7662 if (listLength(c->reply) != 0) {
7663 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7664 return;
7665 }
7666
7667 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7668 /* Here we need to check if there is a background saving operation
7669 * in progress, or if it is required to start one */
7670 if (server.bgsavechildpid != -1) {
7671 /* Ok a background save is in progress. Let's check if it is a good
7672 * one for replication, i.e. if there is another slave that is
7673 * registering differences since the server forked to save */
7674 redisClient *slave;
7675 listNode *ln;
7676 listIter li;
7677
7678 listRewind(server.slaves,&li);
7679 while((ln = listNext(&li))) {
7680 slave = ln->value;
7681 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7682 }
7683 if (ln) {
7684 /* Perfect, the server is already registering differences for
7685 * another slave. Set the right state, and copy the buffer. */
7686 listRelease(c->reply);
7687 c->reply = listDup(slave->reply);
7688 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7689 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7690 } else {
7691 /* No way, we need to wait for the next BGSAVE in order to
7692 * register differences */
7693 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7694 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7695 }
7696 } else {
7697 /* Ok we don't have a BGSAVE in progress, let's start one */
7698 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7699 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7700 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7701 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7702 return;
7703 }
7704 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7705 }
7706 c->repldbfd = -1;
7707 c->flags |= REDIS_SLAVE;
7708 c->slaveseldb = 0;
7709 listAddNodeTail(server.slaves,c);
7710 return;
7711 }
7712
7713 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7714 redisClient *slave = privdata;
7715 REDIS_NOTUSED(el);
7716 REDIS_NOTUSED(mask);
7717 char buf[REDIS_IOBUF_LEN];
7718 ssize_t nwritten, buflen;
7719
7720 if (slave->repldboff == 0) {
7721 /* Write the bulk write count before to transfer the DB. In theory here
7722 * we don't know how much room there is in the output buffer of the
7723 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7724 * operations) will never be smaller than the few bytes we need. */
7725 sds bulkcount;
7726
7727 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7728 slave->repldbsize);
7729 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7730 {
7731 sdsfree(bulkcount);
7732 freeClient(slave);
7733 return;
7734 }
7735 sdsfree(bulkcount);
7736 }
7737 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7738 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7739 if (buflen <= 0) {
7740 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7741 (buflen == 0) ? "premature EOF" : strerror(errno));
7742 freeClient(slave);
7743 return;
7744 }
7745 if ((nwritten = write(fd,buf,buflen)) == -1) {
7746 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7747 strerror(errno));
7748 freeClient(slave);
7749 return;
7750 }
7751 slave->repldboff += nwritten;
7752 if (slave->repldboff == slave->repldbsize) {
7753 close(slave->repldbfd);
7754 slave->repldbfd = -1;
7755 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7756 slave->replstate = REDIS_REPL_ONLINE;
7757 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7758 sendReplyToClient, slave) == AE_ERR) {
7759 freeClient(slave);
7760 return;
7761 }
7762 addReplySds(slave,sdsempty());
7763 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7764 }
7765 }
7766
7767 /* This function is called at the end of every backgrond saving.
7768 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7769 * otherwise REDIS_ERR is passed to the function.
7770 *
7771 * The goal of this function is to handle slaves waiting for a successful
7772 * background saving in order to perform non-blocking synchronization. */
7773 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7774 listNode *ln;
7775 int startbgsave = 0;
7776 listIter li;
7777
7778 listRewind(server.slaves,&li);
7779 while((ln = listNext(&li))) {
7780 redisClient *slave = ln->value;
7781
7782 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7783 startbgsave = 1;
7784 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7785 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7786 struct redis_stat buf;
7787
7788 if (bgsaveerr != REDIS_OK) {
7789 freeClient(slave);
7790 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7791 continue;
7792 }
7793 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7794 redis_fstat(slave->repldbfd,&buf) == -1) {
7795 freeClient(slave);
7796 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7797 continue;
7798 }
7799 slave->repldboff = 0;
7800 slave->repldbsize = buf.st_size;
7801 slave->replstate = REDIS_REPL_SEND_BULK;
7802 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7803 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7804 freeClient(slave);
7805 continue;
7806 }
7807 }
7808 }
7809 if (startbgsave) {
7810 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7811 listIter li;
7812
7813 listRewind(server.slaves,&li);
7814 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7815 while((ln = listNext(&li))) {
7816 redisClient *slave = ln->value;
7817
7818 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7819 freeClient(slave);
7820 }
7821 }
7822 }
7823 }
7824
7825 static int syncWithMaster(void) {
7826 char buf[1024], tmpfile[256], authcmd[1024];
7827 long dumpsize;
7828 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7829 int dfd, maxtries = 5;
7830
7831 if (fd == -1) {
7832 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7833 strerror(errno));
7834 return REDIS_ERR;
7835 }
7836
7837 /* AUTH with the master if required. */
7838 if(server.masterauth) {
7839 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7840 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7841 close(fd);
7842 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7843 strerror(errno));
7844 return REDIS_ERR;
7845 }
7846 /* Read the AUTH result. */
7847 if (syncReadLine(fd,buf,1024,3600) == -1) {
7848 close(fd);
7849 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7850 strerror(errno));
7851 return REDIS_ERR;
7852 }
7853 if (buf[0] != '+') {
7854 close(fd);
7855 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7856 return REDIS_ERR;
7857 }
7858 }
7859
7860 /* Issue the SYNC command */
7861 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7862 close(fd);
7863 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7864 strerror(errno));
7865 return REDIS_ERR;
7866 }
7867 /* Read the bulk write count */
7868 if (syncReadLine(fd,buf,1024,3600) == -1) {
7869 close(fd);
7870 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7871 strerror(errno));
7872 return REDIS_ERR;
7873 }
7874 if (buf[0] != '$') {
7875 close(fd);
7876 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7877 return REDIS_ERR;
7878 }
7879 dumpsize = strtol(buf+1,NULL,10);
7880 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7881 /* Read the bulk write data on a temp file */
7882 while(maxtries--) {
7883 snprintf(tmpfile,256,
7884 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7885 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7886 if (dfd != -1) break;
7887 sleep(1);
7888 }
7889 if (dfd == -1) {
7890 close(fd);
7891 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7892 return REDIS_ERR;
7893 }
7894 while(dumpsize) {
7895 int nread, nwritten;
7896
7897 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7898 if (nread == -1) {
7899 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7900 strerror(errno));
7901 close(fd);
7902 close(dfd);
7903 return REDIS_ERR;
7904 }
7905 nwritten = write(dfd,buf,nread);
7906 if (nwritten == -1) {
7907 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7908 close(fd);
7909 close(dfd);
7910 return REDIS_ERR;
7911 }
7912 dumpsize -= nread;
7913 }
7914 close(dfd);
7915 if (rename(tmpfile,server.dbfilename) == -1) {
7916 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7917 unlink(tmpfile);
7918 close(fd);
7919 return REDIS_ERR;
7920 }
7921 emptyDb();
7922 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7923 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7924 close(fd);
7925 return REDIS_ERR;
7926 }
7927 server.master = createClient(fd);
7928 server.master->flags |= REDIS_MASTER;
7929 server.master->authenticated = 1;
7930 server.replstate = REDIS_REPL_CONNECTED;
7931 return REDIS_OK;
7932 }
7933
7934 static void slaveofCommand(redisClient *c) {
7935 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7936 !strcasecmp(c->argv[2]->ptr,"one")) {
7937 if (server.masterhost) {
7938 sdsfree(server.masterhost);
7939 server.masterhost = NULL;
7940 if (server.master) freeClient(server.master);
7941 server.replstate = REDIS_REPL_NONE;
7942 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7943 }
7944 } else {
7945 sdsfree(server.masterhost);
7946 server.masterhost = sdsdup(c->argv[1]->ptr);
7947 server.masterport = atoi(c->argv[2]->ptr);
7948 if (server.master) freeClient(server.master);
7949 server.replstate = REDIS_REPL_CONNECT;
7950 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7951 server.masterhost, server.masterport);
7952 }
7953 addReply(c,shared.ok);
7954 }
7955
7956 /* ============================ Maxmemory directive ======================== */
7957
7958 /* Try to free one object form the pre-allocated objects free list.
7959 * This is useful under low mem conditions as by default we take 1 million
7960 * free objects allocated. On success REDIS_OK is returned, otherwise
7961 * REDIS_ERR. */
7962 static int tryFreeOneObjectFromFreelist(void) {
7963 robj *o;
7964
7965 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7966 if (listLength(server.objfreelist)) {
7967 listNode *head = listFirst(server.objfreelist);
7968 o = listNodeValue(head);
7969 listDelNode(server.objfreelist,head);
7970 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7971 zfree(o);
7972 return REDIS_OK;
7973 } else {
7974 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7975 return REDIS_ERR;
7976 }
7977 }
7978
7979 /* This function gets called when 'maxmemory' is set on the config file to limit
7980 * the max memory used by the server, and we are out of memory.
7981 * This function will try to, in order:
7982 *
7983 * - Free objects from the free list
7984 * - Try to remove keys with an EXPIRE set
7985 *
7986 * It is not possible to free enough memory to reach used-memory < maxmemory
7987 * the server will start refusing commands that will enlarge even more the
7988 * memory usage.
7989 */
7990 static void freeMemoryIfNeeded(void) {
7991 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7992 int j, k, freed = 0;
7993
7994 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7995 for (j = 0; j < server.dbnum; j++) {
7996 int minttl = -1;
7997 robj *minkey = NULL;
7998 struct dictEntry *de;
7999
8000 if (dictSize(server.db[j].expires)) {
8001 freed = 1;
8002 /* From a sample of three keys drop the one nearest to
8003 * the natural expire */
8004 for (k = 0; k < 3; k++) {
8005 time_t t;
8006
8007 de = dictGetRandomKey(server.db[j].expires);
8008 t = (time_t) dictGetEntryVal(de);
8009 if (minttl == -1 || t < minttl) {
8010 minkey = dictGetEntryKey(de);
8011 minttl = t;
8012 }
8013 }
8014 deleteKey(server.db+j,minkey);
8015 }
8016 }
8017 if (!freed) return; /* nothing to free... */
8018 }
8019 }
8020
8021 /* ============================== Append Only file ========================== */
8022
8023 /* Write the append only file buffer on disk.
8024 *
8025 * Since we are required to write the AOF before replying to the client,
8026 * and the only way the client socket can get a write is entering when the
8027 * the event loop, we accumulate all the AOF writes in a memory
8028 * buffer and write it on disk using this function just before entering
8029 * the event loop again. */
8030 static void flushAppendOnlyFile(void) {
8031 time_t now;
8032 ssize_t nwritten;
8033
8034 if (sdslen(server.aofbuf) == 0) return;
8035
8036 /* We want to perform a single write. This should be guaranteed atomic
8037 * at least if the filesystem we are writing is a real physical one.
8038 * While this will save us against the server being killed I don't think
8039 * there is much to do about the whole server stopping for power problems
8040 * or alike */
8041 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8042 if (nwritten != (signed)sdslen(server.aofbuf)) {
8043 /* Ooops, we are in troubles. The best thing to do for now is
8044 * aborting instead of giving the illusion that everything is
8045 * working as expected. */
8046 if (nwritten == -1) {
8047 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8048 } else {
8049 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8050 }
8051 exit(1);
8052 }
8053 sdsfree(server.aofbuf);
8054 server.aofbuf = sdsempty();
8055
8056 /* Fsync if needed */
8057 now = time(NULL);
8058 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8059 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8060 now-server.lastfsync > 1))
8061 {
8062 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8063 * flushing metadata. */
8064 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8065 server.lastfsync = now;
8066 }
8067 }
8068
8069 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8070 sds buf = sdsempty();
8071 int j;
8072 robj *tmpargv[3];
8073
8074 /* The DB this command was targetting is not the same as the last command
8075 * we appendend. To issue a SELECT command is needed. */
8076 if (dictid != server.appendseldb) {
8077 char seldb[64];
8078
8079 snprintf(seldb,sizeof(seldb),"%d",dictid);
8080 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8081 (unsigned long)strlen(seldb),seldb);
8082 server.appendseldb = dictid;
8083 }
8084
8085 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8086 * EXPIREs into EXPIREATs calls */
8087 if (cmd->proc == expireCommand) {
8088 long when;
8089
8090 tmpargv[0] = createStringObject("EXPIREAT",8);
8091 tmpargv[1] = argv[1];
8092 incrRefCount(argv[1]);
8093 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8094 tmpargv[2] = createObject(REDIS_STRING,
8095 sdscatprintf(sdsempty(),"%ld",when));
8096 argv = tmpargv;
8097 }
8098
8099 /* Append the actual command */
8100 buf = sdscatprintf(buf,"*%d\r\n",argc);
8101 for (j = 0; j < argc; j++) {
8102 robj *o = argv[j];
8103
8104 o = getDecodedObject(o);
8105 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8106 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8107 buf = sdscatlen(buf,"\r\n",2);
8108 decrRefCount(o);
8109 }
8110
8111 /* Free the objects from the modified argv for EXPIREAT */
8112 if (cmd->proc == expireCommand) {
8113 for (j = 0; j < 3; j++)
8114 decrRefCount(argv[j]);
8115 }
8116
8117 /* Append to the AOF buffer. This will be flushed on disk just before
8118 * of re-entering the event loop, so before the client will get a
8119 * positive reply about the operation performed. */
8120 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8121
8122 /* If a background append only file rewriting is in progress we want to
8123 * accumulate the differences between the child DB and the current one
8124 * in a buffer, so that when the child process will do its work we
8125 * can append the differences to the new append only file. */
8126 if (server.bgrewritechildpid != -1)
8127 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8128
8129 sdsfree(buf);
8130 }
8131
8132 /* In Redis commands are always executed in the context of a client, so in
8133 * order to load the append only file we need to create a fake client. */
8134 static struct redisClient *createFakeClient(void) {
8135 struct redisClient *c = zmalloc(sizeof(*c));
8136
8137 selectDb(c,0);
8138 c->fd = -1;
8139 c->querybuf = sdsempty();
8140 c->argc = 0;
8141 c->argv = NULL;
8142 c->flags = 0;
8143 /* We set the fake client as a slave waiting for the synchronization
8144 * so that Redis will not try to send replies to this client. */
8145 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8146 c->reply = listCreate();
8147 listSetFreeMethod(c->reply,decrRefCount);
8148 listSetDupMethod(c->reply,dupClientReplyValue);
8149 initClientMultiState(c);
8150 return c;
8151 }
8152
8153 static void freeFakeClient(struct redisClient *c) {
8154 sdsfree(c->querybuf);
8155 listRelease(c->reply);
8156 freeClientMultiState(c);
8157 zfree(c);
8158 }
8159
8160 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8161 * error (the append only file is zero-length) REDIS_ERR is returned. On
8162 * fatal error an error message is logged and the program exists. */
8163 int loadAppendOnlyFile(char *filename) {
8164 struct redisClient *fakeClient;
8165 FILE *fp = fopen(filename,"r");
8166 struct redis_stat sb;
8167 unsigned long long loadedkeys = 0;
8168 int appendonly = server.appendonly;
8169
8170 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8171 return REDIS_ERR;
8172
8173 if (fp == NULL) {
8174 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8175 exit(1);
8176 }
8177
8178 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8179 * to the same file we're about to read. */
8180 server.appendonly = 0;
8181
8182 fakeClient = createFakeClient();
8183 while(1) {
8184 int argc, j;
8185 unsigned long len;
8186 robj **argv;
8187 char buf[128];
8188 sds argsds;
8189 struct redisCommand *cmd;
8190
8191 if (fgets(buf,sizeof(buf),fp) == NULL) {
8192 if (feof(fp))
8193 break;
8194 else
8195 goto readerr;
8196 }
8197 if (buf[0] != '*') goto fmterr;
8198 argc = atoi(buf+1);
8199 argv = zmalloc(sizeof(robj*)*argc);
8200 for (j = 0; j < argc; j++) {
8201 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8202 if (buf[0] != '$') goto fmterr;
8203 len = strtol(buf+1,NULL,10);
8204 argsds = sdsnewlen(NULL,len);
8205 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8206 argv[j] = createObject(REDIS_STRING,argsds);
8207 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8208 }
8209
8210 /* Command lookup */
8211 cmd = lookupCommand(argv[0]->ptr);
8212 if (!cmd) {
8213 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8214 exit(1);
8215 }
8216 /* Try object encoding */
8217 if (cmd->flags & REDIS_CMD_BULK)
8218 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8219 /* Run the command in the context of a fake client */
8220 fakeClient->argc = argc;
8221 fakeClient->argv = argv;
8222 cmd->proc(fakeClient);
8223 /* Discard the reply objects list from the fake client */
8224 while(listLength(fakeClient->reply))
8225 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8226 /* Clean up, ready for the next command */
8227 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8228 zfree(argv);
8229 /* Handle swapping while loading big datasets when VM is on */
8230 loadedkeys++;
8231 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8232 while (zmalloc_used_memory() > server.vm_max_memory) {
8233 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8234 }
8235 }
8236 }
8237
8238 /* This point can only be reached when EOF is reached without errors.
8239 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8240 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8241
8242 fclose(fp);
8243 freeFakeClient(fakeClient);
8244 server.appendonly = appendonly;
8245 return REDIS_OK;
8246
8247 readerr:
8248 if (feof(fp)) {
8249 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8250 } else {
8251 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8252 }
8253 exit(1);
8254 fmterr:
8255 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8256 exit(1);
8257 }
8258
8259 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8260 static int fwriteBulkObject(FILE *fp, robj *obj) {
8261 char buf[128];
8262 int decrrc = 0;
8263
8264 /* Avoid the incr/decr ref count business if possible to help
8265 * copy-on-write (we are often in a child process when this function
8266 * is called).
8267 * Also makes sure that key objects don't get incrRefCount-ed when VM
8268 * is enabled */
8269 if (obj->encoding != REDIS_ENCODING_RAW) {
8270 obj = getDecodedObject(obj);
8271 decrrc = 1;
8272 }
8273 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8274 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8275 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8276 goto err;
8277 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8278 if (decrrc) decrRefCount(obj);
8279 return 1;
8280 err:
8281 if (decrrc) decrRefCount(obj);
8282 return 0;
8283 }
8284
8285 /* Write binary-safe string into a file in the bulkformat
8286 * $<count>\r\n<payload>\r\n */
8287 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8288 char buf[128];
8289
8290 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8291 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8292 if (len && fwrite(s,len,1,fp) == 0) return 0;
8293 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8294 return 1;
8295 }
8296
8297 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8298 static int fwriteBulkDouble(FILE *fp, double d) {
8299 char buf[128], dbuf[128];
8300
8301 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8302 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8303 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8304 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8305 return 1;
8306 }
8307
8308 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8309 static int fwriteBulkLong(FILE *fp, long l) {
8310 char buf[128], lbuf[128];
8311
8312 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8313 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8314 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8315 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8316 return 1;
8317 }
8318
8319 /* Write a sequence of commands able to fully rebuild the dataset into
8320 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8321 static int rewriteAppendOnlyFile(char *filename) {
8322 dictIterator *di = NULL;
8323 dictEntry *de;
8324 FILE *fp;
8325 char tmpfile[256];
8326 int j;
8327 time_t now = time(NULL);
8328
8329 /* Note that we have to use a different temp name here compared to the
8330 * one used by rewriteAppendOnlyFileBackground() function. */
8331 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8332 fp = fopen(tmpfile,"w");
8333 if (!fp) {
8334 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8335 return REDIS_ERR;
8336 }
8337 for (j = 0; j < server.dbnum; j++) {
8338 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8339 redisDb *db = server.db+j;
8340 dict *d = db->dict;
8341 if (dictSize(d) == 0) continue;
8342 di = dictGetIterator(d);
8343 if (!di) {
8344 fclose(fp);
8345 return REDIS_ERR;
8346 }
8347
8348 /* SELECT the new DB */
8349 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8350 if (fwriteBulkLong(fp,j) == 0) goto werr;
8351
8352 /* Iterate this DB writing every entry */
8353 while((de = dictNext(di)) != NULL) {
8354 robj *key, *o;
8355 time_t expiretime;
8356 int swapped;
8357
8358 key = dictGetEntryKey(de);
8359 /* If the value for this key is swapped, load a preview in memory.
8360 * We use a "swapped" flag to remember if we need to free the
8361 * value object instead to just increment the ref count anyway
8362 * in order to avoid copy-on-write of pages if we are forked() */
8363 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8364 key->storage == REDIS_VM_SWAPPING) {
8365 o = dictGetEntryVal(de);
8366 swapped = 0;
8367 } else {
8368 o = vmPreviewObject(key);
8369 swapped = 1;
8370 }
8371 expiretime = getExpire(db,key);
8372
8373 /* Save the key and associated value */
8374 if (o->type == REDIS_STRING) {
8375 /* Emit a SET command */
8376 char cmd[]="*3\r\n$3\r\nSET\r\n";
8377 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8378 /* Key and value */
8379 if (fwriteBulkObject(fp,key) == 0) goto werr;
8380 if (fwriteBulkObject(fp,o) == 0) goto werr;
8381 } else if (o->type == REDIS_LIST) {
8382 /* Emit the RPUSHes needed to rebuild the list */
8383 list *list = o->ptr;
8384 listNode *ln;
8385 listIter li;
8386
8387 listRewind(list,&li);
8388 while((ln = listNext(&li))) {
8389 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8390 robj *eleobj = listNodeValue(ln);
8391
8392 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8393 if (fwriteBulkObject(fp,key) == 0) goto werr;
8394 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8395 }
8396 } else if (o->type == REDIS_SET) {
8397 /* Emit the SADDs needed to rebuild the set */
8398 dict *set = o->ptr;
8399 dictIterator *di = dictGetIterator(set);
8400 dictEntry *de;
8401
8402 while((de = dictNext(di)) != NULL) {
8403 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8404 robj *eleobj = dictGetEntryKey(de);
8405
8406 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8407 if (fwriteBulkObject(fp,key) == 0) goto werr;
8408 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8409 }
8410 dictReleaseIterator(di);
8411 } else if (o->type == REDIS_ZSET) {
8412 /* Emit the ZADDs needed to rebuild the sorted set */
8413 zset *zs = o->ptr;
8414 dictIterator *di = dictGetIterator(zs->dict);
8415 dictEntry *de;
8416
8417 while((de = dictNext(di)) != NULL) {
8418 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8419 robj *eleobj = dictGetEntryKey(de);
8420 double *score = dictGetEntryVal(de);
8421
8422 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8423 if (fwriteBulkObject(fp,key) == 0) goto werr;
8424 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8425 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8426 }
8427 dictReleaseIterator(di);
8428 } else if (o->type == REDIS_HASH) {
8429 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8430
8431 /* Emit the HSETs needed to rebuild the hash */
8432 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8433 unsigned char *p = zipmapRewind(o->ptr);
8434 unsigned char *field, *val;
8435 unsigned int flen, vlen;
8436
8437 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8438 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8439 if (fwriteBulkObject(fp,key) == 0) goto werr;
8440 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8441 return -1;
8442 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8443 return -1;
8444 }
8445 } else {
8446 dictIterator *di = dictGetIterator(o->ptr);
8447 dictEntry *de;
8448
8449 while((de = dictNext(di)) != NULL) {
8450 robj *field = dictGetEntryKey(de);
8451 robj *val = dictGetEntryVal(de);
8452
8453 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8454 if (fwriteBulkObject(fp,key) == 0) goto werr;
8455 if (fwriteBulkObject(fp,field) == -1) return -1;
8456 if (fwriteBulkObject(fp,val) == -1) return -1;
8457 }
8458 dictReleaseIterator(di);
8459 }
8460 } else {
8461 redisPanic("Unknown object type");
8462 }
8463 /* Save the expire time */
8464 if (expiretime != -1) {
8465 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8466 /* If this key is already expired skip it */
8467 if (expiretime < now) continue;
8468 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8469 if (fwriteBulkObject(fp,key) == 0) goto werr;
8470 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8471 }
8472 if (swapped) decrRefCount(o);
8473 }
8474 dictReleaseIterator(di);
8475 }
8476
8477 /* Make sure data will not remain on the OS's output buffers */
8478 fflush(fp);
8479 fsync(fileno(fp));
8480 fclose(fp);
8481
8482 /* Use RENAME to make sure the DB file is changed atomically only
8483 * if the generate DB file is ok. */
8484 if (rename(tmpfile,filename) == -1) {
8485 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8486 unlink(tmpfile);
8487 return REDIS_ERR;
8488 }
8489 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8490 return REDIS_OK;
8491
8492 werr:
8493 fclose(fp);
8494 unlink(tmpfile);
8495 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8496 if (di) dictReleaseIterator(di);
8497 return REDIS_ERR;
8498 }
8499
8500 /* This is how rewriting of the append only file in background works:
8501 *
8502 * 1) The user calls BGREWRITEAOF
8503 * 2) Redis calls this function, that forks():
8504 * 2a) the child rewrite the append only file in a temp file.
8505 * 2b) the parent accumulates differences in server.bgrewritebuf.
8506 * 3) When the child finished '2a' exists.
8507 * 4) The parent will trap the exit code, if it's OK, will append the
8508 * data accumulated into server.bgrewritebuf into the temp file, and
8509 * finally will rename(2) the temp file in the actual file name.
8510 * The the new file is reopened as the new append only file. Profit!
8511 */
8512 static int rewriteAppendOnlyFileBackground(void) {
8513 pid_t childpid;
8514
8515 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8516 if (server.vm_enabled) waitEmptyIOJobsQueue();
8517 if ((childpid = fork()) == 0) {
8518 /* Child */
8519 char tmpfile[256];
8520
8521 if (server.vm_enabled) vmReopenSwapFile();
8522 close(server.fd);
8523 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8524 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8525 _exit(0);
8526 } else {
8527 _exit(1);
8528 }
8529 } else {
8530 /* Parent */
8531 if (childpid == -1) {
8532 redisLog(REDIS_WARNING,
8533 "Can't rewrite append only file in background: fork: %s",
8534 strerror(errno));
8535 return REDIS_ERR;
8536 }
8537 redisLog(REDIS_NOTICE,
8538 "Background append only file rewriting started by pid %d",childpid);
8539 server.bgrewritechildpid = childpid;
8540 updateDictResizePolicy();
8541 /* We set appendseldb to -1 in order to force the next call to the
8542 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8543 * accumulated by the parent into server.bgrewritebuf will start
8544 * with a SELECT statement and it will be safe to merge. */
8545 server.appendseldb = -1;
8546 return REDIS_OK;
8547 }
8548 return REDIS_OK; /* unreached */
8549 }
8550
8551 static void bgrewriteaofCommand(redisClient *c) {
8552 if (server.bgrewritechildpid != -1) {
8553 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8554 return;
8555 }
8556 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8557 char *status = "+Background append only file rewriting started\r\n";
8558 addReplySds(c,sdsnew(status));
8559 } else {
8560 addReply(c,shared.err);
8561 }
8562 }
8563
8564 static void aofRemoveTempFile(pid_t childpid) {
8565 char tmpfile[256];
8566
8567 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8568 unlink(tmpfile);
8569 }
8570
8571 /* Virtual Memory is composed mainly of two subsystems:
8572 * - Blocking Virutal Memory
8573 * - Threaded Virtual Memory I/O
8574 * The two parts are not fully decoupled, but functions are split among two
8575 * different sections of the source code (delimited by comments) in order to
8576 * make more clear what functionality is about the blocking VM and what about
8577 * the threaded (not blocking) VM.
8578 *
8579 * Redis VM design:
8580 *
8581 * Redis VM is a blocking VM (one that blocks reading swapped values from
8582 * disk into memory when a value swapped out is needed in memory) that is made
8583 * unblocking by trying to examine the command argument vector in order to
8584 * load in background values that will likely be needed in order to exec
8585 * the command. The command is executed only once all the relevant keys
8586 * are loaded into memory.
8587 *
8588 * This basically is almost as simple of a blocking VM, but almost as parallel
8589 * as a fully non-blocking VM.
8590 */
8591
8592 /* =================== Virtual Memory - Blocking Side ====================== */
8593
8594 static void vmInit(void) {
8595 off_t totsize;
8596 int pipefds[2];
8597 size_t stacksize;
8598 struct flock fl;
8599
8600 if (server.vm_max_threads != 0)
8601 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8602
8603 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8604 /* Try to open the old swap file, otherwise create it */
8605 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8606 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8607 }
8608 if (server.vm_fp == NULL) {
8609 redisLog(REDIS_WARNING,
8610 "Can't open the swap file: %s. Exiting.",
8611 strerror(errno));
8612 exit(1);
8613 }
8614 server.vm_fd = fileno(server.vm_fp);
8615 /* Lock the swap file for writing, this is useful in order to avoid
8616 * another instance to use the same swap file for a config error. */
8617 fl.l_type = F_WRLCK;
8618 fl.l_whence = SEEK_SET;
8619 fl.l_start = fl.l_len = 0;
8620 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8621 redisLog(REDIS_WARNING,
8622 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8623 exit(1);
8624 }
8625 /* Initialize */
8626 server.vm_next_page = 0;
8627 server.vm_near_pages = 0;
8628 server.vm_stats_used_pages = 0;
8629 server.vm_stats_swapped_objects = 0;
8630 server.vm_stats_swapouts = 0;
8631 server.vm_stats_swapins = 0;
8632 totsize = server.vm_pages*server.vm_page_size;
8633 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8634 if (ftruncate(server.vm_fd,totsize) == -1) {
8635 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8636 strerror(errno));
8637 exit(1);
8638 } else {
8639 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8640 }
8641 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8642 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8643 (long long) (server.vm_pages+7)/8, server.vm_pages);
8644 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8645
8646 /* Initialize threaded I/O (used by Virtual Memory) */
8647 server.io_newjobs = listCreate();
8648 server.io_processing = listCreate();
8649 server.io_processed = listCreate();
8650 server.io_ready_clients = listCreate();
8651 pthread_mutex_init(&server.io_mutex,NULL);
8652 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8653 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8654 server.io_active_threads = 0;
8655 if (pipe(pipefds) == -1) {
8656 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8657 ,strerror(errno));
8658 exit(1);
8659 }
8660 server.io_ready_pipe_read = pipefds[0];
8661 server.io_ready_pipe_write = pipefds[1];
8662 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8663 /* LZF requires a lot of stack */
8664 pthread_attr_init(&server.io_threads_attr);
8665 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8666 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8667 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8668 /* Listen for events in the threaded I/O pipe */
8669 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8670 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8671 oom("creating file event");
8672 }
8673
8674 /* Mark the page as used */
8675 static void vmMarkPageUsed(off_t page) {
8676 off_t byte = page/8;
8677 int bit = page&7;
8678 redisAssert(vmFreePage(page) == 1);
8679 server.vm_bitmap[byte] |= 1<<bit;
8680 }
8681
8682 /* Mark N contiguous pages as used, with 'page' being the first. */
8683 static void vmMarkPagesUsed(off_t page, off_t count) {
8684 off_t j;
8685
8686 for (j = 0; j < count; j++)
8687 vmMarkPageUsed(page+j);
8688 server.vm_stats_used_pages += count;
8689 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8690 (long long)count, (long long)page);
8691 }
8692
8693 /* Mark the page as free */
8694 static void vmMarkPageFree(off_t page) {
8695 off_t byte = page/8;
8696 int bit = page&7;
8697 redisAssert(vmFreePage(page) == 0);
8698 server.vm_bitmap[byte] &= ~(1<<bit);
8699 }
8700
8701 /* Mark N contiguous pages as free, with 'page' being the first. */
8702 static void vmMarkPagesFree(off_t page, off_t count) {
8703 off_t j;
8704
8705 for (j = 0; j < count; j++)
8706 vmMarkPageFree(page+j);
8707 server.vm_stats_used_pages -= count;
8708 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8709 (long long)count, (long long)page);
8710 }
8711
8712 /* Test if the page is free */
8713 static int vmFreePage(off_t page) {
8714 off_t byte = page/8;
8715 int bit = page&7;
8716 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8717 }
8718
8719 /* Find N contiguous free pages storing the first page of the cluster in *first.
8720 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8721 * REDIS_ERR is returned.
8722 *
8723 * This function uses a simple algorithm: we try to allocate
8724 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8725 * again from the start of the swap file searching for free spaces.
8726 *
8727 * If it looks pretty clear that there are no free pages near our offset
8728 * we try to find less populated places doing a forward jump of
8729 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8730 * without hurry, and then we jump again and so forth...
8731 *
8732 * This function can be improved using a free list to avoid to guess
8733 * too much, since we could collect data about freed pages.
8734 *
8735 * note: I implemented this function just after watching an episode of
8736 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8737 */
8738 static int vmFindContiguousPages(off_t *first, off_t n) {
8739 off_t base, offset = 0, since_jump = 0, numfree = 0;
8740
8741 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8742 server.vm_near_pages = 0;
8743 server.vm_next_page = 0;
8744 }
8745 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8746 base = server.vm_next_page;
8747
8748 while(offset < server.vm_pages) {
8749 off_t this = base+offset;
8750
8751 /* If we overflow, restart from page zero */
8752 if (this >= server.vm_pages) {
8753 this -= server.vm_pages;
8754 if (this == 0) {
8755 /* Just overflowed, what we found on tail is no longer
8756 * interesting, as it's no longer contiguous. */
8757 numfree = 0;
8758 }
8759 }
8760 if (vmFreePage(this)) {
8761 /* This is a free page */
8762 numfree++;
8763 /* Already got N free pages? Return to the caller, with success */
8764 if (numfree == n) {
8765 *first = this-(n-1);
8766 server.vm_next_page = this+1;
8767 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8768 return REDIS_OK;
8769 }
8770 } else {
8771 /* The current one is not a free page */
8772 numfree = 0;
8773 }
8774
8775 /* Fast-forward if the current page is not free and we already
8776 * searched enough near this place. */
8777 since_jump++;
8778 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8779 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8780 since_jump = 0;
8781 /* Note that even if we rewind after the jump, we are don't need
8782 * to make sure numfree is set to zero as we only jump *if* it
8783 * is set to zero. */
8784 } else {
8785 /* Otherwise just check the next page */
8786 offset++;
8787 }
8788 }
8789 return REDIS_ERR;
8790 }
8791
8792 /* Write the specified object at the specified page of the swap file */
8793 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8794 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8795 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8796 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8797 redisLog(REDIS_WARNING,
8798 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8799 strerror(errno));
8800 return REDIS_ERR;
8801 }
8802 rdbSaveObject(server.vm_fp,o);
8803 fflush(server.vm_fp);
8804 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8805 return REDIS_OK;
8806 }
8807
8808 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8809 * needed to later retrieve the object into the key object.
8810 * If we can't find enough contiguous empty pages to swap the object on disk
8811 * REDIS_ERR is returned. */
8812 static int vmSwapObjectBlocking(robj *key, robj *val) {
8813 off_t pages = rdbSavedObjectPages(val,NULL);
8814 off_t page;
8815
8816 assert(key->storage == REDIS_VM_MEMORY);
8817 assert(key->refcount == 1);
8818 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8819 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8820 key->vm.page = page;
8821 key->vm.usedpages = pages;
8822 key->storage = REDIS_VM_SWAPPED;
8823 key->vtype = val->type;
8824 decrRefCount(val); /* Deallocate the object from memory. */
8825 vmMarkPagesUsed(page,pages);
8826 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8827 (unsigned char*) key->ptr,
8828 (unsigned long long) page, (unsigned long long) pages);
8829 server.vm_stats_swapped_objects++;
8830 server.vm_stats_swapouts++;
8831 return REDIS_OK;
8832 }
8833
8834 static robj *vmReadObjectFromSwap(off_t page, int type) {
8835 robj *o;
8836
8837 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8838 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8839 redisLog(REDIS_WARNING,
8840 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8841 strerror(errno));
8842 _exit(1);
8843 }
8844 o = rdbLoadObject(type,server.vm_fp);
8845 if (o == NULL) {
8846 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8847 _exit(1);
8848 }
8849 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8850 return o;
8851 }
8852
8853 /* Load the value object relative to the 'key' object from swap to memory.
8854 * The newly allocated object is returned.
8855 *
8856 * If preview is true the unserialized object is returned to the caller but
8857 * no changes are made to the key object, nor the pages are marked as freed */
8858 static robj *vmGenericLoadObject(robj *key, int preview) {
8859 robj *val;
8860
8861 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8862 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8863 if (!preview) {
8864 key->storage = REDIS_VM_MEMORY;
8865 key->vm.atime = server.unixtime;
8866 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8867 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8868 (unsigned char*) key->ptr);
8869 server.vm_stats_swapped_objects--;
8870 } else {
8871 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8872 (unsigned char*) key->ptr);
8873 }
8874 server.vm_stats_swapins++;
8875 return val;
8876 }
8877
8878 /* Plain object loading, from swap to memory */
8879 static robj *vmLoadObject(robj *key) {
8880 /* If we are loading the object in background, stop it, we
8881 * need to load this object synchronously ASAP. */
8882 if (key->storage == REDIS_VM_LOADING)
8883 vmCancelThreadedIOJob(key);
8884 return vmGenericLoadObject(key,0);
8885 }
8886
8887 /* Just load the value on disk, without to modify the key.
8888 * This is useful when we want to perform some operation on the value
8889 * without to really bring it from swap to memory, like while saving the
8890 * dataset or rewriting the append only log. */
8891 static robj *vmPreviewObject(robj *key) {
8892 return vmGenericLoadObject(key,1);
8893 }
8894
8895 /* How a good candidate is this object for swapping?
8896 * The better candidate it is, the greater the returned value.
8897 *
8898 * Currently we try to perform a fast estimation of the object size in
8899 * memory, and combine it with aging informations.
8900 *
8901 * Basically swappability = idle-time * log(estimated size)
8902 *
8903 * Bigger objects are preferred over smaller objects, but not
8904 * proportionally, this is why we use the logarithm. This algorithm is
8905 * just a first try and will probably be tuned later. */
8906 static double computeObjectSwappability(robj *o) {
8907 time_t age = server.unixtime - o->vm.atime;
8908 long asize = 0;
8909 list *l;
8910 dict *d;
8911 struct dictEntry *de;
8912 int z;
8913
8914 if (age <= 0) return 0;
8915 switch(o->type) {
8916 case REDIS_STRING:
8917 if (o->encoding != REDIS_ENCODING_RAW) {
8918 asize = sizeof(*o);
8919 } else {
8920 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8921 }
8922 break;
8923 case REDIS_LIST:
8924 l = o->ptr;
8925 listNode *ln = listFirst(l);
8926
8927 asize = sizeof(list);
8928 if (ln) {
8929 robj *ele = ln->value;
8930 long elesize;
8931
8932 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8933 (sizeof(*o)+sdslen(ele->ptr)) :
8934 sizeof(*o);
8935 asize += (sizeof(listNode)+elesize)*listLength(l);
8936 }
8937 break;
8938 case REDIS_SET:
8939 case REDIS_ZSET:
8940 z = (o->type == REDIS_ZSET);
8941 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8942
8943 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8944 if (z) asize += sizeof(zset)-sizeof(dict);
8945 if (dictSize(d)) {
8946 long elesize;
8947 robj *ele;
8948
8949 de = dictGetRandomKey(d);
8950 ele = dictGetEntryKey(de);
8951 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8952 (sizeof(*o)+sdslen(ele->ptr)) :
8953 sizeof(*o);
8954 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8955 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8956 }
8957 break;
8958 case REDIS_HASH:
8959 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8960 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8961 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8962 unsigned int klen, vlen;
8963 unsigned char *key, *val;
8964
8965 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8966 klen = 0;
8967 vlen = 0;
8968 }
8969 asize = len*(klen+vlen+3);
8970 } else if (o->encoding == REDIS_ENCODING_HT) {
8971 d = o->ptr;
8972 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8973 if (dictSize(d)) {
8974 long elesize;
8975 robj *ele;
8976
8977 de = dictGetRandomKey(d);
8978 ele = dictGetEntryKey(de);
8979 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8980 (sizeof(*o)+sdslen(ele->ptr)) :
8981 sizeof(*o);
8982 ele = dictGetEntryVal(de);
8983 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8984 (sizeof(*o)+sdslen(ele->ptr)) :
8985 sizeof(*o);
8986 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8987 }
8988 }
8989 break;
8990 }
8991 return (double)age*log(1+asize);
8992 }
8993
8994 /* Try to swap an object that's a good candidate for swapping.
8995 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8996 * to swap any object at all.
8997 *
8998 * If 'usethreaded' is true, Redis will try to swap the object in background
8999 * using I/O threads. */
9000 static int vmSwapOneObject(int usethreads) {
9001 int j, i;
9002 struct dictEntry *best = NULL;
9003 double best_swappability = 0;
9004 redisDb *best_db = NULL;
9005 robj *key, *val;
9006
9007 for (j = 0; j < server.dbnum; j++) {
9008 redisDb *db = server.db+j;
9009 /* Why maxtries is set to 100?
9010 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9011 * are swappable objects */
9012 int maxtries = 100;
9013
9014 if (dictSize(db->dict) == 0) continue;
9015 for (i = 0; i < 5; i++) {
9016 dictEntry *de;
9017 double swappability;
9018
9019 if (maxtries) maxtries--;
9020 de = dictGetRandomKey(db->dict);
9021 key = dictGetEntryKey(de);
9022 val = dictGetEntryVal(de);
9023 /* Only swap objects that are currently in memory.
9024 *
9025 * Also don't swap shared objects if threaded VM is on, as we
9026 * try to ensure that the main thread does not touch the
9027 * object while the I/O thread is using it, but we can't
9028 * control other keys without adding additional mutex. */
9029 if (key->storage != REDIS_VM_MEMORY ||
9030 (server.vm_max_threads != 0 && val->refcount != 1)) {
9031 if (maxtries) i--; /* don't count this try */
9032 continue;
9033 }
9034 swappability = computeObjectSwappability(val);
9035 if (!best || swappability > best_swappability) {
9036 best = de;
9037 best_swappability = swappability;
9038 best_db = db;
9039 }
9040 }
9041 }
9042 if (best == NULL) return REDIS_ERR;
9043 key = dictGetEntryKey(best);
9044 val = dictGetEntryVal(best);
9045
9046 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9047 key->ptr, best_swappability);
9048
9049 /* Unshare the key if needed */
9050 if (key->refcount > 1) {
9051 robj *newkey = dupStringObject(key);
9052 decrRefCount(key);
9053 key = dictGetEntryKey(best) = newkey;
9054 }
9055 /* Swap it */
9056 if (usethreads) {
9057 vmSwapObjectThreaded(key,val,best_db);
9058 return REDIS_OK;
9059 } else {
9060 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9061 dictGetEntryVal(best) = NULL;
9062 return REDIS_OK;
9063 } else {
9064 return REDIS_ERR;
9065 }
9066 }
9067 }
9068
9069 static int vmSwapOneObjectBlocking() {
9070 return vmSwapOneObject(0);
9071 }
9072
9073 static int vmSwapOneObjectThreaded() {
9074 return vmSwapOneObject(1);
9075 }
9076
9077 /* Return true if it's safe to swap out objects in a given moment.
9078 * Basically we don't want to swap objects out while there is a BGSAVE
9079 * or a BGAEOREWRITE running in backgroud. */
9080 static int vmCanSwapOut(void) {
9081 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9082 }
9083
9084 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9085 * and was deleted. Otherwise 0 is returned. */
9086 static int deleteIfSwapped(redisDb *db, robj *key) {
9087 dictEntry *de;
9088 robj *foundkey;
9089
9090 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9091 foundkey = dictGetEntryKey(de);
9092 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9093 deleteKey(db,key);
9094 return 1;
9095 }
9096
9097 /* =================== Virtual Memory - Threaded I/O ======================= */
9098
9099 static void freeIOJob(iojob *j) {
9100 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9101 j->type == REDIS_IOJOB_DO_SWAP ||
9102 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9103 decrRefCount(j->val);
9104 /* We don't decrRefCount the j->key field as we did't incremented
9105 * the count creating IO Jobs. This is because the key field here is
9106 * just used as an indentifier and if a key is removed the Job should
9107 * never be touched again. */
9108 zfree(j);
9109 }
9110
9111 /* Every time a thread finished a Job, it writes a byte into the write side
9112 * of an unix pipe in order to "awake" the main thread, and this function
9113 * is called. */
9114 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9115 int mask)
9116 {
9117 char buf[1];
9118 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9119 REDIS_NOTUSED(el);
9120 REDIS_NOTUSED(mask);
9121 REDIS_NOTUSED(privdata);
9122
9123 /* For every byte we read in the read side of the pipe, there is one
9124 * I/O job completed to process. */
9125 while((retval = read(fd,buf,1)) == 1) {
9126 iojob *j;
9127 listNode *ln;
9128 robj *key;
9129 struct dictEntry *de;
9130
9131 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9132
9133 /* Get the processed element (the oldest one) */
9134 lockThreadedIO();
9135 assert(listLength(server.io_processed) != 0);
9136 if (toprocess == -1) {
9137 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9138 if (toprocess <= 0) toprocess = 1;
9139 }
9140 ln = listFirst(server.io_processed);
9141 j = ln->value;
9142 listDelNode(server.io_processed,ln);
9143 unlockThreadedIO();
9144 /* If this job is marked as canceled, just ignore it */
9145 if (j->canceled) {
9146 freeIOJob(j);
9147 continue;
9148 }
9149 /* Post process it in the main thread, as there are things we
9150 * can do just here to avoid race conditions and/or invasive locks */
9151 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9152 de = dictFind(j->db->dict,j->key);
9153 assert(de != NULL);
9154 key = dictGetEntryKey(de);
9155 if (j->type == REDIS_IOJOB_LOAD) {
9156 redisDb *db;
9157
9158 /* Key loaded, bring it at home */
9159 key->storage = REDIS_VM_MEMORY;
9160 key->vm.atime = server.unixtime;
9161 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9162 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9163 (unsigned char*) key->ptr);
9164 server.vm_stats_swapped_objects--;
9165 server.vm_stats_swapins++;
9166 dictGetEntryVal(de) = j->val;
9167 incrRefCount(j->val);
9168 db = j->db;
9169 freeIOJob(j);
9170 /* Handle clients waiting for this key to be loaded. */
9171 handleClientsBlockedOnSwappedKey(db,key);
9172 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9173 /* Now we know the amount of pages required to swap this object.
9174 * Let's find some space for it, and queue this task again
9175 * rebranded as REDIS_IOJOB_DO_SWAP. */
9176 if (!vmCanSwapOut() ||
9177 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9178 {
9179 /* Ooops... no space or we can't swap as there is
9180 * a fork()ed Redis trying to save stuff on disk. */
9181 freeIOJob(j);
9182 key->storage = REDIS_VM_MEMORY; /* undo operation */
9183 } else {
9184 /* Note that we need to mark this pages as used now,
9185 * if the job will be canceled, we'll mark them as freed
9186 * again. */
9187 vmMarkPagesUsed(j->page,j->pages);
9188 j->type = REDIS_IOJOB_DO_SWAP;
9189 lockThreadedIO();
9190 queueIOJob(j);
9191 unlockThreadedIO();
9192 }
9193 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9194 robj *val;
9195
9196 /* Key swapped. We can finally free some memory. */
9197 if (key->storage != REDIS_VM_SWAPPING) {
9198 printf("key->storage: %d\n",key->storage);
9199 printf("key->name: %s\n",(char*)key->ptr);
9200 printf("key->refcount: %d\n",key->refcount);
9201 printf("val: %p\n",(void*)j->val);
9202 printf("val->type: %d\n",j->val->type);
9203 printf("val->ptr: %s\n",(char*)j->val->ptr);
9204 }
9205 redisAssert(key->storage == REDIS_VM_SWAPPING);
9206 val = dictGetEntryVal(de);
9207 key->vm.page = j->page;
9208 key->vm.usedpages = j->pages;
9209 key->storage = REDIS_VM_SWAPPED;
9210 key->vtype = j->val->type;
9211 decrRefCount(val); /* Deallocate the object from memory. */
9212 dictGetEntryVal(de) = NULL;
9213 redisLog(REDIS_DEBUG,
9214 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9215 (unsigned char*) key->ptr,
9216 (unsigned long long) j->page, (unsigned long long) j->pages);
9217 server.vm_stats_swapped_objects++;
9218 server.vm_stats_swapouts++;
9219 freeIOJob(j);
9220 /* Put a few more swap requests in queue if we are still
9221 * out of memory */
9222 if (trytoswap && vmCanSwapOut() &&
9223 zmalloc_used_memory() > server.vm_max_memory)
9224 {
9225 int more = 1;
9226 while(more) {
9227 lockThreadedIO();
9228 more = listLength(server.io_newjobs) <
9229 (unsigned) server.vm_max_threads;
9230 unlockThreadedIO();
9231 /* Don't waste CPU time if swappable objects are rare. */
9232 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9233 trytoswap = 0;
9234 break;
9235 }
9236 }
9237 }
9238 }
9239 processed++;
9240 if (processed == toprocess) return;
9241 }
9242 if (retval < 0 && errno != EAGAIN) {
9243 redisLog(REDIS_WARNING,
9244 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9245 strerror(errno));
9246 }
9247 }
9248
9249 static void lockThreadedIO(void) {
9250 pthread_mutex_lock(&server.io_mutex);
9251 }
9252
9253 static void unlockThreadedIO(void) {
9254 pthread_mutex_unlock(&server.io_mutex);
9255 }
9256
9257 /* Remove the specified object from the threaded I/O queue if still not
9258 * processed, otherwise make sure to flag it as canceled. */
9259 static void vmCancelThreadedIOJob(robj *o) {
9260 list *lists[3] = {
9261 server.io_newjobs, /* 0 */
9262 server.io_processing, /* 1 */
9263 server.io_processed /* 2 */
9264 };
9265 int i;
9266
9267 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9268 again:
9269 lockThreadedIO();
9270 /* Search for a matching key in one of the queues */
9271 for (i = 0; i < 3; i++) {
9272 listNode *ln;
9273 listIter li;
9274
9275 listRewind(lists[i],&li);
9276 while ((ln = listNext(&li)) != NULL) {
9277 iojob *job = ln->value;
9278
9279 if (job->canceled) continue; /* Skip this, already canceled. */
9280 if (job->key == o) {
9281 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9282 (void*)job, (char*)o->ptr, job->type, i);
9283 /* Mark the pages as free since the swap didn't happened
9284 * or happened but is now discarded. */
9285 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9286 vmMarkPagesFree(job->page,job->pages);
9287 /* Cancel the job. It depends on the list the job is
9288 * living in. */
9289 switch(i) {
9290 case 0: /* io_newjobs */
9291 /* If the job was yet not processed the best thing to do
9292 * is to remove it from the queue at all */
9293 freeIOJob(job);
9294 listDelNode(lists[i],ln);
9295 break;
9296 case 1: /* io_processing */
9297 /* Oh Shi- the thread is messing with the Job:
9298 *
9299 * Probably it's accessing the object if this is a
9300 * PREPARE_SWAP or DO_SWAP job.
9301 * If it's a LOAD job it may be reading from disk and
9302 * if we don't wait for the job to terminate before to
9303 * cancel it, maybe in a few microseconds data can be
9304 * corrupted in this pages. So the short story is:
9305 *
9306 * Better to wait for the job to move into the
9307 * next queue (processed)... */
9308
9309 /* We try again and again until the job is completed. */
9310 unlockThreadedIO();
9311 /* But let's wait some time for the I/O thread
9312 * to finish with this job. After all this condition
9313 * should be very rare. */
9314 usleep(1);
9315 goto again;
9316 case 2: /* io_processed */
9317 /* The job was already processed, that's easy...
9318 * just mark it as canceled so that we'll ignore it
9319 * when processing completed jobs. */
9320 job->canceled = 1;
9321 break;
9322 }
9323 /* Finally we have to adjust the storage type of the object
9324 * in order to "UNDO" the operaiton. */
9325 if (o->storage == REDIS_VM_LOADING)
9326 o->storage = REDIS_VM_SWAPPED;
9327 else if (o->storage == REDIS_VM_SWAPPING)
9328 o->storage = REDIS_VM_MEMORY;
9329 unlockThreadedIO();
9330 return;
9331 }
9332 }
9333 }
9334 unlockThreadedIO();
9335 assert(1 != 1); /* We should never reach this */
9336 }
9337
9338 static void *IOThreadEntryPoint(void *arg) {
9339 iojob *j;
9340 listNode *ln;
9341 REDIS_NOTUSED(arg);
9342
9343 pthread_detach(pthread_self());
9344 while(1) {
9345 /* Get a new job to process */
9346 lockThreadedIO();
9347 if (listLength(server.io_newjobs) == 0) {
9348 /* No new jobs in queue, exit. */
9349 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9350 (long) pthread_self());
9351 server.io_active_threads--;
9352 unlockThreadedIO();
9353 return NULL;
9354 }
9355 ln = listFirst(server.io_newjobs);
9356 j = ln->value;
9357 listDelNode(server.io_newjobs,ln);
9358 /* Add the job in the processing queue */
9359 j->thread = pthread_self();
9360 listAddNodeTail(server.io_processing,j);
9361 ln = listLast(server.io_processing); /* We use ln later to remove it */
9362 unlockThreadedIO();
9363 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9364 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9365
9366 /* Process the Job */
9367 if (j->type == REDIS_IOJOB_LOAD) {
9368 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9369 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9370 FILE *fp = fopen("/dev/null","w+");
9371 j->pages = rdbSavedObjectPages(j->val,fp);
9372 fclose(fp);
9373 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9374 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9375 j->canceled = 1;
9376 }
9377
9378 /* Done: insert the job into the processed queue */
9379 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9380 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9381 lockThreadedIO();
9382 listDelNode(server.io_processing,ln);
9383 listAddNodeTail(server.io_processed,j);
9384 unlockThreadedIO();
9385
9386 /* Signal the main thread there is new stuff to process */
9387 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9388 }
9389 return NULL; /* never reached */
9390 }
9391
9392 static void spawnIOThread(void) {
9393 pthread_t thread;
9394 sigset_t mask, omask;
9395 int err;
9396
9397 sigemptyset(&mask);
9398 sigaddset(&mask,SIGCHLD);
9399 sigaddset(&mask,SIGHUP);
9400 sigaddset(&mask,SIGPIPE);
9401 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9402 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9403 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9404 strerror(err));
9405 usleep(1000000);
9406 }
9407 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9408 server.io_active_threads++;
9409 }
9410
9411 /* We need to wait for the last thread to exit before we are able to
9412 * fork() in order to BGSAVE or BGREWRITEAOF. */
9413 static void waitEmptyIOJobsQueue(void) {
9414 while(1) {
9415 int io_processed_len;
9416
9417 lockThreadedIO();
9418 if (listLength(server.io_newjobs) == 0 &&
9419 listLength(server.io_processing) == 0 &&
9420 server.io_active_threads == 0)
9421 {
9422 unlockThreadedIO();
9423 return;
9424 }
9425 /* While waiting for empty jobs queue condition we post-process some
9426 * finshed job, as I/O threads may be hanging trying to write against
9427 * the io_ready_pipe_write FD but there are so much pending jobs that
9428 * it's blocking. */
9429 io_processed_len = listLength(server.io_processed);
9430 unlockThreadedIO();
9431 if (io_processed_len) {
9432 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9433 usleep(1000); /* 1 millisecond */
9434 } else {
9435 usleep(10000); /* 10 milliseconds */
9436 }
9437 }
9438 }
9439
9440 static void vmReopenSwapFile(void) {
9441 /* Note: we don't close the old one as we are in the child process
9442 * and don't want to mess at all with the original file object. */
9443 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9444 if (server.vm_fp == NULL) {
9445 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9446 server.vm_swap_file);
9447 _exit(1);
9448 }
9449 server.vm_fd = fileno(server.vm_fp);
9450 }
9451
9452 /* This function must be called while with threaded IO locked */
9453 static void queueIOJob(iojob *j) {
9454 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9455 (void*)j, j->type, (char*)j->key->ptr);
9456 listAddNodeTail(server.io_newjobs,j);
9457 if (server.io_active_threads < server.vm_max_threads)
9458 spawnIOThread();
9459 }
9460
9461 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9462 iojob *j;
9463
9464 assert(key->storage == REDIS_VM_MEMORY);
9465 assert(key->refcount == 1);
9466
9467 j = zmalloc(sizeof(*j));
9468 j->type = REDIS_IOJOB_PREPARE_SWAP;
9469 j->db = db;
9470 j->key = key;
9471 j->val = val;
9472 incrRefCount(val);
9473 j->canceled = 0;
9474 j->thread = (pthread_t) -1;
9475 key->storage = REDIS_VM_SWAPPING;
9476
9477 lockThreadedIO();
9478 queueIOJob(j);
9479 unlockThreadedIO();
9480 return REDIS_OK;
9481 }
9482
9483 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9484
9485 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9486 * If there is not already a job loading the key, it is craeted.
9487 * The key is added to the io_keys list in the client structure, and also
9488 * in the hash table mapping swapped keys to waiting clients, that is,
9489 * server.io_waited_keys. */
9490 static int waitForSwappedKey(redisClient *c, robj *key) {
9491 struct dictEntry *de;
9492 robj *o;
9493 list *l;
9494
9495 /* If the key does not exist or is already in RAM we don't need to
9496 * block the client at all. */
9497 de = dictFind(c->db->dict,key);
9498 if (de == NULL) return 0;
9499 o = dictGetEntryKey(de);
9500 if (o->storage == REDIS_VM_MEMORY) {
9501 return 0;
9502 } else if (o->storage == REDIS_VM_SWAPPING) {
9503 /* We were swapping the key, undo it! */
9504 vmCancelThreadedIOJob(o);
9505 return 0;
9506 }
9507
9508 /* OK: the key is either swapped, or being loaded just now. */
9509
9510 /* Add the key to the list of keys this client is waiting for.
9511 * This maps clients to keys they are waiting for. */
9512 listAddNodeTail(c->io_keys,key);
9513 incrRefCount(key);
9514
9515 /* Add the client to the swapped keys => clients waiting map. */
9516 de = dictFind(c->db->io_keys,key);
9517 if (de == NULL) {
9518 int retval;
9519
9520 /* For every key we take a list of clients blocked for it */
9521 l = listCreate();
9522 retval = dictAdd(c->db->io_keys,key,l);
9523 incrRefCount(key);
9524 assert(retval == DICT_OK);
9525 } else {
9526 l = dictGetEntryVal(de);
9527 }
9528 listAddNodeTail(l,c);
9529
9530 /* Are we already loading the key from disk? If not create a job */
9531 if (o->storage == REDIS_VM_SWAPPED) {
9532 iojob *j;
9533
9534 o->storage = REDIS_VM_LOADING;
9535 j = zmalloc(sizeof(*j));
9536 j->type = REDIS_IOJOB_LOAD;
9537 j->db = c->db;
9538 j->key = o;
9539 j->key->vtype = o->vtype;
9540 j->page = o->vm.page;
9541 j->val = NULL;
9542 j->canceled = 0;
9543 j->thread = (pthread_t) -1;
9544 lockThreadedIO();
9545 queueIOJob(j);
9546 unlockThreadedIO();
9547 }
9548 return 1;
9549 }
9550
9551 /* Preload keys for any command with first, last and step values for
9552 * the command keys prototype, as defined in the command table. */
9553 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9554 int j, last;
9555 if (cmd->vm_firstkey == 0) return;
9556 last = cmd->vm_lastkey;
9557 if (last < 0) last = argc+last;
9558 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9559 redisAssert(j < argc);
9560 waitForSwappedKey(c,argv[j]);
9561 }
9562 }
9563
9564 /* Preload keys needed for the ZUNION and ZINTER commands. */
9565 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9566 int i, num;
9567 num = atoi(c->argv[2]->ptr);
9568 for (i = 0; i < num; i++) {
9569 waitForSwappedKey(c,c->argv[3+i]);
9570 }
9571 }
9572
9573 /* Is this client attempting to run a command against swapped keys?
9574 * If so, block it ASAP, load the keys in background, then resume it.
9575 *
9576 * The important idea about this function is that it can fail! If keys will
9577 * still be swapped when the client is resumed, this key lookups will
9578 * just block loading keys from disk. In practical terms this should only
9579 * happen with SORT BY command or if there is a bug in this function.
9580 *
9581 * Return 1 if the client is marked as blocked, 0 if the client can
9582 * continue as the keys it is going to access appear to be in memory. */
9583 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9584 if (cmd->vm_preload_proc != NULL) {
9585 cmd->vm_preload_proc(c);
9586 } else {
9587 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9588 }
9589
9590 /* If the client was blocked for at least one key, mark it as blocked. */
9591 if (listLength(c->io_keys)) {
9592 c->flags |= REDIS_IO_WAIT;
9593 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9594 server.vm_blocked_clients++;
9595 return 1;
9596 } else {
9597 return 0;
9598 }
9599 }
9600
9601 /* Remove the 'key' from the list of blocked keys for a given client.
9602 *
9603 * The function returns 1 when there are no longer blocking keys after
9604 * the current one was removed (and the client can be unblocked). */
9605 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9606 list *l;
9607 listNode *ln;
9608 listIter li;
9609 struct dictEntry *de;
9610
9611 /* Remove the key from the list of keys this client is waiting for. */
9612 listRewind(c->io_keys,&li);
9613 while ((ln = listNext(&li)) != NULL) {
9614 if (compareStringObjects(ln->value,key) == 0) {
9615 listDelNode(c->io_keys,ln);
9616 break;
9617 }
9618 }
9619 assert(ln != NULL);
9620
9621 /* Remove the client form the key => waiting clients map. */
9622 de = dictFind(c->db->io_keys,key);
9623 assert(de != NULL);
9624 l = dictGetEntryVal(de);
9625 ln = listSearchKey(l,c);
9626 assert(ln != NULL);
9627 listDelNode(l,ln);
9628 if (listLength(l) == 0)
9629 dictDelete(c->db->io_keys,key);
9630
9631 return listLength(c->io_keys) == 0;
9632 }
9633
9634 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9635 struct dictEntry *de;
9636 list *l;
9637 listNode *ln;
9638 int len;
9639
9640 de = dictFind(db->io_keys,key);
9641 if (!de) return;
9642
9643 l = dictGetEntryVal(de);
9644 len = listLength(l);
9645 /* Note: we can't use something like while(listLength(l)) as the list
9646 * can be freed by the calling function when we remove the last element. */
9647 while (len--) {
9648 ln = listFirst(l);
9649 redisClient *c = ln->value;
9650
9651 if (dontWaitForSwappedKey(c,key)) {
9652 /* Put the client in the list of clients ready to go as we
9653 * loaded all the keys about it. */
9654 listAddNodeTail(server.io_ready_clients,c);
9655 }
9656 }
9657 }
9658
9659 /* =========================== Remote Configuration ========================= */
9660
9661 static void configSetCommand(redisClient *c) {
9662 robj *o = getDecodedObject(c->argv[3]);
9663 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9664 zfree(server.dbfilename);
9665 server.dbfilename = zstrdup(o->ptr);
9666 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9667 zfree(server.requirepass);
9668 server.requirepass = zstrdup(o->ptr);
9669 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9670 zfree(server.masterauth);
9671 server.masterauth = zstrdup(o->ptr);
9672 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9673 server.maxmemory = strtoll(o->ptr, NULL, 10);
9674 } else {
9675 addReplySds(c,sdscatprintf(sdsempty(),
9676 "-ERR not supported CONFIG parameter %s\r\n",
9677 (char*)c->argv[2]->ptr));
9678 decrRefCount(o);
9679 return;
9680 }
9681 decrRefCount(o);
9682 addReply(c,shared.ok);
9683 }
9684
9685 static void configGetCommand(redisClient *c) {
9686 robj *o = getDecodedObject(c->argv[2]);
9687 robj *lenobj = createObject(REDIS_STRING,NULL);
9688 char *pattern = o->ptr;
9689 int matches = 0;
9690
9691 addReply(c,lenobj);
9692 decrRefCount(lenobj);
9693
9694 if (stringmatch(pattern,"dbfilename",0)) {
9695 addReplyBulkCString(c,"dbfilename");
9696 addReplyBulkCString(c,server.dbfilename);
9697 matches++;
9698 }
9699 if (stringmatch(pattern,"requirepass",0)) {
9700 addReplyBulkCString(c,"requirepass");
9701 addReplyBulkCString(c,server.requirepass);
9702 matches++;
9703 }
9704 if (stringmatch(pattern,"masterauth",0)) {
9705 addReplyBulkCString(c,"masterauth");
9706 addReplyBulkCString(c,server.masterauth);
9707 matches++;
9708 }
9709 if (stringmatch(pattern,"maxmemory",0)) {
9710 char buf[128];
9711
9712 snprintf(buf,128,"%llu\n",server.maxmemory);
9713 addReplyBulkCString(c,"maxmemory");
9714 addReplyBulkCString(c,buf);
9715 matches++;
9716 }
9717 decrRefCount(o);
9718 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9719 }
9720
9721 static void configCommand(redisClient *c) {
9722 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9723 if (c->argc != 4) goto badarity;
9724 configSetCommand(c);
9725 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9726 if (c->argc != 3) goto badarity;
9727 configGetCommand(c);
9728 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9729 if (c->argc != 2) goto badarity;
9730 server.stat_numcommands = 0;
9731 server.stat_numconnections = 0;
9732 server.stat_expiredkeys = 0;
9733 server.stat_starttime = time(NULL);
9734 addReply(c,shared.ok);
9735 } else {
9736 addReplySds(c,sdscatprintf(sdsempty(),
9737 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9738 }
9739 return;
9740
9741 badarity:
9742 addReplySds(c,sdscatprintf(sdsempty(),
9743 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9744 (char*) c->argv[1]->ptr));
9745 }
9746
9747 /* =========================== Pubsub implementation ======================== */
9748
9749 static void freePubsubPattern(void *p) {
9750 pubsubPattern *pat = p;
9751
9752 decrRefCount(pat->pattern);
9753 zfree(pat);
9754 }
9755
9756 static int listMatchPubsubPattern(void *a, void *b) {
9757 pubsubPattern *pa = a, *pb = b;
9758
9759 return (pa->client == pb->client) &&
9760 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9761 }
9762
9763 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9764 * 0 if the client was already subscribed to that channel. */
9765 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9766 struct dictEntry *de;
9767 list *clients = NULL;
9768 int retval = 0;
9769
9770 /* Add the channel to the client -> channels hash table */
9771 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9772 retval = 1;
9773 incrRefCount(channel);
9774 /* Add the client to the channel -> list of clients hash table */
9775 de = dictFind(server.pubsub_channels,channel);
9776 if (de == NULL) {
9777 clients = listCreate();
9778 dictAdd(server.pubsub_channels,channel,clients);
9779 incrRefCount(channel);
9780 } else {
9781 clients = dictGetEntryVal(de);
9782 }
9783 listAddNodeTail(clients,c);
9784 }
9785 /* Notify the client */
9786 addReply(c,shared.mbulk3);
9787 addReply(c,shared.subscribebulk);
9788 addReplyBulk(c,channel);
9789 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9790 return retval;
9791 }
9792
9793 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9794 * 0 if the client was not subscribed to the specified channel. */
9795 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9796 struct dictEntry *de;
9797 list *clients;
9798 listNode *ln;
9799 int retval = 0;
9800
9801 /* Remove the channel from the client -> channels hash table */
9802 incrRefCount(channel); /* channel may be just a pointer to the same object
9803 we have in the hash tables. Protect it... */
9804 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9805 retval = 1;
9806 /* Remove the client from the channel -> clients list hash table */
9807 de = dictFind(server.pubsub_channels,channel);
9808 assert(de != NULL);
9809 clients = dictGetEntryVal(de);
9810 ln = listSearchKey(clients,c);
9811 assert(ln != NULL);
9812 listDelNode(clients,ln);
9813 if (listLength(clients) == 0) {
9814 /* Free the list and associated hash entry at all if this was
9815 * the latest client, so that it will be possible to abuse
9816 * Redis PUBSUB creating millions of channels. */
9817 dictDelete(server.pubsub_channels,channel);
9818 }
9819 }
9820 /* Notify the client */
9821 if (notify) {
9822 addReply(c,shared.mbulk3);
9823 addReply(c,shared.unsubscribebulk);
9824 addReplyBulk(c,channel);
9825 addReplyLong(c,dictSize(c->pubsub_channels)+
9826 listLength(c->pubsub_patterns));
9827
9828 }
9829 decrRefCount(channel); /* it is finally safe to release it */
9830 return retval;
9831 }
9832
9833 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9834 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9835 int retval = 0;
9836
9837 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9838 retval = 1;
9839 pubsubPattern *pat;
9840 listAddNodeTail(c->pubsub_patterns,pattern);
9841 incrRefCount(pattern);
9842 pat = zmalloc(sizeof(*pat));
9843 pat->pattern = getDecodedObject(pattern);
9844 pat->client = c;
9845 listAddNodeTail(server.pubsub_patterns,pat);
9846 }
9847 /* Notify the client */
9848 addReply(c,shared.mbulk3);
9849 addReply(c,shared.psubscribebulk);
9850 addReplyBulk(c,pattern);
9851 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9852 return retval;
9853 }
9854
9855 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9856 * 0 if the client was not subscribed to the specified channel. */
9857 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9858 listNode *ln;
9859 pubsubPattern pat;
9860 int retval = 0;
9861
9862 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9863 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9864 retval = 1;
9865 listDelNode(c->pubsub_patterns,ln);
9866 pat.client = c;
9867 pat.pattern = pattern;
9868 ln = listSearchKey(server.pubsub_patterns,&pat);
9869 listDelNode(server.pubsub_patterns,ln);
9870 }
9871 /* Notify the client */
9872 if (notify) {
9873 addReply(c,shared.mbulk3);
9874 addReply(c,shared.punsubscribebulk);
9875 addReplyBulk(c,pattern);
9876 addReplyLong(c,dictSize(c->pubsub_channels)+
9877 listLength(c->pubsub_patterns));
9878 }
9879 decrRefCount(pattern);
9880 return retval;
9881 }
9882
9883 /* Unsubscribe from all the channels. Return the number of channels the
9884 * client was subscribed from. */
9885 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9886 dictIterator *di = dictGetIterator(c->pubsub_channels);
9887 dictEntry *de;
9888 int count = 0;
9889
9890 while((de = dictNext(di)) != NULL) {
9891 robj *channel = dictGetEntryKey(de);
9892
9893 count += pubsubUnsubscribeChannel(c,channel,notify);
9894 }
9895 dictReleaseIterator(di);
9896 return count;
9897 }
9898
9899 /* Unsubscribe from all the patterns. Return the number of patterns the
9900 * client was subscribed from. */
9901 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9902 listNode *ln;
9903 listIter li;
9904 int count = 0;
9905
9906 listRewind(c->pubsub_patterns,&li);
9907 while ((ln = listNext(&li)) != NULL) {
9908 robj *pattern = ln->value;
9909
9910 count += pubsubUnsubscribePattern(c,pattern,notify);
9911 }
9912 return count;
9913 }
9914
9915 /* Publish a message */
9916 static int pubsubPublishMessage(robj *channel, robj *message) {
9917 int receivers = 0;
9918 struct dictEntry *de;
9919 listNode *ln;
9920 listIter li;
9921
9922 /* Send to clients listening for that channel */
9923 de = dictFind(server.pubsub_channels,channel);
9924 if (de) {
9925 list *list = dictGetEntryVal(de);
9926 listNode *ln;
9927 listIter li;
9928
9929 listRewind(list,&li);
9930 while ((ln = listNext(&li)) != NULL) {
9931 redisClient *c = ln->value;
9932
9933 addReply(c,shared.mbulk3);
9934 addReply(c,shared.messagebulk);
9935 addReplyBulk(c,channel);
9936 addReplyBulk(c,message);
9937 receivers++;
9938 }
9939 }
9940 /* Send to clients listening to matching channels */
9941 if (listLength(server.pubsub_patterns)) {
9942 listRewind(server.pubsub_patterns,&li);
9943 channel = getDecodedObject(channel);
9944 while ((ln = listNext(&li)) != NULL) {
9945 pubsubPattern *pat = ln->value;
9946
9947 if (stringmatchlen((char*)pat->pattern->ptr,
9948 sdslen(pat->pattern->ptr),
9949 (char*)channel->ptr,
9950 sdslen(channel->ptr),0)) {
9951 addReply(pat->client,shared.mbulk4);
9952 addReply(pat->client,shared.pmessagebulk);
9953 addReplyBulk(pat->client,pat->pattern);
9954 addReplyBulk(pat->client,channel);
9955 addReplyBulk(pat->client,message);
9956 receivers++;
9957 }
9958 }
9959 decrRefCount(channel);
9960 }
9961 return receivers;
9962 }
9963
9964 static void subscribeCommand(redisClient *c) {
9965 int j;
9966
9967 for (j = 1; j < c->argc; j++)
9968 pubsubSubscribeChannel(c,c->argv[j]);
9969 }
9970
9971 static void unsubscribeCommand(redisClient *c) {
9972 if (c->argc == 1) {
9973 pubsubUnsubscribeAllChannels(c,1);
9974 return;
9975 } else {
9976 int j;
9977
9978 for (j = 1; j < c->argc; j++)
9979 pubsubUnsubscribeChannel(c,c->argv[j],1);
9980 }
9981 }
9982
9983 static void psubscribeCommand(redisClient *c) {
9984 int j;
9985
9986 for (j = 1; j < c->argc; j++)
9987 pubsubSubscribePattern(c,c->argv[j]);
9988 }
9989
9990 static void punsubscribeCommand(redisClient *c) {
9991 if (c->argc == 1) {
9992 pubsubUnsubscribeAllPatterns(c,1);
9993 return;
9994 } else {
9995 int j;
9996
9997 for (j = 1; j < c->argc; j++)
9998 pubsubUnsubscribePattern(c,c->argv[j],1);
9999 }
10000 }
10001
10002 static void publishCommand(redisClient *c) {
10003 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10004 addReplyLong(c,receivers);
10005 }
10006
10007 /* ================================= Debugging ============================== */
10008
10009 static void debugCommand(redisClient *c) {
10010 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10011 *((char*)-1) = 'x';
10012 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10013 if (rdbSave(server.dbfilename) != REDIS_OK) {
10014 addReply(c,shared.err);
10015 return;
10016 }
10017 emptyDb();
10018 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10019 addReply(c,shared.err);
10020 return;
10021 }
10022 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10023 addReply(c,shared.ok);
10024 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10025 emptyDb();
10026 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10027 addReply(c,shared.err);
10028 return;
10029 }
10030 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10031 addReply(c,shared.ok);
10032 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10033 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10034 robj *key, *val;
10035
10036 if (!de) {
10037 addReply(c,shared.nokeyerr);
10038 return;
10039 }
10040 key = dictGetEntryKey(de);
10041 val = dictGetEntryVal(de);
10042 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10043 key->storage == REDIS_VM_SWAPPING)) {
10044 char *strenc;
10045 char buf[128];
10046
10047 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10048 strenc = strencoding[val->encoding];
10049 } else {
10050 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10051 strenc = buf;
10052 }
10053 addReplySds(c,sdscatprintf(sdsempty(),
10054 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10055 "encoding:%s serializedlength:%lld\r\n",
10056 (void*)key, key->refcount, (void*)val, val->refcount,
10057 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10058 } else {
10059 addReplySds(c,sdscatprintf(sdsempty(),
10060 "+Key at:%p refcount:%d, value swapped at: page %llu "
10061 "using %llu pages\r\n",
10062 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10063 (unsigned long long) key->vm.usedpages));
10064 }
10065 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10066 lookupKeyRead(c->db,c->argv[2]);
10067 addReply(c,shared.ok);
10068 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10069 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10070 robj *key, *val;
10071
10072 if (!server.vm_enabled) {
10073 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10074 return;
10075 }
10076 if (!de) {
10077 addReply(c,shared.nokeyerr);
10078 return;
10079 }
10080 key = dictGetEntryKey(de);
10081 val = dictGetEntryVal(de);
10082 /* If the key is shared we want to create a copy */
10083 if (key->refcount > 1) {
10084 robj *newkey = dupStringObject(key);
10085 decrRefCount(key);
10086 key = dictGetEntryKey(de) = newkey;
10087 }
10088 /* Swap it */
10089 if (key->storage != REDIS_VM_MEMORY) {
10090 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10091 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10092 dictGetEntryVal(de) = NULL;
10093 addReply(c,shared.ok);
10094 } else {
10095 addReply(c,shared.err);
10096 }
10097 } else {
10098 addReplySds(c,sdsnew(
10099 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10100 }
10101 }
10102
10103 static void _redisAssert(char *estr, char *file, int line) {
10104 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10105 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
10106 #ifdef HAVE_BACKTRACE
10107 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10108 *((char*)-1) = 'x';
10109 #endif
10110 }
10111
10112 static void _redisPanic(char *msg, char *file, int line) {
10113 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10114 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10115 #ifdef HAVE_BACKTRACE
10116 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10117 *((char*)-1) = 'x';
10118 #endif
10119 }
10120
10121 /* =================================== Main! ================================ */
10122
10123 #ifdef __linux__
10124 int linuxOvercommitMemoryValue(void) {
10125 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10126 char buf[64];
10127
10128 if (!fp) return -1;
10129 if (fgets(buf,64,fp) == NULL) {
10130 fclose(fp);
10131 return -1;
10132 }
10133 fclose(fp);
10134
10135 return atoi(buf);
10136 }
10137
10138 void linuxOvercommitMemoryWarning(void) {
10139 if (linuxOvercommitMemoryValue() == 0) {
10140 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10141 }
10142 }
10143 #endif /* __linux__ */
10144
10145 static void daemonize(void) {
10146 int fd;
10147 FILE *fp;
10148
10149 if (fork() != 0) exit(0); /* parent exits */
10150 setsid(); /* create a new session */
10151
10152 /* Every output goes to /dev/null. If Redis is daemonized but
10153 * the 'logfile' is set to 'stdout' in the configuration file
10154 * it will not log at all. */
10155 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10156 dup2(fd, STDIN_FILENO);
10157 dup2(fd, STDOUT_FILENO);
10158 dup2(fd, STDERR_FILENO);
10159 if (fd > STDERR_FILENO) close(fd);
10160 }
10161 /* Try to write the pid file */
10162 fp = fopen(server.pidfile,"w");
10163 if (fp) {
10164 fprintf(fp,"%d\n",getpid());
10165 fclose(fp);
10166 }
10167 }
10168
10169 static void version() {
10170 printf("Redis server version %s\n", REDIS_VERSION);
10171 exit(0);
10172 }
10173
10174 static void usage() {
10175 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10176 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10177 exit(1);
10178 }
10179
10180 int main(int argc, char **argv) {
10181 time_t start;
10182
10183 initServerConfig();
10184 if (argc == 2) {
10185 if (strcmp(argv[1], "-v") == 0 ||
10186 strcmp(argv[1], "--version") == 0) version();
10187 if (strcmp(argv[1], "--help") == 0) usage();
10188 resetServerSaveParams();
10189 loadServerConfig(argv[1]);
10190 } else if ((argc > 2)) {
10191 usage();
10192 } else {
10193 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10194 }
10195 if (server.daemonize) daemonize();
10196 initServer();
10197 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10198 #ifdef __linux__
10199 linuxOvercommitMemoryWarning();
10200 #endif
10201 start = time(NULL);
10202 if (server.appendonly) {
10203 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10204 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10205 } else {
10206 if (rdbLoad(server.dbfilename) == REDIS_OK)
10207 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10208 }
10209 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10210 aeSetBeforeSleepProc(server.el,beforeSleep);
10211 aeMain(server.el);
10212 aeDeleteEventLoop(server.el);
10213 return 0;
10214 }
10215
10216 /* ============================= Backtrace support ========================= */
10217
10218 #ifdef HAVE_BACKTRACE
10219 static char *findFuncName(void *pointer, unsigned long *offset);
10220
10221 static void *getMcontextEip(ucontext_t *uc) {
10222 #if defined(__FreeBSD__)
10223 return (void*) uc->uc_mcontext.mc_eip;
10224 #elif defined(__dietlibc__)
10225 return (void*) uc->uc_mcontext.eip;
10226 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10227 #if __x86_64__
10228 return (void*) uc->uc_mcontext->__ss.__rip;
10229 #else
10230 return (void*) uc->uc_mcontext->__ss.__eip;
10231 #endif
10232 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10233 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10234 return (void*) uc->uc_mcontext->__ss.__rip;
10235 #else
10236 return (void*) uc->uc_mcontext->__ss.__eip;
10237 #endif
10238 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10239 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10240 #elif defined(__ia64__) /* Linux IA64 */
10241 return (void*) uc->uc_mcontext.sc_ip;
10242 #else
10243 return NULL;
10244 #endif
10245 }
10246
10247 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10248 void *trace[100];
10249 char **messages = NULL;
10250 int i, trace_size = 0;
10251 unsigned long offset=0;
10252 ucontext_t *uc = (ucontext_t*) secret;
10253 sds infostring;
10254 REDIS_NOTUSED(info);
10255
10256 redisLog(REDIS_WARNING,
10257 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10258 infostring = genRedisInfoString();
10259 redisLog(REDIS_WARNING, "%s",infostring);
10260 /* It's not safe to sdsfree() the returned string under memory
10261 * corruption conditions. Let it leak as we are going to abort */
10262
10263 trace_size = backtrace(trace, 100);
10264 /* overwrite sigaction with caller's address */
10265 if (getMcontextEip(uc) != NULL) {
10266 trace[1] = getMcontextEip(uc);
10267 }
10268 messages = backtrace_symbols(trace, trace_size);
10269
10270 for (i=1; i<trace_size; ++i) {
10271 char *fn = findFuncName(trace[i], &offset), *p;
10272
10273 p = strchr(messages[i],'+');
10274 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10275 redisLog(REDIS_WARNING,"%s", messages[i]);
10276 } else {
10277 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10278 }
10279 }
10280 /* free(messages); Don't call free() with possibly corrupted memory. */
10281 _exit(0);
10282 }
10283
10284 static void setupSigSegvAction(void) {
10285 struct sigaction act;
10286
10287 sigemptyset (&act.sa_mask);
10288 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10289 * is used. Otherwise, sa_handler is used */
10290 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10291 act.sa_sigaction = segvHandler;
10292 sigaction (SIGSEGV, &act, NULL);
10293 sigaction (SIGBUS, &act, NULL);
10294 sigaction (SIGFPE, &act, NULL);
10295 sigaction (SIGILL, &act, NULL);
10296 sigaction (SIGBUS, &act, NULL);
10297 return;
10298 }
10299
10300 #include "staticsymbols.h"
10301 /* This function try to convert a pointer into a function name. It's used in
10302 * oreder to provide a backtrace under segmentation fault that's able to
10303 * display functions declared as static (otherwise the backtrace is useless). */
10304 static char *findFuncName(void *pointer, unsigned long *offset){
10305 int i, ret = -1;
10306 unsigned long off, minoff = 0;
10307
10308 /* Try to match against the Symbol with the smallest offset */
10309 for (i=0; symsTable[i].pointer; i++) {
10310 unsigned long lp = (unsigned long) pointer;
10311
10312 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10313 off=lp-symsTable[i].pointer;
10314 if (ret < 0 || off < minoff) {
10315 minoff=off;
10316 ret=i;
10317 }
10318 }
10319 }
10320 if (ret == -1) return NULL;
10321 *offset = minoff;
10322 return symsTable[ret].name;
10323 }
10324 #else /* HAVE_BACKTRACE */
10325 static void setupSigSegvAction(void) {
10326 }
10327 #endif /* HAVE_BACKTRACE */
10328
10329
10330
10331 /* The End */
10332
10333
10334