]> git.saurik.com Git - redis.git/blob - redis.c
f81db41cd9ca4888511ecd16a62c0d851fd9f7c5
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <math.h>
61 #include <pthread.h>
62
63 #if defined(__sun)
64 #include "solarisfixes.h"
65 #endif
66
67 #include "redis.h"
68 #include "ae.h" /* Event driven programming library */
69 #include "sds.h" /* Dynamic safe strings */
70 #include "anet.h" /* Networking the easy way */
71 #include "dict.h" /* Hash tables */
72 #include "adlist.h" /* Linked lists */
73 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
74 #include "lzf.h" /* LZF compression library */
75 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
76 #include "zipmap.h"
77
78 /* Error codes */
79 #define REDIS_OK 0
80 #define REDIS_ERR -1
81
82 /* Static server configuration */
83 #define REDIS_SERVERPORT 6379 /* TCP port */
84 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
85 #define REDIS_IOBUF_LEN 1024
86 #define REDIS_LOADBUF_LEN 1024
87 #define REDIS_STATIC_ARGS 8
88 #define REDIS_DEFAULT_DBNUM 16
89 #define REDIS_CONFIGLINE_MAX 1024
90 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
92 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
93 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
94 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97 #define REDIS_WRITEV_THRESHOLD 3
98 /* Max number of iovecs used for each writev call */
99 #define REDIS_WRITEV_IOVEC_COUNT 256
100
101 /* Hash table parameters */
102 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
103
104 /* Command flags */
105 #define REDIS_CMD_BULK 1 /* Bulk write command */
106 #define REDIS_CMD_INLINE 2 /* Inline command */
107 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111 #define REDIS_CMD_DENYOOM 4
112 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
124 #define REDIS_ENCODING_RAW 0 /* Raw representation */
125 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
126 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
128
129 static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131 };
132
133 /* Object types only used for dumping to disk */
134 #define REDIS_EXPIRETIME 253
135 #define REDIS_SELECTDB 254
136 #define REDIS_EOF 255
137
138 /* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
148 *
149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
151 #define REDIS_RDB_6BITLEN 0
152 #define REDIS_RDB_14BITLEN 1
153 #define REDIS_RDB_32BITLEN 2
154 #define REDIS_RDB_ENCVAL 3
155 #define REDIS_RDB_LENERR UINT_MAX
156
157 /* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
163 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
164
165 /* Virtual memory object->where field. */
166 #define REDIS_VM_MEMORY 0 /* The object is on memory */
167 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
168 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
171 /* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173 #define REDIS_VM_MAX_NEAR_PAGES 65536
174 #define REDIS_VM_MAX_RANDOM_JUMP 4096
175 #define REDIS_VM_MAX_THREADS 32
176 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
177 /* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
181 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
182
183 /* Client flags */
184 #define REDIS_SLAVE 1 /* This client is a slave server */
185 #define REDIS_MASTER 2 /* This client is a master server */
186 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187 #define REDIS_MULTI 8 /* This client is in a MULTI context */
188 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
190
191 /* Slave replication state - slave side */
192 #define REDIS_REPL_NONE 0 /* No active replication */
193 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
194 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
196 /* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
205 /* List related stuff */
206 #define REDIS_HEAD 0
207 #define REDIS_TAIL 1
208
209 /* Sort operations */
210 #define REDIS_SORT_GET 0
211 #define REDIS_SORT_ASC 1
212 #define REDIS_SORT_DESC 2
213 #define REDIS_SORTKEY_MAX 1024
214
215 /* Log levels */
216 #define REDIS_DEBUG 0
217 #define REDIS_VERBOSE 1
218 #define REDIS_NOTICE 2
219 #define REDIS_WARNING 3
220
221 /* Anti-warning macro... */
222 #define REDIS_NOTUSED(V) ((void) V)
223
224 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
226
227 /* Append only defines */
228 #define APPENDFSYNC_NO 0
229 #define APPENDFSYNC_ALWAYS 1
230 #define APPENDFSYNC_EVERYSEC 2
231
232 /* Hashes related defaults */
233 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
236 /* We can print the stacktrace, so our assert is defined this way: */
237 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
238 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
239 static void _redisAssert(char *estr, char *file, int line);
240 static void _redisPanic(char *msg, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 sds aofbuf; /* AOF buffer, written before entering the event loop */
374 struct saveparam *saveparams;
375 int saveparamslen;
376 char *logfile;
377 char *bindaddr;
378 char *dbfilename;
379 char *appendfilename;
380 char *requirepass;
381 int rdbcompression;
382 int activerehashing;
383 /* Replication related */
384 int isslave;
385 char *masterauth;
386 char *masterhost;
387 int masterport;
388 redisClient *master; /* client that is master for this slave */
389 int replstate;
390 unsigned int maxclients;
391 unsigned long long maxmemory;
392 unsigned int blpop_blocked_clients;
393 unsigned int vm_blocked_clients;
394 /* Sort parameters - qsort_r() is only available under BSD so we
395 * have to take this state global, in order to pass it to sortCompare() */
396 int sort_desc;
397 int sort_alpha;
398 int sort_bypattern;
399 /* Virtual memory configuration */
400 int vm_enabled;
401 char *vm_swap_file;
402 off_t vm_page_size;
403 off_t vm_pages;
404 unsigned long long vm_max_memory;
405 /* Hashes config */
406 size_t hash_max_zipmap_entries;
407 size_t hash_max_zipmap_value;
408 /* Virtual memory state */
409 FILE *vm_fp;
410 int vm_fd;
411 off_t vm_next_page; /* Next probably empty page */
412 off_t vm_near_pages; /* Number of pages allocated sequentially */
413 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
414 time_t unixtime; /* Unix time sampled every second. */
415 /* Virtual memory I/O threads stuff */
416 /* An I/O thread process an element taken from the io_jobs queue and
417 * put the result of the operation in the io_done list. While the
418 * job is being processed, it's put on io_processing queue. */
419 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
420 list *io_processing; /* List of VM I/O jobs being processed */
421 list *io_processed; /* List of VM I/O jobs already processed */
422 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
423 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
424 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
425 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
426 pthread_attr_t io_threads_attr; /* attributes for threads creation */
427 int io_active_threads; /* Number of running I/O threads */
428 int vm_max_threads; /* Max number of I/O threads running at the same time */
429 /* Our main thread is blocked on the event loop, locking for sockets ready
430 * to be read or written, so when a threaded I/O operation is ready to be
431 * processed by the main thread, the I/O thread will use a unix pipe to
432 * awake the main thread. The followings are the two pipe FDs. */
433 int io_ready_pipe_read;
434 int io_ready_pipe_write;
435 /* Virtual memory stats */
436 unsigned long long vm_stats_used_pages;
437 unsigned long long vm_stats_swapped_objects;
438 unsigned long long vm_stats_swapouts;
439 unsigned long long vm_stats_swapins;
440 /* Pubsub */
441 dict *pubsub_channels; /* Map channels to list of subscribed clients */
442 list *pubsub_patterns; /* A list of pubsub_patterns */
443 /* Misc */
444 FILE *devnull;
445 };
446
447 typedef struct pubsubPattern {
448 redisClient *client;
449 robj *pattern;
450 } pubsubPattern;
451
452 typedef void redisCommandProc(redisClient *c);
453 typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
454 struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisVmPreloadProc *vm_preload_proc;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
467 };
468
469 struct redisFunctionSym {
470 char *name;
471 unsigned long pointer;
472 };
473
474 typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480 } redisSortObject;
481
482 typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485 } redisSortOperation;
486
487 /* ZSETs use a specialized version of Skiplists */
488
489 typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
491 struct zskiplistNode *backward;
492 unsigned int *span;
493 double score;
494 robj *obj;
495 } zskiplistNode;
496
497 typedef struct zskiplist {
498 struct zskiplistNode *header, *tail;
499 unsigned long length;
500 int level;
501 } zskiplist;
502
503 typedef struct zset {
504 dict *dict;
505 zskiplist *zsl;
506 } zset;
507
508 /* Our shared "common" objects */
509
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct {
512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
513 *colon, *nullbulk, *nullmultibulk, *queued,
514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
516 *select0, *select1, *select2, *select3, *select4,
517 *select5, *select6, *select7, *select8, *select9,
518 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *mbulk4, *psubscribebulk, *punsubscribebulk,
520 *integers[REDIS_SHARED_INTEGERS];
521 } shared;
522
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob {
534 int type; /* Request type, REDIS_IOJOB_* */
535 redisDb *db;/* Redis database */
536 robj *key; /* This I/O request is about swapping this key */
537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543 } iojob;
544
545 /*================================ Prototypes =============================== */
546
547 static void freeStringObject(robj *o);
548 static void freeListObject(robj *o);
549 static void freeSetObject(robj *o);
550 static void decrRefCount(void *o);
551 static robj *createObject(int type, void *ptr);
552 static void freeClient(redisClient *c);
553 static int rdbLoad(char *filename);
554 static void addReply(redisClient *c, robj *obj);
555 static void addReplySds(redisClient *c, sds s);
556 static void incrRefCount(robj *o);
557 static int rdbSaveBackground(char *filename);
558 static robj *createStringObject(char *ptr, size_t len);
559 static robj *dupStringObject(robj *o);
560 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
561 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
562 static void flushAppendOnlyFile(void);
563 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
564 static int syncWithMaster(void);
565 static robj *tryObjectEncoding(robj *o);
566 static robj *getDecodedObject(robj *o);
567 static int removeExpire(redisDb *db, robj *key);
568 static int expireIfNeeded(redisDb *db, robj *key);
569 static int deleteIfVolatile(redisDb *db, robj *key);
570 static int deleteIfSwapped(redisDb *db, robj *key);
571 static int deleteKey(redisDb *db, robj *key);
572 static time_t getExpire(redisDb *db, robj *key);
573 static int setExpire(redisDb *db, robj *key, time_t when);
574 static void updateSlavesWaitingBgsave(int bgsaveerr);
575 static void freeMemoryIfNeeded(void);
576 static int processCommand(redisClient *c);
577 static void setupSigSegvAction(void);
578 static void rdbRemoveTempFile(pid_t childpid);
579 static void aofRemoveTempFile(pid_t childpid);
580 static size_t stringObjectLen(robj *o);
581 static void processInputBuffer(redisClient *c);
582 static zskiplist *zslCreate(void);
583 static void zslFree(zskiplist *zsl);
584 static void zslInsert(zskiplist *zsl, double score, robj *obj);
585 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
586 static void initClientMultiState(redisClient *c);
587 static void freeClientMultiState(redisClient *c);
588 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
589 static void unblockClientWaitingData(redisClient *c);
590 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
591 static void vmInit(void);
592 static void vmMarkPagesFree(off_t page, off_t count);
593 static robj *vmLoadObject(robj *key);
594 static robj *vmPreviewObject(robj *key);
595 static int vmSwapOneObjectBlocking(void);
596 static int vmSwapOneObjectThreaded(void);
597 static int vmCanSwapOut(void);
598 static int tryFreeOneObjectFromFreelist(void);
599 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
600 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
601 static void vmCancelThreadedIOJob(robj *o);
602 static void lockThreadedIO(void);
603 static void unlockThreadedIO(void);
604 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
605 static void freeIOJob(iojob *j);
606 static void queueIOJob(iojob *j);
607 static int vmWriteObjectOnSwap(robj *o, off_t page);
608 static robj *vmReadObjectFromSwap(off_t page, int type);
609 static void waitEmptyIOJobsQueue(void);
610 static void vmReopenSwapFile(void);
611 static int vmFreePage(off_t page);
612 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
613 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
614 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
615 static int dontWaitForSwappedKey(redisClient *c, robj *key);
616 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
617 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
618 static struct redisCommand *lookupCommand(char *name);
619 static void call(redisClient *c, struct redisCommand *cmd);
620 static void resetClient(redisClient *c);
621 static void convertToRealHash(robj *o);
622 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
623 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
624 static void freePubsubPattern(void *p);
625 static int listMatchPubsubPattern(void *a, void *b);
626 static int compareStringObjects(robj *a, robj *b);
627 static void usage();
628 static int rewriteAppendOnlyFileBackground(void);
629
630 static void authCommand(redisClient *c);
631 static void pingCommand(redisClient *c);
632 static void echoCommand(redisClient *c);
633 static void setCommand(redisClient *c);
634 static void setnxCommand(redisClient *c);
635 static void setexCommand(redisClient *c);
636 static void getCommand(redisClient *c);
637 static void delCommand(redisClient *c);
638 static void existsCommand(redisClient *c);
639 static void incrCommand(redisClient *c);
640 static void decrCommand(redisClient *c);
641 static void incrbyCommand(redisClient *c);
642 static void decrbyCommand(redisClient *c);
643 static void selectCommand(redisClient *c);
644 static void randomkeyCommand(redisClient *c);
645 static void keysCommand(redisClient *c);
646 static void dbsizeCommand(redisClient *c);
647 static void lastsaveCommand(redisClient *c);
648 static void saveCommand(redisClient *c);
649 static void bgsaveCommand(redisClient *c);
650 static void bgrewriteaofCommand(redisClient *c);
651 static void shutdownCommand(redisClient *c);
652 static void moveCommand(redisClient *c);
653 static void renameCommand(redisClient *c);
654 static void renamenxCommand(redisClient *c);
655 static void lpushCommand(redisClient *c);
656 static void rpushCommand(redisClient *c);
657 static void lpopCommand(redisClient *c);
658 static void rpopCommand(redisClient *c);
659 static void llenCommand(redisClient *c);
660 static void lindexCommand(redisClient *c);
661 static void lrangeCommand(redisClient *c);
662 static void ltrimCommand(redisClient *c);
663 static void typeCommand(redisClient *c);
664 static void lsetCommand(redisClient *c);
665 static void saddCommand(redisClient *c);
666 static void sremCommand(redisClient *c);
667 static void smoveCommand(redisClient *c);
668 static void sismemberCommand(redisClient *c);
669 static void scardCommand(redisClient *c);
670 static void spopCommand(redisClient *c);
671 static void srandmemberCommand(redisClient *c);
672 static void sinterCommand(redisClient *c);
673 static void sinterstoreCommand(redisClient *c);
674 static void sunionCommand(redisClient *c);
675 static void sunionstoreCommand(redisClient *c);
676 static void sdiffCommand(redisClient *c);
677 static void sdiffstoreCommand(redisClient *c);
678 static void syncCommand(redisClient *c);
679 static void flushdbCommand(redisClient *c);
680 static void flushallCommand(redisClient *c);
681 static void sortCommand(redisClient *c);
682 static void lremCommand(redisClient *c);
683 static void rpoplpushcommand(redisClient *c);
684 static void infoCommand(redisClient *c);
685 static void mgetCommand(redisClient *c);
686 static void monitorCommand(redisClient *c);
687 static void expireCommand(redisClient *c);
688 static void expireatCommand(redisClient *c);
689 static void getsetCommand(redisClient *c);
690 static void ttlCommand(redisClient *c);
691 static void slaveofCommand(redisClient *c);
692 static void debugCommand(redisClient *c);
693 static void msetCommand(redisClient *c);
694 static void msetnxCommand(redisClient *c);
695 static void zaddCommand(redisClient *c);
696 static void zincrbyCommand(redisClient *c);
697 static void zrangeCommand(redisClient *c);
698 static void zrangebyscoreCommand(redisClient *c);
699 static void zcountCommand(redisClient *c);
700 static void zrevrangeCommand(redisClient *c);
701 static void zcardCommand(redisClient *c);
702 static void zremCommand(redisClient *c);
703 static void zscoreCommand(redisClient *c);
704 static void zremrangebyscoreCommand(redisClient *c);
705 static void multiCommand(redisClient *c);
706 static void execCommand(redisClient *c);
707 static void discardCommand(redisClient *c);
708 static void blpopCommand(redisClient *c);
709 static void brpopCommand(redisClient *c);
710 static void appendCommand(redisClient *c);
711 static void substrCommand(redisClient *c);
712 static void zrankCommand(redisClient *c);
713 static void zrevrankCommand(redisClient *c);
714 static void hsetCommand(redisClient *c);
715 static void hsetnxCommand(redisClient *c);
716 static void hgetCommand(redisClient *c);
717 static void hmsetCommand(redisClient *c);
718 static void hmgetCommand(redisClient *c);
719 static void hdelCommand(redisClient *c);
720 static void hlenCommand(redisClient *c);
721 static void zremrangebyrankCommand(redisClient *c);
722 static void zunionCommand(redisClient *c);
723 static void zinterCommand(redisClient *c);
724 static void hkeysCommand(redisClient *c);
725 static void hvalsCommand(redisClient *c);
726 static void hgetallCommand(redisClient *c);
727 static void hexistsCommand(redisClient *c);
728 static void configCommand(redisClient *c);
729 static void hincrbyCommand(redisClient *c);
730 static void subscribeCommand(redisClient *c);
731 static void unsubscribeCommand(redisClient *c);
732 static void psubscribeCommand(redisClient *c);
733 static void punsubscribeCommand(redisClient *c);
734 static void publishCommand(redisClient *c);
735
736 /*================================= Globals ================================= */
737
738 /* Global vars */
739 static struct redisServer server; /* server global state */
740 static struct redisCommand cmdTable[] = {
741 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
743 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
744 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
745 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
748 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
752 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
764 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
765 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
767 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
768 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
775 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
776 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
777 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
778 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
785 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
786 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
806 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
810 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
811 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
815 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
823 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
831 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
836 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
842 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
847 {NULL,NULL,0,0,NULL,0,0,0}
848 };
849
850 /*============================ Utility functions ============================ */
851
852 /* Glob-style pattern matching. */
853 static int stringmatchlen(const char *pattern, int patternLen,
854 const char *string, int stringLen, int nocase)
855 {
856 while(patternLen) {
857 switch(pattern[0]) {
858 case '*':
859 while (pattern[1] == '*') {
860 pattern++;
861 patternLen--;
862 }
863 if (patternLen == 1)
864 return 1; /* match */
865 while(stringLen) {
866 if (stringmatchlen(pattern+1, patternLen-1,
867 string, stringLen, nocase))
868 return 1; /* match */
869 string++;
870 stringLen--;
871 }
872 return 0; /* no match */
873 break;
874 case '?':
875 if (stringLen == 0)
876 return 0; /* no match */
877 string++;
878 stringLen--;
879 break;
880 case '[':
881 {
882 int not, match;
883
884 pattern++;
885 patternLen--;
886 not = pattern[0] == '^';
887 if (not) {
888 pattern++;
889 patternLen--;
890 }
891 match = 0;
892 while(1) {
893 if (pattern[0] == '\\') {
894 pattern++;
895 patternLen--;
896 if (pattern[0] == string[0])
897 match = 1;
898 } else if (pattern[0] == ']') {
899 break;
900 } else if (patternLen == 0) {
901 pattern--;
902 patternLen++;
903 break;
904 } else if (pattern[1] == '-' && patternLen >= 3) {
905 int start = pattern[0];
906 int end = pattern[2];
907 int c = string[0];
908 if (start > end) {
909 int t = start;
910 start = end;
911 end = t;
912 }
913 if (nocase) {
914 start = tolower(start);
915 end = tolower(end);
916 c = tolower(c);
917 }
918 pattern += 2;
919 patternLen -= 2;
920 if (c >= start && c <= end)
921 match = 1;
922 } else {
923 if (!nocase) {
924 if (pattern[0] == string[0])
925 match = 1;
926 } else {
927 if (tolower((int)pattern[0]) == tolower((int)string[0]))
928 match = 1;
929 }
930 }
931 pattern++;
932 patternLen--;
933 }
934 if (not)
935 match = !match;
936 if (!match)
937 return 0; /* no match */
938 string++;
939 stringLen--;
940 break;
941 }
942 case '\\':
943 if (patternLen >= 2) {
944 pattern++;
945 patternLen--;
946 }
947 /* fall through */
948 default:
949 if (!nocase) {
950 if (pattern[0] != string[0])
951 return 0; /* no match */
952 } else {
953 if (tolower((int)pattern[0]) != tolower((int)string[0]))
954 return 0; /* no match */
955 }
956 string++;
957 stringLen--;
958 break;
959 }
960 pattern++;
961 patternLen--;
962 if (stringLen == 0) {
963 while(*pattern == '*') {
964 pattern++;
965 patternLen--;
966 }
967 break;
968 }
969 }
970 if (patternLen == 0 && stringLen == 0)
971 return 1;
972 return 0;
973 }
974
975 static int stringmatch(const char *pattern, const char *string, int nocase) {
976 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
977 }
978
979 /* Convert a string representing an amount of memory into the number of
980 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
981 * (1024*1024*1024).
982 *
983 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
984 * set to 0 */
985 static long long memtoll(const char *p, int *err) {
986 const char *u;
987 char buf[128];
988 long mul; /* unit multiplier */
989 long long val;
990 unsigned int digits;
991
992 if (err) *err = 0;
993 /* Search the first non digit character. */
994 u = p;
995 if (*u == '-') u++;
996 while(*u && isdigit(*u)) u++;
997 if (*u == '\0' || !strcasecmp(u,"b")) {
998 mul = 1;
999 } else if (!strcasecmp(u,"k")) {
1000 mul = 1000;
1001 } else if (!strcasecmp(u,"kb")) {
1002 mul = 1024;
1003 } else if (!strcasecmp(u,"m")) {
1004 mul = 1000*1000;
1005 } else if (!strcasecmp(u,"mb")) {
1006 mul = 1024*1024;
1007 } else if (!strcasecmp(u,"g")) {
1008 mul = 1000L*1000*1000;
1009 } else if (!strcasecmp(u,"gb")) {
1010 mul = 1024L*1024*1024;
1011 } else {
1012 if (err) *err = 1;
1013 mul = 1;
1014 }
1015 digits = u-p;
1016 if (digits >= sizeof(buf)) {
1017 if (err) *err = 1;
1018 return LLONG_MAX;
1019 }
1020 memcpy(buf,p,digits);
1021 buf[digits] = '\0';
1022 val = strtoll(buf,NULL,10);
1023 return val*mul;
1024 }
1025
1026 static void redisLog(int level, const char *fmt, ...) {
1027 va_list ap;
1028 FILE *fp;
1029
1030 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1031 if (!fp) return;
1032
1033 va_start(ap, fmt);
1034 if (level >= server.verbosity) {
1035 char *c = ".-*#";
1036 char buf[64];
1037 time_t now;
1038
1039 now = time(NULL);
1040 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1041 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1042 vfprintf(fp, fmt, ap);
1043 fprintf(fp,"\n");
1044 fflush(fp);
1045 }
1046 va_end(ap);
1047
1048 if (server.logfile) fclose(fp);
1049 }
1050
1051 /*====================== Hash table type implementation ==================== */
1052
1053 /* This is an hash table type that uses the SDS dynamic strings libary as
1054 * keys and radis objects as values (objects can hold SDS strings,
1055 * lists, sets). */
1056
1057 static void dictVanillaFree(void *privdata, void *val)
1058 {
1059 DICT_NOTUSED(privdata);
1060 zfree(val);
1061 }
1062
1063 static void dictListDestructor(void *privdata, void *val)
1064 {
1065 DICT_NOTUSED(privdata);
1066 listRelease((list*)val);
1067 }
1068
1069 static int sdsDictKeyCompare(void *privdata, const void *key1,
1070 const void *key2)
1071 {
1072 int l1,l2;
1073 DICT_NOTUSED(privdata);
1074
1075 l1 = sdslen((sds)key1);
1076 l2 = sdslen((sds)key2);
1077 if (l1 != l2) return 0;
1078 return memcmp(key1, key2, l1) == 0;
1079 }
1080
1081 static void dictRedisObjectDestructor(void *privdata, void *val)
1082 {
1083 DICT_NOTUSED(privdata);
1084
1085 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1086 decrRefCount(val);
1087 }
1088
1089 static int dictObjKeyCompare(void *privdata, const void *key1,
1090 const void *key2)
1091 {
1092 const robj *o1 = key1, *o2 = key2;
1093 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1094 }
1095
1096 static unsigned int dictObjHash(const void *key) {
1097 const robj *o = key;
1098 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1099 }
1100
1101 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1102 const void *key2)
1103 {
1104 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1105 int cmp;
1106
1107 if (o1->encoding == REDIS_ENCODING_INT &&
1108 o2->encoding == REDIS_ENCODING_INT &&
1109 o1->ptr == o2->ptr) return 1;
1110
1111 o1 = getDecodedObject(o1);
1112 o2 = getDecodedObject(o2);
1113 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1114 decrRefCount(o1);
1115 decrRefCount(o2);
1116 return cmp;
1117 }
1118
1119 static unsigned int dictEncObjHash(const void *key) {
1120 robj *o = (robj*) key;
1121
1122 if (o->encoding == REDIS_ENCODING_RAW) {
1123 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1124 } else {
1125 if (o->encoding == REDIS_ENCODING_INT) {
1126 char buf[32];
1127 int len;
1128
1129 len = snprintf(buf,32,"%ld",(long)o->ptr);
1130 return dictGenHashFunction((unsigned char*)buf, len);
1131 } else {
1132 unsigned int hash;
1133
1134 o = getDecodedObject(o);
1135 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1136 decrRefCount(o);
1137 return hash;
1138 }
1139 }
1140 }
1141
1142 /* Sets type and expires */
1143 static dictType setDictType = {
1144 dictEncObjHash, /* hash function */
1145 NULL, /* key dup */
1146 NULL, /* val dup */
1147 dictEncObjKeyCompare, /* key compare */
1148 dictRedisObjectDestructor, /* key destructor */
1149 NULL /* val destructor */
1150 };
1151
1152 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1153 static dictType zsetDictType = {
1154 dictEncObjHash, /* hash function */
1155 NULL, /* key dup */
1156 NULL, /* val dup */
1157 dictEncObjKeyCompare, /* key compare */
1158 dictRedisObjectDestructor, /* key destructor */
1159 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1160 };
1161
1162 /* Db->dict */
1163 static dictType dbDictType = {
1164 dictObjHash, /* hash function */
1165 NULL, /* key dup */
1166 NULL, /* val dup */
1167 dictObjKeyCompare, /* key compare */
1168 dictRedisObjectDestructor, /* key destructor */
1169 dictRedisObjectDestructor /* val destructor */
1170 };
1171
1172 /* Db->expires */
1173 static dictType keyptrDictType = {
1174 dictObjHash, /* hash function */
1175 NULL, /* key dup */
1176 NULL, /* val dup */
1177 dictObjKeyCompare, /* key compare */
1178 dictRedisObjectDestructor, /* key destructor */
1179 NULL /* val destructor */
1180 };
1181
1182 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1183 static dictType hashDictType = {
1184 dictEncObjHash, /* hash function */
1185 NULL, /* key dup */
1186 NULL, /* val dup */
1187 dictEncObjKeyCompare, /* key compare */
1188 dictRedisObjectDestructor, /* key destructor */
1189 dictRedisObjectDestructor /* val destructor */
1190 };
1191
1192 /* Keylist hash table type has unencoded redis objects as keys and
1193 * lists as values. It's used for blocking operations (BLPOP) and to
1194 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1195 static dictType keylistDictType = {
1196 dictObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
1201 dictListDestructor /* val destructor */
1202 };
1203
1204 static void version();
1205
1206 /* ========================= Random utility functions ======================= */
1207
1208 /* Redis generally does not try to recover from out of memory conditions
1209 * when allocating objects or strings, it is not clear if it will be possible
1210 * to report this condition to the client since the networking layer itself
1211 * is based on heap allocation for send buffers, so we simply abort.
1212 * At least the code will be simpler to read... */
1213 static void oom(const char *msg) {
1214 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1215 sleep(1);
1216 abort();
1217 }
1218
1219 /* ====================== Redis server networking stuff ===================== */
1220 static void closeTimedoutClients(void) {
1221 redisClient *c;
1222 listNode *ln;
1223 time_t now = time(NULL);
1224 listIter li;
1225
1226 listRewind(server.clients,&li);
1227 while ((ln = listNext(&li)) != NULL) {
1228 c = listNodeValue(ln);
1229 if (server.maxidletime &&
1230 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1231 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1232 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1233 listLength(c->pubsub_patterns) == 0 &&
1234 (now - c->lastinteraction > server.maxidletime))
1235 {
1236 redisLog(REDIS_VERBOSE,"Closing idle client");
1237 freeClient(c);
1238 } else if (c->flags & REDIS_BLOCKED) {
1239 if (c->blockingto != 0 && c->blockingto < now) {
1240 addReply(c,shared.nullmultibulk);
1241 unblockClientWaitingData(c);
1242 }
1243 }
1244 }
1245 }
1246
1247 static int htNeedsResize(dict *dict) {
1248 long long size, used;
1249
1250 size = dictSlots(dict);
1251 used = dictSize(dict);
1252 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1253 (used*100/size < REDIS_HT_MINFILL));
1254 }
1255
1256 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1257 * we resize the hash table to save memory */
1258 static void tryResizeHashTables(void) {
1259 int j;
1260
1261 for (j = 0; j < server.dbnum; j++) {
1262 if (htNeedsResize(server.db[j].dict))
1263 dictResize(server.db[j].dict);
1264 if (htNeedsResize(server.db[j].expires))
1265 dictResize(server.db[j].expires);
1266 }
1267 }
1268
1269 /* Our hash table implementation performs rehashing incrementally while
1270 * we write/read from the hash table. Still if the server is idle, the hash
1271 * table will use two tables for a long time. So we try to use 1 millisecond
1272 * of CPU time at every serverCron() loop in order to rehash some key. */
1273 static void incrementallyRehash(void) {
1274 int j;
1275
1276 for (j = 0; j < server.dbnum; j++) {
1277 if (dictIsRehashing(server.db[j].dict)) {
1278 dictRehashMilliseconds(server.db[j].dict,1);
1279 break; /* already used our millisecond for this loop... */
1280 }
1281 }
1282 }
1283
1284 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1285 void backgroundSaveDoneHandler(int statloc) {
1286 int exitcode = WEXITSTATUS(statloc);
1287 int bysignal = WIFSIGNALED(statloc);
1288
1289 if (!bysignal && exitcode == 0) {
1290 redisLog(REDIS_NOTICE,
1291 "Background saving terminated with success");
1292 server.dirty = 0;
1293 server.lastsave = time(NULL);
1294 } else if (!bysignal && exitcode != 0) {
1295 redisLog(REDIS_WARNING, "Background saving error");
1296 } else {
1297 redisLog(REDIS_WARNING,
1298 "Background saving terminated by signal %d", WTERMSIG(statloc));
1299 rdbRemoveTempFile(server.bgsavechildpid);
1300 }
1301 server.bgsavechildpid = -1;
1302 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1303 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1304 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1305 }
1306
1307 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1308 * Handle this. */
1309 void backgroundRewriteDoneHandler(int statloc) {
1310 int exitcode = WEXITSTATUS(statloc);
1311 int bysignal = WIFSIGNALED(statloc);
1312
1313 if (!bysignal && exitcode == 0) {
1314 int fd;
1315 char tmpfile[256];
1316
1317 redisLog(REDIS_NOTICE,
1318 "Background append only file rewriting terminated with success");
1319 /* Now it's time to flush the differences accumulated by the parent */
1320 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1321 fd = open(tmpfile,O_WRONLY|O_APPEND);
1322 if (fd == -1) {
1323 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1324 goto cleanup;
1325 }
1326 /* Flush our data... */
1327 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1328 (signed) sdslen(server.bgrewritebuf)) {
1329 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1330 close(fd);
1331 goto cleanup;
1332 }
1333 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1334 /* Now our work is to rename the temp file into the stable file. And
1335 * switch the file descriptor used by the server for append only. */
1336 if (rename(tmpfile,server.appendfilename) == -1) {
1337 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1338 close(fd);
1339 goto cleanup;
1340 }
1341 /* Mission completed... almost */
1342 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1343 if (server.appendfd != -1) {
1344 /* If append only is actually enabled... */
1345 close(server.appendfd);
1346 server.appendfd = fd;
1347 fsync(fd);
1348 server.appendseldb = -1; /* Make sure it will issue SELECT */
1349 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1350 } else {
1351 /* If append only is disabled we just generate a dump in this
1352 * format. Why not? */
1353 close(fd);
1354 }
1355 } else if (!bysignal && exitcode != 0) {
1356 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1357 } else {
1358 redisLog(REDIS_WARNING,
1359 "Background append only file rewriting terminated by signal %d",
1360 WTERMSIG(statloc));
1361 }
1362 cleanup:
1363 sdsfree(server.bgrewritebuf);
1364 server.bgrewritebuf = sdsempty();
1365 aofRemoveTempFile(server.bgrewritechildpid);
1366 server.bgrewritechildpid = -1;
1367 }
1368
1369 /* This function is called once a background process of some kind terminates,
1370 * as we want to avoid resizing the hash tables when there is a child in order
1371 * to play well with copy-on-write (otherwise when a resize happens lots of
1372 * memory pages are copied). The goal of this function is to update the ability
1373 * for dict.c to resize the hash tables accordingly to the fact we have o not
1374 * running childs. */
1375 static void updateDictResizePolicy(void) {
1376 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1377 dictEnableResize();
1378 else
1379 dictDisableResize();
1380 }
1381
1382 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1383 int j, loops = server.cronloops++;
1384 REDIS_NOTUSED(eventLoop);
1385 REDIS_NOTUSED(id);
1386 REDIS_NOTUSED(clientData);
1387
1388 /* We take a cached value of the unix time in the global state because
1389 * with virtual memory and aging there is to store the current time
1390 * in objects at every object access, and accuracy is not needed.
1391 * To access a global var is faster than calling time(NULL) */
1392 server.unixtime = time(NULL);
1393
1394 /* Show some info about non-empty databases */
1395 for (j = 0; j < server.dbnum; j++) {
1396 long long size, used, vkeys;
1397
1398 size = dictSlots(server.db[j].dict);
1399 used = dictSize(server.db[j].dict);
1400 vkeys = dictSize(server.db[j].expires);
1401 if (!(loops % 50) && (used || vkeys)) {
1402 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1403 /* dictPrintStats(server.dict); */
1404 }
1405 }
1406
1407 /* We don't want to resize the hash tables while a bacground saving
1408 * is in progress: the saving child is created using fork() that is
1409 * implemented with a copy-on-write semantic in most modern systems, so
1410 * if we resize the HT while there is the saving child at work actually
1411 * a lot of memory movements in the parent will cause a lot of pages
1412 * copied. */
1413 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1414 if (!(loops % 10)) tryResizeHashTables();
1415 if (server.activerehashing) incrementallyRehash();
1416 }
1417
1418 /* Show information about connected clients */
1419 if (!(loops % 50)) {
1420 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1421 listLength(server.clients)-listLength(server.slaves),
1422 listLength(server.slaves),
1423 zmalloc_used_memory());
1424 }
1425
1426 /* Close connections of timedout clients */
1427 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1428 closeTimedoutClients();
1429
1430 /* Check if a background saving or AOF rewrite in progress terminated */
1431 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1432 int statloc;
1433 pid_t pid;
1434
1435 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1436 if (pid == server.bgsavechildpid) {
1437 backgroundSaveDoneHandler(statloc);
1438 } else {
1439 backgroundRewriteDoneHandler(statloc);
1440 }
1441 updateDictResizePolicy();
1442 }
1443 } else {
1444 /* If there is not a background saving in progress check if
1445 * we have to save now */
1446 time_t now = time(NULL);
1447 for (j = 0; j < server.saveparamslen; j++) {
1448 struct saveparam *sp = server.saveparams+j;
1449
1450 if (server.dirty >= sp->changes &&
1451 now-server.lastsave > sp->seconds) {
1452 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1453 sp->changes, sp->seconds);
1454 rdbSaveBackground(server.dbfilename);
1455 break;
1456 }
1457 }
1458 }
1459
1460 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1461 * will use few CPU cycles if there are few expiring keys, otherwise
1462 * it will get more aggressive to avoid that too much memory is used by
1463 * keys that can be removed from the keyspace. */
1464 for (j = 0; j < server.dbnum; j++) {
1465 int expired;
1466 redisDb *db = server.db+j;
1467
1468 /* Continue to expire if at the end of the cycle more than 25%
1469 * of the keys were expired. */
1470 do {
1471 long num = dictSize(db->expires);
1472 time_t now = time(NULL);
1473
1474 expired = 0;
1475 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1476 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1477 while (num--) {
1478 dictEntry *de;
1479 time_t t;
1480
1481 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1482 t = (time_t) dictGetEntryVal(de);
1483 if (now > t) {
1484 deleteKey(db,dictGetEntryKey(de));
1485 expired++;
1486 server.stat_expiredkeys++;
1487 }
1488 }
1489 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1490 }
1491
1492 /* Swap a few keys on disk if we are over the memory limit and VM
1493 * is enbled. Try to free objects from the free list first. */
1494 if (vmCanSwapOut()) {
1495 while (server.vm_enabled && zmalloc_used_memory() >
1496 server.vm_max_memory)
1497 {
1498 int retval;
1499
1500 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1501 retval = (server.vm_max_threads == 0) ?
1502 vmSwapOneObjectBlocking() :
1503 vmSwapOneObjectThreaded();
1504 if (retval == REDIS_ERR && !(loops % 300) &&
1505 zmalloc_used_memory() >
1506 (server.vm_max_memory+server.vm_max_memory/10))
1507 {
1508 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1509 }
1510 /* Note that when using threade I/O we free just one object,
1511 * because anyway when the I/O thread in charge to swap this
1512 * object out will finish, the handler of completed jobs
1513 * will try to swap more objects if we are still out of memory. */
1514 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1515 }
1516 }
1517
1518 /* Check if we should connect to a MASTER */
1519 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1520 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1521 if (syncWithMaster() == REDIS_OK) {
1522 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1523 if (server.appendonly) rewriteAppendOnlyFileBackground();
1524 }
1525 }
1526 return 100;
1527 }
1528
1529 /* This function gets called every time Redis is entering the
1530 * main loop of the event driven library, that is, before to sleep
1531 * for ready file descriptors. */
1532 static void beforeSleep(struct aeEventLoop *eventLoop) {
1533 REDIS_NOTUSED(eventLoop);
1534
1535 /* Awake clients that got all the swapped keys they requested */
1536 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1537 listIter li;
1538 listNode *ln;
1539
1540 listRewind(server.io_ready_clients,&li);
1541 while((ln = listNext(&li))) {
1542 redisClient *c = ln->value;
1543 struct redisCommand *cmd;
1544
1545 /* Resume the client. */
1546 listDelNode(server.io_ready_clients,ln);
1547 c->flags &= (~REDIS_IO_WAIT);
1548 server.vm_blocked_clients--;
1549 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1550 readQueryFromClient, c);
1551 cmd = lookupCommand(c->argv[0]->ptr);
1552 assert(cmd != NULL);
1553 call(c,cmd);
1554 resetClient(c);
1555 /* There may be more data to process in the input buffer. */
1556 if (c->querybuf && sdslen(c->querybuf) > 0)
1557 processInputBuffer(c);
1558 }
1559 }
1560 /* Write the AOF buffer on disk */
1561 flushAppendOnlyFile();
1562 }
1563
1564 static void createSharedObjects(void) {
1565 int j;
1566
1567 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1568 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1569 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1570 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1571 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1572 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1573 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1574 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1575 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1576 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1577 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1578 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1580 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1581 "-ERR no such key\r\n"));
1582 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1583 "-ERR syntax error\r\n"));
1584 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1585 "-ERR source and destination objects are the same\r\n"));
1586 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1587 "-ERR index out of range\r\n"));
1588 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1589 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1590 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1591 shared.select0 = createStringObject("select 0\r\n",10);
1592 shared.select1 = createStringObject("select 1\r\n",10);
1593 shared.select2 = createStringObject("select 2\r\n",10);
1594 shared.select3 = createStringObject("select 3\r\n",10);
1595 shared.select4 = createStringObject("select 4\r\n",10);
1596 shared.select5 = createStringObject("select 5\r\n",10);
1597 shared.select6 = createStringObject("select 6\r\n",10);
1598 shared.select7 = createStringObject("select 7\r\n",10);
1599 shared.select8 = createStringObject("select 8\r\n",10);
1600 shared.select9 = createStringObject("select 9\r\n",10);
1601 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1602 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1603 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1604 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1605 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1606 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1607 shared.mbulk3 = createStringObject("*3\r\n",4);
1608 shared.mbulk4 = createStringObject("*4\r\n",4);
1609 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1610 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1611 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1612 }
1613 }
1614
1615 static void appendServerSaveParams(time_t seconds, int changes) {
1616 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1617 server.saveparams[server.saveparamslen].seconds = seconds;
1618 server.saveparams[server.saveparamslen].changes = changes;
1619 server.saveparamslen++;
1620 }
1621
1622 static void resetServerSaveParams() {
1623 zfree(server.saveparams);
1624 server.saveparams = NULL;
1625 server.saveparamslen = 0;
1626 }
1627
1628 static void initServerConfig() {
1629 server.dbnum = REDIS_DEFAULT_DBNUM;
1630 server.port = REDIS_SERVERPORT;
1631 server.verbosity = REDIS_VERBOSE;
1632 server.maxidletime = REDIS_MAXIDLETIME;
1633 server.saveparams = NULL;
1634 server.logfile = NULL; /* NULL = log on standard output */
1635 server.bindaddr = NULL;
1636 server.glueoutputbuf = 1;
1637 server.daemonize = 0;
1638 server.appendonly = 0;
1639 server.appendfsync = APPENDFSYNC_ALWAYS;
1640 server.lastfsync = time(NULL);
1641 server.appendfd = -1;
1642 server.appendseldb = -1; /* Make sure the first time will not match */
1643 server.pidfile = zstrdup("/var/run/redis.pid");
1644 server.dbfilename = zstrdup("dump.rdb");
1645 server.appendfilename = zstrdup("appendonly.aof");
1646 server.requirepass = NULL;
1647 server.rdbcompression = 1;
1648 server.activerehashing = 1;
1649 server.maxclients = 0;
1650 server.blpop_blocked_clients = 0;
1651 server.maxmemory = 0;
1652 server.vm_enabled = 0;
1653 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1654 server.vm_page_size = 256; /* 256 bytes per page */
1655 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1656 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1657 server.vm_max_threads = 4;
1658 server.vm_blocked_clients = 0;
1659 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1660 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1661
1662 resetServerSaveParams();
1663
1664 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1665 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1666 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1667 /* Replication related */
1668 server.isslave = 0;
1669 server.masterauth = NULL;
1670 server.masterhost = NULL;
1671 server.masterport = 6379;
1672 server.master = NULL;
1673 server.replstate = REDIS_REPL_NONE;
1674
1675 /* Double constants initialization */
1676 R_Zero = 0.0;
1677 R_PosInf = 1.0/R_Zero;
1678 R_NegInf = -1.0/R_Zero;
1679 R_Nan = R_Zero/R_Zero;
1680 }
1681
1682 static void initServer() {
1683 int j;
1684
1685 signal(SIGHUP, SIG_IGN);
1686 signal(SIGPIPE, SIG_IGN);
1687 setupSigSegvAction();
1688
1689 server.devnull = fopen("/dev/null","w");
1690 if (server.devnull == NULL) {
1691 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1692 exit(1);
1693 }
1694 server.clients = listCreate();
1695 server.slaves = listCreate();
1696 server.monitors = listCreate();
1697 server.objfreelist = listCreate();
1698 createSharedObjects();
1699 server.el = aeCreateEventLoop();
1700 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1701 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1702 if (server.fd == -1) {
1703 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1704 exit(1);
1705 }
1706 for (j = 0; j < server.dbnum; j++) {
1707 server.db[j].dict = dictCreate(&dbDictType,NULL);
1708 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1709 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1710 if (server.vm_enabled)
1711 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1712 server.db[j].id = j;
1713 }
1714 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1715 server.pubsub_patterns = listCreate();
1716 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1717 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1718 server.cronloops = 0;
1719 server.bgsavechildpid = -1;
1720 server.bgrewritechildpid = -1;
1721 server.bgrewritebuf = sdsempty();
1722 server.aofbuf = sdsempty();
1723 server.lastsave = time(NULL);
1724 server.dirty = 0;
1725 server.stat_numcommands = 0;
1726 server.stat_numconnections = 0;
1727 server.stat_expiredkeys = 0;
1728 server.stat_starttime = time(NULL);
1729 server.unixtime = time(NULL);
1730 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1731 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1732 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1733
1734 if (server.appendonly) {
1735 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1736 if (server.appendfd == -1) {
1737 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1738 strerror(errno));
1739 exit(1);
1740 }
1741 }
1742
1743 if (server.vm_enabled) vmInit();
1744 }
1745
1746 /* Empty the whole database */
1747 static long long emptyDb() {
1748 int j;
1749 long long removed = 0;
1750
1751 for (j = 0; j < server.dbnum; j++) {
1752 removed += dictSize(server.db[j].dict);
1753 dictEmpty(server.db[j].dict);
1754 dictEmpty(server.db[j].expires);
1755 }
1756 return removed;
1757 }
1758
1759 static int yesnotoi(char *s) {
1760 if (!strcasecmp(s,"yes")) return 1;
1761 else if (!strcasecmp(s,"no")) return 0;
1762 else return -1;
1763 }
1764
1765 /* I agree, this is a very rudimental way to load a configuration...
1766 will improve later if the config gets more complex */
1767 static void loadServerConfig(char *filename) {
1768 FILE *fp;
1769 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1770 int linenum = 0;
1771 sds line = NULL;
1772
1773 if (filename[0] == '-' && filename[1] == '\0')
1774 fp = stdin;
1775 else {
1776 if ((fp = fopen(filename,"r")) == NULL) {
1777 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1778 exit(1);
1779 }
1780 }
1781
1782 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1783 sds *argv;
1784 int argc, j;
1785
1786 linenum++;
1787 line = sdsnew(buf);
1788 line = sdstrim(line," \t\r\n");
1789
1790 /* Skip comments and blank lines*/
1791 if (line[0] == '#' || line[0] == '\0') {
1792 sdsfree(line);
1793 continue;
1794 }
1795
1796 /* Split into arguments */
1797 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1798 sdstolower(argv[0]);
1799
1800 /* Execute config directives */
1801 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1802 server.maxidletime = atoi(argv[1]);
1803 if (server.maxidletime < 0) {
1804 err = "Invalid timeout value"; goto loaderr;
1805 }
1806 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1807 server.port = atoi(argv[1]);
1808 if (server.port < 1 || server.port > 65535) {
1809 err = "Invalid port"; goto loaderr;
1810 }
1811 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1812 server.bindaddr = zstrdup(argv[1]);
1813 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1814 int seconds = atoi(argv[1]);
1815 int changes = atoi(argv[2]);
1816 if (seconds < 1 || changes < 0) {
1817 err = "Invalid save parameters"; goto loaderr;
1818 }
1819 appendServerSaveParams(seconds,changes);
1820 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1821 if (chdir(argv[1]) == -1) {
1822 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1823 argv[1], strerror(errno));
1824 exit(1);
1825 }
1826 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1827 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1828 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1829 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1830 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1831 else {
1832 err = "Invalid log level. Must be one of debug, notice, warning";
1833 goto loaderr;
1834 }
1835 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1836 FILE *logfp;
1837
1838 server.logfile = zstrdup(argv[1]);
1839 if (!strcasecmp(server.logfile,"stdout")) {
1840 zfree(server.logfile);
1841 server.logfile = NULL;
1842 }
1843 if (server.logfile) {
1844 /* Test if we are able to open the file. The server will not
1845 * be able to abort just for this problem later... */
1846 logfp = fopen(server.logfile,"a");
1847 if (logfp == NULL) {
1848 err = sdscatprintf(sdsempty(),
1849 "Can't open the log file: %s", strerror(errno));
1850 goto loaderr;
1851 }
1852 fclose(logfp);
1853 }
1854 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1855 server.dbnum = atoi(argv[1]);
1856 if (server.dbnum < 1) {
1857 err = "Invalid number of databases"; goto loaderr;
1858 }
1859 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1860 loadServerConfig(argv[1]);
1861 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1862 server.maxclients = atoi(argv[1]);
1863 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1864 server.maxmemory = memtoll(argv[1],NULL);
1865 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1866 server.masterhost = sdsnew(argv[1]);
1867 server.masterport = atoi(argv[2]);
1868 server.replstate = REDIS_REPL_CONNECT;
1869 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1870 server.masterauth = zstrdup(argv[1]);
1871 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1872 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1873 err = "argument must be 'yes' or 'no'"; goto loaderr;
1874 }
1875 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1876 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1877 err = "argument must be 'yes' or 'no'"; goto loaderr;
1878 }
1879 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1880 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1881 err = "argument must be 'yes' or 'no'"; goto loaderr;
1882 }
1883 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1884 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1885 err = "argument must be 'yes' or 'no'"; goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1888 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1889 err = "argument must be 'yes' or 'no'"; goto loaderr;
1890 }
1891 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1892 zfree(server.appendfilename);
1893 server.appendfilename = zstrdup(argv[1]);
1894 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1895 if (!strcasecmp(argv[1],"no")) {
1896 server.appendfsync = APPENDFSYNC_NO;
1897 } else if (!strcasecmp(argv[1],"always")) {
1898 server.appendfsync = APPENDFSYNC_ALWAYS;
1899 } else if (!strcasecmp(argv[1],"everysec")) {
1900 server.appendfsync = APPENDFSYNC_EVERYSEC;
1901 } else {
1902 err = "argument must be 'no', 'always' or 'everysec'";
1903 goto loaderr;
1904 }
1905 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1906 server.requirepass = zstrdup(argv[1]);
1907 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1908 zfree(server.pidfile);
1909 server.pidfile = zstrdup(argv[1]);
1910 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1911 zfree(server.dbfilename);
1912 server.dbfilename = zstrdup(argv[1]);
1913 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1914 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1915 err = "argument must be 'yes' or 'no'"; goto loaderr;
1916 }
1917 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1918 zfree(server.vm_swap_file);
1919 server.vm_swap_file = zstrdup(argv[1]);
1920 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1921 server.vm_max_memory = memtoll(argv[1],NULL);
1922 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1923 server.vm_page_size = memtoll(argv[1], NULL);
1924 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1925 server.vm_pages = memtoll(argv[1], NULL);
1926 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1927 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1928 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1929 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1930 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1931 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1932 } else {
1933 err = "Bad directive or wrong number of arguments"; goto loaderr;
1934 }
1935 for (j = 0; j < argc; j++)
1936 sdsfree(argv[j]);
1937 zfree(argv);
1938 sdsfree(line);
1939 }
1940 if (fp != stdin) fclose(fp);
1941 return;
1942
1943 loaderr:
1944 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1945 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1946 fprintf(stderr, ">>> '%s'\n", line);
1947 fprintf(stderr, "%s\n", err);
1948 exit(1);
1949 }
1950
1951 static void freeClientArgv(redisClient *c) {
1952 int j;
1953
1954 for (j = 0; j < c->argc; j++)
1955 decrRefCount(c->argv[j]);
1956 for (j = 0; j < c->mbargc; j++)
1957 decrRefCount(c->mbargv[j]);
1958 c->argc = 0;
1959 c->mbargc = 0;
1960 }
1961
1962 static void freeClient(redisClient *c) {
1963 listNode *ln;
1964
1965 /* Note that if the client we are freeing is blocked into a blocking
1966 * call, we have to set querybuf to NULL *before* to call
1967 * unblockClientWaitingData() to avoid processInputBuffer() will get
1968 * called. Also it is important to remove the file events after
1969 * this, because this call adds the READABLE event. */
1970 sdsfree(c->querybuf);
1971 c->querybuf = NULL;
1972 if (c->flags & REDIS_BLOCKED)
1973 unblockClientWaitingData(c);
1974
1975 /* Unsubscribe from all the pubsub channels */
1976 pubsubUnsubscribeAllChannels(c,0);
1977 pubsubUnsubscribeAllPatterns(c,0);
1978 dictRelease(c->pubsub_channels);
1979 listRelease(c->pubsub_patterns);
1980 /* Obvious cleanup */
1981 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1982 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1983 listRelease(c->reply);
1984 freeClientArgv(c);
1985 close(c->fd);
1986 /* Remove from the list of clients */
1987 ln = listSearchKey(server.clients,c);
1988 redisAssert(ln != NULL);
1989 listDelNode(server.clients,ln);
1990 /* Remove from the list of clients waiting for swapped keys */
1991 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1992 ln = listSearchKey(server.io_ready_clients,c);
1993 if (ln) {
1994 listDelNode(server.io_ready_clients,ln);
1995 server.vm_blocked_clients--;
1996 }
1997 }
1998 while (server.vm_enabled && listLength(c->io_keys)) {
1999 ln = listFirst(c->io_keys);
2000 dontWaitForSwappedKey(c,ln->value);
2001 }
2002 listRelease(c->io_keys);
2003 /* Master/slave cleanup */
2004 if (c->flags & REDIS_SLAVE) {
2005 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2006 close(c->repldbfd);
2007 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2008 ln = listSearchKey(l,c);
2009 redisAssert(ln != NULL);
2010 listDelNode(l,ln);
2011 }
2012 if (c->flags & REDIS_MASTER) {
2013 server.master = NULL;
2014 server.replstate = REDIS_REPL_CONNECT;
2015 }
2016 /* Release memory */
2017 zfree(c->argv);
2018 zfree(c->mbargv);
2019 freeClientMultiState(c);
2020 zfree(c);
2021 }
2022
2023 #define GLUEREPLY_UP_TO (1024)
2024 static void glueReplyBuffersIfNeeded(redisClient *c) {
2025 int copylen = 0;
2026 char buf[GLUEREPLY_UP_TO];
2027 listNode *ln;
2028 listIter li;
2029 robj *o;
2030
2031 listRewind(c->reply,&li);
2032 while((ln = listNext(&li))) {
2033 int objlen;
2034
2035 o = ln->value;
2036 objlen = sdslen(o->ptr);
2037 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2038 memcpy(buf+copylen,o->ptr,objlen);
2039 copylen += objlen;
2040 listDelNode(c->reply,ln);
2041 } else {
2042 if (copylen == 0) return;
2043 break;
2044 }
2045 }
2046 /* Now the output buffer is empty, add the new single element */
2047 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2048 listAddNodeHead(c->reply,o);
2049 }
2050
2051 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2052 redisClient *c = privdata;
2053 int nwritten = 0, totwritten = 0, objlen;
2054 robj *o;
2055 REDIS_NOTUSED(el);
2056 REDIS_NOTUSED(mask);
2057
2058 /* Use writev() if we have enough buffers to send */
2059 if (!server.glueoutputbuf &&
2060 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2061 !(c->flags & REDIS_MASTER))
2062 {
2063 sendReplyToClientWritev(el, fd, privdata, mask);
2064 return;
2065 }
2066
2067 while(listLength(c->reply)) {
2068 if (server.glueoutputbuf && listLength(c->reply) > 1)
2069 glueReplyBuffersIfNeeded(c);
2070
2071 o = listNodeValue(listFirst(c->reply));
2072 objlen = sdslen(o->ptr);
2073
2074 if (objlen == 0) {
2075 listDelNode(c->reply,listFirst(c->reply));
2076 continue;
2077 }
2078
2079 if (c->flags & REDIS_MASTER) {
2080 /* Don't reply to a master */
2081 nwritten = objlen - c->sentlen;
2082 } else {
2083 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2084 if (nwritten <= 0) break;
2085 }
2086 c->sentlen += nwritten;
2087 totwritten += nwritten;
2088 /* If we fully sent the object on head go to the next one */
2089 if (c->sentlen == objlen) {
2090 listDelNode(c->reply,listFirst(c->reply));
2091 c->sentlen = 0;
2092 }
2093 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2094 * bytes, in a single threaded server it's a good idea to serve
2095 * other clients as well, even if a very large request comes from
2096 * super fast link that is always able to accept data (in real world
2097 * scenario think about 'KEYS *' against the loopback interfae) */
2098 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2099 }
2100 if (nwritten == -1) {
2101 if (errno == EAGAIN) {
2102 nwritten = 0;
2103 } else {
2104 redisLog(REDIS_VERBOSE,
2105 "Error writing to client: %s", strerror(errno));
2106 freeClient(c);
2107 return;
2108 }
2109 }
2110 if (totwritten > 0) c->lastinteraction = time(NULL);
2111 if (listLength(c->reply) == 0) {
2112 c->sentlen = 0;
2113 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2114 }
2115 }
2116
2117 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2118 {
2119 redisClient *c = privdata;
2120 int nwritten = 0, totwritten = 0, objlen, willwrite;
2121 robj *o;
2122 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2123 int offset, ion = 0;
2124 REDIS_NOTUSED(el);
2125 REDIS_NOTUSED(mask);
2126
2127 listNode *node;
2128 while (listLength(c->reply)) {
2129 offset = c->sentlen;
2130 ion = 0;
2131 willwrite = 0;
2132
2133 /* fill-in the iov[] array */
2134 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2135 o = listNodeValue(node);
2136 objlen = sdslen(o->ptr);
2137
2138 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2139 break;
2140
2141 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2142 break; /* no more iovecs */
2143
2144 iov[ion].iov_base = ((char*)o->ptr) + offset;
2145 iov[ion].iov_len = objlen - offset;
2146 willwrite += objlen - offset;
2147 offset = 0; /* just for the first item */
2148 ion++;
2149 }
2150
2151 if(willwrite == 0)
2152 break;
2153
2154 /* write all collected blocks at once */
2155 if((nwritten = writev(fd, iov, ion)) < 0) {
2156 if (errno != EAGAIN) {
2157 redisLog(REDIS_VERBOSE,
2158 "Error writing to client: %s", strerror(errno));
2159 freeClient(c);
2160 return;
2161 }
2162 break;
2163 }
2164
2165 totwritten += nwritten;
2166 offset = c->sentlen;
2167
2168 /* remove written robjs from c->reply */
2169 while (nwritten && listLength(c->reply)) {
2170 o = listNodeValue(listFirst(c->reply));
2171 objlen = sdslen(o->ptr);
2172
2173 if(nwritten >= objlen - offset) {
2174 listDelNode(c->reply, listFirst(c->reply));
2175 nwritten -= objlen - offset;
2176 c->sentlen = 0;
2177 } else {
2178 /* partial write */
2179 c->sentlen += nwritten;
2180 break;
2181 }
2182 offset = 0;
2183 }
2184 }
2185
2186 if (totwritten > 0)
2187 c->lastinteraction = time(NULL);
2188
2189 if (listLength(c->reply) == 0) {
2190 c->sentlen = 0;
2191 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2192 }
2193 }
2194
2195 static struct redisCommand *lookupCommand(char *name) {
2196 int j = 0;
2197 while(cmdTable[j].name != NULL) {
2198 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2199 j++;
2200 }
2201 return NULL;
2202 }
2203
2204 /* resetClient prepare the client to process the next command */
2205 static void resetClient(redisClient *c) {
2206 freeClientArgv(c);
2207 c->bulklen = -1;
2208 c->multibulk = 0;
2209 }
2210
2211 /* Call() is the core of Redis execution of a command */
2212 static void call(redisClient *c, struct redisCommand *cmd) {
2213 long long dirty;
2214
2215 dirty = server.dirty;
2216 cmd->proc(c);
2217 dirty = server.dirty-dirty;
2218
2219 if (server.appendonly && dirty)
2220 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2221 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2222 listLength(server.slaves))
2223 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2224 if (listLength(server.monitors))
2225 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2226 server.stat_numcommands++;
2227 }
2228
2229 /* If this function gets called we already read a whole
2230 * command, argments are in the client argv/argc fields.
2231 * processCommand() execute the command or prepare the
2232 * server for a bulk read from the client.
2233 *
2234 * If 1 is returned the client is still alive and valid and
2235 * and other operations can be performed by the caller. Otherwise
2236 * if 0 is returned the client was destroied (i.e. after QUIT). */
2237 static int processCommand(redisClient *c) {
2238 struct redisCommand *cmd;
2239
2240 /* Free some memory if needed (maxmemory setting) */
2241 if (server.maxmemory) freeMemoryIfNeeded();
2242
2243 /* Handle the multi bulk command type. This is an alternative protocol
2244 * supported by Redis in order to receive commands that are composed of
2245 * multiple binary-safe "bulk" arguments. The latency of processing is
2246 * a bit higher but this allows things like multi-sets, so if this
2247 * protocol is used only for MSET and similar commands this is a big win. */
2248 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2249 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2250 if (c->multibulk <= 0) {
2251 resetClient(c);
2252 return 1;
2253 } else {
2254 decrRefCount(c->argv[c->argc-1]);
2255 c->argc--;
2256 return 1;
2257 }
2258 } else if (c->multibulk) {
2259 if (c->bulklen == -1) {
2260 if (((char*)c->argv[0]->ptr)[0] != '$') {
2261 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2262 resetClient(c);
2263 return 1;
2264 } else {
2265 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2266 decrRefCount(c->argv[0]);
2267 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2268 c->argc--;
2269 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2270 resetClient(c);
2271 return 1;
2272 }
2273 c->argc--;
2274 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2275 return 1;
2276 }
2277 } else {
2278 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2279 c->mbargv[c->mbargc] = c->argv[0];
2280 c->mbargc++;
2281 c->argc--;
2282 c->multibulk--;
2283 if (c->multibulk == 0) {
2284 robj **auxargv;
2285 int auxargc;
2286
2287 /* Here we need to swap the multi-bulk argc/argv with the
2288 * normal argc/argv of the client structure. */
2289 auxargv = c->argv;
2290 c->argv = c->mbargv;
2291 c->mbargv = auxargv;
2292
2293 auxargc = c->argc;
2294 c->argc = c->mbargc;
2295 c->mbargc = auxargc;
2296
2297 /* We need to set bulklen to something different than -1
2298 * in order for the code below to process the command without
2299 * to try to read the last argument of a bulk command as
2300 * a special argument. */
2301 c->bulklen = 0;
2302 /* continue below and process the command */
2303 } else {
2304 c->bulklen = -1;
2305 return 1;
2306 }
2307 }
2308 }
2309 /* -- end of multi bulk commands processing -- */
2310
2311 /* The QUIT command is handled as a special case. Normal command
2312 * procs are unable to close the client connection safely */
2313 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2314 freeClient(c);
2315 return 0;
2316 }
2317
2318 /* Now lookup the command and check ASAP about trivial error conditions
2319 * such wrong arity, bad command name and so forth. */
2320 cmd = lookupCommand(c->argv[0]->ptr);
2321 if (!cmd) {
2322 addReplySds(c,
2323 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2324 (char*)c->argv[0]->ptr));
2325 resetClient(c);
2326 return 1;
2327 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2328 (c->argc < -cmd->arity)) {
2329 addReplySds(c,
2330 sdscatprintf(sdsempty(),
2331 "-ERR wrong number of arguments for '%s' command\r\n",
2332 cmd->name));
2333 resetClient(c);
2334 return 1;
2335 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2336 /* This is a bulk command, we have to read the last argument yet. */
2337 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2338
2339 decrRefCount(c->argv[c->argc-1]);
2340 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2341 c->argc--;
2342 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2343 resetClient(c);
2344 return 1;
2345 }
2346 c->argc--;
2347 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2348 /* It is possible that the bulk read is already in the
2349 * buffer. Check this condition and handle it accordingly.
2350 * This is just a fast path, alternative to call processInputBuffer().
2351 * It's a good idea since the code is small and this condition
2352 * happens most of the times. */
2353 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2354 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2355 c->argc++;
2356 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2357 } else {
2358 /* Otherwise return... there is to read the last argument
2359 * from the socket. */
2360 return 1;
2361 }
2362 }
2363 /* Let's try to encode the bulk object to save space. */
2364 if (cmd->flags & REDIS_CMD_BULK)
2365 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2366
2367 /* Check if the user is authenticated */
2368 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2369 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2370 resetClient(c);
2371 return 1;
2372 }
2373
2374 /* Handle the maxmemory directive */
2375 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2376 zmalloc_used_memory() > server.maxmemory)
2377 {
2378 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2379 resetClient(c);
2380 return 1;
2381 }
2382
2383 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2384 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2385 &&
2386 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2387 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2388 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2389 resetClient(c);
2390 return 1;
2391 }
2392
2393 /* Exec the command */
2394 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2395 queueMultiCommand(c,cmd);
2396 addReply(c,shared.queued);
2397 } else {
2398 if (server.vm_enabled && server.vm_max_threads > 0 &&
2399 blockClientOnSwappedKeys(cmd,c)) return 1;
2400 call(c,cmd);
2401 }
2402
2403 /* Prepare the client for the next command */
2404 resetClient(c);
2405 return 1;
2406 }
2407
2408 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2409 listNode *ln;
2410 listIter li;
2411 int outc = 0, j;
2412 robj **outv;
2413 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2414 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2415 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2416 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2417 robj *lenobj;
2418
2419 if (argc <= REDIS_STATIC_ARGS) {
2420 outv = static_outv;
2421 } else {
2422 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2423 }
2424
2425 lenobj = createObject(REDIS_STRING,
2426 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2427 lenobj->refcount = 0;
2428 outv[outc++] = lenobj;
2429 for (j = 0; j < argc; j++) {
2430 lenobj = createObject(REDIS_STRING,
2431 sdscatprintf(sdsempty(),"$%lu\r\n",
2432 (unsigned long) stringObjectLen(argv[j])));
2433 lenobj->refcount = 0;
2434 outv[outc++] = lenobj;
2435 outv[outc++] = argv[j];
2436 outv[outc++] = shared.crlf;
2437 }
2438
2439 /* Increment all the refcounts at start and decrement at end in order to
2440 * be sure to free objects if there is no slave in a replication state
2441 * able to be feed with commands */
2442 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2443 listRewind(slaves,&li);
2444 while((ln = listNext(&li))) {
2445 redisClient *slave = ln->value;
2446
2447 /* Don't feed slaves that are still waiting for BGSAVE to start */
2448 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2449
2450 /* Feed all the other slaves, MONITORs and so on */
2451 if (slave->slaveseldb != dictid) {
2452 robj *selectcmd;
2453
2454 switch(dictid) {
2455 case 0: selectcmd = shared.select0; break;
2456 case 1: selectcmd = shared.select1; break;
2457 case 2: selectcmd = shared.select2; break;
2458 case 3: selectcmd = shared.select3; break;
2459 case 4: selectcmd = shared.select4; break;
2460 case 5: selectcmd = shared.select5; break;
2461 case 6: selectcmd = shared.select6; break;
2462 case 7: selectcmd = shared.select7; break;
2463 case 8: selectcmd = shared.select8; break;
2464 case 9: selectcmd = shared.select9; break;
2465 default:
2466 selectcmd = createObject(REDIS_STRING,
2467 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2468 selectcmd->refcount = 0;
2469 break;
2470 }
2471 addReply(slave,selectcmd);
2472 slave->slaveseldb = dictid;
2473 }
2474 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2475 }
2476 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2477 if (outv != static_outv) zfree(outv);
2478 }
2479
2480 static sds sdscatrepr(sds s, char *p, size_t len) {
2481 s = sdscatlen(s,"\"",1);
2482 while(len--) {
2483 switch(*p) {
2484 case '\\':
2485 case '"':
2486 s = sdscatprintf(s,"\\%c",*p);
2487 break;
2488 case '\n': s = sdscatlen(s,"\\n",1); break;
2489 case '\r': s = sdscatlen(s,"\\r",1); break;
2490 case '\t': s = sdscatlen(s,"\\t",1); break;
2491 case '\a': s = sdscatlen(s,"\\a",1); break;
2492 case '\b': s = sdscatlen(s,"\\b",1); break;
2493 default:
2494 if (isprint(*p))
2495 s = sdscatprintf(s,"%c",*p);
2496 else
2497 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2498 break;
2499 }
2500 p++;
2501 }
2502 return sdscatlen(s,"\"",1);
2503 }
2504
2505 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2506 listNode *ln;
2507 listIter li;
2508 int j;
2509 sds cmdrepr = sdsnew("+");
2510 robj *cmdobj;
2511 struct timeval tv;
2512
2513 gettimeofday(&tv,NULL);
2514 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2515 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2516
2517 for (j = 0; j < argc; j++) {
2518 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2519 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2520 } else {
2521 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2522 sdslen(argv[j]->ptr));
2523 }
2524 if (j != argc-1)
2525 cmdrepr = sdscatlen(cmdrepr," ",1);
2526 }
2527 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2528 cmdobj = createObject(REDIS_STRING,cmdrepr);
2529
2530 listRewind(monitors,&li);
2531 while((ln = listNext(&li))) {
2532 redisClient *monitor = ln->value;
2533 addReply(monitor,cmdobj);
2534 }
2535 decrRefCount(cmdobj);
2536 }
2537
2538 static void processInputBuffer(redisClient *c) {
2539 again:
2540 /* Before to process the input buffer, make sure the client is not
2541 * waitig for a blocking operation such as BLPOP. Note that the first
2542 * iteration the client is never blocked, otherwise the processInputBuffer
2543 * would not be called at all, but after the execution of the first commands
2544 * in the input buffer the client may be blocked, and the "goto again"
2545 * will try to reiterate. The following line will make it return asap. */
2546 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2547 if (c->bulklen == -1) {
2548 /* Read the first line of the query */
2549 char *p = strchr(c->querybuf,'\n');
2550 size_t querylen;
2551
2552 if (p) {
2553 sds query, *argv;
2554 int argc, j;
2555
2556 query = c->querybuf;
2557 c->querybuf = sdsempty();
2558 querylen = 1+(p-(query));
2559 if (sdslen(query) > querylen) {
2560 /* leave data after the first line of the query in the buffer */
2561 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2562 }
2563 *p = '\0'; /* remove "\n" */
2564 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2565 sdsupdatelen(query);
2566
2567 /* Now we can split the query in arguments */
2568 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2569 sdsfree(query);
2570
2571 if (c->argv) zfree(c->argv);
2572 c->argv = zmalloc(sizeof(robj*)*argc);
2573
2574 for (j = 0; j < argc; j++) {
2575 if (sdslen(argv[j])) {
2576 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2577 c->argc++;
2578 } else {
2579 sdsfree(argv[j]);
2580 }
2581 }
2582 zfree(argv);
2583 if (c->argc) {
2584 /* Execute the command. If the client is still valid
2585 * after processCommand() return and there is something
2586 * on the query buffer try to process the next command. */
2587 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2588 } else {
2589 /* Nothing to process, argc == 0. Just process the query
2590 * buffer if it's not empty or return to the caller */
2591 if (sdslen(c->querybuf)) goto again;
2592 }
2593 return;
2594 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2595 redisLog(REDIS_VERBOSE, "Client protocol error");
2596 freeClient(c);
2597 return;
2598 }
2599 } else {
2600 /* Bulk read handling. Note that if we are at this point
2601 the client already sent a command terminated with a newline,
2602 we are reading the bulk data that is actually the last
2603 argument of the command. */
2604 int qbl = sdslen(c->querybuf);
2605
2606 if (c->bulklen <= qbl) {
2607 /* Copy everything but the final CRLF as final argument */
2608 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2609 c->argc++;
2610 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2611 /* Process the command. If the client is still valid after
2612 * the processing and there is more data in the buffer
2613 * try to parse it. */
2614 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2615 return;
2616 }
2617 }
2618 }
2619
2620 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2621 redisClient *c = (redisClient*) privdata;
2622 char buf[REDIS_IOBUF_LEN];
2623 int nread;
2624 REDIS_NOTUSED(el);
2625 REDIS_NOTUSED(mask);
2626
2627 nread = read(fd, buf, REDIS_IOBUF_LEN);
2628 if (nread == -1) {
2629 if (errno == EAGAIN) {
2630 nread = 0;
2631 } else {
2632 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2633 freeClient(c);
2634 return;
2635 }
2636 } else if (nread == 0) {
2637 redisLog(REDIS_VERBOSE, "Client closed connection");
2638 freeClient(c);
2639 return;
2640 }
2641 if (nread) {
2642 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2643 c->lastinteraction = time(NULL);
2644 } else {
2645 return;
2646 }
2647 processInputBuffer(c);
2648 }
2649
2650 static int selectDb(redisClient *c, int id) {
2651 if (id < 0 || id >= server.dbnum)
2652 return REDIS_ERR;
2653 c->db = &server.db[id];
2654 return REDIS_OK;
2655 }
2656
2657 static void *dupClientReplyValue(void *o) {
2658 incrRefCount((robj*)o);
2659 return o;
2660 }
2661
2662 static int listMatchObjects(void *a, void *b) {
2663 return compareStringObjects(a,b) == 0;
2664 }
2665
2666 static redisClient *createClient(int fd) {
2667 redisClient *c = zmalloc(sizeof(*c));
2668
2669 anetNonBlock(NULL,fd);
2670 anetTcpNoDelay(NULL,fd);
2671 if (!c) return NULL;
2672 selectDb(c,0);
2673 c->fd = fd;
2674 c->querybuf = sdsempty();
2675 c->argc = 0;
2676 c->argv = NULL;
2677 c->bulklen = -1;
2678 c->multibulk = 0;
2679 c->mbargc = 0;
2680 c->mbargv = NULL;
2681 c->sentlen = 0;
2682 c->flags = 0;
2683 c->lastinteraction = time(NULL);
2684 c->authenticated = 0;
2685 c->replstate = REDIS_REPL_NONE;
2686 c->reply = listCreate();
2687 listSetFreeMethod(c->reply,decrRefCount);
2688 listSetDupMethod(c->reply,dupClientReplyValue);
2689 c->blockingkeys = NULL;
2690 c->blockingkeysnum = 0;
2691 c->io_keys = listCreate();
2692 listSetFreeMethod(c->io_keys,decrRefCount);
2693 c->pubsub_channels = dictCreate(&setDictType,NULL);
2694 c->pubsub_patterns = listCreate();
2695 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2696 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2697 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2698 readQueryFromClient, c) == AE_ERR) {
2699 freeClient(c);
2700 return NULL;
2701 }
2702 listAddNodeTail(server.clients,c);
2703 initClientMultiState(c);
2704 return c;
2705 }
2706
2707 static void addReply(redisClient *c, robj *obj) {
2708 if (listLength(c->reply) == 0 &&
2709 (c->replstate == REDIS_REPL_NONE ||
2710 c->replstate == REDIS_REPL_ONLINE) &&
2711 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2712 sendReplyToClient, c) == AE_ERR) return;
2713
2714 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2715 obj = dupStringObject(obj);
2716 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2717 }
2718 listAddNodeTail(c->reply,getDecodedObject(obj));
2719 }
2720
2721 static void addReplySds(redisClient *c, sds s) {
2722 robj *o = createObject(REDIS_STRING,s);
2723 addReply(c,o);
2724 decrRefCount(o);
2725 }
2726
2727 static void addReplyDouble(redisClient *c, double d) {
2728 char buf[128];
2729
2730 snprintf(buf,sizeof(buf),"%.17g",d);
2731 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2732 (unsigned long) strlen(buf),buf));
2733 }
2734
2735 static void addReplyLong(redisClient *c, long l) {
2736 char buf[128];
2737 size_t len;
2738
2739 if (l == 0) {
2740 addReply(c,shared.czero);
2741 return;
2742 } else if (l == 1) {
2743 addReply(c,shared.cone);
2744 return;
2745 }
2746 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2747 addReplySds(c,sdsnewlen(buf,len));
2748 }
2749
2750 static void addReplyLongLong(redisClient *c, long long ll) {
2751 char buf[128];
2752 size_t len;
2753
2754 if (ll == 0) {
2755 addReply(c,shared.czero);
2756 return;
2757 } else if (ll == 1) {
2758 addReply(c,shared.cone);
2759 return;
2760 }
2761 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2762 addReplySds(c,sdsnewlen(buf,len));
2763 }
2764
2765 static void addReplyUlong(redisClient *c, unsigned long ul) {
2766 char buf[128];
2767 size_t len;
2768
2769 if (ul == 0) {
2770 addReply(c,shared.czero);
2771 return;
2772 } else if (ul == 1) {
2773 addReply(c,shared.cone);
2774 return;
2775 }
2776 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2777 addReplySds(c,sdsnewlen(buf,len));
2778 }
2779
2780 static void addReplyBulkLen(redisClient *c, robj *obj) {
2781 size_t len;
2782
2783 if (obj->encoding == REDIS_ENCODING_RAW) {
2784 len = sdslen(obj->ptr);
2785 } else {
2786 long n = (long)obj->ptr;
2787
2788 /* Compute how many bytes will take this integer as a radix 10 string */
2789 len = 1;
2790 if (n < 0) {
2791 len++;
2792 n = -n;
2793 }
2794 while((n = n/10) != 0) {
2795 len++;
2796 }
2797 }
2798 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2799 }
2800
2801 static void addReplyBulk(redisClient *c, robj *obj) {
2802 addReplyBulkLen(c,obj);
2803 addReply(c,obj);
2804 addReply(c,shared.crlf);
2805 }
2806
2807 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2808 static void addReplyBulkCString(redisClient *c, char *s) {
2809 if (s == NULL) {
2810 addReply(c,shared.nullbulk);
2811 } else {
2812 robj *o = createStringObject(s,strlen(s));
2813 addReplyBulk(c,o);
2814 decrRefCount(o);
2815 }
2816 }
2817
2818 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2819 int cport, cfd;
2820 char cip[128];
2821 redisClient *c;
2822 REDIS_NOTUSED(el);
2823 REDIS_NOTUSED(mask);
2824 REDIS_NOTUSED(privdata);
2825
2826 cfd = anetAccept(server.neterr, fd, cip, &cport);
2827 if (cfd == AE_ERR) {
2828 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2829 return;
2830 }
2831 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2832 if ((c = createClient(cfd)) == NULL) {
2833 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2834 close(cfd); /* May be already closed, just ingore errors */
2835 return;
2836 }
2837 /* If maxclient directive is set and this is one client more... close the
2838 * connection. Note that we create the client instead to check before
2839 * for this condition, since now the socket is already set in nonblocking
2840 * mode and we can send an error for free using the Kernel I/O */
2841 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2842 char *err = "-ERR max number of clients reached\r\n";
2843
2844 /* That's a best effort error message, don't check write errors */
2845 if (write(c->fd,err,strlen(err)) == -1) {
2846 /* Nothing to do, Just to avoid the warning... */
2847 }
2848 freeClient(c);
2849 return;
2850 }
2851 server.stat_numconnections++;
2852 }
2853
2854 /* ======================= Redis objects implementation ===================== */
2855
2856 static robj *createObject(int type, void *ptr) {
2857 robj *o;
2858
2859 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2860 if (listLength(server.objfreelist)) {
2861 listNode *head = listFirst(server.objfreelist);
2862 o = listNodeValue(head);
2863 listDelNode(server.objfreelist,head);
2864 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2865 } else {
2866 if (server.vm_enabled) {
2867 pthread_mutex_unlock(&server.obj_freelist_mutex);
2868 o = zmalloc(sizeof(*o));
2869 } else {
2870 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2871 }
2872 }
2873 o->type = type;
2874 o->encoding = REDIS_ENCODING_RAW;
2875 o->ptr = ptr;
2876 o->refcount = 1;
2877 if (server.vm_enabled) {
2878 /* Note that this code may run in the context of an I/O thread
2879 * and accessing to server.unixtime in theory is an error
2880 * (no locks). But in practice this is safe, and even if we read
2881 * garbage Redis will not fail, as it's just a statistical info */
2882 o->vm.atime = server.unixtime;
2883 o->storage = REDIS_VM_MEMORY;
2884 }
2885 return o;
2886 }
2887
2888 static robj *createStringObject(char *ptr, size_t len) {
2889 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2890 }
2891
2892 static robj *createStringObjectFromLongLong(long long value) {
2893 robj *o;
2894 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2895 incrRefCount(shared.integers[value]);
2896 o = shared.integers[value];
2897 } else {
2898 o = createObject(REDIS_STRING, NULL);
2899 if (value >= LONG_MIN && value <= LONG_MAX) {
2900 o->encoding = REDIS_ENCODING_INT;
2901 o->ptr = (void*)((long)value);
2902 } else {
2903 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2904 }
2905 }
2906 return o;
2907 }
2908
2909 static robj *dupStringObject(robj *o) {
2910 assert(o->encoding == REDIS_ENCODING_RAW);
2911 return createStringObject(o->ptr,sdslen(o->ptr));
2912 }
2913
2914 static robj *createListObject(void) {
2915 list *l = listCreate();
2916
2917 listSetFreeMethod(l,decrRefCount);
2918 return createObject(REDIS_LIST,l);
2919 }
2920
2921 static robj *createSetObject(void) {
2922 dict *d = dictCreate(&setDictType,NULL);
2923 return createObject(REDIS_SET,d);
2924 }
2925
2926 static robj *createHashObject(void) {
2927 /* All the Hashes start as zipmaps. Will be automatically converted
2928 * into hash tables if there are enough elements or big elements
2929 * inside. */
2930 unsigned char *zm = zipmapNew();
2931 robj *o = createObject(REDIS_HASH,zm);
2932 o->encoding = REDIS_ENCODING_ZIPMAP;
2933 return o;
2934 }
2935
2936 static robj *createZsetObject(void) {
2937 zset *zs = zmalloc(sizeof(*zs));
2938
2939 zs->dict = dictCreate(&zsetDictType,NULL);
2940 zs->zsl = zslCreate();
2941 return createObject(REDIS_ZSET,zs);
2942 }
2943
2944 static void freeStringObject(robj *o) {
2945 if (o->encoding == REDIS_ENCODING_RAW) {
2946 sdsfree(o->ptr);
2947 }
2948 }
2949
2950 static void freeListObject(robj *o) {
2951 listRelease((list*) o->ptr);
2952 }
2953
2954 static void freeSetObject(robj *o) {
2955 dictRelease((dict*) o->ptr);
2956 }
2957
2958 static void freeZsetObject(robj *o) {
2959 zset *zs = o->ptr;
2960
2961 dictRelease(zs->dict);
2962 zslFree(zs->zsl);
2963 zfree(zs);
2964 }
2965
2966 static void freeHashObject(robj *o) {
2967 switch (o->encoding) {
2968 case REDIS_ENCODING_HT:
2969 dictRelease((dict*) o->ptr);
2970 break;
2971 case REDIS_ENCODING_ZIPMAP:
2972 zfree(o->ptr);
2973 break;
2974 default:
2975 redisPanic("Unknown hash encoding type");
2976 break;
2977 }
2978 }
2979
2980 static void incrRefCount(robj *o) {
2981 o->refcount++;
2982 }
2983
2984 static void decrRefCount(void *obj) {
2985 robj *o = obj;
2986
2987 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2988 /* Object is a key of a swapped out value, or in the process of being
2989 * loaded. */
2990 if (server.vm_enabled &&
2991 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2992 {
2993 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2994 redisAssert(o->type == REDIS_STRING);
2995 freeStringObject(o);
2996 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2997 pthread_mutex_lock(&server.obj_freelist_mutex);
2998 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2999 !listAddNodeHead(server.objfreelist,o))
3000 zfree(o);
3001 pthread_mutex_unlock(&server.obj_freelist_mutex);
3002 server.vm_stats_swapped_objects--;
3003 return;
3004 }
3005 /* Object is in memory, or in the process of being swapped out. */
3006 if (--(o->refcount) == 0) {
3007 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3008 vmCancelThreadedIOJob(obj);
3009 switch(o->type) {
3010 case REDIS_STRING: freeStringObject(o); break;
3011 case REDIS_LIST: freeListObject(o); break;
3012 case REDIS_SET: freeSetObject(o); break;
3013 case REDIS_ZSET: freeZsetObject(o); break;
3014 case REDIS_HASH: freeHashObject(o); break;
3015 default: redisPanic("Unknown object type"); break;
3016 }
3017 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3018 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3019 !listAddNodeHead(server.objfreelist,o))
3020 zfree(o);
3021 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3022 }
3023 }
3024
3025 static robj *lookupKey(redisDb *db, robj *key) {
3026 dictEntry *de = dictFind(db->dict,key);
3027 if (de) {
3028 robj *key = dictGetEntryKey(de);
3029 robj *val = dictGetEntryVal(de);
3030
3031 if (server.vm_enabled) {
3032 if (key->storage == REDIS_VM_MEMORY ||
3033 key->storage == REDIS_VM_SWAPPING)
3034 {
3035 /* If we were swapping the object out, stop it, this key
3036 * was requested. */
3037 if (key->storage == REDIS_VM_SWAPPING)
3038 vmCancelThreadedIOJob(key);
3039 /* Update the access time of the key for the aging algorithm. */
3040 key->vm.atime = server.unixtime;
3041 } else {
3042 int notify = (key->storage == REDIS_VM_LOADING);
3043
3044 /* Our value was swapped on disk. Bring it at home. */
3045 redisAssert(val == NULL);
3046 val = vmLoadObject(key);
3047 dictGetEntryVal(de) = val;
3048
3049 /* Clients blocked by the VM subsystem may be waiting for
3050 * this key... */
3051 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3052 }
3053 }
3054 return val;
3055 } else {
3056 return NULL;
3057 }
3058 }
3059
3060 static robj *lookupKeyRead(redisDb *db, robj *key) {
3061 expireIfNeeded(db,key);
3062 return lookupKey(db,key);
3063 }
3064
3065 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3066 deleteIfVolatile(db,key);
3067 return lookupKey(db,key);
3068 }
3069
3070 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3071 robj *o = lookupKeyRead(c->db, key);
3072 if (!o) addReply(c,reply);
3073 return o;
3074 }
3075
3076 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3077 robj *o = lookupKeyWrite(c->db, key);
3078 if (!o) addReply(c,reply);
3079 return o;
3080 }
3081
3082 static int checkType(redisClient *c, robj *o, int type) {
3083 if (o->type != type) {
3084 addReply(c,shared.wrongtypeerr);
3085 return 1;
3086 }
3087 return 0;
3088 }
3089
3090 static int deleteKey(redisDb *db, robj *key) {
3091 int retval;
3092
3093 /* We need to protect key from destruction: after the first dictDelete()
3094 * it may happen that 'key' is no longer valid if we don't increment
3095 * it's count. This may happen when we get the object reference directly
3096 * from the hash table with dictRandomKey() or dict iterators */
3097 incrRefCount(key);
3098 if (dictSize(db->expires)) dictDelete(db->expires,key);
3099 retval = dictDelete(db->dict,key);
3100 decrRefCount(key);
3101
3102 return retval == DICT_OK;
3103 }
3104
3105 /* Check if the nul-terminated string 's' can be represented by a long
3106 * (that is, is a number that fits into long without any other space or
3107 * character before or after the digits).
3108 *
3109 * If so, the function returns REDIS_OK and *longval is set to the value
3110 * of the number. Otherwise REDIS_ERR is returned */
3111 static int isStringRepresentableAsLong(sds s, long *longval) {
3112 char buf[32], *endptr;
3113 long value;
3114 int slen;
3115
3116 value = strtol(s, &endptr, 10);
3117 if (endptr[0] != '\0') return REDIS_ERR;
3118 slen = snprintf(buf,32,"%ld",value);
3119
3120 /* If the number converted back into a string is not identical
3121 * then it's not possible to encode the string as integer */
3122 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3123 if (longval) *longval = value;
3124 return REDIS_OK;
3125 }
3126
3127 /* Try to encode a string object in order to save space */
3128 static robj *tryObjectEncoding(robj *o) {
3129 long value;
3130 sds s = o->ptr;
3131
3132 if (o->encoding != REDIS_ENCODING_RAW)
3133 return o; /* Already encoded */
3134
3135 /* It's not safe to encode shared objects: shared objects can be shared
3136 * everywhere in the "object space" of Redis. Encoded objects can only
3137 * appear as "values" (and not, for instance, as keys) */
3138 if (o->refcount > 1) return o;
3139
3140 /* Currently we try to encode only strings */
3141 redisAssert(o->type == REDIS_STRING);
3142
3143 /* Check if we can represent this string as a long integer */
3144 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3145
3146 /* Ok, this object can be encoded */
3147 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3148 decrRefCount(o);
3149 incrRefCount(shared.integers[value]);
3150 return shared.integers[value];
3151 } else {
3152 o->encoding = REDIS_ENCODING_INT;
3153 sdsfree(o->ptr);
3154 o->ptr = (void*) value;
3155 return o;
3156 }
3157 }
3158
3159 /* Get a decoded version of an encoded object (returned as a new object).
3160 * If the object is already raw-encoded just increment the ref count. */
3161 static robj *getDecodedObject(robj *o) {
3162 robj *dec;
3163
3164 if (o->encoding == REDIS_ENCODING_RAW) {
3165 incrRefCount(o);
3166 return o;
3167 }
3168 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3169 char buf[32];
3170
3171 snprintf(buf,32,"%ld",(long)o->ptr);
3172 dec = createStringObject(buf,strlen(buf));
3173 return dec;
3174 } else {
3175 redisPanic("Unknown encoding type");
3176 }
3177 }
3178
3179 /* Compare two string objects via strcmp() or alike.
3180 * Note that the objects may be integer-encoded. In such a case we
3181 * use snprintf() to get a string representation of the numbers on the stack
3182 * and compare the strings, it's much faster than calling getDecodedObject().
3183 *
3184 * Important note: if objects are not integer encoded, but binary-safe strings,
3185 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3186 * binary safe. */
3187 static int compareStringObjects(robj *a, robj *b) {
3188 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3189 char bufa[128], bufb[128], *astr, *bstr;
3190 int bothsds = 1;
3191
3192 if (a == b) return 0;
3193 if (a->encoding != REDIS_ENCODING_RAW) {
3194 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3195 astr = bufa;
3196 bothsds = 0;
3197 } else {
3198 astr = a->ptr;
3199 }
3200 if (b->encoding != REDIS_ENCODING_RAW) {
3201 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3202 bstr = bufb;
3203 bothsds = 0;
3204 } else {
3205 bstr = b->ptr;
3206 }
3207 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3208 }
3209
3210 static size_t stringObjectLen(robj *o) {
3211 redisAssert(o->type == REDIS_STRING);
3212 if (o->encoding == REDIS_ENCODING_RAW) {
3213 return sdslen(o->ptr);
3214 } else {
3215 char buf[32];
3216
3217 return snprintf(buf,32,"%ld",(long)o->ptr);
3218 }
3219 }
3220
3221 static int getDoubleFromObject(robj *o, double *target) {
3222 double value;
3223 char *eptr;
3224
3225 if (o == NULL) {
3226 value = 0;
3227 } else {
3228 redisAssert(o->type == REDIS_STRING);
3229 if (o->encoding == REDIS_ENCODING_RAW) {
3230 value = strtod(o->ptr, &eptr);
3231 if (eptr[0] != '\0') return REDIS_ERR;
3232 } else if (o->encoding == REDIS_ENCODING_INT) {
3233 value = (long)o->ptr;
3234 } else {
3235 redisPanic("Unknown string encoding");
3236 }
3237 }
3238
3239 *target = value;
3240 return REDIS_OK;
3241 }
3242
3243 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3244 double value;
3245 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3246 if (msg != NULL) {
3247 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3248 } else {
3249 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3250 }
3251 return REDIS_ERR;
3252 }
3253
3254 *target = value;
3255 return REDIS_OK;
3256 }
3257
3258 static int getLongLongFromObject(robj *o, long long *target) {
3259 long long value;
3260 char *eptr;
3261
3262 if (o == NULL) {
3263 value = 0;
3264 } else {
3265 redisAssert(o->type == REDIS_STRING);
3266 if (o->encoding == REDIS_ENCODING_RAW) {
3267 value = strtoll(o->ptr, &eptr, 10);
3268 if (eptr[0] != '\0') return REDIS_ERR;
3269 } else if (o->encoding == REDIS_ENCODING_INT) {
3270 value = (long)o->ptr;
3271 } else {
3272 redisPanic("Unknown string encoding");
3273 }
3274 }
3275
3276 *target = value;
3277 return REDIS_OK;
3278 }
3279
3280 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3281 long long value;
3282 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3283 if (msg != NULL) {
3284 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3285 } else {
3286 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3287 }
3288 return REDIS_ERR;
3289 }
3290
3291 *target = value;
3292 return REDIS_OK;
3293 }
3294
3295 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3296 long long value;
3297
3298 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3299 if (value < LONG_MIN || value > LONG_MAX) {
3300 if (msg != NULL) {
3301 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3302 } else {
3303 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3304 }
3305 return REDIS_ERR;
3306 }
3307
3308 *target = value;
3309 return REDIS_OK;
3310 }
3311
3312 /*============================ RDB saving/loading =========================== */
3313
3314 static int rdbSaveType(FILE *fp, unsigned char type) {
3315 if (fwrite(&type,1,1,fp) == 0) return -1;
3316 return 0;
3317 }
3318
3319 static int rdbSaveTime(FILE *fp, time_t t) {
3320 int32_t t32 = (int32_t) t;
3321 if (fwrite(&t32,4,1,fp) == 0) return -1;
3322 return 0;
3323 }
3324
3325 /* check rdbLoadLen() comments for more info */
3326 static int rdbSaveLen(FILE *fp, uint32_t len) {
3327 unsigned char buf[2];
3328
3329 if (len < (1<<6)) {
3330 /* Save a 6 bit len */
3331 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3332 if (fwrite(buf,1,1,fp) == 0) return -1;
3333 } else if (len < (1<<14)) {
3334 /* Save a 14 bit len */
3335 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3336 buf[1] = len&0xFF;
3337 if (fwrite(buf,2,1,fp) == 0) return -1;
3338 } else {
3339 /* Save a 32 bit len */
3340 buf[0] = (REDIS_RDB_32BITLEN<<6);
3341 if (fwrite(buf,1,1,fp) == 0) return -1;
3342 len = htonl(len);
3343 if (fwrite(&len,4,1,fp) == 0) return -1;
3344 }
3345 return 0;
3346 }
3347
3348 /* String objects in the form "2391" "-100" without any space and with a
3349 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3350 * encoded as integers to save space */
3351 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3352 long long value;
3353 char *endptr, buf[32];
3354
3355 /* Check if it's possible to encode this value as a number */
3356 value = strtoll(s, &endptr, 10);
3357 if (endptr[0] != '\0') return 0;
3358 snprintf(buf,32,"%lld",value);
3359
3360 /* If the number converted back into a string is not identical
3361 * then it's not possible to encode the string as integer */
3362 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3363
3364 /* Finally check if it fits in our ranges */
3365 if (value >= -(1<<7) && value <= (1<<7)-1) {
3366 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3367 enc[1] = value&0xFF;
3368 return 2;
3369 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3370 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3371 enc[1] = value&0xFF;
3372 enc[2] = (value>>8)&0xFF;
3373 return 3;
3374 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3375 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3376 enc[1] = value&0xFF;
3377 enc[2] = (value>>8)&0xFF;
3378 enc[3] = (value>>16)&0xFF;
3379 enc[4] = (value>>24)&0xFF;
3380 return 5;
3381 } else {
3382 return 0;
3383 }
3384 }
3385
3386 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3387 size_t comprlen, outlen;
3388 unsigned char byte;
3389 void *out;
3390
3391 /* We require at least four bytes compression for this to be worth it */
3392 if (len <= 4) return 0;
3393 outlen = len-4;
3394 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3395 comprlen = lzf_compress(s, len, out, outlen);
3396 if (comprlen == 0) {
3397 zfree(out);
3398 return 0;
3399 }
3400 /* Data compressed! Let's save it on disk */
3401 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3402 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3403 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3404 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3405 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3406 zfree(out);
3407 return comprlen;
3408
3409 writeerr:
3410 zfree(out);
3411 return -1;
3412 }
3413
3414 /* Save a string objet as [len][data] on disk. If the object is a string
3415 * representation of an integer value we try to safe it in a special form */
3416 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3417 int enclen;
3418
3419 /* Try integer encoding */
3420 if (len <= 11) {
3421 unsigned char buf[5];
3422 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3423 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3424 return 0;
3425 }
3426 }
3427
3428 /* Try LZF compression - under 20 bytes it's unable to compress even
3429 * aaaaaaaaaaaaaaaaaa so skip it */
3430 if (server.rdbcompression && len > 20) {
3431 int retval;
3432
3433 retval = rdbSaveLzfStringObject(fp,s,len);
3434 if (retval == -1) return -1;
3435 if (retval > 0) return 0;
3436 /* retval == 0 means data can't be compressed, save the old way */
3437 }
3438
3439 /* Store verbatim */
3440 if (rdbSaveLen(fp,len) == -1) return -1;
3441 if (len && fwrite(s,len,1,fp) == 0) return -1;
3442 return 0;
3443 }
3444
3445 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3446 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3447 int retval;
3448
3449 /* Avoid incr/decr ref count business when possible.
3450 * This plays well with copy-on-write given that we are probably
3451 * in a child process (BGSAVE). Also this makes sure key objects
3452 * of swapped objects are not incRefCount-ed (an assert does not allow
3453 * this in order to avoid bugs) */
3454 if (obj->encoding != REDIS_ENCODING_RAW) {
3455 obj = getDecodedObject(obj);
3456 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3457 decrRefCount(obj);
3458 } else {
3459 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3460 }
3461 return retval;
3462 }
3463
3464 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3465 * 8 bit integer specifing the length of the representation.
3466 * This 8 bit integer has special values in order to specify the following
3467 * conditions:
3468 * 253: not a number
3469 * 254: + inf
3470 * 255: - inf
3471 */
3472 static int rdbSaveDoubleValue(FILE *fp, double val) {
3473 unsigned char buf[128];
3474 int len;
3475
3476 if (isnan(val)) {
3477 buf[0] = 253;
3478 len = 1;
3479 } else if (!isfinite(val)) {
3480 len = 1;
3481 buf[0] = (val < 0) ? 255 : 254;
3482 } else {
3483 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3484 buf[0] = strlen((char*)buf+1);
3485 len = buf[0]+1;
3486 }
3487 if (fwrite(buf,len,1,fp) == 0) return -1;
3488 return 0;
3489 }
3490
3491 /* Save a Redis object. */
3492 static int rdbSaveObject(FILE *fp, robj *o) {
3493 if (o->type == REDIS_STRING) {
3494 /* Save a string value */
3495 if (rdbSaveStringObject(fp,o) == -1) return -1;
3496 } else if (o->type == REDIS_LIST) {
3497 /* Save a list value */
3498 list *list = o->ptr;
3499 listIter li;
3500 listNode *ln;
3501
3502 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3503 listRewind(list,&li);
3504 while((ln = listNext(&li))) {
3505 robj *eleobj = listNodeValue(ln);
3506
3507 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3508 }
3509 } else if (o->type == REDIS_SET) {
3510 /* Save a set value */
3511 dict *set = o->ptr;
3512 dictIterator *di = dictGetIterator(set);
3513 dictEntry *de;
3514
3515 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3516 while((de = dictNext(di)) != NULL) {
3517 robj *eleobj = dictGetEntryKey(de);
3518
3519 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3520 }
3521 dictReleaseIterator(di);
3522 } else if (o->type == REDIS_ZSET) {
3523 /* Save a set value */
3524 zset *zs = o->ptr;
3525 dictIterator *di = dictGetIterator(zs->dict);
3526 dictEntry *de;
3527
3528 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3529 while((de = dictNext(di)) != NULL) {
3530 robj *eleobj = dictGetEntryKey(de);
3531 double *score = dictGetEntryVal(de);
3532
3533 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3534 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3535 }
3536 dictReleaseIterator(di);
3537 } else if (o->type == REDIS_HASH) {
3538 /* Save a hash value */
3539 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3540 unsigned char *p = zipmapRewind(o->ptr);
3541 unsigned int count = zipmapLen(o->ptr);
3542 unsigned char *key, *val;
3543 unsigned int klen, vlen;
3544
3545 if (rdbSaveLen(fp,count) == -1) return -1;
3546 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3547 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3548 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3549 }
3550 } else {
3551 dictIterator *di = dictGetIterator(o->ptr);
3552 dictEntry *de;
3553
3554 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3555 while((de = dictNext(di)) != NULL) {
3556 robj *key = dictGetEntryKey(de);
3557 robj *val = dictGetEntryVal(de);
3558
3559 if (rdbSaveStringObject(fp,key) == -1) return -1;
3560 if (rdbSaveStringObject(fp,val) == -1) return -1;
3561 }
3562 dictReleaseIterator(di);
3563 }
3564 } else {
3565 redisPanic("Unknown object type");
3566 }
3567 return 0;
3568 }
3569
3570 /* Return the length the object will have on disk if saved with
3571 * the rdbSaveObject() function. Currently we use a trick to get
3572 * this length with very little changes to the code. In the future
3573 * we could switch to a faster solution. */
3574 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3575 if (fp == NULL) fp = server.devnull;
3576 rewind(fp);
3577 assert(rdbSaveObject(fp,o) != 1);
3578 return ftello(fp);
3579 }
3580
3581 /* Return the number of pages required to save this object in the swap file */
3582 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3583 off_t bytes = rdbSavedObjectLen(o,fp);
3584
3585 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3586 }
3587
3588 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3589 static int rdbSave(char *filename) {
3590 dictIterator *di = NULL;
3591 dictEntry *de;
3592 FILE *fp;
3593 char tmpfile[256];
3594 int j;
3595 time_t now = time(NULL);
3596
3597 /* Wait for I/O therads to terminate, just in case this is a
3598 * foreground-saving, to avoid seeking the swap file descriptor at the
3599 * same time. */
3600 if (server.vm_enabled)
3601 waitEmptyIOJobsQueue();
3602
3603 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3604 fp = fopen(tmpfile,"w");
3605 if (!fp) {
3606 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3607 return REDIS_ERR;
3608 }
3609 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3610 for (j = 0; j < server.dbnum; j++) {
3611 redisDb *db = server.db+j;
3612 dict *d = db->dict;
3613 if (dictSize(d) == 0) continue;
3614 di = dictGetIterator(d);
3615 if (!di) {
3616 fclose(fp);
3617 return REDIS_ERR;
3618 }
3619
3620 /* Write the SELECT DB opcode */
3621 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3622 if (rdbSaveLen(fp,j) == -1) goto werr;
3623
3624 /* Iterate this DB writing every entry */
3625 while((de = dictNext(di)) != NULL) {
3626 robj *key = dictGetEntryKey(de);
3627 robj *o = dictGetEntryVal(de);
3628 time_t expiretime = getExpire(db,key);
3629
3630 /* Save the expire time */
3631 if (expiretime != -1) {
3632 /* If this key is already expired skip it */
3633 if (expiretime < now) continue;
3634 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3635 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3636 }
3637 /* Save the key and associated value. This requires special
3638 * handling if the value is swapped out. */
3639 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3640 key->storage == REDIS_VM_SWAPPING) {
3641 /* Save type, key, value */
3642 if (rdbSaveType(fp,o->type) == -1) goto werr;
3643 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3644 if (rdbSaveObject(fp,o) == -1) goto werr;
3645 } else {
3646 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3647 robj *po;
3648 /* Get a preview of the object in memory */
3649 po = vmPreviewObject(key);
3650 /* Save type, key, value */
3651 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3652 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3653 if (rdbSaveObject(fp,po) == -1) goto werr;
3654 /* Remove the loaded object from memory */
3655 decrRefCount(po);
3656 }
3657 }
3658 dictReleaseIterator(di);
3659 }
3660 /* EOF opcode */
3661 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3662
3663 /* Make sure data will not remain on the OS's output buffers */
3664 fflush(fp);
3665 fsync(fileno(fp));
3666 fclose(fp);
3667
3668 /* Use RENAME to make sure the DB file is changed atomically only
3669 * if the generate DB file is ok. */
3670 if (rename(tmpfile,filename) == -1) {
3671 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3672 unlink(tmpfile);
3673 return REDIS_ERR;
3674 }
3675 redisLog(REDIS_NOTICE,"DB saved on disk");
3676 server.dirty = 0;
3677 server.lastsave = time(NULL);
3678 return REDIS_OK;
3679
3680 werr:
3681 fclose(fp);
3682 unlink(tmpfile);
3683 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3684 if (di) dictReleaseIterator(di);
3685 return REDIS_ERR;
3686 }
3687
3688 static int rdbSaveBackground(char *filename) {
3689 pid_t childpid;
3690
3691 if (server.bgsavechildpid != -1) return REDIS_ERR;
3692 if (server.vm_enabled) waitEmptyIOJobsQueue();
3693 if ((childpid = fork()) == 0) {
3694 /* Child */
3695 if (server.vm_enabled) vmReopenSwapFile();
3696 close(server.fd);
3697 if (rdbSave(filename) == REDIS_OK) {
3698 _exit(0);
3699 } else {
3700 _exit(1);
3701 }
3702 } else {
3703 /* Parent */
3704 if (childpid == -1) {
3705 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3706 strerror(errno));
3707 return REDIS_ERR;
3708 }
3709 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3710 server.bgsavechildpid = childpid;
3711 updateDictResizePolicy();
3712 return REDIS_OK;
3713 }
3714 return REDIS_OK; /* unreached */
3715 }
3716
3717 static void rdbRemoveTempFile(pid_t childpid) {
3718 char tmpfile[256];
3719
3720 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3721 unlink(tmpfile);
3722 }
3723
3724 static int rdbLoadType(FILE *fp) {
3725 unsigned char type;
3726 if (fread(&type,1,1,fp) == 0) return -1;
3727 return type;
3728 }
3729
3730 static time_t rdbLoadTime(FILE *fp) {
3731 int32_t t32;
3732 if (fread(&t32,4,1,fp) == 0) return -1;
3733 return (time_t) t32;
3734 }
3735
3736 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3737 * of this file for a description of how this are stored on disk.
3738 *
3739 * isencoded is set to 1 if the readed length is not actually a length but
3740 * an "encoding type", check the above comments for more info */
3741 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3742 unsigned char buf[2];
3743 uint32_t len;
3744 int type;
3745
3746 if (isencoded) *isencoded = 0;
3747 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3748 type = (buf[0]&0xC0)>>6;
3749 if (type == REDIS_RDB_6BITLEN) {
3750 /* Read a 6 bit len */
3751 return buf[0]&0x3F;
3752 } else if (type == REDIS_RDB_ENCVAL) {
3753 /* Read a 6 bit len encoding type */
3754 if (isencoded) *isencoded = 1;
3755 return buf[0]&0x3F;
3756 } else if (type == REDIS_RDB_14BITLEN) {
3757 /* Read a 14 bit len */
3758 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3759 return ((buf[0]&0x3F)<<8)|buf[1];
3760 } else {
3761 /* Read a 32 bit len */
3762 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3763 return ntohl(len);
3764 }
3765 }
3766
3767 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3768 unsigned char enc[4];
3769 long long val;
3770
3771 if (enctype == REDIS_RDB_ENC_INT8) {
3772 if (fread(enc,1,1,fp) == 0) return NULL;
3773 val = (signed char)enc[0];
3774 } else if (enctype == REDIS_RDB_ENC_INT16) {
3775 uint16_t v;
3776 if (fread(enc,2,1,fp) == 0) return NULL;
3777 v = enc[0]|(enc[1]<<8);
3778 val = (int16_t)v;
3779 } else if (enctype == REDIS_RDB_ENC_INT32) {
3780 uint32_t v;
3781 if (fread(enc,4,1,fp) == 0) return NULL;
3782 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3783 val = (int32_t)v;
3784 } else {
3785 val = 0; /* anti-warning */
3786 redisPanic("Unknown RDB integer encoding type");
3787 }
3788 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3789 }
3790
3791 static robj *rdbLoadLzfStringObject(FILE*fp) {
3792 unsigned int len, clen;
3793 unsigned char *c = NULL;
3794 sds val = NULL;
3795
3796 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3797 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3798 if ((c = zmalloc(clen)) == NULL) goto err;
3799 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3800 if (fread(c,clen,1,fp) == 0) goto err;
3801 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3802 zfree(c);
3803 return createObject(REDIS_STRING,val);
3804 err:
3805 zfree(c);
3806 sdsfree(val);
3807 return NULL;
3808 }
3809
3810 static robj *rdbLoadStringObject(FILE*fp) {
3811 int isencoded;
3812 uint32_t len;
3813 sds val;
3814
3815 len = rdbLoadLen(fp,&isencoded);
3816 if (isencoded) {
3817 switch(len) {
3818 case REDIS_RDB_ENC_INT8:
3819 case REDIS_RDB_ENC_INT16:
3820 case REDIS_RDB_ENC_INT32:
3821 return rdbLoadIntegerObject(fp,len);
3822 case REDIS_RDB_ENC_LZF:
3823 return rdbLoadLzfStringObject(fp);
3824 default:
3825 redisPanic("Unknown RDB encoding type");
3826 }
3827 }
3828
3829 if (len == REDIS_RDB_LENERR) return NULL;
3830 val = sdsnewlen(NULL,len);
3831 if (len && fread(val,len,1,fp) == 0) {
3832 sdsfree(val);
3833 return NULL;
3834 }
3835 return createObject(REDIS_STRING,val);
3836 }
3837
3838 /* For information about double serialization check rdbSaveDoubleValue() */
3839 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3840 char buf[128];
3841 unsigned char len;
3842
3843 if (fread(&len,1,1,fp) == 0) return -1;
3844 switch(len) {
3845 case 255: *val = R_NegInf; return 0;
3846 case 254: *val = R_PosInf; return 0;
3847 case 253: *val = R_Nan; return 0;
3848 default:
3849 if (fread(buf,len,1,fp) == 0) return -1;
3850 buf[len] = '\0';
3851 sscanf(buf, "%lg", val);
3852 return 0;
3853 }
3854 }
3855
3856 /* Load a Redis object of the specified type from the specified file.
3857 * On success a newly allocated object is returned, otherwise NULL. */
3858 static robj *rdbLoadObject(int type, FILE *fp) {
3859 robj *o;
3860
3861 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3862 if (type == REDIS_STRING) {
3863 /* Read string value */
3864 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3865 o = tryObjectEncoding(o);
3866 } else if (type == REDIS_LIST || type == REDIS_SET) {
3867 /* Read list/set value */
3868 uint32_t listlen;
3869
3870 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3871 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3872 /* It's faster to expand the dict to the right size asap in order
3873 * to avoid rehashing */
3874 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3875 dictExpand(o->ptr,listlen);
3876 /* Load every single element of the list/set */
3877 while(listlen--) {
3878 robj *ele;
3879
3880 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3881 ele = tryObjectEncoding(ele);
3882 if (type == REDIS_LIST) {
3883 listAddNodeTail((list*)o->ptr,ele);
3884 } else {
3885 dictAdd((dict*)o->ptr,ele,NULL);
3886 }
3887 }
3888 } else if (type == REDIS_ZSET) {
3889 /* Read list/set value */
3890 size_t zsetlen;
3891 zset *zs;
3892
3893 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3894 o = createZsetObject();
3895 zs = o->ptr;
3896 /* Load every single element of the list/set */
3897 while(zsetlen--) {
3898 robj *ele;
3899 double *score = zmalloc(sizeof(double));
3900
3901 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3902 ele = tryObjectEncoding(ele);
3903 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3904 dictAdd(zs->dict,ele,score);
3905 zslInsert(zs->zsl,*score,ele);
3906 incrRefCount(ele); /* added to skiplist */
3907 }
3908 } else if (type == REDIS_HASH) {
3909 size_t hashlen;
3910
3911 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3912 o = createHashObject();
3913 /* Too many entries? Use an hash table. */
3914 if (hashlen > server.hash_max_zipmap_entries)
3915 convertToRealHash(o);
3916 /* Load every key/value, then set it into the zipmap or hash
3917 * table, as needed. */
3918 while(hashlen--) {
3919 robj *key, *val;
3920
3921 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3922 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3923 /* If we are using a zipmap and there are too big values
3924 * the object is converted to real hash table encoding. */
3925 if (o->encoding != REDIS_ENCODING_HT &&
3926 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3927 sdslen(val->ptr) > server.hash_max_zipmap_value))
3928 {
3929 convertToRealHash(o);
3930 }
3931
3932 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3933 unsigned char *zm = o->ptr;
3934
3935 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3936 val->ptr,sdslen(val->ptr),NULL);
3937 o->ptr = zm;
3938 decrRefCount(key);
3939 decrRefCount(val);
3940 } else {
3941 key = tryObjectEncoding(key);
3942 val = tryObjectEncoding(val);
3943 dictAdd((dict*)o->ptr,key,val);
3944 }
3945 }
3946 } else {
3947 redisPanic("Unknown object type");
3948 }
3949 return o;
3950 }
3951
3952 static int rdbLoad(char *filename) {
3953 FILE *fp;
3954 robj *keyobj = NULL;
3955 uint32_t dbid;
3956 int type, retval, rdbver;
3957 dict *d = server.db[0].dict;
3958 redisDb *db = server.db+0;
3959 char buf[1024];
3960 time_t expiretime = -1, now = time(NULL);
3961 long long loadedkeys = 0;
3962
3963 fp = fopen(filename,"r");
3964 if (!fp) return REDIS_ERR;
3965 if (fread(buf,9,1,fp) == 0) goto eoferr;
3966 buf[9] = '\0';
3967 if (memcmp(buf,"REDIS",5) != 0) {
3968 fclose(fp);
3969 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3970 return REDIS_ERR;
3971 }
3972 rdbver = atoi(buf+5);
3973 if (rdbver != 1) {
3974 fclose(fp);
3975 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3976 return REDIS_ERR;
3977 }
3978 while(1) {
3979 robj *o;
3980
3981 /* Read type. */
3982 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3983 if (type == REDIS_EXPIRETIME) {
3984 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3985 /* We read the time so we need to read the object type again */
3986 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3987 }
3988 if (type == REDIS_EOF) break;
3989 /* Handle SELECT DB opcode as a special case */
3990 if (type == REDIS_SELECTDB) {
3991 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3992 goto eoferr;
3993 if (dbid >= (unsigned)server.dbnum) {
3994 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3995 exit(1);
3996 }
3997 db = server.db+dbid;
3998 d = db->dict;
3999 continue;
4000 }
4001 /* Read key */
4002 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4003 /* Read value */
4004 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4005 /* Add the new object in the hash table */
4006 retval = dictAdd(d,keyobj,o);
4007 if (retval == DICT_ERR) {
4008 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
4009 exit(1);
4010 }
4011 /* Set the expire time if needed */
4012 if (expiretime != -1) {
4013 setExpire(db,keyobj,expiretime);
4014 /* Delete this key if already expired */
4015 if (expiretime < now) deleteKey(db,keyobj);
4016 expiretime = -1;
4017 }
4018 keyobj = o = NULL;
4019 /* Handle swapping while loading big datasets when VM is on */
4020 loadedkeys++;
4021 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
4022 while (zmalloc_used_memory() > server.vm_max_memory) {
4023 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4024 }
4025 }
4026 }
4027 fclose(fp);
4028 return REDIS_OK;
4029
4030 eoferr: /* unexpected end of file is handled here with a fatal exit */
4031 if (keyobj) decrRefCount(keyobj);
4032 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4033 exit(1);
4034 return REDIS_ERR; /* Just to avoid warning */
4035 }
4036
4037 /*================================== Commands =============================== */
4038
4039 static void authCommand(redisClient *c) {
4040 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4041 c->authenticated = 1;
4042 addReply(c,shared.ok);
4043 } else {
4044 c->authenticated = 0;
4045 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4046 }
4047 }
4048
4049 static void pingCommand(redisClient *c) {
4050 addReply(c,shared.pong);
4051 }
4052
4053 static void echoCommand(redisClient *c) {
4054 addReplyBulk(c,c->argv[1]);
4055 }
4056
4057 /*=================================== Strings =============================== */
4058
4059 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4060 int retval;
4061 long seconds = 0; /* initialized to avoid an harmness warning */
4062
4063 if (expire) {
4064 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4065 return;
4066 if (seconds <= 0) {
4067 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4068 return;
4069 }
4070 }
4071
4072 if (nx) deleteIfVolatile(c->db,key);
4073 retval = dictAdd(c->db->dict,key,val);
4074 if (retval == DICT_ERR) {
4075 if (!nx) {
4076 /* If the key is about a swapped value, we want a new key object
4077 * to overwrite the old. So we delete the old key in the database.
4078 * This will also make sure that swap pages about the old object
4079 * will be marked as free. */
4080 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4081 incrRefCount(key);
4082 dictReplace(c->db->dict,key,val);
4083 incrRefCount(val);
4084 } else {
4085 addReply(c,shared.czero);
4086 return;
4087 }
4088 } else {
4089 incrRefCount(key);
4090 incrRefCount(val);
4091 }
4092 server.dirty++;
4093 removeExpire(c->db,key);
4094 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4095 addReply(c, nx ? shared.cone : shared.ok);
4096 }
4097
4098 static void setCommand(redisClient *c) {
4099 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4100 }
4101
4102 static void setnxCommand(redisClient *c) {
4103 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4104 }
4105
4106 static void setexCommand(redisClient *c) {
4107 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4108 }
4109
4110 static int getGenericCommand(redisClient *c) {
4111 robj *o;
4112
4113 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4114 return REDIS_OK;
4115
4116 if (o->type != REDIS_STRING) {
4117 addReply(c,shared.wrongtypeerr);
4118 return REDIS_ERR;
4119 } else {
4120 addReplyBulk(c,o);
4121 return REDIS_OK;
4122 }
4123 }
4124
4125 static void getCommand(redisClient *c) {
4126 getGenericCommand(c);
4127 }
4128
4129 static void getsetCommand(redisClient *c) {
4130 if (getGenericCommand(c) == REDIS_ERR) return;
4131 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4132 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4133 } else {
4134 incrRefCount(c->argv[1]);
4135 }
4136 incrRefCount(c->argv[2]);
4137 server.dirty++;
4138 removeExpire(c->db,c->argv[1]);
4139 }
4140
4141 static void mgetCommand(redisClient *c) {
4142 int j;
4143
4144 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4145 for (j = 1; j < c->argc; j++) {
4146 robj *o = lookupKeyRead(c->db,c->argv[j]);
4147 if (o == NULL) {
4148 addReply(c,shared.nullbulk);
4149 } else {
4150 if (o->type != REDIS_STRING) {
4151 addReply(c,shared.nullbulk);
4152 } else {
4153 addReplyBulk(c,o);
4154 }
4155 }
4156 }
4157 }
4158
4159 static void msetGenericCommand(redisClient *c, int nx) {
4160 int j, busykeys = 0;
4161
4162 if ((c->argc % 2) == 0) {
4163 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4164 return;
4165 }
4166 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4167 * set nothing at all if at least one already key exists. */
4168 if (nx) {
4169 for (j = 1; j < c->argc; j += 2) {
4170 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4171 busykeys++;
4172 }
4173 }
4174 }
4175 if (busykeys) {
4176 addReply(c, shared.czero);
4177 return;
4178 }
4179
4180 for (j = 1; j < c->argc; j += 2) {
4181 int retval;
4182
4183 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4184 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4185 if (retval == DICT_ERR) {
4186 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4187 incrRefCount(c->argv[j+1]);
4188 } else {
4189 incrRefCount(c->argv[j]);
4190 incrRefCount(c->argv[j+1]);
4191 }
4192 removeExpire(c->db,c->argv[j]);
4193 }
4194 server.dirty += (c->argc-1)/2;
4195 addReply(c, nx ? shared.cone : shared.ok);
4196 }
4197
4198 static void msetCommand(redisClient *c) {
4199 msetGenericCommand(c,0);
4200 }
4201
4202 static void msetnxCommand(redisClient *c) {
4203 msetGenericCommand(c,1);
4204 }
4205
4206 static void incrDecrCommand(redisClient *c, long long incr) {
4207 long long value;
4208 int retval;
4209 robj *o;
4210
4211 o = lookupKeyWrite(c->db,c->argv[1]);
4212
4213 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4214
4215 value += incr;
4216 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4217 o = tryObjectEncoding(o);
4218 retval = dictAdd(c->db->dict,c->argv[1],o);
4219 if (retval == DICT_ERR) {
4220 dictReplace(c->db->dict,c->argv[1],o);
4221 removeExpire(c->db,c->argv[1]);
4222 } else {
4223 incrRefCount(c->argv[1]);
4224 }
4225 server.dirty++;
4226 addReply(c,shared.colon);
4227 addReply(c,o);
4228 addReply(c,shared.crlf);
4229 }
4230
4231 static void incrCommand(redisClient *c) {
4232 incrDecrCommand(c,1);
4233 }
4234
4235 static void decrCommand(redisClient *c) {
4236 incrDecrCommand(c,-1);
4237 }
4238
4239 static void incrbyCommand(redisClient *c) {
4240 long long incr;
4241
4242 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4243 incrDecrCommand(c,incr);
4244 }
4245
4246 static void decrbyCommand(redisClient *c) {
4247 long long incr;
4248
4249 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4250 incrDecrCommand(c,-incr);
4251 }
4252
4253 static void appendCommand(redisClient *c) {
4254 int retval;
4255 size_t totlen;
4256 robj *o;
4257
4258 o = lookupKeyWrite(c->db,c->argv[1]);
4259 if (o == NULL) {
4260 /* Create the key */
4261 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4262 incrRefCount(c->argv[1]);
4263 incrRefCount(c->argv[2]);
4264 totlen = stringObjectLen(c->argv[2]);
4265 } else {
4266 dictEntry *de;
4267
4268 de = dictFind(c->db->dict,c->argv[1]);
4269 assert(de != NULL);
4270
4271 o = dictGetEntryVal(de);
4272 if (o->type != REDIS_STRING) {
4273 addReply(c,shared.wrongtypeerr);
4274 return;
4275 }
4276 /* If the object is specially encoded or shared we have to make
4277 * a copy */
4278 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4279 robj *decoded = getDecodedObject(o);
4280
4281 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4282 decrRefCount(decoded);
4283 dictReplace(c->db->dict,c->argv[1],o);
4284 }
4285 /* APPEND! */
4286 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4287 o->ptr = sdscatlen(o->ptr,
4288 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4289 } else {
4290 o->ptr = sdscatprintf(o->ptr, "%ld",
4291 (unsigned long) c->argv[2]->ptr);
4292 }
4293 totlen = sdslen(o->ptr);
4294 }
4295 server.dirty++;
4296 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4297 }
4298
4299 static void substrCommand(redisClient *c) {
4300 robj *o;
4301 long start = atoi(c->argv[2]->ptr);
4302 long end = atoi(c->argv[3]->ptr);
4303 size_t rangelen, strlen;
4304 sds range;
4305
4306 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4307 checkType(c,o,REDIS_STRING)) return;
4308
4309 o = getDecodedObject(o);
4310 strlen = sdslen(o->ptr);
4311
4312 /* convert negative indexes */
4313 if (start < 0) start = strlen+start;
4314 if (end < 0) end = strlen+end;
4315 if (start < 0) start = 0;
4316 if (end < 0) end = 0;
4317
4318 /* indexes sanity checks */
4319 if (start > end || (size_t)start >= strlen) {
4320 /* Out of range start or start > end result in null reply */
4321 addReply(c,shared.nullbulk);
4322 decrRefCount(o);
4323 return;
4324 }
4325 if ((size_t)end >= strlen) end = strlen-1;
4326 rangelen = (end-start)+1;
4327
4328 /* Return the result */
4329 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4330 range = sdsnewlen((char*)o->ptr+start,rangelen);
4331 addReplySds(c,range);
4332 addReply(c,shared.crlf);
4333 decrRefCount(o);
4334 }
4335
4336 /* ========================= Type agnostic commands ========================= */
4337
4338 static void delCommand(redisClient *c) {
4339 int deleted = 0, j;
4340
4341 for (j = 1; j < c->argc; j++) {
4342 if (deleteKey(c->db,c->argv[j])) {
4343 server.dirty++;
4344 deleted++;
4345 }
4346 }
4347 addReplyLong(c,deleted);
4348 }
4349
4350 static void existsCommand(redisClient *c) {
4351 expireIfNeeded(c->db,c->argv[1]);
4352 if (dictFind(c->db->dict,c->argv[1])) {
4353 addReply(c, shared.cone);
4354 } else {
4355 addReply(c, shared.czero);
4356 }
4357 }
4358
4359 static void selectCommand(redisClient *c) {
4360 int id = atoi(c->argv[1]->ptr);
4361
4362 if (selectDb(c,id) == REDIS_ERR) {
4363 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4364 } else {
4365 addReply(c,shared.ok);
4366 }
4367 }
4368
4369 static void randomkeyCommand(redisClient *c) {
4370 dictEntry *de;
4371 robj *key;
4372
4373 while(1) {
4374 de = dictGetRandomKey(c->db->dict);
4375 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4376 }
4377
4378 if (de == NULL) {
4379 addReply(c,shared.nullbulk);
4380 return;
4381 }
4382
4383 key = dictGetEntryKey(de);
4384 if (server.vm_enabled) {
4385 key = dupStringObject(key);
4386 addReplyBulk(c,key);
4387 decrRefCount(key);
4388 } else {
4389 addReplyBulk(c,key);
4390 }
4391 }
4392
4393 static void keysCommand(redisClient *c) {
4394 dictIterator *di;
4395 dictEntry *de;
4396 sds pattern = c->argv[1]->ptr;
4397 int plen = sdslen(pattern);
4398 unsigned long numkeys = 0;
4399 robj *lenobj = createObject(REDIS_STRING,NULL);
4400
4401 di = dictGetIterator(c->db->dict);
4402 addReply(c,lenobj);
4403 decrRefCount(lenobj);
4404 while((de = dictNext(di)) != NULL) {
4405 robj *keyobj = dictGetEntryKey(de);
4406
4407 sds key = keyobj->ptr;
4408 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4409 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4410 if (expireIfNeeded(c->db,keyobj) == 0) {
4411 addReplyBulk(c,keyobj);
4412 numkeys++;
4413 }
4414 }
4415 }
4416 dictReleaseIterator(di);
4417 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4418 }
4419
4420 static void dbsizeCommand(redisClient *c) {
4421 addReplySds(c,
4422 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4423 }
4424
4425 static void lastsaveCommand(redisClient *c) {
4426 addReplySds(c,
4427 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4428 }
4429
4430 static void typeCommand(redisClient *c) {
4431 robj *o;
4432 char *type;
4433
4434 o = lookupKeyRead(c->db,c->argv[1]);
4435 if (o == NULL) {
4436 type = "+none";
4437 } else {
4438 switch(o->type) {
4439 case REDIS_STRING: type = "+string"; break;
4440 case REDIS_LIST: type = "+list"; break;
4441 case REDIS_SET: type = "+set"; break;
4442 case REDIS_ZSET: type = "+zset"; break;
4443 case REDIS_HASH: type = "+hash"; break;
4444 default: type = "+unknown"; break;
4445 }
4446 }
4447 addReplySds(c,sdsnew(type));
4448 addReply(c,shared.crlf);
4449 }
4450
4451 static void saveCommand(redisClient *c) {
4452 if (server.bgsavechildpid != -1) {
4453 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4454 return;
4455 }
4456 if (rdbSave(server.dbfilename) == REDIS_OK) {
4457 addReply(c,shared.ok);
4458 } else {
4459 addReply(c,shared.err);
4460 }
4461 }
4462
4463 static void bgsaveCommand(redisClient *c) {
4464 if (server.bgsavechildpid != -1) {
4465 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4466 return;
4467 }
4468 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4469 char *status = "+Background saving started\r\n";
4470 addReplySds(c,sdsnew(status));
4471 } else {
4472 addReply(c,shared.err);
4473 }
4474 }
4475
4476 static void shutdownCommand(redisClient *c) {
4477 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4478 /* Kill the saving child if there is a background saving in progress.
4479 We want to avoid race conditions, for instance our saving child may
4480 overwrite the synchronous saving did by SHUTDOWN. */
4481 if (server.bgsavechildpid != -1) {
4482 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4483 kill(server.bgsavechildpid,SIGKILL);
4484 rdbRemoveTempFile(server.bgsavechildpid);
4485 }
4486 if (server.appendonly) {
4487 /* Append only file: fsync() the AOF and exit */
4488 fsync(server.appendfd);
4489 if (server.vm_enabled) unlink(server.vm_swap_file);
4490 exit(0);
4491 } else {
4492 /* Snapshotting. Perform a SYNC SAVE and exit */
4493 if (rdbSave(server.dbfilename) == REDIS_OK) {
4494 if (server.daemonize)
4495 unlink(server.pidfile);
4496 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4497 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4498 exit(0);
4499 } else {
4500 /* Ooops.. error saving! The best we can do is to continue
4501 * operating. Note that if there was a background saving process,
4502 * in the next cron() Redis will be notified that the background
4503 * saving aborted, handling special stuff like slaves pending for
4504 * synchronization... */
4505 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4506 addReplySds(c,
4507 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4508 }
4509 }
4510 }
4511
4512 static void renameGenericCommand(redisClient *c, int nx) {
4513 robj *o;
4514
4515 /* To use the same key as src and dst is probably an error */
4516 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4517 addReply(c,shared.sameobjecterr);
4518 return;
4519 }
4520
4521 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4522 return;
4523
4524 incrRefCount(o);
4525 deleteIfVolatile(c->db,c->argv[2]);
4526 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4527 if (nx) {
4528 decrRefCount(o);
4529 addReply(c,shared.czero);
4530 return;
4531 }
4532 dictReplace(c->db->dict,c->argv[2],o);
4533 } else {
4534 incrRefCount(c->argv[2]);
4535 }
4536 deleteKey(c->db,c->argv[1]);
4537 server.dirty++;
4538 addReply(c,nx ? shared.cone : shared.ok);
4539 }
4540
4541 static void renameCommand(redisClient *c) {
4542 renameGenericCommand(c,0);
4543 }
4544
4545 static void renamenxCommand(redisClient *c) {
4546 renameGenericCommand(c,1);
4547 }
4548
4549 static void moveCommand(redisClient *c) {
4550 robj *o;
4551 redisDb *src, *dst;
4552 int srcid;
4553
4554 /* Obtain source and target DB pointers */
4555 src = c->db;
4556 srcid = c->db->id;
4557 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4558 addReply(c,shared.outofrangeerr);
4559 return;
4560 }
4561 dst = c->db;
4562 selectDb(c,srcid); /* Back to the source DB */
4563
4564 /* If the user is moving using as target the same
4565 * DB as the source DB it is probably an error. */
4566 if (src == dst) {
4567 addReply(c,shared.sameobjecterr);
4568 return;
4569 }
4570
4571 /* Check if the element exists and get a reference */
4572 o = lookupKeyWrite(c->db,c->argv[1]);
4573 if (!o) {
4574 addReply(c,shared.czero);
4575 return;
4576 }
4577
4578 /* Try to add the element to the target DB */
4579 deleteIfVolatile(dst,c->argv[1]);
4580 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4581 addReply(c,shared.czero);
4582 return;
4583 }
4584 incrRefCount(c->argv[1]);
4585 incrRefCount(o);
4586
4587 /* OK! key moved, free the entry in the source DB */
4588 deleteKey(src,c->argv[1]);
4589 server.dirty++;
4590 addReply(c,shared.cone);
4591 }
4592
4593 /* =================================== Lists ================================ */
4594 static void pushGenericCommand(redisClient *c, int where) {
4595 robj *lobj;
4596 list *list;
4597
4598 lobj = lookupKeyWrite(c->db,c->argv[1]);
4599 if (lobj == NULL) {
4600 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4601 addReply(c,shared.cone);
4602 return;
4603 }
4604 lobj = createListObject();
4605 list = lobj->ptr;
4606 if (where == REDIS_HEAD) {
4607 listAddNodeHead(list,c->argv[2]);
4608 } else {
4609 listAddNodeTail(list,c->argv[2]);
4610 }
4611 dictAdd(c->db->dict,c->argv[1],lobj);
4612 incrRefCount(c->argv[1]);
4613 incrRefCount(c->argv[2]);
4614 } else {
4615 if (lobj->type != REDIS_LIST) {
4616 addReply(c,shared.wrongtypeerr);
4617 return;
4618 }
4619 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4620 addReply(c,shared.cone);
4621 return;
4622 }
4623 list = lobj->ptr;
4624 if (where == REDIS_HEAD) {
4625 listAddNodeHead(list,c->argv[2]);
4626 } else {
4627 listAddNodeTail(list,c->argv[2]);
4628 }
4629 incrRefCount(c->argv[2]);
4630 }
4631 server.dirty++;
4632 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4633 }
4634
4635 static void lpushCommand(redisClient *c) {
4636 pushGenericCommand(c,REDIS_HEAD);
4637 }
4638
4639 static void rpushCommand(redisClient *c) {
4640 pushGenericCommand(c,REDIS_TAIL);
4641 }
4642
4643 static void llenCommand(redisClient *c) {
4644 robj *o;
4645 list *l;
4646
4647 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4648 checkType(c,o,REDIS_LIST)) return;
4649
4650 l = o->ptr;
4651 addReplyUlong(c,listLength(l));
4652 }
4653
4654 static void lindexCommand(redisClient *c) {
4655 robj *o;
4656 int index = atoi(c->argv[2]->ptr);
4657 list *list;
4658 listNode *ln;
4659
4660 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4661 checkType(c,o,REDIS_LIST)) return;
4662 list = o->ptr;
4663
4664 ln = listIndex(list, index);
4665 if (ln == NULL) {
4666 addReply(c,shared.nullbulk);
4667 } else {
4668 robj *ele = listNodeValue(ln);
4669 addReplyBulk(c,ele);
4670 }
4671 }
4672
4673 static void lsetCommand(redisClient *c) {
4674 robj *o;
4675 int index = atoi(c->argv[2]->ptr);
4676 list *list;
4677 listNode *ln;
4678
4679 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4680 checkType(c,o,REDIS_LIST)) return;
4681 list = o->ptr;
4682
4683 ln = listIndex(list, index);
4684 if (ln == NULL) {
4685 addReply(c,shared.outofrangeerr);
4686 } else {
4687 robj *ele = listNodeValue(ln);
4688
4689 decrRefCount(ele);
4690 listNodeValue(ln) = c->argv[3];
4691 incrRefCount(c->argv[3]);
4692 addReply(c,shared.ok);
4693 server.dirty++;
4694 }
4695 }
4696
4697 static void popGenericCommand(redisClient *c, int where) {
4698 robj *o;
4699 list *list;
4700 listNode *ln;
4701
4702 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4703 checkType(c,o,REDIS_LIST)) return;
4704 list = o->ptr;
4705
4706 if (where == REDIS_HEAD)
4707 ln = listFirst(list);
4708 else
4709 ln = listLast(list);
4710
4711 if (ln == NULL) {
4712 addReply(c,shared.nullbulk);
4713 } else {
4714 robj *ele = listNodeValue(ln);
4715 addReplyBulk(c,ele);
4716 listDelNode(list,ln);
4717 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4718 server.dirty++;
4719 }
4720 }
4721
4722 static void lpopCommand(redisClient *c) {
4723 popGenericCommand(c,REDIS_HEAD);
4724 }
4725
4726 static void rpopCommand(redisClient *c) {
4727 popGenericCommand(c,REDIS_TAIL);
4728 }
4729
4730 static void lrangeCommand(redisClient *c) {
4731 robj *o;
4732 int start = atoi(c->argv[2]->ptr);
4733 int end = atoi(c->argv[3]->ptr);
4734 int llen;
4735 int rangelen, j;
4736 list *list;
4737 listNode *ln;
4738 robj *ele;
4739
4740 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4741 || checkType(c,o,REDIS_LIST)) return;
4742 list = o->ptr;
4743 llen = listLength(list);
4744
4745 /* convert negative indexes */
4746 if (start < 0) start = llen+start;
4747 if (end < 0) end = llen+end;
4748 if (start < 0) start = 0;
4749 if (end < 0) end = 0;
4750
4751 /* indexes sanity checks */
4752 if (start > end || start >= llen) {
4753 /* Out of range start or start > end result in empty list */
4754 addReply(c,shared.emptymultibulk);
4755 return;
4756 }
4757 if (end >= llen) end = llen-1;
4758 rangelen = (end-start)+1;
4759
4760 /* Return the result in form of a multi-bulk reply */
4761 ln = listIndex(list, start);
4762 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4763 for (j = 0; j < rangelen; j++) {
4764 ele = listNodeValue(ln);
4765 addReplyBulk(c,ele);
4766 ln = ln->next;
4767 }
4768 }
4769
4770 static void ltrimCommand(redisClient *c) {
4771 robj *o;
4772 int start = atoi(c->argv[2]->ptr);
4773 int end = atoi(c->argv[3]->ptr);
4774 int llen;
4775 int j, ltrim, rtrim;
4776 list *list;
4777 listNode *ln;
4778
4779 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4780 checkType(c,o,REDIS_LIST)) return;
4781 list = o->ptr;
4782 llen = listLength(list);
4783
4784 /* convert negative indexes */
4785 if (start < 0) start = llen+start;
4786 if (end < 0) end = llen+end;
4787 if (start < 0) start = 0;
4788 if (end < 0) end = 0;
4789
4790 /* indexes sanity checks */
4791 if (start > end || start >= llen) {
4792 /* Out of range start or start > end result in empty list */
4793 ltrim = llen;
4794 rtrim = 0;
4795 } else {
4796 if (end >= llen) end = llen-1;
4797 ltrim = start;
4798 rtrim = llen-end-1;
4799 }
4800
4801 /* Remove list elements to perform the trim */
4802 for (j = 0; j < ltrim; j++) {
4803 ln = listFirst(list);
4804 listDelNode(list,ln);
4805 }
4806 for (j = 0; j < rtrim; j++) {
4807 ln = listLast(list);
4808 listDelNode(list,ln);
4809 }
4810 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4811 server.dirty++;
4812 addReply(c,shared.ok);
4813 }
4814
4815 static void lremCommand(redisClient *c) {
4816 robj *o;
4817 list *list;
4818 listNode *ln, *next;
4819 int toremove = atoi(c->argv[2]->ptr);
4820 int removed = 0;
4821 int fromtail = 0;
4822
4823 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4824 checkType(c,o,REDIS_LIST)) return;
4825 list = o->ptr;
4826
4827 if (toremove < 0) {
4828 toremove = -toremove;
4829 fromtail = 1;
4830 }
4831 ln = fromtail ? list->tail : list->head;
4832 while (ln) {
4833 robj *ele = listNodeValue(ln);
4834
4835 next = fromtail ? ln->prev : ln->next;
4836 if (compareStringObjects(ele,c->argv[3]) == 0) {
4837 listDelNode(list,ln);
4838 server.dirty++;
4839 removed++;
4840 if (toremove && removed == toremove) break;
4841 }
4842 ln = next;
4843 }
4844 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4845 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4846 }
4847
4848 /* This is the semantic of this command:
4849 * RPOPLPUSH srclist dstlist:
4850 * IF LLEN(srclist) > 0
4851 * element = RPOP srclist
4852 * LPUSH dstlist element
4853 * RETURN element
4854 * ELSE
4855 * RETURN nil
4856 * END
4857 * END
4858 *
4859 * The idea is to be able to get an element from a list in a reliable way
4860 * since the element is not just returned but pushed against another list
4861 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4862 */
4863 static void rpoplpushcommand(redisClient *c) {
4864 robj *sobj;
4865 list *srclist;
4866 listNode *ln;
4867
4868 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4869 checkType(c,sobj,REDIS_LIST)) return;
4870 srclist = sobj->ptr;
4871 ln = listLast(srclist);
4872
4873 if (ln == NULL) {
4874 addReply(c,shared.nullbulk);
4875 } else {
4876 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4877 robj *ele = listNodeValue(ln);
4878 list *dstlist;
4879
4880 if (dobj && dobj->type != REDIS_LIST) {
4881 addReply(c,shared.wrongtypeerr);
4882 return;
4883 }
4884
4885 /* Add the element to the target list (unless it's directly
4886 * passed to some BLPOP-ing client */
4887 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4888 if (dobj == NULL) {
4889 /* Create the list if the key does not exist */
4890 dobj = createListObject();
4891 dictAdd(c->db->dict,c->argv[2],dobj);
4892 incrRefCount(c->argv[2]);
4893 }
4894 dstlist = dobj->ptr;
4895 listAddNodeHead(dstlist,ele);
4896 incrRefCount(ele);
4897 }
4898
4899 /* Send the element to the client as reply as well */
4900 addReplyBulk(c,ele);
4901
4902 /* Finally remove the element from the source list */
4903 listDelNode(srclist,ln);
4904 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4905 server.dirty++;
4906 }
4907 }
4908
4909 /* ==================================== Sets ================================ */
4910
4911 static void saddCommand(redisClient *c) {
4912 robj *set;
4913
4914 set = lookupKeyWrite(c->db,c->argv[1]);
4915 if (set == NULL) {
4916 set = createSetObject();
4917 dictAdd(c->db->dict,c->argv[1],set);
4918 incrRefCount(c->argv[1]);
4919 } else {
4920 if (set->type != REDIS_SET) {
4921 addReply(c,shared.wrongtypeerr);
4922 return;
4923 }
4924 }
4925 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4926 incrRefCount(c->argv[2]);
4927 server.dirty++;
4928 addReply(c,shared.cone);
4929 } else {
4930 addReply(c,shared.czero);
4931 }
4932 }
4933
4934 static void sremCommand(redisClient *c) {
4935 robj *set;
4936
4937 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4938 checkType(c,set,REDIS_SET)) return;
4939
4940 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4941 server.dirty++;
4942 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4943 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4944 addReply(c,shared.cone);
4945 } else {
4946 addReply(c,shared.czero);
4947 }
4948 }
4949
4950 static void smoveCommand(redisClient *c) {
4951 robj *srcset, *dstset;
4952
4953 srcset = lookupKeyWrite(c->db,c->argv[1]);
4954 dstset = lookupKeyWrite(c->db,c->argv[2]);
4955
4956 /* If the source key does not exist return 0, if it's of the wrong type
4957 * raise an error */
4958 if (srcset == NULL || srcset->type != REDIS_SET) {
4959 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4960 return;
4961 }
4962 /* Error if the destination key is not a set as well */
4963 if (dstset && dstset->type != REDIS_SET) {
4964 addReply(c,shared.wrongtypeerr);
4965 return;
4966 }
4967 /* Remove the element from the source set */
4968 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4969 /* Key not found in the src set! return zero */
4970 addReply(c,shared.czero);
4971 return;
4972 }
4973 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4974 deleteKey(c->db,c->argv[1]);
4975 server.dirty++;
4976 /* Add the element to the destination set */
4977 if (!dstset) {
4978 dstset = createSetObject();
4979 dictAdd(c->db->dict,c->argv[2],dstset);
4980 incrRefCount(c->argv[2]);
4981 }
4982 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4983 incrRefCount(c->argv[3]);
4984 addReply(c,shared.cone);
4985 }
4986
4987 static void sismemberCommand(redisClient *c) {
4988 robj *set;
4989
4990 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4991 checkType(c,set,REDIS_SET)) return;
4992
4993 if (dictFind(set->ptr,c->argv[2]))
4994 addReply(c,shared.cone);
4995 else
4996 addReply(c,shared.czero);
4997 }
4998
4999 static void scardCommand(redisClient *c) {
5000 robj *o;
5001 dict *s;
5002
5003 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5004 checkType(c,o,REDIS_SET)) return;
5005
5006 s = o->ptr;
5007 addReplyUlong(c,dictSize(s));
5008 }
5009
5010 static void spopCommand(redisClient *c) {
5011 robj *set;
5012 dictEntry *de;
5013
5014 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5015 checkType(c,set,REDIS_SET)) return;
5016
5017 de = dictGetRandomKey(set->ptr);
5018 if (de == NULL) {
5019 addReply(c,shared.nullbulk);
5020 } else {
5021 robj *ele = dictGetEntryKey(de);
5022
5023 addReplyBulk(c,ele);
5024 dictDelete(set->ptr,ele);
5025 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5026 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5027 server.dirty++;
5028 }
5029 }
5030
5031 static void srandmemberCommand(redisClient *c) {
5032 robj *set;
5033 dictEntry *de;
5034
5035 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5036 checkType(c,set,REDIS_SET)) return;
5037
5038 de = dictGetRandomKey(set->ptr);
5039 if (de == NULL) {
5040 addReply(c,shared.nullbulk);
5041 } else {
5042 robj *ele = dictGetEntryKey(de);
5043
5044 addReplyBulk(c,ele);
5045 }
5046 }
5047
5048 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5049 dict **d1 = (void*) s1, **d2 = (void*) s2;
5050
5051 return dictSize(*d1)-dictSize(*d2);
5052 }
5053
5054 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5055 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5056 dictIterator *di;
5057 dictEntry *de;
5058 robj *lenobj = NULL, *dstset = NULL;
5059 unsigned long j, cardinality = 0;
5060
5061 for (j = 0; j < setsnum; j++) {
5062 robj *setobj;
5063
5064 setobj = dstkey ?
5065 lookupKeyWrite(c->db,setskeys[j]) :
5066 lookupKeyRead(c->db,setskeys[j]);
5067 if (!setobj) {
5068 zfree(dv);
5069 if (dstkey) {
5070 if (deleteKey(c->db,dstkey))
5071 server.dirty++;
5072 addReply(c,shared.czero);
5073 } else {
5074 addReply(c,shared.emptymultibulk);
5075 }
5076 return;
5077 }
5078 if (setobj->type != REDIS_SET) {
5079 zfree(dv);
5080 addReply(c,shared.wrongtypeerr);
5081 return;
5082 }
5083 dv[j] = setobj->ptr;
5084 }
5085 /* Sort sets from the smallest to largest, this will improve our
5086 * algorithm's performace */
5087 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5088
5089 /* The first thing we should output is the total number of elements...
5090 * since this is a multi-bulk write, but at this stage we don't know
5091 * the intersection set size, so we use a trick, append an empty object
5092 * to the output list and save the pointer to later modify it with the
5093 * right length */
5094 if (!dstkey) {
5095 lenobj = createObject(REDIS_STRING,NULL);
5096 addReply(c,lenobj);
5097 decrRefCount(lenobj);
5098 } else {
5099 /* If we have a target key where to store the resulting set
5100 * create this key with an empty set inside */
5101 dstset = createSetObject();
5102 }
5103
5104 /* Iterate all the elements of the first (smallest) set, and test
5105 * the element against all the other sets, if at least one set does
5106 * not include the element it is discarded */
5107 di = dictGetIterator(dv[0]);
5108
5109 while((de = dictNext(di)) != NULL) {
5110 robj *ele;
5111
5112 for (j = 1; j < setsnum; j++)
5113 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5114 if (j != setsnum)
5115 continue; /* at least one set does not contain the member */
5116 ele = dictGetEntryKey(de);
5117 if (!dstkey) {
5118 addReplyBulk(c,ele);
5119 cardinality++;
5120 } else {
5121 dictAdd(dstset->ptr,ele,NULL);
5122 incrRefCount(ele);
5123 }
5124 }
5125 dictReleaseIterator(di);
5126
5127 if (dstkey) {
5128 /* Store the resulting set into the target, if the intersection
5129 * is not an empty set. */
5130 deleteKey(c->db,dstkey);
5131 if (dictSize((dict*)dstset->ptr) > 0) {
5132 dictAdd(c->db->dict,dstkey,dstset);
5133 incrRefCount(dstkey);
5134 addReplyLong(c,dictSize((dict*)dstset->ptr));
5135 } else {
5136 decrRefCount(dstset);
5137 addReply(c,shared.czero);
5138 }
5139 server.dirty++;
5140 } else {
5141 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5142 }
5143 zfree(dv);
5144 }
5145
5146 static void sinterCommand(redisClient *c) {
5147 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5148 }
5149
5150 static void sinterstoreCommand(redisClient *c) {
5151 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5152 }
5153
5154 #define REDIS_OP_UNION 0
5155 #define REDIS_OP_DIFF 1
5156 #define REDIS_OP_INTER 2
5157
5158 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5159 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5160 dictIterator *di;
5161 dictEntry *de;
5162 robj *dstset = NULL;
5163 int j, cardinality = 0;
5164
5165 for (j = 0; j < setsnum; j++) {
5166 robj *setobj;
5167
5168 setobj = dstkey ?
5169 lookupKeyWrite(c->db,setskeys[j]) :
5170 lookupKeyRead(c->db,setskeys[j]);
5171 if (!setobj) {
5172 dv[j] = NULL;
5173 continue;
5174 }
5175 if (setobj->type != REDIS_SET) {
5176 zfree(dv);
5177 addReply(c,shared.wrongtypeerr);
5178 return;
5179 }
5180 dv[j] = setobj->ptr;
5181 }
5182
5183 /* We need a temp set object to store our union. If the dstkey
5184 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5185 * this set object will be the resulting object to set into the target key*/
5186 dstset = createSetObject();
5187
5188 /* Iterate all the elements of all the sets, add every element a single
5189 * time to the result set */
5190 for (j = 0; j < setsnum; j++) {
5191 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5192 if (!dv[j]) continue; /* non existing keys are like empty sets */
5193
5194 di = dictGetIterator(dv[j]);
5195
5196 while((de = dictNext(di)) != NULL) {
5197 robj *ele;
5198
5199 /* dictAdd will not add the same element multiple times */
5200 ele = dictGetEntryKey(de);
5201 if (op == REDIS_OP_UNION || j == 0) {
5202 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5203 incrRefCount(ele);
5204 cardinality++;
5205 }
5206 } else if (op == REDIS_OP_DIFF) {
5207 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5208 cardinality--;
5209 }
5210 }
5211 }
5212 dictReleaseIterator(di);
5213
5214 /* result set is empty? Exit asap. */
5215 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5216 }
5217
5218 /* Output the content of the resulting set, if not in STORE mode */
5219 if (!dstkey) {
5220 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5221 di = dictGetIterator(dstset->ptr);
5222 while((de = dictNext(di)) != NULL) {
5223 robj *ele;
5224
5225 ele = dictGetEntryKey(de);
5226 addReplyBulk(c,ele);
5227 }
5228 dictReleaseIterator(di);
5229 decrRefCount(dstset);
5230 } else {
5231 /* If we have a target key where to store the resulting set
5232 * create this key with the result set inside */
5233 deleteKey(c->db,dstkey);
5234 if (dictSize((dict*)dstset->ptr) > 0) {
5235 dictAdd(c->db->dict,dstkey,dstset);
5236 incrRefCount(dstkey);
5237 addReplyLong(c,dictSize((dict*)dstset->ptr));
5238 } else {
5239 decrRefCount(dstset);
5240 addReply(c,shared.czero);
5241 }
5242 server.dirty++;
5243 }
5244 zfree(dv);
5245 }
5246
5247 static void sunionCommand(redisClient *c) {
5248 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5249 }
5250
5251 static void sunionstoreCommand(redisClient *c) {
5252 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5253 }
5254
5255 static void sdiffCommand(redisClient *c) {
5256 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5257 }
5258
5259 static void sdiffstoreCommand(redisClient *c) {
5260 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5261 }
5262
5263 /* ==================================== ZSets =============================== */
5264
5265 /* ZSETs are ordered sets using two data structures to hold the same elements
5266 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5267 * data structure.
5268 *
5269 * The elements are added to an hash table mapping Redis objects to scores.
5270 * At the same time the elements are added to a skip list mapping scores
5271 * to Redis objects (so objects are sorted by scores in this "view"). */
5272
5273 /* This skiplist implementation is almost a C translation of the original
5274 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5275 * Alternative to Balanced Trees", modified in three ways:
5276 * a) this implementation allows for repeated values.
5277 * b) the comparison is not just by key (our 'score') but by satellite data.
5278 * c) there is a back pointer, so it's a doubly linked list with the back
5279 * pointers being only at "level 1". This allows to traverse the list
5280 * from tail to head, useful for ZREVRANGE. */
5281
5282 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5283 zskiplistNode *zn = zmalloc(sizeof(*zn));
5284
5285 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5286 if (level > 0)
5287 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5288 zn->score = score;
5289 zn->obj = obj;
5290 return zn;
5291 }
5292
5293 static zskiplist *zslCreate(void) {
5294 int j;
5295 zskiplist *zsl;
5296
5297 zsl = zmalloc(sizeof(*zsl));
5298 zsl->level = 1;
5299 zsl->length = 0;
5300 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5301 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5302 zsl->header->forward[j] = NULL;
5303
5304 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5305 if (j < ZSKIPLIST_MAXLEVEL-1)
5306 zsl->header->span[j] = 0;
5307 }
5308 zsl->header->backward = NULL;
5309 zsl->tail = NULL;
5310 return zsl;
5311 }
5312
5313 static void zslFreeNode(zskiplistNode *node) {
5314 decrRefCount(node->obj);
5315 zfree(node->forward);
5316 zfree(node->span);
5317 zfree(node);
5318 }
5319
5320 static void zslFree(zskiplist *zsl) {
5321 zskiplistNode *node = zsl->header->forward[0], *next;
5322
5323 zfree(zsl->header->forward);
5324 zfree(zsl->header->span);
5325 zfree(zsl->header);
5326 while(node) {
5327 next = node->forward[0];
5328 zslFreeNode(node);
5329 node = next;
5330 }
5331 zfree(zsl);
5332 }
5333
5334 static int zslRandomLevel(void) {
5335 int level = 1;
5336 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5337 level += 1;
5338 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5339 }
5340
5341 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5342 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5343 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5344 int i, level;
5345
5346 x = zsl->header;
5347 for (i = zsl->level-1; i >= 0; i--) {
5348 /* store rank that is crossed to reach the insert position */
5349 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5350
5351 while (x->forward[i] &&
5352 (x->forward[i]->score < score ||
5353 (x->forward[i]->score == score &&
5354 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5355 rank[i] += i > 0 ? x->span[i-1] : 1;
5356 x = x->forward[i];
5357 }
5358 update[i] = x;
5359 }
5360 /* we assume the key is not already inside, since we allow duplicated
5361 * scores, and the re-insertion of score and redis object should never
5362 * happpen since the caller of zslInsert() should test in the hash table
5363 * if the element is already inside or not. */
5364 level = zslRandomLevel();
5365 if (level > zsl->level) {
5366 for (i = zsl->level; i < level; i++) {
5367 rank[i] = 0;
5368 update[i] = zsl->header;
5369 update[i]->span[i-1] = zsl->length;
5370 }
5371 zsl->level = level;
5372 }
5373 x = zslCreateNode(level,score,obj);
5374 for (i = 0; i < level; i++) {
5375 x->forward[i] = update[i]->forward[i];
5376 update[i]->forward[i] = x;
5377
5378 /* update span covered by update[i] as x is inserted here */
5379 if (i > 0) {
5380 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5381 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5382 }
5383 }
5384
5385 /* increment span for untouched levels */
5386 for (i = level; i < zsl->level; i++) {
5387 update[i]->span[i-1]++;
5388 }
5389
5390 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5391 if (x->forward[0])
5392 x->forward[0]->backward = x;
5393 else
5394 zsl->tail = x;
5395 zsl->length++;
5396 }
5397
5398 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5399 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5400 int i;
5401 for (i = 0; i < zsl->level; i++) {
5402 if (update[i]->forward[i] == x) {
5403 if (i > 0) {
5404 update[i]->span[i-1] += x->span[i-1] - 1;
5405 }
5406 update[i]->forward[i] = x->forward[i];
5407 } else {
5408 /* invariant: i > 0, because update[0]->forward[0]
5409 * is always equal to x */
5410 update[i]->span[i-1] -= 1;
5411 }
5412 }
5413 if (x->forward[0]) {
5414 x->forward[0]->backward = x->backward;
5415 } else {
5416 zsl->tail = x->backward;
5417 }
5418 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5419 zsl->level--;
5420 zsl->length--;
5421 }
5422
5423 /* Delete an element with matching score/object from the skiplist. */
5424 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5425 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5426 int i;
5427
5428 x = zsl->header;
5429 for (i = zsl->level-1; i >= 0; i--) {
5430 while (x->forward[i] &&
5431 (x->forward[i]->score < score ||
5432 (x->forward[i]->score == score &&
5433 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5434 x = x->forward[i];
5435 update[i] = x;
5436 }
5437 /* We may have multiple elements with the same score, what we need
5438 * is to find the element with both the right score and object. */
5439 x = x->forward[0];
5440 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5441 zslDeleteNode(zsl, x, update);
5442 zslFreeNode(x);
5443 return 1;
5444 } else {
5445 return 0; /* not found */
5446 }
5447 return 0; /* not found */
5448 }
5449
5450 /* Delete all the elements with score between min and max from the skiplist.
5451 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5452 * Note that this function takes the reference to the hash table view of the
5453 * sorted set, in order to remove the elements from the hash table too. */
5454 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5455 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5456 unsigned long removed = 0;
5457 int i;
5458
5459 x = zsl->header;
5460 for (i = zsl->level-1; i >= 0; i--) {
5461 while (x->forward[i] && x->forward[i]->score < min)
5462 x = x->forward[i];
5463 update[i] = x;
5464 }
5465 /* We may have multiple elements with the same score, what we need
5466 * is to find the element with both the right score and object. */
5467 x = x->forward[0];
5468 while (x && x->score <= max) {
5469 zskiplistNode *next = x->forward[0];
5470 zslDeleteNode(zsl, x, update);
5471 dictDelete(dict,x->obj);
5472 zslFreeNode(x);
5473 removed++;
5474 x = next;
5475 }
5476 return removed; /* not found */
5477 }
5478
5479 /* Delete all the elements with rank between start and end from the skiplist.
5480 * Start and end are inclusive. Note that start and end need to be 1-based */
5481 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5482 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5483 unsigned long traversed = 0, removed = 0;
5484 int i;
5485
5486 x = zsl->header;
5487 for (i = zsl->level-1; i >= 0; i--) {
5488 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5489 traversed += i > 0 ? x->span[i-1] : 1;
5490 x = x->forward[i];
5491 }
5492 update[i] = x;
5493 }
5494
5495 traversed++;
5496 x = x->forward[0];
5497 while (x && traversed <= end) {
5498 zskiplistNode *next = x->forward[0];
5499 zslDeleteNode(zsl, x, update);
5500 dictDelete(dict,x->obj);
5501 zslFreeNode(x);
5502 removed++;
5503 traversed++;
5504 x = next;
5505 }
5506 return removed;
5507 }
5508
5509 /* Find the first node having a score equal or greater than the specified one.
5510 * Returns NULL if there is no match. */
5511 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5512 zskiplistNode *x;
5513 int i;
5514
5515 x = zsl->header;
5516 for (i = zsl->level-1; i >= 0; i--) {
5517 while (x->forward[i] && x->forward[i]->score < score)
5518 x = x->forward[i];
5519 }
5520 /* We may have multiple elements with the same score, what we need
5521 * is to find the element with both the right score and object. */
5522 return x->forward[0];
5523 }
5524
5525 /* Find the rank for an element by both score and key.
5526 * Returns 0 when the element cannot be found, rank otherwise.
5527 * Note that the rank is 1-based due to the span of zsl->header to the
5528 * first element. */
5529 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5530 zskiplistNode *x;
5531 unsigned long rank = 0;
5532 int i;
5533
5534 x = zsl->header;
5535 for (i = zsl->level-1; i >= 0; i--) {
5536 while (x->forward[i] &&
5537 (x->forward[i]->score < score ||
5538 (x->forward[i]->score == score &&
5539 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5540 rank += i > 0 ? x->span[i-1] : 1;
5541 x = x->forward[i];
5542 }
5543
5544 /* x might be equal to zsl->header, so test if obj is non-NULL */
5545 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5546 return rank;
5547 }
5548 }
5549 return 0;
5550 }
5551
5552 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5553 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5554 zskiplistNode *x;
5555 unsigned long traversed = 0;
5556 int i;
5557
5558 x = zsl->header;
5559 for (i = zsl->level-1; i >= 0; i--) {
5560 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5561 {
5562 traversed += i > 0 ? x->span[i-1] : 1;
5563 x = x->forward[i];
5564 }
5565 if (traversed == rank) {
5566 return x;
5567 }
5568 }
5569 return NULL;
5570 }
5571
5572 /* The actual Z-commands implementations */
5573
5574 /* This generic command implements both ZADD and ZINCRBY.
5575 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5576 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5577 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5578 robj *zsetobj;
5579 zset *zs;
5580 double *score;
5581
5582 zsetobj = lookupKeyWrite(c->db,key);
5583 if (zsetobj == NULL) {
5584 zsetobj = createZsetObject();
5585 dictAdd(c->db->dict,key,zsetobj);
5586 incrRefCount(key);
5587 } else {
5588 if (zsetobj->type != REDIS_ZSET) {
5589 addReply(c,shared.wrongtypeerr);
5590 return;
5591 }
5592 }
5593 zs = zsetobj->ptr;
5594
5595 /* Ok now since we implement both ZADD and ZINCRBY here the code
5596 * needs to handle the two different conditions. It's all about setting
5597 * '*score', that is, the new score to set, to the right value. */
5598 score = zmalloc(sizeof(double));
5599 if (doincrement) {
5600 dictEntry *de;
5601
5602 /* Read the old score. If the element was not present starts from 0 */
5603 de = dictFind(zs->dict,ele);
5604 if (de) {
5605 double *oldscore = dictGetEntryVal(de);
5606 *score = *oldscore + scoreval;
5607 } else {
5608 *score = scoreval;
5609 }
5610 } else {
5611 *score = scoreval;
5612 }
5613
5614 /* What follows is a simple remove and re-insert operation that is common
5615 * to both ZADD and ZINCRBY... */
5616 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5617 /* case 1: New element */
5618 incrRefCount(ele); /* added to hash */
5619 zslInsert(zs->zsl,*score,ele);
5620 incrRefCount(ele); /* added to skiplist */
5621 server.dirty++;
5622 if (doincrement)
5623 addReplyDouble(c,*score);
5624 else
5625 addReply(c,shared.cone);
5626 } else {
5627 dictEntry *de;
5628 double *oldscore;
5629
5630 /* case 2: Score update operation */
5631 de = dictFind(zs->dict,ele);
5632 redisAssert(de != NULL);
5633 oldscore = dictGetEntryVal(de);
5634 if (*score != *oldscore) {
5635 int deleted;
5636
5637 /* Remove and insert the element in the skip list with new score */
5638 deleted = zslDelete(zs->zsl,*oldscore,ele);
5639 redisAssert(deleted != 0);
5640 zslInsert(zs->zsl,*score,ele);
5641 incrRefCount(ele);
5642 /* Update the score in the hash table */
5643 dictReplace(zs->dict,ele,score);
5644 server.dirty++;
5645 } else {
5646 zfree(score);
5647 }
5648 if (doincrement)
5649 addReplyDouble(c,*score);
5650 else
5651 addReply(c,shared.czero);
5652 }
5653 }
5654
5655 static void zaddCommand(redisClient *c) {
5656 double scoreval;
5657
5658 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5659 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5660 }
5661
5662 static void zincrbyCommand(redisClient *c) {
5663 double scoreval;
5664
5665 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5666 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5667 }
5668
5669 static void zremCommand(redisClient *c) {
5670 robj *zsetobj;
5671 zset *zs;
5672 dictEntry *de;
5673 double *oldscore;
5674 int deleted;
5675
5676 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5677 checkType(c,zsetobj,REDIS_ZSET)) return;
5678
5679 zs = zsetobj->ptr;
5680 de = dictFind(zs->dict,c->argv[2]);
5681 if (de == NULL) {
5682 addReply(c,shared.czero);
5683 return;
5684 }
5685 /* Delete from the skiplist */
5686 oldscore = dictGetEntryVal(de);
5687 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5688 redisAssert(deleted != 0);
5689
5690 /* Delete from the hash table */
5691 dictDelete(zs->dict,c->argv[2]);
5692 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5693 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5694 server.dirty++;
5695 addReply(c,shared.cone);
5696 }
5697
5698 static void zremrangebyscoreCommand(redisClient *c) {
5699 double min;
5700 double max;
5701 long deleted;
5702 robj *zsetobj;
5703 zset *zs;
5704
5705 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5706 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5707
5708 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5709 checkType(c,zsetobj,REDIS_ZSET)) return;
5710
5711 zs = zsetobj->ptr;
5712 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5713 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5714 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5715 server.dirty += deleted;
5716 addReplyLong(c,deleted);
5717 }
5718
5719 static void zremrangebyrankCommand(redisClient *c) {
5720 long start;
5721 long end;
5722 int llen;
5723 long deleted;
5724 robj *zsetobj;
5725 zset *zs;
5726
5727 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5728 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5729
5730 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5731 checkType(c,zsetobj,REDIS_ZSET)) return;
5732 zs = zsetobj->ptr;
5733 llen = zs->zsl->length;
5734
5735 /* convert negative indexes */
5736 if (start < 0) start = llen+start;
5737 if (end < 0) end = llen+end;
5738 if (start < 0) start = 0;
5739 if (end < 0) end = 0;
5740
5741 /* indexes sanity checks */
5742 if (start > end || start >= llen) {
5743 addReply(c,shared.czero);
5744 return;
5745 }
5746 if (end >= llen) end = llen-1;
5747
5748 /* increment start and end because zsl*Rank functions
5749 * use 1-based rank */
5750 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5751 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5752 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5753 server.dirty += deleted;
5754 addReplyLong(c, deleted);
5755 }
5756
5757 typedef struct {
5758 dict *dict;
5759 double weight;
5760 } zsetopsrc;
5761
5762 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5763 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5764 unsigned long size1, size2;
5765 size1 = d1->dict ? dictSize(d1->dict) : 0;
5766 size2 = d2->dict ? dictSize(d2->dict) : 0;
5767 return size1 - size2;
5768 }
5769
5770 #define REDIS_AGGR_SUM 1
5771 #define REDIS_AGGR_MIN 2
5772 #define REDIS_AGGR_MAX 3
5773
5774 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5775 if (aggregate == REDIS_AGGR_SUM) {
5776 *target = *target + val;
5777 } else if (aggregate == REDIS_AGGR_MIN) {
5778 *target = val < *target ? val : *target;
5779 } else if (aggregate == REDIS_AGGR_MAX) {
5780 *target = val > *target ? val : *target;
5781 } else {
5782 /* safety net */
5783 redisPanic("Unknown ZUNION/INTER aggregate type");
5784 }
5785 }
5786
5787 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5788 int i, j, zsetnum;
5789 int aggregate = REDIS_AGGR_SUM;
5790 zsetopsrc *src;
5791 robj *dstobj;
5792 zset *dstzset;
5793 dictIterator *di;
5794 dictEntry *de;
5795
5796 /* expect zsetnum input keys to be given */
5797 zsetnum = atoi(c->argv[2]->ptr);
5798 if (zsetnum < 1) {
5799 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5800 return;
5801 }
5802
5803 /* test if the expected number of keys would overflow */
5804 if (3+zsetnum > c->argc) {
5805 addReply(c,shared.syntaxerr);
5806 return;
5807 }
5808
5809 /* read keys to be used for input */
5810 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5811 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5812 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5813 if (!zsetobj) {
5814 src[i].dict = NULL;
5815 } else {
5816 if (zsetobj->type != REDIS_ZSET) {
5817 zfree(src);
5818 addReply(c,shared.wrongtypeerr);
5819 return;
5820 }
5821 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5822 }
5823
5824 /* default all weights to 1 */
5825 src[i].weight = 1.0;
5826 }
5827
5828 /* parse optional extra arguments */
5829 if (j < c->argc) {
5830 int remaining = c->argc - j;
5831
5832 while (remaining) {
5833 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5834 j++; remaining--;
5835 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5836 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5837 return;
5838 }
5839 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5840 j++; remaining--;
5841 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5842 aggregate = REDIS_AGGR_SUM;
5843 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5844 aggregate = REDIS_AGGR_MIN;
5845 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5846 aggregate = REDIS_AGGR_MAX;
5847 } else {
5848 zfree(src);
5849 addReply(c,shared.syntaxerr);
5850 return;
5851 }
5852 j++; remaining--;
5853 } else {
5854 zfree(src);
5855 addReply(c,shared.syntaxerr);
5856 return;
5857 }
5858 }
5859 }
5860
5861 /* sort sets from the smallest to largest, this will improve our
5862 * algorithm's performance */
5863 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5864
5865 dstobj = createZsetObject();
5866 dstzset = dstobj->ptr;
5867
5868 if (op == REDIS_OP_INTER) {
5869 /* skip going over all entries if the smallest zset is NULL or empty */
5870 if (src[0].dict && dictSize(src[0].dict) > 0) {
5871 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5872 * from small to large, all src[i > 0].dict are non-empty too */
5873 di = dictGetIterator(src[0].dict);
5874 while((de = dictNext(di)) != NULL) {
5875 double *score = zmalloc(sizeof(double)), value;
5876 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5877
5878 for (j = 1; j < zsetnum; j++) {
5879 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5880 if (other) {
5881 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5882 zunionInterAggregate(score, value, aggregate);
5883 } else {
5884 break;
5885 }
5886 }
5887
5888 /* skip entry when not present in every source dict */
5889 if (j != zsetnum) {
5890 zfree(score);
5891 } else {
5892 robj *o = dictGetEntryKey(de);
5893 dictAdd(dstzset->dict,o,score);
5894 incrRefCount(o); /* added to dictionary */
5895 zslInsert(dstzset->zsl,*score,o);
5896 incrRefCount(o); /* added to skiplist */
5897 }
5898 }
5899 dictReleaseIterator(di);
5900 }
5901 } else if (op == REDIS_OP_UNION) {
5902 for (i = 0; i < zsetnum; i++) {
5903 if (!src[i].dict) continue;
5904
5905 di = dictGetIterator(src[i].dict);
5906 while((de = dictNext(di)) != NULL) {
5907 /* skip key when already processed */
5908 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5909
5910 double *score = zmalloc(sizeof(double)), value;
5911 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5912
5913 /* because the zsets are sorted by size, its only possible
5914 * for sets at larger indices to hold this entry */
5915 for (j = (i+1); j < zsetnum; j++) {
5916 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5917 if (other) {
5918 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5919 zunionInterAggregate(score, value, aggregate);
5920 }
5921 }
5922
5923 robj *o = dictGetEntryKey(de);
5924 dictAdd(dstzset->dict,o,score);
5925 incrRefCount(o); /* added to dictionary */
5926 zslInsert(dstzset->zsl,*score,o);
5927 incrRefCount(o); /* added to skiplist */
5928 }
5929 dictReleaseIterator(di);
5930 }
5931 } else {
5932 /* unknown operator */
5933 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5934 }
5935
5936 deleteKey(c->db,dstkey);
5937 if (dstzset->zsl->length) {
5938 dictAdd(c->db->dict,dstkey,dstobj);
5939 incrRefCount(dstkey);
5940 addReplyLong(c, dstzset->zsl->length);
5941 server.dirty++;
5942 } else {
5943 decrRefCount(dstobj);
5944 addReply(c, shared.czero);
5945 }
5946 zfree(src);
5947 }
5948
5949 static void zunionCommand(redisClient *c) {
5950 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5951 }
5952
5953 static void zinterCommand(redisClient *c) {
5954 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5955 }
5956
5957 static void zrangeGenericCommand(redisClient *c, int reverse) {
5958 robj *o;
5959 long start;
5960 long end;
5961 int withscores = 0;
5962 int llen;
5963 int rangelen, j;
5964 zset *zsetobj;
5965 zskiplist *zsl;
5966 zskiplistNode *ln;
5967 robj *ele;
5968
5969 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5970 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5971
5972 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5973 withscores = 1;
5974 } else if (c->argc >= 5) {
5975 addReply(c,shared.syntaxerr);
5976 return;
5977 }
5978
5979 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5980 || checkType(c,o,REDIS_ZSET)) return;
5981 zsetobj = o->ptr;
5982 zsl = zsetobj->zsl;
5983 llen = zsl->length;
5984
5985 /* convert negative indexes */
5986 if (start < 0) start = llen+start;
5987 if (end < 0) end = llen+end;
5988 if (start < 0) start = 0;
5989 if (end < 0) end = 0;
5990
5991 /* indexes sanity checks */
5992 if (start > end || start >= llen) {
5993 /* Out of range start or start > end result in empty list */
5994 addReply(c,shared.emptymultibulk);
5995 return;
5996 }
5997 if (end >= llen) end = llen-1;
5998 rangelen = (end-start)+1;
5999
6000 /* check if starting point is trivial, before searching
6001 * the element in log(N) time */
6002 if (reverse) {
6003 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6004 } else {
6005 ln = start == 0 ?
6006 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6007 }
6008
6009 /* Return the result in form of a multi-bulk reply */
6010 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6011 withscores ? (rangelen*2) : rangelen));
6012 for (j = 0; j < rangelen; j++) {
6013 ele = ln->obj;
6014 addReplyBulk(c,ele);
6015 if (withscores)
6016 addReplyDouble(c,ln->score);
6017 ln = reverse ? ln->backward : ln->forward[0];
6018 }
6019 }
6020
6021 static void zrangeCommand(redisClient *c) {
6022 zrangeGenericCommand(c,0);
6023 }
6024
6025 static void zrevrangeCommand(redisClient *c) {
6026 zrangeGenericCommand(c,1);
6027 }
6028
6029 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6030 * If justcount is non-zero, just the count is returned. */
6031 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6032 robj *o;
6033 double min, max;
6034 int minex = 0, maxex = 0; /* are min or max exclusive? */
6035 int offset = 0, limit = -1;
6036 int withscores = 0;
6037 int badsyntax = 0;
6038
6039 /* Parse the min-max interval. If one of the values is prefixed
6040 * by the "(" character, it's considered "open". For instance
6041 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6042 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6043 if (((char*)c->argv[2]->ptr)[0] == '(') {
6044 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6045 minex = 1;
6046 } else {
6047 min = strtod(c->argv[2]->ptr,NULL);
6048 }
6049 if (((char*)c->argv[3]->ptr)[0] == '(') {
6050 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6051 maxex = 1;
6052 } else {
6053 max = strtod(c->argv[3]->ptr,NULL);
6054 }
6055
6056 /* Parse "WITHSCORES": note that if the command was called with
6057 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6058 * enter the following paths to parse WITHSCORES and LIMIT. */
6059 if (c->argc == 5 || c->argc == 8) {
6060 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6061 withscores = 1;
6062 else
6063 badsyntax = 1;
6064 }
6065 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6066 badsyntax = 1;
6067 if (badsyntax) {
6068 addReplySds(c,
6069 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6070 return;
6071 }
6072
6073 /* Parse "LIMIT" */
6074 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6075 addReply(c,shared.syntaxerr);
6076 return;
6077 } else if (c->argc == (7 + withscores)) {
6078 offset = atoi(c->argv[5]->ptr);
6079 limit = atoi(c->argv[6]->ptr);
6080 if (offset < 0) offset = 0;
6081 }
6082
6083 /* Ok, lookup the key and get the range */
6084 o = lookupKeyRead(c->db,c->argv[1]);
6085 if (o == NULL) {
6086 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6087 } else {
6088 if (o->type != REDIS_ZSET) {
6089 addReply(c,shared.wrongtypeerr);
6090 } else {
6091 zset *zsetobj = o->ptr;
6092 zskiplist *zsl = zsetobj->zsl;
6093 zskiplistNode *ln;
6094 robj *ele, *lenobj = NULL;
6095 unsigned long rangelen = 0;
6096
6097 /* Get the first node with the score >= min, or with
6098 * score > min if 'minex' is true. */
6099 ln = zslFirstWithScore(zsl,min);
6100 while (minex && ln && ln->score == min) ln = ln->forward[0];
6101
6102 if (ln == NULL) {
6103 /* No element matching the speciifed interval */
6104 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6105 return;
6106 }
6107
6108 /* We don't know in advance how many matching elements there
6109 * are in the list, so we push this object that will represent
6110 * the multi-bulk length in the output buffer, and will "fix"
6111 * it later */
6112 if (!justcount) {
6113 lenobj = createObject(REDIS_STRING,NULL);
6114 addReply(c,lenobj);
6115 decrRefCount(lenobj);
6116 }
6117
6118 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6119 if (offset) {
6120 offset--;
6121 ln = ln->forward[0];
6122 continue;
6123 }
6124 if (limit == 0) break;
6125 if (!justcount) {
6126 ele = ln->obj;
6127 addReplyBulk(c,ele);
6128 if (withscores)
6129 addReplyDouble(c,ln->score);
6130 }
6131 ln = ln->forward[0];
6132 rangelen++;
6133 if (limit > 0) limit--;
6134 }
6135 if (justcount) {
6136 addReplyLong(c,(long)rangelen);
6137 } else {
6138 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6139 withscores ? (rangelen*2) : rangelen);
6140 }
6141 }
6142 }
6143 }
6144
6145 static void zrangebyscoreCommand(redisClient *c) {
6146 genericZrangebyscoreCommand(c,0);
6147 }
6148
6149 static void zcountCommand(redisClient *c) {
6150 genericZrangebyscoreCommand(c,1);
6151 }
6152
6153 static void zcardCommand(redisClient *c) {
6154 robj *o;
6155 zset *zs;
6156
6157 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6158 checkType(c,o,REDIS_ZSET)) return;
6159
6160 zs = o->ptr;
6161 addReplyUlong(c,zs->zsl->length);
6162 }
6163
6164 static void zscoreCommand(redisClient *c) {
6165 robj *o;
6166 zset *zs;
6167 dictEntry *de;
6168
6169 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6170 checkType(c,o,REDIS_ZSET)) return;
6171
6172 zs = o->ptr;
6173 de = dictFind(zs->dict,c->argv[2]);
6174 if (!de) {
6175 addReply(c,shared.nullbulk);
6176 } else {
6177 double *score = dictGetEntryVal(de);
6178
6179 addReplyDouble(c,*score);
6180 }
6181 }
6182
6183 static void zrankGenericCommand(redisClient *c, int reverse) {
6184 robj *o;
6185 zset *zs;
6186 zskiplist *zsl;
6187 dictEntry *de;
6188 unsigned long rank;
6189 double *score;
6190
6191 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6192 checkType(c,o,REDIS_ZSET)) return;
6193
6194 zs = o->ptr;
6195 zsl = zs->zsl;
6196 de = dictFind(zs->dict,c->argv[2]);
6197 if (!de) {
6198 addReply(c,shared.nullbulk);
6199 return;
6200 }
6201
6202 score = dictGetEntryVal(de);
6203 rank = zslGetRank(zsl, *score, c->argv[2]);
6204 if (rank) {
6205 if (reverse) {
6206 addReplyLong(c, zsl->length - rank);
6207 } else {
6208 addReplyLong(c, rank-1);
6209 }
6210 } else {
6211 addReply(c,shared.nullbulk);
6212 }
6213 }
6214
6215 static void zrankCommand(redisClient *c) {
6216 zrankGenericCommand(c, 0);
6217 }
6218
6219 static void zrevrankCommand(redisClient *c) {
6220 zrankGenericCommand(c, 1);
6221 }
6222
6223 /* ========================= Hashes utility functions ======================= */
6224 #define REDIS_HASH_KEY 1
6225 #define REDIS_HASH_VALUE 2
6226
6227 /* Check the length of a number of objects to see if we need to convert a
6228 * zipmap to a real hash. Note that we only check string encoded objects
6229 * as their string length can be queried in constant time. */
6230 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6231 int i;
6232 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6233
6234 for (i = start; i <= end; i++) {
6235 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6236 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6237 {
6238 convertToRealHash(subject);
6239 return;
6240 }
6241 }
6242 }
6243
6244 /* Encode given objects in-place when the hash uses a dict. */
6245 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6246 if (subject->encoding == REDIS_ENCODING_HT) {
6247 if (o1) *o1 = tryObjectEncoding(*o1);
6248 if (o2) *o2 = tryObjectEncoding(*o2);
6249 }
6250 }
6251
6252 /* Get the value from a hash identified by key. Returns either a string
6253 * object or NULL if the value cannot be found. The refcount of the object
6254 * is always increased by 1 when the value was found. */
6255 static robj *hashGet(robj *o, robj *key) {
6256 robj *value = NULL;
6257 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6258 unsigned char *v;
6259 unsigned int vlen;
6260 key = getDecodedObject(key);
6261 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6262 value = createStringObject((char*)v,vlen);
6263 }
6264 decrRefCount(key);
6265 } else {
6266 dictEntry *de = dictFind(o->ptr,key);
6267 if (de != NULL) {
6268 value = dictGetEntryVal(de);
6269 incrRefCount(value);
6270 }
6271 }
6272 return value;
6273 }
6274
6275 /* Test if the key exists in the given hash. Returns 1 if the key
6276 * exists and 0 when it doesn't. */
6277 static int hashExists(robj *o, robj *key) {
6278 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6279 key = getDecodedObject(key);
6280 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6281 decrRefCount(key);
6282 return 1;
6283 }
6284 decrRefCount(key);
6285 } else {
6286 if (dictFind(o->ptr,key) != NULL) {
6287 return 1;
6288 }
6289 }
6290 return 0;
6291 }
6292
6293 /* Add an element, discard the old if the key already exists.
6294 * Return 0 on insert and 1 on update. */
6295 static int hashSet(robj *o, robj *key, robj *value) {
6296 int update = 0;
6297 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6298 key = getDecodedObject(key);
6299 value = getDecodedObject(value);
6300 o->ptr = zipmapSet(o->ptr,
6301 key->ptr,sdslen(key->ptr),
6302 value->ptr,sdslen(value->ptr), &update);
6303 decrRefCount(key);
6304 decrRefCount(value);
6305
6306 /* Check if the zipmap needs to be upgraded to a real hash table */
6307 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6308 convertToRealHash(o);
6309 } else {
6310 if (dictReplace(o->ptr,key,value)) {
6311 /* Insert */
6312 incrRefCount(key);
6313 } else {
6314 /* Update */
6315 update = 1;
6316 }
6317 incrRefCount(value);
6318 }
6319 return update;
6320 }
6321
6322 /* Delete an element from a hash.
6323 * Return 1 on deleted and 0 on not found. */
6324 static int hashDelete(robj *o, robj *key) {
6325 int deleted = 0;
6326 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6327 key = getDecodedObject(key);
6328 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6329 decrRefCount(key);
6330 } else {
6331 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6332 /* Always check if the dictionary needs a resize after a delete. */
6333 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6334 }
6335 return deleted;
6336 }
6337
6338 /* Return the number of elements in a hash. */
6339 static unsigned long hashLength(robj *o) {
6340 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6341 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6342 }
6343
6344 /* Structure to hold hash iteration abstration. Note that iteration over
6345 * hashes involves both fields and values. Because it is possible that
6346 * not both are required, store pointers in the iterator to avoid
6347 * unnecessary memory allocation for fields/values. */
6348 typedef struct {
6349 int encoding;
6350 unsigned char *zi;
6351 unsigned char *zk, *zv;
6352 unsigned int zklen, zvlen;
6353
6354 dictIterator *di;
6355 dictEntry *de;
6356 } hashIterator;
6357
6358 static hashIterator *hashInitIterator(robj *subject) {
6359 hashIterator *hi = zmalloc(sizeof(hashIterator));
6360 hi->encoding = subject->encoding;
6361 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6362 hi->zi = zipmapRewind(subject->ptr);
6363 } else if (hi->encoding == REDIS_ENCODING_HT) {
6364 hi->di = dictGetIterator(subject->ptr);
6365 } else {
6366 redisAssert(NULL);
6367 }
6368 return hi;
6369 }
6370
6371 static void hashReleaseIterator(hashIterator *hi) {
6372 if (hi->encoding == REDIS_ENCODING_HT) {
6373 dictReleaseIterator(hi->di);
6374 }
6375 zfree(hi);
6376 }
6377
6378 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6379 * could be found and REDIS_ERR when the iterator reaches the end. */
6380 static int hashNext(hashIterator *hi) {
6381 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6382 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6383 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6384 } else {
6385 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6386 }
6387 return REDIS_OK;
6388 }
6389
6390 /* Get key or value object at current iteration position.
6391 * This increases the refcount of the field object by 1. */
6392 static robj *hashCurrent(hashIterator *hi, int what) {
6393 robj *o;
6394 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6395 if (what & REDIS_HASH_KEY) {
6396 o = createStringObject((char*)hi->zk,hi->zklen);
6397 } else {
6398 o = createStringObject((char*)hi->zv,hi->zvlen);
6399 }
6400 } else {
6401 if (what & REDIS_HASH_KEY) {
6402 o = dictGetEntryKey(hi->de);
6403 } else {
6404 o = dictGetEntryVal(hi->de);
6405 }
6406 incrRefCount(o);
6407 }
6408 return o;
6409 }
6410
6411 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6412 robj *o = lookupKeyWrite(c->db,key);
6413 if (o == NULL) {
6414 o = createHashObject();
6415 dictAdd(c->db->dict,key,o);
6416 incrRefCount(key);
6417 } else {
6418 if (o->type != REDIS_HASH) {
6419 addReply(c,shared.wrongtypeerr);
6420 return NULL;
6421 }
6422 }
6423 return o;
6424 }
6425
6426 /* ============================= Hash commands ============================== */
6427 static void hsetCommand(redisClient *c) {
6428 int update;
6429 robj *o;
6430
6431 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6432 hashTryConversion(o,c->argv,2,3);
6433 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6434 update = hashSet(o,c->argv[2],c->argv[3]);
6435 addReply(c, update ? shared.czero : shared.cone);
6436 server.dirty++;
6437 }
6438
6439 static void hsetnxCommand(redisClient *c) {
6440 robj *o;
6441 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6442 hashTryConversion(o,c->argv,2,3);
6443
6444 if (hashExists(o, c->argv[2])) {
6445 addReply(c, shared.czero);
6446 } else {
6447 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6448 hashSet(o,c->argv[2],c->argv[3]);
6449 addReply(c, shared.cone);
6450 server.dirty++;
6451 }
6452 }
6453
6454 static void hmsetCommand(redisClient *c) {
6455 int i;
6456 robj *o;
6457
6458 if ((c->argc % 2) == 1) {
6459 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6460 return;
6461 }
6462
6463 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6464 hashTryConversion(o,c->argv,2,c->argc-1);
6465 for (i = 2; i < c->argc; i += 2) {
6466 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6467 hashSet(o,c->argv[i],c->argv[i+1]);
6468 }
6469 addReply(c, shared.ok);
6470 server.dirty++;
6471 }
6472
6473 static void hincrbyCommand(redisClient *c) {
6474 long long value, incr;
6475 robj *o, *current, *new;
6476
6477 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6478 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6479 if ((current = hashGet(o,c->argv[2])) != NULL) {
6480 if (getLongLongFromObjectOrReply(c,current,&value,
6481 "hash value is not an integer") != REDIS_OK) {
6482 decrRefCount(current);
6483 return;
6484 }
6485 decrRefCount(current);
6486 } else {
6487 value = 0;
6488 }
6489
6490 value += incr;
6491 new = createStringObjectFromLongLong(value);
6492 hashTryObjectEncoding(o,&c->argv[2],NULL);
6493 hashSet(o,c->argv[2],new);
6494 decrRefCount(new);
6495 addReplyLongLong(c,value);
6496 server.dirty++;
6497 }
6498
6499 static void hgetCommand(redisClient *c) {
6500 robj *o, *value;
6501 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6502 checkType(c,o,REDIS_HASH)) return;
6503
6504 if ((value = hashGet(o,c->argv[2])) != NULL) {
6505 addReplyBulk(c,value);
6506 decrRefCount(value);
6507 } else {
6508 addReply(c,shared.nullbulk);
6509 }
6510 }
6511
6512 static void hmgetCommand(redisClient *c) {
6513 int i;
6514 robj *o, *value;
6515 o = lookupKeyRead(c->db,c->argv[1]);
6516 if (o != NULL && o->type != REDIS_HASH) {
6517 addReply(c,shared.wrongtypeerr);
6518 }
6519
6520 /* Note the check for o != NULL happens inside the loop. This is
6521 * done because objects that cannot be found are considered to be
6522 * an empty hash. The reply should then be a series of NULLs. */
6523 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6524 for (i = 2; i < c->argc; i++) {
6525 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6526 addReplyBulk(c,value);
6527 decrRefCount(value);
6528 } else {
6529 addReply(c,shared.nullbulk);
6530 }
6531 }
6532 }
6533
6534 static void hdelCommand(redisClient *c) {
6535 robj *o;
6536 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6537 checkType(c,o,REDIS_HASH)) return;
6538
6539 if (hashDelete(o,c->argv[2])) {
6540 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6541 addReply(c,shared.cone);
6542 server.dirty++;
6543 } else {
6544 addReply(c,shared.czero);
6545 }
6546 }
6547
6548 static void hlenCommand(redisClient *c) {
6549 robj *o;
6550 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6551 checkType(c,o,REDIS_HASH)) return;
6552
6553 addReplyUlong(c,hashLength(o));
6554 }
6555
6556 static void genericHgetallCommand(redisClient *c, int flags) {
6557 robj *o, *lenobj, *obj;
6558 unsigned long count = 0;
6559 hashIterator *hi;
6560
6561 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6562 || checkType(c,o,REDIS_HASH)) return;
6563
6564 lenobj = createObject(REDIS_STRING,NULL);
6565 addReply(c,lenobj);
6566 decrRefCount(lenobj);
6567
6568 hi = hashInitIterator(o);
6569 while (hashNext(hi) != REDIS_ERR) {
6570 if (flags & REDIS_HASH_KEY) {
6571 obj = hashCurrent(hi,REDIS_HASH_KEY);
6572 addReplyBulk(c,obj);
6573 decrRefCount(obj);
6574 count++;
6575 }
6576 if (flags & REDIS_HASH_VALUE) {
6577 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6578 addReplyBulk(c,obj);
6579 decrRefCount(obj);
6580 count++;
6581 }
6582 }
6583 hashReleaseIterator(hi);
6584
6585 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6586 }
6587
6588 static void hkeysCommand(redisClient *c) {
6589 genericHgetallCommand(c,REDIS_HASH_KEY);
6590 }
6591
6592 static void hvalsCommand(redisClient *c) {
6593 genericHgetallCommand(c,REDIS_HASH_VALUE);
6594 }
6595
6596 static void hgetallCommand(redisClient *c) {
6597 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6598 }
6599
6600 static void hexistsCommand(redisClient *c) {
6601 robj *o;
6602 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6603 checkType(c,o,REDIS_HASH)) return;
6604
6605 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6606 }
6607
6608 static void convertToRealHash(robj *o) {
6609 unsigned char *key, *val, *p, *zm = o->ptr;
6610 unsigned int klen, vlen;
6611 dict *dict = dictCreate(&hashDictType,NULL);
6612
6613 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6614 p = zipmapRewind(zm);
6615 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6616 robj *keyobj, *valobj;
6617
6618 keyobj = createStringObject((char*)key,klen);
6619 valobj = createStringObject((char*)val,vlen);
6620 keyobj = tryObjectEncoding(keyobj);
6621 valobj = tryObjectEncoding(valobj);
6622 dictAdd(dict,keyobj,valobj);
6623 }
6624 o->encoding = REDIS_ENCODING_HT;
6625 o->ptr = dict;
6626 zfree(zm);
6627 }
6628
6629 /* ========================= Non type-specific commands ==================== */
6630
6631 static void flushdbCommand(redisClient *c) {
6632 server.dirty += dictSize(c->db->dict);
6633 dictEmpty(c->db->dict);
6634 dictEmpty(c->db->expires);
6635 addReply(c,shared.ok);
6636 }
6637
6638 static void flushallCommand(redisClient *c) {
6639 server.dirty += emptyDb();
6640 addReply(c,shared.ok);
6641 if (server.bgsavechildpid != -1) {
6642 kill(server.bgsavechildpid,SIGKILL);
6643 rdbRemoveTempFile(server.bgsavechildpid);
6644 }
6645 rdbSave(server.dbfilename);
6646 server.dirty++;
6647 }
6648
6649 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6650 redisSortOperation *so = zmalloc(sizeof(*so));
6651 so->type = type;
6652 so->pattern = pattern;
6653 return so;
6654 }
6655
6656 /* Return the value associated to the key with a name obtained
6657 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6658 * The returned object will always have its refcount increased by 1
6659 * when it is non-NULL. */
6660 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6661 char *p, *f;
6662 sds spat, ssub;
6663 robj keyobj, fieldobj, *o;
6664 int prefixlen, sublen, postfixlen, fieldlen;
6665 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6666 struct {
6667 long len;
6668 long free;
6669 char buf[REDIS_SORTKEY_MAX+1];
6670 } keyname, fieldname;
6671
6672 /* If the pattern is "#" return the substitution object itself in order
6673 * to implement the "SORT ... GET #" feature. */
6674 spat = pattern->ptr;
6675 if (spat[0] == '#' && spat[1] == '\0') {
6676 incrRefCount(subst);
6677 return subst;
6678 }
6679
6680 /* The substitution object may be specially encoded. If so we create
6681 * a decoded object on the fly. Otherwise getDecodedObject will just
6682 * increment the ref count, that we'll decrement later. */
6683 subst = getDecodedObject(subst);
6684
6685 ssub = subst->ptr;
6686 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6687 p = strchr(spat,'*');
6688 if (!p) {
6689 decrRefCount(subst);
6690 return NULL;
6691 }
6692
6693 /* Find out if we're dealing with a hash dereference. */
6694 if ((f = strstr(p+1, "->")) != NULL) {
6695 fieldlen = sdslen(spat)-(f-spat);
6696 /* this also copies \0 character */
6697 memcpy(fieldname.buf,f+2,fieldlen-1);
6698 fieldname.len = fieldlen-2;
6699 } else {
6700 fieldlen = 0;
6701 }
6702
6703 prefixlen = p-spat;
6704 sublen = sdslen(ssub);
6705 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6706 memcpy(keyname.buf,spat,prefixlen);
6707 memcpy(keyname.buf+prefixlen,ssub,sublen);
6708 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6709 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6710 keyname.len = prefixlen+sublen+postfixlen;
6711 decrRefCount(subst);
6712
6713 /* Lookup substituted key */
6714 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6715 o = lookupKeyRead(db,&keyobj);
6716 if (o == NULL) return NULL;
6717
6718 if (fieldlen > 0) {
6719 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6720
6721 /* Retrieve value from hash by the field name. This operation
6722 * already increases the refcount of the returned object. */
6723 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6724 o = hashGet(o, &fieldobj);
6725 } else {
6726 if (o->type != REDIS_STRING) return NULL;
6727
6728 /* Every object that this function returns needs to have its refcount
6729 * increased. sortCommand decreases it again. */
6730 incrRefCount(o);
6731 }
6732
6733 return o;
6734 }
6735
6736 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6737 * the additional parameter is not standard but a BSD-specific we have to
6738 * pass sorting parameters via the global 'server' structure */
6739 static int sortCompare(const void *s1, const void *s2) {
6740 const redisSortObject *so1 = s1, *so2 = s2;
6741 int cmp;
6742
6743 if (!server.sort_alpha) {
6744 /* Numeric sorting. Here it's trivial as we precomputed scores */
6745 if (so1->u.score > so2->u.score) {
6746 cmp = 1;
6747 } else if (so1->u.score < so2->u.score) {
6748 cmp = -1;
6749 } else {
6750 cmp = 0;
6751 }
6752 } else {
6753 /* Alphanumeric sorting */
6754 if (server.sort_bypattern) {
6755 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6756 /* At least one compare object is NULL */
6757 if (so1->u.cmpobj == so2->u.cmpobj)
6758 cmp = 0;
6759 else if (so1->u.cmpobj == NULL)
6760 cmp = -1;
6761 else
6762 cmp = 1;
6763 } else {
6764 /* We have both the objects, use strcoll */
6765 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6766 }
6767 } else {
6768 /* Compare elements directly. */
6769 cmp = compareStringObjects(so1->obj,so2->obj);
6770 }
6771 }
6772 return server.sort_desc ? -cmp : cmp;
6773 }
6774
6775 /* The SORT command is the most complex command in Redis. Warning: this code
6776 * is optimized for speed and a bit less for readability */
6777 static void sortCommand(redisClient *c) {
6778 list *operations;
6779 int outputlen = 0;
6780 int desc = 0, alpha = 0;
6781 int limit_start = 0, limit_count = -1, start, end;
6782 int j, dontsort = 0, vectorlen;
6783 int getop = 0; /* GET operation counter */
6784 robj *sortval, *sortby = NULL, *storekey = NULL;
6785 redisSortObject *vector; /* Resulting vector to sort */
6786
6787 /* Lookup the key to sort. It must be of the right types */
6788 sortval = lookupKeyRead(c->db,c->argv[1]);
6789 if (sortval == NULL) {
6790 addReply(c,shared.emptymultibulk);
6791 return;
6792 }
6793 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6794 sortval->type != REDIS_ZSET)
6795 {
6796 addReply(c,shared.wrongtypeerr);
6797 return;
6798 }
6799
6800 /* Create a list of operations to perform for every sorted element.
6801 * Operations can be GET/DEL/INCR/DECR */
6802 operations = listCreate();
6803 listSetFreeMethod(operations,zfree);
6804 j = 2;
6805
6806 /* Now we need to protect sortval incrementing its count, in the future
6807 * SORT may have options able to overwrite/delete keys during the sorting
6808 * and the sorted key itself may get destroied */
6809 incrRefCount(sortval);
6810
6811 /* The SORT command has an SQL-alike syntax, parse it */
6812 while(j < c->argc) {
6813 int leftargs = c->argc-j-1;
6814 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6815 desc = 0;
6816 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6817 desc = 1;
6818 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6819 alpha = 1;
6820 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6821 limit_start = atoi(c->argv[j+1]->ptr);
6822 limit_count = atoi(c->argv[j+2]->ptr);
6823 j+=2;
6824 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6825 storekey = c->argv[j+1];
6826 j++;
6827 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6828 sortby = c->argv[j+1];
6829 /* If the BY pattern does not contain '*', i.e. it is constant,
6830 * we don't need to sort nor to lookup the weight keys. */
6831 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6832 j++;
6833 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6834 listAddNodeTail(operations,createSortOperation(
6835 REDIS_SORT_GET,c->argv[j+1]));
6836 getop++;
6837 j++;
6838 } else {
6839 decrRefCount(sortval);
6840 listRelease(operations);
6841 addReply(c,shared.syntaxerr);
6842 return;
6843 }
6844 j++;
6845 }
6846
6847 /* Load the sorting vector with all the objects to sort */
6848 switch(sortval->type) {
6849 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6850 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6851 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6852 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6853 }
6854 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6855 j = 0;
6856
6857 if (sortval->type == REDIS_LIST) {
6858 list *list = sortval->ptr;
6859 listNode *ln;
6860 listIter li;
6861
6862 listRewind(list,&li);
6863 while((ln = listNext(&li))) {
6864 robj *ele = ln->value;
6865 vector[j].obj = ele;
6866 vector[j].u.score = 0;
6867 vector[j].u.cmpobj = NULL;
6868 j++;
6869 }
6870 } else {
6871 dict *set;
6872 dictIterator *di;
6873 dictEntry *setele;
6874
6875 if (sortval->type == REDIS_SET) {
6876 set = sortval->ptr;
6877 } else {
6878 zset *zs = sortval->ptr;
6879 set = zs->dict;
6880 }
6881
6882 di = dictGetIterator(set);
6883 while((setele = dictNext(di)) != NULL) {
6884 vector[j].obj = dictGetEntryKey(setele);
6885 vector[j].u.score = 0;
6886 vector[j].u.cmpobj = NULL;
6887 j++;
6888 }
6889 dictReleaseIterator(di);
6890 }
6891 redisAssert(j == vectorlen);
6892
6893 /* Now it's time to load the right scores in the sorting vector */
6894 if (dontsort == 0) {
6895 for (j = 0; j < vectorlen; j++) {
6896 robj *byval;
6897 if (sortby) {
6898 /* lookup value to sort by */
6899 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6900 if (!byval) continue;
6901 } else {
6902 /* use object itself to sort by */
6903 byval = vector[j].obj;
6904 }
6905
6906 if (alpha) {
6907 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6908 } else {
6909 if (byval->encoding == REDIS_ENCODING_RAW) {
6910 vector[j].u.score = strtod(byval->ptr,NULL);
6911 } else if (byval->encoding == REDIS_ENCODING_INT) {
6912 /* Don't need to decode the object if it's
6913 * integer-encoded (the only encoding supported) so
6914 * far. We can just cast it */
6915 vector[j].u.score = (long)byval->ptr;
6916 } else {
6917 redisAssert(1 != 1);
6918 }
6919 }
6920
6921 /* when the object was retrieved using lookupKeyByPattern,
6922 * its refcount needs to be decreased. */
6923 if (sortby) {
6924 decrRefCount(byval);
6925 }
6926 }
6927 }
6928
6929 /* We are ready to sort the vector... perform a bit of sanity check
6930 * on the LIMIT option too. We'll use a partial version of quicksort. */
6931 start = (limit_start < 0) ? 0 : limit_start;
6932 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6933 if (start >= vectorlen) {
6934 start = vectorlen-1;
6935 end = vectorlen-2;
6936 }
6937 if (end >= vectorlen) end = vectorlen-1;
6938
6939 if (dontsort == 0) {
6940 server.sort_desc = desc;
6941 server.sort_alpha = alpha;
6942 server.sort_bypattern = sortby ? 1 : 0;
6943 if (sortby && (start != 0 || end != vectorlen-1))
6944 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6945 else
6946 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6947 }
6948
6949 /* Send command output to the output buffer, performing the specified
6950 * GET/DEL/INCR/DECR operations if any. */
6951 outputlen = getop ? getop*(end-start+1) : end-start+1;
6952 if (storekey == NULL) {
6953 /* STORE option not specified, sent the sorting result to client */
6954 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6955 for (j = start; j <= end; j++) {
6956 listNode *ln;
6957 listIter li;
6958
6959 if (!getop) addReplyBulk(c,vector[j].obj);
6960 listRewind(operations,&li);
6961 while((ln = listNext(&li))) {
6962 redisSortOperation *sop = ln->value;
6963 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6964 vector[j].obj);
6965
6966 if (sop->type == REDIS_SORT_GET) {
6967 if (!val) {
6968 addReply(c,shared.nullbulk);
6969 } else {
6970 addReplyBulk(c,val);
6971 decrRefCount(val);
6972 }
6973 } else {
6974 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6975 }
6976 }
6977 }
6978 } else {
6979 robj *listObject = createListObject();
6980 list *listPtr = (list*) listObject->ptr;
6981
6982 /* STORE option specified, set the sorting result as a List object */
6983 for (j = start; j <= end; j++) {
6984 listNode *ln;
6985 listIter li;
6986
6987 if (!getop) {
6988 listAddNodeTail(listPtr,vector[j].obj);
6989 incrRefCount(vector[j].obj);
6990 }
6991 listRewind(operations,&li);
6992 while((ln = listNext(&li))) {
6993 redisSortOperation *sop = ln->value;
6994 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6995 vector[j].obj);
6996
6997 if (sop->type == REDIS_SORT_GET) {
6998 if (!val) {
6999 listAddNodeTail(listPtr,createStringObject("",0));
7000 } else {
7001 /* We should do a incrRefCount on val because it is
7002 * added to the list, but also a decrRefCount because
7003 * it is returned by lookupKeyByPattern. This results
7004 * in doing nothing at all. */
7005 listAddNodeTail(listPtr,val);
7006 }
7007 } else {
7008 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7009 }
7010 }
7011 }
7012 if (dictReplace(c->db->dict,storekey,listObject)) {
7013 incrRefCount(storekey);
7014 }
7015 /* Note: we add 1 because the DB is dirty anyway since even if the
7016 * SORT result is empty a new key is set and maybe the old content
7017 * replaced. */
7018 server.dirty += 1+outputlen;
7019 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7020 }
7021
7022 /* Cleanup */
7023 decrRefCount(sortval);
7024 listRelease(operations);
7025 for (j = 0; j < vectorlen; j++) {
7026 if (alpha && vector[j].u.cmpobj)
7027 decrRefCount(vector[j].u.cmpobj);
7028 }
7029 zfree(vector);
7030 }
7031
7032 /* Convert an amount of bytes into a human readable string in the form
7033 * of 100B, 2G, 100M, 4K, and so forth. */
7034 static void bytesToHuman(char *s, unsigned long long n) {
7035 double d;
7036
7037 if (n < 1024) {
7038 /* Bytes */
7039 sprintf(s,"%lluB",n);
7040 return;
7041 } else if (n < (1024*1024)) {
7042 d = (double)n/(1024);
7043 sprintf(s,"%.2fK",d);
7044 } else if (n < (1024LL*1024*1024)) {
7045 d = (double)n/(1024*1024);
7046 sprintf(s,"%.2fM",d);
7047 } else if (n < (1024LL*1024*1024*1024)) {
7048 d = (double)n/(1024LL*1024*1024);
7049 sprintf(s,"%.2fG",d);
7050 }
7051 }
7052
7053 /* Create the string returned by the INFO command. This is decoupled
7054 * by the INFO command itself as we need to report the same information
7055 * on memory corruption problems. */
7056 static sds genRedisInfoString(void) {
7057 sds info;
7058 time_t uptime = time(NULL)-server.stat_starttime;
7059 int j;
7060 char hmem[64];
7061
7062 bytesToHuman(hmem,zmalloc_used_memory());
7063 info = sdscatprintf(sdsempty(),
7064 "redis_version:%s\r\n"
7065 "arch_bits:%s\r\n"
7066 "multiplexing_api:%s\r\n"
7067 "process_id:%ld\r\n"
7068 "uptime_in_seconds:%ld\r\n"
7069 "uptime_in_days:%ld\r\n"
7070 "connected_clients:%d\r\n"
7071 "connected_slaves:%d\r\n"
7072 "blocked_clients:%d\r\n"
7073 "used_memory:%zu\r\n"
7074 "used_memory_human:%s\r\n"
7075 "changes_since_last_save:%lld\r\n"
7076 "bgsave_in_progress:%d\r\n"
7077 "last_save_time:%ld\r\n"
7078 "bgrewriteaof_in_progress:%d\r\n"
7079 "total_connections_received:%lld\r\n"
7080 "total_commands_processed:%lld\r\n"
7081 "expired_keys:%lld\r\n"
7082 "hash_max_zipmap_entries:%ld\r\n"
7083 "hash_max_zipmap_value:%ld\r\n"
7084 "pubsub_channels:%ld\r\n"
7085 "pubsub_patterns:%u\r\n"
7086 "vm_enabled:%d\r\n"
7087 "role:%s\r\n"
7088 ,REDIS_VERSION,
7089 (sizeof(long) == 8) ? "64" : "32",
7090 aeGetApiName(),
7091 (long) getpid(),
7092 uptime,
7093 uptime/(3600*24),
7094 listLength(server.clients)-listLength(server.slaves),
7095 listLength(server.slaves),
7096 server.blpop_blocked_clients,
7097 zmalloc_used_memory(),
7098 hmem,
7099 server.dirty,
7100 server.bgsavechildpid != -1,
7101 server.lastsave,
7102 server.bgrewritechildpid != -1,
7103 server.stat_numconnections,
7104 server.stat_numcommands,
7105 server.stat_expiredkeys,
7106 server.hash_max_zipmap_entries,
7107 server.hash_max_zipmap_value,
7108 dictSize(server.pubsub_channels),
7109 listLength(server.pubsub_patterns),
7110 server.vm_enabled != 0,
7111 server.masterhost == NULL ? "master" : "slave"
7112 );
7113 if (server.masterhost) {
7114 info = sdscatprintf(info,
7115 "master_host:%s\r\n"
7116 "master_port:%d\r\n"
7117 "master_link_status:%s\r\n"
7118 "master_last_io_seconds_ago:%d\r\n"
7119 ,server.masterhost,
7120 server.masterport,
7121 (server.replstate == REDIS_REPL_CONNECTED) ?
7122 "up" : "down",
7123 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7124 );
7125 }
7126 if (server.vm_enabled) {
7127 lockThreadedIO();
7128 info = sdscatprintf(info,
7129 "vm_conf_max_memory:%llu\r\n"
7130 "vm_conf_page_size:%llu\r\n"
7131 "vm_conf_pages:%llu\r\n"
7132 "vm_stats_used_pages:%llu\r\n"
7133 "vm_stats_swapped_objects:%llu\r\n"
7134 "vm_stats_swappin_count:%llu\r\n"
7135 "vm_stats_swappout_count:%llu\r\n"
7136 "vm_stats_io_newjobs_len:%lu\r\n"
7137 "vm_stats_io_processing_len:%lu\r\n"
7138 "vm_stats_io_processed_len:%lu\r\n"
7139 "vm_stats_io_active_threads:%lu\r\n"
7140 "vm_stats_blocked_clients:%lu\r\n"
7141 ,(unsigned long long) server.vm_max_memory,
7142 (unsigned long long) server.vm_page_size,
7143 (unsigned long long) server.vm_pages,
7144 (unsigned long long) server.vm_stats_used_pages,
7145 (unsigned long long) server.vm_stats_swapped_objects,
7146 (unsigned long long) server.vm_stats_swapins,
7147 (unsigned long long) server.vm_stats_swapouts,
7148 (unsigned long) listLength(server.io_newjobs),
7149 (unsigned long) listLength(server.io_processing),
7150 (unsigned long) listLength(server.io_processed),
7151 (unsigned long) server.io_active_threads,
7152 (unsigned long) server.vm_blocked_clients
7153 );
7154 unlockThreadedIO();
7155 }
7156 for (j = 0; j < server.dbnum; j++) {
7157 long long keys, vkeys;
7158
7159 keys = dictSize(server.db[j].dict);
7160 vkeys = dictSize(server.db[j].expires);
7161 if (keys || vkeys) {
7162 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7163 j, keys, vkeys);
7164 }
7165 }
7166 return info;
7167 }
7168
7169 static void infoCommand(redisClient *c) {
7170 sds info = genRedisInfoString();
7171 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7172 (unsigned long)sdslen(info)));
7173 addReplySds(c,info);
7174 addReply(c,shared.crlf);
7175 }
7176
7177 static void monitorCommand(redisClient *c) {
7178 /* ignore MONITOR if aleady slave or in monitor mode */
7179 if (c->flags & REDIS_SLAVE) return;
7180
7181 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7182 c->slaveseldb = 0;
7183 listAddNodeTail(server.monitors,c);
7184 addReply(c,shared.ok);
7185 }
7186
7187 /* ================================= Expire ================================= */
7188 static int removeExpire(redisDb *db, robj *key) {
7189 if (dictDelete(db->expires,key) == DICT_OK) {
7190 return 1;
7191 } else {
7192 return 0;
7193 }
7194 }
7195
7196 static int setExpire(redisDb *db, robj *key, time_t when) {
7197 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7198 return 0;
7199 } else {
7200 incrRefCount(key);
7201 return 1;
7202 }
7203 }
7204
7205 /* Return the expire time of the specified key, or -1 if no expire
7206 * is associated with this key (i.e. the key is non volatile) */
7207 static time_t getExpire(redisDb *db, robj *key) {
7208 dictEntry *de;
7209
7210 /* No expire? return ASAP */
7211 if (dictSize(db->expires) == 0 ||
7212 (de = dictFind(db->expires,key)) == NULL) return -1;
7213
7214 return (time_t) dictGetEntryVal(de);
7215 }
7216
7217 static int expireIfNeeded(redisDb *db, robj *key) {
7218 time_t when;
7219 dictEntry *de;
7220
7221 /* No expire? return ASAP */
7222 if (dictSize(db->expires) == 0 ||
7223 (de = dictFind(db->expires,key)) == NULL) return 0;
7224
7225 /* Lookup the expire */
7226 when = (time_t) dictGetEntryVal(de);
7227 if (time(NULL) <= when) return 0;
7228
7229 /* Delete the key */
7230 dictDelete(db->expires,key);
7231 server.stat_expiredkeys++;
7232 return dictDelete(db->dict,key) == DICT_OK;
7233 }
7234
7235 static int deleteIfVolatile(redisDb *db, robj *key) {
7236 dictEntry *de;
7237
7238 /* No expire? return ASAP */
7239 if (dictSize(db->expires) == 0 ||
7240 (de = dictFind(db->expires,key)) == NULL) return 0;
7241
7242 /* Delete the key */
7243 server.dirty++;
7244 server.stat_expiredkeys++;
7245 dictDelete(db->expires,key);
7246 return dictDelete(db->dict,key) == DICT_OK;
7247 }
7248
7249 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7250 dictEntry *de;
7251 time_t seconds;
7252
7253 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7254
7255 seconds -= offset;
7256
7257 de = dictFind(c->db->dict,key);
7258 if (de == NULL) {
7259 addReply(c,shared.czero);
7260 return;
7261 }
7262 if (seconds <= 0) {
7263 if (deleteKey(c->db,key)) server.dirty++;
7264 addReply(c, shared.cone);
7265 return;
7266 } else {
7267 time_t when = time(NULL)+seconds;
7268 if (setExpire(c->db,key,when)) {
7269 addReply(c,shared.cone);
7270 server.dirty++;
7271 } else {
7272 addReply(c,shared.czero);
7273 }
7274 return;
7275 }
7276 }
7277
7278 static void expireCommand(redisClient *c) {
7279 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7280 }
7281
7282 static void expireatCommand(redisClient *c) {
7283 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7284 }
7285
7286 static void ttlCommand(redisClient *c) {
7287 time_t expire;
7288 int ttl = -1;
7289
7290 expire = getExpire(c->db,c->argv[1]);
7291 if (expire != -1) {
7292 ttl = (int) (expire-time(NULL));
7293 if (ttl < 0) ttl = -1;
7294 }
7295 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7296 }
7297
7298 /* ================================ MULTI/EXEC ============================== */
7299
7300 /* Client state initialization for MULTI/EXEC */
7301 static void initClientMultiState(redisClient *c) {
7302 c->mstate.commands = NULL;
7303 c->mstate.count = 0;
7304 }
7305
7306 /* Release all the resources associated with MULTI/EXEC state */
7307 static void freeClientMultiState(redisClient *c) {
7308 int j;
7309
7310 for (j = 0; j < c->mstate.count; j++) {
7311 int i;
7312 multiCmd *mc = c->mstate.commands+j;
7313
7314 for (i = 0; i < mc->argc; i++)
7315 decrRefCount(mc->argv[i]);
7316 zfree(mc->argv);
7317 }
7318 zfree(c->mstate.commands);
7319 }
7320
7321 /* Add a new command into the MULTI commands queue */
7322 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7323 multiCmd *mc;
7324 int j;
7325
7326 c->mstate.commands = zrealloc(c->mstate.commands,
7327 sizeof(multiCmd)*(c->mstate.count+1));
7328 mc = c->mstate.commands+c->mstate.count;
7329 mc->cmd = cmd;
7330 mc->argc = c->argc;
7331 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7332 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7333 for (j = 0; j < c->argc; j++)
7334 incrRefCount(mc->argv[j]);
7335 c->mstate.count++;
7336 }
7337
7338 static void multiCommand(redisClient *c) {
7339 c->flags |= REDIS_MULTI;
7340 addReply(c,shared.ok);
7341 }
7342
7343 static void discardCommand(redisClient *c) {
7344 if (!(c->flags & REDIS_MULTI)) {
7345 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7346 return;
7347 }
7348
7349 freeClientMultiState(c);
7350 initClientMultiState(c);
7351 c->flags &= (~REDIS_MULTI);
7352 addReply(c,shared.ok);
7353 }
7354
7355 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7356 * implememntation for more information. */
7357 static void execCommandReplicateMulti(redisClient *c) {
7358 struct redisCommand *cmd;
7359 robj *multistring = createStringObject("MULTI",5);
7360
7361 cmd = lookupCommand("multi");
7362 if (server.appendonly)
7363 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7364 if (listLength(server.slaves))
7365 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7366 decrRefCount(multistring);
7367 }
7368
7369 static void execCommand(redisClient *c) {
7370 int j;
7371 robj **orig_argv;
7372 int orig_argc;
7373
7374 if (!(c->flags & REDIS_MULTI)) {
7375 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7376 return;
7377 }
7378
7379 /* Replicate a MULTI request now that we are sure the block is executed.
7380 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7381 * both the AOF and the replication link will have the same consistency
7382 * and atomicity guarantees. */
7383 execCommandReplicateMulti(c);
7384
7385 /* Exec all the queued commands */
7386 orig_argv = c->argv;
7387 orig_argc = c->argc;
7388 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7389 for (j = 0; j < c->mstate.count; j++) {
7390 c->argc = c->mstate.commands[j].argc;
7391 c->argv = c->mstate.commands[j].argv;
7392 call(c,c->mstate.commands[j].cmd);
7393 }
7394 c->argv = orig_argv;
7395 c->argc = orig_argc;
7396 freeClientMultiState(c);
7397 initClientMultiState(c);
7398 c->flags &= (~REDIS_MULTI);
7399 /* Make sure the EXEC command is always replicated / AOF, since we
7400 * always send the MULTI command (we can't know beforehand if the
7401 * next operations will contain at least a modification to the DB). */
7402 server.dirty++;
7403 }
7404
7405 /* =========================== Blocking Operations ========================= */
7406
7407 /* Currently Redis blocking operations support is limited to list POP ops,
7408 * so the current implementation is not fully generic, but it is also not
7409 * completely specific so it will not require a rewrite to support new
7410 * kind of blocking operations in the future.
7411 *
7412 * Still it's important to note that list blocking operations can be already
7413 * used as a notification mechanism in order to implement other blocking
7414 * operations at application level, so there must be a very strong evidence
7415 * of usefulness and generality before new blocking operations are implemented.
7416 *
7417 * This is how the current blocking POP works, we use BLPOP as example:
7418 * - If the user calls BLPOP and the key exists and contains a non empty list
7419 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7420 * if there is not to block.
7421 * - If instead BLPOP is called and the key does not exists or the list is
7422 * empty we need to block. In order to do so we remove the notification for
7423 * new data to read in the client socket (so that we'll not serve new
7424 * requests if the blocking request is not served). Also we put the client
7425 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7426 * blocking for this keys.
7427 * - If a PUSH operation against a key with blocked clients waiting is
7428 * performed, we serve the first in the list: basically instead to push
7429 * the new element inside the list we return it to the (first / oldest)
7430 * blocking client, unblock the client, and remove it form the list.
7431 *
7432 * The above comment and the source code should be enough in order to understand
7433 * the implementation and modify / fix it later.
7434 */
7435
7436 /* Set a client in blocking mode for the specified key, with the specified
7437 * timeout */
7438 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7439 dictEntry *de;
7440 list *l;
7441 int j;
7442
7443 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7444 c->blockingkeysnum = numkeys;
7445 c->blockingto = timeout;
7446 for (j = 0; j < numkeys; j++) {
7447 /* Add the key in the client structure, to map clients -> keys */
7448 c->blockingkeys[j] = keys[j];
7449 incrRefCount(keys[j]);
7450
7451 /* And in the other "side", to map keys -> clients */
7452 de = dictFind(c->db->blockingkeys,keys[j]);
7453 if (de == NULL) {
7454 int retval;
7455
7456 /* For every key we take a list of clients blocked for it */
7457 l = listCreate();
7458 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7459 incrRefCount(keys[j]);
7460 assert(retval == DICT_OK);
7461 } else {
7462 l = dictGetEntryVal(de);
7463 }
7464 listAddNodeTail(l,c);
7465 }
7466 /* Mark the client as a blocked client */
7467 c->flags |= REDIS_BLOCKED;
7468 server.blpop_blocked_clients++;
7469 }
7470
7471 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7472 static void unblockClientWaitingData(redisClient *c) {
7473 dictEntry *de;
7474 list *l;
7475 int j;
7476
7477 assert(c->blockingkeys != NULL);
7478 /* The client may wait for multiple keys, so unblock it for every key. */
7479 for (j = 0; j < c->blockingkeysnum; j++) {
7480 /* Remove this client from the list of clients waiting for this key. */
7481 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7482 assert(de != NULL);
7483 l = dictGetEntryVal(de);
7484 listDelNode(l,listSearchKey(l,c));
7485 /* If the list is empty we need to remove it to avoid wasting memory */
7486 if (listLength(l) == 0)
7487 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7488 decrRefCount(c->blockingkeys[j]);
7489 }
7490 /* Cleanup the client structure */
7491 zfree(c->blockingkeys);
7492 c->blockingkeys = NULL;
7493 c->flags &= (~REDIS_BLOCKED);
7494 server.blpop_blocked_clients--;
7495 /* We want to process data if there is some command waiting
7496 * in the input buffer. Note that this is safe even if
7497 * unblockClientWaitingData() gets called from freeClient() because
7498 * freeClient() will be smart enough to call this function
7499 * *after* c->querybuf was set to NULL. */
7500 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7501 }
7502
7503 /* This should be called from any function PUSHing into lists.
7504 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7505 * 'ele' is the element pushed.
7506 *
7507 * If the function returns 0 there was no client waiting for a list push
7508 * against this key.
7509 *
7510 * If the function returns 1 there was a client waiting for a list push
7511 * against this key, the element was passed to this client thus it's not
7512 * needed to actually add it to the list and the caller should return asap. */
7513 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7514 struct dictEntry *de;
7515 redisClient *receiver;
7516 list *l;
7517 listNode *ln;
7518
7519 de = dictFind(c->db->blockingkeys,key);
7520 if (de == NULL) return 0;
7521 l = dictGetEntryVal(de);
7522 ln = listFirst(l);
7523 assert(ln != NULL);
7524 receiver = ln->value;
7525
7526 addReplySds(receiver,sdsnew("*2\r\n"));
7527 addReplyBulk(receiver,key);
7528 addReplyBulk(receiver,ele);
7529 unblockClientWaitingData(receiver);
7530 return 1;
7531 }
7532
7533 /* Blocking RPOP/LPOP */
7534 static void blockingPopGenericCommand(redisClient *c, int where) {
7535 robj *o;
7536 time_t timeout;
7537 int j;
7538
7539 for (j = 1; j < c->argc-1; j++) {
7540 o = lookupKeyWrite(c->db,c->argv[j]);
7541 if (o != NULL) {
7542 if (o->type != REDIS_LIST) {
7543 addReply(c,shared.wrongtypeerr);
7544 return;
7545 } else {
7546 list *list = o->ptr;
7547 if (listLength(list) != 0) {
7548 /* If the list contains elements fall back to the usual
7549 * non-blocking POP operation */
7550 robj *argv[2], **orig_argv;
7551 int orig_argc;
7552
7553 /* We need to alter the command arguments before to call
7554 * popGenericCommand() as the command takes a single key. */
7555 orig_argv = c->argv;
7556 orig_argc = c->argc;
7557 argv[1] = c->argv[j];
7558 c->argv = argv;
7559 c->argc = 2;
7560
7561 /* Also the return value is different, we need to output
7562 * the multi bulk reply header and the key name. The
7563 * "real" command will add the last element (the value)
7564 * for us. If this souds like an hack to you it's just
7565 * because it is... */
7566 addReplySds(c,sdsnew("*2\r\n"));
7567 addReplyBulk(c,argv[1]);
7568 popGenericCommand(c,where);
7569
7570 /* Fix the client structure with the original stuff */
7571 c->argv = orig_argv;
7572 c->argc = orig_argc;
7573 return;
7574 }
7575 }
7576 }
7577 }
7578 /* If the list is empty or the key does not exists we must block */
7579 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7580 if (timeout > 0) timeout += time(NULL);
7581 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7582 }
7583
7584 static void blpopCommand(redisClient *c) {
7585 blockingPopGenericCommand(c,REDIS_HEAD);
7586 }
7587
7588 static void brpopCommand(redisClient *c) {
7589 blockingPopGenericCommand(c,REDIS_TAIL);
7590 }
7591
7592 /* =============================== Replication ============================= */
7593
7594 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7595 ssize_t nwritten, ret = size;
7596 time_t start = time(NULL);
7597
7598 timeout++;
7599 while(size) {
7600 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7601 nwritten = write(fd,ptr,size);
7602 if (nwritten == -1) return -1;
7603 ptr += nwritten;
7604 size -= nwritten;
7605 }
7606 if ((time(NULL)-start) > timeout) {
7607 errno = ETIMEDOUT;
7608 return -1;
7609 }
7610 }
7611 return ret;
7612 }
7613
7614 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7615 ssize_t nread, totread = 0;
7616 time_t start = time(NULL);
7617
7618 timeout++;
7619 while(size) {
7620 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7621 nread = read(fd,ptr,size);
7622 if (nread == -1) return -1;
7623 ptr += nread;
7624 size -= nread;
7625 totread += nread;
7626 }
7627 if ((time(NULL)-start) > timeout) {
7628 errno = ETIMEDOUT;
7629 return -1;
7630 }
7631 }
7632 return totread;
7633 }
7634
7635 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7636 ssize_t nread = 0;
7637
7638 size--;
7639 while(size) {
7640 char c;
7641
7642 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7643 if (c == '\n') {
7644 *ptr = '\0';
7645 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7646 return nread;
7647 } else {
7648 *ptr++ = c;
7649 *ptr = '\0';
7650 nread++;
7651 }
7652 }
7653 return nread;
7654 }
7655
7656 static void syncCommand(redisClient *c) {
7657 /* ignore SYNC if aleady slave or in monitor mode */
7658 if (c->flags & REDIS_SLAVE) return;
7659
7660 /* SYNC can't be issued when the server has pending data to send to
7661 * the client about already issued commands. We need a fresh reply
7662 * buffer registering the differences between the BGSAVE and the current
7663 * dataset, so that we can copy to other slaves if needed. */
7664 if (listLength(c->reply) != 0) {
7665 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7666 return;
7667 }
7668
7669 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7670 /* Here we need to check if there is a background saving operation
7671 * in progress, or if it is required to start one */
7672 if (server.bgsavechildpid != -1) {
7673 /* Ok a background save is in progress. Let's check if it is a good
7674 * one for replication, i.e. if there is another slave that is
7675 * registering differences since the server forked to save */
7676 redisClient *slave;
7677 listNode *ln;
7678 listIter li;
7679
7680 listRewind(server.slaves,&li);
7681 while((ln = listNext(&li))) {
7682 slave = ln->value;
7683 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7684 }
7685 if (ln) {
7686 /* Perfect, the server is already registering differences for
7687 * another slave. Set the right state, and copy the buffer. */
7688 listRelease(c->reply);
7689 c->reply = listDup(slave->reply);
7690 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7691 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7692 } else {
7693 /* No way, we need to wait for the next BGSAVE in order to
7694 * register differences */
7695 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7696 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7697 }
7698 } else {
7699 /* Ok we don't have a BGSAVE in progress, let's start one */
7700 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7701 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7702 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7703 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7704 return;
7705 }
7706 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7707 }
7708 c->repldbfd = -1;
7709 c->flags |= REDIS_SLAVE;
7710 c->slaveseldb = 0;
7711 listAddNodeTail(server.slaves,c);
7712 return;
7713 }
7714
7715 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7716 redisClient *slave = privdata;
7717 REDIS_NOTUSED(el);
7718 REDIS_NOTUSED(mask);
7719 char buf[REDIS_IOBUF_LEN];
7720 ssize_t nwritten, buflen;
7721
7722 if (slave->repldboff == 0) {
7723 /* Write the bulk write count before to transfer the DB. In theory here
7724 * we don't know how much room there is in the output buffer of the
7725 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7726 * operations) will never be smaller than the few bytes we need. */
7727 sds bulkcount;
7728
7729 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7730 slave->repldbsize);
7731 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7732 {
7733 sdsfree(bulkcount);
7734 freeClient(slave);
7735 return;
7736 }
7737 sdsfree(bulkcount);
7738 }
7739 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7740 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7741 if (buflen <= 0) {
7742 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7743 (buflen == 0) ? "premature EOF" : strerror(errno));
7744 freeClient(slave);
7745 return;
7746 }
7747 if ((nwritten = write(fd,buf,buflen)) == -1) {
7748 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7749 strerror(errno));
7750 freeClient(slave);
7751 return;
7752 }
7753 slave->repldboff += nwritten;
7754 if (slave->repldboff == slave->repldbsize) {
7755 close(slave->repldbfd);
7756 slave->repldbfd = -1;
7757 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7758 slave->replstate = REDIS_REPL_ONLINE;
7759 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7760 sendReplyToClient, slave) == AE_ERR) {
7761 freeClient(slave);
7762 return;
7763 }
7764 addReplySds(slave,sdsempty());
7765 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7766 }
7767 }
7768
7769 /* This function is called at the end of every backgrond saving.
7770 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7771 * otherwise REDIS_ERR is passed to the function.
7772 *
7773 * The goal of this function is to handle slaves waiting for a successful
7774 * background saving in order to perform non-blocking synchronization. */
7775 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7776 listNode *ln;
7777 int startbgsave = 0;
7778 listIter li;
7779
7780 listRewind(server.slaves,&li);
7781 while((ln = listNext(&li))) {
7782 redisClient *slave = ln->value;
7783
7784 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7785 startbgsave = 1;
7786 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7787 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7788 struct redis_stat buf;
7789
7790 if (bgsaveerr != REDIS_OK) {
7791 freeClient(slave);
7792 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7793 continue;
7794 }
7795 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7796 redis_fstat(slave->repldbfd,&buf) == -1) {
7797 freeClient(slave);
7798 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7799 continue;
7800 }
7801 slave->repldboff = 0;
7802 slave->repldbsize = buf.st_size;
7803 slave->replstate = REDIS_REPL_SEND_BULK;
7804 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7805 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7806 freeClient(slave);
7807 continue;
7808 }
7809 }
7810 }
7811 if (startbgsave) {
7812 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7813 listIter li;
7814
7815 listRewind(server.slaves,&li);
7816 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7817 while((ln = listNext(&li))) {
7818 redisClient *slave = ln->value;
7819
7820 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7821 freeClient(slave);
7822 }
7823 }
7824 }
7825 }
7826
7827 static int syncWithMaster(void) {
7828 char buf[1024], tmpfile[256], authcmd[1024];
7829 long dumpsize;
7830 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7831 int dfd, maxtries = 5;
7832
7833 if (fd == -1) {
7834 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7835 strerror(errno));
7836 return REDIS_ERR;
7837 }
7838
7839 /* AUTH with the master if required. */
7840 if(server.masterauth) {
7841 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7842 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7843 close(fd);
7844 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7845 strerror(errno));
7846 return REDIS_ERR;
7847 }
7848 /* Read the AUTH result. */
7849 if (syncReadLine(fd,buf,1024,3600) == -1) {
7850 close(fd);
7851 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7852 strerror(errno));
7853 return REDIS_ERR;
7854 }
7855 if (buf[0] != '+') {
7856 close(fd);
7857 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7858 return REDIS_ERR;
7859 }
7860 }
7861
7862 /* Issue the SYNC command */
7863 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7864 close(fd);
7865 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7866 strerror(errno));
7867 return REDIS_ERR;
7868 }
7869 /* Read the bulk write count */
7870 if (syncReadLine(fd,buf,1024,3600) == -1) {
7871 close(fd);
7872 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7873 strerror(errno));
7874 return REDIS_ERR;
7875 }
7876 if (buf[0] != '$') {
7877 close(fd);
7878 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7879 return REDIS_ERR;
7880 }
7881 dumpsize = strtol(buf+1,NULL,10);
7882 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7883 /* Read the bulk write data on a temp file */
7884 while(maxtries--) {
7885 snprintf(tmpfile,256,
7886 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7887 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7888 if (dfd != -1) break;
7889 sleep(1);
7890 }
7891 if (dfd == -1) {
7892 close(fd);
7893 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7894 return REDIS_ERR;
7895 }
7896 while(dumpsize) {
7897 int nread, nwritten;
7898
7899 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7900 if (nread == -1) {
7901 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7902 strerror(errno));
7903 close(fd);
7904 close(dfd);
7905 return REDIS_ERR;
7906 }
7907 nwritten = write(dfd,buf,nread);
7908 if (nwritten == -1) {
7909 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7910 close(fd);
7911 close(dfd);
7912 return REDIS_ERR;
7913 }
7914 dumpsize -= nread;
7915 }
7916 close(dfd);
7917 if (rename(tmpfile,server.dbfilename) == -1) {
7918 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7919 unlink(tmpfile);
7920 close(fd);
7921 return REDIS_ERR;
7922 }
7923 emptyDb();
7924 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7925 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7926 close(fd);
7927 return REDIS_ERR;
7928 }
7929 server.master = createClient(fd);
7930 server.master->flags |= REDIS_MASTER;
7931 server.master->authenticated = 1;
7932 server.replstate = REDIS_REPL_CONNECTED;
7933 return REDIS_OK;
7934 }
7935
7936 static void slaveofCommand(redisClient *c) {
7937 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7938 !strcasecmp(c->argv[2]->ptr,"one")) {
7939 if (server.masterhost) {
7940 sdsfree(server.masterhost);
7941 server.masterhost = NULL;
7942 if (server.master) freeClient(server.master);
7943 server.replstate = REDIS_REPL_NONE;
7944 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7945 }
7946 } else {
7947 sdsfree(server.masterhost);
7948 server.masterhost = sdsdup(c->argv[1]->ptr);
7949 server.masterport = atoi(c->argv[2]->ptr);
7950 if (server.master) freeClient(server.master);
7951 server.replstate = REDIS_REPL_CONNECT;
7952 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7953 server.masterhost, server.masterport);
7954 }
7955 addReply(c,shared.ok);
7956 }
7957
7958 /* ============================ Maxmemory directive ======================== */
7959
7960 /* Try to free one object form the pre-allocated objects free list.
7961 * This is useful under low mem conditions as by default we take 1 million
7962 * free objects allocated. On success REDIS_OK is returned, otherwise
7963 * REDIS_ERR. */
7964 static int tryFreeOneObjectFromFreelist(void) {
7965 robj *o;
7966
7967 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7968 if (listLength(server.objfreelist)) {
7969 listNode *head = listFirst(server.objfreelist);
7970 o = listNodeValue(head);
7971 listDelNode(server.objfreelist,head);
7972 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7973 zfree(o);
7974 return REDIS_OK;
7975 } else {
7976 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7977 return REDIS_ERR;
7978 }
7979 }
7980
7981 /* This function gets called when 'maxmemory' is set on the config file to limit
7982 * the max memory used by the server, and we are out of memory.
7983 * This function will try to, in order:
7984 *
7985 * - Free objects from the free list
7986 * - Try to remove keys with an EXPIRE set
7987 *
7988 * It is not possible to free enough memory to reach used-memory < maxmemory
7989 * the server will start refusing commands that will enlarge even more the
7990 * memory usage.
7991 */
7992 static void freeMemoryIfNeeded(void) {
7993 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7994 int j, k, freed = 0;
7995
7996 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7997 for (j = 0; j < server.dbnum; j++) {
7998 int minttl = -1;
7999 robj *minkey = NULL;
8000 struct dictEntry *de;
8001
8002 if (dictSize(server.db[j].expires)) {
8003 freed = 1;
8004 /* From a sample of three keys drop the one nearest to
8005 * the natural expire */
8006 for (k = 0; k < 3; k++) {
8007 time_t t;
8008
8009 de = dictGetRandomKey(server.db[j].expires);
8010 t = (time_t) dictGetEntryVal(de);
8011 if (minttl == -1 || t < minttl) {
8012 minkey = dictGetEntryKey(de);
8013 minttl = t;
8014 }
8015 }
8016 deleteKey(server.db+j,minkey);
8017 }
8018 }
8019 if (!freed) return; /* nothing to free... */
8020 }
8021 }
8022
8023 /* ============================== Append Only file ========================== */
8024
8025 /* Write the append only file buffer on disk.
8026 *
8027 * Since we are required to write the AOF before replying to the client,
8028 * and the only way the client socket can get a write is entering when the
8029 * the event loop, we accumulate all the AOF writes in a memory
8030 * buffer and write it on disk using this function just before entering
8031 * the event loop again. */
8032 static void flushAppendOnlyFile(void) {
8033 time_t now;
8034 ssize_t nwritten;
8035
8036 if (sdslen(server.aofbuf) == 0) return;
8037
8038 /* We want to perform a single write. This should be guaranteed atomic
8039 * at least if the filesystem we are writing is a real physical one.
8040 * While this will save us against the server being killed I don't think
8041 * there is much to do about the whole server stopping for power problems
8042 * or alike */
8043 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8044 if (nwritten != (signed)sdslen(server.aofbuf)) {
8045 /* Ooops, we are in troubles. The best thing to do for now is
8046 * aborting instead of giving the illusion that everything is
8047 * working as expected. */
8048 if (nwritten == -1) {
8049 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8050 } else {
8051 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8052 }
8053 exit(1);
8054 }
8055 sdsfree(server.aofbuf);
8056 server.aofbuf = sdsempty();
8057
8058 /* Fsync if needed */
8059 now = time(NULL);
8060 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8061 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8062 now-server.lastfsync > 1))
8063 {
8064 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8065 * flushing metadata. */
8066 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8067 server.lastfsync = now;
8068 }
8069 }
8070
8071 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8072 sds buf = sdsempty();
8073 int j;
8074 robj *tmpargv[3];
8075
8076 /* The DB this command was targetting is not the same as the last command
8077 * we appendend. To issue a SELECT command is needed. */
8078 if (dictid != server.appendseldb) {
8079 char seldb[64];
8080
8081 snprintf(seldb,sizeof(seldb),"%d",dictid);
8082 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8083 (unsigned long)strlen(seldb),seldb);
8084 server.appendseldb = dictid;
8085 }
8086
8087 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8088 * EXPIREs into EXPIREATs calls */
8089 if (cmd->proc == expireCommand) {
8090 long when;
8091
8092 tmpargv[0] = createStringObject("EXPIREAT",8);
8093 tmpargv[1] = argv[1];
8094 incrRefCount(argv[1]);
8095 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8096 tmpargv[2] = createObject(REDIS_STRING,
8097 sdscatprintf(sdsempty(),"%ld",when));
8098 argv = tmpargv;
8099 }
8100
8101 /* Append the actual command */
8102 buf = sdscatprintf(buf,"*%d\r\n",argc);
8103 for (j = 0; j < argc; j++) {
8104 robj *o = argv[j];
8105
8106 o = getDecodedObject(o);
8107 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8108 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8109 buf = sdscatlen(buf,"\r\n",2);
8110 decrRefCount(o);
8111 }
8112
8113 /* Free the objects from the modified argv for EXPIREAT */
8114 if (cmd->proc == expireCommand) {
8115 for (j = 0; j < 3; j++)
8116 decrRefCount(argv[j]);
8117 }
8118
8119 /* Append to the AOF buffer. This will be flushed on disk just before
8120 * of re-entering the event loop, so before the client will get a
8121 * positive reply about the operation performed. */
8122 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8123
8124 /* If a background append only file rewriting is in progress we want to
8125 * accumulate the differences between the child DB and the current one
8126 * in a buffer, so that when the child process will do its work we
8127 * can append the differences to the new append only file. */
8128 if (server.bgrewritechildpid != -1)
8129 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8130
8131 sdsfree(buf);
8132 }
8133
8134 /* In Redis commands are always executed in the context of a client, so in
8135 * order to load the append only file we need to create a fake client. */
8136 static struct redisClient *createFakeClient(void) {
8137 struct redisClient *c = zmalloc(sizeof(*c));
8138
8139 selectDb(c,0);
8140 c->fd = -1;
8141 c->querybuf = sdsempty();
8142 c->argc = 0;
8143 c->argv = NULL;
8144 c->flags = 0;
8145 /* We set the fake client as a slave waiting for the synchronization
8146 * so that Redis will not try to send replies to this client. */
8147 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8148 c->reply = listCreate();
8149 listSetFreeMethod(c->reply,decrRefCount);
8150 listSetDupMethod(c->reply,dupClientReplyValue);
8151 initClientMultiState(c);
8152 return c;
8153 }
8154
8155 static void freeFakeClient(struct redisClient *c) {
8156 sdsfree(c->querybuf);
8157 listRelease(c->reply);
8158 freeClientMultiState(c);
8159 zfree(c);
8160 }
8161
8162 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8163 * error (the append only file is zero-length) REDIS_ERR is returned. On
8164 * fatal error an error message is logged and the program exists. */
8165 int loadAppendOnlyFile(char *filename) {
8166 struct redisClient *fakeClient;
8167 FILE *fp = fopen(filename,"r");
8168 struct redis_stat sb;
8169 unsigned long long loadedkeys = 0;
8170 int appendonly = server.appendonly;
8171
8172 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8173 return REDIS_ERR;
8174
8175 if (fp == NULL) {
8176 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8177 exit(1);
8178 }
8179
8180 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8181 * to the same file we're about to read. */
8182 server.appendonly = 0;
8183
8184 fakeClient = createFakeClient();
8185 while(1) {
8186 int argc, j;
8187 unsigned long len;
8188 robj **argv;
8189 char buf[128];
8190 sds argsds;
8191 struct redisCommand *cmd;
8192
8193 if (fgets(buf,sizeof(buf),fp) == NULL) {
8194 if (feof(fp))
8195 break;
8196 else
8197 goto readerr;
8198 }
8199 if (buf[0] != '*') goto fmterr;
8200 argc = atoi(buf+1);
8201 argv = zmalloc(sizeof(robj*)*argc);
8202 for (j = 0; j < argc; j++) {
8203 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8204 if (buf[0] != '$') goto fmterr;
8205 len = strtol(buf+1,NULL,10);
8206 argsds = sdsnewlen(NULL,len);
8207 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8208 argv[j] = createObject(REDIS_STRING,argsds);
8209 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8210 }
8211
8212 /* Command lookup */
8213 cmd = lookupCommand(argv[0]->ptr);
8214 if (!cmd) {
8215 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8216 exit(1);
8217 }
8218 /* Try object encoding */
8219 if (cmd->flags & REDIS_CMD_BULK)
8220 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8221 /* Run the command in the context of a fake client */
8222 fakeClient->argc = argc;
8223 fakeClient->argv = argv;
8224 cmd->proc(fakeClient);
8225 /* Discard the reply objects list from the fake client */
8226 while(listLength(fakeClient->reply))
8227 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8228 /* Clean up, ready for the next command */
8229 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8230 zfree(argv);
8231 /* Handle swapping while loading big datasets when VM is on */
8232 loadedkeys++;
8233 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8234 while (zmalloc_used_memory() > server.vm_max_memory) {
8235 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8236 }
8237 }
8238 }
8239
8240 /* This point can only be reached when EOF is reached without errors.
8241 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8242 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8243
8244 fclose(fp);
8245 freeFakeClient(fakeClient);
8246 server.appendonly = appendonly;
8247 return REDIS_OK;
8248
8249 readerr:
8250 if (feof(fp)) {
8251 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8252 } else {
8253 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8254 }
8255 exit(1);
8256 fmterr:
8257 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8258 exit(1);
8259 }
8260
8261 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8262 static int fwriteBulkObject(FILE *fp, robj *obj) {
8263 char buf[128];
8264 int decrrc = 0;
8265
8266 /* Avoid the incr/decr ref count business if possible to help
8267 * copy-on-write (we are often in a child process when this function
8268 * is called).
8269 * Also makes sure that key objects don't get incrRefCount-ed when VM
8270 * is enabled */
8271 if (obj->encoding != REDIS_ENCODING_RAW) {
8272 obj = getDecodedObject(obj);
8273 decrrc = 1;
8274 }
8275 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8276 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8277 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8278 goto err;
8279 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8280 if (decrrc) decrRefCount(obj);
8281 return 1;
8282 err:
8283 if (decrrc) decrRefCount(obj);
8284 return 0;
8285 }
8286
8287 /* Write binary-safe string into a file in the bulkformat
8288 * $<count>\r\n<payload>\r\n */
8289 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8290 char buf[128];
8291
8292 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8293 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8294 if (len && fwrite(s,len,1,fp) == 0) return 0;
8295 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8296 return 1;
8297 }
8298
8299 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8300 static int fwriteBulkDouble(FILE *fp, double d) {
8301 char buf[128], dbuf[128];
8302
8303 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8304 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8305 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8306 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8307 return 1;
8308 }
8309
8310 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8311 static int fwriteBulkLong(FILE *fp, long l) {
8312 char buf[128], lbuf[128];
8313
8314 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8315 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8316 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8317 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8318 return 1;
8319 }
8320
8321 /* Write a sequence of commands able to fully rebuild the dataset into
8322 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8323 static int rewriteAppendOnlyFile(char *filename) {
8324 dictIterator *di = NULL;
8325 dictEntry *de;
8326 FILE *fp;
8327 char tmpfile[256];
8328 int j;
8329 time_t now = time(NULL);
8330
8331 /* Note that we have to use a different temp name here compared to the
8332 * one used by rewriteAppendOnlyFileBackground() function. */
8333 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8334 fp = fopen(tmpfile,"w");
8335 if (!fp) {
8336 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8337 return REDIS_ERR;
8338 }
8339 for (j = 0; j < server.dbnum; j++) {
8340 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8341 redisDb *db = server.db+j;
8342 dict *d = db->dict;
8343 if (dictSize(d) == 0) continue;
8344 di = dictGetIterator(d);
8345 if (!di) {
8346 fclose(fp);
8347 return REDIS_ERR;
8348 }
8349
8350 /* SELECT the new DB */
8351 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8352 if (fwriteBulkLong(fp,j) == 0) goto werr;
8353
8354 /* Iterate this DB writing every entry */
8355 while((de = dictNext(di)) != NULL) {
8356 robj *key, *o;
8357 time_t expiretime;
8358 int swapped;
8359
8360 key = dictGetEntryKey(de);
8361 /* If the value for this key is swapped, load a preview in memory.
8362 * We use a "swapped" flag to remember if we need to free the
8363 * value object instead to just increment the ref count anyway
8364 * in order to avoid copy-on-write of pages if we are forked() */
8365 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8366 key->storage == REDIS_VM_SWAPPING) {
8367 o = dictGetEntryVal(de);
8368 swapped = 0;
8369 } else {
8370 o = vmPreviewObject(key);
8371 swapped = 1;
8372 }
8373 expiretime = getExpire(db,key);
8374
8375 /* Save the key and associated value */
8376 if (o->type == REDIS_STRING) {
8377 /* Emit a SET command */
8378 char cmd[]="*3\r\n$3\r\nSET\r\n";
8379 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8380 /* Key and value */
8381 if (fwriteBulkObject(fp,key) == 0) goto werr;
8382 if (fwriteBulkObject(fp,o) == 0) goto werr;
8383 } else if (o->type == REDIS_LIST) {
8384 /* Emit the RPUSHes needed to rebuild the list */
8385 list *list = o->ptr;
8386 listNode *ln;
8387 listIter li;
8388
8389 listRewind(list,&li);
8390 while((ln = listNext(&li))) {
8391 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8392 robj *eleobj = listNodeValue(ln);
8393
8394 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8395 if (fwriteBulkObject(fp,key) == 0) goto werr;
8396 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8397 }
8398 } else if (o->type == REDIS_SET) {
8399 /* Emit the SADDs needed to rebuild the set */
8400 dict *set = o->ptr;
8401 dictIterator *di = dictGetIterator(set);
8402 dictEntry *de;
8403
8404 while((de = dictNext(di)) != NULL) {
8405 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8406 robj *eleobj = dictGetEntryKey(de);
8407
8408 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8409 if (fwriteBulkObject(fp,key) == 0) goto werr;
8410 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8411 }
8412 dictReleaseIterator(di);
8413 } else if (o->type == REDIS_ZSET) {
8414 /* Emit the ZADDs needed to rebuild the sorted set */
8415 zset *zs = o->ptr;
8416 dictIterator *di = dictGetIterator(zs->dict);
8417 dictEntry *de;
8418
8419 while((de = dictNext(di)) != NULL) {
8420 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8421 robj *eleobj = dictGetEntryKey(de);
8422 double *score = dictGetEntryVal(de);
8423
8424 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8425 if (fwriteBulkObject(fp,key) == 0) goto werr;
8426 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8427 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8428 }
8429 dictReleaseIterator(di);
8430 } else if (o->type == REDIS_HASH) {
8431 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8432
8433 /* Emit the HSETs needed to rebuild the hash */
8434 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8435 unsigned char *p = zipmapRewind(o->ptr);
8436 unsigned char *field, *val;
8437 unsigned int flen, vlen;
8438
8439 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8440 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8441 if (fwriteBulkObject(fp,key) == 0) goto werr;
8442 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8443 return -1;
8444 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8445 return -1;
8446 }
8447 } else {
8448 dictIterator *di = dictGetIterator(o->ptr);
8449 dictEntry *de;
8450
8451 while((de = dictNext(di)) != NULL) {
8452 robj *field = dictGetEntryKey(de);
8453 robj *val = dictGetEntryVal(de);
8454
8455 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8456 if (fwriteBulkObject(fp,key) == 0) goto werr;
8457 if (fwriteBulkObject(fp,field) == -1) return -1;
8458 if (fwriteBulkObject(fp,val) == -1) return -1;
8459 }
8460 dictReleaseIterator(di);
8461 }
8462 } else {
8463 redisPanic("Unknown object type");
8464 }
8465 /* Save the expire time */
8466 if (expiretime != -1) {
8467 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8468 /* If this key is already expired skip it */
8469 if (expiretime < now) continue;
8470 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8471 if (fwriteBulkObject(fp,key) == 0) goto werr;
8472 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8473 }
8474 if (swapped) decrRefCount(o);
8475 }
8476 dictReleaseIterator(di);
8477 }
8478
8479 /* Make sure data will not remain on the OS's output buffers */
8480 fflush(fp);
8481 fsync(fileno(fp));
8482 fclose(fp);
8483
8484 /* Use RENAME to make sure the DB file is changed atomically only
8485 * if the generate DB file is ok. */
8486 if (rename(tmpfile,filename) == -1) {
8487 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8488 unlink(tmpfile);
8489 return REDIS_ERR;
8490 }
8491 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8492 return REDIS_OK;
8493
8494 werr:
8495 fclose(fp);
8496 unlink(tmpfile);
8497 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8498 if (di) dictReleaseIterator(di);
8499 return REDIS_ERR;
8500 }
8501
8502 /* This is how rewriting of the append only file in background works:
8503 *
8504 * 1) The user calls BGREWRITEAOF
8505 * 2) Redis calls this function, that forks():
8506 * 2a) the child rewrite the append only file in a temp file.
8507 * 2b) the parent accumulates differences in server.bgrewritebuf.
8508 * 3) When the child finished '2a' exists.
8509 * 4) The parent will trap the exit code, if it's OK, will append the
8510 * data accumulated into server.bgrewritebuf into the temp file, and
8511 * finally will rename(2) the temp file in the actual file name.
8512 * The the new file is reopened as the new append only file. Profit!
8513 */
8514 static int rewriteAppendOnlyFileBackground(void) {
8515 pid_t childpid;
8516
8517 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8518 if (server.vm_enabled) waitEmptyIOJobsQueue();
8519 if ((childpid = fork()) == 0) {
8520 /* Child */
8521 char tmpfile[256];
8522
8523 if (server.vm_enabled) vmReopenSwapFile();
8524 close(server.fd);
8525 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8526 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8527 _exit(0);
8528 } else {
8529 _exit(1);
8530 }
8531 } else {
8532 /* Parent */
8533 if (childpid == -1) {
8534 redisLog(REDIS_WARNING,
8535 "Can't rewrite append only file in background: fork: %s",
8536 strerror(errno));
8537 return REDIS_ERR;
8538 }
8539 redisLog(REDIS_NOTICE,
8540 "Background append only file rewriting started by pid %d",childpid);
8541 server.bgrewritechildpid = childpid;
8542 updateDictResizePolicy();
8543 /* We set appendseldb to -1 in order to force the next call to the
8544 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8545 * accumulated by the parent into server.bgrewritebuf will start
8546 * with a SELECT statement and it will be safe to merge. */
8547 server.appendseldb = -1;
8548 return REDIS_OK;
8549 }
8550 return REDIS_OK; /* unreached */
8551 }
8552
8553 static void bgrewriteaofCommand(redisClient *c) {
8554 if (server.bgrewritechildpid != -1) {
8555 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8556 return;
8557 }
8558 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8559 char *status = "+Background append only file rewriting started\r\n";
8560 addReplySds(c,sdsnew(status));
8561 } else {
8562 addReply(c,shared.err);
8563 }
8564 }
8565
8566 static void aofRemoveTempFile(pid_t childpid) {
8567 char tmpfile[256];
8568
8569 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8570 unlink(tmpfile);
8571 }
8572
8573 /* Virtual Memory is composed mainly of two subsystems:
8574 * - Blocking Virutal Memory
8575 * - Threaded Virtual Memory I/O
8576 * The two parts are not fully decoupled, but functions are split among two
8577 * different sections of the source code (delimited by comments) in order to
8578 * make more clear what functionality is about the blocking VM and what about
8579 * the threaded (not blocking) VM.
8580 *
8581 * Redis VM design:
8582 *
8583 * Redis VM is a blocking VM (one that blocks reading swapped values from
8584 * disk into memory when a value swapped out is needed in memory) that is made
8585 * unblocking by trying to examine the command argument vector in order to
8586 * load in background values that will likely be needed in order to exec
8587 * the command. The command is executed only once all the relevant keys
8588 * are loaded into memory.
8589 *
8590 * This basically is almost as simple of a blocking VM, but almost as parallel
8591 * as a fully non-blocking VM.
8592 */
8593
8594 /* =================== Virtual Memory - Blocking Side ====================== */
8595
8596 static void vmInit(void) {
8597 off_t totsize;
8598 int pipefds[2];
8599 size_t stacksize;
8600 struct flock fl;
8601
8602 if (server.vm_max_threads != 0)
8603 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8604
8605 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8606 /* Try to open the old swap file, otherwise create it */
8607 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8608 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8609 }
8610 if (server.vm_fp == NULL) {
8611 redisLog(REDIS_WARNING,
8612 "Can't open the swap file: %s. Exiting.",
8613 strerror(errno));
8614 exit(1);
8615 }
8616 server.vm_fd = fileno(server.vm_fp);
8617 /* Lock the swap file for writing, this is useful in order to avoid
8618 * another instance to use the same swap file for a config error. */
8619 fl.l_type = F_WRLCK;
8620 fl.l_whence = SEEK_SET;
8621 fl.l_start = fl.l_len = 0;
8622 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8623 redisLog(REDIS_WARNING,
8624 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8625 exit(1);
8626 }
8627 /* Initialize */
8628 server.vm_next_page = 0;
8629 server.vm_near_pages = 0;
8630 server.vm_stats_used_pages = 0;
8631 server.vm_stats_swapped_objects = 0;
8632 server.vm_stats_swapouts = 0;
8633 server.vm_stats_swapins = 0;
8634 totsize = server.vm_pages*server.vm_page_size;
8635 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8636 if (ftruncate(server.vm_fd,totsize) == -1) {
8637 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8638 strerror(errno));
8639 exit(1);
8640 } else {
8641 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8642 }
8643 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8644 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8645 (long long) (server.vm_pages+7)/8, server.vm_pages);
8646 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8647
8648 /* Initialize threaded I/O (used by Virtual Memory) */
8649 server.io_newjobs = listCreate();
8650 server.io_processing = listCreate();
8651 server.io_processed = listCreate();
8652 server.io_ready_clients = listCreate();
8653 pthread_mutex_init(&server.io_mutex,NULL);
8654 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8655 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8656 server.io_active_threads = 0;
8657 if (pipe(pipefds) == -1) {
8658 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8659 ,strerror(errno));
8660 exit(1);
8661 }
8662 server.io_ready_pipe_read = pipefds[0];
8663 server.io_ready_pipe_write = pipefds[1];
8664 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8665 /* LZF requires a lot of stack */
8666 pthread_attr_init(&server.io_threads_attr);
8667 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8668 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8669 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8670 /* Listen for events in the threaded I/O pipe */
8671 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8672 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8673 oom("creating file event");
8674 }
8675
8676 /* Mark the page as used */
8677 static void vmMarkPageUsed(off_t page) {
8678 off_t byte = page/8;
8679 int bit = page&7;
8680 redisAssert(vmFreePage(page) == 1);
8681 server.vm_bitmap[byte] |= 1<<bit;
8682 }
8683
8684 /* Mark N contiguous pages as used, with 'page' being the first. */
8685 static void vmMarkPagesUsed(off_t page, off_t count) {
8686 off_t j;
8687
8688 for (j = 0; j < count; j++)
8689 vmMarkPageUsed(page+j);
8690 server.vm_stats_used_pages += count;
8691 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8692 (long long)count, (long long)page);
8693 }
8694
8695 /* Mark the page as free */
8696 static void vmMarkPageFree(off_t page) {
8697 off_t byte = page/8;
8698 int bit = page&7;
8699 redisAssert(vmFreePage(page) == 0);
8700 server.vm_bitmap[byte] &= ~(1<<bit);
8701 }
8702
8703 /* Mark N contiguous pages as free, with 'page' being the first. */
8704 static void vmMarkPagesFree(off_t page, off_t count) {
8705 off_t j;
8706
8707 for (j = 0; j < count; j++)
8708 vmMarkPageFree(page+j);
8709 server.vm_stats_used_pages -= count;
8710 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8711 (long long)count, (long long)page);
8712 }
8713
8714 /* Test if the page is free */
8715 static int vmFreePage(off_t page) {
8716 off_t byte = page/8;
8717 int bit = page&7;
8718 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8719 }
8720
8721 /* Find N contiguous free pages storing the first page of the cluster in *first.
8722 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8723 * REDIS_ERR is returned.
8724 *
8725 * This function uses a simple algorithm: we try to allocate
8726 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8727 * again from the start of the swap file searching for free spaces.
8728 *
8729 * If it looks pretty clear that there are no free pages near our offset
8730 * we try to find less populated places doing a forward jump of
8731 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8732 * without hurry, and then we jump again and so forth...
8733 *
8734 * This function can be improved using a free list to avoid to guess
8735 * too much, since we could collect data about freed pages.
8736 *
8737 * note: I implemented this function just after watching an episode of
8738 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8739 */
8740 static int vmFindContiguousPages(off_t *first, off_t n) {
8741 off_t base, offset = 0, since_jump = 0, numfree = 0;
8742
8743 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8744 server.vm_near_pages = 0;
8745 server.vm_next_page = 0;
8746 }
8747 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8748 base = server.vm_next_page;
8749
8750 while(offset < server.vm_pages) {
8751 off_t this = base+offset;
8752
8753 /* If we overflow, restart from page zero */
8754 if (this >= server.vm_pages) {
8755 this -= server.vm_pages;
8756 if (this == 0) {
8757 /* Just overflowed, what we found on tail is no longer
8758 * interesting, as it's no longer contiguous. */
8759 numfree = 0;
8760 }
8761 }
8762 if (vmFreePage(this)) {
8763 /* This is a free page */
8764 numfree++;
8765 /* Already got N free pages? Return to the caller, with success */
8766 if (numfree == n) {
8767 *first = this-(n-1);
8768 server.vm_next_page = this+1;
8769 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8770 return REDIS_OK;
8771 }
8772 } else {
8773 /* The current one is not a free page */
8774 numfree = 0;
8775 }
8776
8777 /* Fast-forward if the current page is not free and we already
8778 * searched enough near this place. */
8779 since_jump++;
8780 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8781 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8782 since_jump = 0;
8783 /* Note that even if we rewind after the jump, we are don't need
8784 * to make sure numfree is set to zero as we only jump *if* it
8785 * is set to zero. */
8786 } else {
8787 /* Otherwise just check the next page */
8788 offset++;
8789 }
8790 }
8791 return REDIS_ERR;
8792 }
8793
8794 /* Write the specified object at the specified page of the swap file */
8795 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8796 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8797 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8798 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8799 redisLog(REDIS_WARNING,
8800 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8801 strerror(errno));
8802 return REDIS_ERR;
8803 }
8804 rdbSaveObject(server.vm_fp,o);
8805 fflush(server.vm_fp);
8806 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8807 return REDIS_OK;
8808 }
8809
8810 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8811 * needed to later retrieve the object into the key object.
8812 * If we can't find enough contiguous empty pages to swap the object on disk
8813 * REDIS_ERR is returned. */
8814 static int vmSwapObjectBlocking(robj *key, robj *val) {
8815 off_t pages = rdbSavedObjectPages(val,NULL);
8816 off_t page;
8817
8818 assert(key->storage == REDIS_VM_MEMORY);
8819 assert(key->refcount == 1);
8820 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8821 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8822 key->vm.page = page;
8823 key->vm.usedpages = pages;
8824 key->storage = REDIS_VM_SWAPPED;
8825 key->vtype = val->type;
8826 decrRefCount(val); /* Deallocate the object from memory. */
8827 vmMarkPagesUsed(page,pages);
8828 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8829 (unsigned char*) key->ptr,
8830 (unsigned long long) page, (unsigned long long) pages);
8831 server.vm_stats_swapped_objects++;
8832 server.vm_stats_swapouts++;
8833 return REDIS_OK;
8834 }
8835
8836 static robj *vmReadObjectFromSwap(off_t page, int type) {
8837 robj *o;
8838
8839 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8840 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8841 redisLog(REDIS_WARNING,
8842 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8843 strerror(errno));
8844 _exit(1);
8845 }
8846 o = rdbLoadObject(type,server.vm_fp);
8847 if (o == NULL) {
8848 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8849 _exit(1);
8850 }
8851 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8852 return o;
8853 }
8854
8855 /* Load the value object relative to the 'key' object from swap to memory.
8856 * The newly allocated object is returned.
8857 *
8858 * If preview is true the unserialized object is returned to the caller but
8859 * no changes are made to the key object, nor the pages are marked as freed */
8860 static robj *vmGenericLoadObject(robj *key, int preview) {
8861 robj *val;
8862
8863 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8864 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8865 if (!preview) {
8866 key->storage = REDIS_VM_MEMORY;
8867 key->vm.atime = server.unixtime;
8868 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8869 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8870 (unsigned char*) key->ptr);
8871 server.vm_stats_swapped_objects--;
8872 } else {
8873 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8874 (unsigned char*) key->ptr);
8875 }
8876 server.vm_stats_swapins++;
8877 return val;
8878 }
8879
8880 /* Plain object loading, from swap to memory */
8881 static robj *vmLoadObject(robj *key) {
8882 /* If we are loading the object in background, stop it, we
8883 * need to load this object synchronously ASAP. */
8884 if (key->storage == REDIS_VM_LOADING)
8885 vmCancelThreadedIOJob(key);
8886 return vmGenericLoadObject(key,0);
8887 }
8888
8889 /* Just load the value on disk, without to modify the key.
8890 * This is useful when we want to perform some operation on the value
8891 * without to really bring it from swap to memory, like while saving the
8892 * dataset or rewriting the append only log. */
8893 static robj *vmPreviewObject(robj *key) {
8894 return vmGenericLoadObject(key,1);
8895 }
8896
8897 /* How a good candidate is this object for swapping?
8898 * The better candidate it is, the greater the returned value.
8899 *
8900 * Currently we try to perform a fast estimation of the object size in
8901 * memory, and combine it with aging informations.
8902 *
8903 * Basically swappability = idle-time * log(estimated size)
8904 *
8905 * Bigger objects are preferred over smaller objects, but not
8906 * proportionally, this is why we use the logarithm. This algorithm is
8907 * just a first try and will probably be tuned later. */
8908 static double computeObjectSwappability(robj *o) {
8909 time_t age = server.unixtime - o->vm.atime;
8910 long asize = 0;
8911 list *l;
8912 dict *d;
8913 struct dictEntry *de;
8914 int z;
8915
8916 if (age <= 0) return 0;
8917 switch(o->type) {
8918 case REDIS_STRING:
8919 if (o->encoding != REDIS_ENCODING_RAW) {
8920 asize = sizeof(*o);
8921 } else {
8922 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8923 }
8924 break;
8925 case REDIS_LIST:
8926 l = o->ptr;
8927 listNode *ln = listFirst(l);
8928
8929 asize = sizeof(list);
8930 if (ln) {
8931 robj *ele = ln->value;
8932 long elesize;
8933
8934 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8935 (sizeof(*o)+sdslen(ele->ptr)) :
8936 sizeof(*o);
8937 asize += (sizeof(listNode)+elesize)*listLength(l);
8938 }
8939 break;
8940 case REDIS_SET:
8941 case REDIS_ZSET:
8942 z = (o->type == REDIS_ZSET);
8943 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8944
8945 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8946 if (z) asize += sizeof(zset)-sizeof(dict);
8947 if (dictSize(d)) {
8948 long elesize;
8949 robj *ele;
8950
8951 de = dictGetRandomKey(d);
8952 ele = dictGetEntryKey(de);
8953 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8954 (sizeof(*o)+sdslen(ele->ptr)) :
8955 sizeof(*o);
8956 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8957 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8958 }
8959 break;
8960 case REDIS_HASH:
8961 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8962 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8963 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8964 unsigned int klen, vlen;
8965 unsigned char *key, *val;
8966
8967 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8968 klen = 0;
8969 vlen = 0;
8970 }
8971 asize = len*(klen+vlen+3);
8972 } else if (o->encoding == REDIS_ENCODING_HT) {
8973 d = o->ptr;
8974 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8975 if (dictSize(d)) {
8976 long elesize;
8977 robj *ele;
8978
8979 de = dictGetRandomKey(d);
8980 ele = dictGetEntryKey(de);
8981 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8982 (sizeof(*o)+sdslen(ele->ptr)) :
8983 sizeof(*o);
8984 ele = dictGetEntryVal(de);
8985 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8986 (sizeof(*o)+sdslen(ele->ptr)) :
8987 sizeof(*o);
8988 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8989 }
8990 }
8991 break;
8992 }
8993 return (double)age*log(1+asize);
8994 }
8995
8996 /* Try to swap an object that's a good candidate for swapping.
8997 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8998 * to swap any object at all.
8999 *
9000 * If 'usethreaded' is true, Redis will try to swap the object in background
9001 * using I/O threads. */
9002 static int vmSwapOneObject(int usethreads) {
9003 int j, i;
9004 struct dictEntry *best = NULL;
9005 double best_swappability = 0;
9006 redisDb *best_db = NULL;
9007 robj *key, *val;
9008
9009 for (j = 0; j < server.dbnum; j++) {
9010 redisDb *db = server.db+j;
9011 /* Why maxtries is set to 100?
9012 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9013 * are swappable objects */
9014 int maxtries = 100;
9015
9016 if (dictSize(db->dict) == 0) continue;
9017 for (i = 0; i < 5; i++) {
9018 dictEntry *de;
9019 double swappability;
9020
9021 if (maxtries) maxtries--;
9022 de = dictGetRandomKey(db->dict);
9023 key = dictGetEntryKey(de);
9024 val = dictGetEntryVal(de);
9025 /* Only swap objects that are currently in memory.
9026 *
9027 * Also don't swap shared objects if threaded VM is on, as we
9028 * try to ensure that the main thread does not touch the
9029 * object while the I/O thread is using it, but we can't
9030 * control other keys without adding additional mutex. */
9031 if (key->storage != REDIS_VM_MEMORY ||
9032 (server.vm_max_threads != 0 && val->refcount != 1)) {
9033 if (maxtries) i--; /* don't count this try */
9034 continue;
9035 }
9036 swappability = computeObjectSwappability(val);
9037 if (!best || swappability > best_swappability) {
9038 best = de;
9039 best_swappability = swappability;
9040 best_db = db;
9041 }
9042 }
9043 }
9044 if (best == NULL) return REDIS_ERR;
9045 key = dictGetEntryKey(best);
9046 val = dictGetEntryVal(best);
9047
9048 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9049 key->ptr, best_swappability);
9050
9051 /* Unshare the key if needed */
9052 if (key->refcount > 1) {
9053 robj *newkey = dupStringObject(key);
9054 decrRefCount(key);
9055 key = dictGetEntryKey(best) = newkey;
9056 }
9057 /* Swap it */
9058 if (usethreads) {
9059 vmSwapObjectThreaded(key,val,best_db);
9060 return REDIS_OK;
9061 } else {
9062 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9063 dictGetEntryVal(best) = NULL;
9064 return REDIS_OK;
9065 } else {
9066 return REDIS_ERR;
9067 }
9068 }
9069 }
9070
9071 static int vmSwapOneObjectBlocking() {
9072 return vmSwapOneObject(0);
9073 }
9074
9075 static int vmSwapOneObjectThreaded() {
9076 return vmSwapOneObject(1);
9077 }
9078
9079 /* Return true if it's safe to swap out objects in a given moment.
9080 * Basically we don't want to swap objects out while there is a BGSAVE
9081 * or a BGAEOREWRITE running in backgroud. */
9082 static int vmCanSwapOut(void) {
9083 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9084 }
9085
9086 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9087 * and was deleted. Otherwise 0 is returned. */
9088 static int deleteIfSwapped(redisDb *db, robj *key) {
9089 dictEntry *de;
9090 robj *foundkey;
9091
9092 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9093 foundkey = dictGetEntryKey(de);
9094 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9095 deleteKey(db,key);
9096 return 1;
9097 }
9098
9099 /* =================== Virtual Memory - Threaded I/O ======================= */
9100
9101 static void freeIOJob(iojob *j) {
9102 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9103 j->type == REDIS_IOJOB_DO_SWAP ||
9104 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9105 decrRefCount(j->val);
9106 /* We don't decrRefCount the j->key field as we did't incremented
9107 * the count creating IO Jobs. This is because the key field here is
9108 * just used as an indentifier and if a key is removed the Job should
9109 * never be touched again. */
9110 zfree(j);
9111 }
9112
9113 /* Every time a thread finished a Job, it writes a byte into the write side
9114 * of an unix pipe in order to "awake" the main thread, and this function
9115 * is called. */
9116 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9117 int mask)
9118 {
9119 char buf[1];
9120 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9121 REDIS_NOTUSED(el);
9122 REDIS_NOTUSED(mask);
9123 REDIS_NOTUSED(privdata);
9124
9125 /* For every byte we read in the read side of the pipe, there is one
9126 * I/O job completed to process. */
9127 while((retval = read(fd,buf,1)) == 1) {
9128 iojob *j;
9129 listNode *ln;
9130 robj *key;
9131 struct dictEntry *de;
9132
9133 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9134
9135 /* Get the processed element (the oldest one) */
9136 lockThreadedIO();
9137 assert(listLength(server.io_processed) != 0);
9138 if (toprocess == -1) {
9139 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9140 if (toprocess <= 0) toprocess = 1;
9141 }
9142 ln = listFirst(server.io_processed);
9143 j = ln->value;
9144 listDelNode(server.io_processed,ln);
9145 unlockThreadedIO();
9146 /* If this job is marked as canceled, just ignore it */
9147 if (j->canceled) {
9148 freeIOJob(j);
9149 continue;
9150 }
9151 /* Post process it in the main thread, as there are things we
9152 * can do just here to avoid race conditions and/or invasive locks */
9153 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9154 de = dictFind(j->db->dict,j->key);
9155 assert(de != NULL);
9156 key = dictGetEntryKey(de);
9157 if (j->type == REDIS_IOJOB_LOAD) {
9158 redisDb *db;
9159
9160 /* Key loaded, bring it at home */
9161 key->storage = REDIS_VM_MEMORY;
9162 key->vm.atime = server.unixtime;
9163 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9164 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9165 (unsigned char*) key->ptr);
9166 server.vm_stats_swapped_objects--;
9167 server.vm_stats_swapins++;
9168 dictGetEntryVal(de) = j->val;
9169 incrRefCount(j->val);
9170 db = j->db;
9171 freeIOJob(j);
9172 /* Handle clients waiting for this key to be loaded. */
9173 handleClientsBlockedOnSwappedKey(db,key);
9174 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9175 /* Now we know the amount of pages required to swap this object.
9176 * Let's find some space for it, and queue this task again
9177 * rebranded as REDIS_IOJOB_DO_SWAP. */
9178 if (!vmCanSwapOut() ||
9179 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9180 {
9181 /* Ooops... no space or we can't swap as there is
9182 * a fork()ed Redis trying to save stuff on disk. */
9183 freeIOJob(j);
9184 key->storage = REDIS_VM_MEMORY; /* undo operation */
9185 } else {
9186 /* Note that we need to mark this pages as used now,
9187 * if the job will be canceled, we'll mark them as freed
9188 * again. */
9189 vmMarkPagesUsed(j->page,j->pages);
9190 j->type = REDIS_IOJOB_DO_SWAP;
9191 lockThreadedIO();
9192 queueIOJob(j);
9193 unlockThreadedIO();
9194 }
9195 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9196 robj *val;
9197
9198 /* Key swapped. We can finally free some memory. */
9199 if (key->storage != REDIS_VM_SWAPPING) {
9200 printf("key->storage: %d\n",key->storage);
9201 printf("key->name: %s\n",(char*)key->ptr);
9202 printf("key->refcount: %d\n",key->refcount);
9203 printf("val: %p\n",(void*)j->val);
9204 printf("val->type: %d\n",j->val->type);
9205 printf("val->ptr: %s\n",(char*)j->val->ptr);
9206 }
9207 redisAssert(key->storage == REDIS_VM_SWAPPING);
9208 val = dictGetEntryVal(de);
9209 key->vm.page = j->page;
9210 key->vm.usedpages = j->pages;
9211 key->storage = REDIS_VM_SWAPPED;
9212 key->vtype = j->val->type;
9213 decrRefCount(val); /* Deallocate the object from memory. */
9214 dictGetEntryVal(de) = NULL;
9215 redisLog(REDIS_DEBUG,
9216 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9217 (unsigned char*) key->ptr,
9218 (unsigned long long) j->page, (unsigned long long) j->pages);
9219 server.vm_stats_swapped_objects++;
9220 server.vm_stats_swapouts++;
9221 freeIOJob(j);
9222 /* Put a few more swap requests in queue if we are still
9223 * out of memory */
9224 if (trytoswap && vmCanSwapOut() &&
9225 zmalloc_used_memory() > server.vm_max_memory)
9226 {
9227 int more = 1;
9228 while(more) {
9229 lockThreadedIO();
9230 more = listLength(server.io_newjobs) <
9231 (unsigned) server.vm_max_threads;
9232 unlockThreadedIO();
9233 /* Don't waste CPU time if swappable objects are rare. */
9234 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9235 trytoswap = 0;
9236 break;
9237 }
9238 }
9239 }
9240 }
9241 processed++;
9242 if (processed == toprocess) return;
9243 }
9244 if (retval < 0 && errno != EAGAIN) {
9245 redisLog(REDIS_WARNING,
9246 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9247 strerror(errno));
9248 }
9249 }
9250
9251 static void lockThreadedIO(void) {
9252 pthread_mutex_lock(&server.io_mutex);
9253 }
9254
9255 static void unlockThreadedIO(void) {
9256 pthread_mutex_unlock(&server.io_mutex);
9257 }
9258
9259 /* Remove the specified object from the threaded I/O queue if still not
9260 * processed, otherwise make sure to flag it as canceled. */
9261 static void vmCancelThreadedIOJob(robj *o) {
9262 list *lists[3] = {
9263 server.io_newjobs, /* 0 */
9264 server.io_processing, /* 1 */
9265 server.io_processed /* 2 */
9266 };
9267 int i;
9268
9269 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9270 again:
9271 lockThreadedIO();
9272 /* Search for a matching key in one of the queues */
9273 for (i = 0; i < 3; i++) {
9274 listNode *ln;
9275 listIter li;
9276
9277 listRewind(lists[i],&li);
9278 while ((ln = listNext(&li)) != NULL) {
9279 iojob *job = ln->value;
9280
9281 if (job->canceled) continue; /* Skip this, already canceled. */
9282 if (job->key == o) {
9283 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9284 (void*)job, (char*)o->ptr, job->type, i);
9285 /* Mark the pages as free since the swap didn't happened
9286 * or happened but is now discarded. */
9287 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9288 vmMarkPagesFree(job->page,job->pages);
9289 /* Cancel the job. It depends on the list the job is
9290 * living in. */
9291 switch(i) {
9292 case 0: /* io_newjobs */
9293 /* If the job was yet not processed the best thing to do
9294 * is to remove it from the queue at all */
9295 freeIOJob(job);
9296 listDelNode(lists[i],ln);
9297 break;
9298 case 1: /* io_processing */
9299 /* Oh Shi- the thread is messing with the Job:
9300 *
9301 * Probably it's accessing the object if this is a
9302 * PREPARE_SWAP or DO_SWAP job.
9303 * If it's a LOAD job it may be reading from disk and
9304 * if we don't wait for the job to terminate before to
9305 * cancel it, maybe in a few microseconds data can be
9306 * corrupted in this pages. So the short story is:
9307 *
9308 * Better to wait for the job to move into the
9309 * next queue (processed)... */
9310
9311 /* We try again and again until the job is completed. */
9312 unlockThreadedIO();
9313 /* But let's wait some time for the I/O thread
9314 * to finish with this job. After all this condition
9315 * should be very rare. */
9316 usleep(1);
9317 goto again;
9318 case 2: /* io_processed */
9319 /* The job was already processed, that's easy...
9320 * just mark it as canceled so that we'll ignore it
9321 * when processing completed jobs. */
9322 job->canceled = 1;
9323 break;
9324 }
9325 /* Finally we have to adjust the storage type of the object
9326 * in order to "UNDO" the operaiton. */
9327 if (o->storage == REDIS_VM_LOADING)
9328 o->storage = REDIS_VM_SWAPPED;
9329 else if (o->storage == REDIS_VM_SWAPPING)
9330 o->storage = REDIS_VM_MEMORY;
9331 unlockThreadedIO();
9332 return;
9333 }
9334 }
9335 }
9336 unlockThreadedIO();
9337 assert(1 != 1); /* We should never reach this */
9338 }
9339
9340 static void *IOThreadEntryPoint(void *arg) {
9341 iojob *j;
9342 listNode *ln;
9343 REDIS_NOTUSED(arg);
9344
9345 pthread_detach(pthread_self());
9346 while(1) {
9347 /* Get a new job to process */
9348 lockThreadedIO();
9349 if (listLength(server.io_newjobs) == 0) {
9350 /* No new jobs in queue, exit. */
9351 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9352 (long) pthread_self());
9353 server.io_active_threads--;
9354 unlockThreadedIO();
9355 return NULL;
9356 }
9357 ln = listFirst(server.io_newjobs);
9358 j = ln->value;
9359 listDelNode(server.io_newjobs,ln);
9360 /* Add the job in the processing queue */
9361 j->thread = pthread_self();
9362 listAddNodeTail(server.io_processing,j);
9363 ln = listLast(server.io_processing); /* We use ln later to remove it */
9364 unlockThreadedIO();
9365 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9366 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9367
9368 /* Process the Job */
9369 if (j->type == REDIS_IOJOB_LOAD) {
9370 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9371 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9372 FILE *fp = fopen("/dev/null","w+");
9373 j->pages = rdbSavedObjectPages(j->val,fp);
9374 fclose(fp);
9375 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9376 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9377 j->canceled = 1;
9378 }
9379
9380 /* Done: insert the job into the processed queue */
9381 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9382 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9383 lockThreadedIO();
9384 listDelNode(server.io_processing,ln);
9385 listAddNodeTail(server.io_processed,j);
9386 unlockThreadedIO();
9387
9388 /* Signal the main thread there is new stuff to process */
9389 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9390 }
9391 return NULL; /* never reached */
9392 }
9393
9394 static void spawnIOThread(void) {
9395 pthread_t thread;
9396 sigset_t mask, omask;
9397 int err;
9398
9399 sigemptyset(&mask);
9400 sigaddset(&mask,SIGCHLD);
9401 sigaddset(&mask,SIGHUP);
9402 sigaddset(&mask,SIGPIPE);
9403 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9404 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9405 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9406 strerror(err));
9407 usleep(1000000);
9408 }
9409 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9410 server.io_active_threads++;
9411 }
9412
9413 /* We need to wait for the last thread to exit before we are able to
9414 * fork() in order to BGSAVE or BGREWRITEAOF. */
9415 static void waitEmptyIOJobsQueue(void) {
9416 while(1) {
9417 int io_processed_len;
9418
9419 lockThreadedIO();
9420 if (listLength(server.io_newjobs) == 0 &&
9421 listLength(server.io_processing) == 0 &&
9422 server.io_active_threads == 0)
9423 {
9424 unlockThreadedIO();
9425 return;
9426 }
9427 /* While waiting for empty jobs queue condition we post-process some
9428 * finshed job, as I/O threads may be hanging trying to write against
9429 * the io_ready_pipe_write FD but there are so much pending jobs that
9430 * it's blocking. */
9431 io_processed_len = listLength(server.io_processed);
9432 unlockThreadedIO();
9433 if (io_processed_len) {
9434 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9435 usleep(1000); /* 1 millisecond */
9436 } else {
9437 usleep(10000); /* 10 milliseconds */
9438 }
9439 }
9440 }
9441
9442 static void vmReopenSwapFile(void) {
9443 /* Note: we don't close the old one as we are in the child process
9444 * and don't want to mess at all with the original file object. */
9445 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9446 if (server.vm_fp == NULL) {
9447 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9448 server.vm_swap_file);
9449 _exit(1);
9450 }
9451 server.vm_fd = fileno(server.vm_fp);
9452 }
9453
9454 /* This function must be called while with threaded IO locked */
9455 static void queueIOJob(iojob *j) {
9456 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9457 (void*)j, j->type, (char*)j->key->ptr);
9458 listAddNodeTail(server.io_newjobs,j);
9459 if (server.io_active_threads < server.vm_max_threads)
9460 spawnIOThread();
9461 }
9462
9463 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9464 iojob *j;
9465
9466 assert(key->storage == REDIS_VM_MEMORY);
9467 assert(key->refcount == 1);
9468
9469 j = zmalloc(sizeof(*j));
9470 j->type = REDIS_IOJOB_PREPARE_SWAP;
9471 j->db = db;
9472 j->key = key;
9473 j->val = val;
9474 incrRefCount(val);
9475 j->canceled = 0;
9476 j->thread = (pthread_t) -1;
9477 key->storage = REDIS_VM_SWAPPING;
9478
9479 lockThreadedIO();
9480 queueIOJob(j);
9481 unlockThreadedIO();
9482 return REDIS_OK;
9483 }
9484
9485 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9486
9487 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9488 * If there is not already a job loading the key, it is craeted.
9489 * The key is added to the io_keys list in the client structure, and also
9490 * in the hash table mapping swapped keys to waiting clients, that is,
9491 * server.io_waited_keys. */
9492 static int waitForSwappedKey(redisClient *c, robj *key) {
9493 struct dictEntry *de;
9494 robj *o;
9495 list *l;
9496
9497 /* If the key does not exist or is already in RAM we don't need to
9498 * block the client at all. */
9499 de = dictFind(c->db->dict,key);
9500 if (de == NULL) return 0;
9501 o = dictGetEntryKey(de);
9502 if (o->storage == REDIS_VM_MEMORY) {
9503 return 0;
9504 } else if (o->storage == REDIS_VM_SWAPPING) {
9505 /* We were swapping the key, undo it! */
9506 vmCancelThreadedIOJob(o);
9507 return 0;
9508 }
9509
9510 /* OK: the key is either swapped, or being loaded just now. */
9511
9512 /* Add the key to the list of keys this client is waiting for.
9513 * This maps clients to keys they are waiting for. */
9514 listAddNodeTail(c->io_keys,key);
9515 incrRefCount(key);
9516
9517 /* Add the client to the swapped keys => clients waiting map. */
9518 de = dictFind(c->db->io_keys,key);
9519 if (de == NULL) {
9520 int retval;
9521
9522 /* For every key we take a list of clients blocked for it */
9523 l = listCreate();
9524 retval = dictAdd(c->db->io_keys,key,l);
9525 incrRefCount(key);
9526 assert(retval == DICT_OK);
9527 } else {
9528 l = dictGetEntryVal(de);
9529 }
9530 listAddNodeTail(l,c);
9531
9532 /* Are we already loading the key from disk? If not create a job */
9533 if (o->storage == REDIS_VM_SWAPPED) {
9534 iojob *j;
9535
9536 o->storage = REDIS_VM_LOADING;
9537 j = zmalloc(sizeof(*j));
9538 j->type = REDIS_IOJOB_LOAD;
9539 j->db = c->db;
9540 j->key = o;
9541 j->key->vtype = o->vtype;
9542 j->page = o->vm.page;
9543 j->val = NULL;
9544 j->canceled = 0;
9545 j->thread = (pthread_t) -1;
9546 lockThreadedIO();
9547 queueIOJob(j);
9548 unlockThreadedIO();
9549 }
9550 return 1;
9551 }
9552
9553 /* Preload keys for any command with first, last and step values for
9554 * the command keys prototype, as defined in the command table. */
9555 static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9556 int j, last;
9557 if (cmd->vm_firstkey == 0) return;
9558 last = cmd->vm_lastkey;
9559 if (last < 0) last = argc+last;
9560 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9561 redisAssert(j < argc);
9562 waitForSwappedKey(c,argv[j]);
9563 }
9564 }
9565
9566 /* Preload keys needed for the ZUNION and ZINTER commands.
9567 * Note that the number of keys to preload is user-defined, so we need to
9568 * apply a sanity check against argc. */
9569 static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9570 int i, num;
9571 REDIS_NOTUSED(cmd);
9572
9573 num = atoi(argv[2]->ptr);
9574 if (num > (argc-3)) return;
9575 for (i = 0; i < num; i++) {
9576 waitForSwappedKey(c,argv[3+i]);
9577 }
9578 }
9579
9580 /* Preload keys needed to execute the entire MULTI/EXEC block.
9581 *
9582 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9583 * and will block the client when any command requires a swapped out value. */
9584 static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9585 int i, margc;
9586 struct redisCommand *mcmd;
9587 robj **margv;
9588 REDIS_NOTUSED(cmd);
9589 REDIS_NOTUSED(argc);
9590 REDIS_NOTUSED(argv);
9591
9592 if (!(c->flags & REDIS_MULTI)) return;
9593 for (i = 0; i < c->mstate.count; i++) {
9594 mcmd = c->mstate.commands[i].cmd;
9595 margc = c->mstate.commands[i].argc;
9596 margv = c->mstate.commands[i].argv;
9597
9598 if (mcmd->vm_preload_proc != NULL) {
9599 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9600 } else {
9601 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9602 }
9603 }
9604 }
9605
9606 /* Is this client attempting to run a command against swapped keys?
9607 * If so, block it ASAP, load the keys in background, then resume it.
9608 *
9609 * The important idea about this function is that it can fail! If keys will
9610 * still be swapped when the client is resumed, this key lookups will
9611 * just block loading keys from disk. In practical terms this should only
9612 * happen with SORT BY command or if there is a bug in this function.
9613 *
9614 * Return 1 if the client is marked as blocked, 0 if the client can
9615 * continue as the keys it is going to access appear to be in memory. */
9616 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9617 if (cmd->vm_preload_proc != NULL) {
9618 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
9619 } else {
9620 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
9621 }
9622
9623 /* If the client was blocked for at least one key, mark it as blocked. */
9624 if (listLength(c->io_keys)) {
9625 c->flags |= REDIS_IO_WAIT;
9626 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9627 server.vm_blocked_clients++;
9628 return 1;
9629 } else {
9630 return 0;
9631 }
9632 }
9633
9634 /* Remove the 'key' from the list of blocked keys for a given client.
9635 *
9636 * The function returns 1 when there are no longer blocking keys after
9637 * the current one was removed (and the client can be unblocked). */
9638 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9639 list *l;
9640 listNode *ln;
9641 listIter li;
9642 struct dictEntry *de;
9643
9644 /* Remove the key from the list of keys this client is waiting for. */
9645 listRewind(c->io_keys,&li);
9646 while ((ln = listNext(&li)) != NULL) {
9647 if (compareStringObjects(ln->value,key) == 0) {
9648 listDelNode(c->io_keys,ln);
9649 break;
9650 }
9651 }
9652 assert(ln != NULL);
9653
9654 /* Remove the client form the key => waiting clients map. */
9655 de = dictFind(c->db->io_keys,key);
9656 assert(de != NULL);
9657 l = dictGetEntryVal(de);
9658 ln = listSearchKey(l,c);
9659 assert(ln != NULL);
9660 listDelNode(l,ln);
9661 if (listLength(l) == 0)
9662 dictDelete(c->db->io_keys,key);
9663
9664 return listLength(c->io_keys) == 0;
9665 }
9666
9667 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9668 struct dictEntry *de;
9669 list *l;
9670 listNode *ln;
9671 int len;
9672
9673 de = dictFind(db->io_keys,key);
9674 if (!de) return;
9675
9676 l = dictGetEntryVal(de);
9677 len = listLength(l);
9678 /* Note: we can't use something like while(listLength(l)) as the list
9679 * can be freed by the calling function when we remove the last element. */
9680 while (len--) {
9681 ln = listFirst(l);
9682 redisClient *c = ln->value;
9683
9684 if (dontWaitForSwappedKey(c,key)) {
9685 /* Put the client in the list of clients ready to go as we
9686 * loaded all the keys about it. */
9687 listAddNodeTail(server.io_ready_clients,c);
9688 }
9689 }
9690 }
9691
9692 /* =========================== Remote Configuration ========================= */
9693
9694 static void configSetCommand(redisClient *c) {
9695 robj *o = getDecodedObject(c->argv[3]);
9696 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9697 zfree(server.dbfilename);
9698 server.dbfilename = zstrdup(o->ptr);
9699 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9700 zfree(server.requirepass);
9701 server.requirepass = zstrdup(o->ptr);
9702 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9703 zfree(server.masterauth);
9704 server.masterauth = zstrdup(o->ptr);
9705 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9706 server.maxmemory = strtoll(o->ptr, NULL, 10);
9707 } else {
9708 addReplySds(c,sdscatprintf(sdsempty(),
9709 "-ERR not supported CONFIG parameter %s\r\n",
9710 (char*)c->argv[2]->ptr));
9711 decrRefCount(o);
9712 return;
9713 }
9714 decrRefCount(o);
9715 addReply(c,shared.ok);
9716 }
9717
9718 static void configGetCommand(redisClient *c) {
9719 robj *o = getDecodedObject(c->argv[2]);
9720 robj *lenobj = createObject(REDIS_STRING,NULL);
9721 char *pattern = o->ptr;
9722 int matches = 0;
9723
9724 addReply(c,lenobj);
9725 decrRefCount(lenobj);
9726
9727 if (stringmatch(pattern,"dbfilename",0)) {
9728 addReplyBulkCString(c,"dbfilename");
9729 addReplyBulkCString(c,server.dbfilename);
9730 matches++;
9731 }
9732 if (stringmatch(pattern,"requirepass",0)) {
9733 addReplyBulkCString(c,"requirepass");
9734 addReplyBulkCString(c,server.requirepass);
9735 matches++;
9736 }
9737 if (stringmatch(pattern,"masterauth",0)) {
9738 addReplyBulkCString(c,"masterauth");
9739 addReplyBulkCString(c,server.masterauth);
9740 matches++;
9741 }
9742 if (stringmatch(pattern,"maxmemory",0)) {
9743 char buf[128];
9744
9745 snprintf(buf,128,"%llu\n",server.maxmemory);
9746 addReplyBulkCString(c,"maxmemory");
9747 addReplyBulkCString(c,buf);
9748 matches++;
9749 }
9750 decrRefCount(o);
9751 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9752 }
9753
9754 static void configCommand(redisClient *c) {
9755 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9756 if (c->argc != 4) goto badarity;
9757 configSetCommand(c);
9758 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9759 if (c->argc != 3) goto badarity;
9760 configGetCommand(c);
9761 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9762 if (c->argc != 2) goto badarity;
9763 server.stat_numcommands = 0;
9764 server.stat_numconnections = 0;
9765 server.stat_expiredkeys = 0;
9766 server.stat_starttime = time(NULL);
9767 addReply(c,shared.ok);
9768 } else {
9769 addReplySds(c,sdscatprintf(sdsempty(),
9770 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9771 }
9772 return;
9773
9774 badarity:
9775 addReplySds(c,sdscatprintf(sdsempty(),
9776 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9777 (char*) c->argv[1]->ptr));
9778 }
9779
9780 /* =========================== Pubsub implementation ======================== */
9781
9782 static void freePubsubPattern(void *p) {
9783 pubsubPattern *pat = p;
9784
9785 decrRefCount(pat->pattern);
9786 zfree(pat);
9787 }
9788
9789 static int listMatchPubsubPattern(void *a, void *b) {
9790 pubsubPattern *pa = a, *pb = b;
9791
9792 return (pa->client == pb->client) &&
9793 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9794 }
9795
9796 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9797 * 0 if the client was already subscribed to that channel. */
9798 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9799 struct dictEntry *de;
9800 list *clients = NULL;
9801 int retval = 0;
9802
9803 /* Add the channel to the client -> channels hash table */
9804 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9805 retval = 1;
9806 incrRefCount(channel);
9807 /* Add the client to the channel -> list of clients hash table */
9808 de = dictFind(server.pubsub_channels,channel);
9809 if (de == NULL) {
9810 clients = listCreate();
9811 dictAdd(server.pubsub_channels,channel,clients);
9812 incrRefCount(channel);
9813 } else {
9814 clients = dictGetEntryVal(de);
9815 }
9816 listAddNodeTail(clients,c);
9817 }
9818 /* Notify the client */
9819 addReply(c,shared.mbulk3);
9820 addReply(c,shared.subscribebulk);
9821 addReplyBulk(c,channel);
9822 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9823 return retval;
9824 }
9825
9826 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9827 * 0 if the client was not subscribed to the specified channel. */
9828 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9829 struct dictEntry *de;
9830 list *clients;
9831 listNode *ln;
9832 int retval = 0;
9833
9834 /* Remove the channel from the client -> channels hash table */
9835 incrRefCount(channel); /* channel may be just a pointer to the same object
9836 we have in the hash tables. Protect it... */
9837 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9838 retval = 1;
9839 /* Remove the client from the channel -> clients list hash table */
9840 de = dictFind(server.pubsub_channels,channel);
9841 assert(de != NULL);
9842 clients = dictGetEntryVal(de);
9843 ln = listSearchKey(clients,c);
9844 assert(ln != NULL);
9845 listDelNode(clients,ln);
9846 if (listLength(clients) == 0) {
9847 /* Free the list and associated hash entry at all if this was
9848 * the latest client, so that it will be possible to abuse
9849 * Redis PUBSUB creating millions of channels. */
9850 dictDelete(server.pubsub_channels,channel);
9851 }
9852 }
9853 /* Notify the client */
9854 if (notify) {
9855 addReply(c,shared.mbulk3);
9856 addReply(c,shared.unsubscribebulk);
9857 addReplyBulk(c,channel);
9858 addReplyLong(c,dictSize(c->pubsub_channels)+
9859 listLength(c->pubsub_patterns));
9860
9861 }
9862 decrRefCount(channel); /* it is finally safe to release it */
9863 return retval;
9864 }
9865
9866 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9867 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9868 int retval = 0;
9869
9870 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9871 retval = 1;
9872 pubsubPattern *pat;
9873 listAddNodeTail(c->pubsub_patterns,pattern);
9874 incrRefCount(pattern);
9875 pat = zmalloc(sizeof(*pat));
9876 pat->pattern = getDecodedObject(pattern);
9877 pat->client = c;
9878 listAddNodeTail(server.pubsub_patterns,pat);
9879 }
9880 /* Notify the client */
9881 addReply(c,shared.mbulk3);
9882 addReply(c,shared.psubscribebulk);
9883 addReplyBulk(c,pattern);
9884 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9885 return retval;
9886 }
9887
9888 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9889 * 0 if the client was not subscribed to the specified channel. */
9890 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9891 listNode *ln;
9892 pubsubPattern pat;
9893 int retval = 0;
9894
9895 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9896 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9897 retval = 1;
9898 listDelNode(c->pubsub_patterns,ln);
9899 pat.client = c;
9900 pat.pattern = pattern;
9901 ln = listSearchKey(server.pubsub_patterns,&pat);
9902 listDelNode(server.pubsub_patterns,ln);
9903 }
9904 /* Notify the client */
9905 if (notify) {
9906 addReply(c,shared.mbulk3);
9907 addReply(c,shared.punsubscribebulk);
9908 addReplyBulk(c,pattern);
9909 addReplyLong(c,dictSize(c->pubsub_channels)+
9910 listLength(c->pubsub_patterns));
9911 }
9912 decrRefCount(pattern);
9913 return retval;
9914 }
9915
9916 /* Unsubscribe from all the channels. Return the number of channels the
9917 * client was subscribed from. */
9918 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9919 dictIterator *di = dictGetIterator(c->pubsub_channels);
9920 dictEntry *de;
9921 int count = 0;
9922
9923 while((de = dictNext(di)) != NULL) {
9924 robj *channel = dictGetEntryKey(de);
9925
9926 count += pubsubUnsubscribeChannel(c,channel,notify);
9927 }
9928 dictReleaseIterator(di);
9929 return count;
9930 }
9931
9932 /* Unsubscribe from all the patterns. Return the number of patterns the
9933 * client was subscribed from. */
9934 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9935 listNode *ln;
9936 listIter li;
9937 int count = 0;
9938
9939 listRewind(c->pubsub_patterns,&li);
9940 while ((ln = listNext(&li)) != NULL) {
9941 robj *pattern = ln->value;
9942
9943 count += pubsubUnsubscribePattern(c,pattern,notify);
9944 }
9945 return count;
9946 }
9947
9948 /* Publish a message */
9949 static int pubsubPublishMessage(robj *channel, robj *message) {
9950 int receivers = 0;
9951 struct dictEntry *de;
9952 listNode *ln;
9953 listIter li;
9954
9955 /* Send to clients listening for that channel */
9956 de = dictFind(server.pubsub_channels,channel);
9957 if (de) {
9958 list *list = dictGetEntryVal(de);
9959 listNode *ln;
9960 listIter li;
9961
9962 listRewind(list,&li);
9963 while ((ln = listNext(&li)) != NULL) {
9964 redisClient *c = ln->value;
9965
9966 addReply(c,shared.mbulk3);
9967 addReply(c,shared.messagebulk);
9968 addReplyBulk(c,channel);
9969 addReplyBulk(c,message);
9970 receivers++;
9971 }
9972 }
9973 /* Send to clients listening to matching channels */
9974 if (listLength(server.pubsub_patterns)) {
9975 listRewind(server.pubsub_patterns,&li);
9976 channel = getDecodedObject(channel);
9977 while ((ln = listNext(&li)) != NULL) {
9978 pubsubPattern *pat = ln->value;
9979
9980 if (stringmatchlen((char*)pat->pattern->ptr,
9981 sdslen(pat->pattern->ptr),
9982 (char*)channel->ptr,
9983 sdslen(channel->ptr),0)) {
9984 addReply(pat->client,shared.mbulk4);
9985 addReply(pat->client,shared.pmessagebulk);
9986 addReplyBulk(pat->client,pat->pattern);
9987 addReplyBulk(pat->client,channel);
9988 addReplyBulk(pat->client,message);
9989 receivers++;
9990 }
9991 }
9992 decrRefCount(channel);
9993 }
9994 return receivers;
9995 }
9996
9997 static void subscribeCommand(redisClient *c) {
9998 int j;
9999
10000 for (j = 1; j < c->argc; j++)
10001 pubsubSubscribeChannel(c,c->argv[j]);
10002 }
10003
10004 static void unsubscribeCommand(redisClient *c) {
10005 if (c->argc == 1) {
10006 pubsubUnsubscribeAllChannels(c,1);
10007 return;
10008 } else {
10009 int j;
10010
10011 for (j = 1; j < c->argc; j++)
10012 pubsubUnsubscribeChannel(c,c->argv[j],1);
10013 }
10014 }
10015
10016 static void psubscribeCommand(redisClient *c) {
10017 int j;
10018
10019 for (j = 1; j < c->argc; j++)
10020 pubsubSubscribePattern(c,c->argv[j]);
10021 }
10022
10023 static void punsubscribeCommand(redisClient *c) {
10024 if (c->argc == 1) {
10025 pubsubUnsubscribeAllPatterns(c,1);
10026 return;
10027 } else {
10028 int j;
10029
10030 for (j = 1; j < c->argc; j++)
10031 pubsubUnsubscribePattern(c,c->argv[j],1);
10032 }
10033 }
10034
10035 static void publishCommand(redisClient *c) {
10036 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10037 addReplyLong(c,receivers);
10038 }
10039
10040 /* ================================= Debugging ============================== */
10041
10042 static void debugCommand(redisClient *c) {
10043 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10044 *((char*)-1) = 'x';
10045 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10046 if (rdbSave(server.dbfilename) != REDIS_OK) {
10047 addReply(c,shared.err);
10048 return;
10049 }
10050 emptyDb();
10051 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10052 addReply(c,shared.err);
10053 return;
10054 }
10055 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10056 addReply(c,shared.ok);
10057 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10058 emptyDb();
10059 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10060 addReply(c,shared.err);
10061 return;
10062 }
10063 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10064 addReply(c,shared.ok);
10065 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10066 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10067 robj *key, *val;
10068
10069 if (!de) {
10070 addReply(c,shared.nokeyerr);
10071 return;
10072 }
10073 key = dictGetEntryKey(de);
10074 val = dictGetEntryVal(de);
10075 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10076 key->storage == REDIS_VM_SWAPPING)) {
10077 char *strenc;
10078 char buf[128];
10079
10080 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10081 strenc = strencoding[val->encoding];
10082 } else {
10083 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10084 strenc = buf;
10085 }
10086 addReplySds(c,sdscatprintf(sdsempty(),
10087 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10088 "encoding:%s serializedlength:%lld\r\n",
10089 (void*)key, key->refcount, (void*)val, val->refcount,
10090 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10091 } else {
10092 addReplySds(c,sdscatprintf(sdsempty(),
10093 "+Key at:%p refcount:%d, value swapped at: page %llu "
10094 "using %llu pages\r\n",
10095 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10096 (unsigned long long) key->vm.usedpages));
10097 }
10098 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10099 lookupKeyRead(c->db,c->argv[2]);
10100 addReply(c,shared.ok);
10101 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10102 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10103 robj *key, *val;
10104
10105 if (!server.vm_enabled) {
10106 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10107 return;
10108 }
10109 if (!de) {
10110 addReply(c,shared.nokeyerr);
10111 return;
10112 }
10113 key = dictGetEntryKey(de);
10114 val = dictGetEntryVal(de);
10115 /* If the key is shared we want to create a copy */
10116 if (key->refcount > 1) {
10117 robj *newkey = dupStringObject(key);
10118 decrRefCount(key);
10119 key = dictGetEntryKey(de) = newkey;
10120 }
10121 /* Swap it */
10122 if (key->storage != REDIS_VM_MEMORY) {
10123 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10124 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10125 dictGetEntryVal(de) = NULL;
10126 addReply(c,shared.ok);
10127 } else {
10128 addReply(c,shared.err);
10129 }
10130 } else {
10131 addReplySds(c,sdsnew(
10132 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10133 }
10134 }
10135
10136 static void _redisAssert(char *estr, char *file, int line) {
10137 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10138 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
10139 #ifdef HAVE_BACKTRACE
10140 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10141 *((char*)-1) = 'x';
10142 #endif
10143 }
10144
10145 static void _redisPanic(char *msg, char *file, int line) {
10146 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10147 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10148 #ifdef HAVE_BACKTRACE
10149 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10150 *((char*)-1) = 'x';
10151 #endif
10152 }
10153
10154 /* =================================== Main! ================================ */
10155
10156 #ifdef __linux__
10157 int linuxOvercommitMemoryValue(void) {
10158 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10159 char buf[64];
10160
10161 if (!fp) return -1;
10162 if (fgets(buf,64,fp) == NULL) {
10163 fclose(fp);
10164 return -1;
10165 }
10166 fclose(fp);
10167
10168 return atoi(buf);
10169 }
10170
10171 void linuxOvercommitMemoryWarning(void) {
10172 if (linuxOvercommitMemoryValue() == 0) {
10173 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10174 }
10175 }
10176 #endif /* __linux__ */
10177
10178 static void daemonize(void) {
10179 int fd;
10180 FILE *fp;
10181
10182 if (fork() != 0) exit(0); /* parent exits */
10183 setsid(); /* create a new session */
10184
10185 /* Every output goes to /dev/null. If Redis is daemonized but
10186 * the 'logfile' is set to 'stdout' in the configuration file
10187 * it will not log at all. */
10188 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10189 dup2(fd, STDIN_FILENO);
10190 dup2(fd, STDOUT_FILENO);
10191 dup2(fd, STDERR_FILENO);
10192 if (fd > STDERR_FILENO) close(fd);
10193 }
10194 /* Try to write the pid file */
10195 fp = fopen(server.pidfile,"w");
10196 if (fp) {
10197 fprintf(fp,"%d\n",getpid());
10198 fclose(fp);
10199 }
10200 }
10201
10202 static void version() {
10203 printf("Redis server version %s\n", REDIS_VERSION);
10204 exit(0);
10205 }
10206
10207 static void usage() {
10208 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10209 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10210 exit(1);
10211 }
10212
10213 int main(int argc, char **argv) {
10214 time_t start;
10215
10216 initServerConfig();
10217 if (argc == 2) {
10218 if (strcmp(argv[1], "-v") == 0 ||
10219 strcmp(argv[1], "--version") == 0) version();
10220 if (strcmp(argv[1], "--help") == 0) usage();
10221 resetServerSaveParams();
10222 loadServerConfig(argv[1]);
10223 } else if ((argc > 2)) {
10224 usage();
10225 } else {
10226 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10227 }
10228 if (server.daemonize) daemonize();
10229 initServer();
10230 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10231 #ifdef __linux__
10232 linuxOvercommitMemoryWarning();
10233 #endif
10234 start = time(NULL);
10235 if (server.appendonly) {
10236 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10237 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10238 } else {
10239 if (rdbLoad(server.dbfilename) == REDIS_OK)
10240 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10241 }
10242 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10243 aeSetBeforeSleepProc(server.el,beforeSleep);
10244 aeMain(server.el);
10245 aeDeleteEventLoop(server.el);
10246 return 0;
10247 }
10248
10249 /* ============================= Backtrace support ========================= */
10250
10251 #ifdef HAVE_BACKTRACE
10252 static char *findFuncName(void *pointer, unsigned long *offset);
10253
10254 static void *getMcontextEip(ucontext_t *uc) {
10255 #if defined(__FreeBSD__)
10256 return (void*) uc->uc_mcontext.mc_eip;
10257 #elif defined(__dietlibc__)
10258 return (void*) uc->uc_mcontext.eip;
10259 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10260 #if __x86_64__
10261 return (void*) uc->uc_mcontext->__ss.__rip;
10262 #else
10263 return (void*) uc->uc_mcontext->__ss.__eip;
10264 #endif
10265 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10266 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10267 return (void*) uc->uc_mcontext->__ss.__rip;
10268 #else
10269 return (void*) uc->uc_mcontext->__ss.__eip;
10270 #endif
10271 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10272 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10273 #elif defined(__ia64__) /* Linux IA64 */
10274 return (void*) uc->uc_mcontext.sc_ip;
10275 #else
10276 return NULL;
10277 #endif
10278 }
10279
10280 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10281 void *trace[100];
10282 char **messages = NULL;
10283 int i, trace_size = 0;
10284 unsigned long offset=0;
10285 ucontext_t *uc = (ucontext_t*) secret;
10286 sds infostring;
10287 REDIS_NOTUSED(info);
10288
10289 redisLog(REDIS_WARNING,
10290 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10291 infostring = genRedisInfoString();
10292 redisLog(REDIS_WARNING, "%s",infostring);
10293 /* It's not safe to sdsfree() the returned string under memory
10294 * corruption conditions. Let it leak as we are going to abort */
10295
10296 trace_size = backtrace(trace, 100);
10297 /* overwrite sigaction with caller's address */
10298 if (getMcontextEip(uc) != NULL) {
10299 trace[1] = getMcontextEip(uc);
10300 }
10301 messages = backtrace_symbols(trace, trace_size);
10302
10303 for (i=1; i<trace_size; ++i) {
10304 char *fn = findFuncName(trace[i], &offset), *p;
10305
10306 p = strchr(messages[i],'+');
10307 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10308 redisLog(REDIS_WARNING,"%s", messages[i]);
10309 } else {
10310 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10311 }
10312 }
10313 /* free(messages); Don't call free() with possibly corrupted memory. */
10314 _exit(0);
10315 }
10316
10317 static void setupSigSegvAction(void) {
10318 struct sigaction act;
10319
10320 sigemptyset (&act.sa_mask);
10321 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10322 * is used. Otherwise, sa_handler is used */
10323 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10324 act.sa_sigaction = segvHandler;
10325 sigaction (SIGSEGV, &act, NULL);
10326 sigaction (SIGBUS, &act, NULL);
10327 sigaction (SIGFPE, &act, NULL);
10328 sigaction (SIGILL, &act, NULL);
10329 sigaction (SIGBUS, &act, NULL);
10330 return;
10331 }
10332
10333 #include "staticsymbols.h"
10334 /* This function try to convert a pointer into a function name. It's used in
10335 * oreder to provide a backtrace under segmentation fault that's able to
10336 * display functions declared as static (otherwise the backtrace is useless). */
10337 static char *findFuncName(void *pointer, unsigned long *offset){
10338 int i, ret = -1;
10339 unsigned long off, minoff = 0;
10340
10341 /* Try to match against the Symbol with the smallest offset */
10342 for (i=0; symsTable[i].pointer; i++) {
10343 unsigned long lp = (unsigned long) pointer;
10344
10345 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10346 off=lp-symsTable[i].pointer;
10347 if (ret < 0 || off < minoff) {
10348 minoff=off;
10349 ret=i;
10350 }
10351 }
10352 }
10353 if (ret == -1) return NULL;
10354 *offset = minoff;
10355 return symsTable[ret].name;
10356 }
10357 #else /* HAVE_BACKTRACE */
10358 static void setupSigSegvAction(void) {
10359 }
10360 #endif /* HAVE_BACKTRACE */
10361
10362
10363
10364 /* The End */
10365
10366
10367