]> git.saurik.com Git - redis.git/blob - redis.c
39da764749d19d71eb9d0ab0f2f6fa50c1e39d78
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <math.h>
61 #include <pthread.h>
62
63 #if defined(__sun)
64 #include "solarisfixes.h"
65 #endif
66
67 #include "redis.h"
68 #include "ae.h" /* Event driven programming library */
69 #include "sds.h" /* Dynamic safe strings */
70 #include "anet.h" /* Networking the easy way */
71 #include "dict.h" /* Hash tables */
72 #include "adlist.h" /* Linked lists */
73 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
74 #include "lzf.h" /* LZF compression library */
75 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
76 #include "zipmap.h"
77
78 /* Error codes */
79 #define REDIS_OK 0
80 #define REDIS_ERR -1
81
82 /* Static server configuration */
83 #define REDIS_SERVERPORT 6379 /* TCP port */
84 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
85 #define REDIS_IOBUF_LEN 1024
86 #define REDIS_LOADBUF_LEN 1024
87 #define REDIS_STATIC_ARGS 8
88 #define REDIS_DEFAULT_DBNUM 16
89 #define REDIS_CONFIGLINE_MAX 1024
90 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
92 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
93 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
94 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97 #define REDIS_WRITEV_THRESHOLD 3
98 /* Max number of iovecs used for each writev call */
99 #define REDIS_WRITEV_IOVEC_COUNT 256
100
101 /* Hash table parameters */
102 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
103
104 /* Command flags */
105 #define REDIS_CMD_BULK 1 /* Bulk write command */
106 #define REDIS_CMD_INLINE 2 /* Inline command */
107 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111 #define REDIS_CMD_DENYOOM 4
112 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
124 #define REDIS_ENCODING_RAW 0 /* Raw representation */
125 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
126 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
128
129 static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131 };
132
133 /* Object types only used for dumping to disk */
134 #define REDIS_EXPIRETIME 253
135 #define REDIS_SELECTDB 254
136 #define REDIS_EOF 255
137
138 /* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
148 *
149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
151 #define REDIS_RDB_6BITLEN 0
152 #define REDIS_RDB_14BITLEN 1
153 #define REDIS_RDB_32BITLEN 2
154 #define REDIS_RDB_ENCVAL 3
155 #define REDIS_RDB_LENERR UINT_MAX
156
157 /* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
163 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
164
165 /* Virtual memory object->where field. */
166 #define REDIS_VM_MEMORY 0 /* The object is on memory */
167 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
168 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
171 /* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173 #define REDIS_VM_MAX_NEAR_PAGES 65536
174 #define REDIS_VM_MAX_RANDOM_JUMP 4096
175 #define REDIS_VM_MAX_THREADS 32
176 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
177 /* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
181 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
182
183 /* Client flags */
184 #define REDIS_SLAVE 1 /* This client is a slave server */
185 #define REDIS_MASTER 2 /* This client is a master server */
186 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187 #define REDIS_MULTI 8 /* This client is in a MULTI context */
188 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
190
191 /* Slave replication state - slave side */
192 #define REDIS_REPL_NONE 0 /* No active replication */
193 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
194 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
196 /* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
205 /* List related stuff */
206 #define REDIS_HEAD 0
207 #define REDIS_TAIL 1
208
209 /* Sort operations */
210 #define REDIS_SORT_GET 0
211 #define REDIS_SORT_ASC 1
212 #define REDIS_SORT_DESC 2
213 #define REDIS_SORTKEY_MAX 1024
214
215 /* Log levels */
216 #define REDIS_DEBUG 0
217 #define REDIS_VERBOSE 1
218 #define REDIS_NOTICE 2
219 #define REDIS_WARNING 3
220
221 /* Anti-warning macro... */
222 #define REDIS_NOTUSED(V) ((void) V)
223
224 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
226
227 /* Append only defines */
228 #define APPENDFSYNC_NO 0
229 #define APPENDFSYNC_ALWAYS 1
230 #define APPENDFSYNC_EVERYSEC 2
231
232 /* Hashes related defaults */
233 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
236 /* We can print the stacktrace, so our assert is defined this way: */
237 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
238 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
239 static void _redisAssert(char *estr, char *file, int line);
240 static void _redisPanic(char *msg, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 sds aofbuf; /* AOF buffer, written before entering the event loop */
374 struct saveparam *saveparams;
375 int saveparamslen;
376 char *logfile;
377 char *bindaddr;
378 char *dbfilename;
379 char *appendfilename;
380 char *requirepass;
381 int rdbcompression;
382 int activerehashing;
383 /* Replication related */
384 int isslave;
385 char *masterauth;
386 char *masterhost;
387 int masterport;
388 redisClient *master; /* client that is master for this slave */
389 int replstate;
390 unsigned int maxclients;
391 unsigned long long maxmemory;
392 unsigned int blpop_blocked_clients;
393 unsigned int vm_blocked_clients;
394 /* Sort parameters - qsort_r() is only available under BSD so we
395 * have to take this state global, in order to pass it to sortCompare() */
396 int sort_desc;
397 int sort_alpha;
398 int sort_bypattern;
399 /* Virtual memory configuration */
400 int vm_enabled;
401 char *vm_swap_file;
402 off_t vm_page_size;
403 off_t vm_pages;
404 unsigned long long vm_max_memory;
405 /* Hashes config */
406 size_t hash_max_zipmap_entries;
407 size_t hash_max_zipmap_value;
408 /* Virtual memory state */
409 FILE *vm_fp;
410 int vm_fd;
411 off_t vm_next_page; /* Next probably empty page */
412 off_t vm_near_pages; /* Number of pages allocated sequentially */
413 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
414 time_t unixtime; /* Unix time sampled every second. */
415 /* Virtual memory I/O threads stuff */
416 /* An I/O thread process an element taken from the io_jobs queue and
417 * put the result of the operation in the io_done list. While the
418 * job is being processed, it's put on io_processing queue. */
419 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
420 list *io_processing; /* List of VM I/O jobs being processed */
421 list *io_processed; /* List of VM I/O jobs already processed */
422 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
423 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
424 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
425 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
426 pthread_attr_t io_threads_attr; /* attributes for threads creation */
427 int io_active_threads; /* Number of running I/O threads */
428 int vm_max_threads; /* Max number of I/O threads running at the same time */
429 /* Our main thread is blocked on the event loop, locking for sockets ready
430 * to be read or written, so when a threaded I/O operation is ready to be
431 * processed by the main thread, the I/O thread will use a unix pipe to
432 * awake the main thread. The followings are the two pipe FDs. */
433 int io_ready_pipe_read;
434 int io_ready_pipe_write;
435 /* Virtual memory stats */
436 unsigned long long vm_stats_used_pages;
437 unsigned long long vm_stats_swapped_objects;
438 unsigned long long vm_stats_swapouts;
439 unsigned long long vm_stats_swapins;
440 /* Pubsub */
441 dict *pubsub_channels; /* Map channels to list of subscribed clients */
442 list *pubsub_patterns; /* A list of pubsub_patterns */
443 /* Misc */
444 FILE *devnull;
445 };
446
447 typedef struct pubsubPattern {
448 redisClient *client;
449 robj *pattern;
450 } pubsubPattern;
451
452 typedef void redisCommandProc(redisClient *c);
453 struct redisCommand {
454 char *name;
455 redisCommandProc *proc;
456 int arity;
457 int flags;
458 /* Use a function to determine which keys need to be loaded
459 * in the background prior to executing this command. Takes precedence
460 * over vm_firstkey and others, ignored when NULL */
461 redisCommandProc *vm_preload_proc;
462 /* What keys should be loaded in background when calling this command? */
463 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
464 int vm_lastkey; /* THe last argument that's a key */
465 int vm_keystep; /* The step between first and last key */
466 };
467
468 struct redisFunctionSym {
469 char *name;
470 unsigned long pointer;
471 };
472
473 typedef struct _redisSortObject {
474 robj *obj;
475 union {
476 double score;
477 robj *cmpobj;
478 } u;
479 } redisSortObject;
480
481 typedef struct _redisSortOperation {
482 int type;
483 robj *pattern;
484 } redisSortOperation;
485
486 /* ZSETs use a specialized version of Skiplists */
487
488 typedef struct zskiplistNode {
489 struct zskiplistNode **forward;
490 struct zskiplistNode *backward;
491 unsigned int *span;
492 double score;
493 robj *obj;
494 } zskiplistNode;
495
496 typedef struct zskiplist {
497 struct zskiplistNode *header, *tail;
498 unsigned long length;
499 int level;
500 } zskiplist;
501
502 typedef struct zset {
503 dict *dict;
504 zskiplist *zsl;
505 } zset;
506
507 /* Our shared "common" objects */
508
509 #define REDIS_SHARED_INTEGERS 10000
510 struct sharedObjectsStruct {
511 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
512 *colon, *nullbulk, *nullmultibulk, *queued,
513 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
514 *outofrangeerr, *plus,
515 *select0, *select1, *select2, *select3, *select4,
516 *select5, *select6, *select7, *select8, *select9,
517 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
518 *mbulk4, *psubscribebulk, *punsubscribebulk,
519 *integers[REDIS_SHARED_INTEGERS];
520 } shared;
521
522 /* Global vars that are actally used as constants. The following double
523 * values are used for double on-disk serialization, and are initialized
524 * at runtime to avoid strange compiler optimizations. */
525
526 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
527
528 /* VM threaded I/O request message */
529 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
530 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
531 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
532 typedef struct iojob {
533 int type; /* Request type, REDIS_IOJOB_* */
534 redisDb *db;/* Redis database */
535 robj *key; /* This I/O request is about swapping this key */
536 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
537 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
538 off_t page; /* Swap page where to read/write the object */
539 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
540 int canceled; /* True if this command was canceled by blocking side of VM */
541 pthread_t thread; /* ID of the thread processing this entry */
542 } iojob;
543
544 /*================================ Prototypes =============================== */
545
546 static void freeStringObject(robj *o);
547 static void freeListObject(robj *o);
548 static void freeSetObject(robj *o);
549 static void decrRefCount(void *o);
550 static robj *createObject(int type, void *ptr);
551 static void freeClient(redisClient *c);
552 static int rdbLoad(char *filename);
553 static void addReply(redisClient *c, robj *obj);
554 static void addReplySds(redisClient *c, sds s);
555 static void incrRefCount(robj *o);
556 static int rdbSaveBackground(char *filename);
557 static robj *createStringObject(char *ptr, size_t len);
558 static robj *dupStringObject(robj *o);
559 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
560 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
561 static void flushAppendOnlyFile(void);
562 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
563 static int syncWithMaster(void);
564 static robj *tryObjectEncoding(robj *o);
565 static robj *getDecodedObject(robj *o);
566 static int removeExpire(redisDb *db, robj *key);
567 static int expireIfNeeded(redisDb *db, robj *key);
568 static int deleteIfVolatile(redisDb *db, robj *key);
569 static int deleteIfSwapped(redisDb *db, robj *key);
570 static int deleteKey(redisDb *db, robj *key);
571 static time_t getExpire(redisDb *db, robj *key);
572 static int setExpire(redisDb *db, robj *key, time_t when);
573 static void updateSlavesWaitingBgsave(int bgsaveerr);
574 static void freeMemoryIfNeeded(void);
575 static int processCommand(redisClient *c);
576 static void setupSigSegvAction(void);
577 static void rdbRemoveTempFile(pid_t childpid);
578 static void aofRemoveTempFile(pid_t childpid);
579 static size_t stringObjectLen(robj *o);
580 static void processInputBuffer(redisClient *c);
581 static zskiplist *zslCreate(void);
582 static void zslFree(zskiplist *zsl);
583 static void zslInsert(zskiplist *zsl, double score, robj *obj);
584 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
585 static void initClientMultiState(redisClient *c);
586 static void freeClientMultiState(redisClient *c);
587 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
588 static void unblockClientWaitingData(redisClient *c);
589 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
590 static void vmInit(void);
591 static void vmMarkPagesFree(off_t page, off_t count);
592 static robj *vmLoadObject(robj *key);
593 static robj *vmPreviewObject(robj *key);
594 static int vmSwapOneObjectBlocking(void);
595 static int vmSwapOneObjectThreaded(void);
596 static int vmCanSwapOut(void);
597 static int tryFreeOneObjectFromFreelist(void);
598 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
599 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
600 static void vmCancelThreadedIOJob(robj *o);
601 static void lockThreadedIO(void);
602 static void unlockThreadedIO(void);
603 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
604 static void freeIOJob(iojob *j);
605 static void queueIOJob(iojob *j);
606 static int vmWriteObjectOnSwap(robj *o, off_t page);
607 static robj *vmReadObjectFromSwap(off_t page, int type);
608 static void waitEmptyIOJobsQueue(void);
609 static void vmReopenSwapFile(void);
610 static int vmFreePage(off_t page);
611 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
612 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
613 static int dontWaitForSwappedKey(redisClient *c, robj *key);
614 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
615 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
616 static struct redisCommand *lookupCommand(char *name);
617 static void call(redisClient *c, struct redisCommand *cmd);
618 static void resetClient(redisClient *c);
619 static void convertToRealHash(robj *o);
620 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
621 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
622 static void freePubsubPattern(void *p);
623 static int listMatchPubsubPattern(void *a, void *b);
624 static int compareStringObjects(robj *a, robj *b);
625 static int equalStringObjects(robj *a, robj *b);
626 static void usage();
627 static int rewriteAppendOnlyFileBackground(void);
628 static int vmSwapObjectBlocking(robj *key, robj *val);
629
630 static void authCommand(redisClient *c);
631 static void pingCommand(redisClient *c);
632 static void echoCommand(redisClient *c);
633 static void setCommand(redisClient *c);
634 static void setnxCommand(redisClient *c);
635 static void setexCommand(redisClient *c);
636 static void getCommand(redisClient *c);
637 static void delCommand(redisClient *c);
638 static void existsCommand(redisClient *c);
639 static void incrCommand(redisClient *c);
640 static void decrCommand(redisClient *c);
641 static void incrbyCommand(redisClient *c);
642 static void decrbyCommand(redisClient *c);
643 static void selectCommand(redisClient *c);
644 static void randomkeyCommand(redisClient *c);
645 static void keysCommand(redisClient *c);
646 static void dbsizeCommand(redisClient *c);
647 static void lastsaveCommand(redisClient *c);
648 static void saveCommand(redisClient *c);
649 static void bgsaveCommand(redisClient *c);
650 static void bgrewriteaofCommand(redisClient *c);
651 static void shutdownCommand(redisClient *c);
652 static void moveCommand(redisClient *c);
653 static void renameCommand(redisClient *c);
654 static void renamenxCommand(redisClient *c);
655 static void lpushCommand(redisClient *c);
656 static void rpushCommand(redisClient *c);
657 static void lpopCommand(redisClient *c);
658 static void rpopCommand(redisClient *c);
659 static void llenCommand(redisClient *c);
660 static void lindexCommand(redisClient *c);
661 static void lrangeCommand(redisClient *c);
662 static void ltrimCommand(redisClient *c);
663 static void typeCommand(redisClient *c);
664 static void lsetCommand(redisClient *c);
665 static void saddCommand(redisClient *c);
666 static void sremCommand(redisClient *c);
667 static void smoveCommand(redisClient *c);
668 static void sismemberCommand(redisClient *c);
669 static void scardCommand(redisClient *c);
670 static void spopCommand(redisClient *c);
671 static void srandmemberCommand(redisClient *c);
672 static void sinterCommand(redisClient *c);
673 static void sinterstoreCommand(redisClient *c);
674 static void sunionCommand(redisClient *c);
675 static void sunionstoreCommand(redisClient *c);
676 static void sdiffCommand(redisClient *c);
677 static void sdiffstoreCommand(redisClient *c);
678 static void syncCommand(redisClient *c);
679 static void flushdbCommand(redisClient *c);
680 static void flushallCommand(redisClient *c);
681 static void sortCommand(redisClient *c);
682 static void lremCommand(redisClient *c);
683 static void rpoplpushcommand(redisClient *c);
684 static void infoCommand(redisClient *c);
685 static void mgetCommand(redisClient *c);
686 static void monitorCommand(redisClient *c);
687 static void expireCommand(redisClient *c);
688 static void expireatCommand(redisClient *c);
689 static void getsetCommand(redisClient *c);
690 static void ttlCommand(redisClient *c);
691 static void slaveofCommand(redisClient *c);
692 static void debugCommand(redisClient *c);
693 static void msetCommand(redisClient *c);
694 static void msetnxCommand(redisClient *c);
695 static void zaddCommand(redisClient *c);
696 static void zincrbyCommand(redisClient *c);
697 static void zrangeCommand(redisClient *c);
698 static void zrangebyscoreCommand(redisClient *c);
699 static void zcountCommand(redisClient *c);
700 static void zrevrangeCommand(redisClient *c);
701 static void zcardCommand(redisClient *c);
702 static void zremCommand(redisClient *c);
703 static void zscoreCommand(redisClient *c);
704 static void zremrangebyscoreCommand(redisClient *c);
705 static void multiCommand(redisClient *c);
706 static void execCommand(redisClient *c);
707 static void discardCommand(redisClient *c);
708 static void blpopCommand(redisClient *c);
709 static void brpopCommand(redisClient *c);
710 static void appendCommand(redisClient *c);
711 static void substrCommand(redisClient *c);
712 static void zrankCommand(redisClient *c);
713 static void zrevrankCommand(redisClient *c);
714 static void hsetCommand(redisClient *c);
715 static void hsetnxCommand(redisClient *c);
716 static void hgetCommand(redisClient *c);
717 static void hmsetCommand(redisClient *c);
718 static void hmgetCommand(redisClient *c);
719 static void hdelCommand(redisClient *c);
720 static void hlenCommand(redisClient *c);
721 static void zremrangebyrankCommand(redisClient *c);
722 static void zunionCommand(redisClient *c);
723 static void zinterCommand(redisClient *c);
724 static void hkeysCommand(redisClient *c);
725 static void hvalsCommand(redisClient *c);
726 static void hgetallCommand(redisClient *c);
727 static void hexistsCommand(redisClient *c);
728 static void configCommand(redisClient *c);
729 static void hincrbyCommand(redisClient *c);
730 static void subscribeCommand(redisClient *c);
731 static void unsubscribeCommand(redisClient *c);
732 static void psubscribeCommand(redisClient *c);
733 static void punsubscribeCommand(redisClient *c);
734 static void publishCommand(redisClient *c);
735
736 /*================================= Globals ================================= */
737
738 /* Global vars */
739 static struct redisServer server; /* server global state */
740 static struct redisCommand cmdTable[] = {
741 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
743 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
744 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
745 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
748 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
752 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
764 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
765 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
767 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
768 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
775 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
776 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
777 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
778 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
785 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
786 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
806 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
810 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
811 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
815 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
823 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
831 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
836 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
842 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
847 {NULL,NULL,0,0,NULL,0,0,0}
848 };
849
850 /*============================ Utility functions ============================ */
851
852 /* Glob-style pattern matching. */
853 static int stringmatchlen(const char *pattern, int patternLen,
854 const char *string, int stringLen, int nocase)
855 {
856 while(patternLen) {
857 switch(pattern[0]) {
858 case '*':
859 while (pattern[1] == '*') {
860 pattern++;
861 patternLen--;
862 }
863 if (patternLen == 1)
864 return 1; /* match */
865 while(stringLen) {
866 if (stringmatchlen(pattern+1, patternLen-1,
867 string, stringLen, nocase))
868 return 1; /* match */
869 string++;
870 stringLen--;
871 }
872 return 0; /* no match */
873 break;
874 case '?':
875 if (stringLen == 0)
876 return 0; /* no match */
877 string++;
878 stringLen--;
879 break;
880 case '[':
881 {
882 int not, match;
883
884 pattern++;
885 patternLen--;
886 not = pattern[0] == '^';
887 if (not) {
888 pattern++;
889 patternLen--;
890 }
891 match = 0;
892 while(1) {
893 if (pattern[0] == '\\') {
894 pattern++;
895 patternLen--;
896 if (pattern[0] == string[0])
897 match = 1;
898 } else if (pattern[0] == ']') {
899 break;
900 } else if (patternLen == 0) {
901 pattern--;
902 patternLen++;
903 break;
904 } else if (pattern[1] == '-' && patternLen >= 3) {
905 int start = pattern[0];
906 int end = pattern[2];
907 int c = string[0];
908 if (start > end) {
909 int t = start;
910 start = end;
911 end = t;
912 }
913 if (nocase) {
914 start = tolower(start);
915 end = tolower(end);
916 c = tolower(c);
917 }
918 pattern += 2;
919 patternLen -= 2;
920 if (c >= start && c <= end)
921 match = 1;
922 } else {
923 if (!nocase) {
924 if (pattern[0] == string[0])
925 match = 1;
926 } else {
927 if (tolower((int)pattern[0]) == tolower((int)string[0]))
928 match = 1;
929 }
930 }
931 pattern++;
932 patternLen--;
933 }
934 if (not)
935 match = !match;
936 if (!match)
937 return 0; /* no match */
938 string++;
939 stringLen--;
940 break;
941 }
942 case '\\':
943 if (patternLen >= 2) {
944 pattern++;
945 patternLen--;
946 }
947 /* fall through */
948 default:
949 if (!nocase) {
950 if (pattern[0] != string[0])
951 return 0; /* no match */
952 } else {
953 if (tolower((int)pattern[0]) != tolower((int)string[0]))
954 return 0; /* no match */
955 }
956 string++;
957 stringLen--;
958 break;
959 }
960 pattern++;
961 patternLen--;
962 if (stringLen == 0) {
963 while(*pattern == '*') {
964 pattern++;
965 patternLen--;
966 }
967 break;
968 }
969 }
970 if (patternLen == 0 && stringLen == 0)
971 return 1;
972 return 0;
973 }
974
975 static int stringmatch(const char *pattern, const char *string, int nocase) {
976 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
977 }
978
979 /* Convert a string representing an amount of memory into the number of
980 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
981 * (1024*1024*1024).
982 *
983 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
984 * set to 0 */
985 static long long memtoll(const char *p, int *err) {
986 const char *u;
987 char buf[128];
988 long mul; /* unit multiplier */
989 long long val;
990 unsigned int digits;
991
992 if (err) *err = 0;
993 /* Search the first non digit character. */
994 u = p;
995 if (*u == '-') u++;
996 while(*u && isdigit(*u)) u++;
997 if (*u == '\0' || !strcasecmp(u,"b")) {
998 mul = 1;
999 } else if (!strcasecmp(u,"k")) {
1000 mul = 1000;
1001 } else if (!strcasecmp(u,"kb")) {
1002 mul = 1024;
1003 } else if (!strcasecmp(u,"m")) {
1004 mul = 1000*1000;
1005 } else if (!strcasecmp(u,"mb")) {
1006 mul = 1024*1024;
1007 } else if (!strcasecmp(u,"g")) {
1008 mul = 1000L*1000*1000;
1009 } else if (!strcasecmp(u,"gb")) {
1010 mul = 1024L*1024*1024;
1011 } else {
1012 if (err) *err = 1;
1013 mul = 1;
1014 }
1015 digits = u-p;
1016 if (digits >= sizeof(buf)) {
1017 if (err) *err = 1;
1018 return LLONG_MAX;
1019 }
1020 memcpy(buf,p,digits);
1021 buf[digits] = '\0';
1022 val = strtoll(buf,NULL,10);
1023 return val*mul;
1024 }
1025
1026 static void redisLog(int level, const char *fmt, ...) {
1027 va_list ap;
1028 FILE *fp;
1029
1030 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1031 if (!fp) return;
1032
1033 va_start(ap, fmt);
1034 if (level >= server.verbosity) {
1035 char *c = ".-*#";
1036 char buf[64];
1037 time_t now;
1038
1039 now = time(NULL);
1040 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1041 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1042 vfprintf(fp, fmt, ap);
1043 fprintf(fp,"\n");
1044 fflush(fp);
1045 }
1046 va_end(ap);
1047
1048 if (server.logfile) fclose(fp);
1049 }
1050
1051 /*====================== Hash table type implementation ==================== */
1052
1053 /* This is an hash table type that uses the SDS dynamic strings libary as
1054 * keys and radis objects as values (objects can hold SDS strings,
1055 * lists, sets). */
1056
1057 static void dictVanillaFree(void *privdata, void *val)
1058 {
1059 DICT_NOTUSED(privdata);
1060 zfree(val);
1061 }
1062
1063 static void dictListDestructor(void *privdata, void *val)
1064 {
1065 DICT_NOTUSED(privdata);
1066 listRelease((list*)val);
1067 }
1068
1069 static int sdsDictKeyCompare(void *privdata, const void *key1,
1070 const void *key2)
1071 {
1072 int l1,l2;
1073 DICT_NOTUSED(privdata);
1074
1075 l1 = sdslen((sds)key1);
1076 l2 = sdslen((sds)key2);
1077 if (l1 != l2) return 0;
1078 return memcmp(key1, key2, l1) == 0;
1079 }
1080
1081 static void dictRedisObjectDestructor(void *privdata, void *val)
1082 {
1083 DICT_NOTUSED(privdata);
1084
1085 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1086 decrRefCount(val);
1087 }
1088
1089 static int dictObjKeyCompare(void *privdata, const void *key1,
1090 const void *key2)
1091 {
1092 const robj *o1 = key1, *o2 = key2;
1093 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1094 }
1095
1096 static unsigned int dictObjHash(const void *key) {
1097 const robj *o = key;
1098 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1099 }
1100
1101 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1102 const void *key2)
1103 {
1104 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1105 int cmp;
1106
1107 if (o1->encoding == REDIS_ENCODING_INT &&
1108 o2->encoding == REDIS_ENCODING_INT)
1109 return o1->ptr == o2->ptr;
1110
1111 o1 = getDecodedObject(o1);
1112 o2 = getDecodedObject(o2);
1113 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1114 decrRefCount(o1);
1115 decrRefCount(o2);
1116 return cmp;
1117 }
1118
1119 static unsigned int dictEncObjHash(const void *key) {
1120 robj *o = (robj*) key;
1121
1122 if (o->encoding == REDIS_ENCODING_RAW) {
1123 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1124 } else {
1125 if (o->encoding == REDIS_ENCODING_INT) {
1126 char buf[32];
1127 int len;
1128
1129 len = snprintf(buf,32,"%ld",(long)o->ptr);
1130 return dictGenHashFunction((unsigned char*)buf, len);
1131 } else {
1132 unsigned int hash;
1133
1134 o = getDecodedObject(o);
1135 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1136 decrRefCount(o);
1137 return hash;
1138 }
1139 }
1140 }
1141
1142 /* Sets type and expires */
1143 static dictType setDictType = {
1144 dictEncObjHash, /* hash function */
1145 NULL, /* key dup */
1146 NULL, /* val dup */
1147 dictEncObjKeyCompare, /* key compare */
1148 dictRedisObjectDestructor, /* key destructor */
1149 NULL /* val destructor */
1150 };
1151
1152 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1153 static dictType zsetDictType = {
1154 dictEncObjHash, /* hash function */
1155 NULL, /* key dup */
1156 NULL, /* val dup */
1157 dictEncObjKeyCompare, /* key compare */
1158 dictRedisObjectDestructor, /* key destructor */
1159 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1160 };
1161
1162 /* Db->dict */
1163 static dictType dbDictType = {
1164 dictObjHash, /* hash function */
1165 NULL, /* key dup */
1166 NULL, /* val dup */
1167 dictObjKeyCompare, /* key compare */
1168 dictRedisObjectDestructor, /* key destructor */
1169 dictRedisObjectDestructor /* val destructor */
1170 };
1171
1172 /* Db->expires */
1173 static dictType keyptrDictType = {
1174 dictObjHash, /* hash function */
1175 NULL, /* key dup */
1176 NULL, /* val dup */
1177 dictObjKeyCompare, /* key compare */
1178 dictRedisObjectDestructor, /* key destructor */
1179 NULL /* val destructor */
1180 };
1181
1182 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1183 static dictType hashDictType = {
1184 dictEncObjHash, /* hash function */
1185 NULL, /* key dup */
1186 NULL, /* val dup */
1187 dictEncObjKeyCompare, /* key compare */
1188 dictRedisObjectDestructor, /* key destructor */
1189 dictRedisObjectDestructor /* val destructor */
1190 };
1191
1192 /* Keylist hash table type has unencoded redis objects as keys and
1193 * lists as values. It's used for blocking operations (BLPOP) and to
1194 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1195 static dictType keylistDictType = {
1196 dictObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
1201 dictListDestructor /* val destructor */
1202 };
1203
1204 static void version();
1205
1206 /* ========================= Random utility functions ======================= */
1207
1208 /* Redis generally does not try to recover from out of memory conditions
1209 * when allocating objects or strings, it is not clear if it will be possible
1210 * to report this condition to the client since the networking layer itself
1211 * is based on heap allocation for send buffers, so we simply abort.
1212 * At least the code will be simpler to read... */
1213 static void oom(const char *msg) {
1214 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1215 sleep(1);
1216 abort();
1217 }
1218
1219 /* ====================== Redis server networking stuff ===================== */
1220 static void closeTimedoutClients(void) {
1221 redisClient *c;
1222 listNode *ln;
1223 time_t now = time(NULL);
1224 listIter li;
1225
1226 listRewind(server.clients,&li);
1227 while ((ln = listNext(&li)) != NULL) {
1228 c = listNodeValue(ln);
1229 if (server.maxidletime &&
1230 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1231 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1232 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1233 listLength(c->pubsub_patterns) == 0 &&
1234 (now - c->lastinteraction > server.maxidletime))
1235 {
1236 redisLog(REDIS_VERBOSE,"Closing idle client");
1237 freeClient(c);
1238 } else if (c->flags & REDIS_BLOCKED) {
1239 if (c->blockingto != 0 && c->blockingto < now) {
1240 addReply(c,shared.nullmultibulk);
1241 unblockClientWaitingData(c);
1242 }
1243 }
1244 }
1245 }
1246
1247 static int htNeedsResize(dict *dict) {
1248 long long size, used;
1249
1250 size = dictSlots(dict);
1251 used = dictSize(dict);
1252 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1253 (used*100/size < REDIS_HT_MINFILL));
1254 }
1255
1256 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1257 * we resize the hash table to save memory */
1258 static void tryResizeHashTables(void) {
1259 int j;
1260
1261 for (j = 0; j < server.dbnum; j++) {
1262 if (htNeedsResize(server.db[j].dict))
1263 dictResize(server.db[j].dict);
1264 if (htNeedsResize(server.db[j].expires))
1265 dictResize(server.db[j].expires);
1266 }
1267 }
1268
1269 /* Our hash table implementation performs rehashing incrementally while
1270 * we write/read from the hash table. Still if the server is idle, the hash
1271 * table will use two tables for a long time. So we try to use 1 millisecond
1272 * of CPU time at every serverCron() loop in order to rehash some key. */
1273 static void incrementallyRehash(void) {
1274 int j;
1275
1276 for (j = 0; j < server.dbnum; j++) {
1277 if (dictIsRehashing(server.db[j].dict)) {
1278 dictRehashMilliseconds(server.db[j].dict,1);
1279 break; /* already used our millisecond for this loop... */
1280 }
1281 }
1282 }
1283
1284 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1285 void backgroundSaveDoneHandler(int statloc) {
1286 int exitcode = WEXITSTATUS(statloc);
1287 int bysignal = WIFSIGNALED(statloc);
1288
1289 if (!bysignal && exitcode == 0) {
1290 redisLog(REDIS_NOTICE,
1291 "Background saving terminated with success");
1292 server.dirty = 0;
1293 server.lastsave = time(NULL);
1294 } else if (!bysignal && exitcode != 0) {
1295 redisLog(REDIS_WARNING, "Background saving error");
1296 } else {
1297 redisLog(REDIS_WARNING,
1298 "Background saving terminated by signal %d", WTERMSIG(statloc));
1299 rdbRemoveTempFile(server.bgsavechildpid);
1300 }
1301 server.bgsavechildpid = -1;
1302 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1303 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1304 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1305 }
1306
1307 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1308 * Handle this. */
1309 void backgroundRewriteDoneHandler(int statloc) {
1310 int exitcode = WEXITSTATUS(statloc);
1311 int bysignal = WIFSIGNALED(statloc);
1312
1313 if (!bysignal && exitcode == 0) {
1314 int fd;
1315 char tmpfile[256];
1316
1317 redisLog(REDIS_NOTICE,
1318 "Background append only file rewriting terminated with success");
1319 /* Now it's time to flush the differences accumulated by the parent */
1320 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1321 fd = open(tmpfile,O_WRONLY|O_APPEND);
1322 if (fd == -1) {
1323 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1324 goto cleanup;
1325 }
1326 /* Flush our data... */
1327 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1328 (signed) sdslen(server.bgrewritebuf)) {
1329 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1330 close(fd);
1331 goto cleanup;
1332 }
1333 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1334 /* Now our work is to rename the temp file into the stable file. And
1335 * switch the file descriptor used by the server for append only. */
1336 if (rename(tmpfile,server.appendfilename) == -1) {
1337 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1338 close(fd);
1339 goto cleanup;
1340 }
1341 /* Mission completed... almost */
1342 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1343 if (server.appendfd != -1) {
1344 /* If append only is actually enabled... */
1345 close(server.appendfd);
1346 server.appendfd = fd;
1347 fsync(fd);
1348 server.appendseldb = -1; /* Make sure it will issue SELECT */
1349 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1350 } else {
1351 /* If append only is disabled we just generate a dump in this
1352 * format. Why not? */
1353 close(fd);
1354 }
1355 } else if (!bysignal && exitcode != 0) {
1356 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1357 } else {
1358 redisLog(REDIS_WARNING,
1359 "Background append only file rewriting terminated by signal %d",
1360 WTERMSIG(statloc));
1361 }
1362 cleanup:
1363 sdsfree(server.bgrewritebuf);
1364 server.bgrewritebuf = sdsempty();
1365 aofRemoveTempFile(server.bgrewritechildpid);
1366 server.bgrewritechildpid = -1;
1367 }
1368
1369 /* This function is called once a background process of some kind terminates,
1370 * as we want to avoid resizing the hash tables when there is a child in order
1371 * to play well with copy-on-write (otherwise when a resize happens lots of
1372 * memory pages are copied). The goal of this function is to update the ability
1373 * for dict.c to resize the hash tables accordingly to the fact we have o not
1374 * running childs. */
1375 static void updateDictResizePolicy(void) {
1376 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1377 dictEnableResize();
1378 else
1379 dictDisableResize();
1380 }
1381
1382 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1383 int j, loops = server.cronloops++;
1384 REDIS_NOTUSED(eventLoop);
1385 REDIS_NOTUSED(id);
1386 REDIS_NOTUSED(clientData);
1387
1388 /* We take a cached value of the unix time in the global state because
1389 * with virtual memory and aging there is to store the current time
1390 * in objects at every object access, and accuracy is not needed.
1391 * To access a global var is faster than calling time(NULL) */
1392 server.unixtime = time(NULL);
1393
1394 /* Show some info about non-empty databases */
1395 for (j = 0; j < server.dbnum; j++) {
1396 long long size, used, vkeys;
1397
1398 size = dictSlots(server.db[j].dict);
1399 used = dictSize(server.db[j].dict);
1400 vkeys = dictSize(server.db[j].expires);
1401 if (!(loops % 50) && (used || vkeys)) {
1402 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1403 /* dictPrintStats(server.dict); */
1404 }
1405 }
1406
1407 /* We don't want to resize the hash tables while a bacground saving
1408 * is in progress: the saving child is created using fork() that is
1409 * implemented with a copy-on-write semantic in most modern systems, so
1410 * if we resize the HT while there is the saving child at work actually
1411 * a lot of memory movements in the parent will cause a lot of pages
1412 * copied. */
1413 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1414 if (!(loops % 10)) tryResizeHashTables();
1415 if (server.activerehashing) incrementallyRehash();
1416 }
1417
1418 /* Show information about connected clients */
1419 if (!(loops % 50)) {
1420 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1421 listLength(server.clients)-listLength(server.slaves),
1422 listLength(server.slaves),
1423 zmalloc_used_memory());
1424 }
1425
1426 /* Close connections of timedout clients */
1427 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1428 closeTimedoutClients();
1429
1430 /* Check if a background saving or AOF rewrite in progress terminated */
1431 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1432 int statloc;
1433 pid_t pid;
1434
1435 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1436 if (pid == server.bgsavechildpid) {
1437 backgroundSaveDoneHandler(statloc);
1438 } else {
1439 backgroundRewriteDoneHandler(statloc);
1440 }
1441 updateDictResizePolicy();
1442 }
1443 } else {
1444 /* If there is not a background saving in progress check if
1445 * we have to save now */
1446 time_t now = time(NULL);
1447 for (j = 0; j < server.saveparamslen; j++) {
1448 struct saveparam *sp = server.saveparams+j;
1449
1450 if (server.dirty >= sp->changes &&
1451 now-server.lastsave > sp->seconds) {
1452 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1453 sp->changes, sp->seconds);
1454 rdbSaveBackground(server.dbfilename);
1455 break;
1456 }
1457 }
1458 }
1459
1460 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1461 * will use few CPU cycles if there are few expiring keys, otherwise
1462 * it will get more aggressive to avoid that too much memory is used by
1463 * keys that can be removed from the keyspace. */
1464 for (j = 0; j < server.dbnum; j++) {
1465 int expired;
1466 redisDb *db = server.db+j;
1467
1468 /* Continue to expire if at the end of the cycle more than 25%
1469 * of the keys were expired. */
1470 do {
1471 long num = dictSize(db->expires);
1472 time_t now = time(NULL);
1473
1474 expired = 0;
1475 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1476 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1477 while (num--) {
1478 dictEntry *de;
1479 time_t t;
1480
1481 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1482 t = (time_t) dictGetEntryVal(de);
1483 if (now > t) {
1484 deleteKey(db,dictGetEntryKey(de));
1485 expired++;
1486 server.stat_expiredkeys++;
1487 }
1488 }
1489 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1490 }
1491
1492 /* Swap a few keys on disk if we are over the memory limit and VM
1493 * is enbled. Try to free objects from the free list first. */
1494 if (vmCanSwapOut()) {
1495 while (server.vm_enabled && zmalloc_used_memory() >
1496 server.vm_max_memory)
1497 {
1498 int retval;
1499
1500 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1501 retval = (server.vm_max_threads == 0) ?
1502 vmSwapOneObjectBlocking() :
1503 vmSwapOneObjectThreaded();
1504 if (retval == REDIS_ERR && !(loops % 300) &&
1505 zmalloc_used_memory() >
1506 (server.vm_max_memory+server.vm_max_memory/10))
1507 {
1508 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1509 }
1510 /* Note that when using threade I/O we free just one object,
1511 * because anyway when the I/O thread in charge to swap this
1512 * object out will finish, the handler of completed jobs
1513 * will try to swap more objects if we are still out of memory. */
1514 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1515 }
1516 }
1517
1518 /* Check if we should connect to a MASTER */
1519 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1520 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1521 if (syncWithMaster() == REDIS_OK) {
1522 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1523 if (server.appendonly) rewriteAppendOnlyFileBackground();
1524 }
1525 }
1526 return 100;
1527 }
1528
1529 /* This function gets called every time Redis is entering the
1530 * main loop of the event driven library, that is, before to sleep
1531 * for ready file descriptors. */
1532 static void beforeSleep(struct aeEventLoop *eventLoop) {
1533 REDIS_NOTUSED(eventLoop);
1534
1535 /* Awake clients that got all the swapped keys they requested */
1536 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1537 listIter li;
1538 listNode *ln;
1539
1540 listRewind(server.io_ready_clients,&li);
1541 while((ln = listNext(&li))) {
1542 redisClient *c = ln->value;
1543 struct redisCommand *cmd;
1544
1545 /* Resume the client. */
1546 listDelNode(server.io_ready_clients,ln);
1547 c->flags &= (~REDIS_IO_WAIT);
1548 server.vm_blocked_clients--;
1549 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1550 readQueryFromClient, c);
1551 cmd = lookupCommand(c->argv[0]->ptr);
1552 assert(cmd != NULL);
1553 call(c,cmd);
1554 resetClient(c);
1555 /* There may be more data to process in the input buffer. */
1556 if (c->querybuf && sdslen(c->querybuf) > 0)
1557 processInputBuffer(c);
1558 }
1559 }
1560 /* Write the AOF buffer on disk */
1561 flushAppendOnlyFile();
1562 }
1563
1564 static void createSharedObjects(void) {
1565 int j;
1566
1567 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1568 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1569 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1570 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1571 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1572 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1573 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1574 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1575 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1576 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1577 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1578 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1580 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1581 "-ERR no such key\r\n"));
1582 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1583 "-ERR syntax error\r\n"));
1584 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1585 "-ERR source and destination objects are the same\r\n"));
1586 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1587 "-ERR index out of range\r\n"));
1588 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1589 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1590 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1591 shared.select0 = createStringObject("select 0\r\n",10);
1592 shared.select1 = createStringObject("select 1\r\n",10);
1593 shared.select2 = createStringObject("select 2\r\n",10);
1594 shared.select3 = createStringObject("select 3\r\n",10);
1595 shared.select4 = createStringObject("select 4\r\n",10);
1596 shared.select5 = createStringObject("select 5\r\n",10);
1597 shared.select6 = createStringObject("select 6\r\n",10);
1598 shared.select7 = createStringObject("select 7\r\n",10);
1599 shared.select8 = createStringObject("select 8\r\n",10);
1600 shared.select9 = createStringObject("select 9\r\n",10);
1601 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1602 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1603 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1604 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1605 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1606 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1607 shared.mbulk3 = createStringObject("*3\r\n",4);
1608 shared.mbulk4 = createStringObject("*4\r\n",4);
1609 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1610 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1611 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1612 }
1613 }
1614
1615 static void appendServerSaveParams(time_t seconds, int changes) {
1616 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1617 server.saveparams[server.saveparamslen].seconds = seconds;
1618 server.saveparams[server.saveparamslen].changes = changes;
1619 server.saveparamslen++;
1620 }
1621
1622 static void resetServerSaveParams() {
1623 zfree(server.saveparams);
1624 server.saveparams = NULL;
1625 server.saveparamslen = 0;
1626 }
1627
1628 static void initServerConfig() {
1629 server.dbnum = REDIS_DEFAULT_DBNUM;
1630 server.port = REDIS_SERVERPORT;
1631 server.verbosity = REDIS_VERBOSE;
1632 server.maxidletime = REDIS_MAXIDLETIME;
1633 server.saveparams = NULL;
1634 server.logfile = NULL; /* NULL = log on standard output */
1635 server.bindaddr = NULL;
1636 server.glueoutputbuf = 1;
1637 server.daemonize = 0;
1638 server.appendonly = 0;
1639 server.appendfsync = APPENDFSYNC_EVERYSEC;
1640 server.lastfsync = time(NULL);
1641 server.appendfd = -1;
1642 server.appendseldb = -1; /* Make sure the first time will not match */
1643 server.pidfile = zstrdup("/var/run/redis.pid");
1644 server.dbfilename = zstrdup("dump.rdb");
1645 server.appendfilename = zstrdup("appendonly.aof");
1646 server.requirepass = NULL;
1647 server.rdbcompression = 1;
1648 server.activerehashing = 1;
1649 server.maxclients = 0;
1650 server.blpop_blocked_clients = 0;
1651 server.maxmemory = 0;
1652 server.vm_enabled = 0;
1653 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1654 server.vm_page_size = 256; /* 256 bytes per page */
1655 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1656 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1657 server.vm_max_threads = 4;
1658 server.vm_blocked_clients = 0;
1659 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1660 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1661
1662 resetServerSaveParams();
1663
1664 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1665 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1666 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1667 /* Replication related */
1668 server.isslave = 0;
1669 server.masterauth = NULL;
1670 server.masterhost = NULL;
1671 server.masterport = 6379;
1672 server.master = NULL;
1673 server.replstate = REDIS_REPL_NONE;
1674
1675 /* Double constants initialization */
1676 R_Zero = 0.0;
1677 R_PosInf = 1.0/R_Zero;
1678 R_NegInf = -1.0/R_Zero;
1679 R_Nan = R_Zero/R_Zero;
1680 }
1681
1682 static void initServer() {
1683 int j;
1684
1685 signal(SIGHUP, SIG_IGN);
1686 signal(SIGPIPE, SIG_IGN);
1687 setupSigSegvAction();
1688
1689 server.devnull = fopen("/dev/null","w");
1690 if (server.devnull == NULL) {
1691 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1692 exit(1);
1693 }
1694 server.clients = listCreate();
1695 server.slaves = listCreate();
1696 server.monitors = listCreate();
1697 server.objfreelist = listCreate();
1698 createSharedObjects();
1699 server.el = aeCreateEventLoop();
1700 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1701 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1702 if (server.fd == -1) {
1703 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1704 exit(1);
1705 }
1706 for (j = 0; j < server.dbnum; j++) {
1707 server.db[j].dict = dictCreate(&dbDictType,NULL);
1708 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1709 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1710 if (server.vm_enabled)
1711 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1712 server.db[j].id = j;
1713 }
1714 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1715 server.pubsub_patterns = listCreate();
1716 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1717 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1718 server.cronloops = 0;
1719 server.bgsavechildpid = -1;
1720 server.bgrewritechildpid = -1;
1721 server.bgrewritebuf = sdsempty();
1722 server.aofbuf = sdsempty();
1723 server.lastsave = time(NULL);
1724 server.dirty = 0;
1725 server.stat_numcommands = 0;
1726 server.stat_numconnections = 0;
1727 server.stat_expiredkeys = 0;
1728 server.stat_starttime = time(NULL);
1729 server.unixtime = time(NULL);
1730 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1731 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1732 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1733
1734 if (server.appendonly) {
1735 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1736 if (server.appendfd == -1) {
1737 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1738 strerror(errno));
1739 exit(1);
1740 }
1741 }
1742
1743 if (server.vm_enabled) vmInit();
1744 }
1745
1746 /* Empty the whole database */
1747 static long long emptyDb() {
1748 int j;
1749 long long removed = 0;
1750
1751 for (j = 0; j < server.dbnum; j++) {
1752 removed += dictSize(server.db[j].dict);
1753 dictEmpty(server.db[j].dict);
1754 dictEmpty(server.db[j].expires);
1755 }
1756 return removed;
1757 }
1758
1759 static int yesnotoi(char *s) {
1760 if (!strcasecmp(s,"yes")) return 1;
1761 else if (!strcasecmp(s,"no")) return 0;
1762 else return -1;
1763 }
1764
1765 /* I agree, this is a very rudimental way to load a configuration...
1766 will improve later if the config gets more complex */
1767 static void loadServerConfig(char *filename) {
1768 FILE *fp;
1769 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1770 int linenum = 0;
1771 sds line = NULL;
1772
1773 if (filename[0] == '-' && filename[1] == '\0')
1774 fp = stdin;
1775 else {
1776 if ((fp = fopen(filename,"r")) == NULL) {
1777 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1778 exit(1);
1779 }
1780 }
1781
1782 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1783 sds *argv;
1784 int argc, j;
1785
1786 linenum++;
1787 line = sdsnew(buf);
1788 line = sdstrim(line," \t\r\n");
1789
1790 /* Skip comments and blank lines*/
1791 if (line[0] == '#' || line[0] == '\0') {
1792 sdsfree(line);
1793 continue;
1794 }
1795
1796 /* Split into arguments */
1797 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1798 sdstolower(argv[0]);
1799
1800 /* Execute config directives */
1801 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1802 server.maxidletime = atoi(argv[1]);
1803 if (server.maxidletime < 0) {
1804 err = "Invalid timeout value"; goto loaderr;
1805 }
1806 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1807 server.port = atoi(argv[1]);
1808 if (server.port < 1 || server.port > 65535) {
1809 err = "Invalid port"; goto loaderr;
1810 }
1811 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1812 server.bindaddr = zstrdup(argv[1]);
1813 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1814 int seconds = atoi(argv[1]);
1815 int changes = atoi(argv[2]);
1816 if (seconds < 1 || changes < 0) {
1817 err = "Invalid save parameters"; goto loaderr;
1818 }
1819 appendServerSaveParams(seconds,changes);
1820 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1821 if (chdir(argv[1]) == -1) {
1822 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1823 argv[1], strerror(errno));
1824 exit(1);
1825 }
1826 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1827 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1828 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1829 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1830 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1831 else {
1832 err = "Invalid log level. Must be one of debug, notice, warning";
1833 goto loaderr;
1834 }
1835 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1836 FILE *logfp;
1837
1838 server.logfile = zstrdup(argv[1]);
1839 if (!strcasecmp(server.logfile,"stdout")) {
1840 zfree(server.logfile);
1841 server.logfile = NULL;
1842 }
1843 if (server.logfile) {
1844 /* Test if we are able to open the file. The server will not
1845 * be able to abort just for this problem later... */
1846 logfp = fopen(server.logfile,"a");
1847 if (logfp == NULL) {
1848 err = sdscatprintf(sdsempty(),
1849 "Can't open the log file: %s", strerror(errno));
1850 goto loaderr;
1851 }
1852 fclose(logfp);
1853 }
1854 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1855 server.dbnum = atoi(argv[1]);
1856 if (server.dbnum < 1) {
1857 err = "Invalid number of databases"; goto loaderr;
1858 }
1859 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1860 loadServerConfig(argv[1]);
1861 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1862 server.maxclients = atoi(argv[1]);
1863 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1864 server.maxmemory = memtoll(argv[1],NULL);
1865 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1866 server.masterhost = sdsnew(argv[1]);
1867 server.masterport = atoi(argv[2]);
1868 server.replstate = REDIS_REPL_CONNECT;
1869 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1870 server.masterauth = zstrdup(argv[1]);
1871 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1872 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1873 err = "argument must be 'yes' or 'no'"; goto loaderr;
1874 }
1875 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1876 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1877 err = "argument must be 'yes' or 'no'"; goto loaderr;
1878 }
1879 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1880 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1881 err = "argument must be 'yes' or 'no'"; goto loaderr;
1882 }
1883 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1884 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1885 err = "argument must be 'yes' or 'no'"; goto loaderr;
1886 }
1887 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1888 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1889 err = "argument must be 'yes' or 'no'"; goto loaderr;
1890 }
1891 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1892 if (!strcasecmp(argv[1],"no")) {
1893 server.appendfsync = APPENDFSYNC_NO;
1894 } else if (!strcasecmp(argv[1],"always")) {
1895 server.appendfsync = APPENDFSYNC_ALWAYS;
1896 } else if (!strcasecmp(argv[1],"everysec")) {
1897 server.appendfsync = APPENDFSYNC_EVERYSEC;
1898 } else {
1899 err = "argument must be 'no', 'always' or 'everysec'";
1900 goto loaderr;
1901 }
1902 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1903 server.requirepass = zstrdup(argv[1]);
1904 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1905 zfree(server.pidfile);
1906 server.pidfile = zstrdup(argv[1]);
1907 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1908 zfree(server.dbfilename);
1909 server.dbfilename = zstrdup(argv[1]);
1910 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1911 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1912 err = "argument must be 'yes' or 'no'"; goto loaderr;
1913 }
1914 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1915 zfree(server.vm_swap_file);
1916 server.vm_swap_file = zstrdup(argv[1]);
1917 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1918 server.vm_max_memory = memtoll(argv[1],NULL);
1919 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1920 server.vm_page_size = memtoll(argv[1], NULL);
1921 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1922 server.vm_pages = memtoll(argv[1], NULL);
1923 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1924 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1925 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1926 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1927 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1928 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1929 } else {
1930 err = "Bad directive or wrong number of arguments"; goto loaderr;
1931 }
1932 for (j = 0; j < argc; j++)
1933 sdsfree(argv[j]);
1934 zfree(argv);
1935 sdsfree(line);
1936 }
1937 if (fp != stdin) fclose(fp);
1938 return;
1939
1940 loaderr:
1941 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1942 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1943 fprintf(stderr, ">>> '%s'\n", line);
1944 fprintf(stderr, "%s\n", err);
1945 exit(1);
1946 }
1947
1948 static void freeClientArgv(redisClient *c) {
1949 int j;
1950
1951 for (j = 0; j < c->argc; j++)
1952 decrRefCount(c->argv[j]);
1953 for (j = 0; j < c->mbargc; j++)
1954 decrRefCount(c->mbargv[j]);
1955 c->argc = 0;
1956 c->mbargc = 0;
1957 }
1958
1959 static void freeClient(redisClient *c) {
1960 listNode *ln;
1961
1962 /* Note that if the client we are freeing is blocked into a blocking
1963 * call, we have to set querybuf to NULL *before* to call
1964 * unblockClientWaitingData() to avoid processInputBuffer() will get
1965 * called. Also it is important to remove the file events after
1966 * this, because this call adds the READABLE event. */
1967 sdsfree(c->querybuf);
1968 c->querybuf = NULL;
1969 if (c->flags & REDIS_BLOCKED)
1970 unblockClientWaitingData(c);
1971
1972 /* Unsubscribe from all the pubsub channels */
1973 pubsubUnsubscribeAllChannels(c,0);
1974 pubsubUnsubscribeAllPatterns(c,0);
1975 dictRelease(c->pubsub_channels);
1976 listRelease(c->pubsub_patterns);
1977 /* Obvious cleanup */
1978 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1979 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1980 listRelease(c->reply);
1981 freeClientArgv(c);
1982 close(c->fd);
1983 /* Remove from the list of clients */
1984 ln = listSearchKey(server.clients,c);
1985 redisAssert(ln != NULL);
1986 listDelNode(server.clients,ln);
1987 /* Remove from the list of clients waiting for swapped keys */
1988 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1989 ln = listSearchKey(server.io_ready_clients,c);
1990 if (ln) {
1991 listDelNode(server.io_ready_clients,ln);
1992 server.vm_blocked_clients--;
1993 }
1994 }
1995 while (server.vm_enabled && listLength(c->io_keys)) {
1996 ln = listFirst(c->io_keys);
1997 dontWaitForSwappedKey(c,ln->value);
1998 }
1999 listRelease(c->io_keys);
2000 /* Master/slave cleanup */
2001 if (c->flags & REDIS_SLAVE) {
2002 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2003 close(c->repldbfd);
2004 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2005 ln = listSearchKey(l,c);
2006 redisAssert(ln != NULL);
2007 listDelNode(l,ln);
2008 }
2009 if (c->flags & REDIS_MASTER) {
2010 server.master = NULL;
2011 server.replstate = REDIS_REPL_CONNECT;
2012 }
2013 /* Release memory */
2014 zfree(c->argv);
2015 zfree(c->mbargv);
2016 freeClientMultiState(c);
2017 zfree(c);
2018 }
2019
2020 #define GLUEREPLY_UP_TO (1024)
2021 static void glueReplyBuffersIfNeeded(redisClient *c) {
2022 int copylen = 0;
2023 char buf[GLUEREPLY_UP_TO];
2024 listNode *ln;
2025 listIter li;
2026 robj *o;
2027
2028 listRewind(c->reply,&li);
2029 while((ln = listNext(&li))) {
2030 int objlen;
2031
2032 o = ln->value;
2033 objlen = sdslen(o->ptr);
2034 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2035 memcpy(buf+copylen,o->ptr,objlen);
2036 copylen += objlen;
2037 listDelNode(c->reply,ln);
2038 } else {
2039 if (copylen == 0) return;
2040 break;
2041 }
2042 }
2043 /* Now the output buffer is empty, add the new single element */
2044 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2045 listAddNodeHead(c->reply,o);
2046 }
2047
2048 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2049 redisClient *c = privdata;
2050 int nwritten = 0, totwritten = 0, objlen;
2051 robj *o;
2052 REDIS_NOTUSED(el);
2053 REDIS_NOTUSED(mask);
2054
2055 /* Use writev() if we have enough buffers to send */
2056 if (!server.glueoutputbuf &&
2057 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2058 !(c->flags & REDIS_MASTER))
2059 {
2060 sendReplyToClientWritev(el, fd, privdata, mask);
2061 return;
2062 }
2063
2064 while(listLength(c->reply)) {
2065 if (server.glueoutputbuf && listLength(c->reply) > 1)
2066 glueReplyBuffersIfNeeded(c);
2067
2068 o = listNodeValue(listFirst(c->reply));
2069 objlen = sdslen(o->ptr);
2070
2071 if (objlen == 0) {
2072 listDelNode(c->reply,listFirst(c->reply));
2073 continue;
2074 }
2075
2076 if (c->flags & REDIS_MASTER) {
2077 /* Don't reply to a master */
2078 nwritten = objlen - c->sentlen;
2079 } else {
2080 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2081 if (nwritten <= 0) break;
2082 }
2083 c->sentlen += nwritten;
2084 totwritten += nwritten;
2085 /* If we fully sent the object on head go to the next one */
2086 if (c->sentlen == objlen) {
2087 listDelNode(c->reply,listFirst(c->reply));
2088 c->sentlen = 0;
2089 }
2090 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2091 * bytes, in a single threaded server it's a good idea to serve
2092 * other clients as well, even if a very large request comes from
2093 * super fast link that is always able to accept data (in real world
2094 * scenario think about 'KEYS *' against the loopback interfae) */
2095 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2096 }
2097 if (nwritten == -1) {
2098 if (errno == EAGAIN) {
2099 nwritten = 0;
2100 } else {
2101 redisLog(REDIS_VERBOSE,
2102 "Error writing to client: %s", strerror(errno));
2103 freeClient(c);
2104 return;
2105 }
2106 }
2107 if (totwritten > 0) c->lastinteraction = time(NULL);
2108 if (listLength(c->reply) == 0) {
2109 c->sentlen = 0;
2110 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2111 }
2112 }
2113
2114 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2115 {
2116 redisClient *c = privdata;
2117 int nwritten = 0, totwritten = 0, objlen, willwrite;
2118 robj *o;
2119 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2120 int offset, ion = 0;
2121 REDIS_NOTUSED(el);
2122 REDIS_NOTUSED(mask);
2123
2124 listNode *node;
2125 while (listLength(c->reply)) {
2126 offset = c->sentlen;
2127 ion = 0;
2128 willwrite = 0;
2129
2130 /* fill-in the iov[] array */
2131 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2132 o = listNodeValue(node);
2133 objlen = sdslen(o->ptr);
2134
2135 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2136 break;
2137
2138 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2139 break; /* no more iovecs */
2140
2141 iov[ion].iov_base = ((char*)o->ptr) + offset;
2142 iov[ion].iov_len = objlen - offset;
2143 willwrite += objlen - offset;
2144 offset = 0; /* just for the first item */
2145 ion++;
2146 }
2147
2148 if(willwrite == 0)
2149 break;
2150
2151 /* write all collected blocks at once */
2152 if((nwritten = writev(fd, iov, ion)) < 0) {
2153 if (errno != EAGAIN) {
2154 redisLog(REDIS_VERBOSE,
2155 "Error writing to client: %s", strerror(errno));
2156 freeClient(c);
2157 return;
2158 }
2159 break;
2160 }
2161
2162 totwritten += nwritten;
2163 offset = c->sentlen;
2164
2165 /* remove written robjs from c->reply */
2166 while (nwritten && listLength(c->reply)) {
2167 o = listNodeValue(listFirst(c->reply));
2168 objlen = sdslen(o->ptr);
2169
2170 if(nwritten >= objlen - offset) {
2171 listDelNode(c->reply, listFirst(c->reply));
2172 nwritten -= objlen - offset;
2173 c->sentlen = 0;
2174 } else {
2175 /* partial write */
2176 c->sentlen += nwritten;
2177 break;
2178 }
2179 offset = 0;
2180 }
2181 }
2182
2183 if (totwritten > 0)
2184 c->lastinteraction = time(NULL);
2185
2186 if (listLength(c->reply) == 0) {
2187 c->sentlen = 0;
2188 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2189 }
2190 }
2191
2192 static struct redisCommand *lookupCommand(char *name) {
2193 int j = 0;
2194 while(cmdTable[j].name != NULL) {
2195 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2196 j++;
2197 }
2198 return NULL;
2199 }
2200
2201 /* resetClient prepare the client to process the next command */
2202 static void resetClient(redisClient *c) {
2203 freeClientArgv(c);
2204 c->bulklen = -1;
2205 c->multibulk = 0;
2206 }
2207
2208 /* Call() is the core of Redis execution of a command */
2209 static void call(redisClient *c, struct redisCommand *cmd) {
2210 long long dirty;
2211
2212 dirty = server.dirty;
2213 cmd->proc(c);
2214 dirty = server.dirty-dirty;
2215
2216 if (server.appendonly && dirty)
2217 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2218 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2219 listLength(server.slaves))
2220 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2221 if (listLength(server.monitors))
2222 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2223 server.stat_numcommands++;
2224 }
2225
2226 /* If this function gets called we already read a whole
2227 * command, argments are in the client argv/argc fields.
2228 * processCommand() execute the command or prepare the
2229 * server for a bulk read from the client.
2230 *
2231 * If 1 is returned the client is still alive and valid and
2232 * and other operations can be performed by the caller. Otherwise
2233 * if 0 is returned the client was destroied (i.e. after QUIT). */
2234 static int processCommand(redisClient *c) {
2235 struct redisCommand *cmd;
2236
2237 /* Free some memory if needed (maxmemory setting) */
2238 if (server.maxmemory) freeMemoryIfNeeded();
2239
2240 /* Handle the multi bulk command type. This is an alternative protocol
2241 * supported by Redis in order to receive commands that are composed of
2242 * multiple binary-safe "bulk" arguments. The latency of processing is
2243 * a bit higher but this allows things like multi-sets, so if this
2244 * protocol is used only for MSET and similar commands this is a big win. */
2245 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2246 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2247 if (c->multibulk <= 0) {
2248 resetClient(c);
2249 return 1;
2250 } else {
2251 decrRefCount(c->argv[c->argc-1]);
2252 c->argc--;
2253 return 1;
2254 }
2255 } else if (c->multibulk) {
2256 if (c->bulklen == -1) {
2257 if (((char*)c->argv[0]->ptr)[0] != '$') {
2258 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2259 resetClient(c);
2260 return 1;
2261 } else {
2262 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2263 decrRefCount(c->argv[0]);
2264 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2265 c->argc--;
2266 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2267 resetClient(c);
2268 return 1;
2269 }
2270 c->argc--;
2271 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2272 return 1;
2273 }
2274 } else {
2275 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2276 c->mbargv[c->mbargc] = c->argv[0];
2277 c->mbargc++;
2278 c->argc--;
2279 c->multibulk--;
2280 if (c->multibulk == 0) {
2281 robj **auxargv;
2282 int auxargc;
2283
2284 /* Here we need to swap the multi-bulk argc/argv with the
2285 * normal argc/argv of the client structure. */
2286 auxargv = c->argv;
2287 c->argv = c->mbargv;
2288 c->mbargv = auxargv;
2289
2290 auxargc = c->argc;
2291 c->argc = c->mbargc;
2292 c->mbargc = auxargc;
2293
2294 /* We need to set bulklen to something different than -1
2295 * in order for the code below to process the command without
2296 * to try to read the last argument of a bulk command as
2297 * a special argument. */
2298 c->bulklen = 0;
2299 /* continue below and process the command */
2300 } else {
2301 c->bulklen = -1;
2302 return 1;
2303 }
2304 }
2305 }
2306 /* -- end of multi bulk commands processing -- */
2307
2308 /* The QUIT command is handled as a special case. Normal command
2309 * procs are unable to close the client connection safely */
2310 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2311 freeClient(c);
2312 return 0;
2313 }
2314
2315 /* Now lookup the command and check ASAP about trivial error conditions
2316 * such wrong arity, bad command name and so forth. */
2317 cmd = lookupCommand(c->argv[0]->ptr);
2318 if (!cmd) {
2319 addReplySds(c,
2320 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2321 (char*)c->argv[0]->ptr));
2322 resetClient(c);
2323 return 1;
2324 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2325 (c->argc < -cmd->arity)) {
2326 addReplySds(c,
2327 sdscatprintf(sdsempty(),
2328 "-ERR wrong number of arguments for '%s' command\r\n",
2329 cmd->name));
2330 resetClient(c);
2331 return 1;
2332 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2333 /* This is a bulk command, we have to read the last argument yet. */
2334 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2335
2336 decrRefCount(c->argv[c->argc-1]);
2337 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2338 c->argc--;
2339 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2340 resetClient(c);
2341 return 1;
2342 }
2343 c->argc--;
2344 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2345 /* It is possible that the bulk read is already in the
2346 * buffer. Check this condition and handle it accordingly.
2347 * This is just a fast path, alternative to call processInputBuffer().
2348 * It's a good idea since the code is small and this condition
2349 * happens most of the times. */
2350 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2351 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2352 c->argc++;
2353 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2354 } else {
2355 /* Otherwise return... there is to read the last argument
2356 * from the socket. */
2357 return 1;
2358 }
2359 }
2360 /* Let's try to encode the bulk object to save space. */
2361 if (cmd->flags & REDIS_CMD_BULK)
2362 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2363
2364 /* Check if the user is authenticated */
2365 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2366 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2367 resetClient(c);
2368 return 1;
2369 }
2370
2371 /* Handle the maxmemory directive */
2372 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2373 zmalloc_used_memory() > server.maxmemory)
2374 {
2375 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2376 resetClient(c);
2377 return 1;
2378 }
2379
2380 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2381 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2382 &&
2383 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2384 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2385 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2386 resetClient(c);
2387 return 1;
2388 }
2389
2390 /* Exec the command */
2391 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2392 queueMultiCommand(c,cmd);
2393 addReply(c,shared.queued);
2394 } else {
2395 if (server.vm_enabled && server.vm_max_threads > 0 &&
2396 blockClientOnSwappedKeys(cmd,c)) return 1;
2397 call(c,cmd);
2398 }
2399
2400 /* Prepare the client for the next command */
2401 resetClient(c);
2402 return 1;
2403 }
2404
2405 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2406 listNode *ln;
2407 listIter li;
2408 int outc = 0, j;
2409 robj **outv;
2410 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2411 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2412 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2413 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2414 robj *lenobj;
2415
2416 if (argc <= REDIS_STATIC_ARGS) {
2417 outv = static_outv;
2418 } else {
2419 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2420 }
2421
2422 lenobj = createObject(REDIS_STRING,
2423 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2424 lenobj->refcount = 0;
2425 outv[outc++] = lenobj;
2426 for (j = 0; j < argc; j++) {
2427 lenobj = createObject(REDIS_STRING,
2428 sdscatprintf(sdsempty(),"$%lu\r\n",
2429 (unsigned long) stringObjectLen(argv[j])));
2430 lenobj->refcount = 0;
2431 outv[outc++] = lenobj;
2432 outv[outc++] = argv[j];
2433 outv[outc++] = shared.crlf;
2434 }
2435
2436 /* Increment all the refcounts at start and decrement at end in order to
2437 * be sure to free objects if there is no slave in a replication state
2438 * able to be feed with commands */
2439 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2440 listRewind(slaves,&li);
2441 while((ln = listNext(&li))) {
2442 redisClient *slave = ln->value;
2443
2444 /* Don't feed slaves that are still waiting for BGSAVE to start */
2445 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2446
2447 /* Feed all the other slaves, MONITORs and so on */
2448 if (slave->slaveseldb != dictid) {
2449 robj *selectcmd;
2450
2451 switch(dictid) {
2452 case 0: selectcmd = shared.select0; break;
2453 case 1: selectcmd = shared.select1; break;
2454 case 2: selectcmd = shared.select2; break;
2455 case 3: selectcmd = shared.select3; break;
2456 case 4: selectcmd = shared.select4; break;
2457 case 5: selectcmd = shared.select5; break;
2458 case 6: selectcmd = shared.select6; break;
2459 case 7: selectcmd = shared.select7; break;
2460 case 8: selectcmd = shared.select8; break;
2461 case 9: selectcmd = shared.select9; break;
2462 default:
2463 selectcmd = createObject(REDIS_STRING,
2464 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2465 selectcmd->refcount = 0;
2466 break;
2467 }
2468 addReply(slave,selectcmd);
2469 slave->slaveseldb = dictid;
2470 }
2471 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2472 }
2473 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2474 if (outv != static_outv) zfree(outv);
2475 }
2476
2477 static sds sdscatrepr(sds s, char *p, size_t len) {
2478 s = sdscatlen(s,"\"",1);
2479 while(len--) {
2480 switch(*p) {
2481 case '\\':
2482 case '"':
2483 s = sdscatprintf(s,"\\%c",*p);
2484 break;
2485 case '\n': s = sdscatlen(s,"\\n",1); break;
2486 case '\r': s = sdscatlen(s,"\\r",1); break;
2487 case '\t': s = sdscatlen(s,"\\t",1); break;
2488 case '\a': s = sdscatlen(s,"\\a",1); break;
2489 case '\b': s = sdscatlen(s,"\\b",1); break;
2490 default:
2491 if (isprint(*p))
2492 s = sdscatprintf(s,"%c",*p);
2493 else
2494 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2495 break;
2496 }
2497 p++;
2498 }
2499 return sdscatlen(s,"\"",1);
2500 }
2501
2502 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2503 listNode *ln;
2504 listIter li;
2505 int j;
2506 sds cmdrepr = sdsnew("+");
2507 robj *cmdobj;
2508 struct timeval tv;
2509
2510 gettimeofday(&tv,NULL);
2511 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2512 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2513
2514 for (j = 0; j < argc; j++) {
2515 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2516 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2517 } else {
2518 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2519 sdslen(argv[j]->ptr));
2520 }
2521 if (j != argc-1)
2522 cmdrepr = sdscatlen(cmdrepr," ",1);
2523 }
2524 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2525 cmdobj = createObject(REDIS_STRING,cmdrepr);
2526
2527 listRewind(monitors,&li);
2528 while((ln = listNext(&li))) {
2529 redisClient *monitor = ln->value;
2530 addReply(monitor,cmdobj);
2531 }
2532 decrRefCount(cmdobj);
2533 }
2534
2535 static void processInputBuffer(redisClient *c) {
2536 again:
2537 /* Before to process the input buffer, make sure the client is not
2538 * waitig for a blocking operation such as BLPOP. Note that the first
2539 * iteration the client is never blocked, otherwise the processInputBuffer
2540 * would not be called at all, but after the execution of the first commands
2541 * in the input buffer the client may be blocked, and the "goto again"
2542 * will try to reiterate. The following line will make it return asap. */
2543 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2544 if (c->bulklen == -1) {
2545 /* Read the first line of the query */
2546 char *p = strchr(c->querybuf,'\n');
2547 size_t querylen;
2548
2549 if (p) {
2550 sds query, *argv;
2551 int argc, j;
2552
2553 query = c->querybuf;
2554 c->querybuf = sdsempty();
2555 querylen = 1+(p-(query));
2556 if (sdslen(query) > querylen) {
2557 /* leave data after the first line of the query in the buffer */
2558 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2559 }
2560 *p = '\0'; /* remove "\n" */
2561 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2562 sdsupdatelen(query);
2563
2564 /* Now we can split the query in arguments */
2565 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2566 sdsfree(query);
2567
2568 if (c->argv) zfree(c->argv);
2569 c->argv = zmalloc(sizeof(robj*)*argc);
2570
2571 for (j = 0; j < argc; j++) {
2572 if (sdslen(argv[j])) {
2573 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2574 c->argc++;
2575 } else {
2576 sdsfree(argv[j]);
2577 }
2578 }
2579 zfree(argv);
2580 if (c->argc) {
2581 /* Execute the command. If the client is still valid
2582 * after processCommand() return and there is something
2583 * on the query buffer try to process the next command. */
2584 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2585 } else {
2586 /* Nothing to process, argc == 0. Just process the query
2587 * buffer if it's not empty or return to the caller */
2588 if (sdslen(c->querybuf)) goto again;
2589 }
2590 return;
2591 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2592 redisLog(REDIS_VERBOSE, "Client protocol error");
2593 freeClient(c);
2594 return;
2595 }
2596 } else {
2597 /* Bulk read handling. Note that if we are at this point
2598 the client already sent a command terminated with a newline,
2599 we are reading the bulk data that is actually the last
2600 argument of the command. */
2601 int qbl = sdslen(c->querybuf);
2602
2603 if (c->bulklen <= qbl) {
2604 /* Copy everything but the final CRLF as final argument */
2605 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2606 c->argc++;
2607 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2608 /* Process the command. If the client is still valid after
2609 * the processing and there is more data in the buffer
2610 * try to parse it. */
2611 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2612 return;
2613 }
2614 }
2615 }
2616
2617 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2618 redisClient *c = (redisClient*) privdata;
2619 char buf[REDIS_IOBUF_LEN];
2620 int nread;
2621 REDIS_NOTUSED(el);
2622 REDIS_NOTUSED(mask);
2623
2624 nread = read(fd, buf, REDIS_IOBUF_LEN);
2625 if (nread == -1) {
2626 if (errno == EAGAIN) {
2627 nread = 0;
2628 } else {
2629 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2630 freeClient(c);
2631 return;
2632 }
2633 } else if (nread == 0) {
2634 redisLog(REDIS_VERBOSE, "Client closed connection");
2635 freeClient(c);
2636 return;
2637 }
2638 if (nread) {
2639 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2640 c->lastinteraction = time(NULL);
2641 } else {
2642 return;
2643 }
2644 processInputBuffer(c);
2645 }
2646
2647 static int selectDb(redisClient *c, int id) {
2648 if (id < 0 || id >= server.dbnum)
2649 return REDIS_ERR;
2650 c->db = &server.db[id];
2651 return REDIS_OK;
2652 }
2653
2654 static void *dupClientReplyValue(void *o) {
2655 incrRefCount((robj*)o);
2656 return o;
2657 }
2658
2659 static int listMatchObjects(void *a, void *b) {
2660 return equalStringObjects(a,b);
2661 }
2662
2663 static redisClient *createClient(int fd) {
2664 redisClient *c = zmalloc(sizeof(*c));
2665
2666 anetNonBlock(NULL,fd);
2667 anetTcpNoDelay(NULL,fd);
2668 if (!c) return NULL;
2669 selectDb(c,0);
2670 c->fd = fd;
2671 c->querybuf = sdsempty();
2672 c->argc = 0;
2673 c->argv = NULL;
2674 c->bulklen = -1;
2675 c->multibulk = 0;
2676 c->mbargc = 0;
2677 c->mbargv = NULL;
2678 c->sentlen = 0;
2679 c->flags = 0;
2680 c->lastinteraction = time(NULL);
2681 c->authenticated = 0;
2682 c->replstate = REDIS_REPL_NONE;
2683 c->reply = listCreate();
2684 listSetFreeMethod(c->reply,decrRefCount);
2685 listSetDupMethod(c->reply,dupClientReplyValue);
2686 c->blockingkeys = NULL;
2687 c->blockingkeysnum = 0;
2688 c->io_keys = listCreate();
2689 listSetFreeMethod(c->io_keys,decrRefCount);
2690 c->pubsub_channels = dictCreate(&setDictType,NULL);
2691 c->pubsub_patterns = listCreate();
2692 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2693 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2694 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2695 readQueryFromClient, c) == AE_ERR) {
2696 freeClient(c);
2697 return NULL;
2698 }
2699 listAddNodeTail(server.clients,c);
2700 initClientMultiState(c);
2701 return c;
2702 }
2703
2704 static void addReply(redisClient *c, robj *obj) {
2705 if (listLength(c->reply) == 0 &&
2706 (c->replstate == REDIS_REPL_NONE ||
2707 c->replstate == REDIS_REPL_ONLINE) &&
2708 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2709 sendReplyToClient, c) == AE_ERR) return;
2710
2711 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2712 obj = dupStringObject(obj);
2713 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2714 }
2715 listAddNodeTail(c->reply,getDecodedObject(obj));
2716 }
2717
2718 static void addReplySds(redisClient *c, sds s) {
2719 robj *o = createObject(REDIS_STRING,s);
2720 addReply(c,o);
2721 decrRefCount(o);
2722 }
2723
2724 static void addReplyDouble(redisClient *c, double d) {
2725 char buf[128];
2726
2727 snprintf(buf,sizeof(buf),"%.17g",d);
2728 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2729 (unsigned long) strlen(buf),buf));
2730 }
2731
2732 static void addReplyLong(redisClient *c, long l) {
2733 char buf[128];
2734 size_t len;
2735
2736 if (l == 0) {
2737 addReply(c,shared.czero);
2738 return;
2739 } else if (l == 1) {
2740 addReply(c,shared.cone);
2741 return;
2742 }
2743 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2744 addReplySds(c,sdsnewlen(buf,len));
2745 }
2746
2747 static void addReplyLongLong(redisClient *c, long long ll) {
2748 char buf[128];
2749 size_t len;
2750
2751 if (ll == 0) {
2752 addReply(c,shared.czero);
2753 return;
2754 } else if (ll == 1) {
2755 addReply(c,shared.cone);
2756 return;
2757 }
2758 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2759 addReplySds(c,sdsnewlen(buf,len));
2760 }
2761
2762 static void addReplyUlong(redisClient *c, unsigned long ul) {
2763 char buf[128];
2764 size_t len;
2765
2766 if (ul == 0) {
2767 addReply(c,shared.czero);
2768 return;
2769 } else if (ul == 1) {
2770 addReply(c,shared.cone);
2771 return;
2772 }
2773 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2774 addReplySds(c,sdsnewlen(buf,len));
2775 }
2776
2777 static void addReplyBulkLen(redisClient *c, robj *obj) {
2778 size_t len;
2779
2780 if (obj->encoding == REDIS_ENCODING_RAW) {
2781 len = sdslen(obj->ptr);
2782 } else {
2783 long n = (long)obj->ptr;
2784
2785 /* Compute how many bytes will take this integer as a radix 10 string */
2786 len = 1;
2787 if (n < 0) {
2788 len++;
2789 n = -n;
2790 }
2791 while((n = n/10) != 0) {
2792 len++;
2793 }
2794 }
2795 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2796 }
2797
2798 static void addReplyBulk(redisClient *c, robj *obj) {
2799 addReplyBulkLen(c,obj);
2800 addReply(c,obj);
2801 addReply(c,shared.crlf);
2802 }
2803
2804 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2805 static void addReplyBulkCString(redisClient *c, char *s) {
2806 if (s == NULL) {
2807 addReply(c,shared.nullbulk);
2808 } else {
2809 robj *o = createStringObject(s,strlen(s));
2810 addReplyBulk(c,o);
2811 decrRefCount(o);
2812 }
2813 }
2814
2815 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2816 int cport, cfd;
2817 char cip[128];
2818 redisClient *c;
2819 REDIS_NOTUSED(el);
2820 REDIS_NOTUSED(mask);
2821 REDIS_NOTUSED(privdata);
2822
2823 cfd = anetAccept(server.neterr, fd, cip, &cport);
2824 if (cfd == AE_ERR) {
2825 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2826 return;
2827 }
2828 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2829 if ((c = createClient(cfd)) == NULL) {
2830 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2831 close(cfd); /* May be already closed, just ingore errors */
2832 return;
2833 }
2834 /* If maxclient directive is set and this is one client more... close the
2835 * connection. Note that we create the client instead to check before
2836 * for this condition, since now the socket is already set in nonblocking
2837 * mode and we can send an error for free using the Kernel I/O */
2838 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2839 char *err = "-ERR max number of clients reached\r\n";
2840
2841 /* That's a best effort error message, don't check write errors */
2842 if (write(c->fd,err,strlen(err)) == -1) {
2843 /* Nothing to do, Just to avoid the warning... */
2844 }
2845 freeClient(c);
2846 return;
2847 }
2848 server.stat_numconnections++;
2849 }
2850
2851 /* ======================= Redis objects implementation ===================== */
2852
2853 static robj *createObject(int type, void *ptr) {
2854 robj *o;
2855
2856 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2857 if (listLength(server.objfreelist)) {
2858 listNode *head = listFirst(server.objfreelist);
2859 o = listNodeValue(head);
2860 listDelNode(server.objfreelist,head);
2861 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2862 } else {
2863 if (server.vm_enabled) {
2864 pthread_mutex_unlock(&server.obj_freelist_mutex);
2865 o = zmalloc(sizeof(*o));
2866 } else {
2867 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2868 }
2869 }
2870 o->type = type;
2871 o->encoding = REDIS_ENCODING_RAW;
2872 o->ptr = ptr;
2873 o->refcount = 1;
2874 if (server.vm_enabled) {
2875 /* Note that this code may run in the context of an I/O thread
2876 * and accessing to server.unixtime in theory is an error
2877 * (no locks). But in practice this is safe, and even if we read
2878 * garbage Redis will not fail, as it's just a statistical info */
2879 o->vm.atime = server.unixtime;
2880 o->storage = REDIS_VM_MEMORY;
2881 }
2882 return o;
2883 }
2884
2885 static robj *createStringObject(char *ptr, size_t len) {
2886 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2887 }
2888
2889 static robj *createStringObjectFromLongLong(long long value) {
2890 robj *o;
2891 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2892 incrRefCount(shared.integers[value]);
2893 o = shared.integers[value];
2894 } else {
2895 o = createObject(REDIS_STRING, NULL);
2896 if (value >= LONG_MIN && value <= LONG_MAX) {
2897 o->encoding = REDIS_ENCODING_INT;
2898 o->ptr = (void*)((long)value);
2899 } else {
2900 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2901 }
2902 }
2903 return o;
2904 }
2905
2906 static robj *dupStringObject(robj *o) {
2907 assert(o->encoding == REDIS_ENCODING_RAW);
2908 return createStringObject(o->ptr,sdslen(o->ptr));
2909 }
2910
2911 static robj *createListObject(void) {
2912 list *l = listCreate();
2913
2914 listSetFreeMethod(l,decrRefCount);
2915 return createObject(REDIS_LIST,l);
2916 }
2917
2918 static robj *createSetObject(void) {
2919 dict *d = dictCreate(&setDictType,NULL);
2920 return createObject(REDIS_SET,d);
2921 }
2922
2923 static robj *createHashObject(void) {
2924 /* All the Hashes start as zipmaps. Will be automatically converted
2925 * into hash tables if there are enough elements or big elements
2926 * inside. */
2927 unsigned char *zm = zipmapNew();
2928 robj *o = createObject(REDIS_HASH,zm);
2929 o->encoding = REDIS_ENCODING_ZIPMAP;
2930 return o;
2931 }
2932
2933 static robj *createZsetObject(void) {
2934 zset *zs = zmalloc(sizeof(*zs));
2935
2936 zs->dict = dictCreate(&zsetDictType,NULL);
2937 zs->zsl = zslCreate();
2938 return createObject(REDIS_ZSET,zs);
2939 }
2940
2941 static void freeStringObject(robj *o) {
2942 if (o->encoding == REDIS_ENCODING_RAW) {
2943 sdsfree(o->ptr);
2944 }
2945 }
2946
2947 static void freeListObject(robj *o) {
2948 listRelease((list*) o->ptr);
2949 }
2950
2951 static void freeSetObject(robj *o) {
2952 dictRelease((dict*) o->ptr);
2953 }
2954
2955 static void freeZsetObject(robj *o) {
2956 zset *zs = o->ptr;
2957
2958 dictRelease(zs->dict);
2959 zslFree(zs->zsl);
2960 zfree(zs);
2961 }
2962
2963 static void freeHashObject(robj *o) {
2964 switch (o->encoding) {
2965 case REDIS_ENCODING_HT:
2966 dictRelease((dict*) o->ptr);
2967 break;
2968 case REDIS_ENCODING_ZIPMAP:
2969 zfree(o->ptr);
2970 break;
2971 default:
2972 redisPanic("Unknown hash encoding type");
2973 break;
2974 }
2975 }
2976
2977 static void incrRefCount(robj *o) {
2978 o->refcount++;
2979 }
2980
2981 static void decrRefCount(void *obj) {
2982 robj *o = obj;
2983
2984 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2985 /* Object is a key of a swapped out value, or in the process of being
2986 * loaded. */
2987 if (server.vm_enabled &&
2988 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2989 {
2990 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2991 redisAssert(o->type == REDIS_STRING);
2992 freeStringObject(o);
2993 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2994 pthread_mutex_lock(&server.obj_freelist_mutex);
2995 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2996 !listAddNodeHead(server.objfreelist,o))
2997 zfree(o);
2998 pthread_mutex_unlock(&server.obj_freelist_mutex);
2999 server.vm_stats_swapped_objects--;
3000 return;
3001 }
3002 /* Object is in memory, or in the process of being swapped out. */
3003 if (--(o->refcount) == 0) {
3004 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3005 vmCancelThreadedIOJob(obj);
3006 switch(o->type) {
3007 case REDIS_STRING: freeStringObject(o); break;
3008 case REDIS_LIST: freeListObject(o); break;
3009 case REDIS_SET: freeSetObject(o); break;
3010 case REDIS_ZSET: freeZsetObject(o); break;
3011 case REDIS_HASH: freeHashObject(o); break;
3012 default: redisPanic("Unknown object type"); break;
3013 }
3014 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3015 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3016 !listAddNodeHead(server.objfreelist,o))
3017 zfree(o);
3018 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3019 }
3020 }
3021
3022 static robj *lookupKey(redisDb *db, robj *key) {
3023 dictEntry *de = dictFind(db->dict,key);
3024 if (de) {
3025 robj *key = dictGetEntryKey(de);
3026 robj *val = dictGetEntryVal(de);
3027
3028 if (server.vm_enabled) {
3029 if (key->storage == REDIS_VM_MEMORY ||
3030 key->storage == REDIS_VM_SWAPPING)
3031 {
3032 /* If we were swapping the object out, stop it, this key
3033 * was requested. */
3034 if (key->storage == REDIS_VM_SWAPPING)
3035 vmCancelThreadedIOJob(key);
3036 /* Update the access time of the key for the aging algorithm. */
3037 key->vm.atime = server.unixtime;
3038 } else {
3039 int notify = (key->storage == REDIS_VM_LOADING);
3040
3041 /* Our value was swapped on disk. Bring it at home. */
3042 redisAssert(val == NULL);
3043 val = vmLoadObject(key);
3044 dictGetEntryVal(de) = val;
3045
3046 /* Clients blocked by the VM subsystem may be waiting for
3047 * this key... */
3048 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3049 }
3050 }
3051 return val;
3052 } else {
3053 return NULL;
3054 }
3055 }
3056
3057 static robj *lookupKeyRead(redisDb *db, robj *key) {
3058 expireIfNeeded(db,key);
3059 return lookupKey(db,key);
3060 }
3061
3062 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3063 deleteIfVolatile(db,key);
3064 return lookupKey(db,key);
3065 }
3066
3067 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3068 robj *o = lookupKeyRead(c->db, key);
3069 if (!o) addReply(c,reply);
3070 return o;
3071 }
3072
3073 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3074 robj *o = lookupKeyWrite(c->db, key);
3075 if (!o) addReply(c,reply);
3076 return o;
3077 }
3078
3079 static int checkType(redisClient *c, robj *o, int type) {
3080 if (o->type != type) {
3081 addReply(c,shared.wrongtypeerr);
3082 return 1;
3083 }
3084 return 0;
3085 }
3086
3087 static int deleteKey(redisDb *db, robj *key) {
3088 int retval;
3089
3090 /* We need to protect key from destruction: after the first dictDelete()
3091 * it may happen that 'key' is no longer valid if we don't increment
3092 * it's count. This may happen when we get the object reference directly
3093 * from the hash table with dictRandomKey() or dict iterators */
3094 incrRefCount(key);
3095 if (dictSize(db->expires)) dictDelete(db->expires,key);
3096 retval = dictDelete(db->dict,key);
3097 decrRefCount(key);
3098
3099 return retval == DICT_OK;
3100 }
3101
3102 /* Check if the nul-terminated string 's' can be represented by a long
3103 * (that is, is a number that fits into long without any other space or
3104 * character before or after the digits).
3105 *
3106 * If so, the function returns REDIS_OK and *longval is set to the value
3107 * of the number. Otherwise REDIS_ERR is returned */
3108 static int isStringRepresentableAsLong(sds s, long *longval) {
3109 char buf[32], *endptr;
3110 long value;
3111 int slen;
3112
3113 value = strtol(s, &endptr, 10);
3114 if (endptr[0] != '\0') return REDIS_ERR;
3115 slen = snprintf(buf,32,"%ld",value);
3116
3117 /* If the number converted back into a string is not identical
3118 * then it's not possible to encode the string as integer */
3119 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3120 if (longval) *longval = value;
3121 return REDIS_OK;
3122 }
3123
3124 /* Try to encode a string object in order to save space */
3125 static robj *tryObjectEncoding(robj *o) {
3126 long value;
3127 sds s = o->ptr;
3128
3129 if (o->encoding != REDIS_ENCODING_RAW)
3130 return o; /* Already encoded */
3131
3132 /* It's not safe to encode shared objects: shared objects can be shared
3133 * everywhere in the "object space" of Redis. Encoded objects can only
3134 * appear as "values" (and not, for instance, as keys) */
3135 if (o->refcount > 1) return o;
3136
3137 /* Currently we try to encode only strings */
3138 redisAssert(o->type == REDIS_STRING);
3139
3140 /* Check if we can represent this string as a long integer */
3141 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3142
3143 /* Ok, this object can be encoded */
3144 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3145 decrRefCount(o);
3146 incrRefCount(shared.integers[value]);
3147 return shared.integers[value];
3148 } else {
3149 o->encoding = REDIS_ENCODING_INT;
3150 sdsfree(o->ptr);
3151 o->ptr = (void*) value;
3152 return o;
3153 }
3154 }
3155
3156 /* Get a decoded version of an encoded object (returned as a new object).
3157 * If the object is already raw-encoded just increment the ref count. */
3158 static robj *getDecodedObject(robj *o) {
3159 robj *dec;
3160
3161 if (o->encoding == REDIS_ENCODING_RAW) {
3162 incrRefCount(o);
3163 return o;
3164 }
3165 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3166 char buf[32];
3167
3168 snprintf(buf,32,"%ld",(long)o->ptr);
3169 dec = createStringObject(buf,strlen(buf));
3170 return dec;
3171 } else {
3172 redisPanic("Unknown encoding type");
3173 }
3174 }
3175
3176 /* Compare two string objects via strcmp() or alike.
3177 * Note that the objects may be integer-encoded. In such a case we
3178 * use snprintf() to get a string representation of the numbers on the stack
3179 * and compare the strings, it's much faster than calling getDecodedObject().
3180 *
3181 * Important note: if objects are not integer encoded, but binary-safe strings,
3182 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3183 * binary safe. */
3184 static int compareStringObjects(robj *a, robj *b) {
3185 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3186 char bufa[128], bufb[128], *astr, *bstr;
3187 int bothsds = 1;
3188
3189 if (a == b) return 0;
3190 if (a->encoding != REDIS_ENCODING_RAW) {
3191 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3192 astr = bufa;
3193 bothsds = 0;
3194 } else {
3195 astr = a->ptr;
3196 }
3197 if (b->encoding != REDIS_ENCODING_RAW) {
3198 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3199 bstr = bufb;
3200 bothsds = 0;
3201 } else {
3202 bstr = b->ptr;
3203 }
3204 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3205 }
3206
3207 /* Equal string objects return 1 if the two objects are the same from the
3208 * point of view of a string comparison, otherwise 0 is returned. Note that
3209 * this function is faster then checking for (compareStringObject(a,b) == 0)
3210 * because it can perform some more optimization. */
3211 static int equalStringObjects(robj *a, robj *b) {
3212 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3213 return a->ptr == b->ptr;
3214 } else {
3215 return compareStringObjects(a,b) == 0;
3216 }
3217 }
3218
3219 static size_t stringObjectLen(robj *o) {
3220 redisAssert(o->type == REDIS_STRING);
3221 if (o->encoding == REDIS_ENCODING_RAW) {
3222 return sdslen(o->ptr);
3223 } else {
3224 char buf[32];
3225
3226 return snprintf(buf,32,"%ld",(long)o->ptr);
3227 }
3228 }
3229
3230 static int getDoubleFromObject(robj *o, double *target) {
3231 double value;
3232 char *eptr;
3233
3234 if (o == NULL) {
3235 value = 0;
3236 } else {
3237 redisAssert(o->type == REDIS_STRING);
3238 if (o->encoding == REDIS_ENCODING_RAW) {
3239 value = strtod(o->ptr, &eptr);
3240 if (eptr[0] != '\0') return REDIS_ERR;
3241 } else if (o->encoding == REDIS_ENCODING_INT) {
3242 value = (long)o->ptr;
3243 } else {
3244 redisPanic("Unknown string encoding");
3245 }
3246 }
3247
3248 *target = value;
3249 return REDIS_OK;
3250 }
3251
3252 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3253 double value;
3254 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3255 if (msg != NULL) {
3256 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3257 } else {
3258 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3259 }
3260 return REDIS_ERR;
3261 }
3262
3263 *target = value;
3264 return REDIS_OK;
3265 }
3266
3267 static int getLongLongFromObject(robj *o, long long *target) {
3268 long long value;
3269 char *eptr;
3270
3271 if (o == NULL) {
3272 value = 0;
3273 } else {
3274 redisAssert(o->type == REDIS_STRING);
3275 if (o->encoding == REDIS_ENCODING_RAW) {
3276 value = strtoll(o->ptr, &eptr, 10);
3277 if (eptr[0] != '\0') return REDIS_ERR;
3278 } else if (o->encoding == REDIS_ENCODING_INT) {
3279 value = (long)o->ptr;
3280 } else {
3281 redisPanic("Unknown string encoding");
3282 }
3283 }
3284
3285 *target = value;
3286 return REDIS_OK;
3287 }
3288
3289 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3290 long long value;
3291 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3292 if (msg != NULL) {
3293 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3294 } else {
3295 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3296 }
3297 return REDIS_ERR;
3298 }
3299
3300 *target = value;
3301 return REDIS_OK;
3302 }
3303
3304 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3305 long long value;
3306
3307 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3308 if (value < LONG_MIN || value > LONG_MAX) {
3309 if (msg != NULL) {
3310 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3311 } else {
3312 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3313 }
3314 return REDIS_ERR;
3315 }
3316
3317 *target = value;
3318 return REDIS_OK;
3319 }
3320
3321 /*============================ RDB saving/loading =========================== */
3322
3323 static int rdbSaveType(FILE *fp, unsigned char type) {
3324 if (fwrite(&type,1,1,fp) == 0) return -1;
3325 return 0;
3326 }
3327
3328 static int rdbSaveTime(FILE *fp, time_t t) {
3329 int32_t t32 = (int32_t) t;
3330 if (fwrite(&t32,4,1,fp) == 0) return -1;
3331 return 0;
3332 }
3333
3334 /* check rdbLoadLen() comments for more info */
3335 static int rdbSaveLen(FILE *fp, uint32_t len) {
3336 unsigned char buf[2];
3337
3338 if (len < (1<<6)) {
3339 /* Save a 6 bit len */
3340 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3341 if (fwrite(buf,1,1,fp) == 0) return -1;
3342 } else if (len < (1<<14)) {
3343 /* Save a 14 bit len */
3344 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3345 buf[1] = len&0xFF;
3346 if (fwrite(buf,2,1,fp) == 0) return -1;
3347 } else {
3348 /* Save a 32 bit len */
3349 buf[0] = (REDIS_RDB_32BITLEN<<6);
3350 if (fwrite(buf,1,1,fp) == 0) return -1;
3351 len = htonl(len);
3352 if (fwrite(&len,4,1,fp) == 0) return -1;
3353 }
3354 return 0;
3355 }
3356
3357 /* String objects in the form "2391" "-100" without any space and with a
3358 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3359 * encoded as integers to save space */
3360 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3361 long long value;
3362 char *endptr, buf[32];
3363
3364 /* Check if it's possible to encode this value as a number */
3365 value = strtoll(s, &endptr, 10);
3366 if (endptr[0] != '\0') return 0;
3367 snprintf(buf,32,"%lld",value);
3368
3369 /* If the number converted back into a string is not identical
3370 * then it's not possible to encode the string as integer */
3371 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3372
3373 /* Finally check if it fits in our ranges */
3374 if (value >= -(1<<7) && value <= (1<<7)-1) {
3375 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3376 enc[1] = value&0xFF;
3377 return 2;
3378 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3379 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3380 enc[1] = value&0xFF;
3381 enc[2] = (value>>8)&0xFF;
3382 return 3;
3383 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3384 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3385 enc[1] = value&0xFF;
3386 enc[2] = (value>>8)&0xFF;
3387 enc[3] = (value>>16)&0xFF;
3388 enc[4] = (value>>24)&0xFF;
3389 return 5;
3390 } else {
3391 return 0;
3392 }
3393 }
3394
3395 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3396 size_t comprlen, outlen;
3397 unsigned char byte;
3398 void *out;
3399
3400 /* We require at least four bytes compression for this to be worth it */
3401 if (len <= 4) return 0;
3402 outlen = len-4;
3403 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3404 comprlen = lzf_compress(s, len, out, outlen);
3405 if (comprlen == 0) {
3406 zfree(out);
3407 return 0;
3408 }
3409 /* Data compressed! Let's save it on disk */
3410 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3411 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3412 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3413 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3414 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3415 zfree(out);
3416 return comprlen;
3417
3418 writeerr:
3419 zfree(out);
3420 return -1;
3421 }
3422
3423 /* Save a string objet as [len][data] on disk. If the object is a string
3424 * representation of an integer value we try to safe it in a special form */
3425 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3426 int enclen;
3427
3428 /* Try integer encoding */
3429 if (len <= 11) {
3430 unsigned char buf[5];
3431 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3432 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3433 return 0;
3434 }
3435 }
3436
3437 /* Try LZF compression - under 20 bytes it's unable to compress even
3438 * aaaaaaaaaaaaaaaaaa so skip it */
3439 if (server.rdbcompression && len > 20) {
3440 int retval;
3441
3442 retval = rdbSaveLzfStringObject(fp,s,len);
3443 if (retval == -1) return -1;
3444 if (retval > 0) return 0;
3445 /* retval == 0 means data can't be compressed, save the old way */
3446 }
3447
3448 /* Store verbatim */
3449 if (rdbSaveLen(fp,len) == -1) return -1;
3450 if (len && fwrite(s,len,1,fp) == 0) return -1;
3451 return 0;
3452 }
3453
3454 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3455 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3456 int retval;
3457
3458 /* Avoid incr/decr ref count business when possible.
3459 * This plays well with copy-on-write given that we are probably
3460 * in a child process (BGSAVE). Also this makes sure key objects
3461 * of swapped objects are not incRefCount-ed (an assert does not allow
3462 * this in order to avoid bugs) */
3463 if (obj->encoding != REDIS_ENCODING_RAW) {
3464 obj = getDecodedObject(obj);
3465 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3466 decrRefCount(obj);
3467 } else {
3468 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3469 }
3470 return retval;
3471 }
3472
3473 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3474 * 8 bit integer specifing the length of the representation.
3475 * This 8 bit integer has special values in order to specify the following
3476 * conditions:
3477 * 253: not a number
3478 * 254: + inf
3479 * 255: - inf
3480 */
3481 static int rdbSaveDoubleValue(FILE *fp, double val) {
3482 unsigned char buf[128];
3483 int len;
3484
3485 if (isnan(val)) {
3486 buf[0] = 253;
3487 len = 1;
3488 } else if (!isfinite(val)) {
3489 len = 1;
3490 buf[0] = (val < 0) ? 255 : 254;
3491 } else {
3492 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3493 buf[0] = strlen((char*)buf+1);
3494 len = buf[0]+1;
3495 }
3496 if (fwrite(buf,len,1,fp) == 0) return -1;
3497 return 0;
3498 }
3499
3500 /* Save a Redis object. */
3501 static int rdbSaveObject(FILE *fp, robj *o) {
3502 if (o->type == REDIS_STRING) {
3503 /* Save a string value */
3504 if (rdbSaveStringObject(fp,o) == -1) return -1;
3505 } else if (o->type == REDIS_LIST) {
3506 /* Save a list value */
3507 list *list = o->ptr;
3508 listIter li;
3509 listNode *ln;
3510
3511 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3512 listRewind(list,&li);
3513 while((ln = listNext(&li))) {
3514 robj *eleobj = listNodeValue(ln);
3515
3516 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3517 }
3518 } else if (o->type == REDIS_SET) {
3519 /* Save a set value */
3520 dict *set = o->ptr;
3521 dictIterator *di = dictGetIterator(set);
3522 dictEntry *de;
3523
3524 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3525 while((de = dictNext(di)) != NULL) {
3526 robj *eleobj = dictGetEntryKey(de);
3527
3528 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3529 }
3530 dictReleaseIterator(di);
3531 } else if (o->type == REDIS_ZSET) {
3532 /* Save a set value */
3533 zset *zs = o->ptr;
3534 dictIterator *di = dictGetIterator(zs->dict);
3535 dictEntry *de;
3536
3537 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3538 while((de = dictNext(di)) != NULL) {
3539 robj *eleobj = dictGetEntryKey(de);
3540 double *score = dictGetEntryVal(de);
3541
3542 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3543 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3544 }
3545 dictReleaseIterator(di);
3546 } else if (o->type == REDIS_HASH) {
3547 /* Save a hash value */
3548 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3549 unsigned char *p = zipmapRewind(o->ptr);
3550 unsigned int count = zipmapLen(o->ptr);
3551 unsigned char *key, *val;
3552 unsigned int klen, vlen;
3553
3554 if (rdbSaveLen(fp,count) == -1) return -1;
3555 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3556 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3557 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3558 }
3559 } else {
3560 dictIterator *di = dictGetIterator(o->ptr);
3561 dictEntry *de;
3562
3563 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3564 while((de = dictNext(di)) != NULL) {
3565 robj *key = dictGetEntryKey(de);
3566 robj *val = dictGetEntryVal(de);
3567
3568 if (rdbSaveStringObject(fp,key) == -1) return -1;
3569 if (rdbSaveStringObject(fp,val) == -1) return -1;
3570 }
3571 dictReleaseIterator(di);
3572 }
3573 } else {
3574 redisPanic("Unknown object type");
3575 }
3576 return 0;
3577 }
3578
3579 /* Return the length the object will have on disk if saved with
3580 * the rdbSaveObject() function. Currently we use a trick to get
3581 * this length with very little changes to the code. In the future
3582 * we could switch to a faster solution. */
3583 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3584 if (fp == NULL) fp = server.devnull;
3585 rewind(fp);
3586 assert(rdbSaveObject(fp,o) != 1);
3587 return ftello(fp);
3588 }
3589
3590 /* Return the number of pages required to save this object in the swap file */
3591 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3592 off_t bytes = rdbSavedObjectLen(o,fp);
3593
3594 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3595 }
3596
3597 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3598 static int rdbSave(char *filename) {
3599 dictIterator *di = NULL;
3600 dictEntry *de;
3601 FILE *fp;
3602 char tmpfile[256];
3603 int j;
3604 time_t now = time(NULL);
3605
3606 /* Wait for I/O therads to terminate, just in case this is a
3607 * foreground-saving, to avoid seeking the swap file descriptor at the
3608 * same time. */
3609 if (server.vm_enabled)
3610 waitEmptyIOJobsQueue();
3611
3612 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3613 fp = fopen(tmpfile,"w");
3614 if (!fp) {
3615 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3616 return REDIS_ERR;
3617 }
3618 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3619 for (j = 0; j < server.dbnum; j++) {
3620 redisDb *db = server.db+j;
3621 dict *d = db->dict;
3622 if (dictSize(d) == 0) continue;
3623 di = dictGetIterator(d);
3624 if (!di) {
3625 fclose(fp);
3626 return REDIS_ERR;
3627 }
3628
3629 /* Write the SELECT DB opcode */
3630 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3631 if (rdbSaveLen(fp,j) == -1) goto werr;
3632
3633 /* Iterate this DB writing every entry */
3634 while((de = dictNext(di)) != NULL) {
3635 robj *key = dictGetEntryKey(de);
3636 robj *o = dictGetEntryVal(de);
3637 time_t expiretime = getExpire(db,key);
3638
3639 /* Save the expire time */
3640 if (expiretime != -1) {
3641 /* If this key is already expired skip it */
3642 if (expiretime < now) continue;
3643 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3644 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3645 }
3646 /* Save the key and associated value. This requires special
3647 * handling if the value is swapped out. */
3648 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3649 key->storage == REDIS_VM_SWAPPING) {
3650 /* Save type, key, value */
3651 if (rdbSaveType(fp,o->type) == -1) goto werr;
3652 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3653 if (rdbSaveObject(fp,o) == -1) goto werr;
3654 } else {
3655 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3656 robj *po;
3657 /* Get a preview of the object in memory */
3658 po = vmPreviewObject(key);
3659 /* Save type, key, value */
3660 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3661 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3662 if (rdbSaveObject(fp,po) == -1) goto werr;
3663 /* Remove the loaded object from memory */
3664 decrRefCount(po);
3665 }
3666 }
3667 dictReleaseIterator(di);
3668 }
3669 /* EOF opcode */
3670 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3671
3672 /* Make sure data will not remain on the OS's output buffers */
3673 fflush(fp);
3674 fsync(fileno(fp));
3675 fclose(fp);
3676
3677 /* Use RENAME to make sure the DB file is changed atomically only
3678 * if the generate DB file is ok. */
3679 if (rename(tmpfile,filename) == -1) {
3680 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3681 unlink(tmpfile);
3682 return REDIS_ERR;
3683 }
3684 redisLog(REDIS_NOTICE,"DB saved on disk");
3685 server.dirty = 0;
3686 server.lastsave = time(NULL);
3687 return REDIS_OK;
3688
3689 werr:
3690 fclose(fp);
3691 unlink(tmpfile);
3692 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3693 if (di) dictReleaseIterator(di);
3694 return REDIS_ERR;
3695 }
3696
3697 static int rdbSaveBackground(char *filename) {
3698 pid_t childpid;
3699
3700 if (server.bgsavechildpid != -1) return REDIS_ERR;
3701 if (server.vm_enabled) waitEmptyIOJobsQueue();
3702 if ((childpid = fork()) == 0) {
3703 /* Child */
3704 if (server.vm_enabled) vmReopenSwapFile();
3705 close(server.fd);
3706 if (rdbSave(filename) == REDIS_OK) {
3707 _exit(0);
3708 } else {
3709 _exit(1);
3710 }
3711 } else {
3712 /* Parent */
3713 if (childpid == -1) {
3714 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3715 strerror(errno));
3716 return REDIS_ERR;
3717 }
3718 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3719 server.bgsavechildpid = childpid;
3720 updateDictResizePolicy();
3721 return REDIS_OK;
3722 }
3723 return REDIS_OK; /* unreached */
3724 }
3725
3726 static void rdbRemoveTempFile(pid_t childpid) {
3727 char tmpfile[256];
3728
3729 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3730 unlink(tmpfile);
3731 }
3732
3733 static int rdbLoadType(FILE *fp) {
3734 unsigned char type;
3735 if (fread(&type,1,1,fp) == 0) return -1;
3736 return type;
3737 }
3738
3739 static time_t rdbLoadTime(FILE *fp) {
3740 int32_t t32;
3741 if (fread(&t32,4,1,fp) == 0) return -1;
3742 return (time_t) t32;
3743 }
3744
3745 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3746 * of this file for a description of how this are stored on disk.
3747 *
3748 * isencoded is set to 1 if the readed length is not actually a length but
3749 * an "encoding type", check the above comments for more info */
3750 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3751 unsigned char buf[2];
3752 uint32_t len;
3753 int type;
3754
3755 if (isencoded) *isencoded = 0;
3756 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3757 type = (buf[0]&0xC0)>>6;
3758 if (type == REDIS_RDB_6BITLEN) {
3759 /* Read a 6 bit len */
3760 return buf[0]&0x3F;
3761 } else if (type == REDIS_RDB_ENCVAL) {
3762 /* Read a 6 bit len encoding type */
3763 if (isencoded) *isencoded = 1;
3764 return buf[0]&0x3F;
3765 } else if (type == REDIS_RDB_14BITLEN) {
3766 /* Read a 14 bit len */
3767 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3768 return ((buf[0]&0x3F)<<8)|buf[1];
3769 } else {
3770 /* Read a 32 bit len */
3771 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3772 return ntohl(len);
3773 }
3774 }
3775
3776 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3777 unsigned char enc[4];
3778 long long val;
3779
3780 if (enctype == REDIS_RDB_ENC_INT8) {
3781 if (fread(enc,1,1,fp) == 0) return NULL;
3782 val = (signed char)enc[0];
3783 } else if (enctype == REDIS_RDB_ENC_INT16) {
3784 uint16_t v;
3785 if (fread(enc,2,1,fp) == 0) return NULL;
3786 v = enc[0]|(enc[1]<<8);
3787 val = (int16_t)v;
3788 } else if (enctype == REDIS_RDB_ENC_INT32) {
3789 uint32_t v;
3790 if (fread(enc,4,1,fp) == 0) return NULL;
3791 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3792 val = (int32_t)v;
3793 } else {
3794 val = 0; /* anti-warning */
3795 redisPanic("Unknown RDB integer encoding type");
3796 }
3797 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3798 }
3799
3800 static robj *rdbLoadLzfStringObject(FILE*fp) {
3801 unsigned int len, clen;
3802 unsigned char *c = NULL;
3803 sds val = NULL;
3804
3805 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3806 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3807 if ((c = zmalloc(clen)) == NULL) goto err;
3808 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3809 if (fread(c,clen,1,fp) == 0) goto err;
3810 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3811 zfree(c);
3812 return createObject(REDIS_STRING,val);
3813 err:
3814 zfree(c);
3815 sdsfree(val);
3816 return NULL;
3817 }
3818
3819 static robj *rdbLoadStringObject(FILE*fp) {
3820 int isencoded;
3821 uint32_t len;
3822 sds val;
3823
3824 len = rdbLoadLen(fp,&isencoded);
3825 if (isencoded) {
3826 switch(len) {
3827 case REDIS_RDB_ENC_INT8:
3828 case REDIS_RDB_ENC_INT16:
3829 case REDIS_RDB_ENC_INT32:
3830 return rdbLoadIntegerObject(fp,len);
3831 case REDIS_RDB_ENC_LZF:
3832 return rdbLoadLzfStringObject(fp);
3833 default:
3834 redisPanic("Unknown RDB encoding type");
3835 }
3836 }
3837
3838 if (len == REDIS_RDB_LENERR) return NULL;
3839 val = sdsnewlen(NULL,len);
3840 if (len && fread(val,len,1,fp) == 0) {
3841 sdsfree(val);
3842 return NULL;
3843 }
3844 return createObject(REDIS_STRING,val);
3845 }
3846
3847 /* For information about double serialization check rdbSaveDoubleValue() */
3848 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3849 char buf[128];
3850 unsigned char len;
3851
3852 if (fread(&len,1,1,fp) == 0) return -1;
3853 switch(len) {
3854 case 255: *val = R_NegInf; return 0;
3855 case 254: *val = R_PosInf; return 0;
3856 case 253: *val = R_Nan; return 0;
3857 default:
3858 if (fread(buf,len,1,fp) == 0) return -1;
3859 buf[len] = '\0';
3860 sscanf(buf, "%lg", val);
3861 return 0;
3862 }
3863 }
3864
3865 /* Load a Redis object of the specified type from the specified file.
3866 * On success a newly allocated object is returned, otherwise NULL. */
3867 static robj *rdbLoadObject(int type, FILE *fp) {
3868 robj *o;
3869
3870 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3871 if (type == REDIS_STRING) {
3872 /* Read string value */
3873 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3874 o = tryObjectEncoding(o);
3875 } else if (type == REDIS_LIST || type == REDIS_SET) {
3876 /* Read list/set value */
3877 uint32_t listlen;
3878
3879 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3880 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3881 /* It's faster to expand the dict to the right size asap in order
3882 * to avoid rehashing */
3883 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3884 dictExpand(o->ptr,listlen);
3885 /* Load every single element of the list/set */
3886 while(listlen--) {
3887 robj *ele;
3888
3889 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3890 ele = tryObjectEncoding(ele);
3891 if (type == REDIS_LIST) {
3892 listAddNodeTail((list*)o->ptr,ele);
3893 } else {
3894 dictAdd((dict*)o->ptr,ele,NULL);
3895 }
3896 }
3897 } else if (type == REDIS_ZSET) {
3898 /* Read list/set value */
3899 size_t zsetlen;
3900 zset *zs;
3901
3902 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3903 o = createZsetObject();
3904 zs = o->ptr;
3905 /* Load every single element of the list/set */
3906 while(zsetlen--) {
3907 robj *ele;
3908 double *score = zmalloc(sizeof(double));
3909
3910 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3911 ele = tryObjectEncoding(ele);
3912 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3913 dictAdd(zs->dict,ele,score);
3914 zslInsert(zs->zsl,*score,ele);
3915 incrRefCount(ele); /* added to skiplist */
3916 }
3917 } else if (type == REDIS_HASH) {
3918 size_t hashlen;
3919
3920 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3921 o = createHashObject();
3922 /* Too many entries? Use an hash table. */
3923 if (hashlen > server.hash_max_zipmap_entries)
3924 convertToRealHash(o);
3925 /* Load every key/value, then set it into the zipmap or hash
3926 * table, as needed. */
3927 while(hashlen--) {
3928 robj *key, *val;
3929
3930 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3931 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3932 /* If we are using a zipmap and there are too big values
3933 * the object is converted to real hash table encoding. */
3934 if (o->encoding != REDIS_ENCODING_HT &&
3935 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3936 sdslen(val->ptr) > server.hash_max_zipmap_value))
3937 {
3938 convertToRealHash(o);
3939 }
3940
3941 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3942 unsigned char *zm = o->ptr;
3943
3944 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3945 val->ptr,sdslen(val->ptr),NULL);
3946 o->ptr = zm;
3947 decrRefCount(key);
3948 decrRefCount(val);
3949 } else {
3950 key = tryObjectEncoding(key);
3951 val = tryObjectEncoding(val);
3952 dictAdd((dict*)o->ptr,key,val);
3953 }
3954 }
3955 } else {
3956 redisPanic("Unknown object type");
3957 }
3958 return o;
3959 }
3960
3961 static int rdbLoad(char *filename) {
3962 FILE *fp;
3963 uint32_t dbid;
3964 int type, retval, rdbver;
3965 int swap_all_values = 0;
3966 dict *d = server.db[0].dict;
3967 redisDb *db = server.db+0;
3968 char buf[1024];
3969 time_t expiretime, now = time(NULL);
3970 long long loadedkeys = 0;
3971
3972 fp = fopen(filename,"r");
3973 if (!fp) return REDIS_ERR;
3974 if (fread(buf,9,1,fp) == 0) goto eoferr;
3975 buf[9] = '\0';
3976 if (memcmp(buf,"REDIS",5) != 0) {
3977 fclose(fp);
3978 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3979 return REDIS_ERR;
3980 }
3981 rdbver = atoi(buf+5);
3982 if (rdbver != 1) {
3983 fclose(fp);
3984 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3985 return REDIS_ERR;
3986 }
3987 while(1) {
3988 robj *key, *val;
3989
3990 expiretime = -1;
3991 /* Read type. */
3992 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3993 if (type == REDIS_EXPIRETIME) {
3994 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3995 /* We read the time so we need to read the object type again */
3996 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3997 }
3998 if (type == REDIS_EOF) break;
3999 /* Handle SELECT DB opcode as a special case */
4000 if (type == REDIS_SELECTDB) {
4001 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
4002 goto eoferr;
4003 if (dbid >= (unsigned)server.dbnum) {
4004 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
4005 exit(1);
4006 }
4007 db = server.db+dbid;
4008 d = db->dict;
4009 continue;
4010 }
4011 /* Read key */
4012 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
4013 /* Read value */
4014 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
4015 /* Check if the key already expired */
4016 if (expiretime != -1 && expiretime < now) {
4017 decrRefCount(key);
4018 decrRefCount(val);
4019 continue;
4020 }
4021 /* Add the new object in the hash table */
4022 retval = dictAdd(d,key,val);
4023 if (retval == DICT_ERR) {
4024 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
4025 exit(1);
4026 }
4027 loadedkeys++;
4028 /* Set the expire time if needed */
4029 if (expiretime != -1) setExpire(db,key,expiretime);
4030
4031 /* Handle swapping while loading big datasets when VM is on */
4032
4033 /* If we detecter we are hopeless about fitting something in memory
4034 * we just swap every new key on disk. Directly...
4035 * Note that's important to check for this condition before resorting
4036 * to random sampling, otherwise we may try to swap already
4037 * swapped keys. */
4038 if (swap_all_values) {
4039 dictEntry *de = dictFind(d,key);
4040
4041 /* de may be NULL since the key already expired */
4042 if (de) {
4043 key = dictGetEntryKey(de);
4044 val = dictGetEntryVal(de);
4045
4046 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
4047 dictGetEntryVal(de) = NULL;
4048 }
4049 }
4050 continue;
4051 }
4052
4053 /* If we have still some hope of having some value fitting memory
4054 * then we try random sampling. */
4055 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
4056 while (zmalloc_used_memory() > server.vm_max_memory) {
4057 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4058 }
4059 if (zmalloc_used_memory() > server.vm_max_memory)
4060 swap_all_values = 1; /* We are already using too much mem */
4061 }
4062 }
4063 fclose(fp);
4064 return REDIS_OK;
4065
4066 eoferr: /* unexpected end of file is handled here with a fatal exit */
4067 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4068 exit(1);
4069 return REDIS_ERR; /* Just to avoid warning */
4070 }
4071
4072 /*================================== Commands =============================== */
4073
4074 static void authCommand(redisClient *c) {
4075 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4076 c->authenticated = 1;
4077 addReply(c,shared.ok);
4078 } else {
4079 c->authenticated = 0;
4080 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4081 }
4082 }
4083
4084 static void pingCommand(redisClient *c) {
4085 addReply(c,shared.pong);
4086 }
4087
4088 static void echoCommand(redisClient *c) {
4089 addReplyBulk(c,c->argv[1]);
4090 }
4091
4092 /*=================================== Strings =============================== */
4093
4094 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4095 int retval;
4096 long seconds = 0; /* initialized to avoid an harmness warning */
4097
4098 if (expire) {
4099 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4100 return;
4101 if (seconds <= 0) {
4102 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4103 return;
4104 }
4105 }
4106
4107 if (nx) deleteIfVolatile(c->db,key);
4108 retval = dictAdd(c->db->dict,key,val);
4109 if (retval == DICT_ERR) {
4110 if (!nx) {
4111 /* If the key is about a swapped value, we want a new key object
4112 * to overwrite the old. So we delete the old key in the database.
4113 * This will also make sure that swap pages about the old object
4114 * will be marked as free. */
4115 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4116 incrRefCount(key);
4117 dictReplace(c->db->dict,key,val);
4118 incrRefCount(val);
4119 } else {
4120 addReply(c,shared.czero);
4121 return;
4122 }
4123 } else {
4124 incrRefCount(key);
4125 incrRefCount(val);
4126 }
4127 server.dirty++;
4128 removeExpire(c->db,key);
4129 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4130 addReply(c, nx ? shared.cone : shared.ok);
4131 }
4132
4133 static void setCommand(redisClient *c) {
4134 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4135 }
4136
4137 static void setnxCommand(redisClient *c) {
4138 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4139 }
4140
4141 static void setexCommand(redisClient *c) {
4142 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4143 }
4144
4145 static int getGenericCommand(redisClient *c) {
4146 robj *o;
4147
4148 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4149 return REDIS_OK;
4150
4151 if (o->type != REDIS_STRING) {
4152 addReply(c,shared.wrongtypeerr);
4153 return REDIS_ERR;
4154 } else {
4155 addReplyBulk(c,o);
4156 return REDIS_OK;
4157 }
4158 }
4159
4160 static void getCommand(redisClient *c) {
4161 getGenericCommand(c);
4162 }
4163
4164 static void getsetCommand(redisClient *c) {
4165 if (getGenericCommand(c) == REDIS_ERR) return;
4166 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4167 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4168 } else {
4169 incrRefCount(c->argv[1]);
4170 }
4171 incrRefCount(c->argv[2]);
4172 server.dirty++;
4173 removeExpire(c->db,c->argv[1]);
4174 }
4175
4176 static void mgetCommand(redisClient *c) {
4177 int j;
4178
4179 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4180 for (j = 1; j < c->argc; j++) {
4181 robj *o = lookupKeyRead(c->db,c->argv[j]);
4182 if (o == NULL) {
4183 addReply(c,shared.nullbulk);
4184 } else {
4185 if (o->type != REDIS_STRING) {
4186 addReply(c,shared.nullbulk);
4187 } else {
4188 addReplyBulk(c,o);
4189 }
4190 }
4191 }
4192 }
4193
4194 static void msetGenericCommand(redisClient *c, int nx) {
4195 int j, busykeys = 0;
4196
4197 if ((c->argc % 2) == 0) {
4198 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4199 return;
4200 }
4201 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4202 * set nothing at all if at least one already key exists. */
4203 if (nx) {
4204 for (j = 1; j < c->argc; j += 2) {
4205 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4206 busykeys++;
4207 }
4208 }
4209 }
4210 if (busykeys) {
4211 addReply(c, shared.czero);
4212 return;
4213 }
4214
4215 for (j = 1; j < c->argc; j += 2) {
4216 int retval;
4217
4218 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4219 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4220 if (retval == DICT_ERR) {
4221 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4222 incrRefCount(c->argv[j+1]);
4223 } else {
4224 incrRefCount(c->argv[j]);
4225 incrRefCount(c->argv[j+1]);
4226 }
4227 removeExpire(c->db,c->argv[j]);
4228 }
4229 server.dirty += (c->argc-1)/2;
4230 addReply(c, nx ? shared.cone : shared.ok);
4231 }
4232
4233 static void msetCommand(redisClient *c) {
4234 msetGenericCommand(c,0);
4235 }
4236
4237 static void msetnxCommand(redisClient *c) {
4238 msetGenericCommand(c,1);
4239 }
4240
4241 static void incrDecrCommand(redisClient *c, long long incr) {
4242 long long value;
4243 int retval;
4244 robj *o;
4245
4246 o = lookupKeyWrite(c->db,c->argv[1]);
4247
4248 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4249
4250 value += incr;
4251 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4252 o = tryObjectEncoding(o);
4253 retval = dictAdd(c->db->dict,c->argv[1],o);
4254 if (retval == DICT_ERR) {
4255 dictReplace(c->db->dict,c->argv[1],o);
4256 removeExpire(c->db,c->argv[1]);
4257 } else {
4258 incrRefCount(c->argv[1]);
4259 }
4260 server.dirty++;
4261 addReply(c,shared.colon);
4262 addReply(c,o);
4263 addReply(c,shared.crlf);
4264 }
4265
4266 static void incrCommand(redisClient *c) {
4267 incrDecrCommand(c,1);
4268 }
4269
4270 static void decrCommand(redisClient *c) {
4271 incrDecrCommand(c,-1);
4272 }
4273
4274 static void incrbyCommand(redisClient *c) {
4275 long long incr;
4276
4277 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4278 incrDecrCommand(c,incr);
4279 }
4280
4281 static void decrbyCommand(redisClient *c) {
4282 long long incr;
4283
4284 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4285 incrDecrCommand(c,-incr);
4286 }
4287
4288 static void appendCommand(redisClient *c) {
4289 int retval;
4290 size_t totlen;
4291 robj *o;
4292
4293 o = lookupKeyWrite(c->db,c->argv[1]);
4294 if (o == NULL) {
4295 /* Create the key */
4296 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4297 incrRefCount(c->argv[1]);
4298 incrRefCount(c->argv[2]);
4299 totlen = stringObjectLen(c->argv[2]);
4300 } else {
4301 dictEntry *de;
4302
4303 de = dictFind(c->db->dict,c->argv[1]);
4304 assert(de != NULL);
4305
4306 o = dictGetEntryVal(de);
4307 if (o->type != REDIS_STRING) {
4308 addReply(c,shared.wrongtypeerr);
4309 return;
4310 }
4311 /* If the object is specially encoded or shared we have to make
4312 * a copy */
4313 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4314 robj *decoded = getDecodedObject(o);
4315
4316 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4317 decrRefCount(decoded);
4318 dictReplace(c->db->dict,c->argv[1],o);
4319 }
4320 /* APPEND! */
4321 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4322 o->ptr = sdscatlen(o->ptr,
4323 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4324 } else {
4325 o->ptr = sdscatprintf(o->ptr, "%ld",
4326 (unsigned long) c->argv[2]->ptr);
4327 }
4328 totlen = sdslen(o->ptr);
4329 }
4330 server.dirty++;
4331 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4332 }
4333
4334 static void substrCommand(redisClient *c) {
4335 robj *o;
4336 long start = atoi(c->argv[2]->ptr);
4337 long end = atoi(c->argv[3]->ptr);
4338 size_t rangelen, strlen;
4339 sds range;
4340
4341 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4342 checkType(c,o,REDIS_STRING)) return;
4343
4344 o = getDecodedObject(o);
4345 strlen = sdslen(o->ptr);
4346
4347 /* convert negative indexes */
4348 if (start < 0) start = strlen+start;
4349 if (end < 0) end = strlen+end;
4350 if (start < 0) start = 0;
4351 if (end < 0) end = 0;
4352
4353 /* indexes sanity checks */
4354 if (start > end || (size_t)start >= strlen) {
4355 /* Out of range start or start > end result in null reply */
4356 addReply(c,shared.nullbulk);
4357 decrRefCount(o);
4358 return;
4359 }
4360 if ((size_t)end >= strlen) end = strlen-1;
4361 rangelen = (end-start)+1;
4362
4363 /* Return the result */
4364 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4365 range = sdsnewlen((char*)o->ptr+start,rangelen);
4366 addReplySds(c,range);
4367 addReply(c,shared.crlf);
4368 decrRefCount(o);
4369 }
4370
4371 /* ========================= Type agnostic commands ========================= */
4372
4373 static void delCommand(redisClient *c) {
4374 int deleted = 0, j;
4375
4376 for (j = 1; j < c->argc; j++) {
4377 if (deleteKey(c->db,c->argv[j])) {
4378 server.dirty++;
4379 deleted++;
4380 }
4381 }
4382 addReplyLong(c,deleted);
4383 }
4384
4385 static void existsCommand(redisClient *c) {
4386 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4387 }
4388
4389 static void selectCommand(redisClient *c) {
4390 int id = atoi(c->argv[1]->ptr);
4391
4392 if (selectDb(c,id) == REDIS_ERR) {
4393 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4394 } else {
4395 addReply(c,shared.ok);
4396 }
4397 }
4398
4399 static void randomkeyCommand(redisClient *c) {
4400 dictEntry *de;
4401 robj *key;
4402
4403 while(1) {
4404 de = dictGetRandomKey(c->db->dict);
4405 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4406 }
4407
4408 if (de == NULL) {
4409 addReply(c,shared.nullbulk);
4410 return;
4411 }
4412
4413 key = dictGetEntryKey(de);
4414 if (server.vm_enabled) {
4415 key = dupStringObject(key);
4416 addReplyBulk(c,key);
4417 decrRefCount(key);
4418 } else {
4419 addReplyBulk(c,key);
4420 }
4421 }
4422
4423 static void keysCommand(redisClient *c) {
4424 dictIterator *di;
4425 dictEntry *de;
4426 sds pattern = c->argv[1]->ptr;
4427 int plen = sdslen(pattern);
4428 unsigned long numkeys = 0;
4429 robj *lenobj = createObject(REDIS_STRING,NULL);
4430
4431 di = dictGetIterator(c->db->dict);
4432 addReply(c,lenobj);
4433 decrRefCount(lenobj);
4434 while((de = dictNext(di)) != NULL) {
4435 robj *keyobj = dictGetEntryKey(de);
4436
4437 sds key = keyobj->ptr;
4438 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4439 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4440 if (expireIfNeeded(c->db,keyobj) == 0) {
4441 addReplyBulk(c,keyobj);
4442 numkeys++;
4443 }
4444 }
4445 }
4446 dictReleaseIterator(di);
4447 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4448 }
4449
4450 static void dbsizeCommand(redisClient *c) {
4451 addReplySds(c,
4452 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4453 }
4454
4455 static void lastsaveCommand(redisClient *c) {
4456 addReplySds(c,
4457 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4458 }
4459
4460 static void typeCommand(redisClient *c) {
4461 robj *o;
4462 char *type;
4463
4464 o = lookupKeyRead(c->db,c->argv[1]);
4465 if (o == NULL) {
4466 type = "+none";
4467 } else {
4468 switch(o->type) {
4469 case REDIS_STRING: type = "+string"; break;
4470 case REDIS_LIST: type = "+list"; break;
4471 case REDIS_SET: type = "+set"; break;
4472 case REDIS_ZSET: type = "+zset"; break;
4473 case REDIS_HASH: type = "+hash"; break;
4474 default: type = "+unknown"; break;
4475 }
4476 }
4477 addReplySds(c,sdsnew(type));
4478 addReply(c,shared.crlf);
4479 }
4480
4481 static void saveCommand(redisClient *c) {
4482 if (server.bgsavechildpid != -1) {
4483 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4484 return;
4485 }
4486 if (rdbSave(server.dbfilename) == REDIS_OK) {
4487 addReply(c,shared.ok);
4488 } else {
4489 addReply(c,shared.err);
4490 }
4491 }
4492
4493 static void bgsaveCommand(redisClient *c) {
4494 if (server.bgsavechildpid != -1) {
4495 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4496 return;
4497 }
4498 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4499 char *status = "+Background saving started\r\n";
4500 addReplySds(c,sdsnew(status));
4501 } else {
4502 addReply(c,shared.err);
4503 }
4504 }
4505
4506 static void shutdownCommand(redisClient *c) {
4507 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4508 /* Kill the saving child if there is a background saving in progress.
4509 We want to avoid race conditions, for instance our saving child may
4510 overwrite the synchronous saving did by SHUTDOWN. */
4511 if (server.bgsavechildpid != -1) {
4512 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4513 kill(server.bgsavechildpid,SIGKILL);
4514 rdbRemoveTempFile(server.bgsavechildpid);
4515 }
4516 if (server.appendonly) {
4517 /* Append only file: fsync() the AOF and exit */
4518 fsync(server.appendfd);
4519 if (server.vm_enabled) unlink(server.vm_swap_file);
4520 exit(0);
4521 } else {
4522 /* Snapshotting. Perform a SYNC SAVE and exit */
4523 if (rdbSave(server.dbfilename) == REDIS_OK) {
4524 if (server.daemonize)
4525 unlink(server.pidfile);
4526 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4527 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4528 exit(0);
4529 } else {
4530 /* Ooops.. error saving! The best we can do is to continue
4531 * operating. Note that if there was a background saving process,
4532 * in the next cron() Redis will be notified that the background
4533 * saving aborted, handling special stuff like slaves pending for
4534 * synchronization... */
4535 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4536 addReplySds(c,
4537 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4538 }
4539 }
4540 }
4541
4542 static void renameGenericCommand(redisClient *c, int nx) {
4543 robj *o;
4544
4545 /* To use the same key as src and dst is probably an error */
4546 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4547 addReply(c,shared.sameobjecterr);
4548 return;
4549 }
4550
4551 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4552 return;
4553
4554 incrRefCount(o);
4555 deleteIfVolatile(c->db,c->argv[2]);
4556 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4557 if (nx) {
4558 decrRefCount(o);
4559 addReply(c,shared.czero);
4560 return;
4561 }
4562 dictReplace(c->db->dict,c->argv[2],o);
4563 } else {
4564 incrRefCount(c->argv[2]);
4565 }
4566 deleteKey(c->db,c->argv[1]);
4567 server.dirty++;
4568 addReply(c,nx ? shared.cone : shared.ok);
4569 }
4570
4571 static void renameCommand(redisClient *c) {
4572 renameGenericCommand(c,0);
4573 }
4574
4575 static void renamenxCommand(redisClient *c) {
4576 renameGenericCommand(c,1);
4577 }
4578
4579 static void moveCommand(redisClient *c) {
4580 robj *o;
4581 redisDb *src, *dst;
4582 int srcid;
4583
4584 /* Obtain source and target DB pointers */
4585 src = c->db;
4586 srcid = c->db->id;
4587 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4588 addReply(c,shared.outofrangeerr);
4589 return;
4590 }
4591 dst = c->db;
4592 selectDb(c,srcid); /* Back to the source DB */
4593
4594 /* If the user is moving using as target the same
4595 * DB as the source DB it is probably an error. */
4596 if (src == dst) {
4597 addReply(c,shared.sameobjecterr);
4598 return;
4599 }
4600
4601 /* Check if the element exists and get a reference */
4602 o = lookupKeyWrite(c->db,c->argv[1]);
4603 if (!o) {
4604 addReply(c,shared.czero);
4605 return;
4606 }
4607
4608 /* Try to add the element to the target DB */
4609 deleteIfVolatile(dst,c->argv[1]);
4610 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4611 addReply(c,shared.czero);
4612 return;
4613 }
4614 incrRefCount(c->argv[1]);
4615 incrRefCount(o);
4616
4617 /* OK! key moved, free the entry in the source DB */
4618 deleteKey(src,c->argv[1]);
4619 server.dirty++;
4620 addReply(c,shared.cone);
4621 }
4622
4623 /* =================================== Lists ================================ */
4624 static void pushGenericCommand(redisClient *c, int where) {
4625 robj *lobj;
4626 list *list;
4627
4628 lobj = lookupKeyWrite(c->db,c->argv[1]);
4629 if (lobj == NULL) {
4630 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4631 addReply(c,shared.cone);
4632 return;
4633 }
4634 lobj = createListObject();
4635 list = lobj->ptr;
4636 if (where == REDIS_HEAD) {
4637 listAddNodeHead(list,c->argv[2]);
4638 } else {
4639 listAddNodeTail(list,c->argv[2]);
4640 }
4641 dictAdd(c->db->dict,c->argv[1],lobj);
4642 incrRefCount(c->argv[1]);
4643 incrRefCount(c->argv[2]);
4644 } else {
4645 if (lobj->type != REDIS_LIST) {
4646 addReply(c,shared.wrongtypeerr);
4647 return;
4648 }
4649 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4650 addReply(c,shared.cone);
4651 return;
4652 }
4653 list = lobj->ptr;
4654 if (where == REDIS_HEAD) {
4655 listAddNodeHead(list,c->argv[2]);
4656 } else {
4657 listAddNodeTail(list,c->argv[2]);
4658 }
4659 incrRefCount(c->argv[2]);
4660 }
4661 server.dirty++;
4662 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4663 }
4664
4665 static void lpushCommand(redisClient *c) {
4666 pushGenericCommand(c,REDIS_HEAD);
4667 }
4668
4669 static void rpushCommand(redisClient *c) {
4670 pushGenericCommand(c,REDIS_TAIL);
4671 }
4672
4673 static void llenCommand(redisClient *c) {
4674 robj *o;
4675 list *l;
4676
4677 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4678 checkType(c,o,REDIS_LIST)) return;
4679
4680 l = o->ptr;
4681 addReplyUlong(c,listLength(l));
4682 }
4683
4684 static void lindexCommand(redisClient *c) {
4685 robj *o;
4686 int index = atoi(c->argv[2]->ptr);
4687 list *list;
4688 listNode *ln;
4689
4690 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4691 checkType(c,o,REDIS_LIST)) return;
4692 list = o->ptr;
4693
4694 ln = listIndex(list, index);
4695 if (ln == NULL) {
4696 addReply(c,shared.nullbulk);
4697 } else {
4698 robj *ele = listNodeValue(ln);
4699 addReplyBulk(c,ele);
4700 }
4701 }
4702
4703 static void lsetCommand(redisClient *c) {
4704 robj *o;
4705 int index = atoi(c->argv[2]->ptr);
4706 list *list;
4707 listNode *ln;
4708
4709 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4710 checkType(c,o,REDIS_LIST)) return;
4711 list = o->ptr;
4712
4713 ln = listIndex(list, index);
4714 if (ln == NULL) {
4715 addReply(c,shared.outofrangeerr);
4716 } else {
4717 robj *ele = listNodeValue(ln);
4718
4719 decrRefCount(ele);
4720 listNodeValue(ln) = c->argv[3];
4721 incrRefCount(c->argv[3]);
4722 addReply(c,shared.ok);
4723 server.dirty++;
4724 }
4725 }
4726
4727 static void popGenericCommand(redisClient *c, int where) {
4728 robj *o;
4729 list *list;
4730 listNode *ln;
4731
4732 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4733 checkType(c,o,REDIS_LIST)) return;
4734 list = o->ptr;
4735
4736 if (where == REDIS_HEAD)
4737 ln = listFirst(list);
4738 else
4739 ln = listLast(list);
4740
4741 if (ln == NULL) {
4742 addReply(c,shared.nullbulk);
4743 } else {
4744 robj *ele = listNodeValue(ln);
4745 addReplyBulk(c,ele);
4746 listDelNode(list,ln);
4747 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4748 server.dirty++;
4749 }
4750 }
4751
4752 static void lpopCommand(redisClient *c) {
4753 popGenericCommand(c,REDIS_HEAD);
4754 }
4755
4756 static void rpopCommand(redisClient *c) {
4757 popGenericCommand(c,REDIS_TAIL);
4758 }
4759
4760 static void lrangeCommand(redisClient *c) {
4761 robj *o;
4762 int start = atoi(c->argv[2]->ptr);
4763 int end = atoi(c->argv[3]->ptr);
4764 int llen;
4765 int rangelen, j;
4766 list *list;
4767 listNode *ln;
4768 robj *ele;
4769
4770 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4771 || checkType(c,o,REDIS_LIST)) return;
4772 list = o->ptr;
4773 llen = listLength(list);
4774
4775 /* convert negative indexes */
4776 if (start < 0) start = llen+start;
4777 if (end < 0) end = llen+end;
4778 if (start < 0) start = 0;
4779 if (end < 0) end = 0;
4780
4781 /* indexes sanity checks */
4782 if (start > end || start >= llen) {
4783 /* Out of range start or start > end result in empty list */
4784 addReply(c,shared.emptymultibulk);
4785 return;
4786 }
4787 if (end >= llen) end = llen-1;
4788 rangelen = (end-start)+1;
4789
4790 /* Return the result in form of a multi-bulk reply */
4791 ln = listIndex(list, start);
4792 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4793 for (j = 0; j < rangelen; j++) {
4794 ele = listNodeValue(ln);
4795 addReplyBulk(c,ele);
4796 ln = ln->next;
4797 }
4798 }
4799
4800 static void ltrimCommand(redisClient *c) {
4801 robj *o;
4802 int start = atoi(c->argv[2]->ptr);
4803 int end = atoi(c->argv[3]->ptr);
4804 int llen;
4805 int j, ltrim, rtrim;
4806 list *list;
4807 listNode *ln;
4808
4809 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4810 checkType(c,o,REDIS_LIST)) return;
4811 list = o->ptr;
4812 llen = listLength(list);
4813
4814 /* convert negative indexes */
4815 if (start < 0) start = llen+start;
4816 if (end < 0) end = llen+end;
4817 if (start < 0) start = 0;
4818 if (end < 0) end = 0;
4819
4820 /* indexes sanity checks */
4821 if (start > end || start >= llen) {
4822 /* Out of range start or start > end result in empty list */
4823 ltrim = llen;
4824 rtrim = 0;
4825 } else {
4826 if (end >= llen) end = llen-1;
4827 ltrim = start;
4828 rtrim = llen-end-1;
4829 }
4830
4831 /* Remove list elements to perform the trim */
4832 for (j = 0; j < ltrim; j++) {
4833 ln = listFirst(list);
4834 listDelNode(list,ln);
4835 }
4836 for (j = 0; j < rtrim; j++) {
4837 ln = listLast(list);
4838 listDelNode(list,ln);
4839 }
4840 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4841 server.dirty++;
4842 addReply(c,shared.ok);
4843 }
4844
4845 static void lremCommand(redisClient *c) {
4846 robj *o;
4847 list *list;
4848 listNode *ln, *next;
4849 int toremove = atoi(c->argv[2]->ptr);
4850 int removed = 0;
4851 int fromtail = 0;
4852
4853 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4854 checkType(c,o,REDIS_LIST)) return;
4855 list = o->ptr;
4856
4857 if (toremove < 0) {
4858 toremove = -toremove;
4859 fromtail = 1;
4860 }
4861 ln = fromtail ? list->tail : list->head;
4862 while (ln) {
4863 robj *ele = listNodeValue(ln);
4864
4865 next = fromtail ? ln->prev : ln->next;
4866 if (equalStringObjects(ele,c->argv[3])) {
4867 listDelNode(list,ln);
4868 server.dirty++;
4869 removed++;
4870 if (toremove && removed == toremove) break;
4871 }
4872 ln = next;
4873 }
4874 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4875 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4876 }
4877
4878 /* This is the semantic of this command:
4879 * RPOPLPUSH srclist dstlist:
4880 * IF LLEN(srclist) > 0
4881 * element = RPOP srclist
4882 * LPUSH dstlist element
4883 * RETURN element
4884 * ELSE
4885 * RETURN nil
4886 * END
4887 * END
4888 *
4889 * The idea is to be able to get an element from a list in a reliable way
4890 * since the element is not just returned but pushed against another list
4891 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4892 */
4893 static void rpoplpushcommand(redisClient *c) {
4894 robj *sobj;
4895 list *srclist;
4896 listNode *ln;
4897
4898 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4899 checkType(c,sobj,REDIS_LIST)) return;
4900 srclist = sobj->ptr;
4901 ln = listLast(srclist);
4902
4903 if (ln == NULL) {
4904 addReply(c,shared.nullbulk);
4905 } else {
4906 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4907 robj *ele = listNodeValue(ln);
4908 list *dstlist;
4909
4910 if (dobj && dobj->type != REDIS_LIST) {
4911 addReply(c,shared.wrongtypeerr);
4912 return;
4913 }
4914
4915 /* Add the element to the target list (unless it's directly
4916 * passed to some BLPOP-ing client */
4917 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4918 if (dobj == NULL) {
4919 /* Create the list if the key does not exist */
4920 dobj = createListObject();
4921 dictAdd(c->db->dict,c->argv[2],dobj);
4922 incrRefCount(c->argv[2]);
4923 }
4924 dstlist = dobj->ptr;
4925 listAddNodeHead(dstlist,ele);
4926 incrRefCount(ele);
4927 }
4928
4929 /* Send the element to the client as reply as well */
4930 addReplyBulk(c,ele);
4931
4932 /* Finally remove the element from the source list */
4933 listDelNode(srclist,ln);
4934 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4935 server.dirty++;
4936 }
4937 }
4938
4939 /* ==================================== Sets ================================ */
4940
4941 static void saddCommand(redisClient *c) {
4942 robj *set;
4943
4944 set = lookupKeyWrite(c->db,c->argv[1]);
4945 if (set == NULL) {
4946 set = createSetObject();
4947 dictAdd(c->db->dict,c->argv[1],set);
4948 incrRefCount(c->argv[1]);
4949 } else {
4950 if (set->type != REDIS_SET) {
4951 addReply(c,shared.wrongtypeerr);
4952 return;
4953 }
4954 }
4955 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4956 incrRefCount(c->argv[2]);
4957 server.dirty++;
4958 addReply(c,shared.cone);
4959 } else {
4960 addReply(c,shared.czero);
4961 }
4962 }
4963
4964 static void sremCommand(redisClient *c) {
4965 robj *set;
4966
4967 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4968 checkType(c,set,REDIS_SET)) return;
4969
4970 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4971 server.dirty++;
4972 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4973 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4974 addReply(c,shared.cone);
4975 } else {
4976 addReply(c,shared.czero);
4977 }
4978 }
4979
4980 static void smoveCommand(redisClient *c) {
4981 robj *srcset, *dstset;
4982
4983 srcset = lookupKeyWrite(c->db,c->argv[1]);
4984 dstset = lookupKeyWrite(c->db,c->argv[2]);
4985
4986 /* If the source key does not exist return 0, if it's of the wrong type
4987 * raise an error */
4988 if (srcset == NULL || srcset->type != REDIS_SET) {
4989 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4990 return;
4991 }
4992 /* Error if the destination key is not a set as well */
4993 if (dstset && dstset->type != REDIS_SET) {
4994 addReply(c,shared.wrongtypeerr);
4995 return;
4996 }
4997 /* Remove the element from the source set */
4998 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4999 /* Key not found in the src set! return zero */
5000 addReply(c,shared.czero);
5001 return;
5002 }
5003 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5004 deleteKey(c->db,c->argv[1]);
5005 server.dirty++;
5006 /* Add the element to the destination set */
5007 if (!dstset) {
5008 dstset = createSetObject();
5009 dictAdd(c->db->dict,c->argv[2],dstset);
5010 incrRefCount(c->argv[2]);
5011 }
5012 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5013 incrRefCount(c->argv[3]);
5014 addReply(c,shared.cone);
5015 }
5016
5017 static void sismemberCommand(redisClient *c) {
5018 robj *set;
5019
5020 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5021 checkType(c,set,REDIS_SET)) return;
5022
5023 if (dictFind(set->ptr,c->argv[2]))
5024 addReply(c,shared.cone);
5025 else
5026 addReply(c,shared.czero);
5027 }
5028
5029 static void scardCommand(redisClient *c) {
5030 robj *o;
5031 dict *s;
5032
5033 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5034 checkType(c,o,REDIS_SET)) return;
5035
5036 s = o->ptr;
5037 addReplyUlong(c,dictSize(s));
5038 }
5039
5040 static void spopCommand(redisClient *c) {
5041 robj *set;
5042 dictEntry *de;
5043
5044 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5045 checkType(c,set,REDIS_SET)) return;
5046
5047 de = dictGetRandomKey(set->ptr);
5048 if (de == NULL) {
5049 addReply(c,shared.nullbulk);
5050 } else {
5051 robj *ele = dictGetEntryKey(de);
5052
5053 addReplyBulk(c,ele);
5054 dictDelete(set->ptr,ele);
5055 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5056 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5057 server.dirty++;
5058 }
5059 }
5060
5061 static void srandmemberCommand(redisClient *c) {
5062 robj *set;
5063 dictEntry *de;
5064
5065 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5066 checkType(c,set,REDIS_SET)) return;
5067
5068 de = dictGetRandomKey(set->ptr);
5069 if (de == NULL) {
5070 addReply(c,shared.nullbulk);
5071 } else {
5072 robj *ele = dictGetEntryKey(de);
5073
5074 addReplyBulk(c,ele);
5075 }
5076 }
5077
5078 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5079 dict **d1 = (void*) s1, **d2 = (void*) s2;
5080
5081 return dictSize(*d1)-dictSize(*d2);
5082 }
5083
5084 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5085 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5086 dictIterator *di;
5087 dictEntry *de;
5088 robj *lenobj = NULL, *dstset = NULL;
5089 unsigned long j, cardinality = 0;
5090
5091 for (j = 0; j < setsnum; j++) {
5092 robj *setobj;
5093
5094 setobj = dstkey ?
5095 lookupKeyWrite(c->db,setskeys[j]) :
5096 lookupKeyRead(c->db,setskeys[j]);
5097 if (!setobj) {
5098 zfree(dv);
5099 if (dstkey) {
5100 if (deleteKey(c->db,dstkey))
5101 server.dirty++;
5102 addReply(c,shared.czero);
5103 } else {
5104 addReply(c,shared.emptymultibulk);
5105 }
5106 return;
5107 }
5108 if (setobj->type != REDIS_SET) {
5109 zfree(dv);
5110 addReply(c,shared.wrongtypeerr);
5111 return;
5112 }
5113 dv[j] = setobj->ptr;
5114 }
5115 /* Sort sets from the smallest to largest, this will improve our
5116 * algorithm's performace */
5117 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5118
5119 /* The first thing we should output is the total number of elements...
5120 * since this is a multi-bulk write, but at this stage we don't know
5121 * the intersection set size, so we use a trick, append an empty object
5122 * to the output list and save the pointer to later modify it with the
5123 * right length */
5124 if (!dstkey) {
5125 lenobj = createObject(REDIS_STRING,NULL);
5126 addReply(c,lenobj);
5127 decrRefCount(lenobj);
5128 } else {
5129 /* If we have a target key where to store the resulting set
5130 * create this key with an empty set inside */
5131 dstset = createSetObject();
5132 }
5133
5134 /* Iterate all the elements of the first (smallest) set, and test
5135 * the element against all the other sets, if at least one set does
5136 * not include the element it is discarded */
5137 di = dictGetIterator(dv[0]);
5138
5139 while((de = dictNext(di)) != NULL) {
5140 robj *ele;
5141
5142 for (j = 1; j < setsnum; j++)
5143 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5144 if (j != setsnum)
5145 continue; /* at least one set does not contain the member */
5146 ele = dictGetEntryKey(de);
5147 if (!dstkey) {
5148 addReplyBulk(c,ele);
5149 cardinality++;
5150 } else {
5151 dictAdd(dstset->ptr,ele,NULL);
5152 incrRefCount(ele);
5153 }
5154 }
5155 dictReleaseIterator(di);
5156
5157 if (dstkey) {
5158 /* Store the resulting set into the target, if the intersection
5159 * is not an empty set. */
5160 deleteKey(c->db,dstkey);
5161 if (dictSize((dict*)dstset->ptr) > 0) {
5162 dictAdd(c->db->dict,dstkey,dstset);
5163 incrRefCount(dstkey);
5164 addReplyLong(c,dictSize((dict*)dstset->ptr));
5165 } else {
5166 decrRefCount(dstset);
5167 addReply(c,shared.czero);
5168 }
5169 server.dirty++;
5170 } else {
5171 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5172 }
5173 zfree(dv);
5174 }
5175
5176 static void sinterCommand(redisClient *c) {
5177 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5178 }
5179
5180 static void sinterstoreCommand(redisClient *c) {
5181 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5182 }
5183
5184 #define REDIS_OP_UNION 0
5185 #define REDIS_OP_DIFF 1
5186 #define REDIS_OP_INTER 2
5187
5188 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5189 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5190 dictIterator *di;
5191 dictEntry *de;
5192 robj *dstset = NULL;
5193 int j, cardinality = 0;
5194
5195 for (j = 0; j < setsnum; j++) {
5196 robj *setobj;
5197
5198 setobj = dstkey ?
5199 lookupKeyWrite(c->db,setskeys[j]) :
5200 lookupKeyRead(c->db,setskeys[j]);
5201 if (!setobj) {
5202 dv[j] = NULL;
5203 continue;
5204 }
5205 if (setobj->type != REDIS_SET) {
5206 zfree(dv);
5207 addReply(c,shared.wrongtypeerr);
5208 return;
5209 }
5210 dv[j] = setobj->ptr;
5211 }
5212
5213 /* We need a temp set object to store our union. If the dstkey
5214 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5215 * this set object will be the resulting object to set into the target key*/
5216 dstset = createSetObject();
5217
5218 /* Iterate all the elements of all the sets, add every element a single
5219 * time to the result set */
5220 for (j = 0; j < setsnum; j++) {
5221 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5222 if (!dv[j]) continue; /* non existing keys are like empty sets */
5223
5224 di = dictGetIterator(dv[j]);
5225
5226 while((de = dictNext(di)) != NULL) {
5227 robj *ele;
5228
5229 /* dictAdd will not add the same element multiple times */
5230 ele = dictGetEntryKey(de);
5231 if (op == REDIS_OP_UNION || j == 0) {
5232 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5233 incrRefCount(ele);
5234 cardinality++;
5235 }
5236 } else if (op == REDIS_OP_DIFF) {
5237 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5238 cardinality--;
5239 }
5240 }
5241 }
5242 dictReleaseIterator(di);
5243
5244 /* result set is empty? Exit asap. */
5245 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5246 }
5247
5248 /* Output the content of the resulting set, if not in STORE mode */
5249 if (!dstkey) {
5250 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5251 di = dictGetIterator(dstset->ptr);
5252 while((de = dictNext(di)) != NULL) {
5253 robj *ele;
5254
5255 ele = dictGetEntryKey(de);
5256 addReplyBulk(c,ele);
5257 }
5258 dictReleaseIterator(di);
5259 decrRefCount(dstset);
5260 } else {
5261 /* If we have a target key where to store the resulting set
5262 * create this key with the result set inside */
5263 deleteKey(c->db,dstkey);
5264 if (dictSize((dict*)dstset->ptr) > 0) {
5265 dictAdd(c->db->dict,dstkey,dstset);
5266 incrRefCount(dstkey);
5267 addReplyLong(c,dictSize((dict*)dstset->ptr));
5268 } else {
5269 decrRefCount(dstset);
5270 addReply(c,shared.czero);
5271 }
5272 server.dirty++;
5273 }
5274 zfree(dv);
5275 }
5276
5277 static void sunionCommand(redisClient *c) {
5278 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5279 }
5280
5281 static void sunionstoreCommand(redisClient *c) {
5282 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5283 }
5284
5285 static void sdiffCommand(redisClient *c) {
5286 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5287 }
5288
5289 static void sdiffstoreCommand(redisClient *c) {
5290 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5291 }
5292
5293 /* ==================================== ZSets =============================== */
5294
5295 /* ZSETs are ordered sets using two data structures to hold the same elements
5296 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5297 * data structure.
5298 *
5299 * The elements are added to an hash table mapping Redis objects to scores.
5300 * At the same time the elements are added to a skip list mapping scores
5301 * to Redis objects (so objects are sorted by scores in this "view"). */
5302
5303 /* This skiplist implementation is almost a C translation of the original
5304 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5305 * Alternative to Balanced Trees", modified in three ways:
5306 * a) this implementation allows for repeated values.
5307 * b) the comparison is not just by key (our 'score') but by satellite data.
5308 * c) there is a back pointer, so it's a doubly linked list with the back
5309 * pointers being only at "level 1". This allows to traverse the list
5310 * from tail to head, useful for ZREVRANGE. */
5311
5312 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5313 zskiplistNode *zn = zmalloc(sizeof(*zn));
5314
5315 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5316 if (level > 0)
5317 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5318 zn->score = score;
5319 zn->obj = obj;
5320 return zn;
5321 }
5322
5323 static zskiplist *zslCreate(void) {
5324 int j;
5325 zskiplist *zsl;
5326
5327 zsl = zmalloc(sizeof(*zsl));
5328 zsl->level = 1;
5329 zsl->length = 0;
5330 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5331 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5332 zsl->header->forward[j] = NULL;
5333
5334 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5335 if (j < ZSKIPLIST_MAXLEVEL-1)
5336 zsl->header->span[j] = 0;
5337 }
5338 zsl->header->backward = NULL;
5339 zsl->tail = NULL;
5340 return zsl;
5341 }
5342
5343 static void zslFreeNode(zskiplistNode *node) {
5344 decrRefCount(node->obj);
5345 zfree(node->forward);
5346 zfree(node->span);
5347 zfree(node);
5348 }
5349
5350 static void zslFree(zskiplist *zsl) {
5351 zskiplistNode *node = zsl->header->forward[0], *next;
5352
5353 zfree(zsl->header->forward);
5354 zfree(zsl->header->span);
5355 zfree(zsl->header);
5356 while(node) {
5357 next = node->forward[0];
5358 zslFreeNode(node);
5359 node = next;
5360 }
5361 zfree(zsl);
5362 }
5363
5364 static int zslRandomLevel(void) {
5365 int level = 1;
5366 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5367 level += 1;
5368 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5369 }
5370
5371 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5372 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5373 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5374 int i, level;
5375
5376 x = zsl->header;
5377 for (i = zsl->level-1; i >= 0; i--) {
5378 /* store rank that is crossed to reach the insert position */
5379 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5380
5381 while (x->forward[i] &&
5382 (x->forward[i]->score < score ||
5383 (x->forward[i]->score == score &&
5384 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5385 rank[i] += i > 0 ? x->span[i-1] : 1;
5386 x = x->forward[i];
5387 }
5388 update[i] = x;
5389 }
5390 /* we assume the key is not already inside, since we allow duplicated
5391 * scores, and the re-insertion of score and redis object should never
5392 * happpen since the caller of zslInsert() should test in the hash table
5393 * if the element is already inside or not. */
5394 level = zslRandomLevel();
5395 if (level > zsl->level) {
5396 for (i = zsl->level; i < level; i++) {
5397 rank[i] = 0;
5398 update[i] = zsl->header;
5399 update[i]->span[i-1] = zsl->length;
5400 }
5401 zsl->level = level;
5402 }
5403 x = zslCreateNode(level,score,obj);
5404 for (i = 0; i < level; i++) {
5405 x->forward[i] = update[i]->forward[i];
5406 update[i]->forward[i] = x;
5407
5408 /* update span covered by update[i] as x is inserted here */
5409 if (i > 0) {
5410 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5411 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5412 }
5413 }
5414
5415 /* increment span for untouched levels */
5416 for (i = level; i < zsl->level; i++) {
5417 update[i]->span[i-1]++;
5418 }
5419
5420 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5421 if (x->forward[0])
5422 x->forward[0]->backward = x;
5423 else
5424 zsl->tail = x;
5425 zsl->length++;
5426 }
5427
5428 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5429 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5430 int i;
5431 for (i = 0; i < zsl->level; i++) {
5432 if (update[i]->forward[i] == x) {
5433 if (i > 0) {
5434 update[i]->span[i-1] += x->span[i-1] - 1;
5435 }
5436 update[i]->forward[i] = x->forward[i];
5437 } else {
5438 /* invariant: i > 0, because update[0]->forward[0]
5439 * is always equal to x */
5440 update[i]->span[i-1] -= 1;
5441 }
5442 }
5443 if (x->forward[0]) {
5444 x->forward[0]->backward = x->backward;
5445 } else {
5446 zsl->tail = x->backward;
5447 }
5448 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5449 zsl->level--;
5450 zsl->length--;
5451 }
5452
5453 /* Delete an element with matching score/object from the skiplist. */
5454 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5455 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5456 int i;
5457
5458 x = zsl->header;
5459 for (i = zsl->level-1; i >= 0; i--) {
5460 while (x->forward[i] &&
5461 (x->forward[i]->score < score ||
5462 (x->forward[i]->score == score &&
5463 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5464 x = x->forward[i];
5465 update[i] = x;
5466 }
5467 /* We may have multiple elements with the same score, what we need
5468 * is to find the element with both the right score and object. */
5469 x = x->forward[0];
5470 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
5471 zslDeleteNode(zsl, x, update);
5472 zslFreeNode(x);
5473 return 1;
5474 } else {
5475 return 0; /* not found */
5476 }
5477 return 0; /* not found */
5478 }
5479
5480 /* Delete all the elements with score between min and max from the skiplist.
5481 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5482 * Note that this function takes the reference to the hash table view of the
5483 * sorted set, in order to remove the elements from the hash table too. */
5484 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5485 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5486 unsigned long removed = 0;
5487 int i;
5488
5489 x = zsl->header;
5490 for (i = zsl->level-1; i >= 0; i--) {
5491 while (x->forward[i] && x->forward[i]->score < min)
5492 x = x->forward[i];
5493 update[i] = x;
5494 }
5495 /* We may have multiple elements with the same score, what we need
5496 * is to find the element with both the right score and object. */
5497 x = x->forward[0];
5498 while (x && x->score <= max) {
5499 zskiplistNode *next = x->forward[0];
5500 zslDeleteNode(zsl, x, update);
5501 dictDelete(dict,x->obj);
5502 zslFreeNode(x);
5503 removed++;
5504 x = next;
5505 }
5506 return removed; /* not found */
5507 }
5508
5509 /* Delete all the elements with rank between start and end from the skiplist.
5510 * Start and end are inclusive. Note that start and end need to be 1-based */
5511 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5512 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5513 unsigned long traversed = 0, removed = 0;
5514 int i;
5515
5516 x = zsl->header;
5517 for (i = zsl->level-1; i >= 0; i--) {
5518 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5519 traversed += i > 0 ? x->span[i-1] : 1;
5520 x = x->forward[i];
5521 }
5522 update[i] = x;
5523 }
5524
5525 traversed++;
5526 x = x->forward[0];
5527 while (x && traversed <= end) {
5528 zskiplistNode *next = x->forward[0];
5529 zslDeleteNode(zsl, x, update);
5530 dictDelete(dict,x->obj);
5531 zslFreeNode(x);
5532 removed++;
5533 traversed++;
5534 x = next;
5535 }
5536 return removed;
5537 }
5538
5539 /* Find the first node having a score equal or greater than the specified one.
5540 * Returns NULL if there is no match. */
5541 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5542 zskiplistNode *x;
5543 int i;
5544
5545 x = zsl->header;
5546 for (i = zsl->level-1; i >= 0; i--) {
5547 while (x->forward[i] && x->forward[i]->score < score)
5548 x = x->forward[i];
5549 }
5550 /* We may have multiple elements with the same score, what we need
5551 * is to find the element with both the right score and object. */
5552 return x->forward[0];
5553 }
5554
5555 /* Find the rank for an element by both score and key.
5556 * Returns 0 when the element cannot be found, rank otherwise.
5557 * Note that the rank is 1-based due to the span of zsl->header to the
5558 * first element. */
5559 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5560 zskiplistNode *x;
5561 unsigned long rank = 0;
5562 int i;
5563
5564 x = zsl->header;
5565 for (i = zsl->level-1; i >= 0; i--) {
5566 while (x->forward[i] &&
5567 (x->forward[i]->score < score ||
5568 (x->forward[i]->score == score &&
5569 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5570 rank += i > 0 ? x->span[i-1] : 1;
5571 x = x->forward[i];
5572 }
5573
5574 /* x might be equal to zsl->header, so test if obj is non-NULL */
5575 if (x->obj && equalStringObjects(x->obj,o)) {
5576 return rank;
5577 }
5578 }
5579 return 0;
5580 }
5581
5582 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5583 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5584 zskiplistNode *x;
5585 unsigned long traversed = 0;
5586 int i;
5587
5588 x = zsl->header;
5589 for (i = zsl->level-1; i >= 0; i--) {
5590 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5591 {
5592 traversed += i > 0 ? x->span[i-1] : 1;
5593 x = x->forward[i];
5594 }
5595 if (traversed == rank) {
5596 return x;
5597 }
5598 }
5599 return NULL;
5600 }
5601
5602 /* The actual Z-commands implementations */
5603
5604 /* This generic command implements both ZADD and ZINCRBY.
5605 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5606 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5607 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5608 robj *zsetobj;
5609 zset *zs;
5610 double *score;
5611
5612 zsetobj = lookupKeyWrite(c->db,key);
5613 if (zsetobj == NULL) {
5614 zsetobj = createZsetObject();
5615 dictAdd(c->db->dict,key,zsetobj);
5616 incrRefCount(key);
5617 } else {
5618 if (zsetobj->type != REDIS_ZSET) {
5619 addReply(c,shared.wrongtypeerr);
5620 return;
5621 }
5622 }
5623 zs = zsetobj->ptr;
5624
5625 /* Ok now since we implement both ZADD and ZINCRBY here the code
5626 * needs to handle the two different conditions. It's all about setting
5627 * '*score', that is, the new score to set, to the right value. */
5628 score = zmalloc(sizeof(double));
5629 if (doincrement) {
5630 dictEntry *de;
5631
5632 /* Read the old score. If the element was not present starts from 0 */
5633 de = dictFind(zs->dict,ele);
5634 if (de) {
5635 double *oldscore = dictGetEntryVal(de);
5636 *score = *oldscore + scoreval;
5637 } else {
5638 *score = scoreval;
5639 }
5640 } else {
5641 *score = scoreval;
5642 }
5643
5644 /* What follows is a simple remove and re-insert operation that is common
5645 * to both ZADD and ZINCRBY... */
5646 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5647 /* case 1: New element */
5648 incrRefCount(ele); /* added to hash */
5649 zslInsert(zs->zsl,*score,ele);
5650 incrRefCount(ele); /* added to skiplist */
5651 server.dirty++;
5652 if (doincrement)
5653 addReplyDouble(c,*score);
5654 else
5655 addReply(c,shared.cone);
5656 } else {
5657 dictEntry *de;
5658 double *oldscore;
5659
5660 /* case 2: Score update operation */
5661 de = dictFind(zs->dict,ele);
5662 redisAssert(de != NULL);
5663 oldscore = dictGetEntryVal(de);
5664 if (*score != *oldscore) {
5665 int deleted;
5666
5667 /* Remove and insert the element in the skip list with new score */
5668 deleted = zslDelete(zs->zsl,*oldscore,ele);
5669 redisAssert(deleted != 0);
5670 zslInsert(zs->zsl,*score,ele);
5671 incrRefCount(ele);
5672 /* Update the score in the hash table */
5673 dictReplace(zs->dict,ele,score);
5674 server.dirty++;
5675 } else {
5676 zfree(score);
5677 }
5678 if (doincrement)
5679 addReplyDouble(c,*score);
5680 else
5681 addReply(c,shared.czero);
5682 }
5683 }
5684
5685 static void zaddCommand(redisClient *c) {
5686 double scoreval;
5687
5688 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5689 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5690 }
5691
5692 static void zincrbyCommand(redisClient *c) {
5693 double scoreval;
5694
5695 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5696 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5697 }
5698
5699 static void zremCommand(redisClient *c) {
5700 robj *zsetobj;
5701 zset *zs;
5702 dictEntry *de;
5703 double *oldscore;
5704 int deleted;
5705
5706 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5707 checkType(c,zsetobj,REDIS_ZSET)) return;
5708
5709 zs = zsetobj->ptr;
5710 de = dictFind(zs->dict,c->argv[2]);
5711 if (de == NULL) {
5712 addReply(c,shared.czero);
5713 return;
5714 }
5715 /* Delete from the skiplist */
5716 oldscore = dictGetEntryVal(de);
5717 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5718 redisAssert(deleted != 0);
5719
5720 /* Delete from the hash table */
5721 dictDelete(zs->dict,c->argv[2]);
5722 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5723 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5724 server.dirty++;
5725 addReply(c,shared.cone);
5726 }
5727
5728 static void zremrangebyscoreCommand(redisClient *c) {
5729 double min;
5730 double max;
5731 long deleted;
5732 robj *zsetobj;
5733 zset *zs;
5734
5735 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5736 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5737
5738 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5739 checkType(c,zsetobj,REDIS_ZSET)) return;
5740
5741 zs = zsetobj->ptr;
5742 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5743 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5744 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5745 server.dirty += deleted;
5746 addReplyLong(c,deleted);
5747 }
5748
5749 static void zremrangebyrankCommand(redisClient *c) {
5750 long start;
5751 long end;
5752 int llen;
5753 long deleted;
5754 robj *zsetobj;
5755 zset *zs;
5756
5757 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5758 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5759
5760 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5761 checkType(c,zsetobj,REDIS_ZSET)) return;
5762 zs = zsetobj->ptr;
5763 llen = zs->zsl->length;
5764
5765 /* convert negative indexes */
5766 if (start < 0) start = llen+start;
5767 if (end < 0) end = llen+end;
5768 if (start < 0) start = 0;
5769 if (end < 0) end = 0;
5770
5771 /* indexes sanity checks */
5772 if (start > end || start >= llen) {
5773 addReply(c,shared.czero);
5774 return;
5775 }
5776 if (end >= llen) end = llen-1;
5777
5778 /* increment start and end because zsl*Rank functions
5779 * use 1-based rank */
5780 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5781 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5782 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5783 server.dirty += deleted;
5784 addReplyLong(c, deleted);
5785 }
5786
5787 typedef struct {
5788 dict *dict;
5789 double weight;
5790 } zsetopsrc;
5791
5792 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5793 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5794 unsigned long size1, size2;
5795 size1 = d1->dict ? dictSize(d1->dict) : 0;
5796 size2 = d2->dict ? dictSize(d2->dict) : 0;
5797 return size1 - size2;
5798 }
5799
5800 #define REDIS_AGGR_SUM 1
5801 #define REDIS_AGGR_MIN 2
5802 #define REDIS_AGGR_MAX 3
5803
5804 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5805 if (aggregate == REDIS_AGGR_SUM) {
5806 *target = *target + val;
5807 } else if (aggregate == REDIS_AGGR_MIN) {
5808 *target = val < *target ? val : *target;
5809 } else if (aggregate == REDIS_AGGR_MAX) {
5810 *target = val > *target ? val : *target;
5811 } else {
5812 /* safety net */
5813 redisPanic("Unknown ZUNION/INTER aggregate type");
5814 }
5815 }
5816
5817 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5818 int i, j, zsetnum;
5819 int aggregate = REDIS_AGGR_SUM;
5820 zsetopsrc *src;
5821 robj *dstobj;
5822 zset *dstzset;
5823 dictIterator *di;
5824 dictEntry *de;
5825
5826 /* expect zsetnum input keys to be given */
5827 zsetnum = atoi(c->argv[2]->ptr);
5828 if (zsetnum < 1) {
5829 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5830 return;
5831 }
5832
5833 /* test if the expected number of keys would overflow */
5834 if (3+zsetnum > c->argc) {
5835 addReply(c,shared.syntaxerr);
5836 return;
5837 }
5838
5839 /* read keys to be used for input */
5840 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5841 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5842 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5843 if (!zsetobj) {
5844 src[i].dict = NULL;
5845 } else {
5846 if (zsetobj->type != REDIS_ZSET) {
5847 zfree(src);
5848 addReply(c,shared.wrongtypeerr);
5849 return;
5850 }
5851 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5852 }
5853
5854 /* default all weights to 1 */
5855 src[i].weight = 1.0;
5856 }
5857
5858 /* parse optional extra arguments */
5859 if (j < c->argc) {
5860 int remaining = c->argc - j;
5861
5862 while (remaining) {
5863 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5864 j++; remaining--;
5865 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5866 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5867 return;
5868 }
5869 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5870 j++; remaining--;
5871 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5872 aggregate = REDIS_AGGR_SUM;
5873 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5874 aggregate = REDIS_AGGR_MIN;
5875 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5876 aggregate = REDIS_AGGR_MAX;
5877 } else {
5878 zfree(src);
5879 addReply(c,shared.syntaxerr);
5880 return;
5881 }
5882 j++; remaining--;
5883 } else {
5884 zfree(src);
5885 addReply(c,shared.syntaxerr);
5886 return;
5887 }
5888 }
5889 }
5890
5891 /* sort sets from the smallest to largest, this will improve our
5892 * algorithm's performance */
5893 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5894
5895 dstobj = createZsetObject();
5896 dstzset = dstobj->ptr;
5897
5898 if (op == REDIS_OP_INTER) {
5899 /* skip going over all entries if the smallest zset is NULL or empty */
5900 if (src[0].dict && dictSize(src[0].dict) > 0) {
5901 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5902 * from small to large, all src[i > 0].dict are non-empty too */
5903 di = dictGetIterator(src[0].dict);
5904 while((de = dictNext(di)) != NULL) {
5905 double *score = zmalloc(sizeof(double)), value;
5906 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5907
5908 for (j = 1; j < zsetnum; j++) {
5909 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5910 if (other) {
5911 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5912 zunionInterAggregate(score, value, aggregate);
5913 } else {
5914 break;
5915 }
5916 }
5917
5918 /* skip entry when not present in every source dict */
5919 if (j != zsetnum) {
5920 zfree(score);
5921 } else {
5922 robj *o = dictGetEntryKey(de);
5923 dictAdd(dstzset->dict,o,score);
5924 incrRefCount(o); /* added to dictionary */
5925 zslInsert(dstzset->zsl,*score,o);
5926 incrRefCount(o); /* added to skiplist */
5927 }
5928 }
5929 dictReleaseIterator(di);
5930 }
5931 } else if (op == REDIS_OP_UNION) {
5932 for (i = 0; i < zsetnum; i++) {
5933 if (!src[i].dict) continue;
5934
5935 di = dictGetIterator(src[i].dict);
5936 while((de = dictNext(di)) != NULL) {
5937 /* skip key when already processed */
5938 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5939
5940 double *score = zmalloc(sizeof(double)), value;
5941 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5942
5943 /* because the zsets are sorted by size, its only possible
5944 * for sets at larger indices to hold this entry */
5945 for (j = (i+1); j < zsetnum; j++) {
5946 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5947 if (other) {
5948 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5949 zunionInterAggregate(score, value, aggregate);
5950 }
5951 }
5952
5953 robj *o = dictGetEntryKey(de);
5954 dictAdd(dstzset->dict,o,score);
5955 incrRefCount(o); /* added to dictionary */
5956 zslInsert(dstzset->zsl,*score,o);
5957 incrRefCount(o); /* added to skiplist */
5958 }
5959 dictReleaseIterator(di);
5960 }
5961 } else {
5962 /* unknown operator */
5963 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5964 }
5965
5966 deleteKey(c->db,dstkey);
5967 if (dstzset->zsl->length) {
5968 dictAdd(c->db->dict,dstkey,dstobj);
5969 incrRefCount(dstkey);
5970 addReplyLong(c, dstzset->zsl->length);
5971 server.dirty++;
5972 } else {
5973 decrRefCount(dstobj);
5974 addReply(c, shared.czero);
5975 }
5976 zfree(src);
5977 }
5978
5979 static void zunionCommand(redisClient *c) {
5980 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5981 }
5982
5983 static void zinterCommand(redisClient *c) {
5984 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5985 }
5986
5987 static void zrangeGenericCommand(redisClient *c, int reverse) {
5988 robj *o;
5989 long start;
5990 long end;
5991 int withscores = 0;
5992 int llen;
5993 int rangelen, j;
5994 zset *zsetobj;
5995 zskiplist *zsl;
5996 zskiplistNode *ln;
5997 robj *ele;
5998
5999 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6000 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
6001
6002 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6003 withscores = 1;
6004 } else if (c->argc >= 5) {
6005 addReply(c,shared.syntaxerr);
6006 return;
6007 }
6008
6009 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6010 || checkType(c,o,REDIS_ZSET)) return;
6011 zsetobj = o->ptr;
6012 zsl = zsetobj->zsl;
6013 llen = zsl->length;
6014
6015 /* convert negative indexes */
6016 if (start < 0) start = llen+start;
6017 if (end < 0) end = llen+end;
6018 if (start < 0) start = 0;
6019 if (end < 0) end = 0;
6020
6021 /* indexes sanity checks */
6022 if (start > end || start >= llen) {
6023 /* Out of range start or start > end result in empty list */
6024 addReply(c,shared.emptymultibulk);
6025 return;
6026 }
6027 if (end >= llen) end = llen-1;
6028 rangelen = (end-start)+1;
6029
6030 /* check if starting point is trivial, before searching
6031 * the element in log(N) time */
6032 if (reverse) {
6033 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6034 } else {
6035 ln = start == 0 ?
6036 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6037 }
6038
6039 /* Return the result in form of a multi-bulk reply */
6040 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6041 withscores ? (rangelen*2) : rangelen));
6042 for (j = 0; j < rangelen; j++) {
6043 ele = ln->obj;
6044 addReplyBulk(c,ele);
6045 if (withscores)
6046 addReplyDouble(c,ln->score);
6047 ln = reverse ? ln->backward : ln->forward[0];
6048 }
6049 }
6050
6051 static void zrangeCommand(redisClient *c) {
6052 zrangeGenericCommand(c,0);
6053 }
6054
6055 static void zrevrangeCommand(redisClient *c) {
6056 zrangeGenericCommand(c,1);
6057 }
6058
6059 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6060 * If justcount is non-zero, just the count is returned. */
6061 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6062 robj *o;
6063 double min, max;
6064 int minex = 0, maxex = 0; /* are min or max exclusive? */
6065 int offset = 0, limit = -1;
6066 int withscores = 0;
6067 int badsyntax = 0;
6068
6069 /* Parse the min-max interval. If one of the values is prefixed
6070 * by the "(" character, it's considered "open". For instance
6071 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6072 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6073 if (((char*)c->argv[2]->ptr)[0] == '(') {
6074 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6075 minex = 1;
6076 } else {
6077 min = strtod(c->argv[2]->ptr,NULL);
6078 }
6079 if (((char*)c->argv[3]->ptr)[0] == '(') {
6080 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6081 maxex = 1;
6082 } else {
6083 max = strtod(c->argv[3]->ptr,NULL);
6084 }
6085
6086 /* Parse "WITHSCORES": note that if the command was called with
6087 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6088 * enter the following paths to parse WITHSCORES and LIMIT. */
6089 if (c->argc == 5 || c->argc == 8) {
6090 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6091 withscores = 1;
6092 else
6093 badsyntax = 1;
6094 }
6095 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6096 badsyntax = 1;
6097 if (badsyntax) {
6098 addReplySds(c,
6099 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6100 return;
6101 }
6102
6103 /* Parse "LIMIT" */
6104 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6105 addReply(c,shared.syntaxerr);
6106 return;
6107 } else if (c->argc == (7 + withscores)) {
6108 offset = atoi(c->argv[5]->ptr);
6109 limit = atoi(c->argv[6]->ptr);
6110 if (offset < 0) offset = 0;
6111 }
6112
6113 /* Ok, lookup the key and get the range */
6114 o = lookupKeyRead(c->db,c->argv[1]);
6115 if (o == NULL) {
6116 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6117 } else {
6118 if (o->type != REDIS_ZSET) {
6119 addReply(c,shared.wrongtypeerr);
6120 } else {
6121 zset *zsetobj = o->ptr;
6122 zskiplist *zsl = zsetobj->zsl;
6123 zskiplistNode *ln;
6124 robj *ele, *lenobj = NULL;
6125 unsigned long rangelen = 0;
6126
6127 /* Get the first node with the score >= min, or with
6128 * score > min if 'minex' is true. */
6129 ln = zslFirstWithScore(zsl,min);
6130 while (minex && ln && ln->score == min) ln = ln->forward[0];
6131
6132 if (ln == NULL) {
6133 /* No element matching the speciifed interval */
6134 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6135 return;
6136 }
6137
6138 /* We don't know in advance how many matching elements there
6139 * are in the list, so we push this object that will represent
6140 * the multi-bulk length in the output buffer, and will "fix"
6141 * it later */
6142 if (!justcount) {
6143 lenobj = createObject(REDIS_STRING,NULL);
6144 addReply(c,lenobj);
6145 decrRefCount(lenobj);
6146 }
6147
6148 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6149 if (offset) {
6150 offset--;
6151 ln = ln->forward[0];
6152 continue;
6153 }
6154 if (limit == 0) break;
6155 if (!justcount) {
6156 ele = ln->obj;
6157 addReplyBulk(c,ele);
6158 if (withscores)
6159 addReplyDouble(c,ln->score);
6160 }
6161 ln = ln->forward[0];
6162 rangelen++;
6163 if (limit > 0) limit--;
6164 }
6165 if (justcount) {
6166 addReplyLong(c,(long)rangelen);
6167 } else {
6168 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6169 withscores ? (rangelen*2) : rangelen);
6170 }
6171 }
6172 }
6173 }
6174
6175 static void zrangebyscoreCommand(redisClient *c) {
6176 genericZrangebyscoreCommand(c,0);
6177 }
6178
6179 static void zcountCommand(redisClient *c) {
6180 genericZrangebyscoreCommand(c,1);
6181 }
6182
6183 static void zcardCommand(redisClient *c) {
6184 robj *o;
6185 zset *zs;
6186
6187 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6188 checkType(c,o,REDIS_ZSET)) return;
6189
6190 zs = o->ptr;
6191 addReplyUlong(c,zs->zsl->length);
6192 }
6193
6194 static void zscoreCommand(redisClient *c) {
6195 robj *o;
6196 zset *zs;
6197 dictEntry *de;
6198
6199 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6200 checkType(c,o,REDIS_ZSET)) return;
6201
6202 zs = o->ptr;
6203 de = dictFind(zs->dict,c->argv[2]);
6204 if (!de) {
6205 addReply(c,shared.nullbulk);
6206 } else {
6207 double *score = dictGetEntryVal(de);
6208
6209 addReplyDouble(c,*score);
6210 }
6211 }
6212
6213 static void zrankGenericCommand(redisClient *c, int reverse) {
6214 robj *o;
6215 zset *zs;
6216 zskiplist *zsl;
6217 dictEntry *de;
6218 unsigned long rank;
6219 double *score;
6220
6221 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6222 checkType(c,o,REDIS_ZSET)) return;
6223
6224 zs = o->ptr;
6225 zsl = zs->zsl;
6226 de = dictFind(zs->dict,c->argv[2]);
6227 if (!de) {
6228 addReply(c,shared.nullbulk);
6229 return;
6230 }
6231
6232 score = dictGetEntryVal(de);
6233 rank = zslGetRank(zsl, *score, c->argv[2]);
6234 if (rank) {
6235 if (reverse) {
6236 addReplyLong(c, zsl->length - rank);
6237 } else {
6238 addReplyLong(c, rank-1);
6239 }
6240 } else {
6241 addReply(c,shared.nullbulk);
6242 }
6243 }
6244
6245 static void zrankCommand(redisClient *c) {
6246 zrankGenericCommand(c, 0);
6247 }
6248
6249 static void zrevrankCommand(redisClient *c) {
6250 zrankGenericCommand(c, 1);
6251 }
6252
6253 /* ========================= Hashes utility functions ======================= */
6254 #define REDIS_HASH_KEY 1
6255 #define REDIS_HASH_VALUE 2
6256
6257 /* Check the length of a number of objects to see if we need to convert a
6258 * zipmap to a real hash. Note that we only check string encoded objects
6259 * as their string length can be queried in constant time. */
6260 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6261 int i;
6262 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6263
6264 for (i = start; i <= end; i++) {
6265 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6266 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6267 {
6268 convertToRealHash(subject);
6269 return;
6270 }
6271 }
6272 }
6273
6274 /* Encode given objects in-place when the hash uses a dict. */
6275 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6276 if (subject->encoding == REDIS_ENCODING_HT) {
6277 if (o1) *o1 = tryObjectEncoding(*o1);
6278 if (o2) *o2 = tryObjectEncoding(*o2);
6279 }
6280 }
6281
6282 /* Get the value from a hash identified by key. Returns either a string
6283 * object or NULL if the value cannot be found. The refcount of the object
6284 * is always increased by 1 when the value was found. */
6285 static robj *hashGet(robj *o, robj *key) {
6286 robj *value = NULL;
6287 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6288 unsigned char *v;
6289 unsigned int vlen;
6290 key = getDecodedObject(key);
6291 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6292 value = createStringObject((char*)v,vlen);
6293 }
6294 decrRefCount(key);
6295 } else {
6296 dictEntry *de = dictFind(o->ptr,key);
6297 if (de != NULL) {
6298 value = dictGetEntryVal(de);
6299 incrRefCount(value);
6300 }
6301 }
6302 return value;
6303 }
6304
6305 /* Test if the key exists in the given hash. Returns 1 if the key
6306 * exists and 0 when it doesn't. */
6307 static int hashExists(robj *o, robj *key) {
6308 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6309 key = getDecodedObject(key);
6310 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6311 decrRefCount(key);
6312 return 1;
6313 }
6314 decrRefCount(key);
6315 } else {
6316 if (dictFind(o->ptr,key) != NULL) {
6317 return 1;
6318 }
6319 }
6320 return 0;
6321 }
6322
6323 /* Add an element, discard the old if the key already exists.
6324 * Return 0 on insert and 1 on update. */
6325 static int hashSet(robj *o, robj *key, robj *value) {
6326 int update = 0;
6327 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6328 key = getDecodedObject(key);
6329 value = getDecodedObject(value);
6330 o->ptr = zipmapSet(o->ptr,
6331 key->ptr,sdslen(key->ptr),
6332 value->ptr,sdslen(value->ptr), &update);
6333 decrRefCount(key);
6334 decrRefCount(value);
6335
6336 /* Check if the zipmap needs to be upgraded to a real hash table */
6337 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6338 convertToRealHash(o);
6339 } else {
6340 if (dictReplace(o->ptr,key,value)) {
6341 /* Insert */
6342 incrRefCount(key);
6343 } else {
6344 /* Update */
6345 update = 1;
6346 }
6347 incrRefCount(value);
6348 }
6349 return update;
6350 }
6351
6352 /* Delete an element from a hash.
6353 * Return 1 on deleted and 0 on not found. */
6354 static int hashDelete(robj *o, robj *key) {
6355 int deleted = 0;
6356 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6357 key = getDecodedObject(key);
6358 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6359 decrRefCount(key);
6360 } else {
6361 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6362 /* Always check if the dictionary needs a resize after a delete. */
6363 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6364 }
6365 return deleted;
6366 }
6367
6368 /* Return the number of elements in a hash. */
6369 static unsigned long hashLength(robj *o) {
6370 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6371 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6372 }
6373
6374 /* Structure to hold hash iteration abstration. Note that iteration over
6375 * hashes involves both fields and values. Because it is possible that
6376 * not both are required, store pointers in the iterator to avoid
6377 * unnecessary memory allocation for fields/values. */
6378 typedef struct {
6379 int encoding;
6380 unsigned char *zi;
6381 unsigned char *zk, *zv;
6382 unsigned int zklen, zvlen;
6383
6384 dictIterator *di;
6385 dictEntry *de;
6386 } hashIterator;
6387
6388 static hashIterator *hashInitIterator(robj *subject) {
6389 hashIterator *hi = zmalloc(sizeof(hashIterator));
6390 hi->encoding = subject->encoding;
6391 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6392 hi->zi = zipmapRewind(subject->ptr);
6393 } else if (hi->encoding == REDIS_ENCODING_HT) {
6394 hi->di = dictGetIterator(subject->ptr);
6395 } else {
6396 redisAssert(NULL);
6397 }
6398 return hi;
6399 }
6400
6401 static void hashReleaseIterator(hashIterator *hi) {
6402 if (hi->encoding == REDIS_ENCODING_HT) {
6403 dictReleaseIterator(hi->di);
6404 }
6405 zfree(hi);
6406 }
6407
6408 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6409 * could be found and REDIS_ERR when the iterator reaches the end. */
6410 static int hashNext(hashIterator *hi) {
6411 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6412 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6413 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6414 } else {
6415 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6416 }
6417 return REDIS_OK;
6418 }
6419
6420 /* Get key or value object at current iteration position.
6421 * This increases the refcount of the field object by 1. */
6422 static robj *hashCurrent(hashIterator *hi, int what) {
6423 robj *o;
6424 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6425 if (what & REDIS_HASH_KEY) {
6426 o = createStringObject((char*)hi->zk,hi->zklen);
6427 } else {
6428 o = createStringObject((char*)hi->zv,hi->zvlen);
6429 }
6430 } else {
6431 if (what & REDIS_HASH_KEY) {
6432 o = dictGetEntryKey(hi->de);
6433 } else {
6434 o = dictGetEntryVal(hi->de);
6435 }
6436 incrRefCount(o);
6437 }
6438 return o;
6439 }
6440
6441 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6442 robj *o = lookupKeyWrite(c->db,key);
6443 if (o == NULL) {
6444 o = createHashObject();
6445 dictAdd(c->db->dict,key,o);
6446 incrRefCount(key);
6447 } else {
6448 if (o->type != REDIS_HASH) {
6449 addReply(c,shared.wrongtypeerr);
6450 return NULL;
6451 }
6452 }
6453 return o;
6454 }
6455
6456 /* ============================= Hash commands ============================== */
6457 static void hsetCommand(redisClient *c) {
6458 int update;
6459 robj *o;
6460
6461 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6462 hashTryConversion(o,c->argv,2,3);
6463 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6464 update = hashSet(o,c->argv[2],c->argv[3]);
6465 addReply(c, update ? shared.czero : shared.cone);
6466 server.dirty++;
6467 }
6468
6469 static void hsetnxCommand(redisClient *c) {
6470 robj *o;
6471 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6472 hashTryConversion(o,c->argv,2,3);
6473
6474 if (hashExists(o, c->argv[2])) {
6475 addReply(c, shared.czero);
6476 } else {
6477 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6478 hashSet(o,c->argv[2],c->argv[3]);
6479 addReply(c, shared.cone);
6480 server.dirty++;
6481 }
6482 }
6483
6484 static void hmsetCommand(redisClient *c) {
6485 int i;
6486 robj *o;
6487
6488 if ((c->argc % 2) == 1) {
6489 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6490 return;
6491 }
6492
6493 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6494 hashTryConversion(o,c->argv,2,c->argc-1);
6495 for (i = 2; i < c->argc; i += 2) {
6496 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6497 hashSet(o,c->argv[i],c->argv[i+1]);
6498 }
6499 addReply(c, shared.ok);
6500 server.dirty++;
6501 }
6502
6503 static void hincrbyCommand(redisClient *c) {
6504 long long value, incr;
6505 robj *o, *current, *new;
6506
6507 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6508 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6509 if ((current = hashGet(o,c->argv[2])) != NULL) {
6510 if (getLongLongFromObjectOrReply(c,current,&value,
6511 "hash value is not an integer") != REDIS_OK) {
6512 decrRefCount(current);
6513 return;
6514 }
6515 decrRefCount(current);
6516 } else {
6517 value = 0;
6518 }
6519
6520 value += incr;
6521 new = createStringObjectFromLongLong(value);
6522 hashTryObjectEncoding(o,&c->argv[2],NULL);
6523 hashSet(o,c->argv[2],new);
6524 decrRefCount(new);
6525 addReplyLongLong(c,value);
6526 server.dirty++;
6527 }
6528
6529 static void hgetCommand(redisClient *c) {
6530 robj *o, *value;
6531 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6532 checkType(c,o,REDIS_HASH)) return;
6533
6534 if ((value = hashGet(o,c->argv[2])) != NULL) {
6535 addReplyBulk(c,value);
6536 decrRefCount(value);
6537 } else {
6538 addReply(c,shared.nullbulk);
6539 }
6540 }
6541
6542 static void hmgetCommand(redisClient *c) {
6543 int i;
6544 robj *o, *value;
6545 o = lookupKeyRead(c->db,c->argv[1]);
6546 if (o != NULL && o->type != REDIS_HASH) {
6547 addReply(c,shared.wrongtypeerr);
6548 }
6549
6550 /* Note the check for o != NULL happens inside the loop. This is
6551 * done because objects that cannot be found are considered to be
6552 * an empty hash. The reply should then be a series of NULLs. */
6553 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6554 for (i = 2; i < c->argc; i++) {
6555 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6556 addReplyBulk(c,value);
6557 decrRefCount(value);
6558 } else {
6559 addReply(c,shared.nullbulk);
6560 }
6561 }
6562 }
6563
6564 static void hdelCommand(redisClient *c) {
6565 robj *o;
6566 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6567 checkType(c,o,REDIS_HASH)) return;
6568
6569 if (hashDelete(o,c->argv[2])) {
6570 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6571 addReply(c,shared.cone);
6572 server.dirty++;
6573 } else {
6574 addReply(c,shared.czero);
6575 }
6576 }
6577
6578 static void hlenCommand(redisClient *c) {
6579 robj *o;
6580 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6581 checkType(c,o,REDIS_HASH)) return;
6582
6583 addReplyUlong(c,hashLength(o));
6584 }
6585
6586 static void genericHgetallCommand(redisClient *c, int flags) {
6587 robj *o, *lenobj, *obj;
6588 unsigned long count = 0;
6589 hashIterator *hi;
6590
6591 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6592 || checkType(c,o,REDIS_HASH)) return;
6593
6594 lenobj = createObject(REDIS_STRING,NULL);
6595 addReply(c,lenobj);
6596 decrRefCount(lenobj);
6597
6598 hi = hashInitIterator(o);
6599 while (hashNext(hi) != REDIS_ERR) {
6600 if (flags & REDIS_HASH_KEY) {
6601 obj = hashCurrent(hi,REDIS_HASH_KEY);
6602 addReplyBulk(c,obj);
6603 decrRefCount(obj);
6604 count++;
6605 }
6606 if (flags & REDIS_HASH_VALUE) {
6607 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6608 addReplyBulk(c,obj);
6609 decrRefCount(obj);
6610 count++;
6611 }
6612 }
6613 hashReleaseIterator(hi);
6614
6615 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6616 }
6617
6618 static void hkeysCommand(redisClient *c) {
6619 genericHgetallCommand(c,REDIS_HASH_KEY);
6620 }
6621
6622 static void hvalsCommand(redisClient *c) {
6623 genericHgetallCommand(c,REDIS_HASH_VALUE);
6624 }
6625
6626 static void hgetallCommand(redisClient *c) {
6627 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6628 }
6629
6630 static void hexistsCommand(redisClient *c) {
6631 robj *o;
6632 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6633 checkType(c,o,REDIS_HASH)) return;
6634
6635 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6636 }
6637
6638 static void convertToRealHash(robj *o) {
6639 unsigned char *key, *val, *p, *zm = o->ptr;
6640 unsigned int klen, vlen;
6641 dict *dict = dictCreate(&hashDictType,NULL);
6642
6643 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6644 p = zipmapRewind(zm);
6645 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6646 robj *keyobj, *valobj;
6647
6648 keyobj = createStringObject((char*)key,klen);
6649 valobj = createStringObject((char*)val,vlen);
6650 keyobj = tryObjectEncoding(keyobj);
6651 valobj = tryObjectEncoding(valobj);
6652 dictAdd(dict,keyobj,valobj);
6653 }
6654 o->encoding = REDIS_ENCODING_HT;
6655 o->ptr = dict;
6656 zfree(zm);
6657 }
6658
6659 /* ========================= Non type-specific commands ==================== */
6660
6661 static void flushdbCommand(redisClient *c) {
6662 server.dirty += dictSize(c->db->dict);
6663 dictEmpty(c->db->dict);
6664 dictEmpty(c->db->expires);
6665 addReply(c,shared.ok);
6666 }
6667
6668 static void flushallCommand(redisClient *c) {
6669 server.dirty += emptyDb();
6670 addReply(c,shared.ok);
6671 if (server.bgsavechildpid != -1) {
6672 kill(server.bgsavechildpid,SIGKILL);
6673 rdbRemoveTempFile(server.bgsavechildpid);
6674 }
6675 rdbSave(server.dbfilename);
6676 server.dirty++;
6677 }
6678
6679 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6680 redisSortOperation *so = zmalloc(sizeof(*so));
6681 so->type = type;
6682 so->pattern = pattern;
6683 return so;
6684 }
6685
6686 /* Return the value associated to the key with a name obtained
6687 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6688 * The returned object will always have its refcount increased by 1
6689 * when it is non-NULL. */
6690 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6691 char *p, *f;
6692 sds spat, ssub;
6693 robj keyobj, fieldobj, *o;
6694 int prefixlen, sublen, postfixlen, fieldlen;
6695 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6696 struct {
6697 long len;
6698 long free;
6699 char buf[REDIS_SORTKEY_MAX+1];
6700 } keyname, fieldname;
6701
6702 /* If the pattern is "#" return the substitution object itself in order
6703 * to implement the "SORT ... GET #" feature. */
6704 spat = pattern->ptr;
6705 if (spat[0] == '#' && spat[1] == '\0') {
6706 incrRefCount(subst);
6707 return subst;
6708 }
6709
6710 /* The substitution object may be specially encoded. If so we create
6711 * a decoded object on the fly. Otherwise getDecodedObject will just
6712 * increment the ref count, that we'll decrement later. */
6713 subst = getDecodedObject(subst);
6714
6715 ssub = subst->ptr;
6716 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6717 p = strchr(spat,'*');
6718 if (!p) {
6719 decrRefCount(subst);
6720 return NULL;
6721 }
6722
6723 /* Find out if we're dealing with a hash dereference. */
6724 if ((f = strstr(p+1, "->")) != NULL) {
6725 fieldlen = sdslen(spat)-(f-spat);
6726 /* this also copies \0 character */
6727 memcpy(fieldname.buf,f+2,fieldlen-1);
6728 fieldname.len = fieldlen-2;
6729 } else {
6730 fieldlen = 0;
6731 }
6732
6733 prefixlen = p-spat;
6734 sublen = sdslen(ssub);
6735 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6736 memcpy(keyname.buf,spat,prefixlen);
6737 memcpy(keyname.buf+prefixlen,ssub,sublen);
6738 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6739 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6740 keyname.len = prefixlen+sublen+postfixlen;
6741 decrRefCount(subst);
6742
6743 /* Lookup substituted key */
6744 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6745 o = lookupKeyRead(db,&keyobj);
6746 if (o == NULL) return NULL;
6747
6748 if (fieldlen > 0) {
6749 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6750
6751 /* Retrieve value from hash by the field name. This operation
6752 * already increases the refcount of the returned object. */
6753 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6754 o = hashGet(o, &fieldobj);
6755 } else {
6756 if (o->type != REDIS_STRING) return NULL;
6757
6758 /* Every object that this function returns needs to have its refcount
6759 * increased. sortCommand decreases it again. */
6760 incrRefCount(o);
6761 }
6762
6763 return o;
6764 }
6765
6766 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6767 * the additional parameter is not standard but a BSD-specific we have to
6768 * pass sorting parameters via the global 'server' structure */
6769 static int sortCompare(const void *s1, const void *s2) {
6770 const redisSortObject *so1 = s1, *so2 = s2;
6771 int cmp;
6772
6773 if (!server.sort_alpha) {
6774 /* Numeric sorting. Here it's trivial as we precomputed scores */
6775 if (so1->u.score > so2->u.score) {
6776 cmp = 1;
6777 } else if (so1->u.score < so2->u.score) {
6778 cmp = -1;
6779 } else {
6780 cmp = 0;
6781 }
6782 } else {
6783 /* Alphanumeric sorting */
6784 if (server.sort_bypattern) {
6785 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6786 /* At least one compare object is NULL */
6787 if (so1->u.cmpobj == so2->u.cmpobj)
6788 cmp = 0;
6789 else if (so1->u.cmpobj == NULL)
6790 cmp = -1;
6791 else
6792 cmp = 1;
6793 } else {
6794 /* We have both the objects, use strcoll */
6795 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6796 }
6797 } else {
6798 /* Compare elements directly. */
6799 cmp = compareStringObjects(so1->obj,so2->obj);
6800 }
6801 }
6802 return server.sort_desc ? -cmp : cmp;
6803 }
6804
6805 /* The SORT command is the most complex command in Redis. Warning: this code
6806 * is optimized for speed and a bit less for readability */
6807 static void sortCommand(redisClient *c) {
6808 list *operations;
6809 int outputlen = 0;
6810 int desc = 0, alpha = 0;
6811 int limit_start = 0, limit_count = -1, start, end;
6812 int j, dontsort = 0, vectorlen;
6813 int getop = 0; /* GET operation counter */
6814 robj *sortval, *sortby = NULL, *storekey = NULL;
6815 redisSortObject *vector; /* Resulting vector to sort */
6816
6817 /* Lookup the key to sort. It must be of the right types */
6818 sortval = lookupKeyRead(c->db,c->argv[1]);
6819 if (sortval == NULL) {
6820 addReply(c,shared.emptymultibulk);
6821 return;
6822 }
6823 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6824 sortval->type != REDIS_ZSET)
6825 {
6826 addReply(c,shared.wrongtypeerr);
6827 return;
6828 }
6829
6830 /* Create a list of operations to perform for every sorted element.
6831 * Operations can be GET/DEL/INCR/DECR */
6832 operations = listCreate();
6833 listSetFreeMethod(operations,zfree);
6834 j = 2;
6835
6836 /* Now we need to protect sortval incrementing its count, in the future
6837 * SORT may have options able to overwrite/delete keys during the sorting
6838 * and the sorted key itself may get destroied */
6839 incrRefCount(sortval);
6840
6841 /* The SORT command has an SQL-alike syntax, parse it */
6842 while(j < c->argc) {
6843 int leftargs = c->argc-j-1;
6844 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6845 desc = 0;
6846 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6847 desc = 1;
6848 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6849 alpha = 1;
6850 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6851 limit_start = atoi(c->argv[j+1]->ptr);
6852 limit_count = atoi(c->argv[j+2]->ptr);
6853 j+=2;
6854 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6855 storekey = c->argv[j+1];
6856 j++;
6857 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6858 sortby = c->argv[j+1];
6859 /* If the BY pattern does not contain '*', i.e. it is constant,
6860 * we don't need to sort nor to lookup the weight keys. */
6861 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6862 j++;
6863 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6864 listAddNodeTail(operations,createSortOperation(
6865 REDIS_SORT_GET,c->argv[j+1]));
6866 getop++;
6867 j++;
6868 } else {
6869 decrRefCount(sortval);
6870 listRelease(operations);
6871 addReply(c,shared.syntaxerr);
6872 return;
6873 }
6874 j++;
6875 }
6876
6877 /* Load the sorting vector with all the objects to sort */
6878 switch(sortval->type) {
6879 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6880 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6881 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6882 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6883 }
6884 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6885 j = 0;
6886
6887 if (sortval->type == REDIS_LIST) {
6888 list *list = sortval->ptr;
6889 listNode *ln;
6890 listIter li;
6891
6892 listRewind(list,&li);
6893 while((ln = listNext(&li))) {
6894 robj *ele = ln->value;
6895 vector[j].obj = ele;
6896 vector[j].u.score = 0;
6897 vector[j].u.cmpobj = NULL;
6898 j++;
6899 }
6900 } else {
6901 dict *set;
6902 dictIterator *di;
6903 dictEntry *setele;
6904
6905 if (sortval->type == REDIS_SET) {
6906 set = sortval->ptr;
6907 } else {
6908 zset *zs = sortval->ptr;
6909 set = zs->dict;
6910 }
6911
6912 di = dictGetIterator(set);
6913 while((setele = dictNext(di)) != NULL) {
6914 vector[j].obj = dictGetEntryKey(setele);
6915 vector[j].u.score = 0;
6916 vector[j].u.cmpobj = NULL;
6917 j++;
6918 }
6919 dictReleaseIterator(di);
6920 }
6921 redisAssert(j == vectorlen);
6922
6923 /* Now it's time to load the right scores in the sorting vector */
6924 if (dontsort == 0) {
6925 for (j = 0; j < vectorlen; j++) {
6926 robj *byval;
6927 if (sortby) {
6928 /* lookup value to sort by */
6929 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6930 if (!byval) continue;
6931 } else {
6932 /* use object itself to sort by */
6933 byval = vector[j].obj;
6934 }
6935
6936 if (alpha) {
6937 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6938 } else {
6939 if (byval->encoding == REDIS_ENCODING_RAW) {
6940 vector[j].u.score = strtod(byval->ptr,NULL);
6941 } else if (byval->encoding == REDIS_ENCODING_INT) {
6942 /* Don't need to decode the object if it's
6943 * integer-encoded (the only encoding supported) so
6944 * far. We can just cast it */
6945 vector[j].u.score = (long)byval->ptr;
6946 } else {
6947 redisAssert(1 != 1);
6948 }
6949 }
6950
6951 /* when the object was retrieved using lookupKeyByPattern,
6952 * its refcount needs to be decreased. */
6953 if (sortby) {
6954 decrRefCount(byval);
6955 }
6956 }
6957 }
6958
6959 /* We are ready to sort the vector... perform a bit of sanity check
6960 * on the LIMIT option too. We'll use a partial version of quicksort. */
6961 start = (limit_start < 0) ? 0 : limit_start;
6962 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6963 if (start >= vectorlen) {
6964 start = vectorlen-1;
6965 end = vectorlen-2;
6966 }
6967 if (end >= vectorlen) end = vectorlen-1;
6968
6969 if (dontsort == 0) {
6970 server.sort_desc = desc;
6971 server.sort_alpha = alpha;
6972 server.sort_bypattern = sortby ? 1 : 0;
6973 if (sortby && (start != 0 || end != vectorlen-1))
6974 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6975 else
6976 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6977 }
6978
6979 /* Send command output to the output buffer, performing the specified
6980 * GET/DEL/INCR/DECR operations if any. */
6981 outputlen = getop ? getop*(end-start+1) : end-start+1;
6982 if (storekey == NULL) {
6983 /* STORE option not specified, sent the sorting result to client */
6984 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6985 for (j = start; j <= end; j++) {
6986 listNode *ln;
6987 listIter li;
6988
6989 if (!getop) addReplyBulk(c,vector[j].obj);
6990 listRewind(operations,&li);
6991 while((ln = listNext(&li))) {
6992 redisSortOperation *sop = ln->value;
6993 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6994 vector[j].obj);
6995
6996 if (sop->type == REDIS_SORT_GET) {
6997 if (!val) {
6998 addReply(c,shared.nullbulk);
6999 } else {
7000 addReplyBulk(c,val);
7001 decrRefCount(val);
7002 }
7003 } else {
7004 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7005 }
7006 }
7007 }
7008 } else {
7009 robj *listObject = createListObject();
7010 list *listPtr = (list*) listObject->ptr;
7011
7012 /* STORE option specified, set the sorting result as a List object */
7013 for (j = start; j <= end; j++) {
7014 listNode *ln;
7015 listIter li;
7016
7017 if (!getop) {
7018 listAddNodeTail(listPtr,vector[j].obj);
7019 incrRefCount(vector[j].obj);
7020 }
7021 listRewind(operations,&li);
7022 while((ln = listNext(&li))) {
7023 redisSortOperation *sop = ln->value;
7024 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7025 vector[j].obj);
7026
7027 if (sop->type == REDIS_SORT_GET) {
7028 if (!val) {
7029 listAddNodeTail(listPtr,createStringObject("",0));
7030 } else {
7031 /* We should do a incrRefCount on val because it is
7032 * added to the list, but also a decrRefCount because
7033 * it is returned by lookupKeyByPattern. This results
7034 * in doing nothing at all. */
7035 listAddNodeTail(listPtr,val);
7036 }
7037 } else {
7038 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
7039 }
7040 }
7041 }
7042 if (dictReplace(c->db->dict,storekey,listObject)) {
7043 incrRefCount(storekey);
7044 }
7045 /* Note: we add 1 because the DB is dirty anyway since even if the
7046 * SORT result is empty a new key is set and maybe the old content
7047 * replaced. */
7048 server.dirty += 1+outputlen;
7049 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7050 }
7051
7052 /* Cleanup */
7053 decrRefCount(sortval);
7054 listRelease(operations);
7055 for (j = 0; j < vectorlen; j++) {
7056 if (alpha && vector[j].u.cmpobj)
7057 decrRefCount(vector[j].u.cmpobj);
7058 }
7059 zfree(vector);
7060 }
7061
7062 /* Convert an amount of bytes into a human readable string in the form
7063 * of 100B, 2G, 100M, 4K, and so forth. */
7064 static void bytesToHuman(char *s, unsigned long long n) {
7065 double d;
7066
7067 if (n < 1024) {
7068 /* Bytes */
7069 sprintf(s,"%lluB",n);
7070 return;
7071 } else if (n < (1024*1024)) {
7072 d = (double)n/(1024);
7073 sprintf(s,"%.2fK",d);
7074 } else if (n < (1024LL*1024*1024)) {
7075 d = (double)n/(1024*1024);
7076 sprintf(s,"%.2fM",d);
7077 } else if (n < (1024LL*1024*1024*1024)) {
7078 d = (double)n/(1024LL*1024*1024);
7079 sprintf(s,"%.2fG",d);
7080 }
7081 }
7082
7083 /* Create the string returned by the INFO command. This is decoupled
7084 * by the INFO command itself as we need to report the same information
7085 * on memory corruption problems. */
7086 static sds genRedisInfoString(void) {
7087 sds info;
7088 time_t uptime = time(NULL)-server.stat_starttime;
7089 int j;
7090 char hmem[64];
7091
7092 bytesToHuman(hmem,zmalloc_used_memory());
7093 info = sdscatprintf(sdsempty(),
7094 "redis_version:%s\r\n"
7095 "arch_bits:%s\r\n"
7096 "multiplexing_api:%s\r\n"
7097 "process_id:%ld\r\n"
7098 "uptime_in_seconds:%ld\r\n"
7099 "uptime_in_days:%ld\r\n"
7100 "connected_clients:%d\r\n"
7101 "connected_slaves:%d\r\n"
7102 "blocked_clients:%d\r\n"
7103 "used_memory:%zu\r\n"
7104 "used_memory_human:%s\r\n"
7105 "changes_since_last_save:%lld\r\n"
7106 "bgsave_in_progress:%d\r\n"
7107 "last_save_time:%ld\r\n"
7108 "bgrewriteaof_in_progress:%d\r\n"
7109 "total_connections_received:%lld\r\n"
7110 "total_commands_processed:%lld\r\n"
7111 "expired_keys:%lld\r\n"
7112 "hash_max_zipmap_entries:%ld\r\n"
7113 "hash_max_zipmap_value:%ld\r\n"
7114 "pubsub_channels:%ld\r\n"
7115 "pubsub_patterns:%u\r\n"
7116 "vm_enabled:%d\r\n"
7117 "role:%s\r\n"
7118 ,REDIS_VERSION,
7119 (sizeof(long) == 8) ? "64" : "32",
7120 aeGetApiName(),
7121 (long) getpid(),
7122 uptime,
7123 uptime/(3600*24),
7124 listLength(server.clients)-listLength(server.slaves),
7125 listLength(server.slaves),
7126 server.blpop_blocked_clients,
7127 zmalloc_used_memory(),
7128 hmem,
7129 server.dirty,
7130 server.bgsavechildpid != -1,
7131 server.lastsave,
7132 server.bgrewritechildpid != -1,
7133 server.stat_numconnections,
7134 server.stat_numcommands,
7135 server.stat_expiredkeys,
7136 server.hash_max_zipmap_entries,
7137 server.hash_max_zipmap_value,
7138 dictSize(server.pubsub_channels),
7139 listLength(server.pubsub_patterns),
7140 server.vm_enabled != 0,
7141 server.masterhost == NULL ? "master" : "slave"
7142 );
7143 if (server.masterhost) {
7144 info = sdscatprintf(info,
7145 "master_host:%s\r\n"
7146 "master_port:%d\r\n"
7147 "master_link_status:%s\r\n"
7148 "master_last_io_seconds_ago:%d\r\n"
7149 ,server.masterhost,
7150 server.masterport,
7151 (server.replstate == REDIS_REPL_CONNECTED) ?
7152 "up" : "down",
7153 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7154 );
7155 }
7156 if (server.vm_enabled) {
7157 lockThreadedIO();
7158 info = sdscatprintf(info,
7159 "vm_conf_max_memory:%llu\r\n"
7160 "vm_conf_page_size:%llu\r\n"
7161 "vm_conf_pages:%llu\r\n"
7162 "vm_stats_used_pages:%llu\r\n"
7163 "vm_stats_swapped_objects:%llu\r\n"
7164 "vm_stats_swappin_count:%llu\r\n"
7165 "vm_stats_swappout_count:%llu\r\n"
7166 "vm_stats_io_newjobs_len:%lu\r\n"
7167 "vm_stats_io_processing_len:%lu\r\n"
7168 "vm_stats_io_processed_len:%lu\r\n"
7169 "vm_stats_io_active_threads:%lu\r\n"
7170 "vm_stats_blocked_clients:%lu\r\n"
7171 ,(unsigned long long) server.vm_max_memory,
7172 (unsigned long long) server.vm_page_size,
7173 (unsigned long long) server.vm_pages,
7174 (unsigned long long) server.vm_stats_used_pages,
7175 (unsigned long long) server.vm_stats_swapped_objects,
7176 (unsigned long long) server.vm_stats_swapins,
7177 (unsigned long long) server.vm_stats_swapouts,
7178 (unsigned long) listLength(server.io_newjobs),
7179 (unsigned long) listLength(server.io_processing),
7180 (unsigned long) listLength(server.io_processed),
7181 (unsigned long) server.io_active_threads,
7182 (unsigned long) server.vm_blocked_clients
7183 );
7184 unlockThreadedIO();
7185 }
7186 for (j = 0; j < server.dbnum; j++) {
7187 long long keys, vkeys;
7188
7189 keys = dictSize(server.db[j].dict);
7190 vkeys = dictSize(server.db[j].expires);
7191 if (keys || vkeys) {
7192 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7193 j, keys, vkeys);
7194 }
7195 }
7196 return info;
7197 }
7198
7199 static void infoCommand(redisClient *c) {
7200 sds info = genRedisInfoString();
7201 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7202 (unsigned long)sdslen(info)));
7203 addReplySds(c,info);
7204 addReply(c,shared.crlf);
7205 }
7206
7207 static void monitorCommand(redisClient *c) {
7208 /* ignore MONITOR if aleady slave or in monitor mode */
7209 if (c->flags & REDIS_SLAVE) return;
7210
7211 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7212 c->slaveseldb = 0;
7213 listAddNodeTail(server.monitors,c);
7214 addReply(c,shared.ok);
7215 }
7216
7217 /* ================================= Expire ================================= */
7218 static int removeExpire(redisDb *db, robj *key) {
7219 if (dictDelete(db->expires,key) == DICT_OK) {
7220 return 1;
7221 } else {
7222 return 0;
7223 }
7224 }
7225
7226 static int setExpire(redisDb *db, robj *key, time_t when) {
7227 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7228 return 0;
7229 } else {
7230 incrRefCount(key);
7231 return 1;
7232 }
7233 }
7234
7235 /* Return the expire time of the specified key, or -1 if no expire
7236 * is associated with this key (i.e. the key is non volatile) */
7237 static time_t getExpire(redisDb *db, robj *key) {
7238 dictEntry *de;
7239
7240 /* No expire? return ASAP */
7241 if (dictSize(db->expires) == 0 ||
7242 (de = dictFind(db->expires,key)) == NULL) return -1;
7243
7244 return (time_t) dictGetEntryVal(de);
7245 }
7246
7247 static int expireIfNeeded(redisDb *db, robj *key) {
7248 time_t when;
7249 dictEntry *de;
7250
7251 /* No expire? return ASAP */
7252 if (dictSize(db->expires) == 0 ||
7253 (de = dictFind(db->expires,key)) == NULL) return 0;
7254
7255 /* Lookup the expire */
7256 when = (time_t) dictGetEntryVal(de);
7257 if (time(NULL) <= when) return 0;
7258
7259 /* Delete the key */
7260 dictDelete(db->expires,key);
7261 server.stat_expiredkeys++;
7262 return dictDelete(db->dict,key) == DICT_OK;
7263 }
7264
7265 static int deleteIfVolatile(redisDb *db, robj *key) {
7266 dictEntry *de;
7267
7268 /* No expire? return ASAP */
7269 if (dictSize(db->expires) == 0 ||
7270 (de = dictFind(db->expires,key)) == NULL) return 0;
7271
7272 /* Delete the key */
7273 server.dirty++;
7274 server.stat_expiredkeys++;
7275 dictDelete(db->expires,key);
7276 return dictDelete(db->dict,key) == DICT_OK;
7277 }
7278
7279 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7280 dictEntry *de;
7281 time_t seconds;
7282
7283 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7284
7285 seconds -= offset;
7286
7287 de = dictFind(c->db->dict,key);
7288 if (de == NULL) {
7289 addReply(c,shared.czero);
7290 return;
7291 }
7292 if (seconds <= 0) {
7293 if (deleteKey(c->db,key)) server.dirty++;
7294 addReply(c, shared.cone);
7295 return;
7296 } else {
7297 time_t when = time(NULL)+seconds;
7298 if (setExpire(c->db,key,when)) {
7299 addReply(c,shared.cone);
7300 server.dirty++;
7301 } else {
7302 addReply(c,shared.czero);
7303 }
7304 return;
7305 }
7306 }
7307
7308 static void expireCommand(redisClient *c) {
7309 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7310 }
7311
7312 static void expireatCommand(redisClient *c) {
7313 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7314 }
7315
7316 static void ttlCommand(redisClient *c) {
7317 time_t expire;
7318 int ttl = -1;
7319
7320 expire = getExpire(c->db,c->argv[1]);
7321 if (expire != -1) {
7322 ttl = (int) (expire-time(NULL));
7323 if (ttl < 0) ttl = -1;
7324 }
7325 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7326 }
7327
7328 /* ================================ MULTI/EXEC ============================== */
7329
7330 /* Client state initialization for MULTI/EXEC */
7331 static void initClientMultiState(redisClient *c) {
7332 c->mstate.commands = NULL;
7333 c->mstate.count = 0;
7334 }
7335
7336 /* Release all the resources associated with MULTI/EXEC state */
7337 static void freeClientMultiState(redisClient *c) {
7338 int j;
7339
7340 for (j = 0; j < c->mstate.count; j++) {
7341 int i;
7342 multiCmd *mc = c->mstate.commands+j;
7343
7344 for (i = 0; i < mc->argc; i++)
7345 decrRefCount(mc->argv[i]);
7346 zfree(mc->argv);
7347 }
7348 zfree(c->mstate.commands);
7349 }
7350
7351 /* Add a new command into the MULTI commands queue */
7352 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7353 multiCmd *mc;
7354 int j;
7355
7356 c->mstate.commands = zrealloc(c->mstate.commands,
7357 sizeof(multiCmd)*(c->mstate.count+1));
7358 mc = c->mstate.commands+c->mstate.count;
7359 mc->cmd = cmd;
7360 mc->argc = c->argc;
7361 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7362 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7363 for (j = 0; j < c->argc; j++)
7364 incrRefCount(mc->argv[j]);
7365 c->mstate.count++;
7366 }
7367
7368 static void multiCommand(redisClient *c) {
7369 c->flags |= REDIS_MULTI;
7370 addReply(c,shared.ok);
7371 }
7372
7373 static void discardCommand(redisClient *c) {
7374 if (!(c->flags & REDIS_MULTI)) {
7375 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7376 return;
7377 }
7378
7379 freeClientMultiState(c);
7380 initClientMultiState(c);
7381 c->flags &= (~REDIS_MULTI);
7382 addReply(c,shared.ok);
7383 }
7384
7385 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7386 * implememntation for more information. */
7387 static void execCommandReplicateMulti(redisClient *c) {
7388 struct redisCommand *cmd;
7389 robj *multistring = createStringObject("MULTI",5);
7390
7391 cmd = lookupCommand("multi");
7392 if (server.appendonly)
7393 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7394 if (listLength(server.slaves))
7395 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7396 decrRefCount(multistring);
7397 }
7398
7399 static void execCommand(redisClient *c) {
7400 int j;
7401 robj **orig_argv;
7402 int orig_argc;
7403
7404 if (!(c->flags & REDIS_MULTI)) {
7405 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7406 return;
7407 }
7408
7409 /* Replicate a MULTI request now that we are sure the block is executed.
7410 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7411 * both the AOF and the replication link will have the same consistency
7412 * and atomicity guarantees. */
7413 execCommandReplicateMulti(c);
7414
7415 /* Exec all the queued commands */
7416 orig_argv = c->argv;
7417 orig_argc = c->argc;
7418 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7419 for (j = 0; j < c->mstate.count; j++) {
7420 c->argc = c->mstate.commands[j].argc;
7421 c->argv = c->mstate.commands[j].argv;
7422 call(c,c->mstate.commands[j].cmd);
7423 }
7424 c->argv = orig_argv;
7425 c->argc = orig_argc;
7426 freeClientMultiState(c);
7427 initClientMultiState(c);
7428 c->flags &= (~REDIS_MULTI);
7429 /* Make sure the EXEC command is always replicated / AOF, since we
7430 * always send the MULTI command (we can't know beforehand if the
7431 * next operations will contain at least a modification to the DB). */
7432 server.dirty++;
7433 }
7434
7435 /* =========================== Blocking Operations ========================= */
7436
7437 /* Currently Redis blocking operations support is limited to list POP ops,
7438 * so the current implementation is not fully generic, but it is also not
7439 * completely specific so it will not require a rewrite to support new
7440 * kind of blocking operations in the future.
7441 *
7442 * Still it's important to note that list blocking operations can be already
7443 * used as a notification mechanism in order to implement other blocking
7444 * operations at application level, so there must be a very strong evidence
7445 * of usefulness and generality before new blocking operations are implemented.
7446 *
7447 * This is how the current blocking POP works, we use BLPOP as example:
7448 * - If the user calls BLPOP and the key exists and contains a non empty list
7449 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7450 * if there is not to block.
7451 * - If instead BLPOP is called and the key does not exists or the list is
7452 * empty we need to block. In order to do so we remove the notification for
7453 * new data to read in the client socket (so that we'll not serve new
7454 * requests if the blocking request is not served). Also we put the client
7455 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7456 * blocking for this keys.
7457 * - If a PUSH operation against a key with blocked clients waiting is
7458 * performed, we serve the first in the list: basically instead to push
7459 * the new element inside the list we return it to the (first / oldest)
7460 * blocking client, unblock the client, and remove it form the list.
7461 *
7462 * The above comment and the source code should be enough in order to understand
7463 * the implementation and modify / fix it later.
7464 */
7465
7466 /* Set a client in blocking mode for the specified key, with the specified
7467 * timeout */
7468 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7469 dictEntry *de;
7470 list *l;
7471 int j;
7472
7473 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7474 c->blockingkeysnum = numkeys;
7475 c->blockingto = timeout;
7476 for (j = 0; j < numkeys; j++) {
7477 /* Add the key in the client structure, to map clients -> keys */
7478 c->blockingkeys[j] = keys[j];
7479 incrRefCount(keys[j]);
7480
7481 /* And in the other "side", to map keys -> clients */
7482 de = dictFind(c->db->blockingkeys,keys[j]);
7483 if (de == NULL) {
7484 int retval;
7485
7486 /* For every key we take a list of clients blocked for it */
7487 l = listCreate();
7488 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7489 incrRefCount(keys[j]);
7490 assert(retval == DICT_OK);
7491 } else {
7492 l = dictGetEntryVal(de);
7493 }
7494 listAddNodeTail(l,c);
7495 }
7496 /* Mark the client as a blocked client */
7497 c->flags |= REDIS_BLOCKED;
7498 server.blpop_blocked_clients++;
7499 }
7500
7501 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7502 static void unblockClientWaitingData(redisClient *c) {
7503 dictEntry *de;
7504 list *l;
7505 int j;
7506
7507 assert(c->blockingkeys != NULL);
7508 /* The client may wait for multiple keys, so unblock it for every key. */
7509 for (j = 0; j < c->blockingkeysnum; j++) {
7510 /* Remove this client from the list of clients waiting for this key. */
7511 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7512 assert(de != NULL);
7513 l = dictGetEntryVal(de);
7514 listDelNode(l,listSearchKey(l,c));
7515 /* If the list is empty we need to remove it to avoid wasting memory */
7516 if (listLength(l) == 0)
7517 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7518 decrRefCount(c->blockingkeys[j]);
7519 }
7520 /* Cleanup the client structure */
7521 zfree(c->blockingkeys);
7522 c->blockingkeys = NULL;
7523 c->flags &= (~REDIS_BLOCKED);
7524 server.blpop_blocked_clients--;
7525 /* We want to process data if there is some command waiting
7526 * in the input buffer. Note that this is safe even if
7527 * unblockClientWaitingData() gets called from freeClient() because
7528 * freeClient() will be smart enough to call this function
7529 * *after* c->querybuf was set to NULL. */
7530 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7531 }
7532
7533 /* This should be called from any function PUSHing into lists.
7534 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7535 * 'ele' is the element pushed.
7536 *
7537 * If the function returns 0 there was no client waiting for a list push
7538 * against this key.
7539 *
7540 * If the function returns 1 there was a client waiting for a list push
7541 * against this key, the element was passed to this client thus it's not
7542 * needed to actually add it to the list and the caller should return asap. */
7543 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7544 struct dictEntry *de;
7545 redisClient *receiver;
7546 list *l;
7547 listNode *ln;
7548
7549 de = dictFind(c->db->blockingkeys,key);
7550 if (de == NULL) return 0;
7551 l = dictGetEntryVal(de);
7552 ln = listFirst(l);
7553 assert(ln != NULL);
7554 receiver = ln->value;
7555
7556 addReplySds(receiver,sdsnew("*2\r\n"));
7557 addReplyBulk(receiver,key);
7558 addReplyBulk(receiver,ele);
7559 unblockClientWaitingData(receiver);
7560 return 1;
7561 }
7562
7563 /* Blocking RPOP/LPOP */
7564 static void blockingPopGenericCommand(redisClient *c, int where) {
7565 robj *o;
7566 time_t timeout;
7567 int j;
7568
7569 for (j = 1; j < c->argc-1; j++) {
7570 o = lookupKeyWrite(c->db,c->argv[j]);
7571 if (o != NULL) {
7572 if (o->type != REDIS_LIST) {
7573 addReply(c,shared.wrongtypeerr);
7574 return;
7575 } else {
7576 list *list = o->ptr;
7577 if (listLength(list) != 0) {
7578 /* If the list contains elements fall back to the usual
7579 * non-blocking POP operation */
7580 robj *argv[2], **orig_argv;
7581 int orig_argc;
7582
7583 /* We need to alter the command arguments before to call
7584 * popGenericCommand() as the command takes a single key. */
7585 orig_argv = c->argv;
7586 orig_argc = c->argc;
7587 argv[1] = c->argv[j];
7588 c->argv = argv;
7589 c->argc = 2;
7590
7591 /* Also the return value is different, we need to output
7592 * the multi bulk reply header and the key name. The
7593 * "real" command will add the last element (the value)
7594 * for us. If this souds like an hack to you it's just
7595 * because it is... */
7596 addReplySds(c,sdsnew("*2\r\n"));
7597 addReplyBulk(c,argv[1]);
7598 popGenericCommand(c,where);
7599
7600 /* Fix the client structure with the original stuff */
7601 c->argv = orig_argv;
7602 c->argc = orig_argc;
7603 return;
7604 }
7605 }
7606 }
7607 }
7608 /* If the list is empty or the key does not exists we must block */
7609 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7610 if (timeout > 0) timeout += time(NULL);
7611 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7612 }
7613
7614 static void blpopCommand(redisClient *c) {
7615 blockingPopGenericCommand(c,REDIS_HEAD);
7616 }
7617
7618 static void brpopCommand(redisClient *c) {
7619 blockingPopGenericCommand(c,REDIS_TAIL);
7620 }
7621
7622 /* =============================== Replication ============================= */
7623
7624 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7625 ssize_t nwritten, ret = size;
7626 time_t start = time(NULL);
7627
7628 timeout++;
7629 while(size) {
7630 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7631 nwritten = write(fd,ptr,size);
7632 if (nwritten == -1) return -1;
7633 ptr += nwritten;
7634 size -= nwritten;
7635 }
7636 if ((time(NULL)-start) > timeout) {
7637 errno = ETIMEDOUT;
7638 return -1;
7639 }
7640 }
7641 return ret;
7642 }
7643
7644 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7645 ssize_t nread, totread = 0;
7646 time_t start = time(NULL);
7647
7648 timeout++;
7649 while(size) {
7650 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7651 nread = read(fd,ptr,size);
7652 if (nread == -1) return -1;
7653 ptr += nread;
7654 size -= nread;
7655 totread += nread;
7656 }
7657 if ((time(NULL)-start) > timeout) {
7658 errno = ETIMEDOUT;
7659 return -1;
7660 }
7661 }
7662 return totread;
7663 }
7664
7665 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7666 ssize_t nread = 0;
7667
7668 size--;
7669 while(size) {
7670 char c;
7671
7672 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7673 if (c == '\n') {
7674 *ptr = '\0';
7675 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7676 return nread;
7677 } else {
7678 *ptr++ = c;
7679 *ptr = '\0';
7680 nread++;
7681 }
7682 }
7683 return nread;
7684 }
7685
7686 static void syncCommand(redisClient *c) {
7687 /* ignore SYNC if aleady slave or in monitor mode */
7688 if (c->flags & REDIS_SLAVE) return;
7689
7690 /* SYNC can't be issued when the server has pending data to send to
7691 * the client about already issued commands. We need a fresh reply
7692 * buffer registering the differences between the BGSAVE and the current
7693 * dataset, so that we can copy to other slaves if needed. */
7694 if (listLength(c->reply) != 0) {
7695 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7696 return;
7697 }
7698
7699 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7700 /* Here we need to check if there is a background saving operation
7701 * in progress, or if it is required to start one */
7702 if (server.bgsavechildpid != -1) {
7703 /* Ok a background save is in progress. Let's check if it is a good
7704 * one for replication, i.e. if there is another slave that is
7705 * registering differences since the server forked to save */
7706 redisClient *slave;
7707 listNode *ln;
7708 listIter li;
7709
7710 listRewind(server.slaves,&li);
7711 while((ln = listNext(&li))) {
7712 slave = ln->value;
7713 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7714 }
7715 if (ln) {
7716 /* Perfect, the server is already registering differences for
7717 * another slave. Set the right state, and copy the buffer. */
7718 listRelease(c->reply);
7719 c->reply = listDup(slave->reply);
7720 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7721 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7722 } else {
7723 /* No way, we need to wait for the next BGSAVE in order to
7724 * register differences */
7725 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7726 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7727 }
7728 } else {
7729 /* Ok we don't have a BGSAVE in progress, let's start one */
7730 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7731 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7732 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7733 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7734 return;
7735 }
7736 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7737 }
7738 c->repldbfd = -1;
7739 c->flags |= REDIS_SLAVE;
7740 c->slaveseldb = 0;
7741 listAddNodeTail(server.slaves,c);
7742 return;
7743 }
7744
7745 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7746 redisClient *slave = privdata;
7747 REDIS_NOTUSED(el);
7748 REDIS_NOTUSED(mask);
7749 char buf[REDIS_IOBUF_LEN];
7750 ssize_t nwritten, buflen;
7751
7752 if (slave->repldboff == 0) {
7753 /* Write the bulk write count before to transfer the DB. In theory here
7754 * we don't know how much room there is in the output buffer of the
7755 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7756 * operations) will never be smaller than the few bytes we need. */
7757 sds bulkcount;
7758
7759 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7760 slave->repldbsize);
7761 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7762 {
7763 sdsfree(bulkcount);
7764 freeClient(slave);
7765 return;
7766 }
7767 sdsfree(bulkcount);
7768 }
7769 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7770 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7771 if (buflen <= 0) {
7772 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7773 (buflen == 0) ? "premature EOF" : strerror(errno));
7774 freeClient(slave);
7775 return;
7776 }
7777 if ((nwritten = write(fd,buf,buflen)) == -1) {
7778 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7779 strerror(errno));
7780 freeClient(slave);
7781 return;
7782 }
7783 slave->repldboff += nwritten;
7784 if (slave->repldboff == slave->repldbsize) {
7785 close(slave->repldbfd);
7786 slave->repldbfd = -1;
7787 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7788 slave->replstate = REDIS_REPL_ONLINE;
7789 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7790 sendReplyToClient, slave) == AE_ERR) {
7791 freeClient(slave);
7792 return;
7793 }
7794 addReplySds(slave,sdsempty());
7795 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7796 }
7797 }
7798
7799 /* This function is called at the end of every backgrond saving.
7800 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7801 * otherwise REDIS_ERR is passed to the function.
7802 *
7803 * The goal of this function is to handle slaves waiting for a successful
7804 * background saving in order to perform non-blocking synchronization. */
7805 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7806 listNode *ln;
7807 int startbgsave = 0;
7808 listIter li;
7809
7810 listRewind(server.slaves,&li);
7811 while((ln = listNext(&li))) {
7812 redisClient *slave = ln->value;
7813
7814 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7815 startbgsave = 1;
7816 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7817 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7818 struct redis_stat buf;
7819
7820 if (bgsaveerr != REDIS_OK) {
7821 freeClient(slave);
7822 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7823 continue;
7824 }
7825 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7826 redis_fstat(slave->repldbfd,&buf) == -1) {
7827 freeClient(slave);
7828 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7829 continue;
7830 }
7831 slave->repldboff = 0;
7832 slave->repldbsize = buf.st_size;
7833 slave->replstate = REDIS_REPL_SEND_BULK;
7834 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7835 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7836 freeClient(slave);
7837 continue;
7838 }
7839 }
7840 }
7841 if (startbgsave) {
7842 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7843 listIter li;
7844
7845 listRewind(server.slaves,&li);
7846 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7847 while((ln = listNext(&li))) {
7848 redisClient *slave = ln->value;
7849
7850 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7851 freeClient(slave);
7852 }
7853 }
7854 }
7855 }
7856
7857 static int syncWithMaster(void) {
7858 char buf[1024], tmpfile[256], authcmd[1024];
7859 long dumpsize;
7860 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7861 int dfd, maxtries = 5;
7862
7863 if (fd == -1) {
7864 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7865 strerror(errno));
7866 return REDIS_ERR;
7867 }
7868
7869 /* AUTH with the master if required. */
7870 if(server.masterauth) {
7871 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7872 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7873 close(fd);
7874 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7875 strerror(errno));
7876 return REDIS_ERR;
7877 }
7878 /* Read the AUTH result. */
7879 if (syncReadLine(fd,buf,1024,3600) == -1) {
7880 close(fd);
7881 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7882 strerror(errno));
7883 return REDIS_ERR;
7884 }
7885 if (buf[0] != '+') {
7886 close(fd);
7887 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7888 return REDIS_ERR;
7889 }
7890 }
7891
7892 /* Issue the SYNC command */
7893 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7894 close(fd);
7895 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7896 strerror(errno));
7897 return REDIS_ERR;
7898 }
7899 /* Read the bulk write count */
7900 if (syncReadLine(fd,buf,1024,3600) == -1) {
7901 close(fd);
7902 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7903 strerror(errno));
7904 return REDIS_ERR;
7905 }
7906 if (buf[0] != '$') {
7907 close(fd);
7908 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7909 return REDIS_ERR;
7910 }
7911 dumpsize = strtol(buf+1,NULL,10);
7912 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7913 /* Read the bulk write data on a temp file */
7914 while(maxtries--) {
7915 snprintf(tmpfile,256,
7916 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7917 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7918 if (dfd != -1) break;
7919 sleep(1);
7920 }
7921 if (dfd == -1) {
7922 close(fd);
7923 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7924 return REDIS_ERR;
7925 }
7926 while(dumpsize) {
7927 int nread, nwritten;
7928
7929 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7930 if (nread == -1) {
7931 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7932 strerror(errno));
7933 close(fd);
7934 close(dfd);
7935 return REDIS_ERR;
7936 }
7937 nwritten = write(dfd,buf,nread);
7938 if (nwritten == -1) {
7939 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7940 close(fd);
7941 close(dfd);
7942 return REDIS_ERR;
7943 }
7944 dumpsize -= nread;
7945 }
7946 close(dfd);
7947 if (rename(tmpfile,server.dbfilename) == -1) {
7948 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7949 unlink(tmpfile);
7950 close(fd);
7951 return REDIS_ERR;
7952 }
7953 emptyDb();
7954 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7955 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7956 close(fd);
7957 return REDIS_ERR;
7958 }
7959 server.master = createClient(fd);
7960 server.master->flags |= REDIS_MASTER;
7961 server.master->authenticated = 1;
7962 server.replstate = REDIS_REPL_CONNECTED;
7963 return REDIS_OK;
7964 }
7965
7966 static void slaveofCommand(redisClient *c) {
7967 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7968 !strcasecmp(c->argv[2]->ptr,"one")) {
7969 if (server.masterhost) {
7970 sdsfree(server.masterhost);
7971 server.masterhost = NULL;
7972 if (server.master) freeClient(server.master);
7973 server.replstate = REDIS_REPL_NONE;
7974 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7975 }
7976 } else {
7977 sdsfree(server.masterhost);
7978 server.masterhost = sdsdup(c->argv[1]->ptr);
7979 server.masterport = atoi(c->argv[2]->ptr);
7980 if (server.master) freeClient(server.master);
7981 server.replstate = REDIS_REPL_CONNECT;
7982 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7983 server.masterhost, server.masterport);
7984 }
7985 addReply(c,shared.ok);
7986 }
7987
7988 /* ============================ Maxmemory directive ======================== */
7989
7990 /* Try to free one object form the pre-allocated objects free list.
7991 * This is useful under low mem conditions as by default we take 1 million
7992 * free objects allocated. On success REDIS_OK is returned, otherwise
7993 * REDIS_ERR. */
7994 static int tryFreeOneObjectFromFreelist(void) {
7995 robj *o;
7996
7997 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7998 if (listLength(server.objfreelist)) {
7999 listNode *head = listFirst(server.objfreelist);
8000 o = listNodeValue(head);
8001 listDelNode(server.objfreelist,head);
8002 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8003 zfree(o);
8004 return REDIS_OK;
8005 } else {
8006 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8007 return REDIS_ERR;
8008 }
8009 }
8010
8011 /* This function gets called when 'maxmemory' is set on the config file to limit
8012 * the max memory used by the server, and we are out of memory.
8013 * This function will try to, in order:
8014 *
8015 * - Free objects from the free list
8016 * - Try to remove keys with an EXPIRE set
8017 *
8018 * It is not possible to free enough memory to reach used-memory < maxmemory
8019 * the server will start refusing commands that will enlarge even more the
8020 * memory usage.
8021 */
8022 static void freeMemoryIfNeeded(void) {
8023 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
8024 int j, k, freed = 0;
8025
8026 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8027 for (j = 0; j < server.dbnum; j++) {
8028 int minttl = -1;
8029 robj *minkey = NULL;
8030 struct dictEntry *de;
8031
8032 if (dictSize(server.db[j].expires)) {
8033 freed = 1;
8034 /* From a sample of three keys drop the one nearest to
8035 * the natural expire */
8036 for (k = 0; k < 3; k++) {
8037 time_t t;
8038
8039 de = dictGetRandomKey(server.db[j].expires);
8040 t = (time_t) dictGetEntryVal(de);
8041 if (minttl == -1 || t < minttl) {
8042 minkey = dictGetEntryKey(de);
8043 minttl = t;
8044 }
8045 }
8046 deleteKey(server.db+j,minkey);
8047 }
8048 }
8049 if (!freed) return; /* nothing to free... */
8050 }
8051 }
8052
8053 /* ============================== Append Only file ========================== */
8054
8055 /* Write the append only file buffer on disk.
8056 *
8057 * Since we are required to write the AOF before replying to the client,
8058 * and the only way the client socket can get a write is entering when the
8059 * the event loop, we accumulate all the AOF writes in a memory
8060 * buffer and write it on disk using this function just before entering
8061 * the event loop again. */
8062 static void flushAppendOnlyFile(void) {
8063 time_t now;
8064 ssize_t nwritten;
8065
8066 if (sdslen(server.aofbuf) == 0) return;
8067
8068 /* We want to perform a single write. This should be guaranteed atomic
8069 * at least if the filesystem we are writing is a real physical one.
8070 * While this will save us against the server being killed I don't think
8071 * there is much to do about the whole server stopping for power problems
8072 * or alike */
8073 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8074 if (nwritten != (signed)sdslen(server.aofbuf)) {
8075 /* Ooops, we are in troubles. The best thing to do for now is
8076 * aborting instead of giving the illusion that everything is
8077 * working as expected. */
8078 if (nwritten == -1) {
8079 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8080 } else {
8081 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8082 }
8083 exit(1);
8084 }
8085 sdsfree(server.aofbuf);
8086 server.aofbuf = sdsempty();
8087
8088 /* Fsync if needed */
8089 now = time(NULL);
8090 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8091 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8092 now-server.lastfsync > 1))
8093 {
8094 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8095 * flushing metadata. */
8096 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8097 server.lastfsync = now;
8098 }
8099 }
8100
8101 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8102 sds buf = sdsempty();
8103 int j;
8104 robj *tmpargv[3];
8105
8106 /* The DB this command was targetting is not the same as the last command
8107 * we appendend. To issue a SELECT command is needed. */
8108 if (dictid != server.appendseldb) {
8109 char seldb[64];
8110
8111 snprintf(seldb,sizeof(seldb),"%d",dictid);
8112 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8113 (unsigned long)strlen(seldb),seldb);
8114 server.appendseldb = dictid;
8115 }
8116
8117 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8118 * EXPIREs into EXPIREATs calls */
8119 if (cmd->proc == expireCommand) {
8120 long when;
8121
8122 tmpargv[0] = createStringObject("EXPIREAT",8);
8123 tmpargv[1] = argv[1];
8124 incrRefCount(argv[1]);
8125 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8126 tmpargv[2] = createObject(REDIS_STRING,
8127 sdscatprintf(sdsempty(),"%ld",when));
8128 argv = tmpargv;
8129 }
8130
8131 /* Append the actual command */
8132 buf = sdscatprintf(buf,"*%d\r\n",argc);
8133 for (j = 0; j < argc; j++) {
8134 robj *o = argv[j];
8135
8136 o = getDecodedObject(o);
8137 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8138 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8139 buf = sdscatlen(buf,"\r\n",2);
8140 decrRefCount(o);
8141 }
8142
8143 /* Free the objects from the modified argv for EXPIREAT */
8144 if (cmd->proc == expireCommand) {
8145 for (j = 0; j < 3; j++)
8146 decrRefCount(argv[j]);
8147 }
8148
8149 /* Append to the AOF buffer. This will be flushed on disk just before
8150 * of re-entering the event loop, so before the client will get a
8151 * positive reply about the operation performed. */
8152 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8153
8154 /* If a background append only file rewriting is in progress we want to
8155 * accumulate the differences between the child DB and the current one
8156 * in a buffer, so that when the child process will do its work we
8157 * can append the differences to the new append only file. */
8158 if (server.bgrewritechildpid != -1)
8159 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8160
8161 sdsfree(buf);
8162 }
8163
8164 /* In Redis commands are always executed in the context of a client, so in
8165 * order to load the append only file we need to create a fake client. */
8166 static struct redisClient *createFakeClient(void) {
8167 struct redisClient *c = zmalloc(sizeof(*c));
8168
8169 selectDb(c,0);
8170 c->fd = -1;
8171 c->querybuf = sdsempty();
8172 c->argc = 0;
8173 c->argv = NULL;
8174 c->flags = 0;
8175 /* We set the fake client as a slave waiting for the synchronization
8176 * so that Redis will not try to send replies to this client. */
8177 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8178 c->reply = listCreate();
8179 listSetFreeMethod(c->reply,decrRefCount);
8180 listSetDupMethod(c->reply,dupClientReplyValue);
8181 initClientMultiState(c);
8182 return c;
8183 }
8184
8185 static void freeFakeClient(struct redisClient *c) {
8186 sdsfree(c->querybuf);
8187 listRelease(c->reply);
8188 freeClientMultiState(c);
8189 zfree(c);
8190 }
8191
8192 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8193 * error (the append only file is zero-length) REDIS_ERR is returned. On
8194 * fatal error an error message is logged and the program exists. */
8195 int loadAppendOnlyFile(char *filename) {
8196 struct redisClient *fakeClient;
8197 FILE *fp = fopen(filename,"r");
8198 struct redis_stat sb;
8199 unsigned long long loadedkeys = 0;
8200 int appendonly = server.appendonly;
8201
8202 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8203 return REDIS_ERR;
8204
8205 if (fp == NULL) {
8206 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8207 exit(1);
8208 }
8209
8210 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8211 * to the same file we're about to read. */
8212 server.appendonly = 0;
8213
8214 fakeClient = createFakeClient();
8215 while(1) {
8216 int argc, j;
8217 unsigned long len;
8218 robj **argv;
8219 char buf[128];
8220 sds argsds;
8221 struct redisCommand *cmd;
8222
8223 if (fgets(buf,sizeof(buf),fp) == NULL) {
8224 if (feof(fp))
8225 break;
8226 else
8227 goto readerr;
8228 }
8229 if (buf[0] != '*') goto fmterr;
8230 argc = atoi(buf+1);
8231 argv = zmalloc(sizeof(robj*)*argc);
8232 for (j = 0; j < argc; j++) {
8233 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8234 if (buf[0] != '$') goto fmterr;
8235 len = strtol(buf+1,NULL,10);
8236 argsds = sdsnewlen(NULL,len);
8237 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8238 argv[j] = createObject(REDIS_STRING,argsds);
8239 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8240 }
8241
8242 /* Command lookup */
8243 cmd = lookupCommand(argv[0]->ptr);
8244 if (!cmd) {
8245 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8246 exit(1);
8247 }
8248 /* Try object encoding */
8249 if (cmd->flags & REDIS_CMD_BULK)
8250 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8251 /* Run the command in the context of a fake client */
8252 fakeClient->argc = argc;
8253 fakeClient->argv = argv;
8254 cmd->proc(fakeClient);
8255 /* Discard the reply objects list from the fake client */
8256 while(listLength(fakeClient->reply))
8257 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8258 /* Clean up, ready for the next command */
8259 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8260 zfree(argv);
8261 /* Handle swapping while loading big datasets when VM is on */
8262 loadedkeys++;
8263 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8264 while (zmalloc_used_memory() > server.vm_max_memory) {
8265 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8266 }
8267 }
8268 }
8269
8270 /* This point can only be reached when EOF is reached without errors.
8271 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8272 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8273
8274 fclose(fp);
8275 freeFakeClient(fakeClient);
8276 server.appendonly = appendonly;
8277 return REDIS_OK;
8278
8279 readerr:
8280 if (feof(fp)) {
8281 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8282 } else {
8283 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8284 }
8285 exit(1);
8286 fmterr:
8287 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8288 exit(1);
8289 }
8290
8291 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8292 static int fwriteBulkObject(FILE *fp, robj *obj) {
8293 char buf[128];
8294 int decrrc = 0;
8295
8296 /* Avoid the incr/decr ref count business if possible to help
8297 * copy-on-write (we are often in a child process when this function
8298 * is called).
8299 * Also makes sure that key objects don't get incrRefCount-ed when VM
8300 * is enabled */
8301 if (obj->encoding != REDIS_ENCODING_RAW) {
8302 obj = getDecodedObject(obj);
8303 decrrc = 1;
8304 }
8305 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8306 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8307 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8308 goto err;
8309 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8310 if (decrrc) decrRefCount(obj);
8311 return 1;
8312 err:
8313 if (decrrc) decrRefCount(obj);
8314 return 0;
8315 }
8316
8317 /* Write binary-safe string into a file in the bulkformat
8318 * $<count>\r\n<payload>\r\n */
8319 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8320 char buf[128];
8321
8322 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8323 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8324 if (len && fwrite(s,len,1,fp) == 0) return 0;
8325 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8326 return 1;
8327 }
8328
8329 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8330 static int fwriteBulkDouble(FILE *fp, double d) {
8331 char buf[128], dbuf[128];
8332
8333 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8334 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8335 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8336 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8337 return 1;
8338 }
8339
8340 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8341 static int fwriteBulkLong(FILE *fp, long l) {
8342 char buf[128], lbuf[128];
8343
8344 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8345 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8346 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8347 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8348 return 1;
8349 }
8350
8351 /* Write a sequence of commands able to fully rebuild the dataset into
8352 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8353 static int rewriteAppendOnlyFile(char *filename) {
8354 dictIterator *di = NULL;
8355 dictEntry *de;
8356 FILE *fp;
8357 char tmpfile[256];
8358 int j;
8359 time_t now = time(NULL);
8360
8361 /* Note that we have to use a different temp name here compared to the
8362 * one used by rewriteAppendOnlyFileBackground() function. */
8363 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8364 fp = fopen(tmpfile,"w");
8365 if (!fp) {
8366 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8367 return REDIS_ERR;
8368 }
8369 for (j = 0; j < server.dbnum; j++) {
8370 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8371 redisDb *db = server.db+j;
8372 dict *d = db->dict;
8373 if (dictSize(d) == 0) continue;
8374 di = dictGetIterator(d);
8375 if (!di) {
8376 fclose(fp);
8377 return REDIS_ERR;
8378 }
8379
8380 /* SELECT the new DB */
8381 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8382 if (fwriteBulkLong(fp,j) == 0) goto werr;
8383
8384 /* Iterate this DB writing every entry */
8385 while((de = dictNext(di)) != NULL) {
8386 robj *key, *o;
8387 time_t expiretime;
8388 int swapped;
8389
8390 key = dictGetEntryKey(de);
8391 /* If the value for this key is swapped, load a preview in memory.
8392 * We use a "swapped" flag to remember if we need to free the
8393 * value object instead to just increment the ref count anyway
8394 * in order to avoid copy-on-write of pages if we are forked() */
8395 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8396 key->storage == REDIS_VM_SWAPPING) {
8397 o = dictGetEntryVal(de);
8398 swapped = 0;
8399 } else {
8400 o = vmPreviewObject(key);
8401 swapped = 1;
8402 }
8403 expiretime = getExpire(db,key);
8404
8405 /* Save the key and associated value */
8406 if (o->type == REDIS_STRING) {
8407 /* Emit a SET command */
8408 char cmd[]="*3\r\n$3\r\nSET\r\n";
8409 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8410 /* Key and value */
8411 if (fwriteBulkObject(fp,key) == 0) goto werr;
8412 if (fwriteBulkObject(fp,o) == 0) goto werr;
8413 } else if (o->type == REDIS_LIST) {
8414 /* Emit the RPUSHes needed to rebuild the list */
8415 list *list = o->ptr;
8416 listNode *ln;
8417 listIter li;
8418
8419 listRewind(list,&li);
8420 while((ln = listNext(&li))) {
8421 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8422 robj *eleobj = listNodeValue(ln);
8423
8424 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8425 if (fwriteBulkObject(fp,key) == 0) goto werr;
8426 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8427 }
8428 } else if (o->type == REDIS_SET) {
8429 /* Emit the SADDs needed to rebuild the set */
8430 dict *set = o->ptr;
8431 dictIterator *di = dictGetIterator(set);
8432 dictEntry *de;
8433
8434 while((de = dictNext(di)) != NULL) {
8435 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8436 robj *eleobj = dictGetEntryKey(de);
8437
8438 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8439 if (fwriteBulkObject(fp,key) == 0) goto werr;
8440 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8441 }
8442 dictReleaseIterator(di);
8443 } else if (o->type == REDIS_ZSET) {
8444 /* Emit the ZADDs needed to rebuild the sorted set */
8445 zset *zs = o->ptr;
8446 dictIterator *di = dictGetIterator(zs->dict);
8447 dictEntry *de;
8448
8449 while((de = dictNext(di)) != NULL) {
8450 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8451 robj *eleobj = dictGetEntryKey(de);
8452 double *score = dictGetEntryVal(de);
8453
8454 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8455 if (fwriteBulkObject(fp,key) == 0) goto werr;
8456 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8457 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8458 }
8459 dictReleaseIterator(di);
8460 } else if (o->type == REDIS_HASH) {
8461 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8462
8463 /* Emit the HSETs needed to rebuild the hash */
8464 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8465 unsigned char *p = zipmapRewind(o->ptr);
8466 unsigned char *field, *val;
8467 unsigned int flen, vlen;
8468
8469 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8470 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8471 if (fwriteBulkObject(fp,key) == 0) goto werr;
8472 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8473 return -1;
8474 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8475 return -1;
8476 }
8477 } else {
8478 dictIterator *di = dictGetIterator(o->ptr);
8479 dictEntry *de;
8480
8481 while((de = dictNext(di)) != NULL) {
8482 robj *field = dictGetEntryKey(de);
8483 robj *val = dictGetEntryVal(de);
8484
8485 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8486 if (fwriteBulkObject(fp,key) == 0) goto werr;
8487 if (fwriteBulkObject(fp,field) == -1) return -1;
8488 if (fwriteBulkObject(fp,val) == -1) return -1;
8489 }
8490 dictReleaseIterator(di);
8491 }
8492 } else {
8493 redisPanic("Unknown object type");
8494 }
8495 /* Save the expire time */
8496 if (expiretime != -1) {
8497 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8498 /* If this key is already expired skip it */
8499 if (expiretime < now) continue;
8500 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8501 if (fwriteBulkObject(fp,key) == 0) goto werr;
8502 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8503 }
8504 if (swapped) decrRefCount(o);
8505 }
8506 dictReleaseIterator(di);
8507 }
8508
8509 /* Make sure data will not remain on the OS's output buffers */
8510 fflush(fp);
8511 fsync(fileno(fp));
8512 fclose(fp);
8513
8514 /* Use RENAME to make sure the DB file is changed atomically only
8515 * if the generate DB file is ok. */
8516 if (rename(tmpfile,filename) == -1) {
8517 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8518 unlink(tmpfile);
8519 return REDIS_ERR;
8520 }
8521 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8522 return REDIS_OK;
8523
8524 werr:
8525 fclose(fp);
8526 unlink(tmpfile);
8527 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8528 if (di) dictReleaseIterator(di);
8529 return REDIS_ERR;
8530 }
8531
8532 /* This is how rewriting of the append only file in background works:
8533 *
8534 * 1) The user calls BGREWRITEAOF
8535 * 2) Redis calls this function, that forks():
8536 * 2a) the child rewrite the append only file in a temp file.
8537 * 2b) the parent accumulates differences in server.bgrewritebuf.
8538 * 3) When the child finished '2a' exists.
8539 * 4) The parent will trap the exit code, if it's OK, will append the
8540 * data accumulated into server.bgrewritebuf into the temp file, and
8541 * finally will rename(2) the temp file in the actual file name.
8542 * The the new file is reopened as the new append only file. Profit!
8543 */
8544 static int rewriteAppendOnlyFileBackground(void) {
8545 pid_t childpid;
8546
8547 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8548 if (server.vm_enabled) waitEmptyIOJobsQueue();
8549 if ((childpid = fork()) == 0) {
8550 /* Child */
8551 char tmpfile[256];
8552
8553 if (server.vm_enabled) vmReopenSwapFile();
8554 close(server.fd);
8555 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8556 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8557 _exit(0);
8558 } else {
8559 _exit(1);
8560 }
8561 } else {
8562 /* Parent */
8563 if (childpid == -1) {
8564 redisLog(REDIS_WARNING,
8565 "Can't rewrite append only file in background: fork: %s",
8566 strerror(errno));
8567 return REDIS_ERR;
8568 }
8569 redisLog(REDIS_NOTICE,
8570 "Background append only file rewriting started by pid %d",childpid);
8571 server.bgrewritechildpid = childpid;
8572 updateDictResizePolicy();
8573 /* We set appendseldb to -1 in order to force the next call to the
8574 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8575 * accumulated by the parent into server.bgrewritebuf will start
8576 * with a SELECT statement and it will be safe to merge. */
8577 server.appendseldb = -1;
8578 return REDIS_OK;
8579 }
8580 return REDIS_OK; /* unreached */
8581 }
8582
8583 static void bgrewriteaofCommand(redisClient *c) {
8584 if (server.bgrewritechildpid != -1) {
8585 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8586 return;
8587 }
8588 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8589 char *status = "+Background append only file rewriting started\r\n";
8590 addReplySds(c,sdsnew(status));
8591 } else {
8592 addReply(c,shared.err);
8593 }
8594 }
8595
8596 static void aofRemoveTempFile(pid_t childpid) {
8597 char tmpfile[256];
8598
8599 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8600 unlink(tmpfile);
8601 }
8602
8603 /* Virtual Memory is composed mainly of two subsystems:
8604 * - Blocking Virutal Memory
8605 * - Threaded Virtual Memory I/O
8606 * The two parts are not fully decoupled, but functions are split among two
8607 * different sections of the source code (delimited by comments) in order to
8608 * make more clear what functionality is about the blocking VM and what about
8609 * the threaded (not blocking) VM.
8610 *
8611 * Redis VM design:
8612 *
8613 * Redis VM is a blocking VM (one that blocks reading swapped values from
8614 * disk into memory when a value swapped out is needed in memory) that is made
8615 * unblocking by trying to examine the command argument vector in order to
8616 * load in background values that will likely be needed in order to exec
8617 * the command. The command is executed only once all the relevant keys
8618 * are loaded into memory.
8619 *
8620 * This basically is almost as simple of a blocking VM, but almost as parallel
8621 * as a fully non-blocking VM.
8622 */
8623
8624 /* =================== Virtual Memory - Blocking Side ====================== */
8625
8626 static void vmInit(void) {
8627 off_t totsize;
8628 int pipefds[2];
8629 size_t stacksize;
8630 struct flock fl;
8631
8632 if (server.vm_max_threads != 0)
8633 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8634
8635 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8636 /* Try to open the old swap file, otherwise create it */
8637 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8638 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8639 }
8640 if (server.vm_fp == NULL) {
8641 redisLog(REDIS_WARNING,
8642 "Can't open the swap file: %s. Exiting.",
8643 strerror(errno));
8644 exit(1);
8645 }
8646 server.vm_fd = fileno(server.vm_fp);
8647 /* Lock the swap file for writing, this is useful in order to avoid
8648 * another instance to use the same swap file for a config error. */
8649 fl.l_type = F_WRLCK;
8650 fl.l_whence = SEEK_SET;
8651 fl.l_start = fl.l_len = 0;
8652 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8653 redisLog(REDIS_WARNING,
8654 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8655 exit(1);
8656 }
8657 /* Initialize */
8658 server.vm_next_page = 0;
8659 server.vm_near_pages = 0;
8660 server.vm_stats_used_pages = 0;
8661 server.vm_stats_swapped_objects = 0;
8662 server.vm_stats_swapouts = 0;
8663 server.vm_stats_swapins = 0;
8664 totsize = server.vm_pages*server.vm_page_size;
8665 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8666 if (ftruncate(server.vm_fd,totsize) == -1) {
8667 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8668 strerror(errno));
8669 exit(1);
8670 } else {
8671 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8672 }
8673 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8674 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8675 (long long) (server.vm_pages+7)/8, server.vm_pages);
8676 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8677
8678 /* Initialize threaded I/O (used by Virtual Memory) */
8679 server.io_newjobs = listCreate();
8680 server.io_processing = listCreate();
8681 server.io_processed = listCreate();
8682 server.io_ready_clients = listCreate();
8683 pthread_mutex_init(&server.io_mutex,NULL);
8684 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8685 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8686 server.io_active_threads = 0;
8687 if (pipe(pipefds) == -1) {
8688 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8689 ,strerror(errno));
8690 exit(1);
8691 }
8692 server.io_ready_pipe_read = pipefds[0];
8693 server.io_ready_pipe_write = pipefds[1];
8694 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8695 /* LZF requires a lot of stack */
8696 pthread_attr_init(&server.io_threads_attr);
8697 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8698 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8699 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8700 /* Listen for events in the threaded I/O pipe */
8701 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8702 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8703 oom("creating file event");
8704 }
8705
8706 /* Mark the page as used */
8707 static void vmMarkPageUsed(off_t page) {
8708 off_t byte = page/8;
8709 int bit = page&7;
8710 redisAssert(vmFreePage(page) == 1);
8711 server.vm_bitmap[byte] |= 1<<bit;
8712 }
8713
8714 /* Mark N contiguous pages as used, with 'page' being the first. */
8715 static void vmMarkPagesUsed(off_t page, off_t count) {
8716 off_t j;
8717
8718 for (j = 0; j < count; j++)
8719 vmMarkPageUsed(page+j);
8720 server.vm_stats_used_pages += count;
8721 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8722 (long long)count, (long long)page);
8723 }
8724
8725 /* Mark the page as free */
8726 static void vmMarkPageFree(off_t page) {
8727 off_t byte = page/8;
8728 int bit = page&7;
8729 redisAssert(vmFreePage(page) == 0);
8730 server.vm_bitmap[byte] &= ~(1<<bit);
8731 }
8732
8733 /* Mark N contiguous pages as free, with 'page' being the first. */
8734 static void vmMarkPagesFree(off_t page, off_t count) {
8735 off_t j;
8736
8737 for (j = 0; j < count; j++)
8738 vmMarkPageFree(page+j);
8739 server.vm_stats_used_pages -= count;
8740 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8741 (long long)count, (long long)page);
8742 }
8743
8744 /* Test if the page is free */
8745 static int vmFreePage(off_t page) {
8746 off_t byte = page/8;
8747 int bit = page&7;
8748 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8749 }
8750
8751 /* Find N contiguous free pages storing the first page of the cluster in *first.
8752 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8753 * REDIS_ERR is returned.
8754 *
8755 * This function uses a simple algorithm: we try to allocate
8756 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8757 * again from the start of the swap file searching for free spaces.
8758 *
8759 * If it looks pretty clear that there are no free pages near our offset
8760 * we try to find less populated places doing a forward jump of
8761 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8762 * without hurry, and then we jump again and so forth...
8763 *
8764 * This function can be improved using a free list to avoid to guess
8765 * too much, since we could collect data about freed pages.
8766 *
8767 * note: I implemented this function just after watching an episode of
8768 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8769 */
8770 static int vmFindContiguousPages(off_t *first, off_t n) {
8771 off_t base, offset = 0, since_jump = 0, numfree = 0;
8772
8773 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8774 server.vm_near_pages = 0;
8775 server.vm_next_page = 0;
8776 }
8777 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8778 base = server.vm_next_page;
8779
8780 while(offset < server.vm_pages) {
8781 off_t this = base+offset;
8782
8783 /* If we overflow, restart from page zero */
8784 if (this >= server.vm_pages) {
8785 this -= server.vm_pages;
8786 if (this == 0) {
8787 /* Just overflowed, what we found on tail is no longer
8788 * interesting, as it's no longer contiguous. */
8789 numfree = 0;
8790 }
8791 }
8792 if (vmFreePage(this)) {
8793 /* This is a free page */
8794 numfree++;
8795 /* Already got N free pages? Return to the caller, with success */
8796 if (numfree == n) {
8797 *first = this-(n-1);
8798 server.vm_next_page = this+1;
8799 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8800 return REDIS_OK;
8801 }
8802 } else {
8803 /* The current one is not a free page */
8804 numfree = 0;
8805 }
8806
8807 /* Fast-forward if the current page is not free and we already
8808 * searched enough near this place. */
8809 since_jump++;
8810 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8811 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8812 since_jump = 0;
8813 /* Note that even if we rewind after the jump, we are don't need
8814 * to make sure numfree is set to zero as we only jump *if* it
8815 * is set to zero. */
8816 } else {
8817 /* Otherwise just check the next page */
8818 offset++;
8819 }
8820 }
8821 return REDIS_ERR;
8822 }
8823
8824 /* Write the specified object at the specified page of the swap file */
8825 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8826 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8827 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8828 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8829 redisLog(REDIS_WARNING,
8830 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8831 strerror(errno));
8832 return REDIS_ERR;
8833 }
8834 rdbSaveObject(server.vm_fp,o);
8835 fflush(server.vm_fp);
8836 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8837 return REDIS_OK;
8838 }
8839
8840 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8841 * needed to later retrieve the object into the key object.
8842 * If we can't find enough contiguous empty pages to swap the object on disk
8843 * REDIS_ERR is returned. */
8844 static int vmSwapObjectBlocking(robj *key, robj *val) {
8845 off_t pages = rdbSavedObjectPages(val,NULL);
8846 off_t page;
8847
8848 assert(key->storage == REDIS_VM_MEMORY);
8849 assert(key->refcount == 1);
8850 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8851 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8852 key->vm.page = page;
8853 key->vm.usedpages = pages;
8854 key->storage = REDIS_VM_SWAPPED;
8855 key->vtype = val->type;
8856 decrRefCount(val); /* Deallocate the object from memory. */
8857 vmMarkPagesUsed(page,pages);
8858 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8859 (unsigned char*) key->ptr,
8860 (unsigned long long) page, (unsigned long long) pages);
8861 server.vm_stats_swapped_objects++;
8862 server.vm_stats_swapouts++;
8863 return REDIS_OK;
8864 }
8865
8866 static robj *vmReadObjectFromSwap(off_t page, int type) {
8867 robj *o;
8868
8869 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8870 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8871 redisLog(REDIS_WARNING,
8872 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8873 strerror(errno));
8874 _exit(1);
8875 }
8876 o = rdbLoadObject(type,server.vm_fp);
8877 if (o == NULL) {
8878 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8879 _exit(1);
8880 }
8881 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8882 return o;
8883 }
8884
8885 /* Load the value object relative to the 'key' object from swap to memory.
8886 * The newly allocated object is returned.
8887 *
8888 * If preview is true the unserialized object is returned to the caller but
8889 * no changes are made to the key object, nor the pages are marked as freed */
8890 static robj *vmGenericLoadObject(robj *key, int preview) {
8891 robj *val;
8892
8893 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8894 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8895 if (!preview) {
8896 key->storage = REDIS_VM_MEMORY;
8897 key->vm.atime = server.unixtime;
8898 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8899 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8900 (unsigned char*) key->ptr);
8901 server.vm_stats_swapped_objects--;
8902 } else {
8903 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8904 (unsigned char*) key->ptr);
8905 }
8906 server.vm_stats_swapins++;
8907 return val;
8908 }
8909
8910 /* Plain object loading, from swap to memory */
8911 static robj *vmLoadObject(robj *key) {
8912 /* If we are loading the object in background, stop it, we
8913 * need to load this object synchronously ASAP. */
8914 if (key->storage == REDIS_VM_LOADING)
8915 vmCancelThreadedIOJob(key);
8916 return vmGenericLoadObject(key,0);
8917 }
8918
8919 /* Just load the value on disk, without to modify the key.
8920 * This is useful when we want to perform some operation on the value
8921 * without to really bring it from swap to memory, like while saving the
8922 * dataset or rewriting the append only log. */
8923 static robj *vmPreviewObject(robj *key) {
8924 return vmGenericLoadObject(key,1);
8925 }
8926
8927 /* How a good candidate is this object for swapping?
8928 * The better candidate it is, the greater the returned value.
8929 *
8930 * Currently we try to perform a fast estimation of the object size in
8931 * memory, and combine it with aging informations.
8932 *
8933 * Basically swappability = idle-time * log(estimated size)
8934 *
8935 * Bigger objects are preferred over smaller objects, but not
8936 * proportionally, this is why we use the logarithm. This algorithm is
8937 * just a first try and will probably be tuned later. */
8938 static double computeObjectSwappability(robj *o) {
8939 time_t age = server.unixtime - o->vm.atime;
8940 long asize = 0;
8941 list *l;
8942 dict *d;
8943 struct dictEntry *de;
8944 int z;
8945
8946 if (age <= 0) return 0;
8947 switch(o->type) {
8948 case REDIS_STRING:
8949 if (o->encoding != REDIS_ENCODING_RAW) {
8950 asize = sizeof(*o);
8951 } else {
8952 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8953 }
8954 break;
8955 case REDIS_LIST:
8956 l = o->ptr;
8957 listNode *ln = listFirst(l);
8958
8959 asize = sizeof(list);
8960 if (ln) {
8961 robj *ele = ln->value;
8962 long elesize;
8963
8964 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8965 (sizeof(*o)+sdslen(ele->ptr)) :
8966 sizeof(*o);
8967 asize += (sizeof(listNode)+elesize)*listLength(l);
8968 }
8969 break;
8970 case REDIS_SET:
8971 case REDIS_ZSET:
8972 z = (o->type == REDIS_ZSET);
8973 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8974
8975 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8976 if (z) asize += sizeof(zset)-sizeof(dict);
8977 if (dictSize(d)) {
8978 long elesize;
8979 robj *ele;
8980
8981 de = dictGetRandomKey(d);
8982 ele = dictGetEntryKey(de);
8983 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8984 (sizeof(*o)+sdslen(ele->ptr)) :
8985 sizeof(*o);
8986 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8987 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8988 }
8989 break;
8990 case REDIS_HASH:
8991 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8992 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8993 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8994 unsigned int klen, vlen;
8995 unsigned char *key, *val;
8996
8997 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8998 klen = 0;
8999 vlen = 0;
9000 }
9001 asize = len*(klen+vlen+3);
9002 } else if (o->encoding == REDIS_ENCODING_HT) {
9003 d = o->ptr;
9004 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9005 if (dictSize(d)) {
9006 long elesize;
9007 robj *ele;
9008
9009 de = dictGetRandomKey(d);
9010 ele = dictGetEntryKey(de);
9011 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9012 (sizeof(*o)+sdslen(ele->ptr)) :
9013 sizeof(*o);
9014 ele = dictGetEntryVal(de);
9015 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9016 (sizeof(*o)+sdslen(ele->ptr)) :
9017 sizeof(*o);
9018 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9019 }
9020 }
9021 break;
9022 }
9023 return (double)age*log(1+asize);
9024 }
9025
9026 /* Try to swap an object that's a good candidate for swapping.
9027 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
9028 * to swap any object at all.
9029 *
9030 * If 'usethreaded' is true, Redis will try to swap the object in background
9031 * using I/O threads. */
9032 static int vmSwapOneObject(int usethreads) {
9033 int j, i;
9034 struct dictEntry *best = NULL;
9035 double best_swappability = 0;
9036 redisDb *best_db = NULL;
9037 robj *key, *val;
9038
9039 for (j = 0; j < server.dbnum; j++) {
9040 redisDb *db = server.db+j;
9041 /* Why maxtries is set to 100?
9042 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9043 * are swappable objects */
9044 int maxtries = 100;
9045
9046 if (dictSize(db->dict) == 0) continue;
9047 for (i = 0; i < 5; i++) {
9048 dictEntry *de;
9049 double swappability;
9050
9051 if (maxtries) maxtries--;
9052 de = dictGetRandomKey(db->dict);
9053 key = dictGetEntryKey(de);
9054 val = dictGetEntryVal(de);
9055 /* Only swap objects that are currently in memory.
9056 *
9057 * Also don't swap shared objects if threaded VM is on, as we
9058 * try to ensure that the main thread does not touch the
9059 * object while the I/O thread is using it, but we can't
9060 * control other keys without adding additional mutex. */
9061 if (key->storage != REDIS_VM_MEMORY ||
9062 (server.vm_max_threads != 0 && val->refcount != 1)) {
9063 if (maxtries) i--; /* don't count this try */
9064 continue;
9065 }
9066 swappability = computeObjectSwappability(val);
9067 if (!best || swappability > best_swappability) {
9068 best = de;
9069 best_swappability = swappability;
9070 best_db = db;
9071 }
9072 }
9073 }
9074 if (best == NULL) return REDIS_ERR;
9075 key = dictGetEntryKey(best);
9076 val = dictGetEntryVal(best);
9077
9078 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9079 key->ptr, best_swappability);
9080
9081 /* Unshare the key if needed */
9082 if (key->refcount > 1) {
9083 robj *newkey = dupStringObject(key);
9084 decrRefCount(key);
9085 key = dictGetEntryKey(best) = newkey;
9086 }
9087 /* Swap it */
9088 if (usethreads) {
9089 vmSwapObjectThreaded(key,val,best_db);
9090 return REDIS_OK;
9091 } else {
9092 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9093 dictGetEntryVal(best) = NULL;
9094 return REDIS_OK;
9095 } else {
9096 return REDIS_ERR;
9097 }
9098 }
9099 }
9100
9101 static int vmSwapOneObjectBlocking() {
9102 return vmSwapOneObject(0);
9103 }
9104
9105 static int vmSwapOneObjectThreaded() {
9106 return vmSwapOneObject(1);
9107 }
9108
9109 /* Return true if it's safe to swap out objects in a given moment.
9110 * Basically we don't want to swap objects out while there is a BGSAVE
9111 * or a BGAEOREWRITE running in backgroud. */
9112 static int vmCanSwapOut(void) {
9113 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9114 }
9115
9116 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9117 * and was deleted. Otherwise 0 is returned. */
9118 static int deleteIfSwapped(redisDb *db, robj *key) {
9119 dictEntry *de;
9120 robj *foundkey;
9121
9122 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9123 foundkey = dictGetEntryKey(de);
9124 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9125 deleteKey(db,key);
9126 return 1;
9127 }
9128
9129 /* =================== Virtual Memory - Threaded I/O ======================= */
9130
9131 static void freeIOJob(iojob *j) {
9132 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9133 j->type == REDIS_IOJOB_DO_SWAP ||
9134 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9135 decrRefCount(j->val);
9136 /* We don't decrRefCount the j->key field as we did't incremented
9137 * the count creating IO Jobs. This is because the key field here is
9138 * just used as an indentifier and if a key is removed the Job should
9139 * never be touched again. */
9140 zfree(j);
9141 }
9142
9143 /* Every time a thread finished a Job, it writes a byte into the write side
9144 * of an unix pipe in order to "awake" the main thread, and this function
9145 * is called. */
9146 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9147 int mask)
9148 {
9149 char buf[1];
9150 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9151 REDIS_NOTUSED(el);
9152 REDIS_NOTUSED(mask);
9153 REDIS_NOTUSED(privdata);
9154
9155 /* For every byte we read in the read side of the pipe, there is one
9156 * I/O job completed to process. */
9157 while((retval = read(fd,buf,1)) == 1) {
9158 iojob *j;
9159 listNode *ln;
9160 robj *key;
9161 struct dictEntry *de;
9162
9163 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9164
9165 /* Get the processed element (the oldest one) */
9166 lockThreadedIO();
9167 assert(listLength(server.io_processed) != 0);
9168 if (toprocess == -1) {
9169 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9170 if (toprocess <= 0) toprocess = 1;
9171 }
9172 ln = listFirst(server.io_processed);
9173 j = ln->value;
9174 listDelNode(server.io_processed,ln);
9175 unlockThreadedIO();
9176 /* If this job is marked as canceled, just ignore it */
9177 if (j->canceled) {
9178 freeIOJob(j);
9179 continue;
9180 }
9181 /* Post process it in the main thread, as there are things we
9182 * can do just here to avoid race conditions and/or invasive locks */
9183 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9184 de = dictFind(j->db->dict,j->key);
9185 assert(de != NULL);
9186 key = dictGetEntryKey(de);
9187 if (j->type == REDIS_IOJOB_LOAD) {
9188 redisDb *db;
9189
9190 /* Key loaded, bring it at home */
9191 key->storage = REDIS_VM_MEMORY;
9192 key->vm.atime = server.unixtime;
9193 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9194 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9195 (unsigned char*) key->ptr);
9196 server.vm_stats_swapped_objects--;
9197 server.vm_stats_swapins++;
9198 dictGetEntryVal(de) = j->val;
9199 incrRefCount(j->val);
9200 db = j->db;
9201 freeIOJob(j);
9202 /* Handle clients waiting for this key to be loaded. */
9203 handleClientsBlockedOnSwappedKey(db,key);
9204 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9205 /* Now we know the amount of pages required to swap this object.
9206 * Let's find some space for it, and queue this task again
9207 * rebranded as REDIS_IOJOB_DO_SWAP. */
9208 if (!vmCanSwapOut() ||
9209 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9210 {
9211 /* Ooops... no space or we can't swap as there is
9212 * a fork()ed Redis trying to save stuff on disk. */
9213 freeIOJob(j);
9214 key->storage = REDIS_VM_MEMORY; /* undo operation */
9215 } else {
9216 /* Note that we need to mark this pages as used now,
9217 * if the job will be canceled, we'll mark them as freed
9218 * again. */
9219 vmMarkPagesUsed(j->page,j->pages);
9220 j->type = REDIS_IOJOB_DO_SWAP;
9221 lockThreadedIO();
9222 queueIOJob(j);
9223 unlockThreadedIO();
9224 }
9225 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9226 robj *val;
9227
9228 /* Key swapped. We can finally free some memory. */
9229 if (key->storage != REDIS_VM_SWAPPING) {
9230 printf("key->storage: %d\n",key->storage);
9231 printf("key->name: %s\n",(char*)key->ptr);
9232 printf("key->refcount: %d\n",key->refcount);
9233 printf("val: %p\n",(void*)j->val);
9234 printf("val->type: %d\n",j->val->type);
9235 printf("val->ptr: %s\n",(char*)j->val->ptr);
9236 }
9237 redisAssert(key->storage == REDIS_VM_SWAPPING);
9238 val = dictGetEntryVal(de);
9239 key->vm.page = j->page;
9240 key->vm.usedpages = j->pages;
9241 key->storage = REDIS_VM_SWAPPED;
9242 key->vtype = j->val->type;
9243 decrRefCount(val); /* Deallocate the object from memory. */
9244 dictGetEntryVal(de) = NULL;
9245 redisLog(REDIS_DEBUG,
9246 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9247 (unsigned char*) key->ptr,
9248 (unsigned long long) j->page, (unsigned long long) j->pages);
9249 server.vm_stats_swapped_objects++;
9250 server.vm_stats_swapouts++;
9251 freeIOJob(j);
9252 /* Put a few more swap requests in queue if we are still
9253 * out of memory */
9254 if (trytoswap && vmCanSwapOut() &&
9255 zmalloc_used_memory() > server.vm_max_memory)
9256 {
9257 int more = 1;
9258 while(more) {
9259 lockThreadedIO();
9260 more = listLength(server.io_newjobs) <
9261 (unsigned) server.vm_max_threads;
9262 unlockThreadedIO();
9263 /* Don't waste CPU time if swappable objects are rare. */
9264 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9265 trytoswap = 0;
9266 break;
9267 }
9268 }
9269 }
9270 }
9271 processed++;
9272 if (processed == toprocess) return;
9273 }
9274 if (retval < 0 && errno != EAGAIN) {
9275 redisLog(REDIS_WARNING,
9276 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9277 strerror(errno));
9278 }
9279 }
9280
9281 static void lockThreadedIO(void) {
9282 pthread_mutex_lock(&server.io_mutex);
9283 }
9284
9285 static void unlockThreadedIO(void) {
9286 pthread_mutex_unlock(&server.io_mutex);
9287 }
9288
9289 /* Remove the specified object from the threaded I/O queue if still not
9290 * processed, otherwise make sure to flag it as canceled. */
9291 static void vmCancelThreadedIOJob(robj *o) {
9292 list *lists[3] = {
9293 server.io_newjobs, /* 0 */
9294 server.io_processing, /* 1 */
9295 server.io_processed /* 2 */
9296 };
9297 int i;
9298
9299 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9300 again:
9301 lockThreadedIO();
9302 /* Search for a matching key in one of the queues */
9303 for (i = 0; i < 3; i++) {
9304 listNode *ln;
9305 listIter li;
9306
9307 listRewind(lists[i],&li);
9308 while ((ln = listNext(&li)) != NULL) {
9309 iojob *job = ln->value;
9310
9311 if (job->canceled) continue; /* Skip this, already canceled. */
9312 if (job->key == o) {
9313 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9314 (void*)job, (char*)o->ptr, job->type, i);
9315 /* Mark the pages as free since the swap didn't happened
9316 * or happened but is now discarded. */
9317 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9318 vmMarkPagesFree(job->page,job->pages);
9319 /* Cancel the job. It depends on the list the job is
9320 * living in. */
9321 switch(i) {
9322 case 0: /* io_newjobs */
9323 /* If the job was yet not processed the best thing to do
9324 * is to remove it from the queue at all */
9325 freeIOJob(job);
9326 listDelNode(lists[i],ln);
9327 break;
9328 case 1: /* io_processing */
9329 /* Oh Shi- the thread is messing with the Job:
9330 *
9331 * Probably it's accessing the object if this is a
9332 * PREPARE_SWAP or DO_SWAP job.
9333 * If it's a LOAD job it may be reading from disk and
9334 * if we don't wait for the job to terminate before to
9335 * cancel it, maybe in a few microseconds data can be
9336 * corrupted in this pages. So the short story is:
9337 *
9338 * Better to wait for the job to move into the
9339 * next queue (processed)... */
9340
9341 /* We try again and again until the job is completed. */
9342 unlockThreadedIO();
9343 /* But let's wait some time for the I/O thread
9344 * to finish with this job. After all this condition
9345 * should be very rare. */
9346 usleep(1);
9347 goto again;
9348 case 2: /* io_processed */
9349 /* The job was already processed, that's easy...
9350 * just mark it as canceled so that we'll ignore it
9351 * when processing completed jobs. */
9352 job->canceled = 1;
9353 break;
9354 }
9355 /* Finally we have to adjust the storage type of the object
9356 * in order to "UNDO" the operaiton. */
9357 if (o->storage == REDIS_VM_LOADING)
9358 o->storage = REDIS_VM_SWAPPED;
9359 else if (o->storage == REDIS_VM_SWAPPING)
9360 o->storage = REDIS_VM_MEMORY;
9361 unlockThreadedIO();
9362 return;
9363 }
9364 }
9365 }
9366 unlockThreadedIO();
9367 assert(1 != 1); /* We should never reach this */
9368 }
9369
9370 static void *IOThreadEntryPoint(void *arg) {
9371 iojob *j;
9372 listNode *ln;
9373 REDIS_NOTUSED(arg);
9374
9375 pthread_detach(pthread_self());
9376 while(1) {
9377 /* Get a new job to process */
9378 lockThreadedIO();
9379 if (listLength(server.io_newjobs) == 0) {
9380 /* No new jobs in queue, exit. */
9381 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9382 (long) pthread_self());
9383 server.io_active_threads--;
9384 unlockThreadedIO();
9385 return NULL;
9386 }
9387 ln = listFirst(server.io_newjobs);
9388 j = ln->value;
9389 listDelNode(server.io_newjobs,ln);
9390 /* Add the job in the processing queue */
9391 j->thread = pthread_self();
9392 listAddNodeTail(server.io_processing,j);
9393 ln = listLast(server.io_processing); /* We use ln later to remove it */
9394 unlockThreadedIO();
9395 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9396 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9397
9398 /* Process the Job */
9399 if (j->type == REDIS_IOJOB_LOAD) {
9400 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9401 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9402 FILE *fp = fopen("/dev/null","w+");
9403 j->pages = rdbSavedObjectPages(j->val,fp);
9404 fclose(fp);
9405 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9406 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9407 j->canceled = 1;
9408 }
9409
9410 /* Done: insert the job into the processed queue */
9411 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9412 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9413 lockThreadedIO();
9414 listDelNode(server.io_processing,ln);
9415 listAddNodeTail(server.io_processed,j);
9416 unlockThreadedIO();
9417
9418 /* Signal the main thread there is new stuff to process */
9419 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9420 }
9421 return NULL; /* never reached */
9422 }
9423
9424 static void spawnIOThread(void) {
9425 pthread_t thread;
9426 sigset_t mask, omask;
9427 int err;
9428
9429 sigemptyset(&mask);
9430 sigaddset(&mask,SIGCHLD);
9431 sigaddset(&mask,SIGHUP);
9432 sigaddset(&mask,SIGPIPE);
9433 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9434 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9435 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9436 strerror(err));
9437 usleep(1000000);
9438 }
9439 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9440 server.io_active_threads++;
9441 }
9442
9443 /* We need to wait for the last thread to exit before we are able to
9444 * fork() in order to BGSAVE or BGREWRITEAOF. */
9445 static void waitEmptyIOJobsQueue(void) {
9446 while(1) {
9447 int io_processed_len;
9448
9449 lockThreadedIO();
9450 if (listLength(server.io_newjobs) == 0 &&
9451 listLength(server.io_processing) == 0 &&
9452 server.io_active_threads == 0)
9453 {
9454 unlockThreadedIO();
9455 return;
9456 }
9457 /* While waiting for empty jobs queue condition we post-process some
9458 * finshed job, as I/O threads may be hanging trying to write against
9459 * the io_ready_pipe_write FD but there are so much pending jobs that
9460 * it's blocking. */
9461 io_processed_len = listLength(server.io_processed);
9462 unlockThreadedIO();
9463 if (io_processed_len) {
9464 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9465 usleep(1000); /* 1 millisecond */
9466 } else {
9467 usleep(10000); /* 10 milliseconds */
9468 }
9469 }
9470 }
9471
9472 static void vmReopenSwapFile(void) {
9473 /* Note: we don't close the old one as we are in the child process
9474 * and don't want to mess at all with the original file object. */
9475 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9476 if (server.vm_fp == NULL) {
9477 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9478 server.vm_swap_file);
9479 _exit(1);
9480 }
9481 server.vm_fd = fileno(server.vm_fp);
9482 }
9483
9484 /* This function must be called while with threaded IO locked */
9485 static void queueIOJob(iojob *j) {
9486 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9487 (void*)j, j->type, (char*)j->key->ptr);
9488 listAddNodeTail(server.io_newjobs,j);
9489 if (server.io_active_threads < server.vm_max_threads)
9490 spawnIOThread();
9491 }
9492
9493 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9494 iojob *j;
9495
9496 assert(key->storage == REDIS_VM_MEMORY);
9497 assert(key->refcount == 1);
9498
9499 j = zmalloc(sizeof(*j));
9500 j->type = REDIS_IOJOB_PREPARE_SWAP;
9501 j->db = db;
9502 j->key = key;
9503 j->val = val;
9504 incrRefCount(val);
9505 j->canceled = 0;
9506 j->thread = (pthread_t) -1;
9507 key->storage = REDIS_VM_SWAPPING;
9508
9509 lockThreadedIO();
9510 queueIOJob(j);
9511 unlockThreadedIO();
9512 return REDIS_OK;
9513 }
9514
9515 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9516
9517 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9518 * If there is not already a job loading the key, it is craeted.
9519 * The key is added to the io_keys list in the client structure, and also
9520 * in the hash table mapping swapped keys to waiting clients, that is,
9521 * server.io_waited_keys. */
9522 static int waitForSwappedKey(redisClient *c, robj *key) {
9523 struct dictEntry *de;
9524 robj *o;
9525 list *l;
9526
9527 /* If the key does not exist or is already in RAM we don't need to
9528 * block the client at all. */
9529 de = dictFind(c->db->dict,key);
9530 if (de == NULL) return 0;
9531 o = dictGetEntryKey(de);
9532 if (o->storage == REDIS_VM_MEMORY) {
9533 return 0;
9534 } else if (o->storage == REDIS_VM_SWAPPING) {
9535 /* We were swapping the key, undo it! */
9536 vmCancelThreadedIOJob(o);
9537 return 0;
9538 }
9539
9540 /* OK: the key is either swapped, or being loaded just now. */
9541
9542 /* Add the key to the list of keys this client is waiting for.
9543 * This maps clients to keys they are waiting for. */
9544 listAddNodeTail(c->io_keys,key);
9545 incrRefCount(key);
9546
9547 /* Add the client to the swapped keys => clients waiting map. */
9548 de = dictFind(c->db->io_keys,key);
9549 if (de == NULL) {
9550 int retval;
9551
9552 /* For every key we take a list of clients blocked for it */
9553 l = listCreate();
9554 retval = dictAdd(c->db->io_keys,key,l);
9555 incrRefCount(key);
9556 assert(retval == DICT_OK);
9557 } else {
9558 l = dictGetEntryVal(de);
9559 }
9560 listAddNodeTail(l,c);
9561
9562 /* Are we already loading the key from disk? If not create a job */
9563 if (o->storage == REDIS_VM_SWAPPED) {
9564 iojob *j;
9565
9566 o->storage = REDIS_VM_LOADING;
9567 j = zmalloc(sizeof(*j));
9568 j->type = REDIS_IOJOB_LOAD;
9569 j->db = c->db;
9570 j->key = o;
9571 j->key->vtype = o->vtype;
9572 j->page = o->vm.page;
9573 j->val = NULL;
9574 j->canceled = 0;
9575 j->thread = (pthread_t) -1;
9576 lockThreadedIO();
9577 queueIOJob(j);
9578 unlockThreadedIO();
9579 }
9580 return 1;
9581 }
9582
9583 /* Preload keys needed for the ZUNION and ZINTER commands. */
9584 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9585 int i, num;
9586 num = atoi(c->argv[2]->ptr);
9587 for (i = 0; i < num; i++) {
9588 waitForSwappedKey(c,c->argv[3+i]);
9589 }
9590 }
9591
9592 /* Is this client attempting to run a command against swapped keys?
9593 * If so, block it ASAP, load the keys in background, then resume it.
9594 *
9595 * The important idea about this function is that it can fail! If keys will
9596 * still be swapped when the client is resumed, this key lookups will
9597 * just block loading keys from disk. In practical terms this should only
9598 * happen with SORT BY command or if there is a bug in this function.
9599 *
9600 * Return 1 if the client is marked as blocked, 0 if the client can
9601 * continue as the keys it is going to access appear to be in memory. */
9602 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9603 int j, last;
9604
9605 if (cmd->vm_preload_proc != NULL) {
9606 cmd->vm_preload_proc(c);
9607 } else {
9608 if (cmd->vm_firstkey == 0) return 0;
9609 last = cmd->vm_lastkey;
9610 if (last < 0) last = c->argc+last;
9611 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9612 waitForSwappedKey(c,c->argv[j]);
9613 }
9614
9615 /* If the client was blocked for at least one key, mark it as blocked. */
9616 if (listLength(c->io_keys)) {
9617 c->flags |= REDIS_IO_WAIT;
9618 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9619 server.vm_blocked_clients++;
9620 return 1;
9621 } else {
9622 return 0;
9623 }
9624 }
9625
9626 /* Remove the 'key' from the list of blocked keys for a given client.
9627 *
9628 * The function returns 1 when there are no longer blocking keys after
9629 * the current one was removed (and the client can be unblocked). */
9630 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9631 list *l;
9632 listNode *ln;
9633 listIter li;
9634 struct dictEntry *de;
9635
9636 /* Remove the key from the list of keys this client is waiting for. */
9637 listRewind(c->io_keys,&li);
9638 while ((ln = listNext(&li)) != NULL) {
9639 if (equalStringObjects(ln->value,key)) {
9640 listDelNode(c->io_keys,ln);
9641 break;
9642 }
9643 }
9644 assert(ln != NULL);
9645
9646 /* Remove the client form the key => waiting clients map. */
9647 de = dictFind(c->db->io_keys,key);
9648 assert(de != NULL);
9649 l = dictGetEntryVal(de);
9650 ln = listSearchKey(l,c);
9651 assert(ln != NULL);
9652 listDelNode(l,ln);
9653 if (listLength(l) == 0)
9654 dictDelete(c->db->io_keys,key);
9655
9656 return listLength(c->io_keys) == 0;
9657 }
9658
9659 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9660 struct dictEntry *de;
9661 list *l;
9662 listNode *ln;
9663 int len;
9664
9665 de = dictFind(db->io_keys,key);
9666 if (!de) return;
9667
9668 l = dictGetEntryVal(de);
9669 len = listLength(l);
9670 /* Note: we can't use something like while(listLength(l)) as the list
9671 * can be freed by the calling function when we remove the last element. */
9672 while (len--) {
9673 ln = listFirst(l);
9674 redisClient *c = ln->value;
9675
9676 if (dontWaitForSwappedKey(c,key)) {
9677 /* Put the client in the list of clients ready to go as we
9678 * loaded all the keys about it. */
9679 listAddNodeTail(server.io_ready_clients,c);
9680 }
9681 }
9682 }
9683
9684 /* =========================== Remote Configuration ========================= */
9685
9686 static void configSetCommand(redisClient *c) {
9687 robj *o = getDecodedObject(c->argv[3]);
9688 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9689 zfree(server.dbfilename);
9690 server.dbfilename = zstrdup(o->ptr);
9691 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9692 zfree(server.requirepass);
9693 server.requirepass = zstrdup(o->ptr);
9694 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9695 zfree(server.masterauth);
9696 server.masterauth = zstrdup(o->ptr);
9697 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9698 server.maxmemory = strtoll(o->ptr, NULL, 10);
9699 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9700 if (!strcasecmp(o->ptr,"no")) {
9701 server.appendfsync = APPENDFSYNC_NO;
9702 } else if (!strcasecmp(o->ptr,"everysec")) {
9703 server.appendfsync = APPENDFSYNC_EVERYSEC;
9704 } else if (!strcasecmp(o->ptr,"always")) {
9705 server.appendfsync = APPENDFSYNC_ALWAYS;
9706 } else {
9707 goto badfmt;
9708 }
9709 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9710 int vlen, j;
9711 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9712
9713 /* Perform sanity check before setting the new config:
9714 * - Even number of args
9715 * - Seconds >= 1, changes >= 0 */
9716 if (vlen & 1) {
9717 sdsfreesplitres(v,vlen);
9718 goto badfmt;
9719 }
9720 for (j = 0; j < vlen; j++) {
9721 char *eptr;
9722 long val;
9723
9724 val = strtoll(v[j], &eptr, 10);
9725 if (eptr[0] != '\0' ||
9726 ((j & 1) == 0 && val < 1) ||
9727 ((j & 1) == 1 && val < 0)) {
9728 sdsfreesplitres(v,vlen);
9729 goto badfmt;
9730 }
9731 }
9732 /* Finally set the new config */
9733 resetServerSaveParams();
9734 for (j = 0; j < vlen; j += 2) {
9735 time_t seconds;
9736 int changes;
9737
9738 seconds = strtoll(v[j],NULL,10);
9739 changes = strtoll(v[j+1],NULL,10);
9740 appendServerSaveParams(seconds, changes);
9741 }
9742 sdsfreesplitres(v,vlen);
9743 } else {
9744 addReplySds(c,sdscatprintf(sdsempty(),
9745 "-ERR not supported CONFIG parameter %s\r\n",
9746 (char*)c->argv[2]->ptr));
9747 decrRefCount(o);
9748 return;
9749 }
9750 decrRefCount(o);
9751 addReply(c,shared.ok);
9752 return;
9753
9754 badfmt: /* Bad format errors */
9755 addReplySds(c,sdscatprintf(sdsempty(),
9756 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
9757 (char*)o->ptr,
9758 (char*)c->argv[2]->ptr));
9759 decrRefCount(o);
9760 }
9761
9762 static void configGetCommand(redisClient *c) {
9763 robj *o = getDecodedObject(c->argv[2]);
9764 robj *lenobj = createObject(REDIS_STRING,NULL);
9765 char *pattern = o->ptr;
9766 int matches = 0;
9767
9768 addReply(c,lenobj);
9769 decrRefCount(lenobj);
9770
9771 if (stringmatch(pattern,"dbfilename",0)) {
9772 addReplyBulkCString(c,"dbfilename");
9773 addReplyBulkCString(c,server.dbfilename);
9774 matches++;
9775 }
9776 if (stringmatch(pattern,"requirepass",0)) {
9777 addReplyBulkCString(c,"requirepass");
9778 addReplyBulkCString(c,server.requirepass);
9779 matches++;
9780 }
9781 if (stringmatch(pattern,"masterauth",0)) {
9782 addReplyBulkCString(c,"masterauth");
9783 addReplyBulkCString(c,server.masterauth);
9784 matches++;
9785 }
9786 if (stringmatch(pattern,"maxmemory",0)) {
9787 char buf[128];
9788
9789 snprintf(buf,128,"%llu\n",server.maxmemory);
9790 addReplyBulkCString(c,"maxmemory");
9791 addReplyBulkCString(c,buf);
9792 matches++;
9793 }
9794 if (stringmatch(pattern,"appendfsync",0)) {
9795 char *policy;
9796
9797 switch(server.appendfsync) {
9798 case APPENDFSYNC_NO: policy = "no"; break;
9799 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
9800 case APPENDFSYNC_ALWAYS: policy = "always"; break;
9801 default: policy = "unknown"; break; /* too harmless to panic */
9802 }
9803 addReplyBulkCString(c,"appendfsync");
9804 addReplyBulkCString(c,policy);
9805 matches++;
9806 }
9807 if (stringmatch(pattern,"save",0)) {
9808 sds buf = sdsempty();
9809 int j;
9810
9811 for (j = 0; j < server.saveparamslen; j++) {
9812 buf = sdscatprintf(buf,"%ld %d",
9813 server.saveparams[j].seconds,
9814 server.saveparams[j].changes);
9815 if (j != server.saveparamslen-1)
9816 buf = sdscatlen(buf," ",1);
9817 }
9818 addReplyBulkCString(c,"save");
9819 addReplyBulkCString(c,buf);
9820 sdsfree(buf);
9821 matches++;
9822 }
9823 decrRefCount(o);
9824 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9825 }
9826
9827 static void configCommand(redisClient *c) {
9828 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9829 if (c->argc != 4) goto badarity;
9830 configSetCommand(c);
9831 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9832 if (c->argc != 3) goto badarity;
9833 configGetCommand(c);
9834 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9835 if (c->argc != 2) goto badarity;
9836 server.stat_numcommands = 0;
9837 server.stat_numconnections = 0;
9838 server.stat_expiredkeys = 0;
9839 server.stat_starttime = time(NULL);
9840 addReply(c,shared.ok);
9841 } else {
9842 addReplySds(c,sdscatprintf(sdsempty(),
9843 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9844 }
9845 return;
9846
9847 badarity:
9848 addReplySds(c,sdscatprintf(sdsempty(),
9849 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9850 (char*) c->argv[1]->ptr));
9851 }
9852
9853 /* =========================== Pubsub implementation ======================== */
9854
9855 static void freePubsubPattern(void *p) {
9856 pubsubPattern *pat = p;
9857
9858 decrRefCount(pat->pattern);
9859 zfree(pat);
9860 }
9861
9862 static int listMatchPubsubPattern(void *a, void *b) {
9863 pubsubPattern *pa = a, *pb = b;
9864
9865 return (pa->client == pb->client) &&
9866 (equalStringObjects(pa->pattern,pb->pattern));
9867 }
9868
9869 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9870 * 0 if the client was already subscribed to that channel. */
9871 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9872 struct dictEntry *de;
9873 list *clients = NULL;
9874 int retval = 0;
9875
9876 /* Add the channel to the client -> channels hash table */
9877 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9878 retval = 1;
9879 incrRefCount(channel);
9880 /* Add the client to the channel -> list of clients hash table */
9881 de = dictFind(server.pubsub_channels,channel);
9882 if (de == NULL) {
9883 clients = listCreate();
9884 dictAdd(server.pubsub_channels,channel,clients);
9885 incrRefCount(channel);
9886 } else {
9887 clients = dictGetEntryVal(de);
9888 }
9889 listAddNodeTail(clients,c);
9890 }
9891 /* Notify the client */
9892 addReply(c,shared.mbulk3);
9893 addReply(c,shared.subscribebulk);
9894 addReplyBulk(c,channel);
9895 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9896 return retval;
9897 }
9898
9899 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9900 * 0 if the client was not subscribed to the specified channel. */
9901 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9902 struct dictEntry *de;
9903 list *clients;
9904 listNode *ln;
9905 int retval = 0;
9906
9907 /* Remove the channel from the client -> channels hash table */
9908 incrRefCount(channel); /* channel may be just a pointer to the same object
9909 we have in the hash tables. Protect it... */
9910 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9911 retval = 1;
9912 /* Remove the client from the channel -> clients list hash table */
9913 de = dictFind(server.pubsub_channels,channel);
9914 assert(de != NULL);
9915 clients = dictGetEntryVal(de);
9916 ln = listSearchKey(clients,c);
9917 assert(ln != NULL);
9918 listDelNode(clients,ln);
9919 if (listLength(clients) == 0) {
9920 /* Free the list and associated hash entry at all if this was
9921 * the latest client, so that it will be possible to abuse
9922 * Redis PUBSUB creating millions of channels. */
9923 dictDelete(server.pubsub_channels,channel);
9924 }
9925 }
9926 /* Notify the client */
9927 if (notify) {
9928 addReply(c,shared.mbulk3);
9929 addReply(c,shared.unsubscribebulk);
9930 addReplyBulk(c,channel);
9931 addReplyLong(c,dictSize(c->pubsub_channels)+
9932 listLength(c->pubsub_patterns));
9933
9934 }
9935 decrRefCount(channel); /* it is finally safe to release it */
9936 return retval;
9937 }
9938
9939 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9940 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9941 int retval = 0;
9942
9943 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9944 retval = 1;
9945 pubsubPattern *pat;
9946 listAddNodeTail(c->pubsub_patterns,pattern);
9947 incrRefCount(pattern);
9948 pat = zmalloc(sizeof(*pat));
9949 pat->pattern = getDecodedObject(pattern);
9950 pat->client = c;
9951 listAddNodeTail(server.pubsub_patterns,pat);
9952 }
9953 /* Notify the client */
9954 addReply(c,shared.mbulk3);
9955 addReply(c,shared.psubscribebulk);
9956 addReplyBulk(c,pattern);
9957 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9958 return retval;
9959 }
9960
9961 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9962 * 0 if the client was not subscribed to the specified channel. */
9963 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9964 listNode *ln;
9965 pubsubPattern pat;
9966 int retval = 0;
9967
9968 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9969 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9970 retval = 1;
9971 listDelNode(c->pubsub_patterns,ln);
9972 pat.client = c;
9973 pat.pattern = pattern;
9974 ln = listSearchKey(server.pubsub_patterns,&pat);
9975 listDelNode(server.pubsub_patterns,ln);
9976 }
9977 /* Notify the client */
9978 if (notify) {
9979 addReply(c,shared.mbulk3);
9980 addReply(c,shared.punsubscribebulk);
9981 addReplyBulk(c,pattern);
9982 addReplyLong(c,dictSize(c->pubsub_channels)+
9983 listLength(c->pubsub_patterns));
9984 }
9985 decrRefCount(pattern);
9986 return retval;
9987 }
9988
9989 /* Unsubscribe from all the channels. Return the number of channels the
9990 * client was subscribed from. */
9991 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9992 dictIterator *di = dictGetIterator(c->pubsub_channels);
9993 dictEntry *de;
9994 int count = 0;
9995
9996 while((de = dictNext(di)) != NULL) {
9997 robj *channel = dictGetEntryKey(de);
9998
9999 count += pubsubUnsubscribeChannel(c,channel,notify);
10000 }
10001 dictReleaseIterator(di);
10002 return count;
10003 }
10004
10005 /* Unsubscribe from all the patterns. Return the number of patterns the
10006 * client was subscribed from. */
10007 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10008 listNode *ln;
10009 listIter li;
10010 int count = 0;
10011
10012 listRewind(c->pubsub_patterns,&li);
10013 while ((ln = listNext(&li)) != NULL) {
10014 robj *pattern = ln->value;
10015
10016 count += pubsubUnsubscribePattern(c,pattern,notify);
10017 }
10018 return count;
10019 }
10020
10021 /* Publish a message */
10022 static int pubsubPublishMessage(robj *channel, robj *message) {
10023 int receivers = 0;
10024 struct dictEntry *de;
10025 listNode *ln;
10026 listIter li;
10027
10028 /* Send to clients listening for that channel */
10029 de = dictFind(server.pubsub_channels,channel);
10030 if (de) {
10031 list *list = dictGetEntryVal(de);
10032 listNode *ln;
10033 listIter li;
10034
10035 listRewind(list,&li);
10036 while ((ln = listNext(&li)) != NULL) {
10037 redisClient *c = ln->value;
10038
10039 addReply(c,shared.mbulk3);
10040 addReply(c,shared.messagebulk);
10041 addReplyBulk(c,channel);
10042 addReplyBulk(c,message);
10043 receivers++;
10044 }
10045 }
10046 /* Send to clients listening to matching channels */
10047 if (listLength(server.pubsub_patterns)) {
10048 listRewind(server.pubsub_patterns,&li);
10049 channel = getDecodedObject(channel);
10050 while ((ln = listNext(&li)) != NULL) {
10051 pubsubPattern *pat = ln->value;
10052
10053 if (stringmatchlen((char*)pat->pattern->ptr,
10054 sdslen(pat->pattern->ptr),
10055 (char*)channel->ptr,
10056 sdslen(channel->ptr),0)) {
10057 addReply(pat->client,shared.mbulk4);
10058 addReply(pat->client,shared.pmessagebulk);
10059 addReplyBulk(pat->client,pat->pattern);
10060 addReplyBulk(pat->client,channel);
10061 addReplyBulk(pat->client,message);
10062 receivers++;
10063 }
10064 }
10065 decrRefCount(channel);
10066 }
10067 return receivers;
10068 }
10069
10070 static void subscribeCommand(redisClient *c) {
10071 int j;
10072
10073 for (j = 1; j < c->argc; j++)
10074 pubsubSubscribeChannel(c,c->argv[j]);
10075 }
10076
10077 static void unsubscribeCommand(redisClient *c) {
10078 if (c->argc == 1) {
10079 pubsubUnsubscribeAllChannels(c,1);
10080 return;
10081 } else {
10082 int j;
10083
10084 for (j = 1; j < c->argc; j++)
10085 pubsubUnsubscribeChannel(c,c->argv[j],1);
10086 }
10087 }
10088
10089 static void psubscribeCommand(redisClient *c) {
10090 int j;
10091
10092 for (j = 1; j < c->argc; j++)
10093 pubsubSubscribePattern(c,c->argv[j]);
10094 }
10095
10096 static void punsubscribeCommand(redisClient *c) {
10097 if (c->argc == 1) {
10098 pubsubUnsubscribeAllPatterns(c,1);
10099 return;
10100 } else {
10101 int j;
10102
10103 for (j = 1; j < c->argc; j++)
10104 pubsubUnsubscribePattern(c,c->argv[j],1);
10105 }
10106 }
10107
10108 static void publishCommand(redisClient *c) {
10109 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10110 addReplyLong(c,receivers);
10111 }
10112
10113 /* ================================= Debugging ============================== */
10114
10115 static void debugCommand(redisClient *c) {
10116 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10117 *((char*)-1) = 'x';
10118 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10119 if (rdbSave(server.dbfilename) != REDIS_OK) {
10120 addReply(c,shared.err);
10121 return;
10122 }
10123 emptyDb();
10124 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10125 addReply(c,shared.err);
10126 return;
10127 }
10128 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10129 addReply(c,shared.ok);
10130 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10131 emptyDb();
10132 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10133 addReply(c,shared.err);
10134 return;
10135 }
10136 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10137 addReply(c,shared.ok);
10138 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10139 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10140 robj *key, *val;
10141
10142 if (!de) {
10143 addReply(c,shared.nokeyerr);
10144 return;
10145 }
10146 key = dictGetEntryKey(de);
10147 val = dictGetEntryVal(de);
10148 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10149 key->storage == REDIS_VM_SWAPPING)) {
10150 char *strenc;
10151 char buf[128];
10152
10153 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10154 strenc = strencoding[val->encoding];
10155 } else {
10156 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10157 strenc = buf;
10158 }
10159 addReplySds(c,sdscatprintf(sdsempty(),
10160 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10161 "encoding:%s serializedlength:%lld\r\n",
10162 (void*)key, key->refcount, (void*)val, val->refcount,
10163 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10164 } else {
10165 addReplySds(c,sdscatprintf(sdsempty(),
10166 "+Key at:%p refcount:%d, value swapped at: page %llu "
10167 "using %llu pages\r\n",
10168 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10169 (unsigned long long) key->vm.usedpages));
10170 }
10171 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10172 lookupKeyRead(c->db,c->argv[2]);
10173 addReply(c,shared.ok);
10174 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10175 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10176 robj *key, *val;
10177
10178 if (!server.vm_enabled) {
10179 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10180 return;
10181 }
10182 if (!de) {
10183 addReply(c,shared.nokeyerr);
10184 return;
10185 }
10186 key = dictGetEntryKey(de);
10187 val = dictGetEntryVal(de);
10188 /* If the key is shared we want to create a copy */
10189 if (key->refcount > 1) {
10190 robj *newkey = dupStringObject(key);
10191 decrRefCount(key);
10192 key = dictGetEntryKey(de) = newkey;
10193 }
10194 /* Swap it */
10195 if (key->storage != REDIS_VM_MEMORY) {
10196 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10197 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10198 dictGetEntryVal(de) = NULL;
10199 addReply(c,shared.ok);
10200 } else {
10201 addReply(c,shared.err);
10202 }
10203 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10204 long keys, j;
10205 robj *key, *val;
10206 char buf[128];
10207
10208 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10209 return;
10210 for (j = 0; j < keys; j++) {
10211 snprintf(buf,sizeof(buf),"key:%lu",j);
10212 key = createStringObject(buf,strlen(buf));
10213 if (lookupKeyRead(c->db,key) != NULL) {
10214 decrRefCount(key);
10215 continue;
10216 }
10217 snprintf(buf,sizeof(buf),"value:%lu",j);
10218 val = createStringObject(buf,strlen(buf));
10219 dictAdd(c->db->dict,key,val);
10220 }
10221 addReply(c,shared.ok);
10222 } else {
10223 addReplySds(c,sdsnew(
10224 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10225 }
10226 }
10227
10228 static void _redisAssert(char *estr, char *file, int line) {
10229 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10230 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
10231 #ifdef HAVE_BACKTRACE
10232 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10233 *((char*)-1) = 'x';
10234 #endif
10235 }
10236
10237 static void _redisPanic(char *msg, char *file, int line) {
10238 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10239 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10240 #ifdef HAVE_BACKTRACE
10241 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10242 *((char*)-1) = 'x';
10243 #endif
10244 }
10245
10246 /* =================================== Main! ================================ */
10247
10248 #ifdef __linux__
10249 int linuxOvercommitMemoryValue(void) {
10250 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10251 char buf[64];
10252
10253 if (!fp) return -1;
10254 if (fgets(buf,64,fp) == NULL) {
10255 fclose(fp);
10256 return -1;
10257 }
10258 fclose(fp);
10259
10260 return atoi(buf);
10261 }
10262
10263 void linuxOvercommitMemoryWarning(void) {
10264 if (linuxOvercommitMemoryValue() == 0) {
10265 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10266 }
10267 }
10268 #endif /* __linux__ */
10269
10270 static void daemonize(void) {
10271 int fd;
10272 FILE *fp;
10273
10274 if (fork() != 0) exit(0); /* parent exits */
10275 setsid(); /* create a new session */
10276
10277 /* Every output goes to /dev/null. If Redis is daemonized but
10278 * the 'logfile' is set to 'stdout' in the configuration file
10279 * it will not log at all. */
10280 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10281 dup2(fd, STDIN_FILENO);
10282 dup2(fd, STDOUT_FILENO);
10283 dup2(fd, STDERR_FILENO);
10284 if (fd > STDERR_FILENO) close(fd);
10285 }
10286 /* Try to write the pid file */
10287 fp = fopen(server.pidfile,"w");
10288 if (fp) {
10289 fprintf(fp,"%d\n",getpid());
10290 fclose(fp);
10291 }
10292 }
10293
10294 static void version() {
10295 printf("Redis server version %s\n", REDIS_VERSION);
10296 exit(0);
10297 }
10298
10299 static void usage() {
10300 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10301 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10302 exit(1);
10303 }
10304
10305 int main(int argc, char **argv) {
10306 time_t start;
10307
10308 initServerConfig();
10309 if (argc == 2) {
10310 if (strcmp(argv[1], "-v") == 0 ||
10311 strcmp(argv[1], "--version") == 0) version();
10312 if (strcmp(argv[1], "--help") == 0) usage();
10313 resetServerSaveParams();
10314 loadServerConfig(argv[1]);
10315 } else if ((argc > 2)) {
10316 usage();
10317 } else {
10318 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10319 }
10320 if (server.daemonize) daemonize();
10321 initServer();
10322 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10323 #ifdef __linux__
10324 linuxOvercommitMemoryWarning();
10325 #endif
10326 start = time(NULL);
10327 if (server.appendonly) {
10328 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10329 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10330 } else {
10331 if (rdbLoad(server.dbfilename) == REDIS_OK)
10332 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10333 }
10334 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10335 aeSetBeforeSleepProc(server.el,beforeSleep);
10336 aeMain(server.el);
10337 aeDeleteEventLoop(server.el);
10338 return 0;
10339 }
10340
10341 /* ============================= Backtrace support ========================= */
10342
10343 #ifdef HAVE_BACKTRACE
10344 static char *findFuncName(void *pointer, unsigned long *offset);
10345
10346 static void *getMcontextEip(ucontext_t *uc) {
10347 #if defined(__FreeBSD__)
10348 return (void*) uc->uc_mcontext.mc_eip;
10349 #elif defined(__dietlibc__)
10350 return (void*) uc->uc_mcontext.eip;
10351 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10352 #if __x86_64__
10353 return (void*) uc->uc_mcontext->__ss.__rip;
10354 #else
10355 return (void*) uc->uc_mcontext->__ss.__eip;
10356 #endif
10357 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10358 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10359 return (void*) uc->uc_mcontext->__ss.__rip;
10360 #else
10361 return (void*) uc->uc_mcontext->__ss.__eip;
10362 #endif
10363 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10364 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10365 #elif defined(__ia64__) /* Linux IA64 */
10366 return (void*) uc->uc_mcontext.sc_ip;
10367 #else
10368 return NULL;
10369 #endif
10370 }
10371
10372 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10373 void *trace[100];
10374 char **messages = NULL;
10375 int i, trace_size = 0;
10376 unsigned long offset=0;
10377 ucontext_t *uc = (ucontext_t*) secret;
10378 sds infostring;
10379 REDIS_NOTUSED(info);
10380
10381 redisLog(REDIS_WARNING,
10382 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10383 infostring = genRedisInfoString();
10384 redisLog(REDIS_WARNING, "%s",infostring);
10385 /* It's not safe to sdsfree() the returned string under memory
10386 * corruption conditions. Let it leak as we are going to abort */
10387
10388 trace_size = backtrace(trace, 100);
10389 /* overwrite sigaction with caller's address */
10390 if (getMcontextEip(uc) != NULL) {
10391 trace[1] = getMcontextEip(uc);
10392 }
10393 messages = backtrace_symbols(trace, trace_size);
10394
10395 for (i=1; i<trace_size; ++i) {
10396 char *fn = findFuncName(trace[i], &offset), *p;
10397
10398 p = strchr(messages[i],'+');
10399 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10400 redisLog(REDIS_WARNING,"%s", messages[i]);
10401 } else {
10402 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10403 }
10404 }
10405 /* free(messages); Don't call free() with possibly corrupted memory. */
10406 _exit(0);
10407 }
10408
10409 static void setupSigSegvAction(void) {
10410 struct sigaction act;
10411
10412 sigemptyset (&act.sa_mask);
10413 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10414 * is used. Otherwise, sa_handler is used */
10415 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10416 act.sa_sigaction = segvHandler;
10417 sigaction (SIGSEGV, &act, NULL);
10418 sigaction (SIGBUS, &act, NULL);
10419 sigaction (SIGFPE, &act, NULL);
10420 sigaction (SIGILL, &act, NULL);
10421 sigaction (SIGBUS, &act, NULL);
10422 return;
10423 }
10424
10425 #include "staticsymbols.h"
10426 /* This function try to convert a pointer into a function name. It's used in
10427 * oreder to provide a backtrace under segmentation fault that's able to
10428 * display functions declared as static (otherwise the backtrace is useless). */
10429 static char *findFuncName(void *pointer, unsigned long *offset){
10430 int i, ret = -1;
10431 unsigned long off, minoff = 0;
10432
10433 /* Try to match against the Symbol with the smallest offset */
10434 for (i=0; symsTable[i].pointer; i++) {
10435 unsigned long lp = (unsigned long) pointer;
10436
10437 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10438 off=lp-symsTable[i].pointer;
10439 if (ret < 0 || off < minoff) {
10440 minoff=off;
10441 ret=i;
10442 }
10443 }
10444 }
10445 if (ret == -1) return NULL;
10446 *offset = minoff;
10447 return symsTable[ret].name;
10448 }
10449 #else /* HAVE_BACKTRACE */
10450 static void setupSigSegvAction(void) {
10451 }
10452 #endif /* HAVE_BACKTRACE */
10453
10454
10455
10456 /* The End */
10457
10458
10459