]> git.saurik.com Git - redis.git/blob - redis.c
SETEX implemented
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
241 static void _redisAssert(char *estr, char *file, int line);
242 static void _redisPanic(char *msg, char *file, int line);
243
244 /*================================= Data types ============================== */
245
246 /* A redis object, that is a type able to hold a string / list / set */
247
248 /* The VM object structure */
249 struct redisObjectVM {
250 off_t page; /* the page at witch the object is stored on disk */
251 off_t usedpages; /* number of pages used on disk */
252 time_t atime; /* Last access time */
253 } vm;
254
255 /* The actual Redis Object */
256 typedef struct redisObject {
257 void *ptr;
258 unsigned char type;
259 unsigned char encoding;
260 unsigned char storage; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
264 int refcount;
265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm;
270 } robj;
271
272 /* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276 #define initStaticStringObject(_var,_ptr) do { \
277 _var.refcount = 1; \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
280 _var.ptr = _ptr; \
281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
282 } while(0);
283
284 typedef struct redisDb {
285 dict *dict; /* The keyspace for this DB */
286 dict *expires; /* Timeout of keys with a timeout set */
287 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
288 dict *io_keys; /* Keys with clients waiting for VM I/O */
289 int id;
290 } redisDb;
291
292 /* Client MULTI/EXEC state */
293 typedef struct multiCmd {
294 robj **argv;
295 int argc;
296 struct redisCommand *cmd;
297 } multiCmd;
298
299 typedef struct multiState {
300 multiCmd *commands; /* Array of MULTI commands */
301 int count; /* Total number of MULTI commands */
302 } multiState;
303
304 /* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306 typedef struct redisClient {
307 int fd;
308 redisDb *db;
309 int dictid;
310 sds querybuf;
311 robj **argv, **mbargv;
312 int argc, mbargc;
313 int bulklen; /* bulk read len. -1 if not in bulk read mode */
314 int multibulk; /* multi bulk command format active */
315 list *reply;
316 int sentlen;
317 time_t lastinteraction; /* time of the last interaction, used for timeout */
318 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
319 int slaveseldb; /* slave selected db, if this client is a slave */
320 int authenticated; /* when requirepass is non-NULL */
321 int replstate; /* replication state if this is a slave */
322 int repldbfd; /* replication DB file descriptor */
323 long repldboff; /* replication DB file offset */
324 off_t repldbsize; /* replication DB file size */
325 multiState mstate; /* MULTI/EXEC state */
326 robj **blockingkeys; /* The key we are waiting to terminate a blocking
327 * operation such as BLPOP. Otherwise NULL. */
328 int blockingkeysnum; /* Number of blocking keys */
329 time_t blockingto; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
331 list *io_keys; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
333 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
334 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
335 } redisClient;
336
337 struct saveparam {
338 time_t seconds;
339 int changes;
340 };
341
342 /* Global server state structure */
343 struct redisServer {
344 int port;
345 int fd;
346 redisDb *db;
347 long long dirty; /* changes to DB from the last save */
348 list *clients;
349 list *slaves, *monitors;
350 char neterr[ANET_ERR_LEN];
351 aeEventLoop *el;
352 int cronloops; /* number of times the cron function run */
353 list *objfreelist; /* A list of freed objects to avoid malloc() */
354 time_t lastsave; /* Unix time of last save succeeede */
355 /* Fields used only for stats */
356 time_t stat_starttime; /* server start time */
357 long long stat_numcommands; /* number of processed commands */
358 long long stat_numconnections; /* number of connections received */
359 long long stat_expiredkeys; /* number of expired keys */
360 /* Configuration */
361 int verbosity;
362 int glueoutputbuf;
363 int maxidletime;
364 int dbnum;
365 int daemonize;
366 int appendonly;
367 int appendfsync;
368 time_t lastfsync;
369 int appendfd;
370 int appendseldb;
371 char *pidfile;
372 pid_t bgsavechildpid;
373 pid_t bgrewritechildpid;
374 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
380 char *appendfilename;
381 char *requirepass;
382 int rdbcompression;
383 int activerehashing;
384 /* Replication related */
385 int isslave;
386 char *masterauth;
387 char *masterhost;
388 int masterport;
389 redisClient *master; /* client that is master for this slave */
390 int replstate;
391 unsigned int maxclients;
392 unsigned long long maxmemory;
393 unsigned int blpop_blocked_clients;
394 unsigned int vm_blocked_clients;
395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
397 int sort_desc;
398 int sort_alpha;
399 int sort_bypattern;
400 /* Virtual memory configuration */
401 int vm_enabled;
402 char *vm_swap_file;
403 off_t vm_page_size;
404 off_t vm_pages;
405 unsigned long long vm_max_memory;
406 /* Hashes config */
407 size_t hash_max_zipmap_entries;
408 size_t hash_max_zipmap_value;
409 /* Virtual memory state */
410 FILE *vm_fp;
411 int vm_fd;
412 off_t vm_next_page; /* Next probably empty page */
413 off_t vm_near_pages; /* Number of pages allocated sequentially */
414 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
415 time_t unixtime; /* Unix time sampled every second. */
416 /* Virtual memory I/O threads stuff */
417 /* An I/O thread process an element taken from the io_jobs queue and
418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
421 list *io_processing; /* List of VM I/O jobs being processed */
422 list *io_processed; /* List of VM I/O jobs already processed */
423 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
424 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
425 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
427 pthread_attr_t io_threads_attr; /* attributes for threads creation */
428 int io_active_threads; /* Number of running I/O threads */
429 int vm_max_threads; /* Max number of I/O threads running at the same time */
430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read;
435 int io_ready_pipe_write;
436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages;
438 unsigned long long vm_stats_swapped_objects;
439 unsigned long long vm_stats_swapouts;
440 unsigned long long vm_stats_swapins;
441 /* Pubsub */
442 dict *pubsub_channels; /* Map channels to list of subscribed clients */
443 list *pubsub_patterns; /* A list of pubsub_patterns */
444 /* Misc */
445 FILE *devnull;
446 };
447
448 typedef struct pubsubPattern {
449 redisClient *client;
450 robj *pattern;
451 } pubsubPattern;
452
453 typedef void redisCommandProc(redisClient *c);
454 struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc *vm_preload_proc;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
467 };
468
469 struct redisFunctionSym {
470 char *name;
471 unsigned long pointer;
472 };
473
474 typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480 } redisSortObject;
481
482 typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485 } redisSortOperation;
486
487 /* ZSETs use a specialized version of Skiplists */
488
489 typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
491 struct zskiplistNode *backward;
492 unsigned int *span;
493 double score;
494 robj *obj;
495 } zskiplistNode;
496
497 typedef struct zskiplist {
498 struct zskiplistNode *header, *tail;
499 unsigned long length;
500 int level;
501 } zskiplist;
502
503 typedef struct zset {
504 dict *dict;
505 zskiplist *zsl;
506 } zset;
507
508 /* Our shared "common" objects */
509
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct {
512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
513 *colon, *nullbulk, *nullmultibulk, *queued,
514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
516 *select0, *select1, *select2, *select3, *select4,
517 *select5, *select6, *select7, *select8, *select9,
518 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *mbulk4, *psubscribebulk, *punsubscribebulk,
520 *integers[REDIS_SHARED_INTEGERS];
521 } shared;
522
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob {
534 int type; /* Request type, REDIS_IOJOB_* */
535 redisDb *db;/* Redis database */
536 robj *key; /* This I/O request is about swapping this key */
537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543 } iojob;
544
545 /*================================ Prototypes =============================== */
546
547 static void freeStringObject(robj *o);
548 static void freeListObject(robj *o);
549 static void freeSetObject(robj *o);
550 static void decrRefCount(void *o);
551 static robj *createObject(int type, void *ptr);
552 static void freeClient(redisClient *c);
553 static int rdbLoad(char *filename);
554 static void addReply(redisClient *c, robj *obj);
555 static void addReplySds(redisClient *c, sds s);
556 static void incrRefCount(robj *o);
557 static int rdbSaveBackground(char *filename);
558 static robj *createStringObject(char *ptr, size_t len);
559 static robj *dupStringObject(robj *o);
560 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
561 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
562 static int syncWithMaster(void);
563 static robj *tryObjectEncoding(robj *o);
564 static robj *getDecodedObject(robj *o);
565 static int removeExpire(redisDb *db, robj *key);
566 static int expireIfNeeded(redisDb *db, robj *key);
567 static int deleteIfVolatile(redisDb *db, robj *key);
568 static int deleteIfSwapped(redisDb *db, robj *key);
569 static int deleteKey(redisDb *db, robj *key);
570 static time_t getExpire(redisDb *db, robj *key);
571 static int setExpire(redisDb *db, robj *key, time_t when);
572 static void updateSlavesWaitingBgsave(int bgsaveerr);
573 static void freeMemoryIfNeeded(void);
574 static int processCommand(redisClient *c);
575 static void setupSigSegvAction(void);
576 static void rdbRemoveTempFile(pid_t childpid);
577 static void aofRemoveTempFile(pid_t childpid);
578 static size_t stringObjectLen(robj *o);
579 static void processInputBuffer(redisClient *c);
580 static zskiplist *zslCreate(void);
581 static void zslFree(zskiplist *zsl);
582 static void zslInsert(zskiplist *zsl, double score, robj *obj);
583 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void initClientMultiState(redisClient *c);
585 static void freeClientMultiState(redisClient *c);
586 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
587 static void unblockClientWaitingData(redisClient *c);
588 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
589 static void vmInit(void);
590 static void vmMarkPagesFree(off_t page, off_t count);
591 static robj *vmLoadObject(robj *key);
592 static robj *vmPreviewObject(robj *key);
593 static int vmSwapOneObjectBlocking(void);
594 static int vmSwapOneObjectThreaded(void);
595 static int vmCanSwapOut(void);
596 static int tryFreeOneObjectFromFreelist(void);
597 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
598 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
599 static void vmCancelThreadedIOJob(robj *o);
600 static void lockThreadedIO(void);
601 static void unlockThreadedIO(void);
602 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
603 static void freeIOJob(iojob *j);
604 static void queueIOJob(iojob *j);
605 static int vmWriteObjectOnSwap(robj *o, off_t page);
606 static robj *vmReadObjectFromSwap(off_t page, int type);
607 static void waitEmptyIOJobsQueue(void);
608 static void vmReopenSwapFile(void);
609 static int vmFreePage(off_t page);
610 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
611 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
612 static int dontWaitForSwappedKey(redisClient *c, robj *key);
613 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
614 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
615 static struct redisCommand *lookupCommand(char *name);
616 static void call(redisClient *c, struct redisCommand *cmd);
617 static void resetClient(redisClient *c);
618 static void convertToRealHash(robj *o);
619 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
620 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
621 static void freePubsubPattern(void *p);
622 static int listMatchPubsubPattern(void *a, void *b);
623 static int compareStringObjects(robj *a, robj *b);
624 static void usage();
625
626 static void authCommand(redisClient *c);
627 static void pingCommand(redisClient *c);
628 static void echoCommand(redisClient *c);
629 static void setCommand(redisClient *c);
630 static void setnxCommand(redisClient *c);
631 static void setexCommand(redisClient *c);
632 static void getCommand(redisClient *c);
633 static void delCommand(redisClient *c);
634 static void existsCommand(redisClient *c);
635 static void incrCommand(redisClient *c);
636 static void decrCommand(redisClient *c);
637 static void incrbyCommand(redisClient *c);
638 static void decrbyCommand(redisClient *c);
639 static void selectCommand(redisClient *c);
640 static void randomkeyCommand(redisClient *c);
641 static void keysCommand(redisClient *c);
642 static void dbsizeCommand(redisClient *c);
643 static void lastsaveCommand(redisClient *c);
644 static void saveCommand(redisClient *c);
645 static void bgsaveCommand(redisClient *c);
646 static void bgrewriteaofCommand(redisClient *c);
647 static void shutdownCommand(redisClient *c);
648 static void moveCommand(redisClient *c);
649 static void renameCommand(redisClient *c);
650 static void renamenxCommand(redisClient *c);
651 static void lpushCommand(redisClient *c);
652 static void rpushCommand(redisClient *c);
653 static void lpopCommand(redisClient *c);
654 static void rpopCommand(redisClient *c);
655 static void llenCommand(redisClient *c);
656 static void lindexCommand(redisClient *c);
657 static void lrangeCommand(redisClient *c);
658 static void ltrimCommand(redisClient *c);
659 static void typeCommand(redisClient *c);
660 static void lsetCommand(redisClient *c);
661 static void saddCommand(redisClient *c);
662 static void sremCommand(redisClient *c);
663 static void smoveCommand(redisClient *c);
664 static void sismemberCommand(redisClient *c);
665 static void scardCommand(redisClient *c);
666 static void spopCommand(redisClient *c);
667 static void srandmemberCommand(redisClient *c);
668 static void sinterCommand(redisClient *c);
669 static void sinterstoreCommand(redisClient *c);
670 static void sunionCommand(redisClient *c);
671 static void sunionstoreCommand(redisClient *c);
672 static void sdiffCommand(redisClient *c);
673 static void sdiffstoreCommand(redisClient *c);
674 static void syncCommand(redisClient *c);
675 static void flushdbCommand(redisClient *c);
676 static void flushallCommand(redisClient *c);
677 static void sortCommand(redisClient *c);
678 static void lremCommand(redisClient *c);
679 static void rpoplpushcommand(redisClient *c);
680 static void infoCommand(redisClient *c);
681 static void mgetCommand(redisClient *c);
682 static void monitorCommand(redisClient *c);
683 static void expireCommand(redisClient *c);
684 static void expireatCommand(redisClient *c);
685 static void getsetCommand(redisClient *c);
686 static void ttlCommand(redisClient *c);
687 static void slaveofCommand(redisClient *c);
688 static void debugCommand(redisClient *c);
689 static void msetCommand(redisClient *c);
690 static void msetnxCommand(redisClient *c);
691 static void zaddCommand(redisClient *c);
692 static void zincrbyCommand(redisClient *c);
693 static void zrangeCommand(redisClient *c);
694 static void zrangebyscoreCommand(redisClient *c);
695 static void zcountCommand(redisClient *c);
696 static void zrevrangeCommand(redisClient *c);
697 static void zcardCommand(redisClient *c);
698 static void zremCommand(redisClient *c);
699 static void zscoreCommand(redisClient *c);
700 static void zremrangebyscoreCommand(redisClient *c);
701 static void multiCommand(redisClient *c);
702 static void execCommand(redisClient *c);
703 static void discardCommand(redisClient *c);
704 static void blpopCommand(redisClient *c);
705 static void brpopCommand(redisClient *c);
706 static void appendCommand(redisClient *c);
707 static void substrCommand(redisClient *c);
708 static void zrankCommand(redisClient *c);
709 static void zrevrankCommand(redisClient *c);
710 static void hsetCommand(redisClient *c);
711 static void hsetnxCommand(redisClient *c);
712 static void hgetCommand(redisClient *c);
713 static void hmsetCommand(redisClient *c);
714 static void hmgetCommand(redisClient *c);
715 static void hdelCommand(redisClient *c);
716 static void hlenCommand(redisClient *c);
717 static void zremrangebyrankCommand(redisClient *c);
718 static void zunionCommand(redisClient *c);
719 static void zinterCommand(redisClient *c);
720 static void hkeysCommand(redisClient *c);
721 static void hvalsCommand(redisClient *c);
722 static void hgetallCommand(redisClient *c);
723 static void hexistsCommand(redisClient *c);
724 static void configCommand(redisClient *c);
725 static void hincrbyCommand(redisClient *c);
726 static void subscribeCommand(redisClient *c);
727 static void unsubscribeCommand(redisClient *c);
728 static void psubscribeCommand(redisClient *c);
729 static void punsubscribeCommand(redisClient *c);
730 static void publishCommand(redisClient *c);
731
732 /*================================= Globals ================================= */
733
734 /* Global vars */
735 static struct redisServer server; /* server global state */
736 static struct redisCommand cmdTable[] = {
737 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
740 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
741 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
744 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
748 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
760 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
761 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
764 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
765 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
769 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
770 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
781 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
782 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
789 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
802 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
806 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
807 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
819 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
827 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
838 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
843 {NULL,NULL,0,0,NULL,0,0,0}
844 };
845
846 /*============================ Utility functions ============================ */
847
848 /* Glob-style pattern matching. */
849 static int stringmatchlen(const char *pattern, int patternLen,
850 const char *string, int stringLen, int nocase)
851 {
852 while(patternLen) {
853 switch(pattern[0]) {
854 case '*':
855 while (pattern[1] == '*') {
856 pattern++;
857 patternLen--;
858 }
859 if (patternLen == 1)
860 return 1; /* match */
861 while(stringLen) {
862 if (stringmatchlen(pattern+1, patternLen-1,
863 string, stringLen, nocase))
864 return 1; /* match */
865 string++;
866 stringLen--;
867 }
868 return 0; /* no match */
869 break;
870 case '?':
871 if (stringLen == 0)
872 return 0; /* no match */
873 string++;
874 stringLen--;
875 break;
876 case '[':
877 {
878 int not, match;
879
880 pattern++;
881 patternLen--;
882 not = pattern[0] == '^';
883 if (not) {
884 pattern++;
885 patternLen--;
886 }
887 match = 0;
888 while(1) {
889 if (pattern[0] == '\\') {
890 pattern++;
891 patternLen--;
892 if (pattern[0] == string[0])
893 match = 1;
894 } else if (pattern[0] == ']') {
895 break;
896 } else if (patternLen == 0) {
897 pattern--;
898 patternLen++;
899 break;
900 } else if (pattern[1] == '-' && patternLen >= 3) {
901 int start = pattern[0];
902 int end = pattern[2];
903 int c = string[0];
904 if (start > end) {
905 int t = start;
906 start = end;
907 end = t;
908 }
909 if (nocase) {
910 start = tolower(start);
911 end = tolower(end);
912 c = tolower(c);
913 }
914 pattern += 2;
915 patternLen -= 2;
916 if (c >= start && c <= end)
917 match = 1;
918 } else {
919 if (!nocase) {
920 if (pattern[0] == string[0])
921 match = 1;
922 } else {
923 if (tolower((int)pattern[0]) == tolower((int)string[0]))
924 match = 1;
925 }
926 }
927 pattern++;
928 patternLen--;
929 }
930 if (not)
931 match = !match;
932 if (!match)
933 return 0; /* no match */
934 string++;
935 stringLen--;
936 break;
937 }
938 case '\\':
939 if (patternLen >= 2) {
940 pattern++;
941 patternLen--;
942 }
943 /* fall through */
944 default:
945 if (!nocase) {
946 if (pattern[0] != string[0])
947 return 0; /* no match */
948 } else {
949 if (tolower((int)pattern[0]) != tolower((int)string[0]))
950 return 0; /* no match */
951 }
952 string++;
953 stringLen--;
954 break;
955 }
956 pattern++;
957 patternLen--;
958 if (stringLen == 0) {
959 while(*pattern == '*') {
960 pattern++;
961 patternLen--;
962 }
963 break;
964 }
965 }
966 if (patternLen == 0 && stringLen == 0)
967 return 1;
968 return 0;
969 }
970
971 static int stringmatch(const char *pattern, const char *string, int nocase) {
972 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
973 }
974
975 /* Convert a string representing an amount of memory into the number of
976 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
977 * (1024*1024*1024).
978 *
979 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
980 * set to 0 */
981 static long long memtoll(const char *p, int *err) {
982 const char *u;
983 char buf[128];
984 long mul; /* unit multiplier */
985 long long val;
986 unsigned int digits;
987
988 if (err) *err = 0;
989 /* Search the first non digit character. */
990 u = p;
991 if (*u == '-') u++;
992 while(*u && isdigit(*u)) u++;
993 if (*u == '\0' || !strcasecmp(u,"b")) {
994 mul = 1;
995 } else if (!strcasecmp(u,"k")) {
996 mul = 1000;
997 } else if (!strcasecmp(u,"kb")) {
998 mul = 1024;
999 } else if (!strcasecmp(u,"m")) {
1000 mul = 1000*1000;
1001 } else if (!strcasecmp(u,"mb")) {
1002 mul = 1024*1024;
1003 } else if (!strcasecmp(u,"g")) {
1004 mul = 1000L*1000*1000;
1005 } else if (!strcasecmp(u,"gb")) {
1006 mul = 1024L*1024*1024;
1007 } else {
1008 if (err) *err = 1;
1009 mul = 1;
1010 }
1011 digits = u-p;
1012 if (digits >= sizeof(buf)) {
1013 if (err) *err = 1;
1014 return LLONG_MAX;
1015 }
1016 memcpy(buf,p,digits);
1017 buf[digits] = '\0';
1018 val = strtoll(buf,NULL,10);
1019 return val*mul;
1020 }
1021
1022 static void redisLog(int level, const char *fmt, ...) {
1023 va_list ap;
1024 FILE *fp;
1025
1026 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1027 if (!fp) return;
1028
1029 va_start(ap, fmt);
1030 if (level >= server.verbosity) {
1031 char *c = ".-*#";
1032 char buf[64];
1033 time_t now;
1034
1035 now = time(NULL);
1036 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1037 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1038 vfprintf(fp, fmt, ap);
1039 fprintf(fp,"\n");
1040 fflush(fp);
1041 }
1042 va_end(ap);
1043
1044 if (server.logfile) fclose(fp);
1045 }
1046
1047 /*====================== Hash table type implementation ==================== */
1048
1049 /* This is an hash table type that uses the SDS dynamic strings libary as
1050 * keys and radis objects as values (objects can hold SDS strings,
1051 * lists, sets). */
1052
1053 static void dictVanillaFree(void *privdata, void *val)
1054 {
1055 DICT_NOTUSED(privdata);
1056 zfree(val);
1057 }
1058
1059 static void dictListDestructor(void *privdata, void *val)
1060 {
1061 DICT_NOTUSED(privdata);
1062 listRelease((list*)val);
1063 }
1064
1065 static int sdsDictKeyCompare(void *privdata, const void *key1,
1066 const void *key2)
1067 {
1068 int l1,l2;
1069 DICT_NOTUSED(privdata);
1070
1071 l1 = sdslen((sds)key1);
1072 l2 = sdslen((sds)key2);
1073 if (l1 != l2) return 0;
1074 return memcmp(key1, key2, l1) == 0;
1075 }
1076
1077 static void dictRedisObjectDestructor(void *privdata, void *val)
1078 {
1079 DICT_NOTUSED(privdata);
1080
1081 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1082 decrRefCount(val);
1083 }
1084
1085 static int dictObjKeyCompare(void *privdata, const void *key1,
1086 const void *key2)
1087 {
1088 const robj *o1 = key1, *o2 = key2;
1089 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1090 }
1091
1092 static unsigned int dictObjHash(const void *key) {
1093 const robj *o = key;
1094 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1095 }
1096
1097 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1098 const void *key2)
1099 {
1100 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1101 int cmp;
1102
1103 if (o1->encoding == REDIS_ENCODING_INT &&
1104 o2->encoding == REDIS_ENCODING_INT &&
1105 o1->ptr == o2->ptr) return 1;
1106
1107 o1 = getDecodedObject(o1);
1108 o2 = getDecodedObject(o2);
1109 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1110 decrRefCount(o1);
1111 decrRefCount(o2);
1112 return cmp;
1113 }
1114
1115 static unsigned int dictEncObjHash(const void *key) {
1116 robj *o = (robj*) key;
1117
1118 if (o->encoding == REDIS_ENCODING_RAW) {
1119 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1120 } else {
1121 if (o->encoding == REDIS_ENCODING_INT) {
1122 char buf[32];
1123 int len;
1124
1125 len = snprintf(buf,32,"%ld",(long)o->ptr);
1126 return dictGenHashFunction((unsigned char*)buf, len);
1127 } else {
1128 unsigned int hash;
1129
1130 o = getDecodedObject(o);
1131 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1132 decrRefCount(o);
1133 return hash;
1134 }
1135 }
1136 }
1137
1138 /* Sets type and expires */
1139 static dictType setDictType = {
1140 dictEncObjHash, /* hash function */
1141 NULL, /* key dup */
1142 NULL, /* val dup */
1143 dictEncObjKeyCompare, /* key compare */
1144 dictRedisObjectDestructor, /* key destructor */
1145 NULL /* val destructor */
1146 };
1147
1148 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1149 static dictType zsetDictType = {
1150 dictEncObjHash, /* hash function */
1151 NULL, /* key dup */
1152 NULL, /* val dup */
1153 dictEncObjKeyCompare, /* key compare */
1154 dictRedisObjectDestructor, /* key destructor */
1155 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1156 };
1157
1158 /* Db->dict */
1159 static dictType dbDictType = {
1160 dictObjHash, /* hash function */
1161 NULL, /* key dup */
1162 NULL, /* val dup */
1163 dictObjKeyCompare, /* key compare */
1164 dictRedisObjectDestructor, /* key destructor */
1165 dictRedisObjectDestructor /* val destructor */
1166 };
1167
1168 /* Db->expires */
1169 static dictType keyptrDictType = {
1170 dictObjHash, /* hash function */
1171 NULL, /* key dup */
1172 NULL, /* val dup */
1173 dictObjKeyCompare, /* key compare */
1174 dictRedisObjectDestructor, /* key destructor */
1175 NULL /* val destructor */
1176 };
1177
1178 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1179 static dictType hashDictType = {
1180 dictEncObjHash, /* hash function */
1181 NULL, /* key dup */
1182 NULL, /* val dup */
1183 dictEncObjKeyCompare, /* key compare */
1184 dictRedisObjectDestructor, /* key destructor */
1185 dictRedisObjectDestructor /* val destructor */
1186 };
1187
1188 /* Keylist hash table type has unencoded redis objects as keys and
1189 * lists as values. It's used for blocking operations (BLPOP) and to
1190 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1191 static dictType keylistDictType = {
1192 dictObjHash, /* hash function */
1193 NULL, /* key dup */
1194 NULL, /* val dup */
1195 dictObjKeyCompare, /* key compare */
1196 dictRedisObjectDestructor, /* key destructor */
1197 dictListDestructor /* val destructor */
1198 };
1199
1200 static void version();
1201
1202 /* ========================= Random utility functions ======================= */
1203
1204 /* Redis generally does not try to recover from out of memory conditions
1205 * when allocating objects or strings, it is not clear if it will be possible
1206 * to report this condition to the client since the networking layer itself
1207 * is based on heap allocation for send buffers, so we simply abort.
1208 * At least the code will be simpler to read... */
1209 static void oom(const char *msg) {
1210 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1211 sleep(1);
1212 abort();
1213 }
1214
1215 /* ====================== Redis server networking stuff ===================== */
1216 static void closeTimedoutClients(void) {
1217 redisClient *c;
1218 listNode *ln;
1219 time_t now = time(NULL);
1220 listIter li;
1221
1222 listRewind(server.clients,&li);
1223 while ((ln = listNext(&li)) != NULL) {
1224 c = listNodeValue(ln);
1225 if (server.maxidletime &&
1226 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1227 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1228 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1229 listLength(c->pubsub_patterns) == 0 &&
1230 (now - c->lastinteraction > server.maxidletime))
1231 {
1232 redisLog(REDIS_VERBOSE,"Closing idle client");
1233 freeClient(c);
1234 } else if (c->flags & REDIS_BLOCKED) {
1235 if (c->blockingto != 0 && c->blockingto < now) {
1236 addReply(c,shared.nullmultibulk);
1237 unblockClientWaitingData(c);
1238 }
1239 }
1240 }
1241 }
1242
1243 static int htNeedsResize(dict *dict) {
1244 long long size, used;
1245
1246 size = dictSlots(dict);
1247 used = dictSize(dict);
1248 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1249 (used*100/size < REDIS_HT_MINFILL));
1250 }
1251
1252 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1253 * we resize the hash table to save memory */
1254 static void tryResizeHashTables(void) {
1255 int j;
1256
1257 for (j = 0; j < server.dbnum; j++) {
1258 if (htNeedsResize(server.db[j].dict))
1259 dictResize(server.db[j].dict);
1260 if (htNeedsResize(server.db[j].expires))
1261 dictResize(server.db[j].expires);
1262 }
1263 }
1264
1265 /* Our hash table implementation performs rehashing incrementally while
1266 * we write/read from the hash table. Still if the server is idle, the hash
1267 * table will use two tables for a long time. So we try to use 1 millisecond
1268 * of CPU time at every serverCron() loop in order to rehash some key. */
1269 static void incrementallyRehash(void) {
1270 int j;
1271
1272 for (j = 0; j < server.dbnum; j++) {
1273 if (dictIsRehashing(server.db[j].dict)) {
1274 dictRehashMilliseconds(server.db[j].dict,1);
1275 break; /* already used our millisecond for this loop... */
1276 }
1277 }
1278 }
1279
1280 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1281 void backgroundSaveDoneHandler(int statloc) {
1282 int exitcode = WEXITSTATUS(statloc);
1283 int bysignal = WIFSIGNALED(statloc);
1284
1285 if (!bysignal && exitcode == 0) {
1286 redisLog(REDIS_NOTICE,
1287 "Background saving terminated with success");
1288 server.dirty = 0;
1289 server.lastsave = time(NULL);
1290 } else if (!bysignal && exitcode != 0) {
1291 redisLog(REDIS_WARNING, "Background saving error");
1292 } else {
1293 redisLog(REDIS_WARNING,
1294 "Background saving terminated by signal %d", WTERMSIG(statloc));
1295 rdbRemoveTempFile(server.bgsavechildpid);
1296 }
1297 server.bgsavechildpid = -1;
1298 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1299 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1300 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1301 }
1302
1303 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1304 * Handle this. */
1305 void backgroundRewriteDoneHandler(int statloc) {
1306 int exitcode = WEXITSTATUS(statloc);
1307 int bysignal = WIFSIGNALED(statloc);
1308
1309 if (!bysignal && exitcode == 0) {
1310 int fd;
1311 char tmpfile[256];
1312
1313 redisLog(REDIS_NOTICE,
1314 "Background append only file rewriting terminated with success");
1315 /* Now it's time to flush the differences accumulated by the parent */
1316 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1317 fd = open(tmpfile,O_WRONLY|O_APPEND);
1318 if (fd == -1) {
1319 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1320 goto cleanup;
1321 }
1322 /* Flush our data... */
1323 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1324 (signed) sdslen(server.bgrewritebuf)) {
1325 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1326 close(fd);
1327 goto cleanup;
1328 }
1329 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1330 /* Now our work is to rename the temp file into the stable file. And
1331 * switch the file descriptor used by the server for append only. */
1332 if (rename(tmpfile,server.appendfilename) == -1) {
1333 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1334 close(fd);
1335 goto cleanup;
1336 }
1337 /* Mission completed... almost */
1338 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1339 if (server.appendfd != -1) {
1340 /* If append only is actually enabled... */
1341 close(server.appendfd);
1342 server.appendfd = fd;
1343 fsync(fd);
1344 server.appendseldb = -1; /* Make sure it will issue SELECT */
1345 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1346 } else {
1347 /* If append only is disabled we just generate a dump in this
1348 * format. Why not? */
1349 close(fd);
1350 }
1351 } else if (!bysignal && exitcode != 0) {
1352 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1353 } else {
1354 redisLog(REDIS_WARNING,
1355 "Background append only file rewriting terminated by signal %d",
1356 WTERMSIG(statloc));
1357 }
1358 cleanup:
1359 sdsfree(server.bgrewritebuf);
1360 server.bgrewritebuf = sdsempty();
1361 aofRemoveTempFile(server.bgrewritechildpid);
1362 server.bgrewritechildpid = -1;
1363 }
1364
1365 /* This function is called once a background process of some kind terminates,
1366 * as we want to avoid resizing the hash tables when there is a child in order
1367 * to play well with copy-on-write (otherwise when a resize happens lots of
1368 * memory pages are copied). The goal of this function is to update the ability
1369 * for dict.c to resize the hash tables accordingly to the fact we have o not
1370 * running childs. */
1371 static void updateDictResizePolicy(void) {
1372 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1373 dictEnableResize();
1374 else
1375 dictDisableResize();
1376 }
1377
1378 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1379 int j, loops = server.cronloops++;
1380 REDIS_NOTUSED(eventLoop);
1381 REDIS_NOTUSED(id);
1382 REDIS_NOTUSED(clientData);
1383
1384 /* We take a cached value of the unix time in the global state because
1385 * with virtual memory and aging there is to store the current time
1386 * in objects at every object access, and accuracy is not needed.
1387 * To access a global var is faster than calling time(NULL) */
1388 server.unixtime = time(NULL);
1389
1390 /* Show some info about non-empty databases */
1391 for (j = 0; j < server.dbnum; j++) {
1392 long long size, used, vkeys;
1393
1394 size = dictSlots(server.db[j].dict);
1395 used = dictSize(server.db[j].dict);
1396 vkeys = dictSize(server.db[j].expires);
1397 if (!(loops % 50) && (used || vkeys)) {
1398 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1399 /* dictPrintStats(server.dict); */
1400 }
1401 }
1402
1403 /* We don't want to resize the hash tables while a bacground saving
1404 * is in progress: the saving child is created using fork() that is
1405 * implemented with a copy-on-write semantic in most modern systems, so
1406 * if we resize the HT while there is the saving child at work actually
1407 * a lot of memory movements in the parent will cause a lot of pages
1408 * copied. */
1409 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1410 if (!(loops % 10)) tryResizeHashTables();
1411 if (server.activerehashing) incrementallyRehash();
1412 }
1413
1414 /* Show information about connected clients */
1415 if (!(loops % 50)) {
1416 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1417 listLength(server.clients)-listLength(server.slaves),
1418 listLength(server.slaves),
1419 zmalloc_used_memory());
1420 }
1421
1422 /* Close connections of timedout clients */
1423 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1424 closeTimedoutClients();
1425
1426 /* Check if a background saving or AOF rewrite in progress terminated */
1427 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1428 int statloc;
1429 pid_t pid;
1430
1431 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1432 if (pid == server.bgsavechildpid) {
1433 backgroundSaveDoneHandler(statloc);
1434 } else {
1435 backgroundRewriteDoneHandler(statloc);
1436 }
1437 updateDictResizePolicy();
1438 }
1439 } else {
1440 /* If there is not a background saving in progress check if
1441 * we have to save now */
1442 time_t now = time(NULL);
1443 for (j = 0; j < server.saveparamslen; j++) {
1444 struct saveparam *sp = server.saveparams+j;
1445
1446 if (server.dirty >= sp->changes &&
1447 now-server.lastsave > sp->seconds) {
1448 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1449 sp->changes, sp->seconds);
1450 rdbSaveBackground(server.dbfilename);
1451 break;
1452 }
1453 }
1454 }
1455
1456 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1457 * will use few CPU cycles if there are few expiring keys, otherwise
1458 * it will get more aggressive to avoid that too much memory is used by
1459 * keys that can be removed from the keyspace. */
1460 for (j = 0; j < server.dbnum; j++) {
1461 int expired;
1462 redisDb *db = server.db+j;
1463
1464 /* Continue to expire if at the end of the cycle more than 25%
1465 * of the keys were expired. */
1466 do {
1467 long num = dictSize(db->expires);
1468 time_t now = time(NULL);
1469
1470 expired = 0;
1471 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1472 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1473 while (num--) {
1474 dictEntry *de;
1475 time_t t;
1476
1477 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1478 t = (time_t) dictGetEntryVal(de);
1479 if (now > t) {
1480 deleteKey(db,dictGetEntryKey(de));
1481 expired++;
1482 server.stat_expiredkeys++;
1483 }
1484 }
1485 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1486 }
1487
1488 /* Swap a few keys on disk if we are over the memory limit and VM
1489 * is enbled. Try to free objects from the free list first. */
1490 if (vmCanSwapOut()) {
1491 while (server.vm_enabled && zmalloc_used_memory() >
1492 server.vm_max_memory)
1493 {
1494 int retval;
1495
1496 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1497 retval = (server.vm_max_threads == 0) ?
1498 vmSwapOneObjectBlocking() :
1499 vmSwapOneObjectThreaded();
1500 if (retval == REDIS_ERR && !(loops % 300) &&
1501 zmalloc_used_memory() >
1502 (server.vm_max_memory+server.vm_max_memory/10))
1503 {
1504 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1505 }
1506 /* Note that when using threade I/O we free just one object,
1507 * because anyway when the I/O thread in charge to swap this
1508 * object out will finish, the handler of completed jobs
1509 * will try to swap more objects if we are still out of memory. */
1510 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1511 }
1512 }
1513
1514 /* Check if we should connect to a MASTER */
1515 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1516 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1517 if (syncWithMaster() == REDIS_OK) {
1518 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1519 }
1520 }
1521 return 100;
1522 }
1523
1524 /* This function gets called every time Redis is entering the
1525 * main loop of the event driven library, that is, before to sleep
1526 * for ready file descriptors. */
1527 static void beforeSleep(struct aeEventLoop *eventLoop) {
1528 REDIS_NOTUSED(eventLoop);
1529
1530 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1531 listIter li;
1532 listNode *ln;
1533
1534 listRewind(server.io_ready_clients,&li);
1535 while((ln = listNext(&li))) {
1536 redisClient *c = ln->value;
1537 struct redisCommand *cmd;
1538
1539 /* Resume the client. */
1540 listDelNode(server.io_ready_clients,ln);
1541 c->flags &= (~REDIS_IO_WAIT);
1542 server.vm_blocked_clients--;
1543 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1544 readQueryFromClient, c);
1545 cmd = lookupCommand(c->argv[0]->ptr);
1546 assert(cmd != NULL);
1547 call(c,cmd);
1548 resetClient(c);
1549 /* There may be more data to process in the input buffer. */
1550 if (c->querybuf && sdslen(c->querybuf) > 0)
1551 processInputBuffer(c);
1552 }
1553 }
1554 }
1555
1556 static void createSharedObjects(void) {
1557 int j;
1558
1559 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1560 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1561 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1562 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1563 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1564 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1565 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1566 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1567 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1568 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1569 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1570 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1571 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1572 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1573 "-ERR no such key\r\n"));
1574 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1575 "-ERR syntax error\r\n"));
1576 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1577 "-ERR source and destination objects are the same\r\n"));
1578 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR index out of range\r\n"));
1580 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1581 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1582 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1583 shared.select0 = createStringObject("select 0\r\n",10);
1584 shared.select1 = createStringObject("select 1\r\n",10);
1585 shared.select2 = createStringObject("select 2\r\n",10);
1586 shared.select3 = createStringObject("select 3\r\n",10);
1587 shared.select4 = createStringObject("select 4\r\n",10);
1588 shared.select5 = createStringObject("select 5\r\n",10);
1589 shared.select6 = createStringObject("select 6\r\n",10);
1590 shared.select7 = createStringObject("select 7\r\n",10);
1591 shared.select8 = createStringObject("select 8\r\n",10);
1592 shared.select9 = createStringObject("select 9\r\n",10);
1593 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1594 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1595 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1596 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1597 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1598 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1599 shared.mbulk3 = createStringObject("*3\r\n",4);
1600 shared.mbulk4 = createStringObject("*4\r\n",4);
1601 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1602 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1603 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1604 }
1605 }
1606
1607 static void appendServerSaveParams(time_t seconds, int changes) {
1608 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1609 server.saveparams[server.saveparamslen].seconds = seconds;
1610 server.saveparams[server.saveparamslen].changes = changes;
1611 server.saveparamslen++;
1612 }
1613
1614 static void resetServerSaveParams() {
1615 zfree(server.saveparams);
1616 server.saveparams = NULL;
1617 server.saveparamslen = 0;
1618 }
1619
1620 static void initServerConfig() {
1621 server.dbnum = REDIS_DEFAULT_DBNUM;
1622 server.port = REDIS_SERVERPORT;
1623 server.verbosity = REDIS_VERBOSE;
1624 server.maxidletime = REDIS_MAXIDLETIME;
1625 server.saveparams = NULL;
1626 server.logfile = NULL; /* NULL = log on standard output */
1627 server.bindaddr = NULL;
1628 server.glueoutputbuf = 1;
1629 server.daemonize = 0;
1630 server.appendonly = 0;
1631 server.appendfsync = APPENDFSYNC_ALWAYS;
1632 server.lastfsync = time(NULL);
1633 server.appendfd = -1;
1634 server.appendseldb = -1; /* Make sure the first time will not match */
1635 server.pidfile = zstrdup("/var/run/redis.pid");
1636 server.dbfilename = zstrdup("dump.rdb");
1637 server.appendfilename = zstrdup("appendonly.aof");
1638 server.requirepass = NULL;
1639 server.rdbcompression = 1;
1640 server.activerehashing = 1;
1641 server.maxclients = 0;
1642 server.blpop_blocked_clients = 0;
1643 server.maxmemory = 0;
1644 server.vm_enabled = 0;
1645 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1646 server.vm_page_size = 256; /* 256 bytes per page */
1647 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1648 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1649 server.vm_max_threads = 4;
1650 server.vm_blocked_clients = 0;
1651 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1652 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1653
1654 resetServerSaveParams();
1655
1656 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1657 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1658 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1659 /* Replication related */
1660 server.isslave = 0;
1661 server.masterauth = NULL;
1662 server.masterhost = NULL;
1663 server.masterport = 6379;
1664 server.master = NULL;
1665 server.replstate = REDIS_REPL_NONE;
1666
1667 /* Double constants initialization */
1668 R_Zero = 0.0;
1669 R_PosInf = 1.0/R_Zero;
1670 R_NegInf = -1.0/R_Zero;
1671 R_Nan = R_Zero/R_Zero;
1672 }
1673
1674 static void initServer() {
1675 int j;
1676
1677 signal(SIGHUP, SIG_IGN);
1678 signal(SIGPIPE, SIG_IGN);
1679 setupSigSegvAction();
1680
1681 server.devnull = fopen("/dev/null","w");
1682 if (server.devnull == NULL) {
1683 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1684 exit(1);
1685 }
1686 server.clients = listCreate();
1687 server.slaves = listCreate();
1688 server.monitors = listCreate();
1689 server.objfreelist = listCreate();
1690 createSharedObjects();
1691 server.el = aeCreateEventLoop();
1692 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1693 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1694 if (server.fd == -1) {
1695 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1696 exit(1);
1697 }
1698 for (j = 0; j < server.dbnum; j++) {
1699 server.db[j].dict = dictCreate(&dbDictType,NULL);
1700 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1701 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1702 if (server.vm_enabled)
1703 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1704 server.db[j].id = j;
1705 }
1706 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1707 server.pubsub_patterns = listCreate();
1708 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1709 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1710 server.cronloops = 0;
1711 server.bgsavechildpid = -1;
1712 server.bgrewritechildpid = -1;
1713 server.bgrewritebuf = sdsempty();
1714 server.lastsave = time(NULL);
1715 server.dirty = 0;
1716 server.stat_numcommands = 0;
1717 server.stat_numconnections = 0;
1718 server.stat_expiredkeys = 0;
1719 server.stat_starttime = time(NULL);
1720 server.unixtime = time(NULL);
1721 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1722 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1723 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1724
1725 if (server.appendonly) {
1726 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1727 if (server.appendfd == -1) {
1728 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1729 strerror(errno));
1730 exit(1);
1731 }
1732 }
1733
1734 if (server.vm_enabled) vmInit();
1735 }
1736
1737 /* Empty the whole database */
1738 static long long emptyDb() {
1739 int j;
1740 long long removed = 0;
1741
1742 for (j = 0; j < server.dbnum; j++) {
1743 removed += dictSize(server.db[j].dict);
1744 dictEmpty(server.db[j].dict);
1745 dictEmpty(server.db[j].expires);
1746 }
1747 return removed;
1748 }
1749
1750 static int yesnotoi(char *s) {
1751 if (!strcasecmp(s,"yes")) return 1;
1752 else if (!strcasecmp(s,"no")) return 0;
1753 else return -1;
1754 }
1755
1756 /* I agree, this is a very rudimental way to load a configuration...
1757 will improve later if the config gets more complex */
1758 static void loadServerConfig(char *filename) {
1759 FILE *fp;
1760 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1761 int linenum = 0;
1762 sds line = NULL;
1763
1764 if (filename[0] == '-' && filename[1] == '\0')
1765 fp = stdin;
1766 else {
1767 if ((fp = fopen(filename,"r")) == NULL) {
1768 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1769 exit(1);
1770 }
1771 }
1772
1773 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1774 sds *argv;
1775 int argc, j;
1776
1777 linenum++;
1778 line = sdsnew(buf);
1779 line = sdstrim(line," \t\r\n");
1780
1781 /* Skip comments and blank lines*/
1782 if (line[0] == '#' || line[0] == '\0') {
1783 sdsfree(line);
1784 continue;
1785 }
1786
1787 /* Split into arguments */
1788 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1789 sdstolower(argv[0]);
1790
1791 /* Execute config directives */
1792 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1793 server.maxidletime = atoi(argv[1]);
1794 if (server.maxidletime < 0) {
1795 err = "Invalid timeout value"; goto loaderr;
1796 }
1797 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1798 server.port = atoi(argv[1]);
1799 if (server.port < 1 || server.port > 65535) {
1800 err = "Invalid port"; goto loaderr;
1801 }
1802 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1803 server.bindaddr = zstrdup(argv[1]);
1804 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1805 int seconds = atoi(argv[1]);
1806 int changes = atoi(argv[2]);
1807 if (seconds < 1 || changes < 0) {
1808 err = "Invalid save parameters"; goto loaderr;
1809 }
1810 appendServerSaveParams(seconds,changes);
1811 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1812 if (chdir(argv[1]) == -1) {
1813 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1814 argv[1], strerror(errno));
1815 exit(1);
1816 }
1817 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1818 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1819 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1820 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1821 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1822 else {
1823 err = "Invalid log level. Must be one of debug, notice, warning";
1824 goto loaderr;
1825 }
1826 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1827 FILE *logfp;
1828
1829 server.logfile = zstrdup(argv[1]);
1830 if (!strcasecmp(server.logfile,"stdout")) {
1831 zfree(server.logfile);
1832 server.logfile = NULL;
1833 }
1834 if (server.logfile) {
1835 /* Test if we are able to open the file. The server will not
1836 * be able to abort just for this problem later... */
1837 logfp = fopen(server.logfile,"a");
1838 if (logfp == NULL) {
1839 err = sdscatprintf(sdsempty(),
1840 "Can't open the log file: %s", strerror(errno));
1841 goto loaderr;
1842 }
1843 fclose(logfp);
1844 }
1845 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1846 server.dbnum = atoi(argv[1]);
1847 if (server.dbnum < 1) {
1848 err = "Invalid number of databases"; goto loaderr;
1849 }
1850 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1851 loadServerConfig(argv[1]);
1852 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1853 server.maxclients = atoi(argv[1]);
1854 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1855 server.maxmemory = memtoll(argv[1],NULL);
1856 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1857 server.masterhost = sdsnew(argv[1]);
1858 server.masterport = atoi(argv[2]);
1859 server.replstate = REDIS_REPL_CONNECT;
1860 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1861 server.masterauth = zstrdup(argv[1]);
1862 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1863 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1864 err = "argument must be 'yes' or 'no'"; goto loaderr;
1865 }
1866 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1867 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1868 err = "argument must be 'yes' or 'no'"; goto loaderr;
1869 }
1870 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1871 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1872 err = "argument must be 'yes' or 'no'"; goto loaderr;
1873 }
1874 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1875 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1876 err = "argument must be 'yes' or 'no'"; goto loaderr;
1877 }
1878 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1879 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1880 err = "argument must be 'yes' or 'no'"; goto loaderr;
1881 }
1882 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1883 if (!strcasecmp(argv[1],"no")) {
1884 server.appendfsync = APPENDFSYNC_NO;
1885 } else if (!strcasecmp(argv[1],"always")) {
1886 server.appendfsync = APPENDFSYNC_ALWAYS;
1887 } else if (!strcasecmp(argv[1],"everysec")) {
1888 server.appendfsync = APPENDFSYNC_EVERYSEC;
1889 } else {
1890 err = "argument must be 'no', 'always' or 'everysec'";
1891 goto loaderr;
1892 }
1893 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1894 server.requirepass = zstrdup(argv[1]);
1895 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1896 zfree(server.pidfile);
1897 server.pidfile = zstrdup(argv[1]);
1898 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1899 zfree(server.dbfilename);
1900 server.dbfilename = zstrdup(argv[1]);
1901 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1902 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1903 err = "argument must be 'yes' or 'no'"; goto loaderr;
1904 }
1905 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1906 zfree(server.vm_swap_file);
1907 server.vm_swap_file = zstrdup(argv[1]);
1908 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1909 server.vm_max_memory = memtoll(argv[1],NULL);
1910 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1911 server.vm_page_size = memtoll(argv[1], NULL);
1912 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1913 server.vm_pages = memtoll(argv[1], NULL);
1914 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1915 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1916 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1917 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1918 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1919 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1920 } else {
1921 err = "Bad directive or wrong number of arguments"; goto loaderr;
1922 }
1923 for (j = 0; j < argc; j++)
1924 sdsfree(argv[j]);
1925 zfree(argv);
1926 sdsfree(line);
1927 }
1928 if (fp != stdin) fclose(fp);
1929 return;
1930
1931 loaderr:
1932 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1933 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1934 fprintf(stderr, ">>> '%s'\n", line);
1935 fprintf(stderr, "%s\n", err);
1936 exit(1);
1937 }
1938
1939 static void freeClientArgv(redisClient *c) {
1940 int j;
1941
1942 for (j = 0; j < c->argc; j++)
1943 decrRefCount(c->argv[j]);
1944 for (j = 0; j < c->mbargc; j++)
1945 decrRefCount(c->mbargv[j]);
1946 c->argc = 0;
1947 c->mbargc = 0;
1948 }
1949
1950 static void freeClient(redisClient *c) {
1951 listNode *ln;
1952
1953 /* Note that if the client we are freeing is blocked into a blocking
1954 * call, we have to set querybuf to NULL *before* to call
1955 * unblockClientWaitingData() to avoid processInputBuffer() will get
1956 * called. Also it is important to remove the file events after
1957 * this, because this call adds the READABLE event. */
1958 sdsfree(c->querybuf);
1959 c->querybuf = NULL;
1960 if (c->flags & REDIS_BLOCKED)
1961 unblockClientWaitingData(c);
1962
1963 /* Unsubscribe from all the pubsub channels */
1964 pubsubUnsubscribeAllChannels(c,0);
1965 pubsubUnsubscribeAllPatterns(c,0);
1966 dictRelease(c->pubsub_channels);
1967 listRelease(c->pubsub_patterns);
1968 /* Obvious cleanup */
1969 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1970 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1971 listRelease(c->reply);
1972 freeClientArgv(c);
1973 close(c->fd);
1974 /* Remove from the list of clients */
1975 ln = listSearchKey(server.clients,c);
1976 redisAssert(ln != NULL);
1977 listDelNode(server.clients,ln);
1978 /* Remove from the list of clients waiting for swapped keys */
1979 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1980 ln = listSearchKey(server.io_ready_clients,c);
1981 if (ln) {
1982 listDelNode(server.io_ready_clients,ln);
1983 server.vm_blocked_clients--;
1984 }
1985 }
1986 while (server.vm_enabled && listLength(c->io_keys)) {
1987 ln = listFirst(c->io_keys);
1988 dontWaitForSwappedKey(c,ln->value);
1989 }
1990 listRelease(c->io_keys);
1991 /* Master/slave cleanup */
1992 if (c->flags & REDIS_SLAVE) {
1993 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1994 close(c->repldbfd);
1995 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1996 ln = listSearchKey(l,c);
1997 redisAssert(ln != NULL);
1998 listDelNode(l,ln);
1999 }
2000 if (c->flags & REDIS_MASTER) {
2001 server.master = NULL;
2002 server.replstate = REDIS_REPL_CONNECT;
2003 }
2004 /* Release memory */
2005 zfree(c->argv);
2006 zfree(c->mbargv);
2007 freeClientMultiState(c);
2008 zfree(c);
2009 }
2010
2011 #define GLUEREPLY_UP_TO (1024)
2012 static void glueReplyBuffersIfNeeded(redisClient *c) {
2013 int copylen = 0;
2014 char buf[GLUEREPLY_UP_TO];
2015 listNode *ln;
2016 listIter li;
2017 robj *o;
2018
2019 listRewind(c->reply,&li);
2020 while((ln = listNext(&li))) {
2021 int objlen;
2022
2023 o = ln->value;
2024 objlen = sdslen(o->ptr);
2025 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2026 memcpy(buf+copylen,o->ptr,objlen);
2027 copylen += objlen;
2028 listDelNode(c->reply,ln);
2029 } else {
2030 if (copylen == 0) return;
2031 break;
2032 }
2033 }
2034 /* Now the output buffer is empty, add the new single element */
2035 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2036 listAddNodeHead(c->reply,o);
2037 }
2038
2039 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2040 redisClient *c = privdata;
2041 int nwritten = 0, totwritten = 0, objlen;
2042 robj *o;
2043 REDIS_NOTUSED(el);
2044 REDIS_NOTUSED(mask);
2045
2046 /* Use writev() if we have enough buffers to send */
2047 if (!server.glueoutputbuf &&
2048 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2049 !(c->flags & REDIS_MASTER))
2050 {
2051 sendReplyToClientWritev(el, fd, privdata, mask);
2052 return;
2053 }
2054
2055 while(listLength(c->reply)) {
2056 if (server.glueoutputbuf && listLength(c->reply) > 1)
2057 glueReplyBuffersIfNeeded(c);
2058
2059 o = listNodeValue(listFirst(c->reply));
2060 objlen = sdslen(o->ptr);
2061
2062 if (objlen == 0) {
2063 listDelNode(c->reply,listFirst(c->reply));
2064 continue;
2065 }
2066
2067 if (c->flags & REDIS_MASTER) {
2068 /* Don't reply to a master */
2069 nwritten = objlen - c->sentlen;
2070 } else {
2071 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2072 if (nwritten <= 0) break;
2073 }
2074 c->sentlen += nwritten;
2075 totwritten += nwritten;
2076 /* If we fully sent the object on head go to the next one */
2077 if (c->sentlen == objlen) {
2078 listDelNode(c->reply,listFirst(c->reply));
2079 c->sentlen = 0;
2080 }
2081 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2082 * bytes, in a single threaded server it's a good idea to serve
2083 * other clients as well, even if a very large request comes from
2084 * super fast link that is always able to accept data (in real world
2085 * scenario think about 'KEYS *' against the loopback interfae) */
2086 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2087 }
2088 if (nwritten == -1) {
2089 if (errno == EAGAIN) {
2090 nwritten = 0;
2091 } else {
2092 redisLog(REDIS_VERBOSE,
2093 "Error writing to client: %s", strerror(errno));
2094 freeClient(c);
2095 return;
2096 }
2097 }
2098 if (totwritten > 0) c->lastinteraction = time(NULL);
2099 if (listLength(c->reply) == 0) {
2100 c->sentlen = 0;
2101 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2102 }
2103 }
2104
2105 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2106 {
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen, willwrite;
2109 robj *o;
2110 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2111 int offset, ion = 0;
2112 REDIS_NOTUSED(el);
2113 REDIS_NOTUSED(mask);
2114
2115 listNode *node;
2116 while (listLength(c->reply)) {
2117 offset = c->sentlen;
2118 ion = 0;
2119 willwrite = 0;
2120
2121 /* fill-in the iov[] array */
2122 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2123 o = listNodeValue(node);
2124 objlen = sdslen(o->ptr);
2125
2126 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2127 break;
2128
2129 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2130 break; /* no more iovecs */
2131
2132 iov[ion].iov_base = ((char*)o->ptr) + offset;
2133 iov[ion].iov_len = objlen - offset;
2134 willwrite += objlen - offset;
2135 offset = 0; /* just for the first item */
2136 ion++;
2137 }
2138
2139 if(willwrite == 0)
2140 break;
2141
2142 /* write all collected blocks at once */
2143 if((nwritten = writev(fd, iov, ion)) < 0) {
2144 if (errno != EAGAIN) {
2145 redisLog(REDIS_VERBOSE,
2146 "Error writing to client: %s", strerror(errno));
2147 freeClient(c);
2148 return;
2149 }
2150 break;
2151 }
2152
2153 totwritten += nwritten;
2154 offset = c->sentlen;
2155
2156 /* remove written robjs from c->reply */
2157 while (nwritten && listLength(c->reply)) {
2158 o = listNodeValue(listFirst(c->reply));
2159 objlen = sdslen(o->ptr);
2160
2161 if(nwritten >= objlen - offset) {
2162 listDelNode(c->reply, listFirst(c->reply));
2163 nwritten -= objlen - offset;
2164 c->sentlen = 0;
2165 } else {
2166 /* partial write */
2167 c->sentlen += nwritten;
2168 break;
2169 }
2170 offset = 0;
2171 }
2172 }
2173
2174 if (totwritten > 0)
2175 c->lastinteraction = time(NULL);
2176
2177 if (listLength(c->reply) == 0) {
2178 c->sentlen = 0;
2179 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2180 }
2181 }
2182
2183 static struct redisCommand *lookupCommand(char *name) {
2184 int j = 0;
2185 while(cmdTable[j].name != NULL) {
2186 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2187 j++;
2188 }
2189 return NULL;
2190 }
2191
2192 /* resetClient prepare the client to process the next command */
2193 static void resetClient(redisClient *c) {
2194 freeClientArgv(c);
2195 c->bulklen = -1;
2196 c->multibulk = 0;
2197 }
2198
2199 /* Call() is the core of Redis execution of a command */
2200 static void call(redisClient *c, struct redisCommand *cmd) {
2201 long long dirty;
2202
2203 dirty = server.dirty;
2204 cmd->proc(c);
2205 dirty = server.dirty-dirty;
2206
2207 if (server.appendonly && dirty)
2208 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2209 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2210 listLength(server.slaves))
2211 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2212 if (listLength(server.monitors))
2213 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2214 server.stat_numcommands++;
2215 }
2216
2217 /* If this function gets called we already read a whole
2218 * command, argments are in the client argv/argc fields.
2219 * processCommand() execute the command or prepare the
2220 * server for a bulk read from the client.
2221 *
2222 * If 1 is returned the client is still alive and valid and
2223 * and other operations can be performed by the caller. Otherwise
2224 * if 0 is returned the client was destroied (i.e. after QUIT). */
2225 static int processCommand(redisClient *c) {
2226 struct redisCommand *cmd;
2227
2228 /* Free some memory if needed (maxmemory setting) */
2229 if (server.maxmemory) freeMemoryIfNeeded();
2230
2231 /* Handle the multi bulk command type. This is an alternative protocol
2232 * supported by Redis in order to receive commands that are composed of
2233 * multiple binary-safe "bulk" arguments. The latency of processing is
2234 * a bit higher but this allows things like multi-sets, so if this
2235 * protocol is used only for MSET and similar commands this is a big win. */
2236 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2237 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2238 if (c->multibulk <= 0) {
2239 resetClient(c);
2240 return 1;
2241 } else {
2242 decrRefCount(c->argv[c->argc-1]);
2243 c->argc--;
2244 return 1;
2245 }
2246 } else if (c->multibulk) {
2247 if (c->bulklen == -1) {
2248 if (((char*)c->argv[0]->ptr)[0] != '$') {
2249 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2250 resetClient(c);
2251 return 1;
2252 } else {
2253 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2254 decrRefCount(c->argv[0]);
2255 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2256 c->argc--;
2257 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2258 resetClient(c);
2259 return 1;
2260 }
2261 c->argc--;
2262 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2263 return 1;
2264 }
2265 } else {
2266 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2267 c->mbargv[c->mbargc] = c->argv[0];
2268 c->mbargc++;
2269 c->argc--;
2270 c->multibulk--;
2271 if (c->multibulk == 0) {
2272 robj **auxargv;
2273 int auxargc;
2274
2275 /* Here we need to swap the multi-bulk argc/argv with the
2276 * normal argc/argv of the client structure. */
2277 auxargv = c->argv;
2278 c->argv = c->mbargv;
2279 c->mbargv = auxargv;
2280
2281 auxargc = c->argc;
2282 c->argc = c->mbargc;
2283 c->mbargc = auxargc;
2284
2285 /* We need to set bulklen to something different than -1
2286 * in order for the code below to process the command without
2287 * to try to read the last argument of a bulk command as
2288 * a special argument. */
2289 c->bulklen = 0;
2290 /* continue below and process the command */
2291 } else {
2292 c->bulklen = -1;
2293 return 1;
2294 }
2295 }
2296 }
2297 /* -- end of multi bulk commands processing -- */
2298
2299 /* The QUIT command is handled as a special case. Normal command
2300 * procs are unable to close the client connection safely */
2301 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2302 freeClient(c);
2303 return 0;
2304 }
2305
2306 /* Now lookup the command and check ASAP about trivial error conditions
2307 * such wrong arity, bad command name and so forth. */
2308 cmd = lookupCommand(c->argv[0]->ptr);
2309 if (!cmd) {
2310 addReplySds(c,
2311 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2312 (char*)c->argv[0]->ptr));
2313 resetClient(c);
2314 return 1;
2315 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2316 (c->argc < -cmd->arity)) {
2317 addReplySds(c,
2318 sdscatprintf(sdsempty(),
2319 "-ERR wrong number of arguments for '%s' command\r\n",
2320 cmd->name));
2321 resetClient(c);
2322 return 1;
2323 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2324 /* This is a bulk command, we have to read the last argument yet. */
2325 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2326
2327 decrRefCount(c->argv[c->argc-1]);
2328 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2329 c->argc--;
2330 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2331 resetClient(c);
2332 return 1;
2333 }
2334 c->argc--;
2335 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2336 /* It is possible that the bulk read is already in the
2337 * buffer. Check this condition and handle it accordingly.
2338 * This is just a fast path, alternative to call processInputBuffer().
2339 * It's a good idea since the code is small and this condition
2340 * happens most of the times. */
2341 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2342 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2343 c->argc++;
2344 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2345 } else {
2346 /* Otherwise return... there is to read the last argument
2347 * from the socket. */
2348 return 1;
2349 }
2350 }
2351 /* Let's try to encode the bulk object to save space. */
2352 if (cmd->flags & REDIS_CMD_BULK)
2353 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2354
2355 /* Check if the user is authenticated */
2356 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2357 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2358 resetClient(c);
2359 return 1;
2360 }
2361
2362 /* Handle the maxmemory directive */
2363 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2364 zmalloc_used_memory() > server.maxmemory)
2365 {
2366 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2367 resetClient(c);
2368 return 1;
2369 }
2370
2371 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2372 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2373 &&
2374 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2375 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2376 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2377 resetClient(c);
2378 return 1;
2379 }
2380
2381 /* Exec the command */
2382 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2383 queueMultiCommand(c,cmd);
2384 addReply(c,shared.queued);
2385 } else {
2386 if (server.vm_enabled && server.vm_max_threads > 0 &&
2387 blockClientOnSwappedKeys(cmd,c)) return 1;
2388 call(c,cmd);
2389 }
2390
2391 /* Prepare the client for the next command */
2392 resetClient(c);
2393 return 1;
2394 }
2395
2396 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2397 listNode *ln;
2398 listIter li;
2399 int outc = 0, j;
2400 robj **outv;
2401 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2402 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2403 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2404 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2405 robj *lenobj;
2406
2407 if (argc <= REDIS_STATIC_ARGS) {
2408 outv = static_outv;
2409 } else {
2410 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2411 }
2412
2413 lenobj = createObject(REDIS_STRING,
2414 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2415 lenobj->refcount = 0;
2416 outv[outc++] = lenobj;
2417 for (j = 0; j < argc; j++) {
2418 lenobj = createObject(REDIS_STRING,
2419 sdscatprintf(sdsempty(),"$%lu\r\n",
2420 (unsigned long) stringObjectLen(argv[j])));
2421 lenobj->refcount = 0;
2422 outv[outc++] = lenobj;
2423 outv[outc++] = argv[j];
2424 outv[outc++] = shared.crlf;
2425 }
2426
2427 /* Increment all the refcounts at start and decrement at end in order to
2428 * be sure to free objects if there is no slave in a replication state
2429 * able to be feed with commands */
2430 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2431 listRewind(slaves,&li);
2432 while((ln = listNext(&li))) {
2433 redisClient *slave = ln->value;
2434
2435 /* Don't feed slaves that are still waiting for BGSAVE to start */
2436 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2437
2438 /* Feed all the other slaves, MONITORs and so on */
2439 if (slave->slaveseldb != dictid) {
2440 robj *selectcmd;
2441
2442 switch(dictid) {
2443 case 0: selectcmd = shared.select0; break;
2444 case 1: selectcmd = shared.select1; break;
2445 case 2: selectcmd = shared.select2; break;
2446 case 3: selectcmd = shared.select3; break;
2447 case 4: selectcmd = shared.select4; break;
2448 case 5: selectcmd = shared.select5; break;
2449 case 6: selectcmd = shared.select6; break;
2450 case 7: selectcmd = shared.select7; break;
2451 case 8: selectcmd = shared.select8; break;
2452 case 9: selectcmd = shared.select9; break;
2453 default:
2454 selectcmd = createObject(REDIS_STRING,
2455 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2456 selectcmd->refcount = 0;
2457 break;
2458 }
2459 addReply(slave,selectcmd);
2460 slave->slaveseldb = dictid;
2461 }
2462 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2463 }
2464 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2465 if (outv != static_outv) zfree(outv);
2466 }
2467
2468 static void processInputBuffer(redisClient *c) {
2469 again:
2470 /* Before to process the input buffer, make sure the client is not
2471 * waitig for a blocking operation such as BLPOP. Note that the first
2472 * iteration the client is never blocked, otherwise the processInputBuffer
2473 * would not be called at all, but after the execution of the first commands
2474 * in the input buffer the client may be blocked, and the "goto again"
2475 * will try to reiterate. The following line will make it return asap. */
2476 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2477 if (c->bulklen == -1) {
2478 /* Read the first line of the query */
2479 char *p = strchr(c->querybuf,'\n');
2480 size_t querylen;
2481
2482 if (p) {
2483 sds query, *argv;
2484 int argc, j;
2485
2486 query = c->querybuf;
2487 c->querybuf = sdsempty();
2488 querylen = 1+(p-(query));
2489 if (sdslen(query) > querylen) {
2490 /* leave data after the first line of the query in the buffer */
2491 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2492 }
2493 *p = '\0'; /* remove "\n" */
2494 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2495 sdsupdatelen(query);
2496
2497 /* Now we can split the query in arguments */
2498 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2499 sdsfree(query);
2500
2501 if (c->argv) zfree(c->argv);
2502 c->argv = zmalloc(sizeof(robj*)*argc);
2503
2504 for (j = 0; j < argc; j++) {
2505 if (sdslen(argv[j])) {
2506 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2507 c->argc++;
2508 } else {
2509 sdsfree(argv[j]);
2510 }
2511 }
2512 zfree(argv);
2513 if (c->argc) {
2514 /* Execute the command. If the client is still valid
2515 * after processCommand() return and there is something
2516 * on the query buffer try to process the next command. */
2517 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2518 } else {
2519 /* Nothing to process, argc == 0. Just process the query
2520 * buffer if it's not empty or return to the caller */
2521 if (sdslen(c->querybuf)) goto again;
2522 }
2523 return;
2524 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2525 redisLog(REDIS_VERBOSE, "Client protocol error");
2526 freeClient(c);
2527 return;
2528 }
2529 } else {
2530 /* Bulk read handling. Note that if we are at this point
2531 the client already sent a command terminated with a newline,
2532 we are reading the bulk data that is actually the last
2533 argument of the command. */
2534 int qbl = sdslen(c->querybuf);
2535
2536 if (c->bulklen <= qbl) {
2537 /* Copy everything but the final CRLF as final argument */
2538 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2539 c->argc++;
2540 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2541 /* Process the command. If the client is still valid after
2542 * the processing and there is more data in the buffer
2543 * try to parse it. */
2544 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2545 return;
2546 }
2547 }
2548 }
2549
2550 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2551 redisClient *c = (redisClient*) privdata;
2552 char buf[REDIS_IOBUF_LEN];
2553 int nread;
2554 REDIS_NOTUSED(el);
2555 REDIS_NOTUSED(mask);
2556
2557 nread = read(fd, buf, REDIS_IOBUF_LEN);
2558 if (nread == -1) {
2559 if (errno == EAGAIN) {
2560 nread = 0;
2561 } else {
2562 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2563 freeClient(c);
2564 return;
2565 }
2566 } else if (nread == 0) {
2567 redisLog(REDIS_VERBOSE, "Client closed connection");
2568 freeClient(c);
2569 return;
2570 }
2571 if (nread) {
2572 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2573 c->lastinteraction = time(NULL);
2574 } else {
2575 return;
2576 }
2577 processInputBuffer(c);
2578 }
2579
2580 static int selectDb(redisClient *c, int id) {
2581 if (id < 0 || id >= server.dbnum)
2582 return REDIS_ERR;
2583 c->db = &server.db[id];
2584 return REDIS_OK;
2585 }
2586
2587 static void *dupClientReplyValue(void *o) {
2588 incrRefCount((robj*)o);
2589 return o;
2590 }
2591
2592 static int listMatchObjects(void *a, void *b) {
2593 return compareStringObjects(a,b) == 0;
2594 }
2595
2596 static redisClient *createClient(int fd) {
2597 redisClient *c = zmalloc(sizeof(*c));
2598
2599 anetNonBlock(NULL,fd);
2600 anetTcpNoDelay(NULL,fd);
2601 if (!c) return NULL;
2602 selectDb(c,0);
2603 c->fd = fd;
2604 c->querybuf = sdsempty();
2605 c->argc = 0;
2606 c->argv = NULL;
2607 c->bulklen = -1;
2608 c->multibulk = 0;
2609 c->mbargc = 0;
2610 c->mbargv = NULL;
2611 c->sentlen = 0;
2612 c->flags = 0;
2613 c->lastinteraction = time(NULL);
2614 c->authenticated = 0;
2615 c->replstate = REDIS_REPL_NONE;
2616 c->reply = listCreate();
2617 listSetFreeMethod(c->reply,decrRefCount);
2618 listSetDupMethod(c->reply,dupClientReplyValue);
2619 c->blockingkeys = NULL;
2620 c->blockingkeysnum = 0;
2621 c->io_keys = listCreate();
2622 listSetFreeMethod(c->io_keys,decrRefCount);
2623 c->pubsub_channels = dictCreate(&setDictType,NULL);
2624 c->pubsub_patterns = listCreate();
2625 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2626 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2627 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2628 readQueryFromClient, c) == AE_ERR) {
2629 freeClient(c);
2630 return NULL;
2631 }
2632 listAddNodeTail(server.clients,c);
2633 initClientMultiState(c);
2634 return c;
2635 }
2636
2637 static void addReply(redisClient *c, robj *obj) {
2638 if (listLength(c->reply) == 0 &&
2639 (c->replstate == REDIS_REPL_NONE ||
2640 c->replstate == REDIS_REPL_ONLINE) &&
2641 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2642 sendReplyToClient, c) == AE_ERR) return;
2643
2644 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2645 obj = dupStringObject(obj);
2646 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2647 }
2648 listAddNodeTail(c->reply,getDecodedObject(obj));
2649 }
2650
2651 static void addReplySds(redisClient *c, sds s) {
2652 robj *o = createObject(REDIS_STRING,s);
2653 addReply(c,o);
2654 decrRefCount(o);
2655 }
2656
2657 static void addReplyDouble(redisClient *c, double d) {
2658 char buf[128];
2659
2660 snprintf(buf,sizeof(buf),"%.17g",d);
2661 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2662 (unsigned long) strlen(buf),buf));
2663 }
2664
2665 static void addReplyLong(redisClient *c, long l) {
2666 char buf[128];
2667 size_t len;
2668
2669 if (l == 0) {
2670 addReply(c,shared.czero);
2671 return;
2672 } else if (l == 1) {
2673 addReply(c,shared.cone);
2674 return;
2675 }
2676 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2677 addReplySds(c,sdsnewlen(buf,len));
2678 }
2679
2680 static void addReplyLongLong(redisClient *c, long long ll) {
2681 char buf[128];
2682 size_t len;
2683
2684 if (ll == 0) {
2685 addReply(c,shared.czero);
2686 return;
2687 } else if (ll == 1) {
2688 addReply(c,shared.cone);
2689 return;
2690 }
2691 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2692 addReplySds(c,sdsnewlen(buf,len));
2693 }
2694
2695 static void addReplyUlong(redisClient *c, unsigned long ul) {
2696 char buf[128];
2697 size_t len;
2698
2699 if (ul == 0) {
2700 addReply(c,shared.czero);
2701 return;
2702 } else if (ul == 1) {
2703 addReply(c,shared.cone);
2704 return;
2705 }
2706 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2707 addReplySds(c,sdsnewlen(buf,len));
2708 }
2709
2710 static void addReplyBulkLen(redisClient *c, robj *obj) {
2711 size_t len;
2712
2713 if (obj->encoding == REDIS_ENCODING_RAW) {
2714 len = sdslen(obj->ptr);
2715 } else {
2716 long n = (long)obj->ptr;
2717
2718 /* Compute how many bytes will take this integer as a radix 10 string */
2719 len = 1;
2720 if (n < 0) {
2721 len++;
2722 n = -n;
2723 }
2724 while((n = n/10) != 0) {
2725 len++;
2726 }
2727 }
2728 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2729 }
2730
2731 static void addReplyBulk(redisClient *c, robj *obj) {
2732 addReplyBulkLen(c,obj);
2733 addReply(c,obj);
2734 addReply(c,shared.crlf);
2735 }
2736
2737 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2738 static void addReplyBulkCString(redisClient *c, char *s) {
2739 if (s == NULL) {
2740 addReply(c,shared.nullbulk);
2741 } else {
2742 robj *o = createStringObject(s,strlen(s));
2743 addReplyBulk(c,o);
2744 decrRefCount(o);
2745 }
2746 }
2747
2748 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2749 int cport, cfd;
2750 char cip[128];
2751 redisClient *c;
2752 REDIS_NOTUSED(el);
2753 REDIS_NOTUSED(mask);
2754 REDIS_NOTUSED(privdata);
2755
2756 cfd = anetAccept(server.neterr, fd, cip, &cport);
2757 if (cfd == AE_ERR) {
2758 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2759 return;
2760 }
2761 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2762 if ((c = createClient(cfd)) == NULL) {
2763 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2764 close(cfd); /* May be already closed, just ingore errors */
2765 return;
2766 }
2767 /* If maxclient directive is set and this is one client more... close the
2768 * connection. Note that we create the client instead to check before
2769 * for this condition, since now the socket is already set in nonblocking
2770 * mode and we can send an error for free using the Kernel I/O */
2771 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2772 char *err = "-ERR max number of clients reached\r\n";
2773
2774 /* That's a best effort error message, don't check write errors */
2775 if (write(c->fd,err,strlen(err)) == -1) {
2776 /* Nothing to do, Just to avoid the warning... */
2777 }
2778 freeClient(c);
2779 return;
2780 }
2781 server.stat_numconnections++;
2782 }
2783
2784 /* ======================= Redis objects implementation ===================== */
2785
2786 static robj *createObject(int type, void *ptr) {
2787 robj *o;
2788
2789 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2790 if (listLength(server.objfreelist)) {
2791 listNode *head = listFirst(server.objfreelist);
2792 o = listNodeValue(head);
2793 listDelNode(server.objfreelist,head);
2794 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2795 } else {
2796 if (server.vm_enabled) {
2797 pthread_mutex_unlock(&server.obj_freelist_mutex);
2798 o = zmalloc(sizeof(*o));
2799 } else {
2800 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2801 }
2802 }
2803 o->type = type;
2804 o->encoding = REDIS_ENCODING_RAW;
2805 o->ptr = ptr;
2806 o->refcount = 1;
2807 if (server.vm_enabled) {
2808 /* Note that this code may run in the context of an I/O thread
2809 * and accessing to server.unixtime in theory is an error
2810 * (no locks). But in practice this is safe, and even if we read
2811 * garbage Redis will not fail, as it's just a statistical info */
2812 o->vm.atime = server.unixtime;
2813 o->storage = REDIS_VM_MEMORY;
2814 }
2815 return o;
2816 }
2817
2818 static robj *createStringObject(char *ptr, size_t len) {
2819 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2820 }
2821
2822 static robj *createStringObjectFromLongLong(long long value) {
2823 robj *o;
2824 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2825 incrRefCount(shared.integers[value]);
2826 o = shared.integers[value];
2827 } else {
2828 o = createObject(REDIS_STRING, NULL);
2829 if (value >= LONG_MIN && value <= LONG_MAX) {
2830 o->encoding = REDIS_ENCODING_INT;
2831 o->ptr = (void*)((long)value);
2832 } else {
2833 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2834 }
2835 }
2836 return o;
2837 }
2838
2839 static robj *dupStringObject(robj *o) {
2840 assert(o->encoding == REDIS_ENCODING_RAW);
2841 return createStringObject(o->ptr,sdslen(o->ptr));
2842 }
2843
2844 static robj *createListObject(void) {
2845 list *l = listCreate();
2846
2847 listSetFreeMethod(l,decrRefCount);
2848 return createObject(REDIS_LIST,l);
2849 }
2850
2851 static robj *createSetObject(void) {
2852 dict *d = dictCreate(&setDictType,NULL);
2853 return createObject(REDIS_SET,d);
2854 }
2855
2856 static robj *createHashObject(void) {
2857 /* All the Hashes start as zipmaps. Will be automatically converted
2858 * into hash tables if there are enough elements or big elements
2859 * inside. */
2860 unsigned char *zm = zipmapNew();
2861 robj *o = createObject(REDIS_HASH,zm);
2862 o->encoding = REDIS_ENCODING_ZIPMAP;
2863 return o;
2864 }
2865
2866 static robj *createZsetObject(void) {
2867 zset *zs = zmalloc(sizeof(*zs));
2868
2869 zs->dict = dictCreate(&zsetDictType,NULL);
2870 zs->zsl = zslCreate();
2871 return createObject(REDIS_ZSET,zs);
2872 }
2873
2874 static void freeStringObject(robj *o) {
2875 if (o->encoding == REDIS_ENCODING_RAW) {
2876 sdsfree(o->ptr);
2877 }
2878 }
2879
2880 static void freeListObject(robj *o) {
2881 listRelease((list*) o->ptr);
2882 }
2883
2884 static void freeSetObject(robj *o) {
2885 dictRelease((dict*) o->ptr);
2886 }
2887
2888 static void freeZsetObject(robj *o) {
2889 zset *zs = o->ptr;
2890
2891 dictRelease(zs->dict);
2892 zslFree(zs->zsl);
2893 zfree(zs);
2894 }
2895
2896 static void freeHashObject(robj *o) {
2897 switch (o->encoding) {
2898 case REDIS_ENCODING_HT:
2899 dictRelease((dict*) o->ptr);
2900 break;
2901 case REDIS_ENCODING_ZIPMAP:
2902 zfree(o->ptr);
2903 break;
2904 default:
2905 redisPanic("Unknown hash encoding type");
2906 break;
2907 }
2908 }
2909
2910 static void incrRefCount(robj *o) {
2911 o->refcount++;
2912 }
2913
2914 static void decrRefCount(void *obj) {
2915 robj *o = obj;
2916
2917 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2918 /* Object is a key of a swapped out value, or in the process of being
2919 * loaded. */
2920 if (server.vm_enabled &&
2921 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2922 {
2923 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2924 redisAssert(o->type == REDIS_STRING);
2925 freeStringObject(o);
2926 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2927 pthread_mutex_lock(&server.obj_freelist_mutex);
2928 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2929 !listAddNodeHead(server.objfreelist,o))
2930 zfree(o);
2931 pthread_mutex_unlock(&server.obj_freelist_mutex);
2932 server.vm_stats_swapped_objects--;
2933 return;
2934 }
2935 /* Object is in memory, or in the process of being swapped out. */
2936 if (--(o->refcount) == 0) {
2937 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2938 vmCancelThreadedIOJob(obj);
2939 switch(o->type) {
2940 case REDIS_STRING: freeStringObject(o); break;
2941 case REDIS_LIST: freeListObject(o); break;
2942 case REDIS_SET: freeSetObject(o); break;
2943 case REDIS_ZSET: freeZsetObject(o); break;
2944 case REDIS_HASH: freeHashObject(o); break;
2945 default: redisPanic("Unknown object type"); break;
2946 }
2947 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2948 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2949 !listAddNodeHead(server.objfreelist,o))
2950 zfree(o);
2951 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2952 }
2953 }
2954
2955 static robj *lookupKey(redisDb *db, robj *key) {
2956 dictEntry *de = dictFind(db->dict,key);
2957 if (de) {
2958 robj *key = dictGetEntryKey(de);
2959 robj *val = dictGetEntryVal(de);
2960
2961 if (server.vm_enabled) {
2962 if (key->storage == REDIS_VM_MEMORY ||
2963 key->storage == REDIS_VM_SWAPPING)
2964 {
2965 /* If we were swapping the object out, stop it, this key
2966 * was requested. */
2967 if (key->storage == REDIS_VM_SWAPPING)
2968 vmCancelThreadedIOJob(key);
2969 /* Update the access time of the key for the aging algorithm. */
2970 key->vm.atime = server.unixtime;
2971 } else {
2972 int notify = (key->storage == REDIS_VM_LOADING);
2973
2974 /* Our value was swapped on disk. Bring it at home. */
2975 redisAssert(val == NULL);
2976 val = vmLoadObject(key);
2977 dictGetEntryVal(de) = val;
2978
2979 /* Clients blocked by the VM subsystem may be waiting for
2980 * this key... */
2981 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2982 }
2983 }
2984 return val;
2985 } else {
2986 return NULL;
2987 }
2988 }
2989
2990 static robj *lookupKeyRead(redisDb *db, robj *key) {
2991 expireIfNeeded(db,key);
2992 return lookupKey(db,key);
2993 }
2994
2995 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2996 deleteIfVolatile(db,key);
2997 return lookupKey(db,key);
2998 }
2999
3000 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3001 robj *o = lookupKeyRead(c->db, key);
3002 if (!o) addReply(c,reply);
3003 return o;
3004 }
3005
3006 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3007 robj *o = lookupKeyWrite(c->db, key);
3008 if (!o) addReply(c,reply);
3009 return o;
3010 }
3011
3012 static int checkType(redisClient *c, robj *o, int type) {
3013 if (o->type != type) {
3014 addReply(c,shared.wrongtypeerr);
3015 return 1;
3016 }
3017 return 0;
3018 }
3019
3020 static int deleteKey(redisDb *db, robj *key) {
3021 int retval;
3022
3023 /* We need to protect key from destruction: after the first dictDelete()
3024 * it may happen that 'key' is no longer valid if we don't increment
3025 * it's count. This may happen when we get the object reference directly
3026 * from the hash table with dictRandomKey() or dict iterators */
3027 incrRefCount(key);
3028 if (dictSize(db->expires)) dictDelete(db->expires,key);
3029 retval = dictDelete(db->dict,key);
3030 decrRefCount(key);
3031
3032 return retval == DICT_OK;
3033 }
3034
3035 /* Check if the nul-terminated string 's' can be represented by a long
3036 * (that is, is a number that fits into long without any other space or
3037 * character before or after the digits).
3038 *
3039 * If so, the function returns REDIS_OK and *longval is set to the value
3040 * of the number. Otherwise REDIS_ERR is returned */
3041 static int isStringRepresentableAsLong(sds s, long *longval) {
3042 char buf[32], *endptr;
3043 long value;
3044 int slen;
3045
3046 value = strtol(s, &endptr, 10);
3047 if (endptr[0] != '\0') return REDIS_ERR;
3048 slen = snprintf(buf,32,"%ld",value);
3049
3050 /* If the number converted back into a string is not identical
3051 * then it's not possible to encode the string as integer */
3052 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3053 if (longval) *longval = value;
3054 return REDIS_OK;
3055 }
3056
3057 /* Try to encode a string object in order to save space */
3058 static robj *tryObjectEncoding(robj *o) {
3059 long value;
3060 sds s = o->ptr;
3061
3062 if (o->encoding != REDIS_ENCODING_RAW)
3063 return o; /* Already encoded */
3064
3065 /* It's not safe to encode shared objects: shared objects can be shared
3066 * everywhere in the "object space" of Redis. Encoded objects can only
3067 * appear as "values" (and not, for instance, as keys) */
3068 if (o->refcount > 1) return o;
3069
3070 /* Currently we try to encode only strings */
3071 redisAssert(o->type == REDIS_STRING);
3072
3073 /* Check if we can represent this string as a long integer */
3074 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3075
3076 /* Ok, this object can be encoded */
3077 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3078 decrRefCount(o);
3079 incrRefCount(shared.integers[value]);
3080 return shared.integers[value];
3081 } else {
3082 o->encoding = REDIS_ENCODING_INT;
3083 sdsfree(o->ptr);
3084 o->ptr = (void*) value;
3085 return o;
3086 }
3087 }
3088
3089 /* Get a decoded version of an encoded object (returned as a new object).
3090 * If the object is already raw-encoded just increment the ref count. */
3091 static robj *getDecodedObject(robj *o) {
3092 robj *dec;
3093
3094 if (o->encoding == REDIS_ENCODING_RAW) {
3095 incrRefCount(o);
3096 return o;
3097 }
3098 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3099 char buf[32];
3100
3101 snprintf(buf,32,"%ld",(long)o->ptr);
3102 dec = createStringObject(buf,strlen(buf));
3103 return dec;
3104 } else {
3105 redisPanic("Unknown encoding type");
3106 }
3107 }
3108
3109 /* Compare two string objects via strcmp() or alike.
3110 * Note that the objects may be integer-encoded. In such a case we
3111 * use snprintf() to get a string representation of the numbers on the stack
3112 * and compare the strings, it's much faster than calling getDecodedObject().
3113 *
3114 * Important note: if objects are not integer encoded, but binary-safe strings,
3115 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3116 * binary safe. */
3117 static int compareStringObjects(robj *a, robj *b) {
3118 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3119 char bufa[128], bufb[128], *astr, *bstr;
3120 int bothsds = 1;
3121
3122 if (a == b) return 0;
3123 if (a->encoding != REDIS_ENCODING_RAW) {
3124 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3125 astr = bufa;
3126 bothsds = 0;
3127 } else {
3128 astr = a->ptr;
3129 }
3130 if (b->encoding != REDIS_ENCODING_RAW) {
3131 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3132 bstr = bufb;
3133 bothsds = 0;
3134 } else {
3135 bstr = b->ptr;
3136 }
3137 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3138 }
3139
3140 static size_t stringObjectLen(robj *o) {
3141 redisAssert(o->type == REDIS_STRING);
3142 if (o->encoding == REDIS_ENCODING_RAW) {
3143 return sdslen(o->ptr);
3144 } else {
3145 char buf[32];
3146
3147 return snprintf(buf,32,"%ld",(long)o->ptr);
3148 }
3149 }
3150
3151 static int getDoubleFromObject(robj *o, double *target) {
3152 double value;
3153 char *eptr;
3154
3155 if (o == NULL) {
3156 value = 0;
3157 } else {
3158 redisAssert(o->type == REDIS_STRING);
3159 if (o->encoding == REDIS_ENCODING_RAW) {
3160 value = strtod(o->ptr, &eptr);
3161 if (eptr[0] != '\0') return REDIS_ERR;
3162 } else if (o->encoding == REDIS_ENCODING_INT) {
3163 value = (long)o->ptr;
3164 } else {
3165 redisAssert(1 != 1);
3166 }
3167 }
3168
3169 *target = value;
3170 return REDIS_OK;
3171 }
3172
3173 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3174 double value;
3175 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3176 if (msg != NULL) {
3177 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3178 } else {
3179 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3180 }
3181 return REDIS_ERR;
3182 }
3183
3184 *target = value;
3185 return REDIS_OK;
3186 }
3187
3188 static int getLongLongFromObject(robj *o, long long *target) {
3189 long long value;
3190 char *eptr;
3191
3192 if (o == NULL) {
3193 value = 0;
3194 } else {
3195 redisAssert(o->type == REDIS_STRING);
3196 if (o->encoding == REDIS_ENCODING_RAW) {
3197 value = strtoll(o->ptr, &eptr, 10);
3198 if (eptr[0] != '\0') return REDIS_ERR;
3199 } else if (o->encoding == REDIS_ENCODING_INT) {
3200 value = (long)o->ptr;
3201 } else {
3202 redisAssert(1 != 1);
3203 }
3204 }
3205
3206 *target = value;
3207 return REDIS_OK;
3208 }
3209
3210 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3211 long long value;
3212 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3213 if (msg != NULL) {
3214 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3215 } else {
3216 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3217 }
3218 return REDIS_ERR;
3219 }
3220
3221 *target = value;
3222 return REDIS_OK;
3223 }
3224
3225 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3226 long long value;
3227
3228 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3229 if (value < LONG_MIN || value > LONG_MAX) {
3230 if (msg != NULL) {
3231 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3232 } else {
3233 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3234 }
3235 return REDIS_ERR;
3236 }
3237
3238 *target = value;
3239 return REDIS_OK;
3240 }
3241
3242 /*============================ RDB saving/loading =========================== */
3243
3244 static int rdbSaveType(FILE *fp, unsigned char type) {
3245 if (fwrite(&type,1,1,fp) == 0) return -1;
3246 return 0;
3247 }
3248
3249 static int rdbSaveTime(FILE *fp, time_t t) {
3250 int32_t t32 = (int32_t) t;
3251 if (fwrite(&t32,4,1,fp) == 0) return -1;
3252 return 0;
3253 }
3254
3255 /* check rdbLoadLen() comments for more info */
3256 static int rdbSaveLen(FILE *fp, uint32_t len) {
3257 unsigned char buf[2];
3258
3259 if (len < (1<<6)) {
3260 /* Save a 6 bit len */
3261 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3262 if (fwrite(buf,1,1,fp) == 0) return -1;
3263 } else if (len < (1<<14)) {
3264 /* Save a 14 bit len */
3265 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3266 buf[1] = len&0xFF;
3267 if (fwrite(buf,2,1,fp) == 0) return -1;
3268 } else {
3269 /* Save a 32 bit len */
3270 buf[0] = (REDIS_RDB_32BITLEN<<6);
3271 if (fwrite(buf,1,1,fp) == 0) return -1;
3272 len = htonl(len);
3273 if (fwrite(&len,4,1,fp) == 0) return -1;
3274 }
3275 return 0;
3276 }
3277
3278 /* String objects in the form "2391" "-100" without any space and with a
3279 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3280 * encoded as integers to save space */
3281 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3282 long long value;
3283 char *endptr, buf[32];
3284
3285 /* Check if it's possible to encode this value as a number */
3286 value = strtoll(s, &endptr, 10);
3287 if (endptr[0] != '\0') return 0;
3288 snprintf(buf,32,"%lld",value);
3289
3290 /* If the number converted back into a string is not identical
3291 * then it's not possible to encode the string as integer */
3292 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3293
3294 /* Finally check if it fits in our ranges */
3295 if (value >= -(1<<7) && value <= (1<<7)-1) {
3296 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3297 enc[1] = value&0xFF;
3298 return 2;
3299 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3300 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3301 enc[1] = value&0xFF;
3302 enc[2] = (value>>8)&0xFF;
3303 return 3;
3304 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3305 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3306 enc[1] = value&0xFF;
3307 enc[2] = (value>>8)&0xFF;
3308 enc[3] = (value>>16)&0xFF;
3309 enc[4] = (value>>24)&0xFF;
3310 return 5;
3311 } else {
3312 return 0;
3313 }
3314 }
3315
3316 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3317 size_t comprlen, outlen;
3318 unsigned char byte;
3319 void *out;
3320
3321 /* We require at least four bytes compression for this to be worth it */
3322 if (len <= 4) return 0;
3323 outlen = len-4;
3324 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3325 comprlen = lzf_compress(s, len, out, outlen);
3326 if (comprlen == 0) {
3327 zfree(out);
3328 return 0;
3329 }
3330 /* Data compressed! Let's save it on disk */
3331 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3332 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3333 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3334 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3335 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3336 zfree(out);
3337 return comprlen;
3338
3339 writeerr:
3340 zfree(out);
3341 return -1;
3342 }
3343
3344 /* Save a string objet as [len][data] on disk. If the object is a string
3345 * representation of an integer value we try to safe it in a special form */
3346 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3347 int enclen;
3348
3349 /* Try integer encoding */
3350 if (len <= 11) {
3351 unsigned char buf[5];
3352 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3353 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3354 return 0;
3355 }
3356 }
3357
3358 /* Try LZF compression - under 20 bytes it's unable to compress even
3359 * aaaaaaaaaaaaaaaaaa so skip it */
3360 if (server.rdbcompression && len > 20) {
3361 int retval;
3362
3363 retval = rdbSaveLzfStringObject(fp,s,len);
3364 if (retval == -1) return -1;
3365 if (retval > 0) return 0;
3366 /* retval == 0 means data can't be compressed, save the old way */
3367 }
3368
3369 /* Store verbatim */
3370 if (rdbSaveLen(fp,len) == -1) return -1;
3371 if (len && fwrite(s,len,1,fp) == 0) return -1;
3372 return 0;
3373 }
3374
3375 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3376 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3377 int retval;
3378
3379 /* Avoid incr/decr ref count business when possible.
3380 * This plays well with copy-on-write given that we are probably
3381 * in a child process (BGSAVE). Also this makes sure key objects
3382 * of swapped objects are not incRefCount-ed (an assert does not allow
3383 * this in order to avoid bugs) */
3384 if (obj->encoding != REDIS_ENCODING_RAW) {
3385 obj = getDecodedObject(obj);
3386 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3387 decrRefCount(obj);
3388 } else {
3389 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3390 }
3391 return retval;
3392 }
3393
3394 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3395 * 8 bit integer specifing the length of the representation.
3396 * This 8 bit integer has special values in order to specify the following
3397 * conditions:
3398 * 253: not a number
3399 * 254: + inf
3400 * 255: - inf
3401 */
3402 static int rdbSaveDoubleValue(FILE *fp, double val) {
3403 unsigned char buf[128];
3404 int len;
3405
3406 if (isnan(val)) {
3407 buf[0] = 253;
3408 len = 1;
3409 } else if (!isfinite(val)) {
3410 len = 1;
3411 buf[0] = (val < 0) ? 255 : 254;
3412 } else {
3413 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3414 buf[0] = strlen((char*)buf+1);
3415 len = buf[0]+1;
3416 }
3417 if (fwrite(buf,len,1,fp) == 0) return -1;
3418 return 0;
3419 }
3420
3421 /* Save a Redis object. */
3422 static int rdbSaveObject(FILE *fp, robj *o) {
3423 if (o->type == REDIS_STRING) {
3424 /* Save a string value */
3425 if (rdbSaveStringObject(fp,o) == -1) return -1;
3426 } else if (o->type == REDIS_LIST) {
3427 /* Save a list value */
3428 list *list = o->ptr;
3429 listIter li;
3430 listNode *ln;
3431
3432 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3433 listRewind(list,&li);
3434 while((ln = listNext(&li))) {
3435 robj *eleobj = listNodeValue(ln);
3436
3437 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3438 }
3439 } else if (o->type == REDIS_SET) {
3440 /* Save a set value */
3441 dict *set = o->ptr;
3442 dictIterator *di = dictGetIterator(set);
3443 dictEntry *de;
3444
3445 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3446 while((de = dictNext(di)) != NULL) {
3447 robj *eleobj = dictGetEntryKey(de);
3448
3449 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3450 }
3451 dictReleaseIterator(di);
3452 } else if (o->type == REDIS_ZSET) {
3453 /* Save a set value */
3454 zset *zs = o->ptr;
3455 dictIterator *di = dictGetIterator(zs->dict);
3456 dictEntry *de;
3457
3458 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3459 while((de = dictNext(di)) != NULL) {
3460 robj *eleobj = dictGetEntryKey(de);
3461 double *score = dictGetEntryVal(de);
3462
3463 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3464 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3465 }
3466 dictReleaseIterator(di);
3467 } else if (o->type == REDIS_HASH) {
3468 /* Save a hash value */
3469 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3470 unsigned char *p = zipmapRewind(o->ptr);
3471 unsigned int count = zipmapLen(o->ptr);
3472 unsigned char *key, *val;
3473 unsigned int klen, vlen;
3474
3475 if (rdbSaveLen(fp,count) == -1) return -1;
3476 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3477 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3478 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3479 }
3480 } else {
3481 dictIterator *di = dictGetIterator(o->ptr);
3482 dictEntry *de;
3483
3484 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3485 while((de = dictNext(di)) != NULL) {
3486 robj *key = dictGetEntryKey(de);
3487 robj *val = dictGetEntryVal(de);
3488
3489 if (rdbSaveStringObject(fp,key) == -1) return -1;
3490 if (rdbSaveStringObject(fp,val) == -1) return -1;
3491 }
3492 dictReleaseIterator(di);
3493 }
3494 } else {
3495 redisPanic("Unknown object type");
3496 }
3497 return 0;
3498 }
3499
3500 /* Return the length the object will have on disk if saved with
3501 * the rdbSaveObject() function. Currently we use a trick to get
3502 * this length with very little changes to the code. In the future
3503 * we could switch to a faster solution. */
3504 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3505 if (fp == NULL) fp = server.devnull;
3506 rewind(fp);
3507 assert(rdbSaveObject(fp,o) != 1);
3508 return ftello(fp);
3509 }
3510
3511 /* Return the number of pages required to save this object in the swap file */
3512 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3513 off_t bytes = rdbSavedObjectLen(o,fp);
3514
3515 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3516 }
3517
3518 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3519 static int rdbSave(char *filename) {
3520 dictIterator *di = NULL;
3521 dictEntry *de;
3522 FILE *fp;
3523 char tmpfile[256];
3524 int j;
3525 time_t now = time(NULL);
3526
3527 /* Wait for I/O therads to terminate, just in case this is a
3528 * foreground-saving, to avoid seeking the swap file descriptor at the
3529 * same time. */
3530 if (server.vm_enabled)
3531 waitEmptyIOJobsQueue();
3532
3533 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3534 fp = fopen(tmpfile,"w");
3535 if (!fp) {
3536 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3537 return REDIS_ERR;
3538 }
3539 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3540 for (j = 0; j < server.dbnum; j++) {
3541 redisDb *db = server.db+j;
3542 dict *d = db->dict;
3543 if (dictSize(d) == 0) continue;
3544 di = dictGetIterator(d);
3545 if (!di) {
3546 fclose(fp);
3547 return REDIS_ERR;
3548 }
3549
3550 /* Write the SELECT DB opcode */
3551 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3552 if (rdbSaveLen(fp,j) == -1) goto werr;
3553
3554 /* Iterate this DB writing every entry */
3555 while((de = dictNext(di)) != NULL) {
3556 robj *key = dictGetEntryKey(de);
3557 robj *o = dictGetEntryVal(de);
3558 time_t expiretime = getExpire(db,key);
3559
3560 /* Save the expire time */
3561 if (expiretime != -1) {
3562 /* If this key is already expired skip it */
3563 if (expiretime < now) continue;
3564 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3565 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3566 }
3567 /* Save the key and associated value. This requires special
3568 * handling if the value is swapped out. */
3569 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3570 key->storage == REDIS_VM_SWAPPING) {
3571 /* Save type, key, value */
3572 if (rdbSaveType(fp,o->type) == -1) goto werr;
3573 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3574 if (rdbSaveObject(fp,o) == -1) goto werr;
3575 } else {
3576 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3577 robj *po;
3578 /* Get a preview of the object in memory */
3579 po = vmPreviewObject(key);
3580 /* Save type, key, value */
3581 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3582 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3583 if (rdbSaveObject(fp,po) == -1) goto werr;
3584 /* Remove the loaded object from memory */
3585 decrRefCount(po);
3586 }
3587 }
3588 dictReleaseIterator(di);
3589 }
3590 /* EOF opcode */
3591 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3592
3593 /* Make sure data will not remain on the OS's output buffers */
3594 fflush(fp);
3595 fsync(fileno(fp));
3596 fclose(fp);
3597
3598 /* Use RENAME to make sure the DB file is changed atomically only
3599 * if the generate DB file is ok. */
3600 if (rename(tmpfile,filename) == -1) {
3601 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3602 unlink(tmpfile);
3603 return REDIS_ERR;
3604 }
3605 redisLog(REDIS_NOTICE,"DB saved on disk");
3606 server.dirty = 0;
3607 server.lastsave = time(NULL);
3608 return REDIS_OK;
3609
3610 werr:
3611 fclose(fp);
3612 unlink(tmpfile);
3613 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3614 if (di) dictReleaseIterator(di);
3615 return REDIS_ERR;
3616 }
3617
3618 static int rdbSaveBackground(char *filename) {
3619 pid_t childpid;
3620
3621 if (server.bgsavechildpid != -1) return REDIS_ERR;
3622 if (server.vm_enabled) waitEmptyIOJobsQueue();
3623 if ((childpid = fork()) == 0) {
3624 /* Child */
3625 if (server.vm_enabled) vmReopenSwapFile();
3626 close(server.fd);
3627 if (rdbSave(filename) == REDIS_OK) {
3628 _exit(0);
3629 } else {
3630 _exit(1);
3631 }
3632 } else {
3633 /* Parent */
3634 if (childpid == -1) {
3635 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3636 strerror(errno));
3637 return REDIS_ERR;
3638 }
3639 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3640 server.bgsavechildpid = childpid;
3641 updateDictResizePolicy();
3642 return REDIS_OK;
3643 }
3644 return REDIS_OK; /* unreached */
3645 }
3646
3647 static void rdbRemoveTempFile(pid_t childpid) {
3648 char tmpfile[256];
3649
3650 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3651 unlink(tmpfile);
3652 }
3653
3654 static int rdbLoadType(FILE *fp) {
3655 unsigned char type;
3656 if (fread(&type,1,1,fp) == 0) return -1;
3657 return type;
3658 }
3659
3660 static time_t rdbLoadTime(FILE *fp) {
3661 int32_t t32;
3662 if (fread(&t32,4,1,fp) == 0) return -1;
3663 return (time_t) t32;
3664 }
3665
3666 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3667 * of this file for a description of how this are stored on disk.
3668 *
3669 * isencoded is set to 1 if the readed length is not actually a length but
3670 * an "encoding type", check the above comments for more info */
3671 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3672 unsigned char buf[2];
3673 uint32_t len;
3674 int type;
3675
3676 if (isencoded) *isencoded = 0;
3677 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3678 type = (buf[0]&0xC0)>>6;
3679 if (type == REDIS_RDB_6BITLEN) {
3680 /* Read a 6 bit len */
3681 return buf[0]&0x3F;
3682 } else if (type == REDIS_RDB_ENCVAL) {
3683 /* Read a 6 bit len encoding type */
3684 if (isencoded) *isencoded = 1;
3685 return buf[0]&0x3F;
3686 } else if (type == REDIS_RDB_14BITLEN) {
3687 /* Read a 14 bit len */
3688 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3689 return ((buf[0]&0x3F)<<8)|buf[1];
3690 } else {
3691 /* Read a 32 bit len */
3692 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3693 return ntohl(len);
3694 }
3695 }
3696
3697 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3698 unsigned char enc[4];
3699 long long val;
3700
3701 if (enctype == REDIS_RDB_ENC_INT8) {
3702 if (fread(enc,1,1,fp) == 0) return NULL;
3703 val = (signed char)enc[0];
3704 } else if (enctype == REDIS_RDB_ENC_INT16) {
3705 uint16_t v;
3706 if (fread(enc,2,1,fp) == 0) return NULL;
3707 v = enc[0]|(enc[1]<<8);
3708 val = (int16_t)v;
3709 } else if (enctype == REDIS_RDB_ENC_INT32) {
3710 uint32_t v;
3711 if (fread(enc,4,1,fp) == 0) return NULL;
3712 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3713 val = (int32_t)v;
3714 } else {
3715 val = 0; /* anti-warning */
3716 redisPanic("Unknown RDB integer encoding type");
3717 }
3718 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3719 }
3720
3721 static robj *rdbLoadLzfStringObject(FILE*fp) {
3722 unsigned int len, clen;
3723 unsigned char *c = NULL;
3724 sds val = NULL;
3725
3726 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3727 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3728 if ((c = zmalloc(clen)) == NULL) goto err;
3729 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3730 if (fread(c,clen,1,fp) == 0) goto err;
3731 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3732 zfree(c);
3733 return createObject(REDIS_STRING,val);
3734 err:
3735 zfree(c);
3736 sdsfree(val);
3737 return NULL;
3738 }
3739
3740 static robj *rdbLoadStringObject(FILE*fp) {
3741 int isencoded;
3742 uint32_t len;
3743 sds val;
3744
3745 len = rdbLoadLen(fp,&isencoded);
3746 if (isencoded) {
3747 switch(len) {
3748 case REDIS_RDB_ENC_INT8:
3749 case REDIS_RDB_ENC_INT16:
3750 case REDIS_RDB_ENC_INT32:
3751 return rdbLoadIntegerObject(fp,len);
3752 case REDIS_RDB_ENC_LZF:
3753 return rdbLoadLzfStringObject(fp);
3754 default:
3755 redisPanic("Unknown RDB encoding type");
3756 }
3757 }
3758
3759 if (len == REDIS_RDB_LENERR) return NULL;
3760 val = sdsnewlen(NULL,len);
3761 if (len && fread(val,len,1,fp) == 0) {
3762 sdsfree(val);
3763 return NULL;
3764 }
3765 return createObject(REDIS_STRING,val);
3766 }
3767
3768 /* For information about double serialization check rdbSaveDoubleValue() */
3769 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3770 char buf[128];
3771 unsigned char len;
3772
3773 if (fread(&len,1,1,fp) == 0) return -1;
3774 switch(len) {
3775 case 255: *val = R_NegInf; return 0;
3776 case 254: *val = R_PosInf; return 0;
3777 case 253: *val = R_Nan; return 0;
3778 default:
3779 if (fread(buf,len,1,fp) == 0) return -1;
3780 buf[len] = '\0';
3781 sscanf(buf, "%lg", val);
3782 return 0;
3783 }
3784 }
3785
3786 /* Load a Redis object of the specified type from the specified file.
3787 * On success a newly allocated object is returned, otherwise NULL. */
3788 static robj *rdbLoadObject(int type, FILE *fp) {
3789 robj *o;
3790
3791 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3792 if (type == REDIS_STRING) {
3793 /* Read string value */
3794 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3795 o = tryObjectEncoding(o);
3796 } else if (type == REDIS_LIST || type == REDIS_SET) {
3797 /* Read list/set value */
3798 uint32_t listlen;
3799
3800 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3801 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3802 /* It's faster to expand the dict to the right size asap in order
3803 * to avoid rehashing */
3804 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3805 dictExpand(o->ptr,listlen);
3806 /* Load every single element of the list/set */
3807 while(listlen--) {
3808 robj *ele;
3809
3810 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3811 ele = tryObjectEncoding(ele);
3812 if (type == REDIS_LIST) {
3813 listAddNodeTail((list*)o->ptr,ele);
3814 } else {
3815 dictAdd((dict*)o->ptr,ele,NULL);
3816 }
3817 }
3818 } else if (type == REDIS_ZSET) {
3819 /* Read list/set value */
3820 size_t zsetlen;
3821 zset *zs;
3822
3823 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3824 o = createZsetObject();
3825 zs = o->ptr;
3826 /* Load every single element of the list/set */
3827 while(zsetlen--) {
3828 robj *ele;
3829 double *score = zmalloc(sizeof(double));
3830
3831 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3832 ele = tryObjectEncoding(ele);
3833 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3834 dictAdd(zs->dict,ele,score);
3835 zslInsert(zs->zsl,*score,ele);
3836 incrRefCount(ele); /* added to skiplist */
3837 }
3838 } else if (type == REDIS_HASH) {
3839 size_t hashlen;
3840
3841 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3842 o = createHashObject();
3843 /* Too many entries? Use an hash table. */
3844 if (hashlen > server.hash_max_zipmap_entries)
3845 convertToRealHash(o);
3846 /* Load every key/value, then set it into the zipmap or hash
3847 * table, as needed. */
3848 while(hashlen--) {
3849 robj *key, *val;
3850
3851 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3852 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3853 /* If we are using a zipmap and there are too big values
3854 * the object is converted to real hash table encoding. */
3855 if (o->encoding != REDIS_ENCODING_HT &&
3856 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3857 sdslen(val->ptr) > server.hash_max_zipmap_value))
3858 {
3859 convertToRealHash(o);
3860 }
3861
3862 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3863 unsigned char *zm = o->ptr;
3864
3865 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3866 val->ptr,sdslen(val->ptr),NULL);
3867 o->ptr = zm;
3868 decrRefCount(key);
3869 decrRefCount(val);
3870 } else {
3871 key = tryObjectEncoding(key);
3872 val = tryObjectEncoding(val);
3873 dictAdd((dict*)o->ptr,key,val);
3874 }
3875 }
3876 } else {
3877 redisPanic("Unknown object type");
3878 }
3879 return o;
3880 }
3881
3882 static int rdbLoad(char *filename) {
3883 FILE *fp;
3884 robj *keyobj = NULL;
3885 uint32_t dbid;
3886 int type, retval, rdbver;
3887 dict *d = server.db[0].dict;
3888 redisDb *db = server.db+0;
3889 char buf[1024];
3890 time_t expiretime = -1, now = time(NULL);
3891 long long loadedkeys = 0;
3892
3893 fp = fopen(filename,"r");
3894 if (!fp) return REDIS_ERR;
3895 if (fread(buf,9,1,fp) == 0) goto eoferr;
3896 buf[9] = '\0';
3897 if (memcmp(buf,"REDIS",5) != 0) {
3898 fclose(fp);
3899 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3900 return REDIS_ERR;
3901 }
3902 rdbver = atoi(buf+5);
3903 if (rdbver != 1) {
3904 fclose(fp);
3905 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3906 return REDIS_ERR;
3907 }
3908 while(1) {
3909 robj *o;
3910
3911 /* Read type. */
3912 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3913 if (type == REDIS_EXPIRETIME) {
3914 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3915 /* We read the time so we need to read the object type again */
3916 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3917 }
3918 if (type == REDIS_EOF) break;
3919 /* Handle SELECT DB opcode as a special case */
3920 if (type == REDIS_SELECTDB) {
3921 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3922 goto eoferr;
3923 if (dbid >= (unsigned)server.dbnum) {
3924 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3925 exit(1);
3926 }
3927 db = server.db+dbid;
3928 d = db->dict;
3929 continue;
3930 }
3931 /* Read key */
3932 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3933 /* Read value */
3934 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3935 /* Add the new object in the hash table */
3936 retval = dictAdd(d,keyobj,o);
3937 if (retval == DICT_ERR) {
3938 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3939 exit(1);
3940 }
3941 /* Set the expire time if needed */
3942 if (expiretime != -1) {
3943 setExpire(db,keyobj,expiretime);
3944 /* Delete this key if already expired */
3945 if (expiretime < now) deleteKey(db,keyobj);
3946 expiretime = -1;
3947 }
3948 keyobj = o = NULL;
3949 /* Handle swapping while loading big datasets when VM is on */
3950 loadedkeys++;
3951 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3952 while (zmalloc_used_memory() > server.vm_max_memory) {
3953 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3954 }
3955 }
3956 }
3957 fclose(fp);
3958 return REDIS_OK;
3959
3960 eoferr: /* unexpected end of file is handled here with a fatal exit */
3961 if (keyobj) decrRefCount(keyobj);
3962 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3963 exit(1);
3964 return REDIS_ERR; /* Just to avoid warning */
3965 }
3966
3967 /*================================== Commands =============================== */
3968
3969 static void authCommand(redisClient *c) {
3970 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3971 c->authenticated = 1;
3972 addReply(c,shared.ok);
3973 } else {
3974 c->authenticated = 0;
3975 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3976 }
3977 }
3978
3979 static void pingCommand(redisClient *c) {
3980 addReply(c,shared.pong);
3981 }
3982
3983 static void echoCommand(redisClient *c) {
3984 addReplyBulk(c,c->argv[1]);
3985 }
3986
3987 /*=================================== Strings =============================== */
3988
3989 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
3990 int retval;
3991 long seconds;
3992
3993 if (expire) {
3994 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
3995 return;
3996 if (seconds <= 0) {
3997 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
3998 return;
3999 }
4000 }
4001
4002 if (nx) deleteIfVolatile(c->db,key);
4003 retval = dictAdd(c->db->dict,key,val);
4004 if (retval == DICT_ERR) {
4005 if (!nx) {
4006 /* If the key is about a swapped value, we want a new key object
4007 * to overwrite the old. So we delete the old key in the database.
4008 * This will also make sure that swap pages about the old object
4009 * will be marked as free. */
4010 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4011 incrRefCount(key);
4012 dictReplace(c->db->dict,key,val);
4013 incrRefCount(val);
4014 } else {
4015 addReply(c,shared.czero);
4016 return;
4017 }
4018 } else {
4019 incrRefCount(key);
4020 incrRefCount(val);
4021 }
4022 server.dirty++;
4023 removeExpire(c->db,key);
4024 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4025 addReply(c, nx ? shared.cone : shared.ok);
4026 }
4027
4028 static void setCommand(redisClient *c) {
4029 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4030 }
4031
4032 static void setnxCommand(redisClient *c) {
4033 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4034 }
4035
4036 static void setexCommand(redisClient *c) {
4037 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4038 }
4039
4040 static int getGenericCommand(redisClient *c) {
4041 robj *o;
4042
4043 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4044 return REDIS_OK;
4045
4046 if (o->type != REDIS_STRING) {
4047 addReply(c,shared.wrongtypeerr);
4048 return REDIS_ERR;
4049 } else {
4050 addReplyBulk(c,o);
4051 return REDIS_OK;
4052 }
4053 }
4054
4055 static void getCommand(redisClient *c) {
4056 getGenericCommand(c);
4057 }
4058
4059 static void getsetCommand(redisClient *c) {
4060 if (getGenericCommand(c) == REDIS_ERR) return;
4061 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4062 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4063 } else {
4064 incrRefCount(c->argv[1]);
4065 }
4066 incrRefCount(c->argv[2]);
4067 server.dirty++;
4068 removeExpire(c->db,c->argv[1]);
4069 }
4070
4071 static void mgetCommand(redisClient *c) {
4072 int j;
4073
4074 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4075 for (j = 1; j < c->argc; j++) {
4076 robj *o = lookupKeyRead(c->db,c->argv[j]);
4077 if (o == NULL) {
4078 addReply(c,shared.nullbulk);
4079 } else {
4080 if (o->type != REDIS_STRING) {
4081 addReply(c,shared.nullbulk);
4082 } else {
4083 addReplyBulk(c,o);
4084 }
4085 }
4086 }
4087 }
4088
4089 static void msetGenericCommand(redisClient *c, int nx) {
4090 int j, busykeys = 0;
4091
4092 if ((c->argc % 2) == 0) {
4093 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4094 return;
4095 }
4096 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4097 * set nothing at all if at least one already key exists. */
4098 if (nx) {
4099 for (j = 1; j < c->argc; j += 2) {
4100 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4101 busykeys++;
4102 }
4103 }
4104 }
4105 if (busykeys) {
4106 addReply(c, shared.czero);
4107 return;
4108 }
4109
4110 for (j = 1; j < c->argc; j += 2) {
4111 int retval;
4112
4113 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4114 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4115 if (retval == DICT_ERR) {
4116 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4117 incrRefCount(c->argv[j+1]);
4118 } else {
4119 incrRefCount(c->argv[j]);
4120 incrRefCount(c->argv[j+1]);
4121 }
4122 removeExpire(c->db,c->argv[j]);
4123 }
4124 server.dirty += (c->argc-1)/2;
4125 addReply(c, nx ? shared.cone : shared.ok);
4126 }
4127
4128 static void msetCommand(redisClient *c) {
4129 msetGenericCommand(c,0);
4130 }
4131
4132 static void msetnxCommand(redisClient *c) {
4133 msetGenericCommand(c,1);
4134 }
4135
4136 static void incrDecrCommand(redisClient *c, long long incr) {
4137 long long value;
4138 int retval;
4139 robj *o;
4140
4141 o = lookupKeyWrite(c->db,c->argv[1]);
4142
4143 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4144
4145 value += incr;
4146 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4147 o = tryObjectEncoding(o);
4148 retval = dictAdd(c->db->dict,c->argv[1],o);
4149 if (retval == DICT_ERR) {
4150 dictReplace(c->db->dict,c->argv[1],o);
4151 removeExpire(c->db,c->argv[1]);
4152 } else {
4153 incrRefCount(c->argv[1]);
4154 }
4155 server.dirty++;
4156 addReply(c,shared.colon);
4157 addReply(c,o);
4158 addReply(c,shared.crlf);
4159 }
4160
4161 static void incrCommand(redisClient *c) {
4162 incrDecrCommand(c,1);
4163 }
4164
4165 static void decrCommand(redisClient *c) {
4166 incrDecrCommand(c,-1);
4167 }
4168
4169 static void incrbyCommand(redisClient *c) {
4170 long long incr;
4171
4172 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4173 incrDecrCommand(c,incr);
4174 }
4175
4176 static void decrbyCommand(redisClient *c) {
4177 long long incr;
4178
4179 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4180 incrDecrCommand(c,-incr);
4181 }
4182
4183 static void appendCommand(redisClient *c) {
4184 int retval;
4185 size_t totlen;
4186 robj *o;
4187
4188 o = lookupKeyWrite(c->db,c->argv[1]);
4189 if (o == NULL) {
4190 /* Create the key */
4191 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4192 incrRefCount(c->argv[1]);
4193 incrRefCount(c->argv[2]);
4194 totlen = stringObjectLen(c->argv[2]);
4195 } else {
4196 dictEntry *de;
4197
4198 de = dictFind(c->db->dict,c->argv[1]);
4199 assert(de != NULL);
4200
4201 o = dictGetEntryVal(de);
4202 if (o->type != REDIS_STRING) {
4203 addReply(c,shared.wrongtypeerr);
4204 return;
4205 }
4206 /* If the object is specially encoded or shared we have to make
4207 * a copy */
4208 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4209 robj *decoded = getDecodedObject(o);
4210
4211 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4212 decrRefCount(decoded);
4213 dictReplace(c->db->dict,c->argv[1],o);
4214 }
4215 /* APPEND! */
4216 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4217 o->ptr = sdscatlen(o->ptr,
4218 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4219 } else {
4220 o->ptr = sdscatprintf(o->ptr, "%ld",
4221 (unsigned long) c->argv[2]->ptr);
4222 }
4223 totlen = sdslen(o->ptr);
4224 }
4225 server.dirty++;
4226 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4227 }
4228
4229 static void substrCommand(redisClient *c) {
4230 robj *o;
4231 long start = atoi(c->argv[2]->ptr);
4232 long end = atoi(c->argv[3]->ptr);
4233 size_t rangelen, strlen;
4234 sds range;
4235
4236 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4237 checkType(c,o,REDIS_STRING)) return;
4238
4239 o = getDecodedObject(o);
4240 strlen = sdslen(o->ptr);
4241
4242 /* convert negative indexes */
4243 if (start < 0) start = strlen+start;
4244 if (end < 0) end = strlen+end;
4245 if (start < 0) start = 0;
4246 if (end < 0) end = 0;
4247
4248 /* indexes sanity checks */
4249 if (start > end || (size_t)start >= strlen) {
4250 /* Out of range start or start > end result in null reply */
4251 addReply(c,shared.nullbulk);
4252 decrRefCount(o);
4253 return;
4254 }
4255 if ((size_t)end >= strlen) end = strlen-1;
4256 rangelen = (end-start)+1;
4257
4258 /* Return the result */
4259 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4260 range = sdsnewlen((char*)o->ptr+start,rangelen);
4261 addReplySds(c,range);
4262 addReply(c,shared.crlf);
4263 decrRefCount(o);
4264 }
4265
4266 /* ========================= Type agnostic commands ========================= */
4267
4268 static void delCommand(redisClient *c) {
4269 int deleted = 0, j;
4270
4271 for (j = 1; j < c->argc; j++) {
4272 if (deleteKey(c->db,c->argv[j])) {
4273 server.dirty++;
4274 deleted++;
4275 }
4276 }
4277 addReplyLong(c,deleted);
4278 }
4279
4280 static void existsCommand(redisClient *c) {
4281 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4282 }
4283
4284 static void selectCommand(redisClient *c) {
4285 int id = atoi(c->argv[1]->ptr);
4286
4287 if (selectDb(c,id) == REDIS_ERR) {
4288 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4289 } else {
4290 addReply(c,shared.ok);
4291 }
4292 }
4293
4294 static void randomkeyCommand(redisClient *c) {
4295 dictEntry *de;
4296 robj *key;
4297
4298 while(1) {
4299 de = dictGetRandomKey(c->db->dict);
4300 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4301 }
4302
4303 if (de == NULL) {
4304 addReply(c,shared.nullbulk);
4305 return;
4306 }
4307
4308 key = dictGetEntryKey(de);
4309 if (server.vm_enabled) {
4310 key = dupStringObject(key);
4311 addReplyBulk(c,key);
4312 decrRefCount(key);
4313 } else {
4314 addReplyBulk(c,key);
4315 }
4316 }
4317
4318 static void keysCommand(redisClient *c) {
4319 dictIterator *di;
4320 dictEntry *de;
4321 sds pattern = c->argv[1]->ptr;
4322 int plen = sdslen(pattern);
4323 unsigned long numkeys = 0;
4324 robj *lenobj = createObject(REDIS_STRING,NULL);
4325
4326 di = dictGetIterator(c->db->dict);
4327 addReply(c,lenobj);
4328 decrRefCount(lenobj);
4329 while((de = dictNext(di)) != NULL) {
4330 robj *keyobj = dictGetEntryKey(de);
4331
4332 sds key = keyobj->ptr;
4333 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4334 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4335 if (expireIfNeeded(c->db,keyobj) == 0) {
4336 addReplyBulk(c,keyobj);
4337 numkeys++;
4338 }
4339 }
4340 }
4341 dictReleaseIterator(di);
4342 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4343 }
4344
4345 static void dbsizeCommand(redisClient *c) {
4346 addReplySds(c,
4347 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4348 }
4349
4350 static void lastsaveCommand(redisClient *c) {
4351 addReplySds(c,
4352 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4353 }
4354
4355 static void typeCommand(redisClient *c) {
4356 robj *o;
4357 char *type;
4358
4359 o = lookupKeyRead(c->db,c->argv[1]);
4360 if (o == NULL) {
4361 type = "+none";
4362 } else {
4363 switch(o->type) {
4364 case REDIS_STRING: type = "+string"; break;
4365 case REDIS_LIST: type = "+list"; break;
4366 case REDIS_SET: type = "+set"; break;
4367 case REDIS_ZSET: type = "+zset"; break;
4368 case REDIS_HASH: type = "+hash"; break;
4369 default: type = "+unknown"; break;
4370 }
4371 }
4372 addReplySds(c,sdsnew(type));
4373 addReply(c,shared.crlf);
4374 }
4375
4376 static void saveCommand(redisClient *c) {
4377 if (server.bgsavechildpid != -1) {
4378 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4379 return;
4380 }
4381 if (rdbSave(server.dbfilename) == REDIS_OK) {
4382 addReply(c,shared.ok);
4383 } else {
4384 addReply(c,shared.err);
4385 }
4386 }
4387
4388 static void bgsaveCommand(redisClient *c) {
4389 if (server.bgsavechildpid != -1) {
4390 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4391 return;
4392 }
4393 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4394 char *status = "+Background saving started\r\n";
4395 addReplySds(c,sdsnew(status));
4396 } else {
4397 addReply(c,shared.err);
4398 }
4399 }
4400
4401 static void shutdownCommand(redisClient *c) {
4402 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4403 /* Kill the saving child if there is a background saving in progress.
4404 We want to avoid race conditions, for instance our saving child may
4405 overwrite the synchronous saving did by SHUTDOWN. */
4406 if (server.bgsavechildpid != -1) {
4407 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4408 kill(server.bgsavechildpid,SIGKILL);
4409 rdbRemoveTempFile(server.bgsavechildpid);
4410 }
4411 if (server.appendonly) {
4412 /* Append only file: fsync() the AOF and exit */
4413 fsync(server.appendfd);
4414 if (server.vm_enabled) unlink(server.vm_swap_file);
4415 exit(0);
4416 } else {
4417 /* Snapshotting. Perform a SYNC SAVE and exit */
4418 if (rdbSave(server.dbfilename) == REDIS_OK) {
4419 if (server.daemonize)
4420 unlink(server.pidfile);
4421 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4422 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4423 if (server.vm_enabled) unlink(server.vm_swap_file);
4424 exit(0);
4425 } else {
4426 /* Ooops.. error saving! The best we can do is to continue
4427 * operating. Note that if there was a background saving process,
4428 * in the next cron() Redis will be notified that the background
4429 * saving aborted, handling special stuff like slaves pending for
4430 * synchronization... */
4431 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4432 addReplySds(c,
4433 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4434 }
4435 }
4436 }
4437
4438 static void renameGenericCommand(redisClient *c, int nx) {
4439 robj *o;
4440
4441 /* To use the same key as src and dst is probably an error */
4442 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4443 addReply(c,shared.sameobjecterr);
4444 return;
4445 }
4446
4447 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4448 return;
4449
4450 incrRefCount(o);
4451 deleteIfVolatile(c->db,c->argv[2]);
4452 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4453 if (nx) {
4454 decrRefCount(o);
4455 addReply(c,shared.czero);
4456 return;
4457 }
4458 dictReplace(c->db->dict,c->argv[2],o);
4459 } else {
4460 incrRefCount(c->argv[2]);
4461 }
4462 deleteKey(c->db,c->argv[1]);
4463 server.dirty++;
4464 addReply(c,nx ? shared.cone : shared.ok);
4465 }
4466
4467 static void renameCommand(redisClient *c) {
4468 renameGenericCommand(c,0);
4469 }
4470
4471 static void renamenxCommand(redisClient *c) {
4472 renameGenericCommand(c,1);
4473 }
4474
4475 static void moveCommand(redisClient *c) {
4476 robj *o;
4477 redisDb *src, *dst;
4478 int srcid;
4479
4480 /* Obtain source and target DB pointers */
4481 src = c->db;
4482 srcid = c->db->id;
4483 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4484 addReply(c,shared.outofrangeerr);
4485 return;
4486 }
4487 dst = c->db;
4488 selectDb(c,srcid); /* Back to the source DB */
4489
4490 /* If the user is moving using as target the same
4491 * DB as the source DB it is probably an error. */
4492 if (src == dst) {
4493 addReply(c,shared.sameobjecterr);
4494 return;
4495 }
4496
4497 /* Check if the element exists and get a reference */
4498 o = lookupKeyWrite(c->db,c->argv[1]);
4499 if (!o) {
4500 addReply(c,shared.czero);
4501 return;
4502 }
4503
4504 /* Try to add the element to the target DB */
4505 deleteIfVolatile(dst,c->argv[1]);
4506 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4507 addReply(c,shared.czero);
4508 return;
4509 }
4510 incrRefCount(c->argv[1]);
4511 incrRefCount(o);
4512
4513 /* OK! key moved, free the entry in the source DB */
4514 deleteKey(src,c->argv[1]);
4515 server.dirty++;
4516 addReply(c,shared.cone);
4517 }
4518
4519 /* =================================== Lists ================================ */
4520 static void pushGenericCommand(redisClient *c, int where) {
4521 robj *lobj;
4522 list *list;
4523
4524 lobj = lookupKeyWrite(c->db,c->argv[1]);
4525 if (lobj == NULL) {
4526 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4527 addReply(c,shared.cone);
4528 return;
4529 }
4530 lobj = createListObject();
4531 list = lobj->ptr;
4532 if (where == REDIS_HEAD) {
4533 listAddNodeHead(list,c->argv[2]);
4534 } else {
4535 listAddNodeTail(list,c->argv[2]);
4536 }
4537 dictAdd(c->db->dict,c->argv[1],lobj);
4538 incrRefCount(c->argv[1]);
4539 incrRefCount(c->argv[2]);
4540 } else {
4541 if (lobj->type != REDIS_LIST) {
4542 addReply(c,shared.wrongtypeerr);
4543 return;
4544 }
4545 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4546 addReply(c,shared.cone);
4547 return;
4548 }
4549 list = lobj->ptr;
4550 if (where == REDIS_HEAD) {
4551 listAddNodeHead(list,c->argv[2]);
4552 } else {
4553 listAddNodeTail(list,c->argv[2]);
4554 }
4555 incrRefCount(c->argv[2]);
4556 }
4557 server.dirty++;
4558 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4559 }
4560
4561 static void lpushCommand(redisClient *c) {
4562 pushGenericCommand(c,REDIS_HEAD);
4563 }
4564
4565 static void rpushCommand(redisClient *c) {
4566 pushGenericCommand(c,REDIS_TAIL);
4567 }
4568
4569 static void llenCommand(redisClient *c) {
4570 robj *o;
4571 list *l;
4572
4573 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4574 checkType(c,o,REDIS_LIST)) return;
4575
4576 l = o->ptr;
4577 addReplyUlong(c,listLength(l));
4578 }
4579
4580 static void lindexCommand(redisClient *c) {
4581 robj *o;
4582 int index = atoi(c->argv[2]->ptr);
4583 list *list;
4584 listNode *ln;
4585
4586 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4587 checkType(c,o,REDIS_LIST)) return;
4588 list = o->ptr;
4589
4590 ln = listIndex(list, index);
4591 if (ln == NULL) {
4592 addReply(c,shared.nullbulk);
4593 } else {
4594 robj *ele = listNodeValue(ln);
4595 addReplyBulk(c,ele);
4596 }
4597 }
4598
4599 static void lsetCommand(redisClient *c) {
4600 robj *o;
4601 int index = atoi(c->argv[2]->ptr);
4602 list *list;
4603 listNode *ln;
4604
4605 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4606 checkType(c,o,REDIS_LIST)) return;
4607 list = o->ptr;
4608
4609 ln = listIndex(list, index);
4610 if (ln == NULL) {
4611 addReply(c,shared.outofrangeerr);
4612 } else {
4613 robj *ele = listNodeValue(ln);
4614
4615 decrRefCount(ele);
4616 listNodeValue(ln) = c->argv[3];
4617 incrRefCount(c->argv[3]);
4618 addReply(c,shared.ok);
4619 server.dirty++;
4620 }
4621 }
4622
4623 static void popGenericCommand(redisClient *c, int where) {
4624 robj *o;
4625 list *list;
4626 listNode *ln;
4627
4628 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4629 checkType(c,o,REDIS_LIST)) return;
4630 list = o->ptr;
4631
4632 if (where == REDIS_HEAD)
4633 ln = listFirst(list);
4634 else
4635 ln = listLast(list);
4636
4637 if (ln == NULL) {
4638 addReply(c,shared.nullbulk);
4639 } else {
4640 robj *ele = listNodeValue(ln);
4641 addReplyBulk(c,ele);
4642 listDelNode(list,ln);
4643 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4644 server.dirty++;
4645 }
4646 }
4647
4648 static void lpopCommand(redisClient *c) {
4649 popGenericCommand(c,REDIS_HEAD);
4650 }
4651
4652 static void rpopCommand(redisClient *c) {
4653 popGenericCommand(c,REDIS_TAIL);
4654 }
4655
4656 static void lrangeCommand(redisClient *c) {
4657 robj *o;
4658 int start = atoi(c->argv[2]->ptr);
4659 int end = atoi(c->argv[3]->ptr);
4660 int llen;
4661 int rangelen, j;
4662 list *list;
4663 listNode *ln;
4664 robj *ele;
4665
4666 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4667 || checkType(c,o,REDIS_LIST)) return;
4668 list = o->ptr;
4669 llen = listLength(list);
4670
4671 /* convert negative indexes */
4672 if (start < 0) start = llen+start;
4673 if (end < 0) end = llen+end;
4674 if (start < 0) start = 0;
4675 if (end < 0) end = 0;
4676
4677 /* indexes sanity checks */
4678 if (start > end || start >= llen) {
4679 /* Out of range start or start > end result in empty list */
4680 addReply(c,shared.emptymultibulk);
4681 return;
4682 }
4683 if (end >= llen) end = llen-1;
4684 rangelen = (end-start)+1;
4685
4686 /* Return the result in form of a multi-bulk reply */
4687 ln = listIndex(list, start);
4688 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4689 for (j = 0; j < rangelen; j++) {
4690 ele = listNodeValue(ln);
4691 addReplyBulk(c,ele);
4692 ln = ln->next;
4693 }
4694 }
4695
4696 static void ltrimCommand(redisClient *c) {
4697 robj *o;
4698 int start = atoi(c->argv[2]->ptr);
4699 int end = atoi(c->argv[3]->ptr);
4700 int llen;
4701 int j, ltrim, rtrim;
4702 list *list;
4703 listNode *ln;
4704
4705 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4706 checkType(c,o,REDIS_LIST)) return;
4707 list = o->ptr;
4708 llen = listLength(list);
4709
4710 /* convert negative indexes */
4711 if (start < 0) start = llen+start;
4712 if (end < 0) end = llen+end;
4713 if (start < 0) start = 0;
4714 if (end < 0) end = 0;
4715
4716 /* indexes sanity checks */
4717 if (start > end || start >= llen) {
4718 /* Out of range start or start > end result in empty list */
4719 ltrim = llen;
4720 rtrim = 0;
4721 } else {
4722 if (end >= llen) end = llen-1;
4723 ltrim = start;
4724 rtrim = llen-end-1;
4725 }
4726
4727 /* Remove list elements to perform the trim */
4728 for (j = 0; j < ltrim; j++) {
4729 ln = listFirst(list);
4730 listDelNode(list,ln);
4731 }
4732 for (j = 0; j < rtrim; j++) {
4733 ln = listLast(list);
4734 listDelNode(list,ln);
4735 }
4736 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4737 server.dirty++;
4738 addReply(c,shared.ok);
4739 }
4740
4741 static void lremCommand(redisClient *c) {
4742 robj *o;
4743 list *list;
4744 listNode *ln, *next;
4745 int toremove = atoi(c->argv[2]->ptr);
4746 int removed = 0;
4747 int fromtail = 0;
4748
4749 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4750 checkType(c,o,REDIS_LIST)) return;
4751 list = o->ptr;
4752
4753 if (toremove < 0) {
4754 toremove = -toremove;
4755 fromtail = 1;
4756 }
4757 ln = fromtail ? list->tail : list->head;
4758 while (ln) {
4759 robj *ele = listNodeValue(ln);
4760
4761 next = fromtail ? ln->prev : ln->next;
4762 if (compareStringObjects(ele,c->argv[3]) == 0) {
4763 listDelNode(list,ln);
4764 server.dirty++;
4765 removed++;
4766 if (toremove && removed == toremove) break;
4767 }
4768 ln = next;
4769 }
4770 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4771 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4772 }
4773
4774 /* This is the semantic of this command:
4775 * RPOPLPUSH srclist dstlist:
4776 * IF LLEN(srclist) > 0
4777 * element = RPOP srclist
4778 * LPUSH dstlist element
4779 * RETURN element
4780 * ELSE
4781 * RETURN nil
4782 * END
4783 * END
4784 *
4785 * The idea is to be able to get an element from a list in a reliable way
4786 * since the element is not just returned but pushed against another list
4787 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4788 */
4789 static void rpoplpushcommand(redisClient *c) {
4790 robj *sobj;
4791 list *srclist;
4792 listNode *ln;
4793
4794 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4795 checkType(c,sobj,REDIS_LIST)) return;
4796 srclist = sobj->ptr;
4797 ln = listLast(srclist);
4798
4799 if (ln == NULL) {
4800 addReply(c,shared.nullbulk);
4801 } else {
4802 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4803 robj *ele = listNodeValue(ln);
4804 list *dstlist;
4805
4806 if (dobj && dobj->type != REDIS_LIST) {
4807 addReply(c,shared.wrongtypeerr);
4808 return;
4809 }
4810
4811 /* Add the element to the target list (unless it's directly
4812 * passed to some BLPOP-ing client */
4813 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4814 if (dobj == NULL) {
4815 /* Create the list if the key does not exist */
4816 dobj = createListObject();
4817 dictAdd(c->db->dict,c->argv[2],dobj);
4818 incrRefCount(c->argv[2]);
4819 }
4820 dstlist = dobj->ptr;
4821 listAddNodeHead(dstlist,ele);
4822 incrRefCount(ele);
4823 }
4824
4825 /* Send the element to the client as reply as well */
4826 addReplyBulk(c,ele);
4827
4828 /* Finally remove the element from the source list */
4829 listDelNode(srclist,ln);
4830 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4831 server.dirty++;
4832 }
4833 }
4834
4835 /* ==================================== Sets ================================ */
4836
4837 static void saddCommand(redisClient *c) {
4838 robj *set;
4839
4840 set = lookupKeyWrite(c->db,c->argv[1]);
4841 if (set == NULL) {
4842 set = createSetObject();
4843 dictAdd(c->db->dict,c->argv[1],set);
4844 incrRefCount(c->argv[1]);
4845 } else {
4846 if (set->type != REDIS_SET) {
4847 addReply(c,shared.wrongtypeerr);
4848 return;
4849 }
4850 }
4851 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4852 incrRefCount(c->argv[2]);
4853 server.dirty++;
4854 addReply(c,shared.cone);
4855 } else {
4856 addReply(c,shared.czero);
4857 }
4858 }
4859
4860 static void sremCommand(redisClient *c) {
4861 robj *set;
4862
4863 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4864 checkType(c,set,REDIS_SET)) return;
4865
4866 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4867 server.dirty++;
4868 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4869 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4870 addReply(c,shared.cone);
4871 } else {
4872 addReply(c,shared.czero);
4873 }
4874 }
4875
4876 static void smoveCommand(redisClient *c) {
4877 robj *srcset, *dstset;
4878
4879 srcset = lookupKeyWrite(c->db,c->argv[1]);
4880 dstset = lookupKeyWrite(c->db,c->argv[2]);
4881
4882 /* If the source key does not exist return 0, if it's of the wrong type
4883 * raise an error */
4884 if (srcset == NULL || srcset->type != REDIS_SET) {
4885 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4886 return;
4887 }
4888 /* Error if the destination key is not a set as well */
4889 if (dstset && dstset->type != REDIS_SET) {
4890 addReply(c,shared.wrongtypeerr);
4891 return;
4892 }
4893 /* Remove the element from the source set */
4894 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4895 /* Key not found in the src set! return zero */
4896 addReply(c,shared.czero);
4897 return;
4898 }
4899 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4900 deleteKey(c->db,c->argv[1]);
4901 server.dirty++;
4902 /* Add the element to the destination set */
4903 if (!dstset) {
4904 dstset = createSetObject();
4905 dictAdd(c->db->dict,c->argv[2],dstset);
4906 incrRefCount(c->argv[2]);
4907 }
4908 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4909 incrRefCount(c->argv[3]);
4910 addReply(c,shared.cone);
4911 }
4912
4913 static void sismemberCommand(redisClient *c) {
4914 robj *set;
4915
4916 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4917 checkType(c,set,REDIS_SET)) return;
4918
4919 if (dictFind(set->ptr,c->argv[2]))
4920 addReply(c,shared.cone);
4921 else
4922 addReply(c,shared.czero);
4923 }
4924
4925 static void scardCommand(redisClient *c) {
4926 robj *o;
4927 dict *s;
4928
4929 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4930 checkType(c,o,REDIS_SET)) return;
4931
4932 s = o->ptr;
4933 addReplyUlong(c,dictSize(s));
4934 }
4935
4936 static void spopCommand(redisClient *c) {
4937 robj *set;
4938 dictEntry *de;
4939
4940 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4941 checkType(c,set,REDIS_SET)) return;
4942
4943 de = dictGetRandomKey(set->ptr);
4944 if (de == NULL) {
4945 addReply(c,shared.nullbulk);
4946 } else {
4947 robj *ele = dictGetEntryKey(de);
4948
4949 addReplyBulk(c,ele);
4950 dictDelete(set->ptr,ele);
4951 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4952 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4953 server.dirty++;
4954 }
4955 }
4956
4957 static void srandmemberCommand(redisClient *c) {
4958 robj *set;
4959 dictEntry *de;
4960
4961 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4962 checkType(c,set,REDIS_SET)) return;
4963
4964 de = dictGetRandomKey(set->ptr);
4965 if (de == NULL) {
4966 addReply(c,shared.nullbulk);
4967 } else {
4968 robj *ele = dictGetEntryKey(de);
4969
4970 addReplyBulk(c,ele);
4971 }
4972 }
4973
4974 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4975 dict **d1 = (void*) s1, **d2 = (void*) s2;
4976
4977 return dictSize(*d1)-dictSize(*d2);
4978 }
4979
4980 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4981 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4982 dictIterator *di;
4983 dictEntry *de;
4984 robj *lenobj = NULL, *dstset = NULL;
4985 unsigned long j, cardinality = 0;
4986
4987 for (j = 0; j < setsnum; j++) {
4988 robj *setobj;
4989
4990 setobj = dstkey ?
4991 lookupKeyWrite(c->db,setskeys[j]) :
4992 lookupKeyRead(c->db,setskeys[j]);
4993 if (!setobj) {
4994 zfree(dv);
4995 if (dstkey) {
4996 if (deleteKey(c->db,dstkey))
4997 server.dirty++;
4998 addReply(c,shared.czero);
4999 } else {
5000 addReply(c,shared.emptymultibulk);
5001 }
5002 return;
5003 }
5004 if (setobj->type != REDIS_SET) {
5005 zfree(dv);
5006 addReply(c,shared.wrongtypeerr);
5007 return;
5008 }
5009 dv[j] = setobj->ptr;
5010 }
5011 /* Sort sets from the smallest to largest, this will improve our
5012 * algorithm's performace */
5013 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5014
5015 /* The first thing we should output is the total number of elements...
5016 * since this is a multi-bulk write, but at this stage we don't know
5017 * the intersection set size, so we use a trick, append an empty object
5018 * to the output list and save the pointer to later modify it with the
5019 * right length */
5020 if (!dstkey) {
5021 lenobj = createObject(REDIS_STRING,NULL);
5022 addReply(c,lenobj);
5023 decrRefCount(lenobj);
5024 } else {
5025 /* If we have a target key where to store the resulting set
5026 * create this key with an empty set inside */
5027 dstset = createSetObject();
5028 }
5029
5030 /* Iterate all the elements of the first (smallest) set, and test
5031 * the element against all the other sets, if at least one set does
5032 * not include the element it is discarded */
5033 di = dictGetIterator(dv[0]);
5034
5035 while((de = dictNext(di)) != NULL) {
5036 robj *ele;
5037
5038 for (j = 1; j < setsnum; j++)
5039 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5040 if (j != setsnum)
5041 continue; /* at least one set does not contain the member */
5042 ele = dictGetEntryKey(de);
5043 if (!dstkey) {
5044 addReplyBulk(c,ele);
5045 cardinality++;
5046 } else {
5047 dictAdd(dstset->ptr,ele,NULL);
5048 incrRefCount(ele);
5049 }
5050 }
5051 dictReleaseIterator(di);
5052
5053 if (dstkey) {
5054 /* Store the resulting set into the target, if the intersection
5055 * is not an empty set. */
5056 deleteKey(c->db,dstkey);
5057 if (dictSize((dict*)dstset->ptr) > 0) {
5058 dictAdd(c->db->dict,dstkey,dstset);
5059 incrRefCount(dstkey);
5060 addReplyLong(c,dictSize((dict*)dstset->ptr));
5061 } else {
5062 decrRefCount(dstset);
5063 addReply(c,shared.czero);
5064 }
5065 server.dirty++;
5066 } else {
5067 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5068 }
5069 zfree(dv);
5070 }
5071
5072 static void sinterCommand(redisClient *c) {
5073 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5074 }
5075
5076 static void sinterstoreCommand(redisClient *c) {
5077 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5078 }
5079
5080 #define REDIS_OP_UNION 0
5081 #define REDIS_OP_DIFF 1
5082 #define REDIS_OP_INTER 2
5083
5084 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5085 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5086 dictIterator *di;
5087 dictEntry *de;
5088 robj *dstset = NULL;
5089 int j, cardinality = 0;
5090
5091 for (j = 0; j < setsnum; j++) {
5092 robj *setobj;
5093
5094 setobj = dstkey ?
5095 lookupKeyWrite(c->db,setskeys[j]) :
5096 lookupKeyRead(c->db,setskeys[j]);
5097 if (!setobj) {
5098 dv[j] = NULL;
5099 continue;
5100 }
5101 if (setobj->type != REDIS_SET) {
5102 zfree(dv);
5103 addReply(c,shared.wrongtypeerr);
5104 return;
5105 }
5106 dv[j] = setobj->ptr;
5107 }
5108
5109 /* We need a temp set object to store our union. If the dstkey
5110 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5111 * this set object will be the resulting object to set into the target key*/
5112 dstset = createSetObject();
5113
5114 /* Iterate all the elements of all the sets, add every element a single
5115 * time to the result set */
5116 for (j = 0; j < setsnum; j++) {
5117 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5118 if (!dv[j]) continue; /* non existing keys are like empty sets */
5119
5120 di = dictGetIterator(dv[j]);
5121
5122 while((de = dictNext(di)) != NULL) {
5123 robj *ele;
5124
5125 /* dictAdd will not add the same element multiple times */
5126 ele = dictGetEntryKey(de);
5127 if (op == REDIS_OP_UNION || j == 0) {
5128 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5129 incrRefCount(ele);
5130 cardinality++;
5131 }
5132 } else if (op == REDIS_OP_DIFF) {
5133 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5134 cardinality--;
5135 }
5136 }
5137 }
5138 dictReleaseIterator(di);
5139
5140 /* result set is empty? Exit asap. */
5141 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5142 }
5143
5144 /* Output the content of the resulting set, if not in STORE mode */
5145 if (!dstkey) {
5146 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5147 di = dictGetIterator(dstset->ptr);
5148 while((de = dictNext(di)) != NULL) {
5149 robj *ele;
5150
5151 ele = dictGetEntryKey(de);
5152 addReplyBulk(c,ele);
5153 }
5154 dictReleaseIterator(di);
5155 decrRefCount(dstset);
5156 } else {
5157 /* If we have a target key where to store the resulting set
5158 * create this key with the result set inside */
5159 deleteKey(c->db,dstkey);
5160 if (dictSize((dict*)dstset->ptr) > 0) {
5161 dictAdd(c->db->dict,dstkey,dstset);
5162 incrRefCount(dstkey);
5163 addReplyLong(c,dictSize((dict*)dstset->ptr));
5164 } else {
5165 decrRefCount(dstset);
5166 addReply(c,shared.czero);
5167 }
5168 server.dirty++;
5169 }
5170 zfree(dv);
5171 }
5172
5173 static void sunionCommand(redisClient *c) {
5174 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5175 }
5176
5177 static void sunionstoreCommand(redisClient *c) {
5178 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5179 }
5180
5181 static void sdiffCommand(redisClient *c) {
5182 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5183 }
5184
5185 static void sdiffstoreCommand(redisClient *c) {
5186 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5187 }
5188
5189 /* ==================================== ZSets =============================== */
5190
5191 /* ZSETs are ordered sets using two data structures to hold the same elements
5192 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5193 * data structure.
5194 *
5195 * The elements are added to an hash table mapping Redis objects to scores.
5196 * At the same time the elements are added to a skip list mapping scores
5197 * to Redis objects (so objects are sorted by scores in this "view"). */
5198
5199 /* This skiplist implementation is almost a C translation of the original
5200 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5201 * Alternative to Balanced Trees", modified in three ways:
5202 * a) this implementation allows for repeated values.
5203 * b) the comparison is not just by key (our 'score') but by satellite data.
5204 * c) there is a back pointer, so it's a doubly linked list with the back
5205 * pointers being only at "level 1". This allows to traverse the list
5206 * from tail to head, useful for ZREVRANGE. */
5207
5208 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5209 zskiplistNode *zn = zmalloc(sizeof(*zn));
5210
5211 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5212 if (level > 0)
5213 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5214 zn->score = score;
5215 zn->obj = obj;
5216 return zn;
5217 }
5218
5219 static zskiplist *zslCreate(void) {
5220 int j;
5221 zskiplist *zsl;
5222
5223 zsl = zmalloc(sizeof(*zsl));
5224 zsl->level = 1;
5225 zsl->length = 0;
5226 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5227 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5228 zsl->header->forward[j] = NULL;
5229
5230 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5231 if (j < ZSKIPLIST_MAXLEVEL-1)
5232 zsl->header->span[j] = 0;
5233 }
5234 zsl->header->backward = NULL;
5235 zsl->tail = NULL;
5236 return zsl;
5237 }
5238
5239 static void zslFreeNode(zskiplistNode *node) {
5240 decrRefCount(node->obj);
5241 zfree(node->forward);
5242 zfree(node->span);
5243 zfree(node);
5244 }
5245
5246 static void zslFree(zskiplist *zsl) {
5247 zskiplistNode *node = zsl->header->forward[0], *next;
5248
5249 zfree(zsl->header->forward);
5250 zfree(zsl->header->span);
5251 zfree(zsl->header);
5252 while(node) {
5253 next = node->forward[0];
5254 zslFreeNode(node);
5255 node = next;
5256 }
5257 zfree(zsl);
5258 }
5259
5260 static int zslRandomLevel(void) {
5261 int level = 1;
5262 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5263 level += 1;
5264 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5265 }
5266
5267 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5268 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5269 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5270 int i, level;
5271
5272 x = zsl->header;
5273 for (i = zsl->level-1; i >= 0; i--) {
5274 /* store rank that is crossed to reach the insert position */
5275 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5276
5277 while (x->forward[i] &&
5278 (x->forward[i]->score < score ||
5279 (x->forward[i]->score == score &&
5280 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5281 rank[i] += i > 0 ? x->span[i-1] : 1;
5282 x = x->forward[i];
5283 }
5284 update[i] = x;
5285 }
5286 /* we assume the key is not already inside, since we allow duplicated
5287 * scores, and the re-insertion of score and redis object should never
5288 * happpen since the caller of zslInsert() should test in the hash table
5289 * if the element is already inside or not. */
5290 level = zslRandomLevel();
5291 if (level > zsl->level) {
5292 for (i = zsl->level; i < level; i++) {
5293 rank[i] = 0;
5294 update[i] = zsl->header;
5295 update[i]->span[i-1] = zsl->length;
5296 }
5297 zsl->level = level;
5298 }
5299 x = zslCreateNode(level,score,obj);
5300 for (i = 0; i < level; i++) {
5301 x->forward[i] = update[i]->forward[i];
5302 update[i]->forward[i] = x;
5303
5304 /* update span covered by update[i] as x is inserted here */
5305 if (i > 0) {
5306 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5307 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5308 }
5309 }
5310
5311 /* increment span for untouched levels */
5312 for (i = level; i < zsl->level; i++) {
5313 update[i]->span[i-1]++;
5314 }
5315
5316 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5317 if (x->forward[0])
5318 x->forward[0]->backward = x;
5319 else
5320 zsl->tail = x;
5321 zsl->length++;
5322 }
5323
5324 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5325 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5326 int i;
5327 for (i = 0; i < zsl->level; i++) {
5328 if (update[i]->forward[i] == x) {
5329 if (i > 0) {
5330 update[i]->span[i-1] += x->span[i-1] - 1;
5331 }
5332 update[i]->forward[i] = x->forward[i];
5333 } else {
5334 /* invariant: i > 0, because update[0]->forward[0]
5335 * is always equal to x */
5336 update[i]->span[i-1] -= 1;
5337 }
5338 }
5339 if (x->forward[0]) {
5340 x->forward[0]->backward = x->backward;
5341 } else {
5342 zsl->tail = x->backward;
5343 }
5344 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5345 zsl->level--;
5346 zsl->length--;
5347 }
5348
5349 /* Delete an element with matching score/object from the skiplist. */
5350 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5351 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5352 int i;
5353
5354 x = zsl->header;
5355 for (i = zsl->level-1; i >= 0; i--) {
5356 while (x->forward[i] &&
5357 (x->forward[i]->score < score ||
5358 (x->forward[i]->score == score &&
5359 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5360 x = x->forward[i];
5361 update[i] = x;
5362 }
5363 /* We may have multiple elements with the same score, what we need
5364 * is to find the element with both the right score and object. */
5365 x = x->forward[0];
5366 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5367 zslDeleteNode(zsl, x, update);
5368 zslFreeNode(x);
5369 return 1;
5370 } else {
5371 return 0; /* not found */
5372 }
5373 return 0; /* not found */
5374 }
5375
5376 /* Delete all the elements with score between min and max from the skiplist.
5377 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5378 * Note that this function takes the reference to the hash table view of the
5379 * sorted set, in order to remove the elements from the hash table too. */
5380 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5381 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5382 unsigned long removed = 0;
5383 int i;
5384
5385 x = zsl->header;
5386 for (i = zsl->level-1; i >= 0; i--) {
5387 while (x->forward[i] && x->forward[i]->score < min)
5388 x = x->forward[i];
5389 update[i] = x;
5390 }
5391 /* We may have multiple elements with the same score, what we need
5392 * is to find the element with both the right score and object. */
5393 x = x->forward[0];
5394 while (x && x->score <= max) {
5395 zskiplistNode *next = x->forward[0];
5396 zslDeleteNode(zsl, x, update);
5397 dictDelete(dict,x->obj);
5398 zslFreeNode(x);
5399 removed++;
5400 x = next;
5401 }
5402 return removed; /* not found */
5403 }
5404
5405 /* Delete all the elements with rank between start and end from the skiplist.
5406 * Start and end are inclusive. Note that start and end need to be 1-based */
5407 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5408 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5409 unsigned long traversed = 0, removed = 0;
5410 int i;
5411
5412 x = zsl->header;
5413 for (i = zsl->level-1; i >= 0; i--) {
5414 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5415 traversed += i > 0 ? x->span[i-1] : 1;
5416 x = x->forward[i];
5417 }
5418 update[i] = x;
5419 }
5420
5421 traversed++;
5422 x = x->forward[0];
5423 while (x && traversed <= end) {
5424 zskiplistNode *next = x->forward[0];
5425 zslDeleteNode(zsl, x, update);
5426 dictDelete(dict,x->obj);
5427 zslFreeNode(x);
5428 removed++;
5429 traversed++;
5430 x = next;
5431 }
5432 return removed;
5433 }
5434
5435 /* Find the first node having a score equal or greater than the specified one.
5436 * Returns NULL if there is no match. */
5437 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5438 zskiplistNode *x;
5439 int i;
5440
5441 x = zsl->header;
5442 for (i = zsl->level-1; i >= 0; i--) {
5443 while (x->forward[i] && x->forward[i]->score < score)
5444 x = x->forward[i];
5445 }
5446 /* We may have multiple elements with the same score, what we need
5447 * is to find the element with both the right score and object. */
5448 return x->forward[0];
5449 }
5450
5451 /* Find the rank for an element by both score and key.
5452 * Returns 0 when the element cannot be found, rank otherwise.
5453 * Note that the rank is 1-based due to the span of zsl->header to the
5454 * first element. */
5455 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5456 zskiplistNode *x;
5457 unsigned long rank = 0;
5458 int i;
5459
5460 x = zsl->header;
5461 for (i = zsl->level-1; i >= 0; i--) {
5462 while (x->forward[i] &&
5463 (x->forward[i]->score < score ||
5464 (x->forward[i]->score == score &&
5465 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5466 rank += i > 0 ? x->span[i-1] : 1;
5467 x = x->forward[i];
5468 }
5469
5470 /* x might be equal to zsl->header, so test if obj is non-NULL */
5471 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5472 return rank;
5473 }
5474 }
5475 return 0;
5476 }
5477
5478 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5479 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5480 zskiplistNode *x;
5481 unsigned long traversed = 0;
5482 int i;
5483
5484 x = zsl->header;
5485 for (i = zsl->level-1; i >= 0; i--) {
5486 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5487 {
5488 traversed += i > 0 ? x->span[i-1] : 1;
5489 x = x->forward[i];
5490 }
5491 if (traversed == rank) {
5492 return x;
5493 }
5494 }
5495 return NULL;
5496 }
5497
5498 /* The actual Z-commands implementations */
5499
5500 /* This generic command implements both ZADD and ZINCRBY.
5501 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5502 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5503 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5504 robj *zsetobj;
5505 zset *zs;
5506 double *score;
5507
5508 zsetobj = lookupKeyWrite(c->db,key);
5509 if (zsetobj == NULL) {
5510 zsetobj = createZsetObject();
5511 dictAdd(c->db->dict,key,zsetobj);
5512 incrRefCount(key);
5513 } else {
5514 if (zsetobj->type != REDIS_ZSET) {
5515 addReply(c,shared.wrongtypeerr);
5516 return;
5517 }
5518 }
5519 zs = zsetobj->ptr;
5520
5521 /* Ok now since we implement both ZADD and ZINCRBY here the code
5522 * needs to handle the two different conditions. It's all about setting
5523 * '*score', that is, the new score to set, to the right value. */
5524 score = zmalloc(sizeof(double));
5525 if (doincrement) {
5526 dictEntry *de;
5527
5528 /* Read the old score. If the element was not present starts from 0 */
5529 de = dictFind(zs->dict,ele);
5530 if (de) {
5531 double *oldscore = dictGetEntryVal(de);
5532 *score = *oldscore + scoreval;
5533 } else {
5534 *score = scoreval;
5535 }
5536 } else {
5537 *score = scoreval;
5538 }
5539
5540 /* What follows is a simple remove and re-insert operation that is common
5541 * to both ZADD and ZINCRBY... */
5542 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5543 /* case 1: New element */
5544 incrRefCount(ele); /* added to hash */
5545 zslInsert(zs->zsl,*score,ele);
5546 incrRefCount(ele); /* added to skiplist */
5547 server.dirty++;
5548 if (doincrement)
5549 addReplyDouble(c,*score);
5550 else
5551 addReply(c,shared.cone);
5552 } else {
5553 dictEntry *de;
5554 double *oldscore;
5555
5556 /* case 2: Score update operation */
5557 de = dictFind(zs->dict,ele);
5558 redisAssert(de != NULL);
5559 oldscore = dictGetEntryVal(de);
5560 if (*score != *oldscore) {
5561 int deleted;
5562
5563 /* Remove and insert the element in the skip list with new score */
5564 deleted = zslDelete(zs->zsl,*oldscore,ele);
5565 redisAssert(deleted != 0);
5566 zslInsert(zs->zsl,*score,ele);
5567 incrRefCount(ele);
5568 /* Update the score in the hash table */
5569 dictReplace(zs->dict,ele,score);
5570 server.dirty++;
5571 } else {
5572 zfree(score);
5573 }
5574 if (doincrement)
5575 addReplyDouble(c,*score);
5576 else
5577 addReply(c,shared.czero);
5578 }
5579 }
5580
5581 static void zaddCommand(redisClient *c) {
5582 double scoreval;
5583
5584 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5585 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5586 }
5587
5588 static void zincrbyCommand(redisClient *c) {
5589 double scoreval;
5590
5591 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5592 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5593 }
5594
5595 static void zremCommand(redisClient *c) {
5596 robj *zsetobj;
5597 zset *zs;
5598 dictEntry *de;
5599 double *oldscore;
5600 int deleted;
5601
5602 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5603 checkType(c,zsetobj,REDIS_ZSET)) return;
5604
5605 zs = zsetobj->ptr;
5606 de = dictFind(zs->dict,c->argv[2]);
5607 if (de == NULL) {
5608 addReply(c,shared.czero);
5609 return;
5610 }
5611 /* Delete from the skiplist */
5612 oldscore = dictGetEntryVal(de);
5613 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5614 redisAssert(deleted != 0);
5615
5616 /* Delete from the hash table */
5617 dictDelete(zs->dict,c->argv[2]);
5618 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5619 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5620 server.dirty++;
5621 addReply(c,shared.cone);
5622 }
5623
5624 static void zremrangebyscoreCommand(redisClient *c) {
5625 double min;
5626 double max;
5627 long deleted;
5628 robj *zsetobj;
5629 zset *zs;
5630
5631 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5632 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5633
5634 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5635 checkType(c,zsetobj,REDIS_ZSET)) return;
5636
5637 zs = zsetobj->ptr;
5638 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5639 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5640 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5641 server.dirty += deleted;
5642 addReplyLong(c,deleted);
5643 }
5644
5645 static void zremrangebyrankCommand(redisClient *c) {
5646 long start;
5647 long end;
5648 int llen;
5649 long deleted;
5650 robj *zsetobj;
5651 zset *zs;
5652
5653 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5654 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5655
5656 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5657 checkType(c,zsetobj,REDIS_ZSET)) return;
5658 zs = zsetobj->ptr;
5659 llen = zs->zsl->length;
5660
5661 /* convert negative indexes */
5662 if (start < 0) start = llen+start;
5663 if (end < 0) end = llen+end;
5664 if (start < 0) start = 0;
5665 if (end < 0) end = 0;
5666
5667 /* indexes sanity checks */
5668 if (start > end || start >= llen) {
5669 addReply(c,shared.czero);
5670 return;
5671 }
5672 if (end >= llen) end = llen-1;
5673
5674 /* increment start and end because zsl*Rank functions
5675 * use 1-based rank */
5676 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5677 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5678 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5679 server.dirty += deleted;
5680 addReplyLong(c, deleted);
5681 }
5682
5683 typedef struct {
5684 dict *dict;
5685 double weight;
5686 } zsetopsrc;
5687
5688 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5689 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5690 unsigned long size1, size2;
5691 size1 = d1->dict ? dictSize(d1->dict) : 0;
5692 size2 = d2->dict ? dictSize(d2->dict) : 0;
5693 return size1 - size2;
5694 }
5695
5696 #define REDIS_AGGR_SUM 1
5697 #define REDIS_AGGR_MIN 2
5698 #define REDIS_AGGR_MAX 3
5699
5700 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5701 if (aggregate == REDIS_AGGR_SUM) {
5702 *target = *target + val;
5703 } else if (aggregate == REDIS_AGGR_MIN) {
5704 *target = val < *target ? val : *target;
5705 } else if (aggregate == REDIS_AGGR_MAX) {
5706 *target = val > *target ? val : *target;
5707 } else {
5708 /* safety net */
5709 redisPanic("Unknown ZUNION/INTER aggregate type");
5710 }
5711 }
5712
5713 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5714 int i, j, zsetnum;
5715 int aggregate = REDIS_AGGR_SUM;
5716 zsetopsrc *src;
5717 robj *dstobj;
5718 zset *dstzset;
5719 dictIterator *di;
5720 dictEntry *de;
5721
5722 /* expect zsetnum input keys to be given */
5723 zsetnum = atoi(c->argv[2]->ptr);
5724 if (zsetnum < 1) {
5725 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5726 return;
5727 }
5728
5729 /* test if the expected number of keys would overflow */
5730 if (3+zsetnum > c->argc) {
5731 addReply(c,shared.syntaxerr);
5732 return;
5733 }
5734
5735 /* read keys to be used for input */
5736 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5737 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5738 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5739 if (!zsetobj) {
5740 src[i].dict = NULL;
5741 } else {
5742 if (zsetobj->type != REDIS_ZSET) {
5743 zfree(src);
5744 addReply(c,shared.wrongtypeerr);
5745 return;
5746 }
5747 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5748 }
5749
5750 /* default all weights to 1 */
5751 src[i].weight = 1.0;
5752 }
5753
5754 /* parse optional extra arguments */
5755 if (j < c->argc) {
5756 int remaining = c->argc - j;
5757
5758 while (remaining) {
5759 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5760 j++; remaining--;
5761 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5762 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5763 return;
5764 }
5765 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5766 j++; remaining--;
5767 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5768 aggregate = REDIS_AGGR_SUM;
5769 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5770 aggregate = REDIS_AGGR_MIN;
5771 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5772 aggregate = REDIS_AGGR_MAX;
5773 } else {
5774 zfree(src);
5775 addReply(c,shared.syntaxerr);
5776 return;
5777 }
5778 j++; remaining--;
5779 } else {
5780 zfree(src);
5781 addReply(c,shared.syntaxerr);
5782 return;
5783 }
5784 }
5785 }
5786
5787 /* sort sets from the smallest to largest, this will improve our
5788 * algorithm's performance */
5789 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5790
5791 dstobj = createZsetObject();
5792 dstzset = dstobj->ptr;
5793
5794 if (op == REDIS_OP_INTER) {
5795 /* skip going over all entries if the smallest zset is NULL or empty */
5796 if (src[0].dict && dictSize(src[0].dict) > 0) {
5797 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5798 * from small to large, all src[i > 0].dict are non-empty too */
5799 di = dictGetIterator(src[0].dict);
5800 while((de = dictNext(di)) != NULL) {
5801 double *score = zmalloc(sizeof(double)), value;
5802 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5803
5804 for (j = 1; j < zsetnum; j++) {
5805 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5806 if (other) {
5807 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5808 zunionInterAggregate(score, value, aggregate);
5809 } else {
5810 break;
5811 }
5812 }
5813
5814 /* skip entry when not present in every source dict */
5815 if (j != zsetnum) {
5816 zfree(score);
5817 } else {
5818 robj *o = dictGetEntryKey(de);
5819 dictAdd(dstzset->dict,o,score);
5820 incrRefCount(o); /* added to dictionary */
5821 zslInsert(dstzset->zsl,*score,o);
5822 incrRefCount(o); /* added to skiplist */
5823 }
5824 }
5825 dictReleaseIterator(di);
5826 }
5827 } else if (op == REDIS_OP_UNION) {
5828 for (i = 0; i < zsetnum; i++) {
5829 if (!src[i].dict) continue;
5830
5831 di = dictGetIterator(src[i].dict);
5832 while((de = dictNext(di)) != NULL) {
5833 /* skip key when already processed */
5834 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5835
5836 double *score = zmalloc(sizeof(double)), value;
5837 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5838
5839 /* because the zsets are sorted by size, its only possible
5840 * for sets at larger indices to hold this entry */
5841 for (j = (i+1); j < zsetnum; j++) {
5842 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5843 if (other) {
5844 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5845 zunionInterAggregate(score, value, aggregate);
5846 }
5847 }
5848
5849 robj *o = dictGetEntryKey(de);
5850 dictAdd(dstzset->dict,o,score);
5851 incrRefCount(o); /* added to dictionary */
5852 zslInsert(dstzset->zsl,*score,o);
5853 incrRefCount(o); /* added to skiplist */
5854 }
5855 dictReleaseIterator(di);
5856 }
5857 } else {
5858 /* unknown operator */
5859 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5860 }
5861
5862 deleteKey(c->db,dstkey);
5863 if (dstzset->zsl->length) {
5864 dictAdd(c->db->dict,dstkey,dstobj);
5865 incrRefCount(dstkey);
5866 addReplyLong(c, dstzset->zsl->length);
5867 server.dirty++;
5868 } else {
5869 decrRefCount(dstobj);
5870 addReply(c, shared.czero);
5871 }
5872 zfree(src);
5873 }
5874
5875 static void zunionCommand(redisClient *c) {
5876 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5877 }
5878
5879 static void zinterCommand(redisClient *c) {
5880 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5881 }
5882
5883 static void zrangeGenericCommand(redisClient *c, int reverse) {
5884 robj *o;
5885 long start;
5886 long end;
5887 int withscores = 0;
5888 int llen;
5889 int rangelen, j;
5890 zset *zsetobj;
5891 zskiplist *zsl;
5892 zskiplistNode *ln;
5893 robj *ele;
5894
5895 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5896 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5897
5898 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5899 withscores = 1;
5900 } else if (c->argc >= 5) {
5901 addReply(c,shared.syntaxerr);
5902 return;
5903 }
5904
5905 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5906 || checkType(c,o,REDIS_ZSET)) return;
5907 zsetobj = o->ptr;
5908 zsl = zsetobj->zsl;
5909 llen = zsl->length;
5910
5911 /* convert negative indexes */
5912 if (start < 0) start = llen+start;
5913 if (end < 0) end = llen+end;
5914 if (start < 0) start = 0;
5915 if (end < 0) end = 0;
5916
5917 /* indexes sanity checks */
5918 if (start > end || start >= llen) {
5919 /* Out of range start or start > end result in empty list */
5920 addReply(c,shared.emptymultibulk);
5921 return;
5922 }
5923 if (end >= llen) end = llen-1;
5924 rangelen = (end-start)+1;
5925
5926 /* check if starting point is trivial, before searching
5927 * the element in log(N) time */
5928 if (reverse) {
5929 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5930 } else {
5931 ln = start == 0 ?
5932 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5933 }
5934
5935 /* Return the result in form of a multi-bulk reply */
5936 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5937 withscores ? (rangelen*2) : rangelen));
5938 for (j = 0; j < rangelen; j++) {
5939 ele = ln->obj;
5940 addReplyBulk(c,ele);
5941 if (withscores)
5942 addReplyDouble(c,ln->score);
5943 ln = reverse ? ln->backward : ln->forward[0];
5944 }
5945 }
5946
5947 static void zrangeCommand(redisClient *c) {
5948 zrangeGenericCommand(c,0);
5949 }
5950
5951 static void zrevrangeCommand(redisClient *c) {
5952 zrangeGenericCommand(c,1);
5953 }
5954
5955 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5956 * If justcount is non-zero, just the count is returned. */
5957 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5958 robj *o;
5959 double min, max;
5960 int minex = 0, maxex = 0; /* are min or max exclusive? */
5961 int offset = 0, limit = -1;
5962 int withscores = 0;
5963 int badsyntax = 0;
5964
5965 /* Parse the min-max interval. If one of the values is prefixed
5966 * by the "(" character, it's considered "open". For instance
5967 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5968 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5969 if (((char*)c->argv[2]->ptr)[0] == '(') {
5970 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5971 minex = 1;
5972 } else {
5973 min = strtod(c->argv[2]->ptr,NULL);
5974 }
5975 if (((char*)c->argv[3]->ptr)[0] == '(') {
5976 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5977 maxex = 1;
5978 } else {
5979 max = strtod(c->argv[3]->ptr,NULL);
5980 }
5981
5982 /* Parse "WITHSCORES": note that if the command was called with
5983 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5984 * enter the following paths to parse WITHSCORES and LIMIT. */
5985 if (c->argc == 5 || c->argc == 8) {
5986 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5987 withscores = 1;
5988 else
5989 badsyntax = 1;
5990 }
5991 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5992 badsyntax = 1;
5993 if (badsyntax) {
5994 addReplySds(c,
5995 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5996 return;
5997 }
5998
5999 /* Parse "LIMIT" */
6000 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6001 addReply(c,shared.syntaxerr);
6002 return;
6003 } else if (c->argc == (7 + withscores)) {
6004 offset = atoi(c->argv[5]->ptr);
6005 limit = atoi(c->argv[6]->ptr);
6006 if (offset < 0) offset = 0;
6007 }
6008
6009 /* Ok, lookup the key and get the range */
6010 o = lookupKeyRead(c->db,c->argv[1]);
6011 if (o == NULL) {
6012 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6013 } else {
6014 if (o->type != REDIS_ZSET) {
6015 addReply(c,shared.wrongtypeerr);
6016 } else {
6017 zset *zsetobj = o->ptr;
6018 zskiplist *zsl = zsetobj->zsl;
6019 zskiplistNode *ln;
6020 robj *ele, *lenobj = NULL;
6021 unsigned long rangelen = 0;
6022
6023 /* Get the first node with the score >= min, or with
6024 * score > min if 'minex' is true. */
6025 ln = zslFirstWithScore(zsl,min);
6026 while (minex && ln && ln->score == min) ln = ln->forward[0];
6027
6028 if (ln == NULL) {
6029 /* No element matching the speciifed interval */
6030 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6031 return;
6032 }
6033
6034 /* We don't know in advance how many matching elements there
6035 * are in the list, so we push this object that will represent
6036 * the multi-bulk length in the output buffer, and will "fix"
6037 * it later */
6038 if (!justcount) {
6039 lenobj = createObject(REDIS_STRING,NULL);
6040 addReply(c,lenobj);
6041 decrRefCount(lenobj);
6042 }
6043
6044 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6045 if (offset) {
6046 offset--;
6047 ln = ln->forward[0];
6048 continue;
6049 }
6050 if (limit == 0) break;
6051 if (!justcount) {
6052 ele = ln->obj;
6053 addReplyBulk(c,ele);
6054 if (withscores)
6055 addReplyDouble(c,ln->score);
6056 }
6057 ln = ln->forward[0];
6058 rangelen++;
6059 if (limit > 0) limit--;
6060 }
6061 if (justcount) {
6062 addReplyLong(c,(long)rangelen);
6063 } else {
6064 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6065 withscores ? (rangelen*2) : rangelen);
6066 }
6067 }
6068 }
6069 }
6070
6071 static void zrangebyscoreCommand(redisClient *c) {
6072 genericZrangebyscoreCommand(c,0);
6073 }
6074
6075 static void zcountCommand(redisClient *c) {
6076 genericZrangebyscoreCommand(c,1);
6077 }
6078
6079 static void zcardCommand(redisClient *c) {
6080 robj *o;
6081 zset *zs;
6082
6083 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6084 checkType(c,o,REDIS_ZSET)) return;
6085
6086 zs = o->ptr;
6087 addReplyUlong(c,zs->zsl->length);
6088 }
6089
6090 static void zscoreCommand(redisClient *c) {
6091 robj *o;
6092 zset *zs;
6093 dictEntry *de;
6094
6095 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6096 checkType(c,o,REDIS_ZSET)) return;
6097
6098 zs = o->ptr;
6099 de = dictFind(zs->dict,c->argv[2]);
6100 if (!de) {
6101 addReply(c,shared.nullbulk);
6102 } else {
6103 double *score = dictGetEntryVal(de);
6104
6105 addReplyDouble(c,*score);
6106 }
6107 }
6108
6109 static void zrankGenericCommand(redisClient *c, int reverse) {
6110 robj *o;
6111 zset *zs;
6112 zskiplist *zsl;
6113 dictEntry *de;
6114 unsigned long rank;
6115 double *score;
6116
6117 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6118 checkType(c,o,REDIS_ZSET)) return;
6119
6120 zs = o->ptr;
6121 zsl = zs->zsl;
6122 de = dictFind(zs->dict,c->argv[2]);
6123 if (!de) {
6124 addReply(c,shared.nullbulk);
6125 return;
6126 }
6127
6128 score = dictGetEntryVal(de);
6129 rank = zslGetRank(zsl, *score, c->argv[2]);
6130 if (rank) {
6131 if (reverse) {
6132 addReplyLong(c, zsl->length - rank);
6133 } else {
6134 addReplyLong(c, rank-1);
6135 }
6136 } else {
6137 addReply(c,shared.nullbulk);
6138 }
6139 }
6140
6141 static void zrankCommand(redisClient *c) {
6142 zrankGenericCommand(c, 0);
6143 }
6144
6145 static void zrevrankCommand(redisClient *c) {
6146 zrankGenericCommand(c, 1);
6147 }
6148
6149 /* ========================= Hashes utility functions ======================= */
6150 #define REDIS_HASH_KEY 1
6151 #define REDIS_HASH_VALUE 2
6152
6153 /* Check the length of a number of objects to see if we need to convert a
6154 * zipmap to a real hash. Note that we only check string encoded objects
6155 * as their string length can be queried in constant time. */
6156 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6157 int i;
6158 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6159
6160 for (i = start; i <= end; i++) {
6161 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6162 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6163 {
6164 convertToRealHash(subject);
6165 return;
6166 }
6167 }
6168 }
6169
6170 /* Encode given objects in-place when the hash uses a dict. */
6171 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6172 if (subject->encoding == REDIS_ENCODING_HT) {
6173 if (o1) *o1 = tryObjectEncoding(*o1);
6174 if (o2) *o2 = tryObjectEncoding(*o2);
6175 }
6176 }
6177
6178 /* Get the value from a hash identified by key. Returns either a string
6179 * object or NULL if the value cannot be found. The refcount of the object
6180 * is always increased by 1 when the value was found. */
6181 static robj *hashGet(robj *o, robj *key) {
6182 robj *value = NULL;
6183 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6184 unsigned char *v;
6185 unsigned int vlen;
6186 key = getDecodedObject(key);
6187 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6188 value = createStringObject((char*)v,vlen);
6189 }
6190 decrRefCount(key);
6191 } else {
6192 dictEntry *de = dictFind(o->ptr,key);
6193 if (de != NULL) {
6194 value = dictGetEntryVal(de);
6195 incrRefCount(value);
6196 }
6197 }
6198 return value;
6199 }
6200
6201 /* Test if the key exists in the given hash. Returns 1 if the key
6202 * exists and 0 when it doesn't. */
6203 static int hashExists(robj *o, robj *key) {
6204 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6205 key = getDecodedObject(key);
6206 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6207 decrRefCount(key);
6208 return 1;
6209 }
6210 decrRefCount(key);
6211 } else {
6212 if (dictFind(o->ptr,key) != NULL) {
6213 return 1;
6214 }
6215 }
6216 return 0;
6217 }
6218
6219 /* Add an element, discard the old if the key already exists.
6220 * Return 0 on insert and 1 on update. */
6221 static int hashSet(robj *o, robj *key, robj *value) {
6222 int update = 0;
6223 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6224 key = getDecodedObject(key);
6225 value = getDecodedObject(value);
6226 o->ptr = zipmapSet(o->ptr,
6227 key->ptr,sdslen(key->ptr),
6228 value->ptr,sdslen(value->ptr), &update);
6229 decrRefCount(key);
6230 decrRefCount(value);
6231
6232 /* Check if the zipmap needs to be upgraded to a real hash table */
6233 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6234 convertToRealHash(o);
6235 } else {
6236 if (dictReplace(o->ptr,key,value)) {
6237 /* Insert */
6238 incrRefCount(key);
6239 } else {
6240 /* Update */
6241 update = 1;
6242 }
6243 incrRefCount(value);
6244 }
6245 return update;
6246 }
6247
6248 /* Delete an element from a hash.
6249 * Return 1 on deleted and 0 on not found. */
6250 static int hashDelete(robj *o, robj *key) {
6251 int deleted = 0;
6252 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6253 key = getDecodedObject(key);
6254 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6255 decrRefCount(key);
6256 } else {
6257 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6258 /* Always check if the dictionary needs a resize after a delete. */
6259 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6260 }
6261 return deleted;
6262 }
6263
6264 /* Return the number of elements in a hash. */
6265 static unsigned long hashLength(robj *o) {
6266 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6267 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6268 }
6269
6270 /* Structure to hold hash iteration abstration. Note that iteration over
6271 * hashes involves both fields and values. Because it is possible that
6272 * not both are required, store pointers in the iterator to avoid
6273 * unnecessary memory allocation for fields/values. */
6274 typedef struct {
6275 int encoding;
6276 unsigned char *zi;
6277 unsigned char *zk, *zv;
6278 unsigned int zklen, zvlen;
6279
6280 dictIterator *di;
6281 dictEntry *de;
6282 } hashIterator;
6283
6284 static hashIterator *hashInitIterator(robj *subject) {
6285 hashIterator *hi = zmalloc(sizeof(hashIterator));
6286 hi->encoding = subject->encoding;
6287 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6288 hi->zi = zipmapRewind(subject->ptr);
6289 } else if (hi->encoding == REDIS_ENCODING_HT) {
6290 hi->di = dictGetIterator(subject->ptr);
6291 } else {
6292 redisAssert(NULL);
6293 }
6294 return hi;
6295 }
6296
6297 static void hashReleaseIterator(hashIterator *hi) {
6298 if (hi->encoding == REDIS_ENCODING_HT) {
6299 dictReleaseIterator(hi->di);
6300 }
6301 zfree(hi);
6302 }
6303
6304 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6305 * could be found and REDIS_ERR when the iterator reaches the end. */
6306 static int hashNext(hashIterator *hi) {
6307 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6308 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6309 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6310 } else {
6311 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6312 }
6313 return REDIS_OK;
6314 }
6315
6316 /* Get key or value object at current iteration position.
6317 * This increases the refcount of the field object by 1. */
6318 static robj *hashCurrent(hashIterator *hi, int what) {
6319 robj *o;
6320 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6321 if (what & REDIS_HASH_KEY) {
6322 o = createStringObject((char*)hi->zk,hi->zklen);
6323 } else {
6324 o = createStringObject((char*)hi->zv,hi->zvlen);
6325 }
6326 } else {
6327 if (what & REDIS_HASH_KEY) {
6328 o = dictGetEntryKey(hi->de);
6329 } else {
6330 o = dictGetEntryVal(hi->de);
6331 }
6332 incrRefCount(o);
6333 }
6334 return o;
6335 }
6336
6337 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6338 robj *o = lookupKeyWrite(c->db,key);
6339 if (o == NULL) {
6340 o = createHashObject();
6341 dictAdd(c->db->dict,key,o);
6342 incrRefCount(key);
6343 } else {
6344 if (o->type != REDIS_HASH) {
6345 addReply(c,shared.wrongtypeerr);
6346 return NULL;
6347 }
6348 }
6349 return o;
6350 }
6351
6352 /* ============================= Hash commands ============================== */
6353 static void hsetCommand(redisClient *c) {
6354 int update;
6355 robj *o;
6356
6357 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6358 hashTryConversion(o,c->argv,2,3);
6359 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6360 update = hashSet(o,c->argv[2],c->argv[3]);
6361 addReply(c, update ? shared.czero : shared.cone);
6362 server.dirty++;
6363 }
6364
6365 static void hsetnxCommand(redisClient *c) {
6366 robj *o;
6367 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6368 hashTryConversion(o,c->argv,2,3);
6369
6370 if (hashExists(o, c->argv[2])) {
6371 addReply(c, shared.czero);
6372 } else {
6373 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6374 hashSet(o,c->argv[2],c->argv[3]);
6375 addReply(c, shared.cone);
6376 server.dirty++;
6377 }
6378 }
6379
6380 static void hmsetCommand(redisClient *c) {
6381 int i;
6382 robj *o;
6383
6384 if ((c->argc % 2) == 1) {
6385 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6386 return;
6387 }
6388
6389 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6390 hashTryConversion(o,c->argv,2,c->argc-1);
6391 for (i = 2; i < c->argc; i += 2) {
6392 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6393 hashSet(o,c->argv[i],c->argv[i+1]);
6394 }
6395 addReply(c, shared.ok);
6396 server.dirty++;
6397 }
6398
6399 static void hincrbyCommand(redisClient *c) {
6400 long long value, incr;
6401 robj *o, *current, *new;
6402
6403 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6404 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6405 if ((current = hashGet(o,c->argv[2])) != NULL) {
6406 if (current->encoding == REDIS_ENCODING_RAW)
6407 value = strtoll(current->ptr,NULL,10);
6408 else if (current->encoding == REDIS_ENCODING_INT)
6409 value = (long)current->ptr;
6410 else
6411 redisAssert(1 != 1);
6412 decrRefCount(current);
6413 } else {
6414 value = 0;
6415 }
6416
6417 value += incr;
6418 new = createStringObjectFromLongLong(value);
6419 hashTryObjectEncoding(o,&c->argv[2],NULL);
6420 hashSet(o,c->argv[2],new);
6421 decrRefCount(new);
6422 addReplyLongLong(c,value);
6423 server.dirty++;
6424 }
6425
6426 static void hgetCommand(redisClient *c) {
6427 robj *o, *value;
6428 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6429 checkType(c,o,REDIS_HASH)) return;
6430
6431 if ((value = hashGet(o,c->argv[2])) != NULL) {
6432 addReplyBulk(c,value);
6433 decrRefCount(value);
6434 } else {
6435 addReply(c,shared.nullbulk);
6436 }
6437 }
6438
6439 static void hmgetCommand(redisClient *c) {
6440 int i;
6441 robj *o, *value;
6442 o = lookupKeyRead(c->db,c->argv[1]);
6443 if (o != NULL && o->type != REDIS_HASH) {
6444 addReply(c,shared.wrongtypeerr);
6445 }
6446
6447 /* Note the check for o != NULL happens inside the loop. This is
6448 * done because objects that cannot be found are considered to be
6449 * an empty hash. The reply should then be a series of NULLs. */
6450 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6451 for (i = 2; i < c->argc; i++) {
6452 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6453 addReplyBulk(c,value);
6454 decrRefCount(value);
6455 } else {
6456 addReply(c,shared.nullbulk);
6457 }
6458 }
6459 }
6460
6461 static void hdelCommand(redisClient *c) {
6462 robj *o;
6463 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6464 checkType(c,o,REDIS_HASH)) return;
6465
6466 if (hashDelete(o,c->argv[2])) {
6467 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6468 addReply(c,shared.cone);
6469 server.dirty++;
6470 } else {
6471 addReply(c,shared.czero);
6472 }
6473 }
6474
6475 static void hlenCommand(redisClient *c) {
6476 robj *o;
6477 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6478 checkType(c,o,REDIS_HASH)) return;
6479
6480 addReplyUlong(c,hashLength(o));
6481 }
6482
6483 static void genericHgetallCommand(redisClient *c, int flags) {
6484 robj *o, *lenobj, *obj;
6485 unsigned long count = 0;
6486 hashIterator *hi;
6487
6488 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6489 || checkType(c,o,REDIS_HASH)) return;
6490
6491 lenobj = createObject(REDIS_STRING,NULL);
6492 addReply(c,lenobj);
6493 decrRefCount(lenobj);
6494
6495 hi = hashInitIterator(o);
6496 while (hashNext(hi) != REDIS_ERR) {
6497 if (flags & REDIS_HASH_KEY) {
6498 obj = hashCurrent(hi,REDIS_HASH_KEY);
6499 addReplyBulk(c,obj);
6500 decrRefCount(obj);
6501 count++;
6502 }
6503 if (flags & REDIS_HASH_VALUE) {
6504 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6505 addReplyBulk(c,obj);
6506 decrRefCount(obj);
6507 count++;
6508 }
6509 }
6510 hashReleaseIterator(hi);
6511
6512 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6513 }
6514
6515 static void hkeysCommand(redisClient *c) {
6516 genericHgetallCommand(c,REDIS_HASH_KEY);
6517 }
6518
6519 static void hvalsCommand(redisClient *c) {
6520 genericHgetallCommand(c,REDIS_HASH_VALUE);
6521 }
6522
6523 static void hgetallCommand(redisClient *c) {
6524 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6525 }
6526
6527 static void hexistsCommand(redisClient *c) {
6528 robj *o;
6529 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6530 checkType(c,o,REDIS_HASH)) return;
6531
6532 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6533 }
6534
6535 static void convertToRealHash(robj *o) {
6536 unsigned char *key, *val, *p, *zm = o->ptr;
6537 unsigned int klen, vlen;
6538 dict *dict = dictCreate(&hashDictType,NULL);
6539
6540 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6541 p = zipmapRewind(zm);
6542 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6543 robj *keyobj, *valobj;
6544
6545 keyobj = createStringObject((char*)key,klen);
6546 valobj = createStringObject((char*)val,vlen);
6547 keyobj = tryObjectEncoding(keyobj);
6548 valobj = tryObjectEncoding(valobj);
6549 dictAdd(dict,keyobj,valobj);
6550 }
6551 o->encoding = REDIS_ENCODING_HT;
6552 o->ptr = dict;
6553 zfree(zm);
6554 }
6555
6556 /* ========================= Non type-specific commands ==================== */
6557
6558 static void flushdbCommand(redisClient *c) {
6559 server.dirty += dictSize(c->db->dict);
6560 dictEmpty(c->db->dict);
6561 dictEmpty(c->db->expires);
6562 addReply(c,shared.ok);
6563 }
6564
6565 static void flushallCommand(redisClient *c) {
6566 server.dirty += emptyDb();
6567 addReply(c,shared.ok);
6568 if (server.bgsavechildpid != -1) {
6569 kill(server.bgsavechildpid,SIGKILL);
6570 rdbRemoveTempFile(server.bgsavechildpid);
6571 }
6572 rdbSave(server.dbfilename);
6573 server.dirty++;
6574 }
6575
6576 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6577 redisSortOperation *so = zmalloc(sizeof(*so));
6578 so->type = type;
6579 so->pattern = pattern;
6580 return so;
6581 }
6582
6583 /* Return the value associated to the key with a name obtained
6584 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6585 * The returned object will always have its refcount increased by 1
6586 * when it is non-NULL. */
6587 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6588 char *p, *f;
6589 sds spat, ssub;
6590 robj keyobj, fieldobj, *o;
6591 int prefixlen, sublen, postfixlen, fieldlen;
6592 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6593 struct {
6594 long len;
6595 long free;
6596 char buf[REDIS_SORTKEY_MAX+1];
6597 } keyname, fieldname;
6598
6599 /* If the pattern is "#" return the substitution object itself in order
6600 * to implement the "SORT ... GET #" feature. */
6601 spat = pattern->ptr;
6602 if (spat[0] == '#' && spat[1] == '\0') {
6603 incrRefCount(subst);
6604 return subst;
6605 }
6606
6607 /* The substitution object may be specially encoded. If so we create
6608 * a decoded object on the fly. Otherwise getDecodedObject will just
6609 * increment the ref count, that we'll decrement later. */
6610 subst = getDecodedObject(subst);
6611
6612 ssub = subst->ptr;
6613 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6614 p = strchr(spat,'*');
6615 if (!p) {
6616 decrRefCount(subst);
6617 return NULL;
6618 }
6619
6620 /* Find out if we're dealing with a hash dereference. */
6621 if ((f = strstr(p+1, "->")) != NULL) {
6622 fieldlen = sdslen(spat)-(f-spat);
6623 /* this also copies \0 character */
6624 memcpy(fieldname.buf,f+2,fieldlen-1);
6625 fieldname.len = fieldlen-2;
6626 } else {
6627 fieldlen = 0;
6628 }
6629
6630 prefixlen = p-spat;
6631 sublen = sdslen(ssub);
6632 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6633 memcpy(keyname.buf,spat,prefixlen);
6634 memcpy(keyname.buf+prefixlen,ssub,sublen);
6635 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6636 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6637 keyname.len = prefixlen+sublen+postfixlen;
6638 decrRefCount(subst);
6639
6640 /* Lookup substituted key */
6641 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6642 o = lookupKeyRead(db,&keyobj);
6643 if (o == NULL) return NULL;
6644
6645 if (fieldlen > 0) {
6646 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6647
6648 /* Retrieve value from hash by the field name. This operation
6649 * already increases the refcount of the returned object. */
6650 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6651 o = hashGet(o, &fieldobj);
6652 } else {
6653 if (o->type != REDIS_STRING) return NULL;
6654
6655 /* Every object that this function returns needs to have its refcount
6656 * increased. sortCommand decreases it again. */
6657 incrRefCount(o);
6658 }
6659
6660 return o;
6661 }
6662
6663 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6664 * the additional parameter is not standard but a BSD-specific we have to
6665 * pass sorting parameters via the global 'server' structure */
6666 static int sortCompare(const void *s1, const void *s2) {
6667 const redisSortObject *so1 = s1, *so2 = s2;
6668 int cmp;
6669
6670 if (!server.sort_alpha) {
6671 /* Numeric sorting. Here it's trivial as we precomputed scores */
6672 if (so1->u.score > so2->u.score) {
6673 cmp = 1;
6674 } else if (so1->u.score < so2->u.score) {
6675 cmp = -1;
6676 } else {
6677 cmp = 0;
6678 }
6679 } else {
6680 /* Alphanumeric sorting */
6681 if (server.sort_bypattern) {
6682 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6683 /* At least one compare object is NULL */
6684 if (so1->u.cmpobj == so2->u.cmpobj)
6685 cmp = 0;
6686 else if (so1->u.cmpobj == NULL)
6687 cmp = -1;
6688 else
6689 cmp = 1;
6690 } else {
6691 /* We have both the objects, use strcoll */
6692 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6693 }
6694 } else {
6695 /* Compare elements directly. */
6696 cmp = compareStringObjects(so1->obj,so2->obj);
6697 }
6698 }
6699 return server.sort_desc ? -cmp : cmp;
6700 }
6701
6702 /* The SORT command is the most complex command in Redis. Warning: this code
6703 * is optimized for speed and a bit less for readability */
6704 static void sortCommand(redisClient *c) {
6705 list *operations;
6706 int outputlen = 0;
6707 int desc = 0, alpha = 0;
6708 int limit_start = 0, limit_count = -1, start, end;
6709 int j, dontsort = 0, vectorlen;
6710 int getop = 0; /* GET operation counter */
6711 robj *sortval, *sortby = NULL, *storekey = NULL;
6712 redisSortObject *vector; /* Resulting vector to sort */
6713
6714 /* Lookup the key to sort. It must be of the right types */
6715 sortval = lookupKeyRead(c->db,c->argv[1]);
6716 if (sortval == NULL) {
6717 addReply(c,shared.emptymultibulk);
6718 return;
6719 }
6720 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6721 sortval->type != REDIS_ZSET)
6722 {
6723 addReply(c,shared.wrongtypeerr);
6724 return;
6725 }
6726
6727 /* Create a list of operations to perform for every sorted element.
6728 * Operations can be GET/DEL/INCR/DECR */
6729 operations = listCreate();
6730 listSetFreeMethod(operations,zfree);
6731 j = 2;
6732
6733 /* Now we need to protect sortval incrementing its count, in the future
6734 * SORT may have options able to overwrite/delete keys during the sorting
6735 * and the sorted key itself may get destroied */
6736 incrRefCount(sortval);
6737
6738 /* The SORT command has an SQL-alike syntax, parse it */
6739 while(j < c->argc) {
6740 int leftargs = c->argc-j-1;
6741 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6742 desc = 0;
6743 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6744 desc = 1;
6745 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6746 alpha = 1;
6747 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6748 limit_start = atoi(c->argv[j+1]->ptr);
6749 limit_count = atoi(c->argv[j+2]->ptr);
6750 j+=2;
6751 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6752 storekey = c->argv[j+1];
6753 j++;
6754 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6755 sortby = c->argv[j+1];
6756 /* If the BY pattern does not contain '*', i.e. it is constant,
6757 * we don't need to sort nor to lookup the weight keys. */
6758 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6759 j++;
6760 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6761 listAddNodeTail(operations,createSortOperation(
6762 REDIS_SORT_GET,c->argv[j+1]));
6763 getop++;
6764 j++;
6765 } else {
6766 decrRefCount(sortval);
6767 listRelease(operations);
6768 addReply(c,shared.syntaxerr);
6769 return;
6770 }
6771 j++;
6772 }
6773
6774 /* Load the sorting vector with all the objects to sort */
6775 switch(sortval->type) {
6776 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6777 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6778 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6779 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6780 }
6781 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6782 j = 0;
6783
6784 if (sortval->type == REDIS_LIST) {
6785 list *list = sortval->ptr;
6786 listNode *ln;
6787 listIter li;
6788
6789 listRewind(list,&li);
6790 while((ln = listNext(&li))) {
6791 robj *ele = ln->value;
6792 vector[j].obj = ele;
6793 vector[j].u.score = 0;
6794 vector[j].u.cmpobj = NULL;
6795 j++;
6796 }
6797 } else {
6798 dict *set;
6799 dictIterator *di;
6800 dictEntry *setele;
6801
6802 if (sortval->type == REDIS_SET) {
6803 set = sortval->ptr;
6804 } else {
6805 zset *zs = sortval->ptr;
6806 set = zs->dict;
6807 }
6808
6809 di = dictGetIterator(set);
6810 while((setele = dictNext(di)) != NULL) {
6811 vector[j].obj = dictGetEntryKey(setele);
6812 vector[j].u.score = 0;
6813 vector[j].u.cmpobj = NULL;
6814 j++;
6815 }
6816 dictReleaseIterator(di);
6817 }
6818 redisAssert(j == vectorlen);
6819
6820 /* Now it's time to load the right scores in the sorting vector */
6821 if (dontsort == 0) {
6822 for (j = 0; j < vectorlen; j++) {
6823 robj *byval;
6824 if (sortby) {
6825 /* lookup value to sort by */
6826 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6827 if (!byval) continue;
6828 } else {
6829 /* use object itself to sort by */
6830 byval = vector[j].obj;
6831 }
6832
6833 if (alpha) {
6834 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6835 } else {
6836 if (byval->encoding == REDIS_ENCODING_RAW) {
6837 vector[j].u.score = strtod(byval->ptr,NULL);
6838 } else if (byval->encoding == REDIS_ENCODING_INT) {
6839 /* Don't need to decode the object if it's
6840 * integer-encoded (the only encoding supported) so
6841 * far. We can just cast it */
6842 vector[j].u.score = (long)byval->ptr;
6843 } else {
6844 redisAssert(1 != 1);
6845 }
6846 }
6847
6848 /* when the object was retrieved using lookupKeyByPattern,
6849 * its refcount needs to be decreased. */
6850 if (sortby) {
6851 decrRefCount(byval);
6852 }
6853 }
6854 }
6855
6856 /* We are ready to sort the vector... perform a bit of sanity check
6857 * on the LIMIT option too. We'll use a partial version of quicksort. */
6858 start = (limit_start < 0) ? 0 : limit_start;
6859 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6860 if (start >= vectorlen) {
6861 start = vectorlen-1;
6862 end = vectorlen-2;
6863 }
6864 if (end >= vectorlen) end = vectorlen-1;
6865
6866 if (dontsort == 0) {
6867 server.sort_desc = desc;
6868 server.sort_alpha = alpha;
6869 server.sort_bypattern = sortby ? 1 : 0;
6870 if (sortby && (start != 0 || end != vectorlen-1))
6871 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6872 else
6873 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6874 }
6875
6876 /* Send command output to the output buffer, performing the specified
6877 * GET/DEL/INCR/DECR operations if any. */
6878 outputlen = getop ? getop*(end-start+1) : end-start+1;
6879 if (storekey == NULL) {
6880 /* STORE option not specified, sent the sorting result to client */
6881 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6882 for (j = start; j <= end; j++) {
6883 listNode *ln;
6884 listIter li;
6885
6886 if (!getop) addReplyBulk(c,vector[j].obj);
6887 listRewind(operations,&li);
6888 while((ln = listNext(&li))) {
6889 redisSortOperation *sop = ln->value;
6890 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6891 vector[j].obj);
6892
6893 if (sop->type == REDIS_SORT_GET) {
6894 if (!val) {
6895 addReply(c,shared.nullbulk);
6896 } else {
6897 addReplyBulk(c,val);
6898 decrRefCount(val);
6899 }
6900 } else {
6901 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6902 }
6903 }
6904 }
6905 } else {
6906 robj *listObject = createListObject();
6907 list *listPtr = (list*) listObject->ptr;
6908
6909 /* STORE option specified, set the sorting result as a List object */
6910 for (j = start; j <= end; j++) {
6911 listNode *ln;
6912 listIter li;
6913
6914 if (!getop) {
6915 listAddNodeTail(listPtr,vector[j].obj);
6916 incrRefCount(vector[j].obj);
6917 }
6918 listRewind(operations,&li);
6919 while((ln = listNext(&li))) {
6920 redisSortOperation *sop = ln->value;
6921 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6922 vector[j].obj);
6923
6924 if (sop->type == REDIS_SORT_GET) {
6925 if (!val) {
6926 listAddNodeTail(listPtr,createStringObject("",0));
6927 } else {
6928 /* We should do a incrRefCount on val because it is
6929 * added to the list, but also a decrRefCount because
6930 * it is returned by lookupKeyByPattern. This results
6931 * in doing nothing at all. */
6932 listAddNodeTail(listPtr,val);
6933 }
6934 } else {
6935 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6936 }
6937 }
6938 }
6939 if (dictReplace(c->db->dict,storekey,listObject)) {
6940 incrRefCount(storekey);
6941 }
6942 /* Note: we add 1 because the DB is dirty anyway since even if the
6943 * SORT result is empty a new key is set and maybe the old content
6944 * replaced. */
6945 server.dirty += 1+outputlen;
6946 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6947 }
6948
6949 /* Cleanup */
6950 decrRefCount(sortval);
6951 listRelease(operations);
6952 for (j = 0; j < vectorlen; j++) {
6953 if (alpha && vector[j].u.cmpobj)
6954 decrRefCount(vector[j].u.cmpobj);
6955 }
6956 zfree(vector);
6957 }
6958
6959 /* Convert an amount of bytes into a human readable string in the form
6960 * of 100B, 2G, 100M, 4K, and so forth. */
6961 static void bytesToHuman(char *s, unsigned long long n) {
6962 double d;
6963
6964 if (n < 1024) {
6965 /* Bytes */
6966 sprintf(s,"%lluB",n);
6967 return;
6968 } else if (n < (1024*1024)) {
6969 d = (double)n/(1024);
6970 sprintf(s,"%.2fK",d);
6971 } else if (n < (1024LL*1024*1024)) {
6972 d = (double)n/(1024*1024);
6973 sprintf(s,"%.2fM",d);
6974 } else if (n < (1024LL*1024*1024*1024)) {
6975 d = (double)n/(1024LL*1024*1024);
6976 sprintf(s,"%.2fG",d);
6977 }
6978 }
6979
6980 /* Create the string returned by the INFO command. This is decoupled
6981 * by the INFO command itself as we need to report the same information
6982 * on memory corruption problems. */
6983 static sds genRedisInfoString(void) {
6984 sds info;
6985 time_t uptime = time(NULL)-server.stat_starttime;
6986 int j;
6987 char hmem[64];
6988
6989 bytesToHuman(hmem,zmalloc_used_memory());
6990 info = sdscatprintf(sdsempty(),
6991 "redis_version:%s\r\n"
6992 "arch_bits:%s\r\n"
6993 "multiplexing_api:%s\r\n"
6994 "process_id:%ld\r\n"
6995 "uptime_in_seconds:%ld\r\n"
6996 "uptime_in_days:%ld\r\n"
6997 "connected_clients:%d\r\n"
6998 "connected_slaves:%d\r\n"
6999 "blocked_clients:%d\r\n"
7000 "used_memory:%zu\r\n"
7001 "used_memory_human:%s\r\n"
7002 "changes_since_last_save:%lld\r\n"
7003 "bgsave_in_progress:%d\r\n"
7004 "last_save_time:%ld\r\n"
7005 "bgrewriteaof_in_progress:%d\r\n"
7006 "total_connections_received:%lld\r\n"
7007 "total_commands_processed:%lld\r\n"
7008 "expired_keys:%lld\r\n"
7009 "hash_max_zipmap_entries:%ld\r\n"
7010 "hash_max_zipmap_value:%ld\r\n"
7011 "pubsub_channels:%ld\r\n"
7012 "pubsub_patterns:%u\r\n"
7013 "vm_enabled:%d\r\n"
7014 "role:%s\r\n"
7015 ,REDIS_VERSION,
7016 (sizeof(long) == 8) ? "64" : "32",
7017 aeGetApiName(),
7018 (long) getpid(),
7019 uptime,
7020 uptime/(3600*24),
7021 listLength(server.clients)-listLength(server.slaves),
7022 listLength(server.slaves),
7023 server.blpop_blocked_clients,
7024 zmalloc_used_memory(),
7025 hmem,
7026 server.dirty,
7027 server.bgsavechildpid != -1,
7028 server.lastsave,
7029 server.bgrewritechildpid != -1,
7030 server.stat_numconnections,
7031 server.stat_numcommands,
7032 server.stat_expiredkeys,
7033 server.hash_max_zipmap_entries,
7034 server.hash_max_zipmap_value,
7035 dictSize(server.pubsub_channels),
7036 listLength(server.pubsub_patterns),
7037 server.vm_enabled != 0,
7038 server.masterhost == NULL ? "master" : "slave"
7039 );
7040 if (server.masterhost) {
7041 info = sdscatprintf(info,
7042 "master_host:%s\r\n"
7043 "master_port:%d\r\n"
7044 "master_link_status:%s\r\n"
7045 "master_last_io_seconds_ago:%d\r\n"
7046 ,server.masterhost,
7047 server.masterport,
7048 (server.replstate == REDIS_REPL_CONNECTED) ?
7049 "up" : "down",
7050 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7051 );
7052 }
7053 if (server.vm_enabled) {
7054 lockThreadedIO();
7055 info = sdscatprintf(info,
7056 "vm_conf_max_memory:%llu\r\n"
7057 "vm_conf_page_size:%llu\r\n"
7058 "vm_conf_pages:%llu\r\n"
7059 "vm_stats_used_pages:%llu\r\n"
7060 "vm_stats_swapped_objects:%llu\r\n"
7061 "vm_stats_swappin_count:%llu\r\n"
7062 "vm_stats_swappout_count:%llu\r\n"
7063 "vm_stats_io_newjobs_len:%lu\r\n"
7064 "vm_stats_io_processing_len:%lu\r\n"
7065 "vm_stats_io_processed_len:%lu\r\n"
7066 "vm_stats_io_active_threads:%lu\r\n"
7067 "vm_stats_blocked_clients:%lu\r\n"
7068 ,(unsigned long long) server.vm_max_memory,
7069 (unsigned long long) server.vm_page_size,
7070 (unsigned long long) server.vm_pages,
7071 (unsigned long long) server.vm_stats_used_pages,
7072 (unsigned long long) server.vm_stats_swapped_objects,
7073 (unsigned long long) server.vm_stats_swapins,
7074 (unsigned long long) server.vm_stats_swapouts,
7075 (unsigned long) listLength(server.io_newjobs),
7076 (unsigned long) listLength(server.io_processing),
7077 (unsigned long) listLength(server.io_processed),
7078 (unsigned long) server.io_active_threads,
7079 (unsigned long) server.vm_blocked_clients
7080 );
7081 unlockThreadedIO();
7082 }
7083 for (j = 0; j < server.dbnum; j++) {
7084 long long keys, vkeys;
7085
7086 keys = dictSize(server.db[j].dict);
7087 vkeys = dictSize(server.db[j].expires);
7088 if (keys || vkeys) {
7089 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7090 j, keys, vkeys);
7091 }
7092 }
7093 return info;
7094 }
7095
7096 static void infoCommand(redisClient *c) {
7097 sds info = genRedisInfoString();
7098 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7099 (unsigned long)sdslen(info)));
7100 addReplySds(c,info);
7101 addReply(c,shared.crlf);
7102 }
7103
7104 static void monitorCommand(redisClient *c) {
7105 /* ignore MONITOR if aleady slave or in monitor mode */
7106 if (c->flags & REDIS_SLAVE) return;
7107
7108 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7109 c->slaveseldb = 0;
7110 listAddNodeTail(server.monitors,c);
7111 addReply(c,shared.ok);
7112 }
7113
7114 /* ================================= Expire ================================= */
7115 static int removeExpire(redisDb *db, robj *key) {
7116 if (dictDelete(db->expires,key) == DICT_OK) {
7117 return 1;
7118 } else {
7119 return 0;
7120 }
7121 }
7122
7123 static int setExpire(redisDb *db, robj *key, time_t when) {
7124 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7125 return 0;
7126 } else {
7127 incrRefCount(key);
7128 return 1;
7129 }
7130 }
7131
7132 /* Return the expire time of the specified key, or -1 if no expire
7133 * is associated with this key (i.e. the key is non volatile) */
7134 static time_t getExpire(redisDb *db, robj *key) {
7135 dictEntry *de;
7136
7137 /* No expire? return ASAP */
7138 if (dictSize(db->expires) == 0 ||
7139 (de = dictFind(db->expires,key)) == NULL) return -1;
7140
7141 return (time_t) dictGetEntryVal(de);
7142 }
7143
7144 static int expireIfNeeded(redisDb *db, robj *key) {
7145 time_t when;
7146 dictEntry *de;
7147
7148 /* No expire? return ASAP */
7149 if (dictSize(db->expires) == 0 ||
7150 (de = dictFind(db->expires,key)) == NULL) return 0;
7151
7152 /* Lookup the expire */
7153 when = (time_t) dictGetEntryVal(de);
7154 if (time(NULL) <= when) return 0;
7155
7156 /* Delete the key */
7157 dictDelete(db->expires,key);
7158 server.stat_expiredkeys++;
7159 return dictDelete(db->dict,key) == DICT_OK;
7160 }
7161
7162 static int deleteIfVolatile(redisDb *db, robj *key) {
7163 dictEntry *de;
7164
7165 /* No expire? return ASAP */
7166 if (dictSize(db->expires) == 0 ||
7167 (de = dictFind(db->expires,key)) == NULL) return 0;
7168
7169 /* Delete the key */
7170 server.dirty++;
7171 server.stat_expiredkeys++;
7172 dictDelete(db->expires,key);
7173 return dictDelete(db->dict,key) == DICT_OK;
7174 }
7175
7176 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7177 dictEntry *de;
7178 time_t seconds;
7179
7180 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7181
7182 seconds -= offset;
7183
7184 de = dictFind(c->db->dict,key);
7185 if (de == NULL) {
7186 addReply(c,shared.czero);
7187 return;
7188 }
7189 if (seconds <= 0) {
7190 if (deleteKey(c->db,key)) server.dirty++;
7191 addReply(c, shared.cone);
7192 return;
7193 } else {
7194 time_t when = time(NULL)+seconds;
7195 if (setExpire(c->db,key,when)) {
7196 addReply(c,shared.cone);
7197 server.dirty++;
7198 } else {
7199 addReply(c,shared.czero);
7200 }
7201 return;
7202 }
7203 }
7204
7205 static void expireCommand(redisClient *c) {
7206 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7207 }
7208
7209 static void expireatCommand(redisClient *c) {
7210 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7211 }
7212
7213 static void ttlCommand(redisClient *c) {
7214 time_t expire;
7215 int ttl = -1;
7216
7217 expire = getExpire(c->db,c->argv[1]);
7218 if (expire != -1) {
7219 ttl = (int) (expire-time(NULL));
7220 if (ttl < 0) ttl = -1;
7221 }
7222 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7223 }
7224
7225 /* ================================ MULTI/EXEC ============================== */
7226
7227 /* Client state initialization for MULTI/EXEC */
7228 static void initClientMultiState(redisClient *c) {
7229 c->mstate.commands = NULL;
7230 c->mstate.count = 0;
7231 }
7232
7233 /* Release all the resources associated with MULTI/EXEC state */
7234 static void freeClientMultiState(redisClient *c) {
7235 int j;
7236
7237 for (j = 0; j < c->mstate.count; j++) {
7238 int i;
7239 multiCmd *mc = c->mstate.commands+j;
7240
7241 for (i = 0; i < mc->argc; i++)
7242 decrRefCount(mc->argv[i]);
7243 zfree(mc->argv);
7244 }
7245 zfree(c->mstate.commands);
7246 }
7247
7248 /* Add a new command into the MULTI commands queue */
7249 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7250 multiCmd *mc;
7251 int j;
7252
7253 c->mstate.commands = zrealloc(c->mstate.commands,
7254 sizeof(multiCmd)*(c->mstate.count+1));
7255 mc = c->mstate.commands+c->mstate.count;
7256 mc->cmd = cmd;
7257 mc->argc = c->argc;
7258 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7259 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7260 for (j = 0; j < c->argc; j++)
7261 incrRefCount(mc->argv[j]);
7262 c->mstate.count++;
7263 }
7264
7265 static void multiCommand(redisClient *c) {
7266 c->flags |= REDIS_MULTI;
7267 addReply(c,shared.ok);
7268 }
7269
7270 static void discardCommand(redisClient *c) {
7271 if (!(c->flags & REDIS_MULTI)) {
7272 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7273 return;
7274 }
7275
7276 freeClientMultiState(c);
7277 initClientMultiState(c);
7278 c->flags &= (~REDIS_MULTI);
7279 addReply(c,shared.ok);
7280 }
7281
7282 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7283 * implememntation for more information. */
7284 static void execCommandReplicateMulti(redisClient *c) {
7285 struct redisCommand *cmd;
7286 robj *multistring = createStringObject("MULTI",5);
7287
7288 cmd = lookupCommand("multi");
7289 if (server.appendonly)
7290 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7291 if (listLength(server.slaves))
7292 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7293 decrRefCount(multistring);
7294 }
7295
7296 static void execCommand(redisClient *c) {
7297 int j;
7298 robj **orig_argv;
7299 int orig_argc;
7300
7301 if (!(c->flags & REDIS_MULTI)) {
7302 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7303 return;
7304 }
7305
7306 /* Replicate a MULTI request now that we are sure the block is executed.
7307 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7308 * both the AOF and the replication link will have the same consistency
7309 * and atomicity guarantees. */
7310 execCommandReplicateMulti(c);
7311
7312 /* Exec all the queued commands */
7313 orig_argv = c->argv;
7314 orig_argc = c->argc;
7315 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7316 for (j = 0; j < c->mstate.count; j++) {
7317 c->argc = c->mstate.commands[j].argc;
7318 c->argv = c->mstate.commands[j].argv;
7319 call(c,c->mstate.commands[j].cmd);
7320 }
7321 c->argv = orig_argv;
7322 c->argc = orig_argc;
7323 freeClientMultiState(c);
7324 initClientMultiState(c);
7325 c->flags &= (~REDIS_MULTI);
7326 /* Make sure the EXEC command is always replicated / AOF, since we
7327 * always send the MULTI command (we can't know beforehand if the
7328 * next operations will contain at least a modification to the DB). */
7329 server.dirty++;
7330 }
7331
7332 /* =========================== Blocking Operations ========================= */
7333
7334 /* Currently Redis blocking operations support is limited to list POP ops,
7335 * so the current implementation is not fully generic, but it is also not
7336 * completely specific so it will not require a rewrite to support new
7337 * kind of blocking operations in the future.
7338 *
7339 * Still it's important to note that list blocking operations can be already
7340 * used as a notification mechanism in order to implement other blocking
7341 * operations at application level, so there must be a very strong evidence
7342 * of usefulness and generality before new blocking operations are implemented.
7343 *
7344 * This is how the current blocking POP works, we use BLPOP as example:
7345 * - If the user calls BLPOP and the key exists and contains a non empty list
7346 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7347 * if there is not to block.
7348 * - If instead BLPOP is called and the key does not exists or the list is
7349 * empty we need to block. In order to do so we remove the notification for
7350 * new data to read in the client socket (so that we'll not serve new
7351 * requests if the blocking request is not served). Also we put the client
7352 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7353 * blocking for this keys.
7354 * - If a PUSH operation against a key with blocked clients waiting is
7355 * performed, we serve the first in the list: basically instead to push
7356 * the new element inside the list we return it to the (first / oldest)
7357 * blocking client, unblock the client, and remove it form the list.
7358 *
7359 * The above comment and the source code should be enough in order to understand
7360 * the implementation and modify / fix it later.
7361 */
7362
7363 /* Set a client in blocking mode for the specified key, with the specified
7364 * timeout */
7365 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7366 dictEntry *de;
7367 list *l;
7368 int j;
7369
7370 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7371 c->blockingkeysnum = numkeys;
7372 c->blockingto = timeout;
7373 for (j = 0; j < numkeys; j++) {
7374 /* Add the key in the client structure, to map clients -> keys */
7375 c->blockingkeys[j] = keys[j];
7376 incrRefCount(keys[j]);
7377
7378 /* And in the other "side", to map keys -> clients */
7379 de = dictFind(c->db->blockingkeys,keys[j]);
7380 if (de == NULL) {
7381 int retval;
7382
7383 /* For every key we take a list of clients blocked for it */
7384 l = listCreate();
7385 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7386 incrRefCount(keys[j]);
7387 assert(retval == DICT_OK);
7388 } else {
7389 l = dictGetEntryVal(de);
7390 }
7391 listAddNodeTail(l,c);
7392 }
7393 /* Mark the client as a blocked client */
7394 c->flags |= REDIS_BLOCKED;
7395 server.blpop_blocked_clients++;
7396 }
7397
7398 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7399 static void unblockClientWaitingData(redisClient *c) {
7400 dictEntry *de;
7401 list *l;
7402 int j;
7403
7404 assert(c->blockingkeys != NULL);
7405 /* The client may wait for multiple keys, so unblock it for every key. */
7406 for (j = 0; j < c->blockingkeysnum; j++) {
7407 /* Remove this client from the list of clients waiting for this key. */
7408 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7409 assert(de != NULL);
7410 l = dictGetEntryVal(de);
7411 listDelNode(l,listSearchKey(l,c));
7412 /* If the list is empty we need to remove it to avoid wasting memory */
7413 if (listLength(l) == 0)
7414 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7415 decrRefCount(c->blockingkeys[j]);
7416 }
7417 /* Cleanup the client structure */
7418 zfree(c->blockingkeys);
7419 c->blockingkeys = NULL;
7420 c->flags &= (~REDIS_BLOCKED);
7421 server.blpop_blocked_clients--;
7422 /* We want to process data if there is some command waiting
7423 * in the input buffer. Note that this is safe even if
7424 * unblockClientWaitingData() gets called from freeClient() because
7425 * freeClient() will be smart enough to call this function
7426 * *after* c->querybuf was set to NULL. */
7427 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7428 }
7429
7430 /* This should be called from any function PUSHing into lists.
7431 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7432 * 'ele' is the element pushed.
7433 *
7434 * If the function returns 0 there was no client waiting for a list push
7435 * against this key.
7436 *
7437 * If the function returns 1 there was a client waiting for a list push
7438 * against this key, the element was passed to this client thus it's not
7439 * needed to actually add it to the list and the caller should return asap. */
7440 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7441 struct dictEntry *de;
7442 redisClient *receiver;
7443 list *l;
7444 listNode *ln;
7445
7446 de = dictFind(c->db->blockingkeys,key);
7447 if (de == NULL) return 0;
7448 l = dictGetEntryVal(de);
7449 ln = listFirst(l);
7450 assert(ln != NULL);
7451 receiver = ln->value;
7452
7453 addReplySds(receiver,sdsnew("*2\r\n"));
7454 addReplyBulk(receiver,key);
7455 addReplyBulk(receiver,ele);
7456 unblockClientWaitingData(receiver);
7457 return 1;
7458 }
7459
7460 /* Blocking RPOP/LPOP */
7461 static void blockingPopGenericCommand(redisClient *c, int where) {
7462 robj *o;
7463 time_t timeout;
7464 int j;
7465
7466 for (j = 1; j < c->argc-1; j++) {
7467 o = lookupKeyWrite(c->db,c->argv[j]);
7468 if (o != NULL) {
7469 if (o->type != REDIS_LIST) {
7470 addReply(c,shared.wrongtypeerr);
7471 return;
7472 } else {
7473 list *list = o->ptr;
7474 if (listLength(list) != 0) {
7475 /* If the list contains elements fall back to the usual
7476 * non-blocking POP operation */
7477 robj *argv[2], **orig_argv;
7478 int orig_argc;
7479
7480 /* We need to alter the command arguments before to call
7481 * popGenericCommand() as the command takes a single key. */
7482 orig_argv = c->argv;
7483 orig_argc = c->argc;
7484 argv[1] = c->argv[j];
7485 c->argv = argv;
7486 c->argc = 2;
7487
7488 /* Also the return value is different, we need to output
7489 * the multi bulk reply header and the key name. The
7490 * "real" command will add the last element (the value)
7491 * for us. If this souds like an hack to you it's just
7492 * because it is... */
7493 addReplySds(c,sdsnew("*2\r\n"));
7494 addReplyBulk(c,argv[1]);
7495 popGenericCommand(c,where);
7496
7497 /* Fix the client structure with the original stuff */
7498 c->argv = orig_argv;
7499 c->argc = orig_argc;
7500 return;
7501 }
7502 }
7503 }
7504 }
7505 /* If the list is empty or the key does not exists we must block */
7506 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7507 if (timeout > 0) timeout += time(NULL);
7508 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7509 }
7510
7511 static void blpopCommand(redisClient *c) {
7512 blockingPopGenericCommand(c,REDIS_HEAD);
7513 }
7514
7515 static void brpopCommand(redisClient *c) {
7516 blockingPopGenericCommand(c,REDIS_TAIL);
7517 }
7518
7519 /* =============================== Replication ============================= */
7520
7521 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7522 ssize_t nwritten, ret = size;
7523 time_t start = time(NULL);
7524
7525 timeout++;
7526 while(size) {
7527 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7528 nwritten = write(fd,ptr,size);
7529 if (nwritten == -1) return -1;
7530 ptr += nwritten;
7531 size -= nwritten;
7532 }
7533 if ((time(NULL)-start) > timeout) {
7534 errno = ETIMEDOUT;
7535 return -1;
7536 }
7537 }
7538 return ret;
7539 }
7540
7541 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7542 ssize_t nread, totread = 0;
7543 time_t start = time(NULL);
7544
7545 timeout++;
7546 while(size) {
7547 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7548 nread = read(fd,ptr,size);
7549 if (nread == -1) return -1;
7550 ptr += nread;
7551 size -= nread;
7552 totread += nread;
7553 }
7554 if ((time(NULL)-start) > timeout) {
7555 errno = ETIMEDOUT;
7556 return -1;
7557 }
7558 }
7559 return totread;
7560 }
7561
7562 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7563 ssize_t nread = 0;
7564
7565 size--;
7566 while(size) {
7567 char c;
7568
7569 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7570 if (c == '\n') {
7571 *ptr = '\0';
7572 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7573 return nread;
7574 } else {
7575 *ptr++ = c;
7576 *ptr = '\0';
7577 nread++;
7578 }
7579 }
7580 return nread;
7581 }
7582
7583 static void syncCommand(redisClient *c) {
7584 /* ignore SYNC if aleady slave or in monitor mode */
7585 if (c->flags & REDIS_SLAVE) return;
7586
7587 /* SYNC can't be issued when the server has pending data to send to
7588 * the client about already issued commands. We need a fresh reply
7589 * buffer registering the differences between the BGSAVE and the current
7590 * dataset, so that we can copy to other slaves if needed. */
7591 if (listLength(c->reply) != 0) {
7592 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7593 return;
7594 }
7595
7596 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7597 /* Here we need to check if there is a background saving operation
7598 * in progress, or if it is required to start one */
7599 if (server.bgsavechildpid != -1) {
7600 /* Ok a background save is in progress. Let's check if it is a good
7601 * one for replication, i.e. if there is another slave that is
7602 * registering differences since the server forked to save */
7603 redisClient *slave;
7604 listNode *ln;
7605 listIter li;
7606
7607 listRewind(server.slaves,&li);
7608 while((ln = listNext(&li))) {
7609 slave = ln->value;
7610 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7611 }
7612 if (ln) {
7613 /* Perfect, the server is already registering differences for
7614 * another slave. Set the right state, and copy the buffer. */
7615 listRelease(c->reply);
7616 c->reply = listDup(slave->reply);
7617 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7618 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7619 } else {
7620 /* No way, we need to wait for the next BGSAVE in order to
7621 * register differences */
7622 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7623 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7624 }
7625 } else {
7626 /* Ok we don't have a BGSAVE in progress, let's start one */
7627 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7628 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7629 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7630 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7631 return;
7632 }
7633 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7634 }
7635 c->repldbfd = -1;
7636 c->flags |= REDIS_SLAVE;
7637 c->slaveseldb = 0;
7638 listAddNodeTail(server.slaves,c);
7639 return;
7640 }
7641
7642 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7643 redisClient *slave = privdata;
7644 REDIS_NOTUSED(el);
7645 REDIS_NOTUSED(mask);
7646 char buf[REDIS_IOBUF_LEN];
7647 ssize_t nwritten, buflen;
7648
7649 if (slave->repldboff == 0) {
7650 /* Write the bulk write count before to transfer the DB. In theory here
7651 * we don't know how much room there is in the output buffer of the
7652 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7653 * operations) will never be smaller than the few bytes we need. */
7654 sds bulkcount;
7655
7656 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7657 slave->repldbsize);
7658 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7659 {
7660 sdsfree(bulkcount);
7661 freeClient(slave);
7662 return;
7663 }
7664 sdsfree(bulkcount);
7665 }
7666 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7667 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7668 if (buflen <= 0) {
7669 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7670 (buflen == 0) ? "premature EOF" : strerror(errno));
7671 freeClient(slave);
7672 return;
7673 }
7674 if ((nwritten = write(fd,buf,buflen)) == -1) {
7675 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7676 strerror(errno));
7677 freeClient(slave);
7678 return;
7679 }
7680 slave->repldboff += nwritten;
7681 if (slave->repldboff == slave->repldbsize) {
7682 close(slave->repldbfd);
7683 slave->repldbfd = -1;
7684 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7685 slave->replstate = REDIS_REPL_ONLINE;
7686 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7687 sendReplyToClient, slave) == AE_ERR) {
7688 freeClient(slave);
7689 return;
7690 }
7691 addReplySds(slave,sdsempty());
7692 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7693 }
7694 }
7695
7696 /* This function is called at the end of every backgrond saving.
7697 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7698 * otherwise REDIS_ERR is passed to the function.
7699 *
7700 * The goal of this function is to handle slaves waiting for a successful
7701 * background saving in order to perform non-blocking synchronization. */
7702 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7703 listNode *ln;
7704 int startbgsave = 0;
7705 listIter li;
7706
7707 listRewind(server.slaves,&li);
7708 while((ln = listNext(&li))) {
7709 redisClient *slave = ln->value;
7710
7711 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7712 startbgsave = 1;
7713 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7714 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7715 struct redis_stat buf;
7716
7717 if (bgsaveerr != REDIS_OK) {
7718 freeClient(slave);
7719 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7720 continue;
7721 }
7722 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7723 redis_fstat(slave->repldbfd,&buf) == -1) {
7724 freeClient(slave);
7725 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7726 continue;
7727 }
7728 slave->repldboff = 0;
7729 slave->repldbsize = buf.st_size;
7730 slave->replstate = REDIS_REPL_SEND_BULK;
7731 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7732 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7733 freeClient(slave);
7734 continue;
7735 }
7736 }
7737 }
7738 if (startbgsave) {
7739 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7740 listIter li;
7741
7742 listRewind(server.slaves,&li);
7743 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7744 while((ln = listNext(&li))) {
7745 redisClient *slave = ln->value;
7746
7747 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7748 freeClient(slave);
7749 }
7750 }
7751 }
7752 }
7753
7754 static int syncWithMaster(void) {
7755 char buf[1024], tmpfile[256], authcmd[1024];
7756 long dumpsize;
7757 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7758 int dfd, maxtries = 5;
7759
7760 if (fd == -1) {
7761 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7762 strerror(errno));
7763 return REDIS_ERR;
7764 }
7765
7766 /* AUTH with the master if required. */
7767 if(server.masterauth) {
7768 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7769 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7770 close(fd);
7771 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7772 strerror(errno));
7773 return REDIS_ERR;
7774 }
7775 /* Read the AUTH result. */
7776 if (syncReadLine(fd,buf,1024,3600) == -1) {
7777 close(fd);
7778 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7779 strerror(errno));
7780 return REDIS_ERR;
7781 }
7782 if (buf[0] != '+') {
7783 close(fd);
7784 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7785 return REDIS_ERR;
7786 }
7787 }
7788
7789 /* Issue the SYNC command */
7790 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7791 close(fd);
7792 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7793 strerror(errno));
7794 return REDIS_ERR;
7795 }
7796 /* Read the bulk write count */
7797 if (syncReadLine(fd,buf,1024,3600) == -1) {
7798 close(fd);
7799 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7800 strerror(errno));
7801 return REDIS_ERR;
7802 }
7803 if (buf[0] != '$') {
7804 close(fd);
7805 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7806 return REDIS_ERR;
7807 }
7808 dumpsize = strtol(buf+1,NULL,10);
7809 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7810 /* Read the bulk write data on a temp file */
7811 while(maxtries--) {
7812 snprintf(tmpfile,256,
7813 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7814 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7815 if (dfd != -1) break;
7816 sleep(1);
7817 }
7818 if (dfd == -1) {
7819 close(fd);
7820 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7821 return REDIS_ERR;
7822 }
7823 while(dumpsize) {
7824 int nread, nwritten;
7825
7826 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7827 if (nread == -1) {
7828 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7829 strerror(errno));
7830 close(fd);
7831 close(dfd);
7832 return REDIS_ERR;
7833 }
7834 nwritten = write(dfd,buf,nread);
7835 if (nwritten == -1) {
7836 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7837 close(fd);
7838 close(dfd);
7839 return REDIS_ERR;
7840 }
7841 dumpsize -= nread;
7842 }
7843 close(dfd);
7844 if (rename(tmpfile,server.dbfilename) == -1) {
7845 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7846 unlink(tmpfile);
7847 close(fd);
7848 return REDIS_ERR;
7849 }
7850 emptyDb();
7851 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7852 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7853 close(fd);
7854 return REDIS_ERR;
7855 }
7856 server.master = createClient(fd);
7857 server.master->flags |= REDIS_MASTER;
7858 server.master->authenticated = 1;
7859 server.replstate = REDIS_REPL_CONNECTED;
7860 return REDIS_OK;
7861 }
7862
7863 static void slaveofCommand(redisClient *c) {
7864 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7865 !strcasecmp(c->argv[2]->ptr,"one")) {
7866 if (server.masterhost) {
7867 sdsfree(server.masterhost);
7868 server.masterhost = NULL;
7869 if (server.master) freeClient(server.master);
7870 server.replstate = REDIS_REPL_NONE;
7871 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7872 }
7873 } else {
7874 sdsfree(server.masterhost);
7875 server.masterhost = sdsdup(c->argv[1]->ptr);
7876 server.masterport = atoi(c->argv[2]->ptr);
7877 if (server.master) freeClient(server.master);
7878 server.replstate = REDIS_REPL_CONNECT;
7879 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7880 server.masterhost, server.masterport);
7881 }
7882 addReply(c,shared.ok);
7883 }
7884
7885 /* ============================ Maxmemory directive ======================== */
7886
7887 /* Try to free one object form the pre-allocated objects free list.
7888 * This is useful under low mem conditions as by default we take 1 million
7889 * free objects allocated. On success REDIS_OK is returned, otherwise
7890 * REDIS_ERR. */
7891 static int tryFreeOneObjectFromFreelist(void) {
7892 robj *o;
7893
7894 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7895 if (listLength(server.objfreelist)) {
7896 listNode *head = listFirst(server.objfreelist);
7897 o = listNodeValue(head);
7898 listDelNode(server.objfreelist,head);
7899 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7900 zfree(o);
7901 return REDIS_OK;
7902 } else {
7903 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7904 return REDIS_ERR;
7905 }
7906 }
7907
7908 /* This function gets called when 'maxmemory' is set on the config file to limit
7909 * the max memory used by the server, and we are out of memory.
7910 * This function will try to, in order:
7911 *
7912 * - Free objects from the free list
7913 * - Try to remove keys with an EXPIRE set
7914 *
7915 * It is not possible to free enough memory to reach used-memory < maxmemory
7916 * the server will start refusing commands that will enlarge even more the
7917 * memory usage.
7918 */
7919 static void freeMemoryIfNeeded(void) {
7920 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7921 int j, k, freed = 0;
7922
7923 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7924 for (j = 0; j < server.dbnum; j++) {
7925 int minttl = -1;
7926 robj *minkey = NULL;
7927 struct dictEntry *de;
7928
7929 if (dictSize(server.db[j].expires)) {
7930 freed = 1;
7931 /* From a sample of three keys drop the one nearest to
7932 * the natural expire */
7933 for (k = 0; k < 3; k++) {
7934 time_t t;
7935
7936 de = dictGetRandomKey(server.db[j].expires);
7937 t = (time_t) dictGetEntryVal(de);
7938 if (minttl == -1 || t < minttl) {
7939 minkey = dictGetEntryKey(de);
7940 minttl = t;
7941 }
7942 }
7943 deleteKey(server.db+j,minkey);
7944 }
7945 }
7946 if (!freed) return; /* nothing to free... */
7947 }
7948 }
7949
7950 /* ============================== Append Only file ========================== */
7951
7952 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7953 sds buf = sdsempty();
7954 int j;
7955 ssize_t nwritten;
7956 time_t now;
7957 robj *tmpargv[3];
7958
7959 /* The DB this command was targetting is not the same as the last command
7960 * we appendend. To issue a SELECT command is needed. */
7961 if (dictid != server.appendseldb) {
7962 char seldb[64];
7963
7964 snprintf(seldb,sizeof(seldb),"%d",dictid);
7965 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7966 (unsigned long)strlen(seldb),seldb);
7967 server.appendseldb = dictid;
7968 }
7969
7970 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7971 * EXPIREs into EXPIREATs calls */
7972 if (cmd->proc == expireCommand) {
7973 long when;
7974
7975 tmpargv[0] = createStringObject("EXPIREAT",8);
7976 tmpargv[1] = argv[1];
7977 incrRefCount(argv[1]);
7978 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7979 tmpargv[2] = createObject(REDIS_STRING,
7980 sdscatprintf(sdsempty(),"%ld",when));
7981 argv = tmpargv;
7982 }
7983
7984 /* Append the actual command */
7985 buf = sdscatprintf(buf,"*%d\r\n",argc);
7986 for (j = 0; j < argc; j++) {
7987 robj *o = argv[j];
7988
7989 o = getDecodedObject(o);
7990 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7991 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7992 buf = sdscatlen(buf,"\r\n",2);
7993 decrRefCount(o);
7994 }
7995
7996 /* Free the objects from the modified argv for EXPIREAT */
7997 if (cmd->proc == expireCommand) {
7998 for (j = 0; j < 3; j++)
7999 decrRefCount(argv[j]);
8000 }
8001
8002 /* We want to perform a single write. This should be guaranteed atomic
8003 * at least if the filesystem we are writing is a real physical one.
8004 * While this will save us against the server being killed I don't think
8005 * there is much to do about the whole server stopping for power problems
8006 * or alike */
8007 nwritten = write(server.appendfd,buf,sdslen(buf));
8008 if (nwritten != (signed)sdslen(buf)) {
8009 /* Ooops, we are in troubles. The best thing to do for now is
8010 * to simply exit instead to give the illusion that everything is
8011 * working as expected. */
8012 if (nwritten == -1) {
8013 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8014 } else {
8015 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8016 }
8017 exit(1);
8018 }
8019 /* If a background append only file rewriting is in progress we want to
8020 * accumulate the differences between the child DB and the current one
8021 * in a buffer, so that when the child process will do its work we
8022 * can append the differences to the new append only file. */
8023 if (server.bgrewritechildpid != -1)
8024 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8025
8026 sdsfree(buf);
8027 now = time(NULL);
8028 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8029 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8030 now-server.lastfsync > 1))
8031 {
8032 fsync(server.appendfd); /* Let's try to get this data on the disk */
8033 server.lastfsync = now;
8034 }
8035 }
8036
8037 /* In Redis commands are always executed in the context of a client, so in
8038 * order to load the append only file we need to create a fake client. */
8039 static struct redisClient *createFakeClient(void) {
8040 struct redisClient *c = zmalloc(sizeof(*c));
8041
8042 selectDb(c,0);
8043 c->fd = -1;
8044 c->querybuf = sdsempty();
8045 c->argc = 0;
8046 c->argv = NULL;
8047 c->flags = 0;
8048 /* We set the fake client as a slave waiting for the synchronization
8049 * so that Redis will not try to send replies to this client. */
8050 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8051 c->reply = listCreate();
8052 listSetFreeMethod(c->reply,decrRefCount);
8053 listSetDupMethod(c->reply,dupClientReplyValue);
8054 return c;
8055 }
8056
8057 static void freeFakeClient(struct redisClient *c) {
8058 sdsfree(c->querybuf);
8059 listRelease(c->reply);
8060 zfree(c);
8061 }
8062
8063 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8064 * error (the append only file is zero-length) REDIS_ERR is returned. On
8065 * fatal error an error message is logged and the program exists. */
8066 int loadAppendOnlyFile(char *filename) {
8067 struct redisClient *fakeClient;
8068 FILE *fp = fopen(filename,"r");
8069 struct redis_stat sb;
8070 unsigned long long loadedkeys = 0;
8071
8072 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8073 return REDIS_ERR;
8074
8075 if (fp == NULL) {
8076 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8077 exit(1);
8078 }
8079
8080 fakeClient = createFakeClient();
8081 while(1) {
8082 int argc, j;
8083 unsigned long len;
8084 robj **argv;
8085 char buf[128];
8086 sds argsds;
8087 struct redisCommand *cmd;
8088
8089 if (fgets(buf,sizeof(buf),fp) == NULL) {
8090 if (feof(fp))
8091 break;
8092 else
8093 goto readerr;
8094 }
8095 if (buf[0] != '*') goto fmterr;
8096 argc = atoi(buf+1);
8097 argv = zmalloc(sizeof(robj*)*argc);
8098 for (j = 0; j < argc; j++) {
8099 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8100 if (buf[0] != '$') goto fmterr;
8101 len = strtol(buf+1,NULL,10);
8102 argsds = sdsnewlen(NULL,len);
8103 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8104 argv[j] = createObject(REDIS_STRING,argsds);
8105 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8106 }
8107
8108 /* Command lookup */
8109 cmd = lookupCommand(argv[0]->ptr);
8110 if (!cmd) {
8111 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8112 exit(1);
8113 }
8114 /* Try object encoding */
8115 if (cmd->flags & REDIS_CMD_BULK)
8116 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8117 /* Run the command in the context of a fake client */
8118 fakeClient->argc = argc;
8119 fakeClient->argv = argv;
8120 cmd->proc(fakeClient);
8121 /* Discard the reply objects list from the fake client */
8122 while(listLength(fakeClient->reply))
8123 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8124 /* Clean up, ready for the next command */
8125 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8126 zfree(argv);
8127 /* Handle swapping while loading big datasets when VM is on */
8128 loadedkeys++;
8129 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8130 while (zmalloc_used_memory() > server.vm_max_memory) {
8131 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8132 }
8133 }
8134 }
8135 fclose(fp);
8136 freeFakeClient(fakeClient);
8137 return REDIS_OK;
8138
8139 readerr:
8140 if (feof(fp)) {
8141 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8142 } else {
8143 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8144 }
8145 exit(1);
8146 fmterr:
8147 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8148 exit(1);
8149 }
8150
8151 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8152 static int fwriteBulkObject(FILE *fp, robj *obj) {
8153 char buf[128];
8154 int decrrc = 0;
8155
8156 /* Avoid the incr/decr ref count business if possible to help
8157 * copy-on-write (we are often in a child process when this function
8158 * is called).
8159 * Also makes sure that key objects don't get incrRefCount-ed when VM
8160 * is enabled */
8161 if (obj->encoding != REDIS_ENCODING_RAW) {
8162 obj = getDecodedObject(obj);
8163 decrrc = 1;
8164 }
8165 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8166 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8167 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8168 goto err;
8169 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8170 if (decrrc) decrRefCount(obj);
8171 return 1;
8172 err:
8173 if (decrrc) decrRefCount(obj);
8174 return 0;
8175 }
8176
8177 /* Write binary-safe string into a file in the bulkformat
8178 * $<count>\r\n<payload>\r\n */
8179 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8180 char buf[128];
8181
8182 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8183 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8184 if (len && fwrite(s,len,1,fp) == 0) return 0;
8185 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8186 return 1;
8187 }
8188
8189 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8190 static int fwriteBulkDouble(FILE *fp, double d) {
8191 char buf[128], dbuf[128];
8192
8193 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8194 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8195 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8196 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8197 return 1;
8198 }
8199
8200 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8201 static int fwriteBulkLong(FILE *fp, long l) {
8202 char buf[128], lbuf[128];
8203
8204 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8205 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8206 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8207 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8208 return 1;
8209 }
8210
8211 /* Write a sequence of commands able to fully rebuild the dataset into
8212 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8213 static int rewriteAppendOnlyFile(char *filename) {
8214 dictIterator *di = NULL;
8215 dictEntry *de;
8216 FILE *fp;
8217 char tmpfile[256];
8218 int j;
8219 time_t now = time(NULL);
8220
8221 /* Note that we have to use a different temp name here compared to the
8222 * one used by rewriteAppendOnlyFileBackground() function. */
8223 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8224 fp = fopen(tmpfile,"w");
8225 if (!fp) {
8226 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8227 return REDIS_ERR;
8228 }
8229 for (j = 0; j < server.dbnum; j++) {
8230 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8231 redisDb *db = server.db+j;
8232 dict *d = db->dict;
8233 if (dictSize(d) == 0) continue;
8234 di = dictGetIterator(d);
8235 if (!di) {
8236 fclose(fp);
8237 return REDIS_ERR;
8238 }
8239
8240 /* SELECT the new DB */
8241 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8242 if (fwriteBulkLong(fp,j) == 0) goto werr;
8243
8244 /* Iterate this DB writing every entry */
8245 while((de = dictNext(di)) != NULL) {
8246 robj *key, *o;
8247 time_t expiretime;
8248 int swapped;
8249
8250 key = dictGetEntryKey(de);
8251 /* If the value for this key is swapped, load a preview in memory.
8252 * We use a "swapped" flag to remember if we need to free the
8253 * value object instead to just increment the ref count anyway
8254 * in order to avoid copy-on-write of pages if we are forked() */
8255 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8256 key->storage == REDIS_VM_SWAPPING) {
8257 o = dictGetEntryVal(de);
8258 swapped = 0;
8259 } else {
8260 o = vmPreviewObject(key);
8261 swapped = 1;
8262 }
8263 expiretime = getExpire(db,key);
8264
8265 /* Save the key and associated value */
8266 if (o->type == REDIS_STRING) {
8267 /* Emit a SET command */
8268 char cmd[]="*3\r\n$3\r\nSET\r\n";
8269 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8270 /* Key and value */
8271 if (fwriteBulkObject(fp,key) == 0) goto werr;
8272 if (fwriteBulkObject(fp,o) == 0) goto werr;
8273 } else if (o->type == REDIS_LIST) {
8274 /* Emit the RPUSHes needed to rebuild the list */
8275 list *list = o->ptr;
8276 listNode *ln;
8277 listIter li;
8278
8279 listRewind(list,&li);
8280 while((ln = listNext(&li))) {
8281 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8282 robj *eleobj = listNodeValue(ln);
8283
8284 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8285 if (fwriteBulkObject(fp,key) == 0) goto werr;
8286 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8287 }
8288 } else if (o->type == REDIS_SET) {
8289 /* Emit the SADDs needed to rebuild the set */
8290 dict *set = o->ptr;
8291 dictIterator *di = dictGetIterator(set);
8292 dictEntry *de;
8293
8294 while((de = dictNext(di)) != NULL) {
8295 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8296 robj *eleobj = dictGetEntryKey(de);
8297
8298 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8299 if (fwriteBulkObject(fp,key) == 0) goto werr;
8300 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8301 }
8302 dictReleaseIterator(di);
8303 } else if (o->type == REDIS_ZSET) {
8304 /* Emit the ZADDs needed to rebuild the sorted set */
8305 zset *zs = o->ptr;
8306 dictIterator *di = dictGetIterator(zs->dict);
8307 dictEntry *de;
8308
8309 while((de = dictNext(di)) != NULL) {
8310 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8311 robj *eleobj = dictGetEntryKey(de);
8312 double *score = dictGetEntryVal(de);
8313
8314 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8315 if (fwriteBulkObject(fp,key) == 0) goto werr;
8316 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8317 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8318 }
8319 dictReleaseIterator(di);
8320 } else if (o->type == REDIS_HASH) {
8321 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8322
8323 /* Emit the HSETs needed to rebuild the hash */
8324 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8325 unsigned char *p = zipmapRewind(o->ptr);
8326 unsigned char *field, *val;
8327 unsigned int flen, vlen;
8328
8329 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8330 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8331 if (fwriteBulkObject(fp,key) == 0) goto werr;
8332 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8333 return -1;
8334 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8335 return -1;
8336 }
8337 } else {
8338 dictIterator *di = dictGetIterator(o->ptr);
8339 dictEntry *de;
8340
8341 while((de = dictNext(di)) != NULL) {
8342 robj *field = dictGetEntryKey(de);
8343 robj *val = dictGetEntryVal(de);
8344
8345 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8346 if (fwriteBulkObject(fp,key) == 0) goto werr;
8347 if (fwriteBulkObject(fp,field) == -1) return -1;
8348 if (fwriteBulkObject(fp,val) == -1) return -1;
8349 }
8350 dictReleaseIterator(di);
8351 }
8352 } else {
8353 redisPanic("Unknown object type");
8354 }
8355 /* Save the expire time */
8356 if (expiretime != -1) {
8357 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8358 /* If this key is already expired skip it */
8359 if (expiretime < now) continue;
8360 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8361 if (fwriteBulkObject(fp,key) == 0) goto werr;
8362 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8363 }
8364 if (swapped) decrRefCount(o);
8365 }
8366 dictReleaseIterator(di);
8367 }
8368
8369 /* Make sure data will not remain on the OS's output buffers */
8370 fflush(fp);
8371 fsync(fileno(fp));
8372 fclose(fp);
8373
8374 /* Use RENAME to make sure the DB file is changed atomically only
8375 * if the generate DB file is ok. */
8376 if (rename(tmpfile,filename) == -1) {
8377 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8378 unlink(tmpfile);
8379 return REDIS_ERR;
8380 }
8381 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8382 return REDIS_OK;
8383
8384 werr:
8385 fclose(fp);
8386 unlink(tmpfile);
8387 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8388 if (di) dictReleaseIterator(di);
8389 return REDIS_ERR;
8390 }
8391
8392 /* This is how rewriting of the append only file in background works:
8393 *
8394 * 1) The user calls BGREWRITEAOF
8395 * 2) Redis calls this function, that forks():
8396 * 2a) the child rewrite the append only file in a temp file.
8397 * 2b) the parent accumulates differences in server.bgrewritebuf.
8398 * 3) When the child finished '2a' exists.
8399 * 4) The parent will trap the exit code, if it's OK, will append the
8400 * data accumulated into server.bgrewritebuf into the temp file, and
8401 * finally will rename(2) the temp file in the actual file name.
8402 * The the new file is reopened as the new append only file. Profit!
8403 */
8404 static int rewriteAppendOnlyFileBackground(void) {
8405 pid_t childpid;
8406
8407 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8408 if (server.vm_enabled) waitEmptyIOJobsQueue();
8409 if ((childpid = fork()) == 0) {
8410 /* Child */
8411 char tmpfile[256];
8412
8413 if (server.vm_enabled) vmReopenSwapFile();
8414 close(server.fd);
8415 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8416 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8417 _exit(0);
8418 } else {
8419 _exit(1);
8420 }
8421 } else {
8422 /* Parent */
8423 if (childpid == -1) {
8424 redisLog(REDIS_WARNING,
8425 "Can't rewrite append only file in background: fork: %s",
8426 strerror(errno));
8427 return REDIS_ERR;
8428 }
8429 redisLog(REDIS_NOTICE,
8430 "Background append only file rewriting started by pid %d",childpid);
8431 server.bgrewritechildpid = childpid;
8432 updateDictResizePolicy();
8433 /* We set appendseldb to -1 in order to force the next call to the
8434 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8435 * accumulated by the parent into server.bgrewritebuf will start
8436 * with a SELECT statement and it will be safe to merge. */
8437 server.appendseldb = -1;
8438 return REDIS_OK;
8439 }
8440 return REDIS_OK; /* unreached */
8441 }
8442
8443 static void bgrewriteaofCommand(redisClient *c) {
8444 if (server.bgrewritechildpid != -1) {
8445 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8446 return;
8447 }
8448 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8449 char *status = "+Background append only file rewriting started\r\n";
8450 addReplySds(c,sdsnew(status));
8451 } else {
8452 addReply(c,shared.err);
8453 }
8454 }
8455
8456 static void aofRemoveTempFile(pid_t childpid) {
8457 char tmpfile[256];
8458
8459 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8460 unlink(tmpfile);
8461 }
8462
8463 /* Virtual Memory is composed mainly of two subsystems:
8464 * - Blocking Virutal Memory
8465 * - Threaded Virtual Memory I/O
8466 * The two parts are not fully decoupled, but functions are split among two
8467 * different sections of the source code (delimited by comments) in order to
8468 * make more clear what functionality is about the blocking VM and what about
8469 * the threaded (not blocking) VM.
8470 *
8471 * Redis VM design:
8472 *
8473 * Redis VM is a blocking VM (one that blocks reading swapped values from
8474 * disk into memory when a value swapped out is needed in memory) that is made
8475 * unblocking by trying to examine the command argument vector in order to
8476 * load in background values that will likely be needed in order to exec
8477 * the command. The command is executed only once all the relevant keys
8478 * are loaded into memory.
8479 *
8480 * This basically is almost as simple of a blocking VM, but almost as parallel
8481 * as a fully non-blocking VM.
8482 */
8483
8484 /* =================== Virtual Memory - Blocking Side ====================== */
8485
8486 /* substitute the first occurrence of '%p' with the process pid in the
8487 * swap file name. */
8488 static void expandVmSwapFilename(void) {
8489 char *p = strstr(server.vm_swap_file,"%p");
8490 sds new;
8491
8492 if (!p) return;
8493 new = sdsempty();
8494 *p = '\0';
8495 new = sdscat(new,server.vm_swap_file);
8496 new = sdscatprintf(new,"%ld",(long) getpid());
8497 new = sdscat(new,p+2);
8498 zfree(server.vm_swap_file);
8499 server.vm_swap_file = new;
8500 }
8501
8502 static void vmInit(void) {
8503 off_t totsize;
8504 int pipefds[2];
8505 size_t stacksize;
8506
8507 if (server.vm_max_threads != 0)
8508 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8509
8510 expandVmSwapFilename();
8511 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8512 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8513 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8514 }
8515 if (server.vm_fp == NULL) {
8516 redisLog(REDIS_WARNING,
8517 "Impossible to open the swap file: %s. Exiting.",
8518 strerror(errno));
8519 exit(1);
8520 }
8521 server.vm_fd = fileno(server.vm_fp);
8522 server.vm_next_page = 0;
8523 server.vm_near_pages = 0;
8524 server.vm_stats_used_pages = 0;
8525 server.vm_stats_swapped_objects = 0;
8526 server.vm_stats_swapouts = 0;
8527 server.vm_stats_swapins = 0;
8528 totsize = server.vm_pages*server.vm_page_size;
8529 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8530 if (ftruncate(server.vm_fd,totsize) == -1) {
8531 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8532 strerror(errno));
8533 exit(1);
8534 } else {
8535 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8536 }
8537 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8538 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8539 (long long) (server.vm_pages+7)/8, server.vm_pages);
8540 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8541
8542 /* Initialize threaded I/O (used by Virtual Memory) */
8543 server.io_newjobs = listCreate();
8544 server.io_processing = listCreate();
8545 server.io_processed = listCreate();
8546 server.io_ready_clients = listCreate();
8547 pthread_mutex_init(&server.io_mutex,NULL);
8548 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8549 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8550 server.io_active_threads = 0;
8551 if (pipe(pipefds) == -1) {
8552 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8553 ,strerror(errno));
8554 exit(1);
8555 }
8556 server.io_ready_pipe_read = pipefds[0];
8557 server.io_ready_pipe_write = pipefds[1];
8558 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8559 /* LZF requires a lot of stack */
8560 pthread_attr_init(&server.io_threads_attr);
8561 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8562 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8563 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8564 /* Listen for events in the threaded I/O pipe */
8565 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8566 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8567 oom("creating file event");
8568 }
8569
8570 /* Mark the page as used */
8571 static void vmMarkPageUsed(off_t page) {
8572 off_t byte = page/8;
8573 int bit = page&7;
8574 redisAssert(vmFreePage(page) == 1);
8575 server.vm_bitmap[byte] |= 1<<bit;
8576 }
8577
8578 /* Mark N contiguous pages as used, with 'page' being the first. */
8579 static void vmMarkPagesUsed(off_t page, off_t count) {
8580 off_t j;
8581
8582 for (j = 0; j < count; j++)
8583 vmMarkPageUsed(page+j);
8584 server.vm_stats_used_pages += count;
8585 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8586 (long long)count, (long long)page);
8587 }
8588
8589 /* Mark the page as free */
8590 static void vmMarkPageFree(off_t page) {
8591 off_t byte = page/8;
8592 int bit = page&7;
8593 redisAssert(vmFreePage(page) == 0);
8594 server.vm_bitmap[byte] &= ~(1<<bit);
8595 }
8596
8597 /* Mark N contiguous pages as free, with 'page' being the first. */
8598 static void vmMarkPagesFree(off_t page, off_t count) {
8599 off_t j;
8600
8601 for (j = 0; j < count; j++)
8602 vmMarkPageFree(page+j);
8603 server.vm_stats_used_pages -= count;
8604 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8605 (long long)count, (long long)page);
8606 }
8607
8608 /* Test if the page is free */
8609 static int vmFreePage(off_t page) {
8610 off_t byte = page/8;
8611 int bit = page&7;
8612 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8613 }
8614
8615 /* Find N contiguous free pages storing the first page of the cluster in *first.
8616 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8617 * REDIS_ERR is returned.
8618 *
8619 * This function uses a simple algorithm: we try to allocate
8620 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8621 * again from the start of the swap file searching for free spaces.
8622 *
8623 * If it looks pretty clear that there are no free pages near our offset
8624 * we try to find less populated places doing a forward jump of
8625 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8626 * without hurry, and then we jump again and so forth...
8627 *
8628 * This function can be improved using a free list to avoid to guess
8629 * too much, since we could collect data about freed pages.
8630 *
8631 * note: I implemented this function just after watching an episode of
8632 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8633 */
8634 static int vmFindContiguousPages(off_t *first, off_t n) {
8635 off_t base, offset = 0, since_jump = 0, numfree = 0;
8636
8637 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8638 server.vm_near_pages = 0;
8639 server.vm_next_page = 0;
8640 }
8641 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8642 base = server.vm_next_page;
8643
8644 while(offset < server.vm_pages) {
8645 off_t this = base+offset;
8646
8647 /* If we overflow, restart from page zero */
8648 if (this >= server.vm_pages) {
8649 this -= server.vm_pages;
8650 if (this == 0) {
8651 /* Just overflowed, what we found on tail is no longer
8652 * interesting, as it's no longer contiguous. */
8653 numfree = 0;
8654 }
8655 }
8656 if (vmFreePage(this)) {
8657 /* This is a free page */
8658 numfree++;
8659 /* Already got N free pages? Return to the caller, with success */
8660 if (numfree == n) {
8661 *first = this-(n-1);
8662 server.vm_next_page = this+1;
8663 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8664 return REDIS_OK;
8665 }
8666 } else {
8667 /* The current one is not a free page */
8668 numfree = 0;
8669 }
8670
8671 /* Fast-forward if the current page is not free and we already
8672 * searched enough near this place. */
8673 since_jump++;
8674 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8675 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8676 since_jump = 0;
8677 /* Note that even if we rewind after the jump, we are don't need
8678 * to make sure numfree is set to zero as we only jump *if* it
8679 * is set to zero. */
8680 } else {
8681 /* Otherwise just check the next page */
8682 offset++;
8683 }
8684 }
8685 return REDIS_ERR;
8686 }
8687
8688 /* Write the specified object at the specified page of the swap file */
8689 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8690 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8691 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8692 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8693 redisLog(REDIS_WARNING,
8694 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8695 strerror(errno));
8696 return REDIS_ERR;
8697 }
8698 rdbSaveObject(server.vm_fp,o);
8699 fflush(server.vm_fp);
8700 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8701 return REDIS_OK;
8702 }
8703
8704 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8705 * needed to later retrieve the object into the key object.
8706 * If we can't find enough contiguous empty pages to swap the object on disk
8707 * REDIS_ERR is returned. */
8708 static int vmSwapObjectBlocking(robj *key, robj *val) {
8709 off_t pages = rdbSavedObjectPages(val,NULL);
8710 off_t page;
8711
8712 assert(key->storage == REDIS_VM_MEMORY);
8713 assert(key->refcount == 1);
8714 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8715 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8716 key->vm.page = page;
8717 key->vm.usedpages = pages;
8718 key->storage = REDIS_VM_SWAPPED;
8719 key->vtype = val->type;
8720 decrRefCount(val); /* Deallocate the object from memory. */
8721 vmMarkPagesUsed(page,pages);
8722 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8723 (unsigned char*) key->ptr,
8724 (unsigned long long) page, (unsigned long long) pages);
8725 server.vm_stats_swapped_objects++;
8726 server.vm_stats_swapouts++;
8727 return REDIS_OK;
8728 }
8729
8730 static robj *vmReadObjectFromSwap(off_t page, int type) {
8731 robj *o;
8732
8733 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8734 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8735 redisLog(REDIS_WARNING,
8736 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8737 strerror(errno));
8738 _exit(1);
8739 }
8740 o = rdbLoadObject(type,server.vm_fp);
8741 if (o == NULL) {
8742 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8743 _exit(1);
8744 }
8745 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8746 return o;
8747 }
8748
8749 /* Load the value object relative to the 'key' object from swap to memory.
8750 * The newly allocated object is returned.
8751 *
8752 * If preview is true the unserialized object is returned to the caller but
8753 * no changes are made to the key object, nor the pages are marked as freed */
8754 static robj *vmGenericLoadObject(robj *key, int preview) {
8755 robj *val;
8756
8757 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8758 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8759 if (!preview) {
8760 key->storage = REDIS_VM_MEMORY;
8761 key->vm.atime = server.unixtime;
8762 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8763 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8764 (unsigned char*) key->ptr);
8765 server.vm_stats_swapped_objects--;
8766 } else {
8767 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8768 (unsigned char*) key->ptr);
8769 }
8770 server.vm_stats_swapins++;
8771 return val;
8772 }
8773
8774 /* Plain object loading, from swap to memory */
8775 static robj *vmLoadObject(robj *key) {
8776 /* If we are loading the object in background, stop it, we
8777 * need to load this object synchronously ASAP. */
8778 if (key->storage == REDIS_VM_LOADING)
8779 vmCancelThreadedIOJob(key);
8780 return vmGenericLoadObject(key,0);
8781 }
8782
8783 /* Just load the value on disk, without to modify the key.
8784 * This is useful when we want to perform some operation on the value
8785 * without to really bring it from swap to memory, like while saving the
8786 * dataset or rewriting the append only log. */
8787 static robj *vmPreviewObject(robj *key) {
8788 return vmGenericLoadObject(key,1);
8789 }
8790
8791 /* How a good candidate is this object for swapping?
8792 * The better candidate it is, the greater the returned value.
8793 *
8794 * Currently we try to perform a fast estimation of the object size in
8795 * memory, and combine it with aging informations.
8796 *
8797 * Basically swappability = idle-time * log(estimated size)
8798 *
8799 * Bigger objects are preferred over smaller objects, but not
8800 * proportionally, this is why we use the logarithm. This algorithm is
8801 * just a first try and will probably be tuned later. */
8802 static double computeObjectSwappability(robj *o) {
8803 time_t age = server.unixtime - o->vm.atime;
8804 long asize = 0;
8805 list *l;
8806 dict *d;
8807 struct dictEntry *de;
8808 int z;
8809
8810 if (age <= 0) return 0;
8811 switch(o->type) {
8812 case REDIS_STRING:
8813 if (o->encoding != REDIS_ENCODING_RAW) {
8814 asize = sizeof(*o);
8815 } else {
8816 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8817 }
8818 break;
8819 case REDIS_LIST:
8820 l = o->ptr;
8821 listNode *ln = listFirst(l);
8822
8823 asize = sizeof(list);
8824 if (ln) {
8825 robj *ele = ln->value;
8826 long elesize;
8827
8828 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8829 (sizeof(*o)+sdslen(ele->ptr)) :
8830 sizeof(*o);
8831 asize += (sizeof(listNode)+elesize)*listLength(l);
8832 }
8833 break;
8834 case REDIS_SET:
8835 case REDIS_ZSET:
8836 z = (o->type == REDIS_ZSET);
8837 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8838
8839 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8840 if (z) asize += sizeof(zset)-sizeof(dict);
8841 if (dictSize(d)) {
8842 long elesize;
8843 robj *ele;
8844
8845 de = dictGetRandomKey(d);
8846 ele = dictGetEntryKey(de);
8847 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8848 (sizeof(*o)+sdslen(ele->ptr)) :
8849 sizeof(*o);
8850 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8851 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8852 }
8853 break;
8854 case REDIS_HASH:
8855 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8856 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8857 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8858 unsigned int klen, vlen;
8859 unsigned char *key, *val;
8860
8861 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8862 klen = 0;
8863 vlen = 0;
8864 }
8865 asize = len*(klen+vlen+3);
8866 } else if (o->encoding == REDIS_ENCODING_HT) {
8867 d = o->ptr;
8868 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8869 if (dictSize(d)) {
8870 long elesize;
8871 robj *ele;
8872
8873 de = dictGetRandomKey(d);
8874 ele = dictGetEntryKey(de);
8875 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8876 (sizeof(*o)+sdslen(ele->ptr)) :
8877 sizeof(*o);
8878 ele = dictGetEntryVal(de);
8879 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8880 (sizeof(*o)+sdslen(ele->ptr)) :
8881 sizeof(*o);
8882 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8883 }
8884 }
8885 break;
8886 }
8887 return (double)age*log(1+asize);
8888 }
8889
8890 /* Try to swap an object that's a good candidate for swapping.
8891 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8892 * to swap any object at all.
8893 *
8894 * If 'usethreaded' is true, Redis will try to swap the object in background
8895 * using I/O threads. */
8896 static int vmSwapOneObject(int usethreads) {
8897 int j, i;
8898 struct dictEntry *best = NULL;
8899 double best_swappability = 0;
8900 redisDb *best_db = NULL;
8901 robj *key, *val;
8902
8903 for (j = 0; j < server.dbnum; j++) {
8904 redisDb *db = server.db+j;
8905 /* Why maxtries is set to 100?
8906 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8907 * are swappable objects */
8908 int maxtries = 100;
8909
8910 if (dictSize(db->dict) == 0) continue;
8911 for (i = 0; i < 5; i++) {
8912 dictEntry *de;
8913 double swappability;
8914
8915 if (maxtries) maxtries--;
8916 de = dictGetRandomKey(db->dict);
8917 key = dictGetEntryKey(de);
8918 val = dictGetEntryVal(de);
8919 /* Only swap objects that are currently in memory.
8920 *
8921 * Also don't swap shared objects if threaded VM is on, as we
8922 * try to ensure that the main thread does not touch the
8923 * object while the I/O thread is using it, but we can't
8924 * control other keys without adding additional mutex. */
8925 if (key->storage != REDIS_VM_MEMORY ||
8926 (server.vm_max_threads != 0 && val->refcount != 1)) {
8927 if (maxtries) i--; /* don't count this try */
8928 continue;
8929 }
8930 swappability = computeObjectSwappability(val);
8931 if (!best || swappability > best_swappability) {
8932 best = de;
8933 best_swappability = swappability;
8934 best_db = db;
8935 }
8936 }
8937 }
8938 if (best == NULL) return REDIS_ERR;
8939 key = dictGetEntryKey(best);
8940 val = dictGetEntryVal(best);
8941
8942 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8943 key->ptr, best_swappability);
8944
8945 /* Unshare the key if needed */
8946 if (key->refcount > 1) {
8947 robj *newkey = dupStringObject(key);
8948 decrRefCount(key);
8949 key = dictGetEntryKey(best) = newkey;
8950 }
8951 /* Swap it */
8952 if (usethreads) {
8953 vmSwapObjectThreaded(key,val,best_db);
8954 return REDIS_OK;
8955 } else {
8956 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8957 dictGetEntryVal(best) = NULL;
8958 return REDIS_OK;
8959 } else {
8960 return REDIS_ERR;
8961 }
8962 }
8963 }
8964
8965 static int vmSwapOneObjectBlocking() {
8966 return vmSwapOneObject(0);
8967 }
8968
8969 static int vmSwapOneObjectThreaded() {
8970 return vmSwapOneObject(1);
8971 }
8972
8973 /* Return true if it's safe to swap out objects in a given moment.
8974 * Basically we don't want to swap objects out while there is a BGSAVE
8975 * or a BGAEOREWRITE running in backgroud. */
8976 static int vmCanSwapOut(void) {
8977 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8978 }
8979
8980 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8981 * and was deleted. Otherwise 0 is returned. */
8982 static int deleteIfSwapped(redisDb *db, robj *key) {
8983 dictEntry *de;
8984 robj *foundkey;
8985
8986 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8987 foundkey = dictGetEntryKey(de);
8988 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8989 deleteKey(db,key);
8990 return 1;
8991 }
8992
8993 /* =================== Virtual Memory - Threaded I/O ======================= */
8994
8995 static void freeIOJob(iojob *j) {
8996 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8997 j->type == REDIS_IOJOB_DO_SWAP ||
8998 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8999 decrRefCount(j->val);
9000 /* We don't decrRefCount the j->key field as we did't incremented
9001 * the count creating IO Jobs. This is because the key field here is
9002 * just used as an indentifier and if a key is removed the Job should
9003 * never be touched again. */
9004 zfree(j);
9005 }
9006
9007 /* Every time a thread finished a Job, it writes a byte into the write side
9008 * of an unix pipe in order to "awake" the main thread, and this function
9009 * is called. */
9010 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9011 int mask)
9012 {
9013 char buf[1];
9014 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9015 REDIS_NOTUSED(el);
9016 REDIS_NOTUSED(mask);
9017 REDIS_NOTUSED(privdata);
9018
9019 /* For every byte we read in the read side of the pipe, there is one
9020 * I/O job completed to process. */
9021 while((retval = read(fd,buf,1)) == 1) {
9022 iojob *j;
9023 listNode *ln;
9024 robj *key;
9025 struct dictEntry *de;
9026
9027 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9028
9029 /* Get the processed element (the oldest one) */
9030 lockThreadedIO();
9031 assert(listLength(server.io_processed) != 0);
9032 if (toprocess == -1) {
9033 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9034 if (toprocess <= 0) toprocess = 1;
9035 }
9036 ln = listFirst(server.io_processed);
9037 j = ln->value;
9038 listDelNode(server.io_processed,ln);
9039 unlockThreadedIO();
9040 /* If this job is marked as canceled, just ignore it */
9041 if (j->canceled) {
9042 freeIOJob(j);
9043 continue;
9044 }
9045 /* Post process it in the main thread, as there are things we
9046 * can do just here to avoid race conditions and/or invasive locks */
9047 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9048 de = dictFind(j->db->dict,j->key);
9049 assert(de != NULL);
9050 key = dictGetEntryKey(de);
9051 if (j->type == REDIS_IOJOB_LOAD) {
9052 redisDb *db;
9053
9054 /* Key loaded, bring it at home */
9055 key->storage = REDIS_VM_MEMORY;
9056 key->vm.atime = server.unixtime;
9057 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9058 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9059 (unsigned char*) key->ptr);
9060 server.vm_stats_swapped_objects--;
9061 server.vm_stats_swapins++;
9062 dictGetEntryVal(de) = j->val;
9063 incrRefCount(j->val);
9064 db = j->db;
9065 freeIOJob(j);
9066 /* Handle clients waiting for this key to be loaded. */
9067 handleClientsBlockedOnSwappedKey(db,key);
9068 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9069 /* Now we know the amount of pages required to swap this object.
9070 * Let's find some space for it, and queue this task again
9071 * rebranded as REDIS_IOJOB_DO_SWAP. */
9072 if (!vmCanSwapOut() ||
9073 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9074 {
9075 /* Ooops... no space or we can't swap as there is
9076 * a fork()ed Redis trying to save stuff on disk. */
9077 freeIOJob(j);
9078 key->storage = REDIS_VM_MEMORY; /* undo operation */
9079 } else {
9080 /* Note that we need to mark this pages as used now,
9081 * if the job will be canceled, we'll mark them as freed
9082 * again. */
9083 vmMarkPagesUsed(j->page,j->pages);
9084 j->type = REDIS_IOJOB_DO_SWAP;
9085 lockThreadedIO();
9086 queueIOJob(j);
9087 unlockThreadedIO();
9088 }
9089 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9090 robj *val;
9091
9092 /* Key swapped. We can finally free some memory. */
9093 if (key->storage != REDIS_VM_SWAPPING) {
9094 printf("key->storage: %d\n",key->storage);
9095 printf("key->name: %s\n",(char*)key->ptr);
9096 printf("key->refcount: %d\n",key->refcount);
9097 printf("val: %p\n",(void*)j->val);
9098 printf("val->type: %d\n",j->val->type);
9099 printf("val->ptr: %s\n",(char*)j->val->ptr);
9100 }
9101 redisAssert(key->storage == REDIS_VM_SWAPPING);
9102 val = dictGetEntryVal(de);
9103 key->vm.page = j->page;
9104 key->vm.usedpages = j->pages;
9105 key->storage = REDIS_VM_SWAPPED;
9106 key->vtype = j->val->type;
9107 decrRefCount(val); /* Deallocate the object from memory. */
9108 dictGetEntryVal(de) = NULL;
9109 redisLog(REDIS_DEBUG,
9110 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9111 (unsigned char*) key->ptr,
9112 (unsigned long long) j->page, (unsigned long long) j->pages);
9113 server.vm_stats_swapped_objects++;
9114 server.vm_stats_swapouts++;
9115 freeIOJob(j);
9116 /* Put a few more swap requests in queue if we are still
9117 * out of memory */
9118 if (trytoswap && vmCanSwapOut() &&
9119 zmalloc_used_memory() > server.vm_max_memory)
9120 {
9121 int more = 1;
9122 while(more) {
9123 lockThreadedIO();
9124 more = listLength(server.io_newjobs) <
9125 (unsigned) server.vm_max_threads;
9126 unlockThreadedIO();
9127 /* Don't waste CPU time if swappable objects are rare. */
9128 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9129 trytoswap = 0;
9130 break;
9131 }
9132 }
9133 }
9134 }
9135 processed++;
9136 if (processed == toprocess) return;
9137 }
9138 if (retval < 0 && errno != EAGAIN) {
9139 redisLog(REDIS_WARNING,
9140 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9141 strerror(errno));
9142 }
9143 }
9144
9145 static void lockThreadedIO(void) {
9146 pthread_mutex_lock(&server.io_mutex);
9147 }
9148
9149 static void unlockThreadedIO(void) {
9150 pthread_mutex_unlock(&server.io_mutex);
9151 }
9152
9153 /* Remove the specified object from the threaded I/O queue if still not
9154 * processed, otherwise make sure to flag it as canceled. */
9155 static void vmCancelThreadedIOJob(robj *o) {
9156 list *lists[3] = {
9157 server.io_newjobs, /* 0 */
9158 server.io_processing, /* 1 */
9159 server.io_processed /* 2 */
9160 };
9161 int i;
9162
9163 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9164 again:
9165 lockThreadedIO();
9166 /* Search for a matching key in one of the queues */
9167 for (i = 0; i < 3; i++) {
9168 listNode *ln;
9169 listIter li;
9170
9171 listRewind(lists[i],&li);
9172 while ((ln = listNext(&li)) != NULL) {
9173 iojob *job = ln->value;
9174
9175 if (job->canceled) continue; /* Skip this, already canceled. */
9176 if (job->key == o) {
9177 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9178 (void*)job, (char*)o->ptr, job->type, i);
9179 /* Mark the pages as free since the swap didn't happened
9180 * or happened but is now discarded. */
9181 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9182 vmMarkPagesFree(job->page,job->pages);
9183 /* Cancel the job. It depends on the list the job is
9184 * living in. */
9185 switch(i) {
9186 case 0: /* io_newjobs */
9187 /* If the job was yet not processed the best thing to do
9188 * is to remove it from the queue at all */
9189 freeIOJob(job);
9190 listDelNode(lists[i],ln);
9191 break;
9192 case 1: /* io_processing */
9193 /* Oh Shi- the thread is messing with the Job:
9194 *
9195 * Probably it's accessing the object if this is a
9196 * PREPARE_SWAP or DO_SWAP job.
9197 * If it's a LOAD job it may be reading from disk and
9198 * if we don't wait for the job to terminate before to
9199 * cancel it, maybe in a few microseconds data can be
9200 * corrupted in this pages. So the short story is:
9201 *
9202 * Better to wait for the job to move into the
9203 * next queue (processed)... */
9204
9205 /* We try again and again until the job is completed. */
9206 unlockThreadedIO();
9207 /* But let's wait some time for the I/O thread
9208 * to finish with this job. After all this condition
9209 * should be very rare. */
9210 usleep(1);
9211 goto again;
9212 case 2: /* io_processed */
9213 /* The job was already processed, that's easy...
9214 * just mark it as canceled so that we'll ignore it
9215 * when processing completed jobs. */
9216 job->canceled = 1;
9217 break;
9218 }
9219 /* Finally we have to adjust the storage type of the object
9220 * in order to "UNDO" the operaiton. */
9221 if (o->storage == REDIS_VM_LOADING)
9222 o->storage = REDIS_VM_SWAPPED;
9223 else if (o->storage == REDIS_VM_SWAPPING)
9224 o->storage = REDIS_VM_MEMORY;
9225 unlockThreadedIO();
9226 return;
9227 }
9228 }
9229 }
9230 unlockThreadedIO();
9231 assert(1 != 1); /* We should never reach this */
9232 }
9233
9234 static void *IOThreadEntryPoint(void *arg) {
9235 iojob *j;
9236 listNode *ln;
9237 REDIS_NOTUSED(arg);
9238
9239 pthread_detach(pthread_self());
9240 while(1) {
9241 /* Get a new job to process */
9242 lockThreadedIO();
9243 if (listLength(server.io_newjobs) == 0) {
9244 /* No new jobs in queue, exit. */
9245 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9246 (long) pthread_self());
9247 server.io_active_threads--;
9248 unlockThreadedIO();
9249 return NULL;
9250 }
9251 ln = listFirst(server.io_newjobs);
9252 j = ln->value;
9253 listDelNode(server.io_newjobs,ln);
9254 /* Add the job in the processing queue */
9255 j->thread = pthread_self();
9256 listAddNodeTail(server.io_processing,j);
9257 ln = listLast(server.io_processing); /* We use ln later to remove it */
9258 unlockThreadedIO();
9259 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9260 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9261
9262 /* Process the Job */
9263 if (j->type == REDIS_IOJOB_LOAD) {
9264 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9265 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9266 FILE *fp = fopen("/dev/null","w+");
9267 j->pages = rdbSavedObjectPages(j->val,fp);
9268 fclose(fp);
9269 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9270 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9271 j->canceled = 1;
9272 }
9273
9274 /* Done: insert the job into the processed queue */
9275 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9276 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9277 lockThreadedIO();
9278 listDelNode(server.io_processing,ln);
9279 listAddNodeTail(server.io_processed,j);
9280 unlockThreadedIO();
9281
9282 /* Signal the main thread there is new stuff to process */
9283 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9284 }
9285 return NULL; /* never reached */
9286 }
9287
9288 static void spawnIOThread(void) {
9289 pthread_t thread;
9290 sigset_t mask, omask;
9291 int err;
9292
9293 sigemptyset(&mask);
9294 sigaddset(&mask,SIGCHLD);
9295 sigaddset(&mask,SIGHUP);
9296 sigaddset(&mask,SIGPIPE);
9297 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9298 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9299 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9300 strerror(err));
9301 usleep(1000000);
9302 }
9303 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9304 server.io_active_threads++;
9305 }
9306
9307 /* We need to wait for the last thread to exit before we are able to
9308 * fork() in order to BGSAVE or BGREWRITEAOF. */
9309 static void waitEmptyIOJobsQueue(void) {
9310 while(1) {
9311 int io_processed_len;
9312
9313 lockThreadedIO();
9314 if (listLength(server.io_newjobs) == 0 &&
9315 listLength(server.io_processing) == 0 &&
9316 server.io_active_threads == 0)
9317 {
9318 unlockThreadedIO();
9319 return;
9320 }
9321 /* While waiting for empty jobs queue condition we post-process some
9322 * finshed job, as I/O threads may be hanging trying to write against
9323 * the io_ready_pipe_write FD but there are so much pending jobs that
9324 * it's blocking. */
9325 io_processed_len = listLength(server.io_processed);
9326 unlockThreadedIO();
9327 if (io_processed_len) {
9328 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9329 usleep(1000); /* 1 millisecond */
9330 } else {
9331 usleep(10000); /* 10 milliseconds */
9332 }
9333 }
9334 }
9335
9336 static void vmReopenSwapFile(void) {
9337 /* Note: we don't close the old one as we are in the child process
9338 * and don't want to mess at all with the original file object. */
9339 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9340 if (server.vm_fp == NULL) {
9341 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9342 server.vm_swap_file);
9343 _exit(1);
9344 }
9345 server.vm_fd = fileno(server.vm_fp);
9346 }
9347
9348 /* This function must be called while with threaded IO locked */
9349 static void queueIOJob(iojob *j) {
9350 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9351 (void*)j, j->type, (char*)j->key->ptr);
9352 listAddNodeTail(server.io_newjobs,j);
9353 if (server.io_active_threads < server.vm_max_threads)
9354 spawnIOThread();
9355 }
9356
9357 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9358 iojob *j;
9359
9360 assert(key->storage == REDIS_VM_MEMORY);
9361 assert(key->refcount == 1);
9362
9363 j = zmalloc(sizeof(*j));
9364 j->type = REDIS_IOJOB_PREPARE_SWAP;
9365 j->db = db;
9366 j->key = key;
9367 j->val = val;
9368 incrRefCount(val);
9369 j->canceled = 0;
9370 j->thread = (pthread_t) -1;
9371 key->storage = REDIS_VM_SWAPPING;
9372
9373 lockThreadedIO();
9374 queueIOJob(j);
9375 unlockThreadedIO();
9376 return REDIS_OK;
9377 }
9378
9379 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9380
9381 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9382 * If there is not already a job loading the key, it is craeted.
9383 * The key is added to the io_keys list in the client structure, and also
9384 * in the hash table mapping swapped keys to waiting clients, that is,
9385 * server.io_waited_keys. */
9386 static int waitForSwappedKey(redisClient *c, robj *key) {
9387 struct dictEntry *de;
9388 robj *o;
9389 list *l;
9390
9391 /* If the key does not exist or is already in RAM we don't need to
9392 * block the client at all. */
9393 de = dictFind(c->db->dict,key);
9394 if (de == NULL) return 0;
9395 o = dictGetEntryKey(de);
9396 if (o->storage == REDIS_VM_MEMORY) {
9397 return 0;
9398 } else if (o->storage == REDIS_VM_SWAPPING) {
9399 /* We were swapping the key, undo it! */
9400 vmCancelThreadedIOJob(o);
9401 return 0;
9402 }
9403
9404 /* OK: the key is either swapped, or being loaded just now. */
9405
9406 /* Add the key to the list of keys this client is waiting for.
9407 * This maps clients to keys they are waiting for. */
9408 listAddNodeTail(c->io_keys,key);
9409 incrRefCount(key);
9410
9411 /* Add the client to the swapped keys => clients waiting map. */
9412 de = dictFind(c->db->io_keys,key);
9413 if (de == NULL) {
9414 int retval;
9415
9416 /* For every key we take a list of clients blocked for it */
9417 l = listCreate();
9418 retval = dictAdd(c->db->io_keys,key,l);
9419 incrRefCount(key);
9420 assert(retval == DICT_OK);
9421 } else {
9422 l = dictGetEntryVal(de);
9423 }
9424 listAddNodeTail(l,c);
9425
9426 /* Are we already loading the key from disk? If not create a job */
9427 if (o->storage == REDIS_VM_SWAPPED) {
9428 iojob *j;
9429
9430 o->storage = REDIS_VM_LOADING;
9431 j = zmalloc(sizeof(*j));
9432 j->type = REDIS_IOJOB_LOAD;
9433 j->db = c->db;
9434 j->key = o;
9435 j->key->vtype = o->vtype;
9436 j->page = o->vm.page;
9437 j->val = NULL;
9438 j->canceled = 0;
9439 j->thread = (pthread_t) -1;
9440 lockThreadedIO();
9441 queueIOJob(j);
9442 unlockThreadedIO();
9443 }
9444 return 1;
9445 }
9446
9447 /* Preload keys needed for the ZUNION and ZINTER commands. */
9448 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9449 int i, num;
9450 num = atoi(c->argv[2]->ptr);
9451 for (i = 0; i < num; i++) {
9452 waitForSwappedKey(c,c->argv[3+i]);
9453 }
9454 }
9455
9456 /* Is this client attempting to run a command against swapped keys?
9457 * If so, block it ASAP, load the keys in background, then resume it.
9458 *
9459 * The important idea about this function is that it can fail! If keys will
9460 * still be swapped when the client is resumed, this key lookups will
9461 * just block loading keys from disk. In practical terms this should only
9462 * happen with SORT BY command or if there is a bug in this function.
9463 *
9464 * Return 1 if the client is marked as blocked, 0 if the client can
9465 * continue as the keys it is going to access appear to be in memory. */
9466 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9467 int j, last;
9468
9469 if (cmd->vm_preload_proc != NULL) {
9470 cmd->vm_preload_proc(c);
9471 } else {
9472 if (cmd->vm_firstkey == 0) return 0;
9473 last = cmd->vm_lastkey;
9474 if (last < 0) last = c->argc+last;
9475 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9476 waitForSwappedKey(c,c->argv[j]);
9477 }
9478
9479 /* If the client was blocked for at least one key, mark it as blocked. */
9480 if (listLength(c->io_keys)) {
9481 c->flags |= REDIS_IO_WAIT;
9482 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9483 server.vm_blocked_clients++;
9484 return 1;
9485 } else {
9486 return 0;
9487 }
9488 }
9489
9490 /* Remove the 'key' from the list of blocked keys for a given client.
9491 *
9492 * The function returns 1 when there are no longer blocking keys after
9493 * the current one was removed (and the client can be unblocked). */
9494 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9495 list *l;
9496 listNode *ln;
9497 listIter li;
9498 struct dictEntry *de;
9499
9500 /* Remove the key from the list of keys this client is waiting for. */
9501 listRewind(c->io_keys,&li);
9502 while ((ln = listNext(&li)) != NULL) {
9503 if (compareStringObjects(ln->value,key) == 0) {
9504 listDelNode(c->io_keys,ln);
9505 break;
9506 }
9507 }
9508 assert(ln != NULL);
9509
9510 /* Remove the client form the key => waiting clients map. */
9511 de = dictFind(c->db->io_keys,key);
9512 assert(de != NULL);
9513 l = dictGetEntryVal(de);
9514 ln = listSearchKey(l,c);
9515 assert(ln != NULL);
9516 listDelNode(l,ln);
9517 if (listLength(l) == 0)
9518 dictDelete(c->db->io_keys,key);
9519
9520 return listLength(c->io_keys) == 0;
9521 }
9522
9523 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9524 struct dictEntry *de;
9525 list *l;
9526 listNode *ln;
9527 int len;
9528
9529 de = dictFind(db->io_keys,key);
9530 if (!de) return;
9531
9532 l = dictGetEntryVal(de);
9533 len = listLength(l);
9534 /* Note: we can't use something like while(listLength(l)) as the list
9535 * can be freed by the calling function when we remove the last element. */
9536 while (len--) {
9537 ln = listFirst(l);
9538 redisClient *c = ln->value;
9539
9540 if (dontWaitForSwappedKey(c,key)) {
9541 /* Put the client in the list of clients ready to go as we
9542 * loaded all the keys about it. */
9543 listAddNodeTail(server.io_ready_clients,c);
9544 }
9545 }
9546 }
9547
9548 /* =========================== Remote Configuration ========================= */
9549
9550 static void configSetCommand(redisClient *c) {
9551 robj *o = getDecodedObject(c->argv[3]);
9552 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9553 zfree(server.dbfilename);
9554 server.dbfilename = zstrdup(o->ptr);
9555 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9556 zfree(server.requirepass);
9557 server.requirepass = zstrdup(o->ptr);
9558 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9559 zfree(server.masterauth);
9560 server.masterauth = zstrdup(o->ptr);
9561 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9562 server.maxmemory = strtoll(o->ptr, NULL, 10);
9563 } else {
9564 addReplySds(c,sdscatprintf(sdsempty(),
9565 "-ERR not supported CONFIG parameter %s\r\n",
9566 (char*)c->argv[2]->ptr));
9567 decrRefCount(o);
9568 return;
9569 }
9570 decrRefCount(o);
9571 addReply(c,shared.ok);
9572 }
9573
9574 static void configGetCommand(redisClient *c) {
9575 robj *o = getDecodedObject(c->argv[2]);
9576 robj *lenobj = createObject(REDIS_STRING,NULL);
9577 char *pattern = o->ptr;
9578 int matches = 0;
9579
9580 addReply(c,lenobj);
9581 decrRefCount(lenobj);
9582
9583 if (stringmatch(pattern,"dbfilename",0)) {
9584 addReplyBulkCString(c,"dbfilename");
9585 addReplyBulkCString(c,server.dbfilename);
9586 matches++;
9587 }
9588 if (stringmatch(pattern,"requirepass",0)) {
9589 addReplyBulkCString(c,"requirepass");
9590 addReplyBulkCString(c,server.requirepass);
9591 matches++;
9592 }
9593 if (stringmatch(pattern,"masterauth",0)) {
9594 addReplyBulkCString(c,"masterauth");
9595 addReplyBulkCString(c,server.masterauth);
9596 matches++;
9597 }
9598 if (stringmatch(pattern,"maxmemory",0)) {
9599 char buf[128];
9600
9601 snprintf(buf,128,"%llu\n",server.maxmemory);
9602 addReplyBulkCString(c,"maxmemory");
9603 addReplyBulkCString(c,buf);
9604 matches++;
9605 }
9606 decrRefCount(o);
9607 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9608 }
9609
9610 static void configCommand(redisClient *c) {
9611 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9612 if (c->argc != 4) goto badarity;
9613 configSetCommand(c);
9614 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9615 if (c->argc != 3) goto badarity;
9616 configGetCommand(c);
9617 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9618 if (c->argc != 2) goto badarity;
9619 server.stat_numcommands = 0;
9620 server.stat_numconnections = 0;
9621 server.stat_expiredkeys = 0;
9622 server.stat_starttime = time(NULL);
9623 addReply(c,shared.ok);
9624 } else {
9625 addReplySds(c,sdscatprintf(sdsempty(),
9626 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9627 }
9628 return;
9629
9630 badarity:
9631 addReplySds(c,sdscatprintf(sdsempty(),
9632 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9633 (char*) c->argv[1]->ptr));
9634 }
9635
9636 /* =========================== Pubsub implementation ======================== */
9637
9638 static void freePubsubPattern(void *p) {
9639 pubsubPattern *pat = p;
9640
9641 decrRefCount(pat->pattern);
9642 zfree(pat);
9643 }
9644
9645 static int listMatchPubsubPattern(void *a, void *b) {
9646 pubsubPattern *pa = a, *pb = b;
9647
9648 return (pa->client == pb->client) &&
9649 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9650 }
9651
9652 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9653 * 0 if the client was already subscribed to that channel. */
9654 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9655 struct dictEntry *de;
9656 list *clients = NULL;
9657 int retval = 0;
9658
9659 /* Add the channel to the client -> channels hash table */
9660 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9661 retval = 1;
9662 incrRefCount(channel);
9663 /* Add the client to the channel -> list of clients hash table */
9664 de = dictFind(server.pubsub_channels,channel);
9665 if (de == NULL) {
9666 clients = listCreate();
9667 dictAdd(server.pubsub_channels,channel,clients);
9668 incrRefCount(channel);
9669 } else {
9670 clients = dictGetEntryVal(de);
9671 }
9672 listAddNodeTail(clients,c);
9673 }
9674 /* Notify the client */
9675 addReply(c,shared.mbulk3);
9676 addReply(c,shared.subscribebulk);
9677 addReplyBulk(c,channel);
9678 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9679 return retval;
9680 }
9681
9682 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9683 * 0 if the client was not subscribed to the specified channel. */
9684 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9685 struct dictEntry *de;
9686 list *clients;
9687 listNode *ln;
9688 int retval = 0;
9689
9690 /* Remove the channel from the client -> channels hash table */
9691 incrRefCount(channel); /* channel may be just a pointer to the same object
9692 we have in the hash tables. Protect it... */
9693 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9694 retval = 1;
9695 /* Remove the client from the channel -> clients list hash table */
9696 de = dictFind(server.pubsub_channels,channel);
9697 assert(de != NULL);
9698 clients = dictGetEntryVal(de);
9699 ln = listSearchKey(clients,c);
9700 assert(ln != NULL);
9701 listDelNode(clients,ln);
9702 if (listLength(clients) == 0) {
9703 /* Free the list and associated hash entry at all if this was
9704 * the latest client, so that it will be possible to abuse
9705 * Redis PUBSUB creating millions of channels. */
9706 dictDelete(server.pubsub_channels,channel);
9707 }
9708 }
9709 /* Notify the client */
9710 if (notify) {
9711 addReply(c,shared.mbulk3);
9712 addReply(c,shared.unsubscribebulk);
9713 addReplyBulk(c,channel);
9714 addReplyLong(c,dictSize(c->pubsub_channels)+
9715 listLength(c->pubsub_patterns));
9716
9717 }
9718 decrRefCount(channel); /* it is finally safe to release it */
9719 return retval;
9720 }
9721
9722 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9723 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9724 int retval = 0;
9725
9726 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9727 retval = 1;
9728 pubsubPattern *pat;
9729 listAddNodeTail(c->pubsub_patterns,pattern);
9730 incrRefCount(pattern);
9731 pat = zmalloc(sizeof(*pat));
9732 pat->pattern = getDecodedObject(pattern);
9733 pat->client = c;
9734 listAddNodeTail(server.pubsub_patterns,pat);
9735 }
9736 /* Notify the client */
9737 addReply(c,shared.mbulk3);
9738 addReply(c,shared.psubscribebulk);
9739 addReplyBulk(c,pattern);
9740 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9741 return retval;
9742 }
9743
9744 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9745 * 0 if the client was not subscribed to the specified channel. */
9746 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9747 listNode *ln;
9748 pubsubPattern pat;
9749 int retval = 0;
9750
9751 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9752 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9753 retval = 1;
9754 listDelNode(c->pubsub_patterns,ln);
9755 pat.client = c;
9756 pat.pattern = pattern;
9757 ln = listSearchKey(server.pubsub_patterns,&pat);
9758 listDelNode(server.pubsub_patterns,ln);
9759 }
9760 /* Notify the client */
9761 if (notify) {
9762 addReply(c,shared.mbulk3);
9763 addReply(c,shared.punsubscribebulk);
9764 addReplyBulk(c,pattern);
9765 addReplyLong(c,dictSize(c->pubsub_channels)+
9766 listLength(c->pubsub_patterns));
9767 }
9768 decrRefCount(pattern);
9769 return retval;
9770 }
9771
9772 /* Unsubscribe from all the channels. Return the number of channels the
9773 * client was subscribed from. */
9774 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9775 dictIterator *di = dictGetIterator(c->pubsub_channels);
9776 dictEntry *de;
9777 int count = 0;
9778
9779 while((de = dictNext(di)) != NULL) {
9780 robj *channel = dictGetEntryKey(de);
9781
9782 count += pubsubUnsubscribeChannel(c,channel,notify);
9783 }
9784 dictReleaseIterator(di);
9785 return count;
9786 }
9787
9788 /* Unsubscribe from all the patterns. Return the number of patterns the
9789 * client was subscribed from. */
9790 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9791 listNode *ln;
9792 listIter li;
9793 int count = 0;
9794
9795 listRewind(c->pubsub_patterns,&li);
9796 while ((ln = listNext(&li)) != NULL) {
9797 robj *pattern = ln->value;
9798
9799 count += pubsubUnsubscribePattern(c,pattern,notify);
9800 }
9801 return count;
9802 }
9803
9804 /* Publish a message */
9805 static int pubsubPublishMessage(robj *channel, robj *message) {
9806 int receivers = 0;
9807 struct dictEntry *de;
9808 listNode *ln;
9809 listIter li;
9810
9811 /* Send to clients listening for that channel */
9812 de = dictFind(server.pubsub_channels,channel);
9813 if (de) {
9814 list *list = dictGetEntryVal(de);
9815 listNode *ln;
9816 listIter li;
9817
9818 listRewind(list,&li);
9819 while ((ln = listNext(&li)) != NULL) {
9820 redisClient *c = ln->value;
9821
9822 addReply(c,shared.mbulk3);
9823 addReply(c,shared.messagebulk);
9824 addReplyBulk(c,channel);
9825 addReplyBulk(c,message);
9826 receivers++;
9827 }
9828 }
9829 /* Send to clients listening to matching channels */
9830 if (listLength(server.pubsub_patterns)) {
9831 listRewind(server.pubsub_patterns,&li);
9832 channel = getDecodedObject(channel);
9833 while ((ln = listNext(&li)) != NULL) {
9834 pubsubPattern *pat = ln->value;
9835
9836 if (stringmatchlen((char*)pat->pattern->ptr,
9837 sdslen(pat->pattern->ptr),
9838 (char*)channel->ptr,
9839 sdslen(channel->ptr),0)) {
9840 addReply(pat->client,shared.mbulk4);
9841 addReply(pat->client,shared.pmessagebulk);
9842 addReplyBulk(pat->client,pat->pattern);
9843 addReplyBulk(pat->client,channel);
9844 addReplyBulk(pat->client,message);
9845 receivers++;
9846 }
9847 }
9848 decrRefCount(channel);
9849 }
9850 return receivers;
9851 }
9852
9853 static void subscribeCommand(redisClient *c) {
9854 int j;
9855
9856 for (j = 1; j < c->argc; j++)
9857 pubsubSubscribeChannel(c,c->argv[j]);
9858 }
9859
9860 static void unsubscribeCommand(redisClient *c) {
9861 if (c->argc == 1) {
9862 pubsubUnsubscribeAllChannels(c,1);
9863 return;
9864 } else {
9865 int j;
9866
9867 for (j = 1; j < c->argc; j++)
9868 pubsubUnsubscribeChannel(c,c->argv[j],1);
9869 }
9870 }
9871
9872 static void psubscribeCommand(redisClient *c) {
9873 int j;
9874
9875 for (j = 1; j < c->argc; j++)
9876 pubsubSubscribePattern(c,c->argv[j]);
9877 }
9878
9879 static void punsubscribeCommand(redisClient *c) {
9880 if (c->argc == 1) {
9881 pubsubUnsubscribeAllPatterns(c,1);
9882 return;
9883 } else {
9884 int j;
9885
9886 for (j = 1; j < c->argc; j++)
9887 pubsubUnsubscribePattern(c,c->argv[j],1);
9888 }
9889 }
9890
9891 static void publishCommand(redisClient *c) {
9892 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9893 addReplyLong(c,receivers);
9894 }
9895
9896 /* ================================= Debugging ============================== */
9897
9898 static void debugCommand(redisClient *c) {
9899 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9900 *((char*)-1) = 'x';
9901 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9902 if (rdbSave(server.dbfilename) != REDIS_OK) {
9903 addReply(c,shared.err);
9904 return;
9905 }
9906 emptyDb();
9907 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9908 addReply(c,shared.err);
9909 return;
9910 }
9911 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9912 addReply(c,shared.ok);
9913 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9914 emptyDb();
9915 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9916 addReply(c,shared.err);
9917 return;
9918 }
9919 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9920 addReply(c,shared.ok);
9921 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9922 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9923 robj *key, *val;
9924
9925 if (!de) {
9926 addReply(c,shared.nokeyerr);
9927 return;
9928 }
9929 key = dictGetEntryKey(de);
9930 val = dictGetEntryVal(de);
9931 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9932 key->storage == REDIS_VM_SWAPPING)) {
9933 char *strenc;
9934 char buf[128];
9935
9936 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9937 strenc = strencoding[val->encoding];
9938 } else {
9939 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9940 strenc = buf;
9941 }
9942 addReplySds(c,sdscatprintf(sdsempty(),
9943 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9944 "encoding:%s serializedlength:%lld\r\n",
9945 (void*)key, key->refcount, (void*)val, val->refcount,
9946 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9947 } else {
9948 addReplySds(c,sdscatprintf(sdsempty(),
9949 "+Key at:%p refcount:%d, value swapped at: page %llu "
9950 "using %llu pages\r\n",
9951 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9952 (unsigned long long) key->vm.usedpages));
9953 }
9954 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9955 lookupKeyRead(c->db,c->argv[2]);
9956 addReply(c,shared.ok);
9957 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9958 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9959 robj *key, *val;
9960
9961 if (!server.vm_enabled) {
9962 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9963 return;
9964 }
9965 if (!de) {
9966 addReply(c,shared.nokeyerr);
9967 return;
9968 }
9969 key = dictGetEntryKey(de);
9970 val = dictGetEntryVal(de);
9971 /* If the key is shared we want to create a copy */
9972 if (key->refcount > 1) {
9973 robj *newkey = dupStringObject(key);
9974 decrRefCount(key);
9975 key = dictGetEntryKey(de) = newkey;
9976 }
9977 /* Swap it */
9978 if (key->storage != REDIS_VM_MEMORY) {
9979 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9980 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9981 dictGetEntryVal(de) = NULL;
9982 addReply(c,shared.ok);
9983 } else {
9984 addReply(c,shared.err);
9985 }
9986 } else {
9987 addReplySds(c,sdsnew(
9988 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9989 }
9990 }
9991
9992 static void _redisAssert(char *estr, char *file, int line) {
9993 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9994 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9995 #ifdef HAVE_BACKTRACE
9996 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9997 *((char*)-1) = 'x';
9998 #endif
9999 }
10000
10001 static void _redisPanic(char *msg, char *file, int line) {
10002 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10003 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10004 #ifdef HAVE_BACKTRACE
10005 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10006 *((char*)-1) = 'x';
10007 #endif
10008 }
10009
10010 /* =================================== Main! ================================ */
10011
10012 #ifdef __linux__
10013 int linuxOvercommitMemoryValue(void) {
10014 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10015 char buf[64];
10016
10017 if (!fp) return -1;
10018 if (fgets(buf,64,fp) == NULL) {
10019 fclose(fp);
10020 return -1;
10021 }
10022 fclose(fp);
10023
10024 return atoi(buf);
10025 }
10026
10027 void linuxOvercommitMemoryWarning(void) {
10028 if (linuxOvercommitMemoryValue() == 0) {
10029 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10030 }
10031 }
10032 #endif /* __linux__ */
10033
10034 static void daemonize(void) {
10035 int fd;
10036 FILE *fp;
10037
10038 if (fork() != 0) exit(0); /* parent exits */
10039 setsid(); /* create a new session */
10040
10041 /* Every output goes to /dev/null. If Redis is daemonized but
10042 * the 'logfile' is set to 'stdout' in the configuration file
10043 * it will not log at all. */
10044 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10045 dup2(fd, STDIN_FILENO);
10046 dup2(fd, STDOUT_FILENO);
10047 dup2(fd, STDERR_FILENO);
10048 if (fd > STDERR_FILENO) close(fd);
10049 }
10050 /* Try to write the pid file */
10051 fp = fopen(server.pidfile,"w");
10052 if (fp) {
10053 fprintf(fp,"%d\n",getpid());
10054 fclose(fp);
10055 }
10056 }
10057
10058 static void version() {
10059 printf("Redis server version %s\n", REDIS_VERSION);
10060 exit(0);
10061 }
10062
10063 static void usage() {
10064 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10065 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10066 exit(1);
10067 }
10068
10069 int main(int argc, char **argv) {
10070 time_t start;
10071
10072 initServerConfig();
10073 if (argc == 2) {
10074 if (strcmp(argv[1], "-v") == 0 ||
10075 strcmp(argv[1], "--version") == 0) version();
10076 if (strcmp(argv[1], "--help") == 0) usage();
10077 resetServerSaveParams();
10078 loadServerConfig(argv[1]);
10079 } else if ((argc > 2)) {
10080 usage();
10081 } else {
10082 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10083 }
10084 if (server.daemonize) daemonize();
10085 initServer();
10086 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10087 #ifdef __linux__
10088 linuxOvercommitMemoryWarning();
10089 #endif
10090 start = time(NULL);
10091 if (server.appendonly) {
10092 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10093 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10094 } else {
10095 if (rdbLoad(server.dbfilename) == REDIS_OK)
10096 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10097 }
10098 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10099 aeSetBeforeSleepProc(server.el,beforeSleep);
10100 aeMain(server.el);
10101 aeDeleteEventLoop(server.el);
10102 return 0;
10103 }
10104
10105 /* ============================= Backtrace support ========================= */
10106
10107 #ifdef HAVE_BACKTRACE
10108 static char *findFuncName(void *pointer, unsigned long *offset);
10109
10110 static void *getMcontextEip(ucontext_t *uc) {
10111 #if defined(__FreeBSD__)
10112 return (void*) uc->uc_mcontext.mc_eip;
10113 #elif defined(__dietlibc__)
10114 return (void*) uc->uc_mcontext.eip;
10115 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10116 #if __x86_64__
10117 return (void*) uc->uc_mcontext->__ss.__rip;
10118 #else
10119 return (void*) uc->uc_mcontext->__ss.__eip;
10120 #endif
10121 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10122 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10123 return (void*) uc->uc_mcontext->__ss.__rip;
10124 #else
10125 return (void*) uc->uc_mcontext->__ss.__eip;
10126 #endif
10127 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10128 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10129 #elif defined(__ia64__) /* Linux IA64 */
10130 return (void*) uc->uc_mcontext.sc_ip;
10131 #else
10132 return NULL;
10133 #endif
10134 }
10135
10136 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10137 void *trace[100];
10138 char **messages = NULL;
10139 int i, trace_size = 0;
10140 unsigned long offset=0;
10141 ucontext_t *uc = (ucontext_t*) secret;
10142 sds infostring;
10143 REDIS_NOTUSED(info);
10144
10145 redisLog(REDIS_WARNING,
10146 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10147 infostring = genRedisInfoString();
10148 redisLog(REDIS_WARNING, "%s",infostring);
10149 /* It's not safe to sdsfree() the returned string under memory
10150 * corruption conditions. Let it leak as we are going to abort */
10151
10152 trace_size = backtrace(trace, 100);
10153 /* overwrite sigaction with caller's address */
10154 if (getMcontextEip(uc) != NULL) {
10155 trace[1] = getMcontextEip(uc);
10156 }
10157 messages = backtrace_symbols(trace, trace_size);
10158
10159 for (i=1; i<trace_size; ++i) {
10160 char *fn = findFuncName(trace[i], &offset), *p;
10161
10162 p = strchr(messages[i],'+');
10163 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10164 redisLog(REDIS_WARNING,"%s", messages[i]);
10165 } else {
10166 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10167 }
10168 }
10169 /* free(messages); Don't call free() with possibly corrupted memory. */
10170 _exit(0);
10171 }
10172
10173 static void setupSigSegvAction(void) {
10174 struct sigaction act;
10175
10176 sigemptyset (&act.sa_mask);
10177 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10178 * is used. Otherwise, sa_handler is used */
10179 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10180 act.sa_sigaction = segvHandler;
10181 sigaction (SIGSEGV, &act, NULL);
10182 sigaction (SIGBUS, &act, NULL);
10183 sigaction (SIGFPE, &act, NULL);
10184 sigaction (SIGILL, &act, NULL);
10185 sigaction (SIGBUS, &act, NULL);
10186 return;
10187 }
10188
10189 #include "staticsymbols.h"
10190 /* This function try to convert a pointer into a function name. It's used in
10191 * oreder to provide a backtrace under segmentation fault that's able to
10192 * display functions declared as static (otherwise the backtrace is useless). */
10193 static char *findFuncName(void *pointer, unsigned long *offset){
10194 int i, ret = -1;
10195 unsigned long off, minoff = 0;
10196
10197 /* Try to match against the Symbol with the smallest offset */
10198 for (i=0; symsTable[i].pointer; i++) {
10199 unsigned long lp = (unsigned long) pointer;
10200
10201 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10202 off=lp-symsTable[i].pointer;
10203 if (ret < 0 || off < minoff) {
10204 minoff=off;
10205 ret=i;
10206 }
10207 }
10208 }
10209 if (ret == -1) return NULL;
10210 *offset = minoff;
10211 return symsTable[ret].name;
10212 }
10213 #else /* HAVE_BACKTRACE */
10214 static void setupSigSegvAction(void) {
10215 }
10216 #endif /* HAVE_BACKTRACE */
10217
10218
10219
10220 /* The End */
10221
10222
10223