]> git.saurik.com Git - redis.git/blob - redis.c
redis-cli INFO output format is now raw again
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
241 static void _redisAssert(char *estr, char *file, int line);
242 static void _redisPanic(char *msg, char *file, int line);
243
244 /*================================= Data types ============================== */
245
246 /* A redis object, that is a type able to hold a string / list / set */
247
248 /* The VM object structure */
249 struct redisObjectVM {
250 off_t page; /* the page at witch the object is stored on disk */
251 off_t usedpages; /* number of pages used on disk */
252 time_t atime; /* Last access time */
253 } vm;
254
255 /* The actual Redis Object */
256 typedef struct redisObject {
257 void *ptr;
258 unsigned char type;
259 unsigned char encoding;
260 unsigned char storage; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
264 int refcount;
265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm;
270 } robj;
271
272 /* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276 #define initStaticStringObject(_var,_ptr) do { \
277 _var.refcount = 1; \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
280 _var.ptr = _ptr; \
281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
282 } while(0);
283
284 typedef struct redisDb {
285 dict *dict; /* The keyspace for this DB */
286 dict *expires; /* Timeout of keys with a timeout set */
287 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
288 dict *io_keys; /* Keys with clients waiting for VM I/O */
289 int id;
290 } redisDb;
291
292 /* Client MULTI/EXEC state */
293 typedef struct multiCmd {
294 robj **argv;
295 int argc;
296 struct redisCommand *cmd;
297 } multiCmd;
298
299 typedef struct multiState {
300 multiCmd *commands; /* Array of MULTI commands */
301 int count; /* Total number of MULTI commands */
302 } multiState;
303
304 /* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306 typedef struct redisClient {
307 int fd;
308 redisDb *db;
309 int dictid;
310 sds querybuf;
311 robj **argv, **mbargv;
312 int argc, mbargc;
313 int bulklen; /* bulk read len. -1 if not in bulk read mode */
314 int multibulk; /* multi bulk command format active */
315 list *reply;
316 int sentlen;
317 time_t lastinteraction; /* time of the last interaction, used for timeout */
318 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
319 int slaveseldb; /* slave selected db, if this client is a slave */
320 int authenticated; /* when requirepass is non-NULL */
321 int replstate; /* replication state if this is a slave */
322 int repldbfd; /* replication DB file descriptor */
323 long repldboff; /* replication DB file offset */
324 off_t repldbsize; /* replication DB file size */
325 multiState mstate; /* MULTI/EXEC state */
326 robj **blockingkeys; /* The key we are waiting to terminate a blocking
327 * operation such as BLPOP. Otherwise NULL. */
328 int blockingkeysnum; /* Number of blocking keys */
329 time_t blockingto; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
331 list *io_keys; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
333 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
334 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
335 } redisClient;
336
337 struct saveparam {
338 time_t seconds;
339 int changes;
340 };
341
342 /* Global server state structure */
343 struct redisServer {
344 int port;
345 int fd;
346 redisDb *db;
347 long long dirty; /* changes to DB from the last save */
348 list *clients;
349 list *slaves, *monitors;
350 char neterr[ANET_ERR_LEN];
351 aeEventLoop *el;
352 int cronloops; /* number of times the cron function run */
353 list *objfreelist; /* A list of freed objects to avoid malloc() */
354 time_t lastsave; /* Unix time of last save succeeede */
355 /* Fields used only for stats */
356 time_t stat_starttime; /* server start time */
357 long long stat_numcommands; /* number of processed commands */
358 long long stat_numconnections; /* number of connections received */
359 long long stat_expiredkeys; /* number of expired keys */
360 /* Configuration */
361 int verbosity;
362 int glueoutputbuf;
363 int maxidletime;
364 int dbnum;
365 int daemonize;
366 int appendonly;
367 int appendfsync;
368 time_t lastfsync;
369 int appendfd;
370 int appendseldb;
371 char *pidfile;
372 pid_t bgsavechildpid;
373 pid_t bgrewritechildpid;
374 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
380 char *appendfilename;
381 char *requirepass;
382 int rdbcompression;
383 int activerehashing;
384 /* Replication related */
385 int isslave;
386 char *masterauth;
387 char *masterhost;
388 int masterport;
389 redisClient *master; /* client that is master for this slave */
390 int replstate;
391 unsigned int maxclients;
392 unsigned long long maxmemory;
393 unsigned int blpop_blocked_clients;
394 unsigned int vm_blocked_clients;
395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
397 int sort_desc;
398 int sort_alpha;
399 int sort_bypattern;
400 /* Virtual memory configuration */
401 int vm_enabled;
402 char *vm_swap_file;
403 off_t vm_page_size;
404 off_t vm_pages;
405 unsigned long long vm_max_memory;
406 /* Hashes config */
407 size_t hash_max_zipmap_entries;
408 size_t hash_max_zipmap_value;
409 /* Virtual memory state */
410 FILE *vm_fp;
411 int vm_fd;
412 off_t vm_next_page; /* Next probably empty page */
413 off_t vm_near_pages; /* Number of pages allocated sequentially */
414 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
415 time_t unixtime; /* Unix time sampled every second. */
416 /* Virtual memory I/O threads stuff */
417 /* An I/O thread process an element taken from the io_jobs queue and
418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
421 list *io_processing; /* List of VM I/O jobs being processed */
422 list *io_processed; /* List of VM I/O jobs already processed */
423 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
424 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
425 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
427 pthread_attr_t io_threads_attr; /* attributes for threads creation */
428 int io_active_threads; /* Number of running I/O threads */
429 int vm_max_threads; /* Max number of I/O threads running at the same time */
430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read;
435 int io_ready_pipe_write;
436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages;
438 unsigned long long vm_stats_swapped_objects;
439 unsigned long long vm_stats_swapouts;
440 unsigned long long vm_stats_swapins;
441 /* Pubsub */
442 dict *pubsub_channels; /* Map channels to list of subscribed clients */
443 list *pubsub_patterns; /* A list of pubsub_patterns */
444 /* Misc */
445 FILE *devnull;
446 };
447
448 typedef struct pubsubPattern {
449 redisClient *client;
450 robj *pattern;
451 } pubsubPattern;
452
453 typedef void redisCommandProc(redisClient *c);
454 struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc *vm_preload_proc;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
467 };
468
469 struct redisFunctionSym {
470 char *name;
471 unsigned long pointer;
472 };
473
474 typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480 } redisSortObject;
481
482 typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485 } redisSortOperation;
486
487 /* ZSETs use a specialized version of Skiplists */
488
489 typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
491 struct zskiplistNode *backward;
492 unsigned int *span;
493 double score;
494 robj *obj;
495 } zskiplistNode;
496
497 typedef struct zskiplist {
498 struct zskiplistNode *header, *tail;
499 unsigned long length;
500 int level;
501 } zskiplist;
502
503 typedef struct zset {
504 dict *dict;
505 zskiplist *zsl;
506 } zset;
507
508 /* Our shared "common" objects */
509
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct {
512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
513 *colon, *nullbulk, *nullmultibulk, *queued,
514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
516 *select0, *select1, *select2, *select3, *select4,
517 *select5, *select6, *select7, *select8, *select9,
518 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *mbulk4, *psubscribebulk, *punsubscribebulk,
520 *integers[REDIS_SHARED_INTEGERS];
521 } shared;
522
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob {
534 int type; /* Request type, REDIS_IOJOB_* */
535 redisDb *db;/* Redis database */
536 robj *key; /* This I/O request is about swapping this key */
537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543 } iojob;
544
545 /*================================ Prototypes =============================== */
546
547 static void freeStringObject(robj *o);
548 static void freeListObject(robj *o);
549 static void freeSetObject(robj *o);
550 static void decrRefCount(void *o);
551 static robj *createObject(int type, void *ptr);
552 static void freeClient(redisClient *c);
553 static int rdbLoad(char *filename);
554 static void addReply(redisClient *c, robj *obj);
555 static void addReplySds(redisClient *c, sds s);
556 static void incrRefCount(robj *o);
557 static int rdbSaveBackground(char *filename);
558 static robj *createStringObject(char *ptr, size_t len);
559 static robj *dupStringObject(robj *o);
560 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
561 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
562 static int syncWithMaster(void);
563 static robj *tryObjectEncoding(robj *o);
564 static robj *getDecodedObject(robj *o);
565 static int removeExpire(redisDb *db, robj *key);
566 static int expireIfNeeded(redisDb *db, robj *key);
567 static int deleteIfVolatile(redisDb *db, robj *key);
568 static int deleteIfSwapped(redisDb *db, robj *key);
569 static int deleteKey(redisDb *db, robj *key);
570 static time_t getExpire(redisDb *db, robj *key);
571 static int setExpire(redisDb *db, robj *key, time_t when);
572 static void updateSlavesWaitingBgsave(int bgsaveerr);
573 static void freeMemoryIfNeeded(void);
574 static int processCommand(redisClient *c);
575 static void setupSigSegvAction(void);
576 static void rdbRemoveTempFile(pid_t childpid);
577 static void aofRemoveTempFile(pid_t childpid);
578 static size_t stringObjectLen(robj *o);
579 static void processInputBuffer(redisClient *c);
580 static zskiplist *zslCreate(void);
581 static void zslFree(zskiplist *zsl);
582 static void zslInsert(zskiplist *zsl, double score, robj *obj);
583 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void initClientMultiState(redisClient *c);
585 static void freeClientMultiState(redisClient *c);
586 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
587 static void unblockClientWaitingData(redisClient *c);
588 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
589 static void vmInit(void);
590 static void vmMarkPagesFree(off_t page, off_t count);
591 static robj *vmLoadObject(robj *key);
592 static robj *vmPreviewObject(robj *key);
593 static int vmSwapOneObjectBlocking(void);
594 static int vmSwapOneObjectThreaded(void);
595 static int vmCanSwapOut(void);
596 static int tryFreeOneObjectFromFreelist(void);
597 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
598 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
599 static void vmCancelThreadedIOJob(robj *o);
600 static void lockThreadedIO(void);
601 static void unlockThreadedIO(void);
602 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
603 static void freeIOJob(iojob *j);
604 static void queueIOJob(iojob *j);
605 static int vmWriteObjectOnSwap(robj *o, off_t page);
606 static robj *vmReadObjectFromSwap(off_t page, int type);
607 static void waitEmptyIOJobsQueue(void);
608 static void vmReopenSwapFile(void);
609 static int vmFreePage(off_t page);
610 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
611 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
612 static int dontWaitForSwappedKey(redisClient *c, robj *key);
613 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
614 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
615 static struct redisCommand *lookupCommand(char *name);
616 static void call(redisClient *c, struct redisCommand *cmd);
617 static void resetClient(redisClient *c);
618 static void convertToRealHash(robj *o);
619 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
620 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
621 static void freePubsubPattern(void *p);
622 static int listMatchPubsubPattern(void *a, void *b);
623 static int compareStringObjects(robj *a, robj *b);
624 static void usage();
625 static int rewriteAppendOnlyFileBackground(void);
626
627 static void authCommand(redisClient *c);
628 static void pingCommand(redisClient *c);
629 static void echoCommand(redisClient *c);
630 static void setCommand(redisClient *c);
631 static void setnxCommand(redisClient *c);
632 static void setexCommand(redisClient *c);
633 static void getCommand(redisClient *c);
634 static void delCommand(redisClient *c);
635 static void existsCommand(redisClient *c);
636 static void incrCommand(redisClient *c);
637 static void decrCommand(redisClient *c);
638 static void incrbyCommand(redisClient *c);
639 static void decrbyCommand(redisClient *c);
640 static void selectCommand(redisClient *c);
641 static void randomkeyCommand(redisClient *c);
642 static void keysCommand(redisClient *c);
643 static void dbsizeCommand(redisClient *c);
644 static void lastsaveCommand(redisClient *c);
645 static void saveCommand(redisClient *c);
646 static void bgsaveCommand(redisClient *c);
647 static void bgrewriteaofCommand(redisClient *c);
648 static void shutdownCommand(redisClient *c);
649 static void moveCommand(redisClient *c);
650 static void renameCommand(redisClient *c);
651 static void renamenxCommand(redisClient *c);
652 static void lpushCommand(redisClient *c);
653 static void rpushCommand(redisClient *c);
654 static void lpopCommand(redisClient *c);
655 static void rpopCommand(redisClient *c);
656 static void llenCommand(redisClient *c);
657 static void lindexCommand(redisClient *c);
658 static void lrangeCommand(redisClient *c);
659 static void ltrimCommand(redisClient *c);
660 static void typeCommand(redisClient *c);
661 static void lsetCommand(redisClient *c);
662 static void saddCommand(redisClient *c);
663 static void sremCommand(redisClient *c);
664 static void smoveCommand(redisClient *c);
665 static void sismemberCommand(redisClient *c);
666 static void scardCommand(redisClient *c);
667 static void spopCommand(redisClient *c);
668 static void srandmemberCommand(redisClient *c);
669 static void sinterCommand(redisClient *c);
670 static void sinterstoreCommand(redisClient *c);
671 static void sunionCommand(redisClient *c);
672 static void sunionstoreCommand(redisClient *c);
673 static void sdiffCommand(redisClient *c);
674 static void sdiffstoreCommand(redisClient *c);
675 static void syncCommand(redisClient *c);
676 static void flushdbCommand(redisClient *c);
677 static void flushallCommand(redisClient *c);
678 static void sortCommand(redisClient *c);
679 static void lremCommand(redisClient *c);
680 static void rpoplpushcommand(redisClient *c);
681 static void infoCommand(redisClient *c);
682 static void mgetCommand(redisClient *c);
683 static void monitorCommand(redisClient *c);
684 static void expireCommand(redisClient *c);
685 static void expireatCommand(redisClient *c);
686 static void getsetCommand(redisClient *c);
687 static void ttlCommand(redisClient *c);
688 static void slaveofCommand(redisClient *c);
689 static void debugCommand(redisClient *c);
690 static void msetCommand(redisClient *c);
691 static void msetnxCommand(redisClient *c);
692 static void zaddCommand(redisClient *c);
693 static void zincrbyCommand(redisClient *c);
694 static void zrangeCommand(redisClient *c);
695 static void zrangebyscoreCommand(redisClient *c);
696 static void zcountCommand(redisClient *c);
697 static void zrevrangeCommand(redisClient *c);
698 static void zcardCommand(redisClient *c);
699 static void zremCommand(redisClient *c);
700 static void zscoreCommand(redisClient *c);
701 static void zremrangebyscoreCommand(redisClient *c);
702 static void multiCommand(redisClient *c);
703 static void execCommand(redisClient *c);
704 static void discardCommand(redisClient *c);
705 static void blpopCommand(redisClient *c);
706 static void brpopCommand(redisClient *c);
707 static void appendCommand(redisClient *c);
708 static void substrCommand(redisClient *c);
709 static void zrankCommand(redisClient *c);
710 static void zrevrankCommand(redisClient *c);
711 static void hsetCommand(redisClient *c);
712 static void hsetnxCommand(redisClient *c);
713 static void hgetCommand(redisClient *c);
714 static void hmsetCommand(redisClient *c);
715 static void hmgetCommand(redisClient *c);
716 static void hdelCommand(redisClient *c);
717 static void hlenCommand(redisClient *c);
718 static void zremrangebyrankCommand(redisClient *c);
719 static void zunionCommand(redisClient *c);
720 static void zinterCommand(redisClient *c);
721 static void hkeysCommand(redisClient *c);
722 static void hvalsCommand(redisClient *c);
723 static void hgetallCommand(redisClient *c);
724 static void hexistsCommand(redisClient *c);
725 static void configCommand(redisClient *c);
726 static void hincrbyCommand(redisClient *c);
727 static void subscribeCommand(redisClient *c);
728 static void unsubscribeCommand(redisClient *c);
729 static void psubscribeCommand(redisClient *c);
730 static void punsubscribeCommand(redisClient *c);
731 static void publishCommand(redisClient *c);
732
733 /*================================= Globals ================================= */
734
735 /* Global vars */
736 static struct redisServer server; /* server global state */
737 static struct redisCommand cmdTable[] = {
738 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
740 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
741 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
742 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
745 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
749 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
758 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
761 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
762 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
764 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
765 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
766 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
770 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
771 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
772 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
773 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
774 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
775 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
779 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
782 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
783 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
791 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
796 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
798 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
803 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
806 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
807 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
808 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
820 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
828 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
833 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
839 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
844 {NULL,NULL,0,0,NULL,0,0,0}
845 };
846
847 /*============================ Utility functions ============================ */
848
849 /* Glob-style pattern matching. */
850 static int stringmatchlen(const char *pattern, int patternLen,
851 const char *string, int stringLen, int nocase)
852 {
853 while(patternLen) {
854 switch(pattern[0]) {
855 case '*':
856 while (pattern[1] == '*') {
857 pattern++;
858 patternLen--;
859 }
860 if (patternLen == 1)
861 return 1; /* match */
862 while(stringLen) {
863 if (stringmatchlen(pattern+1, patternLen-1,
864 string, stringLen, nocase))
865 return 1; /* match */
866 string++;
867 stringLen--;
868 }
869 return 0; /* no match */
870 break;
871 case '?':
872 if (stringLen == 0)
873 return 0; /* no match */
874 string++;
875 stringLen--;
876 break;
877 case '[':
878 {
879 int not, match;
880
881 pattern++;
882 patternLen--;
883 not = pattern[0] == '^';
884 if (not) {
885 pattern++;
886 patternLen--;
887 }
888 match = 0;
889 while(1) {
890 if (pattern[0] == '\\') {
891 pattern++;
892 patternLen--;
893 if (pattern[0] == string[0])
894 match = 1;
895 } else if (pattern[0] == ']') {
896 break;
897 } else if (patternLen == 0) {
898 pattern--;
899 patternLen++;
900 break;
901 } else if (pattern[1] == '-' && patternLen >= 3) {
902 int start = pattern[0];
903 int end = pattern[2];
904 int c = string[0];
905 if (start > end) {
906 int t = start;
907 start = end;
908 end = t;
909 }
910 if (nocase) {
911 start = tolower(start);
912 end = tolower(end);
913 c = tolower(c);
914 }
915 pattern += 2;
916 patternLen -= 2;
917 if (c >= start && c <= end)
918 match = 1;
919 } else {
920 if (!nocase) {
921 if (pattern[0] == string[0])
922 match = 1;
923 } else {
924 if (tolower((int)pattern[0]) == tolower((int)string[0]))
925 match = 1;
926 }
927 }
928 pattern++;
929 patternLen--;
930 }
931 if (not)
932 match = !match;
933 if (!match)
934 return 0; /* no match */
935 string++;
936 stringLen--;
937 break;
938 }
939 case '\\':
940 if (patternLen >= 2) {
941 pattern++;
942 patternLen--;
943 }
944 /* fall through */
945 default:
946 if (!nocase) {
947 if (pattern[0] != string[0])
948 return 0; /* no match */
949 } else {
950 if (tolower((int)pattern[0]) != tolower((int)string[0]))
951 return 0; /* no match */
952 }
953 string++;
954 stringLen--;
955 break;
956 }
957 pattern++;
958 patternLen--;
959 if (stringLen == 0) {
960 while(*pattern == '*') {
961 pattern++;
962 patternLen--;
963 }
964 break;
965 }
966 }
967 if (patternLen == 0 && stringLen == 0)
968 return 1;
969 return 0;
970 }
971
972 static int stringmatch(const char *pattern, const char *string, int nocase) {
973 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
974 }
975
976 /* Convert a string representing an amount of memory into the number of
977 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
978 * (1024*1024*1024).
979 *
980 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
981 * set to 0 */
982 static long long memtoll(const char *p, int *err) {
983 const char *u;
984 char buf[128];
985 long mul; /* unit multiplier */
986 long long val;
987 unsigned int digits;
988
989 if (err) *err = 0;
990 /* Search the first non digit character. */
991 u = p;
992 if (*u == '-') u++;
993 while(*u && isdigit(*u)) u++;
994 if (*u == '\0' || !strcasecmp(u,"b")) {
995 mul = 1;
996 } else if (!strcasecmp(u,"k")) {
997 mul = 1000;
998 } else if (!strcasecmp(u,"kb")) {
999 mul = 1024;
1000 } else if (!strcasecmp(u,"m")) {
1001 mul = 1000*1000;
1002 } else if (!strcasecmp(u,"mb")) {
1003 mul = 1024*1024;
1004 } else if (!strcasecmp(u,"g")) {
1005 mul = 1000L*1000*1000;
1006 } else if (!strcasecmp(u,"gb")) {
1007 mul = 1024L*1024*1024;
1008 } else {
1009 if (err) *err = 1;
1010 mul = 1;
1011 }
1012 digits = u-p;
1013 if (digits >= sizeof(buf)) {
1014 if (err) *err = 1;
1015 return LLONG_MAX;
1016 }
1017 memcpy(buf,p,digits);
1018 buf[digits] = '\0';
1019 val = strtoll(buf,NULL,10);
1020 return val*mul;
1021 }
1022
1023 static void redisLog(int level, const char *fmt, ...) {
1024 va_list ap;
1025 FILE *fp;
1026
1027 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1028 if (!fp) return;
1029
1030 va_start(ap, fmt);
1031 if (level >= server.verbosity) {
1032 char *c = ".-*#";
1033 char buf[64];
1034 time_t now;
1035
1036 now = time(NULL);
1037 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1038 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1039 vfprintf(fp, fmt, ap);
1040 fprintf(fp,"\n");
1041 fflush(fp);
1042 }
1043 va_end(ap);
1044
1045 if (server.logfile) fclose(fp);
1046 }
1047
1048 /*====================== Hash table type implementation ==================== */
1049
1050 /* This is an hash table type that uses the SDS dynamic strings libary as
1051 * keys and radis objects as values (objects can hold SDS strings,
1052 * lists, sets). */
1053
1054 static void dictVanillaFree(void *privdata, void *val)
1055 {
1056 DICT_NOTUSED(privdata);
1057 zfree(val);
1058 }
1059
1060 static void dictListDestructor(void *privdata, void *val)
1061 {
1062 DICT_NOTUSED(privdata);
1063 listRelease((list*)val);
1064 }
1065
1066 static int sdsDictKeyCompare(void *privdata, const void *key1,
1067 const void *key2)
1068 {
1069 int l1,l2;
1070 DICT_NOTUSED(privdata);
1071
1072 l1 = sdslen((sds)key1);
1073 l2 = sdslen((sds)key2);
1074 if (l1 != l2) return 0;
1075 return memcmp(key1, key2, l1) == 0;
1076 }
1077
1078 static void dictRedisObjectDestructor(void *privdata, void *val)
1079 {
1080 DICT_NOTUSED(privdata);
1081
1082 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1083 decrRefCount(val);
1084 }
1085
1086 static int dictObjKeyCompare(void *privdata, const void *key1,
1087 const void *key2)
1088 {
1089 const robj *o1 = key1, *o2 = key2;
1090 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1091 }
1092
1093 static unsigned int dictObjHash(const void *key) {
1094 const robj *o = key;
1095 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1096 }
1097
1098 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1099 const void *key2)
1100 {
1101 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1102 int cmp;
1103
1104 if (o1->encoding == REDIS_ENCODING_INT &&
1105 o2->encoding == REDIS_ENCODING_INT &&
1106 o1->ptr == o2->ptr) return 1;
1107
1108 o1 = getDecodedObject(o1);
1109 o2 = getDecodedObject(o2);
1110 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1111 decrRefCount(o1);
1112 decrRefCount(o2);
1113 return cmp;
1114 }
1115
1116 static unsigned int dictEncObjHash(const void *key) {
1117 robj *o = (robj*) key;
1118
1119 if (o->encoding == REDIS_ENCODING_RAW) {
1120 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1121 } else {
1122 if (o->encoding == REDIS_ENCODING_INT) {
1123 char buf[32];
1124 int len;
1125
1126 len = snprintf(buf,32,"%ld",(long)o->ptr);
1127 return dictGenHashFunction((unsigned char*)buf, len);
1128 } else {
1129 unsigned int hash;
1130
1131 o = getDecodedObject(o);
1132 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1133 decrRefCount(o);
1134 return hash;
1135 }
1136 }
1137 }
1138
1139 /* Sets type and expires */
1140 static dictType setDictType = {
1141 dictEncObjHash, /* hash function */
1142 NULL, /* key dup */
1143 NULL, /* val dup */
1144 dictEncObjKeyCompare, /* key compare */
1145 dictRedisObjectDestructor, /* key destructor */
1146 NULL /* val destructor */
1147 };
1148
1149 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1150 static dictType zsetDictType = {
1151 dictEncObjHash, /* hash function */
1152 NULL, /* key dup */
1153 NULL, /* val dup */
1154 dictEncObjKeyCompare, /* key compare */
1155 dictRedisObjectDestructor, /* key destructor */
1156 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1157 };
1158
1159 /* Db->dict */
1160 static dictType dbDictType = {
1161 dictObjHash, /* hash function */
1162 NULL, /* key dup */
1163 NULL, /* val dup */
1164 dictObjKeyCompare, /* key compare */
1165 dictRedisObjectDestructor, /* key destructor */
1166 dictRedisObjectDestructor /* val destructor */
1167 };
1168
1169 /* Db->expires */
1170 static dictType keyptrDictType = {
1171 dictObjHash, /* hash function */
1172 NULL, /* key dup */
1173 NULL, /* val dup */
1174 dictObjKeyCompare, /* key compare */
1175 dictRedisObjectDestructor, /* key destructor */
1176 NULL /* val destructor */
1177 };
1178
1179 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1180 static dictType hashDictType = {
1181 dictEncObjHash, /* hash function */
1182 NULL, /* key dup */
1183 NULL, /* val dup */
1184 dictEncObjKeyCompare, /* key compare */
1185 dictRedisObjectDestructor, /* key destructor */
1186 dictRedisObjectDestructor /* val destructor */
1187 };
1188
1189 /* Keylist hash table type has unencoded redis objects as keys and
1190 * lists as values. It's used for blocking operations (BLPOP) and to
1191 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1192 static dictType keylistDictType = {
1193 dictObjHash, /* hash function */
1194 NULL, /* key dup */
1195 NULL, /* val dup */
1196 dictObjKeyCompare, /* key compare */
1197 dictRedisObjectDestructor, /* key destructor */
1198 dictListDestructor /* val destructor */
1199 };
1200
1201 static void version();
1202
1203 /* ========================= Random utility functions ======================= */
1204
1205 /* Redis generally does not try to recover from out of memory conditions
1206 * when allocating objects or strings, it is not clear if it will be possible
1207 * to report this condition to the client since the networking layer itself
1208 * is based on heap allocation for send buffers, so we simply abort.
1209 * At least the code will be simpler to read... */
1210 static void oom(const char *msg) {
1211 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1212 sleep(1);
1213 abort();
1214 }
1215
1216 /* ====================== Redis server networking stuff ===================== */
1217 static void closeTimedoutClients(void) {
1218 redisClient *c;
1219 listNode *ln;
1220 time_t now = time(NULL);
1221 listIter li;
1222
1223 listRewind(server.clients,&li);
1224 while ((ln = listNext(&li)) != NULL) {
1225 c = listNodeValue(ln);
1226 if (server.maxidletime &&
1227 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1228 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1229 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1230 listLength(c->pubsub_patterns) == 0 &&
1231 (now - c->lastinteraction > server.maxidletime))
1232 {
1233 redisLog(REDIS_VERBOSE,"Closing idle client");
1234 freeClient(c);
1235 } else if (c->flags & REDIS_BLOCKED) {
1236 if (c->blockingto != 0 && c->blockingto < now) {
1237 addReply(c,shared.nullmultibulk);
1238 unblockClientWaitingData(c);
1239 }
1240 }
1241 }
1242 }
1243
1244 static int htNeedsResize(dict *dict) {
1245 long long size, used;
1246
1247 size = dictSlots(dict);
1248 used = dictSize(dict);
1249 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1250 (used*100/size < REDIS_HT_MINFILL));
1251 }
1252
1253 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1254 * we resize the hash table to save memory */
1255 static void tryResizeHashTables(void) {
1256 int j;
1257
1258 for (j = 0; j < server.dbnum; j++) {
1259 if (htNeedsResize(server.db[j].dict))
1260 dictResize(server.db[j].dict);
1261 if (htNeedsResize(server.db[j].expires))
1262 dictResize(server.db[j].expires);
1263 }
1264 }
1265
1266 /* Our hash table implementation performs rehashing incrementally while
1267 * we write/read from the hash table. Still if the server is idle, the hash
1268 * table will use two tables for a long time. So we try to use 1 millisecond
1269 * of CPU time at every serverCron() loop in order to rehash some key. */
1270 static void incrementallyRehash(void) {
1271 int j;
1272
1273 for (j = 0; j < server.dbnum; j++) {
1274 if (dictIsRehashing(server.db[j].dict)) {
1275 dictRehashMilliseconds(server.db[j].dict,1);
1276 break; /* already used our millisecond for this loop... */
1277 }
1278 }
1279 }
1280
1281 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1282 void backgroundSaveDoneHandler(int statloc) {
1283 int exitcode = WEXITSTATUS(statloc);
1284 int bysignal = WIFSIGNALED(statloc);
1285
1286 if (!bysignal && exitcode == 0) {
1287 redisLog(REDIS_NOTICE,
1288 "Background saving terminated with success");
1289 server.dirty = 0;
1290 server.lastsave = time(NULL);
1291 } else if (!bysignal && exitcode != 0) {
1292 redisLog(REDIS_WARNING, "Background saving error");
1293 } else {
1294 redisLog(REDIS_WARNING,
1295 "Background saving terminated by signal %d", WTERMSIG(statloc));
1296 rdbRemoveTempFile(server.bgsavechildpid);
1297 }
1298 server.bgsavechildpid = -1;
1299 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1300 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1301 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1302 }
1303
1304 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1305 * Handle this. */
1306 void backgroundRewriteDoneHandler(int statloc) {
1307 int exitcode = WEXITSTATUS(statloc);
1308 int bysignal = WIFSIGNALED(statloc);
1309
1310 if (!bysignal && exitcode == 0) {
1311 int fd;
1312 char tmpfile[256];
1313
1314 redisLog(REDIS_NOTICE,
1315 "Background append only file rewriting terminated with success");
1316 /* Now it's time to flush the differences accumulated by the parent */
1317 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1318 fd = open(tmpfile,O_WRONLY|O_APPEND);
1319 if (fd == -1) {
1320 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1321 goto cleanup;
1322 }
1323 /* Flush our data... */
1324 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1325 (signed) sdslen(server.bgrewritebuf)) {
1326 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1327 close(fd);
1328 goto cleanup;
1329 }
1330 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1331 /* Now our work is to rename the temp file into the stable file. And
1332 * switch the file descriptor used by the server for append only. */
1333 if (rename(tmpfile,server.appendfilename) == -1) {
1334 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1335 close(fd);
1336 goto cleanup;
1337 }
1338 /* Mission completed... almost */
1339 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1340 if (server.appendfd != -1) {
1341 /* If append only is actually enabled... */
1342 close(server.appendfd);
1343 server.appendfd = fd;
1344 fsync(fd);
1345 server.appendseldb = -1; /* Make sure it will issue SELECT */
1346 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1347 } else {
1348 /* If append only is disabled we just generate a dump in this
1349 * format. Why not? */
1350 close(fd);
1351 }
1352 } else if (!bysignal && exitcode != 0) {
1353 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1354 } else {
1355 redisLog(REDIS_WARNING,
1356 "Background append only file rewriting terminated by signal %d",
1357 WTERMSIG(statloc));
1358 }
1359 cleanup:
1360 sdsfree(server.bgrewritebuf);
1361 server.bgrewritebuf = sdsempty();
1362 aofRemoveTempFile(server.bgrewritechildpid);
1363 server.bgrewritechildpid = -1;
1364 }
1365
1366 /* This function is called once a background process of some kind terminates,
1367 * as we want to avoid resizing the hash tables when there is a child in order
1368 * to play well with copy-on-write (otherwise when a resize happens lots of
1369 * memory pages are copied). The goal of this function is to update the ability
1370 * for dict.c to resize the hash tables accordingly to the fact we have o not
1371 * running childs. */
1372 static void updateDictResizePolicy(void) {
1373 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1374 dictEnableResize();
1375 else
1376 dictDisableResize();
1377 }
1378
1379 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1380 int j, loops = server.cronloops++;
1381 REDIS_NOTUSED(eventLoop);
1382 REDIS_NOTUSED(id);
1383 REDIS_NOTUSED(clientData);
1384
1385 /* We take a cached value of the unix time in the global state because
1386 * with virtual memory and aging there is to store the current time
1387 * in objects at every object access, and accuracy is not needed.
1388 * To access a global var is faster than calling time(NULL) */
1389 server.unixtime = time(NULL);
1390
1391 /* Show some info about non-empty databases */
1392 for (j = 0; j < server.dbnum; j++) {
1393 long long size, used, vkeys;
1394
1395 size = dictSlots(server.db[j].dict);
1396 used = dictSize(server.db[j].dict);
1397 vkeys = dictSize(server.db[j].expires);
1398 if (!(loops % 50) && (used || vkeys)) {
1399 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1400 /* dictPrintStats(server.dict); */
1401 }
1402 }
1403
1404 /* We don't want to resize the hash tables while a bacground saving
1405 * is in progress: the saving child is created using fork() that is
1406 * implemented with a copy-on-write semantic in most modern systems, so
1407 * if we resize the HT while there is the saving child at work actually
1408 * a lot of memory movements in the parent will cause a lot of pages
1409 * copied. */
1410 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1411 if (!(loops % 10)) tryResizeHashTables();
1412 if (server.activerehashing) incrementallyRehash();
1413 }
1414
1415 /* Show information about connected clients */
1416 if (!(loops % 50)) {
1417 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1418 listLength(server.clients)-listLength(server.slaves),
1419 listLength(server.slaves),
1420 zmalloc_used_memory());
1421 }
1422
1423 /* Close connections of timedout clients */
1424 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1425 closeTimedoutClients();
1426
1427 /* Check if a background saving or AOF rewrite in progress terminated */
1428 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1429 int statloc;
1430 pid_t pid;
1431
1432 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1433 if (pid == server.bgsavechildpid) {
1434 backgroundSaveDoneHandler(statloc);
1435 } else {
1436 backgroundRewriteDoneHandler(statloc);
1437 }
1438 updateDictResizePolicy();
1439 }
1440 } else {
1441 /* If there is not a background saving in progress check if
1442 * we have to save now */
1443 time_t now = time(NULL);
1444 for (j = 0; j < server.saveparamslen; j++) {
1445 struct saveparam *sp = server.saveparams+j;
1446
1447 if (server.dirty >= sp->changes &&
1448 now-server.lastsave > sp->seconds) {
1449 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1450 sp->changes, sp->seconds);
1451 rdbSaveBackground(server.dbfilename);
1452 break;
1453 }
1454 }
1455 }
1456
1457 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1458 * will use few CPU cycles if there are few expiring keys, otherwise
1459 * it will get more aggressive to avoid that too much memory is used by
1460 * keys that can be removed from the keyspace. */
1461 for (j = 0; j < server.dbnum; j++) {
1462 int expired;
1463 redisDb *db = server.db+j;
1464
1465 /* Continue to expire if at the end of the cycle more than 25%
1466 * of the keys were expired. */
1467 do {
1468 long num = dictSize(db->expires);
1469 time_t now = time(NULL);
1470
1471 expired = 0;
1472 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1473 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1474 while (num--) {
1475 dictEntry *de;
1476 time_t t;
1477
1478 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1479 t = (time_t) dictGetEntryVal(de);
1480 if (now > t) {
1481 deleteKey(db,dictGetEntryKey(de));
1482 expired++;
1483 server.stat_expiredkeys++;
1484 }
1485 }
1486 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1487 }
1488
1489 /* Swap a few keys on disk if we are over the memory limit and VM
1490 * is enbled. Try to free objects from the free list first. */
1491 if (vmCanSwapOut()) {
1492 while (server.vm_enabled && zmalloc_used_memory() >
1493 server.vm_max_memory)
1494 {
1495 int retval;
1496
1497 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1498 retval = (server.vm_max_threads == 0) ?
1499 vmSwapOneObjectBlocking() :
1500 vmSwapOneObjectThreaded();
1501 if (retval == REDIS_ERR && !(loops % 300) &&
1502 zmalloc_used_memory() >
1503 (server.vm_max_memory+server.vm_max_memory/10))
1504 {
1505 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1506 }
1507 /* Note that when using threade I/O we free just one object,
1508 * because anyway when the I/O thread in charge to swap this
1509 * object out will finish, the handler of completed jobs
1510 * will try to swap more objects if we are still out of memory. */
1511 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1512 }
1513 }
1514
1515 /* Check if we should connect to a MASTER */
1516 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1517 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1518 if (syncWithMaster() == REDIS_OK) {
1519 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1520 if (server.appendonly) rewriteAppendOnlyFileBackground();
1521 }
1522 }
1523 return 100;
1524 }
1525
1526 /* This function gets called every time Redis is entering the
1527 * main loop of the event driven library, that is, before to sleep
1528 * for ready file descriptors. */
1529 static void beforeSleep(struct aeEventLoop *eventLoop) {
1530 REDIS_NOTUSED(eventLoop);
1531
1532 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1533 listIter li;
1534 listNode *ln;
1535
1536 listRewind(server.io_ready_clients,&li);
1537 while((ln = listNext(&li))) {
1538 redisClient *c = ln->value;
1539 struct redisCommand *cmd;
1540
1541 /* Resume the client. */
1542 listDelNode(server.io_ready_clients,ln);
1543 c->flags &= (~REDIS_IO_WAIT);
1544 server.vm_blocked_clients--;
1545 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1546 readQueryFromClient, c);
1547 cmd = lookupCommand(c->argv[0]->ptr);
1548 assert(cmd != NULL);
1549 call(c,cmd);
1550 resetClient(c);
1551 /* There may be more data to process in the input buffer. */
1552 if (c->querybuf && sdslen(c->querybuf) > 0)
1553 processInputBuffer(c);
1554 }
1555 }
1556 }
1557
1558 static void createSharedObjects(void) {
1559 int j;
1560
1561 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1562 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1563 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1564 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1565 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1566 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1567 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1568 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1569 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1570 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1571 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1572 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1573 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1574 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1575 "-ERR no such key\r\n"));
1576 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1577 "-ERR syntax error\r\n"));
1578 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR source and destination objects are the same\r\n"));
1580 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1581 "-ERR index out of range\r\n"));
1582 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1583 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1584 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1585 shared.select0 = createStringObject("select 0\r\n",10);
1586 shared.select1 = createStringObject("select 1\r\n",10);
1587 shared.select2 = createStringObject("select 2\r\n",10);
1588 shared.select3 = createStringObject("select 3\r\n",10);
1589 shared.select4 = createStringObject("select 4\r\n",10);
1590 shared.select5 = createStringObject("select 5\r\n",10);
1591 shared.select6 = createStringObject("select 6\r\n",10);
1592 shared.select7 = createStringObject("select 7\r\n",10);
1593 shared.select8 = createStringObject("select 8\r\n",10);
1594 shared.select9 = createStringObject("select 9\r\n",10);
1595 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1596 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1597 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1598 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1599 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1600 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1601 shared.mbulk3 = createStringObject("*3\r\n",4);
1602 shared.mbulk4 = createStringObject("*4\r\n",4);
1603 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1604 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1605 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1606 }
1607 }
1608
1609 static void appendServerSaveParams(time_t seconds, int changes) {
1610 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1611 server.saveparams[server.saveparamslen].seconds = seconds;
1612 server.saveparams[server.saveparamslen].changes = changes;
1613 server.saveparamslen++;
1614 }
1615
1616 static void resetServerSaveParams() {
1617 zfree(server.saveparams);
1618 server.saveparams = NULL;
1619 server.saveparamslen = 0;
1620 }
1621
1622 static void initServerConfig() {
1623 server.dbnum = REDIS_DEFAULT_DBNUM;
1624 server.port = REDIS_SERVERPORT;
1625 server.verbosity = REDIS_VERBOSE;
1626 server.maxidletime = REDIS_MAXIDLETIME;
1627 server.saveparams = NULL;
1628 server.logfile = NULL; /* NULL = log on standard output */
1629 server.bindaddr = NULL;
1630 server.glueoutputbuf = 1;
1631 server.daemonize = 0;
1632 server.appendonly = 0;
1633 server.appendfsync = APPENDFSYNC_ALWAYS;
1634 server.lastfsync = time(NULL);
1635 server.appendfd = -1;
1636 server.appendseldb = -1; /* Make sure the first time will not match */
1637 server.pidfile = zstrdup("/var/run/redis.pid");
1638 server.dbfilename = zstrdup("dump.rdb");
1639 server.appendfilename = zstrdup("appendonly.aof");
1640 server.requirepass = NULL;
1641 server.rdbcompression = 1;
1642 server.activerehashing = 1;
1643 server.maxclients = 0;
1644 server.blpop_blocked_clients = 0;
1645 server.maxmemory = 0;
1646 server.vm_enabled = 0;
1647 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1648 server.vm_page_size = 256; /* 256 bytes per page */
1649 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1650 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1651 server.vm_max_threads = 4;
1652 server.vm_blocked_clients = 0;
1653 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1654 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1655
1656 resetServerSaveParams();
1657
1658 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1659 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1660 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1661 /* Replication related */
1662 server.isslave = 0;
1663 server.masterauth = NULL;
1664 server.masterhost = NULL;
1665 server.masterport = 6379;
1666 server.master = NULL;
1667 server.replstate = REDIS_REPL_NONE;
1668
1669 /* Double constants initialization */
1670 R_Zero = 0.0;
1671 R_PosInf = 1.0/R_Zero;
1672 R_NegInf = -1.0/R_Zero;
1673 R_Nan = R_Zero/R_Zero;
1674 }
1675
1676 static void initServer() {
1677 int j;
1678
1679 signal(SIGHUP, SIG_IGN);
1680 signal(SIGPIPE, SIG_IGN);
1681 setupSigSegvAction();
1682
1683 server.devnull = fopen("/dev/null","w");
1684 if (server.devnull == NULL) {
1685 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1686 exit(1);
1687 }
1688 server.clients = listCreate();
1689 server.slaves = listCreate();
1690 server.monitors = listCreate();
1691 server.objfreelist = listCreate();
1692 createSharedObjects();
1693 server.el = aeCreateEventLoop();
1694 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1695 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1696 if (server.fd == -1) {
1697 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1698 exit(1);
1699 }
1700 for (j = 0; j < server.dbnum; j++) {
1701 server.db[j].dict = dictCreate(&dbDictType,NULL);
1702 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1703 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1704 if (server.vm_enabled)
1705 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1706 server.db[j].id = j;
1707 }
1708 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1709 server.pubsub_patterns = listCreate();
1710 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1711 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1712 server.cronloops = 0;
1713 server.bgsavechildpid = -1;
1714 server.bgrewritechildpid = -1;
1715 server.bgrewritebuf = sdsempty();
1716 server.lastsave = time(NULL);
1717 server.dirty = 0;
1718 server.stat_numcommands = 0;
1719 server.stat_numconnections = 0;
1720 server.stat_expiredkeys = 0;
1721 server.stat_starttime = time(NULL);
1722 server.unixtime = time(NULL);
1723 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1724 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1725 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1726
1727 if (server.appendonly) {
1728 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1729 if (server.appendfd == -1) {
1730 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1731 strerror(errno));
1732 exit(1);
1733 }
1734 }
1735
1736 if (server.vm_enabled) vmInit();
1737 }
1738
1739 /* Empty the whole database */
1740 static long long emptyDb() {
1741 int j;
1742 long long removed = 0;
1743
1744 for (j = 0; j < server.dbnum; j++) {
1745 removed += dictSize(server.db[j].dict);
1746 dictEmpty(server.db[j].dict);
1747 dictEmpty(server.db[j].expires);
1748 }
1749 return removed;
1750 }
1751
1752 static int yesnotoi(char *s) {
1753 if (!strcasecmp(s,"yes")) return 1;
1754 else if (!strcasecmp(s,"no")) return 0;
1755 else return -1;
1756 }
1757
1758 /* I agree, this is a very rudimental way to load a configuration...
1759 will improve later if the config gets more complex */
1760 static void loadServerConfig(char *filename) {
1761 FILE *fp;
1762 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1763 int linenum = 0;
1764 sds line = NULL;
1765
1766 if (filename[0] == '-' && filename[1] == '\0')
1767 fp = stdin;
1768 else {
1769 if ((fp = fopen(filename,"r")) == NULL) {
1770 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1771 exit(1);
1772 }
1773 }
1774
1775 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1776 sds *argv;
1777 int argc, j;
1778
1779 linenum++;
1780 line = sdsnew(buf);
1781 line = sdstrim(line," \t\r\n");
1782
1783 /* Skip comments and blank lines*/
1784 if (line[0] == '#' || line[0] == '\0') {
1785 sdsfree(line);
1786 continue;
1787 }
1788
1789 /* Split into arguments */
1790 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1791 sdstolower(argv[0]);
1792
1793 /* Execute config directives */
1794 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1795 server.maxidletime = atoi(argv[1]);
1796 if (server.maxidletime < 0) {
1797 err = "Invalid timeout value"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1800 server.port = atoi(argv[1]);
1801 if (server.port < 1 || server.port > 65535) {
1802 err = "Invalid port"; goto loaderr;
1803 }
1804 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1805 server.bindaddr = zstrdup(argv[1]);
1806 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1807 int seconds = atoi(argv[1]);
1808 int changes = atoi(argv[2]);
1809 if (seconds < 1 || changes < 0) {
1810 err = "Invalid save parameters"; goto loaderr;
1811 }
1812 appendServerSaveParams(seconds,changes);
1813 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1814 if (chdir(argv[1]) == -1) {
1815 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1816 argv[1], strerror(errno));
1817 exit(1);
1818 }
1819 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1820 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1821 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1822 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1823 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1824 else {
1825 err = "Invalid log level. Must be one of debug, notice, warning";
1826 goto loaderr;
1827 }
1828 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1829 FILE *logfp;
1830
1831 server.logfile = zstrdup(argv[1]);
1832 if (!strcasecmp(server.logfile,"stdout")) {
1833 zfree(server.logfile);
1834 server.logfile = NULL;
1835 }
1836 if (server.logfile) {
1837 /* Test if we are able to open the file. The server will not
1838 * be able to abort just for this problem later... */
1839 logfp = fopen(server.logfile,"a");
1840 if (logfp == NULL) {
1841 err = sdscatprintf(sdsempty(),
1842 "Can't open the log file: %s", strerror(errno));
1843 goto loaderr;
1844 }
1845 fclose(logfp);
1846 }
1847 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1848 server.dbnum = atoi(argv[1]);
1849 if (server.dbnum < 1) {
1850 err = "Invalid number of databases"; goto loaderr;
1851 }
1852 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1853 loadServerConfig(argv[1]);
1854 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1855 server.maxclients = atoi(argv[1]);
1856 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1857 server.maxmemory = memtoll(argv[1],NULL);
1858 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1859 server.masterhost = sdsnew(argv[1]);
1860 server.masterport = atoi(argv[2]);
1861 server.replstate = REDIS_REPL_CONNECT;
1862 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1863 server.masterauth = zstrdup(argv[1]);
1864 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1865 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1866 err = "argument must be 'yes' or 'no'"; goto loaderr;
1867 }
1868 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1869 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1870 err = "argument must be 'yes' or 'no'"; goto loaderr;
1871 }
1872 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1873 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1874 err = "argument must be 'yes' or 'no'"; goto loaderr;
1875 }
1876 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1877 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1878 err = "argument must be 'yes' or 'no'"; goto loaderr;
1879 }
1880 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1881 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1882 err = "argument must be 'yes' or 'no'"; goto loaderr;
1883 }
1884 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1885 if (!strcasecmp(argv[1],"no")) {
1886 server.appendfsync = APPENDFSYNC_NO;
1887 } else if (!strcasecmp(argv[1],"always")) {
1888 server.appendfsync = APPENDFSYNC_ALWAYS;
1889 } else if (!strcasecmp(argv[1],"everysec")) {
1890 server.appendfsync = APPENDFSYNC_EVERYSEC;
1891 } else {
1892 err = "argument must be 'no', 'always' or 'everysec'";
1893 goto loaderr;
1894 }
1895 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1896 server.requirepass = zstrdup(argv[1]);
1897 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1898 zfree(server.pidfile);
1899 server.pidfile = zstrdup(argv[1]);
1900 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1901 zfree(server.dbfilename);
1902 server.dbfilename = zstrdup(argv[1]);
1903 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1904 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1905 err = "argument must be 'yes' or 'no'"; goto loaderr;
1906 }
1907 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1908 zfree(server.vm_swap_file);
1909 server.vm_swap_file = zstrdup(argv[1]);
1910 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1911 server.vm_max_memory = memtoll(argv[1],NULL);
1912 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1913 server.vm_page_size = memtoll(argv[1], NULL);
1914 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1915 server.vm_pages = memtoll(argv[1], NULL);
1916 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1917 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1918 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1919 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1920 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1921 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1922 } else {
1923 err = "Bad directive or wrong number of arguments"; goto loaderr;
1924 }
1925 for (j = 0; j < argc; j++)
1926 sdsfree(argv[j]);
1927 zfree(argv);
1928 sdsfree(line);
1929 }
1930 if (fp != stdin) fclose(fp);
1931 return;
1932
1933 loaderr:
1934 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1935 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1936 fprintf(stderr, ">>> '%s'\n", line);
1937 fprintf(stderr, "%s\n", err);
1938 exit(1);
1939 }
1940
1941 static void freeClientArgv(redisClient *c) {
1942 int j;
1943
1944 for (j = 0; j < c->argc; j++)
1945 decrRefCount(c->argv[j]);
1946 for (j = 0; j < c->mbargc; j++)
1947 decrRefCount(c->mbargv[j]);
1948 c->argc = 0;
1949 c->mbargc = 0;
1950 }
1951
1952 static void freeClient(redisClient *c) {
1953 listNode *ln;
1954
1955 /* Note that if the client we are freeing is blocked into a blocking
1956 * call, we have to set querybuf to NULL *before* to call
1957 * unblockClientWaitingData() to avoid processInputBuffer() will get
1958 * called. Also it is important to remove the file events after
1959 * this, because this call adds the READABLE event. */
1960 sdsfree(c->querybuf);
1961 c->querybuf = NULL;
1962 if (c->flags & REDIS_BLOCKED)
1963 unblockClientWaitingData(c);
1964
1965 /* Unsubscribe from all the pubsub channels */
1966 pubsubUnsubscribeAllChannels(c,0);
1967 pubsubUnsubscribeAllPatterns(c,0);
1968 dictRelease(c->pubsub_channels);
1969 listRelease(c->pubsub_patterns);
1970 /* Obvious cleanup */
1971 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1972 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1973 listRelease(c->reply);
1974 freeClientArgv(c);
1975 close(c->fd);
1976 /* Remove from the list of clients */
1977 ln = listSearchKey(server.clients,c);
1978 redisAssert(ln != NULL);
1979 listDelNode(server.clients,ln);
1980 /* Remove from the list of clients waiting for swapped keys */
1981 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1982 ln = listSearchKey(server.io_ready_clients,c);
1983 if (ln) {
1984 listDelNode(server.io_ready_clients,ln);
1985 server.vm_blocked_clients--;
1986 }
1987 }
1988 while (server.vm_enabled && listLength(c->io_keys)) {
1989 ln = listFirst(c->io_keys);
1990 dontWaitForSwappedKey(c,ln->value);
1991 }
1992 listRelease(c->io_keys);
1993 /* Master/slave cleanup */
1994 if (c->flags & REDIS_SLAVE) {
1995 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1996 close(c->repldbfd);
1997 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1998 ln = listSearchKey(l,c);
1999 redisAssert(ln != NULL);
2000 listDelNode(l,ln);
2001 }
2002 if (c->flags & REDIS_MASTER) {
2003 server.master = NULL;
2004 server.replstate = REDIS_REPL_CONNECT;
2005 }
2006 /* Release memory */
2007 zfree(c->argv);
2008 zfree(c->mbargv);
2009 freeClientMultiState(c);
2010 zfree(c);
2011 }
2012
2013 #define GLUEREPLY_UP_TO (1024)
2014 static void glueReplyBuffersIfNeeded(redisClient *c) {
2015 int copylen = 0;
2016 char buf[GLUEREPLY_UP_TO];
2017 listNode *ln;
2018 listIter li;
2019 robj *o;
2020
2021 listRewind(c->reply,&li);
2022 while((ln = listNext(&li))) {
2023 int objlen;
2024
2025 o = ln->value;
2026 objlen = sdslen(o->ptr);
2027 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2028 memcpy(buf+copylen,o->ptr,objlen);
2029 copylen += objlen;
2030 listDelNode(c->reply,ln);
2031 } else {
2032 if (copylen == 0) return;
2033 break;
2034 }
2035 }
2036 /* Now the output buffer is empty, add the new single element */
2037 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2038 listAddNodeHead(c->reply,o);
2039 }
2040
2041 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2042 redisClient *c = privdata;
2043 int nwritten = 0, totwritten = 0, objlen;
2044 robj *o;
2045 REDIS_NOTUSED(el);
2046 REDIS_NOTUSED(mask);
2047
2048 /* Use writev() if we have enough buffers to send */
2049 if (!server.glueoutputbuf &&
2050 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2051 !(c->flags & REDIS_MASTER))
2052 {
2053 sendReplyToClientWritev(el, fd, privdata, mask);
2054 return;
2055 }
2056
2057 while(listLength(c->reply)) {
2058 if (server.glueoutputbuf && listLength(c->reply) > 1)
2059 glueReplyBuffersIfNeeded(c);
2060
2061 o = listNodeValue(listFirst(c->reply));
2062 objlen = sdslen(o->ptr);
2063
2064 if (objlen == 0) {
2065 listDelNode(c->reply,listFirst(c->reply));
2066 continue;
2067 }
2068
2069 if (c->flags & REDIS_MASTER) {
2070 /* Don't reply to a master */
2071 nwritten = objlen - c->sentlen;
2072 } else {
2073 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2074 if (nwritten <= 0) break;
2075 }
2076 c->sentlen += nwritten;
2077 totwritten += nwritten;
2078 /* If we fully sent the object on head go to the next one */
2079 if (c->sentlen == objlen) {
2080 listDelNode(c->reply,listFirst(c->reply));
2081 c->sentlen = 0;
2082 }
2083 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2084 * bytes, in a single threaded server it's a good idea to serve
2085 * other clients as well, even if a very large request comes from
2086 * super fast link that is always able to accept data (in real world
2087 * scenario think about 'KEYS *' against the loopback interfae) */
2088 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2089 }
2090 if (nwritten == -1) {
2091 if (errno == EAGAIN) {
2092 nwritten = 0;
2093 } else {
2094 redisLog(REDIS_VERBOSE,
2095 "Error writing to client: %s", strerror(errno));
2096 freeClient(c);
2097 return;
2098 }
2099 }
2100 if (totwritten > 0) c->lastinteraction = time(NULL);
2101 if (listLength(c->reply) == 0) {
2102 c->sentlen = 0;
2103 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2104 }
2105 }
2106
2107 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2108 {
2109 redisClient *c = privdata;
2110 int nwritten = 0, totwritten = 0, objlen, willwrite;
2111 robj *o;
2112 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2113 int offset, ion = 0;
2114 REDIS_NOTUSED(el);
2115 REDIS_NOTUSED(mask);
2116
2117 listNode *node;
2118 while (listLength(c->reply)) {
2119 offset = c->sentlen;
2120 ion = 0;
2121 willwrite = 0;
2122
2123 /* fill-in the iov[] array */
2124 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2125 o = listNodeValue(node);
2126 objlen = sdslen(o->ptr);
2127
2128 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2129 break;
2130
2131 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2132 break; /* no more iovecs */
2133
2134 iov[ion].iov_base = ((char*)o->ptr) + offset;
2135 iov[ion].iov_len = objlen - offset;
2136 willwrite += objlen - offset;
2137 offset = 0; /* just for the first item */
2138 ion++;
2139 }
2140
2141 if(willwrite == 0)
2142 break;
2143
2144 /* write all collected blocks at once */
2145 if((nwritten = writev(fd, iov, ion)) < 0) {
2146 if (errno != EAGAIN) {
2147 redisLog(REDIS_VERBOSE,
2148 "Error writing to client: %s", strerror(errno));
2149 freeClient(c);
2150 return;
2151 }
2152 break;
2153 }
2154
2155 totwritten += nwritten;
2156 offset = c->sentlen;
2157
2158 /* remove written robjs from c->reply */
2159 while (nwritten && listLength(c->reply)) {
2160 o = listNodeValue(listFirst(c->reply));
2161 objlen = sdslen(o->ptr);
2162
2163 if(nwritten >= objlen - offset) {
2164 listDelNode(c->reply, listFirst(c->reply));
2165 nwritten -= objlen - offset;
2166 c->sentlen = 0;
2167 } else {
2168 /* partial write */
2169 c->sentlen += nwritten;
2170 break;
2171 }
2172 offset = 0;
2173 }
2174 }
2175
2176 if (totwritten > 0)
2177 c->lastinteraction = time(NULL);
2178
2179 if (listLength(c->reply) == 0) {
2180 c->sentlen = 0;
2181 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2182 }
2183 }
2184
2185 static struct redisCommand *lookupCommand(char *name) {
2186 int j = 0;
2187 while(cmdTable[j].name != NULL) {
2188 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2189 j++;
2190 }
2191 return NULL;
2192 }
2193
2194 /* resetClient prepare the client to process the next command */
2195 static void resetClient(redisClient *c) {
2196 freeClientArgv(c);
2197 c->bulklen = -1;
2198 c->multibulk = 0;
2199 }
2200
2201 /* Call() is the core of Redis execution of a command */
2202 static void call(redisClient *c, struct redisCommand *cmd) {
2203 long long dirty;
2204
2205 dirty = server.dirty;
2206 cmd->proc(c);
2207 dirty = server.dirty-dirty;
2208
2209 if (server.appendonly && dirty)
2210 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2211 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2212 listLength(server.slaves))
2213 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2214 if (listLength(server.monitors))
2215 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2216 server.stat_numcommands++;
2217 }
2218
2219 /* If this function gets called we already read a whole
2220 * command, argments are in the client argv/argc fields.
2221 * processCommand() execute the command or prepare the
2222 * server for a bulk read from the client.
2223 *
2224 * If 1 is returned the client is still alive and valid and
2225 * and other operations can be performed by the caller. Otherwise
2226 * if 0 is returned the client was destroied (i.e. after QUIT). */
2227 static int processCommand(redisClient *c) {
2228 struct redisCommand *cmd;
2229
2230 /* Free some memory if needed (maxmemory setting) */
2231 if (server.maxmemory) freeMemoryIfNeeded();
2232
2233 /* Handle the multi bulk command type. This is an alternative protocol
2234 * supported by Redis in order to receive commands that are composed of
2235 * multiple binary-safe "bulk" arguments. The latency of processing is
2236 * a bit higher but this allows things like multi-sets, so if this
2237 * protocol is used only for MSET and similar commands this is a big win. */
2238 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2239 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2240 if (c->multibulk <= 0) {
2241 resetClient(c);
2242 return 1;
2243 } else {
2244 decrRefCount(c->argv[c->argc-1]);
2245 c->argc--;
2246 return 1;
2247 }
2248 } else if (c->multibulk) {
2249 if (c->bulklen == -1) {
2250 if (((char*)c->argv[0]->ptr)[0] != '$') {
2251 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2252 resetClient(c);
2253 return 1;
2254 } else {
2255 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2256 decrRefCount(c->argv[0]);
2257 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2258 c->argc--;
2259 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2260 resetClient(c);
2261 return 1;
2262 }
2263 c->argc--;
2264 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2265 return 1;
2266 }
2267 } else {
2268 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2269 c->mbargv[c->mbargc] = c->argv[0];
2270 c->mbargc++;
2271 c->argc--;
2272 c->multibulk--;
2273 if (c->multibulk == 0) {
2274 robj **auxargv;
2275 int auxargc;
2276
2277 /* Here we need to swap the multi-bulk argc/argv with the
2278 * normal argc/argv of the client structure. */
2279 auxargv = c->argv;
2280 c->argv = c->mbargv;
2281 c->mbargv = auxargv;
2282
2283 auxargc = c->argc;
2284 c->argc = c->mbargc;
2285 c->mbargc = auxargc;
2286
2287 /* We need to set bulklen to something different than -1
2288 * in order for the code below to process the command without
2289 * to try to read the last argument of a bulk command as
2290 * a special argument. */
2291 c->bulklen = 0;
2292 /* continue below and process the command */
2293 } else {
2294 c->bulklen = -1;
2295 return 1;
2296 }
2297 }
2298 }
2299 /* -- end of multi bulk commands processing -- */
2300
2301 /* The QUIT command is handled as a special case. Normal command
2302 * procs are unable to close the client connection safely */
2303 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2304 freeClient(c);
2305 return 0;
2306 }
2307
2308 /* Now lookup the command and check ASAP about trivial error conditions
2309 * such wrong arity, bad command name and so forth. */
2310 cmd = lookupCommand(c->argv[0]->ptr);
2311 if (!cmd) {
2312 addReplySds(c,
2313 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2314 (char*)c->argv[0]->ptr));
2315 resetClient(c);
2316 return 1;
2317 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2318 (c->argc < -cmd->arity)) {
2319 addReplySds(c,
2320 sdscatprintf(sdsempty(),
2321 "-ERR wrong number of arguments for '%s' command\r\n",
2322 cmd->name));
2323 resetClient(c);
2324 return 1;
2325 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2326 /* This is a bulk command, we have to read the last argument yet. */
2327 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2328
2329 decrRefCount(c->argv[c->argc-1]);
2330 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2331 c->argc--;
2332 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2333 resetClient(c);
2334 return 1;
2335 }
2336 c->argc--;
2337 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2338 /* It is possible that the bulk read is already in the
2339 * buffer. Check this condition and handle it accordingly.
2340 * This is just a fast path, alternative to call processInputBuffer().
2341 * It's a good idea since the code is small and this condition
2342 * happens most of the times. */
2343 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2344 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2345 c->argc++;
2346 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2347 } else {
2348 /* Otherwise return... there is to read the last argument
2349 * from the socket. */
2350 return 1;
2351 }
2352 }
2353 /* Let's try to encode the bulk object to save space. */
2354 if (cmd->flags & REDIS_CMD_BULK)
2355 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2356
2357 /* Check if the user is authenticated */
2358 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2359 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2360 resetClient(c);
2361 return 1;
2362 }
2363
2364 /* Handle the maxmemory directive */
2365 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2366 zmalloc_used_memory() > server.maxmemory)
2367 {
2368 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2369 resetClient(c);
2370 return 1;
2371 }
2372
2373 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2374 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2375 &&
2376 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2377 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2378 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2379 resetClient(c);
2380 return 1;
2381 }
2382
2383 /* Exec the command */
2384 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2385 queueMultiCommand(c,cmd);
2386 addReply(c,shared.queued);
2387 } else {
2388 if (server.vm_enabled && server.vm_max_threads > 0 &&
2389 blockClientOnSwappedKeys(cmd,c)) return 1;
2390 call(c,cmd);
2391 }
2392
2393 /* Prepare the client for the next command */
2394 resetClient(c);
2395 return 1;
2396 }
2397
2398 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2399 listNode *ln;
2400 listIter li;
2401 int outc = 0, j;
2402 robj **outv;
2403 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2404 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2405 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2406 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2407 robj *lenobj;
2408
2409 if (argc <= REDIS_STATIC_ARGS) {
2410 outv = static_outv;
2411 } else {
2412 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2413 }
2414
2415 lenobj = createObject(REDIS_STRING,
2416 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2417 lenobj->refcount = 0;
2418 outv[outc++] = lenobj;
2419 for (j = 0; j < argc; j++) {
2420 lenobj = createObject(REDIS_STRING,
2421 sdscatprintf(sdsempty(),"$%lu\r\n",
2422 (unsigned long) stringObjectLen(argv[j])));
2423 lenobj->refcount = 0;
2424 outv[outc++] = lenobj;
2425 outv[outc++] = argv[j];
2426 outv[outc++] = shared.crlf;
2427 }
2428
2429 /* Increment all the refcounts at start and decrement at end in order to
2430 * be sure to free objects if there is no slave in a replication state
2431 * able to be feed with commands */
2432 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2433 listRewind(slaves,&li);
2434 while((ln = listNext(&li))) {
2435 redisClient *slave = ln->value;
2436
2437 /* Don't feed slaves that are still waiting for BGSAVE to start */
2438 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2439
2440 /* Feed all the other slaves, MONITORs and so on */
2441 if (slave->slaveseldb != dictid) {
2442 robj *selectcmd;
2443
2444 switch(dictid) {
2445 case 0: selectcmd = shared.select0; break;
2446 case 1: selectcmd = shared.select1; break;
2447 case 2: selectcmd = shared.select2; break;
2448 case 3: selectcmd = shared.select3; break;
2449 case 4: selectcmd = shared.select4; break;
2450 case 5: selectcmd = shared.select5; break;
2451 case 6: selectcmd = shared.select6; break;
2452 case 7: selectcmd = shared.select7; break;
2453 case 8: selectcmd = shared.select8; break;
2454 case 9: selectcmd = shared.select9; break;
2455 default:
2456 selectcmd = createObject(REDIS_STRING,
2457 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2458 selectcmd->refcount = 0;
2459 break;
2460 }
2461 addReply(slave,selectcmd);
2462 slave->slaveseldb = dictid;
2463 }
2464 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2465 }
2466 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2467 if (outv != static_outv) zfree(outv);
2468 }
2469
2470 static void processInputBuffer(redisClient *c) {
2471 again:
2472 /* Before to process the input buffer, make sure the client is not
2473 * waitig for a blocking operation such as BLPOP. Note that the first
2474 * iteration the client is never blocked, otherwise the processInputBuffer
2475 * would not be called at all, but after the execution of the first commands
2476 * in the input buffer the client may be blocked, and the "goto again"
2477 * will try to reiterate. The following line will make it return asap. */
2478 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2479 if (c->bulklen == -1) {
2480 /* Read the first line of the query */
2481 char *p = strchr(c->querybuf,'\n');
2482 size_t querylen;
2483
2484 if (p) {
2485 sds query, *argv;
2486 int argc, j;
2487
2488 query = c->querybuf;
2489 c->querybuf = sdsempty();
2490 querylen = 1+(p-(query));
2491 if (sdslen(query) > querylen) {
2492 /* leave data after the first line of the query in the buffer */
2493 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2494 }
2495 *p = '\0'; /* remove "\n" */
2496 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2497 sdsupdatelen(query);
2498
2499 /* Now we can split the query in arguments */
2500 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2501 sdsfree(query);
2502
2503 if (c->argv) zfree(c->argv);
2504 c->argv = zmalloc(sizeof(robj*)*argc);
2505
2506 for (j = 0; j < argc; j++) {
2507 if (sdslen(argv[j])) {
2508 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2509 c->argc++;
2510 } else {
2511 sdsfree(argv[j]);
2512 }
2513 }
2514 zfree(argv);
2515 if (c->argc) {
2516 /* Execute the command. If the client is still valid
2517 * after processCommand() return and there is something
2518 * on the query buffer try to process the next command. */
2519 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2520 } else {
2521 /* Nothing to process, argc == 0. Just process the query
2522 * buffer if it's not empty or return to the caller */
2523 if (sdslen(c->querybuf)) goto again;
2524 }
2525 return;
2526 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2527 redisLog(REDIS_VERBOSE, "Client protocol error");
2528 freeClient(c);
2529 return;
2530 }
2531 } else {
2532 /* Bulk read handling. Note that if we are at this point
2533 the client already sent a command terminated with a newline,
2534 we are reading the bulk data that is actually the last
2535 argument of the command. */
2536 int qbl = sdslen(c->querybuf);
2537
2538 if (c->bulklen <= qbl) {
2539 /* Copy everything but the final CRLF as final argument */
2540 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2541 c->argc++;
2542 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2543 /* Process the command. If the client is still valid after
2544 * the processing and there is more data in the buffer
2545 * try to parse it. */
2546 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2547 return;
2548 }
2549 }
2550 }
2551
2552 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2553 redisClient *c = (redisClient*) privdata;
2554 char buf[REDIS_IOBUF_LEN];
2555 int nread;
2556 REDIS_NOTUSED(el);
2557 REDIS_NOTUSED(mask);
2558
2559 nread = read(fd, buf, REDIS_IOBUF_LEN);
2560 if (nread == -1) {
2561 if (errno == EAGAIN) {
2562 nread = 0;
2563 } else {
2564 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2565 freeClient(c);
2566 return;
2567 }
2568 } else if (nread == 0) {
2569 redisLog(REDIS_VERBOSE, "Client closed connection");
2570 freeClient(c);
2571 return;
2572 }
2573 if (nread) {
2574 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2575 c->lastinteraction = time(NULL);
2576 } else {
2577 return;
2578 }
2579 processInputBuffer(c);
2580 }
2581
2582 static int selectDb(redisClient *c, int id) {
2583 if (id < 0 || id >= server.dbnum)
2584 return REDIS_ERR;
2585 c->db = &server.db[id];
2586 return REDIS_OK;
2587 }
2588
2589 static void *dupClientReplyValue(void *o) {
2590 incrRefCount((robj*)o);
2591 return o;
2592 }
2593
2594 static int listMatchObjects(void *a, void *b) {
2595 return compareStringObjects(a,b) == 0;
2596 }
2597
2598 static redisClient *createClient(int fd) {
2599 redisClient *c = zmalloc(sizeof(*c));
2600
2601 anetNonBlock(NULL,fd);
2602 anetTcpNoDelay(NULL,fd);
2603 if (!c) return NULL;
2604 selectDb(c,0);
2605 c->fd = fd;
2606 c->querybuf = sdsempty();
2607 c->argc = 0;
2608 c->argv = NULL;
2609 c->bulklen = -1;
2610 c->multibulk = 0;
2611 c->mbargc = 0;
2612 c->mbargv = NULL;
2613 c->sentlen = 0;
2614 c->flags = 0;
2615 c->lastinteraction = time(NULL);
2616 c->authenticated = 0;
2617 c->replstate = REDIS_REPL_NONE;
2618 c->reply = listCreate();
2619 listSetFreeMethod(c->reply,decrRefCount);
2620 listSetDupMethod(c->reply,dupClientReplyValue);
2621 c->blockingkeys = NULL;
2622 c->blockingkeysnum = 0;
2623 c->io_keys = listCreate();
2624 listSetFreeMethod(c->io_keys,decrRefCount);
2625 c->pubsub_channels = dictCreate(&setDictType,NULL);
2626 c->pubsub_patterns = listCreate();
2627 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2628 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2629 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2630 readQueryFromClient, c) == AE_ERR) {
2631 freeClient(c);
2632 return NULL;
2633 }
2634 listAddNodeTail(server.clients,c);
2635 initClientMultiState(c);
2636 return c;
2637 }
2638
2639 static void addReply(redisClient *c, robj *obj) {
2640 if (listLength(c->reply) == 0 &&
2641 (c->replstate == REDIS_REPL_NONE ||
2642 c->replstate == REDIS_REPL_ONLINE) &&
2643 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2644 sendReplyToClient, c) == AE_ERR) return;
2645
2646 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2647 obj = dupStringObject(obj);
2648 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2649 }
2650 listAddNodeTail(c->reply,getDecodedObject(obj));
2651 }
2652
2653 static void addReplySds(redisClient *c, sds s) {
2654 robj *o = createObject(REDIS_STRING,s);
2655 addReply(c,o);
2656 decrRefCount(o);
2657 }
2658
2659 static void addReplyDouble(redisClient *c, double d) {
2660 char buf[128];
2661
2662 snprintf(buf,sizeof(buf),"%.17g",d);
2663 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2664 (unsigned long) strlen(buf),buf));
2665 }
2666
2667 static void addReplyLong(redisClient *c, long l) {
2668 char buf[128];
2669 size_t len;
2670
2671 if (l == 0) {
2672 addReply(c,shared.czero);
2673 return;
2674 } else if (l == 1) {
2675 addReply(c,shared.cone);
2676 return;
2677 }
2678 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2679 addReplySds(c,sdsnewlen(buf,len));
2680 }
2681
2682 static void addReplyLongLong(redisClient *c, long long ll) {
2683 char buf[128];
2684 size_t len;
2685
2686 if (ll == 0) {
2687 addReply(c,shared.czero);
2688 return;
2689 } else if (ll == 1) {
2690 addReply(c,shared.cone);
2691 return;
2692 }
2693 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2694 addReplySds(c,sdsnewlen(buf,len));
2695 }
2696
2697 static void addReplyUlong(redisClient *c, unsigned long ul) {
2698 char buf[128];
2699 size_t len;
2700
2701 if (ul == 0) {
2702 addReply(c,shared.czero);
2703 return;
2704 } else if (ul == 1) {
2705 addReply(c,shared.cone);
2706 return;
2707 }
2708 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2709 addReplySds(c,sdsnewlen(buf,len));
2710 }
2711
2712 static void addReplyBulkLen(redisClient *c, robj *obj) {
2713 size_t len;
2714
2715 if (obj->encoding == REDIS_ENCODING_RAW) {
2716 len = sdslen(obj->ptr);
2717 } else {
2718 long n = (long)obj->ptr;
2719
2720 /* Compute how many bytes will take this integer as a radix 10 string */
2721 len = 1;
2722 if (n < 0) {
2723 len++;
2724 n = -n;
2725 }
2726 while((n = n/10) != 0) {
2727 len++;
2728 }
2729 }
2730 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2731 }
2732
2733 static void addReplyBulk(redisClient *c, robj *obj) {
2734 addReplyBulkLen(c,obj);
2735 addReply(c,obj);
2736 addReply(c,shared.crlf);
2737 }
2738
2739 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2740 static void addReplyBulkCString(redisClient *c, char *s) {
2741 if (s == NULL) {
2742 addReply(c,shared.nullbulk);
2743 } else {
2744 robj *o = createStringObject(s,strlen(s));
2745 addReplyBulk(c,o);
2746 decrRefCount(o);
2747 }
2748 }
2749
2750 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2751 int cport, cfd;
2752 char cip[128];
2753 redisClient *c;
2754 REDIS_NOTUSED(el);
2755 REDIS_NOTUSED(mask);
2756 REDIS_NOTUSED(privdata);
2757
2758 cfd = anetAccept(server.neterr, fd, cip, &cport);
2759 if (cfd == AE_ERR) {
2760 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2761 return;
2762 }
2763 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2764 if ((c = createClient(cfd)) == NULL) {
2765 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2766 close(cfd); /* May be already closed, just ingore errors */
2767 return;
2768 }
2769 /* If maxclient directive is set and this is one client more... close the
2770 * connection. Note that we create the client instead to check before
2771 * for this condition, since now the socket is already set in nonblocking
2772 * mode and we can send an error for free using the Kernel I/O */
2773 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2774 char *err = "-ERR max number of clients reached\r\n";
2775
2776 /* That's a best effort error message, don't check write errors */
2777 if (write(c->fd,err,strlen(err)) == -1) {
2778 /* Nothing to do, Just to avoid the warning... */
2779 }
2780 freeClient(c);
2781 return;
2782 }
2783 server.stat_numconnections++;
2784 }
2785
2786 /* ======================= Redis objects implementation ===================== */
2787
2788 static robj *createObject(int type, void *ptr) {
2789 robj *o;
2790
2791 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2792 if (listLength(server.objfreelist)) {
2793 listNode *head = listFirst(server.objfreelist);
2794 o = listNodeValue(head);
2795 listDelNode(server.objfreelist,head);
2796 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2797 } else {
2798 if (server.vm_enabled) {
2799 pthread_mutex_unlock(&server.obj_freelist_mutex);
2800 o = zmalloc(sizeof(*o));
2801 } else {
2802 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2803 }
2804 }
2805 o->type = type;
2806 o->encoding = REDIS_ENCODING_RAW;
2807 o->ptr = ptr;
2808 o->refcount = 1;
2809 if (server.vm_enabled) {
2810 /* Note that this code may run in the context of an I/O thread
2811 * and accessing to server.unixtime in theory is an error
2812 * (no locks). But in practice this is safe, and even if we read
2813 * garbage Redis will not fail, as it's just a statistical info */
2814 o->vm.atime = server.unixtime;
2815 o->storage = REDIS_VM_MEMORY;
2816 }
2817 return o;
2818 }
2819
2820 static robj *createStringObject(char *ptr, size_t len) {
2821 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2822 }
2823
2824 static robj *createStringObjectFromLongLong(long long value) {
2825 robj *o;
2826 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2827 incrRefCount(shared.integers[value]);
2828 o = shared.integers[value];
2829 } else {
2830 o = createObject(REDIS_STRING, NULL);
2831 if (value >= LONG_MIN && value <= LONG_MAX) {
2832 o->encoding = REDIS_ENCODING_INT;
2833 o->ptr = (void*)((long)value);
2834 } else {
2835 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2836 }
2837 }
2838 return o;
2839 }
2840
2841 static robj *dupStringObject(robj *o) {
2842 assert(o->encoding == REDIS_ENCODING_RAW);
2843 return createStringObject(o->ptr,sdslen(o->ptr));
2844 }
2845
2846 static robj *createListObject(void) {
2847 list *l = listCreate();
2848
2849 listSetFreeMethod(l,decrRefCount);
2850 return createObject(REDIS_LIST,l);
2851 }
2852
2853 static robj *createSetObject(void) {
2854 dict *d = dictCreate(&setDictType,NULL);
2855 return createObject(REDIS_SET,d);
2856 }
2857
2858 static robj *createHashObject(void) {
2859 /* All the Hashes start as zipmaps. Will be automatically converted
2860 * into hash tables if there are enough elements or big elements
2861 * inside. */
2862 unsigned char *zm = zipmapNew();
2863 robj *o = createObject(REDIS_HASH,zm);
2864 o->encoding = REDIS_ENCODING_ZIPMAP;
2865 return o;
2866 }
2867
2868 static robj *createZsetObject(void) {
2869 zset *zs = zmalloc(sizeof(*zs));
2870
2871 zs->dict = dictCreate(&zsetDictType,NULL);
2872 zs->zsl = zslCreate();
2873 return createObject(REDIS_ZSET,zs);
2874 }
2875
2876 static void freeStringObject(robj *o) {
2877 if (o->encoding == REDIS_ENCODING_RAW) {
2878 sdsfree(o->ptr);
2879 }
2880 }
2881
2882 static void freeListObject(robj *o) {
2883 listRelease((list*) o->ptr);
2884 }
2885
2886 static void freeSetObject(robj *o) {
2887 dictRelease((dict*) o->ptr);
2888 }
2889
2890 static void freeZsetObject(robj *o) {
2891 zset *zs = o->ptr;
2892
2893 dictRelease(zs->dict);
2894 zslFree(zs->zsl);
2895 zfree(zs);
2896 }
2897
2898 static void freeHashObject(robj *o) {
2899 switch (o->encoding) {
2900 case REDIS_ENCODING_HT:
2901 dictRelease((dict*) o->ptr);
2902 break;
2903 case REDIS_ENCODING_ZIPMAP:
2904 zfree(o->ptr);
2905 break;
2906 default:
2907 redisPanic("Unknown hash encoding type");
2908 break;
2909 }
2910 }
2911
2912 static void incrRefCount(robj *o) {
2913 o->refcount++;
2914 }
2915
2916 static void decrRefCount(void *obj) {
2917 robj *o = obj;
2918
2919 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2920 /* Object is a key of a swapped out value, or in the process of being
2921 * loaded. */
2922 if (server.vm_enabled &&
2923 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2924 {
2925 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2926 redisAssert(o->type == REDIS_STRING);
2927 freeStringObject(o);
2928 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2929 pthread_mutex_lock(&server.obj_freelist_mutex);
2930 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2931 !listAddNodeHead(server.objfreelist,o))
2932 zfree(o);
2933 pthread_mutex_unlock(&server.obj_freelist_mutex);
2934 server.vm_stats_swapped_objects--;
2935 return;
2936 }
2937 /* Object is in memory, or in the process of being swapped out. */
2938 if (--(o->refcount) == 0) {
2939 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2940 vmCancelThreadedIOJob(obj);
2941 switch(o->type) {
2942 case REDIS_STRING: freeStringObject(o); break;
2943 case REDIS_LIST: freeListObject(o); break;
2944 case REDIS_SET: freeSetObject(o); break;
2945 case REDIS_ZSET: freeZsetObject(o); break;
2946 case REDIS_HASH: freeHashObject(o); break;
2947 default: redisPanic("Unknown object type"); break;
2948 }
2949 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2950 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2951 !listAddNodeHead(server.objfreelist,o))
2952 zfree(o);
2953 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2954 }
2955 }
2956
2957 static robj *lookupKey(redisDb *db, robj *key) {
2958 dictEntry *de = dictFind(db->dict,key);
2959 if (de) {
2960 robj *key = dictGetEntryKey(de);
2961 robj *val = dictGetEntryVal(de);
2962
2963 if (server.vm_enabled) {
2964 if (key->storage == REDIS_VM_MEMORY ||
2965 key->storage == REDIS_VM_SWAPPING)
2966 {
2967 /* If we were swapping the object out, stop it, this key
2968 * was requested. */
2969 if (key->storage == REDIS_VM_SWAPPING)
2970 vmCancelThreadedIOJob(key);
2971 /* Update the access time of the key for the aging algorithm. */
2972 key->vm.atime = server.unixtime;
2973 } else {
2974 int notify = (key->storage == REDIS_VM_LOADING);
2975
2976 /* Our value was swapped on disk. Bring it at home. */
2977 redisAssert(val == NULL);
2978 val = vmLoadObject(key);
2979 dictGetEntryVal(de) = val;
2980
2981 /* Clients blocked by the VM subsystem may be waiting for
2982 * this key... */
2983 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2984 }
2985 }
2986 return val;
2987 } else {
2988 return NULL;
2989 }
2990 }
2991
2992 static robj *lookupKeyRead(redisDb *db, robj *key) {
2993 expireIfNeeded(db,key);
2994 return lookupKey(db,key);
2995 }
2996
2997 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2998 deleteIfVolatile(db,key);
2999 return lookupKey(db,key);
3000 }
3001
3002 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3003 robj *o = lookupKeyRead(c->db, key);
3004 if (!o) addReply(c,reply);
3005 return o;
3006 }
3007
3008 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3009 robj *o = lookupKeyWrite(c->db, key);
3010 if (!o) addReply(c,reply);
3011 return o;
3012 }
3013
3014 static int checkType(redisClient *c, robj *o, int type) {
3015 if (o->type != type) {
3016 addReply(c,shared.wrongtypeerr);
3017 return 1;
3018 }
3019 return 0;
3020 }
3021
3022 static int deleteKey(redisDb *db, robj *key) {
3023 int retval;
3024
3025 /* We need to protect key from destruction: after the first dictDelete()
3026 * it may happen that 'key' is no longer valid if we don't increment
3027 * it's count. This may happen when we get the object reference directly
3028 * from the hash table with dictRandomKey() or dict iterators */
3029 incrRefCount(key);
3030 if (dictSize(db->expires)) dictDelete(db->expires,key);
3031 retval = dictDelete(db->dict,key);
3032 decrRefCount(key);
3033
3034 return retval == DICT_OK;
3035 }
3036
3037 /* Check if the nul-terminated string 's' can be represented by a long
3038 * (that is, is a number that fits into long without any other space or
3039 * character before or after the digits).
3040 *
3041 * If so, the function returns REDIS_OK and *longval is set to the value
3042 * of the number. Otherwise REDIS_ERR is returned */
3043 static int isStringRepresentableAsLong(sds s, long *longval) {
3044 char buf[32], *endptr;
3045 long value;
3046 int slen;
3047
3048 value = strtol(s, &endptr, 10);
3049 if (endptr[0] != '\0') return REDIS_ERR;
3050 slen = snprintf(buf,32,"%ld",value);
3051
3052 /* If the number converted back into a string is not identical
3053 * then it's not possible to encode the string as integer */
3054 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3055 if (longval) *longval = value;
3056 return REDIS_OK;
3057 }
3058
3059 /* Try to encode a string object in order to save space */
3060 static robj *tryObjectEncoding(robj *o) {
3061 long value;
3062 sds s = o->ptr;
3063
3064 if (o->encoding != REDIS_ENCODING_RAW)
3065 return o; /* Already encoded */
3066
3067 /* It's not safe to encode shared objects: shared objects can be shared
3068 * everywhere in the "object space" of Redis. Encoded objects can only
3069 * appear as "values" (and not, for instance, as keys) */
3070 if (o->refcount > 1) return o;
3071
3072 /* Currently we try to encode only strings */
3073 redisAssert(o->type == REDIS_STRING);
3074
3075 /* Check if we can represent this string as a long integer */
3076 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3077
3078 /* Ok, this object can be encoded */
3079 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3080 decrRefCount(o);
3081 incrRefCount(shared.integers[value]);
3082 return shared.integers[value];
3083 } else {
3084 o->encoding = REDIS_ENCODING_INT;
3085 sdsfree(o->ptr);
3086 o->ptr = (void*) value;
3087 return o;
3088 }
3089 }
3090
3091 /* Get a decoded version of an encoded object (returned as a new object).
3092 * If the object is already raw-encoded just increment the ref count. */
3093 static robj *getDecodedObject(robj *o) {
3094 robj *dec;
3095
3096 if (o->encoding == REDIS_ENCODING_RAW) {
3097 incrRefCount(o);
3098 return o;
3099 }
3100 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3101 char buf[32];
3102
3103 snprintf(buf,32,"%ld",(long)o->ptr);
3104 dec = createStringObject(buf,strlen(buf));
3105 return dec;
3106 } else {
3107 redisPanic("Unknown encoding type");
3108 }
3109 }
3110
3111 /* Compare two string objects via strcmp() or alike.
3112 * Note that the objects may be integer-encoded. In such a case we
3113 * use snprintf() to get a string representation of the numbers on the stack
3114 * and compare the strings, it's much faster than calling getDecodedObject().
3115 *
3116 * Important note: if objects are not integer encoded, but binary-safe strings,
3117 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3118 * binary safe. */
3119 static int compareStringObjects(robj *a, robj *b) {
3120 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3121 char bufa[128], bufb[128], *astr, *bstr;
3122 int bothsds = 1;
3123
3124 if (a == b) return 0;
3125 if (a->encoding != REDIS_ENCODING_RAW) {
3126 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3127 astr = bufa;
3128 bothsds = 0;
3129 } else {
3130 astr = a->ptr;
3131 }
3132 if (b->encoding != REDIS_ENCODING_RAW) {
3133 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3134 bstr = bufb;
3135 bothsds = 0;
3136 } else {
3137 bstr = b->ptr;
3138 }
3139 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3140 }
3141
3142 static size_t stringObjectLen(robj *o) {
3143 redisAssert(o->type == REDIS_STRING);
3144 if (o->encoding == REDIS_ENCODING_RAW) {
3145 return sdslen(o->ptr);
3146 } else {
3147 char buf[32];
3148
3149 return snprintf(buf,32,"%ld",(long)o->ptr);
3150 }
3151 }
3152
3153 static int getDoubleFromObject(robj *o, double *target) {
3154 double value;
3155 char *eptr;
3156
3157 if (o == NULL) {
3158 value = 0;
3159 } else {
3160 redisAssert(o->type == REDIS_STRING);
3161 if (o->encoding == REDIS_ENCODING_RAW) {
3162 value = strtod(o->ptr, &eptr);
3163 if (eptr[0] != '\0') return REDIS_ERR;
3164 } else if (o->encoding == REDIS_ENCODING_INT) {
3165 value = (long)o->ptr;
3166 } else {
3167 redisAssert(1 != 1);
3168 }
3169 }
3170
3171 *target = value;
3172 return REDIS_OK;
3173 }
3174
3175 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3176 double value;
3177 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3178 if (msg != NULL) {
3179 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3180 } else {
3181 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3182 }
3183 return REDIS_ERR;
3184 }
3185
3186 *target = value;
3187 return REDIS_OK;
3188 }
3189
3190 static int getLongLongFromObject(robj *o, long long *target) {
3191 long long value;
3192 char *eptr;
3193
3194 if (o == NULL) {
3195 value = 0;
3196 } else {
3197 redisAssert(o->type == REDIS_STRING);
3198 if (o->encoding == REDIS_ENCODING_RAW) {
3199 value = strtoll(o->ptr, &eptr, 10);
3200 if (eptr[0] != '\0') return REDIS_ERR;
3201 } else if (o->encoding == REDIS_ENCODING_INT) {
3202 value = (long)o->ptr;
3203 } else {
3204 redisAssert(1 != 1);
3205 }
3206 }
3207
3208 *target = value;
3209 return REDIS_OK;
3210 }
3211
3212 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3213 long long value;
3214 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3215 if (msg != NULL) {
3216 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3217 } else {
3218 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3219 }
3220 return REDIS_ERR;
3221 }
3222
3223 *target = value;
3224 return REDIS_OK;
3225 }
3226
3227 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3228 long long value;
3229
3230 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3231 if (value < LONG_MIN || value > LONG_MAX) {
3232 if (msg != NULL) {
3233 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3234 } else {
3235 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3236 }
3237 return REDIS_ERR;
3238 }
3239
3240 *target = value;
3241 return REDIS_OK;
3242 }
3243
3244 /*============================ RDB saving/loading =========================== */
3245
3246 static int rdbSaveType(FILE *fp, unsigned char type) {
3247 if (fwrite(&type,1,1,fp) == 0) return -1;
3248 return 0;
3249 }
3250
3251 static int rdbSaveTime(FILE *fp, time_t t) {
3252 int32_t t32 = (int32_t) t;
3253 if (fwrite(&t32,4,1,fp) == 0) return -1;
3254 return 0;
3255 }
3256
3257 /* check rdbLoadLen() comments for more info */
3258 static int rdbSaveLen(FILE *fp, uint32_t len) {
3259 unsigned char buf[2];
3260
3261 if (len < (1<<6)) {
3262 /* Save a 6 bit len */
3263 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3264 if (fwrite(buf,1,1,fp) == 0) return -1;
3265 } else if (len < (1<<14)) {
3266 /* Save a 14 bit len */
3267 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3268 buf[1] = len&0xFF;
3269 if (fwrite(buf,2,1,fp) == 0) return -1;
3270 } else {
3271 /* Save a 32 bit len */
3272 buf[0] = (REDIS_RDB_32BITLEN<<6);
3273 if (fwrite(buf,1,1,fp) == 0) return -1;
3274 len = htonl(len);
3275 if (fwrite(&len,4,1,fp) == 0) return -1;
3276 }
3277 return 0;
3278 }
3279
3280 /* String objects in the form "2391" "-100" without any space and with a
3281 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3282 * encoded as integers to save space */
3283 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3284 long long value;
3285 char *endptr, buf[32];
3286
3287 /* Check if it's possible to encode this value as a number */
3288 value = strtoll(s, &endptr, 10);
3289 if (endptr[0] != '\0') return 0;
3290 snprintf(buf,32,"%lld",value);
3291
3292 /* If the number converted back into a string is not identical
3293 * then it's not possible to encode the string as integer */
3294 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3295
3296 /* Finally check if it fits in our ranges */
3297 if (value >= -(1<<7) && value <= (1<<7)-1) {
3298 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3299 enc[1] = value&0xFF;
3300 return 2;
3301 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3302 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3303 enc[1] = value&0xFF;
3304 enc[2] = (value>>8)&0xFF;
3305 return 3;
3306 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3307 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3308 enc[1] = value&0xFF;
3309 enc[2] = (value>>8)&0xFF;
3310 enc[3] = (value>>16)&0xFF;
3311 enc[4] = (value>>24)&0xFF;
3312 return 5;
3313 } else {
3314 return 0;
3315 }
3316 }
3317
3318 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3319 size_t comprlen, outlen;
3320 unsigned char byte;
3321 void *out;
3322
3323 /* We require at least four bytes compression for this to be worth it */
3324 if (len <= 4) return 0;
3325 outlen = len-4;
3326 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3327 comprlen = lzf_compress(s, len, out, outlen);
3328 if (comprlen == 0) {
3329 zfree(out);
3330 return 0;
3331 }
3332 /* Data compressed! Let's save it on disk */
3333 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3334 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3335 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3336 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3337 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3338 zfree(out);
3339 return comprlen;
3340
3341 writeerr:
3342 zfree(out);
3343 return -1;
3344 }
3345
3346 /* Save a string objet as [len][data] on disk. If the object is a string
3347 * representation of an integer value we try to safe it in a special form */
3348 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3349 int enclen;
3350
3351 /* Try integer encoding */
3352 if (len <= 11) {
3353 unsigned char buf[5];
3354 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3355 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3356 return 0;
3357 }
3358 }
3359
3360 /* Try LZF compression - under 20 bytes it's unable to compress even
3361 * aaaaaaaaaaaaaaaaaa so skip it */
3362 if (server.rdbcompression && len > 20) {
3363 int retval;
3364
3365 retval = rdbSaveLzfStringObject(fp,s,len);
3366 if (retval == -1) return -1;
3367 if (retval > 0) return 0;
3368 /* retval == 0 means data can't be compressed, save the old way */
3369 }
3370
3371 /* Store verbatim */
3372 if (rdbSaveLen(fp,len) == -1) return -1;
3373 if (len && fwrite(s,len,1,fp) == 0) return -1;
3374 return 0;
3375 }
3376
3377 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3378 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3379 int retval;
3380
3381 /* Avoid incr/decr ref count business when possible.
3382 * This plays well with copy-on-write given that we are probably
3383 * in a child process (BGSAVE). Also this makes sure key objects
3384 * of swapped objects are not incRefCount-ed (an assert does not allow
3385 * this in order to avoid bugs) */
3386 if (obj->encoding != REDIS_ENCODING_RAW) {
3387 obj = getDecodedObject(obj);
3388 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3389 decrRefCount(obj);
3390 } else {
3391 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3392 }
3393 return retval;
3394 }
3395
3396 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3397 * 8 bit integer specifing the length of the representation.
3398 * This 8 bit integer has special values in order to specify the following
3399 * conditions:
3400 * 253: not a number
3401 * 254: + inf
3402 * 255: - inf
3403 */
3404 static int rdbSaveDoubleValue(FILE *fp, double val) {
3405 unsigned char buf[128];
3406 int len;
3407
3408 if (isnan(val)) {
3409 buf[0] = 253;
3410 len = 1;
3411 } else if (!isfinite(val)) {
3412 len = 1;
3413 buf[0] = (val < 0) ? 255 : 254;
3414 } else {
3415 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3416 buf[0] = strlen((char*)buf+1);
3417 len = buf[0]+1;
3418 }
3419 if (fwrite(buf,len,1,fp) == 0) return -1;
3420 return 0;
3421 }
3422
3423 /* Save a Redis object. */
3424 static int rdbSaveObject(FILE *fp, robj *o) {
3425 if (o->type == REDIS_STRING) {
3426 /* Save a string value */
3427 if (rdbSaveStringObject(fp,o) == -1) return -1;
3428 } else if (o->type == REDIS_LIST) {
3429 /* Save a list value */
3430 list *list = o->ptr;
3431 listIter li;
3432 listNode *ln;
3433
3434 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3435 listRewind(list,&li);
3436 while((ln = listNext(&li))) {
3437 robj *eleobj = listNodeValue(ln);
3438
3439 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3440 }
3441 } else if (o->type == REDIS_SET) {
3442 /* Save a set value */
3443 dict *set = o->ptr;
3444 dictIterator *di = dictGetIterator(set);
3445 dictEntry *de;
3446
3447 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3448 while((de = dictNext(di)) != NULL) {
3449 robj *eleobj = dictGetEntryKey(de);
3450
3451 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3452 }
3453 dictReleaseIterator(di);
3454 } else if (o->type == REDIS_ZSET) {
3455 /* Save a set value */
3456 zset *zs = o->ptr;
3457 dictIterator *di = dictGetIterator(zs->dict);
3458 dictEntry *de;
3459
3460 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3461 while((de = dictNext(di)) != NULL) {
3462 robj *eleobj = dictGetEntryKey(de);
3463 double *score = dictGetEntryVal(de);
3464
3465 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3466 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3467 }
3468 dictReleaseIterator(di);
3469 } else if (o->type == REDIS_HASH) {
3470 /* Save a hash value */
3471 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3472 unsigned char *p = zipmapRewind(o->ptr);
3473 unsigned int count = zipmapLen(o->ptr);
3474 unsigned char *key, *val;
3475 unsigned int klen, vlen;
3476
3477 if (rdbSaveLen(fp,count) == -1) return -1;
3478 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3479 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3480 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3481 }
3482 } else {
3483 dictIterator *di = dictGetIterator(o->ptr);
3484 dictEntry *de;
3485
3486 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3487 while((de = dictNext(di)) != NULL) {
3488 robj *key = dictGetEntryKey(de);
3489 robj *val = dictGetEntryVal(de);
3490
3491 if (rdbSaveStringObject(fp,key) == -1) return -1;
3492 if (rdbSaveStringObject(fp,val) == -1) return -1;
3493 }
3494 dictReleaseIterator(di);
3495 }
3496 } else {
3497 redisPanic("Unknown object type");
3498 }
3499 return 0;
3500 }
3501
3502 /* Return the length the object will have on disk if saved with
3503 * the rdbSaveObject() function. Currently we use a trick to get
3504 * this length with very little changes to the code. In the future
3505 * we could switch to a faster solution. */
3506 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3507 if (fp == NULL) fp = server.devnull;
3508 rewind(fp);
3509 assert(rdbSaveObject(fp,o) != 1);
3510 return ftello(fp);
3511 }
3512
3513 /* Return the number of pages required to save this object in the swap file */
3514 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3515 off_t bytes = rdbSavedObjectLen(o,fp);
3516
3517 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3518 }
3519
3520 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3521 static int rdbSave(char *filename) {
3522 dictIterator *di = NULL;
3523 dictEntry *de;
3524 FILE *fp;
3525 char tmpfile[256];
3526 int j;
3527 time_t now = time(NULL);
3528
3529 /* Wait for I/O therads to terminate, just in case this is a
3530 * foreground-saving, to avoid seeking the swap file descriptor at the
3531 * same time. */
3532 if (server.vm_enabled)
3533 waitEmptyIOJobsQueue();
3534
3535 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3536 fp = fopen(tmpfile,"w");
3537 if (!fp) {
3538 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3539 return REDIS_ERR;
3540 }
3541 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3542 for (j = 0; j < server.dbnum; j++) {
3543 redisDb *db = server.db+j;
3544 dict *d = db->dict;
3545 if (dictSize(d) == 0) continue;
3546 di = dictGetIterator(d);
3547 if (!di) {
3548 fclose(fp);
3549 return REDIS_ERR;
3550 }
3551
3552 /* Write the SELECT DB opcode */
3553 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3554 if (rdbSaveLen(fp,j) == -1) goto werr;
3555
3556 /* Iterate this DB writing every entry */
3557 while((de = dictNext(di)) != NULL) {
3558 robj *key = dictGetEntryKey(de);
3559 robj *o = dictGetEntryVal(de);
3560 time_t expiretime = getExpire(db,key);
3561
3562 /* Save the expire time */
3563 if (expiretime != -1) {
3564 /* If this key is already expired skip it */
3565 if (expiretime < now) continue;
3566 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3567 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3568 }
3569 /* Save the key and associated value. This requires special
3570 * handling if the value is swapped out. */
3571 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3572 key->storage == REDIS_VM_SWAPPING) {
3573 /* Save type, key, value */
3574 if (rdbSaveType(fp,o->type) == -1) goto werr;
3575 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3576 if (rdbSaveObject(fp,o) == -1) goto werr;
3577 } else {
3578 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3579 robj *po;
3580 /* Get a preview of the object in memory */
3581 po = vmPreviewObject(key);
3582 /* Save type, key, value */
3583 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3584 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3585 if (rdbSaveObject(fp,po) == -1) goto werr;
3586 /* Remove the loaded object from memory */
3587 decrRefCount(po);
3588 }
3589 }
3590 dictReleaseIterator(di);
3591 }
3592 /* EOF opcode */
3593 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3594
3595 /* Make sure data will not remain on the OS's output buffers */
3596 fflush(fp);
3597 fsync(fileno(fp));
3598 fclose(fp);
3599
3600 /* Use RENAME to make sure the DB file is changed atomically only
3601 * if the generate DB file is ok. */
3602 if (rename(tmpfile,filename) == -1) {
3603 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3604 unlink(tmpfile);
3605 return REDIS_ERR;
3606 }
3607 redisLog(REDIS_NOTICE,"DB saved on disk");
3608 server.dirty = 0;
3609 server.lastsave = time(NULL);
3610 return REDIS_OK;
3611
3612 werr:
3613 fclose(fp);
3614 unlink(tmpfile);
3615 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3616 if (di) dictReleaseIterator(di);
3617 return REDIS_ERR;
3618 }
3619
3620 static int rdbSaveBackground(char *filename) {
3621 pid_t childpid;
3622
3623 if (server.bgsavechildpid != -1) return REDIS_ERR;
3624 if (server.vm_enabled) waitEmptyIOJobsQueue();
3625 if ((childpid = fork()) == 0) {
3626 /* Child */
3627 if (server.vm_enabled) vmReopenSwapFile();
3628 close(server.fd);
3629 if (rdbSave(filename) == REDIS_OK) {
3630 _exit(0);
3631 } else {
3632 _exit(1);
3633 }
3634 } else {
3635 /* Parent */
3636 if (childpid == -1) {
3637 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3638 strerror(errno));
3639 return REDIS_ERR;
3640 }
3641 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3642 server.bgsavechildpid = childpid;
3643 updateDictResizePolicy();
3644 return REDIS_OK;
3645 }
3646 return REDIS_OK; /* unreached */
3647 }
3648
3649 static void rdbRemoveTempFile(pid_t childpid) {
3650 char tmpfile[256];
3651
3652 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3653 unlink(tmpfile);
3654 }
3655
3656 static int rdbLoadType(FILE *fp) {
3657 unsigned char type;
3658 if (fread(&type,1,1,fp) == 0) return -1;
3659 return type;
3660 }
3661
3662 static time_t rdbLoadTime(FILE *fp) {
3663 int32_t t32;
3664 if (fread(&t32,4,1,fp) == 0) return -1;
3665 return (time_t) t32;
3666 }
3667
3668 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3669 * of this file for a description of how this are stored on disk.
3670 *
3671 * isencoded is set to 1 if the readed length is not actually a length but
3672 * an "encoding type", check the above comments for more info */
3673 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3674 unsigned char buf[2];
3675 uint32_t len;
3676 int type;
3677
3678 if (isencoded) *isencoded = 0;
3679 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3680 type = (buf[0]&0xC0)>>6;
3681 if (type == REDIS_RDB_6BITLEN) {
3682 /* Read a 6 bit len */
3683 return buf[0]&0x3F;
3684 } else if (type == REDIS_RDB_ENCVAL) {
3685 /* Read a 6 bit len encoding type */
3686 if (isencoded) *isencoded = 1;
3687 return buf[0]&0x3F;
3688 } else if (type == REDIS_RDB_14BITLEN) {
3689 /* Read a 14 bit len */
3690 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3691 return ((buf[0]&0x3F)<<8)|buf[1];
3692 } else {
3693 /* Read a 32 bit len */
3694 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3695 return ntohl(len);
3696 }
3697 }
3698
3699 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3700 unsigned char enc[4];
3701 long long val;
3702
3703 if (enctype == REDIS_RDB_ENC_INT8) {
3704 if (fread(enc,1,1,fp) == 0) return NULL;
3705 val = (signed char)enc[0];
3706 } else if (enctype == REDIS_RDB_ENC_INT16) {
3707 uint16_t v;
3708 if (fread(enc,2,1,fp) == 0) return NULL;
3709 v = enc[0]|(enc[1]<<8);
3710 val = (int16_t)v;
3711 } else if (enctype == REDIS_RDB_ENC_INT32) {
3712 uint32_t v;
3713 if (fread(enc,4,1,fp) == 0) return NULL;
3714 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3715 val = (int32_t)v;
3716 } else {
3717 val = 0; /* anti-warning */
3718 redisPanic("Unknown RDB integer encoding type");
3719 }
3720 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3721 }
3722
3723 static robj *rdbLoadLzfStringObject(FILE*fp) {
3724 unsigned int len, clen;
3725 unsigned char *c = NULL;
3726 sds val = NULL;
3727
3728 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3729 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3730 if ((c = zmalloc(clen)) == NULL) goto err;
3731 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3732 if (fread(c,clen,1,fp) == 0) goto err;
3733 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3734 zfree(c);
3735 return createObject(REDIS_STRING,val);
3736 err:
3737 zfree(c);
3738 sdsfree(val);
3739 return NULL;
3740 }
3741
3742 static robj *rdbLoadStringObject(FILE*fp) {
3743 int isencoded;
3744 uint32_t len;
3745 sds val;
3746
3747 len = rdbLoadLen(fp,&isencoded);
3748 if (isencoded) {
3749 switch(len) {
3750 case REDIS_RDB_ENC_INT8:
3751 case REDIS_RDB_ENC_INT16:
3752 case REDIS_RDB_ENC_INT32:
3753 return rdbLoadIntegerObject(fp,len);
3754 case REDIS_RDB_ENC_LZF:
3755 return rdbLoadLzfStringObject(fp);
3756 default:
3757 redisPanic("Unknown RDB encoding type");
3758 }
3759 }
3760
3761 if (len == REDIS_RDB_LENERR) return NULL;
3762 val = sdsnewlen(NULL,len);
3763 if (len && fread(val,len,1,fp) == 0) {
3764 sdsfree(val);
3765 return NULL;
3766 }
3767 return createObject(REDIS_STRING,val);
3768 }
3769
3770 /* For information about double serialization check rdbSaveDoubleValue() */
3771 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3772 char buf[128];
3773 unsigned char len;
3774
3775 if (fread(&len,1,1,fp) == 0) return -1;
3776 switch(len) {
3777 case 255: *val = R_NegInf; return 0;
3778 case 254: *val = R_PosInf; return 0;
3779 case 253: *val = R_Nan; return 0;
3780 default:
3781 if (fread(buf,len,1,fp) == 0) return -1;
3782 buf[len] = '\0';
3783 sscanf(buf, "%lg", val);
3784 return 0;
3785 }
3786 }
3787
3788 /* Load a Redis object of the specified type from the specified file.
3789 * On success a newly allocated object is returned, otherwise NULL. */
3790 static robj *rdbLoadObject(int type, FILE *fp) {
3791 robj *o;
3792
3793 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3794 if (type == REDIS_STRING) {
3795 /* Read string value */
3796 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3797 o = tryObjectEncoding(o);
3798 } else if (type == REDIS_LIST || type == REDIS_SET) {
3799 /* Read list/set value */
3800 uint32_t listlen;
3801
3802 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3803 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3804 /* It's faster to expand the dict to the right size asap in order
3805 * to avoid rehashing */
3806 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3807 dictExpand(o->ptr,listlen);
3808 /* Load every single element of the list/set */
3809 while(listlen--) {
3810 robj *ele;
3811
3812 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3813 ele = tryObjectEncoding(ele);
3814 if (type == REDIS_LIST) {
3815 listAddNodeTail((list*)o->ptr,ele);
3816 } else {
3817 dictAdd((dict*)o->ptr,ele,NULL);
3818 }
3819 }
3820 } else if (type == REDIS_ZSET) {
3821 /* Read list/set value */
3822 size_t zsetlen;
3823 zset *zs;
3824
3825 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3826 o = createZsetObject();
3827 zs = o->ptr;
3828 /* Load every single element of the list/set */
3829 while(zsetlen--) {
3830 robj *ele;
3831 double *score = zmalloc(sizeof(double));
3832
3833 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3834 ele = tryObjectEncoding(ele);
3835 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3836 dictAdd(zs->dict,ele,score);
3837 zslInsert(zs->zsl,*score,ele);
3838 incrRefCount(ele); /* added to skiplist */
3839 }
3840 } else if (type == REDIS_HASH) {
3841 size_t hashlen;
3842
3843 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3844 o = createHashObject();
3845 /* Too many entries? Use an hash table. */
3846 if (hashlen > server.hash_max_zipmap_entries)
3847 convertToRealHash(o);
3848 /* Load every key/value, then set it into the zipmap or hash
3849 * table, as needed. */
3850 while(hashlen--) {
3851 robj *key, *val;
3852
3853 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3854 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3855 /* If we are using a zipmap and there are too big values
3856 * the object is converted to real hash table encoding. */
3857 if (o->encoding != REDIS_ENCODING_HT &&
3858 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3859 sdslen(val->ptr) > server.hash_max_zipmap_value))
3860 {
3861 convertToRealHash(o);
3862 }
3863
3864 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3865 unsigned char *zm = o->ptr;
3866
3867 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3868 val->ptr,sdslen(val->ptr),NULL);
3869 o->ptr = zm;
3870 decrRefCount(key);
3871 decrRefCount(val);
3872 } else {
3873 key = tryObjectEncoding(key);
3874 val = tryObjectEncoding(val);
3875 dictAdd((dict*)o->ptr,key,val);
3876 }
3877 }
3878 } else {
3879 redisPanic("Unknown object type");
3880 }
3881 return o;
3882 }
3883
3884 static int rdbLoad(char *filename) {
3885 FILE *fp;
3886 robj *keyobj = NULL;
3887 uint32_t dbid;
3888 int type, retval, rdbver;
3889 dict *d = server.db[0].dict;
3890 redisDb *db = server.db+0;
3891 char buf[1024];
3892 time_t expiretime = -1, now = time(NULL);
3893 long long loadedkeys = 0;
3894
3895 fp = fopen(filename,"r");
3896 if (!fp) return REDIS_ERR;
3897 if (fread(buf,9,1,fp) == 0) goto eoferr;
3898 buf[9] = '\0';
3899 if (memcmp(buf,"REDIS",5) != 0) {
3900 fclose(fp);
3901 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3902 return REDIS_ERR;
3903 }
3904 rdbver = atoi(buf+5);
3905 if (rdbver != 1) {
3906 fclose(fp);
3907 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3908 return REDIS_ERR;
3909 }
3910 while(1) {
3911 robj *o;
3912
3913 /* Read type. */
3914 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3915 if (type == REDIS_EXPIRETIME) {
3916 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3917 /* We read the time so we need to read the object type again */
3918 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3919 }
3920 if (type == REDIS_EOF) break;
3921 /* Handle SELECT DB opcode as a special case */
3922 if (type == REDIS_SELECTDB) {
3923 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3924 goto eoferr;
3925 if (dbid >= (unsigned)server.dbnum) {
3926 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3927 exit(1);
3928 }
3929 db = server.db+dbid;
3930 d = db->dict;
3931 continue;
3932 }
3933 /* Read key */
3934 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3935 /* Read value */
3936 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3937 /* Add the new object in the hash table */
3938 retval = dictAdd(d,keyobj,o);
3939 if (retval == DICT_ERR) {
3940 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3941 exit(1);
3942 }
3943 /* Set the expire time if needed */
3944 if (expiretime != -1) {
3945 setExpire(db,keyobj,expiretime);
3946 /* Delete this key if already expired */
3947 if (expiretime < now) deleteKey(db,keyobj);
3948 expiretime = -1;
3949 }
3950 keyobj = o = NULL;
3951 /* Handle swapping while loading big datasets when VM is on */
3952 loadedkeys++;
3953 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3954 while (zmalloc_used_memory() > server.vm_max_memory) {
3955 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3956 }
3957 }
3958 }
3959 fclose(fp);
3960 return REDIS_OK;
3961
3962 eoferr: /* unexpected end of file is handled here with a fatal exit */
3963 if (keyobj) decrRefCount(keyobj);
3964 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3965 exit(1);
3966 return REDIS_ERR; /* Just to avoid warning */
3967 }
3968
3969 /*================================== Commands =============================== */
3970
3971 static void authCommand(redisClient *c) {
3972 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3973 c->authenticated = 1;
3974 addReply(c,shared.ok);
3975 } else {
3976 c->authenticated = 0;
3977 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3978 }
3979 }
3980
3981 static void pingCommand(redisClient *c) {
3982 addReply(c,shared.pong);
3983 }
3984
3985 static void echoCommand(redisClient *c) {
3986 addReplyBulk(c,c->argv[1]);
3987 }
3988
3989 /*=================================== Strings =============================== */
3990
3991 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
3992 int retval;
3993 long seconds;
3994
3995 if (expire) {
3996 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
3997 return;
3998 if (seconds <= 0) {
3999 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4000 return;
4001 }
4002 }
4003
4004 if (nx) deleteIfVolatile(c->db,key);
4005 retval = dictAdd(c->db->dict,key,val);
4006 if (retval == DICT_ERR) {
4007 if (!nx) {
4008 /* If the key is about a swapped value, we want a new key object
4009 * to overwrite the old. So we delete the old key in the database.
4010 * This will also make sure that swap pages about the old object
4011 * will be marked as free. */
4012 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4013 incrRefCount(key);
4014 dictReplace(c->db->dict,key,val);
4015 incrRefCount(val);
4016 } else {
4017 addReply(c,shared.czero);
4018 return;
4019 }
4020 } else {
4021 incrRefCount(key);
4022 incrRefCount(val);
4023 }
4024 server.dirty++;
4025 removeExpire(c->db,key);
4026 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4027 addReply(c, nx ? shared.cone : shared.ok);
4028 }
4029
4030 static void setCommand(redisClient *c) {
4031 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4032 }
4033
4034 static void setnxCommand(redisClient *c) {
4035 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4036 }
4037
4038 static void setexCommand(redisClient *c) {
4039 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4040 }
4041
4042 static int getGenericCommand(redisClient *c) {
4043 robj *o;
4044
4045 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4046 return REDIS_OK;
4047
4048 if (o->type != REDIS_STRING) {
4049 addReply(c,shared.wrongtypeerr);
4050 return REDIS_ERR;
4051 } else {
4052 addReplyBulk(c,o);
4053 return REDIS_OK;
4054 }
4055 }
4056
4057 static void getCommand(redisClient *c) {
4058 getGenericCommand(c);
4059 }
4060
4061 static void getsetCommand(redisClient *c) {
4062 if (getGenericCommand(c) == REDIS_ERR) return;
4063 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4064 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4065 } else {
4066 incrRefCount(c->argv[1]);
4067 }
4068 incrRefCount(c->argv[2]);
4069 server.dirty++;
4070 removeExpire(c->db,c->argv[1]);
4071 }
4072
4073 static void mgetCommand(redisClient *c) {
4074 int j;
4075
4076 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4077 for (j = 1; j < c->argc; j++) {
4078 robj *o = lookupKeyRead(c->db,c->argv[j]);
4079 if (o == NULL) {
4080 addReply(c,shared.nullbulk);
4081 } else {
4082 if (o->type != REDIS_STRING) {
4083 addReply(c,shared.nullbulk);
4084 } else {
4085 addReplyBulk(c,o);
4086 }
4087 }
4088 }
4089 }
4090
4091 static void msetGenericCommand(redisClient *c, int nx) {
4092 int j, busykeys = 0;
4093
4094 if ((c->argc % 2) == 0) {
4095 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4096 return;
4097 }
4098 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4099 * set nothing at all if at least one already key exists. */
4100 if (nx) {
4101 for (j = 1; j < c->argc; j += 2) {
4102 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4103 busykeys++;
4104 }
4105 }
4106 }
4107 if (busykeys) {
4108 addReply(c, shared.czero);
4109 return;
4110 }
4111
4112 for (j = 1; j < c->argc; j += 2) {
4113 int retval;
4114
4115 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4116 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4117 if (retval == DICT_ERR) {
4118 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4119 incrRefCount(c->argv[j+1]);
4120 } else {
4121 incrRefCount(c->argv[j]);
4122 incrRefCount(c->argv[j+1]);
4123 }
4124 removeExpire(c->db,c->argv[j]);
4125 }
4126 server.dirty += (c->argc-1)/2;
4127 addReply(c, nx ? shared.cone : shared.ok);
4128 }
4129
4130 static void msetCommand(redisClient *c) {
4131 msetGenericCommand(c,0);
4132 }
4133
4134 static void msetnxCommand(redisClient *c) {
4135 msetGenericCommand(c,1);
4136 }
4137
4138 static void incrDecrCommand(redisClient *c, long long incr) {
4139 long long value;
4140 int retval;
4141 robj *o;
4142
4143 o = lookupKeyWrite(c->db,c->argv[1]);
4144
4145 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4146
4147 value += incr;
4148 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4149 o = tryObjectEncoding(o);
4150 retval = dictAdd(c->db->dict,c->argv[1],o);
4151 if (retval == DICT_ERR) {
4152 dictReplace(c->db->dict,c->argv[1],o);
4153 removeExpire(c->db,c->argv[1]);
4154 } else {
4155 incrRefCount(c->argv[1]);
4156 }
4157 server.dirty++;
4158 addReply(c,shared.colon);
4159 addReply(c,o);
4160 addReply(c,shared.crlf);
4161 }
4162
4163 static void incrCommand(redisClient *c) {
4164 incrDecrCommand(c,1);
4165 }
4166
4167 static void decrCommand(redisClient *c) {
4168 incrDecrCommand(c,-1);
4169 }
4170
4171 static void incrbyCommand(redisClient *c) {
4172 long long incr;
4173
4174 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4175 incrDecrCommand(c,incr);
4176 }
4177
4178 static void decrbyCommand(redisClient *c) {
4179 long long incr;
4180
4181 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4182 incrDecrCommand(c,-incr);
4183 }
4184
4185 static void appendCommand(redisClient *c) {
4186 int retval;
4187 size_t totlen;
4188 robj *o;
4189
4190 o = lookupKeyWrite(c->db,c->argv[1]);
4191 if (o == NULL) {
4192 /* Create the key */
4193 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4194 incrRefCount(c->argv[1]);
4195 incrRefCount(c->argv[2]);
4196 totlen = stringObjectLen(c->argv[2]);
4197 } else {
4198 dictEntry *de;
4199
4200 de = dictFind(c->db->dict,c->argv[1]);
4201 assert(de != NULL);
4202
4203 o = dictGetEntryVal(de);
4204 if (o->type != REDIS_STRING) {
4205 addReply(c,shared.wrongtypeerr);
4206 return;
4207 }
4208 /* If the object is specially encoded or shared we have to make
4209 * a copy */
4210 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4211 robj *decoded = getDecodedObject(o);
4212
4213 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4214 decrRefCount(decoded);
4215 dictReplace(c->db->dict,c->argv[1],o);
4216 }
4217 /* APPEND! */
4218 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4219 o->ptr = sdscatlen(o->ptr,
4220 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4221 } else {
4222 o->ptr = sdscatprintf(o->ptr, "%ld",
4223 (unsigned long) c->argv[2]->ptr);
4224 }
4225 totlen = sdslen(o->ptr);
4226 }
4227 server.dirty++;
4228 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4229 }
4230
4231 static void substrCommand(redisClient *c) {
4232 robj *o;
4233 long start = atoi(c->argv[2]->ptr);
4234 long end = atoi(c->argv[3]->ptr);
4235 size_t rangelen, strlen;
4236 sds range;
4237
4238 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4239 checkType(c,o,REDIS_STRING)) return;
4240
4241 o = getDecodedObject(o);
4242 strlen = sdslen(o->ptr);
4243
4244 /* convert negative indexes */
4245 if (start < 0) start = strlen+start;
4246 if (end < 0) end = strlen+end;
4247 if (start < 0) start = 0;
4248 if (end < 0) end = 0;
4249
4250 /* indexes sanity checks */
4251 if (start > end || (size_t)start >= strlen) {
4252 /* Out of range start or start > end result in null reply */
4253 addReply(c,shared.nullbulk);
4254 decrRefCount(o);
4255 return;
4256 }
4257 if ((size_t)end >= strlen) end = strlen-1;
4258 rangelen = (end-start)+1;
4259
4260 /* Return the result */
4261 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4262 range = sdsnewlen((char*)o->ptr+start,rangelen);
4263 addReplySds(c,range);
4264 addReply(c,shared.crlf);
4265 decrRefCount(o);
4266 }
4267
4268 /* ========================= Type agnostic commands ========================= */
4269
4270 static void delCommand(redisClient *c) {
4271 int deleted = 0, j;
4272
4273 for (j = 1; j < c->argc; j++) {
4274 if (deleteKey(c->db,c->argv[j])) {
4275 server.dirty++;
4276 deleted++;
4277 }
4278 }
4279 addReplyLong(c,deleted);
4280 }
4281
4282 static void existsCommand(redisClient *c) {
4283 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4284 }
4285
4286 static void selectCommand(redisClient *c) {
4287 int id = atoi(c->argv[1]->ptr);
4288
4289 if (selectDb(c,id) == REDIS_ERR) {
4290 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4291 } else {
4292 addReply(c,shared.ok);
4293 }
4294 }
4295
4296 static void randomkeyCommand(redisClient *c) {
4297 dictEntry *de;
4298 robj *key;
4299
4300 while(1) {
4301 de = dictGetRandomKey(c->db->dict);
4302 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4303 }
4304
4305 if (de == NULL) {
4306 addReply(c,shared.nullbulk);
4307 return;
4308 }
4309
4310 key = dictGetEntryKey(de);
4311 if (server.vm_enabled) {
4312 key = dupStringObject(key);
4313 addReplyBulk(c,key);
4314 decrRefCount(key);
4315 } else {
4316 addReplyBulk(c,key);
4317 }
4318 }
4319
4320 static void keysCommand(redisClient *c) {
4321 dictIterator *di;
4322 dictEntry *de;
4323 sds pattern = c->argv[1]->ptr;
4324 int plen = sdslen(pattern);
4325 unsigned long numkeys = 0;
4326 robj *lenobj = createObject(REDIS_STRING,NULL);
4327
4328 di = dictGetIterator(c->db->dict);
4329 addReply(c,lenobj);
4330 decrRefCount(lenobj);
4331 while((de = dictNext(di)) != NULL) {
4332 robj *keyobj = dictGetEntryKey(de);
4333
4334 sds key = keyobj->ptr;
4335 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4336 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4337 if (expireIfNeeded(c->db,keyobj) == 0) {
4338 addReplyBulk(c,keyobj);
4339 numkeys++;
4340 }
4341 }
4342 }
4343 dictReleaseIterator(di);
4344 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4345 }
4346
4347 static void dbsizeCommand(redisClient *c) {
4348 addReplySds(c,
4349 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4350 }
4351
4352 static void lastsaveCommand(redisClient *c) {
4353 addReplySds(c,
4354 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4355 }
4356
4357 static void typeCommand(redisClient *c) {
4358 robj *o;
4359 char *type;
4360
4361 o = lookupKeyRead(c->db,c->argv[1]);
4362 if (o == NULL) {
4363 type = "+none";
4364 } else {
4365 switch(o->type) {
4366 case REDIS_STRING: type = "+string"; break;
4367 case REDIS_LIST: type = "+list"; break;
4368 case REDIS_SET: type = "+set"; break;
4369 case REDIS_ZSET: type = "+zset"; break;
4370 case REDIS_HASH: type = "+hash"; break;
4371 default: type = "+unknown"; break;
4372 }
4373 }
4374 addReplySds(c,sdsnew(type));
4375 addReply(c,shared.crlf);
4376 }
4377
4378 static void saveCommand(redisClient *c) {
4379 if (server.bgsavechildpid != -1) {
4380 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4381 return;
4382 }
4383 if (rdbSave(server.dbfilename) == REDIS_OK) {
4384 addReply(c,shared.ok);
4385 } else {
4386 addReply(c,shared.err);
4387 }
4388 }
4389
4390 static void bgsaveCommand(redisClient *c) {
4391 if (server.bgsavechildpid != -1) {
4392 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4393 return;
4394 }
4395 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4396 char *status = "+Background saving started\r\n";
4397 addReplySds(c,sdsnew(status));
4398 } else {
4399 addReply(c,shared.err);
4400 }
4401 }
4402
4403 static void shutdownCommand(redisClient *c) {
4404 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4405 /* Kill the saving child if there is a background saving in progress.
4406 We want to avoid race conditions, for instance our saving child may
4407 overwrite the synchronous saving did by SHUTDOWN. */
4408 if (server.bgsavechildpid != -1) {
4409 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4410 kill(server.bgsavechildpid,SIGKILL);
4411 rdbRemoveTempFile(server.bgsavechildpid);
4412 }
4413 if (server.appendonly) {
4414 /* Append only file: fsync() the AOF and exit */
4415 fsync(server.appendfd);
4416 if (server.vm_enabled) unlink(server.vm_swap_file);
4417 exit(0);
4418 } else {
4419 /* Snapshotting. Perform a SYNC SAVE and exit */
4420 if (rdbSave(server.dbfilename) == REDIS_OK) {
4421 if (server.daemonize)
4422 unlink(server.pidfile);
4423 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4424 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4425 if (server.vm_enabled) unlink(server.vm_swap_file);
4426 exit(0);
4427 } else {
4428 /* Ooops.. error saving! The best we can do is to continue
4429 * operating. Note that if there was a background saving process,
4430 * in the next cron() Redis will be notified that the background
4431 * saving aborted, handling special stuff like slaves pending for
4432 * synchronization... */
4433 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4434 addReplySds(c,
4435 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4436 }
4437 }
4438 }
4439
4440 static void renameGenericCommand(redisClient *c, int nx) {
4441 robj *o;
4442
4443 /* To use the same key as src and dst is probably an error */
4444 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4445 addReply(c,shared.sameobjecterr);
4446 return;
4447 }
4448
4449 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4450 return;
4451
4452 incrRefCount(o);
4453 deleteIfVolatile(c->db,c->argv[2]);
4454 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4455 if (nx) {
4456 decrRefCount(o);
4457 addReply(c,shared.czero);
4458 return;
4459 }
4460 dictReplace(c->db->dict,c->argv[2],o);
4461 } else {
4462 incrRefCount(c->argv[2]);
4463 }
4464 deleteKey(c->db,c->argv[1]);
4465 server.dirty++;
4466 addReply(c,nx ? shared.cone : shared.ok);
4467 }
4468
4469 static void renameCommand(redisClient *c) {
4470 renameGenericCommand(c,0);
4471 }
4472
4473 static void renamenxCommand(redisClient *c) {
4474 renameGenericCommand(c,1);
4475 }
4476
4477 static void moveCommand(redisClient *c) {
4478 robj *o;
4479 redisDb *src, *dst;
4480 int srcid;
4481
4482 /* Obtain source and target DB pointers */
4483 src = c->db;
4484 srcid = c->db->id;
4485 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4486 addReply(c,shared.outofrangeerr);
4487 return;
4488 }
4489 dst = c->db;
4490 selectDb(c,srcid); /* Back to the source DB */
4491
4492 /* If the user is moving using as target the same
4493 * DB as the source DB it is probably an error. */
4494 if (src == dst) {
4495 addReply(c,shared.sameobjecterr);
4496 return;
4497 }
4498
4499 /* Check if the element exists and get a reference */
4500 o = lookupKeyWrite(c->db,c->argv[1]);
4501 if (!o) {
4502 addReply(c,shared.czero);
4503 return;
4504 }
4505
4506 /* Try to add the element to the target DB */
4507 deleteIfVolatile(dst,c->argv[1]);
4508 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4509 addReply(c,shared.czero);
4510 return;
4511 }
4512 incrRefCount(c->argv[1]);
4513 incrRefCount(o);
4514
4515 /* OK! key moved, free the entry in the source DB */
4516 deleteKey(src,c->argv[1]);
4517 server.dirty++;
4518 addReply(c,shared.cone);
4519 }
4520
4521 /* =================================== Lists ================================ */
4522 static void pushGenericCommand(redisClient *c, int where) {
4523 robj *lobj;
4524 list *list;
4525
4526 lobj = lookupKeyWrite(c->db,c->argv[1]);
4527 if (lobj == NULL) {
4528 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4529 addReply(c,shared.cone);
4530 return;
4531 }
4532 lobj = createListObject();
4533 list = lobj->ptr;
4534 if (where == REDIS_HEAD) {
4535 listAddNodeHead(list,c->argv[2]);
4536 } else {
4537 listAddNodeTail(list,c->argv[2]);
4538 }
4539 dictAdd(c->db->dict,c->argv[1],lobj);
4540 incrRefCount(c->argv[1]);
4541 incrRefCount(c->argv[2]);
4542 } else {
4543 if (lobj->type != REDIS_LIST) {
4544 addReply(c,shared.wrongtypeerr);
4545 return;
4546 }
4547 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4548 addReply(c,shared.cone);
4549 return;
4550 }
4551 list = lobj->ptr;
4552 if (where == REDIS_HEAD) {
4553 listAddNodeHead(list,c->argv[2]);
4554 } else {
4555 listAddNodeTail(list,c->argv[2]);
4556 }
4557 incrRefCount(c->argv[2]);
4558 }
4559 server.dirty++;
4560 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4561 }
4562
4563 static void lpushCommand(redisClient *c) {
4564 pushGenericCommand(c,REDIS_HEAD);
4565 }
4566
4567 static void rpushCommand(redisClient *c) {
4568 pushGenericCommand(c,REDIS_TAIL);
4569 }
4570
4571 static void llenCommand(redisClient *c) {
4572 robj *o;
4573 list *l;
4574
4575 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4576 checkType(c,o,REDIS_LIST)) return;
4577
4578 l = o->ptr;
4579 addReplyUlong(c,listLength(l));
4580 }
4581
4582 static void lindexCommand(redisClient *c) {
4583 robj *o;
4584 int index = atoi(c->argv[2]->ptr);
4585 list *list;
4586 listNode *ln;
4587
4588 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4589 checkType(c,o,REDIS_LIST)) return;
4590 list = o->ptr;
4591
4592 ln = listIndex(list, index);
4593 if (ln == NULL) {
4594 addReply(c,shared.nullbulk);
4595 } else {
4596 robj *ele = listNodeValue(ln);
4597 addReplyBulk(c,ele);
4598 }
4599 }
4600
4601 static void lsetCommand(redisClient *c) {
4602 robj *o;
4603 int index = atoi(c->argv[2]->ptr);
4604 list *list;
4605 listNode *ln;
4606
4607 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4608 checkType(c,o,REDIS_LIST)) return;
4609 list = o->ptr;
4610
4611 ln = listIndex(list, index);
4612 if (ln == NULL) {
4613 addReply(c,shared.outofrangeerr);
4614 } else {
4615 robj *ele = listNodeValue(ln);
4616
4617 decrRefCount(ele);
4618 listNodeValue(ln) = c->argv[3];
4619 incrRefCount(c->argv[3]);
4620 addReply(c,shared.ok);
4621 server.dirty++;
4622 }
4623 }
4624
4625 static void popGenericCommand(redisClient *c, int where) {
4626 robj *o;
4627 list *list;
4628 listNode *ln;
4629
4630 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4631 checkType(c,o,REDIS_LIST)) return;
4632 list = o->ptr;
4633
4634 if (where == REDIS_HEAD)
4635 ln = listFirst(list);
4636 else
4637 ln = listLast(list);
4638
4639 if (ln == NULL) {
4640 addReply(c,shared.nullbulk);
4641 } else {
4642 robj *ele = listNodeValue(ln);
4643 addReplyBulk(c,ele);
4644 listDelNode(list,ln);
4645 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4646 server.dirty++;
4647 }
4648 }
4649
4650 static void lpopCommand(redisClient *c) {
4651 popGenericCommand(c,REDIS_HEAD);
4652 }
4653
4654 static void rpopCommand(redisClient *c) {
4655 popGenericCommand(c,REDIS_TAIL);
4656 }
4657
4658 static void lrangeCommand(redisClient *c) {
4659 robj *o;
4660 int start = atoi(c->argv[2]->ptr);
4661 int end = atoi(c->argv[3]->ptr);
4662 int llen;
4663 int rangelen, j;
4664 list *list;
4665 listNode *ln;
4666 robj *ele;
4667
4668 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4669 || checkType(c,o,REDIS_LIST)) return;
4670 list = o->ptr;
4671 llen = listLength(list);
4672
4673 /* convert negative indexes */
4674 if (start < 0) start = llen+start;
4675 if (end < 0) end = llen+end;
4676 if (start < 0) start = 0;
4677 if (end < 0) end = 0;
4678
4679 /* indexes sanity checks */
4680 if (start > end || start >= llen) {
4681 /* Out of range start or start > end result in empty list */
4682 addReply(c,shared.emptymultibulk);
4683 return;
4684 }
4685 if (end >= llen) end = llen-1;
4686 rangelen = (end-start)+1;
4687
4688 /* Return the result in form of a multi-bulk reply */
4689 ln = listIndex(list, start);
4690 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4691 for (j = 0; j < rangelen; j++) {
4692 ele = listNodeValue(ln);
4693 addReplyBulk(c,ele);
4694 ln = ln->next;
4695 }
4696 }
4697
4698 static void ltrimCommand(redisClient *c) {
4699 robj *o;
4700 int start = atoi(c->argv[2]->ptr);
4701 int end = atoi(c->argv[3]->ptr);
4702 int llen;
4703 int j, ltrim, rtrim;
4704 list *list;
4705 listNode *ln;
4706
4707 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4708 checkType(c,o,REDIS_LIST)) return;
4709 list = o->ptr;
4710 llen = listLength(list);
4711
4712 /* convert negative indexes */
4713 if (start < 0) start = llen+start;
4714 if (end < 0) end = llen+end;
4715 if (start < 0) start = 0;
4716 if (end < 0) end = 0;
4717
4718 /* indexes sanity checks */
4719 if (start > end || start >= llen) {
4720 /* Out of range start or start > end result in empty list */
4721 ltrim = llen;
4722 rtrim = 0;
4723 } else {
4724 if (end >= llen) end = llen-1;
4725 ltrim = start;
4726 rtrim = llen-end-1;
4727 }
4728
4729 /* Remove list elements to perform the trim */
4730 for (j = 0; j < ltrim; j++) {
4731 ln = listFirst(list);
4732 listDelNode(list,ln);
4733 }
4734 for (j = 0; j < rtrim; j++) {
4735 ln = listLast(list);
4736 listDelNode(list,ln);
4737 }
4738 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4739 server.dirty++;
4740 addReply(c,shared.ok);
4741 }
4742
4743 static void lremCommand(redisClient *c) {
4744 robj *o;
4745 list *list;
4746 listNode *ln, *next;
4747 int toremove = atoi(c->argv[2]->ptr);
4748 int removed = 0;
4749 int fromtail = 0;
4750
4751 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4752 checkType(c,o,REDIS_LIST)) return;
4753 list = o->ptr;
4754
4755 if (toremove < 0) {
4756 toremove = -toremove;
4757 fromtail = 1;
4758 }
4759 ln = fromtail ? list->tail : list->head;
4760 while (ln) {
4761 robj *ele = listNodeValue(ln);
4762
4763 next = fromtail ? ln->prev : ln->next;
4764 if (compareStringObjects(ele,c->argv[3]) == 0) {
4765 listDelNode(list,ln);
4766 server.dirty++;
4767 removed++;
4768 if (toremove && removed == toremove) break;
4769 }
4770 ln = next;
4771 }
4772 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4773 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4774 }
4775
4776 /* This is the semantic of this command:
4777 * RPOPLPUSH srclist dstlist:
4778 * IF LLEN(srclist) > 0
4779 * element = RPOP srclist
4780 * LPUSH dstlist element
4781 * RETURN element
4782 * ELSE
4783 * RETURN nil
4784 * END
4785 * END
4786 *
4787 * The idea is to be able to get an element from a list in a reliable way
4788 * since the element is not just returned but pushed against another list
4789 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4790 */
4791 static void rpoplpushcommand(redisClient *c) {
4792 robj *sobj;
4793 list *srclist;
4794 listNode *ln;
4795
4796 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4797 checkType(c,sobj,REDIS_LIST)) return;
4798 srclist = sobj->ptr;
4799 ln = listLast(srclist);
4800
4801 if (ln == NULL) {
4802 addReply(c,shared.nullbulk);
4803 } else {
4804 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4805 robj *ele = listNodeValue(ln);
4806 list *dstlist;
4807
4808 if (dobj && dobj->type != REDIS_LIST) {
4809 addReply(c,shared.wrongtypeerr);
4810 return;
4811 }
4812
4813 /* Add the element to the target list (unless it's directly
4814 * passed to some BLPOP-ing client */
4815 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4816 if (dobj == NULL) {
4817 /* Create the list if the key does not exist */
4818 dobj = createListObject();
4819 dictAdd(c->db->dict,c->argv[2],dobj);
4820 incrRefCount(c->argv[2]);
4821 }
4822 dstlist = dobj->ptr;
4823 listAddNodeHead(dstlist,ele);
4824 incrRefCount(ele);
4825 }
4826
4827 /* Send the element to the client as reply as well */
4828 addReplyBulk(c,ele);
4829
4830 /* Finally remove the element from the source list */
4831 listDelNode(srclist,ln);
4832 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4833 server.dirty++;
4834 }
4835 }
4836
4837 /* ==================================== Sets ================================ */
4838
4839 static void saddCommand(redisClient *c) {
4840 robj *set;
4841
4842 set = lookupKeyWrite(c->db,c->argv[1]);
4843 if (set == NULL) {
4844 set = createSetObject();
4845 dictAdd(c->db->dict,c->argv[1],set);
4846 incrRefCount(c->argv[1]);
4847 } else {
4848 if (set->type != REDIS_SET) {
4849 addReply(c,shared.wrongtypeerr);
4850 return;
4851 }
4852 }
4853 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4854 incrRefCount(c->argv[2]);
4855 server.dirty++;
4856 addReply(c,shared.cone);
4857 } else {
4858 addReply(c,shared.czero);
4859 }
4860 }
4861
4862 static void sremCommand(redisClient *c) {
4863 robj *set;
4864
4865 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4866 checkType(c,set,REDIS_SET)) return;
4867
4868 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4869 server.dirty++;
4870 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4871 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4872 addReply(c,shared.cone);
4873 } else {
4874 addReply(c,shared.czero);
4875 }
4876 }
4877
4878 static void smoveCommand(redisClient *c) {
4879 robj *srcset, *dstset;
4880
4881 srcset = lookupKeyWrite(c->db,c->argv[1]);
4882 dstset = lookupKeyWrite(c->db,c->argv[2]);
4883
4884 /* If the source key does not exist return 0, if it's of the wrong type
4885 * raise an error */
4886 if (srcset == NULL || srcset->type != REDIS_SET) {
4887 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4888 return;
4889 }
4890 /* Error if the destination key is not a set as well */
4891 if (dstset && dstset->type != REDIS_SET) {
4892 addReply(c,shared.wrongtypeerr);
4893 return;
4894 }
4895 /* Remove the element from the source set */
4896 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4897 /* Key not found in the src set! return zero */
4898 addReply(c,shared.czero);
4899 return;
4900 }
4901 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4902 deleteKey(c->db,c->argv[1]);
4903 server.dirty++;
4904 /* Add the element to the destination set */
4905 if (!dstset) {
4906 dstset = createSetObject();
4907 dictAdd(c->db->dict,c->argv[2],dstset);
4908 incrRefCount(c->argv[2]);
4909 }
4910 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4911 incrRefCount(c->argv[3]);
4912 addReply(c,shared.cone);
4913 }
4914
4915 static void sismemberCommand(redisClient *c) {
4916 robj *set;
4917
4918 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4919 checkType(c,set,REDIS_SET)) return;
4920
4921 if (dictFind(set->ptr,c->argv[2]))
4922 addReply(c,shared.cone);
4923 else
4924 addReply(c,shared.czero);
4925 }
4926
4927 static void scardCommand(redisClient *c) {
4928 robj *o;
4929 dict *s;
4930
4931 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4932 checkType(c,o,REDIS_SET)) return;
4933
4934 s = o->ptr;
4935 addReplyUlong(c,dictSize(s));
4936 }
4937
4938 static void spopCommand(redisClient *c) {
4939 robj *set;
4940 dictEntry *de;
4941
4942 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4943 checkType(c,set,REDIS_SET)) return;
4944
4945 de = dictGetRandomKey(set->ptr);
4946 if (de == NULL) {
4947 addReply(c,shared.nullbulk);
4948 } else {
4949 robj *ele = dictGetEntryKey(de);
4950
4951 addReplyBulk(c,ele);
4952 dictDelete(set->ptr,ele);
4953 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4954 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4955 server.dirty++;
4956 }
4957 }
4958
4959 static void srandmemberCommand(redisClient *c) {
4960 robj *set;
4961 dictEntry *de;
4962
4963 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4964 checkType(c,set,REDIS_SET)) return;
4965
4966 de = dictGetRandomKey(set->ptr);
4967 if (de == NULL) {
4968 addReply(c,shared.nullbulk);
4969 } else {
4970 robj *ele = dictGetEntryKey(de);
4971
4972 addReplyBulk(c,ele);
4973 }
4974 }
4975
4976 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4977 dict **d1 = (void*) s1, **d2 = (void*) s2;
4978
4979 return dictSize(*d1)-dictSize(*d2);
4980 }
4981
4982 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4983 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4984 dictIterator *di;
4985 dictEntry *de;
4986 robj *lenobj = NULL, *dstset = NULL;
4987 unsigned long j, cardinality = 0;
4988
4989 for (j = 0; j < setsnum; j++) {
4990 robj *setobj;
4991
4992 setobj = dstkey ?
4993 lookupKeyWrite(c->db,setskeys[j]) :
4994 lookupKeyRead(c->db,setskeys[j]);
4995 if (!setobj) {
4996 zfree(dv);
4997 if (dstkey) {
4998 if (deleteKey(c->db,dstkey))
4999 server.dirty++;
5000 addReply(c,shared.czero);
5001 } else {
5002 addReply(c,shared.emptymultibulk);
5003 }
5004 return;
5005 }
5006 if (setobj->type != REDIS_SET) {
5007 zfree(dv);
5008 addReply(c,shared.wrongtypeerr);
5009 return;
5010 }
5011 dv[j] = setobj->ptr;
5012 }
5013 /* Sort sets from the smallest to largest, this will improve our
5014 * algorithm's performace */
5015 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5016
5017 /* The first thing we should output is the total number of elements...
5018 * since this is a multi-bulk write, but at this stage we don't know
5019 * the intersection set size, so we use a trick, append an empty object
5020 * to the output list and save the pointer to later modify it with the
5021 * right length */
5022 if (!dstkey) {
5023 lenobj = createObject(REDIS_STRING,NULL);
5024 addReply(c,lenobj);
5025 decrRefCount(lenobj);
5026 } else {
5027 /* If we have a target key where to store the resulting set
5028 * create this key with an empty set inside */
5029 dstset = createSetObject();
5030 }
5031
5032 /* Iterate all the elements of the first (smallest) set, and test
5033 * the element against all the other sets, if at least one set does
5034 * not include the element it is discarded */
5035 di = dictGetIterator(dv[0]);
5036
5037 while((de = dictNext(di)) != NULL) {
5038 robj *ele;
5039
5040 for (j = 1; j < setsnum; j++)
5041 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5042 if (j != setsnum)
5043 continue; /* at least one set does not contain the member */
5044 ele = dictGetEntryKey(de);
5045 if (!dstkey) {
5046 addReplyBulk(c,ele);
5047 cardinality++;
5048 } else {
5049 dictAdd(dstset->ptr,ele,NULL);
5050 incrRefCount(ele);
5051 }
5052 }
5053 dictReleaseIterator(di);
5054
5055 if (dstkey) {
5056 /* Store the resulting set into the target, if the intersection
5057 * is not an empty set. */
5058 deleteKey(c->db,dstkey);
5059 if (dictSize((dict*)dstset->ptr) > 0) {
5060 dictAdd(c->db->dict,dstkey,dstset);
5061 incrRefCount(dstkey);
5062 addReplyLong(c,dictSize((dict*)dstset->ptr));
5063 } else {
5064 decrRefCount(dstset);
5065 addReply(c,shared.czero);
5066 }
5067 server.dirty++;
5068 } else {
5069 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5070 }
5071 zfree(dv);
5072 }
5073
5074 static void sinterCommand(redisClient *c) {
5075 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5076 }
5077
5078 static void sinterstoreCommand(redisClient *c) {
5079 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5080 }
5081
5082 #define REDIS_OP_UNION 0
5083 #define REDIS_OP_DIFF 1
5084 #define REDIS_OP_INTER 2
5085
5086 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5087 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5088 dictIterator *di;
5089 dictEntry *de;
5090 robj *dstset = NULL;
5091 int j, cardinality = 0;
5092
5093 for (j = 0; j < setsnum; j++) {
5094 robj *setobj;
5095
5096 setobj = dstkey ?
5097 lookupKeyWrite(c->db,setskeys[j]) :
5098 lookupKeyRead(c->db,setskeys[j]);
5099 if (!setobj) {
5100 dv[j] = NULL;
5101 continue;
5102 }
5103 if (setobj->type != REDIS_SET) {
5104 zfree(dv);
5105 addReply(c,shared.wrongtypeerr);
5106 return;
5107 }
5108 dv[j] = setobj->ptr;
5109 }
5110
5111 /* We need a temp set object to store our union. If the dstkey
5112 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5113 * this set object will be the resulting object to set into the target key*/
5114 dstset = createSetObject();
5115
5116 /* Iterate all the elements of all the sets, add every element a single
5117 * time to the result set */
5118 for (j = 0; j < setsnum; j++) {
5119 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5120 if (!dv[j]) continue; /* non existing keys are like empty sets */
5121
5122 di = dictGetIterator(dv[j]);
5123
5124 while((de = dictNext(di)) != NULL) {
5125 robj *ele;
5126
5127 /* dictAdd will not add the same element multiple times */
5128 ele = dictGetEntryKey(de);
5129 if (op == REDIS_OP_UNION || j == 0) {
5130 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5131 incrRefCount(ele);
5132 cardinality++;
5133 }
5134 } else if (op == REDIS_OP_DIFF) {
5135 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5136 cardinality--;
5137 }
5138 }
5139 }
5140 dictReleaseIterator(di);
5141
5142 /* result set is empty? Exit asap. */
5143 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5144 }
5145
5146 /* Output the content of the resulting set, if not in STORE mode */
5147 if (!dstkey) {
5148 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5149 di = dictGetIterator(dstset->ptr);
5150 while((de = dictNext(di)) != NULL) {
5151 robj *ele;
5152
5153 ele = dictGetEntryKey(de);
5154 addReplyBulk(c,ele);
5155 }
5156 dictReleaseIterator(di);
5157 decrRefCount(dstset);
5158 } else {
5159 /* If we have a target key where to store the resulting set
5160 * create this key with the result set inside */
5161 deleteKey(c->db,dstkey);
5162 if (dictSize((dict*)dstset->ptr) > 0) {
5163 dictAdd(c->db->dict,dstkey,dstset);
5164 incrRefCount(dstkey);
5165 addReplyLong(c,dictSize((dict*)dstset->ptr));
5166 } else {
5167 decrRefCount(dstset);
5168 addReply(c,shared.czero);
5169 }
5170 server.dirty++;
5171 }
5172 zfree(dv);
5173 }
5174
5175 static void sunionCommand(redisClient *c) {
5176 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5177 }
5178
5179 static void sunionstoreCommand(redisClient *c) {
5180 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5181 }
5182
5183 static void sdiffCommand(redisClient *c) {
5184 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5185 }
5186
5187 static void sdiffstoreCommand(redisClient *c) {
5188 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5189 }
5190
5191 /* ==================================== ZSets =============================== */
5192
5193 /* ZSETs are ordered sets using two data structures to hold the same elements
5194 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5195 * data structure.
5196 *
5197 * The elements are added to an hash table mapping Redis objects to scores.
5198 * At the same time the elements are added to a skip list mapping scores
5199 * to Redis objects (so objects are sorted by scores in this "view"). */
5200
5201 /* This skiplist implementation is almost a C translation of the original
5202 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5203 * Alternative to Balanced Trees", modified in three ways:
5204 * a) this implementation allows for repeated values.
5205 * b) the comparison is not just by key (our 'score') but by satellite data.
5206 * c) there is a back pointer, so it's a doubly linked list with the back
5207 * pointers being only at "level 1". This allows to traverse the list
5208 * from tail to head, useful for ZREVRANGE. */
5209
5210 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5211 zskiplistNode *zn = zmalloc(sizeof(*zn));
5212
5213 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5214 if (level > 0)
5215 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5216 zn->score = score;
5217 zn->obj = obj;
5218 return zn;
5219 }
5220
5221 static zskiplist *zslCreate(void) {
5222 int j;
5223 zskiplist *zsl;
5224
5225 zsl = zmalloc(sizeof(*zsl));
5226 zsl->level = 1;
5227 zsl->length = 0;
5228 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5229 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5230 zsl->header->forward[j] = NULL;
5231
5232 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5233 if (j < ZSKIPLIST_MAXLEVEL-1)
5234 zsl->header->span[j] = 0;
5235 }
5236 zsl->header->backward = NULL;
5237 zsl->tail = NULL;
5238 return zsl;
5239 }
5240
5241 static void zslFreeNode(zskiplistNode *node) {
5242 decrRefCount(node->obj);
5243 zfree(node->forward);
5244 zfree(node->span);
5245 zfree(node);
5246 }
5247
5248 static void zslFree(zskiplist *zsl) {
5249 zskiplistNode *node = zsl->header->forward[0], *next;
5250
5251 zfree(zsl->header->forward);
5252 zfree(zsl->header->span);
5253 zfree(zsl->header);
5254 while(node) {
5255 next = node->forward[0];
5256 zslFreeNode(node);
5257 node = next;
5258 }
5259 zfree(zsl);
5260 }
5261
5262 static int zslRandomLevel(void) {
5263 int level = 1;
5264 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5265 level += 1;
5266 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5267 }
5268
5269 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5270 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5271 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5272 int i, level;
5273
5274 x = zsl->header;
5275 for (i = zsl->level-1; i >= 0; i--) {
5276 /* store rank that is crossed to reach the insert position */
5277 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5278
5279 while (x->forward[i] &&
5280 (x->forward[i]->score < score ||
5281 (x->forward[i]->score == score &&
5282 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5283 rank[i] += i > 0 ? x->span[i-1] : 1;
5284 x = x->forward[i];
5285 }
5286 update[i] = x;
5287 }
5288 /* we assume the key is not already inside, since we allow duplicated
5289 * scores, and the re-insertion of score and redis object should never
5290 * happpen since the caller of zslInsert() should test in the hash table
5291 * if the element is already inside or not. */
5292 level = zslRandomLevel();
5293 if (level > zsl->level) {
5294 for (i = zsl->level; i < level; i++) {
5295 rank[i] = 0;
5296 update[i] = zsl->header;
5297 update[i]->span[i-1] = zsl->length;
5298 }
5299 zsl->level = level;
5300 }
5301 x = zslCreateNode(level,score,obj);
5302 for (i = 0; i < level; i++) {
5303 x->forward[i] = update[i]->forward[i];
5304 update[i]->forward[i] = x;
5305
5306 /* update span covered by update[i] as x is inserted here */
5307 if (i > 0) {
5308 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5309 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5310 }
5311 }
5312
5313 /* increment span for untouched levels */
5314 for (i = level; i < zsl->level; i++) {
5315 update[i]->span[i-1]++;
5316 }
5317
5318 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5319 if (x->forward[0])
5320 x->forward[0]->backward = x;
5321 else
5322 zsl->tail = x;
5323 zsl->length++;
5324 }
5325
5326 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5327 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5328 int i;
5329 for (i = 0; i < zsl->level; i++) {
5330 if (update[i]->forward[i] == x) {
5331 if (i > 0) {
5332 update[i]->span[i-1] += x->span[i-1] - 1;
5333 }
5334 update[i]->forward[i] = x->forward[i];
5335 } else {
5336 /* invariant: i > 0, because update[0]->forward[0]
5337 * is always equal to x */
5338 update[i]->span[i-1] -= 1;
5339 }
5340 }
5341 if (x->forward[0]) {
5342 x->forward[0]->backward = x->backward;
5343 } else {
5344 zsl->tail = x->backward;
5345 }
5346 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5347 zsl->level--;
5348 zsl->length--;
5349 }
5350
5351 /* Delete an element with matching score/object from the skiplist. */
5352 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5353 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5354 int i;
5355
5356 x = zsl->header;
5357 for (i = zsl->level-1; i >= 0; i--) {
5358 while (x->forward[i] &&
5359 (x->forward[i]->score < score ||
5360 (x->forward[i]->score == score &&
5361 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5362 x = x->forward[i];
5363 update[i] = x;
5364 }
5365 /* We may have multiple elements with the same score, what we need
5366 * is to find the element with both the right score and object. */
5367 x = x->forward[0];
5368 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5369 zslDeleteNode(zsl, x, update);
5370 zslFreeNode(x);
5371 return 1;
5372 } else {
5373 return 0; /* not found */
5374 }
5375 return 0; /* not found */
5376 }
5377
5378 /* Delete all the elements with score between min and max from the skiplist.
5379 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5380 * Note that this function takes the reference to the hash table view of the
5381 * sorted set, in order to remove the elements from the hash table too. */
5382 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5383 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5384 unsigned long removed = 0;
5385 int i;
5386
5387 x = zsl->header;
5388 for (i = zsl->level-1; i >= 0; i--) {
5389 while (x->forward[i] && x->forward[i]->score < min)
5390 x = x->forward[i];
5391 update[i] = x;
5392 }
5393 /* We may have multiple elements with the same score, what we need
5394 * is to find the element with both the right score and object. */
5395 x = x->forward[0];
5396 while (x && x->score <= max) {
5397 zskiplistNode *next = x->forward[0];
5398 zslDeleteNode(zsl, x, update);
5399 dictDelete(dict,x->obj);
5400 zslFreeNode(x);
5401 removed++;
5402 x = next;
5403 }
5404 return removed; /* not found */
5405 }
5406
5407 /* Delete all the elements with rank between start and end from the skiplist.
5408 * Start and end are inclusive. Note that start and end need to be 1-based */
5409 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5410 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5411 unsigned long traversed = 0, removed = 0;
5412 int i;
5413
5414 x = zsl->header;
5415 for (i = zsl->level-1; i >= 0; i--) {
5416 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5417 traversed += i > 0 ? x->span[i-1] : 1;
5418 x = x->forward[i];
5419 }
5420 update[i] = x;
5421 }
5422
5423 traversed++;
5424 x = x->forward[0];
5425 while (x && traversed <= end) {
5426 zskiplistNode *next = x->forward[0];
5427 zslDeleteNode(zsl, x, update);
5428 dictDelete(dict,x->obj);
5429 zslFreeNode(x);
5430 removed++;
5431 traversed++;
5432 x = next;
5433 }
5434 return removed;
5435 }
5436
5437 /* Find the first node having a score equal or greater than the specified one.
5438 * Returns NULL if there is no match. */
5439 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5440 zskiplistNode *x;
5441 int i;
5442
5443 x = zsl->header;
5444 for (i = zsl->level-1; i >= 0; i--) {
5445 while (x->forward[i] && x->forward[i]->score < score)
5446 x = x->forward[i];
5447 }
5448 /* We may have multiple elements with the same score, what we need
5449 * is to find the element with both the right score and object. */
5450 return x->forward[0];
5451 }
5452
5453 /* Find the rank for an element by both score and key.
5454 * Returns 0 when the element cannot be found, rank otherwise.
5455 * Note that the rank is 1-based due to the span of zsl->header to the
5456 * first element. */
5457 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5458 zskiplistNode *x;
5459 unsigned long rank = 0;
5460 int i;
5461
5462 x = zsl->header;
5463 for (i = zsl->level-1; i >= 0; i--) {
5464 while (x->forward[i] &&
5465 (x->forward[i]->score < score ||
5466 (x->forward[i]->score == score &&
5467 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5468 rank += i > 0 ? x->span[i-1] : 1;
5469 x = x->forward[i];
5470 }
5471
5472 /* x might be equal to zsl->header, so test if obj is non-NULL */
5473 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5474 return rank;
5475 }
5476 }
5477 return 0;
5478 }
5479
5480 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5481 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5482 zskiplistNode *x;
5483 unsigned long traversed = 0;
5484 int i;
5485
5486 x = zsl->header;
5487 for (i = zsl->level-1; i >= 0; i--) {
5488 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5489 {
5490 traversed += i > 0 ? x->span[i-1] : 1;
5491 x = x->forward[i];
5492 }
5493 if (traversed == rank) {
5494 return x;
5495 }
5496 }
5497 return NULL;
5498 }
5499
5500 /* The actual Z-commands implementations */
5501
5502 /* This generic command implements both ZADD and ZINCRBY.
5503 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5504 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5505 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5506 robj *zsetobj;
5507 zset *zs;
5508 double *score;
5509
5510 zsetobj = lookupKeyWrite(c->db,key);
5511 if (zsetobj == NULL) {
5512 zsetobj = createZsetObject();
5513 dictAdd(c->db->dict,key,zsetobj);
5514 incrRefCount(key);
5515 } else {
5516 if (zsetobj->type != REDIS_ZSET) {
5517 addReply(c,shared.wrongtypeerr);
5518 return;
5519 }
5520 }
5521 zs = zsetobj->ptr;
5522
5523 /* Ok now since we implement both ZADD and ZINCRBY here the code
5524 * needs to handle the two different conditions. It's all about setting
5525 * '*score', that is, the new score to set, to the right value. */
5526 score = zmalloc(sizeof(double));
5527 if (doincrement) {
5528 dictEntry *de;
5529
5530 /* Read the old score. If the element was not present starts from 0 */
5531 de = dictFind(zs->dict,ele);
5532 if (de) {
5533 double *oldscore = dictGetEntryVal(de);
5534 *score = *oldscore + scoreval;
5535 } else {
5536 *score = scoreval;
5537 }
5538 } else {
5539 *score = scoreval;
5540 }
5541
5542 /* What follows is a simple remove and re-insert operation that is common
5543 * to both ZADD and ZINCRBY... */
5544 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5545 /* case 1: New element */
5546 incrRefCount(ele); /* added to hash */
5547 zslInsert(zs->zsl,*score,ele);
5548 incrRefCount(ele); /* added to skiplist */
5549 server.dirty++;
5550 if (doincrement)
5551 addReplyDouble(c,*score);
5552 else
5553 addReply(c,shared.cone);
5554 } else {
5555 dictEntry *de;
5556 double *oldscore;
5557
5558 /* case 2: Score update operation */
5559 de = dictFind(zs->dict,ele);
5560 redisAssert(de != NULL);
5561 oldscore = dictGetEntryVal(de);
5562 if (*score != *oldscore) {
5563 int deleted;
5564
5565 /* Remove and insert the element in the skip list with new score */
5566 deleted = zslDelete(zs->zsl,*oldscore,ele);
5567 redisAssert(deleted != 0);
5568 zslInsert(zs->zsl,*score,ele);
5569 incrRefCount(ele);
5570 /* Update the score in the hash table */
5571 dictReplace(zs->dict,ele,score);
5572 server.dirty++;
5573 } else {
5574 zfree(score);
5575 }
5576 if (doincrement)
5577 addReplyDouble(c,*score);
5578 else
5579 addReply(c,shared.czero);
5580 }
5581 }
5582
5583 static void zaddCommand(redisClient *c) {
5584 double scoreval;
5585
5586 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5587 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5588 }
5589
5590 static void zincrbyCommand(redisClient *c) {
5591 double scoreval;
5592
5593 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5594 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5595 }
5596
5597 static void zremCommand(redisClient *c) {
5598 robj *zsetobj;
5599 zset *zs;
5600 dictEntry *de;
5601 double *oldscore;
5602 int deleted;
5603
5604 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5605 checkType(c,zsetobj,REDIS_ZSET)) return;
5606
5607 zs = zsetobj->ptr;
5608 de = dictFind(zs->dict,c->argv[2]);
5609 if (de == NULL) {
5610 addReply(c,shared.czero);
5611 return;
5612 }
5613 /* Delete from the skiplist */
5614 oldscore = dictGetEntryVal(de);
5615 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5616 redisAssert(deleted != 0);
5617
5618 /* Delete from the hash table */
5619 dictDelete(zs->dict,c->argv[2]);
5620 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5621 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5622 server.dirty++;
5623 addReply(c,shared.cone);
5624 }
5625
5626 static void zremrangebyscoreCommand(redisClient *c) {
5627 double min;
5628 double max;
5629 long deleted;
5630 robj *zsetobj;
5631 zset *zs;
5632
5633 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5634 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5635
5636 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5637 checkType(c,zsetobj,REDIS_ZSET)) return;
5638
5639 zs = zsetobj->ptr;
5640 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5641 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5642 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5643 server.dirty += deleted;
5644 addReplyLong(c,deleted);
5645 }
5646
5647 static void zremrangebyrankCommand(redisClient *c) {
5648 long start;
5649 long end;
5650 int llen;
5651 long deleted;
5652 robj *zsetobj;
5653 zset *zs;
5654
5655 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5656 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5657
5658 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5659 checkType(c,zsetobj,REDIS_ZSET)) return;
5660 zs = zsetobj->ptr;
5661 llen = zs->zsl->length;
5662
5663 /* convert negative indexes */
5664 if (start < 0) start = llen+start;
5665 if (end < 0) end = llen+end;
5666 if (start < 0) start = 0;
5667 if (end < 0) end = 0;
5668
5669 /* indexes sanity checks */
5670 if (start > end || start >= llen) {
5671 addReply(c,shared.czero);
5672 return;
5673 }
5674 if (end >= llen) end = llen-1;
5675
5676 /* increment start and end because zsl*Rank functions
5677 * use 1-based rank */
5678 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5679 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5680 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5681 server.dirty += deleted;
5682 addReplyLong(c, deleted);
5683 }
5684
5685 typedef struct {
5686 dict *dict;
5687 double weight;
5688 } zsetopsrc;
5689
5690 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5691 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5692 unsigned long size1, size2;
5693 size1 = d1->dict ? dictSize(d1->dict) : 0;
5694 size2 = d2->dict ? dictSize(d2->dict) : 0;
5695 return size1 - size2;
5696 }
5697
5698 #define REDIS_AGGR_SUM 1
5699 #define REDIS_AGGR_MIN 2
5700 #define REDIS_AGGR_MAX 3
5701
5702 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5703 if (aggregate == REDIS_AGGR_SUM) {
5704 *target = *target + val;
5705 } else if (aggregate == REDIS_AGGR_MIN) {
5706 *target = val < *target ? val : *target;
5707 } else if (aggregate == REDIS_AGGR_MAX) {
5708 *target = val > *target ? val : *target;
5709 } else {
5710 /* safety net */
5711 redisPanic("Unknown ZUNION/INTER aggregate type");
5712 }
5713 }
5714
5715 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5716 int i, j, zsetnum;
5717 int aggregate = REDIS_AGGR_SUM;
5718 zsetopsrc *src;
5719 robj *dstobj;
5720 zset *dstzset;
5721 dictIterator *di;
5722 dictEntry *de;
5723
5724 /* expect zsetnum input keys to be given */
5725 zsetnum = atoi(c->argv[2]->ptr);
5726 if (zsetnum < 1) {
5727 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5728 return;
5729 }
5730
5731 /* test if the expected number of keys would overflow */
5732 if (3+zsetnum > c->argc) {
5733 addReply(c,shared.syntaxerr);
5734 return;
5735 }
5736
5737 /* read keys to be used for input */
5738 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5739 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5740 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5741 if (!zsetobj) {
5742 src[i].dict = NULL;
5743 } else {
5744 if (zsetobj->type != REDIS_ZSET) {
5745 zfree(src);
5746 addReply(c,shared.wrongtypeerr);
5747 return;
5748 }
5749 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5750 }
5751
5752 /* default all weights to 1 */
5753 src[i].weight = 1.0;
5754 }
5755
5756 /* parse optional extra arguments */
5757 if (j < c->argc) {
5758 int remaining = c->argc - j;
5759
5760 while (remaining) {
5761 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5762 j++; remaining--;
5763 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5764 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5765 return;
5766 }
5767 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5768 j++; remaining--;
5769 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5770 aggregate = REDIS_AGGR_SUM;
5771 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5772 aggregate = REDIS_AGGR_MIN;
5773 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5774 aggregate = REDIS_AGGR_MAX;
5775 } else {
5776 zfree(src);
5777 addReply(c,shared.syntaxerr);
5778 return;
5779 }
5780 j++; remaining--;
5781 } else {
5782 zfree(src);
5783 addReply(c,shared.syntaxerr);
5784 return;
5785 }
5786 }
5787 }
5788
5789 /* sort sets from the smallest to largest, this will improve our
5790 * algorithm's performance */
5791 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5792
5793 dstobj = createZsetObject();
5794 dstzset = dstobj->ptr;
5795
5796 if (op == REDIS_OP_INTER) {
5797 /* skip going over all entries if the smallest zset is NULL or empty */
5798 if (src[0].dict && dictSize(src[0].dict) > 0) {
5799 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5800 * from small to large, all src[i > 0].dict are non-empty too */
5801 di = dictGetIterator(src[0].dict);
5802 while((de = dictNext(di)) != NULL) {
5803 double *score = zmalloc(sizeof(double)), value;
5804 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5805
5806 for (j = 1; j < zsetnum; j++) {
5807 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5808 if (other) {
5809 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5810 zunionInterAggregate(score, value, aggregate);
5811 } else {
5812 break;
5813 }
5814 }
5815
5816 /* skip entry when not present in every source dict */
5817 if (j != zsetnum) {
5818 zfree(score);
5819 } else {
5820 robj *o = dictGetEntryKey(de);
5821 dictAdd(dstzset->dict,o,score);
5822 incrRefCount(o); /* added to dictionary */
5823 zslInsert(dstzset->zsl,*score,o);
5824 incrRefCount(o); /* added to skiplist */
5825 }
5826 }
5827 dictReleaseIterator(di);
5828 }
5829 } else if (op == REDIS_OP_UNION) {
5830 for (i = 0; i < zsetnum; i++) {
5831 if (!src[i].dict) continue;
5832
5833 di = dictGetIterator(src[i].dict);
5834 while((de = dictNext(di)) != NULL) {
5835 /* skip key when already processed */
5836 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5837
5838 double *score = zmalloc(sizeof(double)), value;
5839 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5840
5841 /* because the zsets are sorted by size, its only possible
5842 * for sets at larger indices to hold this entry */
5843 for (j = (i+1); j < zsetnum; j++) {
5844 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5845 if (other) {
5846 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5847 zunionInterAggregate(score, value, aggregate);
5848 }
5849 }
5850
5851 robj *o = dictGetEntryKey(de);
5852 dictAdd(dstzset->dict,o,score);
5853 incrRefCount(o); /* added to dictionary */
5854 zslInsert(dstzset->zsl,*score,o);
5855 incrRefCount(o); /* added to skiplist */
5856 }
5857 dictReleaseIterator(di);
5858 }
5859 } else {
5860 /* unknown operator */
5861 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5862 }
5863
5864 deleteKey(c->db,dstkey);
5865 if (dstzset->zsl->length) {
5866 dictAdd(c->db->dict,dstkey,dstobj);
5867 incrRefCount(dstkey);
5868 addReplyLong(c, dstzset->zsl->length);
5869 server.dirty++;
5870 } else {
5871 decrRefCount(dstobj);
5872 addReply(c, shared.czero);
5873 }
5874 zfree(src);
5875 }
5876
5877 static void zunionCommand(redisClient *c) {
5878 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5879 }
5880
5881 static void zinterCommand(redisClient *c) {
5882 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5883 }
5884
5885 static void zrangeGenericCommand(redisClient *c, int reverse) {
5886 robj *o;
5887 long start;
5888 long end;
5889 int withscores = 0;
5890 int llen;
5891 int rangelen, j;
5892 zset *zsetobj;
5893 zskiplist *zsl;
5894 zskiplistNode *ln;
5895 robj *ele;
5896
5897 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5898 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5899
5900 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5901 withscores = 1;
5902 } else if (c->argc >= 5) {
5903 addReply(c,shared.syntaxerr);
5904 return;
5905 }
5906
5907 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5908 || checkType(c,o,REDIS_ZSET)) return;
5909 zsetobj = o->ptr;
5910 zsl = zsetobj->zsl;
5911 llen = zsl->length;
5912
5913 /* convert negative indexes */
5914 if (start < 0) start = llen+start;
5915 if (end < 0) end = llen+end;
5916 if (start < 0) start = 0;
5917 if (end < 0) end = 0;
5918
5919 /* indexes sanity checks */
5920 if (start > end || start >= llen) {
5921 /* Out of range start or start > end result in empty list */
5922 addReply(c,shared.emptymultibulk);
5923 return;
5924 }
5925 if (end >= llen) end = llen-1;
5926 rangelen = (end-start)+1;
5927
5928 /* check if starting point is trivial, before searching
5929 * the element in log(N) time */
5930 if (reverse) {
5931 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5932 } else {
5933 ln = start == 0 ?
5934 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5935 }
5936
5937 /* Return the result in form of a multi-bulk reply */
5938 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5939 withscores ? (rangelen*2) : rangelen));
5940 for (j = 0; j < rangelen; j++) {
5941 ele = ln->obj;
5942 addReplyBulk(c,ele);
5943 if (withscores)
5944 addReplyDouble(c,ln->score);
5945 ln = reverse ? ln->backward : ln->forward[0];
5946 }
5947 }
5948
5949 static void zrangeCommand(redisClient *c) {
5950 zrangeGenericCommand(c,0);
5951 }
5952
5953 static void zrevrangeCommand(redisClient *c) {
5954 zrangeGenericCommand(c,1);
5955 }
5956
5957 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5958 * If justcount is non-zero, just the count is returned. */
5959 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5960 robj *o;
5961 double min, max;
5962 int minex = 0, maxex = 0; /* are min or max exclusive? */
5963 int offset = 0, limit = -1;
5964 int withscores = 0;
5965 int badsyntax = 0;
5966
5967 /* Parse the min-max interval. If one of the values is prefixed
5968 * by the "(" character, it's considered "open". For instance
5969 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5970 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5971 if (((char*)c->argv[2]->ptr)[0] == '(') {
5972 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5973 minex = 1;
5974 } else {
5975 min = strtod(c->argv[2]->ptr,NULL);
5976 }
5977 if (((char*)c->argv[3]->ptr)[0] == '(') {
5978 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5979 maxex = 1;
5980 } else {
5981 max = strtod(c->argv[3]->ptr,NULL);
5982 }
5983
5984 /* Parse "WITHSCORES": note that if the command was called with
5985 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5986 * enter the following paths to parse WITHSCORES and LIMIT. */
5987 if (c->argc == 5 || c->argc == 8) {
5988 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5989 withscores = 1;
5990 else
5991 badsyntax = 1;
5992 }
5993 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5994 badsyntax = 1;
5995 if (badsyntax) {
5996 addReplySds(c,
5997 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5998 return;
5999 }
6000
6001 /* Parse "LIMIT" */
6002 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6003 addReply(c,shared.syntaxerr);
6004 return;
6005 } else if (c->argc == (7 + withscores)) {
6006 offset = atoi(c->argv[5]->ptr);
6007 limit = atoi(c->argv[6]->ptr);
6008 if (offset < 0) offset = 0;
6009 }
6010
6011 /* Ok, lookup the key and get the range */
6012 o = lookupKeyRead(c->db,c->argv[1]);
6013 if (o == NULL) {
6014 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6015 } else {
6016 if (o->type != REDIS_ZSET) {
6017 addReply(c,shared.wrongtypeerr);
6018 } else {
6019 zset *zsetobj = o->ptr;
6020 zskiplist *zsl = zsetobj->zsl;
6021 zskiplistNode *ln;
6022 robj *ele, *lenobj = NULL;
6023 unsigned long rangelen = 0;
6024
6025 /* Get the first node with the score >= min, or with
6026 * score > min if 'minex' is true. */
6027 ln = zslFirstWithScore(zsl,min);
6028 while (minex && ln && ln->score == min) ln = ln->forward[0];
6029
6030 if (ln == NULL) {
6031 /* No element matching the speciifed interval */
6032 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6033 return;
6034 }
6035
6036 /* We don't know in advance how many matching elements there
6037 * are in the list, so we push this object that will represent
6038 * the multi-bulk length in the output buffer, and will "fix"
6039 * it later */
6040 if (!justcount) {
6041 lenobj = createObject(REDIS_STRING,NULL);
6042 addReply(c,lenobj);
6043 decrRefCount(lenobj);
6044 }
6045
6046 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6047 if (offset) {
6048 offset--;
6049 ln = ln->forward[0];
6050 continue;
6051 }
6052 if (limit == 0) break;
6053 if (!justcount) {
6054 ele = ln->obj;
6055 addReplyBulk(c,ele);
6056 if (withscores)
6057 addReplyDouble(c,ln->score);
6058 }
6059 ln = ln->forward[0];
6060 rangelen++;
6061 if (limit > 0) limit--;
6062 }
6063 if (justcount) {
6064 addReplyLong(c,(long)rangelen);
6065 } else {
6066 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6067 withscores ? (rangelen*2) : rangelen);
6068 }
6069 }
6070 }
6071 }
6072
6073 static void zrangebyscoreCommand(redisClient *c) {
6074 genericZrangebyscoreCommand(c,0);
6075 }
6076
6077 static void zcountCommand(redisClient *c) {
6078 genericZrangebyscoreCommand(c,1);
6079 }
6080
6081 static void zcardCommand(redisClient *c) {
6082 robj *o;
6083 zset *zs;
6084
6085 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6086 checkType(c,o,REDIS_ZSET)) return;
6087
6088 zs = o->ptr;
6089 addReplyUlong(c,zs->zsl->length);
6090 }
6091
6092 static void zscoreCommand(redisClient *c) {
6093 robj *o;
6094 zset *zs;
6095 dictEntry *de;
6096
6097 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6098 checkType(c,o,REDIS_ZSET)) return;
6099
6100 zs = o->ptr;
6101 de = dictFind(zs->dict,c->argv[2]);
6102 if (!de) {
6103 addReply(c,shared.nullbulk);
6104 } else {
6105 double *score = dictGetEntryVal(de);
6106
6107 addReplyDouble(c,*score);
6108 }
6109 }
6110
6111 static void zrankGenericCommand(redisClient *c, int reverse) {
6112 robj *o;
6113 zset *zs;
6114 zskiplist *zsl;
6115 dictEntry *de;
6116 unsigned long rank;
6117 double *score;
6118
6119 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6120 checkType(c,o,REDIS_ZSET)) return;
6121
6122 zs = o->ptr;
6123 zsl = zs->zsl;
6124 de = dictFind(zs->dict,c->argv[2]);
6125 if (!de) {
6126 addReply(c,shared.nullbulk);
6127 return;
6128 }
6129
6130 score = dictGetEntryVal(de);
6131 rank = zslGetRank(zsl, *score, c->argv[2]);
6132 if (rank) {
6133 if (reverse) {
6134 addReplyLong(c, zsl->length - rank);
6135 } else {
6136 addReplyLong(c, rank-1);
6137 }
6138 } else {
6139 addReply(c,shared.nullbulk);
6140 }
6141 }
6142
6143 static void zrankCommand(redisClient *c) {
6144 zrankGenericCommand(c, 0);
6145 }
6146
6147 static void zrevrankCommand(redisClient *c) {
6148 zrankGenericCommand(c, 1);
6149 }
6150
6151 /* ========================= Hashes utility functions ======================= */
6152 #define REDIS_HASH_KEY 1
6153 #define REDIS_HASH_VALUE 2
6154
6155 /* Check the length of a number of objects to see if we need to convert a
6156 * zipmap to a real hash. Note that we only check string encoded objects
6157 * as their string length can be queried in constant time. */
6158 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6159 int i;
6160 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6161
6162 for (i = start; i <= end; i++) {
6163 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6164 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6165 {
6166 convertToRealHash(subject);
6167 return;
6168 }
6169 }
6170 }
6171
6172 /* Encode given objects in-place when the hash uses a dict. */
6173 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6174 if (subject->encoding == REDIS_ENCODING_HT) {
6175 if (o1) *o1 = tryObjectEncoding(*o1);
6176 if (o2) *o2 = tryObjectEncoding(*o2);
6177 }
6178 }
6179
6180 /* Get the value from a hash identified by key. Returns either a string
6181 * object or NULL if the value cannot be found. The refcount of the object
6182 * is always increased by 1 when the value was found. */
6183 static robj *hashGet(robj *o, robj *key) {
6184 robj *value = NULL;
6185 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6186 unsigned char *v;
6187 unsigned int vlen;
6188 key = getDecodedObject(key);
6189 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6190 value = createStringObject((char*)v,vlen);
6191 }
6192 decrRefCount(key);
6193 } else {
6194 dictEntry *de = dictFind(o->ptr,key);
6195 if (de != NULL) {
6196 value = dictGetEntryVal(de);
6197 incrRefCount(value);
6198 }
6199 }
6200 return value;
6201 }
6202
6203 /* Test if the key exists in the given hash. Returns 1 if the key
6204 * exists and 0 when it doesn't. */
6205 static int hashExists(robj *o, robj *key) {
6206 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6207 key = getDecodedObject(key);
6208 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6209 decrRefCount(key);
6210 return 1;
6211 }
6212 decrRefCount(key);
6213 } else {
6214 if (dictFind(o->ptr,key) != NULL) {
6215 return 1;
6216 }
6217 }
6218 return 0;
6219 }
6220
6221 /* Add an element, discard the old if the key already exists.
6222 * Return 0 on insert and 1 on update. */
6223 static int hashSet(robj *o, robj *key, robj *value) {
6224 int update = 0;
6225 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6226 key = getDecodedObject(key);
6227 value = getDecodedObject(value);
6228 o->ptr = zipmapSet(o->ptr,
6229 key->ptr,sdslen(key->ptr),
6230 value->ptr,sdslen(value->ptr), &update);
6231 decrRefCount(key);
6232 decrRefCount(value);
6233
6234 /* Check if the zipmap needs to be upgraded to a real hash table */
6235 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6236 convertToRealHash(o);
6237 } else {
6238 if (dictReplace(o->ptr,key,value)) {
6239 /* Insert */
6240 incrRefCount(key);
6241 } else {
6242 /* Update */
6243 update = 1;
6244 }
6245 incrRefCount(value);
6246 }
6247 return update;
6248 }
6249
6250 /* Delete an element from a hash.
6251 * Return 1 on deleted and 0 on not found. */
6252 static int hashDelete(robj *o, robj *key) {
6253 int deleted = 0;
6254 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6255 key = getDecodedObject(key);
6256 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6257 decrRefCount(key);
6258 } else {
6259 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6260 /* Always check if the dictionary needs a resize after a delete. */
6261 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6262 }
6263 return deleted;
6264 }
6265
6266 /* Return the number of elements in a hash. */
6267 static unsigned long hashLength(robj *o) {
6268 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6269 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6270 }
6271
6272 /* Structure to hold hash iteration abstration. Note that iteration over
6273 * hashes involves both fields and values. Because it is possible that
6274 * not both are required, store pointers in the iterator to avoid
6275 * unnecessary memory allocation for fields/values. */
6276 typedef struct {
6277 int encoding;
6278 unsigned char *zi;
6279 unsigned char *zk, *zv;
6280 unsigned int zklen, zvlen;
6281
6282 dictIterator *di;
6283 dictEntry *de;
6284 } hashIterator;
6285
6286 static hashIterator *hashInitIterator(robj *subject) {
6287 hashIterator *hi = zmalloc(sizeof(hashIterator));
6288 hi->encoding = subject->encoding;
6289 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6290 hi->zi = zipmapRewind(subject->ptr);
6291 } else if (hi->encoding == REDIS_ENCODING_HT) {
6292 hi->di = dictGetIterator(subject->ptr);
6293 } else {
6294 redisAssert(NULL);
6295 }
6296 return hi;
6297 }
6298
6299 static void hashReleaseIterator(hashIterator *hi) {
6300 if (hi->encoding == REDIS_ENCODING_HT) {
6301 dictReleaseIterator(hi->di);
6302 }
6303 zfree(hi);
6304 }
6305
6306 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6307 * could be found and REDIS_ERR when the iterator reaches the end. */
6308 static int hashNext(hashIterator *hi) {
6309 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6310 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6311 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6312 } else {
6313 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6314 }
6315 return REDIS_OK;
6316 }
6317
6318 /* Get key or value object at current iteration position.
6319 * This increases the refcount of the field object by 1. */
6320 static robj *hashCurrent(hashIterator *hi, int what) {
6321 robj *o;
6322 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6323 if (what & REDIS_HASH_KEY) {
6324 o = createStringObject((char*)hi->zk,hi->zklen);
6325 } else {
6326 o = createStringObject((char*)hi->zv,hi->zvlen);
6327 }
6328 } else {
6329 if (what & REDIS_HASH_KEY) {
6330 o = dictGetEntryKey(hi->de);
6331 } else {
6332 o = dictGetEntryVal(hi->de);
6333 }
6334 incrRefCount(o);
6335 }
6336 return o;
6337 }
6338
6339 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6340 robj *o = lookupKeyWrite(c->db,key);
6341 if (o == NULL) {
6342 o = createHashObject();
6343 dictAdd(c->db->dict,key,o);
6344 incrRefCount(key);
6345 } else {
6346 if (o->type != REDIS_HASH) {
6347 addReply(c,shared.wrongtypeerr);
6348 return NULL;
6349 }
6350 }
6351 return o;
6352 }
6353
6354 /* ============================= Hash commands ============================== */
6355 static void hsetCommand(redisClient *c) {
6356 int update;
6357 robj *o;
6358
6359 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6360 hashTryConversion(o,c->argv,2,3);
6361 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6362 update = hashSet(o,c->argv[2],c->argv[3]);
6363 addReply(c, update ? shared.czero : shared.cone);
6364 server.dirty++;
6365 }
6366
6367 static void hsetnxCommand(redisClient *c) {
6368 robj *o;
6369 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6370 hashTryConversion(o,c->argv,2,3);
6371
6372 if (hashExists(o, c->argv[2])) {
6373 addReply(c, shared.czero);
6374 } else {
6375 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6376 hashSet(o,c->argv[2],c->argv[3]);
6377 addReply(c, shared.cone);
6378 server.dirty++;
6379 }
6380 }
6381
6382 static void hmsetCommand(redisClient *c) {
6383 int i;
6384 robj *o;
6385
6386 if ((c->argc % 2) == 1) {
6387 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6388 return;
6389 }
6390
6391 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6392 hashTryConversion(o,c->argv,2,c->argc-1);
6393 for (i = 2; i < c->argc; i += 2) {
6394 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6395 hashSet(o,c->argv[i],c->argv[i+1]);
6396 }
6397 addReply(c, shared.ok);
6398 server.dirty++;
6399 }
6400
6401 static void hincrbyCommand(redisClient *c) {
6402 long long value, incr;
6403 robj *o, *current, *new;
6404
6405 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6406 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6407 if ((current = hashGet(o,c->argv[2])) != NULL) {
6408 if (current->encoding == REDIS_ENCODING_RAW)
6409 value = strtoll(current->ptr,NULL,10);
6410 else if (current->encoding == REDIS_ENCODING_INT)
6411 value = (long)current->ptr;
6412 else
6413 redisAssert(1 != 1);
6414 decrRefCount(current);
6415 } else {
6416 value = 0;
6417 }
6418
6419 value += incr;
6420 new = createStringObjectFromLongLong(value);
6421 hashTryObjectEncoding(o,&c->argv[2],NULL);
6422 hashSet(o,c->argv[2],new);
6423 decrRefCount(new);
6424 addReplyLongLong(c,value);
6425 server.dirty++;
6426 }
6427
6428 static void hgetCommand(redisClient *c) {
6429 robj *o, *value;
6430 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6431 checkType(c,o,REDIS_HASH)) return;
6432
6433 if ((value = hashGet(o,c->argv[2])) != NULL) {
6434 addReplyBulk(c,value);
6435 decrRefCount(value);
6436 } else {
6437 addReply(c,shared.nullbulk);
6438 }
6439 }
6440
6441 static void hmgetCommand(redisClient *c) {
6442 int i;
6443 robj *o, *value;
6444 o = lookupKeyRead(c->db,c->argv[1]);
6445 if (o != NULL && o->type != REDIS_HASH) {
6446 addReply(c,shared.wrongtypeerr);
6447 }
6448
6449 /* Note the check for o != NULL happens inside the loop. This is
6450 * done because objects that cannot be found are considered to be
6451 * an empty hash. The reply should then be a series of NULLs. */
6452 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6453 for (i = 2; i < c->argc; i++) {
6454 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6455 addReplyBulk(c,value);
6456 decrRefCount(value);
6457 } else {
6458 addReply(c,shared.nullbulk);
6459 }
6460 }
6461 }
6462
6463 static void hdelCommand(redisClient *c) {
6464 robj *o;
6465 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6466 checkType(c,o,REDIS_HASH)) return;
6467
6468 if (hashDelete(o,c->argv[2])) {
6469 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6470 addReply(c,shared.cone);
6471 server.dirty++;
6472 } else {
6473 addReply(c,shared.czero);
6474 }
6475 }
6476
6477 static void hlenCommand(redisClient *c) {
6478 robj *o;
6479 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6480 checkType(c,o,REDIS_HASH)) return;
6481
6482 addReplyUlong(c,hashLength(o));
6483 }
6484
6485 static void genericHgetallCommand(redisClient *c, int flags) {
6486 robj *o, *lenobj, *obj;
6487 unsigned long count = 0;
6488 hashIterator *hi;
6489
6490 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6491 || checkType(c,o,REDIS_HASH)) return;
6492
6493 lenobj = createObject(REDIS_STRING,NULL);
6494 addReply(c,lenobj);
6495 decrRefCount(lenobj);
6496
6497 hi = hashInitIterator(o);
6498 while (hashNext(hi) != REDIS_ERR) {
6499 if (flags & REDIS_HASH_KEY) {
6500 obj = hashCurrent(hi,REDIS_HASH_KEY);
6501 addReplyBulk(c,obj);
6502 decrRefCount(obj);
6503 count++;
6504 }
6505 if (flags & REDIS_HASH_VALUE) {
6506 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6507 addReplyBulk(c,obj);
6508 decrRefCount(obj);
6509 count++;
6510 }
6511 }
6512 hashReleaseIterator(hi);
6513
6514 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6515 }
6516
6517 static void hkeysCommand(redisClient *c) {
6518 genericHgetallCommand(c,REDIS_HASH_KEY);
6519 }
6520
6521 static void hvalsCommand(redisClient *c) {
6522 genericHgetallCommand(c,REDIS_HASH_VALUE);
6523 }
6524
6525 static void hgetallCommand(redisClient *c) {
6526 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6527 }
6528
6529 static void hexistsCommand(redisClient *c) {
6530 robj *o;
6531 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6532 checkType(c,o,REDIS_HASH)) return;
6533
6534 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6535 }
6536
6537 static void convertToRealHash(robj *o) {
6538 unsigned char *key, *val, *p, *zm = o->ptr;
6539 unsigned int klen, vlen;
6540 dict *dict = dictCreate(&hashDictType,NULL);
6541
6542 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6543 p = zipmapRewind(zm);
6544 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6545 robj *keyobj, *valobj;
6546
6547 keyobj = createStringObject((char*)key,klen);
6548 valobj = createStringObject((char*)val,vlen);
6549 keyobj = tryObjectEncoding(keyobj);
6550 valobj = tryObjectEncoding(valobj);
6551 dictAdd(dict,keyobj,valobj);
6552 }
6553 o->encoding = REDIS_ENCODING_HT;
6554 o->ptr = dict;
6555 zfree(zm);
6556 }
6557
6558 /* ========================= Non type-specific commands ==================== */
6559
6560 static void flushdbCommand(redisClient *c) {
6561 server.dirty += dictSize(c->db->dict);
6562 dictEmpty(c->db->dict);
6563 dictEmpty(c->db->expires);
6564 addReply(c,shared.ok);
6565 }
6566
6567 static void flushallCommand(redisClient *c) {
6568 server.dirty += emptyDb();
6569 addReply(c,shared.ok);
6570 if (server.bgsavechildpid != -1) {
6571 kill(server.bgsavechildpid,SIGKILL);
6572 rdbRemoveTempFile(server.bgsavechildpid);
6573 }
6574 rdbSave(server.dbfilename);
6575 server.dirty++;
6576 }
6577
6578 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6579 redisSortOperation *so = zmalloc(sizeof(*so));
6580 so->type = type;
6581 so->pattern = pattern;
6582 return so;
6583 }
6584
6585 /* Return the value associated to the key with a name obtained
6586 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6587 * The returned object will always have its refcount increased by 1
6588 * when it is non-NULL. */
6589 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6590 char *p, *f;
6591 sds spat, ssub;
6592 robj keyobj, fieldobj, *o;
6593 int prefixlen, sublen, postfixlen, fieldlen;
6594 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6595 struct {
6596 long len;
6597 long free;
6598 char buf[REDIS_SORTKEY_MAX+1];
6599 } keyname, fieldname;
6600
6601 /* If the pattern is "#" return the substitution object itself in order
6602 * to implement the "SORT ... GET #" feature. */
6603 spat = pattern->ptr;
6604 if (spat[0] == '#' && spat[1] == '\0') {
6605 incrRefCount(subst);
6606 return subst;
6607 }
6608
6609 /* The substitution object may be specially encoded. If so we create
6610 * a decoded object on the fly. Otherwise getDecodedObject will just
6611 * increment the ref count, that we'll decrement later. */
6612 subst = getDecodedObject(subst);
6613
6614 ssub = subst->ptr;
6615 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6616 p = strchr(spat,'*');
6617 if (!p) {
6618 decrRefCount(subst);
6619 return NULL;
6620 }
6621
6622 /* Find out if we're dealing with a hash dereference. */
6623 if ((f = strstr(p+1, "->")) != NULL) {
6624 fieldlen = sdslen(spat)-(f-spat);
6625 /* this also copies \0 character */
6626 memcpy(fieldname.buf,f+2,fieldlen-1);
6627 fieldname.len = fieldlen-2;
6628 } else {
6629 fieldlen = 0;
6630 }
6631
6632 prefixlen = p-spat;
6633 sublen = sdslen(ssub);
6634 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6635 memcpy(keyname.buf,spat,prefixlen);
6636 memcpy(keyname.buf+prefixlen,ssub,sublen);
6637 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6638 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6639 keyname.len = prefixlen+sublen+postfixlen;
6640 decrRefCount(subst);
6641
6642 /* Lookup substituted key */
6643 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6644 o = lookupKeyRead(db,&keyobj);
6645 if (o == NULL) return NULL;
6646
6647 if (fieldlen > 0) {
6648 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6649
6650 /* Retrieve value from hash by the field name. This operation
6651 * already increases the refcount of the returned object. */
6652 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6653 o = hashGet(o, &fieldobj);
6654 } else {
6655 if (o->type != REDIS_STRING) return NULL;
6656
6657 /* Every object that this function returns needs to have its refcount
6658 * increased. sortCommand decreases it again. */
6659 incrRefCount(o);
6660 }
6661
6662 return o;
6663 }
6664
6665 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6666 * the additional parameter is not standard but a BSD-specific we have to
6667 * pass sorting parameters via the global 'server' structure */
6668 static int sortCompare(const void *s1, const void *s2) {
6669 const redisSortObject *so1 = s1, *so2 = s2;
6670 int cmp;
6671
6672 if (!server.sort_alpha) {
6673 /* Numeric sorting. Here it's trivial as we precomputed scores */
6674 if (so1->u.score > so2->u.score) {
6675 cmp = 1;
6676 } else if (so1->u.score < so2->u.score) {
6677 cmp = -1;
6678 } else {
6679 cmp = 0;
6680 }
6681 } else {
6682 /* Alphanumeric sorting */
6683 if (server.sort_bypattern) {
6684 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6685 /* At least one compare object is NULL */
6686 if (so1->u.cmpobj == so2->u.cmpobj)
6687 cmp = 0;
6688 else if (so1->u.cmpobj == NULL)
6689 cmp = -1;
6690 else
6691 cmp = 1;
6692 } else {
6693 /* We have both the objects, use strcoll */
6694 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6695 }
6696 } else {
6697 /* Compare elements directly. */
6698 cmp = compareStringObjects(so1->obj,so2->obj);
6699 }
6700 }
6701 return server.sort_desc ? -cmp : cmp;
6702 }
6703
6704 /* The SORT command is the most complex command in Redis. Warning: this code
6705 * is optimized for speed and a bit less for readability */
6706 static void sortCommand(redisClient *c) {
6707 list *operations;
6708 int outputlen = 0;
6709 int desc = 0, alpha = 0;
6710 int limit_start = 0, limit_count = -1, start, end;
6711 int j, dontsort = 0, vectorlen;
6712 int getop = 0; /* GET operation counter */
6713 robj *sortval, *sortby = NULL, *storekey = NULL;
6714 redisSortObject *vector; /* Resulting vector to sort */
6715
6716 /* Lookup the key to sort. It must be of the right types */
6717 sortval = lookupKeyRead(c->db,c->argv[1]);
6718 if (sortval == NULL) {
6719 addReply(c,shared.emptymultibulk);
6720 return;
6721 }
6722 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6723 sortval->type != REDIS_ZSET)
6724 {
6725 addReply(c,shared.wrongtypeerr);
6726 return;
6727 }
6728
6729 /* Create a list of operations to perform for every sorted element.
6730 * Operations can be GET/DEL/INCR/DECR */
6731 operations = listCreate();
6732 listSetFreeMethod(operations,zfree);
6733 j = 2;
6734
6735 /* Now we need to protect sortval incrementing its count, in the future
6736 * SORT may have options able to overwrite/delete keys during the sorting
6737 * and the sorted key itself may get destroied */
6738 incrRefCount(sortval);
6739
6740 /* The SORT command has an SQL-alike syntax, parse it */
6741 while(j < c->argc) {
6742 int leftargs = c->argc-j-1;
6743 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6744 desc = 0;
6745 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6746 desc = 1;
6747 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6748 alpha = 1;
6749 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6750 limit_start = atoi(c->argv[j+1]->ptr);
6751 limit_count = atoi(c->argv[j+2]->ptr);
6752 j+=2;
6753 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6754 storekey = c->argv[j+1];
6755 j++;
6756 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6757 sortby = c->argv[j+1];
6758 /* If the BY pattern does not contain '*', i.e. it is constant,
6759 * we don't need to sort nor to lookup the weight keys. */
6760 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6761 j++;
6762 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6763 listAddNodeTail(operations,createSortOperation(
6764 REDIS_SORT_GET,c->argv[j+1]));
6765 getop++;
6766 j++;
6767 } else {
6768 decrRefCount(sortval);
6769 listRelease(operations);
6770 addReply(c,shared.syntaxerr);
6771 return;
6772 }
6773 j++;
6774 }
6775
6776 /* Load the sorting vector with all the objects to sort */
6777 switch(sortval->type) {
6778 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6779 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6780 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6781 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6782 }
6783 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6784 j = 0;
6785
6786 if (sortval->type == REDIS_LIST) {
6787 list *list = sortval->ptr;
6788 listNode *ln;
6789 listIter li;
6790
6791 listRewind(list,&li);
6792 while((ln = listNext(&li))) {
6793 robj *ele = ln->value;
6794 vector[j].obj = ele;
6795 vector[j].u.score = 0;
6796 vector[j].u.cmpobj = NULL;
6797 j++;
6798 }
6799 } else {
6800 dict *set;
6801 dictIterator *di;
6802 dictEntry *setele;
6803
6804 if (sortval->type == REDIS_SET) {
6805 set = sortval->ptr;
6806 } else {
6807 zset *zs = sortval->ptr;
6808 set = zs->dict;
6809 }
6810
6811 di = dictGetIterator(set);
6812 while((setele = dictNext(di)) != NULL) {
6813 vector[j].obj = dictGetEntryKey(setele);
6814 vector[j].u.score = 0;
6815 vector[j].u.cmpobj = NULL;
6816 j++;
6817 }
6818 dictReleaseIterator(di);
6819 }
6820 redisAssert(j == vectorlen);
6821
6822 /* Now it's time to load the right scores in the sorting vector */
6823 if (dontsort == 0) {
6824 for (j = 0; j < vectorlen; j++) {
6825 robj *byval;
6826 if (sortby) {
6827 /* lookup value to sort by */
6828 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6829 if (!byval) continue;
6830 } else {
6831 /* use object itself to sort by */
6832 byval = vector[j].obj;
6833 }
6834
6835 if (alpha) {
6836 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6837 } else {
6838 if (byval->encoding == REDIS_ENCODING_RAW) {
6839 vector[j].u.score = strtod(byval->ptr,NULL);
6840 } else if (byval->encoding == REDIS_ENCODING_INT) {
6841 /* Don't need to decode the object if it's
6842 * integer-encoded (the only encoding supported) so
6843 * far. We can just cast it */
6844 vector[j].u.score = (long)byval->ptr;
6845 } else {
6846 redisAssert(1 != 1);
6847 }
6848 }
6849
6850 /* when the object was retrieved using lookupKeyByPattern,
6851 * its refcount needs to be decreased. */
6852 if (sortby) {
6853 decrRefCount(byval);
6854 }
6855 }
6856 }
6857
6858 /* We are ready to sort the vector... perform a bit of sanity check
6859 * on the LIMIT option too. We'll use a partial version of quicksort. */
6860 start = (limit_start < 0) ? 0 : limit_start;
6861 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6862 if (start >= vectorlen) {
6863 start = vectorlen-1;
6864 end = vectorlen-2;
6865 }
6866 if (end >= vectorlen) end = vectorlen-1;
6867
6868 if (dontsort == 0) {
6869 server.sort_desc = desc;
6870 server.sort_alpha = alpha;
6871 server.sort_bypattern = sortby ? 1 : 0;
6872 if (sortby && (start != 0 || end != vectorlen-1))
6873 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6874 else
6875 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6876 }
6877
6878 /* Send command output to the output buffer, performing the specified
6879 * GET/DEL/INCR/DECR operations if any. */
6880 outputlen = getop ? getop*(end-start+1) : end-start+1;
6881 if (storekey == NULL) {
6882 /* STORE option not specified, sent the sorting result to client */
6883 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6884 for (j = start; j <= end; j++) {
6885 listNode *ln;
6886 listIter li;
6887
6888 if (!getop) addReplyBulk(c,vector[j].obj);
6889 listRewind(operations,&li);
6890 while((ln = listNext(&li))) {
6891 redisSortOperation *sop = ln->value;
6892 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6893 vector[j].obj);
6894
6895 if (sop->type == REDIS_SORT_GET) {
6896 if (!val) {
6897 addReply(c,shared.nullbulk);
6898 } else {
6899 addReplyBulk(c,val);
6900 decrRefCount(val);
6901 }
6902 } else {
6903 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6904 }
6905 }
6906 }
6907 } else {
6908 robj *listObject = createListObject();
6909 list *listPtr = (list*) listObject->ptr;
6910
6911 /* STORE option specified, set the sorting result as a List object */
6912 for (j = start; j <= end; j++) {
6913 listNode *ln;
6914 listIter li;
6915
6916 if (!getop) {
6917 listAddNodeTail(listPtr,vector[j].obj);
6918 incrRefCount(vector[j].obj);
6919 }
6920 listRewind(operations,&li);
6921 while((ln = listNext(&li))) {
6922 redisSortOperation *sop = ln->value;
6923 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6924 vector[j].obj);
6925
6926 if (sop->type == REDIS_SORT_GET) {
6927 if (!val) {
6928 listAddNodeTail(listPtr,createStringObject("",0));
6929 } else {
6930 /* We should do a incrRefCount on val because it is
6931 * added to the list, but also a decrRefCount because
6932 * it is returned by lookupKeyByPattern. This results
6933 * in doing nothing at all. */
6934 listAddNodeTail(listPtr,val);
6935 }
6936 } else {
6937 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6938 }
6939 }
6940 }
6941 if (dictReplace(c->db->dict,storekey,listObject)) {
6942 incrRefCount(storekey);
6943 }
6944 /* Note: we add 1 because the DB is dirty anyway since even if the
6945 * SORT result is empty a new key is set and maybe the old content
6946 * replaced. */
6947 server.dirty += 1+outputlen;
6948 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6949 }
6950
6951 /* Cleanup */
6952 decrRefCount(sortval);
6953 listRelease(operations);
6954 for (j = 0; j < vectorlen; j++) {
6955 if (alpha && vector[j].u.cmpobj)
6956 decrRefCount(vector[j].u.cmpobj);
6957 }
6958 zfree(vector);
6959 }
6960
6961 /* Convert an amount of bytes into a human readable string in the form
6962 * of 100B, 2G, 100M, 4K, and so forth. */
6963 static void bytesToHuman(char *s, unsigned long long n) {
6964 double d;
6965
6966 if (n < 1024) {
6967 /* Bytes */
6968 sprintf(s,"%lluB",n);
6969 return;
6970 } else if (n < (1024*1024)) {
6971 d = (double)n/(1024);
6972 sprintf(s,"%.2fK",d);
6973 } else if (n < (1024LL*1024*1024)) {
6974 d = (double)n/(1024*1024);
6975 sprintf(s,"%.2fM",d);
6976 } else if (n < (1024LL*1024*1024*1024)) {
6977 d = (double)n/(1024LL*1024*1024);
6978 sprintf(s,"%.2fG",d);
6979 }
6980 }
6981
6982 /* Create the string returned by the INFO command. This is decoupled
6983 * by the INFO command itself as we need to report the same information
6984 * on memory corruption problems. */
6985 static sds genRedisInfoString(void) {
6986 sds info;
6987 time_t uptime = time(NULL)-server.stat_starttime;
6988 int j;
6989 char hmem[64];
6990
6991 bytesToHuman(hmem,zmalloc_used_memory());
6992 info = sdscatprintf(sdsempty(),
6993 "redis_version:%s\r\n"
6994 "arch_bits:%s\r\n"
6995 "multiplexing_api:%s\r\n"
6996 "process_id:%ld\r\n"
6997 "uptime_in_seconds:%ld\r\n"
6998 "uptime_in_days:%ld\r\n"
6999 "connected_clients:%d\r\n"
7000 "connected_slaves:%d\r\n"
7001 "blocked_clients:%d\r\n"
7002 "used_memory:%zu\r\n"
7003 "used_memory_human:%s\r\n"
7004 "changes_since_last_save:%lld\r\n"
7005 "bgsave_in_progress:%d\r\n"
7006 "last_save_time:%ld\r\n"
7007 "bgrewriteaof_in_progress:%d\r\n"
7008 "total_connections_received:%lld\r\n"
7009 "total_commands_processed:%lld\r\n"
7010 "expired_keys:%lld\r\n"
7011 "hash_max_zipmap_entries:%ld\r\n"
7012 "hash_max_zipmap_value:%ld\r\n"
7013 "pubsub_channels:%ld\r\n"
7014 "pubsub_patterns:%u\r\n"
7015 "vm_enabled:%d\r\n"
7016 "role:%s\r\n"
7017 ,REDIS_VERSION,
7018 (sizeof(long) == 8) ? "64" : "32",
7019 aeGetApiName(),
7020 (long) getpid(),
7021 uptime,
7022 uptime/(3600*24),
7023 listLength(server.clients)-listLength(server.slaves),
7024 listLength(server.slaves),
7025 server.blpop_blocked_clients,
7026 zmalloc_used_memory(),
7027 hmem,
7028 server.dirty,
7029 server.bgsavechildpid != -1,
7030 server.lastsave,
7031 server.bgrewritechildpid != -1,
7032 server.stat_numconnections,
7033 server.stat_numcommands,
7034 server.stat_expiredkeys,
7035 server.hash_max_zipmap_entries,
7036 server.hash_max_zipmap_value,
7037 dictSize(server.pubsub_channels),
7038 listLength(server.pubsub_patterns),
7039 server.vm_enabled != 0,
7040 server.masterhost == NULL ? "master" : "slave"
7041 );
7042 if (server.masterhost) {
7043 info = sdscatprintf(info,
7044 "master_host:%s\r\n"
7045 "master_port:%d\r\n"
7046 "master_link_status:%s\r\n"
7047 "master_last_io_seconds_ago:%d\r\n"
7048 ,server.masterhost,
7049 server.masterport,
7050 (server.replstate == REDIS_REPL_CONNECTED) ?
7051 "up" : "down",
7052 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7053 );
7054 }
7055 if (server.vm_enabled) {
7056 lockThreadedIO();
7057 info = sdscatprintf(info,
7058 "vm_conf_max_memory:%llu\r\n"
7059 "vm_conf_page_size:%llu\r\n"
7060 "vm_conf_pages:%llu\r\n"
7061 "vm_stats_used_pages:%llu\r\n"
7062 "vm_stats_swapped_objects:%llu\r\n"
7063 "vm_stats_swappin_count:%llu\r\n"
7064 "vm_stats_swappout_count:%llu\r\n"
7065 "vm_stats_io_newjobs_len:%lu\r\n"
7066 "vm_stats_io_processing_len:%lu\r\n"
7067 "vm_stats_io_processed_len:%lu\r\n"
7068 "vm_stats_io_active_threads:%lu\r\n"
7069 "vm_stats_blocked_clients:%lu\r\n"
7070 ,(unsigned long long) server.vm_max_memory,
7071 (unsigned long long) server.vm_page_size,
7072 (unsigned long long) server.vm_pages,
7073 (unsigned long long) server.vm_stats_used_pages,
7074 (unsigned long long) server.vm_stats_swapped_objects,
7075 (unsigned long long) server.vm_stats_swapins,
7076 (unsigned long long) server.vm_stats_swapouts,
7077 (unsigned long) listLength(server.io_newjobs),
7078 (unsigned long) listLength(server.io_processing),
7079 (unsigned long) listLength(server.io_processed),
7080 (unsigned long) server.io_active_threads,
7081 (unsigned long) server.vm_blocked_clients
7082 );
7083 unlockThreadedIO();
7084 }
7085 for (j = 0; j < server.dbnum; j++) {
7086 long long keys, vkeys;
7087
7088 keys = dictSize(server.db[j].dict);
7089 vkeys = dictSize(server.db[j].expires);
7090 if (keys || vkeys) {
7091 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7092 j, keys, vkeys);
7093 }
7094 }
7095 return info;
7096 }
7097
7098 static void infoCommand(redisClient *c) {
7099 sds info = genRedisInfoString();
7100 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7101 (unsigned long)sdslen(info)));
7102 addReplySds(c,info);
7103 addReply(c,shared.crlf);
7104 }
7105
7106 static void monitorCommand(redisClient *c) {
7107 /* ignore MONITOR if aleady slave or in monitor mode */
7108 if (c->flags & REDIS_SLAVE) return;
7109
7110 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7111 c->slaveseldb = 0;
7112 listAddNodeTail(server.monitors,c);
7113 addReply(c,shared.ok);
7114 }
7115
7116 /* ================================= Expire ================================= */
7117 static int removeExpire(redisDb *db, robj *key) {
7118 if (dictDelete(db->expires,key) == DICT_OK) {
7119 return 1;
7120 } else {
7121 return 0;
7122 }
7123 }
7124
7125 static int setExpire(redisDb *db, robj *key, time_t when) {
7126 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7127 return 0;
7128 } else {
7129 incrRefCount(key);
7130 return 1;
7131 }
7132 }
7133
7134 /* Return the expire time of the specified key, or -1 if no expire
7135 * is associated with this key (i.e. the key is non volatile) */
7136 static time_t getExpire(redisDb *db, robj *key) {
7137 dictEntry *de;
7138
7139 /* No expire? return ASAP */
7140 if (dictSize(db->expires) == 0 ||
7141 (de = dictFind(db->expires,key)) == NULL) return -1;
7142
7143 return (time_t) dictGetEntryVal(de);
7144 }
7145
7146 static int expireIfNeeded(redisDb *db, robj *key) {
7147 time_t when;
7148 dictEntry *de;
7149
7150 /* No expire? return ASAP */
7151 if (dictSize(db->expires) == 0 ||
7152 (de = dictFind(db->expires,key)) == NULL) return 0;
7153
7154 /* Lookup the expire */
7155 when = (time_t) dictGetEntryVal(de);
7156 if (time(NULL) <= when) return 0;
7157
7158 /* Delete the key */
7159 dictDelete(db->expires,key);
7160 server.stat_expiredkeys++;
7161 return dictDelete(db->dict,key) == DICT_OK;
7162 }
7163
7164 static int deleteIfVolatile(redisDb *db, robj *key) {
7165 dictEntry *de;
7166
7167 /* No expire? return ASAP */
7168 if (dictSize(db->expires) == 0 ||
7169 (de = dictFind(db->expires,key)) == NULL) return 0;
7170
7171 /* Delete the key */
7172 server.dirty++;
7173 server.stat_expiredkeys++;
7174 dictDelete(db->expires,key);
7175 return dictDelete(db->dict,key) == DICT_OK;
7176 }
7177
7178 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7179 dictEntry *de;
7180 time_t seconds;
7181
7182 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7183
7184 seconds -= offset;
7185
7186 de = dictFind(c->db->dict,key);
7187 if (de == NULL) {
7188 addReply(c,shared.czero);
7189 return;
7190 }
7191 if (seconds <= 0) {
7192 if (deleteKey(c->db,key)) server.dirty++;
7193 addReply(c, shared.cone);
7194 return;
7195 } else {
7196 time_t when = time(NULL)+seconds;
7197 if (setExpire(c->db,key,when)) {
7198 addReply(c,shared.cone);
7199 server.dirty++;
7200 } else {
7201 addReply(c,shared.czero);
7202 }
7203 return;
7204 }
7205 }
7206
7207 static void expireCommand(redisClient *c) {
7208 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7209 }
7210
7211 static void expireatCommand(redisClient *c) {
7212 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7213 }
7214
7215 static void ttlCommand(redisClient *c) {
7216 time_t expire;
7217 int ttl = -1;
7218
7219 expire = getExpire(c->db,c->argv[1]);
7220 if (expire != -1) {
7221 ttl = (int) (expire-time(NULL));
7222 if (ttl < 0) ttl = -1;
7223 }
7224 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7225 }
7226
7227 /* ================================ MULTI/EXEC ============================== */
7228
7229 /* Client state initialization for MULTI/EXEC */
7230 static void initClientMultiState(redisClient *c) {
7231 c->mstate.commands = NULL;
7232 c->mstate.count = 0;
7233 }
7234
7235 /* Release all the resources associated with MULTI/EXEC state */
7236 static void freeClientMultiState(redisClient *c) {
7237 int j;
7238
7239 for (j = 0; j < c->mstate.count; j++) {
7240 int i;
7241 multiCmd *mc = c->mstate.commands+j;
7242
7243 for (i = 0; i < mc->argc; i++)
7244 decrRefCount(mc->argv[i]);
7245 zfree(mc->argv);
7246 }
7247 zfree(c->mstate.commands);
7248 }
7249
7250 /* Add a new command into the MULTI commands queue */
7251 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7252 multiCmd *mc;
7253 int j;
7254
7255 c->mstate.commands = zrealloc(c->mstate.commands,
7256 sizeof(multiCmd)*(c->mstate.count+1));
7257 mc = c->mstate.commands+c->mstate.count;
7258 mc->cmd = cmd;
7259 mc->argc = c->argc;
7260 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7261 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7262 for (j = 0; j < c->argc; j++)
7263 incrRefCount(mc->argv[j]);
7264 c->mstate.count++;
7265 }
7266
7267 static void multiCommand(redisClient *c) {
7268 c->flags |= REDIS_MULTI;
7269 addReply(c,shared.ok);
7270 }
7271
7272 static void discardCommand(redisClient *c) {
7273 if (!(c->flags & REDIS_MULTI)) {
7274 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7275 return;
7276 }
7277
7278 freeClientMultiState(c);
7279 initClientMultiState(c);
7280 c->flags &= (~REDIS_MULTI);
7281 addReply(c,shared.ok);
7282 }
7283
7284 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7285 * implememntation for more information. */
7286 static void execCommandReplicateMulti(redisClient *c) {
7287 struct redisCommand *cmd;
7288 robj *multistring = createStringObject("MULTI",5);
7289
7290 cmd = lookupCommand("multi");
7291 if (server.appendonly)
7292 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7293 if (listLength(server.slaves))
7294 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7295 decrRefCount(multistring);
7296 }
7297
7298 static void execCommand(redisClient *c) {
7299 int j;
7300 robj **orig_argv;
7301 int orig_argc;
7302
7303 if (!(c->flags & REDIS_MULTI)) {
7304 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7305 return;
7306 }
7307
7308 /* Replicate a MULTI request now that we are sure the block is executed.
7309 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7310 * both the AOF and the replication link will have the same consistency
7311 * and atomicity guarantees. */
7312 execCommandReplicateMulti(c);
7313
7314 /* Exec all the queued commands */
7315 orig_argv = c->argv;
7316 orig_argc = c->argc;
7317 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7318 for (j = 0; j < c->mstate.count; j++) {
7319 c->argc = c->mstate.commands[j].argc;
7320 c->argv = c->mstate.commands[j].argv;
7321 call(c,c->mstate.commands[j].cmd);
7322 }
7323 c->argv = orig_argv;
7324 c->argc = orig_argc;
7325 freeClientMultiState(c);
7326 initClientMultiState(c);
7327 c->flags &= (~REDIS_MULTI);
7328 /* Make sure the EXEC command is always replicated / AOF, since we
7329 * always send the MULTI command (we can't know beforehand if the
7330 * next operations will contain at least a modification to the DB). */
7331 server.dirty++;
7332 }
7333
7334 /* =========================== Blocking Operations ========================= */
7335
7336 /* Currently Redis blocking operations support is limited to list POP ops,
7337 * so the current implementation is not fully generic, but it is also not
7338 * completely specific so it will not require a rewrite to support new
7339 * kind of blocking operations in the future.
7340 *
7341 * Still it's important to note that list blocking operations can be already
7342 * used as a notification mechanism in order to implement other blocking
7343 * operations at application level, so there must be a very strong evidence
7344 * of usefulness and generality before new blocking operations are implemented.
7345 *
7346 * This is how the current blocking POP works, we use BLPOP as example:
7347 * - If the user calls BLPOP and the key exists and contains a non empty list
7348 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7349 * if there is not to block.
7350 * - If instead BLPOP is called and the key does not exists or the list is
7351 * empty we need to block. In order to do so we remove the notification for
7352 * new data to read in the client socket (so that we'll not serve new
7353 * requests if the blocking request is not served). Also we put the client
7354 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7355 * blocking for this keys.
7356 * - If a PUSH operation against a key with blocked clients waiting is
7357 * performed, we serve the first in the list: basically instead to push
7358 * the new element inside the list we return it to the (first / oldest)
7359 * blocking client, unblock the client, and remove it form the list.
7360 *
7361 * The above comment and the source code should be enough in order to understand
7362 * the implementation and modify / fix it later.
7363 */
7364
7365 /* Set a client in blocking mode for the specified key, with the specified
7366 * timeout */
7367 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7368 dictEntry *de;
7369 list *l;
7370 int j;
7371
7372 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7373 c->blockingkeysnum = numkeys;
7374 c->blockingto = timeout;
7375 for (j = 0; j < numkeys; j++) {
7376 /* Add the key in the client structure, to map clients -> keys */
7377 c->blockingkeys[j] = keys[j];
7378 incrRefCount(keys[j]);
7379
7380 /* And in the other "side", to map keys -> clients */
7381 de = dictFind(c->db->blockingkeys,keys[j]);
7382 if (de == NULL) {
7383 int retval;
7384
7385 /* For every key we take a list of clients blocked for it */
7386 l = listCreate();
7387 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7388 incrRefCount(keys[j]);
7389 assert(retval == DICT_OK);
7390 } else {
7391 l = dictGetEntryVal(de);
7392 }
7393 listAddNodeTail(l,c);
7394 }
7395 /* Mark the client as a blocked client */
7396 c->flags |= REDIS_BLOCKED;
7397 server.blpop_blocked_clients++;
7398 }
7399
7400 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7401 static void unblockClientWaitingData(redisClient *c) {
7402 dictEntry *de;
7403 list *l;
7404 int j;
7405
7406 assert(c->blockingkeys != NULL);
7407 /* The client may wait for multiple keys, so unblock it for every key. */
7408 for (j = 0; j < c->blockingkeysnum; j++) {
7409 /* Remove this client from the list of clients waiting for this key. */
7410 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7411 assert(de != NULL);
7412 l = dictGetEntryVal(de);
7413 listDelNode(l,listSearchKey(l,c));
7414 /* If the list is empty we need to remove it to avoid wasting memory */
7415 if (listLength(l) == 0)
7416 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7417 decrRefCount(c->blockingkeys[j]);
7418 }
7419 /* Cleanup the client structure */
7420 zfree(c->blockingkeys);
7421 c->blockingkeys = NULL;
7422 c->flags &= (~REDIS_BLOCKED);
7423 server.blpop_blocked_clients--;
7424 /* We want to process data if there is some command waiting
7425 * in the input buffer. Note that this is safe even if
7426 * unblockClientWaitingData() gets called from freeClient() because
7427 * freeClient() will be smart enough to call this function
7428 * *after* c->querybuf was set to NULL. */
7429 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7430 }
7431
7432 /* This should be called from any function PUSHing into lists.
7433 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7434 * 'ele' is the element pushed.
7435 *
7436 * If the function returns 0 there was no client waiting for a list push
7437 * against this key.
7438 *
7439 * If the function returns 1 there was a client waiting for a list push
7440 * against this key, the element was passed to this client thus it's not
7441 * needed to actually add it to the list and the caller should return asap. */
7442 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7443 struct dictEntry *de;
7444 redisClient *receiver;
7445 list *l;
7446 listNode *ln;
7447
7448 de = dictFind(c->db->blockingkeys,key);
7449 if (de == NULL) return 0;
7450 l = dictGetEntryVal(de);
7451 ln = listFirst(l);
7452 assert(ln != NULL);
7453 receiver = ln->value;
7454
7455 addReplySds(receiver,sdsnew("*2\r\n"));
7456 addReplyBulk(receiver,key);
7457 addReplyBulk(receiver,ele);
7458 unblockClientWaitingData(receiver);
7459 return 1;
7460 }
7461
7462 /* Blocking RPOP/LPOP */
7463 static void blockingPopGenericCommand(redisClient *c, int where) {
7464 robj *o;
7465 time_t timeout;
7466 int j;
7467
7468 for (j = 1; j < c->argc-1; j++) {
7469 o = lookupKeyWrite(c->db,c->argv[j]);
7470 if (o != NULL) {
7471 if (o->type != REDIS_LIST) {
7472 addReply(c,shared.wrongtypeerr);
7473 return;
7474 } else {
7475 list *list = o->ptr;
7476 if (listLength(list) != 0) {
7477 /* If the list contains elements fall back to the usual
7478 * non-blocking POP operation */
7479 robj *argv[2], **orig_argv;
7480 int orig_argc;
7481
7482 /* We need to alter the command arguments before to call
7483 * popGenericCommand() as the command takes a single key. */
7484 orig_argv = c->argv;
7485 orig_argc = c->argc;
7486 argv[1] = c->argv[j];
7487 c->argv = argv;
7488 c->argc = 2;
7489
7490 /* Also the return value is different, we need to output
7491 * the multi bulk reply header and the key name. The
7492 * "real" command will add the last element (the value)
7493 * for us. If this souds like an hack to you it's just
7494 * because it is... */
7495 addReplySds(c,sdsnew("*2\r\n"));
7496 addReplyBulk(c,argv[1]);
7497 popGenericCommand(c,where);
7498
7499 /* Fix the client structure with the original stuff */
7500 c->argv = orig_argv;
7501 c->argc = orig_argc;
7502 return;
7503 }
7504 }
7505 }
7506 }
7507 /* If the list is empty or the key does not exists we must block */
7508 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7509 if (timeout > 0) timeout += time(NULL);
7510 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7511 }
7512
7513 static void blpopCommand(redisClient *c) {
7514 blockingPopGenericCommand(c,REDIS_HEAD);
7515 }
7516
7517 static void brpopCommand(redisClient *c) {
7518 blockingPopGenericCommand(c,REDIS_TAIL);
7519 }
7520
7521 /* =============================== Replication ============================= */
7522
7523 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7524 ssize_t nwritten, ret = size;
7525 time_t start = time(NULL);
7526
7527 timeout++;
7528 while(size) {
7529 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7530 nwritten = write(fd,ptr,size);
7531 if (nwritten == -1) return -1;
7532 ptr += nwritten;
7533 size -= nwritten;
7534 }
7535 if ((time(NULL)-start) > timeout) {
7536 errno = ETIMEDOUT;
7537 return -1;
7538 }
7539 }
7540 return ret;
7541 }
7542
7543 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7544 ssize_t nread, totread = 0;
7545 time_t start = time(NULL);
7546
7547 timeout++;
7548 while(size) {
7549 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7550 nread = read(fd,ptr,size);
7551 if (nread == -1) return -1;
7552 ptr += nread;
7553 size -= nread;
7554 totread += nread;
7555 }
7556 if ((time(NULL)-start) > timeout) {
7557 errno = ETIMEDOUT;
7558 return -1;
7559 }
7560 }
7561 return totread;
7562 }
7563
7564 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7565 ssize_t nread = 0;
7566
7567 size--;
7568 while(size) {
7569 char c;
7570
7571 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7572 if (c == '\n') {
7573 *ptr = '\0';
7574 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7575 return nread;
7576 } else {
7577 *ptr++ = c;
7578 *ptr = '\0';
7579 nread++;
7580 }
7581 }
7582 return nread;
7583 }
7584
7585 static void syncCommand(redisClient *c) {
7586 /* ignore SYNC if aleady slave or in monitor mode */
7587 if (c->flags & REDIS_SLAVE) return;
7588
7589 /* SYNC can't be issued when the server has pending data to send to
7590 * the client about already issued commands. We need a fresh reply
7591 * buffer registering the differences between the BGSAVE and the current
7592 * dataset, so that we can copy to other slaves if needed. */
7593 if (listLength(c->reply) != 0) {
7594 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7595 return;
7596 }
7597
7598 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7599 /* Here we need to check if there is a background saving operation
7600 * in progress, or if it is required to start one */
7601 if (server.bgsavechildpid != -1) {
7602 /* Ok a background save is in progress. Let's check if it is a good
7603 * one for replication, i.e. if there is another slave that is
7604 * registering differences since the server forked to save */
7605 redisClient *slave;
7606 listNode *ln;
7607 listIter li;
7608
7609 listRewind(server.slaves,&li);
7610 while((ln = listNext(&li))) {
7611 slave = ln->value;
7612 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7613 }
7614 if (ln) {
7615 /* Perfect, the server is already registering differences for
7616 * another slave. Set the right state, and copy the buffer. */
7617 listRelease(c->reply);
7618 c->reply = listDup(slave->reply);
7619 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7620 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7621 } else {
7622 /* No way, we need to wait for the next BGSAVE in order to
7623 * register differences */
7624 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7625 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7626 }
7627 } else {
7628 /* Ok we don't have a BGSAVE in progress, let's start one */
7629 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7630 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7631 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7632 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7633 return;
7634 }
7635 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7636 }
7637 c->repldbfd = -1;
7638 c->flags |= REDIS_SLAVE;
7639 c->slaveseldb = 0;
7640 listAddNodeTail(server.slaves,c);
7641 return;
7642 }
7643
7644 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7645 redisClient *slave = privdata;
7646 REDIS_NOTUSED(el);
7647 REDIS_NOTUSED(mask);
7648 char buf[REDIS_IOBUF_LEN];
7649 ssize_t nwritten, buflen;
7650
7651 if (slave->repldboff == 0) {
7652 /* Write the bulk write count before to transfer the DB. In theory here
7653 * we don't know how much room there is in the output buffer of the
7654 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7655 * operations) will never be smaller than the few bytes we need. */
7656 sds bulkcount;
7657
7658 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7659 slave->repldbsize);
7660 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7661 {
7662 sdsfree(bulkcount);
7663 freeClient(slave);
7664 return;
7665 }
7666 sdsfree(bulkcount);
7667 }
7668 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7669 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7670 if (buflen <= 0) {
7671 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7672 (buflen == 0) ? "premature EOF" : strerror(errno));
7673 freeClient(slave);
7674 return;
7675 }
7676 if ((nwritten = write(fd,buf,buflen)) == -1) {
7677 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7678 strerror(errno));
7679 freeClient(slave);
7680 return;
7681 }
7682 slave->repldboff += nwritten;
7683 if (slave->repldboff == slave->repldbsize) {
7684 close(slave->repldbfd);
7685 slave->repldbfd = -1;
7686 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7687 slave->replstate = REDIS_REPL_ONLINE;
7688 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7689 sendReplyToClient, slave) == AE_ERR) {
7690 freeClient(slave);
7691 return;
7692 }
7693 addReplySds(slave,sdsempty());
7694 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7695 }
7696 }
7697
7698 /* This function is called at the end of every backgrond saving.
7699 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7700 * otherwise REDIS_ERR is passed to the function.
7701 *
7702 * The goal of this function is to handle slaves waiting for a successful
7703 * background saving in order to perform non-blocking synchronization. */
7704 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7705 listNode *ln;
7706 int startbgsave = 0;
7707 listIter li;
7708
7709 listRewind(server.slaves,&li);
7710 while((ln = listNext(&li))) {
7711 redisClient *slave = ln->value;
7712
7713 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7714 startbgsave = 1;
7715 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7716 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7717 struct redis_stat buf;
7718
7719 if (bgsaveerr != REDIS_OK) {
7720 freeClient(slave);
7721 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7722 continue;
7723 }
7724 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7725 redis_fstat(slave->repldbfd,&buf) == -1) {
7726 freeClient(slave);
7727 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7728 continue;
7729 }
7730 slave->repldboff = 0;
7731 slave->repldbsize = buf.st_size;
7732 slave->replstate = REDIS_REPL_SEND_BULK;
7733 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7734 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7735 freeClient(slave);
7736 continue;
7737 }
7738 }
7739 }
7740 if (startbgsave) {
7741 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7742 listIter li;
7743
7744 listRewind(server.slaves,&li);
7745 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7746 while((ln = listNext(&li))) {
7747 redisClient *slave = ln->value;
7748
7749 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7750 freeClient(slave);
7751 }
7752 }
7753 }
7754 }
7755
7756 static int syncWithMaster(void) {
7757 char buf[1024], tmpfile[256], authcmd[1024];
7758 long dumpsize;
7759 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7760 int dfd, maxtries = 5;
7761
7762 if (fd == -1) {
7763 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7764 strerror(errno));
7765 return REDIS_ERR;
7766 }
7767
7768 /* AUTH with the master if required. */
7769 if(server.masterauth) {
7770 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7771 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7772 close(fd);
7773 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7774 strerror(errno));
7775 return REDIS_ERR;
7776 }
7777 /* Read the AUTH result. */
7778 if (syncReadLine(fd,buf,1024,3600) == -1) {
7779 close(fd);
7780 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7781 strerror(errno));
7782 return REDIS_ERR;
7783 }
7784 if (buf[0] != '+') {
7785 close(fd);
7786 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7787 return REDIS_ERR;
7788 }
7789 }
7790
7791 /* Issue the SYNC command */
7792 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7793 close(fd);
7794 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7795 strerror(errno));
7796 return REDIS_ERR;
7797 }
7798 /* Read the bulk write count */
7799 if (syncReadLine(fd,buf,1024,3600) == -1) {
7800 close(fd);
7801 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7802 strerror(errno));
7803 return REDIS_ERR;
7804 }
7805 if (buf[0] != '$') {
7806 close(fd);
7807 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7808 return REDIS_ERR;
7809 }
7810 dumpsize = strtol(buf+1,NULL,10);
7811 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7812 /* Read the bulk write data on a temp file */
7813 while(maxtries--) {
7814 snprintf(tmpfile,256,
7815 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7816 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7817 if (dfd != -1) break;
7818 sleep(1);
7819 }
7820 if (dfd == -1) {
7821 close(fd);
7822 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7823 return REDIS_ERR;
7824 }
7825 while(dumpsize) {
7826 int nread, nwritten;
7827
7828 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7829 if (nread == -1) {
7830 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7831 strerror(errno));
7832 close(fd);
7833 close(dfd);
7834 return REDIS_ERR;
7835 }
7836 nwritten = write(dfd,buf,nread);
7837 if (nwritten == -1) {
7838 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7839 close(fd);
7840 close(dfd);
7841 return REDIS_ERR;
7842 }
7843 dumpsize -= nread;
7844 }
7845 close(dfd);
7846 if (rename(tmpfile,server.dbfilename) == -1) {
7847 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7848 unlink(tmpfile);
7849 close(fd);
7850 return REDIS_ERR;
7851 }
7852 emptyDb();
7853 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7854 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7855 close(fd);
7856 return REDIS_ERR;
7857 }
7858 server.master = createClient(fd);
7859 server.master->flags |= REDIS_MASTER;
7860 server.master->authenticated = 1;
7861 server.replstate = REDIS_REPL_CONNECTED;
7862 return REDIS_OK;
7863 }
7864
7865 static void slaveofCommand(redisClient *c) {
7866 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7867 !strcasecmp(c->argv[2]->ptr,"one")) {
7868 if (server.masterhost) {
7869 sdsfree(server.masterhost);
7870 server.masterhost = NULL;
7871 if (server.master) freeClient(server.master);
7872 server.replstate = REDIS_REPL_NONE;
7873 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7874 }
7875 } else {
7876 sdsfree(server.masterhost);
7877 server.masterhost = sdsdup(c->argv[1]->ptr);
7878 server.masterport = atoi(c->argv[2]->ptr);
7879 if (server.master) freeClient(server.master);
7880 server.replstate = REDIS_REPL_CONNECT;
7881 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7882 server.masterhost, server.masterport);
7883 }
7884 addReply(c,shared.ok);
7885 }
7886
7887 /* ============================ Maxmemory directive ======================== */
7888
7889 /* Try to free one object form the pre-allocated objects free list.
7890 * This is useful under low mem conditions as by default we take 1 million
7891 * free objects allocated. On success REDIS_OK is returned, otherwise
7892 * REDIS_ERR. */
7893 static int tryFreeOneObjectFromFreelist(void) {
7894 robj *o;
7895
7896 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7897 if (listLength(server.objfreelist)) {
7898 listNode *head = listFirst(server.objfreelist);
7899 o = listNodeValue(head);
7900 listDelNode(server.objfreelist,head);
7901 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7902 zfree(o);
7903 return REDIS_OK;
7904 } else {
7905 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7906 return REDIS_ERR;
7907 }
7908 }
7909
7910 /* This function gets called when 'maxmemory' is set on the config file to limit
7911 * the max memory used by the server, and we are out of memory.
7912 * This function will try to, in order:
7913 *
7914 * - Free objects from the free list
7915 * - Try to remove keys with an EXPIRE set
7916 *
7917 * It is not possible to free enough memory to reach used-memory < maxmemory
7918 * the server will start refusing commands that will enlarge even more the
7919 * memory usage.
7920 */
7921 static void freeMemoryIfNeeded(void) {
7922 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7923 int j, k, freed = 0;
7924
7925 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7926 for (j = 0; j < server.dbnum; j++) {
7927 int minttl = -1;
7928 robj *minkey = NULL;
7929 struct dictEntry *de;
7930
7931 if (dictSize(server.db[j].expires)) {
7932 freed = 1;
7933 /* From a sample of three keys drop the one nearest to
7934 * the natural expire */
7935 for (k = 0; k < 3; k++) {
7936 time_t t;
7937
7938 de = dictGetRandomKey(server.db[j].expires);
7939 t = (time_t) dictGetEntryVal(de);
7940 if (minttl == -1 || t < minttl) {
7941 minkey = dictGetEntryKey(de);
7942 minttl = t;
7943 }
7944 }
7945 deleteKey(server.db+j,minkey);
7946 }
7947 }
7948 if (!freed) return; /* nothing to free... */
7949 }
7950 }
7951
7952 /* ============================== Append Only file ========================== */
7953
7954 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7955 sds buf = sdsempty();
7956 int j;
7957 ssize_t nwritten;
7958 time_t now;
7959 robj *tmpargv[3];
7960
7961 /* The DB this command was targetting is not the same as the last command
7962 * we appendend. To issue a SELECT command is needed. */
7963 if (dictid != server.appendseldb) {
7964 char seldb[64];
7965
7966 snprintf(seldb,sizeof(seldb),"%d",dictid);
7967 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7968 (unsigned long)strlen(seldb),seldb);
7969 server.appendseldb = dictid;
7970 }
7971
7972 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7973 * EXPIREs into EXPIREATs calls */
7974 if (cmd->proc == expireCommand) {
7975 long when;
7976
7977 tmpargv[0] = createStringObject("EXPIREAT",8);
7978 tmpargv[1] = argv[1];
7979 incrRefCount(argv[1]);
7980 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7981 tmpargv[2] = createObject(REDIS_STRING,
7982 sdscatprintf(sdsempty(),"%ld",when));
7983 argv = tmpargv;
7984 }
7985
7986 /* Append the actual command */
7987 buf = sdscatprintf(buf,"*%d\r\n",argc);
7988 for (j = 0; j < argc; j++) {
7989 robj *o = argv[j];
7990
7991 o = getDecodedObject(o);
7992 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7993 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7994 buf = sdscatlen(buf,"\r\n",2);
7995 decrRefCount(o);
7996 }
7997
7998 /* Free the objects from the modified argv for EXPIREAT */
7999 if (cmd->proc == expireCommand) {
8000 for (j = 0; j < 3; j++)
8001 decrRefCount(argv[j]);
8002 }
8003
8004 /* We want to perform a single write. This should be guaranteed atomic
8005 * at least if the filesystem we are writing is a real physical one.
8006 * While this will save us against the server being killed I don't think
8007 * there is much to do about the whole server stopping for power problems
8008 * or alike */
8009 nwritten = write(server.appendfd,buf,sdslen(buf));
8010 if (nwritten != (signed)sdslen(buf)) {
8011 /* Ooops, we are in troubles. The best thing to do for now is
8012 * to simply exit instead to give the illusion that everything is
8013 * working as expected. */
8014 if (nwritten == -1) {
8015 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8016 } else {
8017 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8018 }
8019 exit(1);
8020 }
8021 /* If a background append only file rewriting is in progress we want to
8022 * accumulate the differences between the child DB and the current one
8023 * in a buffer, so that when the child process will do its work we
8024 * can append the differences to the new append only file. */
8025 if (server.bgrewritechildpid != -1)
8026 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8027
8028 sdsfree(buf);
8029 now = time(NULL);
8030 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8031 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8032 now-server.lastfsync > 1))
8033 {
8034 fsync(server.appendfd); /* Let's try to get this data on the disk */
8035 server.lastfsync = now;
8036 }
8037 }
8038
8039 /* In Redis commands are always executed in the context of a client, so in
8040 * order to load the append only file we need to create a fake client. */
8041 static struct redisClient *createFakeClient(void) {
8042 struct redisClient *c = zmalloc(sizeof(*c));
8043
8044 selectDb(c,0);
8045 c->fd = -1;
8046 c->querybuf = sdsempty();
8047 c->argc = 0;
8048 c->argv = NULL;
8049 c->flags = 0;
8050 /* We set the fake client as a slave waiting for the synchronization
8051 * so that Redis will not try to send replies to this client. */
8052 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8053 c->reply = listCreate();
8054 listSetFreeMethod(c->reply,decrRefCount);
8055 listSetDupMethod(c->reply,dupClientReplyValue);
8056 return c;
8057 }
8058
8059 static void freeFakeClient(struct redisClient *c) {
8060 sdsfree(c->querybuf);
8061 listRelease(c->reply);
8062 zfree(c);
8063 }
8064
8065 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8066 * error (the append only file is zero-length) REDIS_ERR is returned. On
8067 * fatal error an error message is logged and the program exists. */
8068 int loadAppendOnlyFile(char *filename) {
8069 struct redisClient *fakeClient;
8070 FILE *fp = fopen(filename,"r");
8071 struct redis_stat sb;
8072 unsigned long long loadedkeys = 0;
8073
8074 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8075 return REDIS_ERR;
8076
8077 if (fp == NULL) {
8078 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8079 exit(1);
8080 }
8081
8082 fakeClient = createFakeClient();
8083 while(1) {
8084 int argc, j;
8085 unsigned long len;
8086 robj **argv;
8087 char buf[128];
8088 sds argsds;
8089 struct redisCommand *cmd;
8090
8091 if (fgets(buf,sizeof(buf),fp) == NULL) {
8092 if (feof(fp))
8093 break;
8094 else
8095 goto readerr;
8096 }
8097 if (buf[0] != '*') goto fmterr;
8098 argc = atoi(buf+1);
8099 argv = zmalloc(sizeof(robj*)*argc);
8100 for (j = 0; j < argc; j++) {
8101 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8102 if (buf[0] != '$') goto fmterr;
8103 len = strtol(buf+1,NULL,10);
8104 argsds = sdsnewlen(NULL,len);
8105 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8106 argv[j] = createObject(REDIS_STRING,argsds);
8107 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8108 }
8109
8110 /* Command lookup */
8111 cmd = lookupCommand(argv[0]->ptr);
8112 if (!cmd) {
8113 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8114 exit(1);
8115 }
8116 /* Try object encoding */
8117 if (cmd->flags & REDIS_CMD_BULK)
8118 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8119 /* Run the command in the context of a fake client */
8120 fakeClient->argc = argc;
8121 fakeClient->argv = argv;
8122 cmd->proc(fakeClient);
8123 /* Discard the reply objects list from the fake client */
8124 while(listLength(fakeClient->reply))
8125 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8126 /* Clean up, ready for the next command */
8127 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8128 zfree(argv);
8129 /* Handle swapping while loading big datasets when VM is on */
8130 loadedkeys++;
8131 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8132 while (zmalloc_used_memory() > server.vm_max_memory) {
8133 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8134 }
8135 }
8136 }
8137 fclose(fp);
8138 freeFakeClient(fakeClient);
8139 return REDIS_OK;
8140
8141 readerr:
8142 if (feof(fp)) {
8143 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8144 } else {
8145 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8146 }
8147 exit(1);
8148 fmterr:
8149 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8150 exit(1);
8151 }
8152
8153 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8154 static int fwriteBulkObject(FILE *fp, robj *obj) {
8155 char buf[128];
8156 int decrrc = 0;
8157
8158 /* Avoid the incr/decr ref count business if possible to help
8159 * copy-on-write (we are often in a child process when this function
8160 * is called).
8161 * Also makes sure that key objects don't get incrRefCount-ed when VM
8162 * is enabled */
8163 if (obj->encoding != REDIS_ENCODING_RAW) {
8164 obj = getDecodedObject(obj);
8165 decrrc = 1;
8166 }
8167 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8168 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8169 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8170 goto err;
8171 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8172 if (decrrc) decrRefCount(obj);
8173 return 1;
8174 err:
8175 if (decrrc) decrRefCount(obj);
8176 return 0;
8177 }
8178
8179 /* Write binary-safe string into a file in the bulkformat
8180 * $<count>\r\n<payload>\r\n */
8181 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8182 char buf[128];
8183
8184 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8185 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8186 if (len && fwrite(s,len,1,fp) == 0) return 0;
8187 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8188 return 1;
8189 }
8190
8191 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8192 static int fwriteBulkDouble(FILE *fp, double d) {
8193 char buf[128], dbuf[128];
8194
8195 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8196 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8197 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8198 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8199 return 1;
8200 }
8201
8202 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8203 static int fwriteBulkLong(FILE *fp, long l) {
8204 char buf[128], lbuf[128];
8205
8206 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8207 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8208 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8209 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8210 return 1;
8211 }
8212
8213 /* Write a sequence of commands able to fully rebuild the dataset into
8214 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8215 static int rewriteAppendOnlyFile(char *filename) {
8216 dictIterator *di = NULL;
8217 dictEntry *de;
8218 FILE *fp;
8219 char tmpfile[256];
8220 int j;
8221 time_t now = time(NULL);
8222
8223 /* Note that we have to use a different temp name here compared to the
8224 * one used by rewriteAppendOnlyFileBackground() function. */
8225 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8226 fp = fopen(tmpfile,"w");
8227 if (!fp) {
8228 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8229 return REDIS_ERR;
8230 }
8231 for (j = 0; j < server.dbnum; j++) {
8232 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8233 redisDb *db = server.db+j;
8234 dict *d = db->dict;
8235 if (dictSize(d) == 0) continue;
8236 di = dictGetIterator(d);
8237 if (!di) {
8238 fclose(fp);
8239 return REDIS_ERR;
8240 }
8241
8242 /* SELECT the new DB */
8243 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8244 if (fwriteBulkLong(fp,j) == 0) goto werr;
8245
8246 /* Iterate this DB writing every entry */
8247 while((de = dictNext(di)) != NULL) {
8248 robj *key, *o;
8249 time_t expiretime;
8250 int swapped;
8251
8252 key = dictGetEntryKey(de);
8253 /* If the value for this key is swapped, load a preview in memory.
8254 * We use a "swapped" flag to remember if we need to free the
8255 * value object instead to just increment the ref count anyway
8256 * in order to avoid copy-on-write of pages if we are forked() */
8257 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8258 key->storage == REDIS_VM_SWAPPING) {
8259 o = dictGetEntryVal(de);
8260 swapped = 0;
8261 } else {
8262 o = vmPreviewObject(key);
8263 swapped = 1;
8264 }
8265 expiretime = getExpire(db,key);
8266
8267 /* Save the key and associated value */
8268 if (o->type == REDIS_STRING) {
8269 /* Emit a SET command */
8270 char cmd[]="*3\r\n$3\r\nSET\r\n";
8271 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8272 /* Key and value */
8273 if (fwriteBulkObject(fp,key) == 0) goto werr;
8274 if (fwriteBulkObject(fp,o) == 0) goto werr;
8275 } else if (o->type == REDIS_LIST) {
8276 /* Emit the RPUSHes needed to rebuild the list */
8277 list *list = o->ptr;
8278 listNode *ln;
8279 listIter li;
8280
8281 listRewind(list,&li);
8282 while((ln = listNext(&li))) {
8283 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8284 robj *eleobj = listNodeValue(ln);
8285
8286 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8287 if (fwriteBulkObject(fp,key) == 0) goto werr;
8288 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8289 }
8290 } else if (o->type == REDIS_SET) {
8291 /* Emit the SADDs needed to rebuild the set */
8292 dict *set = o->ptr;
8293 dictIterator *di = dictGetIterator(set);
8294 dictEntry *de;
8295
8296 while((de = dictNext(di)) != NULL) {
8297 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8298 robj *eleobj = dictGetEntryKey(de);
8299
8300 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8301 if (fwriteBulkObject(fp,key) == 0) goto werr;
8302 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8303 }
8304 dictReleaseIterator(di);
8305 } else if (o->type == REDIS_ZSET) {
8306 /* Emit the ZADDs needed to rebuild the sorted set */
8307 zset *zs = o->ptr;
8308 dictIterator *di = dictGetIterator(zs->dict);
8309 dictEntry *de;
8310
8311 while((de = dictNext(di)) != NULL) {
8312 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8313 robj *eleobj = dictGetEntryKey(de);
8314 double *score = dictGetEntryVal(de);
8315
8316 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8317 if (fwriteBulkObject(fp,key) == 0) goto werr;
8318 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8319 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8320 }
8321 dictReleaseIterator(di);
8322 } else if (o->type == REDIS_HASH) {
8323 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8324
8325 /* Emit the HSETs needed to rebuild the hash */
8326 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8327 unsigned char *p = zipmapRewind(o->ptr);
8328 unsigned char *field, *val;
8329 unsigned int flen, vlen;
8330
8331 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8332 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8333 if (fwriteBulkObject(fp,key) == 0) goto werr;
8334 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8335 return -1;
8336 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8337 return -1;
8338 }
8339 } else {
8340 dictIterator *di = dictGetIterator(o->ptr);
8341 dictEntry *de;
8342
8343 while((de = dictNext(di)) != NULL) {
8344 robj *field = dictGetEntryKey(de);
8345 robj *val = dictGetEntryVal(de);
8346
8347 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8348 if (fwriteBulkObject(fp,key) == 0) goto werr;
8349 if (fwriteBulkObject(fp,field) == -1) return -1;
8350 if (fwriteBulkObject(fp,val) == -1) return -1;
8351 }
8352 dictReleaseIterator(di);
8353 }
8354 } else {
8355 redisPanic("Unknown object type");
8356 }
8357 /* Save the expire time */
8358 if (expiretime != -1) {
8359 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8360 /* If this key is already expired skip it */
8361 if (expiretime < now) continue;
8362 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8363 if (fwriteBulkObject(fp,key) == 0) goto werr;
8364 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8365 }
8366 if (swapped) decrRefCount(o);
8367 }
8368 dictReleaseIterator(di);
8369 }
8370
8371 /* Make sure data will not remain on the OS's output buffers */
8372 fflush(fp);
8373 fsync(fileno(fp));
8374 fclose(fp);
8375
8376 /* Use RENAME to make sure the DB file is changed atomically only
8377 * if the generate DB file is ok. */
8378 if (rename(tmpfile,filename) == -1) {
8379 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8380 unlink(tmpfile);
8381 return REDIS_ERR;
8382 }
8383 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8384 return REDIS_OK;
8385
8386 werr:
8387 fclose(fp);
8388 unlink(tmpfile);
8389 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8390 if (di) dictReleaseIterator(di);
8391 return REDIS_ERR;
8392 }
8393
8394 /* This is how rewriting of the append only file in background works:
8395 *
8396 * 1) The user calls BGREWRITEAOF
8397 * 2) Redis calls this function, that forks():
8398 * 2a) the child rewrite the append only file in a temp file.
8399 * 2b) the parent accumulates differences in server.bgrewritebuf.
8400 * 3) When the child finished '2a' exists.
8401 * 4) The parent will trap the exit code, if it's OK, will append the
8402 * data accumulated into server.bgrewritebuf into the temp file, and
8403 * finally will rename(2) the temp file in the actual file name.
8404 * The the new file is reopened as the new append only file. Profit!
8405 */
8406 static int rewriteAppendOnlyFileBackground(void) {
8407 pid_t childpid;
8408
8409 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8410 if (server.vm_enabled) waitEmptyIOJobsQueue();
8411 if ((childpid = fork()) == 0) {
8412 /* Child */
8413 char tmpfile[256];
8414
8415 if (server.vm_enabled) vmReopenSwapFile();
8416 close(server.fd);
8417 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8418 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8419 _exit(0);
8420 } else {
8421 _exit(1);
8422 }
8423 } else {
8424 /* Parent */
8425 if (childpid == -1) {
8426 redisLog(REDIS_WARNING,
8427 "Can't rewrite append only file in background: fork: %s",
8428 strerror(errno));
8429 return REDIS_ERR;
8430 }
8431 redisLog(REDIS_NOTICE,
8432 "Background append only file rewriting started by pid %d",childpid);
8433 server.bgrewritechildpid = childpid;
8434 updateDictResizePolicy();
8435 /* We set appendseldb to -1 in order to force the next call to the
8436 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8437 * accumulated by the parent into server.bgrewritebuf will start
8438 * with a SELECT statement and it will be safe to merge. */
8439 server.appendseldb = -1;
8440 return REDIS_OK;
8441 }
8442 return REDIS_OK; /* unreached */
8443 }
8444
8445 static void bgrewriteaofCommand(redisClient *c) {
8446 if (server.bgrewritechildpid != -1) {
8447 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8448 return;
8449 }
8450 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8451 char *status = "+Background append only file rewriting started\r\n";
8452 addReplySds(c,sdsnew(status));
8453 } else {
8454 addReply(c,shared.err);
8455 }
8456 }
8457
8458 static void aofRemoveTempFile(pid_t childpid) {
8459 char tmpfile[256];
8460
8461 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8462 unlink(tmpfile);
8463 }
8464
8465 /* Virtual Memory is composed mainly of two subsystems:
8466 * - Blocking Virutal Memory
8467 * - Threaded Virtual Memory I/O
8468 * The two parts are not fully decoupled, but functions are split among two
8469 * different sections of the source code (delimited by comments) in order to
8470 * make more clear what functionality is about the blocking VM and what about
8471 * the threaded (not blocking) VM.
8472 *
8473 * Redis VM design:
8474 *
8475 * Redis VM is a blocking VM (one that blocks reading swapped values from
8476 * disk into memory when a value swapped out is needed in memory) that is made
8477 * unblocking by trying to examine the command argument vector in order to
8478 * load in background values that will likely be needed in order to exec
8479 * the command. The command is executed only once all the relevant keys
8480 * are loaded into memory.
8481 *
8482 * This basically is almost as simple of a blocking VM, but almost as parallel
8483 * as a fully non-blocking VM.
8484 */
8485
8486 /* =================== Virtual Memory - Blocking Side ====================== */
8487
8488 /* substitute the first occurrence of '%p' with the process pid in the
8489 * swap file name. */
8490 static void expandVmSwapFilename(void) {
8491 char *p = strstr(server.vm_swap_file,"%p");
8492 sds new;
8493
8494 if (!p) return;
8495 new = sdsempty();
8496 *p = '\0';
8497 new = sdscat(new,server.vm_swap_file);
8498 new = sdscatprintf(new,"%ld",(long) getpid());
8499 new = sdscat(new,p+2);
8500 zfree(server.vm_swap_file);
8501 server.vm_swap_file = new;
8502 }
8503
8504 static void vmInit(void) {
8505 off_t totsize;
8506 int pipefds[2];
8507 size_t stacksize;
8508
8509 if (server.vm_max_threads != 0)
8510 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8511
8512 expandVmSwapFilename();
8513 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8514 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8515 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8516 }
8517 if (server.vm_fp == NULL) {
8518 redisLog(REDIS_WARNING,
8519 "Impossible to open the swap file: %s. Exiting.",
8520 strerror(errno));
8521 exit(1);
8522 }
8523 server.vm_fd = fileno(server.vm_fp);
8524 server.vm_next_page = 0;
8525 server.vm_near_pages = 0;
8526 server.vm_stats_used_pages = 0;
8527 server.vm_stats_swapped_objects = 0;
8528 server.vm_stats_swapouts = 0;
8529 server.vm_stats_swapins = 0;
8530 totsize = server.vm_pages*server.vm_page_size;
8531 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8532 if (ftruncate(server.vm_fd,totsize) == -1) {
8533 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8534 strerror(errno));
8535 exit(1);
8536 } else {
8537 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8538 }
8539 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8540 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8541 (long long) (server.vm_pages+7)/8, server.vm_pages);
8542 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8543
8544 /* Initialize threaded I/O (used by Virtual Memory) */
8545 server.io_newjobs = listCreate();
8546 server.io_processing = listCreate();
8547 server.io_processed = listCreate();
8548 server.io_ready_clients = listCreate();
8549 pthread_mutex_init(&server.io_mutex,NULL);
8550 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8551 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8552 server.io_active_threads = 0;
8553 if (pipe(pipefds) == -1) {
8554 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8555 ,strerror(errno));
8556 exit(1);
8557 }
8558 server.io_ready_pipe_read = pipefds[0];
8559 server.io_ready_pipe_write = pipefds[1];
8560 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8561 /* LZF requires a lot of stack */
8562 pthread_attr_init(&server.io_threads_attr);
8563 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8564 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8565 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8566 /* Listen for events in the threaded I/O pipe */
8567 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8568 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8569 oom("creating file event");
8570 }
8571
8572 /* Mark the page as used */
8573 static void vmMarkPageUsed(off_t page) {
8574 off_t byte = page/8;
8575 int bit = page&7;
8576 redisAssert(vmFreePage(page) == 1);
8577 server.vm_bitmap[byte] |= 1<<bit;
8578 }
8579
8580 /* Mark N contiguous pages as used, with 'page' being the first. */
8581 static void vmMarkPagesUsed(off_t page, off_t count) {
8582 off_t j;
8583
8584 for (j = 0; j < count; j++)
8585 vmMarkPageUsed(page+j);
8586 server.vm_stats_used_pages += count;
8587 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8588 (long long)count, (long long)page);
8589 }
8590
8591 /* Mark the page as free */
8592 static void vmMarkPageFree(off_t page) {
8593 off_t byte = page/8;
8594 int bit = page&7;
8595 redisAssert(vmFreePage(page) == 0);
8596 server.vm_bitmap[byte] &= ~(1<<bit);
8597 }
8598
8599 /* Mark N contiguous pages as free, with 'page' being the first. */
8600 static void vmMarkPagesFree(off_t page, off_t count) {
8601 off_t j;
8602
8603 for (j = 0; j < count; j++)
8604 vmMarkPageFree(page+j);
8605 server.vm_stats_used_pages -= count;
8606 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8607 (long long)count, (long long)page);
8608 }
8609
8610 /* Test if the page is free */
8611 static int vmFreePage(off_t page) {
8612 off_t byte = page/8;
8613 int bit = page&7;
8614 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8615 }
8616
8617 /* Find N contiguous free pages storing the first page of the cluster in *first.
8618 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8619 * REDIS_ERR is returned.
8620 *
8621 * This function uses a simple algorithm: we try to allocate
8622 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8623 * again from the start of the swap file searching for free spaces.
8624 *
8625 * If it looks pretty clear that there are no free pages near our offset
8626 * we try to find less populated places doing a forward jump of
8627 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8628 * without hurry, and then we jump again and so forth...
8629 *
8630 * This function can be improved using a free list to avoid to guess
8631 * too much, since we could collect data about freed pages.
8632 *
8633 * note: I implemented this function just after watching an episode of
8634 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8635 */
8636 static int vmFindContiguousPages(off_t *first, off_t n) {
8637 off_t base, offset = 0, since_jump = 0, numfree = 0;
8638
8639 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8640 server.vm_near_pages = 0;
8641 server.vm_next_page = 0;
8642 }
8643 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8644 base = server.vm_next_page;
8645
8646 while(offset < server.vm_pages) {
8647 off_t this = base+offset;
8648
8649 /* If we overflow, restart from page zero */
8650 if (this >= server.vm_pages) {
8651 this -= server.vm_pages;
8652 if (this == 0) {
8653 /* Just overflowed, what we found on tail is no longer
8654 * interesting, as it's no longer contiguous. */
8655 numfree = 0;
8656 }
8657 }
8658 if (vmFreePage(this)) {
8659 /* This is a free page */
8660 numfree++;
8661 /* Already got N free pages? Return to the caller, with success */
8662 if (numfree == n) {
8663 *first = this-(n-1);
8664 server.vm_next_page = this+1;
8665 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8666 return REDIS_OK;
8667 }
8668 } else {
8669 /* The current one is not a free page */
8670 numfree = 0;
8671 }
8672
8673 /* Fast-forward if the current page is not free and we already
8674 * searched enough near this place. */
8675 since_jump++;
8676 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8677 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8678 since_jump = 0;
8679 /* Note that even if we rewind after the jump, we are don't need
8680 * to make sure numfree is set to zero as we only jump *if* it
8681 * is set to zero. */
8682 } else {
8683 /* Otherwise just check the next page */
8684 offset++;
8685 }
8686 }
8687 return REDIS_ERR;
8688 }
8689
8690 /* Write the specified object at the specified page of the swap file */
8691 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8692 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8693 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8694 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8695 redisLog(REDIS_WARNING,
8696 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8697 strerror(errno));
8698 return REDIS_ERR;
8699 }
8700 rdbSaveObject(server.vm_fp,o);
8701 fflush(server.vm_fp);
8702 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8703 return REDIS_OK;
8704 }
8705
8706 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8707 * needed to later retrieve the object into the key object.
8708 * If we can't find enough contiguous empty pages to swap the object on disk
8709 * REDIS_ERR is returned. */
8710 static int vmSwapObjectBlocking(robj *key, robj *val) {
8711 off_t pages = rdbSavedObjectPages(val,NULL);
8712 off_t page;
8713
8714 assert(key->storage == REDIS_VM_MEMORY);
8715 assert(key->refcount == 1);
8716 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8717 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8718 key->vm.page = page;
8719 key->vm.usedpages = pages;
8720 key->storage = REDIS_VM_SWAPPED;
8721 key->vtype = val->type;
8722 decrRefCount(val); /* Deallocate the object from memory. */
8723 vmMarkPagesUsed(page,pages);
8724 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8725 (unsigned char*) key->ptr,
8726 (unsigned long long) page, (unsigned long long) pages);
8727 server.vm_stats_swapped_objects++;
8728 server.vm_stats_swapouts++;
8729 return REDIS_OK;
8730 }
8731
8732 static robj *vmReadObjectFromSwap(off_t page, int type) {
8733 robj *o;
8734
8735 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8736 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8737 redisLog(REDIS_WARNING,
8738 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8739 strerror(errno));
8740 _exit(1);
8741 }
8742 o = rdbLoadObject(type,server.vm_fp);
8743 if (o == NULL) {
8744 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8745 _exit(1);
8746 }
8747 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8748 return o;
8749 }
8750
8751 /* Load the value object relative to the 'key' object from swap to memory.
8752 * The newly allocated object is returned.
8753 *
8754 * If preview is true the unserialized object is returned to the caller but
8755 * no changes are made to the key object, nor the pages are marked as freed */
8756 static robj *vmGenericLoadObject(robj *key, int preview) {
8757 robj *val;
8758
8759 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8760 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8761 if (!preview) {
8762 key->storage = REDIS_VM_MEMORY;
8763 key->vm.atime = server.unixtime;
8764 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8765 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8766 (unsigned char*) key->ptr);
8767 server.vm_stats_swapped_objects--;
8768 } else {
8769 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8770 (unsigned char*) key->ptr);
8771 }
8772 server.vm_stats_swapins++;
8773 return val;
8774 }
8775
8776 /* Plain object loading, from swap to memory */
8777 static robj *vmLoadObject(robj *key) {
8778 /* If we are loading the object in background, stop it, we
8779 * need to load this object synchronously ASAP. */
8780 if (key->storage == REDIS_VM_LOADING)
8781 vmCancelThreadedIOJob(key);
8782 return vmGenericLoadObject(key,0);
8783 }
8784
8785 /* Just load the value on disk, without to modify the key.
8786 * This is useful when we want to perform some operation on the value
8787 * without to really bring it from swap to memory, like while saving the
8788 * dataset or rewriting the append only log. */
8789 static robj *vmPreviewObject(robj *key) {
8790 return vmGenericLoadObject(key,1);
8791 }
8792
8793 /* How a good candidate is this object for swapping?
8794 * The better candidate it is, the greater the returned value.
8795 *
8796 * Currently we try to perform a fast estimation of the object size in
8797 * memory, and combine it with aging informations.
8798 *
8799 * Basically swappability = idle-time * log(estimated size)
8800 *
8801 * Bigger objects are preferred over smaller objects, but not
8802 * proportionally, this is why we use the logarithm. This algorithm is
8803 * just a first try and will probably be tuned later. */
8804 static double computeObjectSwappability(robj *o) {
8805 time_t age = server.unixtime - o->vm.atime;
8806 long asize = 0;
8807 list *l;
8808 dict *d;
8809 struct dictEntry *de;
8810 int z;
8811
8812 if (age <= 0) return 0;
8813 switch(o->type) {
8814 case REDIS_STRING:
8815 if (o->encoding != REDIS_ENCODING_RAW) {
8816 asize = sizeof(*o);
8817 } else {
8818 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8819 }
8820 break;
8821 case REDIS_LIST:
8822 l = o->ptr;
8823 listNode *ln = listFirst(l);
8824
8825 asize = sizeof(list);
8826 if (ln) {
8827 robj *ele = ln->value;
8828 long elesize;
8829
8830 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8831 (sizeof(*o)+sdslen(ele->ptr)) :
8832 sizeof(*o);
8833 asize += (sizeof(listNode)+elesize)*listLength(l);
8834 }
8835 break;
8836 case REDIS_SET:
8837 case REDIS_ZSET:
8838 z = (o->type == REDIS_ZSET);
8839 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8840
8841 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8842 if (z) asize += sizeof(zset)-sizeof(dict);
8843 if (dictSize(d)) {
8844 long elesize;
8845 robj *ele;
8846
8847 de = dictGetRandomKey(d);
8848 ele = dictGetEntryKey(de);
8849 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8850 (sizeof(*o)+sdslen(ele->ptr)) :
8851 sizeof(*o);
8852 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8853 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8854 }
8855 break;
8856 case REDIS_HASH:
8857 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8858 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8859 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8860 unsigned int klen, vlen;
8861 unsigned char *key, *val;
8862
8863 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8864 klen = 0;
8865 vlen = 0;
8866 }
8867 asize = len*(klen+vlen+3);
8868 } else if (o->encoding == REDIS_ENCODING_HT) {
8869 d = o->ptr;
8870 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8871 if (dictSize(d)) {
8872 long elesize;
8873 robj *ele;
8874
8875 de = dictGetRandomKey(d);
8876 ele = dictGetEntryKey(de);
8877 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8878 (sizeof(*o)+sdslen(ele->ptr)) :
8879 sizeof(*o);
8880 ele = dictGetEntryVal(de);
8881 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8882 (sizeof(*o)+sdslen(ele->ptr)) :
8883 sizeof(*o);
8884 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8885 }
8886 }
8887 break;
8888 }
8889 return (double)age*log(1+asize);
8890 }
8891
8892 /* Try to swap an object that's a good candidate for swapping.
8893 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8894 * to swap any object at all.
8895 *
8896 * If 'usethreaded' is true, Redis will try to swap the object in background
8897 * using I/O threads. */
8898 static int vmSwapOneObject(int usethreads) {
8899 int j, i;
8900 struct dictEntry *best = NULL;
8901 double best_swappability = 0;
8902 redisDb *best_db = NULL;
8903 robj *key, *val;
8904
8905 for (j = 0; j < server.dbnum; j++) {
8906 redisDb *db = server.db+j;
8907 /* Why maxtries is set to 100?
8908 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8909 * are swappable objects */
8910 int maxtries = 100;
8911
8912 if (dictSize(db->dict) == 0) continue;
8913 for (i = 0; i < 5; i++) {
8914 dictEntry *de;
8915 double swappability;
8916
8917 if (maxtries) maxtries--;
8918 de = dictGetRandomKey(db->dict);
8919 key = dictGetEntryKey(de);
8920 val = dictGetEntryVal(de);
8921 /* Only swap objects that are currently in memory.
8922 *
8923 * Also don't swap shared objects if threaded VM is on, as we
8924 * try to ensure that the main thread does not touch the
8925 * object while the I/O thread is using it, but we can't
8926 * control other keys without adding additional mutex. */
8927 if (key->storage != REDIS_VM_MEMORY ||
8928 (server.vm_max_threads != 0 && val->refcount != 1)) {
8929 if (maxtries) i--; /* don't count this try */
8930 continue;
8931 }
8932 swappability = computeObjectSwappability(val);
8933 if (!best || swappability > best_swappability) {
8934 best = de;
8935 best_swappability = swappability;
8936 best_db = db;
8937 }
8938 }
8939 }
8940 if (best == NULL) return REDIS_ERR;
8941 key = dictGetEntryKey(best);
8942 val = dictGetEntryVal(best);
8943
8944 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8945 key->ptr, best_swappability);
8946
8947 /* Unshare the key if needed */
8948 if (key->refcount > 1) {
8949 robj *newkey = dupStringObject(key);
8950 decrRefCount(key);
8951 key = dictGetEntryKey(best) = newkey;
8952 }
8953 /* Swap it */
8954 if (usethreads) {
8955 vmSwapObjectThreaded(key,val,best_db);
8956 return REDIS_OK;
8957 } else {
8958 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8959 dictGetEntryVal(best) = NULL;
8960 return REDIS_OK;
8961 } else {
8962 return REDIS_ERR;
8963 }
8964 }
8965 }
8966
8967 static int vmSwapOneObjectBlocking() {
8968 return vmSwapOneObject(0);
8969 }
8970
8971 static int vmSwapOneObjectThreaded() {
8972 return vmSwapOneObject(1);
8973 }
8974
8975 /* Return true if it's safe to swap out objects in a given moment.
8976 * Basically we don't want to swap objects out while there is a BGSAVE
8977 * or a BGAEOREWRITE running in backgroud. */
8978 static int vmCanSwapOut(void) {
8979 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8980 }
8981
8982 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8983 * and was deleted. Otherwise 0 is returned. */
8984 static int deleteIfSwapped(redisDb *db, robj *key) {
8985 dictEntry *de;
8986 robj *foundkey;
8987
8988 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8989 foundkey = dictGetEntryKey(de);
8990 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8991 deleteKey(db,key);
8992 return 1;
8993 }
8994
8995 /* =================== Virtual Memory - Threaded I/O ======================= */
8996
8997 static void freeIOJob(iojob *j) {
8998 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8999 j->type == REDIS_IOJOB_DO_SWAP ||
9000 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9001 decrRefCount(j->val);
9002 /* We don't decrRefCount the j->key field as we did't incremented
9003 * the count creating IO Jobs. This is because the key field here is
9004 * just used as an indentifier and if a key is removed the Job should
9005 * never be touched again. */
9006 zfree(j);
9007 }
9008
9009 /* Every time a thread finished a Job, it writes a byte into the write side
9010 * of an unix pipe in order to "awake" the main thread, and this function
9011 * is called. */
9012 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9013 int mask)
9014 {
9015 char buf[1];
9016 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9017 REDIS_NOTUSED(el);
9018 REDIS_NOTUSED(mask);
9019 REDIS_NOTUSED(privdata);
9020
9021 /* For every byte we read in the read side of the pipe, there is one
9022 * I/O job completed to process. */
9023 while((retval = read(fd,buf,1)) == 1) {
9024 iojob *j;
9025 listNode *ln;
9026 robj *key;
9027 struct dictEntry *de;
9028
9029 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9030
9031 /* Get the processed element (the oldest one) */
9032 lockThreadedIO();
9033 assert(listLength(server.io_processed) != 0);
9034 if (toprocess == -1) {
9035 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9036 if (toprocess <= 0) toprocess = 1;
9037 }
9038 ln = listFirst(server.io_processed);
9039 j = ln->value;
9040 listDelNode(server.io_processed,ln);
9041 unlockThreadedIO();
9042 /* If this job is marked as canceled, just ignore it */
9043 if (j->canceled) {
9044 freeIOJob(j);
9045 continue;
9046 }
9047 /* Post process it in the main thread, as there are things we
9048 * can do just here to avoid race conditions and/or invasive locks */
9049 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9050 de = dictFind(j->db->dict,j->key);
9051 assert(de != NULL);
9052 key = dictGetEntryKey(de);
9053 if (j->type == REDIS_IOJOB_LOAD) {
9054 redisDb *db;
9055
9056 /* Key loaded, bring it at home */
9057 key->storage = REDIS_VM_MEMORY;
9058 key->vm.atime = server.unixtime;
9059 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9060 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9061 (unsigned char*) key->ptr);
9062 server.vm_stats_swapped_objects--;
9063 server.vm_stats_swapins++;
9064 dictGetEntryVal(de) = j->val;
9065 incrRefCount(j->val);
9066 db = j->db;
9067 freeIOJob(j);
9068 /* Handle clients waiting for this key to be loaded. */
9069 handleClientsBlockedOnSwappedKey(db,key);
9070 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9071 /* Now we know the amount of pages required to swap this object.
9072 * Let's find some space for it, and queue this task again
9073 * rebranded as REDIS_IOJOB_DO_SWAP. */
9074 if (!vmCanSwapOut() ||
9075 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9076 {
9077 /* Ooops... no space or we can't swap as there is
9078 * a fork()ed Redis trying to save stuff on disk. */
9079 freeIOJob(j);
9080 key->storage = REDIS_VM_MEMORY; /* undo operation */
9081 } else {
9082 /* Note that we need to mark this pages as used now,
9083 * if the job will be canceled, we'll mark them as freed
9084 * again. */
9085 vmMarkPagesUsed(j->page,j->pages);
9086 j->type = REDIS_IOJOB_DO_SWAP;
9087 lockThreadedIO();
9088 queueIOJob(j);
9089 unlockThreadedIO();
9090 }
9091 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9092 robj *val;
9093
9094 /* Key swapped. We can finally free some memory. */
9095 if (key->storage != REDIS_VM_SWAPPING) {
9096 printf("key->storage: %d\n",key->storage);
9097 printf("key->name: %s\n",(char*)key->ptr);
9098 printf("key->refcount: %d\n",key->refcount);
9099 printf("val: %p\n",(void*)j->val);
9100 printf("val->type: %d\n",j->val->type);
9101 printf("val->ptr: %s\n",(char*)j->val->ptr);
9102 }
9103 redisAssert(key->storage == REDIS_VM_SWAPPING);
9104 val = dictGetEntryVal(de);
9105 key->vm.page = j->page;
9106 key->vm.usedpages = j->pages;
9107 key->storage = REDIS_VM_SWAPPED;
9108 key->vtype = j->val->type;
9109 decrRefCount(val); /* Deallocate the object from memory. */
9110 dictGetEntryVal(de) = NULL;
9111 redisLog(REDIS_DEBUG,
9112 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9113 (unsigned char*) key->ptr,
9114 (unsigned long long) j->page, (unsigned long long) j->pages);
9115 server.vm_stats_swapped_objects++;
9116 server.vm_stats_swapouts++;
9117 freeIOJob(j);
9118 /* Put a few more swap requests in queue if we are still
9119 * out of memory */
9120 if (trytoswap && vmCanSwapOut() &&
9121 zmalloc_used_memory() > server.vm_max_memory)
9122 {
9123 int more = 1;
9124 while(more) {
9125 lockThreadedIO();
9126 more = listLength(server.io_newjobs) <
9127 (unsigned) server.vm_max_threads;
9128 unlockThreadedIO();
9129 /* Don't waste CPU time if swappable objects are rare. */
9130 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9131 trytoswap = 0;
9132 break;
9133 }
9134 }
9135 }
9136 }
9137 processed++;
9138 if (processed == toprocess) return;
9139 }
9140 if (retval < 0 && errno != EAGAIN) {
9141 redisLog(REDIS_WARNING,
9142 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9143 strerror(errno));
9144 }
9145 }
9146
9147 static void lockThreadedIO(void) {
9148 pthread_mutex_lock(&server.io_mutex);
9149 }
9150
9151 static void unlockThreadedIO(void) {
9152 pthread_mutex_unlock(&server.io_mutex);
9153 }
9154
9155 /* Remove the specified object from the threaded I/O queue if still not
9156 * processed, otherwise make sure to flag it as canceled. */
9157 static void vmCancelThreadedIOJob(robj *o) {
9158 list *lists[3] = {
9159 server.io_newjobs, /* 0 */
9160 server.io_processing, /* 1 */
9161 server.io_processed /* 2 */
9162 };
9163 int i;
9164
9165 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9166 again:
9167 lockThreadedIO();
9168 /* Search for a matching key in one of the queues */
9169 for (i = 0; i < 3; i++) {
9170 listNode *ln;
9171 listIter li;
9172
9173 listRewind(lists[i],&li);
9174 while ((ln = listNext(&li)) != NULL) {
9175 iojob *job = ln->value;
9176
9177 if (job->canceled) continue; /* Skip this, already canceled. */
9178 if (job->key == o) {
9179 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9180 (void*)job, (char*)o->ptr, job->type, i);
9181 /* Mark the pages as free since the swap didn't happened
9182 * or happened but is now discarded. */
9183 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9184 vmMarkPagesFree(job->page,job->pages);
9185 /* Cancel the job. It depends on the list the job is
9186 * living in. */
9187 switch(i) {
9188 case 0: /* io_newjobs */
9189 /* If the job was yet not processed the best thing to do
9190 * is to remove it from the queue at all */
9191 freeIOJob(job);
9192 listDelNode(lists[i],ln);
9193 break;
9194 case 1: /* io_processing */
9195 /* Oh Shi- the thread is messing with the Job:
9196 *
9197 * Probably it's accessing the object if this is a
9198 * PREPARE_SWAP or DO_SWAP job.
9199 * If it's a LOAD job it may be reading from disk and
9200 * if we don't wait for the job to terminate before to
9201 * cancel it, maybe in a few microseconds data can be
9202 * corrupted in this pages. So the short story is:
9203 *
9204 * Better to wait for the job to move into the
9205 * next queue (processed)... */
9206
9207 /* We try again and again until the job is completed. */
9208 unlockThreadedIO();
9209 /* But let's wait some time for the I/O thread
9210 * to finish with this job. After all this condition
9211 * should be very rare. */
9212 usleep(1);
9213 goto again;
9214 case 2: /* io_processed */
9215 /* The job was already processed, that's easy...
9216 * just mark it as canceled so that we'll ignore it
9217 * when processing completed jobs. */
9218 job->canceled = 1;
9219 break;
9220 }
9221 /* Finally we have to adjust the storage type of the object
9222 * in order to "UNDO" the operaiton. */
9223 if (o->storage == REDIS_VM_LOADING)
9224 o->storage = REDIS_VM_SWAPPED;
9225 else if (o->storage == REDIS_VM_SWAPPING)
9226 o->storage = REDIS_VM_MEMORY;
9227 unlockThreadedIO();
9228 return;
9229 }
9230 }
9231 }
9232 unlockThreadedIO();
9233 assert(1 != 1); /* We should never reach this */
9234 }
9235
9236 static void *IOThreadEntryPoint(void *arg) {
9237 iojob *j;
9238 listNode *ln;
9239 REDIS_NOTUSED(arg);
9240
9241 pthread_detach(pthread_self());
9242 while(1) {
9243 /* Get a new job to process */
9244 lockThreadedIO();
9245 if (listLength(server.io_newjobs) == 0) {
9246 /* No new jobs in queue, exit. */
9247 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9248 (long) pthread_self());
9249 server.io_active_threads--;
9250 unlockThreadedIO();
9251 return NULL;
9252 }
9253 ln = listFirst(server.io_newjobs);
9254 j = ln->value;
9255 listDelNode(server.io_newjobs,ln);
9256 /* Add the job in the processing queue */
9257 j->thread = pthread_self();
9258 listAddNodeTail(server.io_processing,j);
9259 ln = listLast(server.io_processing); /* We use ln later to remove it */
9260 unlockThreadedIO();
9261 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9262 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9263
9264 /* Process the Job */
9265 if (j->type == REDIS_IOJOB_LOAD) {
9266 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9267 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9268 FILE *fp = fopen("/dev/null","w+");
9269 j->pages = rdbSavedObjectPages(j->val,fp);
9270 fclose(fp);
9271 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9272 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9273 j->canceled = 1;
9274 }
9275
9276 /* Done: insert the job into the processed queue */
9277 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9278 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9279 lockThreadedIO();
9280 listDelNode(server.io_processing,ln);
9281 listAddNodeTail(server.io_processed,j);
9282 unlockThreadedIO();
9283
9284 /* Signal the main thread there is new stuff to process */
9285 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9286 }
9287 return NULL; /* never reached */
9288 }
9289
9290 static void spawnIOThread(void) {
9291 pthread_t thread;
9292 sigset_t mask, omask;
9293 int err;
9294
9295 sigemptyset(&mask);
9296 sigaddset(&mask,SIGCHLD);
9297 sigaddset(&mask,SIGHUP);
9298 sigaddset(&mask,SIGPIPE);
9299 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9300 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9301 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9302 strerror(err));
9303 usleep(1000000);
9304 }
9305 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9306 server.io_active_threads++;
9307 }
9308
9309 /* We need to wait for the last thread to exit before we are able to
9310 * fork() in order to BGSAVE or BGREWRITEAOF. */
9311 static void waitEmptyIOJobsQueue(void) {
9312 while(1) {
9313 int io_processed_len;
9314
9315 lockThreadedIO();
9316 if (listLength(server.io_newjobs) == 0 &&
9317 listLength(server.io_processing) == 0 &&
9318 server.io_active_threads == 0)
9319 {
9320 unlockThreadedIO();
9321 return;
9322 }
9323 /* While waiting for empty jobs queue condition we post-process some
9324 * finshed job, as I/O threads may be hanging trying to write against
9325 * the io_ready_pipe_write FD but there are so much pending jobs that
9326 * it's blocking. */
9327 io_processed_len = listLength(server.io_processed);
9328 unlockThreadedIO();
9329 if (io_processed_len) {
9330 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9331 usleep(1000); /* 1 millisecond */
9332 } else {
9333 usleep(10000); /* 10 milliseconds */
9334 }
9335 }
9336 }
9337
9338 static void vmReopenSwapFile(void) {
9339 /* Note: we don't close the old one as we are in the child process
9340 * and don't want to mess at all with the original file object. */
9341 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9342 if (server.vm_fp == NULL) {
9343 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9344 server.vm_swap_file);
9345 _exit(1);
9346 }
9347 server.vm_fd = fileno(server.vm_fp);
9348 }
9349
9350 /* This function must be called while with threaded IO locked */
9351 static void queueIOJob(iojob *j) {
9352 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9353 (void*)j, j->type, (char*)j->key->ptr);
9354 listAddNodeTail(server.io_newjobs,j);
9355 if (server.io_active_threads < server.vm_max_threads)
9356 spawnIOThread();
9357 }
9358
9359 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9360 iojob *j;
9361
9362 assert(key->storage == REDIS_VM_MEMORY);
9363 assert(key->refcount == 1);
9364
9365 j = zmalloc(sizeof(*j));
9366 j->type = REDIS_IOJOB_PREPARE_SWAP;
9367 j->db = db;
9368 j->key = key;
9369 j->val = val;
9370 incrRefCount(val);
9371 j->canceled = 0;
9372 j->thread = (pthread_t) -1;
9373 key->storage = REDIS_VM_SWAPPING;
9374
9375 lockThreadedIO();
9376 queueIOJob(j);
9377 unlockThreadedIO();
9378 return REDIS_OK;
9379 }
9380
9381 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9382
9383 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9384 * If there is not already a job loading the key, it is craeted.
9385 * The key is added to the io_keys list in the client structure, and also
9386 * in the hash table mapping swapped keys to waiting clients, that is,
9387 * server.io_waited_keys. */
9388 static int waitForSwappedKey(redisClient *c, robj *key) {
9389 struct dictEntry *de;
9390 robj *o;
9391 list *l;
9392
9393 /* If the key does not exist or is already in RAM we don't need to
9394 * block the client at all. */
9395 de = dictFind(c->db->dict,key);
9396 if (de == NULL) return 0;
9397 o = dictGetEntryKey(de);
9398 if (o->storage == REDIS_VM_MEMORY) {
9399 return 0;
9400 } else if (o->storage == REDIS_VM_SWAPPING) {
9401 /* We were swapping the key, undo it! */
9402 vmCancelThreadedIOJob(o);
9403 return 0;
9404 }
9405
9406 /* OK: the key is either swapped, or being loaded just now. */
9407
9408 /* Add the key to the list of keys this client is waiting for.
9409 * This maps clients to keys they are waiting for. */
9410 listAddNodeTail(c->io_keys,key);
9411 incrRefCount(key);
9412
9413 /* Add the client to the swapped keys => clients waiting map. */
9414 de = dictFind(c->db->io_keys,key);
9415 if (de == NULL) {
9416 int retval;
9417
9418 /* For every key we take a list of clients blocked for it */
9419 l = listCreate();
9420 retval = dictAdd(c->db->io_keys,key,l);
9421 incrRefCount(key);
9422 assert(retval == DICT_OK);
9423 } else {
9424 l = dictGetEntryVal(de);
9425 }
9426 listAddNodeTail(l,c);
9427
9428 /* Are we already loading the key from disk? If not create a job */
9429 if (o->storage == REDIS_VM_SWAPPED) {
9430 iojob *j;
9431
9432 o->storage = REDIS_VM_LOADING;
9433 j = zmalloc(sizeof(*j));
9434 j->type = REDIS_IOJOB_LOAD;
9435 j->db = c->db;
9436 j->key = o;
9437 j->key->vtype = o->vtype;
9438 j->page = o->vm.page;
9439 j->val = NULL;
9440 j->canceled = 0;
9441 j->thread = (pthread_t) -1;
9442 lockThreadedIO();
9443 queueIOJob(j);
9444 unlockThreadedIO();
9445 }
9446 return 1;
9447 }
9448
9449 /* Preload keys needed for the ZUNION and ZINTER commands. */
9450 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9451 int i, num;
9452 num = atoi(c->argv[2]->ptr);
9453 for (i = 0; i < num; i++) {
9454 waitForSwappedKey(c,c->argv[3+i]);
9455 }
9456 }
9457
9458 /* Is this client attempting to run a command against swapped keys?
9459 * If so, block it ASAP, load the keys in background, then resume it.
9460 *
9461 * The important idea about this function is that it can fail! If keys will
9462 * still be swapped when the client is resumed, this key lookups will
9463 * just block loading keys from disk. In practical terms this should only
9464 * happen with SORT BY command or if there is a bug in this function.
9465 *
9466 * Return 1 if the client is marked as blocked, 0 if the client can
9467 * continue as the keys it is going to access appear to be in memory. */
9468 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9469 int j, last;
9470
9471 if (cmd->vm_preload_proc != NULL) {
9472 cmd->vm_preload_proc(c);
9473 } else {
9474 if (cmd->vm_firstkey == 0) return 0;
9475 last = cmd->vm_lastkey;
9476 if (last < 0) last = c->argc+last;
9477 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9478 waitForSwappedKey(c,c->argv[j]);
9479 }
9480
9481 /* If the client was blocked for at least one key, mark it as blocked. */
9482 if (listLength(c->io_keys)) {
9483 c->flags |= REDIS_IO_WAIT;
9484 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9485 server.vm_blocked_clients++;
9486 return 1;
9487 } else {
9488 return 0;
9489 }
9490 }
9491
9492 /* Remove the 'key' from the list of blocked keys for a given client.
9493 *
9494 * The function returns 1 when there are no longer blocking keys after
9495 * the current one was removed (and the client can be unblocked). */
9496 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9497 list *l;
9498 listNode *ln;
9499 listIter li;
9500 struct dictEntry *de;
9501
9502 /* Remove the key from the list of keys this client is waiting for. */
9503 listRewind(c->io_keys,&li);
9504 while ((ln = listNext(&li)) != NULL) {
9505 if (compareStringObjects(ln->value,key) == 0) {
9506 listDelNode(c->io_keys,ln);
9507 break;
9508 }
9509 }
9510 assert(ln != NULL);
9511
9512 /* Remove the client form the key => waiting clients map. */
9513 de = dictFind(c->db->io_keys,key);
9514 assert(de != NULL);
9515 l = dictGetEntryVal(de);
9516 ln = listSearchKey(l,c);
9517 assert(ln != NULL);
9518 listDelNode(l,ln);
9519 if (listLength(l) == 0)
9520 dictDelete(c->db->io_keys,key);
9521
9522 return listLength(c->io_keys) == 0;
9523 }
9524
9525 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9526 struct dictEntry *de;
9527 list *l;
9528 listNode *ln;
9529 int len;
9530
9531 de = dictFind(db->io_keys,key);
9532 if (!de) return;
9533
9534 l = dictGetEntryVal(de);
9535 len = listLength(l);
9536 /* Note: we can't use something like while(listLength(l)) as the list
9537 * can be freed by the calling function when we remove the last element. */
9538 while (len--) {
9539 ln = listFirst(l);
9540 redisClient *c = ln->value;
9541
9542 if (dontWaitForSwappedKey(c,key)) {
9543 /* Put the client in the list of clients ready to go as we
9544 * loaded all the keys about it. */
9545 listAddNodeTail(server.io_ready_clients,c);
9546 }
9547 }
9548 }
9549
9550 /* =========================== Remote Configuration ========================= */
9551
9552 static void configSetCommand(redisClient *c) {
9553 robj *o = getDecodedObject(c->argv[3]);
9554 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9555 zfree(server.dbfilename);
9556 server.dbfilename = zstrdup(o->ptr);
9557 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9558 zfree(server.requirepass);
9559 server.requirepass = zstrdup(o->ptr);
9560 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9561 zfree(server.masterauth);
9562 server.masterauth = zstrdup(o->ptr);
9563 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9564 server.maxmemory = strtoll(o->ptr, NULL, 10);
9565 } else {
9566 addReplySds(c,sdscatprintf(sdsempty(),
9567 "-ERR not supported CONFIG parameter %s\r\n",
9568 (char*)c->argv[2]->ptr));
9569 decrRefCount(o);
9570 return;
9571 }
9572 decrRefCount(o);
9573 addReply(c,shared.ok);
9574 }
9575
9576 static void configGetCommand(redisClient *c) {
9577 robj *o = getDecodedObject(c->argv[2]);
9578 robj *lenobj = createObject(REDIS_STRING,NULL);
9579 char *pattern = o->ptr;
9580 int matches = 0;
9581
9582 addReply(c,lenobj);
9583 decrRefCount(lenobj);
9584
9585 if (stringmatch(pattern,"dbfilename",0)) {
9586 addReplyBulkCString(c,"dbfilename");
9587 addReplyBulkCString(c,server.dbfilename);
9588 matches++;
9589 }
9590 if (stringmatch(pattern,"requirepass",0)) {
9591 addReplyBulkCString(c,"requirepass");
9592 addReplyBulkCString(c,server.requirepass);
9593 matches++;
9594 }
9595 if (stringmatch(pattern,"masterauth",0)) {
9596 addReplyBulkCString(c,"masterauth");
9597 addReplyBulkCString(c,server.masterauth);
9598 matches++;
9599 }
9600 if (stringmatch(pattern,"maxmemory",0)) {
9601 char buf[128];
9602
9603 snprintf(buf,128,"%llu\n",server.maxmemory);
9604 addReplyBulkCString(c,"maxmemory");
9605 addReplyBulkCString(c,buf);
9606 matches++;
9607 }
9608 decrRefCount(o);
9609 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9610 }
9611
9612 static void configCommand(redisClient *c) {
9613 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9614 if (c->argc != 4) goto badarity;
9615 configSetCommand(c);
9616 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9617 if (c->argc != 3) goto badarity;
9618 configGetCommand(c);
9619 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9620 if (c->argc != 2) goto badarity;
9621 server.stat_numcommands = 0;
9622 server.stat_numconnections = 0;
9623 server.stat_expiredkeys = 0;
9624 server.stat_starttime = time(NULL);
9625 addReply(c,shared.ok);
9626 } else {
9627 addReplySds(c,sdscatprintf(sdsempty(),
9628 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9629 }
9630 return;
9631
9632 badarity:
9633 addReplySds(c,sdscatprintf(sdsempty(),
9634 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9635 (char*) c->argv[1]->ptr));
9636 }
9637
9638 /* =========================== Pubsub implementation ======================== */
9639
9640 static void freePubsubPattern(void *p) {
9641 pubsubPattern *pat = p;
9642
9643 decrRefCount(pat->pattern);
9644 zfree(pat);
9645 }
9646
9647 static int listMatchPubsubPattern(void *a, void *b) {
9648 pubsubPattern *pa = a, *pb = b;
9649
9650 return (pa->client == pb->client) &&
9651 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9652 }
9653
9654 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9655 * 0 if the client was already subscribed to that channel. */
9656 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9657 struct dictEntry *de;
9658 list *clients = NULL;
9659 int retval = 0;
9660
9661 /* Add the channel to the client -> channels hash table */
9662 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9663 retval = 1;
9664 incrRefCount(channel);
9665 /* Add the client to the channel -> list of clients hash table */
9666 de = dictFind(server.pubsub_channels,channel);
9667 if (de == NULL) {
9668 clients = listCreate();
9669 dictAdd(server.pubsub_channels,channel,clients);
9670 incrRefCount(channel);
9671 } else {
9672 clients = dictGetEntryVal(de);
9673 }
9674 listAddNodeTail(clients,c);
9675 }
9676 /* Notify the client */
9677 addReply(c,shared.mbulk3);
9678 addReply(c,shared.subscribebulk);
9679 addReplyBulk(c,channel);
9680 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9681 return retval;
9682 }
9683
9684 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9685 * 0 if the client was not subscribed to the specified channel. */
9686 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9687 struct dictEntry *de;
9688 list *clients;
9689 listNode *ln;
9690 int retval = 0;
9691
9692 /* Remove the channel from the client -> channels hash table */
9693 incrRefCount(channel); /* channel may be just a pointer to the same object
9694 we have in the hash tables. Protect it... */
9695 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9696 retval = 1;
9697 /* Remove the client from the channel -> clients list hash table */
9698 de = dictFind(server.pubsub_channels,channel);
9699 assert(de != NULL);
9700 clients = dictGetEntryVal(de);
9701 ln = listSearchKey(clients,c);
9702 assert(ln != NULL);
9703 listDelNode(clients,ln);
9704 if (listLength(clients) == 0) {
9705 /* Free the list and associated hash entry at all if this was
9706 * the latest client, so that it will be possible to abuse
9707 * Redis PUBSUB creating millions of channels. */
9708 dictDelete(server.pubsub_channels,channel);
9709 }
9710 }
9711 /* Notify the client */
9712 if (notify) {
9713 addReply(c,shared.mbulk3);
9714 addReply(c,shared.unsubscribebulk);
9715 addReplyBulk(c,channel);
9716 addReplyLong(c,dictSize(c->pubsub_channels)+
9717 listLength(c->pubsub_patterns));
9718
9719 }
9720 decrRefCount(channel); /* it is finally safe to release it */
9721 return retval;
9722 }
9723
9724 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9725 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9726 int retval = 0;
9727
9728 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9729 retval = 1;
9730 pubsubPattern *pat;
9731 listAddNodeTail(c->pubsub_patterns,pattern);
9732 incrRefCount(pattern);
9733 pat = zmalloc(sizeof(*pat));
9734 pat->pattern = getDecodedObject(pattern);
9735 pat->client = c;
9736 listAddNodeTail(server.pubsub_patterns,pat);
9737 }
9738 /* Notify the client */
9739 addReply(c,shared.mbulk3);
9740 addReply(c,shared.psubscribebulk);
9741 addReplyBulk(c,pattern);
9742 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9743 return retval;
9744 }
9745
9746 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9747 * 0 if the client was not subscribed to the specified channel. */
9748 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9749 listNode *ln;
9750 pubsubPattern pat;
9751 int retval = 0;
9752
9753 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9754 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9755 retval = 1;
9756 listDelNode(c->pubsub_patterns,ln);
9757 pat.client = c;
9758 pat.pattern = pattern;
9759 ln = listSearchKey(server.pubsub_patterns,&pat);
9760 listDelNode(server.pubsub_patterns,ln);
9761 }
9762 /* Notify the client */
9763 if (notify) {
9764 addReply(c,shared.mbulk3);
9765 addReply(c,shared.punsubscribebulk);
9766 addReplyBulk(c,pattern);
9767 addReplyLong(c,dictSize(c->pubsub_channels)+
9768 listLength(c->pubsub_patterns));
9769 }
9770 decrRefCount(pattern);
9771 return retval;
9772 }
9773
9774 /* Unsubscribe from all the channels. Return the number of channels the
9775 * client was subscribed from. */
9776 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9777 dictIterator *di = dictGetIterator(c->pubsub_channels);
9778 dictEntry *de;
9779 int count = 0;
9780
9781 while((de = dictNext(di)) != NULL) {
9782 robj *channel = dictGetEntryKey(de);
9783
9784 count += pubsubUnsubscribeChannel(c,channel,notify);
9785 }
9786 dictReleaseIterator(di);
9787 return count;
9788 }
9789
9790 /* Unsubscribe from all the patterns. Return the number of patterns the
9791 * client was subscribed from. */
9792 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9793 listNode *ln;
9794 listIter li;
9795 int count = 0;
9796
9797 listRewind(c->pubsub_patterns,&li);
9798 while ((ln = listNext(&li)) != NULL) {
9799 robj *pattern = ln->value;
9800
9801 count += pubsubUnsubscribePattern(c,pattern,notify);
9802 }
9803 return count;
9804 }
9805
9806 /* Publish a message */
9807 static int pubsubPublishMessage(robj *channel, robj *message) {
9808 int receivers = 0;
9809 struct dictEntry *de;
9810 listNode *ln;
9811 listIter li;
9812
9813 /* Send to clients listening for that channel */
9814 de = dictFind(server.pubsub_channels,channel);
9815 if (de) {
9816 list *list = dictGetEntryVal(de);
9817 listNode *ln;
9818 listIter li;
9819
9820 listRewind(list,&li);
9821 while ((ln = listNext(&li)) != NULL) {
9822 redisClient *c = ln->value;
9823
9824 addReply(c,shared.mbulk3);
9825 addReply(c,shared.messagebulk);
9826 addReplyBulk(c,channel);
9827 addReplyBulk(c,message);
9828 receivers++;
9829 }
9830 }
9831 /* Send to clients listening to matching channels */
9832 if (listLength(server.pubsub_patterns)) {
9833 listRewind(server.pubsub_patterns,&li);
9834 channel = getDecodedObject(channel);
9835 while ((ln = listNext(&li)) != NULL) {
9836 pubsubPattern *pat = ln->value;
9837
9838 if (stringmatchlen((char*)pat->pattern->ptr,
9839 sdslen(pat->pattern->ptr),
9840 (char*)channel->ptr,
9841 sdslen(channel->ptr),0)) {
9842 addReply(pat->client,shared.mbulk4);
9843 addReply(pat->client,shared.pmessagebulk);
9844 addReplyBulk(pat->client,pat->pattern);
9845 addReplyBulk(pat->client,channel);
9846 addReplyBulk(pat->client,message);
9847 receivers++;
9848 }
9849 }
9850 decrRefCount(channel);
9851 }
9852 return receivers;
9853 }
9854
9855 static void subscribeCommand(redisClient *c) {
9856 int j;
9857
9858 for (j = 1; j < c->argc; j++)
9859 pubsubSubscribeChannel(c,c->argv[j]);
9860 }
9861
9862 static void unsubscribeCommand(redisClient *c) {
9863 if (c->argc == 1) {
9864 pubsubUnsubscribeAllChannels(c,1);
9865 return;
9866 } else {
9867 int j;
9868
9869 for (j = 1; j < c->argc; j++)
9870 pubsubUnsubscribeChannel(c,c->argv[j],1);
9871 }
9872 }
9873
9874 static void psubscribeCommand(redisClient *c) {
9875 int j;
9876
9877 for (j = 1; j < c->argc; j++)
9878 pubsubSubscribePattern(c,c->argv[j]);
9879 }
9880
9881 static void punsubscribeCommand(redisClient *c) {
9882 if (c->argc == 1) {
9883 pubsubUnsubscribeAllPatterns(c,1);
9884 return;
9885 } else {
9886 int j;
9887
9888 for (j = 1; j < c->argc; j++)
9889 pubsubUnsubscribePattern(c,c->argv[j],1);
9890 }
9891 }
9892
9893 static void publishCommand(redisClient *c) {
9894 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9895 addReplyLong(c,receivers);
9896 }
9897
9898 /* ================================= Debugging ============================== */
9899
9900 static void debugCommand(redisClient *c) {
9901 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9902 *((char*)-1) = 'x';
9903 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9904 if (rdbSave(server.dbfilename) != REDIS_OK) {
9905 addReply(c,shared.err);
9906 return;
9907 }
9908 emptyDb();
9909 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9910 addReply(c,shared.err);
9911 return;
9912 }
9913 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9914 addReply(c,shared.ok);
9915 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9916 emptyDb();
9917 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9918 addReply(c,shared.err);
9919 return;
9920 }
9921 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9922 addReply(c,shared.ok);
9923 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9924 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9925 robj *key, *val;
9926
9927 if (!de) {
9928 addReply(c,shared.nokeyerr);
9929 return;
9930 }
9931 key = dictGetEntryKey(de);
9932 val = dictGetEntryVal(de);
9933 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9934 key->storage == REDIS_VM_SWAPPING)) {
9935 char *strenc;
9936 char buf[128];
9937
9938 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9939 strenc = strencoding[val->encoding];
9940 } else {
9941 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9942 strenc = buf;
9943 }
9944 addReplySds(c,sdscatprintf(sdsempty(),
9945 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9946 "encoding:%s serializedlength:%lld\r\n",
9947 (void*)key, key->refcount, (void*)val, val->refcount,
9948 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9949 } else {
9950 addReplySds(c,sdscatprintf(sdsempty(),
9951 "+Key at:%p refcount:%d, value swapped at: page %llu "
9952 "using %llu pages\r\n",
9953 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9954 (unsigned long long) key->vm.usedpages));
9955 }
9956 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9957 lookupKeyRead(c->db,c->argv[2]);
9958 addReply(c,shared.ok);
9959 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9960 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9961 robj *key, *val;
9962
9963 if (!server.vm_enabled) {
9964 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9965 return;
9966 }
9967 if (!de) {
9968 addReply(c,shared.nokeyerr);
9969 return;
9970 }
9971 key = dictGetEntryKey(de);
9972 val = dictGetEntryVal(de);
9973 /* If the key is shared we want to create a copy */
9974 if (key->refcount > 1) {
9975 robj *newkey = dupStringObject(key);
9976 decrRefCount(key);
9977 key = dictGetEntryKey(de) = newkey;
9978 }
9979 /* Swap it */
9980 if (key->storage != REDIS_VM_MEMORY) {
9981 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9982 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9983 dictGetEntryVal(de) = NULL;
9984 addReply(c,shared.ok);
9985 } else {
9986 addReply(c,shared.err);
9987 }
9988 } else {
9989 addReplySds(c,sdsnew(
9990 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9991 }
9992 }
9993
9994 static void _redisAssert(char *estr, char *file, int line) {
9995 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9996 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9997 #ifdef HAVE_BACKTRACE
9998 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9999 *((char*)-1) = 'x';
10000 #endif
10001 }
10002
10003 static void _redisPanic(char *msg, char *file, int line) {
10004 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10005 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10006 #ifdef HAVE_BACKTRACE
10007 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10008 *((char*)-1) = 'x';
10009 #endif
10010 }
10011
10012 /* =================================== Main! ================================ */
10013
10014 #ifdef __linux__
10015 int linuxOvercommitMemoryValue(void) {
10016 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10017 char buf[64];
10018
10019 if (!fp) return -1;
10020 if (fgets(buf,64,fp) == NULL) {
10021 fclose(fp);
10022 return -1;
10023 }
10024 fclose(fp);
10025
10026 return atoi(buf);
10027 }
10028
10029 void linuxOvercommitMemoryWarning(void) {
10030 if (linuxOvercommitMemoryValue() == 0) {
10031 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10032 }
10033 }
10034 #endif /* __linux__ */
10035
10036 static void daemonize(void) {
10037 int fd;
10038 FILE *fp;
10039
10040 if (fork() != 0) exit(0); /* parent exits */
10041 setsid(); /* create a new session */
10042
10043 /* Every output goes to /dev/null. If Redis is daemonized but
10044 * the 'logfile' is set to 'stdout' in the configuration file
10045 * it will not log at all. */
10046 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10047 dup2(fd, STDIN_FILENO);
10048 dup2(fd, STDOUT_FILENO);
10049 dup2(fd, STDERR_FILENO);
10050 if (fd > STDERR_FILENO) close(fd);
10051 }
10052 /* Try to write the pid file */
10053 fp = fopen(server.pidfile,"w");
10054 if (fp) {
10055 fprintf(fp,"%d\n",getpid());
10056 fclose(fp);
10057 }
10058 }
10059
10060 static void version() {
10061 printf("Redis server version %s\n", REDIS_VERSION);
10062 exit(0);
10063 }
10064
10065 static void usage() {
10066 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10067 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10068 exit(1);
10069 }
10070
10071 int main(int argc, char **argv) {
10072 time_t start;
10073
10074 initServerConfig();
10075 if (argc == 2) {
10076 if (strcmp(argv[1], "-v") == 0 ||
10077 strcmp(argv[1], "--version") == 0) version();
10078 if (strcmp(argv[1], "--help") == 0) usage();
10079 resetServerSaveParams();
10080 loadServerConfig(argv[1]);
10081 } else if ((argc > 2)) {
10082 usage();
10083 } else {
10084 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10085 }
10086 if (server.daemonize) daemonize();
10087 initServer();
10088 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10089 #ifdef __linux__
10090 linuxOvercommitMemoryWarning();
10091 #endif
10092 start = time(NULL);
10093 if (server.appendonly) {
10094 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10095 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10096 } else {
10097 if (rdbLoad(server.dbfilename) == REDIS_OK)
10098 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10099 }
10100 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10101 aeSetBeforeSleepProc(server.el,beforeSleep);
10102 aeMain(server.el);
10103 aeDeleteEventLoop(server.el);
10104 return 0;
10105 }
10106
10107 /* ============================= Backtrace support ========================= */
10108
10109 #ifdef HAVE_BACKTRACE
10110 static char *findFuncName(void *pointer, unsigned long *offset);
10111
10112 static void *getMcontextEip(ucontext_t *uc) {
10113 #if defined(__FreeBSD__)
10114 return (void*) uc->uc_mcontext.mc_eip;
10115 #elif defined(__dietlibc__)
10116 return (void*) uc->uc_mcontext.eip;
10117 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10118 #if __x86_64__
10119 return (void*) uc->uc_mcontext->__ss.__rip;
10120 #else
10121 return (void*) uc->uc_mcontext->__ss.__eip;
10122 #endif
10123 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10124 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10125 return (void*) uc->uc_mcontext->__ss.__rip;
10126 #else
10127 return (void*) uc->uc_mcontext->__ss.__eip;
10128 #endif
10129 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10130 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10131 #elif defined(__ia64__) /* Linux IA64 */
10132 return (void*) uc->uc_mcontext.sc_ip;
10133 #else
10134 return NULL;
10135 #endif
10136 }
10137
10138 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10139 void *trace[100];
10140 char **messages = NULL;
10141 int i, trace_size = 0;
10142 unsigned long offset=0;
10143 ucontext_t *uc = (ucontext_t*) secret;
10144 sds infostring;
10145 REDIS_NOTUSED(info);
10146
10147 redisLog(REDIS_WARNING,
10148 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10149 infostring = genRedisInfoString();
10150 redisLog(REDIS_WARNING, "%s",infostring);
10151 /* It's not safe to sdsfree() the returned string under memory
10152 * corruption conditions. Let it leak as we are going to abort */
10153
10154 trace_size = backtrace(trace, 100);
10155 /* overwrite sigaction with caller's address */
10156 if (getMcontextEip(uc) != NULL) {
10157 trace[1] = getMcontextEip(uc);
10158 }
10159 messages = backtrace_symbols(trace, trace_size);
10160
10161 for (i=1; i<trace_size; ++i) {
10162 char *fn = findFuncName(trace[i], &offset), *p;
10163
10164 p = strchr(messages[i],'+');
10165 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10166 redisLog(REDIS_WARNING,"%s", messages[i]);
10167 } else {
10168 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10169 }
10170 }
10171 /* free(messages); Don't call free() with possibly corrupted memory. */
10172 _exit(0);
10173 }
10174
10175 static void setupSigSegvAction(void) {
10176 struct sigaction act;
10177
10178 sigemptyset (&act.sa_mask);
10179 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10180 * is used. Otherwise, sa_handler is used */
10181 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10182 act.sa_sigaction = segvHandler;
10183 sigaction (SIGSEGV, &act, NULL);
10184 sigaction (SIGBUS, &act, NULL);
10185 sigaction (SIGFPE, &act, NULL);
10186 sigaction (SIGILL, &act, NULL);
10187 sigaction (SIGBUS, &act, NULL);
10188 return;
10189 }
10190
10191 #include "staticsymbols.h"
10192 /* This function try to convert a pointer into a function name. It's used in
10193 * oreder to provide a backtrace under segmentation fault that's able to
10194 * display functions declared as static (otherwise the backtrace is useless). */
10195 static char *findFuncName(void *pointer, unsigned long *offset){
10196 int i, ret = -1;
10197 unsigned long off, minoff = 0;
10198
10199 /* Try to match against the Symbol with the smallest offset */
10200 for (i=0; symsTable[i].pointer; i++) {
10201 unsigned long lp = (unsigned long) pointer;
10202
10203 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10204 off=lp-symsTable[i].pointer;
10205 if (ret < 0 || off < minoff) {
10206 minoff=off;
10207 ret=i;
10208 }
10209 }
10210 }
10211 if (ret == -1) return NULL;
10212 *offset = minoff;
10213 return symsTable[ret].name;
10214 }
10215 #else /* HAVE_BACKTRACE */
10216 static void setupSigSegvAction(void) {
10217 }
10218 #endif /* HAVE_BACKTRACE */
10219
10220
10221
10222 /* The End */
10223
10224
10225