]> git.saurik.com Git - redis.git/blob - redis.c
Sharing of small integer objects: may save a lot of memory with datasets having many...
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
518 } shared;
519
520 /* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
526 /* VM threaded I/O request message */
527 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
530 typedef struct iojob {
531 int type; /* Request type, REDIS_IOJOB_* */
532 redisDb *db;/* Redis database */
533 robj *key; /* This I/O request is about swapping this key */
534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540 } iojob;
541
542 /*================================ Prototypes =============================== */
543
544 static void freeStringObject(robj *o);
545 static void freeListObject(robj *o);
546 static void freeSetObject(robj *o);
547 static void decrRefCount(void *o);
548 static robj *createObject(int type, void *ptr);
549 static void freeClient(redisClient *c);
550 static int rdbLoad(char *filename);
551 static void addReply(redisClient *c, robj *obj);
552 static void addReplySds(redisClient *c, sds s);
553 static void incrRefCount(robj *o);
554 static int rdbSaveBackground(char *filename);
555 static robj *createStringObject(char *ptr, size_t len);
556 static robj *dupStringObject(robj *o);
557 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
559 static int syncWithMaster(void);
560 static robj *tryObjectEncoding(robj *o);
561 static robj *getDecodedObject(robj *o);
562 static int removeExpire(redisDb *db, robj *key);
563 static int expireIfNeeded(redisDb *db, robj *key);
564 static int deleteIfVolatile(redisDb *db, robj *key);
565 static int deleteIfSwapped(redisDb *db, robj *key);
566 static int deleteKey(redisDb *db, robj *key);
567 static time_t getExpire(redisDb *db, robj *key);
568 static int setExpire(redisDb *db, robj *key, time_t when);
569 static void updateSlavesWaitingBgsave(int bgsaveerr);
570 static void freeMemoryIfNeeded(void);
571 static int processCommand(redisClient *c);
572 static void setupSigSegvAction(void);
573 static void rdbRemoveTempFile(pid_t childpid);
574 static void aofRemoveTempFile(pid_t childpid);
575 static size_t stringObjectLen(robj *o);
576 static void processInputBuffer(redisClient *c);
577 static zskiplist *zslCreate(void);
578 static void zslFree(zskiplist *zsl);
579 static void zslInsert(zskiplist *zsl, double score, robj *obj);
580 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
581 static void initClientMultiState(redisClient *c);
582 static void freeClientMultiState(redisClient *c);
583 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584 static void unblockClientWaitingData(redisClient *c);
585 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586 static void vmInit(void);
587 static void vmMarkPagesFree(off_t page, off_t count);
588 static robj *vmLoadObject(robj *key);
589 static robj *vmPreviewObject(robj *key);
590 static int vmSwapOneObjectBlocking(void);
591 static int vmSwapOneObjectThreaded(void);
592 static int vmCanSwapOut(void);
593 static int tryFreeOneObjectFromFreelist(void);
594 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596 static void vmCancelThreadedIOJob(robj *o);
597 static void lockThreadedIO(void);
598 static void unlockThreadedIO(void);
599 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600 static void freeIOJob(iojob *j);
601 static void queueIOJob(iojob *j);
602 static int vmWriteObjectOnSwap(robj *o, off_t page);
603 static robj *vmReadObjectFromSwap(off_t page, int type);
604 static void waitEmptyIOJobsQueue(void);
605 static void vmReopenSwapFile(void);
606 static int vmFreePage(off_t page);
607 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
608 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609 static int dontWaitForSwappedKey(redisClient *c, robj *key);
610 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612 static struct redisCommand *lookupCommand(char *name);
613 static void call(redisClient *c, struct redisCommand *cmd);
614 static void resetClient(redisClient *c);
615 static void convertToRealHash(robj *o);
616 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618 static void freePubsubPattern(void *p);
619 static int listMatchPubsubPattern(void *a, void *b);
620 static int compareStringObjects(robj *a, robj *b);
621 static void usage();
622
623 static void authCommand(redisClient *c);
624 static void pingCommand(redisClient *c);
625 static void echoCommand(redisClient *c);
626 static void setCommand(redisClient *c);
627 static void setnxCommand(redisClient *c);
628 static void getCommand(redisClient *c);
629 static void delCommand(redisClient *c);
630 static void existsCommand(redisClient *c);
631 static void incrCommand(redisClient *c);
632 static void decrCommand(redisClient *c);
633 static void incrbyCommand(redisClient *c);
634 static void decrbyCommand(redisClient *c);
635 static void selectCommand(redisClient *c);
636 static void randomkeyCommand(redisClient *c);
637 static void keysCommand(redisClient *c);
638 static void dbsizeCommand(redisClient *c);
639 static void lastsaveCommand(redisClient *c);
640 static void saveCommand(redisClient *c);
641 static void bgsaveCommand(redisClient *c);
642 static void bgrewriteaofCommand(redisClient *c);
643 static void shutdownCommand(redisClient *c);
644 static void moveCommand(redisClient *c);
645 static void renameCommand(redisClient *c);
646 static void renamenxCommand(redisClient *c);
647 static void lpushCommand(redisClient *c);
648 static void rpushCommand(redisClient *c);
649 static void lpopCommand(redisClient *c);
650 static void rpopCommand(redisClient *c);
651 static void llenCommand(redisClient *c);
652 static void lindexCommand(redisClient *c);
653 static void lrangeCommand(redisClient *c);
654 static void ltrimCommand(redisClient *c);
655 static void typeCommand(redisClient *c);
656 static void lsetCommand(redisClient *c);
657 static void saddCommand(redisClient *c);
658 static void sremCommand(redisClient *c);
659 static void smoveCommand(redisClient *c);
660 static void sismemberCommand(redisClient *c);
661 static void scardCommand(redisClient *c);
662 static void spopCommand(redisClient *c);
663 static void srandmemberCommand(redisClient *c);
664 static void sinterCommand(redisClient *c);
665 static void sinterstoreCommand(redisClient *c);
666 static void sunionCommand(redisClient *c);
667 static void sunionstoreCommand(redisClient *c);
668 static void sdiffCommand(redisClient *c);
669 static void sdiffstoreCommand(redisClient *c);
670 static void syncCommand(redisClient *c);
671 static void flushdbCommand(redisClient *c);
672 static void flushallCommand(redisClient *c);
673 static void sortCommand(redisClient *c);
674 static void lremCommand(redisClient *c);
675 static void rpoplpushcommand(redisClient *c);
676 static void infoCommand(redisClient *c);
677 static void mgetCommand(redisClient *c);
678 static void monitorCommand(redisClient *c);
679 static void expireCommand(redisClient *c);
680 static void expireatCommand(redisClient *c);
681 static void getsetCommand(redisClient *c);
682 static void ttlCommand(redisClient *c);
683 static void slaveofCommand(redisClient *c);
684 static void debugCommand(redisClient *c);
685 static void msetCommand(redisClient *c);
686 static void msetnxCommand(redisClient *c);
687 static void zaddCommand(redisClient *c);
688 static void zincrbyCommand(redisClient *c);
689 static void zrangeCommand(redisClient *c);
690 static void zrangebyscoreCommand(redisClient *c);
691 static void zcountCommand(redisClient *c);
692 static void zrevrangeCommand(redisClient *c);
693 static void zcardCommand(redisClient *c);
694 static void zremCommand(redisClient *c);
695 static void zscoreCommand(redisClient *c);
696 static void zremrangebyscoreCommand(redisClient *c);
697 static void multiCommand(redisClient *c);
698 static void execCommand(redisClient *c);
699 static void discardCommand(redisClient *c);
700 static void blpopCommand(redisClient *c);
701 static void brpopCommand(redisClient *c);
702 static void appendCommand(redisClient *c);
703 static void substrCommand(redisClient *c);
704 static void zrankCommand(redisClient *c);
705 static void zrevrankCommand(redisClient *c);
706 static void hsetCommand(redisClient *c);
707 static void hgetCommand(redisClient *c);
708 static void hdelCommand(redisClient *c);
709 static void hlenCommand(redisClient *c);
710 static void zremrangebyrankCommand(redisClient *c);
711 static void zunionCommand(redisClient *c);
712 static void zinterCommand(redisClient *c);
713 static void hkeysCommand(redisClient *c);
714 static void hvalsCommand(redisClient *c);
715 static void hgetallCommand(redisClient *c);
716 static void hexistsCommand(redisClient *c);
717 static void configCommand(redisClient *c);
718 static void hincrbyCommand(redisClient *c);
719 static void subscribeCommand(redisClient *c);
720 static void unsubscribeCommand(redisClient *c);
721 static void psubscribeCommand(redisClient *c);
722 static void punsubscribeCommand(redisClient *c);
723 static void publishCommand(redisClient *c);
724
725 /*================================= Globals ================================= */
726
727 /* Global vars */
728 static struct redisServer server; /* server global state */
729 static struct redisCommand cmdTable[] = {
730 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
731 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
732 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
733 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
734 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
736 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
738 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
739 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
740 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
752 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
753 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
755 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
756 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
761 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
762 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
770 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
773 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
774 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
784 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
791 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
795 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
796 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
808 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
816 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
827 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
832 {NULL,NULL,0,0,NULL,0,0,0}
833 };
834
835 /*============================ Utility functions ============================ */
836
837 /* Glob-style pattern matching. */
838 static int stringmatchlen(const char *pattern, int patternLen,
839 const char *string, int stringLen, int nocase)
840 {
841 while(patternLen) {
842 switch(pattern[0]) {
843 case '*':
844 while (pattern[1] == '*') {
845 pattern++;
846 patternLen--;
847 }
848 if (patternLen == 1)
849 return 1; /* match */
850 while(stringLen) {
851 if (stringmatchlen(pattern+1, patternLen-1,
852 string, stringLen, nocase))
853 return 1; /* match */
854 string++;
855 stringLen--;
856 }
857 return 0; /* no match */
858 break;
859 case '?':
860 if (stringLen == 0)
861 return 0; /* no match */
862 string++;
863 stringLen--;
864 break;
865 case '[':
866 {
867 int not, match;
868
869 pattern++;
870 patternLen--;
871 not = pattern[0] == '^';
872 if (not) {
873 pattern++;
874 patternLen--;
875 }
876 match = 0;
877 while(1) {
878 if (pattern[0] == '\\') {
879 pattern++;
880 patternLen--;
881 if (pattern[0] == string[0])
882 match = 1;
883 } else if (pattern[0] == ']') {
884 break;
885 } else if (patternLen == 0) {
886 pattern--;
887 patternLen++;
888 break;
889 } else if (pattern[1] == '-' && patternLen >= 3) {
890 int start = pattern[0];
891 int end = pattern[2];
892 int c = string[0];
893 if (start > end) {
894 int t = start;
895 start = end;
896 end = t;
897 }
898 if (nocase) {
899 start = tolower(start);
900 end = tolower(end);
901 c = tolower(c);
902 }
903 pattern += 2;
904 patternLen -= 2;
905 if (c >= start && c <= end)
906 match = 1;
907 } else {
908 if (!nocase) {
909 if (pattern[0] == string[0])
910 match = 1;
911 } else {
912 if (tolower((int)pattern[0]) == tolower((int)string[0]))
913 match = 1;
914 }
915 }
916 pattern++;
917 patternLen--;
918 }
919 if (not)
920 match = !match;
921 if (!match)
922 return 0; /* no match */
923 string++;
924 stringLen--;
925 break;
926 }
927 case '\\':
928 if (patternLen >= 2) {
929 pattern++;
930 patternLen--;
931 }
932 /* fall through */
933 default:
934 if (!nocase) {
935 if (pattern[0] != string[0])
936 return 0; /* no match */
937 } else {
938 if (tolower((int)pattern[0]) != tolower((int)string[0]))
939 return 0; /* no match */
940 }
941 string++;
942 stringLen--;
943 break;
944 }
945 pattern++;
946 patternLen--;
947 if (stringLen == 0) {
948 while(*pattern == '*') {
949 pattern++;
950 patternLen--;
951 }
952 break;
953 }
954 }
955 if (patternLen == 0 && stringLen == 0)
956 return 1;
957 return 0;
958 }
959
960 static int stringmatch(const char *pattern, const char *string, int nocase) {
961 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
962 }
963
964 static void redisLog(int level, const char *fmt, ...) {
965 va_list ap;
966 FILE *fp;
967
968 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
969 if (!fp) return;
970
971 va_start(ap, fmt);
972 if (level >= server.verbosity) {
973 char *c = ".-*#";
974 char buf[64];
975 time_t now;
976
977 now = time(NULL);
978 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
979 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
980 vfprintf(fp, fmt, ap);
981 fprintf(fp,"\n");
982 fflush(fp);
983 }
984 va_end(ap);
985
986 if (server.logfile) fclose(fp);
987 }
988
989 /*====================== Hash table type implementation ==================== */
990
991 /* This is an hash table type that uses the SDS dynamic strings libary as
992 * keys and radis objects as values (objects can hold SDS strings,
993 * lists, sets). */
994
995 static void dictVanillaFree(void *privdata, void *val)
996 {
997 DICT_NOTUSED(privdata);
998 zfree(val);
999 }
1000
1001 static void dictListDestructor(void *privdata, void *val)
1002 {
1003 DICT_NOTUSED(privdata);
1004 listRelease((list*)val);
1005 }
1006
1007 static int sdsDictKeyCompare(void *privdata, const void *key1,
1008 const void *key2)
1009 {
1010 int l1,l2;
1011 DICT_NOTUSED(privdata);
1012
1013 l1 = sdslen((sds)key1);
1014 l2 = sdslen((sds)key2);
1015 if (l1 != l2) return 0;
1016 return memcmp(key1, key2, l1) == 0;
1017 }
1018
1019 static void dictRedisObjectDestructor(void *privdata, void *val)
1020 {
1021 DICT_NOTUSED(privdata);
1022
1023 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1024 decrRefCount(val);
1025 }
1026
1027 static int dictObjKeyCompare(void *privdata, const void *key1,
1028 const void *key2)
1029 {
1030 const robj *o1 = key1, *o2 = key2;
1031 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1032 }
1033
1034 static unsigned int dictObjHash(const void *key) {
1035 const robj *o = key;
1036 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1037 }
1038
1039 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1040 const void *key2)
1041 {
1042 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1043 int cmp;
1044
1045 if (o1->encoding == REDIS_ENCODING_INT &&
1046 o2->encoding == REDIS_ENCODING_INT &&
1047 o1->ptr == o2->ptr) return 1;
1048
1049 o1 = getDecodedObject(o1);
1050 o2 = getDecodedObject(o2);
1051 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1052 decrRefCount(o1);
1053 decrRefCount(o2);
1054 return cmp;
1055 }
1056
1057 static unsigned int dictEncObjHash(const void *key) {
1058 robj *o = (robj*) key;
1059
1060 if (o->encoding == REDIS_ENCODING_RAW) {
1061 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1062 } else {
1063 if (o->encoding == REDIS_ENCODING_INT) {
1064 char buf[32];
1065 int len;
1066
1067 len = snprintf(buf,32,"%ld",(long)o->ptr);
1068 return dictGenHashFunction((unsigned char*)buf, len);
1069 } else {
1070 unsigned int hash;
1071
1072 o = getDecodedObject(o);
1073 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1074 decrRefCount(o);
1075 return hash;
1076 }
1077 }
1078 }
1079
1080 /* Sets type and expires */
1081 static dictType setDictType = {
1082 dictEncObjHash, /* hash function */
1083 NULL, /* key dup */
1084 NULL, /* val dup */
1085 dictEncObjKeyCompare, /* key compare */
1086 dictRedisObjectDestructor, /* key destructor */
1087 NULL /* val destructor */
1088 };
1089
1090 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1091 static dictType zsetDictType = {
1092 dictEncObjHash, /* hash function */
1093 NULL, /* key dup */
1094 NULL, /* val dup */
1095 dictEncObjKeyCompare, /* key compare */
1096 dictRedisObjectDestructor, /* key destructor */
1097 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1098 };
1099
1100 /* Db->dict */
1101 static dictType dbDictType = {
1102 dictObjHash, /* hash function */
1103 NULL, /* key dup */
1104 NULL, /* val dup */
1105 dictObjKeyCompare, /* key compare */
1106 dictRedisObjectDestructor, /* key destructor */
1107 dictRedisObjectDestructor /* val destructor */
1108 };
1109
1110 /* Db->expires */
1111 static dictType keyptrDictType = {
1112 dictObjHash, /* hash function */
1113 NULL, /* key dup */
1114 NULL, /* val dup */
1115 dictObjKeyCompare, /* key compare */
1116 dictRedisObjectDestructor, /* key destructor */
1117 NULL /* val destructor */
1118 };
1119
1120 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1121 static dictType hashDictType = {
1122 dictEncObjHash, /* hash function */
1123 NULL, /* key dup */
1124 NULL, /* val dup */
1125 dictEncObjKeyCompare, /* key compare */
1126 dictRedisObjectDestructor, /* key destructor */
1127 dictRedisObjectDestructor /* val destructor */
1128 };
1129
1130 /* Keylist hash table type has unencoded redis objects as keys and
1131 * lists as values. It's used for blocking operations (BLPOP) and to
1132 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1133 static dictType keylistDictType = {
1134 dictObjHash, /* hash function */
1135 NULL, /* key dup */
1136 NULL, /* val dup */
1137 dictObjKeyCompare, /* key compare */
1138 dictRedisObjectDestructor, /* key destructor */
1139 dictListDestructor /* val destructor */
1140 };
1141
1142 static void version();
1143
1144 /* ========================= Random utility functions ======================= */
1145
1146 /* Redis generally does not try to recover from out of memory conditions
1147 * when allocating objects or strings, it is not clear if it will be possible
1148 * to report this condition to the client since the networking layer itself
1149 * is based on heap allocation for send buffers, so we simply abort.
1150 * At least the code will be simpler to read... */
1151 static void oom(const char *msg) {
1152 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1153 sleep(1);
1154 abort();
1155 }
1156
1157 /* ====================== Redis server networking stuff ===================== */
1158 static void closeTimedoutClients(void) {
1159 redisClient *c;
1160 listNode *ln;
1161 time_t now = time(NULL);
1162 listIter li;
1163
1164 listRewind(server.clients,&li);
1165 while ((ln = listNext(&li)) != NULL) {
1166 c = listNodeValue(ln);
1167 if (server.maxidletime &&
1168 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1169 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1170 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1171 listLength(c->pubsub_patterns) == 0 &&
1172 (now - c->lastinteraction > server.maxidletime))
1173 {
1174 redisLog(REDIS_VERBOSE,"Closing idle client");
1175 freeClient(c);
1176 } else if (c->flags & REDIS_BLOCKED) {
1177 if (c->blockingto != 0 && c->blockingto < now) {
1178 addReply(c,shared.nullmultibulk);
1179 unblockClientWaitingData(c);
1180 }
1181 }
1182 }
1183 }
1184
1185 static int htNeedsResize(dict *dict) {
1186 long long size, used;
1187
1188 size = dictSlots(dict);
1189 used = dictSize(dict);
1190 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1191 (used*100/size < REDIS_HT_MINFILL));
1192 }
1193
1194 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1195 * we resize the hash table to save memory */
1196 static void tryResizeHashTables(void) {
1197 int j;
1198
1199 for (j = 0; j < server.dbnum; j++) {
1200 if (htNeedsResize(server.db[j].dict)) {
1201 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1202 dictResize(server.db[j].dict);
1203 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1204 }
1205 if (htNeedsResize(server.db[j].expires))
1206 dictResize(server.db[j].expires);
1207 }
1208 }
1209
1210 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1211 void backgroundSaveDoneHandler(int statloc) {
1212 int exitcode = WEXITSTATUS(statloc);
1213 int bysignal = WIFSIGNALED(statloc);
1214
1215 if (!bysignal && exitcode == 0) {
1216 redisLog(REDIS_NOTICE,
1217 "Background saving terminated with success");
1218 server.dirty = 0;
1219 server.lastsave = time(NULL);
1220 } else if (!bysignal && exitcode != 0) {
1221 redisLog(REDIS_WARNING, "Background saving error");
1222 } else {
1223 redisLog(REDIS_WARNING,
1224 "Background saving terminated by signal %d", WTERMSIG(statloc));
1225 rdbRemoveTempFile(server.bgsavechildpid);
1226 }
1227 server.bgsavechildpid = -1;
1228 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1229 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1230 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1231 }
1232
1233 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1234 * Handle this. */
1235 void backgroundRewriteDoneHandler(int statloc) {
1236 int exitcode = WEXITSTATUS(statloc);
1237 int bysignal = WIFSIGNALED(statloc);
1238
1239 if (!bysignal && exitcode == 0) {
1240 int fd;
1241 char tmpfile[256];
1242
1243 redisLog(REDIS_NOTICE,
1244 "Background append only file rewriting terminated with success");
1245 /* Now it's time to flush the differences accumulated by the parent */
1246 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1247 fd = open(tmpfile,O_WRONLY|O_APPEND);
1248 if (fd == -1) {
1249 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1250 goto cleanup;
1251 }
1252 /* Flush our data... */
1253 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1254 (signed) sdslen(server.bgrewritebuf)) {
1255 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1256 close(fd);
1257 goto cleanup;
1258 }
1259 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1260 /* Now our work is to rename the temp file into the stable file. And
1261 * switch the file descriptor used by the server for append only. */
1262 if (rename(tmpfile,server.appendfilename) == -1) {
1263 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1264 close(fd);
1265 goto cleanup;
1266 }
1267 /* Mission completed... almost */
1268 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1269 if (server.appendfd != -1) {
1270 /* If append only is actually enabled... */
1271 close(server.appendfd);
1272 server.appendfd = fd;
1273 fsync(fd);
1274 server.appendseldb = -1; /* Make sure it will issue SELECT */
1275 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1276 } else {
1277 /* If append only is disabled we just generate a dump in this
1278 * format. Why not? */
1279 close(fd);
1280 }
1281 } else if (!bysignal && exitcode != 0) {
1282 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1283 } else {
1284 redisLog(REDIS_WARNING,
1285 "Background append only file rewriting terminated by signal %d",
1286 WTERMSIG(statloc));
1287 }
1288 cleanup:
1289 sdsfree(server.bgrewritebuf);
1290 server.bgrewritebuf = sdsempty();
1291 aofRemoveTempFile(server.bgrewritechildpid);
1292 server.bgrewritechildpid = -1;
1293 }
1294
1295 /* This function is called once a background process of some kind terminates,
1296 * as we want to avoid resizing the hash tables when there is a child in order
1297 * to play well with copy-on-write (otherwise when a resize happens lots of
1298 * memory pages are copied). The goal of this function is to update the ability
1299 * for dict.c to resize the hash tables accordingly to the fact we have o not
1300 * running childs. */
1301 static void updateDictResizePolicy(void) {
1302 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1303 dictEnableResize();
1304 else
1305 dictDisableResize();
1306 }
1307
1308 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1309 int j, loops = server.cronloops++;
1310 REDIS_NOTUSED(eventLoop);
1311 REDIS_NOTUSED(id);
1312 REDIS_NOTUSED(clientData);
1313
1314 /* We take a cached value of the unix time in the global state because
1315 * with virtual memory and aging there is to store the current time
1316 * in objects at every object access, and accuracy is not needed.
1317 * To access a global var is faster than calling time(NULL) */
1318 server.unixtime = time(NULL);
1319
1320 /* Show some info about non-empty databases */
1321 for (j = 0; j < server.dbnum; j++) {
1322 long long size, used, vkeys;
1323
1324 size = dictSlots(server.db[j].dict);
1325 used = dictSize(server.db[j].dict);
1326 vkeys = dictSize(server.db[j].expires);
1327 if (!(loops % 50) && (used || vkeys)) {
1328 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1329 /* dictPrintStats(server.dict); */
1330 }
1331 }
1332
1333 /* We don't want to resize the hash tables while a bacground saving
1334 * is in progress: the saving child is created using fork() that is
1335 * implemented with a copy-on-write semantic in most modern systems, so
1336 * if we resize the HT while there is the saving child at work actually
1337 * a lot of memory movements in the parent will cause a lot of pages
1338 * copied. */
1339 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1340 !(loops % 10))
1341 {
1342 tryResizeHashTables();
1343 }
1344
1345 /* Show information about connected clients */
1346 if (!(loops % 50)) {
1347 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1348 listLength(server.clients)-listLength(server.slaves),
1349 listLength(server.slaves),
1350 zmalloc_used_memory());
1351 }
1352
1353 /* Close connections of timedout clients */
1354 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1355 closeTimedoutClients();
1356
1357 /* Check if a background saving or AOF rewrite in progress terminated */
1358 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1359 int statloc;
1360 pid_t pid;
1361
1362 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1363 if (pid == server.bgsavechildpid) {
1364 backgroundSaveDoneHandler(statloc);
1365 } else {
1366 backgroundRewriteDoneHandler(statloc);
1367 }
1368 updateDictResizePolicy();
1369 }
1370 } else {
1371 /* If there is not a background saving in progress check if
1372 * we have to save now */
1373 time_t now = time(NULL);
1374 for (j = 0; j < server.saveparamslen; j++) {
1375 struct saveparam *sp = server.saveparams+j;
1376
1377 if (server.dirty >= sp->changes &&
1378 now-server.lastsave > sp->seconds) {
1379 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1380 sp->changes, sp->seconds);
1381 rdbSaveBackground(server.dbfilename);
1382 break;
1383 }
1384 }
1385 }
1386
1387 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1388 * will use few CPU cycles if there are few expiring keys, otherwise
1389 * it will get more aggressive to avoid that too much memory is used by
1390 * keys that can be removed from the keyspace. */
1391 for (j = 0; j < server.dbnum; j++) {
1392 int expired;
1393 redisDb *db = server.db+j;
1394
1395 /* Continue to expire if at the end of the cycle more than 25%
1396 * of the keys were expired. */
1397 do {
1398 long num = dictSize(db->expires);
1399 time_t now = time(NULL);
1400
1401 expired = 0;
1402 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1403 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1404 while (num--) {
1405 dictEntry *de;
1406 time_t t;
1407
1408 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1409 t = (time_t) dictGetEntryVal(de);
1410 if (now > t) {
1411 deleteKey(db,dictGetEntryKey(de));
1412 expired++;
1413 server.stat_expiredkeys++;
1414 }
1415 }
1416 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1417 }
1418
1419 /* Swap a few keys on disk if we are over the memory limit and VM
1420 * is enbled. Try to free objects from the free list first. */
1421 if (vmCanSwapOut()) {
1422 while (server.vm_enabled && zmalloc_used_memory() >
1423 server.vm_max_memory)
1424 {
1425 int retval;
1426
1427 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1428 retval = (server.vm_max_threads == 0) ?
1429 vmSwapOneObjectBlocking() :
1430 vmSwapOneObjectThreaded();
1431 if (retval == REDIS_ERR && !(loops % 300) &&
1432 zmalloc_used_memory() >
1433 (server.vm_max_memory+server.vm_max_memory/10))
1434 {
1435 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1436 }
1437 /* Note that when using threade I/O we free just one object,
1438 * because anyway when the I/O thread in charge to swap this
1439 * object out will finish, the handler of completed jobs
1440 * will try to swap more objects if we are still out of memory. */
1441 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1442 }
1443 }
1444
1445 /* Check if we should connect to a MASTER */
1446 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1447 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1448 if (syncWithMaster() == REDIS_OK) {
1449 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1450 }
1451 }
1452 return 100;
1453 }
1454
1455 /* This function gets called every time Redis is entering the
1456 * main loop of the event driven library, that is, before to sleep
1457 * for ready file descriptors. */
1458 static void beforeSleep(struct aeEventLoop *eventLoop) {
1459 REDIS_NOTUSED(eventLoop);
1460
1461 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1462 listIter li;
1463 listNode *ln;
1464
1465 listRewind(server.io_ready_clients,&li);
1466 while((ln = listNext(&li))) {
1467 redisClient *c = ln->value;
1468 struct redisCommand *cmd;
1469
1470 /* Resume the client. */
1471 listDelNode(server.io_ready_clients,ln);
1472 c->flags &= (~REDIS_IO_WAIT);
1473 server.vm_blocked_clients--;
1474 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1475 readQueryFromClient, c);
1476 cmd = lookupCommand(c->argv[0]->ptr);
1477 assert(cmd != NULL);
1478 call(c,cmd);
1479 resetClient(c);
1480 /* There may be more data to process in the input buffer. */
1481 if (c->querybuf && sdslen(c->querybuf) > 0)
1482 processInputBuffer(c);
1483 }
1484 }
1485 }
1486
1487 static void createSharedObjects(void) {
1488 int j;
1489
1490 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1491 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1492 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1493 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1494 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1495 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1496 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1497 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1498 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1499 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1500 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1501 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1502 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1503 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1504 "-ERR no such key\r\n"));
1505 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR syntax error\r\n"));
1507 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR source and destination objects are the same\r\n"));
1509 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR index out of range\r\n"));
1511 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1512 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1513 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1514 shared.select0 = createStringObject("select 0\r\n",10);
1515 shared.select1 = createStringObject("select 1\r\n",10);
1516 shared.select2 = createStringObject("select 2\r\n",10);
1517 shared.select3 = createStringObject("select 3\r\n",10);
1518 shared.select4 = createStringObject("select 4\r\n",10);
1519 shared.select5 = createStringObject("select 5\r\n",10);
1520 shared.select6 = createStringObject("select 6\r\n",10);
1521 shared.select7 = createStringObject("select 7\r\n",10);
1522 shared.select8 = createStringObject("select 8\r\n",10);
1523 shared.select9 = createStringObject("select 9\r\n",10);
1524 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1525 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1526 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1527 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1528 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1529 shared.mbulk3 = createStringObject("*3\r\n",4);
1530 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1531 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1532 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1533 }
1534 }
1535
1536 static void appendServerSaveParams(time_t seconds, int changes) {
1537 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1538 server.saveparams[server.saveparamslen].seconds = seconds;
1539 server.saveparams[server.saveparamslen].changes = changes;
1540 server.saveparamslen++;
1541 }
1542
1543 static void resetServerSaveParams() {
1544 zfree(server.saveparams);
1545 server.saveparams = NULL;
1546 server.saveparamslen = 0;
1547 }
1548
1549 static void initServerConfig() {
1550 server.dbnum = REDIS_DEFAULT_DBNUM;
1551 server.port = REDIS_SERVERPORT;
1552 server.verbosity = REDIS_VERBOSE;
1553 server.maxidletime = REDIS_MAXIDLETIME;
1554 server.saveparams = NULL;
1555 server.logfile = NULL; /* NULL = log on standard output */
1556 server.bindaddr = NULL;
1557 server.glueoutputbuf = 1;
1558 server.daemonize = 0;
1559 server.appendonly = 0;
1560 server.appendfsync = APPENDFSYNC_ALWAYS;
1561 server.lastfsync = time(NULL);
1562 server.appendfd = -1;
1563 server.appendseldb = -1; /* Make sure the first time will not match */
1564 server.pidfile = zstrdup("/var/run/redis.pid");
1565 server.dbfilename = zstrdup("dump.rdb");
1566 server.appendfilename = zstrdup("appendonly.aof");
1567 server.requirepass = NULL;
1568 server.shareobjects = 0;
1569 server.rdbcompression = 1;
1570 server.maxclients = 0;
1571 server.blpop_blocked_clients = 0;
1572 server.maxmemory = 0;
1573 server.vm_enabled = 0;
1574 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1575 server.vm_page_size = 256; /* 256 bytes per page */
1576 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1577 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1578 server.vm_max_threads = 4;
1579 server.vm_blocked_clients = 0;
1580 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1581 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1582
1583 resetServerSaveParams();
1584
1585 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1586 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1587 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1588 /* Replication related */
1589 server.isslave = 0;
1590 server.masterauth = NULL;
1591 server.masterhost = NULL;
1592 server.masterport = 6379;
1593 server.master = NULL;
1594 server.replstate = REDIS_REPL_NONE;
1595
1596 /* Double constants initialization */
1597 R_Zero = 0.0;
1598 R_PosInf = 1.0/R_Zero;
1599 R_NegInf = -1.0/R_Zero;
1600 R_Nan = R_Zero/R_Zero;
1601 }
1602
1603 static void initServer() {
1604 int j;
1605
1606 signal(SIGHUP, SIG_IGN);
1607 signal(SIGPIPE, SIG_IGN);
1608 setupSigSegvAction();
1609
1610 server.devnull = fopen("/dev/null","w");
1611 if (server.devnull == NULL) {
1612 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1613 exit(1);
1614 }
1615 server.clients = listCreate();
1616 server.slaves = listCreate();
1617 server.monitors = listCreate();
1618 server.objfreelist = listCreate();
1619 createSharedObjects();
1620 server.el = aeCreateEventLoop();
1621 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1622 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1623 if (server.fd == -1) {
1624 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1625 exit(1);
1626 }
1627 for (j = 0; j < server.dbnum; j++) {
1628 server.db[j].dict = dictCreate(&dbDictType,NULL);
1629 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1630 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1631 if (server.vm_enabled)
1632 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1633 server.db[j].id = j;
1634 }
1635 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1636 server.pubsub_patterns = listCreate();
1637 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1638 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1639 server.cronloops = 0;
1640 server.bgsavechildpid = -1;
1641 server.bgrewritechildpid = -1;
1642 server.bgrewritebuf = sdsempty();
1643 server.lastsave = time(NULL);
1644 server.dirty = 0;
1645 server.stat_numcommands = 0;
1646 server.stat_numconnections = 0;
1647 server.stat_expiredkeys = 0;
1648 server.stat_starttime = time(NULL);
1649 server.unixtime = time(NULL);
1650 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1651 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1652 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1653
1654 if (server.appendonly) {
1655 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1656 if (server.appendfd == -1) {
1657 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1658 strerror(errno));
1659 exit(1);
1660 }
1661 }
1662
1663 if (server.vm_enabled) vmInit();
1664 }
1665
1666 /* Empty the whole database */
1667 static long long emptyDb() {
1668 int j;
1669 long long removed = 0;
1670
1671 for (j = 0; j < server.dbnum; j++) {
1672 removed += dictSize(server.db[j].dict);
1673 dictEmpty(server.db[j].dict);
1674 dictEmpty(server.db[j].expires);
1675 }
1676 return removed;
1677 }
1678
1679 static int yesnotoi(char *s) {
1680 if (!strcasecmp(s,"yes")) return 1;
1681 else if (!strcasecmp(s,"no")) return 0;
1682 else return -1;
1683 }
1684
1685 /* I agree, this is a very rudimental way to load a configuration...
1686 will improve later if the config gets more complex */
1687 static void loadServerConfig(char *filename) {
1688 FILE *fp;
1689 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1690 int linenum = 0;
1691 sds line = NULL;
1692 char *errormsg = "Fatal error, can't open config file '%s'";
1693 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1694 sprintf(errorbuf, errormsg, filename);
1695
1696 if (filename[0] == '-' && filename[1] == '\0')
1697 fp = stdin;
1698 else {
1699 if ((fp = fopen(filename,"r")) == NULL) {
1700 redisLog(REDIS_WARNING, errorbuf);
1701 exit(1);
1702 }
1703 }
1704
1705 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1706 sds *argv;
1707 int argc, j;
1708
1709 linenum++;
1710 line = sdsnew(buf);
1711 line = sdstrim(line," \t\r\n");
1712
1713 /* Skip comments and blank lines*/
1714 if (line[0] == '#' || line[0] == '\0') {
1715 sdsfree(line);
1716 continue;
1717 }
1718
1719 /* Split into arguments */
1720 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1721 sdstolower(argv[0]);
1722
1723 /* Execute config directives */
1724 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1725 server.maxidletime = atoi(argv[1]);
1726 if (server.maxidletime < 0) {
1727 err = "Invalid timeout value"; goto loaderr;
1728 }
1729 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1730 server.port = atoi(argv[1]);
1731 if (server.port < 1 || server.port > 65535) {
1732 err = "Invalid port"; goto loaderr;
1733 }
1734 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1735 server.bindaddr = zstrdup(argv[1]);
1736 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1737 int seconds = atoi(argv[1]);
1738 int changes = atoi(argv[2]);
1739 if (seconds < 1 || changes < 0) {
1740 err = "Invalid save parameters"; goto loaderr;
1741 }
1742 appendServerSaveParams(seconds,changes);
1743 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1744 if (chdir(argv[1]) == -1) {
1745 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1746 argv[1], strerror(errno));
1747 exit(1);
1748 }
1749 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1750 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1751 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1752 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1753 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1754 else {
1755 err = "Invalid log level. Must be one of debug, notice, warning";
1756 goto loaderr;
1757 }
1758 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1759 FILE *logfp;
1760
1761 server.logfile = zstrdup(argv[1]);
1762 if (!strcasecmp(server.logfile,"stdout")) {
1763 zfree(server.logfile);
1764 server.logfile = NULL;
1765 }
1766 if (server.logfile) {
1767 /* Test if we are able to open the file. The server will not
1768 * be able to abort just for this problem later... */
1769 logfp = fopen(server.logfile,"a");
1770 if (logfp == NULL) {
1771 err = sdscatprintf(sdsempty(),
1772 "Can't open the log file: %s", strerror(errno));
1773 goto loaderr;
1774 }
1775 fclose(logfp);
1776 }
1777 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1778 server.dbnum = atoi(argv[1]);
1779 if (server.dbnum < 1) {
1780 err = "Invalid number of databases"; goto loaderr;
1781 }
1782 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1783 loadServerConfig(argv[1]);
1784 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1785 server.maxclients = atoi(argv[1]);
1786 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1787 server.maxmemory = strtoll(argv[1], NULL, 10);
1788 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1789 server.masterhost = sdsnew(argv[1]);
1790 server.masterport = atoi(argv[2]);
1791 server.replstate = REDIS_REPL_CONNECT;
1792 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1793 server.masterauth = zstrdup(argv[1]);
1794 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1795 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1796 err = "argument must be 'yes' or 'no'"; goto loaderr;
1797 }
1798 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1799 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1800 err = "argument must be 'yes' or 'no'"; goto loaderr;
1801 }
1802 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1803 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1804 err = "argument must be 'yes' or 'no'"; goto loaderr;
1805 }
1806 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1807 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1808 err = "argument must be 'yes' or 'no'"; goto loaderr;
1809 }
1810 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1811 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1812 err = "argument must be 'yes' or 'no'"; goto loaderr;
1813 }
1814 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1815 if (!strcasecmp(argv[1],"no")) {
1816 server.appendfsync = APPENDFSYNC_NO;
1817 } else if (!strcasecmp(argv[1],"always")) {
1818 server.appendfsync = APPENDFSYNC_ALWAYS;
1819 } else if (!strcasecmp(argv[1],"everysec")) {
1820 server.appendfsync = APPENDFSYNC_EVERYSEC;
1821 } else {
1822 err = "argument must be 'no', 'always' or 'everysec'";
1823 goto loaderr;
1824 }
1825 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1826 server.requirepass = zstrdup(argv[1]);
1827 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1828 zfree(server.pidfile);
1829 server.pidfile = zstrdup(argv[1]);
1830 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1831 zfree(server.dbfilename);
1832 server.dbfilename = zstrdup(argv[1]);
1833 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1834 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1835 err = "argument must be 'yes' or 'no'"; goto loaderr;
1836 }
1837 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1838 zfree(server.vm_swap_file);
1839 server.vm_swap_file = zstrdup(argv[1]);
1840 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1841 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1842 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1843 server.vm_page_size = strtoll(argv[1], NULL, 10);
1844 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1845 server.vm_pages = strtoll(argv[1], NULL, 10);
1846 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1847 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1848 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1849 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1850 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1851 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1852 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1853 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1854 } else {
1855 err = "Bad directive or wrong number of arguments"; goto loaderr;
1856 }
1857 for (j = 0; j < argc; j++)
1858 sdsfree(argv[j]);
1859 zfree(argv);
1860 sdsfree(line);
1861 }
1862 if (fp != stdin) fclose(fp);
1863 return;
1864
1865 loaderr:
1866 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1867 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1868 fprintf(stderr, ">>> '%s'\n", line);
1869 fprintf(stderr, "%s\n", err);
1870 exit(1);
1871 }
1872
1873 static void freeClientArgv(redisClient *c) {
1874 int j;
1875
1876 for (j = 0; j < c->argc; j++)
1877 decrRefCount(c->argv[j]);
1878 for (j = 0; j < c->mbargc; j++)
1879 decrRefCount(c->mbargv[j]);
1880 c->argc = 0;
1881 c->mbargc = 0;
1882 }
1883
1884 static void freeClient(redisClient *c) {
1885 listNode *ln;
1886
1887 /* Note that if the client we are freeing is blocked into a blocking
1888 * call, we have to set querybuf to NULL *before* to call
1889 * unblockClientWaitingData() to avoid processInputBuffer() will get
1890 * called. Also it is important to remove the file events after
1891 * this, because this call adds the READABLE event. */
1892 sdsfree(c->querybuf);
1893 c->querybuf = NULL;
1894 if (c->flags & REDIS_BLOCKED)
1895 unblockClientWaitingData(c);
1896
1897 /* Unsubscribe from all the pubsub channels */
1898 pubsubUnsubscribeAllChannels(c,0);
1899 pubsubUnsubscribeAllPatterns(c,0);
1900 dictRelease(c->pubsub_channels);
1901 listRelease(c->pubsub_patterns);
1902 /* Obvious cleanup */
1903 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1904 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1905 listRelease(c->reply);
1906 freeClientArgv(c);
1907 close(c->fd);
1908 /* Remove from the list of clients */
1909 ln = listSearchKey(server.clients,c);
1910 redisAssert(ln != NULL);
1911 listDelNode(server.clients,ln);
1912 /* Remove from the list of clients waiting for swapped keys */
1913 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1914 ln = listSearchKey(server.io_ready_clients,c);
1915 if (ln) {
1916 listDelNode(server.io_ready_clients,ln);
1917 server.vm_blocked_clients--;
1918 }
1919 }
1920 while (server.vm_enabled && listLength(c->io_keys)) {
1921 ln = listFirst(c->io_keys);
1922 dontWaitForSwappedKey(c,ln->value);
1923 }
1924 listRelease(c->io_keys);
1925 /* Master/slave cleanup */
1926 if (c->flags & REDIS_SLAVE) {
1927 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1928 close(c->repldbfd);
1929 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1930 ln = listSearchKey(l,c);
1931 redisAssert(ln != NULL);
1932 listDelNode(l,ln);
1933 }
1934 if (c->flags & REDIS_MASTER) {
1935 server.master = NULL;
1936 server.replstate = REDIS_REPL_CONNECT;
1937 }
1938 /* Release memory */
1939 zfree(c->argv);
1940 zfree(c->mbargv);
1941 freeClientMultiState(c);
1942 zfree(c);
1943 }
1944
1945 #define GLUEREPLY_UP_TO (1024)
1946 static void glueReplyBuffersIfNeeded(redisClient *c) {
1947 int copylen = 0;
1948 char buf[GLUEREPLY_UP_TO];
1949 listNode *ln;
1950 listIter li;
1951 robj *o;
1952
1953 listRewind(c->reply,&li);
1954 while((ln = listNext(&li))) {
1955 int objlen;
1956
1957 o = ln->value;
1958 objlen = sdslen(o->ptr);
1959 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1960 memcpy(buf+copylen,o->ptr,objlen);
1961 copylen += objlen;
1962 listDelNode(c->reply,ln);
1963 } else {
1964 if (copylen == 0) return;
1965 break;
1966 }
1967 }
1968 /* Now the output buffer is empty, add the new single element */
1969 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1970 listAddNodeHead(c->reply,o);
1971 }
1972
1973 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1974 redisClient *c = privdata;
1975 int nwritten = 0, totwritten = 0, objlen;
1976 robj *o;
1977 REDIS_NOTUSED(el);
1978 REDIS_NOTUSED(mask);
1979
1980 /* Use writev() if we have enough buffers to send */
1981 if (!server.glueoutputbuf &&
1982 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1983 !(c->flags & REDIS_MASTER))
1984 {
1985 sendReplyToClientWritev(el, fd, privdata, mask);
1986 return;
1987 }
1988
1989 while(listLength(c->reply)) {
1990 if (server.glueoutputbuf && listLength(c->reply) > 1)
1991 glueReplyBuffersIfNeeded(c);
1992
1993 o = listNodeValue(listFirst(c->reply));
1994 objlen = sdslen(o->ptr);
1995
1996 if (objlen == 0) {
1997 listDelNode(c->reply,listFirst(c->reply));
1998 continue;
1999 }
2000
2001 if (c->flags & REDIS_MASTER) {
2002 /* Don't reply to a master */
2003 nwritten = objlen - c->sentlen;
2004 } else {
2005 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2006 if (nwritten <= 0) break;
2007 }
2008 c->sentlen += nwritten;
2009 totwritten += nwritten;
2010 /* If we fully sent the object on head go to the next one */
2011 if (c->sentlen == objlen) {
2012 listDelNode(c->reply,listFirst(c->reply));
2013 c->sentlen = 0;
2014 }
2015 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2016 * bytes, in a single threaded server it's a good idea to serve
2017 * other clients as well, even if a very large request comes from
2018 * super fast link that is always able to accept data (in real world
2019 * scenario think about 'KEYS *' against the loopback interfae) */
2020 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2021 }
2022 if (nwritten == -1) {
2023 if (errno == EAGAIN) {
2024 nwritten = 0;
2025 } else {
2026 redisLog(REDIS_VERBOSE,
2027 "Error writing to client: %s", strerror(errno));
2028 freeClient(c);
2029 return;
2030 }
2031 }
2032 if (totwritten > 0) c->lastinteraction = time(NULL);
2033 if (listLength(c->reply) == 0) {
2034 c->sentlen = 0;
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2036 }
2037 }
2038
2039 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2040 {
2041 redisClient *c = privdata;
2042 int nwritten = 0, totwritten = 0, objlen, willwrite;
2043 robj *o;
2044 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2045 int offset, ion = 0;
2046 REDIS_NOTUSED(el);
2047 REDIS_NOTUSED(mask);
2048
2049 listNode *node;
2050 while (listLength(c->reply)) {
2051 offset = c->sentlen;
2052 ion = 0;
2053 willwrite = 0;
2054
2055 /* fill-in the iov[] array */
2056 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2057 o = listNodeValue(node);
2058 objlen = sdslen(o->ptr);
2059
2060 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2061 break;
2062
2063 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2064 break; /* no more iovecs */
2065
2066 iov[ion].iov_base = ((char*)o->ptr) + offset;
2067 iov[ion].iov_len = objlen - offset;
2068 willwrite += objlen - offset;
2069 offset = 0; /* just for the first item */
2070 ion++;
2071 }
2072
2073 if(willwrite == 0)
2074 break;
2075
2076 /* write all collected blocks at once */
2077 if((nwritten = writev(fd, iov, ion)) < 0) {
2078 if (errno != EAGAIN) {
2079 redisLog(REDIS_VERBOSE,
2080 "Error writing to client: %s", strerror(errno));
2081 freeClient(c);
2082 return;
2083 }
2084 break;
2085 }
2086
2087 totwritten += nwritten;
2088 offset = c->sentlen;
2089
2090 /* remove written robjs from c->reply */
2091 while (nwritten && listLength(c->reply)) {
2092 o = listNodeValue(listFirst(c->reply));
2093 objlen = sdslen(o->ptr);
2094
2095 if(nwritten >= objlen - offset) {
2096 listDelNode(c->reply, listFirst(c->reply));
2097 nwritten -= objlen - offset;
2098 c->sentlen = 0;
2099 } else {
2100 /* partial write */
2101 c->sentlen += nwritten;
2102 break;
2103 }
2104 offset = 0;
2105 }
2106 }
2107
2108 if (totwritten > 0)
2109 c->lastinteraction = time(NULL);
2110
2111 if (listLength(c->reply) == 0) {
2112 c->sentlen = 0;
2113 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2114 }
2115 }
2116
2117 static struct redisCommand *lookupCommand(char *name) {
2118 int j = 0;
2119 while(cmdTable[j].name != NULL) {
2120 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2121 j++;
2122 }
2123 return NULL;
2124 }
2125
2126 /* resetClient prepare the client to process the next command */
2127 static void resetClient(redisClient *c) {
2128 freeClientArgv(c);
2129 c->bulklen = -1;
2130 c->multibulk = 0;
2131 }
2132
2133 /* Call() is the core of Redis execution of a command */
2134 static void call(redisClient *c, struct redisCommand *cmd) {
2135 long long dirty;
2136
2137 dirty = server.dirty;
2138 cmd->proc(c);
2139 dirty = server.dirty-dirty;
2140
2141 if (server.appendonly && dirty)
2142 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2143 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2144 listLength(server.slaves))
2145 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2146 if (listLength(server.monitors))
2147 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2148 server.stat_numcommands++;
2149 }
2150
2151 /* If this function gets called we already read a whole
2152 * command, argments are in the client argv/argc fields.
2153 * processCommand() execute the command or prepare the
2154 * server for a bulk read from the client.
2155 *
2156 * If 1 is returned the client is still alive and valid and
2157 * and other operations can be performed by the caller. Otherwise
2158 * if 0 is returned the client was destroied (i.e. after QUIT). */
2159 static int processCommand(redisClient *c) {
2160 struct redisCommand *cmd;
2161
2162 /* Free some memory if needed (maxmemory setting) */
2163 if (server.maxmemory) freeMemoryIfNeeded();
2164
2165 /* Handle the multi bulk command type. This is an alternative protocol
2166 * supported by Redis in order to receive commands that are composed of
2167 * multiple binary-safe "bulk" arguments. The latency of processing is
2168 * a bit higher but this allows things like multi-sets, so if this
2169 * protocol is used only for MSET and similar commands this is a big win. */
2170 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2171 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2172 if (c->multibulk <= 0) {
2173 resetClient(c);
2174 return 1;
2175 } else {
2176 decrRefCount(c->argv[c->argc-1]);
2177 c->argc--;
2178 return 1;
2179 }
2180 } else if (c->multibulk) {
2181 if (c->bulklen == -1) {
2182 if (((char*)c->argv[0]->ptr)[0] != '$') {
2183 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2184 resetClient(c);
2185 return 1;
2186 } else {
2187 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2188 decrRefCount(c->argv[0]);
2189 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2190 c->argc--;
2191 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2192 resetClient(c);
2193 return 1;
2194 }
2195 c->argc--;
2196 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2197 return 1;
2198 }
2199 } else {
2200 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2201 c->mbargv[c->mbargc] = c->argv[0];
2202 c->mbargc++;
2203 c->argc--;
2204 c->multibulk--;
2205 if (c->multibulk == 0) {
2206 robj **auxargv;
2207 int auxargc;
2208
2209 /* Here we need to swap the multi-bulk argc/argv with the
2210 * normal argc/argv of the client structure. */
2211 auxargv = c->argv;
2212 c->argv = c->mbargv;
2213 c->mbargv = auxargv;
2214
2215 auxargc = c->argc;
2216 c->argc = c->mbargc;
2217 c->mbargc = auxargc;
2218
2219 /* We need to set bulklen to something different than -1
2220 * in order for the code below to process the command without
2221 * to try to read the last argument of a bulk command as
2222 * a special argument. */
2223 c->bulklen = 0;
2224 /* continue below and process the command */
2225 } else {
2226 c->bulklen = -1;
2227 return 1;
2228 }
2229 }
2230 }
2231 /* -- end of multi bulk commands processing -- */
2232
2233 /* The QUIT command is handled as a special case. Normal command
2234 * procs are unable to close the client connection safely */
2235 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2236 freeClient(c);
2237 return 0;
2238 }
2239
2240 /* Now lookup the command and check ASAP about trivial error conditions
2241 * such wrong arity, bad command name and so forth. */
2242 cmd = lookupCommand(c->argv[0]->ptr);
2243 if (!cmd) {
2244 addReplySds(c,
2245 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2246 (char*)c->argv[0]->ptr));
2247 resetClient(c);
2248 return 1;
2249 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2250 (c->argc < -cmd->arity)) {
2251 addReplySds(c,
2252 sdscatprintf(sdsempty(),
2253 "-ERR wrong number of arguments for '%s' command\r\n",
2254 cmd->name));
2255 resetClient(c);
2256 return 1;
2257 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2258 /* This is a bulk command, we have to read the last argument yet. */
2259 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2260
2261 decrRefCount(c->argv[c->argc-1]);
2262 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2263 c->argc--;
2264 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2265 resetClient(c);
2266 return 1;
2267 }
2268 c->argc--;
2269 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2270 /* It is possible that the bulk read is already in the
2271 * buffer. Check this condition and handle it accordingly.
2272 * This is just a fast path, alternative to call processInputBuffer().
2273 * It's a good idea since the code is small and this condition
2274 * happens most of the times. */
2275 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2276 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2277 c->argc++;
2278 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2279 } else {
2280 /* Otherwise return... there is to read the last argument
2281 * from the socket. */
2282 return 1;
2283 }
2284 }
2285 /* Let's try to encode the bulk object to save space. */
2286 if (cmd->flags & REDIS_CMD_BULK)
2287 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2288
2289 /* Check if the user is authenticated */
2290 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2291 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2292 resetClient(c);
2293 return 1;
2294 }
2295
2296 /* Handle the maxmemory directive */
2297 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2298 zmalloc_used_memory() > server.maxmemory)
2299 {
2300 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2301 resetClient(c);
2302 return 1;
2303 }
2304
2305 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2306 if (dictSize(c->pubsub_channels) > 0 &&
2307 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2308 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2309 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2310 resetClient(c);
2311 return 1;
2312 }
2313
2314 /* Exec the command */
2315 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2316 queueMultiCommand(c,cmd);
2317 addReply(c,shared.queued);
2318 } else {
2319 if (server.vm_enabled && server.vm_max_threads > 0 &&
2320 blockClientOnSwappedKeys(cmd,c)) return 1;
2321 call(c,cmd);
2322 }
2323
2324 /* Prepare the client for the next command */
2325 resetClient(c);
2326 return 1;
2327 }
2328
2329 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2330 listNode *ln;
2331 listIter li;
2332 int outc = 0, j;
2333 robj **outv;
2334 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2335 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2336 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2337 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2338 robj *lenobj;
2339
2340 if (argc <= REDIS_STATIC_ARGS) {
2341 outv = static_outv;
2342 } else {
2343 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2344 }
2345
2346 lenobj = createObject(REDIS_STRING,
2347 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2348 lenobj->refcount = 0;
2349 outv[outc++] = lenobj;
2350 for (j = 0; j < argc; j++) {
2351 lenobj = createObject(REDIS_STRING,
2352 sdscatprintf(sdsempty(),"$%lu\r\n",
2353 (unsigned long) stringObjectLen(argv[j])));
2354 lenobj->refcount = 0;
2355 outv[outc++] = lenobj;
2356 outv[outc++] = argv[j];
2357 outv[outc++] = shared.crlf;
2358 }
2359
2360 /* Increment all the refcounts at start and decrement at end in order to
2361 * be sure to free objects if there is no slave in a replication state
2362 * able to be feed with commands */
2363 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2364 listRewind(slaves,&li);
2365 while((ln = listNext(&li))) {
2366 redisClient *slave = ln->value;
2367
2368 /* Don't feed slaves that are still waiting for BGSAVE to start */
2369 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2370
2371 /* Feed all the other slaves, MONITORs and so on */
2372 if (slave->slaveseldb != dictid) {
2373 robj *selectcmd;
2374
2375 switch(dictid) {
2376 case 0: selectcmd = shared.select0; break;
2377 case 1: selectcmd = shared.select1; break;
2378 case 2: selectcmd = shared.select2; break;
2379 case 3: selectcmd = shared.select3; break;
2380 case 4: selectcmd = shared.select4; break;
2381 case 5: selectcmd = shared.select5; break;
2382 case 6: selectcmd = shared.select6; break;
2383 case 7: selectcmd = shared.select7; break;
2384 case 8: selectcmd = shared.select8; break;
2385 case 9: selectcmd = shared.select9; break;
2386 default:
2387 selectcmd = createObject(REDIS_STRING,
2388 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2389 selectcmd->refcount = 0;
2390 break;
2391 }
2392 addReply(slave,selectcmd);
2393 slave->slaveseldb = dictid;
2394 }
2395 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2396 }
2397 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2398 if (outv != static_outv) zfree(outv);
2399 }
2400
2401 static void processInputBuffer(redisClient *c) {
2402 again:
2403 /* Before to process the input buffer, make sure the client is not
2404 * waitig for a blocking operation such as BLPOP. Note that the first
2405 * iteration the client is never blocked, otherwise the processInputBuffer
2406 * would not be called at all, but after the execution of the first commands
2407 * in the input buffer the client may be blocked, and the "goto again"
2408 * will try to reiterate. The following line will make it return asap. */
2409 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2410 if (c->bulklen == -1) {
2411 /* Read the first line of the query */
2412 char *p = strchr(c->querybuf,'\n');
2413 size_t querylen;
2414
2415 if (p) {
2416 sds query, *argv;
2417 int argc, j;
2418
2419 query = c->querybuf;
2420 c->querybuf = sdsempty();
2421 querylen = 1+(p-(query));
2422 if (sdslen(query) > querylen) {
2423 /* leave data after the first line of the query in the buffer */
2424 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2425 }
2426 *p = '\0'; /* remove "\n" */
2427 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2428 sdsupdatelen(query);
2429
2430 /* Now we can split the query in arguments */
2431 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2432 sdsfree(query);
2433
2434 if (c->argv) zfree(c->argv);
2435 c->argv = zmalloc(sizeof(robj*)*argc);
2436
2437 for (j = 0; j < argc; j++) {
2438 if (sdslen(argv[j])) {
2439 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2440 c->argc++;
2441 } else {
2442 sdsfree(argv[j]);
2443 }
2444 }
2445 zfree(argv);
2446 if (c->argc) {
2447 /* Execute the command. If the client is still valid
2448 * after processCommand() return and there is something
2449 * on the query buffer try to process the next command. */
2450 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2451 } else {
2452 /* Nothing to process, argc == 0. Just process the query
2453 * buffer if it's not empty or return to the caller */
2454 if (sdslen(c->querybuf)) goto again;
2455 }
2456 return;
2457 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2458 redisLog(REDIS_VERBOSE, "Client protocol error");
2459 freeClient(c);
2460 return;
2461 }
2462 } else {
2463 /* Bulk read handling. Note that if we are at this point
2464 the client already sent a command terminated with a newline,
2465 we are reading the bulk data that is actually the last
2466 argument of the command. */
2467 int qbl = sdslen(c->querybuf);
2468
2469 if (c->bulklen <= qbl) {
2470 /* Copy everything but the final CRLF as final argument */
2471 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2472 c->argc++;
2473 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2474 /* Process the command. If the client is still valid after
2475 * the processing and there is more data in the buffer
2476 * try to parse it. */
2477 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2478 return;
2479 }
2480 }
2481 }
2482
2483 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2484 redisClient *c = (redisClient*) privdata;
2485 char buf[REDIS_IOBUF_LEN];
2486 int nread;
2487 REDIS_NOTUSED(el);
2488 REDIS_NOTUSED(mask);
2489
2490 nread = read(fd, buf, REDIS_IOBUF_LEN);
2491 if (nread == -1) {
2492 if (errno == EAGAIN) {
2493 nread = 0;
2494 } else {
2495 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2496 freeClient(c);
2497 return;
2498 }
2499 } else if (nread == 0) {
2500 redisLog(REDIS_VERBOSE, "Client closed connection");
2501 freeClient(c);
2502 return;
2503 }
2504 if (nread) {
2505 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2506 c->lastinteraction = time(NULL);
2507 } else {
2508 return;
2509 }
2510 processInputBuffer(c);
2511 }
2512
2513 static int selectDb(redisClient *c, int id) {
2514 if (id < 0 || id >= server.dbnum)
2515 return REDIS_ERR;
2516 c->db = &server.db[id];
2517 return REDIS_OK;
2518 }
2519
2520 static void *dupClientReplyValue(void *o) {
2521 incrRefCount((robj*)o);
2522 return o;
2523 }
2524
2525 static int listMatchObjects(void *a, void *b) {
2526 return compareStringObjects(a,b) == 0;
2527 }
2528
2529 static redisClient *createClient(int fd) {
2530 redisClient *c = zmalloc(sizeof(*c));
2531
2532 anetNonBlock(NULL,fd);
2533 anetTcpNoDelay(NULL,fd);
2534 if (!c) return NULL;
2535 selectDb(c,0);
2536 c->fd = fd;
2537 c->querybuf = sdsempty();
2538 c->argc = 0;
2539 c->argv = NULL;
2540 c->bulklen = -1;
2541 c->multibulk = 0;
2542 c->mbargc = 0;
2543 c->mbargv = NULL;
2544 c->sentlen = 0;
2545 c->flags = 0;
2546 c->lastinteraction = time(NULL);
2547 c->authenticated = 0;
2548 c->replstate = REDIS_REPL_NONE;
2549 c->reply = listCreate();
2550 listSetFreeMethod(c->reply,decrRefCount);
2551 listSetDupMethod(c->reply,dupClientReplyValue);
2552 c->blockingkeys = NULL;
2553 c->blockingkeysnum = 0;
2554 c->io_keys = listCreate();
2555 listSetFreeMethod(c->io_keys,decrRefCount);
2556 c->pubsub_channels = dictCreate(&setDictType,NULL);
2557 c->pubsub_patterns = listCreate();
2558 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2559 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2560 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2561 readQueryFromClient, c) == AE_ERR) {
2562 freeClient(c);
2563 return NULL;
2564 }
2565 listAddNodeTail(server.clients,c);
2566 initClientMultiState(c);
2567 return c;
2568 }
2569
2570 static void addReply(redisClient *c, robj *obj) {
2571 if (listLength(c->reply) == 0 &&
2572 (c->replstate == REDIS_REPL_NONE ||
2573 c->replstate == REDIS_REPL_ONLINE) &&
2574 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2575 sendReplyToClient, c) == AE_ERR) return;
2576
2577 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2578 obj = dupStringObject(obj);
2579 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2580 }
2581 listAddNodeTail(c->reply,getDecodedObject(obj));
2582 }
2583
2584 static void addReplySds(redisClient *c, sds s) {
2585 robj *o = createObject(REDIS_STRING,s);
2586 addReply(c,o);
2587 decrRefCount(o);
2588 }
2589
2590 static void addReplyDouble(redisClient *c, double d) {
2591 char buf[128];
2592
2593 snprintf(buf,sizeof(buf),"%.17g",d);
2594 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2595 (unsigned long) strlen(buf),buf));
2596 }
2597
2598 static void addReplyLong(redisClient *c, long l) {
2599 char buf[128];
2600 size_t len;
2601
2602 if (l == 0) {
2603 addReply(c,shared.czero);
2604 return;
2605 } else if (l == 1) {
2606 addReply(c,shared.cone);
2607 return;
2608 }
2609 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2610 addReplySds(c,sdsnewlen(buf,len));
2611 }
2612
2613 static void addReplyLongLong(redisClient *c, long long ll) {
2614 char buf[128];
2615 size_t len;
2616
2617 if (ll == 0) {
2618 addReply(c,shared.czero);
2619 return;
2620 } else if (ll == 1) {
2621 addReply(c,shared.cone);
2622 return;
2623 }
2624 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2625 addReplySds(c,sdsnewlen(buf,len));
2626 }
2627
2628 static void addReplyUlong(redisClient *c, unsigned long ul) {
2629 char buf[128];
2630 size_t len;
2631
2632 if (ul == 0) {
2633 addReply(c,shared.czero);
2634 return;
2635 } else if (ul == 1) {
2636 addReply(c,shared.cone);
2637 return;
2638 }
2639 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2640 addReplySds(c,sdsnewlen(buf,len));
2641 }
2642
2643 static void addReplyBulkLen(redisClient *c, robj *obj) {
2644 size_t len;
2645
2646 if (obj->encoding == REDIS_ENCODING_RAW) {
2647 len = sdslen(obj->ptr);
2648 } else {
2649 long n = (long)obj->ptr;
2650
2651 /* Compute how many bytes will take this integer as a radix 10 string */
2652 len = 1;
2653 if (n < 0) {
2654 len++;
2655 n = -n;
2656 }
2657 while((n = n/10) != 0) {
2658 len++;
2659 }
2660 }
2661 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2662 }
2663
2664 static void addReplyBulk(redisClient *c, robj *obj) {
2665 addReplyBulkLen(c,obj);
2666 addReply(c,obj);
2667 addReply(c,shared.crlf);
2668 }
2669
2670 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2671 static void addReplyBulkCString(redisClient *c, char *s) {
2672 if (s == NULL) {
2673 addReply(c,shared.nullbulk);
2674 } else {
2675 robj *o = createStringObject(s,strlen(s));
2676 addReplyBulk(c,o);
2677 decrRefCount(o);
2678 }
2679 }
2680
2681 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2682 int cport, cfd;
2683 char cip[128];
2684 redisClient *c;
2685 REDIS_NOTUSED(el);
2686 REDIS_NOTUSED(mask);
2687 REDIS_NOTUSED(privdata);
2688
2689 cfd = anetAccept(server.neterr, fd, cip, &cport);
2690 if (cfd == AE_ERR) {
2691 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2692 return;
2693 }
2694 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2695 if ((c = createClient(cfd)) == NULL) {
2696 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2697 close(cfd); /* May be already closed, just ingore errors */
2698 return;
2699 }
2700 /* If maxclient directive is set and this is one client more... close the
2701 * connection. Note that we create the client instead to check before
2702 * for this condition, since now the socket is already set in nonblocking
2703 * mode and we can send an error for free using the Kernel I/O */
2704 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2705 char *err = "-ERR max number of clients reached\r\n";
2706
2707 /* That's a best effort error message, don't check write errors */
2708 if (write(c->fd,err,strlen(err)) == -1) {
2709 /* Nothing to do, Just to avoid the warning... */
2710 }
2711 freeClient(c);
2712 return;
2713 }
2714 server.stat_numconnections++;
2715 }
2716
2717 /* ======================= Redis objects implementation ===================== */
2718
2719 static robj *createObject(int type, void *ptr) {
2720 robj *o;
2721
2722 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2723 if (listLength(server.objfreelist)) {
2724 listNode *head = listFirst(server.objfreelist);
2725 o = listNodeValue(head);
2726 listDelNode(server.objfreelist,head);
2727 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2728 } else {
2729 if (server.vm_enabled) {
2730 pthread_mutex_unlock(&server.obj_freelist_mutex);
2731 o = zmalloc(sizeof(*o));
2732 } else {
2733 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2734 }
2735 }
2736 o->type = type;
2737 o->encoding = REDIS_ENCODING_RAW;
2738 o->ptr = ptr;
2739 o->refcount = 1;
2740 if (server.vm_enabled) {
2741 /* Note that this code may run in the context of an I/O thread
2742 * and accessing to server.unixtime in theory is an error
2743 * (no locks). But in practice this is safe, and even if we read
2744 * garbage Redis will not fail, as it's just a statistical info */
2745 o->vm.atime = server.unixtime;
2746 o->storage = REDIS_VM_MEMORY;
2747 }
2748 return o;
2749 }
2750
2751 static robj *createStringObject(char *ptr, size_t len) {
2752 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2753 }
2754
2755 static robj *dupStringObject(robj *o) {
2756 assert(o->encoding == REDIS_ENCODING_RAW);
2757 return createStringObject(o->ptr,sdslen(o->ptr));
2758 }
2759
2760 static robj *createListObject(void) {
2761 list *l = listCreate();
2762
2763 listSetFreeMethod(l,decrRefCount);
2764 return createObject(REDIS_LIST,l);
2765 }
2766
2767 static robj *createSetObject(void) {
2768 dict *d = dictCreate(&setDictType,NULL);
2769 return createObject(REDIS_SET,d);
2770 }
2771
2772 static robj *createHashObject(void) {
2773 /* All the Hashes start as zipmaps. Will be automatically converted
2774 * into hash tables if there are enough elements or big elements
2775 * inside. */
2776 unsigned char *zm = zipmapNew();
2777 robj *o = createObject(REDIS_HASH,zm);
2778 o->encoding = REDIS_ENCODING_ZIPMAP;
2779 return o;
2780 }
2781
2782 static robj *createZsetObject(void) {
2783 zset *zs = zmalloc(sizeof(*zs));
2784
2785 zs->dict = dictCreate(&zsetDictType,NULL);
2786 zs->zsl = zslCreate();
2787 return createObject(REDIS_ZSET,zs);
2788 }
2789
2790 static void freeStringObject(robj *o) {
2791 if (o->encoding == REDIS_ENCODING_RAW) {
2792 sdsfree(o->ptr);
2793 }
2794 }
2795
2796 static void freeListObject(robj *o) {
2797 listRelease((list*) o->ptr);
2798 }
2799
2800 static void freeSetObject(robj *o) {
2801 dictRelease((dict*) o->ptr);
2802 }
2803
2804 static void freeZsetObject(robj *o) {
2805 zset *zs = o->ptr;
2806
2807 dictRelease(zs->dict);
2808 zslFree(zs->zsl);
2809 zfree(zs);
2810 }
2811
2812 static void freeHashObject(robj *o) {
2813 switch (o->encoding) {
2814 case REDIS_ENCODING_HT:
2815 dictRelease((dict*) o->ptr);
2816 break;
2817 case REDIS_ENCODING_ZIPMAP:
2818 zfree(o->ptr);
2819 break;
2820 default:
2821 redisAssert(0);
2822 break;
2823 }
2824 }
2825
2826 static void incrRefCount(robj *o) {
2827 o->refcount++;
2828 }
2829
2830 static void decrRefCount(void *obj) {
2831 robj *o = obj;
2832
2833 /* Object is a key of a swapped out value, or in the process of being
2834 * loaded. */
2835 if (server.vm_enabled &&
2836 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2837 {
2838 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2839 redisAssert(o->type == REDIS_STRING);
2840 freeStringObject(o);
2841 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2842 pthread_mutex_lock(&server.obj_freelist_mutex);
2843 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2844 !listAddNodeHead(server.objfreelist,o))
2845 zfree(o);
2846 pthread_mutex_unlock(&server.obj_freelist_mutex);
2847 server.vm_stats_swapped_objects--;
2848 return;
2849 }
2850 /* Object is in memory, or in the process of being swapped out. */
2851 if (--(o->refcount) == 0) {
2852 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2853 vmCancelThreadedIOJob(obj);
2854 switch(o->type) {
2855 case REDIS_STRING: freeStringObject(o); break;
2856 case REDIS_LIST: freeListObject(o); break;
2857 case REDIS_SET: freeSetObject(o); break;
2858 case REDIS_ZSET: freeZsetObject(o); break;
2859 case REDIS_HASH: freeHashObject(o); break;
2860 default: redisAssert(0); break;
2861 }
2862 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2863 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2864 !listAddNodeHead(server.objfreelist,o))
2865 zfree(o);
2866 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2867 }
2868 }
2869
2870 static robj *lookupKey(redisDb *db, robj *key) {
2871 dictEntry *de = dictFind(db->dict,key);
2872 if (de) {
2873 robj *key = dictGetEntryKey(de);
2874 robj *val = dictGetEntryVal(de);
2875
2876 if (server.vm_enabled) {
2877 if (key->storage == REDIS_VM_MEMORY ||
2878 key->storage == REDIS_VM_SWAPPING)
2879 {
2880 /* If we were swapping the object out, stop it, this key
2881 * was requested. */
2882 if (key->storage == REDIS_VM_SWAPPING)
2883 vmCancelThreadedIOJob(key);
2884 /* Update the access time of the key for the aging algorithm. */
2885 key->vm.atime = server.unixtime;
2886 } else {
2887 int notify = (key->storage == REDIS_VM_LOADING);
2888
2889 /* Our value was swapped on disk. Bring it at home. */
2890 redisAssert(val == NULL);
2891 val = vmLoadObject(key);
2892 dictGetEntryVal(de) = val;
2893
2894 /* Clients blocked by the VM subsystem may be waiting for
2895 * this key... */
2896 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2897 }
2898 }
2899 return val;
2900 } else {
2901 return NULL;
2902 }
2903 }
2904
2905 static robj *lookupKeyRead(redisDb *db, robj *key) {
2906 expireIfNeeded(db,key);
2907 return lookupKey(db,key);
2908 }
2909
2910 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2911 deleteIfVolatile(db,key);
2912 return lookupKey(db,key);
2913 }
2914
2915 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2916 robj *o = lookupKeyRead(c->db, key);
2917 if (!o) addReply(c,reply);
2918 return o;
2919 }
2920
2921 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2922 robj *o = lookupKeyWrite(c->db, key);
2923 if (!o) addReply(c,reply);
2924 return o;
2925 }
2926
2927 static int checkType(redisClient *c, robj *o, int type) {
2928 if (o->type != type) {
2929 addReply(c,shared.wrongtypeerr);
2930 return 1;
2931 }
2932 return 0;
2933 }
2934
2935 static int deleteKey(redisDb *db, robj *key) {
2936 int retval;
2937
2938 /* We need to protect key from destruction: after the first dictDelete()
2939 * it may happen that 'key' is no longer valid if we don't increment
2940 * it's count. This may happen when we get the object reference directly
2941 * from the hash table with dictRandomKey() or dict iterators */
2942 incrRefCount(key);
2943 if (dictSize(db->expires)) dictDelete(db->expires,key);
2944 retval = dictDelete(db->dict,key);
2945 decrRefCount(key);
2946
2947 return retval == DICT_OK;
2948 }
2949
2950 /* Check if the nul-terminated string 's' can be represented by a long
2951 * (that is, is a number that fits into long without any other space or
2952 * character before or after the digits).
2953 *
2954 * If so, the function returns REDIS_OK and *longval is set to the value
2955 * of the number. Otherwise REDIS_ERR is returned */
2956 static int isStringRepresentableAsLong(sds s, long *longval) {
2957 char buf[32], *endptr;
2958 long value;
2959 int slen;
2960
2961 value = strtol(s, &endptr, 10);
2962 if (endptr[0] != '\0') return REDIS_ERR;
2963 slen = snprintf(buf,32,"%ld",value);
2964
2965 /* If the number converted back into a string is not identical
2966 * then it's not possible to encode the string as integer */
2967 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2968 if (longval) *longval = value;
2969 return REDIS_OK;
2970 }
2971
2972 /* Try to encode a string object in order to save space */
2973 static robj *tryObjectEncoding(robj *o) {
2974 long value;
2975 sds s = o->ptr;
2976
2977 if (o->encoding != REDIS_ENCODING_RAW)
2978 return o; /* Already encoded */
2979
2980 /* It's not safe to encode shared objects: shared objects can be shared
2981 * everywhere in the "object space" of Redis. Encoded objects can only
2982 * appear as "values" (and not, for instance, as keys) */
2983 if (o->refcount > 1) return o;
2984
2985 /* Currently we try to encode only strings */
2986 redisAssert(o->type == REDIS_STRING);
2987
2988 /* Check if we can represent this string as a long integer */
2989 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2990
2991 /* Ok, this object can be encoded */
2992 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2993 decrRefCount(o);
2994 incrRefCount(shared.integers[value]);
2995 return shared.integers[value];
2996 } else {
2997 o->encoding = REDIS_ENCODING_INT;
2998 sdsfree(o->ptr);
2999 o->ptr = (void*) value;
3000 return o;
3001 }
3002 }
3003
3004 /* Get a decoded version of an encoded object (returned as a new object).
3005 * If the object is already raw-encoded just increment the ref count. */
3006 static robj *getDecodedObject(robj *o) {
3007 robj *dec;
3008
3009 if (o->encoding == REDIS_ENCODING_RAW) {
3010 incrRefCount(o);
3011 return o;
3012 }
3013 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3014 char buf[32];
3015
3016 snprintf(buf,32,"%ld",(long)o->ptr);
3017 dec = createStringObject(buf,strlen(buf));
3018 return dec;
3019 } else {
3020 redisAssert(1 != 1);
3021 }
3022 }
3023
3024 /* Compare two string objects via strcmp() or alike.
3025 * Note that the objects may be integer-encoded. In such a case we
3026 * use snprintf() to get a string representation of the numbers on the stack
3027 * and compare the strings, it's much faster than calling getDecodedObject().
3028 *
3029 * Important note: if objects are not integer encoded, but binary-safe strings,
3030 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3031 * binary safe. */
3032 static int compareStringObjects(robj *a, robj *b) {
3033 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3034 char bufa[128], bufb[128], *astr, *bstr;
3035 int bothsds = 1;
3036
3037 if (a == b) return 0;
3038 if (a->encoding != REDIS_ENCODING_RAW) {
3039 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3040 astr = bufa;
3041 bothsds = 0;
3042 } else {
3043 astr = a->ptr;
3044 }
3045 if (b->encoding != REDIS_ENCODING_RAW) {
3046 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3047 bstr = bufb;
3048 bothsds = 0;
3049 } else {
3050 bstr = b->ptr;
3051 }
3052 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3053 }
3054
3055 static size_t stringObjectLen(robj *o) {
3056 redisAssert(o->type == REDIS_STRING);
3057 if (o->encoding == REDIS_ENCODING_RAW) {
3058 return sdslen(o->ptr);
3059 } else {
3060 char buf[32];
3061
3062 return snprintf(buf,32,"%ld",(long)o->ptr);
3063 }
3064 }
3065
3066 /*============================ RDB saving/loading =========================== */
3067
3068 static int rdbSaveType(FILE *fp, unsigned char type) {
3069 if (fwrite(&type,1,1,fp) == 0) return -1;
3070 return 0;
3071 }
3072
3073 static int rdbSaveTime(FILE *fp, time_t t) {
3074 int32_t t32 = (int32_t) t;
3075 if (fwrite(&t32,4,1,fp) == 0) return -1;
3076 return 0;
3077 }
3078
3079 /* check rdbLoadLen() comments for more info */
3080 static int rdbSaveLen(FILE *fp, uint32_t len) {
3081 unsigned char buf[2];
3082
3083 if (len < (1<<6)) {
3084 /* Save a 6 bit len */
3085 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3086 if (fwrite(buf,1,1,fp) == 0) return -1;
3087 } else if (len < (1<<14)) {
3088 /* Save a 14 bit len */
3089 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3090 buf[1] = len&0xFF;
3091 if (fwrite(buf,2,1,fp) == 0) return -1;
3092 } else {
3093 /* Save a 32 bit len */
3094 buf[0] = (REDIS_RDB_32BITLEN<<6);
3095 if (fwrite(buf,1,1,fp) == 0) return -1;
3096 len = htonl(len);
3097 if (fwrite(&len,4,1,fp) == 0) return -1;
3098 }
3099 return 0;
3100 }
3101
3102 /* String objects in the form "2391" "-100" without any space and with a
3103 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3104 * encoded as integers to save space */
3105 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3106 long long value;
3107 char *endptr, buf[32];
3108
3109 /* Check if it's possible to encode this value as a number */
3110 value = strtoll(s, &endptr, 10);
3111 if (endptr[0] != '\0') return 0;
3112 snprintf(buf,32,"%lld",value);
3113
3114 /* If the number converted back into a string is not identical
3115 * then it's not possible to encode the string as integer */
3116 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3117
3118 /* Finally check if it fits in our ranges */
3119 if (value >= -(1<<7) && value <= (1<<7)-1) {
3120 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3121 enc[1] = value&0xFF;
3122 return 2;
3123 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3124 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3125 enc[1] = value&0xFF;
3126 enc[2] = (value>>8)&0xFF;
3127 return 3;
3128 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3129 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3130 enc[1] = value&0xFF;
3131 enc[2] = (value>>8)&0xFF;
3132 enc[3] = (value>>16)&0xFF;
3133 enc[4] = (value>>24)&0xFF;
3134 return 5;
3135 } else {
3136 return 0;
3137 }
3138 }
3139
3140 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3141 size_t comprlen, outlen;
3142 unsigned char byte;
3143 void *out;
3144
3145 /* We require at least four bytes compression for this to be worth it */
3146 if (len <= 4) return 0;
3147 outlen = len-4;
3148 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3149 comprlen = lzf_compress(s, len, out, outlen);
3150 if (comprlen == 0) {
3151 zfree(out);
3152 return 0;
3153 }
3154 /* Data compressed! Let's save it on disk */
3155 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3156 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3157 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3158 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3159 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3160 zfree(out);
3161 return comprlen;
3162
3163 writeerr:
3164 zfree(out);
3165 return -1;
3166 }
3167
3168 /* Save a string objet as [len][data] on disk. If the object is a string
3169 * representation of an integer value we try to safe it in a special form */
3170 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3171 int enclen;
3172
3173 /* Try integer encoding */
3174 if (len <= 11) {
3175 unsigned char buf[5];
3176 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3177 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3178 return 0;
3179 }
3180 }
3181
3182 /* Try LZF compression - under 20 bytes it's unable to compress even
3183 * aaaaaaaaaaaaaaaaaa so skip it */
3184 if (server.rdbcompression && len > 20) {
3185 int retval;
3186
3187 retval = rdbSaveLzfStringObject(fp,s,len);
3188 if (retval == -1) return -1;
3189 if (retval > 0) return 0;
3190 /* retval == 0 means data can't be compressed, save the old way */
3191 }
3192
3193 /* Store verbatim */
3194 if (rdbSaveLen(fp,len) == -1) return -1;
3195 if (len && fwrite(s,len,1,fp) == 0) return -1;
3196 return 0;
3197 }
3198
3199 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3200 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3201 int retval;
3202
3203 /* Avoid incr/decr ref count business when possible.
3204 * This plays well with copy-on-write given that we are probably
3205 * in a child process (BGSAVE). Also this makes sure key objects
3206 * of swapped objects are not incRefCount-ed (an assert does not allow
3207 * this in order to avoid bugs) */
3208 if (obj->encoding != REDIS_ENCODING_RAW) {
3209 obj = getDecodedObject(obj);
3210 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3211 decrRefCount(obj);
3212 } else {
3213 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3214 }
3215 return retval;
3216 }
3217
3218 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3219 * 8 bit integer specifing the length of the representation.
3220 * This 8 bit integer has special values in order to specify the following
3221 * conditions:
3222 * 253: not a number
3223 * 254: + inf
3224 * 255: - inf
3225 */
3226 static int rdbSaveDoubleValue(FILE *fp, double val) {
3227 unsigned char buf[128];
3228 int len;
3229
3230 if (isnan(val)) {
3231 buf[0] = 253;
3232 len = 1;
3233 } else if (!isfinite(val)) {
3234 len = 1;
3235 buf[0] = (val < 0) ? 255 : 254;
3236 } else {
3237 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3238 buf[0] = strlen((char*)buf+1);
3239 len = buf[0]+1;
3240 }
3241 if (fwrite(buf,len,1,fp) == 0) return -1;
3242 return 0;
3243 }
3244
3245 /* Save a Redis object. */
3246 static int rdbSaveObject(FILE *fp, robj *o) {
3247 if (o->type == REDIS_STRING) {
3248 /* Save a string value */
3249 if (rdbSaveStringObject(fp,o) == -1) return -1;
3250 } else if (o->type == REDIS_LIST) {
3251 /* Save a list value */
3252 list *list = o->ptr;
3253 listIter li;
3254 listNode *ln;
3255
3256 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3257 listRewind(list,&li);
3258 while((ln = listNext(&li))) {
3259 robj *eleobj = listNodeValue(ln);
3260
3261 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3262 }
3263 } else if (o->type == REDIS_SET) {
3264 /* Save a set value */
3265 dict *set = o->ptr;
3266 dictIterator *di = dictGetIterator(set);
3267 dictEntry *de;
3268
3269 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3270 while((de = dictNext(di)) != NULL) {
3271 robj *eleobj = dictGetEntryKey(de);
3272
3273 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3274 }
3275 dictReleaseIterator(di);
3276 } else if (o->type == REDIS_ZSET) {
3277 /* Save a set value */
3278 zset *zs = o->ptr;
3279 dictIterator *di = dictGetIterator(zs->dict);
3280 dictEntry *de;
3281
3282 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3283 while((de = dictNext(di)) != NULL) {
3284 robj *eleobj = dictGetEntryKey(de);
3285 double *score = dictGetEntryVal(de);
3286
3287 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3288 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3289 }
3290 dictReleaseIterator(di);
3291 } else if (o->type == REDIS_HASH) {
3292 /* Save a hash value */
3293 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3294 unsigned char *p = zipmapRewind(o->ptr);
3295 unsigned int count = zipmapLen(o->ptr);
3296 unsigned char *key, *val;
3297 unsigned int klen, vlen;
3298
3299 if (rdbSaveLen(fp,count) == -1) return -1;
3300 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3301 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3302 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3303 }
3304 } else {
3305 dictIterator *di = dictGetIterator(o->ptr);
3306 dictEntry *de;
3307
3308 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3309 while((de = dictNext(di)) != NULL) {
3310 robj *key = dictGetEntryKey(de);
3311 robj *val = dictGetEntryVal(de);
3312
3313 if (rdbSaveStringObject(fp,key) == -1) return -1;
3314 if (rdbSaveStringObject(fp,val) == -1) return -1;
3315 }
3316 dictReleaseIterator(di);
3317 }
3318 } else {
3319 redisAssert(0);
3320 }
3321 return 0;
3322 }
3323
3324 /* Return the length the object will have on disk if saved with
3325 * the rdbSaveObject() function. Currently we use a trick to get
3326 * this length with very little changes to the code. In the future
3327 * we could switch to a faster solution. */
3328 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3329 if (fp == NULL) fp = server.devnull;
3330 rewind(fp);
3331 assert(rdbSaveObject(fp,o) != 1);
3332 return ftello(fp);
3333 }
3334
3335 /* Return the number of pages required to save this object in the swap file */
3336 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3337 off_t bytes = rdbSavedObjectLen(o,fp);
3338
3339 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3340 }
3341
3342 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3343 static int rdbSave(char *filename) {
3344 dictIterator *di = NULL;
3345 dictEntry *de;
3346 FILE *fp;
3347 char tmpfile[256];
3348 int j;
3349 time_t now = time(NULL);
3350
3351 /* Wait for I/O therads to terminate, just in case this is a
3352 * foreground-saving, to avoid seeking the swap file descriptor at the
3353 * same time. */
3354 if (server.vm_enabled)
3355 waitEmptyIOJobsQueue();
3356
3357 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3358 fp = fopen(tmpfile,"w");
3359 if (!fp) {
3360 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3361 return REDIS_ERR;
3362 }
3363 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3364 for (j = 0; j < server.dbnum; j++) {
3365 redisDb *db = server.db+j;
3366 dict *d = db->dict;
3367 if (dictSize(d) == 0) continue;
3368 di = dictGetIterator(d);
3369 if (!di) {
3370 fclose(fp);
3371 return REDIS_ERR;
3372 }
3373
3374 /* Write the SELECT DB opcode */
3375 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3376 if (rdbSaveLen(fp,j) == -1) goto werr;
3377
3378 /* Iterate this DB writing every entry */
3379 while((de = dictNext(di)) != NULL) {
3380 robj *key = dictGetEntryKey(de);
3381 robj *o = dictGetEntryVal(de);
3382 time_t expiretime = getExpire(db,key);
3383
3384 /* Save the expire time */
3385 if (expiretime != -1) {
3386 /* If this key is already expired skip it */
3387 if (expiretime < now) continue;
3388 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3389 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3390 }
3391 /* Save the key and associated value. This requires special
3392 * handling if the value is swapped out. */
3393 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3394 key->storage == REDIS_VM_SWAPPING) {
3395 /* Save type, key, value */
3396 if (rdbSaveType(fp,o->type) == -1) goto werr;
3397 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3398 if (rdbSaveObject(fp,o) == -1) goto werr;
3399 } else {
3400 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3401 robj *po;
3402 /* Get a preview of the object in memory */
3403 po = vmPreviewObject(key);
3404 /* Save type, key, value */
3405 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3406 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3407 if (rdbSaveObject(fp,po) == -1) goto werr;
3408 /* Remove the loaded object from memory */
3409 decrRefCount(po);
3410 }
3411 }
3412 dictReleaseIterator(di);
3413 }
3414 /* EOF opcode */
3415 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3416
3417 /* Make sure data will not remain on the OS's output buffers */
3418 fflush(fp);
3419 fsync(fileno(fp));
3420 fclose(fp);
3421
3422 /* Use RENAME to make sure the DB file is changed atomically only
3423 * if the generate DB file is ok. */
3424 if (rename(tmpfile,filename) == -1) {
3425 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3426 unlink(tmpfile);
3427 return REDIS_ERR;
3428 }
3429 redisLog(REDIS_NOTICE,"DB saved on disk");
3430 server.dirty = 0;
3431 server.lastsave = time(NULL);
3432 return REDIS_OK;
3433
3434 werr:
3435 fclose(fp);
3436 unlink(tmpfile);
3437 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3438 if (di) dictReleaseIterator(di);
3439 return REDIS_ERR;
3440 }
3441
3442 static int rdbSaveBackground(char *filename) {
3443 pid_t childpid;
3444
3445 if (server.bgsavechildpid != -1) return REDIS_ERR;
3446 if (server.vm_enabled) waitEmptyIOJobsQueue();
3447 if ((childpid = fork()) == 0) {
3448 /* Child */
3449 if (server.vm_enabled) vmReopenSwapFile();
3450 close(server.fd);
3451 if (rdbSave(filename) == REDIS_OK) {
3452 _exit(0);
3453 } else {
3454 _exit(1);
3455 }
3456 } else {
3457 /* Parent */
3458 if (childpid == -1) {
3459 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3460 strerror(errno));
3461 return REDIS_ERR;
3462 }
3463 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3464 server.bgsavechildpid = childpid;
3465 updateDictResizePolicy();
3466 return REDIS_OK;
3467 }
3468 return REDIS_OK; /* unreached */
3469 }
3470
3471 static void rdbRemoveTempFile(pid_t childpid) {
3472 char tmpfile[256];
3473
3474 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3475 unlink(tmpfile);
3476 }
3477
3478 static int rdbLoadType(FILE *fp) {
3479 unsigned char type;
3480 if (fread(&type,1,1,fp) == 0) return -1;
3481 return type;
3482 }
3483
3484 static time_t rdbLoadTime(FILE *fp) {
3485 int32_t t32;
3486 if (fread(&t32,4,1,fp) == 0) return -1;
3487 return (time_t) t32;
3488 }
3489
3490 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3491 * of this file for a description of how this are stored on disk.
3492 *
3493 * isencoded is set to 1 if the readed length is not actually a length but
3494 * an "encoding type", check the above comments for more info */
3495 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3496 unsigned char buf[2];
3497 uint32_t len;
3498 int type;
3499
3500 if (isencoded) *isencoded = 0;
3501 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3502 type = (buf[0]&0xC0)>>6;
3503 if (type == REDIS_RDB_6BITLEN) {
3504 /* Read a 6 bit len */
3505 return buf[0]&0x3F;
3506 } else if (type == REDIS_RDB_ENCVAL) {
3507 /* Read a 6 bit len encoding type */
3508 if (isencoded) *isencoded = 1;
3509 return buf[0]&0x3F;
3510 } else if (type == REDIS_RDB_14BITLEN) {
3511 /* Read a 14 bit len */
3512 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3513 return ((buf[0]&0x3F)<<8)|buf[1];
3514 } else {
3515 /* Read a 32 bit len */
3516 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3517 return ntohl(len);
3518 }
3519 }
3520
3521 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3522 unsigned char enc[4];
3523 long long val;
3524
3525 if (enctype == REDIS_RDB_ENC_INT8) {
3526 if (fread(enc,1,1,fp) == 0) return NULL;
3527 val = (signed char)enc[0];
3528 } else if (enctype == REDIS_RDB_ENC_INT16) {
3529 uint16_t v;
3530 if (fread(enc,2,1,fp) == 0) return NULL;
3531 v = enc[0]|(enc[1]<<8);
3532 val = (int16_t)v;
3533 } else if (enctype == REDIS_RDB_ENC_INT32) {
3534 uint32_t v;
3535 if (fread(enc,4,1,fp) == 0) return NULL;
3536 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3537 val = (int32_t)v;
3538 } else {
3539 val = 0; /* anti-warning */
3540 redisAssert(0);
3541 }
3542 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3543 }
3544
3545 static robj *rdbLoadLzfStringObject(FILE*fp) {
3546 unsigned int len, clen;
3547 unsigned char *c = NULL;
3548 sds val = NULL;
3549
3550 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3551 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3552 if ((c = zmalloc(clen)) == NULL) goto err;
3553 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3554 if (fread(c,clen,1,fp) == 0) goto err;
3555 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3556 zfree(c);
3557 return createObject(REDIS_STRING,val);
3558 err:
3559 zfree(c);
3560 sdsfree(val);
3561 return NULL;
3562 }
3563
3564 static robj *rdbLoadStringObject(FILE*fp) {
3565 int isencoded;
3566 uint32_t len;
3567 sds val;
3568
3569 len = rdbLoadLen(fp,&isencoded);
3570 if (isencoded) {
3571 switch(len) {
3572 case REDIS_RDB_ENC_INT8:
3573 case REDIS_RDB_ENC_INT16:
3574 case REDIS_RDB_ENC_INT32:
3575 return rdbLoadIntegerObject(fp,len);
3576 case REDIS_RDB_ENC_LZF:
3577 return rdbLoadLzfStringObject(fp);
3578 default:
3579 redisAssert(0);
3580 }
3581 }
3582
3583 if (len == REDIS_RDB_LENERR) return NULL;
3584 val = sdsnewlen(NULL,len);
3585 if (len && fread(val,len,1,fp) == 0) {
3586 sdsfree(val);
3587 return NULL;
3588 }
3589 return createObject(REDIS_STRING,val);
3590 }
3591
3592 /* For information about double serialization check rdbSaveDoubleValue() */
3593 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3594 char buf[128];
3595 unsigned char len;
3596
3597 if (fread(&len,1,1,fp) == 0) return -1;
3598 switch(len) {
3599 case 255: *val = R_NegInf; return 0;
3600 case 254: *val = R_PosInf; return 0;
3601 case 253: *val = R_Nan; return 0;
3602 default:
3603 if (fread(buf,len,1,fp) == 0) return -1;
3604 buf[len] = '\0';
3605 sscanf(buf, "%lg", val);
3606 return 0;
3607 }
3608 }
3609
3610 /* Load a Redis object of the specified type from the specified file.
3611 * On success a newly allocated object is returned, otherwise NULL. */
3612 static robj *rdbLoadObject(int type, FILE *fp) {
3613 robj *o;
3614
3615 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3616 if (type == REDIS_STRING) {
3617 /* Read string value */
3618 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3619 o = tryObjectEncoding(o);
3620 } else if (type == REDIS_LIST || type == REDIS_SET) {
3621 /* Read list/set value */
3622 uint32_t listlen;
3623
3624 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3625 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3626 /* It's faster to expand the dict to the right size asap in order
3627 * to avoid rehashing */
3628 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3629 dictExpand(o->ptr,listlen);
3630 /* Load every single element of the list/set */
3631 while(listlen--) {
3632 robj *ele;
3633
3634 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3635 ele = tryObjectEncoding(ele);
3636 if (type == REDIS_LIST) {
3637 listAddNodeTail((list*)o->ptr,ele);
3638 } else {
3639 dictAdd((dict*)o->ptr,ele,NULL);
3640 }
3641 }
3642 } else if (type == REDIS_ZSET) {
3643 /* Read list/set value */
3644 size_t zsetlen;
3645 zset *zs;
3646
3647 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3648 o = createZsetObject();
3649 zs = o->ptr;
3650 /* Load every single element of the list/set */
3651 while(zsetlen--) {
3652 robj *ele;
3653 double *score = zmalloc(sizeof(double));
3654
3655 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3656 ele = tryObjectEncoding(ele);
3657 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3658 dictAdd(zs->dict,ele,score);
3659 zslInsert(zs->zsl,*score,ele);
3660 incrRefCount(ele); /* added to skiplist */
3661 }
3662 } else if (type == REDIS_HASH) {
3663 size_t hashlen;
3664
3665 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3666 o = createHashObject();
3667 /* Too many entries? Use an hash table. */
3668 if (hashlen > server.hash_max_zipmap_entries)
3669 convertToRealHash(o);
3670 /* Load every key/value, then set it into the zipmap or hash
3671 * table, as needed. */
3672 while(hashlen--) {
3673 robj *key, *val;
3674
3675 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3676 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3677 /* If we are using a zipmap and there are too big values
3678 * the object is converted to real hash table encoding. */
3679 if (o->encoding != REDIS_ENCODING_HT &&
3680 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3681 sdslen(val->ptr) > server.hash_max_zipmap_value))
3682 {
3683 convertToRealHash(o);
3684 }
3685
3686 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3687 unsigned char *zm = o->ptr;
3688
3689 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3690 val->ptr,sdslen(val->ptr),NULL);
3691 o->ptr = zm;
3692 decrRefCount(key);
3693 decrRefCount(val);
3694 } else {
3695 key = tryObjectEncoding(key);
3696 val = tryObjectEncoding(val);
3697 dictAdd((dict*)o->ptr,key,val);
3698 }
3699 }
3700 } else {
3701 redisAssert(0);
3702 }
3703 return o;
3704 }
3705
3706 static int rdbLoad(char *filename) {
3707 FILE *fp;
3708 robj *keyobj = NULL;
3709 uint32_t dbid;
3710 int type, retval, rdbver;
3711 dict *d = server.db[0].dict;
3712 redisDb *db = server.db+0;
3713 char buf[1024];
3714 time_t expiretime = -1, now = time(NULL);
3715 long long loadedkeys = 0;
3716
3717 fp = fopen(filename,"r");
3718 if (!fp) return REDIS_ERR;
3719 if (fread(buf,9,1,fp) == 0) goto eoferr;
3720 buf[9] = '\0';
3721 if (memcmp(buf,"REDIS",5) != 0) {
3722 fclose(fp);
3723 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3724 return REDIS_ERR;
3725 }
3726 rdbver = atoi(buf+5);
3727 if (rdbver != 1) {
3728 fclose(fp);
3729 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3730 return REDIS_ERR;
3731 }
3732 while(1) {
3733 robj *o;
3734
3735 /* Read type. */
3736 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3737 if (type == REDIS_EXPIRETIME) {
3738 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3739 /* We read the time so we need to read the object type again */
3740 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3741 }
3742 if (type == REDIS_EOF) break;
3743 /* Handle SELECT DB opcode as a special case */
3744 if (type == REDIS_SELECTDB) {
3745 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3746 goto eoferr;
3747 if (dbid >= (unsigned)server.dbnum) {
3748 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3749 exit(1);
3750 }
3751 db = server.db+dbid;
3752 d = db->dict;
3753 continue;
3754 }
3755 /* Read key */
3756 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3757 /* Read value */
3758 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3759 /* Add the new object in the hash table */
3760 retval = dictAdd(d,keyobj,o);
3761 if (retval == DICT_ERR) {
3762 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3763 exit(1);
3764 }
3765 /* Set the expire time if needed */
3766 if (expiretime != -1) {
3767 setExpire(db,keyobj,expiretime);
3768 /* Delete this key if already expired */
3769 if (expiretime < now) deleteKey(db,keyobj);
3770 expiretime = -1;
3771 }
3772 keyobj = o = NULL;
3773 /* Handle swapping while loading big datasets when VM is on */
3774 loadedkeys++;
3775 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3776 while (zmalloc_used_memory() > server.vm_max_memory) {
3777 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3778 }
3779 }
3780 }
3781 fclose(fp);
3782 return REDIS_OK;
3783
3784 eoferr: /* unexpected end of file is handled here with a fatal exit */
3785 if (keyobj) decrRefCount(keyobj);
3786 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3787 exit(1);
3788 return REDIS_ERR; /* Just to avoid warning */
3789 }
3790
3791 /*================================== Commands =============================== */
3792
3793 static void authCommand(redisClient *c) {
3794 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3795 c->authenticated = 1;
3796 addReply(c,shared.ok);
3797 } else {
3798 c->authenticated = 0;
3799 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3800 }
3801 }
3802
3803 static void pingCommand(redisClient *c) {
3804 addReply(c,shared.pong);
3805 }
3806
3807 static void echoCommand(redisClient *c) {
3808 addReplyBulk(c,c->argv[1]);
3809 }
3810
3811 /*=================================== Strings =============================== */
3812
3813 static void setGenericCommand(redisClient *c, int nx) {
3814 int retval;
3815
3816 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3817 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3818 if (retval == DICT_ERR) {
3819 if (!nx) {
3820 /* If the key is about a swapped value, we want a new key object
3821 * to overwrite the old. So we delete the old key in the database.
3822 * This will also make sure that swap pages about the old object
3823 * will be marked as free. */
3824 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3825 incrRefCount(c->argv[1]);
3826 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3827 incrRefCount(c->argv[2]);
3828 } else {
3829 addReply(c,shared.czero);
3830 return;
3831 }
3832 } else {
3833 incrRefCount(c->argv[1]);
3834 incrRefCount(c->argv[2]);
3835 }
3836 server.dirty++;
3837 removeExpire(c->db,c->argv[1]);
3838 addReply(c, nx ? shared.cone : shared.ok);
3839 }
3840
3841 static void setCommand(redisClient *c) {
3842 setGenericCommand(c,0);
3843 }
3844
3845 static void setnxCommand(redisClient *c) {
3846 setGenericCommand(c,1);
3847 }
3848
3849 static int getGenericCommand(redisClient *c) {
3850 robj *o;
3851
3852 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3853 return REDIS_OK;
3854
3855 if (o->type != REDIS_STRING) {
3856 addReply(c,shared.wrongtypeerr);
3857 return REDIS_ERR;
3858 } else {
3859 addReplyBulk(c,o);
3860 return REDIS_OK;
3861 }
3862 }
3863
3864 static void getCommand(redisClient *c) {
3865 getGenericCommand(c);
3866 }
3867
3868 static void getsetCommand(redisClient *c) {
3869 if (getGenericCommand(c) == REDIS_ERR) return;
3870 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3871 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3872 } else {
3873 incrRefCount(c->argv[1]);
3874 }
3875 incrRefCount(c->argv[2]);
3876 server.dirty++;
3877 removeExpire(c->db,c->argv[1]);
3878 }
3879
3880 static void mgetCommand(redisClient *c) {
3881 int j;
3882
3883 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3884 for (j = 1; j < c->argc; j++) {
3885 robj *o = lookupKeyRead(c->db,c->argv[j]);
3886 if (o == NULL) {
3887 addReply(c,shared.nullbulk);
3888 } else {
3889 if (o->type != REDIS_STRING) {
3890 addReply(c,shared.nullbulk);
3891 } else {
3892 addReplyBulk(c,o);
3893 }
3894 }
3895 }
3896 }
3897
3898 static void msetGenericCommand(redisClient *c, int nx) {
3899 int j, busykeys = 0;
3900
3901 if ((c->argc % 2) == 0) {
3902 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3903 return;
3904 }
3905 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3906 * set nothing at all if at least one already key exists. */
3907 if (nx) {
3908 for (j = 1; j < c->argc; j += 2) {
3909 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3910 busykeys++;
3911 }
3912 }
3913 }
3914 if (busykeys) {
3915 addReply(c, shared.czero);
3916 return;
3917 }
3918
3919 for (j = 1; j < c->argc; j += 2) {
3920 int retval;
3921
3922 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
3923 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3924 if (retval == DICT_ERR) {
3925 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3926 incrRefCount(c->argv[j+1]);
3927 } else {
3928 incrRefCount(c->argv[j]);
3929 incrRefCount(c->argv[j+1]);
3930 }
3931 removeExpire(c->db,c->argv[j]);
3932 }
3933 server.dirty += (c->argc-1)/2;
3934 addReply(c, nx ? shared.cone : shared.ok);
3935 }
3936
3937 static void msetCommand(redisClient *c) {
3938 msetGenericCommand(c,0);
3939 }
3940
3941 static void msetnxCommand(redisClient *c) {
3942 msetGenericCommand(c,1);
3943 }
3944
3945 static void incrDecrCommand(redisClient *c, long long incr) {
3946 long long value;
3947 int retval;
3948 robj *o;
3949
3950 o = lookupKeyWrite(c->db,c->argv[1]);
3951 if (o == NULL) {
3952 value = 0;
3953 } else {
3954 if (o->type != REDIS_STRING) {
3955 value = 0;
3956 } else {
3957 char *eptr;
3958
3959 if (o->encoding == REDIS_ENCODING_RAW)
3960 value = strtoll(o->ptr, &eptr, 10);
3961 else if (o->encoding == REDIS_ENCODING_INT)
3962 value = (long)o->ptr;
3963 else
3964 redisAssert(1 != 1);
3965 }
3966 }
3967
3968 value += incr;
3969 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3970 o = tryObjectEncoding(o);
3971 retval = dictAdd(c->db->dict,c->argv[1],o);
3972 if (retval == DICT_ERR) {
3973 dictReplace(c->db->dict,c->argv[1],o);
3974 removeExpire(c->db,c->argv[1]);
3975 } else {
3976 incrRefCount(c->argv[1]);
3977 }
3978 server.dirty++;
3979 addReply(c,shared.colon);
3980 addReply(c,o);
3981 addReply(c,shared.crlf);
3982 }
3983
3984 static void incrCommand(redisClient *c) {
3985 incrDecrCommand(c,1);
3986 }
3987
3988 static void decrCommand(redisClient *c) {
3989 incrDecrCommand(c,-1);
3990 }
3991
3992 static void incrbyCommand(redisClient *c) {
3993 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3994 incrDecrCommand(c,incr);
3995 }
3996
3997 static void decrbyCommand(redisClient *c) {
3998 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3999 incrDecrCommand(c,-incr);
4000 }
4001
4002 static void appendCommand(redisClient *c) {
4003 int retval;
4004 size_t totlen;
4005 robj *o;
4006
4007 o = lookupKeyWrite(c->db,c->argv[1]);
4008 if (o == NULL) {
4009 /* Create the key */
4010 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4011 incrRefCount(c->argv[1]);
4012 incrRefCount(c->argv[2]);
4013 totlen = stringObjectLen(c->argv[2]);
4014 } else {
4015 dictEntry *de;
4016
4017 de = dictFind(c->db->dict,c->argv[1]);
4018 assert(de != NULL);
4019
4020 o = dictGetEntryVal(de);
4021 if (o->type != REDIS_STRING) {
4022 addReply(c,shared.wrongtypeerr);
4023 return;
4024 }
4025 /* If the object is specially encoded or shared we have to make
4026 * a copy */
4027 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4028 robj *decoded = getDecodedObject(o);
4029
4030 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4031 decrRefCount(decoded);
4032 dictReplace(c->db->dict,c->argv[1],o);
4033 }
4034 /* APPEND! */
4035 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4036 o->ptr = sdscatlen(o->ptr,
4037 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4038 } else {
4039 o->ptr = sdscatprintf(o->ptr, "%ld",
4040 (unsigned long) c->argv[2]->ptr);
4041 }
4042 totlen = sdslen(o->ptr);
4043 }
4044 server.dirty++;
4045 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4046 }
4047
4048 static void substrCommand(redisClient *c) {
4049 robj *o;
4050 long start = atoi(c->argv[2]->ptr);
4051 long end = atoi(c->argv[3]->ptr);
4052 size_t rangelen, strlen;
4053 sds range;
4054
4055 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4056 checkType(c,o,REDIS_STRING)) return;
4057
4058 o = getDecodedObject(o);
4059 strlen = sdslen(o->ptr);
4060
4061 /* convert negative indexes */
4062 if (start < 0) start = strlen+start;
4063 if (end < 0) end = strlen+end;
4064 if (start < 0) start = 0;
4065 if (end < 0) end = 0;
4066
4067 /* indexes sanity checks */
4068 if (start > end || (size_t)start >= strlen) {
4069 /* Out of range start or start > end result in null reply */
4070 addReply(c,shared.nullbulk);
4071 decrRefCount(o);
4072 return;
4073 }
4074 if ((size_t)end >= strlen) end = strlen-1;
4075 rangelen = (end-start)+1;
4076
4077 /* Return the result */
4078 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4079 range = sdsnewlen((char*)o->ptr+start,rangelen);
4080 addReplySds(c,range);
4081 addReply(c,shared.crlf);
4082 decrRefCount(o);
4083 }
4084
4085 /* ========================= Type agnostic commands ========================= */
4086
4087 static void delCommand(redisClient *c) {
4088 int deleted = 0, j;
4089
4090 for (j = 1; j < c->argc; j++) {
4091 if (deleteKey(c->db,c->argv[j])) {
4092 server.dirty++;
4093 deleted++;
4094 }
4095 }
4096 addReplyLong(c,deleted);
4097 }
4098
4099 static void existsCommand(redisClient *c) {
4100 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4101 }
4102
4103 static void selectCommand(redisClient *c) {
4104 int id = atoi(c->argv[1]->ptr);
4105
4106 if (selectDb(c,id) == REDIS_ERR) {
4107 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4108 } else {
4109 addReply(c,shared.ok);
4110 }
4111 }
4112
4113 static void randomkeyCommand(redisClient *c) {
4114 dictEntry *de;
4115
4116 while(1) {
4117 de = dictGetRandomKey(c->db->dict);
4118 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4119 }
4120 if (de == NULL) {
4121 addReply(c,shared.plus);
4122 addReply(c,shared.crlf);
4123 } else {
4124 addReply(c,shared.plus);
4125 addReply(c,dictGetEntryKey(de));
4126 addReply(c,shared.crlf);
4127 }
4128 }
4129
4130 static void keysCommand(redisClient *c) {
4131 dictIterator *di;
4132 dictEntry *de;
4133 sds pattern = c->argv[1]->ptr;
4134 int plen = sdslen(pattern);
4135 unsigned long numkeys = 0;
4136 robj *lenobj = createObject(REDIS_STRING,NULL);
4137
4138 di = dictGetIterator(c->db->dict);
4139 addReply(c,lenobj);
4140 decrRefCount(lenobj);
4141 while((de = dictNext(di)) != NULL) {
4142 robj *keyobj = dictGetEntryKey(de);
4143
4144 sds key = keyobj->ptr;
4145 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4146 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4147 if (expireIfNeeded(c->db,keyobj) == 0) {
4148 addReplyBulk(c,keyobj);
4149 numkeys++;
4150 }
4151 }
4152 }
4153 dictReleaseIterator(di);
4154 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4155 }
4156
4157 static void dbsizeCommand(redisClient *c) {
4158 addReplySds(c,
4159 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4160 }
4161
4162 static void lastsaveCommand(redisClient *c) {
4163 addReplySds(c,
4164 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4165 }
4166
4167 static void typeCommand(redisClient *c) {
4168 robj *o;
4169 char *type;
4170
4171 o = lookupKeyRead(c->db,c->argv[1]);
4172 if (o == NULL) {
4173 type = "+none";
4174 } else {
4175 switch(o->type) {
4176 case REDIS_STRING: type = "+string"; break;
4177 case REDIS_LIST: type = "+list"; break;
4178 case REDIS_SET: type = "+set"; break;
4179 case REDIS_ZSET: type = "+zset"; break;
4180 case REDIS_HASH: type = "+hash"; break;
4181 default: type = "+unknown"; break;
4182 }
4183 }
4184 addReplySds(c,sdsnew(type));
4185 addReply(c,shared.crlf);
4186 }
4187
4188 static void saveCommand(redisClient *c) {
4189 if (server.bgsavechildpid != -1) {
4190 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4191 return;
4192 }
4193 if (rdbSave(server.dbfilename) == REDIS_OK) {
4194 addReply(c,shared.ok);
4195 } else {
4196 addReply(c,shared.err);
4197 }
4198 }
4199
4200 static void bgsaveCommand(redisClient *c) {
4201 if (server.bgsavechildpid != -1) {
4202 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4203 return;
4204 }
4205 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4206 char *status = "+Background saving started\r\n";
4207 addReplySds(c,sdsnew(status));
4208 } else {
4209 addReply(c,shared.err);
4210 }
4211 }
4212
4213 static void shutdownCommand(redisClient *c) {
4214 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4215 /* Kill the saving child if there is a background saving in progress.
4216 We want to avoid race conditions, for instance our saving child may
4217 overwrite the synchronous saving did by SHUTDOWN. */
4218 if (server.bgsavechildpid != -1) {
4219 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4220 kill(server.bgsavechildpid,SIGKILL);
4221 rdbRemoveTempFile(server.bgsavechildpid);
4222 }
4223 if (server.appendonly) {
4224 /* Append only file: fsync() the AOF and exit */
4225 fsync(server.appendfd);
4226 if (server.vm_enabled) unlink(server.vm_swap_file);
4227 exit(0);
4228 } else {
4229 /* Snapshotting. Perform a SYNC SAVE and exit */
4230 if (rdbSave(server.dbfilename) == REDIS_OK) {
4231 if (server.daemonize)
4232 unlink(server.pidfile);
4233 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4234 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4235 if (server.vm_enabled) unlink(server.vm_swap_file);
4236 exit(0);
4237 } else {
4238 /* Ooops.. error saving! The best we can do is to continue
4239 * operating. Note that if there was a background saving process,
4240 * in the next cron() Redis will be notified that the background
4241 * saving aborted, handling special stuff like slaves pending for
4242 * synchronization... */
4243 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4244 addReplySds(c,
4245 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4246 }
4247 }
4248 }
4249
4250 static void renameGenericCommand(redisClient *c, int nx) {
4251 robj *o;
4252
4253 /* To use the same key as src and dst is probably an error */
4254 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4255 addReply(c,shared.sameobjecterr);
4256 return;
4257 }
4258
4259 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4260 return;
4261
4262 incrRefCount(o);
4263 deleteIfVolatile(c->db,c->argv[2]);
4264 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4265 if (nx) {
4266 decrRefCount(o);
4267 addReply(c,shared.czero);
4268 return;
4269 }
4270 dictReplace(c->db->dict,c->argv[2],o);
4271 } else {
4272 incrRefCount(c->argv[2]);
4273 }
4274 deleteKey(c->db,c->argv[1]);
4275 server.dirty++;
4276 addReply(c,nx ? shared.cone : shared.ok);
4277 }
4278
4279 static void renameCommand(redisClient *c) {
4280 renameGenericCommand(c,0);
4281 }
4282
4283 static void renamenxCommand(redisClient *c) {
4284 renameGenericCommand(c,1);
4285 }
4286
4287 static void moveCommand(redisClient *c) {
4288 robj *o;
4289 redisDb *src, *dst;
4290 int srcid;
4291
4292 /* Obtain source and target DB pointers */
4293 src = c->db;
4294 srcid = c->db->id;
4295 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4296 addReply(c,shared.outofrangeerr);
4297 return;
4298 }
4299 dst = c->db;
4300 selectDb(c,srcid); /* Back to the source DB */
4301
4302 /* If the user is moving using as target the same
4303 * DB as the source DB it is probably an error. */
4304 if (src == dst) {
4305 addReply(c,shared.sameobjecterr);
4306 return;
4307 }
4308
4309 /* Check if the element exists and get a reference */
4310 o = lookupKeyWrite(c->db,c->argv[1]);
4311 if (!o) {
4312 addReply(c,shared.czero);
4313 return;
4314 }
4315
4316 /* Try to add the element to the target DB */
4317 deleteIfVolatile(dst,c->argv[1]);
4318 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4319 addReply(c,shared.czero);
4320 return;
4321 }
4322 incrRefCount(c->argv[1]);
4323 incrRefCount(o);
4324
4325 /* OK! key moved, free the entry in the source DB */
4326 deleteKey(src,c->argv[1]);
4327 server.dirty++;
4328 addReply(c,shared.cone);
4329 }
4330
4331 /* =================================== Lists ================================ */
4332 static void pushGenericCommand(redisClient *c, int where) {
4333 robj *lobj;
4334 list *list;
4335
4336 lobj = lookupKeyWrite(c->db,c->argv[1]);
4337 if (lobj == NULL) {
4338 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4339 addReply(c,shared.cone);
4340 return;
4341 }
4342 lobj = createListObject();
4343 list = lobj->ptr;
4344 if (where == REDIS_HEAD) {
4345 listAddNodeHead(list,c->argv[2]);
4346 } else {
4347 listAddNodeTail(list,c->argv[2]);
4348 }
4349 dictAdd(c->db->dict,c->argv[1],lobj);
4350 incrRefCount(c->argv[1]);
4351 incrRefCount(c->argv[2]);
4352 } else {
4353 if (lobj->type != REDIS_LIST) {
4354 addReply(c,shared.wrongtypeerr);
4355 return;
4356 }
4357 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4358 addReply(c,shared.cone);
4359 return;
4360 }
4361 list = lobj->ptr;
4362 if (where == REDIS_HEAD) {
4363 listAddNodeHead(list,c->argv[2]);
4364 } else {
4365 listAddNodeTail(list,c->argv[2]);
4366 }
4367 incrRefCount(c->argv[2]);
4368 }
4369 server.dirty++;
4370 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4371 }
4372
4373 static void lpushCommand(redisClient *c) {
4374 pushGenericCommand(c,REDIS_HEAD);
4375 }
4376
4377 static void rpushCommand(redisClient *c) {
4378 pushGenericCommand(c,REDIS_TAIL);
4379 }
4380
4381 static void llenCommand(redisClient *c) {
4382 robj *o;
4383 list *l;
4384
4385 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4386 checkType(c,o,REDIS_LIST)) return;
4387
4388 l = o->ptr;
4389 addReplyUlong(c,listLength(l));
4390 }
4391
4392 static void lindexCommand(redisClient *c) {
4393 robj *o;
4394 int index = atoi(c->argv[2]->ptr);
4395 list *list;
4396 listNode *ln;
4397
4398 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4399 checkType(c,o,REDIS_LIST)) return;
4400 list = o->ptr;
4401
4402 ln = listIndex(list, index);
4403 if (ln == NULL) {
4404 addReply(c,shared.nullbulk);
4405 } else {
4406 robj *ele = listNodeValue(ln);
4407 addReplyBulk(c,ele);
4408 }
4409 }
4410
4411 static void lsetCommand(redisClient *c) {
4412 robj *o;
4413 int index = atoi(c->argv[2]->ptr);
4414 list *list;
4415 listNode *ln;
4416
4417 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4418 checkType(c,o,REDIS_LIST)) return;
4419 list = o->ptr;
4420
4421 ln = listIndex(list, index);
4422 if (ln == NULL) {
4423 addReply(c,shared.outofrangeerr);
4424 } else {
4425 robj *ele = listNodeValue(ln);
4426
4427 decrRefCount(ele);
4428 listNodeValue(ln) = c->argv[3];
4429 incrRefCount(c->argv[3]);
4430 addReply(c,shared.ok);
4431 server.dirty++;
4432 }
4433 }
4434
4435 static void popGenericCommand(redisClient *c, int where) {
4436 robj *o;
4437 list *list;
4438 listNode *ln;
4439
4440 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4441 checkType(c,o,REDIS_LIST)) return;
4442 list = o->ptr;
4443
4444 if (where == REDIS_HEAD)
4445 ln = listFirst(list);
4446 else
4447 ln = listLast(list);
4448
4449 if (ln == NULL) {
4450 addReply(c,shared.nullbulk);
4451 } else {
4452 robj *ele = listNodeValue(ln);
4453 addReplyBulk(c,ele);
4454 listDelNode(list,ln);
4455 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4456 server.dirty++;
4457 }
4458 }
4459
4460 static void lpopCommand(redisClient *c) {
4461 popGenericCommand(c,REDIS_HEAD);
4462 }
4463
4464 static void rpopCommand(redisClient *c) {
4465 popGenericCommand(c,REDIS_TAIL);
4466 }
4467
4468 static void lrangeCommand(redisClient *c) {
4469 robj *o;
4470 int start = atoi(c->argv[2]->ptr);
4471 int end = atoi(c->argv[3]->ptr);
4472 int llen;
4473 int rangelen, j;
4474 list *list;
4475 listNode *ln;
4476 robj *ele;
4477
4478 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4479 checkType(c,o,REDIS_LIST)) return;
4480 list = o->ptr;
4481 llen = listLength(list);
4482
4483 /* convert negative indexes */
4484 if (start < 0) start = llen+start;
4485 if (end < 0) end = llen+end;
4486 if (start < 0) start = 0;
4487 if (end < 0) end = 0;
4488
4489 /* indexes sanity checks */
4490 if (start > end || start >= llen) {
4491 /* Out of range start or start > end result in empty list */
4492 addReply(c,shared.emptymultibulk);
4493 return;
4494 }
4495 if (end >= llen) end = llen-1;
4496 rangelen = (end-start)+1;
4497
4498 /* Return the result in form of a multi-bulk reply */
4499 ln = listIndex(list, start);
4500 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4501 for (j = 0; j < rangelen; j++) {
4502 ele = listNodeValue(ln);
4503 addReplyBulk(c,ele);
4504 ln = ln->next;
4505 }
4506 }
4507
4508 static void ltrimCommand(redisClient *c) {
4509 robj *o;
4510 int start = atoi(c->argv[2]->ptr);
4511 int end = atoi(c->argv[3]->ptr);
4512 int llen;
4513 int j, ltrim, rtrim;
4514 list *list;
4515 listNode *ln;
4516
4517 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4518 checkType(c,o,REDIS_LIST)) return;
4519 list = o->ptr;
4520 llen = listLength(list);
4521
4522 /* convert negative indexes */
4523 if (start < 0) start = llen+start;
4524 if (end < 0) end = llen+end;
4525 if (start < 0) start = 0;
4526 if (end < 0) end = 0;
4527
4528 /* indexes sanity checks */
4529 if (start > end || start >= llen) {
4530 /* Out of range start or start > end result in empty list */
4531 ltrim = llen;
4532 rtrim = 0;
4533 } else {
4534 if (end >= llen) end = llen-1;
4535 ltrim = start;
4536 rtrim = llen-end-1;
4537 }
4538
4539 /* Remove list elements to perform the trim */
4540 for (j = 0; j < ltrim; j++) {
4541 ln = listFirst(list);
4542 listDelNode(list,ln);
4543 }
4544 for (j = 0; j < rtrim; j++) {
4545 ln = listLast(list);
4546 listDelNode(list,ln);
4547 }
4548 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4549 server.dirty++;
4550 addReply(c,shared.ok);
4551 }
4552
4553 static void lremCommand(redisClient *c) {
4554 robj *o;
4555 list *list;
4556 listNode *ln, *next;
4557 int toremove = atoi(c->argv[2]->ptr);
4558 int removed = 0;
4559 int fromtail = 0;
4560
4561 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4562 checkType(c,o,REDIS_LIST)) return;
4563 list = o->ptr;
4564
4565 if (toremove < 0) {
4566 toremove = -toremove;
4567 fromtail = 1;
4568 }
4569 ln = fromtail ? list->tail : list->head;
4570 while (ln) {
4571 robj *ele = listNodeValue(ln);
4572
4573 next = fromtail ? ln->prev : ln->next;
4574 if (compareStringObjects(ele,c->argv[3]) == 0) {
4575 listDelNode(list,ln);
4576 server.dirty++;
4577 removed++;
4578 if (toremove && removed == toremove) break;
4579 }
4580 ln = next;
4581 }
4582 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4583 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4584 }
4585
4586 /* This is the semantic of this command:
4587 * RPOPLPUSH srclist dstlist:
4588 * IF LLEN(srclist) > 0
4589 * element = RPOP srclist
4590 * LPUSH dstlist element
4591 * RETURN element
4592 * ELSE
4593 * RETURN nil
4594 * END
4595 * END
4596 *
4597 * The idea is to be able to get an element from a list in a reliable way
4598 * since the element is not just returned but pushed against another list
4599 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4600 */
4601 static void rpoplpushcommand(redisClient *c) {
4602 robj *sobj;
4603 list *srclist;
4604 listNode *ln;
4605
4606 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4607 checkType(c,sobj,REDIS_LIST)) return;
4608 srclist = sobj->ptr;
4609 ln = listLast(srclist);
4610
4611 if (ln == NULL) {
4612 addReply(c,shared.nullbulk);
4613 } else {
4614 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4615 robj *ele = listNodeValue(ln);
4616 list *dstlist;
4617
4618 if (dobj && dobj->type != REDIS_LIST) {
4619 addReply(c,shared.wrongtypeerr);
4620 return;
4621 }
4622
4623 /* Add the element to the target list (unless it's directly
4624 * passed to some BLPOP-ing client */
4625 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4626 if (dobj == NULL) {
4627 /* Create the list if the key does not exist */
4628 dobj = createListObject();
4629 dictAdd(c->db->dict,c->argv[2],dobj);
4630 incrRefCount(c->argv[2]);
4631 }
4632 dstlist = dobj->ptr;
4633 listAddNodeHead(dstlist,ele);
4634 incrRefCount(ele);
4635 }
4636
4637 /* Send the element to the client as reply as well */
4638 addReplyBulk(c,ele);
4639
4640 /* Finally remove the element from the source list */
4641 listDelNode(srclist,ln);
4642 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4643 server.dirty++;
4644 }
4645 }
4646
4647 /* ==================================== Sets ================================ */
4648
4649 static void saddCommand(redisClient *c) {
4650 robj *set;
4651
4652 set = lookupKeyWrite(c->db,c->argv[1]);
4653 if (set == NULL) {
4654 set = createSetObject();
4655 dictAdd(c->db->dict,c->argv[1],set);
4656 incrRefCount(c->argv[1]);
4657 } else {
4658 if (set->type != REDIS_SET) {
4659 addReply(c,shared.wrongtypeerr);
4660 return;
4661 }
4662 }
4663 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4664 incrRefCount(c->argv[2]);
4665 server.dirty++;
4666 addReply(c,shared.cone);
4667 } else {
4668 addReply(c,shared.czero);
4669 }
4670 }
4671
4672 static void sremCommand(redisClient *c) {
4673 robj *set;
4674
4675 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4676 checkType(c,set,REDIS_SET)) return;
4677
4678 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4679 server.dirty++;
4680 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4681 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4682 addReply(c,shared.cone);
4683 } else {
4684 addReply(c,shared.czero);
4685 }
4686 }
4687
4688 static void smoveCommand(redisClient *c) {
4689 robj *srcset, *dstset;
4690
4691 srcset = lookupKeyWrite(c->db,c->argv[1]);
4692 dstset = lookupKeyWrite(c->db,c->argv[2]);
4693
4694 /* If the source key does not exist return 0, if it's of the wrong type
4695 * raise an error */
4696 if (srcset == NULL || srcset->type != REDIS_SET) {
4697 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4698 return;
4699 }
4700 /* Error if the destination key is not a set as well */
4701 if (dstset && dstset->type != REDIS_SET) {
4702 addReply(c,shared.wrongtypeerr);
4703 return;
4704 }
4705 /* Remove the element from the source set */
4706 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4707 /* Key not found in the src set! return zero */
4708 addReply(c,shared.czero);
4709 return;
4710 }
4711 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4712 deleteKey(c->db,c->argv[1]);
4713 server.dirty++;
4714 /* Add the element to the destination set */
4715 if (!dstset) {
4716 dstset = createSetObject();
4717 dictAdd(c->db->dict,c->argv[2],dstset);
4718 incrRefCount(c->argv[2]);
4719 }
4720 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4721 incrRefCount(c->argv[3]);
4722 addReply(c,shared.cone);
4723 }
4724
4725 static void sismemberCommand(redisClient *c) {
4726 robj *set;
4727
4728 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4729 checkType(c,set,REDIS_SET)) return;
4730
4731 if (dictFind(set->ptr,c->argv[2]))
4732 addReply(c,shared.cone);
4733 else
4734 addReply(c,shared.czero);
4735 }
4736
4737 static void scardCommand(redisClient *c) {
4738 robj *o;
4739 dict *s;
4740
4741 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4742 checkType(c,o,REDIS_SET)) return;
4743
4744 s = o->ptr;
4745 addReplyUlong(c,dictSize(s));
4746 }
4747
4748 static void spopCommand(redisClient *c) {
4749 robj *set;
4750 dictEntry *de;
4751
4752 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4753 checkType(c,set,REDIS_SET)) return;
4754
4755 de = dictGetRandomKey(set->ptr);
4756 if (de == NULL) {
4757 addReply(c,shared.nullbulk);
4758 } else {
4759 robj *ele = dictGetEntryKey(de);
4760
4761 addReplyBulk(c,ele);
4762 dictDelete(set->ptr,ele);
4763 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4764 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4765 server.dirty++;
4766 }
4767 }
4768
4769 static void srandmemberCommand(redisClient *c) {
4770 robj *set;
4771 dictEntry *de;
4772
4773 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4774 checkType(c,set,REDIS_SET)) return;
4775
4776 de = dictGetRandomKey(set->ptr);
4777 if (de == NULL) {
4778 addReply(c,shared.nullbulk);
4779 } else {
4780 robj *ele = dictGetEntryKey(de);
4781
4782 addReplyBulk(c,ele);
4783 }
4784 }
4785
4786 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4787 dict **d1 = (void*) s1, **d2 = (void*) s2;
4788
4789 return dictSize(*d1)-dictSize(*d2);
4790 }
4791
4792 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4793 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4794 dictIterator *di;
4795 dictEntry *de;
4796 robj *lenobj = NULL, *dstset = NULL;
4797 unsigned long j, cardinality = 0;
4798
4799 for (j = 0; j < setsnum; j++) {
4800 robj *setobj;
4801
4802 setobj = dstkey ?
4803 lookupKeyWrite(c->db,setskeys[j]) :
4804 lookupKeyRead(c->db,setskeys[j]);
4805 if (!setobj) {
4806 zfree(dv);
4807 if (dstkey) {
4808 if (deleteKey(c->db,dstkey))
4809 server.dirty++;
4810 addReply(c,shared.czero);
4811 } else {
4812 addReply(c,shared.nullmultibulk);
4813 }
4814 return;
4815 }
4816 if (setobj->type != REDIS_SET) {
4817 zfree(dv);
4818 addReply(c,shared.wrongtypeerr);
4819 return;
4820 }
4821 dv[j] = setobj->ptr;
4822 }
4823 /* Sort sets from the smallest to largest, this will improve our
4824 * algorithm's performace */
4825 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4826
4827 /* The first thing we should output is the total number of elements...
4828 * since this is a multi-bulk write, but at this stage we don't know
4829 * the intersection set size, so we use a trick, append an empty object
4830 * to the output list and save the pointer to later modify it with the
4831 * right length */
4832 if (!dstkey) {
4833 lenobj = createObject(REDIS_STRING,NULL);
4834 addReply(c,lenobj);
4835 decrRefCount(lenobj);
4836 } else {
4837 /* If we have a target key where to store the resulting set
4838 * create this key with an empty set inside */
4839 dstset = createSetObject();
4840 }
4841
4842 /* Iterate all the elements of the first (smallest) set, and test
4843 * the element against all the other sets, if at least one set does
4844 * not include the element it is discarded */
4845 di = dictGetIterator(dv[0]);
4846
4847 while((de = dictNext(di)) != NULL) {
4848 robj *ele;
4849
4850 for (j = 1; j < setsnum; j++)
4851 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4852 if (j != setsnum)
4853 continue; /* at least one set does not contain the member */
4854 ele = dictGetEntryKey(de);
4855 if (!dstkey) {
4856 addReplyBulk(c,ele);
4857 cardinality++;
4858 } else {
4859 dictAdd(dstset->ptr,ele,NULL);
4860 incrRefCount(ele);
4861 }
4862 }
4863 dictReleaseIterator(di);
4864
4865 if (dstkey) {
4866 /* Store the resulting set into the target, if the intersection
4867 * is not an empty set. */
4868 deleteKey(c->db,dstkey);
4869 if (dictSize((dict*)dstset->ptr) > 0) {
4870 dictAdd(c->db->dict,dstkey,dstset);
4871 incrRefCount(dstkey);
4872 addReplyLong(c,dictSize((dict*)dstset->ptr));
4873 } else {
4874 decrRefCount(dstset);
4875 addReply(c,shared.czero);
4876 }
4877 server.dirty++;
4878 } else {
4879 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4880 }
4881 zfree(dv);
4882 }
4883
4884 static void sinterCommand(redisClient *c) {
4885 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4886 }
4887
4888 static void sinterstoreCommand(redisClient *c) {
4889 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4890 }
4891
4892 #define REDIS_OP_UNION 0
4893 #define REDIS_OP_DIFF 1
4894 #define REDIS_OP_INTER 2
4895
4896 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4897 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4898 dictIterator *di;
4899 dictEntry *de;
4900 robj *dstset = NULL;
4901 int j, cardinality = 0;
4902
4903 for (j = 0; j < setsnum; j++) {
4904 robj *setobj;
4905
4906 setobj = dstkey ?
4907 lookupKeyWrite(c->db,setskeys[j]) :
4908 lookupKeyRead(c->db,setskeys[j]);
4909 if (!setobj) {
4910 dv[j] = NULL;
4911 continue;
4912 }
4913 if (setobj->type != REDIS_SET) {
4914 zfree(dv);
4915 addReply(c,shared.wrongtypeerr);
4916 return;
4917 }
4918 dv[j] = setobj->ptr;
4919 }
4920
4921 /* We need a temp set object to store our union. If the dstkey
4922 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4923 * this set object will be the resulting object to set into the target key*/
4924 dstset = createSetObject();
4925
4926 /* Iterate all the elements of all the sets, add every element a single
4927 * time to the result set */
4928 for (j = 0; j < setsnum; j++) {
4929 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4930 if (!dv[j]) continue; /* non existing keys are like empty sets */
4931
4932 di = dictGetIterator(dv[j]);
4933
4934 while((de = dictNext(di)) != NULL) {
4935 robj *ele;
4936
4937 /* dictAdd will not add the same element multiple times */
4938 ele = dictGetEntryKey(de);
4939 if (op == REDIS_OP_UNION || j == 0) {
4940 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4941 incrRefCount(ele);
4942 cardinality++;
4943 }
4944 } else if (op == REDIS_OP_DIFF) {
4945 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4946 cardinality--;
4947 }
4948 }
4949 }
4950 dictReleaseIterator(di);
4951
4952 /* result set is empty? Exit asap. */
4953 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4954 }
4955
4956 /* Output the content of the resulting set, if not in STORE mode */
4957 if (!dstkey) {
4958 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4959 di = dictGetIterator(dstset->ptr);
4960 while((de = dictNext(di)) != NULL) {
4961 robj *ele;
4962
4963 ele = dictGetEntryKey(de);
4964 addReplyBulk(c,ele);
4965 }
4966 dictReleaseIterator(di);
4967 decrRefCount(dstset);
4968 } else {
4969 /* If we have a target key where to store the resulting set
4970 * create this key with the result set inside */
4971 deleteKey(c->db,dstkey);
4972 if (dictSize((dict*)dstset->ptr) > 0) {
4973 dictAdd(c->db->dict,dstkey,dstset);
4974 incrRefCount(dstkey);
4975 addReplyLong(c,dictSize((dict*)dstset->ptr));
4976 } else {
4977 decrRefCount(dstset);
4978 addReply(c,shared.czero);
4979 }
4980 server.dirty++;
4981 }
4982 zfree(dv);
4983 }
4984
4985 static void sunionCommand(redisClient *c) {
4986 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4987 }
4988
4989 static void sunionstoreCommand(redisClient *c) {
4990 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4991 }
4992
4993 static void sdiffCommand(redisClient *c) {
4994 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4995 }
4996
4997 static void sdiffstoreCommand(redisClient *c) {
4998 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4999 }
5000
5001 /* ==================================== ZSets =============================== */
5002
5003 /* ZSETs are ordered sets using two data structures to hold the same elements
5004 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5005 * data structure.
5006 *
5007 * The elements are added to an hash table mapping Redis objects to scores.
5008 * At the same time the elements are added to a skip list mapping scores
5009 * to Redis objects (so objects are sorted by scores in this "view"). */
5010
5011 /* This skiplist implementation is almost a C translation of the original
5012 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5013 * Alternative to Balanced Trees", modified in three ways:
5014 * a) this implementation allows for repeated values.
5015 * b) the comparison is not just by key (our 'score') but by satellite data.
5016 * c) there is a back pointer, so it's a doubly linked list with the back
5017 * pointers being only at "level 1". This allows to traverse the list
5018 * from tail to head, useful for ZREVRANGE. */
5019
5020 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5021 zskiplistNode *zn = zmalloc(sizeof(*zn));
5022
5023 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5024 if (level > 0)
5025 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5026 zn->score = score;
5027 zn->obj = obj;
5028 return zn;
5029 }
5030
5031 static zskiplist *zslCreate(void) {
5032 int j;
5033 zskiplist *zsl;
5034
5035 zsl = zmalloc(sizeof(*zsl));
5036 zsl->level = 1;
5037 zsl->length = 0;
5038 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5039 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5040 zsl->header->forward[j] = NULL;
5041
5042 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5043 if (j < ZSKIPLIST_MAXLEVEL-1)
5044 zsl->header->span[j] = 0;
5045 }
5046 zsl->header->backward = NULL;
5047 zsl->tail = NULL;
5048 return zsl;
5049 }
5050
5051 static void zslFreeNode(zskiplistNode *node) {
5052 decrRefCount(node->obj);
5053 zfree(node->forward);
5054 zfree(node->span);
5055 zfree(node);
5056 }
5057
5058 static void zslFree(zskiplist *zsl) {
5059 zskiplistNode *node = zsl->header->forward[0], *next;
5060
5061 zfree(zsl->header->forward);
5062 zfree(zsl->header->span);
5063 zfree(zsl->header);
5064 while(node) {
5065 next = node->forward[0];
5066 zslFreeNode(node);
5067 node = next;
5068 }
5069 zfree(zsl);
5070 }
5071
5072 static int zslRandomLevel(void) {
5073 int level = 1;
5074 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5075 level += 1;
5076 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5077 }
5078
5079 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5080 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5081 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5082 int i, level;
5083
5084 x = zsl->header;
5085 for (i = zsl->level-1; i >= 0; i--) {
5086 /* store rank that is crossed to reach the insert position */
5087 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5088
5089 while (x->forward[i] &&
5090 (x->forward[i]->score < score ||
5091 (x->forward[i]->score == score &&
5092 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5093 rank[i] += i > 0 ? x->span[i-1] : 1;
5094 x = x->forward[i];
5095 }
5096 update[i] = x;
5097 }
5098 /* we assume the key is not already inside, since we allow duplicated
5099 * scores, and the re-insertion of score and redis object should never
5100 * happpen since the caller of zslInsert() should test in the hash table
5101 * if the element is already inside or not. */
5102 level = zslRandomLevel();
5103 if (level > zsl->level) {
5104 for (i = zsl->level; i < level; i++) {
5105 rank[i] = 0;
5106 update[i] = zsl->header;
5107 update[i]->span[i-1] = zsl->length;
5108 }
5109 zsl->level = level;
5110 }
5111 x = zslCreateNode(level,score,obj);
5112 for (i = 0; i < level; i++) {
5113 x->forward[i] = update[i]->forward[i];
5114 update[i]->forward[i] = x;
5115
5116 /* update span covered by update[i] as x is inserted here */
5117 if (i > 0) {
5118 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5119 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5120 }
5121 }
5122
5123 /* increment span for untouched levels */
5124 for (i = level; i < zsl->level; i++) {
5125 update[i]->span[i-1]++;
5126 }
5127
5128 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5129 if (x->forward[0])
5130 x->forward[0]->backward = x;
5131 else
5132 zsl->tail = x;
5133 zsl->length++;
5134 }
5135
5136 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5137 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5138 int i;
5139 for (i = 0; i < zsl->level; i++) {
5140 if (update[i]->forward[i] == x) {
5141 if (i > 0) {
5142 update[i]->span[i-1] += x->span[i-1] - 1;
5143 }
5144 update[i]->forward[i] = x->forward[i];
5145 } else {
5146 /* invariant: i > 0, because update[0]->forward[0]
5147 * is always equal to x */
5148 update[i]->span[i-1] -= 1;
5149 }
5150 }
5151 if (x->forward[0]) {
5152 x->forward[0]->backward = x->backward;
5153 } else {
5154 zsl->tail = x->backward;
5155 }
5156 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5157 zsl->level--;
5158 zsl->length--;
5159 }
5160
5161 /* Delete an element with matching score/object from the skiplist. */
5162 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5163 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5164 int i;
5165
5166 x = zsl->header;
5167 for (i = zsl->level-1; i >= 0; i--) {
5168 while (x->forward[i] &&
5169 (x->forward[i]->score < score ||
5170 (x->forward[i]->score == score &&
5171 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5172 x = x->forward[i];
5173 update[i] = x;
5174 }
5175 /* We may have multiple elements with the same score, what we need
5176 * is to find the element with both the right score and object. */
5177 x = x->forward[0];
5178 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5179 zslDeleteNode(zsl, x, update);
5180 zslFreeNode(x);
5181 return 1;
5182 } else {
5183 return 0; /* not found */
5184 }
5185 return 0; /* not found */
5186 }
5187
5188 /* Delete all the elements with score between min and max from the skiplist.
5189 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5190 * Note that this function takes the reference to the hash table view of the
5191 * sorted set, in order to remove the elements from the hash table too. */
5192 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5193 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5194 unsigned long removed = 0;
5195 int i;
5196
5197 x = zsl->header;
5198 for (i = zsl->level-1; i >= 0; i--) {
5199 while (x->forward[i] && x->forward[i]->score < min)
5200 x = x->forward[i];
5201 update[i] = x;
5202 }
5203 /* We may have multiple elements with the same score, what we need
5204 * is to find the element with both the right score and object. */
5205 x = x->forward[0];
5206 while (x && x->score <= max) {
5207 zskiplistNode *next = x->forward[0];
5208 zslDeleteNode(zsl, x, update);
5209 dictDelete(dict,x->obj);
5210 zslFreeNode(x);
5211 removed++;
5212 x = next;
5213 }
5214 return removed; /* not found */
5215 }
5216
5217 /* Delete all the elements with rank between start and end from the skiplist.
5218 * Start and end are inclusive. Note that start and end need to be 1-based */
5219 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5220 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5221 unsigned long traversed = 0, removed = 0;
5222 int i;
5223
5224 x = zsl->header;
5225 for (i = zsl->level-1; i >= 0; i--) {
5226 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5227 traversed += i > 0 ? x->span[i-1] : 1;
5228 x = x->forward[i];
5229 }
5230 update[i] = x;
5231 }
5232
5233 traversed++;
5234 x = x->forward[0];
5235 while (x && traversed <= end) {
5236 zskiplistNode *next = x->forward[0];
5237 zslDeleteNode(zsl, x, update);
5238 dictDelete(dict,x->obj);
5239 zslFreeNode(x);
5240 removed++;
5241 traversed++;
5242 x = next;
5243 }
5244 return removed;
5245 }
5246
5247 /* Find the first node having a score equal or greater than the specified one.
5248 * Returns NULL if there is no match. */
5249 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5250 zskiplistNode *x;
5251 int i;
5252
5253 x = zsl->header;
5254 for (i = zsl->level-1; i >= 0; i--) {
5255 while (x->forward[i] && x->forward[i]->score < score)
5256 x = x->forward[i];
5257 }
5258 /* We may have multiple elements with the same score, what we need
5259 * is to find the element with both the right score and object. */
5260 return x->forward[0];
5261 }
5262
5263 /* Find the rank for an element by both score and key.
5264 * Returns 0 when the element cannot be found, rank otherwise.
5265 * Note that the rank is 1-based due to the span of zsl->header to the
5266 * first element. */
5267 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5268 zskiplistNode *x;
5269 unsigned long rank = 0;
5270 int i;
5271
5272 x = zsl->header;
5273 for (i = zsl->level-1; i >= 0; i--) {
5274 while (x->forward[i] &&
5275 (x->forward[i]->score < score ||
5276 (x->forward[i]->score == score &&
5277 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5278 rank += i > 0 ? x->span[i-1] : 1;
5279 x = x->forward[i];
5280 }
5281
5282 /* x might be equal to zsl->header, so test if obj is non-NULL */
5283 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5284 return rank;
5285 }
5286 }
5287 return 0;
5288 }
5289
5290 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5291 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5292 zskiplistNode *x;
5293 unsigned long traversed = 0;
5294 int i;
5295
5296 x = zsl->header;
5297 for (i = zsl->level-1; i >= 0; i--) {
5298 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5299 {
5300 traversed += i > 0 ? x->span[i-1] : 1;
5301 x = x->forward[i];
5302 }
5303 if (traversed == rank) {
5304 return x;
5305 }
5306 }
5307 return NULL;
5308 }
5309
5310 /* The actual Z-commands implementations */
5311
5312 /* This generic command implements both ZADD and ZINCRBY.
5313 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5314 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5315 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5316 robj *zsetobj;
5317 zset *zs;
5318 double *score;
5319
5320 zsetobj = lookupKeyWrite(c->db,key);
5321 if (zsetobj == NULL) {
5322 zsetobj = createZsetObject();
5323 dictAdd(c->db->dict,key,zsetobj);
5324 incrRefCount(key);
5325 } else {
5326 if (zsetobj->type != REDIS_ZSET) {
5327 addReply(c,shared.wrongtypeerr);
5328 return;
5329 }
5330 }
5331 zs = zsetobj->ptr;
5332
5333 /* Ok now since we implement both ZADD and ZINCRBY here the code
5334 * needs to handle the two different conditions. It's all about setting
5335 * '*score', that is, the new score to set, to the right value. */
5336 score = zmalloc(sizeof(double));
5337 if (doincrement) {
5338 dictEntry *de;
5339
5340 /* Read the old score. If the element was not present starts from 0 */
5341 de = dictFind(zs->dict,ele);
5342 if (de) {
5343 double *oldscore = dictGetEntryVal(de);
5344 *score = *oldscore + scoreval;
5345 } else {
5346 *score = scoreval;
5347 }
5348 } else {
5349 *score = scoreval;
5350 }
5351
5352 /* What follows is a simple remove and re-insert operation that is common
5353 * to both ZADD and ZINCRBY... */
5354 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5355 /* case 1: New element */
5356 incrRefCount(ele); /* added to hash */
5357 zslInsert(zs->zsl,*score,ele);
5358 incrRefCount(ele); /* added to skiplist */
5359 server.dirty++;
5360 if (doincrement)
5361 addReplyDouble(c,*score);
5362 else
5363 addReply(c,shared.cone);
5364 } else {
5365 dictEntry *de;
5366 double *oldscore;
5367
5368 /* case 2: Score update operation */
5369 de = dictFind(zs->dict,ele);
5370 redisAssert(de != NULL);
5371 oldscore = dictGetEntryVal(de);
5372 if (*score != *oldscore) {
5373 int deleted;
5374
5375 /* Remove and insert the element in the skip list with new score */
5376 deleted = zslDelete(zs->zsl,*oldscore,ele);
5377 redisAssert(deleted != 0);
5378 zslInsert(zs->zsl,*score,ele);
5379 incrRefCount(ele);
5380 /* Update the score in the hash table */
5381 dictReplace(zs->dict,ele,score);
5382 server.dirty++;
5383 } else {
5384 zfree(score);
5385 }
5386 if (doincrement)
5387 addReplyDouble(c,*score);
5388 else
5389 addReply(c,shared.czero);
5390 }
5391 }
5392
5393 static void zaddCommand(redisClient *c) {
5394 double scoreval;
5395
5396 scoreval = strtod(c->argv[2]->ptr,NULL);
5397 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5398 }
5399
5400 static void zincrbyCommand(redisClient *c) {
5401 double scoreval;
5402
5403 scoreval = strtod(c->argv[2]->ptr,NULL);
5404 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5405 }
5406
5407 static void zremCommand(redisClient *c) {
5408 robj *zsetobj;
5409 zset *zs;
5410 dictEntry *de;
5411 double *oldscore;
5412 int deleted;
5413
5414 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5415 checkType(c,zsetobj,REDIS_ZSET)) return;
5416
5417 zs = zsetobj->ptr;
5418 de = dictFind(zs->dict,c->argv[2]);
5419 if (de == NULL) {
5420 addReply(c,shared.czero);
5421 return;
5422 }
5423 /* Delete from the skiplist */
5424 oldscore = dictGetEntryVal(de);
5425 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5426 redisAssert(deleted != 0);
5427
5428 /* Delete from the hash table */
5429 dictDelete(zs->dict,c->argv[2]);
5430 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5431 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5432 server.dirty++;
5433 addReply(c,shared.cone);
5434 }
5435
5436 static void zremrangebyscoreCommand(redisClient *c) {
5437 double min = strtod(c->argv[2]->ptr,NULL);
5438 double max = strtod(c->argv[3]->ptr,NULL);
5439 long deleted;
5440 robj *zsetobj;
5441 zset *zs;
5442
5443 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5444 checkType(c,zsetobj,REDIS_ZSET)) return;
5445
5446 zs = zsetobj->ptr;
5447 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5448 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5449 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5450 server.dirty += deleted;
5451 addReplyLong(c,deleted);
5452 }
5453
5454 static void zremrangebyrankCommand(redisClient *c) {
5455 int start = atoi(c->argv[2]->ptr);
5456 int end = atoi(c->argv[3]->ptr);
5457 int llen;
5458 long deleted;
5459 robj *zsetobj;
5460 zset *zs;
5461
5462 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5463 checkType(c,zsetobj,REDIS_ZSET)) return;
5464 zs = zsetobj->ptr;
5465 llen = zs->zsl->length;
5466
5467 /* convert negative indexes */
5468 if (start < 0) start = llen+start;
5469 if (end < 0) end = llen+end;
5470 if (start < 0) start = 0;
5471 if (end < 0) end = 0;
5472
5473 /* indexes sanity checks */
5474 if (start > end || start >= llen) {
5475 addReply(c,shared.czero);
5476 return;
5477 }
5478 if (end >= llen) end = llen-1;
5479
5480 /* increment start and end because zsl*Rank functions
5481 * use 1-based rank */
5482 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5483 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5484 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5485 server.dirty += deleted;
5486 addReplyLong(c, deleted);
5487 }
5488
5489 typedef struct {
5490 dict *dict;
5491 double weight;
5492 } zsetopsrc;
5493
5494 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5495 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5496 unsigned long size1, size2;
5497 size1 = d1->dict ? dictSize(d1->dict) : 0;
5498 size2 = d2->dict ? dictSize(d2->dict) : 0;
5499 return size1 - size2;
5500 }
5501
5502 #define REDIS_AGGR_SUM 1
5503 #define REDIS_AGGR_MIN 2
5504 #define REDIS_AGGR_MAX 3
5505
5506 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5507 if (aggregate == REDIS_AGGR_SUM) {
5508 *target = *target + val;
5509 } else if (aggregate == REDIS_AGGR_MIN) {
5510 *target = val < *target ? val : *target;
5511 } else if (aggregate == REDIS_AGGR_MAX) {
5512 *target = val > *target ? val : *target;
5513 } else {
5514 /* safety net */
5515 redisAssert(0 != 0);
5516 }
5517 }
5518
5519 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5520 int i, j, zsetnum;
5521 int aggregate = REDIS_AGGR_SUM;
5522 zsetopsrc *src;
5523 robj *dstobj;
5524 zset *dstzset;
5525 dictIterator *di;
5526 dictEntry *de;
5527
5528 /* expect zsetnum input keys to be given */
5529 zsetnum = atoi(c->argv[2]->ptr);
5530 if (zsetnum < 1) {
5531 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5532 return;
5533 }
5534
5535 /* test if the expected number of keys would overflow */
5536 if (3+zsetnum > c->argc) {
5537 addReply(c,shared.syntaxerr);
5538 return;
5539 }
5540
5541 /* read keys to be used for input */
5542 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5543 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5544 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5545 if (!zsetobj) {
5546 src[i].dict = NULL;
5547 } else {
5548 if (zsetobj->type != REDIS_ZSET) {
5549 zfree(src);
5550 addReply(c,shared.wrongtypeerr);
5551 return;
5552 }
5553 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5554 }
5555
5556 /* default all weights to 1 */
5557 src[i].weight = 1.0;
5558 }
5559
5560 /* parse optional extra arguments */
5561 if (j < c->argc) {
5562 int remaining = c->argc - j;
5563
5564 while (remaining) {
5565 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5566 j++; remaining--;
5567 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5568 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5569 }
5570 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5571 j++; remaining--;
5572 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5573 aggregate = REDIS_AGGR_SUM;
5574 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5575 aggregate = REDIS_AGGR_MIN;
5576 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5577 aggregate = REDIS_AGGR_MAX;
5578 } else {
5579 zfree(src);
5580 addReply(c,shared.syntaxerr);
5581 return;
5582 }
5583 j++; remaining--;
5584 } else {
5585 zfree(src);
5586 addReply(c,shared.syntaxerr);
5587 return;
5588 }
5589 }
5590 }
5591
5592 /* sort sets from the smallest to largest, this will improve our
5593 * algorithm's performance */
5594 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5595
5596 dstobj = createZsetObject();
5597 dstzset = dstobj->ptr;
5598
5599 if (op == REDIS_OP_INTER) {
5600 /* skip going over all entries if the smallest zset is NULL or empty */
5601 if (src[0].dict && dictSize(src[0].dict) > 0) {
5602 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5603 * from small to large, all src[i > 0].dict are non-empty too */
5604 di = dictGetIterator(src[0].dict);
5605 while((de = dictNext(di)) != NULL) {
5606 double *score = zmalloc(sizeof(double)), value;
5607 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5608
5609 for (j = 1; j < zsetnum; j++) {
5610 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5611 if (other) {
5612 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5613 zunionInterAggregate(score, value, aggregate);
5614 } else {
5615 break;
5616 }
5617 }
5618
5619 /* skip entry when not present in every source dict */
5620 if (j != zsetnum) {
5621 zfree(score);
5622 } else {
5623 robj *o = dictGetEntryKey(de);
5624 dictAdd(dstzset->dict,o,score);
5625 incrRefCount(o); /* added to dictionary */
5626 zslInsert(dstzset->zsl,*score,o);
5627 incrRefCount(o); /* added to skiplist */
5628 }
5629 }
5630 dictReleaseIterator(di);
5631 }
5632 } else if (op == REDIS_OP_UNION) {
5633 for (i = 0; i < zsetnum; i++) {
5634 if (!src[i].dict) continue;
5635
5636 di = dictGetIterator(src[i].dict);
5637 while((de = dictNext(di)) != NULL) {
5638 /* skip key when already processed */
5639 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5640
5641 double *score = zmalloc(sizeof(double)), value;
5642 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5643
5644 /* because the zsets are sorted by size, its only possible
5645 * for sets at larger indices to hold this entry */
5646 for (j = (i+1); j < zsetnum; j++) {
5647 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5648 if (other) {
5649 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5650 zunionInterAggregate(score, value, aggregate);
5651 }
5652 }
5653
5654 robj *o = dictGetEntryKey(de);
5655 dictAdd(dstzset->dict,o,score);
5656 incrRefCount(o); /* added to dictionary */
5657 zslInsert(dstzset->zsl,*score,o);
5658 incrRefCount(o); /* added to skiplist */
5659 }
5660 dictReleaseIterator(di);
5661 }
5662 } else {
5663 /* unknown operator */
5664 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5665 }
5666
5667 deleteKey(c->db,dstkey);
5668 if (dstzset->zsl->length) {
5669 dictAdd(c->db->dict,dstkey,dstobj);
5670 incrRefCount(dstkey);
5671 addReplyLong(c, dstzset->zsl->length);
5672 server.dirty++;
5673 } else {
5674 decrRefCount(dstobj);
5675 addReply(c, shared.czero);
5676 }
5677 zfree(src);
5678 }
5679
5680 static void zunionCommand(redisClient *c) {
5681 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5682 }
5683
5684 static void zinterCommand(redisClient *c) {
5685 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5686 }
5687
5688 static void zrangeGenericCommand(redisClient *c, int reverse) {
5689 robj *o;
5690 int start = atoi(c->argv[2]->ptr);
5691 int end = atoi(c->argv[3]->ptr);
5692 int withscores = 0;
5693 int llen;
5694 int rangelen, j;
5695 zset *zsetobj;
5696 zskiplist *zsl;
5697 zskiplistNode *ln;
5698 robj *ele;
5699
5700 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5701 withscores = 1;
5702 } else if (c->argc >= 5) {
5703 addReply(c,shared.syntaxerr);
5704 return;
5705 }
5706
5707 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5708 checkType(c,o,REDIS_ZSET)) return;
5709 zsetobj = o->ptr;
5710 zsl = zsetobj->zsl;
5711 llen = zsl->length;
5712
5713 /* convert negative indexes */
5714 if (start < 0) start = llen+start;
5715 if (end < 0) end = llen+end;
5716 if (start < 0) start = 0;
5717 if (end < 0) end = 0;
5718
5719 /* indexes sanity checks */
5720 if (start > end || start >= llen) {
5721 /* Out of range start or start > end result in empty list */
5722 addReply(c,shared.emptymultibulk);
5723 return;
5724 }
5725 if (end >= llen) end = llen-1;
5726 rangelen = (end-start)+1;
5727
5728 /* check if starting point is trivial, before searching
5729 * the element in log(N) time */
5730 if (reverse) {
5731 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5732 } else {
5733 ln = start == 0 ?
5734 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5735 }
5736
5737 /* Return the result in form of a multi-bulk reply */
5738 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5739 withscores ? (rangelen*2) : rangelen));
5740 for (j = 0; j < rangelen; j++) {
5741 ele = ln->obj;
5742 addReplyBulk(c,ele);
5743 if (withscores)
5744 addReplyDouble(c,ln->score);
5745 ln = reverse ? ln->backward : ln->forward[0];
5746 }
5747 }
5748
5749 static void zrangeCommand(redisClient *c) {
5750 zrangeGenericCommand(c,0);
5751 }
5752
5753 static void zrevrangeCommand(redisClient *c) {
5754 zrangeGenericCommand(c,1);
5755 }
5756
5757 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5758 * If justcount is non-zero, just the count is returned. */
5759 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5760 robj *o;
5761 double min, max;
5762 int minex = 0, maxex = 0; /* are min or max exclusive? */
5763 int offset = 0, limit = -1;
5764 int withscores = 0;
5765 int badsyntax = 0;
5766
5767 /* Parse the min-max interval. If one of the values is prefixed
5768 * by the "(" character, it's considered "open". For instance
5769 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5770 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5771 if (((char*)c->argv[2]->ptr)[0] == '(') {
5772 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5773 minex = 1;
5774 } else {
5775 min = strtod(c->argv[2]->ptr,NULL);
5776 }
5777 if (((char*)c->argv[3]->ptr)[0] == '(') {
5778 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5779 maxex = 1;
5780 } else {
5781 max = strtod(c->argv[3]->ptr,NULL);
5782 }
5783
5784 /* Parse "WITHSCORES": note that if the command was called with
5785 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5786 * enter the following paths to parse WITHSCORES and LIMIT. */
5787 if (c->argc == 5 || c->argc == 8) {
5788 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5789 withscores = 1;
5790 else
5791 badsyntax = 1;
5792 }
5793 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5794 badsyntax = 1;
5795 if (badsyntax) {
5796 addReplySds(c,
5797 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5798 return;
5799 }
5800
5801 /* Parse "LIMIT" */
5802 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5803 addReply(c,shared.syntaxerr);
5804 return;
5805 } else if (c->argc == (7 + withscores)) {
5806 offset = atoi(c->argv[5]->ptr);
5807 limit = atoi(c->argv[6]->ptr);
5808 if (offset < 0) offset = 0;
5809 }
5810
5811 /* Ok, lookup the key and get the range */
5812 o = lookupKeyRead(c->db,c->argv[1]);
5813 if (o == NULL) {
5814 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5815 } else {
5816 if (o->type != REDIS_ZSET) {
5817 addReply(c,shared.wrongtypeerr);
5818 } else {
5819 zset *zsetobj = o->ptr;
5820 zskiplist *zsl = zsetobj->zsl;
5821 zskiplistNode *ln;
5822 robj *ele, *lenobj = NULL;
5823 unsigned long rangelen = 0;
5824
5825 /* Get the first node with the score >= min, or with
5826 * score > min if 'minex' is true. */
5827 ln = zslFirstWithScore(zsl,min);
5828 while (minex && ln && ln->score == min) ln = ln->forward[0];
5829
5830 if (ln == NULL) {
5831 /* No element matching the speciifed interval */
5832 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5833 return;
5834 }
5835
5836 /* We don't know in advance how many matching elements there
5837 * are in the list, so we push this object that will represent
5838 * the multi-bulk length in the output buffer, and will "fix"
5839 * it later */
5840 if (!justcount) {
5841 lenobj = createObject(REDIS_STRING,NULL);
5842 addReply(c,lenobj);
5843 decrRefCount(lenobj);
5844 }
5845
5846 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5847 if (offset) {
5848 offset--;
5849 ln = ln->forward[0];
5850 continue;
5851 }
5852 if (limit == 0) break;
5853 if (!justcount) {
5854 ele = ln->obj;
5855 addReplyBulk(c,ele);
5856 if (withscores)
5857 addReplyDouble(c,ln->score);
5858 }
5859 ln = ln->forward[0];
5860 rangelen++;
5861 if (limit > 0) limit--;
5862 }
5863 if (justcount) {
5864 addReplyLong(c,(long)rangelen);
5865 } else {
5866 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5867 withscores ? (rangelen*2) : rangelen);
5868 }
5869 }
5870 }
5871 }
5872
5873 static void zrangebyscoreCommand(redisClient *c) {
5874 genericZrangebyscoreCommand(c,0);
5875 }
5876
5877 static void zcountCommand(redisClient *c) {
5878 genericZrangebyscoreCommand(c,1);
5879 }
5880
5881 static void zcardCommand(redisClient *c) {
5882 robj *o;
5883 zset *zs;
5884
5885 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5886 checkType(c,o,REDIS_ZSET)) return;
5887
5888 zs = o->ptr;
5889 addReplyUlong(c,zs->zsl->length);
5890 }
5891
5892 static void zscoreCommand(redisClient *c) {
5893 robj *o;
5894 zset *zs;
5895 dictEntry *de;
5896
5897 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5898 checkType(c,o,REDIS_ZSET)) return;
5899
5900 zs = o->ptr;
5901 de = dictFind(zs->dict,c->argv[2]);
5902 if (!de) {
5903 addReply(c,shared.nullbulk);
5904 } else {
5905 double *score = dictGetEntryVal(de);
5906
5907 addReplyDouble(c,*score);
5908 }
5909 }
5910
5911 static void zrankGenericCommand(redisClient *c, int reverse) {
5912 robj *o;
5913 zset *zs;
5914 zskiplist *zsl;
5915 dictEntry *de;
5916 unsigned long rank;
5917 double *score;
5918
5919 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5920 checkType(c,o,REDIS_ZSET)) return;
5921
5922 zs = o->ptr;
5923 zsl = zs->zsl;
5924 de = dictFind(zs->dict,c->argv[2]);
5925 if (!de) {
5926 addReply(c,shared.nullbulk);
5927 return;
5928 }
5929
5930 score = dictGetEntryVal(de);
5931 rank = zslGetRank(zsl, *score, c->argv[2]);
5932 if (rank) {
5933 if (reverse) {
5934 addReplyLong(c, zsl->length - rank);
5935 } else {
5936 addReplyLong(c, rank-1);
5937 }
5938 } else {
5939 addReply(c,shared.nullbulk);
5940 }
5941 }
5942
5943 static void zrankCommand(redisClient *c) {
5944 zrankGenericCommand(c, 0);
5945 }
5946
5947 static void zrevrankCommand(redisClient *c) {
5948 zrankGenericCommand(c, 1);
5949 }
5950
5951 /* =================================== Hashes =============================== */
5952 static void hsetCommand(redisClient *c) {
5953 int update = 0;
5954 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5955
5956 if (o == NULL) {
5957 o = createHashObject();
5958 dictAdd(c->db->dict,c->argv[1],o);
5959 incrRefCount(c->argv[1]);
5960 } else {
5961 if (o->type != REDIS_HASH) {
5962 addReply(c,shared.wrongtypeerr);
5963 return;
5964 }
5965 }
5966 /* We want to convert the zipmap into an hash table right now if the
5967 * entry to be added is too big. Note that we check if the object
5968 * is integer encoded before to try fetching the length in the test below.
5969 * This is because integers are small, but currently stringObjectLen()
5970 * performs a slow conversion: not worth it. */
5971 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5972 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5973 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5974 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5975 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5976 {
5977 convertToRealHash(o);
5978 }
5979
5980 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5981 unsigned char *zm = o->ptr;
5982 robj *valobj = getDecodedObject(c->argv[3]);
5983
5984 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5985 valobj->ptr,sdslen(valobj->ptr),&update);
5986 decrRefCount(valobj);
5987 o->ptr = zm;
5988
5989 /* And here there is the second check for hash conversion. */
5990 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
5991 convertToRealHash(o);
5992 } else {
5993 c->argv[2] = tryObjectEncoding(c->argv[2]);
5994 /* note that c->argv[3] is already encoded, as the latest arg
5995 * of a bulk command is always integer encoded if possible. */
5996 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5997 incrRefCount(c->argv[2]);
5998 } else {
5999 update = 1;
6000 }
6001 incrRefCount(c->argv[3]);
6002 }
6003 server.dirty++;
6004 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6005 }
6006
6007 static void hincrbyCommand(redisClient *c) {
6008 long long value = 0, incr = 0;
6009 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6010
6011 if (o == NULL) {
6012 o = createHashObject();
6013 dictAdd(c->db->dict,c->argv[1],o);
6014 incrRefCount(c->argv[1]);
6015 } else {
6016 if (o->type != REDIS_HASH) {
6017 addReply(c,shared.wrongtypeerr);
6018 return;
6019 }
6020 }
6021
6022 incr = strtoll(c->argv[3]->ptr, NULL, 10);
6023 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6024 unsigned char *zm = o->ptr;
6025 unsigned char *zval;
6026 unsigned int zvlen;
6027
6028 /* Find value if already present in hash */
6029 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6030 &zval,&zvlen)) {
6031 /* strtoll needs the char* to have a trailing \0, but
6032 * the zipmap doesn't include them. */
6033 sds szval = sdsnewlen(zval, zvlen);
6034 value = strtoll(szval,NULL,10);
6035 sdsfree(szval);
6036 }
6037
6038 value += incr;
6039 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6040 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6041 (unsigned char*)svalue,sdslen(svalue),NULL);
6042 sdsfree(svalue);
6043 o->ptr = zm;
6044
6045 /* Check if the zipmap needs to be converted. */
6046 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6047 convertToRealHash(o);
6048 } else {
6049 robj *hval;
6050 dictEntry *de;
6051
6052 /* Find value if already present in hash */
6053 de = dictFind(o->ptr,c->argv[2]);
6054 if (de != NULL) {
6055 hval = dictGetEntryVal(de);
6056 if (hval->encoding == REDIS_ENCODING_RAW)
6057 value = strtoll(hval->ptr,NULL,10);
6058 else if (hval->encoding == REDIS_ENCODING_INT)
6059 value = (long)hval->ptr;
6060 else
6061 redisAssert(1 != 1);
6062 }
6063
6064 value += incr;
6065 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6066 hval = tryObjectEncoding(hval);
6067 if (dictReplace(o->ptr,c->argv[2],hval)) {
6068 incrRefCount(c->argv[2]);
6069 }
6070 }
6071
6072 server.dirty++;
6073 addReplyLongLong(c, value);
6074 }
6075
6076 static void hgetCommand(redisClient *c) {
6077 robj *o;
6078
6079 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6080 checkType(c,o,REDIS_HASH)) return;
6081
6082 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6083 unsigned char *zm = o->ptr;
6084 unsigned char *val;
6085 unsigned int vlen;
6086 robj *field;
6087
6088 field = getDecodedObject(c->argv[2]);
6089 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6090 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6091 addReplySds(c,sdsnewlen(val,vlen));
6092 addReply(c,shared.crlf);
6093 decrRefCount(field);
6094 return;
6095 } else {
6096 addReply(c,shared.nullbulk);
6097 decrRefCount(field);
6098 return;
6099 }
6100 } else {
6101 struct dictEntry *de;
6102
6103 de = dictFind(o->ptr,c->argv[2]);
6104 if (de == NULL) {
6105 addReply(c,shared.nullbulk);
6106 } else {
6107 robj *e = dictGetEntryVal(de);
6108
6109 addReplyBulk(c,e);
6110 }
6111 }
6112 }
6113
6114 static void hdelCommand(redisClient *c) {
6115 robj *o;
6116 int deleted = 0;
6117
6118 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6119 checkType(c,o,REDIS_HASH)) return;
6120
6121 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6122 robj *field = getDecodedObject(c->argv[2]);
6123
6124 o->ptr = zipmapDel((unsigned char*) o->ptr,
6125 (unsigned char*) field->ptr,
6126 sdslen(field->ptr), &deleted);
6127 decrRefCount(field);
6128 if (zipmapLen((unsigned char*) o->ptr) == 0)
6129 deleteKey(c->db,c->argv[1]);
6130 } else {
6131 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6132 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6133 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6134 }
6135 if (deleted) server.dirty++;
6136 addReply(c,deleted ? shared.cone : shared.czero);
6137 }
6138
6139 static void hlenCommand(redisClient *c) {
6140 robj *o;
6141 unsigned long len;
6142
6143 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6144 checkType(c,o,REDIS_HASH)) return;
6145
6146 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6147 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6148 addReplyUlong(c,len);
6149 }
6150
6151 #define REDIS_GETALL_KEYS 1
6152 #define REDIS_GETALL_VALS 2
6153 static void genericHgetallCommand(redisClient *c, int flags) {
6154 robj *o, *lenobj;
6155 unsigned long count = 0;
6156
6157 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6158 || checkType(c,o,REDIS_HASH)) return;
6159
6160 lenobj = createObject(REDIS_STRING,NULL);
6161 addReply(c,lenobj);
6162 decrRefCount(lenobj);
6163
6164 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6165 unsigned char *p = zipmapRewind(o->ptr);
6166 unsigned char *field, *val;
6167 unsigned int flen, vlen;
6168
6169 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6170 robj *aux;
6171
6172 if (flags & REDIS_GETALL_KEYS) {
6173 aux = createStringObject((char*)field,flen);
6174 addReplyBulk(c,aux);
6175 decrRefCount(aux);
6176 count++;
6177 }
6178 if (flags & REDIS_GETALL_VALS) {
6179 aux = createStringObject((char*)val,vlen);
6180 addReplyBulk(c,aux);
6181 decrRefCount(aux);
6182 count++;
6183 }
6184 }
6185 } else {
6186 dictIterator *di = dictGetIterator(o->ptr);
6187 dictEntry *de;
6188
6189 while((de = dictNext(di)) != NULL) {
6190 robj *fieldobj = dictGetEntryKey(de);
6191 robj *valobj = dictGetEntryVal(de);
6192
6193 if (flags & REDIS_GETALL_KEYS) {
6194 addReplyBulk(c,fieldobj);
6195 count++;
6196 }
6197 if (flags & REDIS_GETALL_VALS) {
6198 addReplyBulk(c,valobj);
6199 count++;
6200 }
6201 }
6202 dictReleaseIterator(di);
6203 }
6204 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6205 }
6206
6207 static void hkeysCommand(redisClient *c) {
6208 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6209 }
6210
6211 static void hvalsCommand(redisClient *c) {
6212 genericHgetallCommand(c,REDIS_GETALL_VALS);
6213 }
6214
6215 static void hgetallCommand(redisClient *c) {
6216 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6217 }
6218
6219 static void hexistsCommand(redisClient *c) {
6220 robj *o;
6221 int exists = 0;
6222
6223 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6224 checkType(c,o,REDIS_HASH)) return;
6225
6226 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6227 robj *field;
6228 unsigned char *zm = o->ptr;
6229
6230 field = getDecodedObject(c->argv[2]);
6231 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6232 decrRefCount(field);
6233 } else {
6234 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6235 }
6236 addReply(c,exists ? shared.cone : shared.czero);
6237 }
6238
6239 static void convertToRealHash(robj *o) {
6240 unsigned char *key, *val, *p, *zm = o->ptr;
6241 unsigned int klen, vlen;
6242 dict *dict = dictCreate(&hashDictType,NULL);
6243
6244 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6245 p = zipmapRewind(zm);
6246 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6247 robj *keyobj, *valobj;
6248
6249 keyobj = createStringObject((char*)key,klen);
6250 valobj = createStringObject((char*)val,vlen);
6251 keyobj = tryObjectEncoding(keyobj);
6252 valobj = tryObjectEncoding(valobj);
6253 dictAdd(dict,keyobj,valobj);
6254 }
6255 o->encoding = REDIS_ENCODING_HT;
6256 o->ptr = dict;
6257 zfree(zm);
6258 }
6259
6260 /* ========================= Non type-specific commands ==================== */
6261
6262 static void flushdbCommand(redisClient *c) {
6263 server.dirty += dictSize(c->db->dict);
6264 dictEmpty(c->db->dict);
6265 dictEmpty(c->db->expires);
6266 addReply(c,shared.ok);
6267 }
6268
6269 static void flushallCommand(redisClient *c) {
6270 server.dirty += emptyDb();
6271 addReply(c,shared.ok);
6272 if (server.bgsavechildpid != -1) {
6273 kill(server.bgsavechildpid,SIGKILL);
6274 rdbRemoveTempFile(server.bgsavechildpid);
6275 }
6276 rdbSave(server.dbfilename);
6277 server.dirty++;
6278 }
6279
6280 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6281 redisSortOperation *so = zmalloc(sizeof(*so));
6282 so->type = type;
6283 so->pattern = pattern;
6284 return so;
6285 }
6286
6287 /* Return the value associated to the key with a name obtained
6288 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6289 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6290 char *p;
6291 sds spat, ssub;
6292 robj keyobj;
6293 int prefixlen, sublen, postfixlen;
6294 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6295 struct {
6296 long len;
6297 long free;
6298 char buf[REDIS_SORTKEY_MAX+1];
6299 } keyname;
6300
6301 /* If the pattern is "#" return the substitution object itself in order
6302 * to implement the "SORT ... GET #" feature. */
6303 spat = pattern->ptr;
6304 if (spat[0] == '#' && spat[1] == '\0') {
6305 return subst;
6306 }
6307
6308 /* The substitution object may be specially encoded. If so we create
6309 * a decoded object on the fly. Otherwise getDecodedObject will just
6310 * increment the ref count, that we'll decrement later. */
6311 subst = getDecodedObject(subst);
6312
6313 ssub = subst->ptr;
6314 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6315 p = strchr(spat,'*');
6316 if (!p) {
6317 decrRefCount(subst);
6318 return NULL;
6319 }
6320
6321 prefixlen = p-spat;
6322 sublen = sdslen(ssub);
6323 postfixlen = sdslen(spat)-(prefixlen+1);
6324 memcpy(keyname.buf,spat,prefixlen);
6325 memcpy(keyname.buf+prefixlen,ssub,sublen);
6326 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6327 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6328 keyname.len = prefixlen+sublen+postfixlen;
6329
6330 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6331 decrRefCount(subst);
6332
6333 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6334 return lookupKeyRead(db,&keyobj);
6335 }
6336
6337 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6338 * the additional parameter is not standard but a BSD-specific we have to
6339 * pass sorting parameters via the global 'server' structure */
6340 static int sortCompare(const void *s1, const void *s2) {
6341 const redisSortObject *so1 = s1, *so2 = s2;
6342 int cmp;
6343
6344 if (!server.sort_alpha) {
6345 /* Numeric sorting. Here it's trivial as we precomputed scores */
6346 if (so1->u.score > so2->u.score) {
6347 cmp = 1;
6348 } else if (so1->u.score < so2->u.score) {
6349 cmp = -1;
6350 } else {
6351 cmp = 0;
6352 }
6353 } else {
6354 /* Alphanumeric sorting */
6355 if (server.sort_bypattern) {
6356 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6357 /* At least one compare object is NULL */
6358 if (so1->u.cmpobj == so2->u.cmpobj)
6359 cmp = 0;
6360 else if (so1->u.cmpobj == NULL)
6361 cmp = -1;
6362 else
6363 cmp = 1;
6364 } else {
6365 /* We have both the objects, use strcoll */
6366 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6367 }
6368 } else {
6369 /* Compare elements directly */
6370 robj *dec1, *dec2;
6371
6372 dec1 = getDecodedObject(so1->obj);
6373 dec2 = getDecodedObject(so2->obj);
6374 cmp = strcoll(dec1->ptr,dec2->ptr);
6375 decrRefCount(dec1);
6376 decrRefCount(dec2);
6377 }
6378 }
6379 return server.sort_desc ? -cmp : cmp;
6380 }
6381
6382 /* The SORT command is the most complex command in Redis. Warning: this code
6383 * is optimized for speed and a bit less for readability */
6384 static void sortCommand(redisClient *c) {
6385 list *operations;
6386 int outputlen = 0;
6387 int desc = 0, alpha = 0;
6388 int limit_start = 0, limit_count = -1, start, end;
6389 int j, dontsort = 0, vectorlen;
6390 int getop = 0; /* GET operation counter */
6391 robj *sortval, *sortby = NULL, *storekey = NULL;
6392 redisSortObject *vector; /* Resulting vector to sort */
6393
6394 /* Lookup the key to sort. It must be of the right types */
6395 sortval = lookupKeyRead(c->db,c->argv[1]);
6396 if (sortval == NULL) {
6397 addReply(c,shared.nullmultibulk);
6398 return;
6399 }
6400 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6401 sortval->type != REDIS_ZSET)
6402 {
6403 addReply(c,shared.wrongtypeerr);
6404 return;
6405 }
6406
6407 /* Create a list of operations to perform for every sorted element.
6408 * Operations can be GET/DEL/INCR/DECR */
6409 operations = listCreate();
6410 listSetFreeMethod(operations,zfree);
6411 j = 2;
6412
6413 /* Now we need to protect sortval incrementing its count, in the future
6414 * SORT may have options able to overwrite/delete keys during the sorting
6415 * and the sorted key itself may get destroied */
6416 incrRefCount(sortval);
6417
6418 /* The SORT command has an SQL-alike syntax, parse it */
6419 while(j < c->argc) {
6420 int leftargs = c->argc-j-1;
6421 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6422 desc = 0;
6423 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6424 desc = 1;
6425 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6426 alpha = 1;
6427 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6428 limit_start = atoi(c->argv[j+1]->ptr);
6429 limit_count = atoi(c->argv[j+2]->ptr);
6430 j+=2;
6431 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6432 storekey = c->argv[j+1];
6433 j++;
6434 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6435 sortby = c->argv[j+1];
6436 /* If the BY pattern does not contain '*', i.e. it is constant,
6437 * we don't need to sort nor to lookup the weight keys. */
6438 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6439 j++;
6440 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6441 listAddNodeTail(operations,createSortOperation(
6442 REDIS_SORT_GET,c->argv[j+1]));
6443 getop++;
6444 j++;
6445 } else {
6446 decrRefCount(sortval);
6447 listRelease(operations);
6448 addReply(c,shared.syntaxerr);
6449 return;
6450 }
6451 j++;
6452 }
6453
6454 /* Load the sorting vector with all the objects to sort */
6455 switch(sortval->type) {
6456 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6457 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6458 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6459 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6460 }
6461 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6462 j = 0;
6463
6464 if (sortval->type == REDIS_LIST) {
6465 list *list = sortval->ptr;
6466 listNode *ln;
6467 listIter li;
6468
6469 listRewind(list,&li);
6470 while((ln = listNext(&li))) {
6471 robj *ele = ln->value;
6472 vector[j].obj = ele;
6473 vector[j].u.score = 0;
6474 vector[j].u.cmpobj = NULL;
6475 j++;
6476 }
6477 } else {
6478 dict *set;
6479 dictIterator *di;
6480 dictEntry *setele;
6481
6482 if (sortval->type == REDIS_SET) {
6483 set = sortval->ptr;
6484 } else {
6485 zset *zs = sortval->ptr;
6486 set = zs->dict;
6487 }
6488
6489 di = dictGetIterator(set);
6490 while((setele = dictNext(di)) != NULL) {
6491 vector[j].obj = dictGetEntryKey(setele);
6492 vector[j].u.score = 0;
6493 vector[j].u.cmpobj = NULL;
6494 j++;
6495 }
6496 dictReleaseIterator(di);
6497 }
6498 redisAssert(j == vectorlen);
6499
6500 /* Now it's time to load the right scores in the sorting vector */
6501 if (dontsort == 0) {
6502 for (j = 0; j < vectorlen; j++) {
6503 if (sortby) {
6504 robj *byval;
6505
6506 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6507 if (!byval || byval->type != REDIS_STRING) continue;
6508 if (alpha) {
6509 vector[j].u.cmpobj = getDecodedObject(byval);
6510 } else {
6511 if (byval->encoding == REDIS_ENCODING_RAW) {
6512 vector[j].u.score = strtod(byval->ptr,NULL);
6513 } else {
6514 /* Don't need to decode the object if it's
6515 * integer-encoded (the only encoding supported) so
6516 * far. We can just cast it */
6517 if (byval->encoding == REDIS_ENCODING_INT) {
6518 vector[j].u.score = (long)byval->ptr;
6519 } else
6520 redisAssert(1 != 1);
6521 }
6522 }
6523 } else {
6524 if (!alpha) {
6525 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6526 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6527 else {
6528 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6529 vector[j].u.score = (long) vector[j].obj->ptr;
6530 else
6531 redisAssert(1 != 1);
6532 }
6533 }
6534 }
6535 }
6536 }
6537
6538 /* We are ready to sort the vector... perform a bit of sanity check
6539 * on the LIMIT option too. We'll use a partial version of quicksort. */
6540 start = (limit_start < 0) ? 0 : limit_start;
6541 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6542 if (start >= vectorlen) {
6543 start = vectorlen-1;
6544 end = vectorlen-2;
6545 }
6546 if (end >= vectorlen) end = vectorlen-1;
6547
6548 if (dontsort == 0) {
6549 server.sort_desc = desc;
6550 server.sort_alpha = alpha;
6551 server.sort_bypattern = sortby ? 1 : 0;
6552 if (sortby && (start != 0 || end != vectorlen-1))
6553 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6554 else
6555 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6556 }
6557
6558 /* Send command output to the output buffer, performing the specified
6559 * GET/DEL/INCR/DECR operations if any. */
6560 outputlen = getop ? getop*(end-start+1) : end-start+1;
6561 if (storekey == NULL) {
6562 /* STORE option not specified, sent the sorting result to client */
6563 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6564 for (j = start; j <= end; j++) {
6565 listNode *ln;
6566 listIter li;
6567
6568 if (!getop) addReplyBulk(c,vector[j].obj);
6569 listRewind(operations,&li);
6570 while((ln = listNext(&li))) {
6571 redisSortOperation *sop = ln->value;
6572 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6573 vector[j].obj);
6574
6575 if (sop->type == REDIS_SORT_GET) {
6576 if (!val || val->type != REDIS_STRING) {
6577 addReply(c,shared.nullbulk);
6578 } else {
6579 addReplyBulk(c,val);
6580 }
6581 } else {
6582 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6583 }
6584 }
6585 }
6586 } else {
6587 robj *listObject = createListObject();
6588 list *listPtr = (list*) listObject->ptr;
6589
6590 /* STORE option specified, set the sorting result as a List object */
6591 for (j = start; j <= end; j++) {
6592 listNode *ln;
6593 listIter li;
6594
6595 if (!getop) {
6596 listAddNodeTail(listPtr,vector[j].obj);
6597 incrRefCount(vector[j].obj);
6598 }
6599 listRewind(operations,&li);
6600 while((ln = listNext(&li))) {
6601 redisSortOperation *sop = ln->value;
6602 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6603 vector[j].obj);
6604
6605 if (sop->type == REDIS_SORT_GET) {
6606 if (!val || val->type != REDIS_STRING) {
6607 listAddNodeTail(listPtr,createStringObject("",0));
6608 } else {
6609 listAddNodeTail(listPtr,val);
6610 incrRefCount(val);
6611 }
6612 } else {
6613 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6614 }
6615 }
6616 }
6617 if (dictReplace(c->db->dict,storekey,listObject)) {
6618 incrRefCount(storekey);
6619 }
6620 /* Note: we add 1 because the DB is dirty anyway since even if the
6621 * SORT result is empty a new key is set and maybe the old content
6622 * replaced. */
6623 server.dirty += 1+outputlen;
6624 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6625 }
6626
6627 /* Cleanup */
6628 decrRefCount(sortval);
6629 listRelease(operations);
6630 for (j = 0; j < vectorlen; j++) {
6631 if (sortby && alpha && vector[j].u.cmpobj)
6632 decrRefCount(vector[j].u.cmpobj);
6633 }
6634 zfree(vector);
6635 }
6636
6637 /* Convert an amount of bytes into a human readable string in the form
6638 * of 100B, 2G, 100M, 4K, and so forth. */
6639 static void bytesToHuman(char *s, unsigned long long n) {
6640 double d;
6641
6642 if (n < 1024) {
6643 /* Bytes */
6644 sprintf(s,"%lluB",n);
6645 return;
6646 } else if (n < (1024*1024)) {
6647 d = (double)n/(1024);
6648 sprintf(s,"%.2fK",d);
6649 } else if (n < (1024LL*1024*1024)) {
6650 d = (double)n/(1024*1024);
6651 sprintf(s,"%.2fM",d);
6652 } else if (n < (1024LL*1024*1024*1024)) {
6653 d = (double)n/(1024LL*1024*1024);
6654 sprintf(s,"%.2fG",d);
6655 }
6656 }
6657
6658 /* Create the string returned by the INFO command. This is decoupled
6659 * by the INFO command itself as we need to report the same information
6660 * on memory corruption problems. */
6661 static sds genRedisInfoString(void) {
6662 sds info;
6663 time_t uptime = time(NULL)-server.stat_starttime;
6664 int j;
6665 char hmem[64];
6666
6667 bytesToHuman(hmem,zmalloc_used_memory());
6668 info = sdscatprintf(sdsempty(),
6669 "redis_version:%s\r\n"
6670 "arch_bits:%s\r\n"
6671 "multiplexing_api:%s\r\n"
6672 "process_id:%ld\r\n"
6673 "uptime_in_seconds:%ld\r\n"
6674 "uptime_in_days:%ld\r\n"
6675 "connected_clients:%d\r\n"
6676 "connected_slaves:%d\r\n"
6677 "blocked_clients:%d\r\n"
6678 "used_memory:%zu\r\n"
6679 "used_memory_human:%s\r\n"
6680 "changes_since_last_save:%lld\r\n"
6681 "bgsave_in_progress:%d\r\n"
6682 "last_save_time:%ld\r\n"
6683 "bgrewriteaof_in_progress:%d\r\n"
6684 "total_connections_received:%lld\r\n"
6685 "total_commands_processed:%lld\r\n"
6686 "expired_keys:%lld\r\n"
6687 "hash_max_zipmap_entries:%ld\r\n"
6688 "hash_max_zipmap_value:%ld\r\n"
6689 "pubsub_channels:%ld\r\n"
6690 "pubsub_patterns:%u\r\n"
6691 "vm_enabled:%d\r\n"
6692 "role:%s\r\n"
6693 ,REDIS_VERSION,
6694 (sizeof(long) == 8) ? "64" : "32",
6695 aeGetApiName(),
6696 (long) getpid(),
6697 uptime,
6698 uptime/(3600*24),
6699 listLength(server.clients)-listLength(server.slaves),
6700 listLength(server.slaves),
6701 server.blpop_blocked_clients,
6702 zmalloc_used_memory(),
6703 hmem,
6704 server.dirty,
6705 server.bgsavechildpid != -1,
6706 server.lastsave,
6707 server.bgrewritechildpid != -1,
6708 server.stat_numconnections,
6709 server.stat_numcommands,
6710 server.stat_expiredkeys,
6711 server.hash_max_zipmap_entries,
6712 server.hash_max_zipmap_value,
6713 dictSize(server.pubsub_channels),
6714 listLength(server.pubsub_patterns),
6715 server.vm_enabled != 0,
6716 server.masterhost == NULL ? "master" : "slave"
6717 );
6718 if (server.masterhost) {
6719 info = sdscatprintf(info,
6720 "master_host:%s\r\n"
6721 "master_port:%d\r\n"
6722 "master_link_status:%s\r\n"
6723 "master_last_io_seconds_ago:%d\r\n"
6724 ,server.masterhost,
6725 server.masterport,
6726 (server.replstate == REDIS_REPL_CONNECTED) ?
6727 "up" : "down",
6728 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6729 );
6730 }
6731 if (server.vm_enabled) {
6732 lockThreadedIO();
6733 info = sdscatprintf(info,
6734 "vm_conf_max_memory:%llu\r\n"
6735 "vm_conf_page_size:%llu\r\n"
6736 "vm_conf_pages:%llu\r\n"
6737 "vm_stats_used_pages:%llu\r\n"
6738 "vm_stats_swapped_objects:%llu\r\n"
6739 "vm_stats_swappin_count:%llu\r\n"
6740 "vm_stats_swappout_count:%llu\r\n"
6741 "vm_stats_io_newjobs_len:%lu\r\n"
6742 "vm_stats_io_processing_len:%lu\r\n"
6743 "vm_stats_io_processed_len:%lu\r\n"
6744 "vm_stats_io_active_threads:%lu\r\n"
6745 "vm_stats_blocked_clients:%lu\r\n"
6746 ,(unsigned long long) server.vm_max_memory,
6747 (unsigned long long) server.vm_page_size,
6748 (unsigned long long) server.vm_pages,
6749 (unsigned long long) server.vm_stats_used_pages,
6750 (unsigned long long) server.vm_stats_swapped_objects,
6751 (unsigned long long) server.vm_stats_swapins,
6752 (unsigned long long) server.vm_stats_swapouts,
6753 (unsigned long) listLength(server.io_newjobs),
6754 (unsigned long) listLength(server.io_processing),
6755 (unsigned long) listLength(server.io_processed),
6756 (unsigned long) server.io_active_threads,
6757 (unsigned long) server.vm_blocked_clients
6758 );
6759 unlockThreadedIO();
6760 }
6761 for (j = 0; j < server.dbnum; j++) {
6762 long long keys, vkeys;
6763
6764 keys = dictSize(server.db[j].dict);
6765 vkeys = dictSize(server.db[j].expires);
6766 if (keys || vkeys) {
6767 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6768 j, keys, vkeys);
6769 }
6770 }
6771 return info;
6772 }
6773
6774 static void infoCommand(redisClient *c) {
6775 sds info = genRedisInfoString();
6776 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6777 (unsigned long)sdslen(info)));
6778 addReplySds(c,info);
6779 addReply(c,shared.crlf);
6780 }
6781
6782 static void monitorCommand(redisClient *c) {
6783 /* ignore MONITOR if aleady slave or in monitor mode */
6784 if (c->flags & REDIS_SLAVE) return;
6785
6786 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6787 c->slaveseldb = 0;
6788 listAddNodeTail(server.monitors,c);
6789 addReply(c,shared.ok);
6790 }
6791
6792 /* ================================= Expire ================================= */
6793 static int removeExpire(redisDb *db, robj *key) {
6794 if (dictDelete(db->expires,key) == DICT_OK) {
6795 return 1;
6796 } else {
6797 return 0;
6798 }
6799 }
6800
6801 static int setExpire(redisDb *db, robj *key, time_t when) {
6802 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6803 return 0;
6804 } else {
6805 incrRefCount(key);
6806 return 1;
6807 }
6808 }
6809
6810 /* Return the expire time of the specified key, or -1 if no expire
6811 * is associated with this key (i.e. the key is non volatile) */
6812 static time_t getExpire(redisDb *db, robj *key) {
6813 dictEntry *de;
6814
6815 /* No expire? return ASAP */
6816 if (dictSize(db->expires) == 0 ||
6817 (de = dictFind(db->expires,key)) == NULL) return -1;
6818
6819 return (time_t) dictGetEntryVal(de);
6820 }
6821
6822 static int expireIfNeeded(redisDb *db, robj *key) {
6823 time_t when;
6824 dictEntry *de;
6825
6826 /* No expire? return ASAP */
6827 if (dictSize(db->expires) == 0 ||
6828 (de = dictFind(db->expires,key)) == NULL) return 0;
6829
6830 /* Lookup the expire */
6831 when = (time_t) dictGetEntryVal(de);
6832 if (time(NULL) <= when) return 0;
6833
6834 /* Delete the key */
6835 dictDelete(db->expires,key);
6836 server.stat_expiredkeys++;
6837 return dictDelete(db->dict,key) == DICT_OK;
6838 }
6839
6840 static int deleteIfVolatile(redisDb *db, robj *key) {
6841 dictEntry *de;
6842
6843 /* No expire? return ASAP */
6844 if (dictSize(db->expires) == 0 ||
6845 (de = dictFind(db->expires,key)) == NULL) return 0;
6846
6847 /* Delete the key */
6848 server.dirty++;
6849 server.stat_expiredkeys++;
6850 dictDelete(db->expires,key);
6851 return dictDelete(db->dict,key) == DICT_OK;
6852 }
6853
6854 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6855 dictEntry *de;
6856
6857 de = dictFind(c->db->dict,key);
6858 if (de == NULL) {
6859 addReply(c,shared.czero);
6860 return;
6861 }
6862 if (seconds < 0) {
6863 if (deleteKey(c->db,key)) server.dirty++;
6864 addReply(c, shared.cone);
6865 return;
6866 } else {
6867 time_t when = time(NULL)+seconds;
6868 if (setExpire(c->db,key,when)) {
6869 addReply(c,shared.cone);
6870 server.dirty++;
6871 } else {
6872 addReply(c,shared.czero);
6873 }
6874 return;
6875 }
6876 }
6877
6878 static void expireCommand(redisClient *c) {
6879 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6880 }
6881
6882 static void expireatCommand(redisClient *c) {
6883 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6884 }
6885
6886 static void ttlCommand(redisClient *c) {
6887 time_t expire;
6888 int ttl = -1;
6889
6890 expire = getExpire(c->db,c->argv[1]);
6891 if (expire != -1) {
6892 ttl = (int) (expire-time(NULL));
6893 if (ttl < 0) ttl = -1;
6894 }
6895 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6896 }
6897
6898 /* ================================ MULTI/EXEC ============================== */
6899
6900 /* Client state initialization for MULTI/EXEC */
6901 static void initClientMultiState(redisClient *c) {
6902 c->mstate.commands = NULL;
6903 c->mstate.count = 0;
6904 }
6905
6906 /* Release all the resources associated with MULTI/EXEC state */
6907 static void freeClientMultiState(redisClient *c) {
6908 int j;
6909
6910 for (j = 0; j < c->mstate.count; j++) {
6911 int i;
6912 multiCmd *mc = c->mstate.commands+j;
6913
6914 for (i = 0; i < mc->argc; i++)
6915 decrRefCount(mc->argv[i]);
6916 zfree(mc->argv);
6917 }
6918 zfree(c->mstate.commands);
6919 }
6920
6921 /* Add a new command into the MULTI commands queue */
6922 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6923 multiCmd *mc;
6924 int j;
6925
6926 c->mstate.commands = zrealloc(c->mstate.commands,
6927 sizeof(multiCmd)*(c->mstate.count+1));
6928 mc = c->mstate.commands+c->mstate.count;
6929 mc->cmd = cmd;
6930 mc->argc = c->argc;
6931 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6932 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6933 for (j = 0; j < c->argc; j++)
6934 incrRefCount(mc->argv[j]);
6935 c->mstate.count++;
6936 }
6937
6938 static void multiCommand(redisClient *c) {
6939 c->flags |= REDIS_MULTI;
6940 addReply(c,shared.ok);
6941 }
6942
6943 static void discardCommand(redisClient *c) {
6944 if (!(c->flags & REDIS_MULTI)) {
6945 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6946 return;
6947 }
6948
6949 freeClientMultiState(c);
6950 initClientMultiState(c);
6951 c->flags &= (~REDIS_MULTI);
6952 addReply(c,shared.ok);
6953 }
6954
6955 static void execCommand(redisClient *c) {
6956 int j;
6957 robj **orig_argv;
6958 int orig_argc;
6959
6960 if (!(c->flags & REDIS_MULTI)) {
6961 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6962 return;
6963 }
6964
6965 orig_argv = c->argv;
6966 orig_argc = c->argc;
6967 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6968 for (j = 0; j < c->mstate.count; j++) {
6969 c->argc = c->mstate.commands[j].argc;
6970 c->argv = c->mstate.commands[j].argv;
6971 call(c,c->mstate.commands[j].cmd);
6972 }
6973 c->argv = orig_argv;
6974 c->argc = orig_argc;
6975 freeClientMultiState(c);
6976 initClientMultiState(c);
6977 c->flags &= (~REDIS_MULTI);
6978 }
6979
6980 /* =========================== Blocking Operations ========================= */
6981
6982 /* Currently Redis blocking operations support is limited to list POP ops,
6983 * so the current implementation is not fully generic, but it is also not
6984 * completely specific so it will not require a rewrite to support new
6985 * kind of blocking operations in the future.
6986 *
6987 * Still it's important to note that list blocking operations can be already
6988 * used as a notification mechanism in order to implement other blocking
6989 * operations at application level, so there must be a very strong evidence
6990 * of usefulness and generality before new blocking operations are implemented.
6991 *
6992 * This is how the current blocking POP works, we use BLPOP as example:
6993 * - If the user calls BLPOP and the key exists and contains a non empty list
6994 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6995 * if there is not to block.
6996 * - If instead BLPOP is called and the key does not exists or the list is
6997 * empty we need to block. In order to do so we remove the notification for
6998 * new data to read in the client socket (so that we'll not serve new
6999 * requests if the blocking request is not served). Also we put the client
7000 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7001 * blocking for this keys.
7002 * - If a PUSH operation against a key with blocked clients waiting is
7003 * performed, we serve the first in the list: basically instead to push
7004 * the new element inside the list we return it to the (first / oldest)
7005 * blocking client, unblock the client, and remove it form the list.
7006 *
7007 * The above comment and the source code should be enough in order to understand
7008 * the implementation and modify / fix it later.
7009 */
7010
7011 /* Set a client in blocking mode for the specified key, with the specified
7012 * timeout */
7013 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7014 dictEntry *de;
7015 list *l;
7016 int j;
7017
7018 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7019 c->blockingkeysnum = numkeys;
7020 c->blockingto = timeout;
7021 for (j = 0; j < numkeys; j++) {
7022 /* Add the key in the client structure, to map clients -> keys */
7023 c->blockingkeys[j] = keys[j];
7024 incrRefCount(keys[j]);
7025
7026 /* And in the other "side", to map keys -> clients */
7027 de = dictFind(c->db->blockingkeys,keys[j]);
7028 if (de == NULL) {
7029 int retval;
7030
7031 /* For every key we take a list of clients blocked for it */
7032 l = listCreate();
7033 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7034 incrRefCount(keys[j]);
7035 assert(retval == DICT_OK);
7036 } else {
7037 l = dictGetEntryVal(de);
7038 }
7039 listAddNodeTail(l,c);
7040 }
7041 /* Mark the client as a blocked client */
7042 c->flags |= REDIS_BLOCKED;
7043 server.blpop_blocked_clients++;
7044 }
7045
7046 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7047 static void unblockClientWaitingData(redisClient *c) {
7048 dictEntry *de;
7049 list *l;
7050 int j;
7051
7052 assert(c->blockingkeys != NULL);
7053 /* The client may wait for multiple keys, so unblock it for every key. */
7054 for (j = 0; j < c->blockingkeysnum; j++) {
7055 /* Remove this client from the list of clients waiting for this key. */
7056 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7057 assert(de != NULL);
7058 l = dictGetEntryVal(de);
7059 listDelNode(l,listSearchKey(l,c));
7060 /* If the list is empty we need to remove it to avoid wasting memory */
7061 if (listLength(l) == 0)
7062 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7063 decrRefCount(c->blockingkeys[j]);
7064 }
7065 /* Cleanup the client structure */
7066 zfree(c->blockingkeys);
7067 c->blockingkeys = NULL;
7068 c->flags &= (~REDIS_BLOCKED);
7069 server.blpop_blocked_clients--;
7070 /* We want to process data if there is some command waiting
7071 * in the input buffer. Note that this is safe even if
7072 * unblockClientWaitingData() gets called from freeClient() because
7073 * freeClient() will be smart enough to call this function
7074 * *after* c->querybuf was set to NULL. */
7075 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7076 }
7077
7078 /* This should be called from any function PUSHing into lists.
7079 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7080 * 'ele' is the element pushed.
7081 *
7082 * If the function returns 0 there was no client waiting for a list push
7083 * against this key.
7084 *
7085 * If the function returns 1 there was a client waiting for a list push
7086 * against this key, the element was passed to this client thus it's not
7087 * needed to actually add it to the list and the caller should return asap. */
7088 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7089 struct dictEntry *de;
7090 redisClient *receiver;
7091 list *l;
7092 listNode *ln;
7093
7094 de = dictFind(c->db->blockingkeys,key);
7095 if (de == NULL) return 0;
7096 l = dictGetEntryVal(de);
7097 ln = listFirst(l);
7098 assert(ln != NULL);
7099 receiver = ln->value;
7100
7101 addReplySds(receiver,sdsnew("*2\r\n"));
7102 addReplyBulk(receiver,key);
7103 addReplyBulk(receiver,ele);
7104 unblockClientWaitingData(receiver);
7105 return 1;
7106 }
7107
7108 /* Blocking RPOP/LPOP */
7109 static void blockingPopGenericCommand(redisClient *c, int where) {
7110 robj *o;
7111 time_t timeout;
7112 int j;
7113
7114 for (j = 1; j < c->argc-1; j++) {
7115 o = lookupKeyWrite(c->db,c->argv[j]);
7116 if (o != NULL) {
7117 if (o->type != REDIS_LIST) {
7118 addReply(c,shared.wrongtypeerr);
7119 return;
7120 } else {
7121 list *list = o->ptr;
7122 if (listLength(list) != 0) {
7123 /* If the list contains elements fall back to the usual
7124 * non-blocking POP operation */
7125 robj *argv[2], **orig_argv;
7126 int orig_argc;
7127
7128 /* We need to alter the command arguments before to call
7129 * popGenericCommand() as the command takes a single key. */
7130 orig_argv = c->argv;
7131 orig_argc = c->argc;
7132 argv[1] = c->argv[j];
7133 c->argv = argv;
7134 c->argc = 2;
7135
7136 /* Also the return value is different, we need to output
7137 * the multi bulk reply header and the key name. The
7138 * "real" command will add the last element (the value)
7139 * for us. If this souds like an hack to you it's just
7140 * because it is... */
7141 addReplySds(c,sdsnew("*2\r\n"));
7142 addReplyBulk(c,argv[1]);
7143 popGenericCommand(c,where);
7144
7145 /* Fix the client structure with the original stuff */
7146 c->argv = orig_argv;
7147 c->argc = orig_argc;
7148 return;
7149 }
7150 }
7151 }
7152 }
7153 /* If the list is empty or the key does not exists we must block */
7154 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7155 if (timeout > 0) timeout += time(NULL);
7156 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7157 }
7158
7159 static void blpopCommand(redisClient *c) {
7160 blockingPopGenericCommand(c,REDIS_HEAD);
7161 }
7162
7163 static void brpopCommand(redisClient *c) {
7164 blockingPopGenericCommand(c,REDIS_TAIL);
7165 }
7166
7167 /* =============================== Replication ============================= */
7168
7169 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7170 ssize_t nwritten, ret = size;
7171 time_t start = time(NULL);
7172
7173 timeout++;
7174 while(size) {
7175 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7176 nwritten = write(fd,ptr,size);
7177 if (nwritten == -1) return -1;
7178 ptr += nwritten;
7179 size -= nwritten;
7180 }
7181 if ((time(NULL)-start) > timeout) {
7182 errno = ETIMEDOUT;
7183 return -1;
7184 }
7185 }
7186 return ret;
7187 }
7188
7189 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7190 ssize_t nread, totread = 0;
7191 time_t start = time(NULL);
7192
7193 timeout++;
7194 while(size) {
7195 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7196 nread = read(fd,ptr,size);
7197 if (nread == -1) return -1;
7198 ptr += nread;
7199 size -= nread;
7200 totread += nread;
7201 }
7202 if ((time(NULL)-start) > timeout) {
7203 errno = ETIMEDOUT;
7204 return -1;
7205 }
7206 }
7207 return totread;
7208 }
7209
7210 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7211 ssize_t nread = 0;
7212
7213 size--;
7214 while(size) {
7215 char c;
7216
7217 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7218 if (c == '\n') {
7219 *ptr = '\0';
7220 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7221 return nread;
7222 } else {
7223 *ptr++ = c;
7224 *ptr = '\0';
7225 nread++;
7226 }
7227 }
7228 return nread;
7229 }
7230
7231 static void syncCommand(redisClient *c) {
7232 /* ignore SYNC if aleady slave or in monitor mode */
7233 if (c->flags & REDIS_SLAVE) return;
7234
7235 /* SYNC can't be issued when the server has pending data to send to
7236 * the client about already issued commands. We need a fresh reply
7237 * buffer registering the differences between the BGSAVE and the current
7238 * dataset, so that we can copy to other slaves if needed. */
7239 if (listLength(c->reply) != 0) {
7240 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7241 return;
7242 }
7243
7244 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7245 /* Here we need to check if there is a background saving operation
7246 * in progress, or if it is required to start one */
7247 if (server.bgsavechildpid != -1) {
7248 /* Ok a background save is in progress. Let's check if it is a good
7249 * one for replication, i.e. if there is another slave that is
7250 * registering differences since the server forked to save */
7251 redisClient *slave;
7252 listNode *ln;
7253 listIter li;
7254
7255 listRewind(server.slaves,&li);
7256 while((ln = listNext(&li))) {
7257 slave = ln->value;
7258 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7259 }
7260 if (ln) {
7261 /* Perfect, the server is already registering differences for
7262 * another slave. Set the right state, and copy the buffer. */
7263 listRelease(c->reply);
7264 c->reply = listDup(slave->reply);
7265 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7266 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7267 } else {
7268 /* No way, we need to wait for the next BGSAVE in order to
7269 * register differences */
7270 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7271 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7272 }
7273 } else {
7274 /* Ok we don't have a BGSAVE in progress, let's start one */
7275 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7276 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7277 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7278 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7279 return;
7280 }
7281 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7282 }
7283 c->repldbfd = -1;
7284 c->flags |= REDIS_SLAVE;
7285 c->slaveseldb = 0;
7286 listAddNodeTail(server.slaves,c);
7287 return;
7288 }
7289
7290 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7291 redisClient *slave = privdata;
7292 REDIS_NOTUSED(el);
7293 REDIS_NOTUSED(mask);
7294 char buf[REDIS_IOBUF_LEN];
7295 ssize_t nwritten, buflen;
7296
7297 if (slave->repldboff == 0) {
7298 /* Write the bulk write count before to transfer the DB. In theory here
7299 * we don't know how much room there is in the output buffer of the
7300 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7301 * operations) will never be smaller than the few bytes we need. */
7302 sds bulkcount;
7303
7304 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7305 slave->repldbsize);
7306 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7307 {
7308 sdsfree(bulkcount);
7309 freeClient(slave);
7310 return;
7311 }
7312 sdsfree(bulkcount);
7313 }
7314 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7315 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7316 if (buflen <= 0) {
7317 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7318 (buflen == 0) ? "premature EOF" : strerror(errno));
7319 freeClient(slave);
7320 return;
7321 }
7322 if ((nwritten = write(fd,buf,buflen)) == -1) {
7323 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7324 strerror(errno));
7325 freeClient(slave);
7326 return;
7327 }
7328 slave->repldboff += nwritten;
7329 if (slave->repldboff == slave->repldbsize) {
7330 close(slave->repldbfd);
7331 slave->repldbfd = -1;
7332 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7333 slave->replstate = REDIS_REPL_ONLINE;
7334 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7335 sendReplyToClient, slave) == AE_ERR) {
7336 freeClient(slave);
7337 return;
7338 }
7339 addReplySds(slave,sdsempty());
7340 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7341 }
7342 }
7343
7344 /* This function is called at the end of every backgrond saving.
7345 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7346 * otherwise REDIS_ERR is passed to the function.
7347 *
7348 * The goal of this function is to handle slaves waiting for a successful
7349 * background saving in order to perform non-blocking synchronization. */
7350 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7351 listNode *ln;
7352 int startbgsave = 0;
7353 listIter li;
7354
7355 listRewind(server.slaves,&li);
7356 while((ln = listNext(&li))) {
7357 redisClient *slave = ln->value;
7358
7359 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7360 startbgsave = 1;
7361 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7362 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7363 struct redis_stat buf;
7364
7365 if (bgsaveerr != REDIS_OK) {
7366 freeClient(slave);
7367 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7368 continue;
7369 }
7370 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7371 redis_fstat(slave->repldbfd,&buf) == -1) {
7372 freeClient(slave);
7373 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7374 continue;
7375 }
7376 slave->repldboff = 0;
7377 slave->repldbsize = buf.st_size;
7378 slave->replstate = REDIS_REPL_SEND_BULK;
7379 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7380 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7381 freeClient(slave);
7382 continue;
7383 }
7384 }
7385 }
7386 if (startbgsave) {
7387 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7388 listIter li;
7389
7390 listRewind(server.slaves,&li);
7391 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7392 while((ln = listNext(&li))) {
7393 redisClient *slave = ln->value;
7394
7395 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7396 freeClient(slave);
7397 }
7398 }
7399 }
7400 }
7401
7402 static int syncWithMaster(void) {
7403 char buf[1024], tmpfile[256], authcmd[1024];
7404 long dumpsize;
7405 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7406 int dfd, maxtries = 5;
7407
7408 if (fd == -1) {
7409 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7410 strerror(errno));
7411 return REDIS_ERR;
7412 }
7413
7414 /* AUTH with the master if required. */
7415 if(server.masterauth) {
7416 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7417 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7418 close(fd);
7419 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7420 strerror(errno));
7421 return REDIS_ERR;
7422 }
7423 /* Read the AUTH result. */
7424 if (syncReadLine(fd,buf,1024,3600) == -1) {
7425 close(fd);
7426 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7427 strerror(errno));
7428 return REDIS_ERR;
7429 }
7430 if (buf[0] != '+') {
7431 close(fd);
7432 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7433 return REDIS_ERR;
7434 }
7435 }
7436
7437 /* Issue the SYNC command */
7438 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7439 close(fd);
7440 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7441 strerror(errno));
7442 return REDIS_ERR;
7443 }
7444 /* Read the bulk write count */
7445 if (syncReadLine(fd,buf,1024,3600) == -1) {
7446 close(fd);
7447 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7448 strerror(errno));
7449 return REDIS_ERR;
7450 }
7451 if (buf[0] != '$') {
7452 close(fd);
7453 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7454 return REDIS_ERR;
7455 }
7456 dumpsize = strtol(buf+1,NULL,10);
7457 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7458 /* Read the bulk write data on a temp file */
7459 while(maxtries--) {
7460 snprintf(tmpfile,256,
7461 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7462 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7463 if (dfd != -1) break;
7464 sleep(1);
7465 }
7466 if (dfd == -1) {
7467 close(fd);
7468 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7469 return REDIS_ERR;
7470 }
7471 while(dumpsize) {
7472 int nread, nwritten;
7473
7474 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7475 if (nread == -1) {
7476 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7477 strerror(errno));
7478 close(fd);
7479 close(dfd);
7480 return REDIS_ERR;
7481 }
7482 nwritten = write(dfd,buf,nread);
7483 if (nwritten == -1) {
7484 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7485 close(fd);
7486 close(dfd);
7487 return REDIS_ERR;
7488 }
7489 dumpsize -= nread;
7490 }
7491 close(dfd);
7492 if (rename(tmpfile,server.dbfilename) == -1) {
7493 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7494 unlink(tmpfile);
7495 close(fd);
7496 return REDIS_ERR;
7497 }
7498 emptyDb();
7499 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7500 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7501 close(fd);
7502 return REDIS_ERR;
7503 }
7504 server.master = createClient(fd);
7505 server.master->flags |= REDIS_MASTER;
7506 server.master->authenticated = 1;
7507 server.replstate = REDIS_REPL_CONNECTED;
7508 return REDIS_OK;
7509 }
7510
7511 static void slaveofCommand(redisClient *c) {
7512 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7513 !strcasecmp(c->argv[2]->ptr,"one")) {
7514 if (server.masterhost) {
7515 sdsfree(server.masterhost);
7516 server.masterhost = NULL;
7517 if (server.master) freeClient(server.master);
7518 server.replstate = REDIS_REPL_NONE;
7519 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7520 }
7521 } else {
7522 sdsfree(server.masterhost);
7523 server.masterhost = sdsdup(c->argv[1]->ptr);
7524 server.masterport = atoi(c->argv[2]->ptr);
7525 if (server.master) freeClient(server.master);
7526 server.replstate = REDIS_REPL_CONNECT;
7527 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7528 server.masterhost, server.masterport);
7529 }
7530 addReply(c,shared.ok);
7531 }
7532
7533 /* ============================ Maxmemory directive ======================== */
7534
7535 /* Try to free one object form the pre-allocated objects free list.
7536 * This is useful under low mem conditions as by default we take 1 million
7537 * free objects allocated. On success REDIS_OK is returned, otherwise
7538 * REDIS_ERR. */
7539 static int tryFreeOneObjectFromFreelist(void) {
7540 robj *o;
7541
7542 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7543 if (listLength(server.objfreelist)) {
7544 listNode *head = listFirst(server.objfreelist);
7545 o = listNodeValue(head);
7546 listDelNode(server.objfreelist,head);
7547 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7548 zfree(o);
7549 return REDIS_OK;
7550 } else {
7551 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7552 return REDIS_ERR;
7553 }
7554 }
7555
7556 /* This function gets called when 'maxmemory' is set on the config file to limit
7557 * the max memory used by the server, and we are out of memory.
7558 * This function will try to, in order:
7559 *
7560 * - Free objects from the free list
7561 * - Try to remove keys with an EXPIRE set
7562 *
7563 * It is not possible to free enough memory to reach used-memory < maxmemory
7564 * the server will start refusing commands that will enlarge even more the
7565 * memory usage.
7566 */
7567 static void freeMemoryIfNeeded(void) {
7568 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7569 int j, k, freed = 0;
7570
7571 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7572 for (j = 0; j < server.dbnum; j++) {
7573 int minttl = -1;
7574 robj *minkey = NULL;
7575 struct dictEntry *de;
7576
7577 if (dictSize(server.db[j].expires)) {
7578 freed = 1;
7579 /* From a sample of three keys drop the one nearest to
7580 * the natural expire */
7581 for (k = 0; k < 3; k++) {
7582 time_t t;
7583
7584 de = dictGetRandomKey(server.db[j].expires);
7585 t = (time_t) dictGetEntryVal(de);
7586 if (minttl == -1 || t < minttl) {
7587 minkey = dictGetEntryKey(de);
7588 minttl = t;
7589 }
7590 }
7591 deleteKey(server.db+j,minkey);
7592 }
7593 }
7594 if (!freed) return; /* nothing to free... */
7595 }
7596 }
7597
7598 /* ============================== Append Only file ========================== */
7599
7600 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7601 sds buf = sdsempty();
7602 int j;
7603 ssize_t nwritten;
7604 time_t now;
7605 robj *tmpargv[3];
7606
7607 /* The DB this command was targetting is not the same as the last command
7608 * we appendend. To issue a SELECT command is needed. */
7609 if (dictid != server.appendseldb) {
7610 char seldb[64];
7611
7612 snprintf(seldb,sizeof(seldb),"%d",dictid);
7613 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7614 (unsigned long)strlen(seldb),seldb);
7615 server.appendseldb = dictid;
7616 }
7617
7618 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7619 * EXPIREs into EXPIREATs calls */
7620 if (cmd->proc == expireCommand) {
7621 long when;
7622
7623 tmpargv[0] = createStringObject("EXPIREAT",8);
7624 tmpargv[1] = argv[1];
7625 incrRefCount(argv[1]);
7626 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7627 tmpargv[2] = createObject(REDIS_STRING,
7628 sdscatprintf(sdsempty(),"%ld",when));
7629 argv = tmpargv;
7630 }
7631
7632 /* Append the actual command */
7633 buf = sdscatprintf(buf,"*%d\r\n",argc);
7634 for (j = 0; j < argc; j++) {
7635 robj *o = argv[j];
7636
7637 o = getDecodedObject(o);
7638 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7639 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7640 buf = sdscatlen(buf,"\r\n",2);
7641 decrRefCount(o);
7642 }
7643
7644 /* Free the objects from the modified argv for EXPIREAT */
7645 if (cmd->proc == expireCommand) {
7646 for (j = 0; j < 3; j++)
7647 decrRefCount(argv[j]);
7648 }
7649
7650 /* We want to perform a single write. This should be guaranteed atomic
7651 * at least if the filesystem we are writing is a real physical one.
7652 * While this will save us against the server being killed I don't think
7653 * there is much to do about the whole server stopping for power problems
7654 * or alike */
7655 nwritten = write(server.appendfd,buf,sdslen(buf));
7656 if (nwritten != (signed)sdslen(buf)) {
7657 /* Ooops, we are in troubles. The best thing to do for now is
7658 * to simply exit instead to give the illusion that everything is
7659 * working as expected. */
7660 if (nwritten == -1) {
7661 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7662 } else {
7663 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7664 }
7665 exit(1);
7666 }
7667 /* If a background append only file rewriting is in progress we want to
7668 * accumulate the differences between the child DB and the current one
7669 * in a buffer, so that when the child process will do its work we
7670 * can append the differences to the new append only file. */
7671 if (server.bgrewritechildpid != -1)
7672 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7673
7674 sdsfree(buf);
7675 now = time(NULL);
7676 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7677 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7678 now-server.lastfsync > 1))
7679 {
7680 fsync(server.appendfd); /* Let's try to get this data on the disk */
7681 server.lastfsync = now;
7682 }
7683 }
7684
7685 /* In Redis commands are always executed in the context of a client, so in
7686 * order to load the append only file we need to create a fake client. */
7687 static struct redisClient *createFakeClient(void) {
7688 struct redisClient *c = zmalloc(sizeof(*c));
7689
7690 selectDb(c,0);
7691 c->fd = -1;
7692 c->querybuf = sdsempty();
7693 c->argc = 0;
7694 c->argv = NULL;
7695 c->flags = 0;
7696 /* We set the fake client as a slave waiting for the synchronization
7697 * so that Redis will not try to send replies to this client. */
7698 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7699 c->reply = listCreate();
7700 listSetFreeMethod(c->reply,decrRefCount);
7701 listSetDupMethod(c->reply,dupClientReplyValue);
7702 return c;
7703 }
7704
7705 static void freeFakeClient(struct redisClient *c) {
7706 sdsfree(c->querybuf);
7707 listRelease(c->reply);
7708 zfree(c);
7709 }
7710
7711 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7712 * error (the append only file is zero-length) REDIS_ERR is returned. On
7713 * fatal error an error message is logged and the program exists. */
7714 int loadAppendOnlyFile(char *filename) {
7715 struct redisClient *fakeClient;
7716 FILE *fp = fopen(filename,"r");
7717 struct redis_stat sb;
7718 unsigned long long loadedkeys = 0;
7719
7720 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7721 return REDIS_ERR;
7722
7723 if (fp == NULL) {
7724 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7725 exit(1);
7726 }
7727
7728 fakeClient = createFakeClient();
7729 while(1) {
7730 int argc, j;
7731 unsigned long len;
7732 robj **argv;
7733 char buf[128];
7734 sds argsds;
7735 struct redisCommand *cmd;
7736
7737 if (fgets(buf,sizeof(buf),fp) == NULL) {
7738 if (feof(fp))
7739 break;
7740 else
7741 goto readerr;
7742 }
7743 if (buf[0] != '*') goto fmterr;
7744 argc = atoi(buf+1);
7745 argv = zmalloc(sizeof(robj*)*argc);
7746 for (j = 0; j < argc; j++) {
7747 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7748 if (buf[0] != '$') goto fmterr;
7749 len = strtol(buf+1,NULL,10);
7750 argsds = sdsnewlen(NULL,len);
7751 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7752 argv[j] = createObject(REDIS_STRING,argsds);
7753 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7754 }
7755
7756 /* Command lookup */
7757 cmd = lookupCommand(argv[0]->ptr);
7758 if (!cmd) {
7759 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7760 exit(1);
7761 }
7762 /* Try object encoding */
7763 if (cmd->flags & REDIS_CMD_BULK)
7764 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7765 /* Run the command in the context of a fake client */
7766 fakeClient->argc = argc;
7767 fakeClient->argv = argv;
7768 cmd->proc(fakeClient);
7769 /* Discard the reply objects list from the fake client */
7770 while(listLength(fakeClient->reply))
7771 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7772 /* Clean up, ready for the next command */
7773 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7774 zfree(argv);
7775 /* Handle swapping while loading big datasets when VM is on */
7776 loadedkeys++;
7777 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7778 while (zmalloc_used_memory() > server.vm_max_memory) {
7779 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7780 }
7781 }
7782 }
7783 fclose(fp);
7784 freeFakeClient(fakeClient);
7785 return REDIS_OK;
7786
7787 readerr:
7788 if (feof(fp)) {
7789 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7790 } else {
7791 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7792 }
7793 exit(1);
7794 fmterr:
7795 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7796 exit(1);
7797 }
7798
7799 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7800 static int fwriteBulkObject(FILE *fp, robj *obj) {
7801 char buf[128];
7802 int decrrc = 0;
7803
7804 /* Avoid the incr/decr ref count business if possible to help
7805 * copy-on-write (we are often in a child process when this function
7806 * is called).
7807 * Also makes sure that key objects don't get incrRefCount-ed when VM
7808 * is enabled */
7809 if (obj->encoding != REDIS_ENCODING_RAW) {
7810 obj = getDecodedObject(obj);
7811 decrrc = 1;
7812 }
7813 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7814 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7815 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7816 goto err;
7817 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7818 if (decrrc) decrRefCount(obj);
7819 return 1;
7820 err:
7821 if (decrrc) decrRefCount(obj);
7822 return 0;
7823 }
7824
7825 /* Write binary-safe string into a file in the bulkformat
7826 * $<count>\r\n<payload>\r\n */
7827 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7828 char buf[128];
7829
7830 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7831 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7832 if (len && fwrite(s,len,1,fp) == 0) return 0;
7833 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7834 return 1;
7835 }
7836
7837 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7838 static int fwriteBulkDouble(FILE *fp, double d) {
7839 char buf[128], dbuf[128];
7840
7841 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7842 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7843 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7844 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7845 return 1;
7846 }
7847
7848 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7849 static int fwriteBulkLong(FILE *fp, long l) {
7850 char buf[128], lbuf[128];
7851
7852 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7853 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7854 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7855 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7856 return 1;
7857 }
7858
7859 /* Write a sequence of commands able to fully rebuild the dataset into
7860 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7861 static int rewriteAppendOnlyFile(char *filename) {
7862 dictIterator *di = NULL;
7863 dictEntry *de;
7864 FILE *fp;
7865 char tmpfile[256];
7866 int j;
7867 time_t now = time(NULL);
7868
7869 /* Note that we have to use a different temp name here compared to the
7870 * one used by rewriteAppendOnlyFileBackground() function. */
7871 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7872 fp = fopen(tmpfile,"w");
7873 if (!fp) {
7874 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7875 return REDIS_ERR;
7876 }
7877 for (j = 0; j < server.dbnum; j++) {
7878 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7879 redisDb *db = server.db+j;
7880 dict *d = db->dict;
7881 if (dictSize(d) == 0) continue;
7882 di = dictGetIterator(d);
7883 if (!di) {
7884 fclose(fp);
7885 return REDIS_ERR;
7886 }
7887
7888 /* SELECT the new DB */
7889 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7890 if (fwriteBulkLong(fp,j) == 0) goto werr;
7891
7892 /* Iterate this DB writing every entry */
7893 while((de = dictNext(di)) != NULL) {
7894 robj *key, *o;
7895 time_t expiretime;
7896 int swapped;
7897
7898 key = dictGetEntryKey(de);
7899 /* If the value for this key is swapped, load a preview in memory.
7900 * We use a "swapped" flag to remember if we need to free the
7901 * value object instead to just increment the ref count anyway
7902 * in order to avoid copy-on-write of pages if we are forked() */
7903 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7904 key->storage == REDIS_VM_SWAPPING) {
7905 o = dictGetEntryVal(de);
7906 swapped = 0;
7907 } else {
7908 o = vmPreviewObject(key);
7909 swapped = 1;
7910 }
7911 expiretime = getExpire(db,key);
7912
7913 /* Save the key and associated value */
7914 if (o->type == REDIS_STRING) {
7915 /* Emit a SET command */
7916 char cmd[]="*3\r\n$3\r\nSET\r\n";
7917 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7918 /* Key and value */
7919 if (fwriteBulkObject(fp,key) == 0) goto werr;
7920 if (fwriteBulkObject(fp,o) == 0) goto werr;
7921 } else if (o->type == REDIS_LIST) {
7922 /* Emit the RPUSHes needed to rebuild the list */
7923 list *list = o->ptr;
7924 listNode *ln;
7925 listIter li;
7926
7927 listRewind(list,&li);
7928 while((ln = listNext(&li))) {
7929 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7930 robj *eleobj = listNodeValue(ln);
7931
7932 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7933 if (fwriteBulkObject(fp,key) == 0) goto werr;
7934 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7935 }
7936 } else if (o->type == REDIS_SET) {
7937 /* Emit the SADDs needed to rebuild the set */
7938 dict *set = o->ptr;
7939 dictIterator *di = dictGetIterator(set);
7940 dictEntry *de;
7941
7942 while((de = dictNext(di)) != NULL) {
7943 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7944 robj *eleobj = dictGetEntryKey(de);
7945
7946 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7947 if (fwriteBulkObject(fp,key) == 0) goto werr;
7948 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7949 }
7950 dictReleaseIterator(di);
7951 } else if (o->type == REDIS_ZSET) {
7952 /* Emit the ZADDs needed to rebuild the sorted set */
7953 zset *zs = o->ptr;
7954 dictIterator *di = dictGetIterator(zs->dict);
7955 dictEntry *de;
7956
7957 while((de = dictNext(di)) != NULL) {
7958 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7959 robj *eleobj = dictGetEntryKey(de);
7960 double *score = dictGetEntryVal(de);
7961
7962 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7963 if (fwriteBulkObject(fp,key) == 0) goto werr;
7964 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7965 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7966 }
7967 dictReleaseIterator(di);
7968 } else if (o->type == REDIS_HASH) {
7969 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7970
7971 /* Emit the HSETs needed to rebuild the hash */
7972 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7973 unsigned char *p = zipmapRewind(o->ptr);
7974 unsigned char *field, *val;
7975 unsigned int flen, vlen;
7976
7977 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7978 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7979 if (fwriteBulkObject(fp,key) == 0) goto werr;
7980 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7981 return -1;
7982 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7983 return -1;
7984 }
7985 } else {
7986 dictIterator *di = dictGetIterator(o->ptr);
7987 dictEntry *de;
7988
7989 while((de = dictNext(di)) != NULL) {
7990 robj *field = dictGetEntryKey(de);
7991 robj *val = dictGetEntryVal(de);
7992
7993 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7994 if (fwriteBulkObject(fp,key) == 0) goto werr;
7995 if (fwriteBulkObject(fp,field) == -1) return -1;
7996 if (fwriteBulkObject(fp,val) == -1) return -1;
7997 }
7998 dictReleaseIterator(di);
7999 }
8000 } else {
8001 redisAssert(0);
8002 }
8003 /* Save the expire time */
8004 if (expiretime != -1) {
8005 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8006 /* If this key is already expired skip it */
8007 if (expiretime < now) continue;
8008 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8009 if (fwriteBulkObject(fp,key) == 0) goto werr;
8010 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8011 }
8012 if (swapped) decrRefCount(o);
8013 }
8014 dictReleaseIterator(di);
8015 }
8016
8017 /* Make sure data will not remain on the OS's output buffers */
8018 fflush(fp);
8019 fsync(fileno(fp));
8020 fclose(fp);
8021
8022 /* Use RENAME to make sure the DB file is changed atomically only
8023 * if the generate DB file is ok. */
8024 if (rename(tmpfile,filename) == -1) {
8025 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8026 unlink(tmpfile);
8027 return REDIS_ERR;
8028 }
8029 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8030 return REDIS_OK;
8031
8032 werr:
8033 fclose(fp);
8034 unlink(tmpfile);
8035 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8036 if (di) dictReleaseIterator(di);
8037 return REDIS_ERR;
8038 }
8039
8040 /* This is how rewriting of the append only file in background works:
8041 *
8042 * 1) The user calls BGREWRITEAOF
8043 * 2) Redis calls this function, that forks():
8044 * 2a) the child rewrite the append only file in a temp file.
8045 * 2b) the parent accumulates differences in server.bgrewritebuf.
8046 * 3) When the child finished '2a' exists.
8047 * 4) The parent will trap the exit code, if it's OK, will append the
8048 * data accumulated into server.bgrewritebuf into the temp file, and
8049 * finally will rename(2) the temp file in the actual file name.
8050 * The the new file is reopened as the new append only file. Profit!
8051 */
8052 static int rewriteAppendOnlyFileBackground(void) {
8053 pid_t childpid;
8054
8055 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8056 if (server.vm_enabled) waitEmptyIOJobsQueue();
8057 if ((childpid = fork()) == 0) {
8058 /* Child */
8059 char tmpfile[256];
8060
8061 if (server.vm_enabled) vmReopenSwapFile();
8062 close(server.fd);
8063 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8064 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8065 _exit(0);
8066 } else {
8067 _exit(1);
8068 }
8069 } else {
8070 /* Parent */
8071 if (childpid == -1) {
8072 redisLog(REDIS_WARNING,
8073 "Can't rewrite append only file in background: fork: %s",
8074 strerror(errno));
8075 return REDIS_ERR;
8076 }
8077 redisLog(REDIS_NOTICE,
8078 "Background append only file rewriting started by pid %d",childpid);
8079 server.bgrewritechildpid = childpid;
8080 updateDictResizePolicy();
8081 /* We set appendseldb to -1 in order to force the next call to the
8082 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8083 * accumulated by the parent into server.bgrewritebuf will start
8084 * with a SELECT statement and it will be safe to merge. */
8085 server.appendseldb = -1;
8086 return REDIS_OK;
8087 }
8088 return REDIS_OK; /* unreached */
8089 }
8090
8091 static void bgrewriteaofCommand(redisClient *c) {
8092 if (server.bgrewritechildpid != -1) {
8093 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8094 return;
8095 }
8096 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8097 char *status = "+Background append only file rewriting started\r\n";
8098 addReplySds(c,sdsnew(status));
8099 } else {
8100 addReply(c,shared.err);
8101 }
8102 }
8103
8104 static void aofRemoveTempFile(pid_t childpid) {
8105 char tmpfile[256];
8106
8107 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8108 unlink(tmpfile);
8109 }
8110
8111 /* Virtual Memory is composed mainly of two subsystems:
8112 * - Blocking Virutal Memory
8113 * - Threaded Virtual Memory I/O
8114 * The two parts are not fully decoupled, but functions are split among two
8115 * different sections of the source code (delimited by comments) in order to
8116 * make more clear what functionality is about the blocking VM and what about
8117 * the threaded (not blocking) VM.
8118 *
8119 * Redis VM design:
8120 *
8121 * Redis VM is a blocking VM (one that blocks reading swapped values from
8122 * disk into memory when a value swapped out is needed in memory) that is made
8123 * unblocking by trying to examine the command argument vector in order to
8124 * load in background values that will likely be needed in order to exec
8125 * the command. The command is executed only once all the relevant keys
8126 * are loaded into memory.
8127 *
8128 * This basically is almost as simple of a blocking VM, but almost as parallel
8129 * as a fully non-blocking VM.
8130 */
8131
8132 /* =================== Virtual Memory - Blocking Side ====================== */
8133
8134 /* substitute the first occurrence of '%p' with the process pid in the
8135 * swap file name. */
8136 static void expandVmSwapFilename(void) {
8137 char *p = strstr(server.vm_swap_file,"%p");
8138 sds new;
8139
8140 if (!p) return;
8141 new = sdsempty();
8142 *p = '\0';
8143 new = sdscat(new,server.vm_swap_file);
8144 new = sdscatprintf(new,"%ld",(long) getpid());
8145 new = sdscat(new,p+2);
8146 zfree(server.vm_swap_file);
8147 server.vm_swap_file = new;
8148 }
8149
8150 static void vmInit(void) {
8151 off_t totsize;
8152 int pipefds[2];
8153 size_t stacksize;
8154
8155 if (server.vm_max_threads != 0)
8156 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8157
8158 expandVmSwapFilename();
8159 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8160 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8161 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8162 }
8163 if (server.vm_fp == NULL) {
8164 redisLog(REDIS_WARNING,
8165 "Impossible to open the swap file: %s. Exiting.",
8166 strerror(errno));
8167 exit(1);
8168 }
8169 server.vm_fd = fileno(server.vm_fp);
8170 server.vm_next_page = 0;
8171 server.vm_near_pages = 0;
8172 server.vm_stats_used_pages = 0;
8173 server.vm_stats_swapped_objects = 0;
8174 server.vm_stats_swapouts = 0;
8175 server.vm_stats_swapins = 0;
8176 totsize = server.vm_pages*server.vm_page_size;
8177 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8178 if (ftruncate(server.vm_fd,totsize) == -1) {
8179 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8180 strerror(errno));
8181 exit(1);
8182 } else {
8183 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8184 }
8185 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8186 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8187 (long long) (server.vm_pages+7)/8, server.vm_pages);
8188 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8189
8190 /* Initialize threaded I/O (used by Virtual Memory) */
8191 server.io_newjobs = listCreate();
8192 server.io_processing = listCreate();
8193 server.io_processed = listCreate();
8194 server.io_ready_clients = listCreate();
8195 pthread_mutex_init(&server.io_mutex,NULL);
8196 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8197 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8198 server.io_active_threads = 0;
8199 if (pipe(pipefds) == -1) {
8200 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8201 ,strerror(errno));
8202 exit(1);
8203 }
8204 server.io_ready_pipe_read = pipefds[0];
8205 server.io_ready_pipe_write = pipefds[1];
8206 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8207 /* LZF requires a lot of stack */
8208 pthread_attr_init(&server.io_threads_attr);
8209 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8210 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8211 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8212 /* Listen for events in the threaded I/O pipe */
8213 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8214 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8215 oom("creating file event");
8216 }
8217
8218 /* Mark the page as used */
8219 static void vmMarkPageUsed(off_t page) {
8220 off_t byte = page/8;
8221 int bit = page&7;
8222 redisAssert(vmFreePage(page) == 1);
8223 server.vm_bitmap[byte] |= 1<<bit;
8224 }
8225
8226 /* Mark N contiguous pages as used, with 'page' being the first. */
8227 static void vmMarkPagesUsed(off_t page, off_t count) {
8228 off_t j;
8229
8230 for (j = 0; j < count; j++)
8231 vmMarkPageUsed(page+j);
8232 server.vm_stats_used_pages += count;
8233 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8234 (long long)count, (long long)page);
8235 }
8236
8237 /* Mark the page as free */
8238 static void vmMarkPageFree(off_t page) {
8239 off_t byte = page/8;
8240 int bit = page&7;
8241 redisAssert(vmFreePage(page) == 0);
8242 server.vm_bitmap[byte] &= ~(1<<bit);
8243 }
8244
8245 /* Mark N contiguous pages as free, with 'page' being the first. */
8246 static void vmMarkPagesFree(off_t page, off_t count) {
8247 off_t j;
8248
8249 for (j = 0; j < count; j++)
8250 vmMarkPageFree(page+j);
8251 server.vm_stats_used_pages -= count;
8252 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8253 (long long)count, (long long)page);
8254 }
8255
8256 /* Test if the page is free */
8257 static int vmFreePage(off_t page) {
8258 off_t byte = page/8;
8259 int bit = page&7;
8260 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8261 }
8262
8263 /* Find N contiguous free pages storing the first page of the cluster in *first.
8264 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8265 * REDIS_ERR is returned.
8266 *
8267 * This function uses a simple algorithm: we try to allocate
8268 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8269 * again from the start of the swap file searching for free spaces.
8270 *
8271 * If it looks pretty clear that there are no free pages near our offset
8272 * we try to find less populated places doing a forward jump of
8273 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8274 * without hurry, and then we jump again and so forth...
8275 *
8276 * This function can be improved using a free list to avoid to guess
8277 * too much, since we could collect data about freed pages.
8278 *
8279 * note: I implemented this function just after watching an episode of
8280 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8281 */
8282 static int vmFindContiguousPages(off_t *first, off_t n) {
8283 off_t base, offset = 0, since_jump = 0, numfree = 0;
8284
8285 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8286 server.vm_near_pages = 0;
8287 server.vm_next_page = 0;
8288 }
8289 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8290 base = server.vm_next_page;
8291
8292 while(offset < server.vm_pages) {
8293 off_t this = base+offset;
8294
8295 /* If we overflow, restart from page zero */
8296 if (this >= server.vm_pages) {
8297 this -= server.vm_pages;
8298 if (this == 0) {
8299 /* Just overflowed, what we found on tail is no longer
8300 * interesting, as it's no longer contiguous. */
8301 numfree = 0;
8302 }
8303 }
8304 if (vmFreePage(this)) {
8305 /* This is a free page */
8306 numfree++;
8307 /* Already got N free pages? Return to the caller, with success */
8308 if (numfree == n) {
8309 *first = this-(n-1);
8310 server.vm_next_page = this+1;
8311 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8312 return REDIS_OK;
8313 }
8314 } else {
8315 /* The current one is not a free page */
8316 numfree = 0;
8317 }
8318
8319 /* Fast-forward if the current page is not free and we already
8320 * searched enough near this place. */
8321 since_jump++;
8322 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8323 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8324 since_jump = 0;
8325 /* Note that even if we rewind after the jump, we are don't need
8326 * to make sure numfree is set to zero as we only jump *if* it
8327 * is set to zero. */
8328 } else {
8329 /* Otherwise just check the next page */
8330 offset++;
8331 }
8332 }
8333 return REDIS_ERR;
8334 }
8335
8336 /* Write the specified object at the specified page of the swap file */
8337 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8338 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8339 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8340 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8341 redisLog(REDIS_WARNING,
8342 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8343 strerror(errno));
8344 return REDIS_ERR;
8345 }
8346 rdbSaveObject(server.vm_fp,o);
8347 fflush(server.vm_fp);
8348 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8349 return REDIS_OK;
8350 }
8351
8352 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8353 * needed to later retrieve the object into the key object.
8354 * If we can't find enough contiguous empty pages to swap the object on disk
8355 * REDIS_ERR is returned. */
8356 static int vmSwapObjectBlocking(robj *key, robj *val) {
8357 off_t pages = rdbSavedObjectPages(val,NULL);
8358 off_t page;
8359
8360 assert(key->storage == REDIS_VM_MEMORY);
8361 assert(key->refcount == 1);
8362 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8363 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8364 key->vm.page = page;
8365 key->vm.usedpages = pages;
8366 key->storage = REDIS_VM_SWAPPED;
8367 key->vtype = val->type;
8368 decrRefCount(val); /* Deallocate the object from memory. */
8369 vmMarkPagesUsed(page,pages);
8370 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8371 (unsigned char*) key->ptr,
8372 (unsigned long long) page, (unsigned long long) pages);
8373 server.vm_stats_swapped_objects++;
8374 server.vm_stats_swapouts++;
8375 return REDIS_OK;
8376 }
8377
8378 static robj *vmReadObjectFromSwap(off_t page, int type) {
8379 robj *o;
8380
8381 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8382 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8383 redisLog(REDIS_WARNING,
8384 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8385 strerror(errno));
8386 _exit(1);
8387 }
8388 o = rdbLoadObject(type,server.vm_fp);
8389 if (o == NULL) {
8390 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8391 _exit(1);
8392 }
8393 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8394 return o;
8395 }
8396
8397 /* Load the value object relative to the 'key' object from swap to memory.
8398 * The newly allocated object is returned.
8399 *
8400 * If preview is true the unserialized object is returned to the caller but
8401 * no changes are made to the key object, nor the pages are marked as freed */
8402 static robj *vmGenericLoadObject(robj *key, int preview) {
8403 robj *val;
8404
8405 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8406 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8407 if (!preview) {
8408 key->storage = REDIS_VM_MEMORY;
8409 key->vm.atime = server.unixtime;
8410 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8411 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8412 (unsigned char*) key->ptr);
8413 server.vm_stats_swapped_objects--;
8414 } else {
8415 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8416 (unsigned char*) key->ptr);
8417 }
8418 server.vm_stats_swapins++;
8419 return val;
8420 }
8421
8422 /* Plain object loading, from swap to memory */
8423 static robj *vmLoadObject(robj *key) {
8424 /* If we are loading the object in background, stop it, we
8425 * need to load this object synchronously ASAP. */
8426 if (key->storage == REDIS_VM_LOADING)
8427 vmCancelThreadedIOJob(key);
8428 return vmGenericLoadObject(key,0);
8429 }
8430
8431 /* Just load the value on disk, without to modify the key.
8432 * This is useful when we want to perform some operation on the value
8433 * without to really bring it from swap to memory, like while saving the
8434 * dataset or rewriting the append only log. */
8435 static robj *vmPreviewObject(robj *key) {
8436 return vmGenericLoadObject(key,1);
8437 }
8438
8439 /* How a good candidate is this object for swapping?
8440 * The better candidate it is, the greater the returned value.
8441 *
8442 * Currently we try to perform a fast estimation of the object size in
8443 * memory, and combine it with aging informations.
8444 *
8445 * Basically swappability = idle-time * log(estimated size)
8446 *
8447 * Bigger objects are preferred over smaller objects, but not
8448 * proportionally, this is why we use the logarithm. This algorithm is
8449 * just a first try and will probably be tuned later. */
8450 static double computeObjectSwappability(robj *o) {
8451 time_t age = server.unixtime - o->vm.atime;
8452 long asize = 0;
8453 list *l;
8454 dict *d;
8455 struct dictEntry *de;
8456 int z;
8457
8458 if (age <= 0) return 0;
8459 switch(o->type) {
8460 case REDIS_STRING:
8461 if (o->encoding != REDIS_ENCODING_RAW) {
8462 asize = sizeof(*o);
8463 } else {
8464 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8465 }
8466 break;
8467 case REDIS_LIST:
8468 l = o->ptr;
8469 listNode *ln = listFirst(l);
8470
8471 asize = sizeof(list);
8472 if (ln) {
8473 robj *ele = ln->value;
8474 long elesize;
8475
8476 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8477 (sizeof(*o)+sdslen(ele->ptr)) :
8478 sizeof(*o);
8479 asize += (sizeof(listNode)+elesize)*listLength(l);
8480 }
8481 break;
8482 case REDIS_SET:
8483 case REDIS_ZSET:
8484 z = (o->type == REDIS_ZSET);
8485 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8486
8487 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8488 if (z) asize += sizeof(zset)-sizeof(dict);
8489 if (dictSize(d)) {
8490 long elesize;
8491 robj *ele;
8492
8493 de = dictGetRandomKey(d);
8494 ele = dictGetEntryKey(de);
8495 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8496 (sizeof(*o)+sdslen(ele->ptr)) :
8497 sizeof(*o);
8498 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8499 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8500 }
8501 break;
8502 case REDIS_HASH:
8503 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8504 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8505 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8506 unsigned int klen, vlen;
8507 unsigned char *key, *val;
8508
8509 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8510 klen = 0;
8511 vlen = 0;
8512 }
8513 asize = len*(klen+vlen+3);
8514 } else if (o->encoding == REDIS_ENCODING_HT) {
8515 d = o->ptr;
8516 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8517 if (dictSize(d)) {
8518 long elesize;
8519 robj *ele;
8520
8521 de = dictGetRandomKey(d);
8522 ele = dictGetEntryKey(de);
8523 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8524 (sizeof(*o)+sdslen(ele->ptr)) :
8525 sizeof(*o);
8526 ele = dictGetEntryVal(de);
8527 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8528 (sizeof(*o)+sdslen(ele->ptr)) :
8529 sizeof(*o);
8530 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8531 }
8532 }
8533 break;
8534 }
8535 return (double)age*log(1+asize);
8536 }
8537
8538 /* Try to swap an object that's a good candidate for swapping.
8539 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8540 * to swap any object at all.
8541 *
8542 * If 'usethreaded' is true, Redis will try to swap the object in background
8543 * using I/O threads. */
8544 static int vmSwapOneObject(int usethreads) {
8545 int j, i;
8546 struct dictEntry *best = NULL;
8547 double best_swappability = 0;
8548 redisDb *best_db = NULL;
8549 robj *key, *val;
8550
8551 for (j = 0; j < server.dbnum; j++) {
8552 redisDb *db = server.db+j;
8553 /* Why maxtries is set to 100?
8554 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8555 * are swappable objects */
8556 int maxtries = 100;
8557
8558 if (dictSize(db->dict) == 0) continue;
8559 for (i = 0; i < 5; i++) {
8560 dictEntry *de;
8561 double swappability;
8562
8563 if (maxtries) maxtries--;
8564 de = dictGetRandomKey(db->dict);
8565 key = dictGetEntryKey(de);
8566 val = dictGetEntryVal(de);
8567 /* Only swap objects that are currently in memory.
8568 *
8569 * Also don't swap shared objects if threaded VM is on, as we
8570 * try to ensure that the main thread does not touch the
8571 * object while the I/O thread is using it, but we can't
8572 * control other keys without adding additional mutex. */
8573 if (key->storage != REDIS_VM_MEMORY ||
8574 (server.vm_max_threads != 0 && val->refcount != 1)) {
8575 if (maxtries) i--; /* don't count this try */
8576 continue;
8577 }
8578 swappability = computeObjectSwappability(val);
8579 if (!best || swappability > best_swappability) {
8580 best = de;
8581 best_swappability = swappability;
8582 best_db = db;
8583 }
8584 }
8585 }
8586 if (best == NULL) return REDIS_ERR;
8587 key = dictGetEntryKey(best);
8588 val = dictGetEntryVal(best);
8589
8590 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8591 key->ptr, best_swappability);
8592
8593 /* Unshare the key if needed */
8594 if (key->refcount > 1) {
8595 robj *newkey = dupStringObject(key);
8596 decrRefCount(key);
8597 key = dictGetEntryKey(best) = newkey;
8598 }
8599 /* Swap it */
8600 if (usethreads) {
8601 vmSwapObjectThreaded(key,val,best_db);
8602 return REDIS_OK;
8603 } else {
8604 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8605 dictGetEntryVal(best) = NULL;
8606 return REDIS_OK;
8607 } else {
8608 return REDIS_ERR;
8609 }
8610 }
8611 }
8612
8613 static int vmSwapOneObjectBlocking() {
8614 return vmSwapOneObject(0);
8615 }
8616
8617 static int vmSwapOneObjectThreaded() {
8618 return vmSwapOneObject(1);
8619 }
8620
8621 /* Return true if it's safe to swap out objects in a given moment.
8622 * Basically we don't want to swap objects out while there is a BGSAVE
8623 * or a BGAEOREWRITE running in backgroud. */
8624 static int vmCanSwapOut(void) {
8625 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8626 }
8627
8628 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8629 * and was deleted. Otherwise 0 is returned. */
8630 static int deleteIfSwapped(redisDb *db, robj *key) {
8631 dictEntry *de;
8632 robj *foundkey;
8633
8634 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8635 foundkey = dictGetEntryKey(de);
8636 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8637 deleteKey(db,key);
8638 return 1;
8639 }
8640
8641 /* =================== Virtual Memory - Threaded I/O ======================= */
8642
8643 static void freeIOJob(iojob *j) {
8644 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8645 j->type == REDIS_IOJOB_DO_SWAP ||
8646 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8647 decrRefCount(j->val);
8648 /* We don't decrRefCount the j->key field as we did't incremented
8649 * the count creating IO Jobs. This is because the key field here is
8650 * just used as an indentifier and if a key is removed the Job should
8651 * never be touched again. */
8652 zfree(j);
8653 }
8654
8655 /* Every time a thread finished a Job, it writes a byte into the write side
8656 * of an unix pipe in order to "awake" the main thread, and this function
8657 * is called. */
8658 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8659 int mask)
8660 {
8661 char buf[1];
8662 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8663 REDIS_NOTUSED(el);
8664 REDIS_NOTUSED(mask);
8665 REDIS_NOTUSED(privdata);
8666
8667 /* For every byte we read in the read side of the pipe, there is one
8668 * I/O job completed to process. */
8669 while((retval = read(fd,buf,1)) == 1) {
8670 iojob *j;
8671 listNode *ln;
8672 robj *key;
8673 struct dictEntry *de;
8674
8675 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8676
8677 /* Get the processed element (the oldest one) */
8678 lockThreadedIO();
8679 assert(listLength(server.io_processed) != 0);
8680 if (toprocess == -1) {
8681 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8682 if (toprocess <= 0) toprocess = 1;
8683 }
8684 ln = listFirst(server.io_processed);
8685 j = ln->value;
8686 listDelNode(server.io_processed,ln);
8687 unlockThreadedIO();
8688 /* If this job is marked as canceled, just ignore it */
8689 if (j->canceled) {
8690 freeIOJob(j);
8691 continue;
8692 }
8693 /* Post process it in the main thread, as there are things we
8694 * can do just here to avoid race conditions and/or invasive locks */
8695 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8696 de = dictFind(j->db->dict,j->key);
8697 assert(de != NULL);
8698 key = dictGetEntryKey(de);
8699 if (j->type == REDIS_IOJOB_LOAD) {
8700 redisDb *db;
8701
8702 /* Key loaded, bring it at home */
8703 key->storage = REDIS_VM_MEMORY;
8704 key->vm.atime = server.unixtime;
8705 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8706 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8707 (unsigned char*) key->ptr);
8708 server.vm_stats_swapped_objects--;
8709 server.vm_stats_swapins++;
8710 dictGetEntryVal(de) = j->val;
8711 incrRefCount(j->val);
8712 db = j->db;
8713 freeIOJob(j);
8714 /* Handle clients waiting for this key to be loaded. */
8715 handleClientsBlockedOnSwappedKey(db,key);
8716 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8717 /* Now we know the amount of pages required to swap this object.
8718 * Let's find some space for it, and queue this task again
8719 * rebranded as REDIS_IOJOB_DO_SWAP. */
8720 if (!vmCanSwapOut() ||
8721 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8722 {
8723 /* Ooops... no space or we can't swap as there is
8724 * a fork()ed Redis trying to save stuff on disk. */
8725 freeIOJob(j);
8726 key->storage = REDIS_VM_MEMORY; /* undo operation */
8727 } else {
8728 /* Note that we need to mark this pages as used now,
8729 * if the job will be canceled, we'll mark them as freed
8730 * again. */
8731 vmMarkPagesUsed(j->page,j->pages);
8732 j->type = REDIS_IOJOB_DO_SWAP;
8733 lockThreadedIO();
8734 queueIOJob(j);
8735 unlockThreadedIO();
8736 }
8737 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8738 robj *val;
8739
8740 /* Key swapped. We can finally free some memory. */
8741 if (key->storage != REDIS_VM_SWAPPING) {
8742 printf("key->storage: %d\n",key->storage);
8743 printf("key->name: %s\n",(char*)key->ptr);
8744 printf("key->refcount: %d\n",key->refcount);
8745 printf("val: %p\n",(void*)j->val);
8746 printf("val->type: %d\n",j->val->type);
8747 printf("val->ptr: %s\n",(char*)j->val->ptr);
8748 }
8749 redisAssert(key->storage == REDIS_VM_SWAPPING);
8750 val = dictGetEntryVal(de);
8751 key->vm.page = j->page;
8752 key->vm.usedpages = j->pages;
8753 key->storage = REDIS_VM_SWAPPED;
8754 key->vtype = j->val->type;
8755 decrRefCount(val); /* Deallocate the object from memory. */
8756 dictGetEntryVal(de) = NULL;
8757 redisLog(REDIS_DEBUG,
8758 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8759 (unsigned char*) key->ptr,
8760 (unsigned long long) j->page, (unsigned long long) j->pages);
8761 server.vm_stats_swapped_objects++;
8762 server.vm_stats_swapouts++;
8763 freeIOJob(j);
8764 /* Put a few more swap requests in queue if we are still
8765 * out of memory */
8766 if (trytoswap && vmCanSwapOut() &&
8767 zmalloc_used_memory() > server.vm_max_memory)
8768 {
8769 int more = 1;
8770 while(more) {
8771 lockThreadedIO();
8772 more = listLength(server.io_newjobs) <
8773 (unsigned) server.vm_max_threads;
8774 unlockThreadedIO();
8775 /* Don't waste CPU time if swappable objects are rare. */
8776 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8777 trytoswap = 0;
8778 break;
8779 }
8780 }
8781 }
8782 }
8783 processed++;
8784 if (processed == toprocess) return;
8785 }
8786 if (retval < 0 && errno != EAGAIN) {
8787 redisLog(REDIS_WARNING,
8788 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8789 strerror(errno));
8790 }
8791 }
8792
8793 static void lockThreadedIO(void) {
8794 pthread_mutex_lock(&server.io_mutex);
8795 }
8796
8797 static void unlockThreadedIO(void) {
8798 pthread_mutex_unlock(&server.io_mutex);
8799 }
8800
8801 /* Remove the specified object from the threaded I/O queue if still not
8802 * processed, otherwise make sure to flag it as canceled. */
8803 static void vmCancelThreadedIOJob(robj *o) {
8804 list *lists[3] = {
8805 server.io_newjobs, /* 0 */
8806 server.io_processing, /* 1 */
8807 server.io_processed /* 2 */
8808 };
8809 int i;
8810
8811 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8812 again:
8813 lockThreadedIO();
8814 /* Search for a matching key in one of the queues */
8815 for (i = 0; i < 3; i++) {
8816 listNode *ln;
8817 listIter li;
8818
8819 listRewind(lists[i],&li);
8820 while ((ln = listNext(&li)) != NULL) {
8821 iojob *job = ln->value;
8822
8823 if (job->canceled) continue; /* Skip this, already canceled. */
8824 if (job->key == o) {
8825 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8826 (void*)job, (char*)o->ptr, job->type, i);
8827 /* Mark the pages as free since the swap didn't happened
8828 * or happened but is now discarded. */
8829 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8830 vmMarkPagesFree(job->page,job->pages);
8831 /* Cancel the job. It depends on the list the job is
8832 * living in. */
8833 switch(i) {
8834 case 0: /* io_newjobs */
8835 /* If the job was yet not processed the best thing to do
8836 * is to remove it from the queue at all */
8837 freeIOJob(job);
8838 listDelNode(lists[i],ln);
8839 break;
8840 case 1: /* io_processing */
8841 /* Oh Shi- the thread is messing with the Job:
8842 *
8843 * Probably it's accessing the object if this is a
8844 * PREPARE_SWAP or DO_SWAP job.
8845 * If it's a LOAD job it may be reading from disk and
8846 * if we don't wait for the job to terminate before to
8847 * cancel it, maybe in a few microseconds data can be
8848 * corrupted in this pages. So the short story is:
8849 *
8850 * Better to wait for the job to move into the
8851 * next queue (processed)... */
8852
8853 /* We try again and again until the job is completed. */
8854 unlockThreadedIO();
8855 /* But let's wait some time for the I/O thread
8856 * to finish with this job. After all this condition
8857 * should be very rare. */
8858 usleep(1);
8859 goto again;
8860 case 2: /* io_processed */
8861 /* The job was already processed, that's easy...
8862 * just mark it as canceled so that we'll ignore it
8863 * when processing completed jobs. */
8864 job->canceled = 1;
8865 break;
8866 }
8867 /* Finally we have to adjust the storage type of the object
8868 * in order to "UNDO" the operaiton. */
8869 if (o->storage == REDIS_VM_LOADING)
8870 o->storage = REDIS_VM_SWAPPED;
8871 else if (o->storage == REDIS_VM_SWAPPING)
8872 o->storage = REDIS_VM_MEMORY;
8873 unlockThreadedIO();
8874 return;
8875 }
8876 }
8877 }
8878 unlockThreadedIO();
8879 assert(1 != 1); /* We should never reach this */
8880 }
8881
8882 static void *IOThreadEntryPoint(void *arg) {
8883 iojob *j;
8884 listNode *ln;
8885 REDIS_NOTUSED(arg);
8886
8887 pthread_detach(pthread_self());
8888 while(1) {
8889 /* Get a new job to process */
8890 lockThreadedIO();
8891 if (listLength(server.io_newjobs) == 0) {
8892 /* No new jobs in queue, exit. */
8893 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8894 (long) pthread_self());
8895 server.io_active_threads--;
8896 unlockThreadedIO();
8897 return NULL;
8898 }
8899 ln = listFirst(server.io_newjobs);
8900 j = ln->value;
8901 listDelNode(server.io_newjobs,ln);
8902 /* Add the job in the processing queue */
8903 j->thread = pthread_self();
8904 listAddNodeTail(server.io_processing,j);
8905 ln = listLast(server.io_processing); /* We use ln later to remove it */
8906 unlockThreadedIO();
8907 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8908 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8909
8910 /* Process the Job */
8911 if (j->type == REDIS_IOJOB_LOAD) {
8912 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8913 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8914 FILE *fp = fopen("/dev/null","w+");
8915 j->pages = rdbSavedObjectPages(j->val,fp);
8916 fclose(fp);
8917 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8918 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8919 j->canceled = 1;
8920 }
8921
8922 /* Done: insert the job into the processed queue */
8923 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8924 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8925 lockThreadedIO();
8926 listDelNode(server.io_processing,ln);
8927 listAddNodeTail(server.io_processed,j);
8928 unlockThreadedIO();
8929
8930 /* Signal the main thread there is new stuff to process */
8931 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8932 }
8933 return NULL; /* never reached */
8934 }
8935
8936 static void spawnIOThread(void) {
8937 pthread_t thread;
8938 sigset_t mask, omask;
8939 int err;
8940
8941 sigemptyset(&mask);
8942 sigaddset(&mask,SIGCHLD);
8943 sigaddset(&mask,SIGHUP);
8944 sigaddset(&mask,SIGPIPE);
8945 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8946 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8947 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8948 strerror(err));
8949 usleep(1000000);
8950 }
8951 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8952 server.io_active_threads++;
8953 }
8954
8955 /* We need to wait for the last thread to exit before we are able to
8956 * fork() in order to BGSAVE or BGREWRITEAOF. */
8957 static void waitEmptyIOJobsQueue(void) {
8958 while(1) {
8959 int io_processed_len;
8960
8961 lockThreadedIO();
8962 if (listLength(server.io_newjobs) == 0 &&
8963 listLength(server.io_processing) == 0 &&
8964 server.io_active_threads == 0)
8965 {
8966 unlockThreadedIO();
8967 return;
8968 }
8969 /* While waiting for empty jobs queue condition we post-process some
8970 * finshed job, as I/O threads may be hanging trying to write against
8971 * the io_ready_pipe_write FD but there are so much pending jobs that
8972 * it's blocking. */
8973 io_processed_len = listLength(server.io_processed);
8974 unlockThreadedIO();
8975 if (io_processed_len) {
8976 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8977 usleep(1000); /* 1 millisecond */
8978 } else {
8979 usleep(10000); /* 10 milliseconds */
8980 }
8981 }
8982 }
8983
8984 static void vmReopenSwapFile(void) {
8985 /* Note: we don't close the old one as we are in the child process
8986 * and don't want to mess at all with the original file object. */
8987 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8988 if (server.vm_fp == NULL) {
8989 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8990 server.vm_swap_file);
8991 _exit(1);
8992 }
8993 server.vm_fd = fileno(server.vm_fp);
8994 }
8995
8996 /* This function must be called while with threaded IO locked */
8997 static void queueIOJob(iojob *j) {
8998 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8999 (void*)j, j->type, (char*)j->key->ptr);
9000 listAddNodeTail(server.io_newjobs,j);
9001 if (server.io_active_threads < server.vm_max_threads)
9002 spawnIOThread();
9003 }
9004
9005 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9006 iojob *j;
9007
9008 assert(key->storage == REDIS_VM_MEMORY);
9009 assert(key->refcount == 1);
9010
9011 j = zmalloc(sizeof(*j));
9012 j->type = REDIS_IOJOB_PREPARE_SWAP;
9013 j->db = db;
9014 j->key = key;
9015 j->val = val;
9016 incrRefCount(val);
9017 j->canceled = 0;
9018 j->thread = (pthread_t) -1;
9019 key->storage = REDIS_VM_SWAPPING;
9020
9021 lockThreadedIO();
9022 queueIOJob(j);
9023 unlockThreadedIO();
9024 return REDIS_OK;
9025 }
9026
9027 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9028
9029 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9030 * If there is not already a job loading the key, it is craeted.
9031 * The key is added to the io_keys list in the client structure, and also
9032 * in the hash table mapping swapped keys to waiting clients, that is,
9033 * server.io_waited_keys. */
9034 static int waitForSwappedKey(redisClient *c, robj *key) {
9035 struct dictEntry *de;
9036 robj *o;
9037 list *l;
9038
9039 /* If the key does not exist or is already in RAM we don't need to
9040 * block the client at all. */
9041 de = dictFind(c->db->dict,key);
9042 if (de == NULL) return 0;
9043 o = dictGetEntryKey(de);
9044 if (o->storage == REDIS_VM_MEMORY) {
9045 return 0;
9046 } else if (o->storage == REDIS_VM_SWAPPING) {
9047 /* We were swapping the key, undo it! */
9048 vmCancelThreadedIOJob(o);
9049 return 0;
9050 }
9051
9052 /* OK: the key is either swapped, or being loaded just now. */
9053
9054 /* Add the key to the list of keys this client is waiting for.
9055 * This maps clients to keys they are waiting for. */
9056 listAddNodeTail(c->io_keys,key);
9057 incrRefCount(key);
9058
9059 /* Add the client to the swapped keys => clients waiting map. */
9060 de = dictFind(c->db->io_keys,key);
9061 if (de == NULL) {
9062 int retval;
9063
9064 /* For every key we take a list of clients blocked for it */
9065 l = listCreate();
9066 retval = dictAdd(c->db->io_keys,key,l);
9067 incrRefCount(key);
9068 assert(retval == DICT_OK);
9069 } else {
9070 l = dictGetEntryVal(de);
9071 }
9072 listAddNodeTail(l,c);
9073
9074 /* Are we already loading the key from disk? If not create a job */
9075 if (o->storage == REDIS_VM_SWAPPED) {
9076 iojob *j;
9077
9078 o->storage = REDIS_VM_LOADING;
9079 j = zmalloc(sizeof(*j));
9080 j->type = REDIS_IOJOB_LOAD;
9081 j->db = c->db;
9082 j->key = o;
9083 j->key->vtype = o->vtype;
9084 j->page = o->vm.page;
9085 j->val = NULL;
9086 j->canceled = 0;
9087 j->thread = (pthread_t) -1;
9088 lockThreadedIO();
9089 queueIOJob(j);
9090 unlockThreadedIO();
9091 }
9092 return 1;
9093 }
9094
9095 /* Preload keys needed for the ZUNION and ZINTER commands. */
9096 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9097 int i, num;
9098 num = atoi(c->argv[2]->ptr);
9099 for (i = 0; i < num; i++) {
9100 waitForSwappedKey(c,c->argv[3+i]);
9101 }
9102 }
9103
9104 /* Is this client attempting to run a command against swapped keys?
9105 * If so, block it ASAP, load the keys in background, then resume it.
9106 *
9107 * The important idea about this function is that it can fail! If keys will
9108 * still be swapped when the client is resumed, this key lookups will
9109 * just block loading keys from disk. In practical terms this should only
9110 * happen with SORT BY command or if there is a bug in this function.
9111 *
9112 * Return 1 if the client is marked as blocked, 0 if the client can
9113 * continue as the keys it is going to access appear to be in memory. */
9114 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9115 int j, last;
9116
9117 if (cmd->vm_preload_proc != NULL) {
9118 cmd->vm_preload_proc(c);
9119 } else {
9120 if (cmd->vm_firstkey == 0) return 0;
9121 last = cmd->vm_lastkey;
9122 if (last < 0) last = c->argc+last;
9123 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9124 waitForSwappedKey(c,c->argv[j]);
9125 }
9126
9127 /* If the client was blocked for at least one key, mark it as blocked. */
9128 if (listLength(c->io_keys)) {
9129 c->flags |= REDIS_IO_WAIT;
9130 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9131 server.vm_blocked_clients++;
9132 return 1;
9133 } else {
9134 return 0;
9135 }
9136 }
9137
9138 /* Remove the 'key' from the list of blocked keys for a given client.
9139 *
9140 * The function returns 1 when there are no longer blocking keys after
9141 * the current one was removed (and the client can be unblocked). */
9142 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9143 list *l;
9144 listNode *ln;
9145 listIter li;
9146 struct dictEntry *de;
9147
9148 /* Remove the key from the list of keys this client is waiting for. */
9149 listRewind(c->io_keys,&li);
9150 while ((ln = listNext(&li)) != NULL) {
9151 if (compareStringObjects(ln->value,key) == 0) {
9152 listDelNode(c->io_keys,ln);
9153 break;
9154 }
9155 }
9156 assert(ln != NULL);
9157
9158 /* Remove the client form the key => waiting clients map. */
9159 de = dictFind(c->db->io_keys,key);
9160 assert(de != NULL);
9161 l = dictGetEntryVal(de);
9162 ln = listSearchKey(l,c);
9163 assert(ln != NULL);
9164 listDelNode(l,ln);
9165 if (listLength(l) == 0)
9166 dictDelete(c->db->io_keys,key);
9167
9168 return listLength(c->io_keys) == 0;
9169 }
9170
9171 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9172 struct dictEntry *de;
9173 list *l;
9174 listNode *ln;
9175 int len;
9176
9177 de = dictFind(db->io_keys,key);
9178 if (!de) return;
9179
9180 l = dictGetEntryVal(de);
9181 len = listLength(l);
9182 /* Note: we can't use something like while(listLength(l)) as the list
9183 * can be freed by the calling function when we remove the last element. */
9184 while (len--) {
9185 ln = listFirst(l);
9186 redisClient *c = ln->value;
9187
9188 if (dontWaitForSwappedKey(c,key)) {
9189 /* Put the client in the list of clients ready to go as we
9190 * loaded all the keys about it. */
9191 listAddNodeTail(server.io_ready_clients,c);
9192 }
9193 }
9194 }
9195
9196 /* =========================== Remote Configuration ========================= */
9197
9198 static void configSetCommand(redisClient *c) {
9199 robj *o = getDecodedObject(c->argv[3]);
9200 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9201 zfree(server.dbfilename);
9202 server.dbfilename = zstrdup(o->ptr);
9203 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9204 zfree(server.requirepass);
9205 server.requirepass = zstrdup(o->ptr);
9206 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9207 zfree(server.masterauth);
9208 server.masterauth = zstrdup(o->ptr);
9209 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9210 server.maxmemory = strtoll(o->ptr, NULL, 10);
9211 } else {
9212 addReplySds(c,sdscatprintf(sdsempty(),
9213 "-ERR not supported CONFIG parameter %s\r\n",
9214 (char*)c->argv[2]->ptr));
9215 decrRefCount(o);
9216 return;
9217 }
9218 decrRefCount(o);
9219 addReply(c,shared.ok);
9220 }
9221
9222 static void configGetCommand(redisClient *c) {
9223 robj *o = getDecodedObject(c->argv[2]);
9224 robj *lenobj = createObject(REDIS_STRING,NULL);
9225 char *pattern = o->ptr;
9226 int matches = 0;
9227
9228 addReply(c,lenobj);
9229 decrRefCount(lenobj);
9230
9231 if (stringmatch(pattern,"dbfilename",0)) {
9232 addReplyBulkCString(c,"dbfilename");
9233 addReplyBulkCString(c,server.dbfilename);
9234 matches++;
9235 }
9236 if (stringmatch(pattern,"requirepass",0)) {
9237 addReplyBulkCString(c,"requirepass");
9238 addReplyBulkCString(c,server.requirepass);
9239 matches++;
9240 }
9241 if (stringmatch(pattern,"masterauth",0)) {
9242 addReplyBulkCString(c,"masterauth");
9243 addReplyBulkCString(c,server.masterauth);
9244 matches++;
9245 }
9246 if (stringmatch(pattern,"maxmemory",0)) {
9247 char buf[128];
9248
9249 snprintf(buf,128,"%llu\n",server.maxmemory);
9250 addReplyBulkCString(c,"maxmemory");
9251 addReplyBulkCString(c,buf);
9252 matches++;
9253 }
9254 decrRefCount(o);
9255 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9256 }
9257
9258 static void configCommand(redisClient *c) {
9259 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9260 if (c->argc != 4) goto badarity;
9261 configSetCommand(c);
9262 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9263 if (c->argc != 3) goto badarity;
9264 configGetCommand(c);
9265 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9266 if (c->argc != 2) goto badarity;
9267 server.stat_numcommands = 0;
9268 server.stat_numconnections = 0;
9269 server.stat_expiredkeys = 0;
9270 server.stat_starttime = time(NULL);
9271 addReply(c,shared.ok);
9272 } else {
9273 addReplySds(c,sdscatprintf(sdsempty(),
9274 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9275 }
9276 return;
9277
9278 badarity:
9279 addReplySds(c,sdscatprintf(sdsempty(),
9280 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9281 (char*) c->argv[1]->ptr));
9282 }
9283
9284 /* =========================== Pubsub implementation ======================== */
9285
9286 static void freePubsubPattern(void *p) {
9287 pubsubPattern *pat = p;
9288
9289 decrRefCount(pat->pattern);
9290 zfree(pat);
9291 }
9292
9293 static int listMatchPubsubPattern(void *a, void *b) {
9294 pubsubPattern *pa = a, *pb = b;
9295
9296 return (pa->client == pb->client) &&
9297 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9298 }
9299
9300 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9301 * 0 if the client was already subscribed to that channel. */
9302 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9303 struct dictEntry *de;
9304 list *clients = NULL;
9305 int retval = 0;
9306
9307 /* Add the channel to the client -> channels hash table */
9308 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9309 retval = 1;
9310 incrRefCount(channel);
9311 /* Add the client to the channel -> list of clients hash table */
9312 de = dictFind(server.pubsub_channels,channel);
9313 if (de == NULL) {
9314 clients = listCreate();
9315 dictAdd(server.pubsub_channels,channel,clients);
9316 incrRefCount(channel);
9317 } else {
9318 clients = dictGetEntryVal(de);
9319 }
9320 listAddNodeTail(clients,c);
9321 }
9322 /* Notify the client */
9323 addReply(c,shared.mbulk3);
9324 addReply(c,shared.subscribebulk);
9325 addReplyBulk(c,channel);
9326 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9327 return retval;
9328 }
9329
9330 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9331 * 0 if the client was not subscribed to the specified channel. */
9332 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9333 struct dictEntry *de;
9334 list *clients;
9335 listNode *ln;
9336 int retval = 0;
9337
9338 /* Remove the channel from the client -> channels hash table */
9339 incrRefCount(channel); /* channel may be just a pointer to the same object
9340 we have in the hash tables. Protect it... */
9341 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9342 retval = 1;
9343 /* Remove the client from the channel -> clients list hash table */
9344 de = dictFind(server.pubsub_channels,channel);
9345 assert(de != NULL);
9346 clients = dictGetEntryVal(de);
9347 ln = listSearchKey(clients,c);
9348 assert(ln != NULL);
9349 listDelNode(clients,ln);
9350 if (listLength(clients) == 0) {
9351 /* Free the list and associated hash entry at all if this was
9352 * the latest client, so that it will be possible to abuse
9353 * Redis PUBSUB creating millions of channels. */
9354 dictDelete(server.pubsub_channels,channel);
9355 }
9356 }
9357 /* Notify the client */
9358 if (notify) {
9359 addReply(c,shared.mbulk3);
9360 addReply(c,shared.unsubscribebulk);
9361 addReplyBulk(c,channel);
9362 addReplyLong(c,dictSize(c->pubsub_channels)+
9363 listLength(c->pubsub_patterns));
9364
9365 }
9366 decrRefCount(channel); /* it is finally safe to release it */
9367 return retval;
9368 }
9369
9370 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9371 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9372 int retval = 0;
9373
9374 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9375 retval = 1;
9376 pubsubPattern *pat;
9377 listAddNodeTail(c->pubsub_patterns,pattern);
9378 incrRefCount(pattern);
9379 pat = zmalloc(sizeof(*pat));
9380 pat->pattern = getDecodedObject(pattern);
9381 pat->client = c;
9382 listAddNodeTail(server.pubsub_patterns,pat);
9383 }
9384 /* Notify the client */
9385 addReply(c,shared.mbulk3);
9386 addReply(c,shared.psubscribebulk);
9387 addReplyBulk(c,pattern);
9388 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9389 return retval;
9390 }
9391
9392 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9393 * 0 if the client was not subscribed to the specified channel. */
9394 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9395 listNode *ln;
9396 pubsubPattern pat;
9397 int retval = 0;
9398
9399 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9400 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9401 retval = 1;
9402 listDelNode(c->pubsub_patterns,ln);
9403 pat.client = c;
9404 pat.pattern = pattern;
9405 ln = listSearchKey(server.pubsub_patterns,&pat);
9406 listDelNode(server.pubsub_patterns,ln);
9407 }
9408 /* Notify the client */
9409 if (notify) {
9410 addReply(c,shared.mbulk3);
9411 addReply(c,shared.punsubscribebulk);
9412 addReplyBulk(c,pattern);
9413 addReplyLong(c,dictSize(c->pubsub_channels)+
9414 listLength(c->pubsub_patterns));
9415 }
9416 decrRefCount(pattern);
9417 return retval;
9418 }
9419
9420 /* Unsubscribe from all the channels. Return the number of channels the
9421 * client was subscribed from. */
9422 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9423 dictIterator *di = dictGetIterator(c->pubsub_channels);
9424 dictEntry *de;
9425 int count = 0;
9426
9427 while((de = dictNext(di)) != NULL) {
9428 robj *channel = dictGetEntryKey(de);
9429
9430 count += pubsubUnsubscribeChannel(c,channel,notify);
9431 }
9432 dictReleaseIterator(di);
9433 return count;
9434 }
9435
9436 /* Unsubscribe from all the patterns. Return the number of patterns the
9437 * client was subscribed from. */
9438 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9439 listNode *ln;
9440 listIter li;
9441 int count = 0;
9442
9443 listRewind(c->pubsub_patterns,&li);
9444 while ((ln = listNext(&li)) != NULL) {
9445 robj *pattern = ln->value;
9446
9447 count += pubsubUnsubscribePattern(c,pattern,notify);
9448 }
9449 return count;
9450 }
9451
9452 /* Publish a message */
9453 static int pubsubPublishMessage(robj *channel, robj *message) {
9454 int receivers = 0;
9455 struct dictEntry *de;
9456 listNode *ln;
9457 listIter li;
9458
9459 /* Send to clients listening for that channel */
9460 de = dictFind(server.pubsub_channels,channel);
9461 if (de) {
9462 list *list = dictGetEntryVal(de);
9463 listNode *ln;
9464 listIter li;
9465
9466 listRewind(list,&li);
9467 while ((ln = listNext(&li)) != NULL) {
9468 redisClient *c = ln->value;
9469
9470 addReply(c,shared.mbulk3);
9471 addReply(c,shared.messagebulk);
9472 addReplyBulk(c,channel);
9473 addReplyBulk(c,message);
9474 receivers++;
9475 }
9476 }
9477 /* Send to clients listening to matching channels */
9478 if (listLength(server.pubsub_patterns)) {
9479 listRewind(server.pubsub_patterns,&li);
9480 channel = getDecodedObject(channel);
9481 while ((ln = listNext(&li)) != NULL) {
9482 pubsubPattern *pat = ln->value;
9483
9484 if (stringmatchlen((char*)pat->pattern->ptr,
9485 sdslen(pat->pattern->ptr),
9486 (char*)channel->ptr,
9487 sdslen(channel->ptr),0)) {
9488 addReply(pat->client,shared.mbulk3);
9489 addReply(pat->client,shared.messagebulk);
9490 addReplyBulk(pat->client,channel);
9491 addReplyBulk(pat->client,message);
9492 receivers++;
9493 }
9494 }
9495 decrRefCount(channel);
9496 }
9497 return receivers;
9498 }
9499
9500 static void subscribeCommand(redisClient *c) {
9501 int j;
9502
9503 for (j = 1; j < c->argc; j++)
9504 pubsubSubscribeChannel(c,c->argv[j]);
9505 }
9506
9507 static void unsubscribeCommand(redisClient *c) {
9508 if (c->argc == 1) {
9509 pubsubUnsubscribeAllChannels(c,1);
9510 return;
9511 } else {
9512 int j;
9513
9514 for (j = 1; j < c->argc; j++)
9515 pubsubUnsubscribeChannel(c,c->argv[j],1);
9516 }
9517 }
9518
9519 static void psubscribeCommand(redisClient *c) {
9520 int j;
9521
9522 for (j = 1; j < c->argc; j++)
9523 pubsubSubscribePattern(c,c->argv[j]);
9524 }
9525
9526 static void punsubscribeCommand(redisClient *c) {
9527 if (c->argc == 1) {
9528 pubsubUnsubscribeAllPatterns(c,1);
9529 return;
9530 } else {
9531 int j;
9532
9533 for (j = 1; j < c->argc; j++)
9534 pubsubUnsubscribePattern(c,c->argv[j],1);
9535 }
9536 }
9537
9538 static void publishCommand(redisClient *c) {
9539 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9540 addReplyLong(c,receivers);
9541 }
9542
9543 /* ================================= Debugging ============================== */
9544
9545 static void debugCommand(redisClient *c) {
9546 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9547 *((char*)-1) = 'x';
9548 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9549 if (rdbSave(server.dbfilename) != REDIS_OK) {
9550 addReply(c,shared.err);
9551 return;
9552 }
9553 emptyDb();
9554 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9555 addReply(c,shared.err);
9556 return;
9557 }
9558 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9559 addReply(c,shared.ok);
9560 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9561 emptyDb();
9562 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9563 addReply(c,shared.err);
9564 return;
9565 }
9566 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9567 addReply(c,shared.ok);
9568 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9569 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9570 robj *key, *val;
9571
9572 if (!de) {
9573 addReply(c,shared.nokeyerr);
9574 return;
9575 }
9576 key = dictGetEntryKey(de);
9577 val = dictGetEntryVal(de);
9578 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9579 key->storage == REDIS_VM_SWAPPING)) {
9580 char *strenc;
9581 char buf[128];
9582
9583 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9584 strenc = strencoding[val->encoding];
9585 } else {
9586 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9587 strenc = buf;
9588 }
9589 addReplySds(c,sdscatprintf(sdsempty(),
9590 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9591 "encoding:%s serializedlength:%lld\r\n",
9592 (void*)key, key->refcount, (void*)val, val->refcount,
9593 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9594 } else {
9595 addReplySds(c,sdscatprintf(sdsempty(),
9596 "+Key at:%p refcount:%d, value swapped at: page %llu "
9597 "using %llu pages\r\n",
9598 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9599 (unsigned long long) key->vm.usedpages));
9600 }
9601 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9602 lookupKeyRead(c->db,c->argv[2]);
9603 addReply(c,shared.ok);
9604 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9605 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9606 robj *key, *val;
9607
9608 if (!server.vm_enabled) {
9609 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9610 return;
9611 }
9612 if (!de) {
9613 addReply(c,shared.nokeyerr);
9614 return;
9615 }
9616 key = dictGetEntryKey(de);
9617 val = dictGetEntryVal(de);
9618 /* If the key is shared we want to create a copy */
9619 if (key->refcount > 1) {
9620 robj *newkey = dupStringObject(key);
9621 decrRefCount(key);
9622 key = dictGetEntryKey(de) = newkey;
9623 }
9624 /* Swap it */
9625 if (key->storage != REDIS_VM_MEMORY) {
9626 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9627 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9628 dictGetEntryVal(de) = NULL;
9629 addReply(c,shared.ok);
9630 } else {
9631 addReply(c,shared.err);
9632 }
9633 } else {
9634 addReplySds(c,sdsnew(
9635 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9636 }
9637 }
9638
9639 static void _redisAssert(char *estr, char *file, int line) {
9640 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9641 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9642 #ifdef HAVE_BACKTRACE
9643 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9644 *((char*)-1) = 'x';
9645 #endif
9646 }
9647
9648 /* =================================== Main! ================================ */
9649
9650 #ifdef __linux__
9651 int linuxOvercommitMemoryValue(void) {
9652 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9653 char buf[64];
9654
9655 if (!fp) return -1;
9656 if (fgets(buf,64,fp) == NULL) {
9657 fclose(fp);
9658 return -1;
9659 }
9660 fclose(fp);
9661
9662 return atoi(buf);
9663 }
9664
9665 void linuxOvercommitMemoryWarning(void) {
9666 if (linuxOvercommitMemoryValue() == 0) {
9667 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9668 }
9669 }
9670 #endif /* __linux__ */
9671
9672 static void daemonize(void) {
9673 int fd;
9674 FILE *fp;
9675
9676 if (fork() != 0) exit(0); /* parent exits */
9677 setsid(); /* create a new session */
9678
9679 /* Every output goes to /dev/null. If Redis is daemonized but
9680 * the 'logfile' is set to 'stdout' in the configuration file
9681 * it will not log at all. */
9682 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9683 dup2(fd, STDIN_FILENO);
9684 dup2(fd, STDOUT_FILENO);
9685 dup2(fd, STDERR_FILENO);
9686 if (fd > STDERR_FILENO) close(fd);
9687 }
9688 /* Try to write the pid file */
9689 fp = fopen(server.pidfile,"w");
9690 if (fp) {
9691 fprintf(fp,"%d\n",getpid());
9692 fclose(fp);
9693 }
9694 }
9695
9696 static void version() {
9697 printf("Redis server version %s\n", REDIS_VERSION);
9698 exit(0);
9699 }
9700
9701 static void usage() {
9702 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9703 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9704 exit(1);
9705 }
9706
9707 int main(int argc, char **argv) {
9708 time_t start;
9709
9710 initServerConfig();
9711 if (argc == 2) {
9712 if (strcmp(argv[1], "-v") == 0 ||
9713 strcmp(argv[1], "--version") == 0) version();
9714 if (strcmp(argv[1], "--help") == 0) usage();
9715 resetServerSaveParams();
9716 loadServerConfig(argv[1]);
9717 } else if ((argc > 2)) {
9718 usage();
9719 } else {
9720 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9721 }
9722 if (server.daemonize) daemonize();
9723 initServer();
9724 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9725 #ifdef __linux__
9726 linuxOvercommitMemoryWarning();
9727 #endif
9728 start = time(NULL);
9729 if (server.appendonly) {
9730 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9731 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9732 } else {
9733 if (rdbLoad(server.dbfilename) == REDIS_OK)
9734 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9735 }
9736 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9737 aeSetBeforeSleepProc(server.el,beforeSleep);
9738 aeMain(server.el);
9739 aeDeleteEventLoop(server.el);
9740 return 0;
9741 }
9742
9743 /* ============================= Backtrace support ========================= */
9744
9745 #ifdef HAVE_BACKTRACE
9746 static char *findFuncName(void *pointer, unsigned long *offset);
9747
9748 static void *getMcontextEip(ucontext_t *uc) {
9749 #if defined(__FreeBSD__)
9750 return (void*) uc->uc_mcontext.mc_eip;
9751 #elif defined(__dietlibc__)
9752 return (void*) uc->uc_mcontext.eip;
9753 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9754 #if __x86_64__
9755 return (void*) uc->uc_mcontext->__ss.__rip;
9756 #else
9757 return (void*) uc->uc_mcontext->__ss.__eip;
9758 #endif
9759 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9760 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9761 return (void*) uc->uc_mcontext->__ss.__rip;
9762 #else
9763 return (void*) uc->uc_mcontext->__ss.__eip;
9764 #endif
9765 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9766 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9767 #elif defined(__ia64__) /* Linux IA64 */
9768 return (void*) uc->uc_mcontext.sc_ip;
9769 #else
9770 return NULL;
9771 #endif
9772 }
9773
9774 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9775 void *trace[100];
9776 char **messages = NULL;
9777 int i, trace_size = 0;
9778 unsigned long offset=0;
9779 ucontext_t *uc = (ucontext_t*) secret;
9780 sds infostring;
9781 REDIS_NOTUSED(info);
9782
9783 redisLog(REDIS_WARNING,
9784 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9785 infostring = genRedisInfoString();
9786 redisLog(REDIS_WARNING, "%s",infostring);
9787 /* It's not safe to sdsfree() the returned string under memory
9788 * corruption conditions. Let it leak as we are going to abort */
9789
9790 trace_size = backtrace(trace, 100);
9791 /* overwrite sigaction with caller's address */
9792 if (getMcontextEip(uc) != NULL) {
9793 trace[1] = getMcontextEip(uc);
9794 }
9795 messages = backtrace_symbols(trace, trace_size);
9796
9797 for (i=1; i<trace_size; ++i) {
9798 char *fn = findFuncName(trace[i], &offset), *p;
9799
9800 p = strchr(messages[i],'+');
9801 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9802 redisLog(REDIS_WARNING,"%s", messages[i]);
9803 } else {
9804 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9805 }
9806 }
9807 /* free(messages); Don't call free() with possibly corrupted memory. */
9808 _exit(0);
9809 }
9810
9811 static void setupSigSegvAction(void) {
9812 struct sigaction act;
9813
9814 sigemptyset (&act.sa_mask);
9815 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9816 * is used. Otherwise, sa_handler is used */
9817 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9818 act.sa_sigaction = segvHandler;
9819 sigaction (SIGSEGV, &act, NULL);
9820 sigaction (SIGBUS, &act, NULL);
9821 sigaction (SIGFPE, &act, NULL);
9822 sigaction (SIGILL, &act, NULL);
9823 sigaction (SIGBUS, &act, NULL);
9824 return;
9825 }
9826
9827 #include "staticsymbols.h"
9828 /* This function try to convert a pointer into a function name. It's used in
9829 * oreder to provide a backtrace under segmentation fault that's able to
9830 * display functions declared as static (otherwise the backtrace is useless). */
9831 static char *findFuncName(void *pointer, unsigned long *offset){
9832 int i, ret = -1;
9833 unsigned long off, minoff = 0;
9834
9835 /* Try to match against the Symbol with the smallest offset */
9836 for (i=0; symsTable[i].pointer; i++) {
9837 unsigned long lp = (unsigned long) pointer;
9838
9839 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9840 off=lp-symsTable[i].pointer;
9841 if (ret < 0 || off < minoff) {
9842 minoff=off;
9843 ret=i;
9844 }
9845 }
9846 }
9847 if (ret == -1) return NULL;
9848 *offset = minoff;
9849 return symsTable[ret].name;
9850 }
9851 #else /* HAVE_BACKTRACE */
9852 static void setupSigSegvAction(void) {
9853 }
9854 #endif /* HAVE_BACKTRACE */
9855
9856
9857
9858 /* The End */
9859
9860
9861