]> git.saurik.com Git - redis.git/blob - redis.c
55e188010eda9d7f9cbd35e94392da3956d05e28
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
518 } shared;
519
520 /* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
526 /* VM threaded I/O request message */
527 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
530 typedef struct iojob {
531 int type; /* Request type, REDIS_IOJOB_* */
532 redisDb *db;/* Redis database */
533 robj *key; /* This I/O request is about swapping this key */
534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540 } iojob;
541
542 /*================================ Prototypes =============================== */
543
544 static void freeStringObject(robj *o);
545 static void freeListObject(robj *o);
546 static void freeSetObject(robj *o);
547 static void decrRefCount(void *o);
548 static robj *createObject(int type, void *ptr);
549 static void freeClient(redisClient *c);
550 static int rdbLoad(char *filename);
551 static void addReply(redisClient *c, robj *obj);
552 static void addReplySds(redisClient *c, sds s);
553 static void incrRefCount(robj *o);
554 static int rdbSaveBackground(char *filename);
555 static robj *createStringObject(char *ptr, size_t len);
556 static robj *dupStringObject(robj *o);
557 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
559 static int syncWithMaster(void);
560 static robj *tryObjectEncoding(robj *o);
561 static robj *getDecodedObject(robj *o);
562 static int removeExpire(redisDb *db, robj *key);
563 static int expireIfNeeded(redisDb *db, robj *key);
564 static int deleteIfVolatile(redisDb *db, robj *key);
565 static int deleteIfSwapped(redisDb *db, robj *key);
566 static int deleteKey(redisDb *db, robj *key);
567 static time_t getExpire(redisDb *db, robj *key);
568 static int setExpire(redisDb *db, robj *key, time_t when);
569 static void updateSlavesWaitingBgsave(int bgsaveerr);
570 static void freeMemoryIfNeeded(void);
571 static int processCommand(redisClient *c);
572 static void setupSigSegvAction(void);
573 static void rdbRemoveTempFile(pid_t childpid);
574 static void aofRemoveTempFile(pid_t childpid);
575 static size_t stringObjectLen(robj *o);
576 static void processInputBuffer(redisClient *c);
577 static zskiplist *zslCreate(void);
578 static void zslFree(zskiplist *zsl);
579 static void zslInsert(zskiplist *zsl, double score, robj *obj);
580 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
581 static void initClientMultiState(redisClient *c);
582 static void freeClientMultiState(redisClient *c);
583 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584 static void unblockClientWaitingData(redisClient *c);
585 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586 static void vmInit(void);
587 static void vmMarkPagesFree(off_t page, off_t count);
588 static robj *vmLoadObject(robj *key);
589 static robj *vmPreviewObject(robj *key);
590 static int vmSwapOneObjectBlocking(void);
591 static int vmSwapOneObjectThreaded(void);
592 static int vmCanSwapOut(void);
593 static int tryFreeOneObjectFromFreelist(void);
594 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596 static void vmCancelThreadedIOJob(robj *o);
597 static void lockThreadedIO(void);
598 static void unlockThreadedIO(void);
599 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600 static void freeIOJob(iojob *j);
601 static void queueIOJob(iojob *j);
602 static int vmWriteObjectOnSwap(robj *o, off_t page);
603 static robj *vmReadObjectFromSwap(off_t page, int type);
604 static void waitEmptyIOJobsQueue(void);
605 static void vmReopenSwapFile(void);
606 static int vmFreePage(off_t page);
607 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
608 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609 static int dontWaitForSwappedKey(redisClient *c, robj *key);
610 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612 static struct redisCommand *lookupCommand(char *name);
613 static void call(redisClient *c, struct redisCommand *cmd);
614 static void resetClient(redisClient *c);
615 static void convertToRealHash(robj *o);
616 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618 static void freePubsubPattern(void *p);
619 static int listMatchPubsubPattern(void *a, void *b);
620 static int compareStringObjects(robj *a, robj *b);
621 static void usage();
622
623 static void authCommand(redisClient *c);
624 static void pingCommand(redisClient *c);
625 static void echoCommand(redisClient *c);
626 static void setCommand(redisClient *c);
627 static void setnxCommand(redisClient *c);
628 static void getCommand(redisClient *c);
629 static void delCommand(redisClient *c);
630 static void existsCommand(redisClient *c);
631 static void incrCommand(redisClient *c);
632 static void decrCommand(redisClient *c);
633 static void incrbyCommand(redisClient *c);
634 static void decrbyCommand(redisClient *c);
635 static void selectCommand(redisClient *c);
636 static void randomkeyCommand(redisClient *c);
637 static void keysCommand(redisClient *c);
638 static void dbsizeCommand(redisClient *c);
639 static void lastsaveCommand(redisClient *c);
640 static void saveCommand(redisClient *c);
641 static void bgsaveCommand(redisClient *c);
642 static void bgrewriteaofCommand(redisClient *c);
643 static void shutdownCommand(redisClient *c);
644 static void moveCommand(redisClient *c);
645 static void renameCommand(redisClient *c);
646 static void renamenxCommand(redisClient *c);
647 static void lpushCommand(redisClient *c);
648 static void rpushCommand(redisClient *c);
649 static void lpopCommand(redisClient *c);
650 static void rpopCommand(redisClient *c);
651 static void llenCommand(redisClient *c);
652 static void lindexCommand(redisClient *c);
653 static void lrangeCommand(redisClient *c);
654 static void ltrimCommand(redisClient *c);
655 static void typeCommand(redisClient *c);
656 static void lsetCommand(redisClient *c);
657 static void saddCommand(redisClient *c);
658 static void sremCommand(redisClient *c);
659 static void smoveCommand(redisClient *c);
660 static void sismemberCommand(redisClient *c);
661 static void scardCommand(redisClient *c);
662 static void spopCommand(redisClient *c);
663 static void srandmemberCommand(redisClient *c);
664 static void sinterCommand(redisClient *c);
665 static void sinterstoreCommand(redisClient *c);
666 static void sunionCommand(redisClient *c);
667 static void sunionstoreCommand(redisClient *c);
668 static void sdiffCommand(redisClient *c);
669 static void sdiffstoreCommand(redisClient *c);
670 static void syncCommand(redisClient *c);
671 static void flushdbCommand(redisClient *c);
672 static void flushallCommand(redisClient *c);
673 static void sortCommand(redisClient *c);
674 static void lremCommand(redisClient *c);
675 static void rpoplpushcommand(redisClient *c);
676 static void infoCommand(redisClient *c);
677 static void mgetCommand(redisClient *c);
678 static void monitorCommand(redisClient *c);
679 static void expireCommand(redisClient *c);
680 static void expireatCommand(redisClient *c);
681 static void getsetCommand(redisClient *c);
682 static void ttlCommand(redisClient *c);
683 static void slaveofCommand(redisClient *c);
684 static void debugCommand(redisClient *c);
685 static void msetCommand(redisClient *c);
686 static void msetnxCommand(redisClient *c);
687 static void zaddCommand(redisClient *c);
688 static void zincrbyCommand(redisClient *c);
689 static void zrangeCommand(redisClient *c);
690 static void zrangebyscoreCommand(redisClient *c);
691 static void zcountCommand(redisClient *c);
692 static void zrevrangeCommand(redisClient *c);
693 static void zcardCommand(redisClient *c);
694 static void zremCommand(redisClient *c);
695 static void zscoreCommand(redisClient *c);
696 static void zremrangebyscoreCommand(redisClient *c);
697 static void multiCommand(redisClient *c);
698 static void execCommand(redisClient *c);
699 static void discardCommand(redisClient *c);
700 static void blpopCommand(redisClient *c);
701 static void brpopCommand(redisClient *c);
702 static void appendCommand(redisClient *c);
703 static void substrCommand(redisClient *c);
704 static void zrankCommand(redisClient *c);
705 static void zrevrankCommand(redisClient *c);
706 static void hsetCommand(redisClient *c);
707 static void hgetCommand(redisClient *c);
708 static void hmsetCommand(redisClient *c);
709 static void hmgetCommand(redisClient *c);
710 static void hdelCommand(redisClient *c);
711 static void hlenCommand(redisClient *c);
712 static void zremrangebyrankCommand(redisClient *c);
713 static void zunionCommand(redisClient *c);
714 static void zinterCommand(redisClient *c);
715 static void hkeysCommand(redisClient *c);
716 static void hvalsCommand(redisClient *c);
717 static void hgetallCommand(redisClient *c);
718 static void hexistsCommand(redisClient *c);
719 static void configCommand(redisClient *c);
720 static void hincrbyCommand(redisClient *c);
721 static void subscribeCommand(redisClient *c);
722 static void unsubscribeCommand(redisClient *c);
723 static void psubscribeCommand(redisClient *c);
724 static void punsubscribeCommand(redisClient *c);
725 static void publishCommand(redisClient *c);
726
727 /*================================= Globals ================================= */
728
729 /* Global vars */
730 static struct redisServer server; /* server global state */
731 static struct redisCommand cmdTable[] = {
732 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
735 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
738 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
742 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
754 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
755 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
758 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
776 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
799 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
800 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
812 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
820 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
831 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
836 {NULL,NULL,0,0,NULL,0,0,0}
837 };
838
839 /*============================ Utility functions ============================ */
840
841 /* Glob-style pattern matching. */
842 static int stringmatchlen(const char *pattern, int patternLen,
843 const char *string, int stringLen, int nocase)
844 {
845 while(patternLen) {
846 switch(pattern[0]) {
847 case '*':
848 while (pattern[1] == '*') {
849 pattern++;
850 patternLen--;
851 }
852 if (patternLen == 1)
853 return 1; /* match */
854 while(stringLen) {
855 if (stringmatchlen(pattern+1, patternLen-1,
856 string, stringLen, nocase))
857 return 1; /* match */
858 string++;
859 stringLen--;
860 }
861 return 0; /* no match */
862 break;
863 case '?':
864 if (stringLen == 0)
865 return 0; /* no match */
866 string++;
867 stringLen--;
868 break;
869 case '[':
870 {
871 int not, match;
872
873 pattern++;
874 patternLen--;
875 not = pattern[0] == '^';
876 if (not) {
877 pattern++;
878 patternLen--;
879 }
880 match = 0;
881 while(1) {
882 if (pattern[0] == '\\') {
883 pattern++;
884 patternLen--;
885 if (pattern[0] == string[0])
886 match = 1;
887 } else if (pattern[0] == ']') {
888 break;
889 } else if (patternLen == 0) {
890 pattern--;
891 patternLen++;
892 break;
893 } else if (pattern[1] == '-' && patternLen >= 3) {
894 int start = pattern[0];
895 int end = pattern[2];
896 int c = string[0];
897 if (start > end) {
898 int t = start;
899 start = end;
900 end = t;
901 }
902 if (nocase) {
903 start = tolower(start);
904 end = tolower(end);
905 c = tolower(c);
906 }
907 pattern += 2;
908 patternLen -= 2;
909 if (c >= start && c <= end)
910 match = 1;
911 } else {
912 if (!nocase) {
913 if (pattern[0] == string[0])
914 match = 1;
915 } else {
916 if (tolower((int)pattern[0]) == tolower((int)string[0]))
917 match = 1;
918 }
919 }
920 pattern++;
921 patternLen--;
922 }
923 if (not)
924 match = !match;
925 if (!match)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 }
931 case '\\':
932 if (patternLen >= 2) {
933 pattern++;
934 patternLen--;
935 }
936 /* fall through */
937 default:
938 if (!nocase) {
939 if (pattern[0] != string[0])
940 return 0; /* no match */
941 } else {
942 if (tolower((int)pattern[0]) != tolower((int)string[0]))
943 return 0; /* no match */
944 }
945 string++;
946 stringLen--;
947 break;
948 }
949 pattern++;
950 patternLen--;
951 if (stringLen == 0) {
952 while(*pattern == '*') {
953 pattern++;
954 patternLen--;
955 }
956 break;
957 }
958 }
959 if (patternLen == 0 && stringLen == 0)
960 return 1;
961 return 0;
962 }
963
964 static int stringmatch(const char *pattern, const char *string, int nocase) {
965 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
966 }
967
968 static void redisLog(int level, const char *fmt, ...) {
969 va_list ap;
970 FILE *fp;
971
972 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
973 if (!fp) return;
974
975 va_start(ap, fmt);
976 if (level >= server.verbosity) {
977 char *c = ".-*#";
978 char buf[64];
979 time_t now;
980
981 now = time(NULL);
982 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
983 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
984 vfprintf(fp, fmt, ap);
985 fprintf(fp,"\n");
986 fflush(fp);
987 }
988 va_end(ap);
989
990 if (server.logfile) fclose(fp);
991 }
992
993 /*====================== Hash table type implementation ==================== */
994
995 /* This is an hash table type that uses the SDS dynamic strings libary as
996 * keys and radis objects as values (objects can hold SDS strings,
997 * lists, sets). */
998
999 static void dictVanillaFree(void *privdata, void *val)
1000 {
1001 DICT_NOTUSED(privdata);
1002 zfree(val);
1003 }
1004
1005 static void dictListDestructor(void *privdata, void *val)
1006 {
1007 DICT_NOTUSED(privdata);
1008 listRelease((list*)val);
1009 }
1010
1011 static int sdsDictKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 int l1,l2;
1015 DICT_NOTUSED(privdata);
1016
1017 l1 = sdslen((sds)key1);
1018 l2 = sdslen((sds)key2);
1019 if (l1 != l2) return 0;
1020 return memcmp(key1, key2, l1) == 0;
1021 }
1022
1023 static void dictRedisObjectDestructor(void *privdata, void *val)
1024 {
1025 DICT_NOTUSED(privdata);
1026
1027 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1028 decrRefCount(val);
1029 }
1030
1031 static int dictObjKeyCompare(void *privdata, const void *key1,
1032 const void *key2)
1033 {
1034 const robj *o1 = key1, *o2 = key2;
1035 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1036 }
1037
1038 static unsigned int dictObjHash(const void *key) {
1039 const robj *o = key;
1040 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1041 }
1042
1043 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1044 const void *key2)
1045 {
1046 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1047 int cmp;
1048
1049 if (o1->encoding == REDIS_ENCODING_INT &&
1050 o2->encoding == REDIS_ENCODING_INT &&
1051 o1->ptr == o2->ptr) return 1;
1052
1053 o1 = getDecodedObject(o1);
1054 o2 = getDecodedObject(o2);
1055 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1056 decrRefCount(o1);
1057 decrRefCount(o2);
1058 return cmp;
1059 }
1060
1061 static unsigned int dictEncObjHash(const void *key) {
1062 robj *o = (robj*) key;
1063
1064 if (o->encoding == REDIS_ENCODING_RAW) {
1065 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1066 } else {
1067 if (o->encoding == REDIS_ENCODING_INT) {
1068 char buf[32];
1069 int len;
1070
1071 len = snprintf(buf,32,"%ld",(long)o->ptr);
1072 return dictGenHashFunction((unsigned char*)buf, len);
1073 } else {
1074 unsigned int hash;
1075
1076 o = getDecodedObject(o);
1077 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1078 decrRefCount(o);
1079 return hash;
1080 }
1081 }
1082 }
1083
1084 /* Sets type and expires */
1085 static dictType setDictType = {
1086 dictEncObjHash, /* hash function */
1087 NULL, /* key dup */
1088 NULL, /* val dup */
1089 dictEncObjKeyCompare, /* key compare */
1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092 };
1093
1094 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1095 static dictType zsetDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
1101 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1102 };
1103
1104 /* Db->dict */
1105 static dictType dbDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictRedisObjectDestructor /* val destructor */
1112 };
1113
1114 /* Db->expires */
1115 static dictType keyptrDictType = {
1116 dictObjHash, /* hash function */
1117 NULL, /* key dup */
1118 NULL, /* val dup */
1119 dictObjKeyCompare, /* key compare */
1120 dictRedisObjectDestructor, /* key destructor */
1121 NULL /* val destructor */
1122 };
1123
1124 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1125 static dictType hashDictType = {
1126 dictEncObjHash, /* hash function */
1127 NULL, /* key dup */
1128 NULL, /* val dup */
1129 dictEncObjKeyCompare, /* key compare */
1130 dictRedisObjectDestructor, /* key destructor */
1131 dictRedisObjectDestructor /* val destructor */
1132 };
1133
1134 /* Keylist hash table type has unencoded redis objects as keys and
1135 * lists as values. It's used for blocking operations (BLPOP) and to
1136 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1137 static dictType keylistDictType = {
1138 dictObjHash, /* hash function */
1139 NULL, /* key dup */
1140 NULL, /* val dup */
1141 dictObjKeyCompare, /* key compare */
1142 dictRedisObjectDestructor, /* key destructor */
1143 dictListDestructor /* val destructor */
1144 };
1145
1146 static void version();
1147
1148 /* ========================= Random utility functions ======================= */
1149
1150 /* Redis generally does not try to recover from out of memory conditions
1151 * when allocating objects or strings, it is not clear if it will be possible
1152 * to report this condition to the client since the networking layer itself
1153 * is based on heap allocation for send buffers, so we simply abort.
1154 * At least the code will be simpler to read... */
1155 static void oom(const char *msg) {
1156 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1157 sleep(1);
1158 abort();
1159 }
1160
1161 /* ====================== Redis server networking stuff ===================== */
1162 static void closeTimedoutClients(void) {
1163 redisClient *c;
1164 listNode *ln;
1165 time_t now = time(NULL);
1166 listIter li;
1167
1168 listRewind(server.clients,&li);
1169 while ((ln = listNext(&li)) != NULL) {
1170 c = listNodeValue(ln);
1171 if (server.maxidletime &&
1172 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1173 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1174 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1175 listLength(c->pubsub_patterns) == 0 &&
1176 (now - c->lastinteraction > server.maxidletime))
1177 {
1178 redisLog(REDIS_VERBOSE,"Closing idle client");
1179 freeClient(c);
1180 } else if (c->flags & REDIS_BLOCKED) {
1181 if (c->blockingto != 0 && c->blockingto < now) {
1182 addReply(c,shared.nullmultibulk);
1183 unblockClientWaitingData(c);
1184 }
1185 }
1186 }
1187 }
1188
1189 static int htNeedsResize(dict *dict) {
1190 long long size, used;
1191
1192 size = dictSlots(dict);
1193 used = dictSize(dict);
1194 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1195 (used*100/size < REDIS_HT_MINFILL));
1196 }
1197
1198 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1199 * we resize the hash table to save memory */
1200 static void tryResizeHashTables(void) {
1201 int j;
1202
1203 for (j = 0; j < server.dbnum; j++) {
1204 if (htNeedsResize(server.db[j].dict))
1205 dictResize(server.db[j].dict);
1206 if (htNeedsResize(server.db[j].expires))
1207 dictResize(server.db[j].expires);
1208 }
1209 }
1210
1211 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1212 void backgroundSaveDoneHandler(int statloc) {
1213 int exitcode = WEXITSTATUS(statloc);
1214 int bysignal = WIFSIGNALED(statloc);
1215
1216 if (!bysignal && exitcode == 0) {
1217 redisLog(REDIS_NOTICE,
1218 "Background saving terminated with success");
1219 server.dirty = 0;
1220 server.lastsave = time(NULL);
1221 } else if (!bysignal && exitcode != 0) {
1222 redisLog(REDIS_WARNING, "Background saving error");
1223 } else {
1224 redisLog(REDIS_WARNING,
1225 "Background saving terminated by signal %d", WTERMSIG(statloc));
1226 rdbRemoveTempFile(server.bgsavechildpid);
1227 }
1228 server.bgsavechildpid = -1;
1229 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1230 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1231 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1232 }
1233
1234 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1235 * Handle this. */
1236 void backgroundRewriteDoneHandler(int statloc) {
1237 int exitcode = WEXITSTATUS(statloc);
1238 int bysignal = WIFSIGNALED(statloc);
1239
1240 if (!bysignal && exitcode == 0) {
1241 int fd;
1242 char tmpfile[256];
1243
1244 redisLog(REDIS_NOTICE,
1245 "Background append only file rewriting terminated with success");
1246 /* Now it's time to flush the differences accumulated by the parent */
1247 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1248 fd = open(tmpfile,O_WRONLY|O_APPEND);
1249 if (fd == -1) {
1250 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1251 goto cleanup;
1252 }
1253 /* Flush our data... */
1254 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1255 (signed) sdslen(server.bgrewritebuf)) {
1256 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1257 close(fd);
1258 goto cleanup;
1259 }
1260 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1261 /* Now our work is to rename the temp file into the stable file. And
1262 * switch the file descriptor used by the server for append only. */
1263 if (rename(tmpfile,server.appendfilename) == -1) {
1264 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1265 close(fd);
1266 goto cleanup;
1267 }
1268 /* Mission completed... almost */
1269 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1270 if (server.appendfd != -1) {
1271 /* If append only is actually enabled... */
1272 close(server.appendfd);
1273 server.appendfd = fd;
1274 fsync(fd);
1275 server.appendseldb = -1; /* Make sure it will issue SELECT */
1276 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1277 } else {
1278 /* If append only is disabled we just generate a dump in this
1279 * format. Why not? */
1280 close(fd);
1281 }
1282 } else if (!bysignal && exitcode != 0) {
1283 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1284 } else {
1285 redisLog(REDIS_WARNING,
1286 "Background append only file rewriting terminated by signal %d",
1287 WTERMSIG(statloc));
1288 }
1289 cleanup:
1290 sdsfree(server.bgrewritebuf);
1291 server.bgrewritebuf = sdsempty();
1292 aofRemoveTempFile(server.bgrewritechildpid);
1293 server.bgrewritechildpid = -1;
1294 }
1295
1296 /* This function is called once a background process of some kind terminates,
1297 * as we want to avoid resizing the hash tables when there is a child in order
1298 * to play well with copy-on-write (otherwise when a resize happens lots of
1299 * memory pages are copied). The goal of this function is to update the ability
1300 * for dict.c to resize the hash tables accordingly to the fact we have o not
1301 * running childs. */
1302 static void updateDictResizePolicy(void) {
1303 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1304 dictEnableResize();
1305 else
1306 dictDisableResize();
1307 }
1308
1309 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1310 int j, loops = server.cronloops++;
1311 REDIS_NOTUSED(eventLoop);
1312 REDIS_NOTUSED(id);
1313 REDIS_NOTUSED(clientData);
1314
1315 /* We take a cached value of the unix time in the global state because
1316 * with virtual memory and aging there is to store the current time
1317 * in objects at every object access, and accuracy is not needed.
1318 * To access a global var is faster than calling time(NULL) */
1319 server.unixtime = time(NULL);
1320
1321 /* Show some info about non-empty databases */
1322 for (j = 0; j < server.dbnum; j++) {
1323 long long size, used, vkeys;
1324
1325 size = dictSlots(server.db[j].dict);
1326 used = dictSize(server.db[j].dict);
1327 vkeys = dictSize(server.db[j].expires);
1328 if (!(loops % 50) && (used || vkeys)) {
1329 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1330 /* dictPrintStats(server.dict); */
1331 }
1332 }
1333
1334 /* We don't want to resize the hash tables while a bacground saving
1335 * is in progress: the saving child is created using fork() that is
1336 * implemented with a copy-on-write semantic in most modern systems, so
1337 * if we resize the HT while there is the saving child at work actually
1338 * a lot of memory movements in the parent will cause a lot of pages
1339 * copied. */
1340 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1341 !(loops % 10))
1342 {
1343 tryResizeHashTables();
1344 }
1345
1346 /* Show information about connected clients */
1347 if (!(loops % 50)) {
1348 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1349 listLength(server.clients)-listLength(server.slaves),
1350 listLength(server.slaves),
1351 zmalloc_used_memory());
1352 }
1353
1354 /* Close connections of timedout clients */
1355 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1356 closeTimedoutClients();
1357
1358 /* Check if a background saving or AOF rewrite in progress terminated */
1359 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1360 int statloc;
1361 pid_t pid;
1362
1363 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1364 if (pid == server.bgsavechildpid) {
1365 backgroundSaveDoneHandler(statloc);
1366 } else {
1367 backgroundRewriteDoneHandler(statloc);
1368 }
1369 updateDictResizePolicy();
1370 }
1371 } else {
1372 /* If there is not a background saving in progress check if
1373 * we have to save now */
1374 time_t now = time(NULL);
1375 for (j = 0; j < server.saveparamslen; j++) {
1376 struct saveparam *sp = server.saveparams+j;
1377
1378 if (server.dirty >= sp->changes &&
1379 now-server.lastsave > sp->seconds) {
1380 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1381 sp->changes, sp->seconds);
1382 rdbSaveBackground(server.dbfilename);
1383 break;
1384 }
1385 }
1386 }
1387
1388 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1389 * will use few CPU cycles if there are few expiring keys, otherwise
1390 * it will get more aggressive to avoid that too much memory is used by
1391 * keys that can be removed from the keyspace. */
1392 for (j = 0; j < server.dbnum; j++) {
1393 int expired;
1394 redisDb *db = server.db+j;
1395
1396 /* Continue to expire if at the end of the cycle more than 25%
1397 * of the keys were expired. */
1398 do {
1399 long num = dictSize(db->expires);
1400 time_t now = time(NULL);
1401
1402 expired = 0;
1403 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1404 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1405 while (num--) {
1406 dictEntry *de;
1407 time_t t;
1408
1409 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1410 t = (time_t) dictGetEntryVal(de);
1411 if (now > t) {
1412 deleteKey(db,dictGetEntryKey(de));
1413 expired++;
1414 server.stat_expiredkeys++;
1415 }
1416 }
1417 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1418 }
1419
1420 /* Swap a few keys on disk if we are over the memory limit and VM
1421 * is enbled. Try to free objects from the free list first. */
1422 if (vmCanSwapOut()) {
1423 while (server.vm_enabled && zmalloc_used_memory() >
1424 server.vm_max_memory)
1425 {
1426 int retval;
1427
1428 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1429 retval = (server.vm_max_threads == 0) ?
1430 vmSwapOneObjectBlocking() :
1431 vmSwapOneObjectThreaded();
1432 if (retval == REDIS_ERR && !(loops % 300) &&
1433 zmalloc_used_memory() >
1434 (server.vm_max_memory+server.vm_max_memory/10))
1435 {
1436 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1437 }
1438 /* Note that when using threade I/O we free just one object,
1439 * because anyway when the I/O thread in charge to swap this
1440 * object out will finish, the handler of completed jobs
1441 * will try to swap more objects if we are still out of memory. */
1442 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1443 }
1444 }
1445
1446 /* Check if we should connect to a MASTER */
1447 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1448 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1449 if (syncWithMaster() == REDIS_OK) {
1450 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1451 }
1452 }
1453 return 100;
1454 }
1455
1456 /* This function gets called every time Redis is entering the
1457 * main loop of the event driven library, that is, before to sleep
1458 * for ready file descriptors. */
1459 static void beforeSleep(struct aeEventLoop *eventLoop) {
1460 REDIS_NOTUSED(eventLoop);
1461
1462 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1463 listIter li;
1464 listNode *ln;
1465
1466 listRewind(server.io_ready_clients,&li);
1467 while((ln = listNext(&li))) {
1468 redisClient *c = ln->value;
1469 struct redisCommand *cmd;
1470
1471 /* Resume the client. */
1472 listDelNode(server.io_ready_clients,ln);
1473 c->flags &= (~REDIS_IO_WAIT);
1474 server.vm_blocked_clients--;
1475 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1476 readQueryFromClient, c);
1477 cmd = lookupCommand(c->argv[0]->ptr);
1478 assert(cmd != NULL);
1479 call(c,cmd);
1480 resetClient(c);
1481 /* There may be more data to process in the input buffer. */
1482 if (c->querybuf && sdslen(c->querybuf) > 0)
1483 processInputBuffer(c);
1484 }
1485 }
1486 }
1487
1488 static void createSharedObjects(void) {
1489 int j;
1490
1491 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1492 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1493 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1494 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1495 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1496 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1497 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1498 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1499 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1500 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1501 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1502 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1503 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1504 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1505 "-ERR no such key\r\n"));
1506 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1507 "-ERR syntax error\r\n"));
1508 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1509 "-ERR source and destination objects are the same\r\n"));
1510 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1511 "-ERR index out of range\r\n"));
1512 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1513 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1514 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1515 shared.select0 = createStringObject("select 0\r\n",10);
1516 shared.select1 = createStringObject("select 1\r\n",10);
1517 shared.select2 = createStringObject("select 2\r\n",10);
1518 shared.select3 = createStringObject("select 3\r\n",10);
1519 shared.select4 = createStringObject("select 4\r\n",10);
1520 shared.select5 = createStringObject("select 5\r\n",10);
1521 shared.select6 = createStringObject("select 6\r\n",10);
1522 shared.select7 = createStringObject("select 7\r\n",10);
1523 shared.select8 = createStringObject("select 8\r\n",10);
1524 shared.select9 = createStringObject("select 9\r\n",10);
1525 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1526 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1527 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1528 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1529 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1530 shared.mbulk3 = createStringObject("*3\r\n",4);
1531 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1532 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1533 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1534 }
1535 }
1536
1537 static void appendServerSaveParams(time_t seconds, int changes) {
1538 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1539 server.saveparams[server.saveparamslen].seconds = seconds;
1540 server.saveparams[server.saveparamslen].changes = changes;
1541 server.saveparamslen++;
1542 }
1543
1544 static void resetServerSaveParams() {
1545 zfree(server.saveparams);
1546 server.saveparams = NULL;
1547 server.saveparamslen = 0;
1548 }
1549
1550 static void initServerConfig() {
1551 server.dbnum = REDIS_DEFAULT_DBNUM;
1552 server.port = REDIS_SERVERPORT;
1553 server.verbosity = REDIS_VERBOSE;
1554 server.maxidletime = REDIS_MAXIDLETIME;
1555 server.saveparams = NULL;
1556 server.logfile = NULL; /* NULL = log on standard output */
1557 server.bindaddr = NULL;
1558 server.glueoutputbuf = 1;
1559 server.daemonize = 0;
1560 server.appendonly = 0;
1561 server.appendfsync = APPENDFSYNC_ALWAYS;
1562 server.lastfsync = time(NULL);
1563 server.appendfd = -1;
1564 server.appendseldb = -1; /* Make sure the first time will not match */
1565 server.pidfile = zstrdup("/var/run/redis.pid");
1566 server.dbfilename = zstrdup("dump.rdb");
1567 server.appendfilename = zstrdup("appendonly.aof");
1568 server.requirepass = NULL;
1569 server.shareobjects = 0;
1570 server.rdbcompression = 1;
1571 server.maxclients = 0;
1572 server.blpop_blocked_clients = 0;
1573 server.maxmemory = 0;
1574 server.vm_enabled = 0;
1575 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1576 server.vm_page_size = 256; /* 256 bytes per page */
1577 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1578 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1579 server.vm_max_threads = 4;
1580 server.vm_blocked_clients = 0;
1581 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1582 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1583
1584 resetServerSaveParams();
1585
1586 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1587 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1588 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1589 /* Replication related */
1590 server.isslave = 0;
1591 server.masterauth = NULL;
1592 server.masterhost = NULL;
1593 server.masterport = 6379;
1594 server.master = NULL;
1595 server.replstate = REDIS_REPL_NONE;
1596
1597 /* Double constants initialization */
1598 R_Zero = 0.0;
1599 R_PosInf = 1.0/R_Zero;
1600 R_NegInf = -1.0/R_Zero;
1601 R_Nan = R_Zero/R_Zero;
1602 }
1603
1604 static void initServer() {
1605 int j;
1606
1607 signal(SIGHUP, SIG_IGN);
1608 signal(SIGPIPE, SIG_IGN);
1609 setupSigSegvAction();
1610
1611 server.devnull = fopen("/dev/null","w");
1612 if (server.devnull == NULL) {
1613 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1614 exit(1);
1615 }
1616 server.clients = listCreate();
1617 server.slaves = listCreate();
1618 server.monitors = listCreate();
1619 server.objfreelist = listCreate();
1620 createSharedObjects();
1621 server.el = aeCreateEventLoop();
1622 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1623 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1624 if (server.fd == -1) {
1625 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1626 exit(1);
1627 }
1628 for (j = 0; j < server.dbnum; j++) {
1629 server.db[j].dict = dictCreate(&dbDictType,NULL);
1630 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1631 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1632 if (server.vm_enabled)
1633 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1634 server.db[j].id = j;
1635 }
1636 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1637 server.pubsub_patterns = listCreate();
1638 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1639 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1640 server.cronloops = 0;
1641 server.bgsavechildpid = -1;
1642 server.bgrewritechildpid = -1;
1643 server.bgrewritebuf = sdsempty();
1644 server.lastsave = time(NULL);
1645 server.dirty = 0;
1646 server.stat_numcommands = 0;
1647 server.stat_numconnections = 0;
1648 server.stat_expiredkeys = 0;
1649 server.stat_starttime = time(NULL);
1650 server.unixtime = time(NULL);
1651 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1652 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1653 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1654
1655 if (server.appendonly) {
1656 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1657 if (server.appendfd == -1) {
1658 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1659 strerror(errno));
1660 exit(1);
1661 }
1662 }
1663
1664 if (server.vm_enabled) vmInit();
1665 }
1666
1667 /* Empty the whole database */
1668 static long long emptyDb() {
1669 int j;
1670 long long removed = 0;
1671
1672 for (j = 0; j < server.dbnum; j++) {
1673 removed += dictSize(server.db[j].dict);
1674 dictEmpty(server.db[j].dict);
1675 dictEmpty(server.db[j].expires);
1676 }
1677 return removed;
1678 }
1679
1680 static int yesnotoi(char *s) {
1681 if (!strcasecmp(s,"yes")) return 1;
1682 else if (!strcasecmp(s,"no")) return 0;
1683 else return -1;
1684 }
1685
1686 /* I agree, this is a very rudimental way to load a configuration...
1687 will improve later if the config gets more complex */
1688 static void loadServerConfig(char *filename) {
1689 FILE *fp;
1690 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1691 int linenum = 0;
1692 sds line = NULL;
1693
1694 if (filename[0] == '-' && filename[1] == '\0')
1695 fp = stdin;
1696 else {
1697 if ((fp = fopen(filename,"r")) == NULL) {
1698 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1699 exit(1);
1700 }
1701 }
1702
1703 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1704 sds *argv;
1705 int argc, j;
1706
1707 linenum++;
1708 line = sdsnew(buf);
1709 line = sdstrim(line," \t\r\n");
1710
1711 /* Skip comments and blank lines*/
1712 if (line[0] == '#' || line[0] == '\0') {
1713 sdsfree(line);
1714 continue;
1715 }
1716
1717 /* Split into arguments */
1718 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1719 sdstolower(argv[0]);
1720
1721 /* Execute config directives */
1722 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1723 server.maxidletime = atoi(argv[1]);
1724 if (server.maxidletime < 0) {
1725 err = "Invalid timeout value"; goto loaderr;
1726 }
1727 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1728 server.port = atoi(argv[1]);
1729 if (server.port < 1 || server.port > 65535) {
1730 err = "Invalid port"; goto loaderr;
1731 }
1732 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1733 server.bindaddr = zstrdup(argv[1]);
1734 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1735 int seconds = atoi(argv[1]);
1736 int changes = atoi(argv[2]);
1737 if (seconds < 1 || changes < 0) {
1738 err = "Invalid save parameters"; goto loaderr;
1739 }
1740 appendServerSaveParams(seconds,changes);
1741 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1742 if (chdir(argv[1]) == -1) {
1743 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1744 argv[1], strerror(errno));
1745 exit(1);
1746 }
1747 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1748 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1749 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1750 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1751 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1752 else {
1753 err = "Invalid log level. Must be one of debug, notice, warning";
1754 goto loaderr;
1755 }
1756 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1757 FILE *logfp;
1758
1759 server.logfile = zstrdup(argv[1]);
1760 if (!strcasecmp(server.logfile,"stdout")) {
1761 zfree(server.logfile);
1762 server.logfile = NULL;
1763 }
1764 if (server.logfile) {
1765 /* Test if we are able to open the file. The server will not
1766 * be able to abort just for this problem later... */
1767 logfp = fopen(server.logfile,"a");
1768 if (logfp == NULL) {
1769 err = sdscatprintf(sdsempty(),
1770 "Can't open the log file: %s", strerror(errno));
1771 goto loaderr;
1772 }
1773 fclose(logfp);
1774 }
1775 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1776 server.dbnum = atoi(argv[1]);
1777 if (server.dbnum < 1) {
1778 err = "Invalid number of databases"; goto loaderr;
1779 }
1780 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1781 loadServerConfig(argv[1]);
1782 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1783 server.maxclients = atoi(argv[1]);
1784 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1785 server.maxmemory = strtoll(argv[1], NULL, 10);
1786 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1787 server.masterhost = sdsnew(argv[1]);
1788 server.masterport = atoi(argv[2]);
1789 server.replstate = REDIS_REPL_CONNECT;
1790 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1791 server.masterauth = zstrdup(argv[1]);
1792 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1793 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1794 err = "argument must be 'yes' or 'no'"; goto loaderr;
1795 }
1796 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1797 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1798 err = "argument must be 'yes' or 'no'"; goto loaderr;
1799 }
1800 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1801 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1802 err = "argument must be 'yes' or 'no'"; goto loaderr;
1803 }
1804 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1805 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1806 err = "argument must be 'yes' or 'no'"; goto loaderr;
1807 }
1808 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1809 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1810 err = "argument must be 'yes' or 'no'"; goto loaderr;
1811 }
1812 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1813 if (!strcasecmp(argv[1],"no")) {
1814 server.appendfsync = APPENDFSYNC_NO;
1815 } else if (!strcasecmp(argv[1],"always")) {
1816 server.appendfsync = APPENDFSYNC_ALWAYS;
1817 } else if (!strcasecmp(argv[1],"everysec")) {
1818 server.appendfsync = APPENDFSYNC_EVERYSEC;
1819 } else {
1820 err = "argument must be 'no', 'always' or 'everysec'";
1821 goto loaderr;
1822 }
1823 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1824 server.requirepass = zstrdup(argv[1]);
1825 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1826 zfree(server.pidfile);
1827 server.pidfile = zstrdup(argv[1]);
1828 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1829 zfree(server.dbfilename);
1830 server.dbfilename = zstrdup(argv[1]);
1831 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1832 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1833 err = "argument must be 'yes' or 'no'"; goto loaderr;
1834 }
1835 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1836 zfree(server.vm_swap_file);
1837 server.vm_swap_file = zstrdup(argv[1]);
1838 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1839 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1840 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1841 server.vm_page_size = strtoll(argv[1], NULL, 10);
1842 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1843 server.vm_pages = strtoll(argv[1], NULL, 10);
1844 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1845 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1846 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1847 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1848 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1849 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1850 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1851 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1852 } else {
1853 err = "Bad directive or wrong number of arguments"; goto loaderr;
1854 }
1855 for (j = 0; j < argc; j++)
1856 sdsfree(argv[j]);
1857 zfree(argv);
1858 sdsfree(line);
1859 }
1860 if (fp != stdin) fclose(fp);
1861 return;
1862
1863 loaderr:
1864 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1865 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1866 fprintf(stderr, ">>> '%s'\n", line);
1867 fprintf(stderr, "%s\n", err);
1868 exit(1);
1869 }
1870
1871 static void freeClientArgv(redisClient *c) {
1872 int j;
1873
1874 for (j = 0; j < c->argc; j++)
1875 decrRefCount(c->argv[j]);
1876 for (j = 0; j < c->mbargc; j++)
1877 decrRefCount(c->mbargv[j]);
1878 c->argc = 0;
1879 c->mbargc = 0;
1880 }
1881
1882 static void freeClient(redisClient *c) {
1883 listNode *ln;
1884
1885 /* Note that if the client we are freeing is blocked into a blocking
1886 * call, we have to set querybuf to NULL *before* to call
1887 * unblockClientWaitingData() to avoid processInputBuffer() will get
1888 * called. Also it is important to remove the file events after
1889 * this, because this call adds the READABLE event. */
1890 sdsfree(c->querybuf);
1891 c->querybuf = NULL;
1892 if (c->flags & REDIS_BLOCKED)
1893 unblockClientWaitingData(c);
1894
1895 /* Unsubscribe from all the pubsub channels */
1896 pubsubUnsubscribeAllChannels(c,0);
1897 pubsubUnsubscribeAllPatterns(c,0);
1898 dictRelease(c->pubsub_channels);
1899 listRelease(c->pubsub_patterns);
1900 /* Obvious cleanup */
1901 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1902 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1903 listRelease(c->reply);
1904 freeClientArgv(c);
1905 close(c->fd);
1906 /* Remove from the list of clients */
1907 ln = listSearchKey(server.clients,c);
1908 redisAssert(ln != NULL);
1909 listDelNode(server.clients,ln);
1910 /* Remove from the list of clients waiting for swapped keys */
1911 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1912 ln = listSearchKey(server.io_ready_clients,c);
1913 if (ln) {
1914 listDelNode(server.io_ready_clients,ln);
1915 server.vm_blocked_clients--;
1916 }
1917 }
1918 while (server.vm_enabled && listLength(c->io_keys)) {
1919 ln = listFirst(c->io_keys);
1920 dontWaitForSwappedKey(c,ln->value);
1921 }
1922 listRelease(c->io_keys);
1923 /* Master/slave cleanup */
1924 if (c->flags & REDIS_SLAVE) {
1925 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1926 close(c->repldbfd);
1927 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1928 ln = listSearchKey(l,c);
1929 redisAssert(ln != NULL);
1930 listDelNode(l,ln);
1931 }
1932 if (c->flags & REDIS_MASTER) {
1933 server.master = NULL;
1934 server.replstate = REDIS_REPL_CONNECT;
1935 }
1936 /* Release memory */
1937 zfree(c->argv);
1938 zfree(c->mbargv);
1939 freeClientMultiState(c);
1940 zfree(c);
1941 }
1942
1943 #define GLUEREPLY_UP_TO (1024)
1944 static void glueReplyBuffersIfNeeded(redisClient *c) {
1945 int copylen = 0;
1946 char buf[GLUEREPLY_UP_TO];
1947 listNode *ln;
1948 listIter li;
1949 robj *o;
1950
1951 listRewind(c->reply,&li);
1952 while((ln = listNext(&li))) {
1953 int objlen;
1954
1955 o = ln->value;
1956 objlen = sdslen(o->ptr);
1957 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1958 memcpy(buf+copylen,o->ptr,objlen);
1959 copylen += objlen;
1960 listDelNode(c->reply,ln);
1961 } else {
1962 if (copylen == 0) return;
1963 break;
1964 }
1965 }
1966 /* Now the output buffer is empty, add the new single element */
1967 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1968 listAddNodeHead(c->reply,o);
1969 }
1970
1971 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1972 redisClient *c = privdata;
1973 int nwritten = 0, totwritten = 0, objlen;
1974 robj *o;
1975 REDIS_NOTUSED(el);
1976 REDIS_NOTUSED(mask);
1977
1978 /* Use writev() if we have enough buffers to send */
1979 if (!server.glueoutputbuf &&
1980 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1981 !(c->flags & REDIS_MASTER))
1982 {
1983 sendReplyToClientWritev(el, fd, privdata, mask);
1984 return;
1985 }
1986
1987 while(listLength(c->reply)) {
1988 if (server.glueoutputbuf && listLength(c->reply) > 1)
1989 glueReplyBuffersIfNeeded(c);
1990
1991 o = listNodeValue(listFirst(c->reply));
1992 objlen = sdslen(o->ptr);
1993
1994 if (objlen == 0) {
1995 listDelNode(c->reply,listFirst(c->reply));
1996 continue;
1997 }
1998
1999 if (c->flags & REDIS_MASTER) {
2000 /* Don't reply to a master */
2001 nwritten = objlen - c->sentlen;
2002 } else {
2003 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2004 if (nwritten <= 0) break;
2005 }
2006 c->sentlen += nwritten;
2007 totwritten += nwritten;
2008 /* If we fully sent the object on head go to the next one */
2009 if (c->sentlen == objlen) {
2010 listDelNode(c->reply,listFirst(c->reply));
2011 c->sentlen = 0;
2012 }
2013 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2014 * bytes, in a single threaded server it's a good idea to serve
2015 * other clients as well, even if a very large request comes from
2016 * super fast link that is always able to accept data (in real world
2017 * scenario think about 'KEYS *' against the loopback interfae) */
2018 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2019 }
2020 if (nwritten == -1) {
2021 if (errno == EAGAIN) {
2022 nwritten = 0;
2023 } else {
2024 redisLog(REDIS_VERBOSE,
2025 "Error writing to client: %s", strerror(errno));
2026 freeClient(c);
2027 return;
2028 }
2029 }
2030 if (totwritten > 0) c->lastinteraction = time(NULL);
2031 if (listLength(c->reply) == 0) {
2032 c->sentlen = 0;
2033 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2034 }
2035 }
2036
2037 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2038 {
2039 redisClient *c = privdata;
2040 int nwritten = 0, totwritten = 0, objlen, willwrite;
2041 robj *o;
2042 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2043 int offset, ion = 0;
2044 REDIS_NOTUSED(el);
2045 REDIS_NOTUSED(mask);
2046
2047 listNode *node;
2048 while (listLength(c->reply)) {
2049 offset = c->sentlen;
2050 ion = 0;
2051 willwrite = 0;
2052
2053 /* fill-in the iov[] array */
2054 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2055 o = listNodeValue(node);
2056 objlen = sdslen(o->ptr);
2057
2058 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2059 break;
2060
2061 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2062 break; /* no more iovecs */
2063
2064 iov[ion].iov_base = ((char*)o->ptr) + offset;
2065 iov[ion].iov_len = objlen - offset;
2066 willwrite += objlen - offset;
2067 offset = 0; /* just for the first item */
2068 ion++;
2069 }
2070
2071 if(willwrite == 0)
2072 break;
2073
2074 /* write all collected blocks at once */
2075 if((nwritten = writev(fd, iov, ion)) < 0) {
2076 if (errno != EAGAIN) {
2077 redisLog(REDIS_VERBOSE,
2078 "Error writing to client: %s", strerror(errno));
2079 freeClient(c);
2080 return;
2081 }
2082 break;
2083 }
2084
2085 totwritten += nwritten;
2086 offset = c->sentlen;
2087
2088 /* remove written robjs from c->reply */
2089 while (nwritten && listLength(c->reply)) {
2090 o = listNodeValue(listFirst(c->reply));
2091 objlen = sdslen(o->ptr);
2092
2093 if(nwritten >= objlen - offset) {
2094 listDelNode(c->reply, listFirst(c->reply));
2095 nwritten -= objlen - offset;
2096 c->sentlen = 0;
2097 } else {
2098 /* partial write */
2099 c->sentlen += nwritten;
2100 break;
2101 }
2102 offset = 0;
2103 }
2104 }
2105
2106 if (totwritten > 0)
2107 c->lastinteraction = time(NULL);
2108
2109 if (listLength(c->reply) == 0) {
2110 c->sentlen = 0;
2111 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2112 }
2113 }
2114
2115 static struct redisCommand *lookupCommand(char *name) {
2116 int j = 0;
2117 while(cmdTable[j].name != NULL) {
2118 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2119 j++;
2120 }
2121 return NULL;
2122 }
2123
2124 /* resetClient prepare the client to process the next command */
2125 static void resetClient(redisClient *c) {
2126 freeClientArgv(c);
2127 c->bulklen = -1;
2128 c->multibulk = 0;
2129 }
2130
2131 /* Call() is the core of Redis execution of a command */
2132 static void call(redisClient *c, struct redisCommand *cmd) {
2133 long long dirty;
2134
2135 dirty = server.dirty;
2136 cmd->proc(c);
2137 dirty = server.dirty-dirty;
2138
2139 if (server.appendonly && dirty)
2140 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2141 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2142 listLength(server.slaves))
2143 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2144 if (listLength(server.monitors))
2145 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2146 server.stat_numcommands++;
2147 }
2148
2149 /* If this function gets called we already read a whole
2150 * command, argments are in the client argv/argc fields.
2151 * processCommand() execute the command or prepare the
2152 * server for a bulk read from the client.
2153 *
2154 * If 1 is returned the client is still alive and valid and
2155 * and other operations can be performed by the caller. Otherwise
2156 * if 0 is returned the client was destroied (i.e. after QUIT). */
2157 static int processCommand(redisClient *c) {
2158 struct redisCommand *cmd;
2159
2160 /* Free some memory if needed (maxmemory setting) */
2161 if (server.maxmemory) freeMemoryIfNeeded();
2162
2163 /* Handle the multi bulk command type. This is an alternative protocol
2164 * supported by Redis in order to receive commands that are composed of
2165 * multiple binary-safe "bulk" arguments. The latency of processing is
2166 * a bit higher but this allows things like multi-sets, so if this
2167 * protocol is used only for MSET and similar commands this is a big win. */
2168 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2169 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2170 if (c->multibulk <= 0) {
2171 resetClient(c);
2172 return 1;
2173 } else {
2174 decrRefCount(c->argv[c->argc-1]);
2175 c->argc--;
2176 return 1;
2177 }
2178 } else if (c->multibulk) {
2179 if (c->bulklen == -1) {
2180 if (((char*)c->argv[0]->ptr)[0] != '$') {
2181 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2182 resetClient(c);
2183 return 1;
2184 } else {
2185 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2186 decrRefCount(c->argv[0]);
2187 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2188 c->argc--;
2189 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2190 resetClient(c);
2191 return 1;
2192 }
2193 c->argc--;
2194 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2195 return 1;
2196 }
2197 } else {
2198 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2199 c->mbargv[c->mbargc] = c->argv[0];
2200 c->mbargc++;
2201 c->argc--;
2202 c->multibulk--;
2203 if (c->multibulk == 0) {
2204 robj **auxargv;
2205 int auxargc;
2206
2207 /* Here we need to swap the multi-bulk argc/argv with the
2208 * normal argc/argv of the client structure. */
2209 auxargv = c->argv;
2210 c->argv = c->mbargv;
2211 c->mbargv = auxargv;
2212
2213 auxargc = c->argc;
2214 c->argc = c->mbargc;
2215 c->mbargc = auxargc;
2216
2217 /* We need to set bulklen to something different than -1
2218 * in order for the code below to process the command without
2219 * to try to read the last argument of a bulk command as
2220 * a special argument. */
2221 c->bulklen = 0;
2222 /* continue below and process the command */
2223 } else {
2224 c->bulklen = -1;
2225 return 1;
2226 }
2227 }
2228 }
2229 /* -- end of multi bulk commands processing -- */
2230
2231 /* The QUIT command is handled as a special case. Normal command
2232 * procs are unable to close the client connection safely */
2233 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2234 freeClient(c);
2235 return 0;
2236 }
2237
2238 /* Now lookup the command and check ASAP about trivial error conditions
2239 * such wrong arity, bad command name and so forth. */
2240 cmd = lookupCommand(c->argv[0]->ptr);
2241 if (!cmd) {
2242 addReplySds(c,
2243 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2244 (char*)c->argv[0]->ptr));
2245 resetClient(c);
2246 return 1;
2247 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2248 (c->argc < -cmd->arity)) {
2249 addReplySds(c,
2250 sdscatprintf(sdsempty(),
2251 "-ERR wrong number of arguments for '%s' command\r\n",
2252 cmd->name));
2253 resetClient(c);
2254 return 1;
2255 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2256 /* This is a bulk command, we have to read the last argument yet. */
2257 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2258
2259 decrRefCount(c->argv[c->argc-1]);
2260 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2261 c->argc--;
2262 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2263 resetClient(c);
2264 return 1;
2265 }
2266 c->argc--;
2267 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2268 /* It is possible that the bulk read is already in the
2269 * buffer. Check this condition and handle it accordingly.
2270 * This is just a fast path, alternative to call processInputBuffer().
2271 * It's a good idea since the code is small and this condition
2272 * happens most of the times. */
2273 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2274 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2275 c->argc++;
2276 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2277 } else {
2278 /* Otherwise return... there is to read the last argument
2279 * from the socket. */
2280 return 1;
2281 }
2282 }
2283 /* Let's try to encode the bulk object to save space. */
2284 if (cmd->flags & REDIS_CMD_BULK)
2285 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2286
2287 /* Check if the user is authenticated */
2288 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2289 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2290 resetClient(c);
2291 return 1;
2292 }
2293
2294 /* Handle the maxmemory directive */
2295 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2296 zmalloc_used_memory() > server.maxmemory)
2297 {
2298 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2299 resetClient(c);
2300 return 1;
2301 }
2302
2303 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2304 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2305 &&
2306 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2307 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2308 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2309 resetClient(c);
2310 return 1;
2311 }
2312
2313 /* Exec the command */
2314 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2315 queueMultiCommand(c,cmd);
2316 addReply(c,shared.queued);
2317 } else {
2318 if (server.vm_enabled && server.vm_max_threads > 0 &&
2319 blockClientOnSwappedKeys(cmd,c)) return 1;
2320 call(c,cmd);
2321 }
2322
2323 /* Prepare the client for the next command */
2324 resetClient(c);
2325 return 1;
2326 }
2327
2328 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2329 listNode *ln;
2330 listIter li;
2331 int outc = 0, j;
2332 robj **outv;
2333 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2334 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2335 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2336 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2337 robj *lenobj;
2338
2339 if (argc <= REDIS_STATIC_ARGS) {
2340 outv = static_outv;
2341 } else {
2342 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2343 }
2344
2345 lenobj = createObject(REDIS_STRING,
2346 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2347 lenobj->refcount = 0;
2348 outv[outc++] = lenobj;
2349 for (j = 0; j < argc; j++) {
2350 lenobj = createObject(REDIS_STRING,
2351 sdscatprintf(sdsempty(),"$%lu\r\n",
2352 (unsigned long) stringObjectLen(argv[j])));
2353 lenobj->refcount = 0;
2354 outv[outc++] = lenobj;
2355 outv[outc++] = argv[j];
2356 outv[outc++] = shared.crlf;
2357 }
2358
2359 /* Increment all the refcounts at start and decrement at end in order to
2360 * be sure to free objects if there is no slave in a replication state
2361 * able to be feed with commands */
2362 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2363 listRewind(slaves,&li);
2364 while((ln = listNext(&li))) {
2365 redisClient *slave = ln->value;
2366
2367 /* Don't feed slaves that are still waiting for BGSAVE to start */
2368 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2369
2370 /* Feed all the other slaves, MONITORs and so on */
2371 if (slave->slaveseldb != dictid) {
2372 robj *selectcmd;
2373
2374 switch(dictid) {
2375 case 0: selectcmd = shared.select0; break;
2376 case 1: selectcmd = shared.select1; break;
2377 case 2: selectcmd = shared.select2; break;
2378 case 3: selectcmd = shared.select3; break;
2379 case 4: selectcmd = shared.select4; break;
2380 case 5: selectcmd = shared.select5; break;
2381 case 6: selectcmd = shared.select6; break;
2382 case 7: selectcmd = shared.select7; break;
2383 case 8: selectcmd = shared.select8; break;
2384 case 9: selectcmd = shared.select9; break;
2385 default:
2386 selectcmd = createObject(REDIS_STRING,
2387 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2388 selectcmd->refcount = 0;
2389 break;
2390 }
2391 addReply(slave,selectcmd);
2392 slave->slaveseldb = dictid;
2393 }
2394 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2395 }
2396 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2397 if (outv != static_outv) zfree(outv);
2398 }
2399
2400 static void processInputBuffer(redisClient *c) {
2401 again:
2402 /* Before to process the input buffer, make sure the client is not
2403 * waitig for a blocking operation such as BLPOP. Note that the first
2404 * iteration the client is never blocked, otherwise the processInputBuffer
2405 * would not be called at all, but after the execution of the first commands
2406 * in the input buffer the client may be blocked, and the "goto again"
2407 * will try to reiterate. The following line will make it return asap. */
2408 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2409 if (c->bulklen == -1) {
2410 /* Read the first line of the query */
2411 char *p = strchr(c->querybuf,'\n');
2412 size_t querylen;
2413
2414 if (p) {
2415 sds query, *argv;
2416 int argc, j;
2417
2418 query = c->querybuf;
2419 c->querybuf = sdsempty();
2420 querylen = 1+(p-(query));
2421 if (sdslen(query) > querylen) {
2422 /* leave data after the first line of the query in the buffer */
2423 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2424 }
2425 *p = '\0'; /* remove "\n" */
2426 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2427 sdsupdatelen(query);
2428
2429 /* Now we can split the query in arguments */
2430 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2431 sdsfree(query);
2432
2433 if (c->argv) zfree(c->argv);
2434 c->argv = zmalloc(sizeof(robj*)*argc);
2435
2436 for (j = 0; j < argc; j++) {
2437 if (sdslen(argv[j])) {
2438 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2439 c->argc++;
2440 } else {
2441 sdsfree(argv[j]);
2442 }
2443 }
2444 zfree(argv);
2445 if (c->argc) {
2446 /* Execute the command. If the client is still valid
2447 * after processCommand() return and there is something
2448 * on the query buffer try to process the next command. */
2449 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2450 } else {
2451 /* Nothing to process, argc == 0. Just process the query
2452 * buffer if it's not empty or return to the caller */
2453 if (sdslen(c->querybuf)) goto again;
2454 }
2455 return;
2456 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2457 redisLog(REDIS_VERBOSE, "Client protocol error");
2458 freeClient(c);
2459 return;
2460 }
2461 } else {
2462 /* Bulk read handling. Note that if we are at this point
2463 the client already sent a command terminated with a newline,
2464 we are reading the bulk data that is actually the last
2465 argument of the command. */
2466 int qbl = sdslen(c->querybuf);
2467
2468 if (c->bulklen <= qbl) {
2469 /* Copy everything but the final CRLF as final argument */
2470 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2471 c->argc++;
2472 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2473 /* Process the command. If the client is still valid after
2474 * the processing and there is more data in the buffer
2475 * try to parse it. */
2476 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2477 return;
2478 }
2479 }
2480 }
2481
2482 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2483 redisClient *c = (redisClient*) privdata;
2484 char buf[REDIS_IOBUF_LEN];
2485 int nread;
2486 REDIS_NOTUSED(el);
2487 REDIS_NOTUSED(mask);
2488
2489 nread = read(fd, buf, REDIS_IOBUF_LEN);
2490 if (nread == -1) {
2491 if (errno == EAGAIN) {
2492 nread = 0;
2493 } else {
2494 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2495 freeClient(c);
2496 return;
2497 }
2498 } else if (nread == 0) {
2499 redisLog(REDIS_VERBOSE, "Client closed connection");
2500 freeClient(c);
2501 return;
2502 }
2503 if (nread) {
2504 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2505 c->lastinteraction = time(NULL);
2506 } else {
2507 return;
2508 }
2509 processInputBuffer(c);
2510 }
2511
2512 static int selectDb(redisClient *c, int id) {
2513 if (id < 0 || id >= server.dbnum)
2514 return REDIS_ERR;
2515 c->db = &server.db[id];
2516 return REDIS_OK;
2517 }
2518
2519 static void *dupClientReplyValue(void *o) {
2520 incrRefCount((robj*)o);
2521 return o;
2522 }
2523
2524 static int listMatchObjects(void *a, void *b) {
2525 return compareStringObjects(a,b) == 0;
2526 }
2527
2528 static redisClient *createClient(int fd) {
2529 redisClient *c = zmalloc(sizeof(*c));
2530
2531 anetNonBlock(NULL,fd);
2532 anetTcpNoDelay(NULL,fd);
2533 if (!c) return NULL;
2534 selectDb(c,0);
2535 c->fd = fd;
2536 c->querybuf = sdsempty();
2537 c->argc = 0;
2538 c->argv = NULL;
2539 c->bulklen = -1;
2540 c->multibulk = 0;
2541 c->mbargc = 0;
2542 c->mbargv = NULL;
2543 c->sentlen = 0;
2544 c->flags = 0;
2545 c->lastinteraction = time(NULL);
2546 c->authenticated = 0;
2547 c->replstate = REDIS_REPL_NONE;
2548 c->reply = listCreate();
2549 listSetFreeMethod(c->reply,decrRefCount);
2550 listSetDupMethod(c->reply,dupClientReplyValue);
2551 c->blockingkeys = NULL;
2552 c->blockingkeysnum = 0;
2553 c->io_keys = listCreate();
2554 listSetFreeMethod(c->io_keys,decrRefCount);
2555 c->pubsub_channels = dictCreate(&setDictType,NULL);
2556 c->pubsub_patterns = listCreate();
2557 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2558 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2559 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2560 readQueryFromClient, c) == AE_ERR) {
2561 freeClient(c);
2562 return NULL;
2563 }
2564 listAddNodeTail(server.clients,c);
2565 initClientMultiState(c);
2566 return c;
2567 }
2568
2569 static void addReply(redisClient *c, robj *obj) {
2570 if (listLength(c->reply) == 0 &&
2571 (c->replstate == REDIS_REPL_NONE ||
2572 c->replstate == REDIS_REPL_ONLINE) &&
2573 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2574 sendReplyToClient, c) == AE_ERR) return;
2575
2576 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2577 obj = dupStringObject(obj);
2578 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2579 }
2580 listAddNodeTail(c->reply,getDecodedObject(obj));
2581 }
2582
2583 static void addReplySds(redisClient *c, sds s) {
2584 robj *o = createObject(REDIS_STRING,s);
2585 addReply(c,o);
2586 decrRefCount(o);
2587 }
2588
2589 static void addReplyDouble(redisClient *c, double d) {
2590 char buf[128];
2591
2592 snprintf(buf,sizeof(buf),"%.17g",d);
2593 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2594 (unsigned long) strlen(buf),buf));
2595 }
2596
2597 static void addReplyLong(redisClient *c, long l) {
2598 char buf[128];
2599 size_t len;
2600
2601 if (l == 0) {
2602 addReply(c,shared.czero);
2603 return;
2604 } else if (l == 1) {
2605 addReply(c,shared.cone);
2606 return;
2607 }
2608 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2609 addReplySds(c,sdsnewlen(buf,len));
2610 }
2611
2612 static void addReplyLongLong(redisClient *c, long long ll) {
2613 char buf[128];
2614 size_t len;
2615
2616 if (ll == 0) {
2617 addReply(c,shared.czero);
2618 return;
2619 } else if (ll == 1) {
2620 addReply(c,shared.cone);
2621 return;
2622 }
2623 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2624 addReplySds(c,sdsnewlen(buf,len));
2625 }
2626
2627 static void addReplyUlong(redisClient *c, unsigned long ul) {
2628 char buf[128];
2629 size_t len;
2630
2631 if (ul == 0) {
2632 addReply(c,shared.czero);
2633 return;
2634 } else if (ul == 1) {
2635 addReply(c,shared.cone);
2636 return;
2637 }
2638 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2639 addReplySds(c,sdsnewlen(buf,len));
2640 }
2641
2642 static void addReplyBulkLen(redisClient *c, robj *obj) {
2643 size_t len;
2644
2645 if (obj->encoding == REDIS_ENCODING_RAW) {
2646 len = sdslen(obj->ptr);
2647 } else {
2648 long n = (long)obj->ptr;
2649
2650 /* Compute how many bytes will take this integer as a radix 10 string */
2651 len = 1;
2652 if (n < 0) {
2653 len++;
2654 n = -n;
2655 }
2656 while((n = n/10) != 0) {
2657 len++;
2658 }
2659 }
2660 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2661 }
2662
2663 static void addReplyBulk(redisClient *c, robj *obj) {
2664 addReplyBulkLen(c,obj);
2665 addReply(c,obj);
2666 addReply(c,shared.crlf);
2667 }
2668
2669 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2670 static void addReplyBulkCString(redisClient *c, char *s) {
2671 if (s == NULL) {
2672 addReply(c,shared.nullbulk);
2673 } else {
2674 robj *o = createStringObject(s,strlen(s));
2675 addReplyBulk(c,o);
2676 decrRefCount(o);
2677 }
2678 }
2679
2680 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2681 int cport, cfd;
2682 char cip[128];
2683 redisClient *c;
2684 REDIS_NOTUSED(el);
2685 REDIS_NOTUSED(mask);
2686 REDIS_NOTUSED(privdata);
2687
2688 cfd = anetAccept(server.neterr, fd, cip, &cport);
2689 if (cfd == AE_ERR) {
2690 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2691 return;
2692 }
2693 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2694 if ((c = createClient(cfd)) == NULL) {
2695 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2696 close(cfd); /* May be already closed, just ingore errors */
2697 return;
2698 }
2699 /* If maxclient directive is set and this is one client more... close the
2700 * connection. Note that we create the client instead to check before
2701 * for this condition, since now the socket is already set in nonblocking
2702 * mode and we can send an error for free using the Kernel I/O */
2703 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2704 char *err = "-ERR max number of clients reached\r\n";
2705
2706 /* That's a best effort error message, don't check write errors */
2707 if (write(c->fd,err,strlen(err)) == -1) {
2708 /* Nothing to do, Just to avoid the warning... */
2709 }
2710 freeClient(c);
2711 return;
2712 }
2713 server.stat_numconnections++;
2714 }
2715
2716 /* ======================= Redis objects implementation ===================== */
2717
2718 static robj *createObject(int type, void *ptr) {
2719 robj *o;
2720
2721 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2722 if (listLength(server.objfreelist)) {
2723 listNode *head = listFirst(server.objfreelist);
2724 o = listNodeValue(head);
2725 listDelNode(server.objfreelist,head);
2726 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2727 } else {
2728 if (server.vm_enabled) {
2729 pthread_mutex_unlock(&server.obj_freelist_mutex);
2730 o = zmalloc(sizeof(*o));
2731 } else {
2732 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2733 }
2734 }
2735 o->type = type;
2736 o->encoding = REDIS_ENCODING_RAW;
2737 o->ptr = ptr;
2738 o->refcount = 1;
2739 if (server.vm_enabled) {
2740 /* Note that this code may run in the context of an I/O thread
2741 * and accessing to server.unixtime in theory is an error
2742 * (no locks). But in practice this is safe, and even if we read
2743 * garbage Redis will not fail, as it's just a statistical info */
2744 o->vm.atime = server.unixtime;
2745 o->storage = REDIS_VM_MEMORY;
2746 }
2747 return o;
2748 }
2749
2750 static robj *createStringObject(char *ptr, size_t len) {
2751 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2752 }
2753
2754 static robj *dupStringObject(robj *o) {
2755 assert(o->encoding == REDIS_ENCODING_RAW);
2756 return createStringObject(o->ptr,sdslen(o->ptr));
2757 }
2758
2759 static robj *createListObject(void) {
2760 list *l = listCreate();
2761
2762 listSetFreeMethod(l,decrRefCount);
2763 return createObject(REDIS_LIST,l);
2764 }
2765
2766 static robj *createSetObject(void) {
2767 dict *d = dictCreate(&setDictType,NULL);
2768 return createObject(REDIS_SET,d);
2769 }
2770
2771 static robj *createHashObject(void) {
2772 /* All the Hashes start as zipmaps. Will be automatically converted
2773 * into hash tables if there are enough elements or big elements
2774 * inside. */
2775 unsigned char *zm = zipmapNew();
2776 robj *o = createObject(REDIS_HASH,zm);
2777 o->encoding = REDIS_ENCODING_ZIPMAP;
2778 return o;
2779 }
2780
2781 static robj *createZsetObject(void) {
2782 zset *zs = zmalloc(sizeof(*zs));
2783
2784 zs->dict = dictCreate(&zsetDictType,NULL);
2785 zs->zsl = zslCreate();
2786 return createObject(REDIS_ZSET,zs);
2787 }
2788
2789 static void freeStringObject(robj *o) {
2790 if (o->encoding == REDIS_ENCODING_RAW) {
2791 sdsfree(o->ptr);
2792 }
2793 }
2794
2795 static void freeListObject(robj *o) {
2796 listRelease((list*) o->ptr);
2797 }
2798
2799 static void freeSetObject(robj *o) {
2800 dictRelease((dict*) o->ptr);
2801 }
2802
2803 static void freeZsetObject(robj *o) {
2804 zset *zs = o->ptr;
2805
2806 dictRelease(zs->dict);
2807 zslFree(zs->zsl);
2808 zfree(zs);
2809 }
2810
2811 static void freeHashObject(robj *o) {
2812 switch (o->encoding) {
2813 case REDIS_ENCODING_HT:
2814 dictRelease((dict*) o->ptr);
2815 break;
2816 case REDIS_ENCODING_ZIPMAP:
2817 zfree(o->ptr);
2818 break;
2819 default:
2820 redisAssert(0);
2821 break;
2822 }
2823 }
2824
2825 static void incrRefCount(robj *o) {
2826 o->refcount++;
2827 }
2828
2829 static void decrRefCount(void *obj) {
2830 robj *o = obj;
2831
2832 /* Object is a key of a swapped out value, or in the process of being
2833 * loaded. */
2834 if (server.vm_enabled &&
2835 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2836 {
2837 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2838 redisAssert(o->type == REDIS_STRING);
2839 freeStringObject(o);
2840 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2841 pthread_mutex_lock(&server.obj_freelist_mutex);
2842 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2843 !listAddNodeHead(server.objfreelist,o))
2844 zfree(o);
2845 pthread_mutex_unlock(&server.obj_freelist_mutex);
2846 server.vm_stats_swapped_objects--;
2847 return;
2848 }
2849 /* Object is in memory, or in the process of being swapped out. */
2850 if (--(o->refcount) == 0) {
2851 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2852 vmCancelThreadedIOJob(obj);
2853 switch(o->type) {
2854 case REDIS_STRING: freeStringObject(o); break;
2855 case REDIS_LIST: freeListObject(o); break;
2856 case REDIS_SET: freeSetObject(o); break;
2857 case REDIS_ZSET: freeZsetObject(o); break;
2858 case REDIS_HASH: freeHashObject(o); break;
2859 default: redisAssert(0); break;
2860 }
2861 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2862 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2863 !listAddNodeHead(server.objfreelist,o))
2864 zfree(o);
2865 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2866 }
2867 }
2868
2869 static robj *lookupKey(redisDb *db, robj *key) {
2870 dictEntry *de = dictFind(db->dict,key);
2871 if (de) {
2872 robj *key = dictGetEntryKey(de);
2873 robj *val = dictGetEntryVal(de);
2874
2875 if (server.vm_enabled) {
2876 if (key->storage == REDIS_VM_MEMORY ||
2877 key->storage == REDIS_VM_SWAPPING)
2878 {
2879 /* If we were swapping the object out, stop it, this key
2880 * was requested. */
2881 if (key->storage == REDIS_VM_SWAPPING)
2882 vmCancelThreadedIOJob(key);
2883 /* Update the access time of the key for the aging algorithm. */
2884 key->vm.atime = server.unixtime;
2885 } else {
2886 int notify = (key->storage == REDIS_VM_LOADING);
2887
2888 /* Our value was swapped on disk. Bring it at home. */
2889 redisAssert(val == NULL);
2890 val = vmLoadObject(key);
2891 dictGetEntryVal(de) = val;
2892
2893 /* Clients blocked by the VM subsystem may be waiting for
2894 * this key... */
2895 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2896 }
2897 }
2898 return val;
2899 } else {
2900 return NULL;
2901 }
2902 }
2903
2904 static robj *lookupKeyRead(redisDb *db, robj *key) {
2905 expireIfNeeded(db,key);
2906 return lookupKey(db,key);
2907 }
2908
2909 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2910 deleteIfVolatile(db,key);
2911 return lookupKey(db,key);
2912 }
2913
2914 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2915 robj *o = lookupKeyRead(c->db, key);
2916 if (!o) addReply(c,reply);
2917 return o;
2918 }
2919
2920 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2921 robj *o = lookupKeyWrite(c->db, key);
2922 if (!o) addReply(c,reply);
2923 return o;
2924 }
2925
2926 static int checkType(redisClient *c, robj *o, int type) {
2927 if (o->type != type) {
2928 addReply(c,shared.wrongtypeerr);
2929 return 1;
2930 }
2931 return 0;
2932 }
2933
2934 static int deleteKey(redisDb *db, robj *key) {
2935 int retval;
2936
2937 /* We need to protect key from destruction: after the first dictDelete()
2938 * it may happen that 'key' is no longer valid if we don't increment
2939 * it's count. This may happen when we get the object reference directly
2940 * from the hash table with dictRandomKey() or dict iterators */
2941 incrRefCount(key);
2942 if (dictSize(db->expires)) dictDelete(db->expires,key);
2943 retval = dictDelete(db->dict,key);
2944 decrRefCount(key);
2945
2946 return retval == DICT_OK;
2947 }
2948
2949 /* Check if the nul-terminated string 's' can be represented by a long
2950 * (that is, is a number that fits into long without any other space or
2951 * character before or after the digits).
2952 *
2953 * If so, the function returns REDIS_OK and *longval is set to the value
2954 * of the number. Otherwise REDIS_ERR is returned */
2955 static int isStringRepresentableAsLong(sds s, long *longval) {
2956 char buf[32], *endptr;
2957 long value;
2958 int slen;
2959
2960 value = strtol(s, &endptr, 10);
2961 if (endptr[0] != '\0') return REDIS_ERR;
2962 slen = snprintf(buf,32,"%ld",value);
2963
2964 /* If the number converted back into a string is not identical
2965 * then it's not possible to encode the string as integer */
2966 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2967 if (longval) *longval = value;
2968 return REDIS_OK;
2969 }
2970
2971 /* Try to encode a string object in order to save space */
2972 static robj *tryObjectEncoding(robj *o) {
2973 long value;
2974 sds s = o->ptr;
2975
2976 if (o->encoding != REDIS_ENCODING_RAW)
2977 return o; /* Already encoded */
2978
2979 /* It's not safe to encode shared objects: shared objects can be shared
2980 * everywhere in the "object space" of Redis. Encoded objects can only
2981 * appear as "values" (and not, for instance, as keys) */
2982 if (o->refcount > 1) return o;
2983
2984 /* Currently we try to encode only strings */
2985 redisAssert(o->type == REDIS_STRING);
2986
2987 /* Check if we can represent this string as a long integer */
2988 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2989
2990 /* Ok, this object can be encoded */
2991 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2992 decrRefCount(o);
2993 incrRefCount(shared.integers[value]);
2994 return shared.integers[value];
2995 } else {
2996 o->encoding = REDIS_ENCODING_INT;
2997 sdsfree(o->ptr);
2998 o->ptr = (void*) value;
2999 return o;
3000 }
3001 }
3002
3003 /* Get a decoded version of an encoded object (returned as a new object).
3004 * If the object is already raw-encoded just increment the ref count. */
3005 static robj *getDecodedObject(robj *o) {
3006 robj *dec;
3007
3008 if (o->encoding == REDIS_ENCODING_RAW) {
3009 incrRefCount(o);
3010 return o;
3011 }
3012 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3013 char buf[32];
3014
3015 snprintf(buf,32,"%ld",(long)o->ptr);
3016 dec = createStringObject(buf,strlen(buf));
3017 return dec;
3018 } else {
3019 redisAssert(1 != 1);
3020 }
3021 }
3022
3023 /* Compare two string objects via strcmp() or alike.
3024 * Note that the objects may be integer-encoded. In such a case we
3025 * use snprintf() to get a string representation of the numbers on the stack
3026 * and compare the strings, it's much faster than calling getDecodedObject().
3027 *
3028 * Important note: if objects are not integer encoded, but binary-safe strings,
3029 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3030 * binary safe. */
3031 static int compareStringObjects(robj *a, robj *b) {
3032 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3033 char bufa[128], bufb[128], *astr, *bstr;
3034 int bothsds = 1;
3035
3036 if (a == b) return 0;
3037 if (a->encoding != REDIS_ENCODING_RAW) {
3038 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3039 astr = bufa;
3040 bothsds = 0;
3041 } else {
3042 astr = a->ptr;
3043 }
3044 if (b->encoding != REDIS_ENCODING_RAW) {
3045 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3046 bstr = bufb;
3047 bothsds = 0;
3048 } else {
3049 bstr = b->ptr;
3050 }
3051 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3052 }
3053
3054 static size_t stringObjectLen(robj *o) {
3055 redisAssert(o->type == REDIS_STRING);
3056 if (o->encoding == REDIS_ENCODING_RAW) {
3057 return sdslen(o->ptr);
3058 } else {
3059 char buf[32];
3060
3061 return snprintf(buf,32,"%ld",(long)o->ptr);
3062 }
3063 }
3064
3065 static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
3066 double parsedValue;
3067 char *eptr = NULL;
3068
3069 if (o && o->type != REDIS_STRING) {
3070 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3071 return REDIS_ERR;
3072 }
3073
3074 if (o == NULL)
3075 parsedValue = 0;
3076 else if (o->encoding == REDIS_ENCODING_RAW)
3077 parsedValue = strtod(o->ptr, &eptr);
3078 else if (o->encoding == REDIS_ENCODING_INT)
3079 parsedValue = (long)o->ptr;
3080 else
3081 redisAssert(1 != 1);
3082
3083 if (eptr != NULL && *eptr != '\0') {
3084 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3085 return REDIS_ERR;
3086 }
3087
3088 *value = parsedValue;
3089
3090 return REDIS_OK;
3091 }
3092
3093 static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
3094 long long parsedValue;
3095 char *eptr = NULL;
3096
3097 if (o && o->type != REDIS_STRING) {
3098 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3099 return REDIS_ERR;
3100 }
3101
3102 if (o == NULL)
3103 parsedValue = 0;
3104 else if (o->encoding == REDIS_ENCODING_RAW)
3105 parsedValue = strtoll(o->ptr, &eptr, 10);
3106 else if (o->encoding == REDIS_ENCODING_INT)
3107 parsedValue = (long)o->ptr;
3108 else
3109 redisAssert(1 != 1);
3110
3111 if (eptr != NULL && *eptr != '\0') {
3112 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3113 return REDIS_ERR;
3114 }
3115
3116 *value = parsedValue;
3117
3118 return REDIS_OK;
3119 }
3120
3121 static int getLongFromObject(redisClient *c, robj *o, long *value) {
3122 long long actualValue;
3123
3124 if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;
3125
3126 if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
3127 addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
3128 return REDIS_ERR;
3129 }
3130
3131 *value = actualValue;
3132
3133 return REDIS_OK;
3134 }
3135
3136 /*============================ RDB saving/loading =========================== */
3137
3138 static int rdbSaveType(FILE *fp, unsigned char type) {
3139 if (fwrite(&type,1,1,fp) == 0) return -1;
3140 return 0;
3141 }
3142
3143 static int rdbSaveTime(FILE *fp, time_t t) {
3144 int32_t t32 = (int32_t) t;
3145 if (fwrite(&t32,4,1,fp) == 0) return -1;
3146 return 0;
3147 }
3148
3149 /* check rdbLoadLen() comments for more info */
3150 static int rdbSaveLen(FILE *fp, uint32_t len) {
3151 unsigned char buf[2];
3152
3153 if (len < (1<<6)) {
3154 /* Save a 6 bit len */
3155 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3156 if (fwrite(buf,1,1,fp) == 0) return -1;
3157 } else if (len < (1<<14)) {
3158 /* Save a 14 bit len */
3159 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3160 buf[1] = len&0xFF;
3161 if (fwrite(buf,2,1,fp) == 0) return -1;
3162 } else {
3163 /* Save a 32 bit len */
3164 buf[0] = (REDIS_RDB_32BITLEN<<6);
3165 if (fwrite(buf,1,1,fp) == 0) return -1;
3166 len = htonl(len);
3167 if (fwrite(&len,4,1,fp) == 0) return -1;
3168 }
3169 return 0;
3170 }
3171
3172 /* String objects in the form "2391" "-100" without any space and with a
3173 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3174 * encoded as integers to save space */
3175 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3176 long long value;
3177 char *endptr, buf[32];
3178
3179 /* Check if it's possible to encode this value as a number */
3180 value = strtoll(s, &endptr, 10);
3181 if (endptr[0] != '\0') return 0;
3182 snprintf(buf,32,"%lld",value);
3183
3184 /* If the number converted back into a string is not identical
3185 * then it's not possible to encode the string as integer */
3186 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3187
3188 /* Finally check if it fits in our ranges */
3189 if (value >= -(1<<7) && value <= (1<<7)-1) {
3190 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3191 enc[1] = value&0xFF;
3192 return 2;
3193 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3194 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3195 enc[1] = value&0xFF;
3196 enc[2] = (value>>8)&0xFF;
3197 return 3;
3198 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3199 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3200 enc[1] = value&0xFF;
3201 enc[2] = (value>>8)&0xFF;
3202 enc[3] = (value>>16)&0xFF;
3203 enc[4] = (value>>24)&0xFF;
3204 return 5;
3205 } else {
3206 return 0;
3207 }
3208 }
3209
3210 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3211 size_t comprlen, outlen;
3212 unsigned char byte;
3213 void *out;
3214
3215 /* We require at least four bytes compression for this to be worth it */
3216 if (len <= 4) return 0;
3217 outlen = len-4;
3218 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3219 comprlen = lzf_compress(s, len, out, outlen);
3220 if (comprlen == 0) {
3221 zfree(out);
3222 return 0;
3223 }
3224 /* Data compressed! Let's save it on disk */
3225 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3226 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3227 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3228 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3229 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3230 zfree(out);
3231 return comprlen;
3232
3233 writeerr:
3234 zfree(out);
3235 return -1;
3236 }
3237
3238 /* Save a string objet as [len][data] on disk. If the object is a string
3239 * representation of an integer value we try to safe it in a special form */
3240 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3241 int enclen;
3242
3243 /* Try integer encoding */
3244 if (len <= 11) {
3245 unsigned char buf[5];
3246 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3247 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3248 return 0;
3249 }
3250 }
3251
3252 /* Try LZF compression - under 20 bytes it's unable to compress even
3253 * aaaaaaaaaaaaaaaaaa so skip it */
3254 if (server.rdbcompression && len > 20) {
3255 int retval;
3256
3257 retval = rdbSaveLzfStringObject(fp,s,len);
3258 if (retval == -1) return -1;
3259 if (retval > 0) return 0;
3260 /* retval == 0 means data can't be compressed, save the old way */
3261 }
3262
3263 /* Store verbatim */
3264 if (rdbSaveLen(fp,len) == -1) return -1;
3265 if (len && fwrite(s,len,1,fp) == 0) return -1;
3266 return 0;
3267 }
3268
3269 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3270 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3271 int retval;
3272
3273 /* Avoid incr/decr ref count business when possible.
3274 * This plays well with copy-on-write given that we are probably
3275 * in a child process (BGSAVE). Also this makes sure key objects
3276 * of swapped objects are not incRefCount-ed (an assert does not allow
3277 * this in order to avoid bugs) */
3278 if (obj->encoding != REDIS_ENCODING_RAW) {
3279 obj = getDecodedObject(obj);
3280 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3281 decrRefCount(obj);
3282 } else {
3283 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3284 }
3285 return retval;
3286 }
3287
3288 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3289 * 8 bit integer specifing the length of the representation.
3290 * This 8 bit integer has special values in order to specify the following
3291 * conditions:
3292 * 253: not a number
3293 * 254: + inf
3294 * 255: - inf
3295 */
3296 static int rdbSaveDoubleValue(FILE *fp, double val) {
3297 unsigned char buf[128];
3298 int len;
3299
3300 if (isnan(val)) {
3301 buf[0] = 253;
3302 len = 1;
3303 } else if (!isfinite(val)) {
3304 len = 1;
3305 buf[0] = (val < 0) ? 255 : 254;
3306 } else {
3307 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3308 buf[0] = strlen((char*)buf+1);
3309 len = buf[0]+1;
3310 }
3311 if (fwrite(buf,len,1,fp) == 0) return -1;
3312 return 0;
3313 }
3314
3315 /* Save a Redis object. */
3316 static int rdbSaveObject(FILE *fp, robj *o) {
3317 if (o->type == REDIS_STRING) {
3318 /* Save a string value */
3319 if (rdbSaveStringObject(fp,o) == -1) return -1;
3320 } else if (o->type == REDIS_LIST) {
3321 /* Save a list value */
3322 list *list = o->ptr;
3323 listIter li;
3324 listNode *ln;
3325
3326 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3327 listRewind(list,&li);
3328 while((ln = listNext(&li))) {
3329 robj *eleobj = listNodeValue(ln);
3330
3331 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3332 }
3333 } else if (o->type == REDIS_SET) {
3334 /* Save a set value */
3335 dict *set = o->ptr;
3336 dictIterator *di = dictGetIterator(set);
3337 dictEntry *de;
3338
3339 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3340 while((de = dictNext(di)) != NULL) {
3341 robj *eleobj = dictGetEntryKey(de);
3342
3343 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3344 }
3345 dictReleaseIterator(di);
3346 } else if (o->type == REDIS_ZSET) {
3347 /* Save a set value */
3348 zset *zs = o->ptr;
3349 dictIterator *di = dictGetIterator(zs->dict);
3350 dictEntry *de;
3351
3352 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3353 while((de = dictNext(di)) != NULL) {
3354 robj *eleobj = dictGetEntryKey(de);
3355 double *score = dictGetEntryVal(de);
3356
3357 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3358 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3359 }
3360 dictReleaseIterator(di);
3361 } else if (o->type == REDIS_HASH) {
3362 /* Save a hash value */
3363 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3364 unsigned char *p = zipmapRewind(o->ptr);
3365 unsigned int count = zipmapLen(o->ptr);
3366 unsigned char *key, *val;
3367 unsigned int klen, vlen;
3368
3369 if (rdbSaveLen(fp,count) == -1) return -1;
3370 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3371 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3372 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3373 }
3374 } else {
3375 dictIterator *di = dictGetIterator(o->ptr);
3376 dictEntry *de;
3377
3378 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3379 while((de = dictNext(di)) != NULL) {
3380 robj *key = dictGetEntryKey(de);
3381 robj *val = dictGetEntryVal(de);
3382
3383 if (rdbSaveStringObject(fp,key) == -1) return -1;
3384 if (rdbSaveStringObject(fp,val) == -1) return -1;
3385 }
3386 dictReleaseIterator(di);
3387 }
3388 } else {
3389 redisAssert(0);
3390 }
3391 return 0;
3392 }
3393
3394 /* Return the length the object will have on disk if saved with
3395 * the rdbSaveObject() function. Currently we use a trick to get
3396 * this length with very little changes to the code. In the future
3397 * we could switch to a faster solution. */
3398 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3399 if (fp == NULL) fp = server.devnull;
3400 rewind(fp);
3401 assert(rdbSaveObject(fp,o) != 1);
3402 return ftello(fp);
3403 }
3404
3405 /* Return the number of pages required to save this object in the swap file */
3406 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3407 off_t bytes = rdbSavedObjectLen(o,fp);
3408
3409 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3410 }
3411
3412 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3413 static int rdbSave(char *filename) {
3414 dictIterator *di = NULL;
3415 dictEntry *de;
3416 FILE *fp;
3417 char tmpfile[256];
3418 int j;
3419 time_t now = time(NULL);
3420
3421 /* Wait for I/O therads to terminate, just in case this is a
3422 * foreground-saving, to avoid seeking the swap file descriptor at the
3423 * same time. */
3424 if (server.vm_enabled)
3425 waitEmptyIOJobsQueue();
3426
3427 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3428 fp = fopen(tmpfile,"w");
3429 if (!fp) {
3430 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3431 return REDIS_ERR;
3432 }
3433 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3434 for (j = 0; j < server.dbnum; j++) {
3435 redisDb *db = server.db+j;
3436 dict *d = db->dict;
3437 if (dictSize(d) == 0) continue;
3438 di = dictGetIterator(d);
3439 if (!di) {
3440 fclose(fp);
3441 return REDIS_ERR;
3442 }
3443
3444 /* Write the SELECT DB opcode */
3445 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3446 if (rdbSaveLen(fp,j) == -1) goto werr;
3447
3448 /* Iterate this DB writing every entry */
3449 while((de = dictNext(di)) != NULL) {
3450 robj *key = dictGetEntryKey(de);
3451 robj *o = dictGetEntryVal(de);
3452 time_t expiretime = getExpire(db,key);
3453
3454 /* Save the expire time */
3455 if (expiretime != -1) {
3456 /* If this key is already expired skip it */
3457 if (expiretime < now) continue;
3458 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3459 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3460 }
3461 /* Save the key and associated value. This requires special
3462 * handling if the value is swapped out. */
3463 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3464 key->storage == REDIS_VM_SWAPPING) {
3465 /* Save type, key, value */
3466 if (rdbSaveType(fp,o->type) == -1) goto werr;
3467 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3468 if (rdbSaveObject(fp,o) == -1) goto werr;
3469 } else {
3470 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3471 robj *po;
3472 /* Get a preview of the object in memory */
3473 po = vmPreviewObject(key);
3474 /* Save type, key, value */
3475 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3476 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3477 if (rdbSaveObject(fp,po) == -1) goto werr;
3478 /* Remove the loaded object from memory */
3479 decrRefCount(po);
3480 }
3481 }
3482 dictReleaseIterator(di);
3483 }
3484 /* EOF opcode */
3485 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3486
3487 /* Make sure data will not remain on the OS's output buffers */
3488 fflush(fp);
3489 fsync(fileno(fp));
3490 fclose(fp);
3491
3492 /* Use RENAME to make sure the DB file is changed atomically only
3493 * if the generate DB file is ok. */
3494 if (rename(tmpfile,filename) == -1) {
3495 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3496 unlink(tmpfile);
3497 return REDIS_ERR;
3498 }
3499 redisLog(REDIS_NOTICE,"DB saved on disk");
3500 server.dirty = 0;
3501 server.lastsave = time(NULL);
3502 return REDIS_OK;
3503
3504 werr:
3505 fclose(fp);
3506 unlink(tmpfile);
3507 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3508 if (di) dictReleaseIterator(di);
3509 return REDIS_ERR;
3510 }
3511
3512 static int rdbSaveBackground(char *filename) {
3513 pid_t childpid;
3514
3515 if (server.bgsavechildpid != -1) return REDIS_ERR;
3516 if (server.vm_enabled) waitEmptyIOJobsQueue();
3517 if ((childpid = fork()) == 0) {
3518 /* Child */
3519 if (server.vm_enabled) vmReopenSwapFile();
3520 close(server.fd);
3521 if (rdbSave(filename) == REDIS_OK) {
3522 _exit(0);
3523 } else {
3524 _exit(1);
3525 }
3526 } else {
3527 /* Parent */
3528 if (childpid == -1) {
3529 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3530 strerror(errno));
3531 return REDIS_ERR;
3532 }
3533 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3534 server.bgsavechildpid = childpid;
3535 updateDictResizePolicy();
3536 return REDIS_OK;
3537 }
3538 return REDIS_OK; /* unreached */
3539 }
3540
3541 static void rdbRemoveTempFile(pid_t childpid) {
3542 char tmpfile[256];
3543
3544 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3545 unlink(tmpfile);
3546 }
3547
3548 static int rdbLoadType(FILE *fp) {
3549 unsigned char type;
3550 if (fread(&type,1,1,fp) == 0) return -1;
3551 return type;
3552 }
3553
3554 static time_t rdbLoadTime(FILE *fp) {
3555 int32_t t32;
3556 if (fread(&t32,4,1,fp) == 0) return -1;
3557 return (time_t) t32;
3558 }
3559
3560 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3561 * of this file for a description of how this are stored on disk.
3562 *
3563 * isencoded is set to 1 if the readed length is not actually a length but
3564 * an "encoding type", check the above comments for more info */
3565 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3566 unsigned char buf[2];
3567 uint32_t len;
3568 int type;
3569
3570 if (isencoded) *isencoded = 0;
3571 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3572 type = (buf[0]&0xC0)>>6;
3573 if (type == REDIS_RDB_6BITLEN) {
3574 /* Read a 6 bit len */
3575 return buf[0]&0x3F;
3576 } else if (type == REDIS_RDB_ENCVAL) {
3577 /* Read a 6 bit len encoding type */
3578 if (isencoded) *isencoded = 1;
3579 return buf[0]&0x3F;
3580 } else if (type == REDIS_RDB_14BITLEN) {
3581 /* Read a 14 bit len */
3582 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3583 return ((buf[0]&0x3F)<<8)|buf[1];
3584 } else {
3585 /* Read a 32 bit len */
3586 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3587 return ntohl(len);
3588 }
3589 }
3590
3591 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3592 unsigned char enc[4];
3593 long long val;
3594
3595 if (enctype == REDIS_RDB_ENC_INT8) {
3596 if (fread(enc,1,1,fp) == 0) return NULL;
3597 val = (signed char)enc[0];
3598 } else if (enctype == REDIS_RDB_ENC_INT16) {
3599 uint16_t v;
3600 if (fread(enc,2,1,fp) == 0) return NULL;
3601 v = enc[0]|(enc[1]<<8);
3602 val = (int16_t)v;
3603 } else if (enctype == REDIS_RDB_ENC_INT32) {
3604 uint32_t v;
3605 if (fread(enc,4,1,fp) == 0) return NULL;
3606 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3607 val = (int32_t)v;
3608 } else {
3609 val = 0; /* anti-warning */
3610 redisAssert(0);
3611 }
3612 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3613 }
3614
3615 static robj *rdbLoadLzfStringObject(FILE*fp) {
3616 unsigned int len, clen;
3617 unsigned char *c = NULL;
3618 sds val = NULL;
3619
3620 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3621 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3622 if ((c = zmalloc(clen)) == NULL) goto err;
3623 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3624 if (fread(c,clen,1,fp) == 0) goto err;
3625 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3626 zfree(c);
3627 return createObject(REDIS_STRING,val);
3628 err:
3629 zfree(c);
3630 sdsfree(val);
3631 return NULL;
3632 }
3633
3634 static robj *rdbLoadStringObject(FILE*fp) {
3635 int isencoded;
3636 uint32_t len;
3637 sds val;
3638
3639 len = rdbLoadLen(fp,&isencoded);
3640 if (isencoded) {
3641 switch(len) {
3642 case REDIS_RDB_ENC_INT8:
3643 case REDIS_RDB_ENC_INT16:
3644 case REDIS_RDB_ENC_INT32:
3645 return rdbLoadIntegerObject(fp,len);
3646 case REDIS_RDB_ENC_LZF:
3647 return rdbLoadLzfStringObject(fp);
3648 default:
3649 redisAssert(0);
3650 }
3651 }
3652
3653 if (len == REDIS_RDB_LENERR) return NULL;
3654 val = sdsnewlen(NULL,len);
3655 if (len && fread(val,len,1,fp) == 0) {
3656 sdsfree(val);
3657 return NULL;
3658 }
3659 return createObject(REDIS_STRING,val);
3660 }
3661
3662 /* For information about double serialization check rdbSaveDoubleValue() */
3663 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3664 char buf[128];
3665 unsigned char len;
3666
3667 if (fread(&len,1,1,fp) == 0) return -1;
3668 switch(len) {
3669 case 255: *val = R_NegInf; return 0;
3670 case 254: *val = R_PosInf; return 0;
3671 case 253: *val = R_Nan; return 0;
3672 default:
3673 if (fread(buf,len,1,fp) == 0) return -1;
3674 buf[len] = '\0';
3675 sscanf(buf, "%lg", val);
3676 return 0;
3677 }
3678 }
3679
3680 /* Load a Redis object of the specified type from the specified file.
3681 * On success a newly allocated object is returned, otherwise NULL. */
3682 static robj *rdbLoadObject(int type, FILE *fp) {
3683 robj *o;
3684
3685 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3686 if (type == REDIS_STRING) {
3687 /* Read string value */
3688 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3689 o = tryObjectEncoding(o);
3690 } else if (type == REDIS_LIST || type == REDIS_SET) {
3691 /* Read list/set value */
3692 uint32_t listlen;
3693
3694 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3695 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3696 /* It's faster to expand the dict to the right size asap in order
3697 * to avoid rehashing */
3698 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3699 dictExpand(o->ptr,listlen);
3700 /* Load every single element of the list/set */
3701 while(listlen--) {
3702 robj *ele;
3703
3704 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3705 ele = tryObjectEncoding(ele);
3706 if (type == REDIS_LIST) {
3707 listAddNodeTail((list*)o->ptr,ele);
3708 } else {
3709 dictAdd((dict*)o->ptr,ele,NULL);
3710 }
3711 }
3712 } else if (type == REDIS_ZSET) {
3713 /* Read list/set value */
3714 size_t zsetlen;
3715 zset *zs;
3716
3717 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3718 o = createZsetObject();
3719 zs = o->ptr;
3720 /* Load every single element of the list/set */
3721 while(zsetlen--) {
3722 robj *ele;
3723 double *score = zmalloc(sizeof(double));
3724
3725 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3726 ele = tryObjectEncoding(ele);
3727 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3728 dictAdd(zs->dict,ele,score);
3729 zslInsert(zs->zsl,*score,ele);
3730 incrRefCount(ele); /* added to skiplist */
3731 }
3732 } else if (type == REDIS_HASH) {
3733 size_t hashlen;
3734
3735 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3736 o = createHashObject();
3737 /* Too many entries? Use an hash table. */
3738 if (hashlen > server.hash_max_zipmap_entries)
3739 convertToRealHash(o);
3740 /* Load every key/value, then set it into the zipmap or hash
3741 * table, as needed. */
3742 while(hashlen--) {
3743 robj *key, *val;
3744
3745 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3746 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3747 /* If we are using a zipmap and there are too big values
3748 * the object is converted to real hash table encoding. */
3749 if (o->encoding != REDIS_ENCODING_HT &&
3750 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3751 sdslen(val->ptr) > server.hash_max_zipmap_value))
3752 {
3753 convertToRealHash(o);
3754 }
3755
3756 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3757 unsigned char *zm = o->ptr;
3758
3759 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3760 val->ptr,sdslen(val->ptr),NULL);
3761 o->ptr = zm;
3762 decrRefCount(key);
3763 decrRefCount(val);
3764 } else {
3765 key = tryObjectEncoding(key);
3766 val = tryObjectEncoding(val);
3767 dictAdd((dict*)o->ptr,key,val);
3768 }
3769 }
3770 } else {
3771 redisAssert(0);
3772 }
3773 return o;
3774 }
3775
3776 static int rdbLoad(char *filename) {
3777 FILE *fp;
3778 robj *keyobj = NULL;
3779 uint32_t dbid;
3780 int type, retval, rdbver;
3781 dict *d = server.db[0].dict;
3782 redisDb *db = server.db+0;
3783 char buf[1024];
3784 time_t expiretime = -1, now = time(NULL);
3785 long long loadedkeys = 0;
3786
3787 fp = fopen(filename,"r");
3788 if (!fp) return REDIS_ERR;
3789 if (fread(buf,9,1,fp) == 0) goto eoferr;
3790 buf[9] = '\0';
3791 if (memcmp(buf,"REDIS",5) != 0) {
3792 fclose(fp);
3793 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3794 return REDIS_ERR;
3795 }
3796 rdbver = atoi(buf+5);
3797 if (rdbver != 1) {
3798 fclose(fp);
3799 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3800 return REDIS_ERR;
3801 }
3802 while(1) {
3803 robj *o;
3804
3805 /* Read type. */
3806 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3807 if (type == REDIS_EXPIRETIME) {
3808 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3809 /* We read the time so we need to read the object type again */
3810 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3811 }
3812 if (type == REDIS_EOF) break;
3813 /* Handle SELECT DB opcode as a special case */
3814 if (type == REDIS_SELECTDB) {
3815 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3816 goto eoferr;
3817 if (dbid >= (unsigned)server.dbnum) {
3818 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3819 exit(1);
3820 }
3821 db = server.db+dbid;
3822 d = db->dict;
3823 continue;
3824 }
3825 /* Read key */
3826 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3827 /* Read value */
3828 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3829 /* Add the new object in the hash table */
3830 retval = dictAdd(d,keyobj,o);
3831 if (retval == DICT_ERR) {
3832 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3833 exit(1);
3834 }
3835 /* Set the expire time if needed */
3836 if (expiretime != -1) {
3837 setExpire(db,keyobj,expiretime);
3838 /* Delete this key if already expired */
3839 if (expiretime < now) deleteKey(db,keyobj);
3840 expiretime = -1;
3841 }
3842 keyobj = o = NULL;
3843 /* Handle swapping while loading big datasets when VM is on */
3844 loadedkeys++;
3845 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3846 while (zmalloc_used_memory() > server.vm_max_memory) {
3847 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3848 }
3849 }
3850 }
3851 fclose(fp);
3852 return REDIS_OK;
3853
3854 eoferr: /* unexpected end of file is handled here with a fatal exit */
3855 if (keyobj) decrRefCount(keyobj);
3856 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3857 exit(1);
3858 return REDIS_ERR; /* Just to avoid warning */
3859 }
3860
3861 /*================================== Commands =============================== */
3862
3863 static void authCommand(redisClient *c) {
3864 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3865 c->authenticated = 1;
3866 addReply(c,shared.ok);
3867 } else {
3868 c->authenticated = 0;
3869 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3870 }
3871 }
3872
3873 static void pingCommand(redisClient *c) {
3874 addReply(c,shared.pong);
3875 }
3876
3877 static void echoCommand(redisClient *c) {
3878 addReplyBulk(c,c->argv[1]);
3879 }
3880
3881 /*=================================== Strings =============================== */
3882
3883 static void setGenericCommand(redisClient *c, int nx) {
3884 int retval;
3885
3886 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3887 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3888 if (retval == DICT_ERR) {
3889 if (!nx) {
3890 /* If the key is about a swapped value, we want a new key object
3891 * to overwrite the old. So we delete the old key in the database.
3892 * This will also make sure that swap pages about the old object
3893 * will be marked as free. */
3894 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3895 incrRefCount(c->argv[1]);
3896 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3897 incrRefCount(c->argv[2]);
3898 } else {
3899 addReply(c,shared.czero);
3900 return;
3901 }
3902 } else {
3903 incrRefCount(c->argv[1]);
3904 incrRefCount(c->argv[2]);
3905 }
3906 server.dirty++;
3907 removeExpire(c->db,c->argv[1]);
3908 addReply(c, nx ? shared.cone : shared.ok);
3909 }
3910
3911 static void setCommand(redisClient *c) {
3912 setGenericCommand(c,0);
3913 }
3914
3915 static void setnxCommand(redisClient *c) {
3916 setGenericCommand(c,1);
3917 }
3918
3919 static int getGenericCommand(redisClient *c) {
3920 robj *o;
3921
3922 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3923 return REDIS_OK;
3924
3925 if (o->type != REDIS_STRING) {
3926 addReply(c,shared.wrongtypeerr);
3927 return REDIS_ERR;
3928 } else {
3929 addReplyBulk(c,o);
3930 return REDIS_OK;
3931 }
3932 }
3933
3934 static void getCommand(redisClient *c) {
3935 getGenericCommand(c);
3936 }
3937
3938 static void getsetCommand(redisClient *c) {
3939 if (getGenericCommand(c) == REDIS_ERR) return;
3940 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3941 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3942 } else {
3943 incrRefCount(c->argv[1]);
3944 }
3945 incrRefCount(c->argv[2]);
3946 server.dirty++;
3947 removeExpire(c->db,c->argv[1]);
3948 }
3949
3950 static void mgetCommand(redisClient *c) {
3951 int j;
3952
3953 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3954 for (j = 1; j < c->argc; j++) {
3955 robj *o = lookupKeyRead(c->db,c->argv[j]);
3956 if (o == NULL) {
3957 addReply(c,shared.nullbulk);
3958 } else {
3959 if (o->type != REDIS_STRING) {
3960 addReply(c,shared.nullbulk);
3961 } else {
3962 addReplyBulk(c,o);
3963 }
3964 }
3965 }
3966 }
3967
3968 static void msetGenericCommand(redisClient *c, int nx) {
3969 int j, busykeys = 0;
3970
3971 if ((c->argc % 2) == 0) {
3972 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3973 return;
3974 }
3975 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3976 * set nothing at all if at least one already key exists. */
3977 if (nx) {
3978 for (j = 1; j < c->argc; j += 2) {
3979 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3980 busykeys++;
3981 }
3982 }
3983 }
3984 if (busykeys) {
3985 addReply(c, shared.czero);
3986 return;
3987 }
3988
3989 for (j = 1; j < c->argc; j += 2) {
3990 int retval;
3991
3992 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
3993 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3994 if (retval == DICT_ERR) {
3995 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3996 incrRefCount(c->argv[j+1]);
3997 } else {
3998 incrRefCount(c->argv[j]);
3999 incrRefCount(c->argv[j+1]);
4000 }
4001 removeExpire(c->db,c->argv[j]);
4002 }
4003 server.dirty += (c->argc-1)/2;
4004 addReply(c, nx ? shared.cone : shared.ok);
4005 }
4006
4007 static void msetCommand(redisClient *c) {
4008 msetGenericCommand(c,0);
4009 }
4010
4011 static void msetnxCommand(redisClient *c) {
4012 msetGenericCommand(c,1);
4013 }
4014
4015 static void incrDecrCommand(redisClient *c, long long incr) {
4016 long long value;
4017 int retval;
4018 robj *o;
4019
4020 o = lookupKeyWrite(c->db,c->argv[1]);
4021
4022 if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
4023
4024 value += incr;
4025 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4026 o = tryObjectEncoding(o);
4027 retval = dictAdd(c->db->dict,c->argv[1],o);
4028 if (retval == DICT_ERR) {
4029 dictReplace(c->db->dict,c->argv[1],o);
4030 removeExpire(c->db,c->argv[1]);
4031 } else {
4032 incrRefCount(c->argv[1]);
4033 }
4034 server.dirty++;
4035 addReply(c,shared.colon);
4036 addReply(c,o);
4037 addReply(c,shared.crlf);
4038 }
4039
4040 static void incrCommand(redisClient *c) {
4041 incrDecrCommand(c,1);
4042 }
4043
4044 static void decrCommand(redisClient *c) {
4045 incrDecrCommand(c,-1);
4046 }
4047
4048 static void incrbyCommand(redisClient *c) {
4049 long long incr;
4050
4051 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4052
4053 incrDecrCommand(c,incr);
4054 }
4055
4056 static void decrbyCommand(redisClient *c) {
4057 long long incr;
4058
4059 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4060
4061 incrDecrCommand(c,-incr);
4062 }
4063
4064 static void appendCommand(redisClient *c) {
4065 int retval;
4066 size_t totlen;
4067 robj *o;
4068
4069 o = lookupKeyWrite(c->db,c->argv[1]);
4070 if (o == NULL) {
4071 /* Create the key */
4072 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4073 incrRefCount(c->argv[1]);
4074 incrRefCount(c->argv[2]);
4075 totlen = stringObjectLen(c->argv[2]);
4076 } else {
4077 dictEntry *de;
4078
4079 de = dictFind(c->db->dict,c->argv[1]);
4080 assert(de != NULL);
4081
4082 o = dictGetEntryVal(de);
4083 if (o->type != REDIS_STRING) {
4084 addReply(c,shared.wrongtypeerr);
4085 return;
4086 }
4087 /* If the object is specially encoded or shared we have to make
4088 * a copy */
4089 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4090 robj *decoded = getDecodedObject(o);
4091
4092 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4093 decrRefCount(decoded);
4094 dictReplace(c->db->dict,c->argv[1],o);
4095 }
4096 /* APPEND! */
4097 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4098 o->ptr = sdscatlen(o->ptr,
4099 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4100 } else {
4101 o->ptr = sdscatprintf(o->ptr, "%ld",
4102 (unsigned long) c->argv[2]->ptr);
4103 }
4104 totlen = sdslen(o->ptr);
4105 }
4106 server.dirty++;
4107 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4108 }
4109
4110 static void substrCommand(redisClient *c) {
4111 robj *o;
4112 long start = atoi(c->argv[2]->ptr);
4113 long end = atoi(c->argv[3]->ptr);
4114 size_t rangelen, strlen;
4115 sds range;
4116
4117 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4118 checkType(c,o,REDIS_STRING)) return;
4119
4120 o = getDecodedObject(o);
4121 strlen = sdslen(o->ptr);
4122
4123 /* convert negative indexes */
4124 if (start < 0) start = strlen+start;
4125 if (end < 0) end = strlen+end;
4126 if (start < 0) start = 0;
4127 if (end < 0) end = 0;
4128
4129 /* indexes sanity checks */
4130 if (start > end || (size_t)start >= strlen) {
4131 /* Out of range start or start > end result in null reply */
4132 addReply(c,shared.nullbulk);
4133 decrRefCount(o);
4134 return;
4135 }
4136 if ((size_t)end >= strlen) end = strlen-1;
4137 rangelen = (end-start)+1;
4138
4139 /* Return the result */
4140 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4141 range = sdsnewlen((char*)o->ptr+start,rangelen);
4142 addReplySds(c,range);
4143 addReply(c,shared.crlf);
4144 decrRefCount(o);
4145 }
4146
4147 /* ========================= Type agnostic commands ========================= */
4148
4149 static void delCommand(redisClient *c) {
4150 int deleted = 0, j;
4151
4152 for (j = 1; j < c->argc; j++) {
4153 if (deleteKey(c->db,c->argv[j])) {
4154 server.dirty++;
4155 deleted++;
4156 }
4157 }
4158 addReplyLong(c,deleted);
4159 }
4160
4161 static void existsCommand(redisClient *c) {
4162 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4163 }
4164
4165 static void selectCommand(redisClient *c) {
4166 int id = atoi(c->argv[1]->ptr);
4167
4168 if (selectDb(c,id) == REDIS_ERR) {
4169 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4170 } else {
4171 addReply(c,shared.ok);
4172 }
4173 }
4174
4175 static void randomkeyCommand(redisClient *c) {
4176 dictEntry *de;
4177
4178 while(1) {
4179 de = dictGetRandomKey(c->db->dict);
4180 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4181 }
4182 if (de == NULL) {
4183 addReply(c,shared.plus);
4184 addReply(c,shared.crlf);
4185 } else {
4186 addReply(c,shared.plus);
4187 addReply(c,dictGetEntryKey(de));
4188 addReply(c,shared.crlf);
4189 }
4190 }
4191
4192 static void keysCommand(redisClient *c) {
4193 dictIterator *di;
4194 dictEntry *de;
4195 sds pattern = c->argv[1]->ptr;
4196 int plen = sdslen(pattern);
4197 unsigned long numkeys = 0;
4198 robj *lenobj = createObject(REDIS_STRING,NULL);
4199
4200 di = dictGetIterator(c->db->dict);
4201 addReply(c,lenobj);
4202 decrRefCount(lenobj);
4203 while((de = dictNext(di)) != NULL) {
4204 robj *keyobj = dictGetEntryKey(de);
4205
4206 sds key = keyobj->ptr;
4207 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4208 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4209 if (expireIfNeeded(c->db,keyobj) == 0) {
4210 addReplyBulk(c,keyobj);
4211 numkeys++;
4212 }
4213 }
4214 }
4215 dictReleaseIterator(di);
4216 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4217 }
4218
4219 static void dbsizeCommand(redisClient *c) {
4220 addReplySds(c,
4221 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4222 }
4223
4224 static void lastsaveCommand(redisClient *c) {
4225 addReplySds(c,
4226 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4227 }
4228
4229 static void typeCommand(redisClient *c) {
4230 robj *o;
4231 char *type;
4232
4233 o = lookupKeyRead(c->db,c->argv[1]);
4234 if (o == NULL) {
4235 type = "+none";
4236 } else {
4237 switch(o->type) {
4238 case REDIS_STRING: type = "+string"; break;
4239 case REDIS_LIST: type = "+list"; break;
4240 case REDIS_SET: type = "+set"; break;
4241 case REDIS_ZSET: type = "+zset"; break;
4242 case REDIS_HASH: type = "+hash"; break;
4243 default: type = "+unknown"; break;
4244 }
4245 }
4246 addReplySds(c,sdsnew(type));
4247 addReply(c,shared.crlf);
4248 }
4249
4250 static void saveCommand(redisClient *c) {
4251 if (server.bgsavechildpid != -1) {
4252 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4253 return;
4254 }
4255 if (rdbSave(server.dbfilename) == REDIS_OK) {
4256 addReply(c,shared.ok);
4257 } else {
4258 addReply(c,shared.err);
4259 }
4260 }
4261
4262 static void bgsaveCommand(redisClient *c) {
4263 if (server.bgsavechildpid != -1) {
4264 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4265 return;
4266 }
4267 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4268 char *status = "+Background saving started\r\n";
4269 addReplySds(c,sdsnew(status));
4270 } else {
4271 addReply(c,shared.err);
4272 }
4273 }
4274
4275 static void shutdownCommand(redisClient *c) {
4276 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4277 /* Kill the saving child if there is a background saving in progress.
4278 We want to avoid race conditions, for instance our saving child may
4279 overwrite the synchronous saving did by SHUTDOWN. */
4280 if (server.bgsavechildpid != -1) {
4281 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4282 kill(server.bgsavechildpid,SIGKILL);
4283 rdbRemoveTempFile(server.bgsavechildpid);
4284 }
4285 if (server.appendonly) {
4286 /* Append only file: fsync() the AOF and exit */
4287 fsync(server.appendfd);
4288 if (server.vm_enabled) unlink(server.vm_swap_file);
4289 exit(0);
4290 } else {
4291 /* Snapshotting. Perform a SYNC SAVE and exit */
4292 if (rdbSave(server.dbfilename) == REDIS_OK) {
4293 if (server.daemonize)
4294 unlink(server.pidfile);
4295 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4296 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4297 if (server.vm_enabled) unlink(server.vm_swap_file);
4298 exit(0);
4299 } else {
4300 /* Ooops.. error saving! The best we can do is to continue
4301 * operating. Note that if there was a background saving process,
4302 * in the next cron() Redis will be notified that the background
4303 * saving aborted, handling special stuff like slaves pending for
4304 * synchronization... */
4305 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4306 addReplySds(c,
4307 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4308 }
4309 }
4310 }
4311
4312 static void renameGenericCommand(redisClient *c, int nx) {
4313 robj *o;
4314
4315 /* To use the same key as src and dst is probably an error */
4316 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4317 addReply(c,shared.sameobjecterr);
4318 return;
4319 }
4320
4321 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4322 return;
4323
4324 incrRefCount(o);
4325 deleteIfVolatile(c->db,c->argv[2]);
4326 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4327 if (nx) {
4328 decrRefCount(o);
4329 addReply(c,shared.czero);
4330 return;
4331 }
4332 dictReplace(c->db->dict,c->argv[2],o);
4333 } else {
4334 incrRefCount(c->argv[2]);
4335 }
4336 deleteKey(c->db,c->argv[1]);
4337 server.dirty++;
4338 addReply(c,nx ? shared.cone : shared.ok);
4339 }
4340
4341 static void renameCommand(redisClient *c) {
4342 renameGenericCommand(c,0);
4343 }
4344
4345 static void renamenxCommand(redisClient *c) {
4346 renameGenericCommand(c,1);
4347 }
4348
4349 static void moveCommand(redisClient *c) {
4350 robj *o;
4351 redisDb *src, *dst;
4352 int srcid;
4353
4354 /* Obtain source and target DB pointers */
4355 src = c->db;
4356 srcid = c->db->id;
4357 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4358 addReply(c,shared.outofrangeerr);
4359 return;
4360 }
4361 dst = c->db;
4362 selectDb(c,srcid); /* Back to the source DB */
4363
4364 /* If the user is moving using as target the same
4365 * DB as the source DB it is probably an error. */
4366 if (src == dst) {
4367 addReply(c,shared.sameobjecterr);
4368 return;
4369 }
4370
4371 /* Check if the element exists and get a reference */
4372 o = lookupKeyWrite(c->db,c->argv[1]);
4373 if (!o) {
4374 addReply(c,shared.czero);
4375 return;
4376 }
4377
4378 /* Try to add the element to the target DB */
4379 deleteIfVolatile(dst,c->argv[1]);
4380 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4381 addReply(c,shared.czero);
4382 return;
4383 }
4384 incrRefCount(c->argv[1]);
4385 incrRefCount(o);
4386
4387 /* OK! key moved, free the entry in the source DB */
4388 deleteKey(src,c->argv[1]);
4389 server.dirty++;
4390 addReply(c,shared.cone);
4391 }
4392
4393 /* =================================== Lists ================================ */
4394 static void pushGenericCommand(redisClient *c, int where) {
4395 robj *lobj;
4396 list *list;
4397
4398 lobj = lookupKeyWrite(c->db,c->argv[1]);
4399 if (lobj == NULL) {
4400 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4401 addReply(c,shared.cone);
4402 return;
4403 }
4404 lobj = createListObject();
4405 list = lobj->ptr;
4406 if (where == REDIS_HEAD) {
4407 listAddNodeHead(list,c->argv[2]);
4408 } else {
4409 listAddNodeTail(list,c->argv[2]);
4410 }
4411 dictAdd(c->db->dict,c->argv[1],lobj);
4412 incrRefCount(c->argv[1]);
4413 incrRefCount(c->argv[2]);
4414 } else {
4415 if (lobj->type != REDIS_LIST) {
4416 addReply(c,shared.wrongtypeerr);
4417 return;
4418 }
4419 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4420 addReply(c,shared.cone);
4421 return;
4422 }
4423 list = lobj->ptr;
4424 if (where == REDIS_HEAD) {
4425 listAddNodeHead(list,c->argv[2]);
4426 } else {
4427 listAddNodeTail(list,c->argv[2]);
4428 }
4429 incrRefCount(c->argv[2]);
4430 }
4431 server.dirty++;
4432 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4433 }
4434
4435 static void lpushCommand(redisClient *c) {
4436 pushGenericCommand(c,REDIS_HEAD);
4437 }
4438
4439 static void rpushCommand(redisClient *c) {
4440 pushGenericCommand(c,REDIS_TAIL);
4441 }
4442
4443 static void llenCommand(redisClient *c) {
4444 robj *o;
4445 list *l;
4446
4447 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4448 checkType(c,o,REDIS_LIST)) return;
4449
4450 l = o->ptr;
4451 addReplyUlong(c,listLength(l));
4452 }
4453
4454 static void lindexCommand(redisClient *c) {
4455 robj *o;
4456 int index = atoi(c->argv[2]->ptr);
4457 list *list;
4458 listNode *ln;
4459
4460 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4461 checkType(c,o,REDIS_LIST)) return;
4462 list = o->ptr;
4463
4464 ln = listIndex(list, index);
4465 if (ln == NULL) {
4466 addReply(c,shared.nullbulk);
4467 } else {
4468 robj *ele = listNodeValue(ln);
4469 addReplyBulk(c,ele);
4470 }
4471 }
4472
4473 static void lsetCommand(redisClient *c) {
4474 robj *o;
4475 int index = atoi(c->argv[2]->ptr);
4476 list *list;
4477 listNode *ln;
4478
4479 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4480 checkType(c,o,REDIS_LIST)) return;
4481 list = o->ptr;
4482
4483 ln = listIndex(list, index);
4484 if (ln == NULL) {
4485 addReply(c,shared.outofrangeerr);
4486 } else {
4487 robj *ele = listNodeValue(ln);
4488
4489 decrRefCount(ele);
4490 listNodeValue(ln) = c->argv[3];
4491 incrRefCount(c->argv[3]);
4492 addReply(c,shared.ok);
4493 server.dirty++;
4494 }
4495 }
4496
4497 static void popGenericCommand(redisClient *c, int where) {
4498 robj *o;
4499 list *list;
4500 listNode *ln;
4501
4502 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4503 checkType(c,o,REDIS_LIST)) return;
4504 list = o->ptr;
4505
4506 if (where == REDIS_HEAD)
4507 ln = listFirst(list);
4508 else
4509 ln = listLast(list);
4510
4511 if (ln == NULL) {
4512 addReply(c,shared.nullbulk);
4513 } else {
4514 robj *ele = listNodeValue(ln);
4515 addReplyBulk(c,ele);
4516 listDelNode(list,ln);
4517 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4518 server.dirty++;
4519 }
4520 }
4521
4522 static void lpopCommand(redisClient *c) {
4523 popGenericCommand(c,REDIS_HEAD);
4524 }
4525
4526 static void rpopCommand(redisClient *c) {
4527 popGenericCommand(c,REDIS_TAIL);
4528 }
4529
4530 static void lrangeCommand(redisClient *c) {
4531 robj *o;
4532 int start = atoi(c->argv[2]->ptr);
4533 int end = atoi(c->argv[3]->ptr);
4534 int llen;
4535 int rangelen, j;
4536 list *list;
4537 listNode *ln;
4538 robj *ele;
4539
4540 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4541 || checkType(c,o,REDIS_LIST)) return;
4542 list = o->ptr;
4543 llen = listLength(list);
4544
4545 /* convert negative indexes */
4546 if (start < 0) start = llen+start;
4547 if (end < 0) end = llen+end;
4548 if (start < 0) start = 0;
4549 if (end < 0) end = 0;
4550
4551 /* indexes sanity checks */
4552 if (start > end || start >= llen) {
4553 /* Out of range start or start > end result in empty list */
4554 addReply(c,shared.emptymultibulk);
4555 return;
4556 }
4557 if (end >= llen) end = llen-1;
4558 rangelen = (end-start)+1;
4559
4560 /* Return the result in form of a multi-bulk reply */
4561 ln = listIndex(list, start);
4562 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4563 for (j = 0; j < rangelen; j++) {
4564 ele = listNodeValue(ln);
4565 addReplyBulk(c,ele);
4566 ln = ln->next;
4567 }
4568 }
4569
4570 static void ltrimCommand(redisClient *c) {
4571 robj *o;
4572 int start = atoi(c->argv[2]->ptr);
4573 int end = atoi(c->argv[3]->ptr);
4574 int llen;
4575 int j, ltrim, rtrim;
4576 list *list;
4577 listNode *ln;
4578
4579 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4580 checkType(c,o,REDIS_LIST)) return;
4581 list = o->ptr;
4582 llen = listLength(list);
4583
4584 /* convert negative indexes */
4585 if (start < 0) start = llen+start;
4586 if (end < 0) end = llen+end;
4587 if (start < 0) start = 0;
4588 if (end < 0) end = 0;
4589
4590 /* indexes sanity checks */
4591 if (start > end || start >= llen) {
4592 /* Out of range start or start > end result in empty list */
4593 ltrim = llen;
4594 rtrim = 0;
4595 } else {
4596 if (end >= llen) end = llen-1;
4597 ltrim = start;
4598 rtrim = llen-end-1;
4599 }
4600
4601 /* Remove list elements to perform the trim */
4602 for (j = 0; j < ltrim; j++) {
4603 ln = listFirst(list);
4604 listDelNode(list,ln);
4605 }
4606 for (j = 0; j < rtrim; j++) {
4607 ln = listLast(list);
4608 listDelNode(list,ln);
4609 }
4610 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4611 server.dirty++;
4612 addReply(c,shared.ok);
4613 }
4614
4615 static void lremCommand(redisClient *c) {
4616 robj *o;
4617 list *list;
4618 listNode *ln, *next;
4619 int toremove = atoi(c->argv[2]->ptr);
4620 int removed = 0;
4621 int fromtail = 0;
4622
4623 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4624 checkType(c,o,REDIS_LIST)) return;
4625 list = o->ptr;
4626
4627 if (toremove < 0) {
4628 toremove = -toremove;
4629 fromtail = 1;
4630 }
4631 ln = fromtail ? list->tail : list->head;
4632 while (ln) {
4633 robj *ele = listNodeValue(ln);
4634
4635 next = fromtail ? ln->prev : ln->next;
4636 if (compareStringObjects(ele,c->argv[3]) == 0) {
4637 listDelNode(list,ln);
4638 server.dirty++;
4639 removed++;
4640 if (toremove && removed == toremove) break;
4641 }
4642 ln = next;
4643 }
4644 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4645 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4646 }
4647
4648 /* This is the semantic of this command:
4649 * RPOPLPUSH srclist dstlist:
4650 * IF LLEN(srclist) > 0
4651 * element = RPOP srclist
4652 * LPUSH dstlist element
4653 * RETURN element
4654 * ELSE
4655 * RETURN nil
4656 * END
4657 * END
4658 *
4659 * The idea is to be able to get an element from a list in a reliable way
4660 * since the element is not just returned but pushed against another list
4661 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4662 */
4663 static void rpoplpushcommand(redisClient *c) {
4664 robj *sobj;
4665 list *srclist;
4666 listNode *ln;
4667
4668 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4669 checkType(c,sobj,REDIS_LIST)) return;
4670 srclist = sobj->ptr;
4671 ln = listLast(srclist);
4672
4673 if (ln == NULL) {
4674 addReply(c,shared.nullbulk);
4675 } else {
4676 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4677 robj *ele = listNodeValue(ln);
4678 list *dstlist;
4679
4680 if (dobj && dobj->type != REDIS_LIST) {
4681 addReply(c,shared.wrongtypeerr);
4682 return;
4683 }
4684
4685 /* Add the element to the target list (unless it's directly
4686 * passed to some BLPOP-ing client */
4687 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4688 if (dobj == NULL) {
4689 /* Create the list if the key does not exist */
4690 dobj = createListObject();
4691 dictAdd(c->db->dict,c->argv[2],dobj);
4692 incrRefCount(c->argv[2]);
4693 }
4694 dstlist = dobj->ptr;
4695 listAddNodeHead(dstlist,ele);
4696 incrRefCount(ele);
4697 }
4698
4699 /* Send the element to the client as reply as well */
4700 addReplyBulk(c,ele);
4701
4702 /* Finally remove the element from the source list */
4703 listDelNode(srclist,ln);
4704 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4705 server.dirty++;
4706 }
4707 }
4708
4709 /* ==================================== Sets ================================ */
4710
4711 static void saddCommand(redisClient *c) {
4712 robj *set;
4713
4714 set = lookupKeyWrite(c->db,c->argv[1]);
4715 if (set == NULL) {
4716 set = createSetObject();
4717 dictAdd(c->db->dict,c->argv[1],set);
4718 incrRefCount(c->argv[1]);
4719 } else {
4720 if (set->type != REDIS_SET) {
4721 addReply(c,shared.wrongtypeerr);
4722 return;
4723 }
4724 }
4725 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4726 incrRefCount(c->argv[2]);
4727 server.dirty++;
4728 addReply(c,shared.cone);
4729 } else {
4730 addReply(c,shared.czero);
4731 }
4732 }
4733
4734 static void sremCommand(redisClient *c) {
4735 robj *set;
4736
4737 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4738 checkType(c,set,REDIS_SET)) return;
4739
4740 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4741 server.dirty++;
4742 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4743 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4744 addReply(c,shared.cone);
4745 } else {
4746 addReply(c,shared.czero);
4747 }
4748 }
4749
4750 static void smoveCommand(redisClient *c) {
4751 robj *srcset, *dstset;
4752
4753 srcset = lookupKeyWrite(c->db,c->argv[1]);
4754 dstset = lookupKeyWrite(c->db,c->argv[2]);
4755
4756 /* If the source key does not exist return 0, if it's of the wrong type
4757 * raise an error */
4758 if (srcset == NULL || srcset->type != REDIS_SET) {
4759 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4760 return;
4761 }
4762 /* Error if the destination key is not a set as well */
4763 if (dstset && dstset->type != REDIS_SET) {
4764 addReply(c,shared.wrongtypeerr);
4765 return;
4766 }
4767 /* Remove the element from the source set */
4768 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4769 /* Key not found in the src set! return zero */
4770 addReply(c,shared.czero);
4771 return;
4772 }
4773 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4774 deleteKey(c->db,c->argv[1]);
4775 server.dirty++;
4776 /* Add the element to the destination set */
4777 if (!dstset) {
4778 dstset = createSetObject();
4779 dictAdd(c->db->dict,c->argv[2],dstset);
4780 incrRefCount(c->argv[2]);
4781 }
4782 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4783 incrRefCount(c->argv[3]);
4784 addReply(c,shared.cone);
4785 }
4786
4787 static void sismemberCommand(redisClient *c) {
4788 robj *set;
4789
4790 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4791 checkType(c,set,REDIS_SET)) return;
4792
4793 if (dictFind(set->ptr,c->argv[2]))
4794 addReply(c,shared.cone);
4795 else
4796 addReply(c,shared.czero);
4797 }
4798
4799 static void scardCommand(redisClient *c) {
4800 robj *o;
4801 dict *s;
4802
4803 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4804 checkType(c,o,REDIS_SET)) return;
4805
4806 s = o->ptr;
4807 addReplyUlong(c,dictSize(s));
4808 }
4809
4810 static void spopCommand(redisClient *c) {
4811 robj *set;
4812 dictEntry *de;
4813
4814 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4815 checkType(c,set,REDIS_SET)) return;
4816
4817 de = dictGetRandomKey(set->ptr);
4818 if (de == NULL) {
4819 addReply(c,shared.nullbulk);
4820 } else {
4821 robj *ele = dictGetEntryKey(de);
4822
4823 addReplyBulk(c,ele);
4824 dictDelete(set->ptr,ele);
4825 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4826 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4827 server.dirty++;
4828 }
4829 }
4830
4831 static void srandmemberCommand(redisClient *c) {
4832 robj *set;
4833 dictEntry *de;
4834
4835 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4836 checkType(c,set,REDIS_SET)) return;
4837
4838 de = dictGetRandomKey(set->ptr);
4839 if (de == NULL) {
4840 addReply(c,shared.nullbulk);
4841 } else {
4842 robj *ele = dictGetEntryKey(de);
4843
4844 addReplyBulk(c,ele);
4845 }
4846 }
4847
4848 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4849 dict **d1 = (void*) s1, **d2 = (void*) s2;
4850
4851 return dictSize(*d1)-dictSize(*d2);
4852 }
4853
4854 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4855 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4856 dictIterator *di;
4857 dictEntry *de;
4858 robj *lenobj = NULL, *dstset = NULL;
4859 unsigned long j, cardinality = 0;
4860
4861 for (j = 0; j < setsnum; j++) {
4862 robj *setobj;
4863
4864 setobj = dstkey ?
4865 lookupKeyWrite(c->db,setskeys[j]) :
4866 lookupKeyRead(c->db,setskeys[j]);
4867 if (!setobj) {
4868 zfree(dv);
4869 if (dstkey) {
4870 if (deleteKey(c->db,dstkey))
4871 server.dirty++;
4872 addReply(c,shared.czero);
4873 } else {
4874 addReply(c,shared.emptymultibulk);
4875 }
4876 return;
4877 }
4878 if (setobj->type != REDIS_SET) {
4879 zfree(dv);
4880 addReply(c,shared.wrongtypeerr);
4881 return;
4882 }
4883 dv[j] = setobj->ptr;
4884 }
4885 /* Sort sets from the smallest to largest, this will improve our
4886 * algorithm's performace */
4887 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4888
4889 /* The first thing we should output is the total number of elements...
4890 * since this is a multi-bulk write, but at this stage we don't know
4891 * the intersection set size, so we use a trick, append an empty object
4892 * to the output list and save the pointer to later modify it with the
4893 * right length */
4894 if (!dstkey) {
4895 lenobj = createObject(REDIS_STRING,NULL);
4896 addReply(c,lenobj);
4897 decrRefCount(lenobj);
4898 } else {
4899 /* If we have a target key where to store the resulting set
4900 * create this key with an empty set inside */
4901 dstset = createSetObject();
4902 }
4903
4904 /* Iterate all the elements of the first (smallest) set, and test
4905 * the element against all the other sets, if at least one set does
4906 * not include the element it is discarded */
4907 di = dictGetIterator(dv[0]);
4908
4909 while((de = dictNext(di)) != NULL) {
4910 robj *ele;
4911
4912 for (j = 1; j < setsnum; j++)
4913 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4914 if (j != setsnum)
4915 continue; /* at least one set does not contain the member */
4916 ele = dictGetEntryKey(de);
4917 if (!dstkey) {
4918 addReplyBulk(c,ele);
4919 cardinality++;
4920 } else {
4921 dictAdd(dstset->ptr,ele,NULL);
4922 incrRefCount(ele);
4923 }
4924 }
4925 dictReleaseIterator(di);
4926
4927 if (dstkey) {
4928 /* Store the resulting set into the target, if the intersection
4929 * is not an empty set. */
4930 deleteKey(c->db,dstkey);
4931 if (dictSize((dict*)dstset->ptr) > 0) {
4932 dictAdd(c->db->dict,dstkey,dstset);
4933 incrRefCount(dstkey);
4934 addReplyLong(c,dictSize((dict*)dstset->ptr));
4935 } else {
4936 decrRefCount(dstset);
4937 addReply(c,shared.czero);
4938 }
4939 server.dirty++;
4940 } else {
4941 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4942 }
4943 zfree(dv);
4944 }
4945
4946 static void sinterCommand(redisClient *c) {
4947 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4948 }
4949
4950 static void sinterstoreCommand(redisClient *c) {
4951 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4952 }
4953
4954 #define REDIS_OP_UNION 0
4955 #define REDIS_OP_DIFF 1
4956 #define REDIS_OP_INTER 2
4957
4958 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4959 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4960 dictIterator *di;
4961 dictEntry *de;
4962 robj *dstset = NULL;
4963 int j, cardinality = 0;
4964
4965 for (j = 0; j < setsnum; j++) {
4966 robj *setobj;
4967
4968 setobj = dstkey ?
4969 lookupKeyWrite(c->db,setskeys[j]) :
4970 lookupKeyRead(c->db,setskeys[j]);
4971 if (!setobj) {
4972 dv[j] = NULL;
4973 continue;
4974 }
4975 if (setobj->type != REDIS_SET) {
4976 zfree(dv);
4977 addReply(c,shared.wrongtypeerr);
4978 return;
4979 }
4980 dv[j] = setobj->ptr;
4981 }
4982
4983 /* We need a temp set object to store our union. If the dstkey
4984 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4985 * this set object will be the resulting object to set into the target key*/
4986 dstset = createSetObject();
4987
4988 /* Iterate all the elements of all the sets, add every element a single
4989 * time to the result set */
4990 for (j = 0; j < setsnum; j++) {
4991 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4992 if (!dv[j]) continue; /* non existing keys are like empty sets */
4993
4994 di = dictGetIterator(dv[j]);
4995
4996 while((de = dictNext(di)) != NULL) {
4997 robj *ele;
4998
4999 /* dictAdd will not add the same element multiple times */
5000 ele = dictGetEntryKey(de);
5001 if (op == REDIS_OP_UNION || j == 0) {
5002 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5003 incrRefCount(ele);
5004 cardinality++;
5005 }
5006 } else if (op == REDIS_OP_DIFF) {
5007 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5008 cardinality--;
5009 }
5010 }
5011 }
5012 dictReleaseIterator(di);
5013
5014 /* result set is empty? Exit asap. */
5015 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5016 }
5017
5018 /* Output the content of the resulting set, if not in STORE mode */
5019 if (!dstkey) {
5020 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5021 di = dictGetIterator(dstset->ptr);
5022 while((de = dictNext(di)) != NULL) {
5023 robj *ele;
5024
5025 ele = dictGetEntryKey(de);
5026 addReplyBulk(c,ele);
5027 }
5028 dictReleaseIterator(di);
5029 decrRefCount(dstset);
5030 } else {
5031 /* If we have a target key where to store the resulting set
5032 * create this key with the result set inside */
5033 deleteKey(c->db,dstkey);
5034 if (dictSize((dict*)dstset->ptr) > 0) {
5035 dictAdd(c->db->dict,dstkey,dstset);
5036 incrRefCount(dstkey);
5037 addReplyLong(c,dictSize((dict*)dstset->ptr));
5038 } else {
5039 decrRefCount(dstset);
5040 addReply(c,shared.czero);
5041 }
5042 server.dirty++;
5043 }
5044 zfree(dv);
5045 }
5046
5047 static void sunionCommand(redisClient *c) {
5048 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5049 }
5050
5051 static void sunionstoreCommand(redisClient *c) {
5052 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5053 }
5054
5055 static void sdiffCommand(redisClient *c) {
5056 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5057 }
5058
5059 static void sdiffstoreCommand(redisClient *c) {
5060 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5061 }
5062
5063 /* ==================================== ZSets =============================== */
5064
5065 /* ZSETs are ordered sets using two data structures to hold the same elements
5066 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5067 * data structure.
5068 *
5069 * The elements are added to an hash table mapping Redis objects to scores.
5070 * At the same time the elements are added to a skip list mapping scores
5071 * to Redis objects (so objects are sorted by scores in this "view"). */
5072
5073 /* This skiplist implementation is almost a C translation of the original
5074 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5075 * Alternative to Balanced Trees", modified in three ways:
5076 * a) this implementation allows for repeated values.
5077 * b) the comparison is not just by key (our 'score') but by satellite data.
5078 * c) there is a back pointer, so it's a doubly linked list with the back
5079 * pointers being only at "level 1". This allows to traverse the list
5080 * from tail to head, useful for ZREVRANGE. */
5081
5082 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5083 zskiplistNode *zn = zmalloc(sizeof(*zn));
5084
5085 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5086 if (level > 0)
5087 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5088 zn->score = score;
5089 zn->obj = obj;
5090 return zn;
5091 }
5092
5093 static zskiplist *zslCreate(void) {
5094 int j;
5095 zskiplist *zsl;
5096
5097 zsl = zmalloc(sizeof(*zsl));
5098 zsl->level = 1;
5099 zsl->length = 0;
5100 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5101 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5102 zsl->header->forward[j] = NULL;
5103
5104 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5105 if (j < ZSKIPLIST_MAXLEVEL-1)
5106 zsl->header->span[j] = 0;
5107 }
5108 zsl->header->backward = NULL;
5109 zsl->tail = NULL;
5110 return zsl;
5111 }
5112
5113 static void zslFreeNode(zskiplistNode *node) {
5114 decrRefCount(node->obj);
5115 zfree(node->forward);
5116 zfree(node->span);
5117 zfree(node);
5118 }
5119
5120 static void zslFree(zskiplist *zsl) {
5121 zskiplistNode *node = zsl->header->forward[0], *next;
5122
5123 zfree(zsl->header->forward);
5124 zfree(zsl->header->span);
5125 zfree(zsl->header);
5126 while(node) {
5127 next = node->forward[0];
5128 zslFreeNode(node);
5129 node = next;
5130 }
5131 zfree(zsl);
5132 }
5133
5134 static int zslRandomLevel(void) {
5135 int level = 1;
5136 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5137 level += 1;
5138 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5139 }
5140
5141 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5142 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5143 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5144 int i, level;
5145
5146 x = zsl->header;
5147 for (i = zsl->level-1; i >= 0; i--) {
5148 /* store rank that is crossed to reach the insert position */
5149 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5150
5151 while (x->forward[i] &&
5152 (x->forward[i]->score < score ||
5153 (x->forward[i]->score == score &&
5154 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5155 rank[i] += i > 0 ? x->span[i-1] : 1;
5156 x = x->forward[i];
5157 }
5158 update[i] = x;
5159 }
5160 /* we assume the key is not already inside, since we allow duplicated
5161 * scores, and the re-insertion of score and redis object should never
5162 * happpen since the caller of zslInsert() should test in the hash table
5163 * if the element is already inside or not. */
5164 level = zslRandomLevel();
5165 if (level > zsl->level) {
5166 for (i = zsl->level; i < level; i++) {
5167 rank[i] = 0;
5168 update[i] = zsl->header;
5169 update[i]->span[i-1] = zsl->length;
5170 }
5171 zsl->level = level;
5172 }
5173 x = zslCreateNode(level,score,obj);
5174 for (i = 0; i < level; i++) {
5175 x->forward[i] = update[i]->forward[i];
5176 update[i]->forward[i] = x;
5177
5178 /* update span covered by update[i] as x is inserted here */
5179 if (i > 0) {
5180 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5181 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5182 }
5183 }
5184
5185 /* increment span for untouched levels */
5186 for (i = level; i < zsl->level; i++) {
5187 update[i]->span[i-1]++;
5188 }
5189
5190 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5191 if (x->forward[0])
5192 x->forward[0]->backward = x;
5193 else
5194 zsl->tail = x;
5195 zsl->length++;
5196 }
5197
5198 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5199 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5200 int i;
5201 for (i = 0; i < zsl->level; i++) {
5202 if (update[i]->forward[i] == x) {
5203 if (i > 0) {
5204 update[i]->span[i-1] += x->span[i-1] - 1;
5205 }
5206 update[i]->forward[i] = x->forward[i];
5207 } else {
5208 /* invariant: i > 0, because update[0]->forward[0]
5209 * is always equal to x */
5210 update[i]->span[i-1] -= 1;
5211 }
5212 }
5213 if (x->forward[0]) {
5214 x->forward[0]->backward = x->backward;
5215 } else {
5216 zsl->tail = x->backward;
5217 }
5218 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5219 zsl->level--;
5220 zsl->length--;
5221 }
5222
5223 /* Delete an element with matching score/object from the skiplist. */
5224 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5225 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5226 int i;
5227
5228 x = zsl->header;
5229 for (i = zsl->level-1; i >= 0; i--) {
5230 while (x->forward[i] &&
5231 (x->forward[i]->score < score ||
5232 (x->forward[i]->score == score &&
5233 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5234 x = x->forward[i];
5235 update[i] = x;
5236 }
5237 /* We may have multiple elements with the same score, what we need
5238 * is to find the element with both the right score and object. */
5239 x = x->forward[0];
5240 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5241 zslDeleteNode(zsl, x, update);
5242 zslFreeNode(x);
5243 return 1;
5244 } else {
5245 return 0; /* not found */
5246 }
5247 return 0; /* not found */
5248 }
5249
5250 /* Delete all the elements with score between min and max from the skiplist.
5251 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5252 * Note that this function takes the reference to the hash table view of the
5253 * sorted set, in order to remove the elements from the hash table too. */
5254 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5255 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5256 unsigned long removed = 0;
5257 int i;
5258
5259 x = zsl->header;
5260 for (i = zsl->level-1; i >= 0; i--) {
5261 while (x->forward[i] && x->forward[i]->score < min)
5262 x = x->forward[i];
5263 update[i] = x;
5264 }
5265 /* We may have multiple elements with the same score, what we need
5266 * is to find the element with both the right score and object. */
5267 x = x->forward[0];
5268 while (x && x->score <= max) {
5269 zskiplistNode *next = x->forward[0];
5270 zslDeleteNode(zsl, x, update);
5271 dictDelete(dict,x->obj);
5272 zslFreeNode(x);
5273 removed++;
5274 x = next;
5275 }
5276 return removed; /* not found */
5277 }
5278
5279 /* Delete all the elements with rank between start and end from the skiplist.
5280 * Start and end are inclusive. Note that start and end need to be 1-based */
5281 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5282 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5283 unsigned long traversed = 0, removed = 0;
5284 int i;
5285
5286 x = zsl->header;
5287 for (i = zsl->level-1; i >= 0; i--) {
5288 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5289 traversed += i > 0 ? x->span[i-1] : 1;
5290 x = x->forward[i];
5291 }
5292 update[i] = x;
5293 }
5294
5295 traversed++;
5296 x = x->forward[0];
5297 while (x && traversed <= end) {
5298 zskiplistNode *next = x->forward[0];
5299 zslDeleteNode(zsl, x, update);
5300 dictDelete(dict,x->obj);
5301 zslFreeNode(x);
5302 removed++;
5303 traversed++;
5304 x = next;
5305 }
5306 return removed;
5307 }
5308
5309 /* Find the first node having a score equal or greater than the specified one.
5310 * Returns NULL if there is no match. */
5311 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5312 zskiplistNode *x;
5313 int i;
5314
5315 x = zsl->header;
5316 for (i = zsl->level-1; i >= 0; i--) {
5317 while (x->forward[i] && x->forward[i]->score < score)
5318 x = x->forward[i];
5319 }
5320 /* We may have multiple elements with the same score, what we need
5321 * is to find the element with both the right score and object. */
5322 return x->forward[0];
5323 }
5324
5325 /* Find the rank for an element by both score and key.
5326 * Returns 0 when the element cannot be found, rank otherwise.
5327 * Note that the rank is 1-based due to the span of zsl->header to the
5328 * first element. */
5329 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5330 zskiplistNode *x;
5331 unsigned long rank = 0;
5332 int i;
5333
5334 x = zsl->header;
5335 for (i = zsl->level-1; i >= 0; i--) {
5336 while (x->forward[i] &&
5337 (x->forward[i]->score < score ||
5338 (x->forward[i]->score == score &&
5339 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5340 rank += i > 0 ? x->span[i-1] : 1;
5341 x = x->forward[i];
5342 }
5343
5344 /* x might be equal to zsl->header, so test if obj is non-NULL */
5345 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5346 return rank;
5347 }
5348 }
5349 return 0;
5350 }
5351
5352 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5353 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5354 zskiplistNode *x;
5355 unsigned long traversed = 0;
5356 int i;
5357
5358 x = zsl->header;
5359 for (i = zsl->level-1; i >= 0; i--) {
5360 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5361 {
5362 traversed += i > 0 ? x->span[i-1] : 1;
5363 x = x->forward[i];
5364 }
5365 if (traversed == rank) {
5366 return x;
5367 }
5368 }
5369 return NULL;
5370 }
5371
5372 /* The actual Z-commands implementations */
5373
5374 /* This generic command implements both ZADD and ZINCRBY.
5375 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5376 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5377 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5378 robj *zsetobj;
5379 zset *zs;
5380 double *score;
5381
5382 zsetobj = lookupKeyWrite(c->db,key);
5383 if (zsetobj == NULL) {
5384 zsetobj = createZsetObject();
5385 dictAdd(c->db->dict,key,zsetobj);
5386 incrRefCount(key);
5387 } else {
5388 if (zsetobj->type != REDIS_ZSET) {
5389 addReply(c,shared.wrongtypeerr);
5390 return;
5391 }
5392 }
5393 zs = zsetobj->ptr;
5394
5395 /* Ok now since we implement both ZADD and ZINCRBY here the code
5396 * needs to handle the two different conditions. It's all about setting
5397 * '*score', that is, the new score to set, to the right value. */
5398 score = zmalloc(sizeof(double));
5399 if (doincrement) {
5400 dictEntry *de;
5401
5402 /* Read the old score. If the element was not present starts from 0 */
5403 de = dictFind(zs->dict,ele);
5404 if (de) {
5405 double *oldscore = dictGetEntryVal(de);
5406 *score = *oldscore + scoreval;
5407 } else {
5408 *score = scoreval;
5409 }
5410 } else {
5411 *score = scoreval;
5412 }
5413
5414 /* What follows is a simple remove and re-insert operation that is common
5415 * to both ZADD and ZINCRBY... */
5416 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5417 /* case 1: New element */
5418 incrRefCount(ele); /* added to hash */
5419 zslInsert(zs->zsl,*score,ele);
5420 incrRefCount(ele); /* added to skiplist */
5421 server.dirty++;
5422 if (doincrement)
5423 addReplyDouble(c,*score);
5424 else
5425 addReply(c,shared.cone);
5426 } else {
5427 dictEntry *de;
5428 double *oldscore;
5429
5430 /* case 2: Score update operation */
5431 de = dictFind(zs->dict,ele);
5432 redisAssert(de != NULL);
5433 oldscore = dictGetEntryVal(de);
5434 if (*score != *oldscore) {
5435 int deleted;
5436
5437 /* Remove and insert the element in the skip list with new score */
5438 deleted = zslDelete(zs->zsl,*oldscore,ele);
5439 redisAssert(deleted != 0);
5440 zslInsert(zs->zsl,*score,ele);
5441 incrRefCount(ele);
5442 /* Update the score in the hash table */
5443 dictReplace(zs->dict,ele,score);
5444 server.dirty++;
5445 } else {
5446 zfree(score);
5447 }
5448 if (doincrement)
5449 addReplyDouble(c,*score);
5450 else
5451 addReply(c,shared.czero);
5452 }
5453 }
5454
5455 static void zaddCommand(redisClient *c) {
5456 double scoreval;
5457
5458 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5459
5460 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5461 }
5462
5463 static void zincrbyCommand(redisClient *c) {
5464 double scoreval;
5465
5466 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5467
5468 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5469 }
5470
5471 static void zremCommand(redisClient *c) {
5472 robj *zsetobj;
5473 zset *zs;
5474 dictEntry *de;
5475 double *oldscore;
5476 int deleted;
5477
5478 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5479 checkType(c,zsetobj,REDIS_ZSET)) return;
5480
5481 zs = zsetobj->ptr;
5482 de = dictFind(zs->dict,c->argv[2]);
5483 if (de == NULL) {
5484 addReply(c,shared.czero);
5485 return;
5486 }
5487 /* Delete from the skiplist */
5488 oldscore = dictGetEntryVal(de);
5489 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5490 redisAssert(deleted != 0);
5491
5492 /* Delete from the hash table */
5493 dictDelete(zs->dict,c->argv[2]);
5494 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5495 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5496 server.dirty++;
5497 addReply(c,shared.cone);
5498 }
5499
5500 static void zremrangebyscoreCommand(redisClient *c) {
5501 double min;
5502 double max;
5503 long deleted;
5504 robj *zsetobj;
5505 zset *zs;
5506
5507 if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
5508 (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;
5509
5510 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5511 checkType(c,zsetobj,REDIS_ZSET)) return;
5512
5513 zs = zsetobj->ptr;
5514 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5515 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5516 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5517 server.dirty += deleted;
5518 addReplyLong(c,deleted);
5519 }
5520
5521 static void zremrangebyrankCommand(redisClient *c) {
5522 long start;
5523 long end;
5524 int llen;
5525 long deleted;
5526 robj *zsetobj;
5527 zset *zs;
5528
5529 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5530 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5531
5532 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5533 checkType(c,zsetobj,REDIS_ZSET)) return;
5534 zs = zsetobj->ptr;
5535 llen = zs->zsl->length;
5536
5537 /* convert negative indexes */
5538 if (start < 0) start = llen+start;
5539 if (end < 0) end = llen+end;
5540 if (start < 0) start = 0;
5541 if (end < 0) end = 0;
5542
5543 /* indexes sanity checks */
5544 if (start > end || start >= llen) {
5545 addReply(c,shared.czero);
5546 return;
5547 }
5548 if (end >= llen) end = llen-1;
5549
5550 /* increment start and end because zsl*Rank functions
5551 * use 1-based rank */
5552 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5553 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5554 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5555 server.dirty += deleted;
5556 addReplyLong(c, deleted);
5557 }
5558
5559 typedef struct {
5560 dict *dict;
5561 double weight;
5562 } zsetopsrc;
5563
5564 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5565 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5566 unsigned long size1, size2;
5567 size1 = d1->dict ? dictSize(d1->dict) : 0;
5568 size2 = d2->dict ? dictSize(d2->dict) : 0;
5569 return size1 - size2;
5570 }
5571
5572 #define REDIS_AGGR_SUM 1
5573 #define REDIS_AGGR_MIN 2
5574 #define REDIS_AGGR_MAX 3
5575
5576 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5577 if (aggregate == REDIS_AGGR_SUM) {
5578 *target = *target + val;
5579 } else if (aggregate == REDIS_AGGR_MIN) {
5580 *target = val < *target ? val : *target;
5581 } else if (aggregate == REDIS_AGGR_MAX) {
5582 *target = val > *target ? val : *target;
5583 } else {
5584 /* safety net */
5585 redisAssert(0 != 0);
5586 }
5587 }
5588
5589 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5590 int i, j, zsetnum;
5591 int aggregate = REDIS_AGGR_SUM;
5592 zsetopsrc *src;
5593 robj *dstobj;
5594 zset *dstzset;
5595 dictIterator *di;
5596 dictEntry *de;
5597
5598 /* expect zsetnum input keys to be given */
5599 zsetnum = atoi(c->argv[2]->ptr);
5600 if (zsetnum < 1) {
5601 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5602 return;
5603 }
5604
5605 /* test if the expected number of keys would overflow */
5606 if (3+zsetnum > c->argc) {
5607 addReply(c,shared.syntaxerr);
5608 return;
5609 }
5610
5611 /* read keys to be used for input */
5612 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5613 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5614 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5615 if (!zsetobj) {
5616 src[i].dict = NULL;
5617 } else {
5618 if (zsetobj->type != REDIS_ZSET) {
5619 zfree(src);
5620 addReply(c,shared.wrongtypeerr);
5621 return;
5622 }
5623 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5624 }
5625
5626 /* default all weights to 1 */
5627 src[i].weight = 1.0;
5628 }
5629
5630 /* parse optional extra arguments */
5631 if (j < c->argc) {
5632 int remaining = c->argc - j;
5633
5634 while (remaining) {
5635 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5636 j++; remaining--;
5637 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5638 if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
5639 return;
5640 }
5641 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5642 j++; remaining--;
5643 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5644 aggregate = REDIS_AGGR_SUM;
5645 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5646 aggregate = REDIS_AGGR_MIN;
5647 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5648 aggregate = REDIS_AGGR_MAX;
5649 } else {
5650 zfree(src);
5651 addReply(c,shared.syntaxerr);
5652 return;
5653 }
5654 j++; remaining--;
5655 } else {
5656 zfree(src);
5657 addReply(c,shared.syntaxerr);
5658 return;
5659 }
5660 }
5661 }
5662
5663 /* sort sets from the smallest to largest, this will improve our
5664 * algorithm's performance */
5665 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5666
5667 dstobj = createZsetObject();
5668 dstzset = dstobj->ptr;
5669
5670 if (op == REDIS_OP_INTER) {
5671 /* skip going over all entries if the smallest zset is NULL or empty */
5672 if (src[0].dict && dictSize(src[0].dict) > 0) {
5673 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5674 * from small to large, all src[i > 0].dict are non-empty too */
5675 di = dictGetIterator(src[0].dict);
5676 while((de = dictNext(di)) != NULL) {
5677 double *score = zmalloc(sizeof(double)), value;
5678 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5679
5680 for (j = 1; j < zsetnum; j++) {
5681 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5682 if (other) {
5683 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5684 zunionInterAggregate(score, value, aggregate);
5685 } else {
5686 break;
5687 }
5688 }
5689
5690 /* skip entry when not present in every source dict */
5691 if (j != zsetnum) {
5692 zfree(score);
5693 } else {
5694 robj *o = dictGetEntryKey(de);
5695 dictAdd(dstzset->dict,o,score);
5696 incrRefCount(o); /* added to dictionary */
5697 zslInsert(dstzset->zsl,*score,o);
5698 incrRefCount(o); /* added to skiplist */
5699 }
5700 }
5701 dictReleaseIterator(di);
5702 }
5703 } else if (op == REDIS_OP_UNION) {
5704 for (i = 0; i < zsetnum; i++) {
5705 if (!src[i].dict) continue;
5706
5707 di = dictGetIterator(src[i].dict);
5708 while((de = dictNext(di)) != NULL) {
5709 /* skip key when already processed */
5710 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5711
5712 double *score = zmalloc(sizeof(double)), value;
5713 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5714
5715 /* because the zsets are sorted by size, its only possible
5716 * for sets at larger indices to hold this entry */
5717 for (j = (i+1); j < zsetnum; j++) {
5718 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5719 if (other) {
5720 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5721 zunionInterAggregate(score, value, aggregate);
5722 }
5723 }
5724
5725 robj *o = dictGetEntryKey(de);
5726 dictAdd(dstzset->dict,o,score);
5727 incrRefCount(o); /* added to dictionary */
5728 zslInsert(dstzset->zsl,*score,o);
5729 incrRefCount(o); /* added to skiplist */
5730 }
5731 dictReleaseIterator(di);
5732 }
5733 } else {
5734 /* unknown operator */
5735 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5736 }
5737
5738 deleteKey(c->db,dstkey);
5739 if (dstzset->zsl->length) {
5740 dictAdd(c->db->dict,dstkey,dstobj);
5741 incrRefCount(dstkey);
5742 addReplyLong(c, dstzset->zsl->length);
5743 server.dirty++;
5744 } else {
5745 decrRefCount(dstobj);
5746 addReply(c, shared.czero);
5747 }
5748 zfree(src);
5749 }
5750
5751 static void zunionCommand(redisClient *c) {
5752 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5753 }
5754
5755 static void zinterCommand(redisClient *c) {
5756 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5757 }
5758
5759 static void zrangeGenericCommand(redisClient *c, int reverse) {
5760 robj *o;
5761 long start;
5762 long end;
5763 int withscores = 0;
5764 int llen;
5765 int rangelen, j;
5766 zset *zsetobj;
5767 zskiplist *zsl;
5768 zskiplistNode *ln;
5769 robj *ele;
5770
5771 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5772 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5773
5774 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5775 withscores = 1;
5776 } else if (c->argc >= 5) {
5777 addReply(c,shared.syntaxerr);
5778 return;
5779 }
5780
5781 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5782 || checkType(c,o,REDIS_ZSET)) return;
5783 zsetobj = o->ptr;
5784 zsl = zsetobj->zsl;
5785 llen = zsl->length;
5786
5787 /* convert negative indexes */
5788 if (start < 0) start = llen+start;
5789 if (end < 0) end = llen+end;
5790 if (start < 0) start = 0;
5791 if (end < 0) end = 0;
5792
5793 /* indexes sanity checks */
5794 if (start > end || start >= llen) {
5795 /* Out of range start or start > end result in empty list */
5796 addReply(c,shared.emptymultibulk);
5797 return;
5798 }
5799 if (end >= llen) end = llen-1;
5800 rangelen = (end-start)+1;
5801
5802 /* check if starting point is trivial, before searching
5803 * the element in log(N) time */
5804 if (reverse) {
5805 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5806 } else {
5807 ln = start == 0 ?
5808 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5809 }
5810
5811 /* Return the result in form of a multi-bulk reply */
5812 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5813 withscores ? (rangelen*2) : rangelen));
5814 for (j = 0; j < rangelen; j++) {
5815 ele = ln->obj;
5816 addReplyBulk(c,ele);
5817 if (withscores)
5818 addReplyDouble(c,ln->score);
5819 ln = reverse ? ln->backward : ln->forward[0];
5820 }
5821 }
5822
5823 static void zrangeCommand(redisClient *c) {
5824 zrangeGenericCommand(c,0);
5825 }
5826
5827 static void zrevrangeCommand(redisClient *c) {
5828 zrangeGenericCommand(c,1);
5829 }
5830
5831 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5832 * If justcount is non-zero, just the count is returned. */
5833 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5834 robj *o;
5835 double min, max;
5836 int minex = 0, maxex = 0; /* are min or max exclusive? */
5837 int offset = 0, limit = -1;
5838 int withscores = 0;
5839 int badsyntax = 0;
5840
5841 /* Parse the min-max interval. If one of the values is prefixed
5842 * by the "(" character, it's considered "open". For instance
5843 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5844 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5845 if (((char*)c->argv[2]->ptr)[0] == '(') {
5846 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5847 minex = 1;
5848 } else {
5849 min = strtod(c->argv[2]->ptr,NULL);
5850 }
5851 if (((char*)c->argv[3]->ptr)[0] == '(') {
5852 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5853 maxex = 1;
5854 } else {
5855 max = strtod(c->argv[3]->ptr,NULL);
5856 }
5857
5858 /* Parse "WITHSCORES": note that if the command was called with
5859 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5860 * enter the following paths to parse WITHSCORES and LIMIT. */
5861 if (c->argc == 5 || c->argc == 8) {
5862 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5863 withscores = 1;
5864 else
5865 badsyntax = 1;
5866 }
5867 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5868 badsyntax = 1;
5869 if (badsyntax) {
5870 addReplySds(c,
5871 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5872 return;
5873 }
5874
5875 /* Parse "LIMIT" */
5876 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5877 addReply(c,shared.syntaxerr);
5878 return;
5879 } else if (c->argc == (7 + withscores)) {
5880 offset = atoi(c->argv[5]->ptr);
5881 limit = atoi(c->argv[6]->ptr);
5882 if (offset < 0) offset = 0;
5883 }
5884
5885 /* Ok, lookup the key and get the range */
5886 o = lookupKeyRead(c->db,c->argv[1]);
5887 if (o == NULL) {
5888 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5889 } else {
5890 if (o->type != REDIS_ZSET) {
5891 addReply(c,shared.wrongtypeerr);
5892 } else {
5893 zset *zsetobj = o->ptr;
5894 zskiplist *zsl = zsetobj->zsl;
5895 zskiplistNode *ln;
5896 robj *ele, *lenobj = NULL;
5897 unsigned long rangelen = 0;
5898
5899 /* Get the first node with the score >= min, or with
5900 * score > min if 'minex' is true. */
5901 ln = zslFirstWithScore(zsl,min);
5902 while (minex && ln && ln->score == min) ln = ln->forward[0];
5903
5904 if (ln == NULL) {
5905 /* No element matching the speciifed interval */
5906 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5907 return;
5908 }
5909
5910 /* We don't know in advance how many matching elements there
5911 * are in the list, so we push this object that will represent
5912 * the multi-bulk length in the output buffer, and will "fix"
5913 * it later */
5914 if (!justcount) {
5915 lenobj = createObject(REDIS_STRING,NULL);
5916 addReply(c,lenobj);
5917 decrRefCount(lenobj);
5918 }
5919
5920 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5921 if (offset) {
5922 offset--;
5923 ln = ln->forward[0];
5924 continue;
5925 }
5926 if (limit == 0) break;
5927 if (!justcount) {
5928 ele = ln->obj;
5929 addReplyBulk(c,ele);
5930 if (withscores)
5931 addReplyDouble(c,ln->score);
5932 }
5933 ln = ln->forward[0];
5934 rangelen++;
5935 if (limit > 0) limit--;
5936 }
5937 if (justcount) {
5938 addReplyLong(c,(long)rangelen);
5939 } else {
5940 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5941 withscores ? (rangelen*2) : rangelen);
5942 }
5943 }
5944 }
5945 }
5946
5947 static void zrangebyscoreCommand(redisClient *c) {
5948 genericZrangebyscoreCommand(c,0);
5949 }
5950
5951 static void zcountCommand(redisClient *c) {
5952 genericZrangebyscoreCommand(c,1);
5953 }
5954
5955 static void zcardCommand(redisClient *c) {
5956 robj *o;
5957 zset *zs;
5958
5959 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5960 checkType(c,o,REDIS_ZSET)) return;
5961
5962 zs = o->ptr;
5963 addReplyUlong(c,zs->zsl->length);
5964 }
5965
5966 static void zscoreCommand(redisClient *c) {
5967 robj *o;
5968 zset *zs;
5969 dictEntry *de;
5970
5971 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5972 checkType(c,o,REDIS_ZSET)) return;
5973
5974 zs = o->ptr;
5975 de = dictFind(zs->dict,c->argv[2]);
5976 if (!de) {
5977 addReply(c,shared.nullbulk);
5978 } else {
5979 double *score = dictGetEntryVal(de);
5980
5981 addReplyDouble(c,*score);
5982 }
5983 }
5984
5985 static void zrankGenericCommand(redisClient *c, int reverse) {
5986 robj *o;
5987 zset *zs;
5988 zskiplist *zsl;
5989 dictEntry *de;
5990 unsigned long rank;
5991 double *score;
5992
5993 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5994 checkType(c,o,REDIS_ZSET)) return;
5995
5996 zs = o->ptr;
5997 zsl = zs->zsl;
5998 de = dictFind(zs->dict,c->argv[2]);
5999 if (!de) {
6000 addReply(c,shared.nullbulk);
6001 return;
6002 }
6003
6004 score = dictGetEntryVal(de);
6005 rank = zslGetRank(zsl, *score, c->argv[2]);
6006 if (rank) {
6007 if (reverse) {
6008 addReplyLong(c, zsl->length - rank);
6009 } else {
6010 addReplyLong(c, rank-1);
6011 }
6012 } else {
6013 addReply(c,shared.nullbulk);
6014 }
6015 }
6016
6017 static void zrankCommand(redisClient *c) {
6018 zrankGenericCommand(c, 0);
6019 }
6020
6021 static void zrevrankCommand(redisClient *c) {
6022 zrankGenericCommand(c, 1);
6023 }
6024
6025 /* =================================== Hashes =============================== */
6026 static void hsetCommand(redisClient *c) {
6027 int update = 0;
6028 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6029
6030 if (o == NULL) {
6031 o = createHashObject();
6032 dictAdd(c->db->dict,c->argv[1],o);
6033 incrRefCount(c->argv[1]);
6034 } else {
6035 if (o->type != REDIS_HASH) {
6036 addReply(c,shared.wrongtypeerr);
6037 return;
6038 }
6039 }
6040 /* We want to convert the zipmap into an hash table right now if the
6041 * entry to be added is too big. Note that we check if the object
6042 * is integer encoded before to try fetching the length in the test below.
6043 * This is because integers are small, but currently stringObjectLen()
6044 * performs a slow conversion: not worth it. */
6045 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
6046 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
6047 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
6048 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
6049 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
6050 {
6051 convertToRealHash(o);
6052 }
6053
6054 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6055 unsigned char *zm = o->ptr;
6056 robj *valobj = getDecodedObject(c->argv[3]);
6057
6058 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6059 valobj->ptr,sdslen(valobj->ptr),&update);
6060 decrRefCount(valobj);
6061 o->ptr = zm;
6062
6063 /* And here there is the second check for hash conversion. */
6064 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6065 convertToRealHash(o);
6066 } else {
6067 c->argv[2] = tryObjectEncoding(c->argv[2]);
6068 /* note that c->argv[3] is already encoded, as the latest arg
6069 * of a bulk command is always integer encoded if possible. */
6070 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
6071 incrRefCount(c->argv[2]);
6072 } else {
6073 update = 1;
6074 }
6075 incrRefCount(c->argv[3]);
6076 }
6077 server.dirty++;
6078 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6079 }
6080
6081 static void hmsetCommand(redisClient *c) {
6082 int i;
6083 robj *o, *key, *val;
6084
6085 if ((c->argc % 2) == 1) {
6086 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6087 return;
6088 }
6089
6090 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6091 o = createHashObject();
6092 dictAdd(c->db->dict,c->argv[1],o);
6093 incrRefCount(c->argv[1]);
6094 } else {
6095 if (o->type != REDIS_HASH) {
6096 addReply(c,shared.wrongtypeerr);
6097 return;
6098 }
6099 }
6100
6101 /* We want to convert the zipmap into an hash table right now if the
6102 * entry to be added is too big. */
6103 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6104 for (i = 2; i < c->argc; i+=2) {
6105 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6106 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6107 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6108 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6109 convertToRealHash(o);
6110 break;
6111 }
6112 }
6113 }
6114
6115 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6116 unsigned char *zm = o->ptr;
6117
6118 for (i = 2; i < c->argc; i+=2) {
6119 key = getDecodedObject(c->argv[i]);
6120 val = getDecodedObject(c->argv[i+1]);
6121 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6122 val->ptr,sdslen(val->ptr),NULL);
6123 decrRefCount(key);
6124 decrRefCount(val);
6125 o->ptr = zm;
6126 }
6127
6128 /* And here there is the second check for hash conversion. */
6129 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6130 convertToRealHash(o);
6131 } else {
6132 for (i = 2; i < c->argc; i+=2) {
6133 key = tryObjectEncoding(c->argv[i]);
6134 val = tryObjectEncoding(c->argv[i+1]);
6135 if (dictReplace(o->ptr,key,val)) {
6136 incrRefCount(key);
6137 }
6138 incrRefCount(val);
6139 }
6140 }
6141
6142 addReply(c, shared.ok);
6143 }
6144
6145 static void hincrbyCommand(redisClient *c) {
6146 long long value = 0, incr = 0;
6147 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6148
6149 if (o == NULL) {
6150 o = createHashObject();
6151 dictAdd(c->db->dict,c->argv[1],o);
6152 incrRefCount(c->argv[1]);
6153 } else {
6154 if (o->type != REDIS_HASH) {
6155 addReply(c,shared.wrongtypeerr);
6156 return;
6157 }
6158 }
6159
6160 if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;
6161
6162 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6163 unsigned char *zm = o->ptr;
6164 unsigned char *zval;
6165 unsigned int zvlen;
6166
6167 /* Find value if already present in hash */
6168 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6169 &zval,&zvlen)) {
6170 /* strtoll needs the char* to have a trailing \0, but
6171 * the zipmap doesn't include them. */
6172 sds szval = sdsnewlen(zval, zvlen);
6173 value = strtoll(szval,NULL,10);
6174 sdsfree(szval);
6175 }
6176
6177 value += incr;
6178 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6179 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6180 (unsigned char*)svalue,sdslen(svalue),NULL);
6181 sdsfree(svalue);
6182 o->ptr = zm;
6183
6184 /* Check if the zipmap needs to be converted. */
6185 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6186 convertToRealHash(o);
6187 } else {
6188 robj *hval;
6189 dictEntry *de;
6190
6191 /* Find value if already present in hash */
6192 de = dictFind(o->ptr,c->argv[2]);
6193 if (de != NULL) {
6194 hval = dictGetEntryVal(de);
6195 if (hval->encoding == REDIS_ENCODING_RAW)
6196 value = strtoll(hval->ptr,NULL,10);
6197 else if (hval->encoding == REDIS_ENCODING_INT)
6198 value = (long)hval->ptr;
6199 else
6200 redisAssert(1 != 1);
6201 }
6202
6203 value += incr;
6204 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6205 hval = tryObjectEncoding(hval);
6206 if (dictReplace(o->ptr,c->argv[2],hval)) {
6207 incrRefCount(c->argv[2]);
6208 }
6209 }
6210
6211 server.dirty++;
6212 addReplyLongLong(c, value);
6213 }
6214
6215 static void hgetCommand(redisClient *c) {
6216 robj *o;
6217
6218 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6219 checkType(c,o,REDIS_HASH)) return;
6220
6221 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6222 unsigned char *zm = o->ptr;
6223 unsigned char *val;
6224 unsigned int vlen;
6225 robj *field;
6226
6227 field = getDecodedObject(c->argv[2]);
6228 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6229 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6230 addReplySds(c,sdsnewlen(val,vlen));
6231 addReply(c,shared.crlf);
6232 decrRefCount(field);
6233 return;
6234 } else {
6235 addReply(c,shared.nullbulk);
6236 decrRefCount(field);
6237 return;
6238 }
6239 } else {
6240 struct dictEntry *de;
6241
6242 de = dictFind(o->ptr,c->argv[2]);
6243 if (de == NULL) {
6244 addReply(c,shared.nullbulk);
6245 } else {
6246 robj *e = dictGetEntryVal(de);
6247
6248 addReplyBulk(c,e);
6249 }
6250 }
6251 }
6252
6253 static void hmgetCommand(redisClient *c) {
6254 int i;
6255
6256 robj *o = lookupKeyRead(c->db, c->argv[1]);
6257 if (o == NULL) {
6258 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6259 for (i = 2; i < c->argc; i++) {
6260 addReply(c,shared.nullbulk);
6261 }
6262 return;
6263 } else {
6264 if (o->type != REDIS_HASH) {
6265 addReply(c,shared.wrongtypeerr);
6266 return;
6267 }
6268 }
6269
6270 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6271 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6272 unsigned char *zm = o->ptr;
6273 unsigned char *v;
6274 unsigned int vlen;
6275 robj *field;
6276
6277 for (i = 2; i < c->argc; i++) {
6278 field = getDecodedObject(c->argv[i]);
6279 if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
6280 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6281 addReplySds(c,sdsnewlen(v,vlen));
6282 addReply(c,shared.crlf);
6283 } else {
6284 addReply(c,shared.nullbulk);
6285 }
6286 decrRefCount(field);
6287 }
6288 } else {
6289 dictEntry *de;
6290
6291 for (i = 2; i < c->argc; i++) {
6292 de = dictFind(o->ptr,c->argv[i]);
6293 if (de != NULL) {
6294 addReplyBulk(c,(robj*)dictGetEntryVal(de));
6295 } else {
6296 addReply(c,shared.nullbulk);
6297 }
6298 }
6299 }
6300 }
6301
6302 static void hdelCommand(redisClient *c) {
6303 robj *o;
6304 int deleted = 0;
6305
6306 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6307 checkType(c,o,REDIS_HASH)) return;
6308
6309 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6310 robj *field = getDecodedObject(c->argv[2]);
6311
6312 o->ptr = zipmapDel((unsigned char*) o->ptr,
6313 (unsigned char*) field->ptr,
6314 sdslen(field->ptr), &deleted);
6315 decrRefCount(field);
6316 if (zipmapLen((unsigned char*) o->ptr) == 0)
6317 deleteKey(c->db,c->argv[1]);
6318 } else {
6319 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6320 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6321 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6322 }
6323 if (deleted) server.dirty++;
6324 addReply(c,deleted ? shared.cone : shared.czero);
6325 }
6326
6327 static void hlenCommand(redisClient *c) {
6328 robj *o;
6329 unsigned long len;
6330
6331 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6332 checkType(c,o,REDIS_HASH)) return;
6333
6334 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6335 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6336 addReplyUlong(c,len);
6337 }
6338
6339 #define REDIS_GETALL_KEYS 1
6340 #define REDIS_GETALL_VALS 2
6341 static void genericHgetallCommand(redisClient *c, int flags) {
6342 robj *o, *lenobj;
6343 unsigned long count = 0;
6344
6345 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6346 || checkType(c,o,REDIS_HASH)) return;
6347
6348 lenobj = createObject(REDIS_STRING,NULL);
6349 addReply(c,lenobj);
6350 decrRefCount(lenobj);
6351
6352 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6353 unsigned char *p = zipmapRewind(o->ptr);
6354 unsigned char *field, *val;
6355 unsigned int flen, vlen;
6356
6357 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6358 robj *aux;
6359
6360 if (flags & REDIS_GETALL_KEYS) {
6361 aux = createStringObject((char*)field,flen);
6362 addReplyBulk(c,aux);
6363 decrRefCount(aux);
6364 count++;
6365 }
6366 if (flags & REDIS_GETALL_VALS) {
6367 aux = createStringObject((char*)val,vlen);
6368 addReplyBulk(c,aux);
6369 decrRefCount(aux);
6370 count++;
6371 }
6372 }
6373 } else {
6374 dictIterator *di = dictGetIterator(o->ptr);
6375 dictEntry *de;
6376
6377 while((de = dictNext(di)) != NULL) {
6378 robj *fieldobj = dictGetEntryKey(de);
6379 robj *valobj = dictGetEntryVal(de);
6380
6381 if (flags & REDIS_GETALL_KEYS) {
6382 addReplyBulk(c,fieldobj);
6383 count++;
6384 }
6385 if (flags & REDIS_GETALL_VALS) {
6386 addReplyBulk(c,valobj);
6387 count++;
6388 }
6389 }
6390 dictReleaseIterator(di);
6391 }
6392 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6393 }
6394
6395 static void hkeysCommand(redisClient *c) {
6396 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6397 }
6398
6399 static void hvalsCommand(redisClient *c) {
6400 genericHgetallCommand(c,REDIS_GETALL_VALS);
6401 }
6402
6403 static void hgetallCommand(redisClient *c) {
6404 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6405 }
6406
6407 static void hexistsCommand(redisClient *c) {
6408 robj *o;
6409 int exists = 0;
6410
6411 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6412 checkType(c,o,REDIS_HASH)) return;
6413
6414 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6415 robj *field;
6416 unsigned char *zm = o->ptr;
6417
6418 field = getDecodedObject(c->argv[2]);
6419 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6420 decrRefCount(field);
6421 } else {
6422 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6423 }
6424 addReply(c,exists ? shared.cone : shared.czero);
6425 }
6426
6427 static void convertToRealHash(robj *o) {
6428 unsigned char *key, *val, *p, *zm = o->ptr;
6429 unsigned int klen, vlen;
6430 dict *dict = dictCreate(&hashDictType,NULL);
6431
6432 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6433 p = zipmapRewind(zm);
6434 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6435 robj *keyobj, *valobj;
6436
6437 keyobj = createStringObject((char*)key,klen);
6438 valobj = createStringObject((char*)val,vlen);
6439 keyobj = tryObjectEncoding(keyobj);
6440 valobj = tryObjectEncoding(valobj);
6441 dictAdd(dict,keyobj,valobj);
6442 }
6443 o->encoding = REDIS_ENCODING_HT;
6444 o->ptr = dict;
6445 zfree(zm);
6446 }
6447
6448 /* ========================= Non type-specific commands ==================== */
6449
6450 static void flushdbCommand(redisClient *c) {
6451 server.dirty += dictSize(c->db->dict);
6452 dictEmpty(c->db->dict);
6453 dictEmpty(c->db->expires);
6454 addReply(c,shared.ok);
6455 }
6456
6457 static void flushallCommand(redisClient *c) {
6458 server.dirty += emptyDb();
6459 addReply(c,shared.ok);
6460 if (server.bgsavechildpid != -1) {
6461 kill(server.bgsavechildpid,SIGKILL);
6462 rdbRemoveTempFile(server.bgsavechildpid);
6463 }
6464 rdbSave(server.dbfilename);
6465 server.dirty++;
6466 }
6467
6468 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6469 redisSortOperation *so = zmalloc(sizeof(*so));
6470 so->type = type;
6471 so->pattern = pattern;
6472 return so;
6473 }
6474
6475 /* Return the value associated to the key with a name obtained
6476 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6477 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6478 char *p;
6479 sds spat, ssub;
6480 robj keyobj;
6481 int prefixlen, sublen, postfixlen;
6482 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6483 struct {
6484 long len;
6485 long free;
6486 char buf[REDIS_SORTKEY_MAX+1];
6487 } keyname;
6488
6489 /* If the pattern is "#" return the substitution object itself in order
6490 * to implement the "SORT ... GET #" feature. */
6491 spat = pattern->ptr;
6492 if (spat[0] == '#' && spat[1] == '\0') {
6493 return subst;
6494 }
6495
6496 /* The substitution object may be specially encoded. If so we create
6497 * a decoded object on the fly. Otherwise getDecodedObject will just
6498 * increment the ref count, that we'll decrement later. */
6499 subst = getDecodedObject(subst);
6500
6501 ssub = subst->ptr;
6502 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6503 p = strchr(spat,'*');
6504 if (!p) {
6505 decrRefCount(subst);
6506 return NULL;
6507 }
6508
6509 prefixlen = p-spat;
6510 sublen = sdslen(ssub);
6511 postfixlen = sdslen(spat)-(prefixlen+1);
6512 memcpy(keyname.buf,spat,prefixlen);
6513 memcpy(keyname.buf+prefixlen,ssub,sublen);
6514 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6515 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6516 keyname.len = prefixlen+sublen+postfixlen;
6517
6518 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6519 decrRefCount(subst);
6520
6521 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6522 return lookupKeyRead(db,&keyobj);
6523 }
6524
6525 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6526 * the additional parameter is not standard but a BSD-specific we have to
6527 * pass sorting parameters via the global 'server' structure */
6528 static int sortCompare(const void *s1, const void *s2) {
6529 const redisSortObject *so1 = s1, *so2 = s2;
6530 int cmp;
6531
6532 if (!server.sort_alpha) {
6533 /* Numeric sorting. Here it's trivial as we precomputed scores */
6534 if (so1->u.score > so2->u.score) {
6535 cmp = 1;
6536 } else if (so1->u.score < so2->u.score) {
6537 cmp = -1;
6538 } else {
6539 cmp = 0;
6540 }
6541 } else {
6542 /* Alphanumeric sorting */
6543 if (server.sort_bypattern) {
6544 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6545 /* At least one compare object is NULL */
6546 if (so1->u.cmpobj == so2->u.cmpobj)
6547 cmp = 0;
6548 else if (so1->u.cmpobj == NULL)
6549 cmp = -1;
6550 else
6551 cmp = 1;
6552 } else {
6553 /* We have both the objects, use strcoll */
6554 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6555 }
6556 } else {
6557 /* Compare elements directly */
6558 robj *dec1, *dec2;
6559
6560 dec1 = getDecodedObject(so1->obj);
6561 dec2 = getDecodedObject(so2->obj);
6562 cmp = strcoll(dec1->ptr,dec2->ptr);
6563 decrRefCount(dec1);
6564 decrRefCount(dec2);
6565 }
6566 }
6567 return server.sort_desc ? -cmp : cmp;
6568 }
6569
6570 /* The SORT command is the most complex command in Redis. Warning: this code
6571 * is optimized for speed and a bit less for readability */
6572 static void sortCommand(redisClient *c) {
6573 list *operations;
6574 int outputlen = 0;
6575 int desc = 0, alpha = 0;
6576 int limit_start = 0, limit_count = -1, start, end;
6577 int j, dontsort = 0, vectorlen;
6578 int getop = 0; /* GET operation counter */
6579 robj *sortval, *sortby = NULL, *storekey = NULL;
6580 redisSortObject *vector; /* Resulting vector to sort */
6581
6582 /* Lookup the key to sort. It must be of the right types */
6583 sortval = lookupKeyRead(c->db,c->argv[1]);
6584 if (sortval == NULL) {
6585 addReply(c,shared.emptymultibulk);
6586 return;
6587 }
6588 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6589 sortval->type != REDIS_ZSET)
6590 {
6591 addReply(c,shared.wrongtypeerr);
6592 return;
6593 }
6594
6595 /* Create a list of operations to perform for every sorted element.
6596 * Operations can be GET/DEL/INCR/DECR */
6597 operations = listCreate();
6598 listSetFreeMethod(operations,zfree);
6599 j = 2;
6600
6601 /* Now we need to protect sortval incrementing its count, in the future
6602 * SORT may have options able to overwrite/delete keys during the sorting
6603 * and the sorted key itself may get destroied */
6604 incrRefCount(sortval);
6605
6606 /* The SORT command has an SQL-alike syntax, parse it */
6607 while(j < c->argc) {
6608 int leftargs = c->argc-j-1;
6609 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6610 desc = 0;
6611 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6612 desc = 1;
6613 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6614 alpha = 1;
6615 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6616 limit_start = atoi(c->argv[j+1]->ptr);
6617 limit_count = atoi(c->argv[j+2]->ptr);
6618 j+=2;
6619 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6620 storekey = c->argv[j+1];
6621 j++;
6622 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6623 sortby = c->argv[j+1];
6624 /* If the BY pattern does not contain '*', i.e. it is constant,
6625 * we don't need to sort nor to lookup the weight keys. */
6626 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6627 j++;
6628 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6629 listAddNodeTail(operations,createSortOperation(
6630 REDIS_SORT_GET,c->argv[j+1]));
6631 getop++;
6632 j++;
6633 } else {
6634 decrRefCount(sortval);
6635 listRelease(operations);
6636 addReply(c,shared.syntaxerr);
6637 return;
6638 }
6639 j++;
6640 }
6641
6642 /* Load the sorting vector with all the objects to sort */
6643 switch(sortval->type) {
6644 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6645 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6646 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6647 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6648 }
6649 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6650 j = 0;
6651
6652 if (sortval->type == REDIS_LIST) {
6653 list *list = sortval->ptr;
6654 listNode *ln;
6655 listIter li;
6656
6657 listRewind(list,&li);
6658 while((ln = listNext(&li))) {
6659 robj *ele = ln->value;
6660 vector[j].obj = ele;
6661 vector[j].u.score = 0;
6662 vector[j].u.cmpobj = NULL;
6663 j++;
6664 }
6665 } else {
6666 dict *set;
6667 dictIterator *di;
6668 dictEntry *setele;
6669
6670 if (sortval->type == REDIS_SET) {
6671 set = sortval->ptr;
6672 } else {
6673 zset *zs = sortval->ptr;
6674 set = zs->dict;
6675 }
6676
6677 di = dictGetIterator(set);
6678 while((setele = dictNext(di)) != NULL) {
6679 vector[j].obj = dictGetEntryKey(setele);
6680 vector[j].u.score = 0;
6681 vector[j].u.cmpobj = NULL;
6682 j++;
6683 }
6684 dictReleaseIterator(di);
6685 }
6686 redisAssert(j == vectorlen);
6687
6688 /* Now it's time to load the right scores in the sorting vector */
6689 if (dontsort == 0) {
6690 for (j = 0; j < vectorlen; j++) {
6691 if (sortby) {
6692 robj *byval;
6693
6694 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6695 if (!byval || byval->type != REDIS_STRING) continue;
6696 if (alpha) {
6697 vector[j].u.cmpobj = getDecodedObject(byval);
6698 } else {
6699 if (byval->encoding == REDIS_ENCODING_RAW) {
6700 vector[j].u.score = strtod(byval->ptr,NULL);
6701 } else {
6702 /* Don't need to decode the object if it's
6703 * integer-encoded (the only encoding supported) so
6704 * far. We can just cast it */
6705 if (byval->encoding == REDIS_ENCODING_INT) {
6706 vector[j].u.score = (long)byval->ptr;
6707 } else
6708 redisAssert(1 != 1);
6709 }
6710 }
6711 } else {
6712 if (!alpha) {
6713 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6714 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6715 else {
6716 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6717 vector[j].u.score = (long) vector[j].obj->ptr;
6718 else
6719 redisAssert(1 != 1);
6720 }
6721 }
6722 }
6723 }
6724 }
6725
6726 /* We are ready to sort the vector... perform a bit of sanity check
6727 * on the LIMIT option too. We'll use a partial version of quicksort. */
6728 start = (limit_start < 0) ? 0 : limit_start;
6729 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6730 if (start >= vectorlen) {
6731 start = vectorlen-1;
6732 end = vectorlen-2;
6733 }
6734 if (end >= vectorlen) end = vectorlen-1;
6735
6736 if (dontsort == 0) {
6737 server.sort_desc = desc;
6738 server.sort_alpha = alpha;
6739 server.sort_bypattern = sortby ? 1 : 0;
6740 if (sortby && (start != 0 || end != vectorlen-1))
6741 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6742 else
6743 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6744 }
6745
6746 /* Send command output to the output buffer, performing the specified
6747 * GET/DEL/INCR/DECR operations if any. */
6748 outputlen = getop ? getop*(end-start+1) : end-start+1;
6749 if (storekey == NULL) {
6750 /* STORE option not specified, sent the sorting result to client */
6751 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6752 for (j = start; j <= end; j++) {
6753 listNode *ln;
6754 listIter li;
6755
6756 if (!getop) addReplyBulk(c,vector[j].obj);
6757 listRewind(operations,&li);
6758 while((ln = listNext(&li))) {
6759 redisSortOperation *sop = ln->value;
6760 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6761 vector[j].obj);
6762
6763 if (sop->type == REDIS_SORT_GET) {
6764 if (!val || val->type != REDIS_STRING) {
6765 addReply(c,shared.nullbulk);
6766 } else {
6767 addReplyBulk(c,val);
6768 }
6769 } else {
6770 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6771 }
6772 }
6773 }
6774 } else {
6775 robj *listObject = createListObject();
6776 list *listPtr = (list*) listObject->ptr;
6777
6778 /* STORE option specified, set the sorting result as a List object */
6779 for (j = start; j <= end; j++) {
6780 listNode *ln;
6781 listIter li;
6782
6783 if (!getop) {
6784 listAddNodeTail(listPtr,vector[j].obj);
6785 incrRefCount(vector[j].obj);
6786 }
6787 listRewind(operations,&li);
6788 while((ln = listNext(&li))) {
6789 redisSortOperation *sop = ln->value;
6790 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6791 vector[j].obj);
6792
6793 if (sop->type == REDIS_SORT_GET) {
6794 if (!val || val->type != REDIS_STRING) {
6795 listAddNodeTail(listPtr,createStringObject("",0));
6796 } else {
6797 listAddNodeTail(listPtr,val);
6798 incrRefCount(val);
6799 }
6800 } else {
6801 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6802 }
6803 }
6804 }
6805 if (dictReplace(c->db->dict,storekey,listObject)) {
6806 incrRefCount(storekey);
6807 }
6808 /* Note: we add 1 because the DB is dirty anyway since even if the
6809 * SORT result is empty a new key is set and maybe the old content
6810 * replaced. */
6811 server.dirty += 1+outputlen;
6812 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6813 }
6814
6815 /* Cleanup */
6816 decrRefCount(sortval);
6817 listRelease(operations);
6818 for (j = 0; j < vectorlen; j++) {
6819 if (sortby && alpha && vector[j].u.cmpobj)
6820 decrRefCount(vector[j].u.cmpobj);
6821 }
6822 zfree(vector);
6823 }
6824
6825 /* Convert an amount of bytes into a human readable string in the form
6826 * of 100B, 2G, 100M, 4K, and so forth. */
6827 static void bytesToHuman(char *s, unsigned long long n) {
6828 double d;
6829
6830 if (n < 1024) {
6831 /* Bytes */
6832 sprintf(s,"%lluB",n);
6833 return;
6834 } else if (n < (1024*1024)) {
6835 d = (double)n/(1024);
6836 sprintf(s,"%.2fK",d);
6837 } else if (n < (1024LL*1024*1024)) {
6838 d = (double)n/(1024*1024);
6839 sprintf(s,"%.2fM",d);
6840 } else if (n < (1024LL*1024*1024*1024)) {
6841 d = (double)n/(1024LL*1024*1024);
6842 sprintf(s,"%.2fG",d);
6843 }
6844 }
6845
6846 /* Create the string returned by the INFO command. This is decoupled
6847 * by the INFO command itself as we need to report the same information
6848 * on memory corruption problems. */
6849 static sds genRedisInfoString(void) {
6850 sds info;
6851 time_t uptime = time(NULL)-server.stat_starttime;
6852 int j;
6853 char hmem[64];
6854
6855 bytesToHuman(hmem,zmalloc_used_memory());
6856 info = sdscatprintf(sdsempty(),
6857 "redis_version:%s\r\n"
6858 "arch_bits:%s\r\n"
6859 "multiplexing_api:%s\r\n"
6860 "process_id:%ld\r\n"
6861 "uptime_in_seconds:%ld\r\n"
6862 "uptime_in_days:%ld\r\n"
6863 "connected_clients:%d\r\n"
6864 "connected_slaves:%d\r\n"
6865 "blocked_clients:%d\r\n"
6866 "used_memory:%zu\r\n"
6867 "used_memory_human:%s\r\n"
6868 "changes_since_last_save:%lld\r\n"
6869 "bgsave_in_progress:%d\r\n"
6870 "last_save_time:%ld\r\n"
6871 "bgrewriteaof_in_progress:%d\r\n"
6872 "total_connections_received:%lld\r\n"
6873 "total_commands_processed:%lld\r\n"
6874 "expired_keys:%lld\r\n"
6875 "hash_max_zipmap_entries:%ld\r\n"
6876 "hash_max_zipmap_value:%ld\r\n"
6877 "pubsub_channels:%ld\r\n"
6878 "pubsub_patterns:%u\r\n"
6879 "vm_enabled:%d\r\n"
6880 "role:%s\r\n"
6881 ,REDIS_VERSION,
6882 (sizeof(long) == 8) ? "64" : "32",
6883 aeGetApiName(),
6884 (long) getpid(),
6885 uptime,
6886 uptime/(3600*24),
6887 listLength(server.clients)-listLength(server.slaves),
6888 listLength(server.slaves),
6889 server.blpop_blocked_clients,
6890 zmalloc_used_memory(),
6891 hmem,
6892 server.dirty,
6893 server.bgsavechildpid != -1,
6894 server.lastsave,
6895 server.bgrewritechildpid != -1,
6896 server.stat_numconnections,
6897 server.stat_numcommands,
6898 server.stat_expiredkeys,
6899 server.hash_max_zipmap_entries,
6900 server.hash_max_zipmap_value,
6901 dictSize(server.pubsub_channels),
6902 listLength(server.pubsub_patterns),
6903 server.vm_enabled != 0,
6904 server.masterhost == NULL ? "master" : "slave"
6905 );
6906 if (server.masterhost) {
6907 info = sdscatprintf(info,
6908 "master_host:%s\r\n"
6909 "master_port:%d\r\n"
6910 "master_link_status:%s\r\n"
6911 "master_last_io_seconds_ago:%d\r\n"
6912 ,server.masterhost,
6913 server.masterport,
6914 (server.replstate == REDIS_REPL_CONNECTED) ?
6915 "up" : "down",
6916 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6917 );
6918 }
6919 if (server.vm_enabled) {
6920 lockThreadedIO();
6921 info = sdscatprintf(info,
6922 "vm_conf_max_memory:%llu\r\n"
6923 "vm_conf_page_size:%llu\r\n"
6924 "vm_conf_pages:%llu\r\n"
6925 "vm_stats_used_pages:%llu\r\n"
6926 "vm_stats_swapped_objects:%llu\r\n"
6927 "vm_stats_swappin_count:%llu\r\n"
6928 "vm_stats_swappout_count:%llu\r\n"
6929 "vm_stats_io_newjobs_len:%lu\r\n"
6930 "vm_stats_io_processing_len:%lu\r\n"
6931 "vm_stats_io_processed_len:%lu\r\n"
6932 "vm_stats_io_active_threads:%lu\r\n"
6933 "vm_stats_blocked_clients:%lu\r\n"
6934 ,(unsigned long long) server.vm_max_memory,
6935 (unsigned long long) server.vm_page_size,
6936 (unsigned long long) server.vm_pages,
6937 (unsigned long long) server.vm_stats_used_pages,
6938 (unsigned long long) server.vm_stats_swapped_objects,
6939 (unsigned long long) server.vm_stats_swapins,
6940 (unsigned long long) server.vm_stats_swapouts,
6941 (unsigned long) listLength(server.io_newjobs),
6942 (unsigned long) listLength(server.io_processing),
6943 (unsigned long) listLength(server.io_processed),
6944 (unsigned long) server.io_active_threads,
6945 (unsigned long) server.vm_blocked_clients
6946 );
6947 unlockThreadedIO();
6948 }
6949 for (j = 0; j < server.dbnum; j++) {
6950 long long keys, vkeys;
6951
6952 keys = dictSize(server.db[j].dict);
6953 vkeys = dictSize(server.db[j].expires);
6954 if (keys || vkeys) {
6955 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6956 j, keys, vkeys);
6957 }
6958 }
6959 return info;
6960 }
6961
6962 static void infoCommand(redisClient *c) {
6963 sds info = genRedisInfoString();
6964 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6965 (unsigned long)sdslen(info)));
6966 addReplySds(c,info);
6967 addReply(c,shared.crlf);
6968 }
6969
6970 static void monitorCommand(redisClient *c) {
6971 /* ignore MONITOR if aleady slave or in monitor mode */
6972 if (c->flags & REDIS_SLAVE) return;
6973
6974 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6975 c->slaveseldb = 0;
6976 listAddNodeTail(server.monitors,c);
6977 addReply(c,shared.ok);
6978 }
6979
6980 /* ================================= Expire ================================= */
6981 static int removeExpire(redisDb *db, robj *key) {
6982 if (dictDelete(db->expires,key) == DICT_OK) {
6983 return 1;
6984 } else {
6985 return 0;
6986 }
6987 }
6988
6989 static int setExpire(redisDb *db, robj *key, time_t when) {
6990 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6991 return 0;
6992 } else {
6993 incrRefCount(key);
6994 return 1;
6995 }
6996 }
6997
6998 /* Return the expire time of the specified key, or -1 if no expire
6999 * is associated with this key (i.e. the key is non volatile) */
7000 static time_t getExpire(redisDb *db, robj *key) {
7001 dictEntry *de;
7002
7003 /* No expire? return ASAP */
7004 if (dictSize(db->expires) == 0 ||
7005 (de = dictFind(db->expires,key)) == NULL) return -1;
7006
7007 return (time_t) dictGetEntryVal(de);
7008 }
7009
7010 static int expireIfNeeded(redisDb *db, robj *key) {
7011 time_t when;
7012 dictEntry *de;
7013
7014 /* No expire? return ASAP */
7015 if (dictSize(db->expires) == 0 ||
7016 (de = dictFind(db->expires,key)) == NULL) return 0;
7017
7018 /* Lookup the expire */
7019 when = (time_t) dictGetEntryVal(de);
7020 if (time(NULL) <= when) return 0;
7021
7022 /* Delete the key */
7023 dictDelete(db->expires,key);
7024 server.stat_expiredkeys++;
7025 return dictDelete(db->dict,key) == DICT_OK;
7026 }
7027
7028 static int deleteIfVolatile(redisDb *db, robj *key) {
7029 dictEntry *de;
7030
7031 /* No expire? return ASAP */
7032 if (dictSize(db->expires) == 0 ||
7033 (de = dictFind(db->expires,key)) == NULL) return 0;
7034
7035 /* Delete the key */
7036 server.dirty++;
7037 server.stat_expiredkeys++;
7038 dictDelete(db->expires,key);
7039 return dictDelete(db->dict,key) == DICT_OK;
7040 }
7041
7042 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7043 dictEntry *de;
7044 time_t seconds;
7045
7046 if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;
7047
7048 seconds -= offset;
7049
7050 de = dictFind(c->db->dict,key);
7051 if (de == NULL) {
7052 addReply(c,shared.czero);
7053 return;
7054 }
7055 if (seconds < 0) {
7056 if (deleteKey(c->db,key)) server.dirty++;
7057 addReply(c, shared.cone);
7058 return;
7059 } else {
7060 time_t when = time(NULL)+seconds;
7061 if (setExpire(c->db,key,when)) {
7062 addReply(c,shared.cone);
7063 server.dirty++;
7064 } else {
7065 addReply(c,shared.czero);
7066 }
7067 return;
7068 }
7069 }
7070
7071 static void expireCommand(redisClient *c) {
7072 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7073 }
7074
7075 static void expireatCommand(redisClient *c) {
7076 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7077 }
7078
7079 static void ttlCommand(redisClient *c) {
7080 time_t expire;
7081 int ttl = -1;
7082
7083 expire = getExpire(c->db,c->argv[1]);
7084 if (expire != -1) {
7085 ttl = (int) (expire-time(NULL));
7086 if (ttl < 0) ttl = -1;
7087 }
7088 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7089 }
7090
7091 /* ================================ MULTI/EXEC ============================== */
7092
7093 /* Client state initialization for MULTI/EXEC */
7094 static void initClientMultiState(redisClient *c) {
7095 c->mstate.commands = NULL;
7096 c->mstate.count = 0;
7097 }
7098
7099 /* Release all the resources associated with MULTI/EXEC state */
7100 static void freeClientMultiState(redisClient *c) {
7101 int j;
7102
7103 for (j = 0; j < c->mstate.count; j++) {
7104 int i;
7105 multiCmd *mc = c->mstate.commands+j;
7106
7107 for (i = 0; i < mc->argc; i++)
7108 decrRefCount(mc->argv[i]);
7109 zfree(mc->argv);
7110 }
7111 zfree(c->mstate.commands);
7112 }
7113
7114 /* Add a new command into the MULTI commands queue */
7115 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7116 multiCmd *mc;
7117 int j;
7118
7119 c->mstate.commands = zrealloc(c->mstate.commands,
7120 sizeof(multiCmd)*(c->mstate.count+1));
7121 mc = c->mstate.commands+c->mstate.count;
7122 mc->cmd = cmd;
7123 mc->argc = c->argc;
7124 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7125 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7126 for (j = 0; j < c->argc; j++)
7127 incrRefCount(mc->argv[j]);
7128 c->mstate.count++;
7129 }
7130
7131 static void multiCommand(redisClient *c) {
7132 c->flags |= REDIS_MULTI;
7133 addReply(c,shared.ok);
7134 }
7135
7136 static void discardCommand(redisClient *c) {
7137 if (!(c->flags & REDIS_MULTI)) {
7138 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7139 return;
7140 }
7141
7142 freeClientMultiState(c);
7143 initClientMultiState(c);
7144 c->flags &= (~REDIS_MULTI);
7145 addReply(c,shared.ok);
7146 }
7147
7148 static void execCommand(redisClient *c) {
7149 int j;
7150 robj **orig_argv;
7151 int orig_argc;
7152
7153 if (!(c->flags & REDIS_MULTI)) {
7154 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7155 return;
7156 }
7157
7158 orig_argv = c->argv;
7159 orig_argc = c->argc;
7160 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7161 for (j = 0; j < c->mstate.count; j++) {
7162 c->argc = c->mstate.commands[j].argc;
7163 c->argv = c->mstate.commands[j].argv;
7164 call(c,c->mstate.commands[j].cmd);
7165 }
7166 c->argv = orig_argv;
7167 c->argc = orig_argc;
7168 freeClientMultiState(c);
7169 initClientMultiState(c);
7170 c->flags &= (~REDIS_MULTI);
7171 }
7172
7173 /* =========================== Blocking Operations ========================= */
7174
7175 /* Currently Redis blocking operations support is limited to list POP ops,
7176 * so the current implementation is not fully generic, but it is also not
7177 * completely specific so it will not require a rewrite to support new
7178 * kind of blocking operations in the future.
7179 *
7180 * Still it's important to note that list blocking operations can be already
7181 * used as a notification mechanism in order to implement other blocking
7182 * operations at application level, so there must be a very strong evidence
7183 * of usefulness and generality before new blocking operations are implemented.
7184 *
7185 * This is how the current blocking POP works, we use BLPOP as example:
7186 * - If the user calls BLPOP and the key exists and contains a non empty list
7187 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7188 * if there is not to block.
7189 * - If instead BLPOP is called and the key does not exists or the list is
7190 * empty we need to block. In order to do so we remove the notification for
7191 * new data to read in the client socket (so that we'll not serve new
7192 * requests if the blocking request is not served). Also we put the client
7193 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7194 * blocking for this keys.
7195 * - If a PUSH operation against a key with blocked clients waiting is
7196 * performed, we serve the first in the list: basically instead to push
7197 * the new element inside the list we return it to the (first / oldest)
7198 * blocking client, unblock the client, and remove it form the list.
7199 *
7200 * The above comment and the source code should be enough in order to understand
7201 * the implementation and modify / fix it later.
7202 */
7203
7204 /* Set a client in blocking mode for the specified key, with the specified
7205 * timeout */
7206 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7207 dictEntry *de;
7208 list *l;
7209 int j;
7210
7211 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7212 c->blockingkeysnum = numkeys;
7213 c->blockingto = timeout;
7214 for (j = 0; j < numkeys; j++) {
7215 /* Add the key in the client structure, to map clients -> keys */
7216 c->blockingkeys[j] = keys[j];
7217 incrRefCount(keys[j]);
7218
7219 /* And in the other "side", to map keys -> clients */
7220 de = dictFind(c->db->blockingkeys,keys[j]);
7221 if (de == NULL) {
7222 int retval;
7223
7224 /* For every key we take a list of clients blocked for it */
7225 l = listCreate();
7226 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7227 incrRefCount(keys[j]);
7228 assert(retval == DICT_OK);
7229 } else {
7230 l = dictGetEntryVal(de);
7231 }
7232 listAddNodeTail(l,c);
7233 }
7234 /* Mark the client as a blocked client */
7235 c->flags |= REDIS_BLOCKED;
7236 server.blpop_blocked_clients++;
7237 }
7238
7239 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7240 static void unblockClientWaitingData(redisClient *c) {
7241 dictEntry *de;
7242 list *l;
7243 int j;
7244
7245 assert(c->blockingkeys != NULL);
7246 /* The client may wait for multiple keys, so unblock it for every key. */
7247 for (j = 0; j < c->blockingkeysnum; j++) {
7248 /* Remove this client from the list of clients waiting for this key. */
7249 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7250 assert(de != NULL);
7251 l = dictGetEntryVal(de);
7252 listDelNode(l,listSearchKey(l,c));
7253 /* If the list is empty we need to remove it to avoid wasting memory */
7254 if (listLength(l) == 0)
7255 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7256 decrRefCount(c->blockingkeys[j]);
7257 }
7258 /* Cleanup the client structure */
7259 zfree(c->blockingkeys);
7260 c->blockingkeys = NULL;
7261 c->flags &= (~REDIS_BLOCKED);
7262 server.blpop_blocked_clients--;
7263 /* We want to process data if there is some command waiting
7264 * in the input buffer. Note that this is safe even if
7265 * unblockClientWaitingData() gets called from freeClient() because
7266 * freeClient() will be smart enough to call this function
7267 * *after* c->querybuf was set to NULL. */
7268 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7269 }
7270
7271 /* This should be called from any function PUSHing into lists.
7272 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7273 * 'ele' is the element pushed.
7274 *
7275 * If the function returns 0 there was no client waiting for a list push
7276 * against this key.
7277 *
7278 * If the function returns 1 there was a client waiting for a list push
7279 * against this key, the element was passed to this client thus it's not
7280 * needed to actually add it to the list and the caller should return asap. */
7281 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7282 struct dictEntry *de;
7283 redisClient *receiver;
7284 list *l;
7285 listNode *ln;
7286
7287 de = dictFind(c->db->blockingkeys,key);
7288 if (de == NULL) return 0;
7289 l = dictGetEntryVal(de);
7290 ln = listFirst(l);
7291 assert(ln != NULL);
7292 receiver = ln->value;
7293
7294 addReplySds(receiver,sdsnew("*2\r\n"));
7295 addReplyBulk(receiver,key);
7296 addReplyBulk(receiver,ele);
7297 unblockClientWaitingData(receiver);
7298 return 1;
7299 }
7300
7301 /* Blocking RPOP/LPOP */
7302 static void blockingPopGenericCommand(redisClient *c, int where) {
7303 robj *o;
7304 time_t timeout;
7305 int j;
7306
7307 for (j = 1; j < c->argc-1; j++) {
7308 o = lookupKeyWrite(c->db,c->argv[j]);
7309 if (o != NULL) {
7310 if (o->type != REDIS_LIST) {
7311 addReply(c,shared.wrongtypeerr);
7312 return;
7313 } else {
7314 list *list = o->ptr;
7315 if (listLength(list) != 0) {
7316 /* If the list contains elements fall back to the usual
7317 * non-blocking POP operation */
7318 robj *argv[2], **orig_argv;
7319 int orig_argc;
7320
7321 /* We need to alter the command arguments before to call
7322 * popGenericCommand() as the command takes a single key. */
7323 orig_argv = c->argv;
7324 orig_argc = c->argc;
7325 argv[1] = c->argv[j];
7326 c->argv = argv;
7327 c->argc = 2;
7328
7329 /* Also the return value is different, we need to output
7330 * the multi bulk reply header and the key name. The
7331 * "real" command will add the last element (the value)
7332 * for us. If this souds like an hack to you it's just
7333 * because it is... */
7334 addReplySds(c,sdsnew("*2\r\n"));
7335 addReplyBulk(c,argv[1]);
7336 popGenericCommand(c,where);
7337
7338 /* Fix the client structure with the original stuff */
7339 c->argv = orig_argv;
7340 c->argc = orig_argc;
7341 return;
7342 }
7343 }
7344 }
7345 }
7346 /* If the list is empty or the key does not exists we must block */
7347 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7348 if (timeout > 0) timeout += time(NULL);
7349 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7350 }
7351
7352 static void blpopCommand(redisClient *c) {
7353 blockingPopGenericCommand(c,REDIS_HEAD);
7354 }
7355
7356 static void brpopCommand(redisClient *c) {
7357 blockingPopGenericCommand(c,REDIS_TAIL);
7358 }
7359
7360 /* =============================== Replication ============================= */
7361
7362 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7363 ssize_t nwritten, ret = size;
7364 time_t start = time(NULL);
7365
7366 timeout++;
7367 while(size) {
7368 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7369 nwritten = write(fd,ptr,size);
7370 if (nwritten == -1) return -1;
7371 ptr += nwritten;
7372 size -= nwritten;
7373 }
7374 if ((time(NULL)-start) > timeout) {
7375 errno = ETIMEDOUT;
7376 return -1;
7377 }
7378 }
7379 return ret;
7380 }
7381
7382 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7383 ssize_t nread, totread = 0;
7384 time_t start = time(NULL);
7385
7386 timeout++;
7387 while(size) {
7388 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7389 nread = read(fd,ptr,size);
7390 if (nread == -1) return -1;
7391 ptr += nread;
7392 size -= nread;
7393 totread += nread;
7394 }
7395 if ((time(NULL)-start) > timeout) {
7396 errno = ETIMEDOUT;
7397 return -1;
7398 }
7399 }
7400 return totread;
7401 }
7402
7403 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7404 ssize_t nread = 0;
7405
7406 size--;
7407 while(size) {
7408 char c;
7409
7410 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7411 if (c == '\n') {
7412 *ptr = '\0';
7413 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7414 return nread;
7415 } else {
7416 *ptr++ = c;
7417 *ptr = '\0';
7418 nread++;
7419 }
7420 }
7421 return nread;
7422 }
7423
7424 static void syncCommand(redisClient *c) {
7425 /* ignore SYNC if aleady slave or in monitor mode */
7426 if (c->flags & REDIS_SLAVE) return;
7427
7428 /* SYNC can't be issued when the server has pending data to send to
7429 * the client about already issued commands. We need a fresh reply
7430 * buffer registering the differences between the BGSAVE and the current
7431 * dataset, so that we can copy to other slaves if needed. */
7432 if (listLength(c->reply) != 0) {
7433 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7434 return;
7435 }
7436
7437 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7438 /* Here we need to check if there is a background saving operation
7439 * in progress, or if it is required to start one */
7440 if (server.bgsavechildpid != -1) {
7441 /* Ok a background save is in progress. Let's check if it is a good
7442 * one for replication, i.e. if there is another slave that is
7443 * registering differences since the server forked to save */
7444 redisClient *slave;
7445 listNode *ln;
7446 listIter li;
7447
7448 listRewind(server.slaves,&li);
7449 while((ln = listNext(&li))) {
7450 slave = ln->value;
7451 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7452 }
7453 if (ln) {
7454 /* Perfect, the server is already registering differences for
7455 * another slave. Set the right state, and copy the buffer. */
7456 listRelease(c->reply);
7457 c->reply = listDup(slave->reply);
7458 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7459 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7460 } else {
7461 /* No way, we need to wait for the next BGSAVE in order to
7462 * register differences */
7463 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7464 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7465 }
7466 } else {
7467 /* Ok we don't have a BGSAVE in progress, let's start one */
7468 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7469 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7470 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7471 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7472 return;
7473 }
7474 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7475 }
7476 c->repldbfd = -1;
7477 c->flags |= REDIS_SLAVE;
7478 c->slaveseldb = 0;
7479 listAddNodeTail(server.slaves,c);
7480 return;
7481 }
7482
7483 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7484 redisClient *slave = privdata;
7485 REDIS_NOTUSED(el);
7486 REDIS_NOTUSED(mask);
7487 char buf[REDIS_IOBUF_LEN];
7488 ssize_t nwritten, buflen;
7489
7490 if (slave->repldboff == 0) {
7491 /* Write the bulk write count before to transfer the DB. In theory here
7492 * we don't know how much room there is in the output buffer of the
7493 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7494 * operations) will never be smaller than the few bytes we need. */
7495 sds bulkcount;
7496
7497 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7498 slave->repldbsize);
7499 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7500 {
7501 sdsfree(bulkcount);
7502 freeClient(slave);
7503 return;
7504 }
7505 sdsfree(bulkcount);
7506 }
7507 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7508 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7509 if (buflen <= 0) {
7510 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7511 (buflen == 0) ? "premature EOF" : strerror(errno));
7512 freeClient(slave);
7513 return;
7514 }
7515 if ((nwritten = write(fd,buf,buflen)) == -1) {
7516 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7517 strerror(errno));
7518 freeClient(slave);
7519 return;
7520 }
7521 slave->repldboff += nwritten;
7522 if (slave->repldboff == slave->repldbsize) {
7523 close(slave->repldbfd);
7524 slave->repldbfd = -1;
7525 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7526 slave->replstate = REDIS_REPL_ONLINE;
7527 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7528 sendReplyToClient, slave) == AE_ERR) {
7529 freeClient(slave);
7530 return;
7531 }
7532 addReplySds(slave,sdsempty());
7533 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7534 }
7535 }
7536
7537 /* This function is called at the end of every backgrond saving.
7538 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7539 * otherwise REDIS_ERR is passed to the function.
7540 *
7541 * The goal of this function is to handle slaves waiting for a successful
7542 * background saving in order to perform non-blocking synchronization. */
7543 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7544 listNode *ln;
7545 int startbgsave = 0;
7546 listIter li;
7547
7548 listRewind(server.slaves,&li);
7549 while((ln = listNext(&li))) {
7550 redisClient *slave = ln->value;
7551
7552 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7553 startbgsave = 1;
7554 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7555 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7556 struct redis_stat buf;
7557
7558 if (bgsaveerr != REDIS_OK) {
7559 freeClient(slave);
7560 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7561 continue;
7562 }
7563 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7564 redis_fstat(slave->repldbfd,&buf) == -1) {
7565 freeClient(slave);
7566 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7567 continue;
7568 }
7569 slave->repldboff = 0;
7570 slave->repldbsize = buf.st_size;
7571 slave->replstate = REDIS_REPL_SEND_BULK;
7572 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7573 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7574 freeClient(slave);
7575 continue;
7576 }
7577 }
7578 }
7579 if (startbgsave) {
7580 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7581 listIter li;
7582
7583 listRewind(server.slaves,&li);
7584 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7585 while((ln = listNext(&li))) {
7586 redisClient *slave = ln->value;
7587
7588 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7589 freeClient(slave);
7590 }
7591 }
7592 }
7593 }
7594
7595 static int syncWithMaster(void) {
7596 char buf[1024], tmpfile[256], authcmd[1024];
7597 long dumpsize;
7598 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7599 int dfd, maxtries = 5;
7600
7601 if (fd == -1) {
7602 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7603 strerror(errno));
7604 return REDIS_ERR;
7605 }
7606
7607 /* AUTH with the master if required. */
7608 if(server.masterauth) {
7609 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7610 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7611 close(fd);
7612 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7613 strerror(errno));
7614 return REDIS_ERR;
7615 }
7616 /* Read the AUTH result. */
7617 if (syncReadLine(fd,buf,1024,3600) == -1) {
7618 close(fd);
7619 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7620 strerror(errno));
7621 return REDIS_ERR;
7622 }
7623 if (buf[0] != '+') {
7624 close(fd);
7625 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7626 return REDIS_ERR;
7627 }
7628 }
7629
7630 /* Issue the SYNC command */
7631 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7632 close(fd);
7633 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7634 strerror(errno));
7635 return REDIS_ERR;
7636 }
7637 /* Read the bulk write count */
7638 if (syncReadLine(fd,buf,1024,3600) == -1) {
7639 close(fd);
7640 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7641 strerror(errno));
7642 return REDIS_ERR;
7643 }
7644 if (buf[0] != '$') {
7645 close(fd);
7646 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7647 return REDIS_ERR;
7648 }
7649 dumpsize = strtol(buf+1,NULL,10);
7650 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7651 /* Read the bulk write data on a temp file */
7652 while(maxtries--) {
7653 snprintf(tmpfile,256,
7654 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7655 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7656 if (dfd != -1) break;
7657 sleep(1);
7658 }
7659 if (dfd == -1) {
7660 close(fd);
7661 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7662 return REDIS_ERR;
7663 }
7664 while(dumpsize) {
7665 int nread, nwritten;
7666
7667 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7668 if (nread == -1) {
7669 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7670 strerror(errno));
7671 close(fd);
7672 close(dfd);
7673 return REDIS_ERR;
7674 }
7675 nwritten = write(dfd,buf,nread);
7676 if (nwritten == -1) {
7677 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7678 close(fd);
7679 close(dfd);
7680 return REDIS_ERR;
7681 }
7682 dumpsize -= nread;
7683 }
7684 close(dfd);
7685 if (rename(tmpfile,server.dbfilename) == -1) {
7686 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7687 unlink(tmpfile);
7688 close(fd);
7689 return REDIS_ERR;
7690 }
7691 emptyDb();
7692 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7693 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7694 close(fd);
7695 return REDIS_ERR;
7696 }
7697 server.master = createClient(fd);
7698 server.master->flags |= REDIS_MASTER;
7699 server.master->authenticated = 1;
7700 server.replstate = REDIS_REPL_CONNECTED;
7701 return REDIS_OK;
7702 }
7703
7704 static void slaveofCommand(redisClient *c) {
7705 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7706 !strcasecmp(c->argv[2]->ptr,"one")) {
7707 if (server.masterhost) {
7708 sdsfree(server.masterhost);
7709 server.masterhost = NULL;
7710 if (server.master) freeClient(server.master);
7711 server.replstate = REDIS_REPL_NONE;
7712 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7713 }
7714 } else {
7715 sdsfree(server.masterhost);
7716 server.masterhost = sdsdup(c->argv[1]->ptr);
7717 server.masterport = atoi(c->argv[2]->ptr);
7718 if (server.master) freeClient(server.master);
7719 server.replstate = REDIS_REPL_CONNECT;
7720 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7721 server.masterhost, server.masterport);
7722 }
7723 addReply(c,shared.ok);
7724 }
7725
7726 /* ============================ Maxmemory directive ======================== */
7727
7728 /* Try to free one object form the pre-allocated objects free list.
7729 * This is useful under low mem conditions as by default we take 1 million
7730 * free objects allocated. On success REDIS_OK is returned, otherwise
7731 * REDIS_ERR. */
7732 static int tryFreeOneObjectFromFreelist(void) {
7733 robj *o;
7734
7735 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7736 if (listLength(server.objfreelist)) {
7737 listNode *head = listFirst(server.objfreelist);
7738 o = listNodeValue(head);
7739 listDelNode(server.objfreelist,head);
7740 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7741 zfree(o);
7742 return REDIS_OK;
7743 } else {
7744 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7745 return REDIS_ERR;
7746 }
7747 }
7748
7749 /* This function gets called when 'maxmemory' is set on the config file to limit
7750 * the max memory used by the server, and we are out of memory.
7751 * This function will try to, in order:
7752 *
7753 * - Free objects from the free list
7754 * - Try to remove keys with an EXPIRE set
7755 *
7756 * It is not possible to free enough memory to reach used-memory < maxmemory
7757 * the server will start refusing commands that will enlarge even more the
7758 * memory usage.
7759 */
7760 static void freeMemoryIfNeeded(void) {
7761 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7762 int j, k, freed = 0;
7763
7764 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7765 for (j = 0; j < server.dbnum; j++) {
7766 int minttl = -1;
7767 robj *minkey = NULL;
7768 struct dictEntry *de;
7769
7770 if (dictSize(server.db[j].expires)) {
7771 freed = 1;
7772 /* From a sample of three keys drop the one nearest to
7773 * the natural expire */
7774 for (k = 0; k < 3; k++) {
7775 time_t t;
7776
7777 de = dictGetRandomKey(server.db[j].expires);
7778 t = (time_t) dictGetEntryVal(de);
7779 if (minttl == -1 || t < minttl) {
7780 minkey = dictGetEntryKey(de);
7781 minttl = t;
7782 }
7783 }
7784 deleteKey(server.db+j,minkey);
7785 }
7786 }
7787 if (!freed) return; /* nothing to free... */
7788 }
7789 }
7790
7791 /* ============================== Append Only file ========================== */
7792
7793 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7794 sds buf = sdsempty();
7795 int j;
7796 ssize_t nwritten;
7797 time_t now;
7798 robj *tmpargv[3];
7799
7800 /* The DB this command was targetting is not the same as the last command
7801 * we appendend. To issue a SELECT command is needed. */
7802 if (dictid != server.appendseldb) {
7803 char seldb[64];
7804
7805 snprintf(seldb,sizeof(seldb),"%d",dictid);
7806 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7807 (unsigned long)strlen(seldb),seldb);
7808 server.appendseldb = dictid;
7809 }
7810
7811 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7812 * EXPIREs into EXPIREATs calls */
7813 if (cmd->proc == expireCommand) {
7814 long when;
7815
7816 tmpargv[0] = createStringObject("EXPIREAT",8);
7817 tmpargv[1] = argv[1];
7818 incrRefCount(argv[1]);
7819 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7820 tmpargv[2] = createObject(REDIS_STRING,
7821 sdscatprintf(sdsempty(),"%ld",when));
7822 argv = tmpargv;
7823 }
7824
7825 /* Append the actual command */
7826 buf = sdscatprintf(buf,"*%d\r\n",argc);
7827 for (j = 0; j < argc; j++) {
7828 robj *o = argv[j];
7829
7830 o = getDecodedObject(o);
7831 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7832 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7833 buf = sdscatlen(buf,"\r\n",2);
7834 decrRefCount(o);
7835 }
7836
7837 /* Free the objects from the modified argv for EXPIREAT */
7838 if (cmd->proc == expireCommand) {
7839 for (j = 0; j < 3; j++)
7840 decrRefCount(argv[j]);
7841 }
7842
7843 /* We want to perform a single write. This should be guaranteed atomic
7844 * at least if the filesystem we are writing is a real physical one.
7845 * While this will save us against the server being killed I don't think
7846 * there is much to do about the whole server stopping for power problems
7847 * or alike */
7848 nwritten = write(server.appendfd,buf,sdslen(buf));
7849 if (nwritten != (signed)sdslen(buf)) {
7850 /* Ooops, we are in troubles. The best thing to do for now is
7851 * to simply exit instead to give the illusion that everything is
7852 * working as expected. */
7853 if (nwritten == -1) {
7854 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7855 } else {
7856 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7857 }
7858 exit(1);
7859 }
7860 /* If a background append only file rewriting is in progress we want to
7861 * accumulate the differences between the child DB and the current one
7862 * in a buffer, so that when the child process will do its work we
7863 * can append the differences to the new append only file. */
7864 if (server.bgrewritechildpid != -1)
7865 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7866
7867 sdsfree(buf);
7868 now = time(NULL);
7869 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7870 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7871 now-server.lastfsync > 1))
7872 {
7873 fsync(server.appendfd); /* Let's try to get this data on the disk */
7874 server.lastfsync = now;
7875 }
7876 }
7877
7878 /* In Redis commands are always executed in the context of a client, so in
7879 * order to load the append only file we need to create a fake client. */
7880 static struct redisClient *createFakeClient(void) {
7881 struct redisClient *c = zmalloc(sizeof(*c));
7882
7883 selectDb(c,0);
7884 c->fd = -1;
7885 c->querybuf = sdsempty();
7886 c->argc = 0;
7887 c->argv = NULL;
7888 c->flags = 0;
7889 /* We set the fake client as a slave waiting for the synchronization
7890 * so that Redis will not try to send replies to this client. */
7891 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7892 c->reply = listCreate();
7893 listSetFreeMethod(c->reply,decrRefCount);
7894 listSetDupMethod(c->reply,dupClientReplyValue);
7895 return c;
7896 }
7897
7898 static void freeFakeClient(struct redisClient *c) {
7899 sdsfree(c->querybuf);
7900 listRelease(c->reply);
7901 zfree(c);
7902 }
7903
7904 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7905 * error (the append only file is zero-length) REDIS_ERR is returned. On
7906 * fatal error an error message is logged and the program exists. */
7907 int loadAppendOnlyFile(char *filename) {
7908 struct redisClient *fakeClient;
7909 FILE *fp = fopen(filename,"r");
7910 struct redis_stat sb;
7911 unsigned long long loadedkeys = 0;
7912
7913 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7914 return REDIS_ERR;
7915
7916 if (fp == NULL) {
7917 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7918 exit(1);
7919 }
7920
7921 fakeClient = createFakeClient();
7922 while(1) {
7923 int argc, j;
7924 unsigned long len;
7925 robj **argv;
7926 char buf[128];
7927 sds argsds;
7928 struct redisCommand *cmd;
7929
7930 if (fgets(buf,sizeof(buf),fp) == NULL) {
7931 if (feof(fp))
7932 break;
7933 else
7934 goto readerr;
7935 }
7936 if (buf[0] != '*') goto fmterr;
7937 argc = atoi(buf+1);
7938 argv = zmalloc(sizeof(robj*)*argc);
7939 for (j = 0; j < argc; j++) {
7940 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7941 if (buf[0] != '$') goto fmterr;
7942 len = strtol(buf+1,NULL,10);
7943 argsds = sdsnewlen(NULL,len);
7944 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7945 argv[j] = createObject(REDIS_STRING,argsds);
7946 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7947 }
7948
7949 /* Command lookup */
7950 cmd = lookupCommand(argv[0]->ptr);
7951 if (!cmd) {
7952 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7953 exit(1);
7954 }
7955 /* Try object encoding */
7956 if (cmd->flags & REDIS_CMD_BULK)
7957 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7958 /* Run the command in the context of a fake client */
7959 fakeClient->argc = argc;
7960 fakeClient->argv = argv;
7961 cmd->proc(fakeClient);
7962 /* Discard the reply objects list from the fake client */
7963 while(listLength(fakeClient->reply))
7964 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7965 /* Clean up, ready for the next command */
7966 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7967 zfree(argv);
7968 /* Handle swapping while loading big datasets when VM is on */
7969 loadedkeys++;
7970 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7971 while (zmalloc_used_memory() > server.vm_max_memory) {
7972 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7973 }
7974 }
7975 }
7976 fclose(fp);
7977 freeFakeClient(fakeClient);
7978 return REDIS_OK;
7979
7980 readerr:
7981 if (feof(fp)) {
7982 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7983 } else {
7984 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7985 }
7986 exit(1);
7987 fmterr:
7988 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7989 exit(1);
7990 }
7991
7992 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7993 static int fwriteBulkObject(FILE *fp, robj *obj) {
7994 char buf[128];
7995 int decrrc = 0;
7996
7997 /* Avoid the incr/decr ref count business if possible to help
7998 * copy-on-write (we are often in a child process when this function
7999 * is called).
8000 * Also makes sure that key objects don't get incrRefCount-ed when VM
8001 * is enabled */
8002 if (obj->encoding != REDIS_ENCODING_RAW) {
8003 obj = getDecodedObject(obj);
8004 decrrc = 1;
8005 }
8006 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8007 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8008 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8009 goto err;
8010 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8011 if (decrrc) decrRefCount(obj);
8012 return 1;
8013 err:
8014 if (decrrc) decrRefCount(obj);
8015 return 0;
8016 }
8017
8018 /* Write binary-safe string into a file in the bulkformat
8019 * $<count>\r\n<payload>\r\n */
8020 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8021 char buf[128];
8022
8023 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8024 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8025 if (len && fwrite(s,len,1,fp) == 0) return 0;
8026 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8027 return 1;
8028 }
8029
8030 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8031 static int fwriteBulkDouble(FILE *fp, double d) {
8032 char buf[128], dbuf[128];
8033
8034 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8035 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8036 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8037 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8038 return 1;
8039 }
8040
8041 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8042 static int fwriteBulkLong(FILE *fp, long l) {
8043 char buf[128], lbuf[128];
8044
8045 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8046 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8047 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8048 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8049 return 1;
8050 }
8051
8052 /* Write a sequence of commands able to fully rebuild the dataset into
8053 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8054 static int rewriteAppendOnlyFile(char *filename) {
8055 dictIterator *di = NULL;
8056 dictEntry *de;
8057 FILE *fp;
8058 char tmpfile[256];
8059 int j;
8060 time_t now = time(NULL);
8061
8062 /* Note that we have to use a different temp name here compared to the
8063 * one used by rewriteAppendOnlyFileBackground() function. */
8064 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8065 fp = fopen(tmpfile,"w");
8066 if (!fp) {
8067 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8068 return REDIS_ERR;
8069 }
8070 for (j = 0; j < server.dbnum; j++) {
8071 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8072 redisDb *db = server.db+j;
8073 dict *d = db->dict;
8074 if (dictSize(d) == 0) continue;
8075 di = dictGetIterator(d);
8076 if (!di) {
8077 fclose(fp);
8078 return REDIS_ERR;
8079 }
8080
8081 /* SELECT the new DB */
8082 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8083 if (fwriteBulkLong(fp,j) == 0) goto werr;
8084
8085 /* Iterate this DB writing every entry */
8086 while((de = dictNext(di)) != NULL) {
8087 robj *key, *o;
8088 time_t expiretime;
8089 int swapped;
8090
8091 key = dictGetEntryKey(de);
8092 /* If the value for this key is swapped, load a preview in memory.
8093 * We use a "swapped" flag to remember if we need to free the
8094 * value object instead to just increment the ref count anyway
8095 * in order to avoid copy-on-write of pages if we are forked() */
8096 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8097 key->storage == REDIS_VM_SWAPPING) {
8098 o = dictGetEntryVal(de);
8099 swapped = 0;
8100 } else {
8101 o = vmPreviewObject(key);
8102 swapped = 1;
8103 }
8104 expiretime = getExpire(db,key);
8105
8106 /* Save the key and associated value */
8107 if (o->type == REDIS_STRING) {
8108 /* Emit a SET command */
8109 char cmd[]="*3\r\n$3\r\nSET\r\n";
8110 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8111 /* Key and value */
8112 if (fwriteBulkObject(fp,key) == 0) goto werr;
8113 if (fwriteBulkObject(fp,o) == 0) goto werr;
8114 } else if (o->type == REDIS_LIST) {
8115 /* Emit the RPUSHes needed to rebuild the list */
8116 list *list = o->ptr;
8117 listNode *ln;
8118 listIter li;
8119
8120 listRewind(list,&li);
8121 while((ln = listNext(&li))) {
8122 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8123 robj *eleobj = listNodeValue(ln);
8124
8125 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8126 if (fwriteBulkObject(fp,key) == 0) goto werr;
8127 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8128 }
8129 } else if (o->type == REDIS_SET) {
8130 /* Emit the SADDs needed to rebuild the set */
8131 dict *set = o->ptr;
8132 dictIterator *di = dictGetIterator(set);
8133 dictEntry *de;
8134
8135 while((de = dictNext(di)) != NULL) {
8136 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8137 robj *eleobj = dictGetEntryKey(de);
8138
8139 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8140 if (fwriteBulkObject(fp,key) == 0) goto werr;
8141 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8142 }
8143 dictReleaseIterator(di);
8144 } else if (o->type == REDIS_ZSET) {
8145 /* Emit the ZADDs needed to rebuild the sorted set */
8146 zset *zs = o->ptr;
8147 dictIterator *di = dictGetIterator(zs->dict);
8148 dictEntry *de;
8149
8150 while((de = dictNext(di)) != NULL) {
8151 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8152 robj *eleobj = dictGetEntryKey(de);
8153 double *score = dictGetEntryVal(de);
8154
8155 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8156 if (fwriteBulkObject(fp,key) == 0) goto werr;
8157 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8158 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8159 }
8160 dictReleaseIterator(di);
8161 } else if (o->type == REDIS_HASH) {
8162 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8163
8164 /* Emit the HSETs needed to rebuild the hash */
8165 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8166 unsigned char *p = zipmapRewind(o->ptr);
8167 unsigned char *field, *val;
8168 unsigned int flen, vlen;
8169
8170 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8171 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8172 if (fwriteBulkObject(fp,key) == 0) goto werr;
8173 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8174 return -1;
8175 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8176 return -1;
8177 }
8178 } else {
8179 dictIterator *di = dictGetIterator(o->ptr);
8180 dictEntry *de;
8181
8182 while((de = dictNext(di)) != NULL) {
8183 robj *field = dictGetEntryKey(de);
8184 robj *val = dictGetEntryVal(de);
8185
8186 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8187 if (fwriteBulkObject(fp,key) == 0) goto werr;
8188 if (fwriteBulkObject(fp,field) == -1) return -1;
8189 if (fwriteBulkObject(fp,val) == -1) return -1;
8190 }
8191 dictReleaseIterator(di);
8192 }
8193 } else {
8194 redisAssert(0);
8195 }
8196 /* Save the expire time */
8197 if (expiretime != -1) {
8198 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8199 /* If this key is already expired skip it */
8200 if (expiretime < now) continue;
8201 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8202 if (fwriteBulkObject(fp,key) == 0) goto werr;
8203 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8204 }
8205 if (swapped) decrRefCount(o);
8206 }
8207 dictReleaseIterator(di);
8208 }
8209
8210 /* Make sure data will not remain on the OS's output buffers */
8211 fflush(fp);
8212 fsync(fileno(fp));
8213 fclose(fp);
8214
8215 /* Use RENAME to make sure the DB file is changed atomically only
8216 * if the generate DB file is ok. */
8217 if (rename(tmpfile,filename) == -1) {
8218 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8219 unlink(tmpfile);
8220 return REDIS_ERR;
8221 }
8222 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8223 return REDIS_OK;
8224
8225 werr:
8226 fclose(fp);
8227 unlink(tmpfile);
8228 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8229 if (di) dictReleaseIterator(di);
8230 return REDIS_ERR;
8231 }
8232
8233 /* This is how rewriting of the append only file in background works:
8234 *
8235 * 1) The user calls BGREWRITEAOF
8236 * 2) Redis calls this function, that forks():
8237 * 2a) the child rewrite the append only file in a temp file.
8238 * 2b) the parent accumulates differences in server.bgrewritebuf.
8239 * 3) When the child finished '2a' exists.
8240 * 4) The parent will trap the exit code, if it's OK, will append the
8241 * data accumulated into server.bgrewritebuf into the temp file, and
8242 * finally will rename(2) the temp file in the actual file name.
8243 * The the new file is reopened as the new append only file. Profit!
8244 */
8245 static int rewriteAppendOnlyFileBackground(void) {
8246 pid_t childpid;
8247
8248 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8249 if (server.vm_enabled) waitEmptyIOJobsQueue();
8250 if ((childpid = fork()) == 0) {
8251 /* Child */
8252 char tmpfile[256];
8253
8254 if (server.vm_enabled) vmReopenSwapFile();
8255 close(server.fd);
8256 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8257 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8258 _exit(0);
8259 } else {
8260 _exit(1);
8261 }
8262 } else {
8263 /* Parent */
8264 if (childpid == -1) {
8265 redisLog(REDIS_WARNING,
8266 "Can't rewrite append only file in background: fork: %s",
8267 strerror(errno));
8268 return REDIS_ERR;
8269 }
8270 redisLog(REDIS_NOTICE,
8271 "Background append only file rewriting started by pid %d",childpid);
8272 server.bgrewritechildpid = childpid;
8273 updateDictResizePolicy();
8274 /* We set appendseldb to -1 in order to force the next call to the
8275 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8276 * accumulated by the parent into server.bgrewritebuf will start
8277 * with a SELECT statement and it will be safe to merge. */
8278 server.appendseldb = -1;
8279 return REDIS_OK;
8280 }
8281 return REDIS_OK; /* unreached */
8282 }
8283
8284 static void bgrewriteaofCommand(redisClient *c) {
8285 if (server.bgrewritechildpid != -1) {
8286 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8287 return;
8288 }
8289 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8290 char *status = "+Background append only file rewriting started\r\n";
8291 addReplySds(c,sdsnew(status));
8292 } else {
8293 addReply(c,shared.err);
8294 }
8295 }
8296
8297 static void aofRemoveTempFile(pid_t childpid) {
8298 char tmpfile[256];
8299
8300 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8301 unlink(tmpfile);
8302 }
8303
8304 /* Virtual Memory is composed mainly of two subsystems:
8305 * - Blocking Virutal Memory
8306 * - Threaded Virtual Memory I/O
8307 * The two parts are not fully decoupled, but functions are split among two
8308 * different sections of the source code (delimited by comments) in order to
8309 * make more clear what functionality is about the blocking VM and what about
8310 * the threaded (not blocking) VM.
8311 *
8312 * Redis VM design:
8313 *
8314 * Redis VM is a blocking VM (one that blocks reading swapped values from
8315 * disk into memory when a value swapped out is needed in memory) that is made
8316 * unblocking by trying to examine the command argument vector in order to
8317 * load in background values that will likely be needed in order to exec
8318 * the command. The command is executed only once all the relevant keys
8319 * are loaded into memory.
8320 *
8321 * This basically is almost as simple of a blocking VM, but almost as parallel
8322 * as a fully non-blocking VM.
8323 */
8324
8325 /* =================== Virtual Memory - Blocking Side ====================== */
8326
8327 /* substitute the first occurrence of '%p' with the process pid in the
8328 * swap file name. */
8329 static void expandVmSwapFilename(void) {
8330 char *p = strstr(server.vm_swap_file,"%p");
8331 sds new;
8332
8333 if (!p) return;
8334 new = sdsempty();
8335 *p = '\0';
8336 new = sdscat(new,server.vm_swap_file);
8337 new = sdscatprintf(new,"%ld",(long) getpid());
8338 new = sdscat(new,p+2);
8339 zfree(server.vm_swap_file);
8340 server.vm_swap_file = new;
8341 }
8342
8343 static void vmInit(void) {
8344 off_t totsize;
8345 int pipefds[2];
8346 size_t stacksize;
8347
8348 if (server.vm_max_threads != 0)
8349 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8350
8351 expandVmSwapFilename();
8352 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8353 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8354 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8355 }
8356 if (server.vm_fp == NULL) {
8357 redisLog(REDIS_WARNING,
8358 "Impossible to open the swap file: %s. Exiting.",
8359 strerror(errno));
8360 exit(1);
8361 }
8362 server.vm_fd = fileno(server.vm_fp);
8363 server.vm_next_page = 0;
8364 server.vm_near_pages = 0;
8365 server.vm_stats_used_pages = 0;
8366 server.vm_stats_swapped_objects = 0;
8367 server.vm_stats_swapouts = 0;
8368 server.vm_stats_swapins = 0;
8369 totsize = server.vm_pages*server.vm_page_size;
8370 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8371 if (ftruncate(server.vm_fd,totsize) == -1) {
8372 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8373 strerror(errno));
8374 exit(1);
8375 } else {
8376 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8377 }
8378 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8379 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8380 (long long) (server.vm_pages+7)/8, server.vm_pages);
8381 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8382
8383 /* Initialize threaded I/O (used by Virtual Memory) */
8384 server.io_newjobs = listCreate();
8385 server.io_processing = listCreate();
8386 server.io_processed = listCreate();
8387 server.io_ready_clients = listCreate();
8388 pthread_mutex_init(&server.io_mutex,NULL);
8389 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8390 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8391 server.io_active_threads = 0;
8392 if (pipe(pipefds) == -1) {
8393 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8394 ,strerror(errno));
8395 exit(1);
8396 }
8397 server.io_ready_pipe_read = pipefds[0];
8398 server.io_ready_pipe_write = pipefds[1];
8399 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8400 /* LZF requires a lot of stack */
8401 pthread_attr_init(&server.io_threads_attr);
8402 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8403 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8404 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8405 /* Listen for events in the threaded I/O pipe */
8406 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8407 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8408 oom("creating file event");
8409 }
8410
8411 /* Mark the page as used */
8412 static void vmMarkPageUsed(off_t page) {
8413 off_t byte = page/8;
8414 int bit = page&7;
8415 redisAssert(vmFreePage(page) == 1);
8416 server.vm_bitmap[byte] |= 1<<bit;
8417 }
8418
8419 /* Mark N contiguous pages as used, with 'page' being the first. */
8420 static void vmMarkPagesUsed(off_t page, off_t count) {
8421 off_t j;
8422
8423 for (j = 0; j < count; j++)
8424 vmMarkPageUsed(page+j);
8425 server.vm_stats_used_pages += count;
8426 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8427 (long long)count, (long long)page);
8428 }
8429
8430 /* Mark the page as free */
8431 static void vmMarkPageFree(off_t page) {
8432 off_t byte = page/8;
8433 int bit = page&7;
8434 redisAssert(vmFreePage(page) == 0);
8435 server.vm_bitmap[byte] &= ~(1<<bit);
8436 }
8437
8438 /* Mark N contiguous pages as free, with 'page' being the first. */
8439 static void vmMarkPagesFree(off_t page, off_t count) {
8440 off_t j;
8441
8442 for (j = 0; j < count; j++)
8443 vmMarkPageFree(page+j);
8444 server.vm_stats_used_pages -= count;
8445 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8446 (long long)count, (long long)page);
8447 }
8448
8449 /* Test if the page is free */
8450 static int vmFreePage(off_t page) {
8451 off_t byte = page/8;
8452 int bit = page&7;
8453 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8454 }
8455
8456 /* Find N contiguous free pages storing the first page of the cluster in *first.
8457 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8458 * REDIS_ERR is returned.
8459 *
8460 * This function uses a simple algorithm: we try to allocate
8461 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8462 * again from the start of the swap file searching for free spaces.
8463 *
8464 * If it looks pretty clear that there are no free pages near our offset
8465 * we try to find less populated places doing a forward jump of
8466 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8467 * without hurry, and then we jump again and so forth...
8468 *
8469 * This function can be improved using a free list to avoid to guess
8470 * too much, since we could collect data about freed pages.
8471 *
8472 * note: I implemented this function just after watching an episode of
8473 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8474 */
8475 static int vmFindContiguousPages(off_t *first, off_t n) {
8476 off_t base, offset = 0, since_jump = 0, numfree = 0;
8477
8478 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8479 server.vm_near_pages = 0;
8480 server.vm_next_page = 0;
8481 }
8482 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8483 base = server.vm_next_page;
8484
8485 while(offset < server.vm_pages) {
8486 off_t this = base+offset;
8487
8488 /* If we overflow, restart from page zero */
8489 if (this >= server.vm_pages) {
8490 this -= server.vm_pages;
8491 if (this == 0) {
8492 /* Just overflowed, what we found on tail is no longer
8493 * interesting, as it's no longer contiguous. */
8494 numfree = 0;
8495 }
8496 }
8497 if (vmFreePage(this)) {
8498 /* This is a free page */
8499 numfree++;
8500 /* Already got N free pages? Return to the caller, with success */
8501 if (numfree == n) {
8502 *first = this-(n-1);
8503 server.vm_next_page = this+1;
8504 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8505 return REDIS_OK;
8506 }
8507 } else {
8508 /* The current one is not a free page */
8509 numfree = 0;
8510 }
8511
8512 /* Fast-forward if the current page is not free and we already
8513 * searched enough near this place. */
8514 since_jump++;
8515 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8516 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8517 since_jump = 0;
8518 /* Note that even if we rewind after the jump, we are don't need
8519 * to make sure numfree is set to zero as we only jump *if* it
8520 * is set to zero. */
8521 } else {
8522 /* Otherwise just check the next page */
8523 offset++;
8524 }
8525 }
8526 return REDIS_ERR;
8527 }
8528
8529 /* Write the specified object at the specified page of the swap file */
8530 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8531 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8532 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8533 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8534 redisLog(REDIS_WARNING,
8535 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8536 strerror(errno));
8537 return REDIS_ERR;
8538 }
8539 rdbSaveObject(server.vm_fp,o);
8540 fflush(server.vm_fp);
8541 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8542 return REDIS_OK;
8543 }
8544
8545 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8546 * needed to later retrieve the object into the key object.
8547 * If we can't find enough contiguous empty pages to swap the object on disk
8548 * REDIS_ERR is returned. */
8549 static int vmSwapObjectBlocking(robj *key, robj *val) {
8550 off_t pages = rdbSavedObjectPages(val,NULL);
8551 off_t page;
8552
8553 assert(key->storage == REDIS_VM_MEMORY);
8554 assert(key->refcount == 1);
8555 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8556 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8557 key->vm.page = page;
8558 key->vm.usedpages = pages;
8559 key->storage = REDIS_VM_SWAPPED;
8560 key->vtype = val->type;
8561 decrRefCount(val); /* Deallocate the object from memory. */
8562 vmMarkPagesUsed(page,pages);
8563 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8564 (unsigned char*) key->ptr,
8565 (unsigned long long) page, (unsigned long long) pages);
8566 server.vm_stats_swapped_objects++;
8567 server.vm_stats_swapouts++;
8568 return REDIS_OK;
8569 }
8570
8571 static robj *vmReadObjectFromSwap(off_t page, int type) {
8572 robj *o;
8573
8574 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8575 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8576 redisLog(REDIS_WARNING,
8577 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8578 strerror(errno));
8579 _exit(1);
8580 }
8581 o = rdbLoadObject(type,server.vm_fp);
8582 if (o == NULL) {
8583 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8584 _exit(1);
8585 }
8586 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8587 return o;
8588 }
8589
8590 /* Load the value object relative to the 'key' object from swap to memory.
8591 * The newly allocated object is returned.
8592 *
8593 * If preview is true the unserialized object is returned to the caller but
8594 * no changes are made to the key object, nor the pages are marked as freed */
8595 static robj *vmGenericLoadObject(robj *key, int preview) {
8596 robj *val;
8597
8598 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8599 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8600 if (!preview) {
8601 key->storage = REDIS_VM_MEMORY;
8602 key->vm.atime = server.unixtime;
8603 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8604 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8605 (unsigned char*) key->ptr);
8606 server.vm_stats_swapped_objects--;
8607 } else {
8608 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8609 (unsigned char*) key->ptr);
8610 }
8611 server.vm_stats_swapins++;
8612 return val;
8613 }
8614
8615 /* Plain object loading, from swap to memory */
8616 static robj *vmLoadObject(robj *key) {
8617 /* If we are loading the object in background, stop it, we
8618 * need to load this object synchronously ASAP. */
8619 if (key->storage == REDIS_VM_LOADING)
8620 vmCancelThreadedIOJob(key);
8621 return vmGenericLoadObject(key,0);
8622 }
8623
8624 /* Just load the value on disk, without to modify the key.
8625 * This is useful when we want to perform some operation on the value
8626 * without to really bring it from swap to memory, like while saving the
8627 * dataset or rewriting the append only log. */
8628 static robj *vmPreviewObject(robj *key) {
8629 return vmGenericLoadObject(key,1);
8630 }
8631
8632 /* How a good candidate is this object for swapping?
8633 * The better candidate it is, the greater the returned value.
8634 *
8635 * Currently we try to perform a fast estimation of the object size in
8636 * memory, and combine it with aging informations.
8637 *
8638 * Basically swappability = idle-time * log(estimated size)
8639 *
8640 * Bigger objects are preferred over smaller objects, but not
8641 * proportionally, this is why we use the logarithm. This algorithm is
8642 * just a first try and will probably be tuned later. */
8643 static double computeObjectSwappability(robj *o) {
8644 time_t age = server.unixtime - o->vm.atime;
8645 long asize = 0;
8646 list *l;
8647 dict *d;
8648 struct dictEntry *de;
8649 int z;
8650
8651 if (age <= 0) return 0;
8652 switch(o->type) {
8653 case REDIS_STRING:
8654 if (o->encoding != REDIS_ENCODING_RAW) {
8655 asize = sizeof(*o);
8656 } else {
8657 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8658 }
8659 break;
8660 case REDIS_LIST:
8661 l = o->ptr;
8662 listNode *ln = listFirst(l);
8663
8664 asize = sizeof(list);
8665 if (ln) {
8666 robj *ele = ln->value;
8667 long elesize;
8668
8669 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8670 (sizeof(*o)+sdslen(ele->ptr)) :
8671 sizeof(*o);
8672 asize += (sizeof(listNode)+elesize)*listLength(l);
8673 }
8674 break;
8675 case REDIS_SET:
8676 case REDIS_ZSET:
8677 z = (o->type == REDIS_ZSET);
8678 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8679
8680 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8681 if (z) asize += sizeof(zset)-sizeof(dict);
8682 if (dictSize(d)) {
8683 long elesize;
8684 robj *ele;
8685
8686 de = dictGetRandomKey(d);
8687 ele = dictGetEntryKey(de);
8688 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8689 (sizeof(*o)+sdslen(ele->ptr)) :
8690 sizeof(*o);
8691 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8692 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8693 }
8694 break;
8695 case REDIS_HASH:
8696 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8697 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8698 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8699 unsigned int klen, vlen;
8700 unsigned char *key, *val;
8701
8702 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8703 klen = 0;
8704 vlen = 0;
8705 }
8706 asize = len*(klen+vlen+3);
8707 } else if (o->encoding == REDIS_ENCODING_HT) {
8708 d = o->ptr;
8709 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8710 if (dictSize(d)) {
8711 long elesize;
8712 robj *ele;
8713
8714 de = dictGetRandomKey(d);
8715 ele = dictGetEntryKey(de);
8716 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8717 (sizeof(*o)+sdslen(ele->ptr)) :
8718 sizeof(*o);
8719 ele = dictGetEntryVal(de);
8720 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8721 (sizeof(*o)+sdslen(ele->ptr)) :
8722 sizeof(*o);
8723 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8724 }
8725 }
8726 break;
8727 }
8728 return (double)age*log(1+asize);
8729 }
8730
8731 /* Try to swap an object that's a good candidate for swapping.
8732 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8733 * to swap any object at all.
8734 *
8735 * If 'usethreaded' is true, Redis will try to swap the object in background
8736 * using I/O threads. */
8737 static int vmSwapOneObject(int usethreads) {
8738 int j, i;
8739 struct dictEntry *best = NULL;
8740 double best_swappability = 0;
8741 redisDb *best_db = NULL;
8742 robj *key, *val;
8743
8744 for (j = 0; j < server.dbnum; j++) {
8745 redisDb *db = server.db+j;
8746 /* Why maxtries is set to 100?
8747 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8748 * are swappable objects */
8749 int maxtries = 100;
8750
8751 if (dictSize(db->dict) == 0) continue;
8752 for (i = 0; i < 5; i++) {
8753 dictEntry *de;
8754 double swappability;
8755
8756 if (maxtries) maxtries--;
8757 de = dictGetRandomKey(db->dict);
8758 key = dictGetEntryKey(de);
8759 val = dictGetEntryVal(de);
8760 /* Only swap objects that are currently in memory.
8761 *
8762 * Also don't swap shared objects if threaded VM is on, as we
8763 * try to ensure that the main thread does not touch the
8764 * object while the I/O thread is using it, but we can't
8765 * control other keys without adding additional mutex. */
8766 if (key->storage != REDIS_VM_MEMORY ||
8767 (server.vm_max_threads != 0 && val->refcount != 1)) {
8768 if (maxtries) i--; /* don't count this try */
8769 continue;
8770 }
8771 swappability = computeObjectSwappability(val);
8772 if (!best || swappability > best_swappability) {
8773 best = de;
8774 best_swappability = swappability;
8775 best_db = db;
8776 }
8777 }
8778 }
8779 if (best == NULL) return REDIS_ERR;
8780 key = dictGetEntryKey(best);
8781 val = dictGetEntryVal(best);
8782
8783 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8784 key->ptr, best_swappability);
8785
8786 /* Unshare the key if needed */
8787 if (key->refcount > 1) {
8788 robj *newkey = dupStringObject(key);
8789 decrRefCount(key);
8790 key = dictGetEntryKey(best) = newkey;
8791 }
8792 /* Swap it */
8793 if (usethreads) {
8794 vmSwapObjectThreaded(key,val,best_db);
8795 return REDIS_OK;
8796 } else {
8797 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8798 dictGetEntryVal(best) = NULL;
8799 return REDIS_OK;
8800 } else {
8801 return REDIS_ERR;
8802 }
8803 }
8804 }
8805
8806 static int vmSwapOneObjectBlocking() {
8807 return vmSwapOneObject(0);
8808 }
8809
8810 static int vmSwapOneObjectThreaded() {
8811 return vmSwapOneObject(1);
8812 }
8813
8814 /* Return true if it's safe to swap out objects in a given moment.
8815 * Basically we don't want to swap objects out while there is a BGSAVE
8816 * or a BGAEOREWRITE running in backgroud. */
8817 static int vmCanSwapOut(void) {
8818 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8819 }
8820
8821 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8822 * and was deleted. Otherwise 0 is returned. */
8823 static int deleteIfSwapped(redisDb *db, robj *key) {
8824 dictEntry *de;
8825 robj *foundkey;
8826
8827 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8828 foundkey = dictGetEntryKey(de);
8829 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8830 deleteKey(db,key);
8831 return 1;
8832 }
8833
8834 /* =================== Virtual Memory - Threaded I/O ======================= */
8835
8836 static void freeIOJob(iojob *j) {
8837 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8838 j->type == REDIS_IOJOB_DO_SWAP ||
8839 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8840 decrRefCount(j->val);
8841 /* We don't decrRefCount the j->key field as we did't incremented
8842 * the count creating IO Jobs. This is because the key field here is
8843 * just used as an indentifier and if a key is removed the Job should
8844 * never be touched again. */
8845 zfree(j);
8846 }
8847
8848 /* Every time a thread finished a Job, it writes a byte into the write side
8849 * of an unix pipe in order to "awake" the main thread, and this function
8850 * is called. */
8851 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8852 int mask)
8853 {
8854 char buf[1];
8855 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8856 REDIS_NOTUSED(el);
8857 REDIS_NOTUSED(mask);
8858 REDIS_NOTUSED(privdata);
8859
8860 /* For every byte we read in the read side of the pipe, there is one
8861 * I/O job completed to process. */
8862 while((retval = read(fd,buf,1)) == 1) {
8863 iojob *j;
8864 listNode *ln;
8865 robj *key;
8866 struct dictEntry *de;
8867
8868 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8869
8870 /* Get the processed element (the oldest one) */
8871 lockThreadedIO();
8872 assert(listLength(server.io_processed) != 0);
8873 if (toprocess == -1) {
8874 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8875 if (toprocess <= 0) toprocess = 1;
8876 }
8877 ln = listFirst(server.io_processed);
8878 j = ln->value;
8879 listDelNode(server.io_processed,ln);
8880 unlockThreadedIO();
8881 /* If this job is marked as canceled, just ignore it */
8882 if (j->canceled) {
8883 freeIOJob(j);
8884 continue;
8885 }
8886 /* Post process it in the main thread, as there are things we
8887 * can do just here to avoid race conditions and/or invasive locks */
8888 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8889 de = dictFind(j->db->dict,j->key);
8890 assert(de != NULL);
8891 key = dictGetEntryKey(de);
8892 if (j->type == REDIS_IOJOB_LOAD) {
8893 redisDb *db;
8894
8895 /* Key loaded, bring it at home */
8896 key->storage = REDIS_VM_MEMORY;
8897 key->vm.atime = server.unixtime;
8898 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8899 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8900 (unsigned char*) key->ptr);
8901 server.vm_stats_swapped_objects--;
8902 server.vm_stats_swapins++;
8903 dictGetEntryVal(de) = j->val;
8904 incrRefCount(j->val);
8905 db = j->db;
8906 freeIOJob(j);
8907 /* Handle clients waiting for this key to be loaded. */
8908 handleClientsBlockedOnSwappedKey(db,key);
8909 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8910 /* Now we know the amount of pages required to swap this object.
8911 * Let's find some space for it, and queue this task again
8912 * rebranded as REDIS_IOJOB_DO_SWAP. */
8913 if (!vmCanSwapOut() ||
8914 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8915 {
8916 /* Ooops... no space or we can't swap as there is
8917 * a fork()ed Redis trying to save stuff on disk. */
8918 freeIOJob(j);
8919 key->storage = REDIS_VM_MEMORY; /* undo operation */
8920 } else {
8921 /* Note that we need to mark this pages as used now,
8922 * if the job will be canceled, we'll mark them as freed
8923 * again. */
8924 vmMarkPagesUsed(j->page,j->pages);
8925 j->type = REDIS_IOJOB_DO_SWAP;
8926 lockThreadedIO();
8927 queueIOJob(j);
8928 unlockThreadedIO();
8929 }
8930 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8931 robj *val;
8932
8933 /* Key swapped. We can finally free some memory. */
8934 if (key->storage != REDIS_VM_SWAPPING) {
8935 printf("key->storage: %d\n",key->storage);
8936 printf("key->name: %s\n",(char*)key->ptr);
8937 printf("key->refcount: %d\n",key->refcount);
8938 printf("val: %p\n",(void*)j->val);
8939 printf("val->type: %d\n",j->val->type);
8940 printf("val->ptr: %s\n",(char*)j->val->ptr);
8941 }
8942 redisAssert(key->storage == REDIS_VM_SWAPPING);
8943 val = dictGetEntryVal(de);
8944 key->vm.page = j->page;
8945 key->vm.usedpages = j->pages;
8946 key->storage = REDIS_VM_SWAPPED;
8947 key->vtype = j->val->type;
8948 decrRefCount(val); /* Deallocate the object from memory. */
8949 dictGetEntryVal(de) = NULL;
8950 redisLog(REDIS_DEBUG,
8951 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8952 (unsigned char*) key->ptr,
8953 (unsigned long long) j->page, (unsigned long long) j->pages);
8954 server.vm_stats_swapped_objects++;
8955 server.vm_stats_swapouts++;
8956 freeIOJob(j);
8957 /* Put a few more swap requests in queue if we are still
8958 * out of memory */
8959 if (trytoswap && vmCanSwapOut() &&
8960 zmalloc_used_memory() > server.vm_max_memory)
8961 {
8962 int more = 1;
8963 while(more) {
8964 lockThreadedIO();
8965 more = listLength(server.io_newjobs) <
8966 (unsigned) server.vm_max_threads;
8967 unlockThreadedIO();
8968 /* Don't waste CPU time if swappable objects are rare. */
8969 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8970 trytoswap = 0;
8971 break;
8972 }
8973 }
8974 }
8975 }
8976 processed++;
8977 if (processed == toprocess) return;
8978 }
8979 if (retval < 0 && errno != EAGAIN) {
8980 redisLog(REDIS_WARNING,
8981 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8982 strerror(errno));
8983 }
8984 }
8985
8986 static void lockThreadedIO(void) {
8987 pthread_mutex_lock(&server.io_mutex);
8988 }
8989
8990 static void unlockThreadedIO(void) {
8991 pthread_mutex_unlock(&server.io_mutex);
8992 }
8993
8994 /* Remove the specified object from the threaded I/O queue if still not
8995 * processed, otherwise make sure to flag it as canceled. */
8996 static void vmCancelThreadedIOJob(robj *o) {
8997 list *lists[3] = {
8998 server.io_newjobs, /* 0 */
8999 server.io_processing, /* 1 */
9000 server.io_processed /* 2 */
9001 };
9002 int i;
9003
9004 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9005 again:
9006 lockThreadedIO();
9007 /* Search for a matching key in one of the queues */
9008 for (i = 0; i < 3; i++) {
9009 listNode *ln;
9010 listIter li;
9011
9012 listRewind(lists[i],&li);
9013 while ((ln = listNext(&li)) != NULL) {
9014 iojob *job = ln->value;
9015
9016 if (job->canceled) continue; /* Skip this, already canceled. */
9017 if (job->key == o) {
9018 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9019 (void*)job, (char*)o->ptr, job->type, i);
9020 /* Mark the pages as free since the swap didn't happened
9021 * or happened but is now discarded. */
9022 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9023 vmMarkPagesFree(job->page,job->pages);
9024 /* Cancel the job. It depends on the list the job is
9025 * living in. */
9026 switch(i) {
9027 case 0: /* io_newjobs */
9028 /* If the job was yet not processed the best thing to do
9029 * is to remove it from the queue at all */
9030 freeIOJob(job);
9031 listDelNode(lists[i],ln);
9032 break;
9033 case 1: /* io_processing */
9034 /* Oh Shi- the thread is messing with the Job:
9035 *
9036 * Probably it's accessing the object if this is a
9037 * PREPARE_SWAP or DO_SWAP job.
9038 * If it's a LOAD job it may be reading from disk and
9039 * if we don't wait for the job to terminate before to
9040 * cancel it, maybe in a few microseconds data can be
9041 * corrupted in this pages. So the short story is:
9042 *
9043 * Better to wait for the job to move into the
9044 * next queue (processed)... */
9045
9046 /* We try again and again until the job is completed. */
9047 unlockThreadedIO();
9048 /* But let's wait some time for the I/O thread
9049 * to finish with this job. After all this condition
9050 * should be very rare. */
9051 usleep(1);
9052 goto again;
9053 case 2: /* io_processed */
9054 /* The job was already processed, that's easy...
9055 * just mark it as canceled so that we'll ignore it
9056 * when processing completed jobs. */
9057 job->canceled = 1;
9058 break;
9059 }
9060 /* Finally we have to adjust the storage type of the object
9061 * in order to "UNDO" the operaiton. */
9062 if (o->storage == REDIS_VM_LOADING)
9063 o->storage = REDIS_VM_SWAPPED;
9064 else if (o->storage == REDIS_VM_SWAPPING)
9065 o->storage = REDIS_VM_MEMORY;
9066 unlockThreadedIO();
9067 return;
9068 }
9069 }
9070 }
9071 unlockThreadedIO();
9072 assert(1 != 1); /* We should never reach this */
9073 }
9074
9075 static void *IOThreadEntryPoint(void *arg) {
9076 iojob *j;
9077 listNode *ln;
9078 REDIS_NOTUSED(arg);
9079
9080 pthread_detach(pthread_self());
9081 while(1) {
9082 /* Get a new job to process */
9083 lockThreadedIO();
9084 if (listLength(server.io_newjobs) == 0) {
9085 /* No new jobs in queue, exit. */
9086 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9087 (long) pthread_self());
9088 server.io_active_threads--;
9089 unlockThreadedIO();
9090 return NULL;
9091 }
9092 ln = listFirst(server.io_newjobs);
9093 j = ln->value;
9094 listDelNode(server.io_newjobs,ln);
9095 /* Add the job in the processing queue */
9096 j->thread = pthread_self();
9097 listAddNodeTail(server.io_processing,j);
9098 ln = listLast(server.io_processing); /* We use ln later to remove it */
9099 unlockThreadedIO();
9100 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9101 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9102
9103 /* Process the Job */
9104 if (j->type == REDIS_IOJOB_LOAD) {
9105 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9106 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9107 FILE *fp = fopen("/dev/null","w+");
9108 j->pages = rdbSavedObjectPages(j->val,fp);
9109 fclose(fp);
9110 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9111 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9112 j->canceled = 1;
9113 }
9114
9115 /* Done: insert the job into the processed queue */
9116 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9117 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9118 lockThreadedIO();
9119 listDelNode(server.io_processing,ln);
9120 listAddNodeTail(server.io_processed,j);
9121 unlockThreadedIO();
9122
9123 /* Signal the main thread there is new stuff to process */
9124 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9125 }
9126 return NULL; /* never reached */
9127 }
9128
9129 static void spawnIOThread(void) {
9130 pthread_t thread;
9131 sigset_t mask, omask;
9132 int err;
9133
9134 sigemptyset(&mask);
9135 sigaddset(&mask,SIGCHLD);
9136 sigaddset(&mask,SIGHUP);
9137 sigaddset(&mask,SIGPIPE);
9138 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9139 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9140 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9141 strerror(err));
9142 usleep(1000000);
9143 }
9144 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9145 server.io_active_threads++;
9146 }
9147
9148 /* We need to wait for the last thread to exit before we are able to
9149 * fork() in order to BGSAVE or BGREWRITEAOF. */
9150 static void waitEmptyIOJobsQueue(void) {
9151 while(1) {
9152 int io_processed_len;
9153
9154 lockThreadedIO();
9155 if (listLength(server.io_newjobs) == 0 &&
9156 listLength(server.io_processing) == 0 &&
9157 server.io_active_threads == 0)
9158 {
9159 unlockThreadedIO();
9160 return;
9161 }
9162 /* While waiting for empty jobs queue condition we post-process some
9163 * finshed job, as I/O threads may be hanging trying to write against
9164 * the io_ready_pipe_write FD but there are so much pending jobs that
9165 * it's blocking. */
9166 io_processed_len = listLength(server.io_processed);
9167 unlockThreadedIO();
9168 if (io_processed_len) {
9169 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9170 usleep(1000); /* 1 millisecond */
9171 } else {
9172 usleep(10000); /* 10 milliseconds */
9173 }
9174 }
9175 }
9176
9177 static void vmReopenSwapFile(void) {
9178 /* Note: we don't close the old one as we are in the child process
9179 * and don't want to mess at all with the original file object. */
9180 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9181 if (server.vm_fp == NULL) {
9182 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9183 server.vm_swap_file);
9184 _exit(1);
9185 }
9186 server.vm_fd = fileno(server.vm_fp);
9187 }
9188
9189 /* This function must be called while with threaded IO locked */
9190 static void queueIOJob(iojob *j) {
9191 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9192 (void*)j, j->type, (char*)j->key->ptr);
9193 listAddNodeTail(server.io_newjobs,j);
9194 if (server.io_active_threads < server.vm_max_threads)
9195 spawnIOThread();
9196 }
9197
9198 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9199 iojob *j;
9200
9201 assert(key->storage == REDIS_VM_MEMORY);
9202 assert(key->refcount == 1);
9203
9204 j = zmalloc(sizeof(*j));
9205 j->type = REDIS_IOJOB_PREPARE_SWAP;
9206 j->db = db;
9207 j->key = key;
9208 j->val = val;
9209 incrRefCount(val);
9210 j->canceled = 0;
9211 j->thread = (pthread_t) -1;
9212 key->storage = REDIS_VM_SWAPPING;
9213
9214 lockThreadedIO();
9215 queueIOJob(j);
9216 unlockThreadedIO();
9217 return REDIS_OK;
9218 }
9219
9220 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9221
9222 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9223 * If there is not already a job loading the key, it is craeted.
9224 * The key is added to the io_keys list in the client structure, and also
9225 * in the hash table mapping swapped keys to waiting clients, that is,
9226 * server.io_waited_keys. */
9227 static int waitForSwappedKey(redisClient *c, robj *key) {
9228 struct dictEntry *de;
9229 robj *o;
9230 list *l;
9231
9232 /* If the key does not exist or is already in RAM we don't need to
9233 * block the client at all. */
9234 de = dictFind(c->db->dict,key);
9235 if (de == NULL) return 0;
9236 o = dictGetEntryKey(de);
9237 if (o->storage == REDIS_VM_MEMORY) {
9238 return 0;
9239 } else if (o->storage == REDIS_VM_SWAPPING) {
9240 /* We were swapping the key, undo it! */
9241 vmCancelThreadedIOJob(o);
9242 return 0;
9243 }
9244
9245 /* OK: the key is either swapped, or being loaded just now. */
9246
9247 /* Add the key to the list of keys this client is waiting for.
9248 * This maps clients to keys they are waiting for. */
9249 listAddNodeTail(c->io_keys,key);
9250 incrRefCount(key);
9251
9252 /* Add the client to the swapped keys => clients waiting map. */
9253 de = dictFind(c->db->io_keys,key);
9254 if (de == NULL) {
9255 int retval;
9256
9257 /* For every key we take a list of clients blocked for it */
9258 l = listCreate();
9259 retval = dictAdd(c->db->io_keys,key,l);
9260 incrRefCount(key);
9261 assert(retval == DICT_OK);
9262 } else {
9263 l = dictGetEntryVal(de);
9264 }
9265 listAddNodeTail(l,c);
9266
9267 /* Are we already loading the key from disk? If not create a job */
9268 if (o->storage == REDIS_VM_SWAPPED) {
9269 iojob *j;
9270
9271 o->storage = REDIS_VM_LOADING;
9272 j = zmalloc(sizeof(*j));
9273 j->type = REDIS_IOJOB_LOAD;
9274 j->db = c->db;
9275 j->key = o;
9276 j->key->vtype = o->vtype;
9277 j->page = o->vm.page;
9278 j->val = NULL;
9279 j->canceled = 0;
9280 j->thread = (pthread_t) -1;
9281 lockThreadedIO();
9282 queueIOJob(j);
9283 unlockThreadedIO();
9284 }
9285 return 1;
9286 }
9287
9288 /* Preload keys needed for the ZUNION and ZINTER commands. */
9289 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9290 int i, num;
9291 num = atoi(c->argv[2]->ptr);
9292 for (i = 0; i < num; i++) {
9293 waitForSwappedKey(c,c->argv[3+i]);
9294 }
9295 }
9296
9297 /* Is this client attempting to run a command against swapped keys?
9298 * If so, block it ASAP, load the keys in background, then resume it.
9299 *
9300 * The important idea about this function is that it can fail! If keys will
9301 * still be swapped when the client is resumed, this key lookups will
9302 * just block loading keys from disk. In practical terms this should only
9303 * happen with SORT BY command or if there is a bug in this function.
9304 *
9305 * Return 1 if the client is marked as blocked, 0 if the client can
9306 * continue as the keys it is going to access appear to be in memory. */
9307 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9308 int j, last;
9309
9310 if (cmd->vm_preload_proc != NULL) {
9311 cmd->vm_preload_proc(c);
9312 } else {
9313 if (cmd->vm_firstkey == 0) return 0;
9314 last = cmd->vm_lastkey;
9315 if (last < 0) last = c->argc+last;
9316 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9317 waitForSwappedKey(c,c->argv[j]);
9318 }
9319
9320 /* If the client was blocked for at least one key, mark it as blocked. */
9321 if (listLength(c->io_keys)) {
9322 c->flags |= REDIS_IO_WAIT;
9323 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9324 server.vm_blocked_clients++;
9325 return 1;
9326 } else {
9327 return 0;
9328 }
9329 }
9330
9331 /* Remove the 'key' from the list of blocked keys for a given client.
9332 *
9333 * The function returns 1 when there are no longer blocking keys after
9334 * the current one was removed (and the client can be unblocked). */
9335 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9336 list *l;
9337 listNode *ln;
9338 listIter li;
9339 struct dictEntry *de;
9340
9341 /* Remove the key from the list of keys this client is waiting for. */
9342 listRewind(c->io_keys,&li);
9343 while ((ln = listNext(&li)) != NULL) {
9344 if (compareStringObjects(ln->value,key) == 0) {
9345 listDelNode(c->io_keys,ln);
9346 break;
9347 }
9348 }
9349 assert(ln != NULL);
9350
9351 /* Remove the client form the key => waiting clients map. */
9352 de = dictFind(c->db->io_keys,key);
9353 assert(de != NULL);
9354 l = dictGetEntryVal(de);
9355 ln = listSearchKey(l,c);
9356 assert(ln != NULL);
9357 listDelNode(l,ln);
9358 if (listLength(l) == 0)
9359 dictDelete(c->db->io_keys,key);
9360
9361 return listLength(c->io_keys) == 0;
9362 }
9363
9364 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9365 struct dictEntry *de;
9366 list *l;
9367 listNode *ln;
9368 int len;
9369
9370 de = dictFind(db->io_keys,key);
9371 if (!de) return;
9372
9373 l = dictGetEntryVal(de);
9374 len = listLength(l);
9375 /* Note: we can't use something like while(listLength(l)) as the list
9376 * can be freed by the calling function when we remove the last element. */
9377 while (len--) {
9378 ln = listFirst(l);
9379 redisClient *c = ln->value;
9380
9381 if (dontWaitForSwappedKey(c,key)) {
9382 /* Put the client in the list of clients ready to go as we
9383 * loaded all the keys about it. */
9384 listAddNodeTail(server.io_ready_clients,c);
9385 }
9386 }
9387 }
9388
9389 /* =========================== Remote Configuration ========================= */
9390
9391 static void configSetCommand(redisClient *c) {
9392 robj *o = getDecodedObject(c->argv[3]);
9393 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9394 zfree(server.dbfilename);
9395 server.dbfilename = zstrdup(o->ptr);
9396 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9397 zfree(server.requirepass);
9398 server.requirepass = zstrdup(o->ptr);
9399 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9400 zfree(server.masterauth);
9401 server.masterauth = zstrdup(o->ptr);
9402 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9403 server.maxmemory = strtoll(o->ptr, NULL, 10);
9404 } else {
9405 addReplySds(c,sdscatprintf(sdsempty(),
9406 "-ERR not supported CONFIG parameter %s\r\n",
9407 (char*)c->argv[2]->ptr));
9408 decrRefCount(o);
9409 return;
9410 }
9411 decrRefCount(o);
9412 addReply(c,shared.ok);
9413 }
9414
9415 static void configGetCommand(redisClient *c) {
9416 robj *o = getDecodedObject(c->argv[2]);
9417 robj *lenobj = createObject(REDIS_STRING,NULL);
9418 char *pattern = o->ptr;
9419 int matches = 0;
9420
9421 addReply(c,lenobj);
9422 decrRefCount(lenobj);
9423
9424 if (stringmatch(pattern,"dbfilename",0)) {
9425 addReplyBulkCString(c,"dbfilename");
9426 addReplyBulkCString(c,server.dbfilename);
9427 matches++;
9428 }
9429 if (stringmatch(pattern,"requirepass",0)) {
9430 addReplyBulkCString(c,"requirepass");
9431 addReplyBulkCString(c,server.requirepass);
9432 matches++;
9433 }
9434 if (stringmatch(pattern,"masterauth",0)) {
9435 addReplyBulkCString(c,"masterauth");
9436 addReplyBulkCString(c,server.masterauth);
9437 matches++;
9438 }
9439 if (stringmatch(pattern,"maxmemory",0)) {
9440 char buf[128];
9441
9442 snprintf(buf,128,"%llu\n",server.maxmemory);
9443 addReplyBulkCString(c,"maxmemory");
9444 addReplyBulkCString(c,buf);
9445 matches++;
9446 }
9447 decrRefCount(o);
9448 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9449 }
9450
9451 static void configCommand(redisClient *c) {
9452 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9453 if (c->argc != 4) goto badarity;
9454 configSetCommand(c);
9455 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9456 if (c->argc != 3) goto badarity;
9457 configGetCommand(c);
9458 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9459 if (c->argc != 2) goto badarity;
9460 server.stat_numcommands = 0;
9461 server.stat_numconnections = 0;
9462 server.stat_expiredkeys = 0;
9463 server.stat_starttime = time(NULL);
9464 addReply(c,shared.ok);
9465 } else {
9466 addReplySds(c,sdscatprintf(sdsempty(),
9467 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9468 }
9469 return;
9470
9471 badarity:
9472 addReplySds(c,sdscatprintf(sdsempty(),
9473 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9474 (char*) c->argv[1]->ptr));
9475 }
9476
9477 /* =========================== Pubsub implementation ======================== */
9478
9479 static void freePubsubPattern(void *p) {
9480 pubsubPattern *pat = p;
9481
9482 decrRefCount(pat->pattern);
9483 zfree(pat);
9484 }
9485
9486 static int listMatchPubsubPattern(void *a, void *b) {
9487 pubsubPattern *pa = a, *pb = b;
9488
9489 return (pa->client == pb->client) &&
9490 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9491 }
9492
9493 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9494 * 0 if the client was already subscribed to that channel. */
9495 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9496 struct dictEntry *de;
9497 list *clients = NULL;
9498 int retval = 0;
9499
9500 /* Add the channel to the client -> channels hash table */
9501 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9502 retval = 1;
9503 incrRefCount(channel);
9504 /* Add the client to the channel -> list of clients hash table */
9505 de = dictFind(server.pubsub_channels,channel);
9506 if (de == NULL) {
9507 clients = listCreate();
9508 dictAdd(server.pubsub_channels,channel,clients);
9509 incrRefCount(channel);
9510 } else {
9511 clients = dictGetEntryVal(de);
9512 }
9513 listAddNodeTail(clients,c);
9514 }
9515 /* Notify the client */
9516 addReply(c,shared.mbulk3);
9517 addReply(c,shared.subscribebulk);
9518 addReplyBulk(c,channel);
9519 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9520 return retval;
9521 }
9522
9523 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9524 * 0 if the client was not subscribed to the specified channel. */
9525 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9526 struct dictEntry *de;
9527 list *clients;
9528 listNode *ln;
9529 int retval = 0;
9530
9531 /* Remove the channel from the client -> channels hash table */
9532 incrRefCount(channel); /* channel may be just a pointer to the same object
9533 we have in the hash tables. Protect it... */
9534 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9535 retval = 1;
9536 /* Remove the client from the channel -> clients list hash table */
9537 de = dictFind(server.pubsub_channels,channel);
9538 assert(de != NULL);
9539 clients = dictGetEntryVal(de);
9540 ln = listSearchKey(clients,c);
9541 assert(ln != NULL);
9542 listDelNode(clients,ln);
9543 if (listLength(clients) == 0) {
9544 /* Free the list and associated hash entry at all if this was
9545 * the latest client, so that it will be possible to abuse
9546 * Redis PUBSUB creating millions of channels. */
9547 dictDelete(server.pubsub_channels,channel);
9548 }
9549 }
9550 /* Notify the client */
9551 if (notify) {
9552 addReply(c,shared.mbulk3);
9553 addReply(c,shared.unsubscribebulk);
9554 addReplyBulk(c,channel);
9555 addReplyLong(c,dictSize(c->pubsub_channels)+
9556 listLength(c->pubsub_patterns));
9557
9558 }
9559 decrRefCount(channel); /* it is finally safe to release it */
9560 return retval;
9561 }
9562
9563 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9564 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9565 int retval = 0;
9566
9567 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9568 retval = 1;
9569 pubsubPattern *pat;
9570 listAddNodeTail(c->pubsub_patterns,pattern);
9571 incrRefCount(pattern);
9572 pat = zmalloc(sizeof(*pat));
9573 pat->pattern = getDecodedObject(pattern);
9574 pat->client = c;
9575 listAddNodeTail(server.pubsub_patterns,pat);
9576 }
9577 /* Notify the client */
9578 addReply(c,shared.mbulk3);
9579 addReply(c,shared.psubscribebulk);
9580 addReplyBulk(c,pattern);
9581 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9582 return retval;
9583 }
9584
9585 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9586 * 0 if the client was not subscribed to the specified channel. */
9587 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9588 listNode *ln;
9589 pubsubPattern pat;
9590 int retval = 0;
9591
9592 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9593 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9594 retval = 1;
9595 listDelNode(c->pubsub_patterns,ln);
9596 pat.client = c;
9597 pat.pattern = pattern;
9598 ln = listSearchKey(server.pubsub_patterns,&pat);
9599 listDelNode(server.pubsub_patterns,ln);
9600 }
9601 /* Notify the client */
9602 if (notify) {
9603 addReply(c,shared.mbulk3);
9604 addReply(c,shared.punsubscribebulk);
9605 addReplyBulk(c,pattern);
9606 addReplyLong(c,dictSize(c->pubsub_channels)+
9607 listLength(c->pubsub_patterns));
9608 }
9609 decrRefCount(pattern);
9610 return retval;
9611 }
9612
9613 /* Unsubscribe from all the channels. Return the number of channels the
9614 * client was subscribed from. */
9615 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9616 dictIterator *di = dictGetIterator(c->pubsub_channels);
9617 dictEntry *de;
9618 int count = 0;
9619
9620 while((de = dictNext(di)) != NULL) {
9621 robj *channel = dictGetEntryKey(de);
9622
9623 count += pubsubUnsubscribeChannel(c,channel,notify);
9624 }
9625 dictReleaseIterator(di);
9626 return count;
9627 }
9628
9629 /* Unsubscribe from all the patterns. Return the number of patterns the
9630 * client was subscribed from. */
9631 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9632 listNode *ln;
9633 listIter li;
9634 int count = 0;
9635
9636 listRewind(c->pubsub_patterns,&li);
9637 while ((ln = listNext(&li)) != NULL) {
9638 robj *pattern = ln->value;
9639
9640 count += pubsubUnsubscribePattern(c,pattern,notify);
9641 }
9642 return count;
9643 }
9644
9645 /* Publish a message */
9646 static int pubsubPublishMessage(robj *channel, robj *message) {
9647 int receivers = 0;
9648 struct dictEntry *de;
9649 listNode *ln;
9650 listIter li;
9651
9652 /* Send to clients listening for that channel */
9653 de = dictFind(server.pubsub_channels,channel);
9654 if (de) {
9655 list *list = dictGetEntryVal(de);
9656 listNode *ln;
9657 listIter li;
9658
9659 listRewind(list,&li);
9660 while ((ln = listNext(&li)) != NULL) {
9661 redisClient *c = ln->value;
9662
9663 addReply(c,shared.mbulk3);
9664 addReply(c,shared.messagebulk);
9665 addReplyBulk(c,channel);
9666 addReplyBulk(c,message);
9667 receivers++;
9668 }
9669 }
9670 /* Send to clients listening to matching channels */
9671 if (listLength(server.pubsub_patterns)) {
9672 listRewind(server.pubsub_patterns,&li);
9673 channel = getDecodedObject(channel);
9674 while ((ln = listNext(&li)) != NULL) {
9675 pubsubPattern *pat = ln->value;
9676
9677 if (stringmatchlen((char*)pat->pattern->ptr,
9678 sdslen(pat->pattern->ptr),
9679 (char*)channel->ptr,
9680 sdslen(channel->ptr),0)) {
9681 addReply(pat->client,shared.mbulk3);
9682 addReply(pat->client,shared.messagebulk);
9683 addReplyBulk(pat->client,channel);
9684 addReplyBulk(pat->client,message);
9685 receivers++;
9686 }
9687 }
9688 decrRefCount(channel);
9689 }
9690 return receivers;
9691 }
9692
9693 static void subscribeCommand(redisClient *c) {
9694 int j;
9695
9696 for (j = 1; j < c->argc; j++)
9697 pubsubSubscribeChannel(c,c->argv[j]);
9698 }
9699
9700 static void unsubscribeCommand(redisClient *c) {
9701 if (c->argc == 1) {
9702 pubsubUnsubscribeAllChannels(c,1);
9703 return;
9704 } else {
9705 int j;
9706
9707 for (j = 1; j < c->argc; j++)
9708 pubsubUnsubscribeChannel(c,c->argv[j],1);
9709 }
9710 }
9711
9712 static void psubscribeCommand(redisClient *c) {
9713 int j;
9714
9715 for (j = 1; j < c->argc; j++)
9716 pubsubSubscribePattern(c,c->argv[j]);
9717 }
9718
9719 static void punsubscribeCommand(redisClient *c) {
9720 if (c->argc == 1) {
9721 pubsubUnsubscribeAllPatterns(c,1);
9722 return;
9723 } else {
9724 int j;
9725
9726 for (j = 1; j < c->argc; j++)
9727 pubsubUnsubscribePattern(c,c->argv[j],1);
9728 }
9729 }
9730
9731 static void publishCommand(redisClient *c) {
9732 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9733 addReplyLong(c,receivers);
9734 }
9735
9736 /* ================================= Debugging ============================== */
9737
9738 static void debugCommand(redisClient *c) {
9739 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9740 *((char*)-1) = 'x';
9741 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9742 if (rdbSave(server.dbfilename) != REDIS_OK) {
9743 addReply(c,shared.err);
9744 return;
9745 }
9746 emptyDb();
9747 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9748 addReply(c,shared.err);
9749 return;
9750 }
9751 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9752 addReply(c,shared.ok);
9753 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9754 emptyDb();
9755 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9756 addReply(c,shared.err);
9757 return;
9758 }
9759 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9760 addReply(c,shared.ok);
9761 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9762 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9763 robj *key, *val;
9764
9765 if (!de) {
9766 addReply(c,shared.nokeyerr);
9767 return;
9768 }
9769 key = dictGetEntryKey(de);
9770 val = dictGetEntryVal(de);
9771 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9772 key->storage == REDIS_VM_SWAPPING)) {
9773 char *strenc;
9774 char buf[128];
9775
9776 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9777 strenc = strencoding[val->encoding];
9778 } else {
9779 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9780 strenc = buf;
9781 }
9782 addReplySds(c,sdscatprintf(sdsempty(),
9783 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9784 "encoding:%s serializedlength:%lld\r\n",
9785 (void*)key, key->refcount, (void*)val, val->refcount,
9786 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9787 } else {
9788 addReplySds(c,sdscatprintf(sdsempty(),
9789 "+Key at:%p refcount:%d, value swapped at: page %llu "
9790 "using %llu pages\r\n",
9791 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9792 (unsigned long long) key->vm.usedpages));
9793 }
9794 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9795 lookupKeyRead(c->db,c->argv[2]);
9796 addReply(c,shared.ok);
9797 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9798 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9799 robj *key, *val;
9800
9801 if (!server.vm_enabled) {
9802 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9803 return;
9804 }
9805 if (!de) {
9806 addReply(c,shared.nokeyerr);
9807 return;
9808 }
9809 key = dictGetEntryKey(de);
9810 val = dictGetEntryVal(de);
9811 /* If the key is shared we want to create a copy */
9812 if (key->refcount > 1) {
9813 robj *newkey = dupStringObject(key);
9814 decrRefCount(key);
9815 key = dictGetEntryKey(de) = newkey;
9816 }
9817 /* Swap it */
9818 if (key->storage != REDIS_VM_MEMORY) {
9819 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9820 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9821 dictGetEntryVal(de) = NULL;
9822 addReply(c,shared.ok);
9823 } else {
9824 addReply(c,shared.err);
9825 }
9826 } else {
9827 addReplySds(c,sdsnew(
9828 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9829 }
9830 }
9831
9832 static void _redisAssert(char *estr, char *file, int line) {
9833 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9834 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9835 #ifdef HAVE_BACKTRACE
9836 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9837 *((char*)-1) = 'x';
9838 #endif
9839 }
9840
9841 /* =================================== Main! ================================ */
9842
9843 #ifdef __linux__
9844 int linuxOvercommitMemoryValue(void) {
9845 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9846 char buf[64];
9847
9848 if (!fp) return -1;
9849 if (fgets(buf,64,fp) == NULL) {
9850 fclose(fp);
9851 return -1;
9852 }
9853 fclose(fp);
9854
9855 return atoi(buf);
9856 }
9857
9858 void linuxOvercommitMemoryWarning(void) {
9859 if (linuxOvercommitMemoryValue() == 0) {
9860 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9861 }
9862 }
9863 #endif /* __linux__ */
9864
9865 static void daemonize(void) {
9866 int fd;
9867 FILE *fp;
9868
9869 if (fork() != 0) exit(0); /* parent exits */
9870 setsid(); /* create a new session */
9871
9872 /* Every output goes to /dev/null. If Redis is daemonized but
9873 * the 'logfile' is set to 'stdout' in the configuration file
9874 * it will not log at all. */
9875 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9876 dup2(fd, STDIN_FILENO);
9877 dup2(fd, STDOUT_FILENO);
9878 dup2(fd, STDERR_FILENO);
9879 if (fd > STDERR_FILENO) close(fd);
9880 }
9881 /* Try to write the pid file */
9882 fp = fopen(server.pidfile,"w");
9883 if (fp) {
9884 fprintf(fp,"%d\n",getpid());
9885 fclose(fp);
9886 }
9887 }
9888
9889 static void version() {
9890 printf("Redis server version %s\n", REDIS_VERSION);
9891 exit(0);
9892 }
9893
9894 static void usage() {
9895 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9896 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9897 exit(1);
9898 }
9899
9900 int main(int argc, char **argv) {
9901 time_t start;
9902
9903 initServerConfig();
9904 if (argc == 2) {
9905 if (strcmp(argv[1], "-v") == 0 ||
9906 strcmp(argv[1], "--version") == 0) version();
9907 if (strcmp(argv[1], "--help") == 0) usage();
9908 resetServerSaveParams();
9909 loadServerConfig(argv[1]);
9910 } else if ((argc > 2)) {
9911 usage();
9912 } else {
9913 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9914 }
9915 if (server.daemonize) daemonize();
9916 initServer();
9917 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9918 #ifdef __linux__
9919 linuxOvercommitMemoryWarning();
9920 #endif
9921 start = time(NULL);
9922 if (server.appendonly) {
9923 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9924 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9925 } else {
9926 if (rdbLoad(server.dbfilename) == REDIS_OK)
9927 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9928 }
9929 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9930 aeSetBeforeSleepProc(server.el,beforeSleep);
9931 aeMain(server.el);
9932 aeDeleteEventLoop(server.el);
9933 return 0;
9934 }
9935
9936 /* ============================= Backtrace support ========================= */
9937
9938 #ifdef HAVE_BACKTRACE
9939 static char *findFuncName(void *pointer, unsigned long *offset);
9940
9941 static void *getMcontextEip(ucontext_t *uc) {
9942 #if defined(__FreeBSD__)
9943 return (void*) uc->uc_mcontext.mc_eip;
9944 #elif defined(__dietlibc__)
9945 return (void*) uc->uc_mcontext.eip;
9946 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9947 #if __x86_64__
9948 return (void*) uc->uc_mcontext->__ss.__rip;
9949 #else
9950 return (void*) uc->uc_mcontext->__ss.__eip;
9951 #endif
9952 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9953 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9954 return (void*) uc->uc_mcontext->__ss.__rip;
9955 #else
9956 return (void*) uc->uc_mcontext->__ss.__eip;
9957 #endif
9958 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9959 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9960 #elif defined(__ia64__) /* Linux IA64 */
9961 return (void*) uc->uc_mcontext.sc_ip;
9962 #else
9963 return NULL;
9964 #endif
9965 }
9966
9967 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9968 void *trace[100];
9969 char **messages = NULL;
9970 int i, trace_size = 0;
9971 unsigned long offset=0;
9972 ucontext_t *uc = (ucontext_t*) secret;
9973 sds infostring;
9974 REDIS_NOTUSED(info);
9975
9976 redisLog(REDIS_WARNING,
9977 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9978 infostring = genRedisInfoString();
9979 redisLog(REDIS_WARNING, "%s",infostring);
9980 /* It's not safe to sdsfree() the returned string under memory
9981 * corruption conditions. Let it leak as we are going to abort */
9982
9983 trace_size = backtrace(trace, 100);
9984 /* overwrite sigaction with caller's address */
9985 if (getMcontextEip(uc) != NULL) {
9986 trace[1] = getMcontextEip(uc);
9987 }
9988 messages = backtrace_symbols(trace, trace_size);
9989
9990 for (i=1; i<trace_size; ++i) {
9991 char *fn = findFuncName(trace[i], &offset), *p;
9992
9993 p = strchr(messages[i],'+');
9994 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9995 redisLog(REDIS_WARNING,"%s", messages[i]);
9996 } else {
9997 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9998 }
9999 }
10000 /* free(messages); Don't call free() with possibly corrupted memory. */
10001 _exit(0);
10002 }
10003
10004 static void setupSigSegvAction(void) {
10005 struct sigaction act;
10006
10007 sigemptyset (&act.sa_mask);
10008 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10009 * is used. Otherwise, sa_handler is used */
10010 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10011 act.sa_sigaction = segvHandler;
10012 sigaction (SIGSEGV, &act, NULL);
10013 sigaction (SIGBUS, &act, NULL);
10014 sigaction (SIGFPE, &act, NULL);
10015 sigaction (SIGILL, &act, NULL);
10016 sigaction (SIGBUS, &act, NULL);
10017 return;
10018 }
10019
10020 #include "staticsymbols.h"
10021 /* This function try to convert a pointer into a function name. It's used in
10022 * oreder to provide a backtrace under segmentation fault that's able to
10023 * display functions declared as static (otherwise the backtrace is useless). */
10024 static char *findFuncName(void *pointer, unsigned long *offset){
10025 int i, ret = -1;
10026 unsigned long off, minoff = 0;
10027
10028 /* Try to match against the Symbol with the smallest offset */
10029 for (i=0; symsTable[i].pointer; i++) {
10030 unsigned long lp = (unsigned long) pointer;
10031
10032 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10033 off=lp-symsTable[i].pointer;
10034 if (ret < 0 || off < minoff) {
10035 minoff=off;
10036 ret=i;
10037 }
10038 }
10039 }
10040 if (ret == -1) return NULL;
10041 *offset = minoff;
10042 return symsTable[ret].name;
10043 }
10044 #else /* HAVE_BACKTRACE */
10045 static void setupSigSegvAction(void) {
10046 }
10047 #endif /* HAVE_BACKTRACE */
10048
10049
10050
10051 /* The End */
10052
10053
10054