]> git.saurik.com Git - redis.git/blob - redis.c
ee2a9696ad04da65cee4ea3d3089ffb44ee04331
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
518 } shared;
519
520 /* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
526 /* VM threaded I/O request message */
527 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
530 typedef struct iojob {
531 int type; /* Request type, REDIS_IOJOB_* */
532 redisDb *db;/* Redis database */
533 robj *key; /* This I/O request is about swapping this key */
534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540 } iojob;
541
542 /*================================ Prototypes =============================== */
543
544 static void freeStringObject(robj *o);
545 static void freeListObject(robj *o);
546 static void freeSetObject(robj *o);
547 static void decrRefCount(void *o);
548 static robj *createObject(int type, void *ptr);
549 static void freeClient(redisClient *c);
550 static int rdbLoad(char *filename);
551 static void addReply(redisClient *c, robj *obj);
552 static void addReplySds(redisClient *c, sds s);
553 static void incrRefCount(robj *o);
554 static int rdbSaveBackground(char *filename);
555 static robj *createStringObject(char *ptr, size_t len);
556 static robj *dupStringObject(robj *o);
557 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
559 static int syncWithMaster(void);
560 static robj *tryObjectEncoding(robj *o);
561 static robj *getDecodedObject(robj *o);
562 static int removeExpire(redisDb *db, robj *key);
563 static int expireIfNeeded(redisDb *db, robj *key);
564 static int deleteIfVolatile(redisDb *db, robj *key);
565 static int deleteIfSwapped(redisDb *db, robj *key);
566 static int deleteKey(redisDb *db, robj *key);
567 static time_t getExpire(redisDb *db, robj *key);
568 static int setExpire(redisDb *db, robj *key, time_t when);
569 static void updateSlavesWaitingBgsave(int bgsaveerr);
570 static void freeMemoryIfNeeded(void);
571 static int processCommand(redisClient *c);
572 static void setupSigSegvAction(void);
573 static void rdbRemoveTempFile(pid_t childpid);
574 static void aofRemoveTempFile(pid_t childpid);
575 static size_t stringObjectLen(robj *o);
576 static void processInputBuffer(redisClient *c);
577 static zskiplist *zslCreate(void);
578 static void zslFree(zskiplist *zsl);
579 static void zslInsert(zskiplist *zsl, double score, robj *obj);
580 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
581 static void initClientMultiState(redisClient *c);
582 static void freeClientMultiState(redisClient *c);
583 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584 static void unblockClientWaitingData(redisClient *c);
585 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586 static void vmInit(void);
587 static void vmMarkPagesFree(off_t page, off_t count);
588 static robj *vmLoadObject(robj *key);
589 static robj *vmPreviewObject(robj *key);
590 static int vmSwapOneObjectBlocking(void);
591 static int vmSwapOneObjectThreaded(void);
592 static int vmCanSwapOut(void);
593 static int tryFreeOneObjectFromFreelist(void);
594 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596 static void vmCancelThreadedIOJob(robj *o);
597 static void lockThreadedIO(void);
598 static void unlockThreadedIO(void);
599 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600 static void freeIOJob(iojob *j);
601 static void queueIOJob(iojob *j);
602 static int vmWriteObjectOnSwap(robj *o, off_t page);
603 static robj *vmReadObjectFromSwap(off_t page, int type);
604 static void waitEmptyIOJobsQueue(void);
605 static void vmReopenSwapFile(void);
606 static int vmFreePage(off_t page);
607 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
608 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609 static int dontWaitForSwappedKey(redisClient *c, robj *key);
610 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612 static struct redisCommand *lookupCommand(char *name);
613 static void call(redisClient *c, struct redisCommand *cmd);
614 static void resetClient(redisClient *c);
615 static void convertToRealHash(robj *o);
616 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618 static void freePubsubPattern(void *p);
619 static int listMatchPubsubPattern(void *a, void *b);
620 static int compareStringObjects(robj *a, robj *b);
621 static void usage();
622
623 static void authCommand(redisClient *c);
624 static void pingCommand(redisClient *c);
625 static void echoCommand(redisClient *c);
626 static void setCommand(redisClient *c);
627 static void setnxCommand(redisClient *c);
628 static void getCommand(redisClient *c);
629 static void delCommand(redisClient *c);
630 static void existsCommand(redisClient *c);
631 static void incrCommand(redisClient *c);
632 static void decrCommand(redisClient *c);
633 static void incrbyCommand(redisClient *c);
634 static void decrbyCommand(redisClient *c);
635 static void selectCommand(redisClient *c);
636 static void randomkeyCommand(redisClient *c);
637 static void keysCommand(redisClient *c);
638 static void dbsizeCommand(redisClient *c);
639 static void lastsaveCommand(redisClient *c);
640 static void saveCommand(redisClient *c);
641 static void bgsaveCommand(redisClient *c);
642 static void bgrewriteaofCommand(redisClient *c);
643 static void shutdownCommand(redisClient *c);
644 static void moveCommand(redisClient *c);
645 static void renameCommand(redisClient *c);
646 static void renamenxCommand(redisClient *c);
647 static void lpushCommand(redisClient *c);
648 static void rpushCommand(redisClient *c);
649 static void lpopCommand(redisClient *c);
650 static void rpopCommand(redisClient *c);
651 static void llenCommand(redisClient *c);
652 static void lindexCommand(redisClient *c);
653 static void lrangeCommand(redisClient *c);
654 static void ltrimCommand(redisClient *c);
655 static void typeCommand(redisClient *c);
656 static void lsetCommand(redisClient *c);
657 static void saddCommand(redisClient *c);
658 static void sremCommand(redisClient *c);
659 static void smoveCommand(redisClient *c);
660 static void sismemberCommand(redisClient *c);
661 static void scardCommand(redisClient *c);
662 static void spopCommand(redisClient *c);
663 static void srandmemberCommand(redisClient *c);
664 static void sinterCommand(redisClient *c);
665 static void sinterstoreCommand(redisClient *c);
666 static void sunionCommand(redisClient *c);
667 static void sunionstoreCommand(redisClient *c);
668 static void sdiffCommand(redisClient *c);
669 static void sdiffstoreCommand(redisClient *c);
670 static void syncCommand(redisClient *c);
671 static void flushdbCommand(redisClient *c);
672 static void flushallCommand(redisClient *c);
673 static void sortCommand(redisClient *c);
674 static void lremCommand(redisClient *c);
675 static void rpoplpushcommand(redisClient *c);
676 static void infoCommand(redisClient *c);
677 static void mgetCommand(redisClient *c);
678 static void monitorCommand(redisClient *c);
679 static void expireCommand(redisClient *c);
680 static void expireatCommand(redisClient *c);
681 static void getsetCommand(redisClient *c);
682 static void ttlCommand(redisClient *c);
683 static void slaveofCommand(redisClient *c);
684 static void debugCommand(redisClient *c);
685 static void msetCommand(redisClient *c);
686 static void msetnxCommand(redisClient *c);
687 static void zaddCommand(redisClient *c);
688 static void zincrbyCommand(redisClient *c);
689 static void zrangeCommand(redisClient *c);
690 static void zrangebyscoreCommand(redisClient *c);
691 static void zcountCommand(redisClient *c);
692 static void zrevrangeCommand(redisClient *c);
693 static void zcardCommand(redisClient *c);
694 static void zremCommand(redisClient *c);
695 static void zscoreCommand(redisClient *c);
696 static void zremrangebyscoreCommand(redisClient *c);
697 static void multiCommand(redisClient *c);
698 static void execCommand(redisClient *c);
699 static void discardCommand(redisClient *c);
700 static void blpopCommand(redisClient *c);
701 static void brpopCommand(redisClient *c);
702 static void appendCommand(redisClient *c);
703 static void substrCommand(redisClient *c);
704 static void zrankCommand(redisClient *c);
705 static void zrevrankCommand(redisClient *c);
706 static void hsetCommand(redisClient *c);
707 static void hgetCommand(redisClient *c);
708 static void hmsetCommand(redisClient *c);
709 static void hmgetCommand(redisClient *c);
710 static void hdelCommand(redisClient *c);
711 static void hlenCommand(redisClient *c);
712 static void zremrangebyrankCommand(redisClient *c);
713 static void zunionCommand(redisClient *c);
714 static void zinterCommand(redisClient *c);
715 static void hkeysCommand(redisClient *c);
716 static void hvalsCommand(redisClient *c);
717 static void hgetallCommand(redisClient *c);
718 static void hexistsCommand(redisClient *c);
719 static void configCommand(redisClient *c);
720 static void hincrbyCommand(redisClient *c);
721 static void subscribeCommand(redisClient *c);
722 static void unsubscribeCommand(redisClient *c);
723 static void psubscribeCommand(redisClient *c);
724 static void punsubscribeCommand(redisClient *c);
725 static void publishCommand(redisClient *c);
726
727 /*================================= Globals ================================= */
728
729 /* Global vars */
730 static struct redisServer server; /* server global state */
731 static struct redisCommand cmdTable[] = {
732 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
735 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
738 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
742 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
754 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
755 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
758 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
776 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
799 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
800 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
812 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
820 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
831 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
836 {NULL,NULL,0,0,NULL,0,0,0}
837 };
838
839 /*============================ Utility functions ============================ */
840
841 /* Glob-style pattern matching. */
842 static int stringmatchlen(const char *pattern, int patternLen,
843 const char *string, int stringLen, int nocase)
844 {
845 while(patternLen) {
846 switch(pattern[0]) {
847 case '*':
848 while (pattern[1] == '*') {
849 pattern++;
850 patternLen--;
851 }
852 if (patternLen == 1)
853 return 1; /* match */
854 while(stringLen) {
855 if (stringmatchlen(pattern+1, patternLen-1,
856 string, stringLen, nocase))
857 return 1; /* match */
858 string++;
859 stringLen--;
860 }
861 return 0; /* no match */
862 break;
863 case '?':
864 if (stringLen == 0)
865 return 0; /* no match */
866 string++;
867 stringLen--;
868 break;
869 case '[':
870 {
871 int not, match;
872
873 pattern++;
874 patternLen--;
875 not = pattern[0] == '^';
876 if (not) {
877 pattern++;
878 patternLen--;
879 }
880 match = 0;
881 while(1) {
882 if (pattern[0] == '\\') {
883 pattern++;
884 patternLen--;
885 if (pattern[0] == string[0])
886 match = 1;
887 } else if (pattern[0] == ']') {
888 break;
889 } else if (patternLen == 0) {
890 pattern--;
891 patternLen++;
892 break;
893 } else if (pattern[1] == '-' && patternLen >= 3) {
894 int start = pattern[0];
895 int end = pattern[2];
896 int c = string[0];
897 if (start > end) {
898 int t = start;
899 start = end;
900 end = t;
901 }
902 if (nocase) {
903 start = tolower(start);
904 end = tolower(end);
905 c = tolower(c);
906 }
907 pattern += 2;
908 patternLen -= 2;
909 if (c >= start && c <= end)
910 match = 1;
911 } else {
912 if (!nocase) {
913 if (pattern[0] == string[0])
914 match = 1;
915 } else {
916 if (tolower((int)pattern[0]) == tolower((int)string[0]))
917 match = 1;
918 }
919 }
920 pattern++;
921 patternLen--;
922 }
923 if (not)
924 match = !match;
925 if (!match)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 }
931 case '\\':
932 if (patternLen >= 2) {
933 pattern++;
934 patternLen--;
935 }
936 /* fall through */
937 default:
938 if (!nocase) {
939 if (pattern[0] != string[0])
940 return 0; /* no match */
941 } else {
942 if (tolower((int)pattern[0]) != tolower((int)string[0]))
943 return 0; /* no match */
944 }
945 string++;
946 stringLen--;
947 break;
948 }
949 pattern++;
950 patternLen--;
951 if (stringLen == 0) {
952 while(*pattern == '*') {
953 pattern++;
954 patternLen--;
955 }
956 break;
957 }
958 }
959 if (patternLen == 0 && stringLen == 0)
960 return 1;
961 return 0;
962 }
963
964 static int stringmatch(const char *pattern, const char *string, int nocase) {
965 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
966 }
967
968 static void redisLog(int level, const char *fmt, ...) {
969 va_list ap;
970 FILE *fp;
971
972 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
973 if (!fp) return;
974
975 va_start(ap, fmt);
976 if (level >= server.verbosity) {
977 char *c = ".-*#";
978 char buf[64];
979 time_t now;
980
981 now = time(NULL);
982 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
983 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
984 vfprintf(fp, fmt, ap);
985 fprintf(fp,"\n");
986 fflush(fp);
987 }
988 va_end(ap);
989
990 if (server.logfile) fclose(fp);
991 }
992
993 /*====================== Hash table type implementation ==================== */
994
995 /* This is an hash table type that uses the SDS dynamic strings libary as
996 * keys and radis objects as values (objects can hold SDS strings,
997 * lists, sets). */
998
999 static void dictVanillaFree(void *privdata, void *val)
1000 {
1001 DICT_NOTUSED(privdata);
1002 zfree(val);
1003 }
1004
1005 static void dictListDestructor(void *privdata, void *val)
1006 {
1007 DICT_NOTUSED(privdata);
1008 listRelease((list*)val);
1009 }
1010
1011 static int sdsDictKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 int l1,l2;
1015 DICT_NOTUSED(privdata);
1016
1017 l1 = sdslen((sds)key1);
1018 l2 = sdslen((sds)key2);
1019 if (l1 != l2) return 0;
1020 return memcmp(key1, key2, l1) == 0;
1021 }
1022
1023 static void dictRedisObjectDestructor(void *privdata, void *val)
1024 {
1025 DICT_NOTUSED(privdata);
1026
1027 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1028 decrRefCount(val);
1029 }
1030
1031 static int dictObjKeyCompare(void *privdata, const void *key1,
1032 const void *key2)
1033 {
1034 const robj *o1 = key1, *o2 = key2;
1035 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1036 }
1037
1038 static unsigned int dictObjHash(const void *key) {
1039 const robj *o = key;
1040 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1041 }
1042
1043 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1044 const void *key2)
1045 {
1046 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1047 int cmp;
1048
1049 if (o1->encoding == REDIS_ENCODING_INT &&
1050 o2->encoding == REDIS_ENCODING_INT &&
1051 o1->ptr == o2->ptr) return 1;
1052
1053 o1 = getDecodedObject(o1);
1054 o2 = getDecodedObject(o2);
1055 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1056 decrRefCount(o1);
1057 decrRefCount(o2);
1058 return cmp;
1059 }
1060
1061 static unsigned int dictEncObjHash(const void *key) {
1062 robj *o = (robj*) key;
1063
1064 if (o->encoding == REDIS_ENCODING_RAW) {
1065 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1066 } else {
1067 if (o->encoding == REDIS_ENCODING_INT) {
1068 char buf[32];
1069 int len;
1070
1071 len = snprintf(buf,32,"%ld",(long)o->ptr);
1072 return dictGenHashFunction((unsigned char*)buf, len);
1073 } else {
1074 unsigned int hash;
1075
1076 o = getDecodedObject(o);
1077 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1078 decrRefCount(o);
1079 return hash;
1080 }
1081 }
1082 }
1083
1084 /* Sets type and expires */
1085 static dictType setDictType = {
1086 dictEncObjHash, /* hash function */
1087 NULL, /* key dup */
1088 NULL, /* val dup */
1089 dictEncObjKeyCompare, /* key compare */
1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092 };
1093
1094 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1095 static dictType zsetDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
1101 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1102 };
1103
1104 /* Db->dict */
1105 static dictType dbDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictRedisObjectDestructor /* val destructor */
1112 };
1113
1114 /* Db->expires */
1115 static dictType keyptrDictType = {
1116 dictObjHash, /* hash function */
1117 NULL, /* key dup */
1118 NULL, /* val dup */
1119 dictObjKeyCompare, /* key compare */
1120 dictRedisObjectDestructor, /* key destructor */
1121 NULL /* val destructor */
1122 };
1123
1124 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1125 static dictType hashDictType = {
1126 dictEncObjHash, /* hash function */
1127 NULL, /* key dup */
1128 NULL, /* val dup */
1129 dictEncObjKeyCompare, /* key compare */
1130 dictRedisObjectDestructor, /* key destructor */
1131 dictRedisObjectDestructor /* val destructor */
1132 };
1133
1134 /* Keylist hash table type has unencoded redis objects as keys and
1135 * lists as values. It's used for blocking operations (BLPOP) and to
1136 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1137 static dictType keylistDictType = {
1138 dictObjHash, /* hash function */
1139 NULL, /* key dup */
1140 NULL, /* val dup */
1141 dictObjKeyCompare, /* key compare */
1142 dictRedisObjectDestructor, /* key destructor */
1143 dictListDestructor /* val destructor */
1144 };
1145
1146 static void version();
1147
1148 /* ========================= Random utility functions ======================= */
1149
1150 /* Redis generally does not try to recover from out of memory conditions
1151 * when allocating objects or strings, it is not clear if it will be possible
1152 * to report this condition to the client since the networking layer itself
1153 * is based on heap allocation for send buffers, so we simply abort.
1154 * At least the code will be simpler to read... */
1155 static void oom(const char *msg) {
1156 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1157 sleep(1);
1158 abort();
1159 }
1160
1161 /* ====================== Redis server networking stuff ===================== */
1162 static void closeTimedoutClients(void) {
1163 redisClient *c;
1164 listNode *ln;
1165 time_t now = time(NULL);
1166 listIter li;
1167
1168 listRewind(server.clients,&li);
1169 while ((ln = listNext(&li)) != NULL) {
1170 c = listNodeValue(ln);
1171 if (server.maxidletime &&
1172 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1173 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1174 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1175 listLength(c->pubsub_patterns) == 0 &&
1176 (now - c->lastinteraction > server.maxidletime))
1177 {
1178 redisLog(REDIS_VERBOSE,"Closing idle client");
1179 freeClient(c);
1180 } else if (c->flags & REDIS_BLOCKED) {
1181 if (c->blockingto != 0 && c->blockingto < now) {
1182 addReply(c,shared.nullmultibulk);
1183 unblockClientWaitingData(c);
1184 }
1185 }
1186 }
1187 }
1188
1189 static int htNeedsResize(dict *dict) {
1190 long long size, used;
1191
1192 size = dictSlots(dict);
1193 used = dictSize(dict);
1194 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1195 (used*100/size < REDIS_HT_MINFILL));
1196 }
1197
1198 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1199 * we resize the hash table to save memory */
1200 static void tryResizeHashTables(void) {
1201 int j;
1202
1203 for (j = 0; j < server.dbnum; j++) {
1204 if (htNeedsResize(server.db[j].dict)) {
1205 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1206 dictResize(server.db[j].dict);
1207 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1208 }
1209 if (htNeedsResize(server.db[j].expires))
1210 dictResize(server.db[j].expires);
1211 }
1212 }
1213
1214 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1215 void backgroundSaveDoneHandler(int statloc) {
1216 int exitcode = WEXITSTATUS(statloc);
1217 int bysignal = WIFSIGNALED(statloc);
1218
1219 if (!bysignal && exitcode == 0) {
1220 redisLog(REDIS_NOTICE,
1221 "Background saving terminated with success");
1222 server.dirty = 0;
1223 server.lastsave = time(NULL);
1224 } else if (!bysignal && exitcode != 0) {
1225 redisLog(REDIS_WARNING, "Background saving error");
1226 } else {
1227 redisLog(REDIS_WARNING,
1228 "Background saving terminated by signal %d", WTERMSIG(statloc));
1229 rdbRemoveTempFile(server.bgsavechildpid);
1230 }
1231 server.bgsavechildpid = -1;
1232 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1233 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1234 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1235 }
1236
1237 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1238 * Handle this. */
1239 void backgroundRewriteDoneHandler(int statloc) {
1240 int exitcode = WEXITSTATUS(statloc);
1241 int bysignal = WIFSIGNALED(statloc);
1242
1243 if (!bysignal && exitcode == 0) {
1244 int fd;
1245 char tmpfile[256];
1246
1247 redisLog(REDIS_NOTICE,
1248 "Background append only file rewriting terminated with success");
1249 /* Now it's time to flush the differences accumulated by the parent */
1250 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1251 fd = open(tmpfile,O_WRONLY|O_APPEND);
1252 if (fd == -1) {
1253 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1254 goto cleanup;
1255 }
1256 /* Flush our data... */
1257 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1258 (signed) sdslen(server.bgrewritebuf)) {
1259 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1260 close(fd);
1261 goto cleanup;
1262 }
1263 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1264 /* Now our work is to rename the temp file into the stable file. And
1265 * switch the file descriptor used by the server for append only. */
1266 if (rename(tmpfile,server.appendfilename) == -1) {
1267 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1268 close(fd);
1269 goto cleanup;
1270 }
1271 /* Mission completed... almost */
1272 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1273 if (server.appendfd != -1) {
1274 /* If append only is actually enabled... */
1275 close(server.appendfd);
1276 server.appendfd = fd;
1277 fsync(fd);
1278 server.appendseldb = -1; /* Make sure it will issue SELECT */
1279 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1280 } else {
1281 /* If append only is disabled we just generate a dump in this
1282 * format. Why not? */
1283 close(fd);
1284 }
1285 } else if (!bysignal && exitcode != 0) {
1286 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1287 } else {
1288 redisLog(REDIS_WARNING,
1289 "Background append only file rewriting terminated by signal %d",
1290 WTERMSIG(statloc));
1291 }
1292 cleanup:
1293 sdsfree(server.bgrewritebuf);
1294 server.bgrewritebuf = sdsempty();
1295 aofRemoveTempFile(server.bgrewritechildpid);
1296 server.bgrewritechildpid = -1;
1297 }
1298
1299 /* This function is called once a background process of some kind terminates,
1300 * as we want to avoid resizing the hash tables when there is a child in order
1301 * to play well with copy-on-write (otherwise when a resize happens lots of
1302 * memory pages are copied). The goal of this function is to update the ability
1303 * for dict.c to resize the hash tables accordingly to the fact we have o not
1304 * running childs. */
1305 static void updateDictResizePolicy(void) {
1306 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1307 dictEnableResize();
1308 else
1309 dictDisableResize();
1310 }
1311
1312 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1313 int j, loops = server.cronloops++;
1314 REDIS_NOTUSED(eventLoop);
1315 REDIS_NOTUSED(id);
1316 REDIS_NOTUSED(clientData);
1317
1318 /* We take a cached value of the unix time in the global state because
1319 * with virtual memory and aging there is to store the current time
1320 * in objects at every object access, and accuracy is not needed.
1321 * To access a global var is faster than calling time(NULL) */
1322 server.unixtime = time(NULL);
1323
1324 /* Show some info about non-empty databases */
1325 for (j = 0; j < server.dbnum; j++) {
1326 long long size, used, vkeys;
1327
1328 size = dictSlots(server.db[j].dict);
1329 used = dictSize(server.db[j].dict);
1330 vkeys = dictSize(server.db[j].expires);
1331 if (!(loops % 50) && (used || vkeys)) {
1332 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1333 /* dictPrintStats(server.dict); */
1334 }
1335 }
1336
1337 /* We don't want to resize the hash tables while a bacground saving
1338 * is in progress: the saving child is created using fork() that is
1339 * implemented with a copy-on-write semantic in most modern systems, so
1340 * if we resize the HT while there is the saving child at work actually
1341 * a lot of memory movements in the parent will cause a lot of pages
1342 * copied. */
1343 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1344 !(loops % 10))
1345 {
1346 tryResizeHashTables();
1347 }
1348
1349 /* Show information about connected clients */
1350 if (!(loops % 50)) {
1351 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1352 listLength(server.clients)-listLength(server.slaves),
1353 listLength(server.slaves),
1354 zmalloc_used_memory());
1355 }
1356
1357 /* Close connections of timedout clients */
1358 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1359 closeTimedoutClients();
1360
1361 /* Check if a background saving or AOF rewrite in progress terminated */
1362 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1363 int statloc;
1364 pid_t pid;
1365
1366 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1367 if (pid == server.bgsavechildpid) {
1368 backgroundSaveDoneHandler(statloc);
1369 } else {
1370 backgroundRewriteDoneHandler(statloc);
1371 }
1372 updateDictResizePolicy();
1373 }
1374 } else {
1375 /* If there is not a background saving in progress check if
1376 * we have to save now */
1377 time_t now = time(NULL);
1378 for (j = 0; j < server.saveparamslen; j++) {
1379 struct saveparam *sp = server.saveparams+j;
1380
1381 if (server.dirty >= sp->changes &&
1382 now-server.lastsave > sp->seconds) {
1383 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1384 sp->changes, sp->seconds);
1385 rdbSaveBackground(server.dbfilename);
1386 break;
1387 }
1388 }
1389 }
1390
1391 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1392 * will use few CPU cycles if there are few expiring keys, otherwise
1393 * it will get more aggressive to avoid that too much memory is used by
1394 * keys that can be removed from the keyspace. */
1395 for (j = 0; j < server.dbnum; j++) {
1396 int expired;
1397 redisDb *db = server.db+j;
1398
1399 /* Continue to expire if at the end of the cycle more than 25%
1400 * of the keys were expired. */
1401 do {
1402 long num = dictSize(db->expires);
1403 time_t now = time(NULL);
1404
1405 expired = 0;
1406 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1407 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1408 while (num--) {
1409 dictEntry *de;
1410 time_t t;
1411
1412 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1413 t = (time_t) dictGetEntryVal(de);
1414 if (now > t) {
1415 deleteKey(db,dictGetEntryKey(de));
1416 expired++;
1417 server.stat_expiredkeys++;
1418 }
1419 }
1420 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1421 }
1422
1423 /* Swap a few keys on disk if we are over the memory limit and VM
1424 * is enbled. Try to free objects from the free list first. */
1425 if (vmCanSwapOut()) {
1426 while (server.vm_enabled && zmalloc_used_memory() >
1427 server.vm_max_memory)
1428 {
1429 int retval;
1430
1431 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1432 retval = (server.vm_max_threads == 0) ?
1433 vmSwapOneObjectBlocking() :
1434 vmSwapOneObjectThreaded();
1435 if (retval == REDIS_ERR && !(loops % 300) &&
1436 zmalloc_used_memory() >
1437 (server.vm_max_memory+server.vm_max_memory/10))
1438 {
1439 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1440 }
1441 /* Note that when using threade I/O we free just one object,
1442 * because anyway when the I/O thread in charge to swap this
1443 * object out will finish, the handler of completed jobs
1444 * will try to swap more objects if we are still out of memory. */
1445 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1446 }
1447 }
1448
1449 /* Check if we should connect to a MASTER */
1450 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1451 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1452 if (syncWithMaster() == REDIS_OK) {
1453 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1454 }
1455 }
1456 return 100;
1457 }
1458
1459 /* This function gets called every time Redis is entering the
1460 * main loop of the event driven library, that is, before to sleep
1461 * for ready file descriptors. */
1462 static void beforeSleep(struct aeEventLoop *eventLoop) {
1463 REDIS_NOTUSED(eventLoop);
1464
1465 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1466 listIter li;
1467 listNode *ln;
1468
1469 listRewind(server.io_ready_clients,&li);
1470 while((ln = listNext(&li))) {
1471 redisClient *c = ln->value;
1472 struct redisCommand *cmd;
1473
1474 /* Resume the client. */
1475 listDelNode(server.io_ready_clients,ln);
1476 c->flags &= (~REDIS_IO_WAIT);
1477 server.vm_blocked_clients--;
1478 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1479 readQueryFromClient, c);
1480 cmd = lookupCommand(c->argv[0]->ptr);
1481 assert(cmd != NULL);
1482 call(c,cmd);
1483 resetClient(c);
1484 /* There may be more data to process in the input buffer. */
1485 if (c->querybuf && sdslen(c->querybuf) > 0)
1486 processInputBuffer(c);
1487 }
1488 }
1489 }
1490
1491 static void createSharedObjects(void) {
1492 int j;
1493
1494 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1495 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1496 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1497 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1498 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1499 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1500 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1501 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1502 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1503 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1504 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1505 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1507 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR no such key\r\n"));
1509 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR syntax error\r\n"));
1511 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1512 "-ERR source and destination objects are the same\r\n"));
1513 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1514 "-ERR index out of range\r\n"));
1515 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1516 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1517 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1518 shared.select0 = createStringObject("select 0\r\n",10);
1519 shared.select1 = createStringObject("select 1\r\n",10);
1520 shared.select2 = createStringObject("select 2\r\n",10);
1521 shared.select3 = createStringObject("select 3\r\n",10);
1522 shared.select4 = createStringObject("select 4\r\n",10);
1523 shared.select5 = createStringObject("select 5\r\n",10);
1524 shared.select6 = createStringObject("select 6\r\n",10);
1525 shared.select7 = createStringObject("select 7\r\n",10);
1526 shared.select8 = createStringObject("select 8\r\n",10);
1527 shared.select9 = createStringObject("select 9\r\n",10);
1528 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1529 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1530 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1531 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1532 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1533 shared.mbulk3 = createStringObject("*3\r\n",4);
1534 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1535 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1536 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1537 }
1538 }
1539
1540 static void appendServerSaveParams(time_t seconds, int changes) {
1541 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1542 server.saveparams[server.saveparamslen].seconds = seconds;
1543 server.saveparams[server.saveparamslen].changes = changes;
1544 server.saveparamslen++;
1545 }
1546
1547 static void resetServerSaveParams() {
1548 zfree(server.saveparams);
1549 server.saveparams = NULL;
1550 server.saveparamslen = 0;
1551 }
1552
1553 static void initServerConfig() {
1554 server.dbnum = REDIS_DEFAULT_DBNUM;
1555 server.port = REDIS_SERVERPORT;
1556 server.verbosity = REDIS_VERBOSE;
1557 server.maxidletime = REDIS_MAXIDLETIME;
1558 server.saveparams = NULL;
1559 server.logfile = NULL; /* NULL = log on standard output */
1560 server.bindaddr = NULL;
1561 server.glueoutputbuf = 1;
1562 server.daemonize = 0;
1563 server.appendonly = 0;
1564 server.appendfsync = APPENDFSYNC_ALWAYS;
1565 server.lastfsync = time(NULL);
1566 server.appendfd = -1;
1567 server.appendseldb = -1; /* Make sure the first time will not match */
1568 server.pidfile = zstrdup("/var/run/redis.pid");
1569 server.dbfilename = zstrdup("dump.rdb");
1570 server.appendfilename = zstrdup("appendonly.aof");
1571 server.requirepass = NULL;
1572 server.shareobjects = 0;
1573 server.rdbcompression = 1;
1574 server.maxclients = 0;
1575 server.blpop_blocked_clients = 0;
1576 server.maxmemory = 0;
1577 server.vm_enabled = 0;
1578 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1579 server.vm_page_size = 256; /* 256 bytes per page */
1580 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1581 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1582 server.vm_max_threads = 4;
1583 server.vm_blocked_clients = 0;
1584 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1585 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1586
1587 resetServerSaveParams();
1588
1589 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1590 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1591 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1592 /* Replication related */
1593 server.isslave = 0;
1594 server.masterauth = NULL;
1595 server.masterhost = NULL;
1596 server.masterport = 6379;
1597 server.master = NULL;
1598 server.replstate = REDIS_REPL_NONE;
1599
1600 /* Double constants initialization */
1601 R_Zero = 0.0;
1602 R_PosInf = 1.0/R_Zero;
1603 R_NegInf = -1.0/R_Zero;
1604 R_Nan = R_Zero/R_Zero;
1605 }
1606
1607 static void initServer() {
1608 int j;
1609
1610 signal(SIGHUP, SIG_IGN);
1611 signal(SIGPIPE, SIG_IGN);
1612 setupSigSegvAction();
1613
1614 server.devnull = fopen("/dev/null","w");
1615 if (server.devnull == NULL) {
1616 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1617 exit(1);
1618 }
1619 server.clients = listCreate();
1620 server.slaves = listCreate();
1621 server.monitors = listCreate();
1622 server.objfreelist = listCreate();
1623 createSharedObjects();
1624 server.el = aeCreateEventLoop();
1625 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1626 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1627 if (server.fd == -1) {
1628 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1629 exit(1);
1630 }
1631 for (j = 0; j < server.dbnum; j++) {
1632 server.db[j].dict = dictCreate(&dbDictType,NULL);
1633 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1634 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1635 if (server.vm_enabled)
1636 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1637 server.db[j].id = j;
1638 }
1639 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1640 server.pubsub_patterns = listCreate();
1641 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1642 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1643 server.cronloops = 0;
1644 server.bgsavechildpid = -1;
1645 server.bgrewritechildpid = -1;
1646 server.bgrewritebuf = sdsempty();
1647 server.lastsave = time(NULL);
1648 server.dirty = 0;
1649 server.stat_numcommands = 0;
1650 server.stat_numconnections = 0;
1651 server.stat_expiredkeys = 0;
1652 server.stat_starttime = time(NULL);
1653 server.unixtime = time(NULL);
1654 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1655 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1656 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1657
1658 if (server.appendonly) {
1659 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1660 if (server.appendfd == -1) {
1661 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1662 strerror(errno));
1663 exit(1);
1664 }
1665 }
1666
1667 if (server.vm_enabled) vmInit();
1668 }
1669
1670 /* Empty the whole database */
1671 static long long emptyDb() {
1672 int j;
1673 long long removed = 0;
1674
1675 for (j = 0; j < server.dbnum; j++) {
1676 removed += dictSize(server.db[j].dict);
1677 dictEmpty(server.db[j].dict);
1678 dictEmpty(server.db[j].expires);
1679 }
1680 return removed;
1681 }
1682
1683 static int yesnotoi(char *s) {
1684 if (!strcasecmp(s,"yes")) return 1;
1685 else if (!strcasecmp(s,"no")) return 0;
1686 else return -1;
1687 }
1688
1689 /* I agree, this is a very rudimental way to load a configuration...
1690 will improve later if the config gets more complex */
1691 static void loadServerConfig(char *filename) {
1692 FILE *fp;
1693 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1694 int linenum = 0;
1695 sds line = NULL;
1696
1697 if (filename[0] == '-' && filename[1] == '\0')
1698 fp = stdin;
1699 else {
1700 if ((fp = fopen(filename,"r")) == NULL) {
1701 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1702 exit(1);
1703 }
1704 }
1705
1706 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1707 sds *argv;
1708 int argc, j;
1709
1710 linenum++;
1711 line = sdsnew(buf);
1712 line = sdstrim(line," \t\r\n");
1713
1714 /* Skip comments and blank lines*/
1715 if (line[0] == '#' || line[0] == '\0') {
1716 sdsfree(line);
1717 continue;
1718 }
1719
1720 /* Split into arguments */
1721 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1722 sdstolower(argv[0]);
1723
1724 /* Execute config directives */
1725 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1726 server.maxidletime = atoi(argv[1]);
1727 if (server.maxidletime < 0) {
1728 err = "Invalid timeout value"; goto loaderr;
1729 }
1730 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1731 server.port = atoi(argv[1]);
1732 if (server.port < 1 || server.port > 65535) {
1733 err = "Invalid port"; goto loaderr;
1734 }
1735 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1736 server.bindaddr = zstrdup(argv[1]);
1737 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1738 int seconds = atoi(argv[1]);
1739 int changes = atoi(argv[2]);
1740 if (seconds < 1 || changes < 0) {
1741 err = "Invalid save parameters"; goto loaderr;
1742 }
1743 appendServerSaveParams(seconds,changes);
1744 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1745 if (chdir(argv[1]) == -1) {
1746 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1747 argv[1], strerror(errno));
1748 exit(1);
1749 }
1750 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1751 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1752 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1753 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1754 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1755 else {
1756 err = "Invalid log level. Must be one of debug, notice, warning";
1757 goto loaderr;
1758 }
1759 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1760 FILE *logfp;
1761
1762 server.logfile = zstrdup(argv[1]);
1763 if (!strcasecmp(server.logfile,"stdout")) {
1764 zfree(server.logfile);
1765 server.logfile = NULL;
1766 }
1767 if (server.logfile) {
1768 /* Test if we are able to open the file. The server will not
1769 * be able to abort just for this problem later... */
1770 logfp = fopen(server.logfile,"a");
1771 if (logfp == NULL) {
1772 err = sdscatprintf(sdsempty(),
1773 "Can't open the log file: %s", strerror(errno));
1774 goto loaderr;
1775 }
1776 fclose(logfp);
1777 }
1778 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1779 server.dbnum = atoi(argv[1]);
1780 if (server.dbnum < 1) {
1781 err = "Invalid number of databases"; goto loaderr;
1782 }
1783 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1784 loadServerConfig(argv[1]);
1785 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1786 server.maxclients = atoi(argv[1]);
1787 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1788 server.maxmemory = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1790 server.masterhost = sdsnew(argv[1]);
1791 server.masterport = atoi(argv[2]);
1792 server.replstate = REDIS_REPL_CONNECT;
1793 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1794 server.masterauth = zstrdup(argv[1]);
1795 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1796 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1800 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1801 err = "argument must be 'yes' or 'no'"; goto loaderr;
1802 }
1803 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1804 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1805 err = "argument must be 'yes' or 'no'"; goto loaderr;
1806 }
1807 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1808 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1809 err = "argument must be 'yes' or 'no'"; goto loaderr;
1810 }
1811 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1812 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1813 err = "argument must be 'yes' or 'no'"; goto loaderr;
1814 }
1815 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1816 if (!strcasecmp(argv[1],"no")) {
1817 server.appendfsync = APPENDFSYNC_NO;
1818 } else if (!strcasecmp(argv[1],"always")) {
1819 server.appendfsync = APPENDFSYNC_ALWAYS;
1820 } else if (!strcasecmp(argv[1],"everysec")) {
1821 server.appendfsync = APPENDFSYNC_EVERYSEC;
1822 } else {
1823 err = "argument must be 'no', 'always' or 'everysec'";
1824 goto loaderr;
1825 }
1826 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1827 server.requirepass = zstrdup(argv[1]);
1828 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1829 zfree(server.pidfile);
1830 server.pidfile = zstrdup(argv[1]);
1831 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1832 zfree(server.dbfilename);
1833 server.dbfilename = zstrdup(argv[1]);
1834 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1835 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1836 err = "argument must be 'yes' or 'no'"; goto loaderr;
1837 }
1838 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1839 zfree(server.vm_swap_file);
1840 server.vm_swap_file = zstrdup(argv[1]);
1841 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1842 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1843 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1844 server.vm_page_size = strtoll(argv[1], NULL, 10);
1845 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1846 server.vm_pages = strtoll(argv[1], NULL, 10);
1847 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1848 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1849 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1850 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1851 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1852 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1853 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1854 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1855 } else {
1856 err = "Bad directive or wrong number of arguments"; goto loaderr;
1857 }
1858 for (j = 0; j < argc; j++)
1859 sdsfree(argv[j]);
1860 zfree(argv);
1861 sdsfree(line);
1862 }
1863 if (fp != stdin) fclose(fp);
1864 return;
1865
1866 loaderr:
1867 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1868 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1869 fprintf(stderr, ">>> '%s'\n", line);
1870 fprintf(stderr, "%s\n", err);
1871 exit(1);
1872 }
1873
1874 static void freeClientArgv(redisClient *c) {
1875 int j;
1876
1877 for (j = 0; j < c->argc; j++)
1878 decrRefCount(c->argv[j]);
1879 for (j = 0; j < c->mbargc; j++)
1880 decrRefCount(c->mbargv[j]);
1881 c->argc = 0;
1882 c->mbargc = 0;
1883 }
1884
1885 static void freeClient(redisClient *c) {
1886 listNode *ln;
1887
1888 /* Note that if the client we are freeing is blocked into a blocking
1889 * call, we have to set querybuf to NULL *before* to call
1890 * unblockClientWaitingData() to avoid processInputBuffer() will get
1891 * called. Also it is important to remove the file events after
1892 * this, because this call adds the READABLE event. */
1893 sdsfree(c->querybuf);
1894 c->querybuf = NULL;
1895 if (c->flags & REDIS_BLOCKED)
1896 unblockClientWaitingData(c);
1897
1898 /* Unsubscribe from all the pubsub channels */
1899 pubsubUnsubscribeAllChannels(c,0);
1900 pubsubUnsubscribeAllPatterns(c,0);
1901 dictRelease(c->pubsub_channels);
1902 listRelease(c->pubsub_patterns);
1903 /* Obvious cleanup */
1904 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1905 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1906 listRelease(c->reply);
1907 freeClientArgv(c);
1908 close(c->fd);
1909 /* Remove from the list of clients */
1910 ln = listSearchKey(server.clients,c);
1911 redisAssert(ln != NULL);
1912 listDelNode(server.clients,ln);
1913 /* Remove from the list of clients waiting for swapped keys */
1914 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1915 ln = listSearchKey(server.io_ready_clients,c);
1916 if (ln) {
1917 listDelNode(server.io_ready_clients,ln);
1918 server.vm_blocked_clients--;
1919 }
1920 }
1921 while (server.vm_enabled && listLength(c->io_keys)) {
1922 ln = listFirst(c->io_keys);
1923 dontWaitForSwappedKey(c,ln->value);
1924 }
1925 listRelease(c->io_keys);
1926 /* Master/slave cleanup */
1927 if (c->flags & REDIS_SLAVE) {
1928 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1929 close(c->repldbfd);
1930 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1931 ln = listSearchKey(l,c);
1932 redisAssert(ln != NULL);
1933 listDelNode(l,ln);
1934 }
1935 if (c->flags & REDIS_MASTER) {
1936 server.master = NULL;
1937 server.replstate = REDIS_REPL_CONNECT;
1938 }
1939 /* Release memory */
1940 zfree(c->argv);
1941 zfree(c->mbargv);
1942 freeClientMultiState(c);
1943 zfree(c);
1944 }
1945
1946 #define GLUEREPLY_UP_TO (1024)
1947 static void glueReplyBuffersIfNeeded(redisClient *c) {
1948 int copylen = 0;
1949 char buf[GLUEREPLY_UP_TO];
1950 listNode *ln;
1951 listIter li;
1952 robj *o;
1953
1954 listRewind(c->reply,&li);
1955 while((ln = listNext(&li))) {
1956 int objlen;
1957
1958 o = ln->value;
1959 objlen = sdslen(o->ptr);
1960 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1961 memcpy(buf+copylen,o->ptr,objlen);
1962 copylen += objlen;
1963 listDelNode(c->reply,ln);
1964 } else {
1965 if (copylen == 0) return;
1966 break;
1967 }
1968 }
1969 /* Now the output buffer is empty, add the new single element */
1970 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1971 listAddNodeHead(c->reply,o);
1972 }
1973
1974 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1975 redisClient *c = privdata;
1976 int nwritten = 0, totwritten = 0, objlen;
1977 robj *o;
1978 REDIS_NOTUSED(el);
1979 REDIS_NOTUSED(mask);
1980
1981 /* Use writev() if we have enough buffers to send */
1982 if (!server.glueoutputbuf &&
1983 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1984 !(c->flags & REDIS_MASTER))
1985 {
1986 sendReplyToClientWritev(el, fd, privdata, mask);
1987 return;
1988 }
1989
1990 while(listLength(c->reply)) {
1991 if (server.glueoutputbuf && listLength(c->reply) > 1)
1992 glueReplyBuffersIfNeeded(c);
1993
1994 o = listNodeValue(listFirst(c->reply));
1995 objlen = sdslen(o->ptr);
1996
1997 if (objlen == 0) {
1998 listDelNode(c->reply,listFirst(c->reply));
1999 continue;
2000 }
2001
2002 if (c->flags & REDIS_MASTER) {
2003 /* Don't reply to a master */
2004 nwritten = objlen - c->sentlen;
2005 } else {
2006 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2007 if (nwritten <= 0) break;
2008 }
2009 c->sentlen += nwritten;
2010 totwritten += nwritten;
2011 /* If we fully sent the object on head go to the next one */
2012 if (c->sentlen == objlen) {
2013 listDelNode(c->reply,listFirst(c->reply));
2014 c->sentlen = 0;
2015 }
2016 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2017 * bytes, in a single threaded server it's a good idea to serve
2018 * other clients as well, even if a very large request comes from
2019 * super fast link that is always able to accept data (in real world
2020 * scenario think about 'KEYS *' against the loopback interfae) */
2021 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2022 }
2023 if (nwritten == -1) {
2024 if (errno == EAGAIN) {
2025 nwritten = 0;
2026 } else {
2027 redisLog(REDIS_VERBOSE,
2028 "Error writing to client: %s", strerror(errno));
2029 freeClient(c);
2030 return;
2031 }
2032 }
2033 if (totwritten > 0) c->lastinteraction = time(NULL);
2034 if (listLength(c->reply) == 0) {
2035 c->sentlen = 0;
2036 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2037 }
2038 }
2039
2040 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2041 {
2042 redisClient *c = privdata;
2043 int nwritten = 0, totwritten = 0, objlen, willwrite;
2044 robj *o;
2045 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2046 int offset, ion = 0;
2047 REDIS_NOTUSED(el);
2048 REDIS_NOTUSED(mask);
2049
2050 listNode *node;
2051 while (listLength(c->reply)) {
2052 offset = c->sentlen;
2053 ion = 0;
2054 willwrite = 0;
2055
2056 /* fill-in the iov[] array */
2057 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2058 o = listNodeValue(node);
2059 objlen = sdslen(o->ptr);
2060
2061 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2062 break;
2063
2064 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2065 break; /* no more iovecs */
2066
2067 iov[ion].iov_base = ((char*)o->ptr) + offset;
2068 iov[ion].iov_len = objlen - offset;
2069 willwrite += objlen - offset;
2070 offset = 0; /* just for the first item */
2071 ion++;
2072 }
2073
2074 if(willwrite == 0)
2075 break;
2076
2077 /* write all collected blocks at once */
2078 if((nwritten = writev(fd, iov, ion)) < 0) {
2079 if (errno != EAGAIN) {
2080 redisLog(REDIS_VERBOSE,
2081 "Error writing to client: %s", strerror(errno));
2082 freeClient(c);
2083 return;
2084 }
2085 break;
2086 }
2087
2088 totwritten += nwritten;
2089 offset = c->sentlen;
2090
2091 /* remove written robjs from c->reply */
2092 while (nwritten && listLength(c->reply)) {
2093 o = listNodeValue(listFirst(c->reply));
2094 objlen = sdslen(o->ptr);
2095
2096 if(nwritten >= objlen - offset) {
2097 listDelNode(c->reply, listFirst(c->reply));
2098 nwritten -= objlen - offset;
2099 c->sentlen = 0;
2100 } else {
2101 /* partial write */
2102 c->sentlen += nwritten;
2103 break;
2104 }
2105 offset = 0;
2106 }
2107 }
2108
2109 if (totwritten > 0)
2110 c->lastinteraction = time(NULL);
2111
2112 if (listLength(c->reply) == 0) {
2113 c->sentlen = 0;
2114 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2115 }
2116 }
2117
2118 static struct redisCommand *lookupCommand(char *name) {
2119 int j = 0;
2120 while(cmdTable[j].name != NULL) {
2121 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2122 j++;
2123 }
2124 return NULL;
2125 }
2126
2127 /* resetClient prepare the client to process the next command */
2128 static void resetClient(redisClient *c) {
2129 freeClientArgv(c);
2130 c->bulklen = -1;
2131 c->multibulk = 0;
2132 }
2133
2134 /* Call() is the core of Redis execution of a command */
2135 static void call(redisClient *c, struct redisCommand *cmd) {
2136 long long dirty;
2137
2138 dirty = server.dirty;
2139 cmd->proc(c);
2140 dirty = server.dirty-dirty;
2141
2142 if (server.appendonly && dirty)
2143 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2144 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2145 listLength(server.slaves))
2146 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2147 if (listLength(server.monitors))
2148 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2149 server.stat_numcommands++;
2150 }
2151
2152 /* If this function gets called we already read a whole
2153 * command, argments are in the client argv/argc fields.
2154 * processCommand() execute the command or prepare the
2155 * server for a bulk read from the client.
2156 *
2157 * If 1 is returned the client is still alive and valid and
2158 * and other operations can be performed by the caller. Otherwise
2159 * if 0 is returned the client was destroied (i.e. after QUIT). */
2160 static int processCommand(redisClient *c) {
2161 struct redisCommand *cmd;
2162
2163 /* Free some memory if needed (maxmemory setting) */
2164 if (server.maxmemory) freeMemoryIfNeeded();
2165
2166 /* Handle the multi bulk command type. This is an alternative protocol
2167 * supported by Redis in order to receive commands that are composed of
2168 * multiple binary-safe "bulk" arguments. The latency of processing is
2169 * a bit higher but this allows things like multi-sets, so if this
2170 * protocol is used only for MSET and similar commands this is a big win. */
2171 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2172 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2173 if (c->multibulk <= 0) {
2174 resetClient(c);
2175 return 1;
2176 } else {
2177 decrRefCount(c->argv[c->argc-1]);
2178 c->argc--;
2179 return 1;
2180 }
2181 } else if (c->multibulk) {
2182 if (c->bulklen == -1) {
2183 if (((char*)c->argv[0]->ptr)[0] != '$') {
2184 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2185 resetClient(c);
2186 return 1;
2187 } else {
2188 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2189 decrRefCount(c->argv[0]);
2190 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2191 c->argc--;
2192 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2193 resetClient(c);
2194 return 1;
2195 }
2196 c->argc--;
2197 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2198 return 1;
2199 }
2200 } else {
2201 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2202 c->mbargv[c->mbargc] = c->argv[0];
2203 c->mbargc++;
2204 c->argc--;
2205 c->multibulk--;
2206 if (c->multibulk == 0) {
2207 robj **auxargv;
2208 int auxargc;
2209
2210 /* Here we need to swap the multi-bulk argc/argv with the
2211 * normal argc/argv of the client structure. */
2212 auxargv = c->argv;
2213 c->argv = c->mbargv;
2214 c->mbargv = auxargv;
2215
2216 auxargc = c->argc;
2217 c->argc = c->mbargc;
2218 c->mbargc = auxargc;
2219
2220 /* We need to set bulklen to something different than -1
2221 * in order for the code below to process the command without
2222 * to try to read the last argument of a bulk command as
2223 * a special argument. */
2224 c->bulklen = 0;
2225 /* continue below and process the command */
2226 } else {
2227 c->bulklen = -1;
2228 return 1;
2229 }
2230 }
2231 }
2232 /* -- end of multi bulk commands processing -- */
2233
2234 /* The QUIT command is handled as a special case. Normal command
2235 * procs are unable to close the client connection safely */
2236 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2237 freeClient(c);
2238 return 0;
2239 }
2240
2241 /* Now lookup the command and check ASAP about trivial error conditions
2242 * such wrong arity, bad command name and so forth. */
2243 cmd = lookupCommand(c->argv[0]->ptr);
2244 if (!cmd) {
2245 addReplySds(c,
2246 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2247 (char*)c->argv[0]->ptr));
2248 resetClient(c);
2249 return 1;
2250 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2251 (c->argc < -cmd->arity)) {
2252 addReplySds(c,
2253 sdscatprintf(sdsempty(),
2254 "-ERR wrong number of arguments for '%s' command\r\n",
2255 cmd->name));
2256 resetClient(c);
2257 return 1;
2258 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2259 /* This is a bulk command, we have to read the last argument yet. */
2260 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2261
2262 decrRefCount(c->argv[c->argc-1]);
2263 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2264 c->argc--;
2265 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2266 resetClient(c);
2267 return 1;
2268 }
2269 c->argc--;
2270 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2271 /* It is possible that the bulk read is already in the
2272 * buffer. Check this condition and handle it accordingly.
2273 * This is just a fast path, alternative to call processInputBuffer().
2274 * It's a good idea since the code is small and this condition
2275 * happens most of the times. */
2276 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2277 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2278 c->argc++;
2279 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2280 } else {
2281 /* Otherwise return... there is to read the last argument
2282 * from the socket. */
2283 return 1;
2284 }
2285 }
2286 /* Let's try to encode the bulk object to save space. */
2287 if (cmd->flags & REDIS_CMD_BULK)
2288 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2289
2290 /* Check if the user is authenticated */
2291 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2292 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2293 resetClient(c);
2294 return 1;
2295 }
2296
2297 /* Handle the maxmemory directive */
2298 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2299 zmalloc_used_memory() > server.maxmemory)
2300 {
2301 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2302 resetClient(c);
2303 return 1;
2304 }
2305
2306 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2307 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2308 &&
2309 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2310 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2311 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2312 resetClient(c);
2313 return 1;
2314 }
2315
2316 /* Exec the command */
2317 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2318 queueMultiCommand(c,cmd);
2319 addReply(c,shared.queued);
2320 } else {
2321 if (server.vm_enabled && server.vm_max_threads > 0 &&
2322 blockClientOnSwappedKeys(cmd,c)) return 1;
2323 call(c,cmd);
2324 }
2325
2326 /* Prepare the client for the next command */
2327 resetClient(c);
2328 return 1;
2329 }
2330
2331 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2332 listNode *ln;
2333 listIter li;
2334 int outc = 0, j;
2335 robj **outv;
2336 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2337 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2338 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2339 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2340 robj *lenobj;
2341
2342 if (argc <= REDIS_STATIC_ARGS) {
2343 outv = static_outv;
2344 } else {
2345 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2346 }
2347
2348 lenobj = createObject(REDIS_STRING,
2349 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2350 lenobj->refcount = 0;
2351 outv[outc++] = lenobj;
2352 for (j = 0; j < argc; j++) {
2353 lenobj = createObject(REDIS_STRING,
2354 sdscatprintf(sdsempty(),"$%lu\r\n",
2355 (unsigned long) stringObjectLen(argv[j])));
2356 lenobj->refcount = 0;
2357 outv[outc++] = lenobj;
2358 outv[outc++] = argv[j];
2359 outv[outc++] = shared.crlf;
2360 }
2361
2362 /* Increment all the refcounts at start and decrement at end in order to
2363 * be sure to free objects if there is no slave in a replication state
2364 * able to be feed with commands */
2365 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2366 listRewind(slaves,&li);
2367 while((ln = listNext(&li))) {
2368 redisClient *slave = ln->value;
2369
2370 /* Don't feed slaves that are still waiting for BGSAVE to start */
2371 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2372
2373 /* Feed all the other slaves, MONITORs and so on */
2374 if (slave->slaveseldb != dictid) {
2375 robj *selectcmd;
2376
2377 switch(dictid) {
2378 case 0: selectcmd = shared.select0; break;
2379 case 1: selectcmd = shared.select1; break;
2380 case 2: selectcmd = shared.select2; break;
2381 case 3: selectcmd = shared.select3; break;
2382 case 4: selectcmd = shared.select4; break;
2383 case 5: selectcmd = shared.select5; break;
2384 case 6: selectcmd = shared.select6; break;
2385 case 7: selectcmd = shared.select7; break;
2386 case 8: selectcmd = shared.select8; break;
2387 case 9: selectcmd = shared.select9; break;
2388 default:
2389 selectcmd = createObject(REDIS_STRING,
2390 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2391 selectcmd->refcount = 0;
2392 break;
2393 }
2394 addReply(slave,selectcmd);
2395 slave->slaveseldb = dictid;
2396 }
2397 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2398 }
2399 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2400 if (outv != static_outv) zfree(outv);
2401 }
2402
2403 static void processInputBuffer(redisClient *c) {
2404 again:
2405 /* Before to process the input buffer, make sure the client is not
2406 * waitig for a blocking operation such as BLPOP. Note that the first
2407 * iteration the client is never blocked, otherwise the processInputBuffer
2408 * would not be called at all, but after the execution of the first commands
2409 * in the input buffer the client may be blocked, and the "goto again"
2410 * will try to reiterate. The following line will make it return asap. */
2411 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2412 if (c->bulklen == -1) {
2413 /* Read the first line of the query */
2414 char *p = strchr(c->querybuf,'\n');
2415 size_t querylen;
2416
2417 if (p) {
2418 sds query, *argv;
2419 int argc, j;
2420
2421 query = c->querybuf;
2422 c->querybuf = sdsempty();
2423 querylen = 1+(p-(query));
2424 if (sdslen(query) > querylen) {
2425 /* leave data after the first line of the query in the buffer */
2426 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2427 }
2428 *p = '\0'; /* remove "\n" */
2429 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2430 sdsupdatelen(query);
2431
2432 /* Now we can split the query in arguments */
2433 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2434 sdsfree(query);
2435
2436 if (c->argv) zfree(c->argv);
2437 c->argv = zmalloc(sizeof(robj*)*argc);
2438
2439 for (j = 0; j < argc; j++) {
2440 if (sdslen(argv[j])) {
2441 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2442 c->argc++;
2443 } else {
2444 sdsfree(argv[j]);
2445 }
2446 }
2447 zfree(argv);
2448 if (c->argc) {
2449 /* Execute the command. If the client is still valid
2450 * after processCommand() return and there is something
2451 * on the query buffer try to process the next command. */
2452 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2453 } else {
2454 /* Nothing to process, argc == 0. Just process the query
2455 * buffer if it's not empty or return to the caller */
2456 if (sdslen(c->querybuf)) goto again;
2457 }
2458 return;
2459 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2460 redisLog(REDIS_VERBOSE, "Client protocol error");
2461 freeClient(c);
2462 return;
2463 }
2464 } else {
2465 /* Bulk read handling. Note that if we are at this point
2466 the client already sent a command terminated with a newline,
2467 we are reading the bulk data that is actually the last
2468 argument of the command. */
2469 int qbl = sdslen(c->querybuf);
2470
2471 if (c->bulklen <= qbl) {
2472 /* Copy everything but the final CRLF as final argument */
2473 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2474 c->argc++;
2475 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2476 /* Process the command. If the client is still valid after
2477 * the processing and there is more data in the buffer
2478 * try to parse it. */
2479 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2480 return;
2481 }
2482 }
2483 }
2484
2485 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2486 redisClient *c = (redisClient*) privdata;
2487 char buf[REDIS_IOBUF_LEN];
2488 int nread;
2489 REDIS_NOTUSED(el);
2490 REDIS_NOTUSED(mask);
2491
2492 nread = read(fd, buf, REDIS_IOBUF_LEN);
2493 if (nread == -1) {
2494 if (errno == EAGAIN) {
2495 nread = 0;
2496 } else {
2497 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2498 freeClient(c);
2499 return;
2500 }
2501 } else if (nread == 0) {
2502 redisLog(REDIS_VERBOSE, "Client closed connection");
2503 freeClient(c);
2504 return;
2505 }
2506 if (nread) {
2507 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2508 c->lastinteraction = time(NULL);
2509 } else {
2510 return;
2511 }
2512 processInputBuffer(c);
2513 }
2514
2515 static int selectDb(redisClient *c, int id) {
2516 if (id < 0 || id >= server.dbnum)
2517 return REDIS_ERR;
2518 c->db = &server.db[id];
2519 return REDIS_OK;
2520 }
2521
2522 static void *dupClientReplyValue(void *o) {
2523 incrRefCount((robj*)o);
2524 return o;
2525 }
2526
2527 static int listMatchObjects(void *a, void *b) {
2528 return compareStringObjects(a,b) == 0;
2529 }
2530
2531 static redisClient *createClient(int fd) {
2532 redisClient *c = zmalloc(sizeof(*c));
2533
2534 anetNonBlock(NULL,fd);
2535 anetTcpNoDelay(NULL,fd);
2536 if (!c) return NULL;
2537 selectDb(c,0);
2538 c->fd = fd;
2539 c->querybuf = sdsempty();
2540 c->argc = 0;
2541 c->argv = NULL;
2542 c->bulklen = -1;
2543 c->multibulk = 0;
2544 c->mbargc = 0;
2545 c->mbargv = NULL;
2546 c->sentlen = 0;
2547 c->flags = 0;
2548 c->lastinteraction = time(NULL);
2549 c->authenticated = 0;
2550 c->replstate = REDIS_REPL_NONE;
2551 c->reply = listCreate();
2552 listSetFreeMethod(c->reply,decrRefCount);
2553 listSetDupMethod(c->reply,dupClientReplyValue);
2554 c->blockingkeys = NULL;
2555 c->blockingkeysnum = 0;
2556 c->io_keys = listCreate();
2557 listSetFreeMethod(c->io_keys,decrRefCount);
2558 c->pubsub_channels = dictCreate(&setDictType,NULL);
2559 c->pubsub_patterns = listCreate();
2560 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2561 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2562 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2563 readQueryFromClient, c) == AE_ERR) {
2564 freeClient(c);
2565 return NULL;
2566 }
2567 listAddNodeTail(server.clients,c);
2568 initClientMultiState(c);
2569 return c;
2570 }
2571
2572 static void addReply(redisClient *c, robj *obj) {
2573 if (listLength(c->reply) == 0 &&
2574 (c->replstate == REDIS_REPL_NONE ||
2575 c->replstate == REDIS_REPL_ONLINE) &&
2576 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2577 sendReplyToClient, c) == AE_ERR) return;
2578
2579 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2580 obj = dupStringObject(obj);
2581 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2582 }
2583 listAddNodeTail(c->reply,getDecodedObject(obj));
2584 }
2585
2586 static void addReplySds(redisClient *c, sds s) {
2587 robj *o = createObject(REDIS_STRING,s);
2588 addReply(c,o);
2589 decrRefCount(o);
2590 }
2591
2592 static void addReplyDouble(redisClient *c, double d) {
2593 char buf[128];
2594
2595 snprintf(buf,sizeof(buf),"%.17g",d);
2596 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2597 (unsigned long) strlen(buf),buf));
2598 }
2599
2600 static void addReplyLong(redisClient *c, long l) {
2601 char buf[128];
2602 size_t len;
2603
2604 if (l == 0) {
2605 addReply(c,shared.czero);
2606 return;
2607 } else if (l == 1) {
2608 addReply(c,shared.cone);
2609 return;
2610 }
2611 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2612 addReplySds(c,sdsnewlen(buf,len));
2613 }
2614
2615 static void addReplyLongLong(redisClient *c, long long ll) {
2616 char buf[128];
2617 size_t len;
2618
2619 if (ll == 0) {
2620 addReply(c,shared.czero);
2621 return;
2622 } else if (ll == 1) {
2623 addReply(c,shared.cone);
2624 return;
2625 }
2626 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2627 addReplySds(c,sdsnewlen(buf,len));
2628 }
2629
2630 static void addReplyUlong(redisClient *c, unsigned long ul) {
2631 char buf[128];
2632 size_t len;
2633
2634 if (ul == 0) {
2635 addReply(c,shared.czero);
2636 return;
2637 } else if (ul == 1) {
2638 addReply(c,shared.cone);
2639 return;
2640 }
2641 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2642 addReplySds(c,sdsnewlen(buf,len));
2643 }
2644
2645 static void addReplyBulkLen(redisClient *c, robj *obj) {
2646 size_t len;
2647
2648 if (obj->encoding == REDIS_ENCODING_RAW) {
2649 len = sdslen(obj->ptr);
2650 } else {
2651 long n = (long)obj->ptr;
2652
2653 /* Compute how many bytes will take this integer as a radix 10 string */
2654 len = 1;
2655 if (n < 0) {
2656 len++;
2657 n = -n;
2658 }
2659 while((n = n/10) != 0) {
2660 len++;
2661 }
2662 }
2663 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2664 }
2665
2666 static void addReplyBulk(redisClient *c, robj *obj) {
2667 addReplyBulkLen(c,obj);
2668 addReply(c,obj);
2669 addReply(c,shared.crlf);
2670 }
2671
2672 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2673 static void addReplyBulkCString(redisClient *c, char *s) {
2674 if (s == NULL) {
2675 addReply(c,shared.nullbulk);
2676 } else {
2677 robj *o = createStringObject(s,strlen(s));
2678 addReplyBulk(c,o);
2679 decrRefCount(o);
2680 }
2681 }
2682
2683 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2684 int cport, cfd;
2685 char cip[128];
2686 redisClient *c;
2687 REDIS_NOTUSED(el);
2688 REDIS_NOTUSED(mask);
2689 REDIS_NOTUSED(privdata);
2690
2691 cfd = anetAccept(server.neterr, fd, cip, &cport);
2692 if (cfd == AE_ERR) {
2693 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2694 return;
2695 }
2696 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2697 if ((c = createClient(cfd)) == NULL) {
2698 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2699 close(cfd); /* May be already closed, just ingore errors */
2700 return;
2701 }
2702 /* If maxclient directive is set and this is one client more... close the
2703 * connection. Note that we create the client instead to check before
2704 * for this condition, since now the socket is already set in nonblocking
2705 * mode and we can send an error for free using the Kernel I/O */
2706 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2707 char *err = "-ERR max number of clients reached\r\n";
2708
2709 /* That's a best effort error message, don't check write errors */
2710 if (write(c->fd,err,strlen(err)) == -1) {
2711 /* Nothing to do, Just to avoid the warning... */
2712 }
2713 freeClient(c);
2714 return;
2715 }
2716 server.stat_numconnections++;
2717 }
2718
2719 /* ======================= Redis objects implementation ===================== */
2720
2721 static robj *createObject(int type, void *ptr) {
2722 robj *o;
2723
2724 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2725 if (listLength(server.objfreelist)) {
2726 listNode *head = listFirst(server.objfreelist);
2727 o = listNodeValue(head);
2728 listDelNode(server.objfreelist,head);
2729 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2730 } else {
2731 if (server.vm_enabled) {
2732 pthread_mutex_unlock(&server.obj_freelist_mutex);
2733 o = zmalloc(sizeof(*o));
2734 } else {
2735 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2736 }
2737 }
2738 o->type = type;
2739 o->encoding = REDIS_ENCODING_RAW;
2740 o->ptr = ptr;
2741 o->refcount = 1;
2742 if (server.vm_enabled) {
2743 /* Note that this code may run in the context of an I/O thread
2744 * and accessing to server.unixtime in theory is an error
2745 * (no locks). But in practice this is safe, and even if we read
2746 * garbage Redis will not fail, as it's just a statistical info */
2747 o->vm.atime = server.unixtime;
2748 o->storage = REDIS_VM_MEMORY;
2749 }
2750 return o;
2751 }
2752
2753 static robj *createStringObject(char *ptr, size_t len) {
2754 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2755 }
2756
2757 static robj *dupStringObject(robj *o) {
2758 assert(o->encoding == REDIS_ENCODING_RAW);
2759 return createStringObject(o->ptr,sdslen(o->ptr));
2760 }
2761
2762 static robj *createListObject(void) {
2763 list *l = listCreate();
2764
2765 listSetFreeMethod(l,decrRefCount);
2766 return createObject(REDIS_LIST,l);
2767 }
2768
2769 static robj *createSetObject(void) {
2770 dict *d = dictCreate(&setDictType,NULL);
2771 return createObject(REDIS_SET,d);
2772 }
2773
2774 static robj *createHashObject(void) {
2775 /* All the Hashes start as zipmaps. Will be automatically converted
2776 * into hash tables if there are enough elements or big elements
2777 * inside. */
2778 unsigned char *zm = zipmapNew();
2779 robj *o = createObject(REDIS_HASH,zm);
2780 o->encoding = REDIS_ENCODING_ZIPMAP;
2781 return o;
2782 }
2783
2784 static robj *createZsetObject(void) {
2785 zset *zs = zmalloc(sizeof(*zs));
2786
2787 zs->dict = dictCreate(&zsetDictType,NULL);
2788 zs->zsl = zslCreate();
2789 return createObject(REDIS_ZSET,zs);
2790 }
2791
2792 static void freeStringObject(robj *o) {
2793 if (o->encoding == REDIS_ENCODING_RAW) {
2794 sdsfree(o->ptr);
2795 }
2796 }
2797
2798 static void freeListObject(robj *o) {
2799 listRelease((list*) o->ptr);
2800 }
2801
2802 static void freeSetObject(robj *o) {
2803 dictRelease((dict*) o->ptr);
2804 }
2805
2806 static void freeZsetObject(robj *o) {
2807 zset *zs = o->ptr;
2808
2809 dictRelease(zs->dict);
2810 zslFree(zs->zsl);
2811 zfree(zs);
2812 }
2813
2814 static void freeHashObject(robj *o) {
2815 switch (o->encoding) {
2816 case REDIS_ENCODING_HT:
2817 dictRelease((dict*) o->ptr);
2818 break;
2819 case REDIS_ENCODING_ZIPMAP:
2820 zfree(o->ptr);
2821 break;
2822 default:
2823 redisAssert(0);
2824 break;
2825 }
2826 }
2827
2828 static void incrRefCount(robj *o) {
2829 o->refcount++;
2830 }
2831
2832 static void decrRefCount(void *obj) {
2833 robj *o = obj;
2834
2835 /* Object is a key of a swapped out value, or in the process of being
2836 * loaded. */
2837 if (server.vm_enabled &&
2838 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2839 {
2840 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2841 redisAssert(o->type == REDIS_STRING);
2842 freeStringObject(o);
2843 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2844 pthread_mutex_lock(&server.obj_freelist_mutex);
2845 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2846 !listAddNodeHead(server.objfreelist,o))
2847 zfree(o);
2848 pthread_mutex_unlock(&server.obj_freelist_mutex);
2849 server.vm_stats_swapped_objects--;
2850 return;
2851 }
2852 /* Object is in memory, or in the process of being swapped out. */
2853 if (--(o->refcount) == 0) {
2854 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2855 vmCancelThreadedIOJob(obj);
2856 switch(o->type) {
2857 case REDIS_STRING: freeStringObject(o); break;
2858 case REDIS_LIST: freeListObject(o); break;
2859 case REDIS_SET: freeSetObject(o); break;
2860 case REDIS_ZSET: freeZsetObject(o); break;
2861 case REDIS_HASH: freeHashObject(o); break;
2862 default: redisAssert(0); break;
2863 }
2864 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2865 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2866 !listAddNodeHead(server.objfreelist,o))
2867 zfree(o);
2868 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2869 }
2870 }
2871
2872 static robj *lookupKey(redisDb *db, robj *key) {
2873 dictEntry *de = dictFind(db->dict,key);
2874 if (de) {
2875 robj *key = dictGetEntryKey(de);
2876 robj *val = dictGetEntryVal(de);
2877
2878 if (server.vm_enabled) {
2879 if (key->storage == REDIS_VM_MEMORY ||
2880 key->storage == REDIS_VM_SWAPPING)
2881 {
2882 /* If we were swapping the object out, stop it, this key
2883 * was requested. */
2884 if (key->storage == REDIS_VM_SWAPPING)
2885 vmCancelThreadedIOJob(key);
2886 /* Update the access time of the key for the aging algorithm. */
2887 key->vm.atime = server.unixtime;
2888 } else {
2889 int notify = (key->storage == REDIS_VM_LOADING);
2890
2891 /* Our value was swapped on disk. Bring it at home. */
2892 redisAssert(val == NULL);
2893 val = vmLoadObject(key);
2894 dictGetEntryVal(de) = val;
2895
2896 /* Clients blocked by the VM subsystem may be waiting for
2897 * this key... */
2898 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2899 }
2900 }
2901 return val;
2902 } else {
2903 return NULL;
2904 }
2905 }
2906
2907 static robj *lookupKeyRead(redisDb *db, robj *key) {
2908 expireIfNeeded(db,key);
2909 return lookupKey(db,key);
2910 }
2911
2912 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2913 deleteIfVolatile(db,key);
2914 return lookupKey(db,key);
2915 }
2916
2917 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2918 robj *o = lookupKeyRead(c->db, key);
2919 if (!o) addReply(c,reply);
2920 return o;
2921 }
2922
2923 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2924 robj *o = lookupKeyWrite(c->db, key);
2925 if (!o) addReply(c,reply);
2926 return o;
2927 }
2928
2929 static int checkType(redisClient *c, robj *o, int type) {
2930 if (o->type != type) {
2931 addReply(c,shared.wrongtypeerr);
2932 return 1;
2933 }
2934 return 0;
2935 }
2936
2937 static int deleteKey(redisDb *db, robj *key) {
2938 int retval;
2939
2940 /* We need to protect key from destruction: after the first dictDelete()
2941 * it may happen that 'key' is no longer valid if we don't increment
2942 * it's count. This may happen when we get the object reference directly
2943 * from the hash table with dictRandomKey() or dict iterators */
2944 incrRefCount(key);
2945 if (dictSize(db->expires)) dictDelete(db->expires,key);
2946 retval = dictDelete(db->dict,key);
2947 decrRefCount(key);
2948
2949 return retval == DICT_OK;
2950 }
2951
2952 /* Check if the nul-terminated string 's' can be represented by a long
2953 * (that is, is a number that fits into long without any other space or
2954 * character before or after the digits).
2955 *
2956 * If so, the function returns REDIS_OK and *longval is set to the value
2957 * of the number. Otherwise REDIS_ERR is returned */
2958 static int isStringRepresentableAsLong(sds s, long *longval) {
2959 char buf[32], *endptr;
2960 long value;
2961 int slen;
2962
2963 value = strtol(s, &endptr, 10);
2964 if (endptr[0] != '\0') return REDIS_ERR;
2965 slen = snprintf(buf,32,"%ld",value);
2966
2967 /* If the number converted back into a string is not identical
2968 * then it's not possible to encode the string as integer */
2969 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2970 if (longval) *longval = value;
2971 return REDIS_OK;
2972 }
2973
2974 /* Try to encode a string object in order to save space */
2975 static robj *tryObjectEncoding(robj *o) {
2976 long value;
2977 sds s = o->ptr;
2978
2979 if (o->encoding != REDIS_ENCODING_RAW)
2980 return o; /* Already encoded */
2981
2982 /* It's not safe to encode shared objects: shared objects can be shared
2983 * everywhere in the "object space" of Redis. Encoded objects can only
2984 * appear as "values" (and not, for instance, as keys) */
2985 if (o->refcount > 1) return o;
2986
2987 /* Currently we try to encode only strings */
2988 redisAssert(o->type == REDIS_STRING);
2989
2990 /* Check if we can represent this string as a long integer */
2991 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2992
2993 /* Ok, this object can be encoded */
2994 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2995 decrRefCount(o);
2996 incrRefCount(shared.integers[value]);
2997 return shared.integers[value];
2998 } else {
2999 o->encoding = REDIS_ENCODING_INT;
3000 sdsfree(o->ptr);
3001 o->ptr = (void*) value;
3002 return o;
3003 }
3004 }
3005
3006 /* Get a decoded version of an encoded object (returned as a new object).
3007 * If the object is already raw-encoded just increment the ref count. */
3008 static robj *getDecodedObject(robj *o) {
3009 robj *dec;
3010
3011 if (o->encoding == REDIS_ENCODING_RAW) {
3012 incrRefCount(o);
3013 return o;
3014 }
3015 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3016 char buf[32];
3017
3018 snprintf(buf,32,"%ld",(long)o->ptr);
3019 dec = createStringObject(buf,strlen(buf));
3020 return dec;
3021 } else {
3022 redisAssert(1 != 1);
3023 }
3024 }
3025
3026 /* Compare two string objects via strcmp() or alike.
3027 * Note that the objects may be integer-encoded. In such a case we
3028 * use snprintf() to get a string representation of the numbers on the stack
3029 * and compare the strings, it's much faster than calling getDecodedObject().
3030 *
3031 * Important note: if objects are not integer encoded, but binary-safe strings,
3032 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3033 * binary safe. */
3034 static int compareStringObjects(robj *a, robj *b) {
3035 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3036 char bufa[128], bufb[128], *astr, *bstr;
3037 int bothsds = 1;
3038
3039 if (a == b) return 0;
3040 if (a->encoding != REDIS_ENCODING_RAW) {
3041 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3042 astr = bufa;
3043 bothsds = 0;
3044 } else {
3045 astr = a->ptr;
3046 }
3047 if (b->encoding != REDIS_ENCODING_RAW) {
3048 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3049 bstr = bufb;
3050 bothsds = 0;
3051 } else {
3052 bstr = b->ptr;
3053 }
3054 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3055 }
3056
3057 static size_t stringObjectLen(robj *o) {
3058 redisAssert(o->type == REDIS_STRING);
3059 if (o->encoding == REDIS_ENCODING_RAW) {
3060 return sdslen(o->ptr);
3061 } else {
3062 char buf[32];
3063
3064 return snprintf(buf,32,"%ld",(long)o->ptr);
3065 }
3066 }
3067
3068 static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
3069 double parsedValue;
3070 char *eptr = NULL;
3071
3072 if (o && o->type != REDIS_STRING) {
3073 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3074 return REDIS_ERR;
3075 }
3076
3077 if (o == NULL)
3078 parsedValue = 0;
3079 else if (o->encoding == REDIS_ENCODING_RAW)
3080 parsedValue = strtod(o->ptr, &eptr);
3081 else if (o->encoding == REDIS_ENCODING_INT)
3082 parsedValue = (long)o->ptr;
3083 else
3084 redisAssert(1 != 1);
3085
3086 if (eptr != NULL && *eptr != '\0') {
3087 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3088 return REDIS_ERR;
3089 }
3090
3091 *value = parsedValue;
3092
3093 return REDIS_OK;
3094 }
3095
3096 static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
3097 long long parsedValue;
3098 char *eptr = NULL;
3099
3100 if (o && o->type != REDIS_STRING) {
3101 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3102 return REDIS_ERR;
3103 }
3104
3105 if (o == NULL)
3106 parsedValue = 0;
3107 else if (o->encoding == REDIS_ENCODING_RAW)
3108 parsedValue = strtoll(o->ptr, &eptr, 10);
3109 else if (o->encoding == REDIS_ENCODING_INT)
3110 parsedValue = (long)o->ptr;
3111 else
3112 redisAssert(1 != 1);
3113
3114 if (eptr != NULL && *eptr != '\0') {
3115 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3116 return REDIS_ERR;
3117 }
3118
3119 *value = parsedValue;
3120
3121 return REDIS_OK;
3122 }
3123
3124 static int getLongFromObject(redisClient *c, robj *o, long *value) {
3125 long long actualValue;
3126
3127 if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;
3128
3129 if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
3130 addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
3131 return REDIS_ERR;
3132 }
3133
3134 *value = actualValue;
3135
3136 return REDIS_OK;
3137 }
3138
3139 /*============================ RDB saving/loading =========================== */
3140
3141 static int rdbSaveType(FILE *fp, unsigned char type) {
3142 if (fwrite(&type,1,1,fp) == 0) return -1;
3143 return 0;
3144 }
3145
3146 static int rdbSaveTime(FILE *fp, time_t t) {
3147 int32_t t32 = (int32_t) t;
3148 if (fwrite(&t32,4,1,fp) == 0) return -1;
3149 return 0;
3150 }
3151
3152 /* check rdbLoadLen() comments for more info */
3153 static int rdbSaveLen(FILE *fp, uint32_t len) {
3154 unsigned char buf[2];
3155
3156 if (len < (1<<6)) {
3157 /* Save a 6 bit len */
3158 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3159 if (fwrite(buf,1,1,fp) == 0) return -1;
3160 } else if (len < (1<<14)) {
3161 /* Save a 14 bit len */
3162 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3163 buf[1] = len&0xFF;
3164 if (fwrite(buf,2,1,fp) == 0) return -1;
3165 } else {
3166 /* Save a 32 bit len */
3167 buf[0] = (REDIS_RDB_32BITLEN<<6);
3168 if (fwrite(buf,1,1,fp) == 0) return -1;
3169 len = htonl(len);
3170 if (fwrite(&len,4,1,fp) == 0) return -1;
3171 }
3172 return 0;
3173 }
3174
3175 /* String objects in the form "2391" "-100" without any space and with a
3176 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3177 * encoded as integers to save space */
3178 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3179 long long value;
3180 char *endptr, buf[32];
3181
3182 /* Check if it's possible to encode this value as a number */
3183 value = strtoll(s, &endptr, 10);
3184 if (endptr[0] != '\0') return 0;
3185 snprintf(buf,32,"%lld",value);
3186
3187 /* If the number converted back into a string is not identical
3188 * then it's not possible to encode the string as integer */
3189 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3190
3191 /* Finally check if it fits in our ranges */
3192 if (value >= -(1<<7) && value <= (1<<7)-1) {
3193 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3194 enc[1] = value&0xFF;
3195 return 2;
3196 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3197 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3198 enc[1] = value&0xFF;
3199 enc[2] = (value>>8)&0xFF;
3200 return 3;
3201 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3202 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3203 enc[1] = value&0xFF;
3204 enc[2] = (value>>8)&0xFF;
3205 enc[3] = (value>>16)&0xFF;
3206 enc[4] = (value>>24)&0xFF;
3207 return 5;
3208 } else {
3209 return 0;
3210 }
3211 }
3212
3213 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3214 size_t comprlen, outlen;
3215 unsigned char byte;
3216 void *out;
3217
3218 /* We require at least four bytes compression for this to be worth it */
3219 if (len <= 4) return 0;
3220 outlen = len-4;
3221 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3222 comprlen = lzf_compress(s, len, out, outlen);
3223 if (comprlen == 0) {
3224 zfree(out);
3225 return 0;
3226 }
3227 /* Data compressed! Let's save it on disk */
3228 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3229 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3230 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3231 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3232 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3233 zfree(out);
3234 return comprlen;
3235
3236 writeerr:
3237 zfree(out);
3238 return -1;
3239 }
3240
3241 /* Save a string objet as [len][data] on disk. If the object is a string
3242 * representation of an integer value we try to safe it in a special form */
3243 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3244 int enclen;
3245
3246 /* Try integer encoding */
3247 if (len <= 11) {
3248 unsigned char buf[5];
3249 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3250 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3251 return 0;
3252 }
3253 }
3254
3255 /* Try LZF compression - under 20 bytes it's unable to compress even
3256 * aaaaaaaaaaaaaaaaaa so skip it */
3257 if (server.rdbcompression && len > 20) {
3258 int retval;
3259
3260 retval = rdbSaveLzfStringObject(fp,s,len);
3261 if (retval == -1) return -1;
3262 if (retval > 0) return 0;
3263 /* retval == 0 means data can't be compressed, save the old way */
3264 }
3265
3266 /* Store verbatim */
3267 if (rdbSaveLen(fp,len) == -1) return -1;
3268 if (len && fwrite(s,len,1,fp) == 0) return -1;
3269 return 0;
3270 }
3271
3272 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3273 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3274 int retval;
3275
3276 /* Avoid incr/decr ref count business when possible.
3277 * This plays well with copy-on-write given that we are probably
3278 * in a child process (BGSAVE). Also this makes sure key objects
3279 * of swapped objects are not incRefCount-ed (an assert does not allow
3280 * this in order to avoid bugs) */
3281 if (obj->encoding != REDIS_ENCODING_RAW) {
3282 obj = getDecodedObject(obj);
3283 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3284 decrRefCount(obj);
3285 } else {
3286 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3287 }
3288 return retval;
3289 }
3290
3291 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3292 * 8 bit integer specifing the length of the representation.
3293 * This 8 bit integer has special values in order to specify the following
3294 * conditions:
3295 * 253: not a number
3296 * 254: + inf
3297 * 255: - inf
3298 */
3299 static int rdbSaveDoubleValue(FILE *fp, double val) {
3300 unsigned char buf[128];
3301 int len;
3302
3303 if (isnan(val)) {
3304 buf[0] = 253;
3305 len = 1;
3306 } else if (!isfinite(val)) {
3307 len = 1;
3308 buf[0] = (val < 0) ? 255 : 254;
3309 } else {
3310 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3311 buf[0] = strlen((char*)buf+1);
3312 len = buf[0]+1;
3313 }
3314 if (fwrite(buf,len,1,fp) == 0) return -1;
3315 return 0;
3316 }
3317
3318 /* Save a Redis object. */
3319 static int rdbSaveObject(FILE *fp, robj *o) {
3320 if (o->type == REDIS_STRING) {
3321 /* Save a string value */
3322 if (rdbSaveStringObject(fp,o) == -1) return -1;
3323 } else if (o->type == REDIS_LIST) {
3324 /* Save a list value */
3325 list *list = o->ptr;
3326 listIter li;
3327 listNode *ln;
3328
3329 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3330 listRewind(list,&li);
3331 while((ln = listNext(&li))) {
3332 robj *eleobj = listNodeValue(ln);
3333
3334 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3335 }
3336 } else if (o->type == REDIS_SET) {
3337 /* Save a set value */
3338 dict *set = o->ptr;
3339 dictIterator *di = dictGetIterator(set);
3340 dictEntry *de;
3341
3342 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3343 while((de = dictNext(di)) != NULL) {
3344 robj *eleobj = dictGetEntryKey(de);
3345
3346 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3347 }
3348 dictReleaseIterator(di);
3349 } else if (o->type == REDIS_ZSET) {
3350 /* Save a set value */
3351 zset *zs = o->ptr;
3352 dictIterator *di = dictGetIterator(zs->dict);
3353 dictEntry *de;
3354
3355 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3356 while((de = dictNext(di)) != NULL) {
3357 robj *eleobj = dictGetEntryKey(de);
3358 double *score = dictGetEntryVal(de);
3359
3360 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3361 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3362 }
3363 dictReleaseIterator(di);
3364 } else if (o->type == REDIS_HASH) {
3365 /* Save a hash value */
3366 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3367 unsigned char *p = zipmapRewind(o->ptr);
3368 unsigned int count = zipmapLen(o->ptr);
3369 unsigned char *key, *val;
3370 unsigned int klen, vlen;
3371
3372 if (rdbSaveLen(fp,count) == -1) return -1;
3373 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3374 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3375 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3376 }
3377 } else {
3378 dictIterator *di = dictGetIterator(o->ptr);
3379 dictEntry *de;
3380
3381 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3382 while((de = dictNext(di)) != NULL) {
3383 robj *key = dictGetEntryKey(de);
3384 robj *val = dictGetEntryVal(de);
3385
3386 if (rdbSaveStringObject(fp,key) == -1) return -1;
3387 if (rdbSaveStringObject(fp,val) == -1) return -1;
3388 }
3389 dictReleaseIterator(di);
3390 }
3391 } else {
3392 redisAssert(0);
3393 }
3394 return 0;
3395 }
3396
3397 /* Return the length the object will have on disk if saved with
3398 * the rdbSaveObject() function. Currently we use a trick to get
3399 * this length with very little changes to the code. In the future
3400 * we could switch to a faster solution. */
3401 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3402 if (fp == NULL) fp = server.devnull;
3403 rewind(fp);
3404 assert(rdbSaveObject(fp,o) != 1);
3405 return ftello(fp);
3406 }
3407
3408 /* Return the number of pages required to save this object in the swap file */
3409 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3410 off_t bytes = rdbSavedObjectLen(o,fp);
3411
3412 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3413 }
3414
3415 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3416 static int rdbSave(char *filename) {
3417 dictIterator *di = NULL;
3418 dictEntry *de;
3419 FILE *fp;
3420 char tmpfile[256];
3421 int j;
3422 time_t now = time(NULL);
3423
3424 /* Wait for I/O therads to terminate, just in case this is a
3425 * foreground-saving, to avoid seeking the swap file descriptor at the
3426 * same time. */
3427 if (server.vm_enabled)
3428 waitEmptyIOJobsQueue();
3429
3430 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3431 fp = fopen(tmpfile,"w");
3432 if (!fp) {
3433 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3434 return REDIS_ERR;
3435 }
3436 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3437 for (j = 0; j < server.dbnum; j++) {
3438 redisDb *db = server.db+j;
3439 dict *d = db->dict;
3440 if (dictSize(d) == 0) continue;
3441 di = dictGetIterator(d);
3442 if (!di) {
3443 fclose(fp);
3444 return REDIS_ERR;
3445 }
3446
3447 /* Write the SELECT DB opcode */
3448 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3449 if (rdbSaveLen(fp,j) == -1) goto werr;
3450
3451 /* Iterate this DB writing every entry */
3452 while((de = dictNext(di)) != NULL) {
3453 robj *key = dictGetEntryKey(de);
3454 robj *o = dictGetEntryVal(de);
3455 time_t expiretime = getExpire(db,key);
3456
3457 /* Save the expire time */
3458 if (expiretime != -1) {
3459 /* If this key is already expired skip it */
3460 if (expiretime < now) continue;
3461 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3462 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3463 }
3464 /* Save the key and associated value. This requires special
3465 * handling if the value is swapped out. */
3466 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3467 key->storage == REDIS_VM_SWAPPING) {
3468 /* Save type, key, value */
3469 if (rdbSaveType(fp,o->type) == -1) goto werr;
3470 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3471 if (rdbSaveObject(fp,o) == -1) goto werr;
3472 } else {
3473 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3474 robj *po;
3475 /* Get a preview of the object in memory */
3476 po = vmPreviewObject(key);
3477 /* Save type, key, value */
3478 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3479 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3480 if (rdbSaveObject(fp,po) == -1) goto werr;
3481 /* Remove the loaded object from memory */
3482 decrRefCount(po);
3483 }
3484 }
3485 dictReleaseIterator(di);
3486 }
3487 /* EOF opcode */
3488 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3489
3490 /* Make sure data will not remain on the OS's output buffers */
3491 fflush(fp);
3492 fsync(fileno(fp));
3493 fclose(fp);
3494
3495 /* Use RENAME to make sure the DB file is changed atomically only
3496 * if the generate DB file is ok. */
3497 if (rename(tmpfile,filename) == -1) {
3498 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3499 unlink(tmpfile);
3500 return REDIS_ERR;
3501 }
3502 redisLog(REDIS_NOTICE,"DB saved on disk");
3503 server.dirty = 0;
3504 server.lastsave = time(NULL);
3505 return REDIS_OK;
3506
3507 werr:
3508 fclose(fp);
3509 unlink(tmpfile);
3510 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3511 if (di) dictReleaseIterator(di);
3512 return REDIS_ERR;
3513 }
3514
3515 static int rdbSaveBackground(char *filename) {
3516 pid_t childpid;
3517
3518 if (server.bgsavechildpid != -1) return REDIS_ERR;
3519 if (server.vm_enabled) waitEmptyIOJobsQueue();
3520 if ((childpid = fork()) == 0) {
3521 /* Child */
3522 if (server.vm_enabled) vmReopenSwapFile();
3523 close(server.fd);
3524 if (rdbSave(filename) == REDIS_OK) {
3525 _exit(0);
3526 } else {
3527 _exit(1);
3528 }
3529 } else {
3530 /* Parent */
3531 if (childpid == -1) {
3532 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3533 strerror(errno));
3534 return REDIS_ERR;
3535 }
3536 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3537 server.bgsavechildpid = childpid;
3538 updateDictResizePolicy();
3539 return REDIS_OK;
3540 }
3541 return REDIS_OK; /* unreached */
3542 }
3543
3544 static void rdbRemoveTempFile(pid_t childpid) {
3545 char tmpfile[256];
3546
3547 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3548 unlink(tmpfile);
3549 }
3550
3551 static int rdbLoadType(FILE *fp) {
3552 unsigned char type;
3553 if (fread(&type,1,1,fp) == 0) return -1;
3554 return type;
3555 }
3556
3557 static time_t rdbLoadTime(FILE *fp) {
3558 int32_t t32;
3559 if (fread(&t32,4,1,fp) == 0) return -1;
3560 return (time_t) t32;
3561 }
3562
3563 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3564 * of this file for a description of how this are stored on disk.
3565 *
3566 * isencoded is set to 1 if the readed length is not actually a length but
3567 * an "encoding type", check the above comments for more info */
3568 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3569 unsigned char buf[2];
3570 uint32_t len;
3571 int type;
3572
3573 if (isencoded) *isencoded = 0;
3574 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3575 type = (buf[0]&0xC0)>>6;
3576 if (type == REDIS_RDB_6BITLEN) {
3577 /* Read a 6 bit len */
3578 return buf[0]&0x3F;
3579 } else if (type == REDIS_RDB_ENCVAL) {
3580 /* Read a 6 bit len encoding type */
3581 if (isencoded) *isencoded = 1;
3582 return buf[0]&0x3F;
3583 } else if (type == REDIS_RDB_14BITLEN) {
3584 /* Read a 14 bit len */
3585 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3586 return ((buf[0]&0x3F)<<8)|buf[1];
3587 } else {
3588 /* Read a 32 bit len */
3589 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3590 return ntohl(len);
3591 }
3592 }
3593
3594 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3595 unsigned char enc[4];
3596 long long val;
3597
3598 if (enctype == REDIS_RDB_ENC_INT8) {
3599 if (fread(enc,1,1,fp) == 0) return NULL;
3600 val = (signed char)enc[0];
3601 } else if (enctype == REDIS_RDB_ENC_INT16) {
3602 uint16_t v;
3603 if (fread(enc,2,1,fp) == 0) return NULL;
3604 v = enc[0]|(enc[1]<<8);
3605 val = (int16_t)v;
3606 } else if (enctype == REDIS_RDB_ENC_INT32) {
3607 uint32_t v;
3608 if (fread(enc,4,1,fp) == 0) return NULL;
3609 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3610 val = (int32_t)v;
3611 } else {
3612 val = 0; /* anti-warning */
3613 redisAssert(0);
3614 }
3615 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3616 }
3617
3618 static robj *rdbLoadLzfStringObject(FILE*fp) {
3619 unsigned int len, clen;
3620 unsigned char *c = NULL;
3621 sds val = NULL;
3622
3623 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3624 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3625 if ((c = zmalloc(clen)) == NULL) goto err;
3626 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3627 if (fread(c,clen,1,fp) == 0) goto err;
3628 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3629 zfree(c);
3630 return createObject(REDIS_STRING,val);
3631 err:
3632 zfree(c);
3633 sdsfree(val);
3634 return NULL;
3635 }
3636
3637 static robj *rdbLoadStringObject(FILE*fp) {
3638 int isencoded;
3639 uint32_t len;
3640 sds val;
3641
3642 len = rdbLoadLen(fp,&isencoded);
3643 if (isencoded) {
3644 switch(len) {
3645 case REDIS_RDB_ENC_INT8:
3646 case REDIS_RDB_ENC_INT16:
3647 case REDIS_RDB_ENC_INT32:
3648 return rdbLoadIntegerObject(fp,len);
3649 case REDIS_RDB_ENC_LZF:
3650 return rdbLoadLzfStringObject(fp);
3651 default:
3652 redisAssert(0);
3653 }
3654 }
3655
3656 if (len == REDIS_RDB_LENERR) return NULL;
3657 val = sdsnewlen(NULL,len);
3658 if (len && fread(val,len,1,fp) == 0) {
3659 sdsfree(val);
3660 return NULL;
3661 }
3662 return createObject(REDIS_STRING,val);
3663 }
3664
3665 /* For information about double serialization check rdbSaveDoubleValue() */
3666 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3667 char buf[128];
3668 unsigned char len;
3669
3670 if (fread(&len,1,1,fp) == 0) return -1;
3671 switch(len) {
3672 case 255: *val = R_NegInf; return 0;
3673 case 254: *val = R_PosInf; return 0;
3674 case 253: *val = R_Nan; return 0;
3675 default:
3676 if (fread(buf,len,1,fp) == 0) return -1;
3677 buf[len] = '\0';
3678 sscanf(buf, "%lg", val);
3679 return 0;
3680 }
3681 }
3682
3683 /* Load a Redis object of the specified type from the specified file.
3684 * On success a newly allocated object is returned, otherwise NULL. */
3685 static robj *rdbLoadObject(int type, FILE *fp) {
3686 robj *o;
3687
3688 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3689 if (type == REDIS_STRING) {
3690 /* Read string value */
3691 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3692 o = tryObjectEncoding(o);
3693 } else if (type == REDIS_LIST || type == REDIS_SET) {
3694 /* Read list/set value */
3695 uint32_t listlen;
3696
3697 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3698 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3699 /* It's faster to expand the dict to the right size asap in order
3700 * to avoid rehashing */
3701 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3702 dictExpand(o->ptr,listlen);
3703 /* Load every single element of the list/set */
3704 while(listlen--) {
3705 robj *ele;
3706
3707 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3708 ele = tryObjectEncoding(ele);
3709 if (type == REDIS_LIST) {
3710 listAddNodeTail((list*)o->ptr,ele);
3711 } else {
3712 dictAdd((dict*)o->ptr,ele,NULL);
3713 }
3714 }
3715 } else if (type == REDIS_ZSET) {
3716 /* Read list/set value */
3717 size_t zsetlen;
3718 zset *zs;
3719
3720 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3721 o = createZsetObject();
3722 zs = o->ptr;
3723 /* Load every single element of the list/set */
3724 while(zsetlen--) {
3725 robj *ele;
3726 double *score = zmalloc(sizeof(double));
3727
3728 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3729 ele = tryObjectEncoding(ele);
3730 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3731 dictAdd(zs->dict,ele,score);
3732 zslInsert(zs->zsl,*score,ele);
3733 incrRefCount(ele); /* added to skiplist */
3734 }
3735 } else if (type == REDIS_HASH) {
3736 size_t hashlen;
3737
3738 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3739 o = createHashObject();
3740 /* Too many entries? Use an hash table. */
3741 if (hashlen > server.hash_max_zipmap_entries)
3742 convertToRealHash(o);
3743 /* Load every key/value, then set it into the zipmap or hash
3744 * table, as needed. */
3745 while(hashlen--) {
3746 robj *key, *val;
3747
3748 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3749 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3750 /* If we are using a zipmap and there are too big values
3751 * the object is converted to real hash table encoding. */
3752 if (o->encoding != REDIS_ENCODING_HT &&
3753 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3754 sdslen(val->ptr) > server.hash_max_zipmap_value))
3755 {
3756 convertToRealHash(o);
3757 }
3758
3759 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3760 unsigned char *zm = o->ptr;
3761
3762 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3763 val->ptr,sdslen(val->ptr),NULL);
3764 o->ptr = zm;
3765 decrRefCount(key);
3766 decrRefCount(val);
3767 } else {
3768 key = tryObjectEncoding(key);
3769 val = tryObjectEncoding(val);
3770 dictAdd((dict*)o->ptr,key,val);
3771 }
3772 }
3773 } else {
3774 redisAssert(0);
3775 }
3776 return o;
3777 }
3778
3779 static int rdbLoad(char *filename) {
3780 FILE *fp;
3781 robj *keyobj = NULL;
3782 uint32_t dbid;
3783 int type, retval, rdbver;
3784 dict *d = server.db[0].dict;
3785 redisDb *db = server.db+0;
3786 char buf[1024];
3787 time_t expiretime = -1, now = time(NULL);
3788 long long loadedkeys = 0;
3789
3790 fp = fopen(filename,"r");
3791 if (!fp) return REDIS_ERR;
3792 if (fread(buf,9,1,fp) == 0) goto eoferr;
3793 buf[9] = '\0';
3794 if (memcmp(buf,"REDIS",5) != 0) {
3795 fclose(fp);
3796 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3797 return REDIS_ERR;
3798 }
3799 rdbver = atoi(buf+5);
3800 if (rdbver != 1) {
3801 fclose(fp);
3802 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3803 return REDIS_ERR;
3804 }
3805 while(1) {
3806 robj *o;
3807
3808 /* Read type. */
3809 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3810 if (type == REDIS_EXPIRETIME) {
3811 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3812 /* We read the time so we need to read the object type again */
3813 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3814 }
3815 if (type == REDIS_EOF) break;
3816 /* Handle SELECT DB opcode as a special case */
3817 if (type == REDIS_SELECTDB) {
3818 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3819 goto eoferr;
3820 if (dbid >= (unsigned)server.dbnum) {
3821 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3822 exit(1);
3823 }
3824 db = server.db+dbid;
3825 d = db->dict;
3826 continue;
3827 }
3828 /* Read key */
3829 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3830 /* Read value */
3831 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3832 /* Add the new object in the hash table */
3833 retval = dictAdd(d,keyobj,o);
3834 if (retval == DICT_ERR) {
3835 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3836 exit(1);
3837 }
3838 /* Set the expire time if needed */
3839 if (expiretime != -1) {
3840 setExpire(db,keyobj,expiretime);
3841 /* Delete this key if already expired */
3842 if (expiretime < now) deleteKey(db,keyobj);
3843 expiretime = -1;
3844 }
3845 keyobj = o = NULL;
3846 /* Handle swapping while loading big datasets when VM is on */
3847 loadedkeys++;
3848 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3849 while (zmalloc_used_memory() > server.vm_max_memory) {
3850 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3851 }
3852 }
3853 }
3854 fclose(fp);
3855 return REDIS_OK;
3856
3857 eoferr: /* unexpected end of file is handled here with a fatal exit */
3858 if (keyobj) decrRefCount(keyobj);
3859 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3860 exit(1);
3861 return REDIS_ERR; /* Just to avoid warning */
3862 }
3863
3864 /*================================== Commands =============================== */
3865
3866 static void authCommand(redisClient *c) {
3867 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3868 c->authenticated = 1;
3869 addReply(c,shared.ok);
3870 } else {
3871 c->authenticated = 0;
3872 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3873 }
3874 }
3875
3876 static void pingCommand(redisClient *c) {
3877 addReply(c,shared.pong);
3878 }
3879
3880 static void echoCommand(redisClient *c) {
3881 addReplyBulk(c,c->argv[1]);
3882 }
3883
3884 /*=================================== Strings =============================== */
3885
3886 static void setGenericCommand(redisClient *c, int nx) {
3887 int retval;
3888
3889 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3890 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3891 if (retval == DICT_ERR) {
3892 if (!nx) {
3893 /* If the key is about a swapped value, we want a new key object
3894 * to overwrite the old. So we delete the old key in the database.
3895 * This will also make sure that swap pages about the old object
3896 * will be marked as free. */
3897 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3898 incrRefCount(c->argv[1]);
3899 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3900 incrRefCount(c->argv[2]);
3901 } else {
3902 addReply(c,shared.czero);
3903 return;
3904 }
3905 } else {
3906 incrRefCount(c->argv[1]);
3907 incrRefCount(c->argv[2]);
3908 }
3909 server.dirty++;
3910 removeExpire(c->db,c->argv[1]);
3911 addReply(c, nx ? shared.cone : shared.ok);
3912 }
3913
3914 static void setCommand(redisClient *c) {
3915 setGenericCommand(c,0);
3916 }
3917
3918 static void setnxCommand(redisClient *c) {
3919 setGenericCommand(c,1);
3920 }
3921
3922 static int getGenericCommand(redisClient *c) {
3923 robj *o;
3924
3925 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3926 return REDIS_OK;
3927
3928 if (o->type != REDIS_STRING) {
3929 addReply(c,shared.wrongtypeerr);
3930 return REDIS_ERR;
3931 } else {
3932 addReplyBulk(c,o);
3933 return REDIS_OK;
3934 }
3935 }
3936
3937 static void getCommand(redisClient *c) {
3938 getGenericCommand(c);
3939 }
3940
3941 static void getsetCommand(redisClient *c) {
3942 if (getGenericCommand(c) == REDIS_ERR) return;
3943 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3944 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3945 } else {
3946 incrRefCount(c->argv[1]);
3947 }
3948 incrRefCount(c->argv[2]);
3949 server.dirty++;
3950 removeExpire(c->db,c->argv[1]);
3951 }
3952
3953 static void mgetCommand(redisClient *c) {
3954 int j;
3955
3956 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3957 for (j = 1; j < c->argc; j++) {
3958 robj *o = lookupKeyRead(c->db,c->argv[j]);
3959 if (o == NULL) {
3960 addReply(c,shared.nullbulk);
3961 } else {
3962 if (o->type != REDIS_STRING) {
3963 addReply(c,shared.nullbulk);
3964 } else {
3965 addReplyBulk(c,o);
3966 }
3967 }
3968 }
3969 }
3970
3971 static void msetGenericCommand(redisClient *c, int nx) {
3972 int j, busykeys = 0;
3973
3974 if ((c->argc % 2) == 0) {
3975 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3976 return;
3977 }
3978 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3979 * set nothing at all if at least one already key exists. */
3980 if (nx) {
3981 for (j = 1; j < c->argc; j += 2) {
3982 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3983 busykeys++;
3984 }
3985 }
3986 }
3987 if (busykeys) {
3988 addReply(c, shared.czero);
3989 return;
3990 }
3991
3992 for (j = 1; j < c->argc; j += 2) {
3993 int retval;
3994
3995 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
3996 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3997 if (retval == DICT_ERR) {
3998 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3999 incrRefCount(c->argv[j+1]);
4000 } else {
4001 incrRefCount(c->argv[j]);
4002 incrRefCount(c->argv[j+1]);
4003 }
4004 removeExpire(c->db,c->argv[j]);
4005 }
4006 server.dirty += (c->argc-1)/2;
4007 addReply(c, nx ? shared.cone : shared.ok);
4008 }
4009
4010 static void msetCommand(redisClient *c) {
4011 msetGenericCommand(c,0);
4012 }
4013
4014 static void msetnxCommand(redisClient *c) {
4015 msetGenericCommand(c,1);
4016 }
4017
4018 static void incrDecrCommand(redisClient *c, long long incr) {
4019 long long value;
4020 int retval;
4021 robj *o;
4022
4023 o = lookupKeyWrite(c->db,c->argv[1]);
4024
4025 if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
4026
4027 value += incr;
4028 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4029 o = tryObjectEncoding(o);
4030 retval = dictAdd(c->db->dict,c->argv[1],o);
4031 if (retval == DICT_ERR) {
4032 dictReplace(c->db->dict,c->argv[1],o);
4033 removeExpire(c->db,c->argv[1]);
4034 } else {
4035 incrRefCount(c->argv[1]);
4036 }
4037 server.dirty++;
4038 addReply(c,shared.colon);
4039 addReply(c,o);
4040 addReply(c,shared.crlf);
4041 }
4042
4043 static void incrCommand(redisClient *c) {
4044 incrDecrCommand(c,1);
4045 }
4046
4047 static void decrCommand(redisClient *c) {
4048 incrDecrCommand(c,-1);
4049 }
4050
4051 static void incrbyCommand(redisClient *c) {
4052 long long incr;
4053
4054 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4055
4056 incrDecrCommand(c,incr);
4057 }
4058
4059 static void decrbyCommand(redisClient *c) {
4060 long long incr;
4061
4062 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4063
4064 incrDecrCommand(c,-incr);
4065 }
4066
4067 static void appendCommand(redisClient *c) {
4068 int retval;
4069 size_t totlen;
4070 robj *o;
4071
4072 o = lookupKeyWrite(c->db,c->argv[1]);
4073 if (o == NULL) {
4074 /* Create the key */
4075 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4076 incrRefCount(c->argv[1]);
4077 incrRefCount(c->argv[2]);
4078 totlen = stringObjectLen(c->argv[2]);
4079 } else {
4080 dictEntry *de;
4081
4082 de = dictFind(c->db->dict,c->argv[1]);
4083 assert(de != NULL);
4084
4085 o = dictGetEntryVal(de);
4086 if (o->type != REDIS_STRING) {
4087 addReply(c,shared.wrongtypeerr);
4088 return;
4089 }
4090 /* If the object is specially encoded or shared we have to make
4091 * a copy */
4092 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4093 robj *decoded = getDecodedObject(o);
4094
4095 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4096 decrRefCount(decoded);
4097 dictReplace(c->db->dict,c->argv[1],o);
4098 }
4099 /* APPEND! */
4100 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4101 o->ptr = sdscatlen(o->ptr,
4102 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4103 } else {
4104 o->ptr = sdscatprintf(o->ptr, "%ld",
4105 (unsigned long) c->argv[2]->ptr);
4106 }
4107 totlen = sdslen(o->ptr);
4108 }
4109 server.dirty++;
4110 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4111 }
4112
4113 static void substrCommand(redisClient *c) {
4114 robj *o;
4115 long start = atoi(c->argv[2]->ptr);
4116 long end = atoi(c->argv[3]->ptr);
4117 size_t rangelen, strlen;
4118 sds range;
4119
4120 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4121 checkType(c,o,REDIS_STRING)) return;
4122
4123 o = getDecodedObject(o);
4124 strlen = sdslen(o->ptr);
4125
4126 /* convert negative indexes */
4127 if (start < 0) start = strlen+start;
4128 if (end < 0) end = strlen+end;
4129 if (start < 0) start = 0;
4130 if (end < 0) end = 0;
4131
4132 /* indexes sanity checks */
4133 if (start > end || (size_t)start >= strlen) {
4134 /* Out of range start or start > end result in null reply */
4135 addReply(c,shared.nullbulk);
4136 decrRefCount(o);
4137 return;
4138 }
4139 if ((size_t)end >= strlen) end = strlen-1;
4140 rangelen = (end-start)+1;
4141
4142 /* Return the result */
4143 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4144 range = sdsnewlen((char*)o->ptr+start,rangelen);
4145 addReplySds(c,range);
4146 addReply(c,shared.crlf);
4147 decrRefCount(o);
4148 }
4149
4150 /* ========================= Type agnostic commands ========================= */
4151
4152 static void delCommand(redisClient *c) {
4153 int deleted = 0, j;
4154
4155 for (j = 1; j < c->argc; j++) {
4156 if (deleteKey(c->db,c->argv[j])) {
4157 server.dirty++;
4158 deleted++;
4159 }
4160 }
4161 addReplyLong(c,deleted);
4162 }
4163
4164 static void existsCommand(redisClient *c) {
4165 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4166 }
4167
4168 static void selectCommand(redisClient *c) {
4169 int id = atoi(c->argv[1]->ptr);
4170
4171 if (selectDb(c,id) == REDIS_ERR) {
4172 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4173 } else {
4174 addReply(c,shared.ok);
4175 }
4176 }
4177
4178 static void randomkeyCommand(redisClient *c) {
4179 dictEntry *de;
4180
4181 while(1) {
4182 de = dictGetRandomKey(c->db->dict);
4183 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4184 }
4185 if (de == NULL) {
4186 addReply(c,shared.plus);
4187 addReply(c,shared.crlf);
4188 } else {
4189 addReply(c,shared.plus);
4190 addReply(c,dictGetEntryKey(de));
4191 addReply(c,shared.crlf);
4192 }
4193 }
4194
4195 static void keysCommand(redisClient *c) {
4196 dictIterator *di;
4197 dictEntry *de;
4198 sds pattern = c->argv[1]->ptr;
4199 int plen = sdslen(pattern);
4200 unsigned long numkeys = 0;
4201 robj *lenobj = createObject(REDIS_STRING,NULL);
4202
4203 di = dictGetIterator(c->db->dict);
4204 addReply(c,lenobj);
4205 decrRefCount(lenobj);
4206 while((de = dictNext(di)) != NULL) {
4207 robj *keyobj = dictGetEntryKey(de);
4208
4209 sds key = keyobj->ptr;
4210 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4211 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4212 if (expireIfNeeded(c->db,keyobj) == 0) {
4213 addReplyBulk(c,keyobj);
4214 numkeys++;
4215 }
4216 }
4217 }
4218 dictReleaseIterator(di);
4219 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4220 }
4221
4222 static void dbsizeCommand(redisClient *c) {
4223 addReplySds(c,
4224 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4225 }
4226
4227 static void lastsaveCommand(redisClient *c) {
4228 addReplySds(c,
4229 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4230 }
4231
4232 static void typeCommand(redisClient *c) {
4233 robj *o;
4234 char *type;
4235
4236 o = lookupKeyRead(c->db,c->argv[1]);
4237 if (o == NULL) {
4238 type = "+none";
4239 } else {
4240 switch(o->type) {
4241 case REDIS_STRING: type = "+string"; break;
4242 case REDIS_LIST: type = "+list"; break;
4243 case REDIS_SET: type = "+set"; break;
4244 case REDIS_ZSET: type = "+zset"; break;
4245 case REDIS_HASH: type = "+hash"; break;
4246 default: type = "+unknown"; break;
4247 }
4248 }
4249 addReplySds(c,sdsnew(type));
4250 addReply(c,shared.crlf);
4251 }
4252
4253 static void saveCommand(redisClient *c) {
4254 if (server.bgsavechildpid != -1) {
4255 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4256 return;
4257 }
4258 if (rdbSave(server.dbfilename) == REDIS_OK) {
4259 addReply(c,shared.ok);
4260 } else {
4261 addReply(c,shared.err);
4262 }
4263 }
4264
4265 static void bgsaveCommand(redisClient *c) {
4266 if (server.bgsavechildpid != -1) {
4267 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4268 return;
4269 }
4270 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4271 char *status = "+Background saving started\r\n";
4272 addReplySds(c,sdsnew(status));
4273 } else {
4274 addReply(c,shared.err);
4275 }
4276 }
4277
4278 static void shutdownCommand(redisClient *c) {
4279 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4280 /* Kill the saving child if there is a background saving in progress.
4281 We want to avoid race conditions, for instance our saving child may
4282 overwrite the synchronous saving did by SHUTDOWN. */
4283 if (server.bgsavechildpid != -1) {
4284 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4285 kill(server.bgsavechildpid,SIGKILL);
4286 rdbRemoveTempFile(server.bgsavechildpid);
4287 }
4288 if (server.appendonly) {
4289 /* Append only file: fsync() the AOF and exit */
4290 fsync(server.appendfd);
4291 if (server.vm_enabled) unlink(server.vm_swap_file);
4292 exit(0);
4293 } else {
4294 /* Snapshotting. Perform a SYNC SAVE and exit */
4295 if (rdbSave(server.dbfilename) == REDIS_OK) {
4296 if (server.daemonize)
4297 unlink(server.pidfile);
4298 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4299 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4300 if (server.vm_enabled) unlink(server.vm_swap_file);
4301 exit(0);
4302 } else {
4303 /* Ooops.. error saving! The best we can do is to continue
4304 * operating. Note that if there was a background saving process,
4305 * in the next cron() Redis will be notified that the background
4306 * saving aborted, handling special stuff like slaves pending for
4307 * synchronization... */
4308 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4309 addReplySds(c,
4310 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4311 }
4312 }
4313 }
4314
4315 static void renameGenericCommand(redisClient *c, int nx) {
4316 robj *o;
4317
4318 /* To use the same key as src and dst is probably an error */
4319 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4320 addReply(c,shared.sameobjecterr);
4321 return;
4322 }
4323
4324 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4325 return;
4326
4327 incrRefCount(o);
4328 deleteIfVolatile(c->db,c->argv[2]);
4329 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4330 if (nx) {
4331 decrRefCount(o);
4332 addReply(c,shared.czero);
4333 return;
4334 }
4335 dictReplace(c->db->dict,c->argv[2],o);
4336 } else {
4337 incrRefCount(c->argv[2]);
4338 }
4339 deleteKey(c->db,c->argv[1]);
4340 server.dirty++;
4341 addReply(c,nx ? shared.cone : shared.ok);
4342 }
4343
4344 static void renameCommand(redisClient *c) {
4345 renameGenericCommand(c,0);
4346 }
4347
4348 static void renamenxCommand(redisClient *c) {
4349 renameGenericCommand(c,1);
4350 }
4351
4352 static void moveCommand(redisClient *c) {
4353 robj *o;
4354 redisDb *src, *dst;
4355 int srcid;
4356
4357 /* Obtain source and target DB pointers */
4358 src = c->db;
4359 srcid = c->db->id;
4360 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4361 addReply(c,shared.outofrangeerr);
4362 return;
4363 }
4364 dst = c->db;
4365 selectDb(c,srcid); /* Back to the source DB */
4366
4367 /* If the user is moving using as target the same
4368 * DB as the source DB it is probably an error. */
4369 if (src == dst) {
4370 addReply(c,shared.sameobjecterr);
4371 return;
4372 }
4373
4374 /* Check if the element exists and get a reference */
4375 o = lookupKeyWrite(c->db,c->argv[1]);
4376 if (!o) {
4377 addReply(c,shared.czero);
4378 return;
4379 }
4380
4381 /* Try to add the element to the target DB */
4382 deleteIfVolatile(dst,c->argv[1]);
4383 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4384 addReply(c,shared.czero);
4385 return;
4386 }
4387 incrRefCount(c->argv[1]);
4388 incrRefCount(o);
4389
4390 /* OK! key moved, free the entry in the source DB */
4391 deleteKey(src,c->argv[1]);
4392 server.dirty++;
4393 addReply(c,shared.cone);
4394 }
4395
4396 /* =================================== Lists ================================ */
4397 static void pushGenericCommand(redisClient *c, int where) {
4398 robj *lobj;
4399 list *list;
4400
4401 lobj = lookupKeyWrite(c->db,c->argv[1]);
4402 if (lobj == NULL) {
4403 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4404 addReply(c,shared.cone);
4405 return;
4406 }
4407 lobj = createListObject();
4408 list = lobj->ptr;
4409 if (where == REDIS_HEAD) {
4410 listAddNodeHead(list,c->argv[2]);
4411 } else {
4412 listAddNodeTail(list,c->argv[2]);
4413 }
4414 dictAdd(c->db->dict,c->argv[1],lobj);
4415 incrRefCount(c->argv[1]);
4416 incrRefCount(c->argv[2]);
4417 } else {
4418 if (lobj->type != REDIS_LIST) {
4419 addReply(c,shared.wrongtypeerr);
4420 return;
4421 }
4422 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4423 addReply(c,shared.cone);
4424 return;
4425 }
4426 list = lobj->ptr;
4427 if (where == REDIS_HEAD) {
4428 listAddNodeHead(list,c->argv[2]);
4429 } else {
4430 listAddNodeTail(list,c->argv[2]);
4431 }
4432 incrRefCount(c->argv[2]);
4433 }
4434 server.dirty++;
4435 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4436 }
4437
4438 static void lpushCommand(redisClient *c) {
4439 pushGenericCommand(c,REDIS_HEAD);
4440 }
4441
4442 static void rpushCommand(redisClient *c) {
4443 pushGenericCommand(c,REDIS_TAIL);
4444 }
4445
4446 static void llenCommand(redisClient *c) {
4447 robj *o;
4448 list *l;
4449
4450 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4451 checkType(c,o,REDIS_LIST)) return;
4452
4453 l = o->ptr;
4454 addReplyUlong(c,listLength(l));
4455 }
4456
4457 static void lindexCommand(redisClient *c) {
4458 robj *o;
4459 int index = atoi(c->argv[2]->ptr);
4460 list *list;
4461 listNode *ln;
4462
4463 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4464 checkType(c,o,REDIS_LIST)) return;
4465 list = o->ptr;
4466
4467 ln = listIndex(list, index);
4468 if (ln == NULL) {
4469 addReply(c,shared.nullbulk);
4470 } else {
4471 robj *ele = listNodeValue(ln);
4472 addReplyBulk(c,ele);
4473 }
4474 }
4475
4476 static void lsetCommand(redisClient *c) {
4477 robj *o;
4478 int index = atoi(c->argv[2]->ptr);
4479 list *list;
4480 listNode *ln;
4481
4482 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4483 checkType(c,o,REDIS_LIST)) return;
4484 list = o->ptr;
4485
4486 ln = listIndex(list, index);
4487 if (ln == NULL) {
4488 addReply(c,shared.outofrangeerr);
4489 } else {
4490 robj *ele = listNodeValue(ln);
4491
4492 decrRefCount(ele);
4493 listNodeValue(ln) = c->argv[3];
4494 incrRefCount(c->argv[3]);
4495 addReply(c,shared.ok);
4496 server.dirty++;
4497 }
4498 }
4499
4500 static void popGenericCommand(redisClient *c, int where) {
4501 robj *o;
4502 list *list;
4503 listNode *ln;
4504
4505 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4506 checkType(c,o,REDIS_LIST)) return;
4507 list = o->ptr;
4508
4509 if (where == REDIS_HEAD)
4510 ln = listFirst(list);
4511 else
4512 ln = listLast(list);
4513
4514 if (ln == NULL) {
4515 addReply(c,shared.nullbulk);
4516 } else {
4517 robj *ele = listNodeValue(ln);
4518 addReplyBulk(c,ele);
4519 listDelNode(list,ln);
4520 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4521 server.dirty++;
4522 }
4523 }
4524
4525 static void lpopCommand(redisClient *c) {
4526 popGenericCommand(c,REDIS_HEAD);
4527 }
4528
4529 static void rpopCommand(redisClient *c) {
4530 popGenericCommand(c,REDIS_TAIL);
4531 }
4532
4533 static void lrangeCommand(redisClient *c) {
4534 robj *o;
4535 int start = atoi(c->argv[2]->ptr);
4536 int end = atoi(c->argv[3]->ptr);
4537 int llen;
4538 int rangelen, j;
4539 list *list;
4540 listNode *ln;
4541 robj *ele;
4542
4543 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4544 || checkType(c,o,REDIS_LIST)) return;
4545 list = o->ptr;
4546 llen = listLength(list);
4547
4548 /* convert negative indexes */
4549 if (start < 0) start = llen+start;
4550 if (end < 0) end = llen+end;
4551 if (start < 0) start = 0;
4552 if (end < 0) end = 0;
4553
4554 /* indexes sanity checks */
4555 if (start > end || start >= llen) {
4556 /* Out of range start or start > end result in empty list */
4557 addReply(c,shared.emptymultibulk);
4558 return;
4559 }
4560 if (end >= llen) end = llen-1;
4561 rangelen = (end-start)+1;
4562
4563 /* Return the result in form of a multi-bulk reply */
4564 ln = listIndex(list, start);
4565 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4566 for (j = 0; j < rangelen; j++) {
4567 ele = listNodeValue(ln);
4568 addReplyBulk(c,ele);
4569 ln = ln->next;
4570 }
4571 }
4572
4573 static void ltrimCommand(redisClient *c) {
4574 robj *o;
4575 int start = atoi(c->argv[2]->ptr);
4576 int end = atoi(c->argv[3]->ptr);
4577 int llen;
4578 int j, ltrim, rtrim;
4579 list *list;
4580 listNode *ln;
4581
4582 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4583 checkType(c,o,REDIS_LIST)) return;
4584 list = o->ptr;
4585 llen = listLength(list);
4586
4587 /* convert negative indexes */
4588 if (start < 0) start = llen+start;
4589 if (end < 0) end = llen+end;
4590 if (start < 0) start = 0;
4591 if (end < 0) end = 0;
4592
4593 /* indexes sanity checks */
4594 if (start > end || start >= llen) {
4595 /* Out of range start or start > end result in empty list */
4596 ltrim = llen;
4597 rtrim = 0;
4598 } else {
4599 if (end >= llen) end = llen-1;
4600 ltrim = start;
4601 rtrim = llen-end-1;
4602 }
4603
4604 /* Remove list elements to perform the trim */
4605 for (j = 0; j < ltrim; j++) {
4606 ln = listFirst(list);
4607 listDelNode(list,ln);
4608 }
4609 for (j = 0; j < rtrim; j++) {
4610 ln = listLast(list);
4611 listDelNode(list,ln);
4612 }
4613 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4614 server.dirty++;
4615 addReply(c,shared.ok);
4616 }
4617
4618 static void lremCommand(redisClient *c) {
4619 robj *o;
4620 list *list;
4621 listNode *ln, *next;
4622 int toremove = atoi(c->argv[2]->ptr);
4623 int removed = 0;
4624 int fromtail = 0;
4625
4626 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4627 checkType(c,o,REDIS_LIST)) return;
4628 list = o->ptr;
4629
4630 if (toremove < 0) {
4631 toremove = -toremove;
4632 fromtail = 1;
4633 }
4634 ln = fromtail ? list->tail : list->head;
4635 while (ln) {
4636 robj *ele = listNodeValue(ln);
4637
4638 next = fromtail ? ln->prev : ln->next;
4639 if (compareStringObjects(ele,c->argv[3]) == 0) {
4640 listDelNode(list,ln);
4641 server.dirty++;
4642 removed++;
4643 if (toremove && removed == toremove) break;
4644 }
4645 ln = next;
4646 }
4647 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4648 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4649 }
4650
4651 /* This is the semantic of this command:
4652 * RPOPLPUSH srclist dstlist:
4653 * IF LLEN(srclist) > 0
4654 * element = RPOP srclist
4655 * LPUSH dstlist element
4656 * RETURN element
4657 * ELSE
4658 * RETURN nil
4659 * END
4660 * END
4661 *
4662 * The idea is to be able to get an element from a list in a reliable way
4663 * since the element is not just returned but pushed against another list
4664 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4665 */
4666 static void rpoplpushcommand(redisClient *c) {
4667 robj *sobj;
4668 list *srclist;
4669 listNode *ln;
4670
4671 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4672 checkType(c,sobj,REDIS_LIST)) return;
4673 srclist = sobj->ptr;
4674 ln = listLast(srclist);
4675
4676 if (ln == NULL) {
4677 addReply(c,shared.nullbulk);
4678 } else {
4679 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4680 robj *ele = listNodeValue(ln);
4681 list *dstlist;
4682
4683 if (dobj && dobj->type != REDIS_LIST) {
4684 addReply(c,shared.wrongtypeerr);
4685 return;
4686 }
4687
4688 /* Add the element to the target list (unless it's directly
4689 * passed to some BLPOP-ing client */
4690 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4691 if (dobj == NULL) {
4692 /* Create the list if the key does not exist */
4693 dobj = createListObject();
4694 dictAdd(c->db->dict,c->argv[2],dobj);
4695 incrRefCount(c->argv[2]);
4696 }
4697 dstlist = dobj->ptr;
4698 listAddNodeHead(dstlist,ele);
4699 incrRefCount(ele);
4700 }
4701
4702 /* Send the element to the client as reply as well */
4703 addReplyBulk(c,ele);
4704
4705 /* Finally remove the element from the source list */
4706 listDelNode(srclist,ln);
4707 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4708 server.dirty++;
4709 }
4710 }
4711
4712 /* ==================================== Sets ================================ */
4713
4714 static void saddCommand(redisClient *c) {
4715 robj *set;
4716
4717 set = lookupKeyWrite(c->db,c->argv[1]);
4718 if (set == NULL) {
4719 set = createSetObject();
4720 dictAdd(c->db->dict,c->argv[1],set);
4721 incrRefCount(c->argv[1]);
4722 } else {
4723 if (set->type != REDIS_SET) {
4724 addReply(c,shared.wrongtypeerr);
4725 return;
4726 }
4727 }
4728 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4729 incrRefCount(c->argv[2]);
4730 server.dirty++;
4731 addReply(c,shared.cone);
4732 } else {
4733 addReply(c,shared.czero);
4734 }
4735 }
4736
4737 static void sremCommand(redisClient *c) {
4738 robj *set;
4739
4740 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4741 checkType(c,set,REDIS_SET)) return;
4742
4743 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4744 server.dirty++;
4745 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4746 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4747 addReply(c,shared.cone);
4748 } else {
4749 addReply(c,shared.czero);
4750 }
4751 }
4752
4753 static void smoveCommand(redisClient *c) {
4754 robj *srcset, *dstset;
4755
4756 srcset = lookupKeyWrite(c->db,c->argv[1]);
4757 dstset = lookupKeyWrite(c->db,c->argv[2]);
4758
4759 /* If the source key does not exist return 0, if it's of the wrong type
4760 * raise an error */
4761 if (srcset == NULL || srcset->type != REDIS_SET) {
4762 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4763 return;
4764 }
4765 /* Error if the destination key is not a set as well */
4766 if (dstset && dstset->type != REDIS_SET) {
4767 addReply(c,shared.wrongtypeerr);
4768 return;
4769 }
4770 /* Remove the element from the source set */
4771 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4772 /* Key not found in the src set! return zero */
4773 addReply(c,shared.czero);
4774 return;
4775 }
4776 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4777 deleteKey(c->db,c->argv[1]);
4778 server.dirty++;
4779 /* Add the element to the destination set */
4780 if (!dstset) {
4781 dstset = createSetObject();
4782 dictAdd(c->db->dict,c->argv[2],dstset);
4783 incrRefCount(c->argv[2]);
4784 }
4785 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4786 incrRefCount(c->argv[3]);
4787 addReply(c,shared.cone);
4788 }
4789
4790 static void sismemberCommand(redisClient *c) {
4791 robj *set;
4792
4793 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4794 checkType(c,set,REDIS_SET)) return;
4795
4796 if (dictFind(set->ptr,c->argv[2]))
4797 addReply(c,shared.cone);
4798 else
4799 addReply(c,shared.czero);
4800 }
4801
4802 static void scardCommand(redisClient *c) {
4803 robj *o;
4804 dict *s;
4805
4806 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4807 checkType(c,o,REDIS_SET)) return;
4808
4809 s = o->ptr;
4810 addReplyUlong(c,dictSize(s));
4811 }
4812
4813 static void spopCommand(redisClient *c) {
4814 robj *set;
4815 dictEntry *de;
4816
4817 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4818 checkType(c,set,REDIS_SET)) return;
4819
4820 de = dictGetRandomKey(set->ptr);
4821 if (de == NULL) {
4822 addReply(c,shared.nullbulk);
4823 } else {
4824 robj *ele = dictGetEntryKey(de);
4825
4826 addReplyBulk(c,ele);
4827 dictDelete(set->ptr,ele);
4828 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4829 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4830 server.dirty++;
4831 }
4832 }
4833
4834 static void srandmemberCommand(redisClient *c) {
4835 robj *set;
4836 dictEntry *de;
4837
4838 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4839 checkType(c,set,REDIS_SET)) return;
4840
4841 de = dictGetRandomKey(set->ptr);
4842 if (de == NULL) {
4843 addReply(c,shared.nullbulk);
4844 } else {
4845 robj *ele = dictGetEntryKey(de);
4846
4847 addReplyBulk(c,ele);
4848 }
4849 }
4850
4851 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4852 dict **d1 = (void*) s1, **d2 = (void*) s2;
4853
4854 return dictSize(*d1)-dictSize(*d2);
4855 }
4856
4857 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4858 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4859 dictIterator *di;
4860 dictEntry *de;
4861 robj *lenobj = NULL, *dstset = NULL;
4862 unsigned long j, cardinality = 0;
4863
4864 for (j = 0; j < setsnum; j++) {
4865 robj *setobj;
4866
4867 setobj = dstkey ?
4868 lookupKeyWrite(c->db,setskeys[j]) :
4869 lookupKeyRead(c->db,setskeys[j]);
4870 if (!setobj) {
4871 zfree(dv);
4872 if (dstkey) {
4873 if (deleteKey(c->db,dstkey))
4874 server.dirty++;
4875 addReply(c,shared.czero);
4876 } else {
4877 addReply(c,shared.emptymultibulk);
4878 }
4879 return;
4880 }
4881 if (setobj->type != REDIS_SET) {
4882 zfree(dv);
4883 addReply(c,shared.wrongtypeerr);
4884 return;
4885 }
4886 dv[j] = setobj->ptr;
4887 }
4888 /* Sort sets from the smallest to largest, this will improve our
4889 * algorithm's performace */
4890 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4891
4892 /* The first thing we should output is the total number of elements...
4893 * since this is a multi-bulk write, but at this stage we don't know
4894 * the intersection set size, so we use a trick, append an empty object
4895 * to the output list and save the pointer to later modify it with the
4896 * right length */
4897 if (!dstkey) {
4898 lenobj = createObject(REDIS_STRING,NULL);
4899 addReply(c,lenobj);
4900 decrRefCount(lenobj);
4901 } else {
4902 /* If we have a target key where to store the resulting set
4903 * create this key with an empty set inside */
4904 dstset = createSetObject();
4905 }
4906
4907 /* Iterate all the elements of the first (smallest) set, and test
4908 * the element against all the other sets, if at least one set does
4909 * not include the element it is discarded */
4910 di = dictGetIterator(dv[0]);
4911
4912 while((de = dictNext(di)) != NULL) {
4913 robj *ele;
4914
4915 for (j = 1; j < setsnum; j++)
4916 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4917 if (j != setsnum)
4918 continue; /* at least one set does not contain the member */
4919 ele = dictGetEntryKey(de);
4920 if (!dstkey) {
4921 addReplyBulk(c,ele);
4922 cardinality++;
4923 } else {
4924 dictAdd(dstset->ptr,ele,NULL);
4925 incrRefCount(ele);
4926 }
4927 }
4928 dictReleaseIterator(di);
4929
4930 if (dstkey) {
4931 /* Store the resulting set into the target, if the intersection
4932 * is not an empty set. */
4933 deleteKey(c->db,dstkey);
4934 if (dictSize((dict*)dstset->ptr) > 0) {
4935 dictAdd(c->db->dict,dstkey,dstset);
4936 incrRefCount(dstkey);
4937 addReplyLong(c,dictSize((dict*)dstset->ptr));
4938 } else {
4939 decrRefCount(dstset);
4940 addReply(c,shared.czero);
4941 }
4942 server.dirty++;
4943 } else {
4944 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4945 }
4946 zfree(dv);
4947 }
4948
4949 static void sinterCommand(redisClient *c) {
4950 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4951 }
4952
4953 static void sinterstoreCommand(redisClient *c) {
4954 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4955 }
4956
4957 #define REDIS_OP_UNION 0
4958 #define REDIS_OP_DIFF 1
4959 #define REDIS_OP_INTER 2
4960
4961 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4962 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4963 dictIterator *di;
4964 dictEntry *de;
4965 robj *dstset = NULL;
4966 int j, cardinality = 0;
4967
4968 for (j = 0; j < setsnum; j++) {
4969 robj *setobj;
4970
4971 setobj = dstkey ?
4972 lookupKeyWrite(c->db,setskeys[j]) :
4973 lookupKeyRead(c->db,setskeys[j]);
4974 if (!setobj) {
4975 dv[j] = NULL;
4976 continue;
4977 }
4978 if (setobj->type != REDIS_SET) {
4979 zfree(dv);
4980 addReply(c,shared.wrongtypeerr);
4981 return;
4982 }
4983 dv[j] = setobj->ptr;
4984 }
4985
4986 /* We need a temp set object to store our union. If the dstkey
4987 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4988 * this set object will be the resulting object to set into the target key*/
4989 dstset = createSetObject();
4990
4991 /* Iterate all the elements of all the sets, add every element a single
4992 * time to the result set */
4993 for (j = 0; j < setsnum; j++) {
4994 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4995 if (!dv[j]) continue; /* non existing keys are like empty sets */
4996
4997 di = dictGetIterator(dv[j]);
4998
4999 while((de = dictNext(di)) != NULL) {
5000 robj *ele;
5001
5002 /* dictAdd will not add the same element multiple times */
5003 ele = dictGetEntryKey(de);
5004 if (op == REDIS_OP_UNION || j == 0) {
5005 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5006 incrRefCount(ele);
5007 cardinality++;
5008 }
5009 } else if (op == REDIS_OP_DIFF) {
5010 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5011 cardinality--;
5012 }
5013 }
5014 }
5015 dictReleaseIterator(di);
5016
5017 /* result set is empty? Exit asap. */
5018 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5019 }
5020
5021 /* Output the content of the resulting set, if not in STORE mode */
5022 if (!dstkey) {
5023 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5024 di = dictGetIterator(dstset->ptr);
5025 while((de = dictNext(di)) != NULL) {
5026 robj *ele;
5027
5028 ele = dictGetEntryKey(de);
5029 addReplyBulk(c,ele);
5030 }
5031 dictReleaseIterator(di);
5032 decrRefCount(dstset);
5033 } else {
5034 /* If we have a target key where to store the resulting set
5035 * create this key with the result set inside */
5036 deleteKey(c->db,dstkey);
5037 if (dictSize((dict*)dstset->ptr) > 0) {
5038 dictAdd(c->db->dict,dstkey,dstset);
5039 incrRefCount(dstkey);
5040 addReplyLong(c,dictSize((dict*)dstset->ptr));
5041 } else {
5042 decrRefCount(dstset);
5043 addReply(c,shared.czero);
5044 }
5045 server.dirty++;
5046 }
5047 zfree(dv);
5048 }
5049
5050 static void sunionCommand(redisClient *c) {
5051 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5052 }
5053
5054 static void sunionstoreCommand(redisClient *c) {
5055 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5056 }
5057
5058 static void sdiffCommand(redisClient *c) {
5059 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5060 }
5061
5062 static void sdiffstoreCommand(redisClient *c) {
5063 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5064 }
5065
5066 /* ==================================== ZSets =============================== */
5067
5068 /* ZSETs are ordered sets using two data structures to hold the same elements
5069 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5070 * data structure.
5071 *
5072 * The elements are added to an hash table mapping Redis objects to scores.
5073 * At the same time the elements are added to a skip list mapping scores
5074 * to Redis objects (so objects are sorted by scores in this "view"). */
5075
5076 /* This skiplist implementation is almost a C translation of the original
5077 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5078 * Alternative to Balanced Trees", modified in three ways:
5079 * a) this implementation allows for repeated values.
5080 * b) the comparison is not just by key (our 'score') but by satellite data.
5081 * c) there is a back pointer, so it's a doubly linked list with the back
5082 * pointers being only at "level 1". This allows to traverse the list
5083 * from tail to head, useful for ZREVRANGE. */
5084
5085 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5086 zskiplistNode *zn = zmalloc(sizeof(*zn));
5087
5088 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5089 if (level > 0)
5090 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5091 zn->score = score;
5092 zn->obj = obj;
5093 return zn;
5094 }
5095
5096 static zskiplist *zslCreate(void) {
5097 int j;
5098 zskiplist *zsl;
5099
5100 zsl = zmalloc(sizeof(*zsl));
5101 zsl->level = 1;
5102 zsl->length = 0;
5103 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5104 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5105 zsl->header->forward[j] = NULL;
5106
5107 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5108 if (j < ZSKIPLIST_MAXLEVEL-1)
5109 zsl->header->span[j] = 0;
5110 }
5111 zsl->header->backward = NULL;
5112 zsl->tail = NULL;
5113 return zsl;
5114 }
5115
5116 static void zslFreeNode(zskiplistNode *node) {
5117 decrRefCount(node->obj);
5118 zfree(node->forward);
5119 zfree(node->span);
5120 zfree(node);
5121 }
5122
5123 static void zslFree(zskiplist *zsl) {
5124 zskiplistNode *node = zsl->header->forward[0], *next;
5125
5126 zfree(zsl->header->forward);
5127 zfree(zsl->header->span);
5128 zfree(zsl->header);
5129 while(node) {
5130 next = node->forward[0];
5131 zslFreeNode(node);
5132 node = next;
5133 }
5134 zfree(zsl);
5135 }
5136
5137 static int zslRandomLevel(void) {
5138 int level = 1;
5139 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5140 level += 1;
5141 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5142 }
5143
5144 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5145 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5146 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5147 int i, level;
5148
5149 x = zsl->header;
5150 for (i = zsl->level-1; i >= 0; i--) {
5151 /* store rank that is crossed to reach the insert position */
5152 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5153
5154 while (x->forward[i] &&
5155 (x->forward[i]->score < score ||
5156 (x->forward[i]->score == score &&
5157 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5158 rank[i] += i > 0 ? x->span[i-1] : 1;
5159 x = x->forward[i];
5160 }
5161 update[i] = x;
5162 }
5163 /* we assume the key is not already inside, since we allow duplicated
5164 * scores, and the re-insertion of score and redis object should never
5165 * happpen since the caller of zslInsert() should test in the hash table
5166 * if the element is already inside or not. */
5167 level = zslRandomLevel();
5168 if (level > zsl->level) {
5169 for (i = zsl->level; i < level; i++) {
5170 rank[i] = 0;
5171 update[i] = zsl->header;
5172 update[i]->span[i-1] = zsl->length;
5173 }
5174 zsl->level = level;
5175 }
5176 x = zslCreateNode(level,score,obj);
5177 for (i = 0; i < level; i++) {
5178 x->forward[i] = update[i]->forward[i];
5179 update[i]->forward[i] = x;
5180
5181 /* update span covered by update[i] as x is inserted here */
5182 if (i > 0) {
5183 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5184 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5185 }
5186 }
5187
5188 /* increment span for untouched levels */
5189 for (i = level; i < zsl->level; i++) {
5190 update[i]->span[i-1]++;
5191 }
5192
5193 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5194 if (x->forward[0])
5195 x->forward[0]->backward = x;
5196 else
5197 zsl->tail = x;
5198 zsl->length++;
5199 }
5200
5201 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5202 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5203 int i;
5204 for (i = 0; i < zsl->level; i++) {
5205 if (update[i]->forward[i] == x) {
5206 if (i > 0) {
5207 update[i]->span[i-1] += x->span[i-1] - 1;
5208 }
5209 update[i]->forward[i] = x->forward[i];
5210 } else {
5211 /* invariant: i > 0, because update[0]->forward[0]
5212 * is always equal to x */
5213 update[i]->span[i-1] -= 1;
5214 }
5215 }
5216 if (x->forward[0]) {
5217 x->forward[0]->backward = x->backward;
5218 } else {
5219 zsl->tail = x->backward;
5220 }
5221 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5222 zsl->level--;
5223 zsl->length--;
5224 }
5225
5226 /* Delete an element with matching score/object from the skiplist. */
5227 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5228 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5229 int i;
5230
5231 x = zsl->header;
5232 for (i = zsl->level-1; i >= 0; i--) {
5233 while (x->forward[i] &&
5234 (x->forward[i]->score < score ||
5235 (x->forward[i]->score == score &&
5236 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5237 x = x->forward[i];
5238 update[i] = x;
5239 }
5240 /* We may have multiple elements with the same score, what we need
5241 * is to find the element with both the right score and object. */
5242 x = x->forward[0];
5243 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5244 zslDeleteNode(zsl, x, update);
5245 zslFreeNode(x);
5246 return 1;
5247 } else {
5248 return 0; /* not found */
5249 }
5250 return 0; /* not found */
5251 }
5252
5253 /* Delete all the elements with score between min and max from the skiplist.
5254 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5255 * Note that this function takes the reference to the hash table view of the
5256 * sorted set, in order to remove the elements from the hash table too. */
5257 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5258 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5259 unsigned long removed = 0;
5260 int i;
5261
5262 x = zsl->header;
5263 for (i = zsl->level-1; i >= 0; i--) {
5264 while (x->forward[i] && x->forward[i]->score < min)
5265 x = x->forward[i];
5266 update[i] = x;
5267 }
5268 /* We may have multiple elements with the same score, what we need
5269 * is to find the element with both the right score and object. */
5270 x = x->forward[0];
5271 while (x && x->score <= max) {
5272 zskiplistNode *next = x->forward[0];
5273 zslDeleteNode(zsl, x, update);
5274 dictDelete(dict,x->obj);
5275 zslFreeNode(x);
5276 removed++;
5277 x = next;
5278 }
5279 return removed; /* not found */
5280 }
5281
5282 /* Delete all the elements with rank between start and end from the skiplist.
5283 * Start and end are inclusive. Note that start and end need to be 1-based */
5284 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5285 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5286 unsigned long traversed = 0, removed = 0;
5287 int i;
5288
5289 x = zsl->header;
5290 for (i = zsl->level-1; i >= 0; i--) {
5291 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5292 traversed += i > 0 ? x->span[i-1] : 1;
5293 x = x->forward[i];
5294 }
5295 update[i] = x;
5296 }
5297
5298 traversed++;
5299 x = x->forward[0];
5300 while (x && traversed <= end) {
5301 zskiplistNode *next = x->forward[0];
5302 zslDeleteNode(zsl, x, update);
5303 dictDelete(dict,x->obj);
5304 zslFreeNode(x);
5305 removed++;
5306 traversed++;
5307 x = next;
5308 }
5309 return removed;
5310 }
5311
5312 /* Find the first node having a score equal or greater than the specified one.
5313 * Returns NULL if there is no match. */
5314 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5315 zskiplistNode *x;
5316 int i;
5317
5318 x = zsl->header;
5319 for (i = zsl->level-1; i >= 0; i--) {
5320 while (x->forward[i] && x->forward[i]->score < score)
5321 x = x->forward[i];
5322 }
5323 /* We may have multiple elements with the same score, what we need
5324 * is to find the element with both the right score and object. */
5325 return x->forward[0];
5326 }
5327
5328 /* Find the rank for an element by both score and key.
5329 * Returns 0 when the element cannot be found, rank otherwise.
5330 * Note that the rank is 1-based due to the span of zsl->header to the
5331 * first element. */
5332 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5333 zskiplistNode *x;
5334 unsigned long rank = 0;
5335 int i;
5336
5337 x = zsl->header;
5338 for (i = zsl->level-1; i >= 0; i--) {
5339 while (x->forward[i] &&
5340 (x->forward[i]->score < score ||
5341 (x->forward[i]->score == score &&
5342 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5343 rank += i > 0 ? x->span[i-1] : 1;
5344 x = x->forward[i];
5345 }
5346
5347 /* x might be equal to zsl->header, so test if obj is non-NULL */
5348 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5349 return rank;
5350 }
5351 }
5352 return 0;
5353 }
5354
5355 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5356 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5357 zskiplistNode *x;
5358 unsigned long traversed = 0;
5359 int i;
5360
5361 x = zsl->header;
5362 for (i = zsl->level-1; i >= 0; i--) {
5363 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5364 {
5365 traversed += i > 0 ? x->span[i-1] : 1;
5366 x = x->forward[i];
5367 }
5368 if (traversed == rank) {
5369 return x;
5370 }
5371 }
5372 return NULL;
5373 }
5374
5375 /* The actual Z-commands implementations */
5376
5377 /* This generic command implements both ZADD and ZINCRBY.
5378 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5379 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5380 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5381 robj *zsetobj;
5382 zset *zs;
5383 double *score;
5384
5385 zsetobj = lookupKeyWrite(c->db,key);
5386 if (zsetobj == NULL) {
5387 zsetobj = createZsetObject();
5388 dictAdd(c->db->dict,key,zsetobj);
5389 incrRefCount(key);
5390 } else {
5391 if (zsetobj->type != REDIS_ZSET) {
5392 addReply(c,shared.wrongtypeerr);
5393 return;
5394 }
5395 }
5396 zs = zsetobj->ptr;
5397
5398 /* Ok now since we implement both ZADD and ZINCRBY here the code
5399 * needs to handle the two different conditions. It's all about setting
5400 * '*score', that is, the new score to set, to the right value. */
5401 score = zmalloc(sizeof(double));
5402 if (doincrement) {
5403 dictEntry *de;
5404
5405 /* Read the old score. If the element was not present starts from 0 */
5406 de = dictFind(zs->dict,ele);
5407 if (de) {
5408 double *oldscore = dictGetEntryVal(de);
5409 *score = *oldscore + scoreval;
5410 } else {
5411 *score = scoreval;
5412 }
5413 } else {
5414 *score = scoreval;
5415 }
5416
5417 /* What follows is a simple remove and re-insert operation that is common
5418 * to both ZADD and ZINCRBY... */
5419 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5420 /* case 1: New element */
5421 incrRefCount(ele); /* added to hash */
5422 zslInsert(zs->zsl,*score,ele);
5423 incrRefCount(ele); /* added to skiplist */
5424 server.dirty++;
5425 if (doincrement)
5426 addReplyDouble(c,*score);
5427 else
5428 addReply(c,shared.cone);
5429 } else {
5430 dictEntry *de;
5431 double *oldscore;
5432
5433 /* case 2: Score update operation */
5434 de = dictFind(zs->dict,ele);
5435 redisAssert(de != NULL);
5436 oldscore = dictGetEntryVal(de);
5437 if (*score != *oldscore) {
5438 int deleted;
5439
5440 /* Remove and insert the element in the skip list with new score */
5441 deleted = zslDelete(zs->zsl,*oldscore,ele);
5442 redisAssert(deleted != 0);
5443 zslInsert(zs->zsl,*score,ele);
5444 incrRefCount(ele);
5445 /* Update the score in the hash table */
5446 dictReplace(zs->dict,ele,score);
5447 server.dirty++;
5448 } else {
5449 zfree(score);
5450 }
5451 if (doincrement)
5452 addReplyDouble(c,*score);
5453 else
5454 addReply(c,shared.czero);
5455 }
5456 }
5457
5458 static void zaddCommand(redisClient *c) {
5459 double scoreval;
5460
5461 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5462
5463 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5464 }
5465
5466 static void zincrbyCommand(redisClient *c) {
5467 double scoreval;
5468
5469 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5470
5471 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5472 }
5473
5474 static void zremCommand(redisClient *c) {
5475 robj *zsetobj;
5476 zset *zs;
5477 dictEntry *de;
5478 double *oldscore;
5479 int deleted;
5480
5481 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5482 checkType(c,zsetobj,REDIS_ZSET)) return;
5483
5484 zs = zsetobj->ptr;
5485 de = dictFind(zs->dict,c->argv[2]);
5486 if (de == NULL) {
5487 addReply(c,shared.czero);
5488 return;
5489 }
5490 /* Delete from the skiplist */
5491 oldscore = dictGetEntryVal(de);
5492 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5493 redisAssert(deleted != 0);
5494
5495 /* Delete from the hash table */
5496 dictDelete(zs->dict,c->argv[2]);
5497 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5498 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5499 server.dirty++;
5500 addReply(c,shared.cone);
5501 }
5502
5503 static void zremrangebyscoreCommand(redisClient *c) {
5504 double min;
5505 double max;
5506 long deleted;
5507 robj *zsetobj;
5508 zset *zs;
5509
5510 if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
5511 (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;
5512
5513 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5514 checkType(c,zsetobj,REDIS_ZSET)) return;
5515
5516 zs = zsetobj->ptr;
5517 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5518 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5519 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5520 server.dirty += deleted;
5521 addReplyLong(c,deleted);
5522 }
5523
5524 static void zremrangebyrankCommand(redisClient *c) {
5525 long start;
5526 long end;
5527 int llen;
5528 long deleted;
5529 robj *zsetobj;
5530 zset *zs;
5531
5532 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5533 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5534
5535 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5536 checkType(c,zsetobj,REDIS_ZSET)) return;
5537 zs = zsetobj->ptr;
5538 llen = zs->zsl->length;
5539
5540 /* convert negative indexes */
5541 if (start < 0) start = llen+start;
5542 if (end < 0) end = llen+end;
5543 if (start < 0) start = 0;
5544 if (end < 0) end = 0;
5545
5546 /* indexes sanity checks */
5547 if (start > end || start >= llen) {
5548 addReply(c,shared.czero);
5549 return;
5550 }
5551 if (end >= llen) end = llen-1;
5552
5553 /* increment start and end because zsl*Rank functions
5554 * use 1-based rank */
5555 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5556 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5557 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5558 server.dirty += deleted;
5559 addReplyLong(c, deleted);
5560 }
5561
5562 typedef struct {
5563 dict *dict;
5564 double weight;
5565 } zsetopsrc;
5566
5567 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5568 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5569 unsigned long size1, size2;
5570 size1 = d1->dict ? dictSize(d1->dict) : 0;
5571 size2 = d2->dict ? dictSize(d2->dict) : 0;
5572 return size1 - size2;
5573 }
5574
5575 #define REDIS_AGGR_SUM 1
5576 #define REDIS_AGGR_MIN 2
5577 #define REDIS_AGGR_MAX 3
5578
5579 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5580 if (aggregate == REDIS_AGGR_SUM) {
5581 *target = *target + val;
5582 } else if (aggregate == REDIS_AGGR_MIN) {
5583 *target = val < *target ? val : *target;
5584 } else if (aggregate == REDIS_AGGR_MAX) {
5585 *target = val > *target ? val : *target;
5586 } else {
5587 /* safety net */
5588 redisAssert(0 != 0);
5589 }
5590 }
5591
5592 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5593 int i, j, zsetnum;
5594 int aggregate = REDIS_AGGR_SUM;
5595 zsetopsrc *src;
5596 robj *dstobj;
5597 zset *dstzset;
5598 dictIterator *di;
5599 dictEntry *de;
5600
5601 /* expect zsetnum input keys to be given */
5602 zsetnum = atoi(c->argv[2]->ptr);
5603 if (zsetnum < 1) {
5604 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5605 return;
5606 }
5607
5608 /* test if the expected number of keys would overflow */
5609 if (3+zsetnum > c->argc) {
5610 addReply(c,shared.syntaxerr);
5611 return;
5612 }
5613
5614 /* read keys to be used for input */
5615 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5616 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5617 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5618 if (!zsetobj) {
5619 src[i].dict = NULL;
5620 } else {
5621 if (zsetobj->type != REDIS_ZSET) {
5622 zfree(src);
5623 addReply(c,shared.wrongtypeerr);
5624 return;
5625 }
5626 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5627 }
5628
5629 /* default all weights to 1 */
5630 src[i].weight = 1.0;
5631 }
5632
5633 /* parse optional extra arguments */
5634 if (j < c->argc) {
5635 int remaining = c->argc - j;
5636
5637 while (remaining) {
5638 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5639 j++; remaining--;
5640 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5641 if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
5642 return;
5643 }
5644 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5645 j++; remaining--;
5646 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5647 aggregate = REDIS_AGGR_SUM;
5648 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5649 aggregate = REDIS_AGGR_MIN;
5650 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5651 aggregate = REDIS_AGGR_MAX;
5652 } else {
5653 zfree(src);
5654 addReply(c,shared.syntaxerr);
5655 return;
5656 }
5657 j++; remaining--;
5658 } else {
5659 zfree(src);
5660 addReply(c,shared.syntaxerr);
5661 return;
5662 }
5663 }
5664 }
5665
5666 /* sort sets from the smallest to largest, this will improve our
5667 * algorithm's performance */
5668 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5669
5670 dstobj = createZsetObject();
5671 dstzset = dstobj->ptr;
5672
5673 if (op == REDIS_OP_INTER) {
5674 /* skip going over all entries if the smallest zset is NULL or empty */
5675 if (src[0].dict && dictSize(src[0].dict) > 0) {
5676 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5677 * from small to large, all src[i > 0].dict are non-empty too */
5678 di = dictGetIterator(src[0].dict);
5679 while((de = dictNext(di)) != NULL) {
5680 double *score = zmalloc(sizeof(double)), value;
5681 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5682
5683 for (j = 1; j < zsetnum; j++) {
5684 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5685 if (other) {
5686 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5687 zunionInterAggregate(score, value, aggregate);
5688 } else {
5689 break;
5690 }
5691 }
5692
5693 /* skip entry when not present in every source dict */
5694 if (j != zsetnum) {
5695 zfree(score);
5696 } else {
5697 robj *o = dictGetEntryKey(de);
5698 dictAdd(dstzset->dict,o,score);
5699 incrRefCount(o); /* added to dictionary */
5700 zslInsert(dstzset->zsl,*score,o);
5701 incrRefCount(o); /* added to skiplist */
5702 }
5703 }
5704 dictReleaseIterator(di);
5705 }
5706 } else if (op == REDIS_OP_UNION) {
5707 for (i = 0; i < zsetnum; i++) {
5708 if (!src[i].dict) continue;
5709
5710 di = dictGetIterator(src[i].dict);
5711 while((de = dictNext(di)) != NULL) {
5712 /* skip key when already processed */
5713 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5714
5715 double *score = zmalloc(sizeof(double)), value;
5716 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5717
5718 /* because the zsets are sorted by size, its only possible
5719 * for sets at larger indices to hold this entry */
5720 for (j = (i+1); j < zsetnum; j++) {
5721 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5722 if (other) {
5723 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5724 zunionInterAggregate(score, value, aggregate);
5725 }
5726 }
5727
5728 robj *o = dictGetEntryKey(de);
5729 dictAdd(dstzset->dict,o,score);
5730 incrRefCount(o); /* added to dictionary */
5731 zslInsert(dstzset->zsl,*score,o);
5732 incrRefCount(o); /* added to skiplist */
5733 }
5734 dictReleaseIterator(di);
5735 }
5736 } else {
5737 /* unknown operator */
5738 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5739 }
5740
5741 deleteKey(c->db,dstkey);
5742 if (dstzset->zsl->length) {
5743 dictAdd(c->db->dict,dstkey,dstobj);
5744 incrRefCount(dstkey);
5745 addReplyLong(c, dstzset->zsl->length);
5746 server.dirty++;
5747 } else {
5748 decrRefCount(dstobj);
5749 addReply(c, shared.czero);
5750 }
5751 zfree(src);
5752 }
5753
5754 static void zunionCommand(redisClient *c) {
5755 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5756 }
5757
5758 static void zinterCommand(redisClient *c) {
5759 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5760 }
5761
5762 static void zrangeGenericCommand(redisClient *c, int reverse) {
5763 robj *o;
5764 long start;
5765 long end;
5766 int withscores = 0;
5767 int llen;
5768 int rangelen, j;
5769 zset *zsetobj;
5770 zskiplist *zsl;
5771 zskiplistNode *ln;
5772 robj *ele;
5773
5774 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5775 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5776
5777 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5778 withscores = 1;
5779 } else if (c->argc >= 5) {
5780 addReply(c,shared.syntaxerr);
5781 return;
5782 }
5783
5784 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5785 || checkType(c,o,REDIS_ZSET)) return;
5786 zsetobj = o->ptr;
5787 zsl = zsetobj->zsl;
5788 llen = zsl->length;
5789
5790 /* convert negative indexes */
5791 if (start < 0) start = llen+start;
5792 if (end < 0) end = llen+end;
5793 if (start < 0) start = 0;
5794 if (end < 0) end = 0;
5795
5796 /* indexes sanity checks */
5797 if (start > end || start >= llen) {
5798 /* Out of range start or start > end result in empty list */
5799 addReply(c,shared.emptymultibulk);
5800 return;
5801 }
5802 if (end >= llen) end = llen-1;
5803 rangelen = (end-start)+1;
5804
5805 /* check if starting point is trivial, before searching
5806 * the element in log(N) time */
5807 if (reverse) {
5808 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5809 } else {
5810 ln = start == 0 ?
5811 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5812 }
5813
5814 /* Return the result in form of a multi-bulk reply */
5815 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5816 withscores ? (rangelen*2) : rangelen));
5817 for (j = 0; j < rangelen; j++) {
5818 ele = ln->obj;
5819 addReplyBulk(c,ele);
5820 if (withscores)
5821 addReplyDouble(c,ln->score);
5822 ln = reverse ? ln->backward : ln->forward[0];
5823 }
5824 }
5825
5826 static void zrangeCommand(redisClient *c) {
5827 zrangeGenericCommand(c,0);
5828 }
5829
5830 static void zrevrangeCommand(redisClient *c) {
5831 zrangeGenericCommand(c,1);
5832 }
5833
5834 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5835 * If justcount is non-zero, just the count is returned. */
5836 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5837 robj *o;
5838 double min, max;
5839 int minex = 0, maxex = 0; /* are min or max exclusive? */
5840 int offset = 0, limit = -1;
5841 int withscores = 0;
5842 int badsyntax = 0;
5843
5844 /* Parse the min-max interval. If one of the values is prefixed
5845 * by the "(" character, it's considered "open". For instance
5846 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5847 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5848 if (((char*)c->argv[2]->ptr)[0] == '(') {
5849 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5850 minex = 1;
5851 } else {
5852 min = strtod(c->argv[2]->ptr,NULL);
5853 }
5854 if (((char*)c->argv[3]->ptr)[0] == '(') {
5855 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5856 maxex = 1;
5857 } else {
5858 max = strtod(c->argv[3]->ptr,NULL);
5859 }
5860
5861 /* Parse "WITHSCORES": note that if the command was called with
5862 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5863 * enter the following paths to parse WITHSCORES and LIMIT. */
5864 if (c->argc == 5 || c->argc == 8) {
5865 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5866 withscores = 1;
5867 else
5868 badsyntax = 1;
5869 }
5870 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5871 badsyntax = 1;
5872 if (badsyntax) {
5873 addReplySds(c,
5874 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5875 return;
5876 }
5877
5878 /* Parse "LIMIT" */
5879 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5880 addReply(c,shared.syntaxerr);
5881 return;
5882 } else if (c->argc == (7 + withscores)) {
5883 offset = atoi(c->argv[5]->ptr);
5884 limit = atoi(c->argv[6]->ptr);
5885 if (offset < 0) offset = 0;
5886 }
5887
5888 /* Ok, lookup the key and get the range */
5889 o = lookupKeyRead(c->db,c->argv[1]);
5890 if (o == NULL) {
5891 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5892 } else {
5893 if (o->type != REDIS_ZSET) {
5894 addReply(c,shared.wrongtypeerr);
5895 } else {
5896 zset *zsetobj = o->ptr;
5897 zskiplist *zsl = zsetobj->zsl;
5898 zskiplistNode *ln;
5899 robj *ele, *lenobj = NULL;
5900 unsigned long rangelen = 0;
5901
5902 /* Get the first node with the score >= min, or with
5903 * score > min if 'minex' is true. */
5904 ln = zslFirstWithScore(zsl,min);
5905 while (minex && ln && ln->score == min) ln = ln->forward[0];
5906
5907 if (ln == NULL) {
5908 /* No element matching the speciifed interval */
5909 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5910 return;
5911 }
5912
5913 /* We don't know in advance how many matching elements there
5914 * are in the list, so we push this object that will represent
5915 * the multi-bulk length in the output buffer, and will "fix"
5916 * it later */
5917 if (!justcount) {
5918 lenobj = createObject(REDIS_STRING,NULL);
5919 addReply(c,lenobj);
5920 decrRefCount(lenobj);
5921 }
5922
5923 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5924 if (offset) {
5925 offset--;
5926 ln = ln->forward[0];
5927 continue;
5928 }
5929 if (limit == 0) break;
5930 if (!justcount) {
5931 ele = ln->obj;
5932 addReplyBulk(c,ele);
5933 if (withscores)
5934 addReplyDouble(c,ln->score);
5935 }
5936 ln = ln->forward[0];
5937 rangelen++;
5938 if (limit > 0) limit--;
5939 }
5940 if (justcount) {
5941 addReplyLong(c,(long)rangelen);
5942 } else {
5943 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5944 withscores ? (rangelen*2) : rangelen);
5945 }
5946 }
5947 }
5948 }
5949
5950 static void zrangebyscoreCommand(redisClient *c) {
5951 genericZrangebyscoreCommand(c,0);
5952 }
5953
5954 static void zcountCommand(redisClient *c) {
5955 genericZrangebyscoreCommand(c,1);
5956 }
5957
5958 static void zcardCommand(redisClient *c) {
5959 robj *o;
5960 zset *zs;
5961
5962 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5963 checkType(c,o,REDIS_ZSET)) return;
5964
5965 zs = o->ptr;
5966 addReplyUlong(c,zs->zsl->length);
5967 }
5968
5969 static void zscoreCommand(redisClient *c) {
5970 robj *o;
5971 zset *zs;
5972 dictEntry *de;
5973
5974 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5975 checkType(c,o,REDIS_ZSET)) return;
5976
5977 zs = o->ptr;
5978 de = dictFind(zs->dict,c->argv[2]);
5979 if (!de) {
5980 addReply(c,shared.nullbulk);
5981 } else {
5982 double *score = dictGetEntryVal(de);
5983
5984 addReplyDouble(c,*score);
5985 }
5986 }
5987
5988 static void zrankGenericCommand(redisClient *c, int reverse) {
5989 robj *o;
5990 zset *zs;
5991 zskiplist *zsl;
5992 dictEntry *de;
5993 unsigned long rank;
5994 double *score;
5995
5996 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5997 checkType(c,o,REDIS_ZSET)) return;
5998
5999 zs = o->ptr;
6000 zsl = zs->zsl;
6001 de = dictFind(zs->dict,c->argv[2]);
6002 if (!de) {
6003 addReply(c,shared.nullbulk);
6004 return;
6005 }
6006
6007 score = dictGetEntryVal(de);
6008 rank = zslGetRank(zsl, *score, c->argv[2]);
6009 if (rank) {
6010 if (reverse) {
6011 addReplyLong(c, zsl->length - rank);
6012 } else {
6013 addReplyLong(c, rank-1);
6014 }
6015 } else {
6016 addReply(c,shared.nullbulk);
6017 }
6018 }
6019
6020 static void zrankCommand(redisClient *c) {
6021 zrankGenericCommand(c, 0);
6022 }
6023
6024 static void zrevrankCommand(redisClient *c) {
6025 zrankGenericCommand(c, 1);
6026 }
6027
6028 /* =================================== Hashes =============================== */
6029 static void hsetCommand(redisClient *c) {
6030 int update = 0;
6031 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6032
6033 if (o == NULL) {
6034 o = createHashObject();
6035 dictAdd(c->db->dict,c->argv[1],o);
6036 incrRefCount(c->argv[1]);
6037 } else {
6038 if (o->type != REDIS_HASH) {
6039 addReply(c,shared.wrongtypeerr);
6040 return;
6041 }
6042 }
6043 /* We want to convert the zipmap into an hash table right now if the
6044 * entry to be added is too big. Note that we check if the object
6045 * is integer encoded before to try fetching the length in the test below.
6046 * This is because integers are small, but currently stringObjectLen()
6047 * performs a slow conversion: not worth it. */
6048 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
6049 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
6050 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
6051 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
6052 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
6053 {
6054 convertToRealHash(o);
6055 }
6056
6057 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6058 unsigned char *zm = o->ptr;
6059 robj *valobj = getDecodedObject(c->argv[3]);
6060
6061 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6062 valobj->ptr,sdslen(valobj->ptr),&update);
6063 decrRefCount(valobj);
6064 o->ptr = zm;
6065
6066 /* And here there is the second check for hash conversion. */
6067 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6068 convertToRealHash(o);
6069 } else {
6070 c->argv[2] = tryObjectEncoding(c->argv[2]);
6071 /* note that c->argv[3] is already encoded, as the latest arg
6072 * of a bulk command is always integer encoded if possible. */
6073 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
6074 incrRefCount(c->argv[2]);
6075 } else {
6076 update = 1;
6077 }
6078 incrRefCount(c->argv[3]);
6079 }
6080 server.dirty++;
6081 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6082 }
6083
6084 static void hmsetCommand(redisClient *c) {
6085 int i;
6086 robj *o, *key, *val;
6087
6088 if ((c->argc % 2) == 1) {
6089 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6090 return;
6091 }
6092
6093 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6094 o = createHashObject();
6095 dictAdd(c->db->dict,c->argv[1],o);
6096 incrRefCount(c->argv[1]);
6097 } else {
6098 if (o->type != REDIS_HASH) {
6099 addReply(c,shared.wrongtypeerr);
6100 return;
6101 }
6102 }
6103
6104 /* We want to convert the zipmap into an hash table right now if the
6105 * entry to be added is too big. */
6106 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6107 for (i = 2; i < c->argc; i+=2) {
6108 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6109 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6110 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6111 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6112 convertToRealHash(o);
6113 break;
6114 }
6115 }
6116 }
6117
6118 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6119 unsigned char *zm = o->ptr;
6120
6121 for (i = 2; i < c->argc; i+=2) {
6122 key = getDecodedObject(c->argv[i]);
6123 val = getDecodedObject(c->argv[i+1]);
6124 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6125 val->ptr,sdslen(val->ptr),NULL);
6126 decrRefCount(key);
6127 decrRefCount(val);
6128 o->ptr = zm;
6129 }
6130
6131 /* And here there is the second check for hash conversion. */
6132 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6133 convertToRealHash(o);
6134 } else {
6135 for (i = 2; i < c->argc; i+=2) {
6136 key = tryObjectEncoding(c->argv[i]);
6137 val = tryObjectEncoding(c->argv[i+1]);
6138 if (dictReplace(o->ptr,key,val)) {
6139 incrRefCount(key);
6140 }
6141 incrRefCount(val);
6142 }
6143 }
6144
6145 addReply(c, shared.ok);
6146 }
6147
6148 static void hincrbyCommand(redisClient *c) {
6149 long long value = 0, incr = 0;
6150 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6151
6152 if (o == NULL) {
6153 o = createHashObject();
6154 dictAdd(c->db->dict,c->argv[1],o);
6155 incrRefCount(c->argv[1]);
6156 } else {
6157 if (o->type != REDIS_HASH) {
6158 addReply(c,shared.wrongtypeerr);
6159 return;
6160 }
6161 }
6162
6163 if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;
6164
6165 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6166 unsigned char *zm = o->ptr;
6167 unsigned char *zval;
6168 unsigned int zvlen;
6169
6170 /* Find value if already present in hash */
6171 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6172 &zval,&zvlen)) {
6173 /* strtoll needs the char* to have a trailing \0, but
6174 * the zipmap doesn't include them. */
6175 sds szval = sdsnewlen(zval, zvlen);
6176 value = strtoll(szval,NULL,10);
6177 sdsfree(szval);
6178 }
6179
6180 value += incr;
6181 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6182 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6183 (unsigned char*)svalue,sdslen(svalue),NULL);
6184 sdsfree(svalue);
6185 o->ptr = zm;
6186
6187 /* Check if the zipmap needs to be converted. */
6188 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6189 convertToRealHash(o);
6190 } else {
6191 robj *hval;
6192 dictEntry *de;
6193
6194 /* Find value if already present in hash */
6195 de = dictFind(o->ptr,c->argv[2]);
6196 if (de != NULL) {
6197 hval = dictGetEntryVal(de);
6198 if (hval->encoding == REDIS_ENCODING_RAW)
6199 value = strtoll(hval->ptr,NULL,10);
6200 else if (hval->encoding == REDIS_ENCODING_INT)
6201 value = (long)hval->ptr;
6202 else
6203 redisAssert(1 != 1);
6204 }
6205
6206 value += incr;
6207 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6208 hval = tryObjectEncoding(hval);
6209 if (dictReplace(o->ptr,c->argv[2],hval)) {
6210 incrRefCount(c->argv[2]);
6211 }
6212 }
6213
6214 server.dirty++;
6215 addReplyLongLong(c, value);
6216 }
6217
6218 static void hgetCommand(redisClient *c) {
6219 robj *o;
6220
6221 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6222 checkType(c,o,REDIS_HASH)) return;
6223
6224 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6225 unsigned char *zm = o->ptr;
6226 unsigned char *val;
6227 unsigned int vlen;
6228 robj *field;
6229
6230 field = getDecodedObject(c->argv[2]);
6231 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6232 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6233 addReplySds(c,sdsnewlen(val,vlen));
6234 addReply(c,shared.crlf);
6235 decrRefCount(field);
6236 return;
6237 } else {
6238 addReply(c,shared.nullbulk);
6239 decrRefCount(field);
6240 return;
6241 }
6242 } else {
6243 struct dictEntry *de;
6244
6245 de = dictFind(o->ptr,c->argv[2]);
6246 if (de == NULL) {
6247 addReply(c,shared.nullbulk);
6248 } else {
6249 robj *e = dictGetEntryVal(de);
6250
6251 addReplyBulk(c,e);
6252 }
6253 }
6254 }
6255
6256 static void hmgetCommand(redisClient *c) {
6257 int i;
6258
6259 robj *o = lookupKeyRead(c->db, c->argv[1]);
6260 if (o == NULL) {
6261 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6262 for (i = 2; i < c->argc; i++) {
6263 addReply(c,shared.nullbulk);
6264 }
6265 return;
6266 } else {
6267 if (o->type != REDIS_HASH) {
6268 addReply(c,shared.wrongtypeerr);
6269 return;
6270 }
6271 }
6272
6273 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6274 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6275 unsigned char *zm = o->ptr;
6276 unsigned char *v;
6277 unsigned int vlen;
6278 robj *field;
6279
6280 for (i = 2; i < c->argc; i++) {
6281 field = getDecodedObject(c->argv[i]);
6282 if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
6283 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6284 addReplySds(c,sdsnewlen(v,vlen));
6285 addReply(c,shared.crlf);
6286 } else {
6287 addReply(c,shared.nullbulk);
6288 }
6289 decrRefCount(field);
6290 }
6291 } else {
6292 dictEntry *de;
6293
6294 for (i = 2; i < c->argc; i++) {
6295 de = dictFind(o->ptr,c->argv[i]);
6296 if (de != NULL) {
6297 addReplyBulk(c,(robj*)dictGetEntryVal(de));
6298 } else {
6299 addReply(c,shared.nullbulk);
6300 }
6301 }
6302 }
6303 }
6304
6305 static void hdelCommand(redisClient *c) {
6306 robj *o;
6307 int deleted = 0;
6308
6309 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6310 checkType(c,o,REDIS_HASH)) return;
6311
6312 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6313 robj *field = getDecodedObject(c->argv[2]);
6314
6315 o->ptr = zipmapDel((unsigned char*) o->ptr,
6316 (unsigned char*) field->ptr,
6317 sdslen(field->ptr), &deleted);
6318 decrRefCount(field);
6319 if (zipmapLen((unsigned char*) o->ptr) == 0)
6320 deleteKey(c->db,c->argv[1]);
6321 } else {
6322 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6323 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6324 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6325 }
6326 if (deleted) server.dirty++;
6327 addReply(c,deleted ? shared.cone : shared.czero);
6328 }
6329
6330 static void hlenCommand(redisClient *c) {
6331 robj *o;
6332 unsigned long len;
6333
6334 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6335 checkType(c,o,REDIS_HASH)) return;
6336
6337 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6338 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6339 addReplyUlong(c,len);
6340 }
6341
6342 #define REDIS_GETALL_KEYS 1
6343 #define REDIS_GETALL_VALS 2
6344 static void genericHgetallCommand(redisClient *c, int flags) {
6345 robj *o, *lenobj;
6346 unsigned long count = 0;
6347
6348 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6349 || checkType(c,o,REDIS_HASH)) return;
6350
6351 lenobj = createObject(REDIS_STRING,NULL);
6352 addReply(c,lenobj);
6353 decrRefCount(lenobj);
6354
6355 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6356 unsigned char *p = zipmapRewind(o->ptr);
6357 unsigned char *field, *val;
6358 unsigned int flen, vlen;
6359
6360 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6361 robj *aux;
6362
6363 if (flags & REDIS_GETALL_KEYS) {
6364 aux = createStringObject((char*)field,flen);
6365 addReplyBulk(c,aux);
6366 decrRefCount(aux);
6367 count++;
6368 }
6369 if (flags & REDIS_GETALL_VALS) {
6370 aux = createStringObject((char*)val,vlen);
6371 addReplyBulk(c,aux);
6372 decrRefCount(aux);
6373 count++;
6374 }
6375 }
6376 } else {
6377 dictIterator *di = dictGetIterator(o->ptr);
6378 dictEntry *de;
6379
6380 while((de = dictNext(di)) != NULL) {
6381 robj *fieldobj = dictGetEntryKey(de);
6382 robj *valobj = dictGetEntryVal(de);
6383
6384 if (flags & REDIS_GETALL_KEYS) {
6385 addReplyBulk(c,fieldobj);
6386 count++;
6387 }
6388 if (flags & REDIS_GETALL_VALS) {
6389 addReplyBulk(c,valobj);
6390 count++;
6391 }
6392 }
6393 dictReleaseIterator(di);
6394 }
6395 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6396 }
6397
6398 static void hkeysCommand(redisClient *c) {
6399 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6400 }
6401
6402 static void hvalsCommand(redisClient *c) {
6403 genericHgetallCommand(c,REDIS_GETALL_VALS);
6404 }
6405
6406 static void hgetallCommand(redisClient *c) {
6407 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6408 }
6409
6410 static void hexistsCommand(redisClient *c) {
6411 robj *o;
6412 int exists = 0;
6413
6414 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6415 checkType(c,o,REDIS_HASH)) return;
6416
6417 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6418 robj *field;
6419 unsigned char *zm = o->ptr;
6420
6421 field = getDecodedObject(c->argv[2]);
6422 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6423 decrRefCount(field);
6424 } else {
6425 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6426 }
6427 addReply(c,exists ? shared.cone : shared.czero);
6428 }
6429
6430 static void convertToRealHash(robj *o) {
6431 unsigned char *key, *val, *p, *zm = o->ptr;
6432 unsigned int klen, vlen;
6433 dict *dict = dictCreate(&hashDictType,NULL);
6434
6435 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6436 p = zipmapRewind(zm);
6437 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6438 robj *keyobj, *valobj;
6439
6440 keyobj = createStringObject((char*)key,klen);
6441 valobj = createStringObject((char*)val,vlen);
6442 keyobj = tryObjectEncoding(keyobj);
6443 valobj = tryObjectEncoding(valobj);
6444 dictAdd(dict,keyobj,valobj);
6445 }
6446 o->encoding = REDIS_ENCODING_HT;
6447 o->ptr = dict;
6448 zfree(zm);
6449 }
6450
6451 /* ========================= Non type-specific commands ==================== */
6452
6453 static void flushdbCommand(redisClient *c) {
6454 server.dirty += dictSize(c->db->dict);
6455 dictEmpty(c->db->dict);
6456 dictEmpty(c->db->expires);
6457 addReply(c,shared.ok);
6458 }
6459
6460 static void flushallCommand(redisClient *c) {
6461 server.dirty += emptyDb();
6462 addReply(c,shared.ok);
6463 if (server.bgsavechildpid != -1) {
6464 kill(server.bgsavechildpid,SIGKILL);
6465 rdbRemoveTempFile(server.bgsavechildpid);
6466 }
6467 rdbSave(server.dbfilename);
6468 server.dirty++;
6469 }
6470
6471 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6472 redisSortOperation *so = zmalloc(sizeof(*so));
6473 so->type = type;
6474 so->pattern = pattern;
6475 return so;
6476 }
6477
6478 /* Return the value associated to the key with a name obtained
6479 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6480 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6481 char *p;
6482 sds spat, ssub;
6483 robj keyobj;
6484 int prefixlen, sublen, postfixlen;
6485 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6486 struct {
6487 long len;
6488 long free;
6489 char buf[REDIS_SORTKEY_MAX+1];
6490 } keyname;
6491
6492 /* If the pattern is "#" return the substitution object itself in order
6493 * to implement the "SORT ... GET #" feature. */
6494 spat = pattern->ptr;
6495 if (spat[0] == '#' && spat[1] == '\0') {
6496 return subst;
6497 }
6498
6499 /* The substitution object may be specially encoded. If so we create
6500 * a decoded object on the fly. Otherwise getDecodedObject will just
6501 * increment the ref count, that we'll decrement later. */
6502 subst = getDecodedObject(subst);
6503
6504 ssub = subst->ptr;
6505 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6506 p = strchr(spat,'*');
6507 if (!p) {
6508 decrRefCount(subst);
6509 return NULL;
6510 }
6511
6512 prefixlen = p-spat;
6513 sublen = sdslen(ssub);
6514 postfixlen = sdslen(spat)-(prefixlen+1);
6515 memcpy(keyname.buf,spat,prefixlen);
6516 memcpy(keyname.buf+prefixlen,ssub,sublen);
6517 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6518 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6519 keyname.len = prefixlen+sublen+postfixlen;
6520
6521 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6522 decrRefCount(subst);
6523
6524 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6525 return lookupKeyRead(db,&keyobj);
6526 }
6527
6528 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6529 * the additional parameter is not standard but a BSD-specific we have to
6530 * pass sorting parameters via the global 'server' structure */
6531 static int sortCompare(const void *s1, const void *s2) {
6532 const redisSortObject *so1 = s1, *so2 = s2;
6533 int cmp;
6534
6535 if (!server.sort_alpha) {
6536 /* Numeric sorting. Here it's trivial as we precomputed scores */
6537 if (so1->u.score > so2->u.score) {
6538 cmp = 1;
6539 } else if (so1->u.score < so2->u.score) {
6540 cmp = -1;
6541 } else {
6542 cmp = 0;
6543 }
6544 } else {
6545 /* Alphanumeric sorting */
6546 if (server.sort_bypattern) {
6547 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6548 /* At least one compare object is NULL */
6549 if (so1->u.cmpobj == so2->u.cmpobj)
6550 cmp = 0;
6551 else if (so1->u.cmpobj == NULL)
6552 cmp = -1;
6553 else
6554 cmp = 1;
6555 } else {
6556 /* We have both the objects, use strcoll */
6557 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6558 }
6559 } else {
6560 /* Compare elements directly */
6561 robj *dec1, *dec2;
6562
6563 dec1 = getDecodedObject(so1->obj);
6564 dec2 = getDecodedObject(so2->obj);
6565 cmp = strcoll(dec1->ptr,dec2->ptr);
6566 decrRefCount(dec1);
6567 decrRefCount(dec2);
6568 }
6569 }
6570 return server.sort_desc ? -cmp : cmp;
6571 }
6572
6573 /* The SORT command is the most complex command in Redis. Warning: this code
6574 * is optimized for speed and a bit less for readability */
6575 static void sortCommand(redisClient *c) {
6576 list *operations;
6577 int outputlen = 0;
6578 int desc = 0, alpha = 0;
6579 int limit_start = 0, limit_count = -1, start, end;
6580 int j, dontsort = 0, vectorlen;
6581 int getop = 0; /* GET operation counter */
6582 robj *sortval, *sortby = NULL, *storekey = NULL;
6583 redisSortObject *vector; /* Resulting vector to sort */
6584
6585 /* Lookup the key to sort. It must be of the right types */
6586 sortval = lookupKeyRead(c->db,c->argv[1]);
6587 if (sortval == NULL) {
6588 addReply(c,shared.emptymultibulk);
6589 return;
6590 }
6591 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6592 sortval->type != REDIS_ZSET)
6593 {
6594 addReply(c,shared.wrongtypeerr);
6595 return;
6596 }
6597
6598 /* Create a list of operations to perform for every sorted element.
6599 * Operations can be GET/DEL/INCR/DECR */
6600 operations = listCreate();
6601 listSetFreeMethod(operations,zfree);
6602 j = 2;
6603
6604 /* Now we need to protect sortval incrementing its count, in the future
6605 * SORT may have options able to overwrite/delete keys during the sorting
6606 * and the sorted key itself may get destroied */
6607 incrRefCount(sortval);
6608
6609 /* The SORT command has an SQL-alike syntax, parse it */
6610 while(j < c->argc) {
6611 int leftargs = c->argc-j-1;
6612 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6613 desc = 0;
6614 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6615 desc = 1;
6616 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6617 alpha = 1;
6618 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6619 limit_start = atoi(c->argv[j+1]->ptr);
6620 limit_count = atoi(c->argv[j+2]->ptr);
6621 j+=2;
6622 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6623 storekey = c->argv[j+1];
6624 j++;
6625 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6626 sortby = c->argv[j+1];
6627 /* If the BY pattern does not contain '*', i.e. it is constant,
6628 * we don't need to sort nor to lookup the weight keys. */
6629 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6630 j++;
6631 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6632 listAddNodeTail(operations,createSortOperation(
6633 REDIS_SORT_GET,c->argv[j+1]));
6634 getop++;
6635 j++;
6636 } else {
6637 decrRefCount(sortval);
6638 listRelease(operations);
6639 addReply(c,shared.syntaxerr);
6640 return;
6641 }
6642 j++;
6643 }
6644
6645 /* Load the sorting vector with all the objects to sort */
6646 switch(sortval->type) {
6647 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6648 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6649 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6650 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6651 }
6652 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6653 j = 0;
6654
6655 if (sortval->type == REDIS_LIST) {
6656 list *list = sortval->ptr;
6657 listNode *ln;
6658 listIter li;
6659
6660 listRewind(list,&li);
6661 while((ln = listNext(&li))) {
6662 robj *ele = ln->value;
6663 vector[j].obj = ele;
6664 vector[j].u.score = 0;
6665 vector[j].u.cmpobj = NULL;
6666 j++;
6667 }
6668 } else {
6669 dict *set;
6670 dictIterator *di;
6671 dictEntry *setele;
6672
6673 if (sortval->type == REDIS_SET) {
6674 set = sortval->ptr;
6675 } else {
6676 zset *zs = sortval->ptr;
6677 set = zs->dict;
6678 }
6679
6680 di = dictGetIterator(set);
6681 while((setele = dictNext(di)) != NULL) {
6682 vector[j].obj = dictGetEntryKey(setele);
6683 vector[j].u.score = 0;
6684 vector[j].u.cmpobj = NULL;
6685 j++;
6686 }
6687 dictReleaseIterator(di);
6688 }
6689 redisAssert(j == vectorlen);
6690
6691 /* Now it's time to load the right scores in the sorting vector */
6692 if (dontsort == 0) {
6693 for (j = 0; j < vectorlen; j++) {
6694 if (sortby) {
6695 robj *byval;
6696
6697 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6698 if (!byval || byval->type != REDIS_STRING) continue;
6699 if (alpha) {
6700 vector[j].u.cmpobj = getDecodedObject(byval);
6701 } else {
6702 if (byval->encoding == REDIS_ENCODING_RAW) {
6703 vector[j].u.score = strtod(byval->ptr,NULL);
6704 } else {
6705 /* Don't need to decode the object if it's
6706 * integer-encoded (the only encoding supported) so
6707 * far. We can just cast it */
6708 if (byval->encoding == REDIS_ENCODING_INT) {
6709 vector[j].u.score = (long)byval->ptr;
6710 } else
6711 redisAssert(1 != 1);
6712 }
6713 }
6714 } else {
6715 if (!alpha) {
6716 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6717 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6718 else {
6719 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6720 vector[j].u.score = (long) vector[j].obj->ptr;
6721 else
6722 redisAssert(1 != 1);
6723 }
6724 }
6725 }
6726 }
6727 }
6728
6729 /* We are ready to sort the vector... perform a bit of sanity check
6730 * on the LIMIT option too. We'll use a partial version of quicksort. */
6731 start = (limit_start < 0) ? 0 : limit_start;
6732 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6733 if (start >= vectorlen) {
6734 start = vectorlen-1;
6735 end = vectorlen-2;
6736 }
6737 if (end >= vectorlen) end = vectorlen-1;
6738
6739 if (dontsort == 0) {
6740 server.sort_desc = desc;
6741 server.sort_alpha = alpha;
6742 server.sort_bypattern = sortby ? 1 : 0;
6743 if (sortby && (start != 0 || end != vectorlen-1))
6744 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6745 else
6746 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6747 }
6748
6749 /* Send command output to the output buffer, performing the specified
6750 * GET/DEL/INCR/DECR operations if any. */
6751 outputlen = getop ? getop*(end-start+1) : end-start+1;
6752 if (storekey == NULL) {
6753 /* STORE option not specified, sent the sorting result to client */
6754 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6755 for (j = start; j <= end; j++) {
6756 listNode *ln;
6757 listIter li;
6758
6759 if (!getop) addReplyBulk(c,vector[j].obj);
6760 listRewind(operations,&li);
6761 while((ln = listNext(&li))) {
6762 redisSortOperation *sop = ln->value;
6763 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6764 vector[j].obj);
6765
6766 if (sop->type == REDIS_SORT_GET) {
6767 if (!val || val->type != REDIS_STRING) {
6768 addReply(c,shared.nullbulk);
6769 } else {
6770 addReplyBulk(c,val);
6771 }
6772 } else {
6773 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6774 }
6775 }
6776 }
6777 } else {
6778 robj *listObject = createListObject();
6779 list *listPtr = (list*) listObject->ptr;
6780
6781 /* STORE option specified, set the sorting result as a List object */
6782 for (j = start; j <= end; j++) {
6783 listNode *ln;
6784 listIter li;
6785
6786 if (!getop) {
6787 listAddNodeTail(listPtr,vector[j].obj);
6788 incrRefCount(vector[j].obj);
6789 }
6790 listRewind(operations,&li);
6791 while((ln = listNext(&li))) {
6792 redisSortOperation *sop = ln->value;
6793 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6794 vector[j].obj);
6795
6796 if (sop->type == REDIS_SORT_GET) {
6797 if (!val || val->type != REDIS_STRING) {
6798 listAddNodeTail(listPtr,createStringObject("",0));
6799 } else {
6800 listAddNodeTail(listPtr,val);
6801 incrRefCount(val);
6802 }
6803 } else {
6804 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6805 }
6806 }
6807 }
6808 if (dictReplace(c->db->dict,storekey,listObject)) {
6809 incrRefCount(storekey);
6810 }
6811 /* Note: we add 1 because the DB is dirty anyway since even if the
6812 * SORT result is empty a new key is set and maybe the old content
6813 * replaced. */
6814 server.dirty += 1+outputlen;
6815 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6816 }
6817
6818 /* Cleanup */
6819 decrRefCount(sortval);
6820 listRelease(operations);
6821 for (j = 0; j < vectorlen; j++) {
6822 if (sortby && alpha && vector[j].u.cmpobj)
6823 decrRefCount(vector[j].u.cmpobj);
6824 }
6825 zfree(vector);
6826 }
6827
6828 /* Convert an amount of bytes into a human readable string in the form
6829 * of 100B, 2G, 100M, 4K, and so forth. */
6830 static void bytesToHuman(char *s, unsigned long long n) {
6831 double d;
6832
6833 if (n < 1024) {
6834 /* Bytes */
6835 sprintf(s,"%lluB",n);
6836 return;
6837 } else if (n < (1024*1024)) {
6838 d = (double)n/(1024);
6839 sprintf(s,"%.2fK",d);
6840 } else if (n < (1024LL*1024*1024)) {
6841 d = (double)n/(1024*1024);
6842 sprintf(s,"%.2fM",d);
6843 } else if (n < (1024LL*1024*1024*1024)) {
6844 d = (double)n/(1024LL*1024*1024);
6845 sprintf(s,"%.2fG",d);
6846 }
6847 }
6848
6849 /* Create the string returned by the INFO command. This is decoupled
6850 * by the INFO command itself as we need to report the same information
6851 * on memory corruption problems. */
6852 static sds genRedisInfoString(void) {
6853 sds info;
6854 time_t uptime = time(NULL)-server.stat_starttime;
6855 int j;
6856 char hmem[64];
6857
6858 bytesToHuman(hmem,zmalloc_used_memory());
6859 info = sdscatprintf(sdsempty(),
6860 "redis_version:%s\r\n"
6861 "arch_bits:%s\r\n"
6862 "multiplexing_api:%s\r\n"
6863 "process_id:%ld\r\n"
6864 "uptime_in_seconds:%ld\r\n"
6865 "uptime_in_days:%ld\r\n"
6866 "connected_clients:%d\r\n"
6867 "connected_slaves:%d\r\n"
6868 "blocked_clients:%d\r\n"
6869 "used_memory:%zu\r\n"
6870 "used_memory_human:%s\r\n"
6871 "changes_since_last_save:%lld\r\n"
6872 "bgsave_in_progress:%d\r\n"
6873 "last_save_time:%ld\r\n"
6874 "bgrewriteaof_in_progress:%d\r\n"
6875 "total_connections_received:%lld\r\n"
6876 "total_commands_processed:%lld\r\n"
6877 "expired_keys:%lld\r\n"
6878 "hash_max_zipmap_entries:%ld\r\n"
6879 "hash_max_zipmap_value:%ld\r\n"
6880 "pubsub_channels:%ld\r\n"
6881 "pubsub_patterns:%u\r\n"
6882 "vm_enabled:%d\r\n"
6883 "role:%s\r\n"
6884 ,REDIS_VERSION,
6885 (sizeof(long) == 8) ? "64" : "32",
6886 aeGetApiName(),
6887 (long) getpid(),
6888 uptime,
6889 uptime/(3600*24),
6890 listLength(server.clients)-listLength(server.slaves),
6891 listLength(server.slaves),
6892 server.blpop_blocked_clients,
6893 zmalloc_used_memory(),
6894 hmem,
6895 server.dirty,
6896 server.bgsavechildpid != -1,
6897 server.lastsave,
6898 server.bgrewritechildpid != -1,
6899 server.stat_numconnections,
6900 server.stat_numcommands,
6901 server.stat_expiredkeys,
6902 server.hash_max_zipmap_entries,
6903 server.hash_max_zipmap_value,
6904 dictSize(server.pubsub_channels),
6905 listLength(server.pubsub_patterns),
6906 server.vm_enabled != 0,
6907 server.masterhost == NULL ? "master" : "slave"
6908 );
6909 if (server.masterhost) {
6910 info = sdscatprintf(info,
6911 "master_host:%s\r\n"
6912 "master_port:%d\r\n"
6913 "master_link_status:%s\r\n"
6914 "master_last_io_seconds_ago:%d\r\n"
6915 ,server.masterhost,
6916 server.masterport,
6917 (server.replstate == REDIS_REPL_CONNECTED) ?
6918 "up" : "down",
6919 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6920 );
6921 }
6922 if (server.vm_enabled) {
6923 lockThreadedIO();
6924 info = sdscatprintf(info,
6925 "vm_conf_max_memory:%llu\r\n"
6926 "vm_conf_page_size:%llu\r\n"
6927 "vm_conf_pages:%llu\r\n"
6928 "vm_stats_used_pages:%llu\r\n"
6929 "vm_stats_swapped_objects:%llu\r\n"
6930 "vm_stats_swappin_count:%llu\r\n"
6931 "vm_stats_swappout_count:%llu\r\n"
6932 "vm_stats_io_newjobs_len:%lu\r\n"
6933 "vm_stats_io_processing_len:%lu\r\n"
6934 "vm_stats_io_processed_len:%lu\r\n"
6935 "vm_stats_io_active_threads:%lu\r\n"
6936 "vm_stats_blocked_clients:%lu\r\n"
6937 ,(unsigned long long) server.vm_max_memory,
6938 (unsigned long long) server.vm_page_size,
6939 (unsigned long long) server.vm_pages,
6940 (unsigned long long) server.vm_stats_used_pages,
6941 (unsigned long long) server.vm_stats_swapped_objects,
6942 (unsigned long long) server.vm_stats_swapins,
6943 (unsigned long long) server.vm_stats_swapouts,
6944 (unsigned long) listLength(server.io_newjobs),
6945 (unsigned long) listLength(server.io_processing),
6946 (unsigned long) listLength(server.io_processed),
6947 (unsigned long) server.io_active_threads,
6948 (unsigned long) server.vm_blocked_clients
6949 );
6950 unlockThreadedIO();
6951 }
6952 for (j = 0; j < server.dbnum; j++) {
6953 long long keys, vkeys;
6954
6955 keys = dictSize(server.db[j].dict);
6956 vkeys = dictSize(server.db[j].expires);
6957 if (keys || vkeys) {
6958 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6959 j, keys, vkeys);
6960 }
6961 }
6962 return info;
6963 }
6964
6965 static void infoCommand(redisClient *c) {
6966 sds info = genRedisInfoString();
6967 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6968 (unsigned long)sdslen(info)));
6969 addReplySds(c,info);
6970 addReply(c,shared.crlf);
6971 }
6972
6973 static void monitorCommand(redisClient *c) {
6974 /* ignore MONITOR if aleady slave or in monitor mode */
6975 if (c->flags & REDIS_SLAVE) return;
6976
6977 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6978 c->slaveseldb = 0;
6979 listAddNodeTail(server.monitors,c);
6980 addReply(c,shared.ok);
6981 }
6982
6983 /* ================================= Expire ================================= */
6984 static int removeExpire(redisDb *db, robj *key) {
6985 if (dictDelete(db->expires,key) == DICT_OK) {
6986 return 1;
6987 } else {
6988 return 0;
6989 }
6990 }
6991
6992 static int setExpire(redisDb *db, robj *key, time_t when) {
6993 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6994 return 0;
6995 } else {
6996 incrRefCount(key);
6997 return 1;
6998 }
6999 }
7000
7001 /* Return the expire time of the specified key, or -1 if no expire
7002 * is associated with this key (i.e. the key is non volatile) */
7003 static time_t getExpire(redisDb *db, robj *key) {
7004 dictEntry *de;
7005
7006 /* No expire? return ASAP */
7007 if (dictSize(db->expires) == 0 ||
7008 (de = dictFind(db->expires,key)) == NULL) return -1;
7009
7010 return (time_t) dictGetEntryVal(de);
7011 }
7012
7013 static int expireIfNeeded(redisDb *db, robj *key) {
7014 time_t when;
7015 dictEntry *de;
7016
7017 /* No expire? return ASAP */
7018 if (dictSize(db->expires) == 0 ||
7019 (de = dictFind(db->expires,key)) == NULL) return 0;
7020
7021 /* Lookup the expire */
7022 when = (time_t) dictGetEntryVal(de);
7023 if (time(NULL) <= when) return 0;
7024
7025 /* Delete the key */
7026 dictDelete(db->expires,key);
7027 server.stat_expiredkeys++;
7028 return dictDelete(db->dict,key) == DICT_OK;
7029 }
7030
7031 static int deleteIfVolatile(redisDb *db, robj *key) {
7032 dictEntry *de;
7033
7034 /* No expire? return ASAP */
7035 if (dictSize(db->expires) == 0 ||
7036 (de = dictFind(db->expires,key)) == NULL) return 0;
7037
7038 /* Delete the key */
7039 server.dirty++;
7040 server.stat_expiredkeys++;
7041 dictDelete(db->expires,key);
7042 return dictDelete(db->dict,key) == DICT_OK;
7043 }
7044
7045 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7046 dictEntry *de;
7047 time_t seconds;
7048
7049 if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;
7050
7051 seconds -= offset;
7052
7053 de = dictFind(c->db->dict,key);
7054 if (de == NULL) {
7055 addReply(c,shared.czero);
7056 return;
7057 }
7058 if (seconds < 0) {
7059 if (deleteKey(c->db,key)) server.dirty++;
7060 addReply(c, shared.cone);
7061 return;
7062 } else {
7063 time_t when = time(NULL)+seconds;
7064 if (setExpire(c->db,key,when)) {
7065 addReply(c,shared.cone);
7066 server.dirty++;
7067 } else {
7068 addReply(c,shared.czero);
7069 }
7070 return;
7071 }
7072 }
7073
7074 static void expireCommand(redisClient *c) {
7075 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7076 }
7077
7078 static void expireatCommand(redisClient *c) {
7079 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7080 }
7081
7082 static void ttlCommand(redisClient *c) {
7083 time_t expire;
7084 int ttl = -1;
7085
7086 expire = getExpire(c->db,c->argv[1]);
7087 if (expire != -1) {
7088 ttl = (int) (expire-time(NULL));
7089 if (ttl < 0) ttl = -1;
7090 }
7091 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7092 }
7093
7094 /* ================================ MULTI/EXEC ============================== */
7095
7096 /* Client state initialization for MULTI/EXEC */
7097 static void initClientMultiState(redisClient *c) {
7098 c->mstate.commands = NULL;
7099 c->mstate.count = 0;
7100 }
7101
7102 /* Release all the resources associated with MULTI/EXEC state */
7103 static void freeClientMultiState(redisClient *c) {
7104 int j;
7105
7106 for (j = 0; j < c->mstate.count; j++) {
7107 int i;
7108 multiCmd *mc = c->mstate.commands+j;
7109
7110 for (i = 0; i < mc->argc; i++)
7111 decrRefCount(mc->argv[i]);
7112 zfree(mc->argv);
7113 }
7114 zfree(c->mstate.commands);
7115 }
7116
7117 /* Add a new command into the MULTI commands queue */
7118 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7119 multiCmd *mc;
7120 int j;
7121
7122 c->mstate.commands = zrealloc(c->mstate.commands,
7123 sizeof(multiCmd)*(c->mstate.count+1));
7124 mc = c->mstate.commands+c->mstate.count;
7125 mc->cmd = cmd;
7126 mc->argc = c->argc;
7127 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7128 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7129 for (j = 0; j < c->argc; j++)
7130 incrRefCount(mc->argv[j]);
7131 c->mstate.count++;
7132 }
7133
7134 static void multiCommand(redisClient *c) {
7135 c->flags |= REDIS_MULTI;
7136 addReply(c,shared.ok);
7137 }
7138
7139 static void discardCommand(redisClient *c) {
7140 if (!(c->flags & REDIS_MULTI)) {
7141 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7142 return;
7143 }
7144
7145 freeClientMultiState(c);
7146 initClientMultiState(c);
7147 c->flags &= (~REDIS_MULTI);
7148 addReply(c,shared.ok);
7149 }
7150
7151 static void execCommand(redisClient *c) {
7152 int j;
7153 robj **orig_argv;
7154 int orig_argc;
7155
7156 if (!(c->flags & REDIS_MULTI)) {
7157 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7158 return;
7159 }
7160
7161 orig_argv = c->argv;
7162 orig_argc = c->argc;
7163 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7164 for (j = 0; j < c->mstate.count; j++) {
7165 c->argc = c->mstate.commands[j].argc;
7166 c->argv = c->mstate.commands[j].argv;
7167 call(c,c->mstate.commands[j].cmd);
7168 }
7169 c->argv = orig_argv;
7170 c->argc = orig_argc;
7171 freeClientMultiState(c);
7172 initClientMultiState(c);
7173 c->flags &= (~REDIS_MULTI);
7174 }
7175
7176 /* =========================== Blocking Operations ========================= */
7177
7178 /* Currently Redis blocking operations support is limited to list POP ops,
7179 * so the current implementation is not fully generic, but it is also not
7180 * completely specific so it will not require a rewrite to support new
7181 * kind of blocking operations in the future.
7182 *
7183 * Still it's important to note that list blocking operations can be already
7184 * used as a notification mechanism in order to implement other blocking
7185 * operations at application level, so there must be a very strong evidence
7186 * of usefulness and generality before new blocking operations are implemented.
7187 *
7188 * This is how the current blocking POP works, we use BLPOP as example:
7189 * - If the user calls BLPOP and the key exists and contains a non empty list
7190 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7191 * if there is not to block.
7192 * - If instead BLPOP is called and the key does not exists or the list is
7193 * empty we need to block. In order to do so we remove the notification for
7194 * new data to read in the client socket (so that we'll not serve new
7195 * requests if the blocking request is not served). Also we put the client
7196 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7197 * blocking for this keys.
7198 * - If a PUSH operation against a key with blocked clients waiting is
7199 * performed, we serve the first in the list: basically instead to push
7200 * the new element inside the list we return it to the (first / oldest)
7201 * blocking client, unblock the client, and remove it form the list.
7202 *
7203 * The above comment and the source code should be enough in order to understand
7204 * the implementation and modify / fix it later.
7205 */
7206
7207 /* Set a client in blocking mode for the specified key, with the specified
7208 * timeout */
7209 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7210 dictEntry *de;
7211 list *l;
7212 int j;
7213
7214 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7215 c->blockingkeysnum = numkeys;
7216 c->blockingto = timeout;
7217 for (j = 0; j < numkeys; j++) {
7218 /* Add the key in the client structure, to map clients -> keys */
7219 c->blockingkeys[j] = keys[j];
7220 incrRefCount(keys[j]);
7221
7222 /* And in the other "side", to map keys -> clients */
7223 de = dictFind(c->db->blockingkeys,keys[j]);
7224 if (de == NULL) {
7225 int retval;
7226
7227 /* For every key we take a list of clients blocked for it */
7228 l = listCreate();
7229 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7230 incrRefCount(keys[j]);
7231 assert(retval == DICT_OK);
7232 } else {
7233 l = dictGetEntryVal(de);
7234 }
7235 listAddNodeTail(l,c);
7236 }
7237 /* Mark the client as a blocked client */
7238 c->flags |= REDIS_BLOCKED;
7239 server.blpop_blocked_clients++;
7240 }
7241
7242 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7243 static void unblockClientWaitingData(redisClient *c) {
7244 dictEntry *de;
7245 list *l;
7246 int j;
7247
7248 assert(c->blockingkeys != NULL);
7249 /* The client may wait for multiple keys, so unblock it for every key. */
7250 for (j = 0; j < c->blockingkeysnum; j++) {
7251 /* Remove this client from the list of clients waiting for this key. */
7252 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7253 assert(de != NULL);
7254 l = dictGetEntryVal(de);
7255 listDelNode(l,listSearchKey(l,c));
7256 /* If the list is empty we need to remove it to avoid wasting memory */
7257 if (listLength(l) == 0)
7258 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7259 decrRefCount(c->blockingkeys[j]);
7260 }
7261 /* Cleanup the client structure */
7262 zfree(c->blockingkeys);
7263 c->blockingkeys = NULL;
7264 c->flags &= (~REDIS_BLOCKED);
7265 server.blpop_blocked_clients--;
7266 /* We want to process data if there is some command waiting
7267 * in the input buffer. Note that this is safe even if
7268 * unblockClientWaitingData() gets called from freeClient() because
7269 * freeClient() will be smart enough to call this function
7270 * *after* c->querybuf was set to NULL. */
7271 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7272 }
7273
7274 /* This should be called from any function PUSHing into lists.
7275 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7276 * 'ele' is the element pushed.
7277 *
7278 * If the function returns 0 there was no client waiting for a list push
7279 * against this key.
7280 *
7281 * If the function returns 1 there was a client waiting for a list push
7282 * against this key, the element was passed to this client thus it's not
7283 * needed to actually add it to the list and the caller should return asap. */
7284 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7285 struct dictEntry *de;
7286 redisClient *receiver;
7287 list *l;
7288 listNode *ln;
7289
7290 de = dictFind(c->db->blockingkeys,key);
7291 if (de == NULL) return 0;
7292 l = dictGetEntryVal(de);
7293 ln = listFirst(l);
7294 assert(ln != NULL);
7295 receiver = ln->value;
7296
7297 addReplySds(receiver,sdsnew("*2\r\n"));
7298 addReplyBulk(receiver,key);
7299 addReplyBulk(receiver,ele);
7300 unblockClientWaitingData(receiver);
7301 return 1;
7302 }
7303
7304 /* Blocking RPOP/LPOP */
7305 static void blockingPopGenericCommand(redisClient *c, int where) {
7306 robj *o;
7307 time_t timeout;
7308 int j;
7309
7310 for (j = 1; j < c->argc-1; j++) {
7311 o = lookupKeyWrite(c->db,c->argv[j]);
7312 if (o != NULL) {
7313 if (o->type != REDIS_LIST) {
7314 addReply(c,shared.wrongtypeerr);
7315 return;
7316 } else {
7317 list *list = o->ptr;
7318 if (listLength(list) != 0) {
7319 /* If the list contains elements fall back to the usual
7320 * non-blocking POP operation */
7321 robj *argv[2], **orig_argv;
7322 int orig_argc;
7323
7324 /* We need to alter the command arguments before to call
7325 * popGenericCommand() as the command takes a single key. */
7326 orig_argv = c->argv;
7327 orig_argc = c->argc;
7328 argv[1] = c->argv[j];
7329 c->argv = argv;
7330 c->argc = 2;
7331
7332 /* Also the return value is different, we need to output
7333 * the multi bulk reply header and the key name. The
7334 * "real" command will add the last element (the value)
7335 * for us. If this souds like an hack to you it's just
7336 * because it is... */
7337 addReplySds(c,sdsnew("*2\r\n"));
7338 addReplyBulk(c,argv[1]);
7339 popGenericCommand(c,where);
7340
7341 /* Fix the client structure with the original stuff */
7342 c->argv = orig_argv;
7343 c->argc = orig_argc;
7344 return;
7345 }
7346 }
7347 }
7348 }
7349 /* If the list is empty or the key does not exists we must block */
7350 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7351 if (timeout > 0) timeout += time(NULL);
7352 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7353 }
7354
7355 static void blpopCommand(redisClient *c) {
7356 blockingPopGenericCommand(c,REDIS_HEAD);
7357 }
7358
7359 static void brpopCommand(redisClient *c) {
7360 blockingPopGenericCommand(c,REDIS_TAIL);
7361 }
7362
7363 /* =============================== Replication ============================= */
7364
7365 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7366 ssize_t nwritten, ret = size;
7367 time_t start = time(NULL);
7368
7369 timeout++;
7370 while(size) {
7371 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7372 nwritten = write(fd,ptr,size);
7373 if (nwritten == -1) return -1;
7374 ptr += nwritten;
7375 size -= nwritten;
7376 }
7377 if ((time(NULL)-start) > timeout) {
7378 errno = ETIMEDOUT;
7379 return -1;
7380 }
7381 }
7382 return ret;
7383 }
7384
7385 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7386 ssize_t nread, totread = 0;
7387 time_t start = time(NULL);
7388
7389 timeout++;
7390 while(size) {
7391 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7392 nread = read(fd,ptr,size);
7393 if (nread == -1) return -1;
7394 ptr += nread;
7395 size -= nread;
7396 totread += nread;
7397 }
7398 if ((time(NULL)-start) > timeout) {
7399 errno = ETIMEDOUT;
7400 return -1;
7401 }
7402 }
7403 return totread;
7404 }
7405
7406 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7407 ssize_t nread = 0;
7408
7409 size--;
7410 while(size) {
7411 char c;
7412
7413 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7414 if (c == '\n') {
7415 *ptr = '\0';
7416 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7417 return nread;
7418 } else {
7419 *ptr++ = c;
7420 *ptr = '\0';
7421 nread++;
7422 }
7423 }
7424 return nread;
7425 }
7426
7427 static void syncCommand(redisClient *c) {
7428 /* ignore SYNC if aleady slave or in monitor mode */
7429 if (c->flags & REDIS_SLAVE) return;
7430
7431 /* SYNC can't be issued when the server has pending data to send to
7432 * the client about already issued commands. We need a fresh reply
7433 * buffer registering the differences between the BGSAVE and the current
7434 * dataset, so that we can copy to other slaves if needed. */
7435 if (listLength(c->reply) != 0) {
7436 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7437 return;
7438 }
7439
7440 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7441 /* Here we need to check if there is a background saving operation
7442 * in progress, or if it is required to start one */
7443 if (server.bgsavechildpid != -1) {
7444 /* Ok a background save is in progress. Let's check if it is a good
7445 * one for replication, i.e. if there is another slave that is
7446 * registering differences since the server forked to save */
7447 redisClient *slave;
7448 listNode *ln;
7449 listIter li;
7450
7451 listRewind(server.slaves,&li);
7452 while((ln = listNext(&li))) {
7453 slave = ln->value;
7454 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7455 }
7456 if (ln) {
7457 /* Perfect, the server is already registering differences for
7458 * another slave. Set the right state, and copy the buffer. */
7459 listRelease(c->reply);
7460 c->reply = listDup(slave->reply);
7461 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7462 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7463 } else {
7464 /* No way, we need to wait for the next BGSAVE in order to
7465 * register differences */
7466 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7467 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7468 }
7469 } else {
7470 /* Ok we don't have a BGSAVE in progress, let's start one */
7471 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7472 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7473 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7474 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7475 return;
7476 }
7477 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7478 }
7479 c->repldbfd = -1;
7480 c->flags |= REDIS_SLAVE;
7481 c->slaveseldb = 0;
7482 listAddNodeTail(server.slaves,c);
7483 return;
7484 }
7485
7486 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7487 redisClient *slave = privdata;
7488 REDIS_NOTUSED(el);
7489 REDIS_NOTUSED(mask);
7490 char buf[REDIS_IOBUF_LEN];
7491 ssize_t nwritten, buflen;
7492
7493 if (slave->repldboff == 0) {
7494 /* Write the bulk write count before to transfer the DB. In theory here
7495 * we don't know how much room there is in the output buffer of the
7496 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7497 * operations) will never be smaller than the few bytes we need. */
7498 sds bulkcount;
7499
7500 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7501 slave->repldbsize);
7502 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7503 {
7504 sdsfree(bulkcount);
7505 freeClient(slave);
7506 return;
7507 }
7508 sdsfree(bulkcount);
7509 }
7510 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7511 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7512 if (buflen <= 0) {
7513 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7514 (buflen == 0) ? "premature EOF" : strerror(errno));
7515 freeClient(slave);
7516 return;
7517 }
7518 if ((nwritten = write(fd,buf,buflen)) == -1) {
7519 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7520 strerror(errno));
7521 freeClient(slave);
7522 return;
7523 }
7524 slave->repldboff += nwritten;
7525 if (slave->repldboff == slave->repldbsize) {
7526 close(slave->repldbfd);
7527 slave->repldbfd = -1;
7528 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7529 slave->replstate = REDIS_REPL_ONLINE;
7530 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7531 sendReplyToClient, slave) == AE_ERR) {
7532 freeClient(slave);
7533 return;
7534 }
7535 addReplySds(slave,sdsempty());
7536 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7537 }
7538 }
7539
7540 /* This function is called at the end of every backgrond saving.
7541 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7542 * otherwise REDIS_ERR is passed to the function.
7543 *
7544 * The goal of this function is to handle slaves waiting for a successful
7545 * background saving in order to perform non-blocking synchronization. */
7546 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7547 listNode *ln;
7548 int startbgsave = 0;
7549 listIter li;
7550
7551 listRewind(server.slaves,&li);
7552 while((ln = listNext(&li))) {
7553 redisClient *slave = ln->value;
7554
7555 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7556 startbgsave = 1;
7557 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7558 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7559 struct redis_stat buf;
7560
7561 if (bgsaveerr != REDIS_OK) {
7562 freeClient(slave);
7563 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7564 continue;
7565 }
7566 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7567 redis_fstat(slave->repldbfd,&buf) == -1) {
7568 freeClient(slave);
7569 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7570 continue;
7571 }
7572 slave->repldboff = 0;
7573 slave->repldbsize = buf.st_size;
7574 slave->replstate = REDIS_REPL_SEND_BULK;
7575 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7576 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7577 freeClient(slave);
7578 continue;
7579 }
7580 }
7581 }
7582 if (startbgsave) {
7583 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7584 listIter li;
7585
7586 listRewind(server.slaves,&li);
7587 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7588 while((ln = listNext(&li))) {
7589 redisClient *slave = ln->value;
7590
7591 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7592 freeClient(slave);
7593 }
7594 }
7595 }
7596 }
7597
7598 static int syncWithMaster(void) {
7599 char buf[1024], tmpfile[256], authcmd[1024];
7600 long dumpsize;
7601 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7602 int dfd, maxtries = 5;
7603
7604 if (fd == -1) {
7605 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7606 strerror(errno));
7607 return REDIS_ERR;
7608 }
7609
7610 /* AUTH with the master if required. */
7611 if(server.masterauth) {
7612 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7613 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7614 close(fd);
7615 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7616 strerror(errno));
7617 return REDIS_ERR;
7618 }
7619 /* Read the AUTH result. */
7620 if (syncReadLine(fd,buf,1024,3600) == -1) {
7621 close(fd);
7622 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7623 strerror(errno));
7624 return REDIS_ERR;
7625 }
7626 if (buf[0] != '+') {
7627 close(fd);
7628 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7629 return REDIS_ERR;
7630 }
7631 }
7632
7633 /* Issue the SYNC command */
7634 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7635 close(fd);
7636 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7637 strerror(errno));
7638 return REDIS_ERR;
7639 }
7640 /* Read the bulk write count */
7641 if (syncReadLine(fd,buf,1024,3600) == -1) {
7642 close(fd);
7643 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7644 strerror(errno));
7645 return REDIS_ERR;
7646 }
7647 if (buf[0] != '$') {
7648 close(fd);
7649 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7650 return REDIS_ERR;
7651 }
7652 dumpsize = strtol(buf+1,NULL,10);
7653 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7654 /* Read the bulk write data on a temp file */
7655 while(maxtries--) {
7656 snprintf(tmpfile,256,
7657 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7658 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7659 if (dfd != -1) break;
7660 sleep(1);
7661 }
7662 if (dfd == -1) {
7663 close(fd);
7664 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7665 return REDIS_ERR;
7666 }
7667 while(dumpsize) {
7668 int nread, nwritten;
7669
7670 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7671 if (nread == -1) {
7672 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7673 strerror(errno));
7674 close(fd);
7675 close(dfd);
7676 return REDIS_ERR;
7677 }
7678 nwritten = write(dfd,buf,nread);
7679 if (nwritten == -1) {
7680 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7681 close(fd);
7682 close(dfd);
7683 return REDIS_ERR;
7684 }
7685 dumpsize -= nread;
7686 }
7687 close(dfd);
7688 if (rename(tmpfile,server.dbfilename) == -1) {
7689 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7690 unlink(tmpfile);
7691 close(fd);
7692 return REDIS_ERR;
7693 }
7694 emptyDb();
7695 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7696 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7697 close(fd);
7698 return REDIS_ERR;
7699 }
7700 server.master = createClient(fd);
7701 server.master->flags |= REDIS_MASTER;
7702 server.master->authenticated = 1;
7703 server.replstate = REDIS_REPL_CONNECTED;
7704 return REDIS_OK;
7705 }
7706
7707 static void slaveofCommand(redisClient *c) {
7708 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7709 !strcasecmp(c->argv[2]->ptr,"one")) {
7710 if (server.masterhost) {
7711 sdsfree(server.masterhost);
7712 server.masterhost = NULL;
7713 if (server.master) freeClient(server.master);
7714 server.replstate = REDIS_REPL_NONE;
7715 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7716 }
7717 } else {
7718 sdsfree(server.masterhost);
7719 server.masterhost = sdsdup(c->argv[1]->ptr);
7720 server.masterport = atoi(c->argv[2]->ptr);
7721 if (server.master) freeClient(server.master);
7722 server.replstate = REDIS_REPL_CONNECT;
7723 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7724 server.masterhost, server.masterport);
7725 }
7726 addReply(c,shared.ok);
7727 }
7728
7729 /* ============================ Maxmemory directive ======================== */
7730
7731 /* Try to free one object form the pre-allocated objects free list.
7732 * This is useful under low mem conditions as by default we take 1 million
7733 * free objects allocated. On success REDIS_OK is returned, otherwise
7734 * REDIS_ERR. */
7735 static int tryFreeOneObjectFromFreelist(void) {
7736 robj *o;
7737
7738 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7739 if (listLength(server.objfreelist)) {
7740 listNode *head = listFirst(server.objfreelist);
7741 o = listNodeValue(head);
7742 listDelNode(server.objfreelist,head);
7743 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7744 zfree(o);
7745 return REDIS_OK;
7746 } else {
7747 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7748 return REDIS_ERR;
7749 }
7750 }
7751
7752 /* This function gets called when 'maxmemory' is set on the config file to limit
7753 * the max memory used by the server, and we are out of memory.
7754 * This function will try to, in order:
7755 *
7756 * - Free objects from the free list
7757 * - Try to remove keys with an EXPIRE set
7758 *
7759 * It is not possible to free enough memory to reach used-memory < maxmemory
7760 * the server will start refusing commands that will enlarge even more the
7761 * memory usage.
7762 */
7763 static void freeMemoryIfNeeded(void) {
7764 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7765 int j, k, freed = 0;
7766
7767 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7768 for (j = 0; j < server.dbnum; j++) {
7769 int minttl = -1;
7770 robj *minkey = NULL;
7771 struct dictEntry *de;
7772
7773 if (dictSize(server.db[j].expires)) {
7774 freed = 1;
7775 /* From a sample of three keys drop the one nearest to
7776 * the natural expire */
7777 for (k = 0; k < 3; k++) {
7778 time_t t;
7779
7780 de = dictGetRandomKey(server.db[j].expires);
7781 t = (time_t) dictGetEntryVal(de);
7782 if (minttl == -1 || t < minttl) {
7783 minkey = dictGetEntryKey(de);
7784 minttl = t;
7785 }
7786 }
7787 deleteKey(server.db+j,minkey);
7788 }
7789 }
7790 if (!freed) return; /* nothing to free... */
7791 }
7792 }
7793
7794 /* ============================== Append Only file ========================== */
7795
7796 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7797 sds buf = sdsempty();
7798 int j;
7799 ssize_t nwritten;
7800 time_t now;
7801 robj *tmpargv[3];
7802
7803 /* The DB this command was targetting is not the same as the last command
7804 * we appendend. To issue a SELECT command is needed. */
7805 if (dictid != server.appendseldb) {
7806 char seldb[64];
7807
7808 snprintf(seldb,sizeof(seldb),"%d",dictid);
7809 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7810 (unsigned long)strlen(seldb),seldb);
7811 server.appendseldb = dictid;
7812 }
7813
7814 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7815 * EXPIREs into EXPIREATs calls */
7816 if (cmd->proc == expireCommand) {
7817 long when;
7818
7819 tmpargv[0] = createStringObject("EXPIREAT",8);
7820 tmpargv[1] = argv[1];
7821 incrRefCount(argv[1]);
7822 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7823 tmpargv[2] = createObject(REDIS_STRING,
7824 sdscatprintf(sdsempty(),"%ld",when));
7825 argv = tmpargv;
7826 }
7827
7828 /* Append the actual command */
7829 buf = sdscatprintf(buf,"*%d\r\n",argc);
7830 for (j = 0; j < argc; j++) {
7831 robj *o = argv[j];
7832
7833 o = getDecodedObject(o);
7834 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7835 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7836 buf = sdscatlen(buf,"\r\n",2);
7837 decrRefCount(o);
7838 }
7839
7840 /* Free the objects from the modified argv for EXPIREAT */
7841 if (cmd->proc == expireCommand) {
7842 for (j = 0; j < 3; j++)
7843 decrRefCount(argv[j]);
7844 }
7845
7846 /* We want to perform a single write. This should be guaranteed atomic
7847 * at least if the filesystem we are writing is a real physical one.
7848 * While this will save us against the server being killed I don't think
7849 * there is much to do about the whole server stopping for power problems
7850 * or alike */
7851 nwritten = write(server.appendfd,buf,sdslen(buf));
7852 if (nwritten != (signed)sdslen(buf)) {
7853 /* Ooops, we are in troubles. The best thing to do for now is
7854 * to simply exit instead to give the illusion that everything is
7855 * working as expected. */
7856 if (nwritten == -1) {
7857 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7858 } else {
7859 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7860 }
7861 exit(1);
7862 }
7863 /* If a background append only file rewriting is in progress we want to
7864 * accumulate the differences between the child DB and the current one
7865 * in a buffer, so that when the child process will do its work we
7866 * can append the differences to the new append only file. */
7867 if (server.bgrewritechildpid != -1)
7868 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7869
7870 sdsfree(buf);
7871 now = time(NULL);
7872 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7873 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7874 now-server.lastfsync > 1))
7875 {
7876 fsync(server.appendfd); /* Let's try to get this data on the disk */
7877 server.lastfsync = now;
7878 }
7879 }
7880
7881 /* In Redis commands are always executed in the context of a client, so in
7882 * order to load the append only file we need to create a fake client. */
7883 static struct redisClient *createFakeClient(void) {
7884 struct redisClient *c = zmalloc(sizeof(*c));
7885
7886 selectDb(c,0);
7887 c->fd = -1;
7888 c->querybuf = sdsempty();
7889 c->argc = 0;
7890 c->argv = NULL;
7891 c->flags = 0;
7892 /* We set the fake client as a slave waiting for the synchronization
7893 * so that Redis will not try to send replies to this client. */
7894 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7895 c->reply = listCreate();
7896 listSetFreeMethod(c->reply,decrRefCount);
7897 listSetDupMethod(c->reply,dupClientReplyValue);
7898 return c;
7899 }
7900
7901 static void freeFakeClient(struct redisClient *c) {
7902 sdsfree(c->querybuf);
7903 listRelease(c->reply);
7904 zfree(c);
7905 }
7906
7907 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7908 * error (the append only file is zero-length) REDIS_ERR is returned. On
7909 * fatal error an error message is logged and the program exists. */
7910 int loadAppendOnlyFile(char *filename) {
7911 struct redisClient *fakeClient;
7912 FILE *fp = fopen(filename,"r");
7913 struct redis_stat sb;
7914 unsigned long long loadedkeys = 0;
7915
7916 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7917 return REDIS_ERR;
7918
7919 if (fp == NULL) {
7920 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7921 exit(1);
7922 }
7923
7924 fakeClient = createFakeClient();
7925 while(1) {
7926 int argc, j;
7927 unsigned long len;
7928 robj **argv;
7929 char buf[128];
7930 sds argsds;
7931 struct redisCommand *cmd;
7932
7933 if (fgets(buf,sizeof(buf),fp) == NULL) {
7934 if (feof(fp))
7935 break;
7936 else
7937 goto readerr;
7938 }
7939 if (buf[0] != '*') goto fmterr;
7940 argc = atoi(buf+1);
7941 argv = zmalloc(sizeof(robj*)*argc);
7942 for (j = 0; j < argc; j++) {
7943 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7944 if (buf[0] != '$') goto fmterr;
7945 len = strtol(buf+1,NULL,10);
7946 argsds = sdsnewlen(NULL,len);
7947 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7948 argv[j] = createObject(REDIS_STRING,argsds);
7949 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7950 }
7951
7952 /* Command lookup */
7953 cmd = lookupCommand(argv[0]->ptr);
7954 if (!cmd) {
7955 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7956 exit(1);
7957 }
7958 /* Try object encoding */
7959 if (cmd->flags & REDIS_CMD_BULK)
7960 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7961 /* Run the command in the context of a fake client */
7962 fakeClient->argc = argc;
7963 fakeClient->argv = argv;
7964 cmd->proc(fakeClient);
7965 /* Discard the reply objects list from the fake client */
7966 while(listLength(fakeClient->reply))
7967 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7968 /* Clean up, ready for the next command */
7969 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7970 zfree(argv);
7971 /* Handle swapping while loading big datasets when VM is on */
7972 loadedkeys++;
7973 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7974 while (zmalloc_used_memory() > server.vm_max_memory) {
7975 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7976 }
7977 }
7978 }
7979 fclose(fp);
7980 freeFakeClient(fakeClient);
7981 return REDIS_OK;
7982
7983 readerr:
7984 if (feof(fp)) {
7985 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7986 } else {
7987 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7988 }
7989 exit(1);
7990 fmterr:
7991 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7992 exit(1);
7993 }
7994
7995 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7996 static int fwriteBulkObject(FILE *fp, robj *obj) {
7997 char buf[128];
7998 int decrrc = 0;
7999
8000 /* Avoid the incr/decr ref count business if possible to help
8001 * copy-on-write (we are often in a child process when this function
8002 * is called).
8003 * Also makes sure that key objects don't get incrRefCount-ed when VM
8004 * is enabled */
8005 if (obj->encoding != REDIS_ENCODING_RAW) {
8006 obj = getDecodedObject(obj);
8007 decrrc = 1;
8008 }
8009 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8010 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8011 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8012 goto err;
8013 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8014 if (decrrc) decrRefCount(obj);
8015 return 1;
8016 err:
8017 if (decrrc) decrRefCount(obj);
8018 return 0;
8019 }
8020
8021 /* Write binary-safe string into a file in the bulkformat
8022 * $<count>\r\n<payload>\r\n */
8023 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8024 char buf[128];
8025
8026 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8027 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8028 if (len && fwrite(s,len,1,fp) == 0) return 0;
8029 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8030 return 1;
8031 }
8032
8033 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8034 static int fwriteBulkDouble(FILE *fp, double d) {
8035 char buf[128], dbuf[128];
8036
8037 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8038 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8039 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8040 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8041 return 1;
8042 }
8043
8044 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8045 static int fwriteBulkLong(FILE *fp, long l) {
8046 char buf[128], lbuf[128];
8047
8048 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8049 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8050 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8051 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8052 return 1;
8053 }
8054
8055 /* Write a sequence of commands able to fully rebuild the dataset into
8056 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8057 static int rewriteAppendOnlyFile(char *filename) {
8058 dictIterator *di = NULL;
8059 dictEntry *de;
8060 FILE *fp;
8061 char tmpfile[256];
8062 int j;
8063 time_t now = time(NULL);
8064
8065 /* Note that we have to use a different temp name here compared to the
8066 * one used by rewriteAppendOnlyFileBackground() function. */
8067 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8068 fp = fopen(tmpfile,"w");
8069 if (!fp) {
8070 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8071 return REDIS_ERR;
8072 }
8073 for (j = 0; j < server.dbnum; j++) {
8074 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8075 redisDb *db = server.db+j;
8076 dict *d = db->dict;
8077 if (dictSize(d) == 0) continue;
8078 di = dictGetIterator(d);
8079 if (!di) {
8080 fclose(fp);
8081 return REDIS_ERR;
8082 }
8083
8084 /* SELECT the new DB */
8085 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8086 if (fwriteBulkLong(fp,j) == 0) goto werr;
8087
8088 /* Iterate this DB writing every entry */
8089 while((de = dictNext(di)) != NULL) {
8090 robj *key, *o;
8091 time_t expiretime;
8092 int swapped;
8093
8094 key = dictGetEntryKey(de);
8095 /* If the value for this key is swapped, load a preview in memory.
8096 * We use a "swapped" flag to remember if we need to free the
8097 * value object instead to just increment the ref count anyway
8098 * in order to avoid copy-on-write of pages if we are forked() */
8099 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8100 key->storage == REDIS_VM_SWAPPING) {
8101 o = dictGetEntryVal(de);
8102 swapped = 0;
8103 } else {
8104 o = vmPreviewObject(key);
8105 swapped = 1;
8106 }
8107 expiretime = getExpire(db,key);
8108
8109 /* Save the key and associated value */
8110 if (o->type == REDIS_STRING) {
8111 /* Emit a SET command */
8112 char cmd[]="*3\r\n$3\r\nSET\r\n";
8113 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8114 /* Key and value */
8115 if (fwriteBulkObject(fp,key) == 0) goto werr;
8116 if (fwriteBulkObject(fp,o) == 0) goto werr;
8117 } else if (o->type == REDIS_LIST) {
8118 /* Emit the RPUSHes needed to rebuild the list */
8119 list *list = o->ptr;
8120 listNode *ln;
8121 listIter li;
8122
8123 listRewind(list,&li);
8124 while((ln = listNext(&li))) {
8125 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8126 robj *eleobj = listNodeValue(ln);
8127
8128 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8129 if (fwriteBulkObject(fp,key) == 0) goto werr;
8130 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8131 }
8132 } else if (o->type == REDIS_SET) {
8133 /* Emit the SADDs needed to rebuild the set */
8134 dict *set = o->ptr;
8135 dictIterator *di = dictGetIterator(set);
8136 dictEntry *de;
8137
8138 while((de = dictNext(di)) != NULL) {
8139 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8140 robj *eleobj = dictGetEntryKey(de);
8141
8142 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8143 if (fwriteBulkObject(fp,key) == 0) goto werr;
8144 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8145 }
8146 dictReleaseIterator(di);
8147 } else if (o->type == REDIS_ZSET) {
8148 /* Emit the ZADDs needed to rebuild the sorted set */
8149 zset *zs = o->ptr;
8150 dictIterator *di = dictGetIterator(zs->dict);
8151 dictEntry *de;
8152
8153 while((de = dictNext(di)) != NULL) {
8154 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8155 robj *eleobj = dictGetEntryKey(de);
8156 double *score = dictGetEntryVal(de);
8157
8158 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8159 if (fwriteBulkObject(fp,key) == 0) goto werr;
8160 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8161 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8162 }
8163 dictReleaseIterator(di);
8164 } else if (o->type == REDIS_HASH) {
8165 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8166
8167 /* Emit the HSETs needed to rebuild the hash */
8168 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8169 unsigned char *p = zipmapRewind(o->ptr);
8170 unsigned char *field, *val;
8171 unsigned int flen, vlen;
8172
8173 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8174 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8175 if (fwriteBulkObject(fp,key) == 0) goto werr;
8176 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8177 return -1;
8178 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8179 return -1;
8180 }
8181 } else {
8182 dictIterator *di = dictGetIterator(o->ptr);
8183 dictEntry *de;
8184
8185 while((de = dictNext(di)) != NULL) {
8186 robj *field = dictGetEntryKey(de);
8187 robj *val = dictGetEntryVal(de);
8188
8189 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8190 if (fwriteBulkObject(fp,key) == 0) goto werr;
8191 if (fwriteBulkObject(fp,field) == -1) return -1;
8192 if (fwriteBulkObject(fp,val) == -1) return -1;
8193 }
8194 dictReleaseIterator(di);
8195 }
8196 } else {
8197 redisAssert(0);
8198 }
8199 /* Save the expire time */
8200 if (expiretime != -1) {
8201 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8202 /* If this key is already expired skip it */
8203 if (expiretime < now) continue;
8204 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8205 if (fwriteBulkObject(fp,key) == 0) goto werr;
8206 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8207 }
8208 if (swapped) decrRefCount(o);
8209 }
8210 dictReleaseIterator(di);
8211 }
8212
8213 /* Make sure data will not remain on the OS's output buffers */
8214 fflush(fp);
8215 fsync(fileno(fp));
8216 fclose(fp);
8217
8218 /* Use RENAME to make sure the DB file is changed atomically only
8219 * if the generate DB file is ok. */
8220 if (rename(tmpfile,filename) == -1) {
8221 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8222 unlink(tmpfile);
8223 return REDIS_ERR;
8224 }
8225 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8226 return REDIS_OK;
8227
8228 werr:
8229 fclose(fp);
8230 unlink(tmpfile);
8231 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8232 if (di) dictReleaseIterator(di);
8233 return REDIS_ERR;
8234 }
8235
8236 /* This is how rewriting of the append only file in background works:
8237 *
8238 * 1) The user calls BGREWRITEAOF
8239 * 2) Redis calls this function, that forks():
8240 * 2a) the child rewrite the append only file in a temp file.
8241 * 2b) the parent accumulates differences in server.bgrewritebuf.
8242 * 3) When the child finished '2a' exists.
8243 * 4) The parent will trap the exit code, if it's OK, will append the
8244 * data accumulated into server.bgrewritebuf into the temp file, and
8245 * finally will rename(2) the temp file in the actual file name.
8246 * The the new file is reopened as the new append only file. Profit!
8247 */
8248 static int rewriteAppendOnlyFileBackground(void) {
8249 pid_t childpid;
8250
8251 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8252 if (server.vm_enabled) waitEmptyIOJobsQueue();
8253 if ((childpid = fork()) == 0) {
8254 /* Child */
8255 char tmpfile[256];
8256
8257 if (server.vm_enabled) vmReopenSwapFile();
8258 close(server.fd);
8259 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8260 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8261 _exit(0);
8262 } else {
8263 _exit(1);
8264 }
8265 } else {
8266 /* Parent */
8267 if (childpid == -1) {
8268 redisLog(REDIS_WARNING,
8269 "Can't rewrite append only file in background: fork: %s",
8270 strerror(errno));
8271 return REDIS_ERR;
8272 }
8273 redisLog(REDIS_NOTICE,
8274 "Background append only file rewriting started by pid %d",childpid);
8275 server.bgrewritechildpid = childpid;
8276 updateDictResizePolicy();
8277 /* We set appendseldb to -1 in order to force the next call to the
8278 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8279 * accumulated by the parent into server.bgrewritebuf will start
8280 * with a SELECT statement and it will be safe to merge. */
8281 server.appendseldb = -1;
8282 return REDIS_OK;
8283 }
8284 return REDIS_OK; /* unreached */
8285 }
8286
8287 static void bgrewriteaofCommand(redisClient *c) {
8288 if (server.bgrewritechildpid != -1) {
8289 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8290 return;
8291 }
8292 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8293 char *status = "+Background append only file rewriting started\r\n";
8294 addReplySds(c,sdsnew(status));
8295 } else {
8296 addReply(c,shared.err);
8297 }
8298 }
8299
8300 static void aofRemoveTempFile(pid_t childpid) {
8301 char tmpfile[256];
8302
8303 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8304 unlink(tmpfile);
8305 }
8306
8307 /* Virtual Memory is composed mainly of two subsystems:
8308 * - Blocking Virutal Memory
8309 * - Threaded Virtual Memory I/O
8310 * The two parts are not fully decoupled, but functions are split among two
8311 * different sections of the source code (delimited by comments) in order to
8312 * make more clear what functionality is about the blocking VM and what about
8313 * the threaded (not blocking) VM.
8314 *
8315 * Redis VM design:
8316 *
8317 * Redis VM is a blocking VM (one that blocks reading swapped values from
8318 * disk into memory when a value swapped out is needed in memory) that is made
8319 * unblocking by trying to examine the command argument vector in order to
8320 * load in background values that will likely be needed in order to exec
8321 * the command. The command is executed only once all the relevant keys
8322 * are loaded into memory.
8323 *
8324 * This basically is almost as simple of a blocking VM, but almost as parallel
8325 * as a fully non-blocking VM.
8326 */
8327
8328 /* =================== Virtual Memory - Blocking Side ====================== */
8329
8330 /* substitute the first occurrence of '%p' with the process pid in the
8331 * swap file name. */
8332 static void expandVmSwapFilename(void) {
8333 char *p = strstr(server.vm_swap_file,"%p");
8334 sds new;
8335
8336 if (!p) return;
8337 new = sdsempty();
8338 *p = '\0';
8339 new = sdscat(new,server.vm_swap_file);
8340 new = sdscatprintf(new,"%ld",(long) getpid());
8341 new = sdscat(new,p+2);
8342 zfree(server.vm_swap_file);
8343 server.vm_swap_file = new;
8344 }
8345
8346 static void vmInit(void) {
8347 off_t totsize;
8348 int pipefds[2];
8349 size_t stacksize;
8350
8351 if (server.vm_max_threads != 0)
8352 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8353
8354 expandVmSwapFilename();
8355 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8356 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8357 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8358 }
8359 if (server.vm_fp == NULL) {
8360 redisLog(REDIS_WARNING,
8361 "Impossible to open the swap file: %s. Exiting.",
8362 strerror(errno));
8363 exit(1);
8364 }
8365 server.vm_fd = fileno(server.vm_fp);
8366 server.vm_next_page = 0;
8367 server.vm_near_pages = 0;
8368 server.vm_stats_used_pages = 0;
8369 server.vm_stats_swapped_objects = 0;
8370 server.vm_stats_swapouts = 0;
8371 server.vm_stats_swapins = 0;
8372 totsize = server.vm_pages*server.vm_page_size;
8373 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8374 if (ftruncate(server.vm_fd,totsize) == -1) {
8375 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8376 strerror(errno));
8377 exit(1);
8378 } else {
8379 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8380 }
8381 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8382 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8383 (long long) (server.vm_pages+7)/8, server.vm_pages);
8384 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8385
8386 /* Initialize threaded I/O (used by Virtual Memory) */
8387 server.io_newjobs = listCreate();
8388 server.io_processing = listCreate();
8389 server.io_processed = listCreate();
8390 server.io_ready_clients = listCreate();
8391 pthread_mutex_init(&server.io_mutex,NULL);
8392 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8393 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8394 server.io_active_threads = 0;
8395 if (pipe(pipefds) == -1) {
8396 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8397 ,strerror(errno));
8398 exit(1);
8399 }
8400 server.io_ready_pipe_read = pipefds[0];
8401 server.io_ready_pipe_write = pipefds[1];
8402 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8403 /* LZF requires a lot of stack */
8404 pthread_attr_init(&server.io_threads_attr);
8405 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8406 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8407 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8408 /* Listen for events in the threaded I/O pipe */
8409 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8410 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8411 oom("creating file event");
8412 }
8413
8414 /* Mark the page as used */
8415 static void vmMarkPageUsed(off_t page) {
8416 off_t byte = page/8;
8417 int bit = page&7;
8418 redisAssert(vmFreePage(page) == 1);
8419 server.vm_bitmap[byte] |= 1<<bit;
8420 }
8421
8422 /* Mark N contiguous pages as used, with 'page' being the first. */
8423 static void vmMarkPagesUsed(off_t page, off_t count) {
8424 off_t j;
8425
8426 for (j = 0; j < count; j++)
8427 vmMarkPageUsed(page+j);
8428 server.vm_stats_used_pages += count;
8429 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8430 (long long)count, (long long)page);
8431 }
8432
8433 /* Mark the page as free */
8434 static void vmMarkPageFree(off_t page) {
8435 off_t byte = page/8;
8436 int bit = page&7;
8437 redisAssert(vmFreePage(page) == 0);
8438 server.vm_bitmap[byte] &= ~(1<<bit);
8439 }
8440
8441 /* Mark N contiguous pages as free, with 'page' being the first. */
8442 static void vmMarkPagesFree(off_t page, off_t count) {
8443 off_t j;
8444
8445 for (j = 0; j < count; j++)
8446 vmMarkPageFree(page+j);
8447 server.vm_stats_used_pages -= count;
8448 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8449 (long long)count, (long long)page);
8450 }
8451
8452 /* Test if the page is free */
8453 static int vmFreePage(off_t page) {
8454 off_t byte = page/8;
8455 int bit = page&7;
8456 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8457 }
8458
8459 /* Find N contiguous free pages storing the first page of the cluster in *first.
8460 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8461 * REDIS_ERR is returned.
8462 *
8463 * This function uses a simple algorithm: we try to allocate
8464 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8465 * again from the start of the swap file searching for free spaces.
8466 *
8467 * If it looks pretty clear that there are no free pages near our offset
8468 * we try to find less populated places doing a forward jump of
8469 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8470 * without hurry, and then we jump again and so forth...
8471 *
8472 * This function can be improved using a free list to avoid to guess
8473 * too much, since we could collect data about freed pages.
8474 *
8475 * note: I implemented this function just after watching an episode of
8476 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8477 */
8478 static int vmFindContiguousPages(off_t *first, off_t n) {
8479 off_t base, offset = 0, since_jump = 0, numfree = 0;
8480
8481 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8482 server.vm_near_pages = 0;
8483 server.vm_next_page = 0;
8484 }
8485 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8486 base = server.vm_next_page;
8487
8488 while(offset < server.vm_pages) {
8489 off_t this = base+offset;
8490
8491 /* If we overflow, restart from page zero */
8492 if (this >= server.vm_pages) {
8493 this -= server.vm_pages;
8494 if (this == 0) {
8495 /* Just overflowed, what we found on tail is no longer
8496 * interesting, as it's no longer contiguous. */
8497 numfree = 0;
8498 }
8499 }
8500 if (vmFreePage(this)) {
8501 /* This is a free page */
8502 numfree++;
8503 /* Already got N free pages? Return to the caller, with success */
8504 if (numfree == n) {
8505 *first = this-(n-1);
8506 server.vm_next_page = this+1;
8507 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8508 return REDIS_OK;
8509 }
8510 } else {
8511 /* The current one is not a free page */
8512 numfree = 0;
8513 }
8514
8515 /* Fast-forward if the current page is not free and we already
8516 * searched enough near this place. */
8517 since_jump++;
8518 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8519 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8520 since_jump = 0;
8521 /* Note that even if we rewind after the jump, we are don't need
8522 * to make sure numfree is set to zero as we only jump *if* it
8523 * is set to zero. */
8524 } else {
8525 /* Otherwise just check the next page */
8526 offset++;
8527 }
8528 }
8529 return REDIS_ERR;
8530 }
8531
8532 /* Write the specified object at the specified page of the swap file */
8533 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8534 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8535 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8536 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8537 redisLog(REDIS_WARNING,
8538 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8539 strerror(errno));
8540 return REDIS_ERR;
8541 }
8542 rdbSaveObject(server.vm_fp,o);
8543 fflush(server.vm_fp);
8544 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8545 return REDIS_OK;
8546 }
8547
8548 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8549 * needed to later retrieve the object into the key object.
8550 * If we can't find enough contiguous empty pages to swap the object on disk
8551 * REDIS_ERR is returned. */
8552 static int vmSwapObjectBlocking(robj *key, robj *val) {
8553 off_t pages = rdbSavedObjectPages(val,NULL);
8554 off_t page;
8555
8556 assert(key->storage == REDIS_VM_MEMORY);
8557 assert(key->refcount == 1);
8558 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8559 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8560 key->vm.page = page;
8561 key->vm.usedpages = pages;
8562 key->storage = REDIS_VM_SWAPPED;
8563 key->vtype = val->type;
8564 decrRefCount(val); /* Deallocate the object from memory. */
8565 vmMarkPagesUsed(page,pages);
8566 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8567 (unsigned char*) key->ptr,
8568 (unsigned long long) page, (unsigned long long) pages);
8569 server.vm_stats_swapped_objects++;
8570 server.vm_stats_swapouts++;
8571 return REDIS_OK;
8572 }
8573
8574 static robj *vmReadObjectFromSwap(off_t page, int type) {
8575 robj *o;
8576
8577 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8578 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8579 redisLog(REDIS_WARNING,
8580 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8581 strerror(errno));
8582 _exit(1);
8583 }
8584 o = rdbLoadObject(type,server.vm_fp);
8585 if (o == NULL) {
8586 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8587 _exit(1);
8588 }
8589 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8590 return o;
8591 }
8592
8593 /* Load the value object relative to the 'key' object from swap to memory.
8594 * The newly allocated object is returned.
8595 *
8596 * If preview is true the unserialized object is returned to the caller but
8597 * no changes are made to the key object, nor the pages are marked as freed */
8598 static robj *vmGenericLoadObject(robj *key, int preview) {
8599 robj *val;
8600
8601 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8602 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8603 if (!preview) {
8604 key->storage = REDIS_VM_MEMORY;
8605 key->vm.atime = server.unixtime;
8606 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8607 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8608 (unsigned char*) key->ptr);
8609 server.vm_stats_swapped_objects--;
8610 } else {
8611 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8612 (unsigned char*) key->ptr);
8613 }
8614 server.vm_stats_swapins++;
8615 return val;
8616 }
8617
8618 /* Plain object loading, from swap to memory */
8619 static robj *vmLoadObject(robj *key) {
8620 /* If we are loading the object in background, stop it, we
8621 * need to load this object synchronously ASAP. */
8622 if (key->storage == REDIS_VM_LOADING)
8623 vmCancelThreadedIOJob(key);
8624 return vmGenericLoadObject(key,0);
8625 }
8626
8627 /* Just load the value on disk, without to modify the key.
8628 * This is useful when we want to perform some operation on the value
8629 * without to really bring it from swap to memory, like while saving the
8630 * dataset or rewriting the append only log. */
8631 static robj *vmPreviewObject(robj *key) {
8632 return vmGenericLoadObject(key,1);
8633 }
8634
8635 /* How a good candidate is this object for swapping?
8636 * The better candidate it is, the greater the returned value.
8637 *
8638 * Currently we try to perform a fast estimation of the object size in
8639 * memory, and combine it with aging informations.
8640 *
8641 * Basically swappability = idle-time * log(estimated size)
8642 *
8643 * Bigger objects are preferred over smaller objects, but not
8644 * proportionally, this is why we use the logarithm. This algorithm is
8645 * just a first try and will probably be tuned later. */
8646 static double computeObjectSwappability(robj *o) {
8647 time_t age = server.unixtime - o->vm.atime;
8648 long asize = 0;
8649 list *l;
8650 dict *d;
8651 struct dictEntry *de;
8652 int z;
8653
8654 if (age <= 0) return 0;
8655 switch(o->type) {
8656 case REDIS_STRING:
8657 if (o->encoding != REDIS_ENCODING_RAW) {
8658 asize = sizeof(*o);
8659 } else {
8660 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8661 }
8662 break;
8663 case REDIS_LIST:
8664 l = o->ptr;
8665 listNode *ln = listFirst(l);
8666
8667 asize = sizeof(list);
8668 if (ln) {
8669 robj *ele = ln->value;
8670 long elesize;
8671
8672 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8673 (sizeof(*o)+sdslen(ele->ptr)) :
8674 sizeof(*o);
8675 asize += (sizeof(listNode)+elesize)*listLength(l);
8676 }
8677 break;
8678 case REDIS_SET:
8679 case REDIS_ZSET:
8680 z = (o->type == REDIS_ZSET);
8681 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8682
8683 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8684 if (z) asize += sizeof(zset)-sizeof(dict);
8685 if (dictSize(d)) {
8686 long elesize;
8687 robj *ele;
8688
8689 de = dictGetRandomKey(d);
8690 ele = dictGetEntryKey(de);
8691 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8692 (sizeof(*o)+sdslen(ele->ptr)) :
8693 sizeof(*o);
8694 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8695 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8696 }
8697 break;
8698 case REDIS_HASH:
8699 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8700 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8701 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8702 unsigned int klen, vlen;
8703 unsigned char *key, *val;
8704
8705 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8706 klen = 0;
8707 vlen = 0;
8708 }
8709 asize = len*(klen+vlen+3);
8710 } else if (o->encoding == REDIS_ENCODING_HT) {
8711 d = o->ptr;
8712 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8713 if (dictSize(d)) {
8714 long elesize;
8715 robj *ele;
8716
8717 de = dictGetRandomKey(d);
8718 ele = dictGetEntryKey(de);
8719 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8720 (sizeof(*o)+sdslen(ele->ptr)) :
8721 sizeof(*o);
8722 ele = dictGetEntryVal(de);
8723 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8724 (sizeof(*o)+sdslen(ele->ptr)) :
8725 sizeof(*o);
8726 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8727 }
8728 }
8729 break;
8730 }
8731 return (double)age*log(1+asize);
8732 }
8733
8734 /* Try to swap an object that's a good candidate for swapping.
8735 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8736 * to swap any object at all.
8737 *
8738 * If 'usethreaded' is true, Redis will try to swap the object in background
8739 * using I/O threads. */
8740 static int vmSwapOneObject(int usethreads) {
8741 int j, i;
8742 struct dictEntry *best = NULL;
8743 double best_swappability = 0;
8744 redisDb *best_db = NULL;
8745 robj *key, *val;
8746
8747 for (j = 0; j < server.dbnum; j++) {
8748 redisDb *db = server.db+j;
8749 /* Why maxtries is set to 100?
8750 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8751 * are swappable objects */
8752 int maxtries = 100;
8753
8754 if (dictSize(db->dict) == 0) continue;
8755 for (i = 0; i < 5; i++) {
8756 dictEntry *de;
8757 double swappability;
8758
8759 if (maxtries) maxtries--;
8760 de = dictGetRandomKey(db->dict);
8761 key = dictGetEntryKey(de);
8762 val = dictGetEntryVal(de);
8763 /* Only swap objects that are currently in memory.
8764 *
8765 * Also don't swap shared objects if threaded VM is on, as we
8766 * try to ensure that the main thread does not touch the
8767 * object while the I/O thread is using it, but we can't
8768 * control other keys without adding additional mutex. */
8769 if (key->storage != REDIS_VM_MEMORY ||
8770 (server.vm_max_threads != 0 && val->refcount != 1)) {
8771 if (maxtries) i--; /* don't count this try */
8772 continue;
8773 }
8774 swappability = computeObjectSwappability(val);
8775 if (!best || swappability > best_swappability) {
8776 best = de;
8777 best_swappability = swappability;
8778 best_db = db;
8779 }
8780 }
8781 }
8782 if (best == NULL) return REDIS_ERR;
8783 key = dictGetEntryKey(best);
8784 val = dictGetEntryVal(best);
8785
8786 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8787 key->ptr, best_swappability);
8788
8789 /* Unshare the key if needed */
8790 if (key->refcount > 1) {
8791 robj *newkey = dupStringObject(key);
8792 decrRefCount(key);
8793 key = dictGetEntryKey(best) = newkey;
8794 }
8795 /* Swap it */
8796 if (usethreads) {
8797 vmSwapObjectThreaded(key,val,best_db);
8798 return REDIS_OK;
8799 } else {
8800 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8801 dictGetEntryVal(best) = NULL;
8802 return REDIS_OK;
8803 } else {
8804 return REDIS_ERR;
8805 }
8806 }
8807 }
8808
8809 static int vmSwapOneObjectBlocking() {
8810 return vmSwapOneObject(0);
8811 }
8812
8813 static int vmSwapOneObjectThreaded() {
8814 return vmSwapOneObject(1);
8815 }
8816
8817 /* Return true if it's safe to swap out objects in a given moment.
8818 * Basically we don't want to swap objects out while there is a BGSAVE
8819 * or a BGAEOREWRITE running in backgroud. */
8820 static int vmCanSwapOut(void) {
8821 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8822 }
8823
8824 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8825 * and was deleted. Otherwise 0 is returned. */
8826 static int deleteIfSwapped(redisDb *db, robj *key) {
8827 dictEntry *de;
8828 robj *foundkey;
8829
8830 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8831 foundkey = dictGetEntryKey(de);
8832 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8833 deleteKey(db,key);
8834 return 1;
8835 }
8836
8837 /* =================== Virtual Memory - Threaded I/O ======================= */
8838
8839 static void freeIOJob(iojob *j) {
8840 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8841 j->type == REDIS_IOJOB_DO_SWAP ||
8842 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8843 decrRefCount(j->val);
8844 /* We don't decrRefCount the j->key field as we did't incremented
8845 * the count creating IO Jobs. This is because the key field here is
8846 * just used as an indentifier and if a key is removed the Job should
8847 * never be touched again. */
8848 zfree(j);
8849 }
8850
8851 /* Every time a thread finished a Job, it writes a byte into the write side
8852 * of an unix pipe in order to "awake" the main thread, and this function
8853 * is called. */
8854 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8855 int mask)
8856 {
8857 char buf[1];
8858 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8859 REDIS_NOTUSED(el);
8860 REDIS_NOTUSED(mask);
8861 REDIS_NOTUSED(privdata);
8862
8863 /* For every byte we read in the read side of the pipe, there is one
8864 * I/O job completed to process. */
8865 while((retval = read(fd,buf,1)) == 1) {
8866 iojob *j;
8867 listNode *ln;
8868 robj *key;
8869 struct dictEntry *de;
8870
8871 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8872
8873 /* Get the processed element (the oldest one) */
8874 lockThreadedIO();
8875 assert(listLength(server.io_processed) != 0);
8876 if (toprocess == -1) {
8877 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8878 if (toprocess <= 0) toprocess = 1;
8879 }
8880 ln = listFirst(server.io_processed);
8881 j = ln->value;
8882 listDelNode(server.io_processed,ln);
8883 unlockThreadedIO();
8884 /* If this job is marked as canceled, just ignore it */
8885 if (j->canceled) {
8886 freeIOJob(j);
8887 continue;
8888 }
8889 /* Post process it in the main thread, as there are things we
8890 * can do just here to avoid race conditions and/or invasive locks */
8891 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8892 de = dictFind(j->db->dict,j->key);
8893 assert(de != NULL);
8894 key = dictGetEntryKey(de);
8895 if (j->type == REDIS_IOJOB_LOAD) {
8896 redisDb *db;
8897
8898 /* Key loaded, bring it at home */
8899 key->storage = REDIS_VM_MEMORY;
8900 key->vm.atime = server.unixtime;
8901 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8902 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8903 (unsigned char*) key->ptr);
8904 server.vm_stats_swapped_objects--;
8905 server.vm_stats_swapins++;
8906 dictGetEntryVal(de) = j->val;
8907 incrRefCount(j->val);
8908 db = j->db;
8909 freeIOJob(j);
8910 /* Handle clients waiting for this key to be loaded. */
8911 handleClientsBlockedOnSwappedKey(db,key);
8912 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8913 /* Now we know the amount of pages required to swap this object.
8914 * Let's find some space for it, and queue this task again
8915 * rebranded as REDIS_IOJOB_DO_SWAP. */
8916 if (!vmCanSwapOut() ||
8917 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8918 {
8919 /* Ooops... no space or we can't swap as there is
8920 * a fork()ed Redis trying to save stuff on disk. */
8921 freeIOJob(j);
8922 key->storage = REDIS_VM_MEMORY; /* undo operation */
8923 } else {
8924 /* Note that we need to mark this pages as used now,
8925 * if the job will be canceled, we'll mark them as freed
8926 * again. */
8927 vmMarkPagesUsed(j->page,j->pages);
8928 j->type = REDIS_IOJOB_DO_SWAP;
8929 lockThreadedIO();
8930 queueIOJob(j);
8931 unlockThreadedIO();
8932 }
8933 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8934 robj *val;
8935
8936 /* Key swapped. We can finally free some memory. */
8937 if (key->storage != REDIS_VM_SWAPPING) {
8938 printf("key->storage: %d\n",key->storage);
8939 printf("key->name: %s\n",(char*)key->ptr);
8940 printf("key->refcount: %d\n",key->refcount);
8941 printf("val: %p\n",(void*)j->val);
8942 printf("val->type: %d\n",j->val->type);
8943 printf("val->ptr: %s\n",(char*)j->val->ptr);
8944 }
8945 redisAssert(key->storage == REDIS_VM_SWAPPING);
8946 val = dictGetEntryVal(de);
8947 key->vm.page = j->page;
8948 key->vm.usedpages = j->pages;
8949 key->storage = REDIS_VM_SWAPPED;
8950 key->vtype = j->val->type;
8951 decrRefCount(val); /* Deallocate the object from memory. */
8952 dictGetEntryVal(de) = NULL;
8953 redisLog(REDIS_DEBUG,
8954 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8955 (unsigned char*) key->ptr,
8956 (unsigned long long) j->page, (unsigned long long) j->pages);
8957 server.vm_stats_swapped_objects++;
8958 server.vm_stats_swapouts++;
8959 freeIOJob(j);
8960 /* Put a few more swap requests in queue if we are still
8961 * out of memory */
8962 if (trytoswap && vmCanSwapOut() &&
8963 zmalloc_used_memory() > server.vm_max_memory)
8964 {
8965 int more = 1;
8966 while(more) {
8967 lockThreadedIO();
8968 more = listLength(server.io_newjobs) <
8969 (unsigned) server.vm_max_threads;
8970 unlockThreadedIO();
8971 /* Don't waste CPU time if swappable objects are rare. */
8972 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8973 trytoswap = 0;
8974 break;
8975 }
8976 }
8977 }
8978 }
8979 processed++;
8980 if (processed == toprocess) return;
8981 }
8982 if (retval < 0 && errno != EAGAIN) {
8983 redisLog(REDIS_WARNING,
8984 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8985 strerror(errno));
8986 }
8987 }
8988
8989 static void lockThreadedIO(void) {
8990 pthread_mutex_lock(&server.io_mutex);
8991 }
8992
8993 static void unlockThreadedIO(void) {
8994 pthread_mutex_unlock(&server.io_mutex);
8995 }
8996
8997 /* Remove the specified object from the threaded I/O queue if still not
8998 * processed, otherwise make sure to flag it as canceled. */
8999 static void vmCancelThreadedIOJob(robj *o) {
9000 list *lists[3] = {
9001 server.io_newjobs, /* 0 */
9002 server.io_processing, /* 1 */
9003 server.io_processed /* 2 */
9004 };
9005 int i;
9006
9007 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9008 again:
9009 lockThreadedIO();
9010 /* Search for a matching key in one of the queues */
9011 for (i = 0; i < 3; i++) {
9012 listNode *ln;
9013 listIter li;
9014
9015 listRewind(lists[i],&li);
9016 while ((ln = listNext(&li)) != NULL) {
9017 iojob *job = ln->value;
9018
9019 if (job->canceled) continue; /* Skip this, already canceled. */
9020 if (job->key == o) {
9021 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9022 (void*)job, (char*)o->ptr, job->type, i);
9023 /* Mark the pages as free since the swap didn't happened
9024 * or happened but is now discarded. */
9025 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9026 vmMarkPagesFree(job->page,job->pages);
9027 /* Cancel the job. It depends on the list the job is
9028 * living in. */
9029 switch(i) {
9030 case 0: /* io_newjobs */
9031 /* If the job was yet not processed the best thing to do
9032 * is to remove it from the queue at all */
9033 freeIOJob(job);
9034 listDelNode(lists[i],ln);
9035 break;
9036 case 1: /* io_processing */
9037 /* Oh Shi- the thread is messing with the Job:
9038 *
9039 * Probably it's accessing the object if this is a
9040 * PREPARE_SWAP or DO_SWAP job.
9041 * If it's a LOAD job it may be reading from disk and
9042 * if we don't wait for the job to terminate before to
9043 * cancel it, maybe in a few microseconds data can be
9044 * corrupted in this pages. So the short story is:
9045 *
9046 * Better to wait for the job to move into the
9047 * next queue (processed)... */
9048
9049 /* We try again and again until the job is completed. */
9050 unlockThreadedIO();
9051 /* But let's wait some time for the I/O thread
9052 * to finish with this job. After all this condition
9053 * should be very rare. */
9054 usleep(1);
9055 goto again;
9056 case 2: /* io_processed */
9057 /* The job was already processed, that's easy...
9058 * just mark it as canceled so that we'll ignore it
9059 * when processing completed jobs. */
9060 job->canceled = 1;
9061 break;
9062 }
9063 /* Finally we have to adjust the storage type of the object
9064 * in order to "UNDO" the operaiton. */
9065 if (o->storage == REDIS_VM_LOADING)
9066 o->storage = REDIS_VM_SWAPPED;
9067 else if (o->storage == REDIS_VM_SWAPPING)
9068 o->storage = REDIS_VM_MEMORY;
9069 unlockThreadedIO();
9070 return;
9071 }
9072 }
9073 }
9074 unlockThreadedIO();
9075 assert(1 != 1); /* We should never reach this */
9076 }
9077
9078 static void *IOThreadEntryPoint(void *arg) {
9079 iojob *j;
9080 listNode *ln;
9081 REDIS_NOTUSED(arg);
9082
9083 pthread_detach(pthread_self());
9084 while(1) {
9085 /* Get a new job to process */
9086 lockThreadedIO();
9087 if (listLength(server.io_newjobs) == 0) {
9088 /* No new jobs in queue, exit. */
9089 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9090 (long) pthread_self());
9091 server.io_active_threads--;
9092 unlockThreadedIO();
9093 return NULL;
9094 }
9095 ln = listFirst(server.io_newjobs);
9096 j = ln->value;
9097 listDelNode(server.io_newjobs,ln);
9098 /* Add the job in the processing queue */
9099 j->thread = pthread_self();
9100 listAddNodeTail(server.io_processing,j);
9101 ln = listLast(server.io_processing); /* We use ln later to remove it */
9102 unlockThreadedIO();
9103 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9104 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9105
9106 /* Process the Job */
9107 if (j->type == REDIS_IOJOB_LOAD) {
9108 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9109 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9110 FILE *fp = fopen("/dev/null","w+");
9111 j->pages = rdbSavedObjectPages(j->val,fp);
9112 fclose(fp);
9113 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9114 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9115 j->canceled = 1;
9116 }
9117
9118 /* Done: insert the job into the processed queue */
9119 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9120 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9121 lockThreadedIO();
9122 listDelNode(server.io_processing,ln);
9123 listAddNodeTail(server.io_processed,j);
9124 unlockThreadedIO();
9125
9126 /* Signal the main thread there is new stuff to process */
9127 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9128 }
9129 return NULL; /* never reached */
9130 }
9131
9132 static void spawnIOThread(void) {
9133 pthread_t thread;
9134 sigset_t mask, omask;
9135 int err;
9136
9137 sigemptyset(&mask);
9138 sigaddset(&mask,SIGCHLD);
9139 sigaddset(&mask,SIGHUP);
9140 sigaddset(&mask,SIGPIPE);
9141 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9142 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9143 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9144 strerror(err));
9145 usleep(1000000);
9146 }
9147 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9148 server.io_active_threads++;
9149 }
9150
9151 /* We need to wait for the last thread to exit before we are able to
9152 * fork() in order to BGSAVE or BGREWRITEAOF. */
9153 static void waitEmptyIOJobsQueue(void) {
9154 while(1) {
9155 int io_processed_len;
9156
9157 lockThreadedIO();
9158 if (listLength(server.io_newjobs) == 0 &&
9159 listLength(server.io_processing) == 0 &&
9160 server.io_active_threads == 0)
9161 {
9162 unlockThreadedIO();
9163 return;
9164 }
9165 /* While waiting for empty jobs queue condition we post-process some
9166 * finshed job, as I/O threads may be hanging trying to write against
9167 * the io_ready_pipe_write FD but there are so much pending jobs that
9168 * it's blocking. */
9169 io_processed_len = listLength(server.io_processed);
9170 unlockThreadedIO();
9171 if (io_processed_len) {
9172 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9173 usleep(1000); /* 1 millisecond */
9174 } else {
9175 usleep(10000); /* 10 milliseconds */
9176 }
9177 }
9178 }
9179
9180 static void vmReopenSwapFile(void) {
9181 /* Note: we don't close the old one as we are in the child process
9182 * and don't want to mess at all with the original file object. */
9183 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9184 if (server.vm_fp == NULL) {
9185 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9186 server.vm_swap_file);
9187 _exit(1);
9188 }
9189 server.vm_fd = fileno(server.vm_fp);
9190 }
9191
9192 /* This function must be called while with threaded IO locked */
9193 static void queueIOJob(iojob *j) {
9194 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9195 (void*)j, j->type, (char*)j->key->ptr);
9196 listAddNodeTail(server.io_newjobs,j);
9197 if (server.io_active_threads < server.vm_max_threads)
9198 spawnIOThread();
9199 }
9200
9201 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9202 iojob *j;
9203
9204 assert(key->storage == REDIS_VM_MEMORY);
9205 assert(key->refcount == 1);
9206
9207 j = zmalloc(sizeof(*j));
9208 j->type = REDIS_IOJOB_PREPARE_SWAP;
9209 j->db = db;
9210 j->key = key;
9211 j->val = val;
9212 incrRefCount(val);
9213 j->canceled = 0;
9214 j->thread = (pthread_t) -1;
9215 key->storage = REDIS_VM_SWAPPING;
9216
9217 lockThreadedIO();
9218 queueIOJob(j);
9219 unlockThreadedIO();
9220 return REDIS_OK;
9221 }
9222
9223 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9224
9225 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9226 * If there is not already a job loading the key, it is craeted.
9227 * The key is added to the io_keys list in the client structure, and also
9228 * in the hash table mapping swapped keys to waiting clients, that is,
9229 * server.io_waited_keys. */
9230 static int waitForSwappedKey(redisClient *c, robj *key) {
9231 struct dictEntry *de;
9232 robj *o;
9233 list *l;
9234
9235 /* If the key does not exist or is already in RAM we don't need to
9236 * block the client at all. */
9237 de = dictFind(c->db->dict,key);
9238 if (de == NULL) return 0;
9239 o = dictGetEntryKey(de);
9240 if (o->storage == REDIS_VM_MEMORY) {
9241 return 0;
9242 } else if (o->storage == REDIS_VM_SWAPPING) {
9243 /* We were swapping the key, undo it! */
9244 vmCancelThreadedIOJob(o);
9245 return 0;
9246 }
9247
9248 /* OK: the key is either swapped, or being loaded just now. */
9249
9250 /* Add the key to the list of keys this client is waiting for.
9251 * This maps clients to keys they are waiting for. */
9252 listAddNodeTail(c->io_keys,key);
9253 incrRefCount(key);
9254
9255 /* Add the client to the swapped keys => clients waiting map. */
9256 de = dictFind(c->db->io_keys,key);
9257 if (de == NULL) {
9258 int retval;
9259
9260 /* For every key we take a list of clients blocked for it */
9261 l = listCreate();
9262 retval = dictAdd(c->db->io_keys,key,l);
9263 incrRefCount(key);
9264 assert(retval == DICT_OK);
9265 } else {
9266 l = dictGetEntryVal(de);
9267 }
9268 listAddNodeTail(l,c);
9269
9270 /* Are we already loading the key from disk? If not create a job */
9271 if (o->storage == REDIS_VM_SWAPPED) {
9272 iojob *j;
9273
9274 o->storage = REDIS_VM_LOADING;
9275 j = zmalloc(sizeof(*j));
9276 j->type = REDIS_IOJOB_LOAD;
9277 j->db = c->db;
9278 j->key = o;
9279 j->key->vtype = o->vtype;
9280 j->page = o->vm.page;
9281 j->val = NULL;
9282 j->canceled = 0;
9283 j->thread = (pthread_t) -1;
9284 lockThreadedIO();
9285 queueIOJob(j);
9286 unlockThreadedIO();
9287 }
9288 return 1;
9289 }
9290
9291 /* Preload keys needed for the ZUNION and ZINTER commands. */
9292 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9293 int i, num;
9294 num = atoi(c->argv[2]->ptr);
9295 for (i = 0; i < num; i++) {
9296 waitForSwappedKey(c,c->argv[3+i]);
9297 }
9298 }
9299
9300 /* Is this client attempting to run a command against swapped keys?
9301 * If so, block it ASAP, load the keys in background, then resume it.
9302 *
9303 * The important idea about this function is that it can fail! If keys will
9304 * still be swapped when the client is resumed, this key lookups will
9305 * just block loading keys from disk. In practical terms this should only
9306 * happen with SORT BY command or if there is a bug in this function.
9307 *
9308 * Return 1 if the client is marked as blocked, 0 if the client can
9309 * continue as the keys it is going to access appear to be in memory. */
9310 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9311 int j, last;
9312
9313 if (cmd->vm_preload_proc != NULL) {
9314 cmd->vm_preload_proc(c);
9315 } else {
9316 if (cmd->vm_firstkey == 0) return 0;
9317 last = cmd->vm_lastkey;
9318 if (last < 0) last = c->argc+last;
9319 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9320 waitForSwappedKey(c,c->argv[j]);
9321 }
9322
9323 /* If the client was blocked for at least one key, mark it as blocked. */
9324 if (listLength(c->io_keys)) {
9325 c->flags |= REDIS_IO_WAIT;
9326 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9327 server.vm_blocked_clients++;
9328 return 1;
9329 } else {
9330 return 0;
9331 }
9332 }
9333
9334 /* Remove the 'key' from the list of blocked keys for a given client.
9335 *
9336 * The function returns 1 when there are no longer blocking keys after
9337 * the current one was removed (and the client can be unblocked). */
9338 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9339 list *l;
9340 listNode *ln;
9341 listIter li;
9342 struct dictEntry *de;
9343
9344 /* Remove the key from the list of keys this client is waiting for. */
9345 listRewind(c->io_keys,&li);
9346 while ((ln = listNext(&li)) != NULL) {
9347 if (compareStringObjects(ln->value,key) == 0) {
9348 listDelNode(c->io_keys,ln);
9349 break;
9350 }
9351 }
9352 assert(ln != NULL);
9353
9354 /* Remove the client form the key => waiting clients map. */
9355 de = dictFind(c->db->io_keys,key);
9356 assert(de != NULL);
9357 l = dictGetEntryVal(de);
9358 ln = listSearchKey(l,c);
9359 assert(ln != NULL);
9360 listDelNode(l,ln);
9361 if (listLength(l) == 0)
9362 dictDelete(c->db->io_keys,key);
9363
9364 return listLength(c->io_keys) == 0;
9365 }
9366
9367 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9368 struct dictEntry *de;
9369 list *l;
9370 listNode *ln;
9371 int len;
9372
9373 de = dictFind(db->io_keys,key);
9374 if (!de) return;
9375
9376 l = dictGetEntryVal(de);
9377 len = listLength(l);
9378 /* Note: we can't use something like while(listLength(l)) as the list
9379 * can be freed by the calling function when we remove the last element. */
9380 while (len--) {
9381 ln = listFirst(l);
9382 redisClient *c = ln->value;
9383
9384 if (dontWaitForSwappedKey(c,key)) {
9385 /* Put the client in the list of clients ready to go as we
9386 * loaded all the keys about it. */
9387 listAddNodeTail(server.io_ready_clients,c);
9388 }
9389 }
9390 }
9391
9392 /* =========================== Remote Configuration ========================= */
9393
9394 static void configSetCommand(redisClient *c) {
9395 robj *o = getDecodedObject(c->argv[3]);
9396 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9397 zfree(server.dbfilename);
9398 server.dbfilename = zstrdup(o->ptr);
9399 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9400 zfree(server.requirepass);
9401 server.requirepass = zstrdup(o->ptr);
9402 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9403 zfree(server.masterauth);
9404 server.masterauth = zstrdup(o->ptr);
9405 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9406 server.maxmemory = strtoll(o->ptr, NULL, 10);
9407 } else {
9408 addReplySds(c,sdscatprintf(sdsempty(),
9409 "-ERR not supported CONFIG parameter %s\r\n",
9410 (char*)c->argv[2]->ptr));
9411 decrRefCount(o);
9412 return;
9413 }
9414 decrRefCount(o);
9415 addReply(c,shared.ok);
9416 }
9417
9418 static void configGetCommand(redisClient *c) {
9419 robj *o = getDecodedObject(c->argv[2]);
9420 robj *lenobj = createObject(REDIS_STRING,NULL);
9421 char *pattern = o->ptr;
9422 int matches = 0;
9423
9424 addReply(c,lenobj);
9425 decrRefCount(lenobj);
9426
9427 if (stringmatch(pattern,"dbfilename",0)) {
9428 addReplyBulkCString(c,"dbfilename");
9429 addReplyBulkCString(c,server.dbfilename);
9430 matches++;
9431 }
9432 if (stringmatch(pattern,"requirepass",0)) {
9433 addReplyBulkCString(c,"requirepass");
9434 addReplyBulkCString(c,server.requirepass);
9435 matches++;
9436 }
9437 if (stringmatch(pattern,"masterauth",0)) {
9438 addReplyBulkCString(c,"masterauth");
9439 addReplyBulkCString(c,server.masterauth);
9440 matches++;
9441 }
9442 if (stringmatch(pattern,"maxmemory",0)) {
9443 char buf[128];
9444
9445 snprintf(buf,128,"%llu\n",server.maxmemory);
9446 addReplyBulkCString(c,"maxmemory");
9447 addReplyBulkCString(c,buf);
9448 matches++;
9449 }
9450 decrRefCount(o);
9451 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9452 }
9453
9454 static void configCommand(redisClient *c) {
9455 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9456 if (c->argc != 4) goto badarity;
9457 configSetCommand(c);
9458 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9459 if (c->argc != 3) goto badarity;
9460 configGetCommand(c);
9461 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9462 if (c->argc != 2) goto badarity;
9463 server.stat_numcommands = 0;
9464 server.stat_numconnections = 0;
9465 server.stat_expiredkeys = 0;
9466 server.stat_starttime = time(NULL);
9467 addReply(c,shared.ok);
9468 } else {
9469 addReplySds(c,sdscatprintf(sdsempty(),
9470 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9471 }
9472 return;
9473
9474 badarity:
9475 addReplySds(c,sdscatprintf(sdsempty(),
9476 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9477 (char*) c->argv[1]->ptr));
9478 }
9479
9480 /* =========================== Pubsub implementation ======================== */
9481
9482 static void freePubsubPattern(void *p) {
9483 pubsubPattern *pat = p;
9484
9485 decrRefCount(pat->pattern);
9486 zfree(pat);
9487 }
9488
9489 static int listMatchPubsubPattern(void *a, void *b) {
9490 pubsubPattern *pa = a, *pb = b;
9491
9492 return (pa->client == pb->client) &&
9493 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9494 }
9495
9496 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9497 * 0 if the client was already subscribed to that channel. */
9498 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9499 struct dictEntry *de;
9500 list *clients = NULL;
9501 int retval = 0;
9502
9503 /* Add the channel to the client -> channels hash table */
9504 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9505 retval = 1;
9506 incrRefCount(channel);
9507 /* Add the client to the channel -> list of clients hash table */
9508 de = dictFind(server.pubsub_channels,channel);
9509 if (de == NULL) {
9510 clients = listCreate();
9511 dictAdd(server.pubsub_channels,channel,clients);
9512 incrRefCount(channel);
9513 } else {
9514 clients = dictGetEntryVal(de);
9515 }
9516 listAddNodeTail(clients,c);
9517 }
9518 /* Notify the client */
9519 addReply(c,shared.mbulk3);
9520 addReply(c,shared.subscribebulk);
9521 addReplyBulk(c,channel);
9522 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9523 return retval;
9524 }
9525
9526 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9527 * 0 if the client was not subscribed to the specified channel. */
9528 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9529 struct dictEntry *de;
9530 list *clients;
9531 listNode *ln;
9532 int retval = 0;
9533
9534 /* Remove the channel from the client -> channels hash table */
9535 incrRefCount(channel); /* channel may be just a pointer to the same object
9536 we have in the hash tables. Protect it... */
9537 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9538 retval = 1;
9539 /* Remove the client from the channel -> clients list hash table */
9540 de = dictFind(server.pubsub_channels,channel);
9541 assert(de != NULL);
9542 clients = dictGetEntryVal(de);
9543 ln = listSearchKey(clients,c);
9544 assert(ln != NULL);
9545 listDelNode(clients,ln);
9546 if (listLength(clients) == 0) {
9547 /* Free the list and associated hash entry at all if this was
9548 * the latest client, so that it will be possible to abuse
9549 * Redis PUBSUB creating millions of channels. */
9550 dictDelete(server.pubsub_channels,channel);
9551 }
9552 }
9553 /* Notify the client */
9554 if (notify) {
9555 addReply(c,shared.mbulk3);
9556 addReply(c,shared.unsubscribebulk);
9557 addReplyBulk(c,channel);
9558 addReplyLong(c,dictSize(c->pubsub_channels)+
9559 listLength(c->pubsub_patterns));
9560
9561 }
9562 decrRefCount(channel); /* it is finally safe to release it */
9563 return retval;
9564 }
9565
9566 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9567 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9568 int retval = 0;
9569
9570 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9571 retval = 1;
9572 pubsubPattern *pat;
9573 listAddNodeTail(c->pubsub_patterns,pattern);
9574 incrRefCount(pattern);
9575 pat = zmalloc(sizeof(*pat));
9576 pat->pattern = getDecodedObject(pattern);
9577 pat->client = c;
9578 listAddNodeTail(server.pubsub_patterns,pat);
9579 }
9580 /* Notify the client */
9581 addReply(c,shared.mbulk3);
9582 addReply(c,shared.psubscribebulk);
9583 addReplyBulk(c,pattern);
9584 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9585 return retval;
9586 }
9587
9588 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9589 * 0 if the client was not subscribed to the specified channel. */
9590 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9591 listNode *ln;
9592 pubsubPattern pat;
9593 int retval = 0;
9594
9595 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9596 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9597 retval = 1;
9598 listDelNode(c->pubsub_patterns,ln);
9599 pat.client = c;
9600 pat.pattern = pattern;
9601 ln = listSearchKey(server.pubsub_patterns,&pat);
9602 listDelNode(server.pubsub_patterns,ln);
9603 }
9604 /* Notify the client */
9605 if (notify) {
9606 addReply(c,shared.mbulk3);
9607 addReply(c,shared.punsubscribebulk);
9608 addReplyBulk(c,pattern);
9609 addReplyLong(c,dictSize(c->pubsub_channels)+
9610 listLength(c->pubsub_patterns));
9611 }
9612 decrRefCount(pattern);
9613 return retval;
9614 }
9615
9616 /* Unsubscribe from all the channels. Return the number of channels the
9617 * client was subscribed from. */
9618 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9619 dictIterator *di = dictGetIterator(c->pubsub_channels);
9620 dictEntry *de;
9621 int count = 0;
9622
9623 while((de = dictNext(di)) != NULL) {
9624 robj *channel = dictGetEntryKey(de);
9625
9626 count += pubsubUnsubscribeChannel(c,channel,notify);
9627 }
9628 dictReleaseIterator(di);
9629 return count;
9630 }
9631
9632 /* Unsubscribe from all the patterns. Return the number of patterns the
9633 * client was subscribed from. */
9634 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9635 listNode *ln;
9636 listIter li;
9637 int count = 0;
9638
9639 listRewind(c->pubsub_patterns,&li);
9640 while ((ln = listNext(&li)) != NULL) {
9641 robj *pattern = ln->value;
9642
9643 count += pubsubUnsubscribePattern(c,pattern,notify);
9644 }
9645 return count;
9646 }
9647
9648 /* Publish a message */
9649 static int pubsubPublishMessage(robj *channel, robj *message) {
9650 int receivers = 0;
9651 struct dictEntry *de;
9652 listNode *ln;
9653 listIter li;
9654
9655 /* Send to clients listening for that channel */
9656 de = dictFind(server.pubsub_channels,channel);
9657 if (de) {
9658 list *list = dictGetEntryVal(de);
9659 listNode *ln;
9660 listIter li;
9661
9662 listRewind(list,&li);
9663 while ((ln = listNext(&li)) != NULL) {
9664 redisClient *c = ln->value;
9665
9666 addReply(c,shared.mbulk3);
9667 addReply(c,shared.messagebulk);
9668 addReplyBulk(c,channel);
9669 addReplyBulk(c,message);
9670 receivers++;
9671 }
9672 }
9673 /* Send to clients listening to matching channels */
9674 if (listLength(server.pubsub_patterns)) {
9675 listRewind(server.pubsub_patterns,&li);
9676 channel = getDecodedObject(channel);
9677 while ((ln = listNext(&li)) != NULL) {
9678 pubsubPattern *pat = ln->value;
9679
9680 if (stringmatchlen((char*)pat->pattern->ptr,
9681 sdslen(pat->pattern->ptr),
9682 (char*)channel->ptr,
9683 sdslen(channel->ptr),0)) {
9684 addReply(pat->client,shared.mbulk3);
9685 addReply(pat->client,shared.messagebulk);
9686 addReplyBulk(pat->client,channel);
9687 addReplyBulk(pat->client,message);
9688 receivers++;
9689 }
9690 }
9691 decrRefCount(channel);
9692 }
9693 return receivers;
9694 }
9695
9696 static void subscribeCommand(redisClient *c) {
9697 int j;
9698
9699 for (j = 1; j < c->argc; j++)
9700 pubsubSubscribeChannel(c,c->argv[j]);
9701 }
9702
9703 static void unsubscribeCommand(redisClient *c) {
9704 if (c->argc == 1) {
9705 pubsubUnsubscribeAllChannels(c,1);
9706 return;
9707 } else {
9708 int j;
9709
9710 for (j = 1; j < c->argc; j++)
9711 pubsubUnsubscribeChannel(c,c->argv[j],1);
9712 }
9713 }
9714
9715 static void psubscribeCommand(redisClient *c) {
9716 int j;
9717
9718 for (j = 1; j < c->argc; j++)
9719 pubsubSubscribePattern(c,c->argv[j]);
9720 }
9721
9722 static void punsubscribeCommand(redisClient *c) {
9723 if (c->argc == 1) {
9724 pubsubUnsubscribeAllPatterns(c,1);
9725 return;
9726 } else {
9727 int j;
9728
9729 for (j = 1; j < c->argc; j++)
9730 pubsubUnsubscribePattern(c,c->argv[j],1);
9731 }
9732 }
9733
9734 static void publishCommand(redisClient *c) {
9735 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9736 addReplyLong(c,receivers);
9737 }
9738
9739 /* ================================= Debugging ============================== */
9740
9741 static void debugCommand(redisClient *c) {
9742 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9743 *((char*)-1) = 'x';
9744 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9745 if (rdbSave(server.dbfilename) != REDIS_OK) {
9746 addReply(c,shared.err);
9747 return;
9748 }
9749 emptyDb();
9750 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9751 addReply(c,shared.err);
9752 return;
9753 }
9754 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9755 addReply(c,shared.ok);
9756 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9757 emptyDb();
9758 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9759 addReply(c,shared.err);
9760 return;
9761 }
9762 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9763 addReply(c,shared.ok);
9764 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9765 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9766 robj *key, *val;
9767
9768 if (!de) {
9769 addReply(c,shared.nokeyerr);
9770 return;
9771 }
9772 key = dictGetEntryKey(de);
9773 val = dictGetEntryVal(de);
9774 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9775 key->storage == REDIS_VM_SWAPPING)) {
9776 char *strenc;
9777 char buf[128];
9778
9779 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9780 strenc = strencoding[val->encoding];
9781 } else {
9782 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9783 strenc = buf;
9784 }
9785 addReplySds(c,sdscatprintf(sdsempty(),
9786 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9787 "encoding:%s serializedlength:%lld\r\n",
9788 (void*)key, key->refcount, (void*)val, val->refcount,
9789 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9790 } else {
9791 addReplySds(c,sdscatprintf(sdsempty(),
9792 "+Key at:%p refcount:%d, value swapped at: page %llu "
9793 "using %llu pages\r\n",
9794 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9795 (unsigned long long) key->vm.usedpages));
9796 }
9797 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9798 lookupKeyRead(c->db,c->argv[2]);
9799 addReply(c,shared.ok);
9800 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9801 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9802 robj *key, *val;
9803
9804 if (!server.vm_enabled) {
9805 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9806 return;
9807 }
9808 if (!de) {
9809 addReply(c,shared.nokeyerr);
9810 return;
9811 }
9812 key = dictGetEntryKey(de);
9813 val = dictGetEntryVal(de);
9814 /* If the key is shared we want to create a copy */
9815 if (key->refcount > 1) {
9816 robj *newkey = dupStringObject(key);
9817 decrRefCount(key);
9818 key = dictGetEntryKey(de) = newkey;
9819 }
9820 /* Swap it */
9821 if (key->storage != REDIS_VM_MEMORY) {
9822 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9823 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9824 dictGetEntryVal(de) = NULL;
9825 addReply(c,shared.ok);
9826 } else {
9827 addReply(c,shared.err);
9828 }
9829 } else {
9830 addReplySds(c,sdsnew(
9831 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9832 }
9833 }
9834
9835 static void _redisAssert(char *estr, char *file, int line) {
9836 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9837 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9838 #ifdef HAVE_BACKTRACE
9839 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9840 *((char*)-1) = 'x';
9841 #endif
9842 }
9843
9844 /* =================================== Main! ================================ */
9845
9846 #ifdef __linux__
9847 int linuxOvercommitMemoryValue(void) {
9848 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9849 char buf[64];
9850
9851 if (!fp) return -1;
9852 if (fgets(buf,64,fp) == NULL) {
9853 fclose(fp);
9854 return -1;
9855 }
9856 fclose(fp);
9857
9858 return atoi(buf);
9859 }
9860
9861 void linuxOvercommitMemoryWarning(void) {
9862 if (linuxOvercommitMemoryValue() == 0) {
9863 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9864 }
9865 }
9866 #endif /* __linux__ */
9867
9868 static void daemonize(void) {
9869 int fd;
9870 FILE *fp;
9871
9872 if (fork() != 0) exit(0); /* parent exits */
9873 setsid(); /* create a new session */
9874
9875 /* Every output goes to /dev/null. If Redis is daemonized but
9876 * the 'logfile' is set to 'stdout' in the configuration file
9877 * it will not log at all. */
9878 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9879 dup2(fd, STDIN_FILENO);
9880 dup2(fd, STDOUT_FILENO);
9881 dup2(fd, STDERR_FILENO);
9882 if (fd > STDERR_FILENO) close(fd);
9883 }
9884 /* Try to write the pid file */
9885 fp = fopen(server.pidfile,"w");
9886 if (fp) {
9887 fprintf(fp,"%d\n",getpid());
9888 fclose(fp);
9889 }
9890 }
9891
9892 static void version() {
9893 printf("Redis server version %s\n", REDIS_VERSION);
9894 exit(0);
9895 }
9896
9897 static void usage() {
9898 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9899 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9900 exit(1);
9901 }
9902
9903 int main(int argc, char **argv) {
9904 time_t start;
9905
9906 initServerConfig();
9907 if (argc == 2) {
9908 if (strcmp(argv[1], "-v") == 0 ||
9909 strcmp(argv[1], "--version") == 0) version();
9910 if (strcmp(argv[1], "--help") == 0) usage();
9911 resetServerSaveParams();
9912 loadServerConfig(argv[1]);
9913 } else if ((argc > 2)) {
9914 usage();
9915 } else {
9916 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9917 }
9918 if (server.daemonize) daemonize();
9919 initServer();
9920 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9921 #ifdef __linux__
9922 linuxOvercommitMemoryWarning();
9923 #endif
9924 start = time(NULL);
9925 if (server.appendonly) {
9926 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9927 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9928 } else {
9929 if (rdbLoad(server.dbfilename) == REDIS_OK)
9930 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9931 }
9932 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9933 aeSetBeforeSleepProc(server.el,beforeSleep);
9934 aeMain(server.el);
9935 aeDeleteEventLoop(server.el);
9936 return 0;
9937 }
9938
9939 /* ============================= Backtrace support ========================= */
9940
9941 #ifdef HAVE_BACKTRACE
9942 static char *findFuncName(void *pointer, unsigned long *offset);
9943
9944 static void *getMcontextEip(ucontext_t *uc) {
9945 #if defined(__FreeBSD__)
9946 return (void*) uc->uc_mcontext.mc_eip;
9947 #elif defined(__dietlibc__)
9948 return (void*) uc->uc_mcontext.eip;
9949 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9950 #if __x86_64__
9951 return (void*) uc->uc_mcontext->__ss.__rip;
9952 #else
9953 return (void*) uc->uc_mcontext->__ss.__eip;
9954 #endif
9955 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9956 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9957 return (void*) uc->uc_mcontext->__ss.__rip;
9958 #else
9959 return (void*) uc->uc_mcontext->__ss.__eip;
9960 #endif
9961 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9962 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9963 #elif defined(__ia64__) /* Linux IA64 */
9964 return (void*) uc->uc_mcontext.sc_ip;
9965 #else
9966 return NULL;
9967 #endif
9968 }
9969
9970 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9971 void *trace[100];
9972 char **messages = NULL;
9973 int i, trace_size = 0;
9974 unsigned long offset=0;
9975 ucontext_t *uc = (ucontext_t*) secret;
9976 sds infostring;
9977 REDIS_NOTUSED(info);
9978
9979 redisLog(REDIS_WARNING,
9980 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9981 infostring = genRedisInfoString();
9982 redisLog(REDIS_WARNING, "%s",infostring);
9983 /* It's not safe to sdsfree() the returned string under memory
9984 * corruption conditions. Let it leak as we are going to abort */
9985
9986 trace_size = backtrace(trace, 100);
9987 /* overwrite sigaction with caller's address */
9988 if (getMcontextEip(uc) != NULL) {
9989 trace[1] = getMcontextEip(uc);
9990 }
9991 messages = backtrace_symbols(trace, trace_size);
9992
9993 for (i=1; i<trace_size; ++i) {
9994 char *fn = findFuncName(trace[i], &offset), *p;
9995
9996 p = strchr(messages[i],'+');
9997 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9998 redisLog(REDIS_WARNING,"%s", messages[i]);
9999 } else {
10000 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10001 }
10002 }
10003 /* free(messages); Don't call free() with possibly corrupted memory. */
10004 _exit(0);
10005 }
10006
10007 static void setupSigSegvAction(void) {
10008 struct sigaction act;
10009
10010 sigemptyset (&act.sa_mask);
10011 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10012 * is used. Otherwise, sa_handler is used */
10013 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10014 act.sa_sigaction = segvHandler;
10015 sigaction (SIGSEGV, &act, NULL);
10016 sigaction (SIGBUS, &act, NULL);
10017 sigaction (SIGFPE, &act, NULL);
10018 sigaction (SIGILL, &act, NULL);
10019 sigaction (SIGBUS, &act, NULL);
10020 return;
10021 }
10022
10023 #include "staticsymbols.h"
10024 /* This function try to convert a pointer into a function name. It's used in
10025 * oreder to provide a backtrace under segmentation fault that's able to
10026 * display functions declared as static (otherwise the backtrace is useless). */
10027 static char *findFuncName(void *pointer, unsigned long *offset){
10028 int i, ret = -1;
10029 unsigned long off, minoff = 0;
10030
10031 /* Try to match against the Symbol with the smallest offset */
10032 for (i=0; symsTable[i].pointer; i++) {
10033 unsigned long lp = (unsigned long) pointer;
10034
10035 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10036 off=lp-symsTable[i].pointer;
10037 if (ret < 0 || off < minoff) {
10038 minoff=off;
10039 ret=i;
10040 }
10041 }
10042 }
10043 if (ret == -1) return NULL;
10044 *offset = minoff;
10045 return symsTable[ret].name;
10046 }
10047 #else /* HAVE_BACKTRACE */
10048 static void setupSigSegvAction(void) {
10049 }
10050 #endif /* HAVE_BACKTRACE */
10051
10052
10053
10054 /* The End */
10055
10056
10057