]> git.saurik.com Git - redis.git/blob - redis.c
bb54cce02062b714b302d2c689561227cc1a65ba
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
518 } shared;
519
520 /* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
526 /* VM threaded I/O request message */
527 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
530 typedef struct iojob {
531 int type; /* Request type, REDIS_IOJOB_* */
532 redisDb *db;/* Redis database */
533 robj *key; /* This I/O request is about swapping this key */
534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540 } iojob;
541
542 /*================================ Prototypes =============================== */
543
544 static void freeStringObject(robj *o);
545 static void freeListObject(robj *o);
546 static void freeSetObject(robj *o);
547 static void decrRefCount(void *o);
548 static robj *createObject(int type, void *ptr);
549 static void freeClient(redisClient *c);
550 static int rdbLoad(char *filename);
551 static void addReply(redisClient *c, robj *obj);
552 static void addReplySds(redisClient *c, sds s);
553 static void incrRefCount(robj *o);
554 static int rdbSaveBackground(char *filename);
555 static robj *createStringObject(char *ptr, size_t len);
556 static robj *dupStringObject(robj *o);
557 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
559 static int syncWithMaster(void);
560 static robj *tryObjectEncoding(robj *o);
561 static robj *getDecodedObject(robj *o);
562 static int removeExpire(redisDb *db, robj *key);
563 static int expireIfNeeded(redisDb *db, robj *key);
564 static int deleteIfVolatile(redisDb *db, robj *key);
565 static int deleteIfSwapped(redisDb *db, robj *key);
566 static int deleteKey(redisDb *db, robj *key);
567 static time_t getExpire(redisDb *db, robj *key);
568 static int setExpire(redisDb *db, robj *key, time_t when);
569 static void updateSlavesWaitingBgsave(int bgsaveerr);
570 static void freeMemoryIfNeeded(void);
571 static int processCommand(redisClient *c);
572 static void setupSigSegvAction(void);
573 static void rdbRemoveTempFile(pid_t childpid);
574 static void aofRemoveTempFile(pid_t childpid);
575 static size_t stringObjectLen(robj *o);
576 static void processInputBuffer(redisClient *c);
577 static zskiplist *zslCreate(void);
578 static void zslFree(zskiplist *zsl);
579 static void zslInsert(zskiplist *zsl, double score, robj *obj);
580 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
581 static void initClientMultiState(redisClient *c);
582 static void freeClientMultiState(redisClient *c);
583 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584 static void unblockClientWaitingData(redisClient *c);
585 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586 static void vmInit(void);
587 static void vmMarkPagesFree(off_t page, off_t count);
588 static robj *vmLoadObject(robj *key);
589 static robj *vmPreviewObject(robj *key);
590 static int vmSwapOneObjectBlocking(void);
591 static int vmSwapOneObjectThreaded(void);
592 static int vmCanSwapOut(void);
593 static int tryFreeOneObjectFromFreelist(void);
594 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596 static void vmCancelThreadedIOJob(robj *o);
597 static void lockThreadedIO(void);
598 static void unlockThreadedIO(void);
599 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600 static void freeIOJob(iojob *j);
601 static void queueIOJob(iojob *j);
602 static int vmWriteObjectOnSwap(robj *o, off_t page);
603 static robj *vmReadObjectFromSwap(off_t page, int type);
604 static void waitEmptyIOJobsQueue(void);
605 static void vmReopenSwapFile(void);
606 static int vmFreePage(off_t page);
607 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
608 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609 static int dontWaitForSwappedKey(redisClient *c, robj *key);
610 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612 static struct redisCommand *lookupCommand(char *name);
613 static void call(redisClient *c, struct redisCommand *cmd);
614 static void resetClient(redisClient *c);
615 static void convertToRealHash(robj *o);
616 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618 static void freePubsubPattern(void *p);
619 static int listMatchPubsubPattern(void *a, void *b);
620 static int compareStringObjects(robj *a, robj *b);
621 static void usage();
622
623 static void authCommand(redisClient *c);
624 static void pingCommand(redisClient *c);
625 static void echoCommand(redisClient *c);
626 static void setCommand(redisClient *c);
627 static void setnxCommand(redisClient *c);
628 static void getCommand(redisClient *c);
629 static void delCommand(redisClient *c);
630 static void existsCommand(redisClient *c);
631 static void incrCommand(redisClient *c);
632 static void decrCommand(redisClient *c);
633 static void incrbyCommand(redisClient *c);
634 static void decrbyCommand(redisClient *c);
635 static void selectCommand(redisClient *c);
636 static void randomkeyCommand(redisClient *c);
637 static void keysCommand(redisClient *c);
638 static void dbsizeCommand(redisClient *c);
639 static void lastsaveCommand(redisClient *c);
640 static void saveCommand(redisClient *c);
641 static void bgsaveCommand(redisClient *c);
642 static void bgrewriteaofCommand(redisClient *c);
643 static void shutdownCommand(redisClient *c);
644 static void moveCommand(redisClient *c);
645 static void renameCommand(redisClient *c);
646 static void renamenxCommand(redisClient *c);
647 static void lpushCommand(redisClient *c);
648 static void rpushCommand(redisClient *c);
649 static void lpopCommand(redisClient *c);
650 static void rpopCommand(redisClient *c);
651 static void llenCommand(redisClient *c);
652 static void lindexCommand(redisClient *c);
653 static void lrangeCommand(redisClient *c);
654 static void ltrimCommand(redisClient *c);
655 static void typeCommand(redisClient *c);
656 static void lsetCommand(redisClient *c);
657 static void saddCommand(redisClient *c);
658 static void sremCommand(redisClient *c);
659 static void smoveCommand(redisClient *c);
660 static void sismemberCommand(redisClient *c);
661 static void scardCommand(redisClient *c);
662 static void spopCommand(redisClient *c);
663 static void srandmemberCommand(redisClient *c);
664 static void sinterCommand(redisClient *c);
665 static void sinterstoreCommand(redisClient *c);
666 static void sunionCommand(redisClient *c);
667 static void sunionstoreCommand(redisClient *c);
668 static void sdiffCommand(redisClient *c);
669 static void sdiffstoreCommand(redisClient *c);
670 static void syncCommand(redisClient *c);
671 static void flushdbCommand(redisClient *c);
672 static void flushallCommand(redisClient *c);
673 static void sortCommand(redisClient *c);
674 static void lremCommand(redisClient *c);
675 static void rpoplpushcommand(redisClient *c);
676 static void infoCommand(redisClient *c);
677 static void mgetCommand(redisClient *c);
678 static void monitorCommand(redisClient *c);
679 static void expireCommand(redisClient *c);
680 static void expireatCommand(redisClient *c);
681 static void getsetCommand(redisClient *c);
682 static void ttlCommand(redisClient *c);
683 static void slaveofCommand(redisClient *c);
684 static void debugCommand(redisClient *c);
685 static void msetCommand(redisClient *c);
686 static void msetnxCommand(redisClient *c);
687 static void zaddCommand(redisClient *c);
688 static void zincrbyCommand(redisClient *c);
689 static void zrangeCommand(redisClient *c);
690 static void zrangebyscoreCommand(redisClient *c);
691 static void zcountCommand(redisClient *c);
692 static void zrevrangeCommand(redisClient *c);
693 static void zcardCommand(redisClient *c);
694 static void zremCommand(redisClient *c);
695 static void zscoreCommand(redisClient *c);
696 static void zremrangebyscoreCommand(redisClient *c);
697 static void multiCommand(redisClient *c);
698 static void execCommand(redisClient *c);
699 static void discardCommand(redisClient *c);
700 static void blpopCommand(redisClient *c);
701 static void brpopCommand(redisClient *c);
702 static void appendCommand(redisClient *c);
703 static void substrCommand(redisClient *c);
704 static void zrankCommand(redisClient *c);
705 static void zrevrankCommand(redisClient *c);
706 static void hsetCommand(redisClient *c);
707 static void hgetCommand(redisClient *c);
708 static void hmsetCommand(redisClient *c);
709 static void hmgetCommand(redisClient *c);
710 static void hdelCommand(redisClient *c);
711 static void hlenCommand(redisClient *c);
712 static void zremrangebyrankCommand(redisClient *c);
713 static void zunionCommand(redisClient *c);
714 static void zinterCommand(redisClient *c);
715 static void hkeysCommand(redisClient *c);
716 static void hvalsCommand(redisClient *c);
717 static void hgetallCommand(redisClient *c);
718 static void hexistsCommand(redisClient *c);
719 static void configCommand(redisClient *c);
720 static void hincrbyCommand(redisClient *c);
721 static void subscribeCommand(redisClient *c);
722 static void unsubscribeCommand(redisClient *c);
723 static void psubscribeCommand(redisClient *c);
724 static void punsubscribeCommand(redisClient *c);
725 static void publishCommand(redisClient *c);
726
727 /*================================= Globals ================================= */
728
729 /* Global vars */
730 static struct redisServer server; /* server global state */
731 static struct redisCommand cmdTable[] = {
732 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
735 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
738 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
742 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
754 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
755 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
758 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
776 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
799 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
800 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
812 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
820 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
831 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
836 {NULL,NULL,0,0,NULL,0,0,0}
837 };
838
839 /*============================ Utility functions ============================ */
840
841 /* Glob-style pattern matching. */
842 static int stringmatchlen(const char *pattern, int patternLen,
843 const char *string, int stringLen, int nocase)
844 {
845 while(patternLen) {
846 switch(pattern[0]) {
847 case '*':
848 while (pattern[1] == '*') {
849 pattern++;
850 patternLen--;
851 }
852 if (patternLen == 1)
853 return 1; /* match */
854 while(stringLen) {
855 if (stringmatchlen(pattern+1, patternLen-1,
856 string, stringLen, nocase))
857 return 1; /* match */
858 string++;
859 stringLen--;
860 }
861 return 0; /* no match */
862 break;
863 case '?':
864 if (stringLen == 0)
865 return 0; /* no match */
866 string++;
867 stringLen--;
868 break;
869 case '[':
870 {
871 int not, match;
872
873 pattern++;
874 patternLen--;
875 not = pattern[0] == '^';
876 if (not) {
877 pattern++;
878 patternLen--;
879 }
880 match = 0;
881 while(1) {
882 if (pattern[0] == '\\') {
883 pattern++;
884 patternLen--;
885 if (pattern[0] == string[0])
886 match = 1;
887 } else if (pattern[0] == ']') {
888 break;
889 } else if (patternLen == 0) {
890 pattern--;
891 patternLen++;
892 break;
893 } else if (pattern[1] == '-' && patternLen >= 3) {
894 int start = pattern[0];
895 int end = pattern[2];
896 int c = string[0];
897 if (start > end) {
898 int t = start;
899 start = end;
900 end = t;
901 }
902 if (nocase) {
903 start = tolower(start);
904 end = tolower(end);
905 c = tolower(c);
906 }
907 pattern += 2;
908 patternLen -= 2;
909 if (c >= start && c <= end)
910 match = 1;
911 } else {
912 if (!nocase) {
913 if (pattern[0] == string[0])
914 match = 1;
915 } else {
916 if (tolower((int)pattern[0]) == tolower((int)string[0]))
917 match = 1;
918 }
919 }
920 pattern++;
921 patternLen--;
922 }
923 if (not)
924 match = !match;
925 if (!match)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 }
931 case '\\':
932 if (patternLen >= 2) {
933 pattern++;
934 patternLen--;
935 }
936 /* fall through */
937 default:
938 if (!nocase) {
939 if (pattern[0] != string[0])
940 return 0; /* no match */
941 } else {
942 if (tolower((int)pattern[0]) != tolower((int)string[0]))
943 return 0; /* no match */
944 }
945 string++;
946 stringLen--;
947 break;
948 }
949 pattern++;
950 patternLen--;
951 if (stringLen == 0) {
952 while(*pattern == '*') {
953 pattern++;
954 patternLen--;
955 }
956 break;
957 }
958 }
959 if (patternLen == 0 && stringLen == 0)
960 return 1;
961 return 0;
962 }
963
964 static int stringmatch(const char *pattern, const char *string, int nocase) {
965 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
966 }
967
968 static void redisLog(int level, const char *fmt, ...) {
969 va_list ap;
970 FILE *fp;
971
972 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
973 if (!fp) return;
974
975 va_start(ap, fmt);
976 if (level >= server.verbosity) {
977 char *c = ".-*#";
978 char buf[64];
979 time_t now;
980
981 now = time(NULL);
982 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
983 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
984 vfprintf(fp, fmt, ap);
985 fprintf(fp,"\n");
986 fflush(fp);
987 }
988 va_end(ap);
989
990 if (server.logfile) fclose(fp);
991 }
992
993 /*====================== Hash table type implementation ==================== */
994
995 /* This is an hash table type that uses the SDS dynamic strings libary as
996 * keys and radis objects as values (objects can hold SDS strings,
997 * lists, sets). */
998
999 static void dictVanillaFree(void *privdata, void *val)
1000 {
1001 DICT_NOTUSED(privdata);
1002 zfree(val);
1003 }
1004
1005 static void dictListDestructor(void *privdata, void *val)
1006 {
1007 DICT_NOTUSED(privdata);
1008 listRelease((list*)val);
1009 }
1010
1011 static int sdsDictKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 int l1,l2;
1015 DICT_NOTUSED(privdata);
1016
1017 l1 = sdslen((sds)key1);
1018 l2 = sdslen((sds)key2);
1019 if (l1 != l2) return 0;
1020 return memcmp(key1, key2, l1) == 0;
1021 }
1022
1023 static void dictRedisObjectDestructor(void *privdata, void *val)
1024 {
1025 DICT_NOTUSED(privdata);
1026
1027 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1028 decrRefCount(val);
1029 }
1030
1031 static int dictObjKeyCompare(void *privdata, const void *key1,
1032 const void *key2)
1033 {
1034 const robj *o1 = key1, *o2 = key2;
1035 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1036 }
1037
1038 static unsigned int dictObjHash(const void *key) {
1039 const robj *o = key;
1040 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1041 }
1042
1043 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1044 const void *key2)
1045 {
1046 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1047 int cmp;
1048
1049 if (o1->encoding == REDIS_ENCODING_INT &&
1050 o2->encoding == REDIS_ENCODING_INT &&
1051 o1->ptr == o2->ptr) return 1;
1052
1053 o1 = getDecodedObject(o1);
1054 o2 = getDecodedObject(o2);
1055 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1056 decrRefCount(o1);
1057 decrRefCount(o2);
1058 return cmp;
1059 }
1060
1061 static unsigned int dictEncObjHash(const void *key) {
1062 robj *o = (robj*) key;
1063
1064 if (o->encoding == REDIS_ENCODING_RAW) {
1065 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1066 } else {
1067 if (o->encoding == REDIS_ENCODING_INT) {
1068 char buf[32];
1069 int len;
1070
1071 len = snprintf(buf,32,"%ld",(long)o->ptr);
1072 return dictGenHashFunction((unsigned char*)buf, len);
1073 } else {
1074 unsigned int hash;
1075
1076 o = getDecodedObject(o);
1077 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1078 decrRefCount(o);
1079 return hash;
1080 }
1081 }
1082 }
1083
1084 /* Sets type and expires */
1085 static dictType setDictType = {
1086 dictEncObjHash, /* hash function */
1087 NULL, /* key dup */
1088 NULL, /* val dup */
1089 dictEncObjKeyCompare, /* key compare */
1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092 };
1093
1094 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1095 static dictType zsetDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
1101 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1102 };
1103
1104 /* Db->dict */
1105 static dictType dbDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictRedisObjectDestructor /* val destructor */
1112 };
1113
1114 /* Db->expires */
1115 static dictType keyptrDictType = {
1116 dictObjHash, /* hash function */
1117 NULL, /* key dup */
1118 NULL, /* val dup */
1119 dictObjKeyCompare, /* key compare */
1120 dictRedisObjectDestructor, /* key destructor */
1121 NULL /* val destructor */
1122 };
1123
1124 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1125 static dictType hashDictType = {
1126 dictEncObjHash, /* hash function */
1127 NULL, /* key dup */
1128 NULL, /* val dup */
1129 dictEncObjKeyCompare, /* key compare */
1130 dictRedisObjectDestructor, /* key destructor */
1131 dictRedisObjectDestructor /* val destructor */
1132 };
1133
1134 /* Keylist hash table type has unencoded redis objects as keys and
1135 * lists as values. It's used for blocking operations (BLPOP) and to
1136 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1137 static dictType keylistDictType = {
1138 dictObjHash, /* hash function */
1139 NULL, /* key dup */
1140 NULL, /* val dup */
1141 dictObjKeyCompare, /* key compare */
1142 dictRedisObjectDestructor, /* key destructor */
1143 dictListDestructor /* val destructor */
1144 };
1145
1146 static void version();
1147
1148 /* ========================= Random utility functions ======================= */
1149
1150 /* Redis generally does not try to recover from out of memory conditions
1151 * when allocating objects or strings, it is not clear if it will be possible
1152 * to report this condition to the client since the networking layer itself
1153 * is based on heap allocation for send buffers, so we simply abort.
1154 * At least the code will be simpler to read... */
1155 static void oom(const char *msg) {
1156 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1157 sleep(1);
1158 abort();
1159 }
1160
1161 /* ====================== Redis server networking stuff ===================== */
1162 static void closeTimedoutClients(void) {
1163 redisClient *c;
1164 listNode *ln;
1165 time_t now = time(NULL);
1166 listIter li;
1167
1168 listRewind(server.clients,&li);
1169 while ((ln = listNext(&li)) != NULL) {
1170 c = listNodeValue(ln);
1171 if (server.maxidletime &&
1172 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1173 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1174 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1175 listLength(c->pubsub_patterns) == 0 &&
1176 (now - c->lastinteraction > server.maxidletime))
1177 {
1178 redisLog(REDIS_VERBOSE,"Closing idle client");
1179 freeClient(c);
1180 } else if (c->flags & REDIS_BLOCKED) {
1181 if (c->blockingto != 0 && c->blockingto < now) {
1182 addReply(c,shared.nullmultibulk);
1183 unblockClientWaitingData(c);
1184 }
1185 }
1186 }
1187 }
1188
1189 static int htNeedsResize(dict *dict) {
1190 long long size, used;
1191
1192 size = dictSlots(dict);
1193 used = dictSize(dict);
1194 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1195 (used*100/size < REDIS_HT_MINFILL));
1196 }
1197
1198 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1199 * we resize the hash table to save memory */
1200 static void tryResizeHashTables(void) {
1201 int j;
1202
1203 for (j = 0; j < server.dbnum; j++) {
1204 if (htNeedsResize(server.db[j].dict)) {
1205 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1206 dictResize(server.db[j].dict);
1207 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1208 }
1209 if (htNeedsResize(server.db[j].expires))
1210 dictResize(server.db[j].expires);
1211 }
1212 }
1213
1214 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1215 void backgroundSaveDoneHandler(int statloc) {
1216 int exitcode = WEXITSTATUS(statloc);
1217 int bysignal = WIFSIGNALED(statloc);
1218
1219 if (!bysignal && exitcode == 0) {
1220 redisLog(REDIS_NOTICE,
1221 "Background saving terminated with success");
1222 server.dirty = 0;
1223 server.lastsave = time(NULL);
1224 } else if (!bysignal && exitcode != 0) {
1225 redisLog(REDIS_WARNING, "Background saving error");
1226 } else {
1227 redisLog(REDIS_WARNING,
1228 "Background saving terminated by signal %d", WTERMSIG(statloc));
1229 rdbRemoveTempFile(server.bgsavechildpid);
1230 }
1231 server.bgsavechildpid = -1;
1232 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1233 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1234 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1235 }
1236
1237 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1238 * Handle this. */
1239 void backgroundRewriteDoneHandler(int statloc) {
1240 int exitcode = WEXITSTATUS(statloc);
1241 int bysignal = WIFSIGNALED(statloc);
1242
1243 if (!bysignal && exitcode == 0) {
1244 int fd;
1245 char tmpfile[256];
1246
1247 redisLog(REDIS_NOTICE,
1248 "Background append only file rewriting terminated with success");
1249 /* Now it's time to flush the differences accumulated by the parent */
1250 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1251 fd = open(tmpfile,O_WRONLY|O_APPEND);
1252 if (fd == -1) {
1253 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1254 goto cleanup;
1255 }
1256 /* Flush our data... */
1257 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1258 (signed) sdslen(server.bgrewritebuf)) {
1259 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1260 close(fd);
1261 goto cleanup;
1262 }
1263 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1264 /* Now our work is to rename the temp file into the stable file. And
1265 * switch the file descriptor used by the server for append only. */
1266 if (rename(tmpfile,server.appendfilename) == -1) {
1267 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1268 close(fd);
1269 goto cleanup;
1270 }
1271 /* Mission completed... almost */
1272 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1273 if (server.appendfd != -1) {
1274 /* If append only is actually enabled... */
1275 close(server.appendfd);
1276 server.appendfd = fd;
1277 fsync(fd);
1278 server.appendseldb = -1; /* Make sure it will issue SELECT */
1279 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1280 } else {
1281 /* If append only is disabled we just generate a dump in this
1282 * format. Why not? */
1283 close(fd);
1284 }
1285 } else if (!bysignal && exitcode != 0) {
1286 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1287 } else {
1288 redisLog(REDIS_WARNING,
1289 "Background append only file rewriting terminated by signal %d",
1290 WTERMSIG(statloc));
1291 }
1292 cleanup:
1293 sdsfree(server.bgrewritebuf);
1294 server.bgrewritebuf = sdsempty();
1295 aofRemoveTempFile(server.bgrewritechildpid);
1296 server.bgrewritechildpid = -1;
1297 }
1298
1299 /* This function is called once a background process of some kind terminates,
1300 * as we want to avoid resizing the hash tables when there is a child in order
1301 * to play well with copy-on-write (otherwise when a resize happens lots of
1302 * memory pages are copied). The goal of this function is to update the ability
1303 * for dict.c to resize the hash tables accordingly to the fact we have o not
1304 * running childs. */
1305 static void updateDictResizePolicy(void) {
1306 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1307 dictEnableResize();
1308 else
1309 dictDisableResize();
1310 }
1311
1312 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1313 int j, loops = server.cronloops++;
1314 REDIS_NOTUSED(eventLoop);
1315 REDIS_NOTUSED(id);
1316 REDIS_NOTUSED(clientData);
1317
1318 /* We take a cached value of the unix time in the global state because
1319 * with virtual memory and aging there is to store the current time
1320 * in objects at every object access, and accuracy is not needed.
1321 * To access a global var is faster than calling time(NULL) */
1322 server.unixtime = time(NULL);
1323
1324 /* Show some info about non-empty databases */
1325 for (j = 0; j < server.dbnum; j++) {
1326 long long size, used, vkeys;
1327
1328 size = dictSlots(server.db[j].dict);
1329 used = dictSize(server.db[j].dict);
1330 vkeys = dictSize(server.db[j].expires);
1331 if (!(loops % 50) && (used || vkeys)) {
1332 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1333 /* dictPrintStats(server.dict); */
1334 }
1335 }
1336
1337 /* We don't want to resize the hash tables while a bacground saving
1338 * is in progress: the saving child is created using fork() that is
1339 * implemented with a copy-on-write semantic in most modern systems, so
1340 * if we resize the HT while there is the saving child at work actually
1341 * a lot of memory movements in the parent will cause a lot of pages
1342 * copied. */
1343 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1344 !(loops % 10))
1345 {
1346 tryResizeHashTables();
1347 }
1348
1349 /* Show information about connected clients */
1350 if (!(loops % 50)) {
1351 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1352 listLength(server.clients)-listLength(server.slaves),
1353 listLength(server.slaves),
1354 zmalloc_used_memory());
1355 }
1356
1357 /* Close connections of timedout clients */
1358 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1359 closeTimedoutClients();
1360
1361 /* Check if a background saving or AOF rewrite in progress terminated */
1362 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1363 int statloc;
1364 pid_t pid;
1365
1366 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1367 if (pid == server.bgsavechildpid) {
1368 backgroundSaveDoneHandler(statloc);
1369 } else {
1370 backgroundRewriteDoneHandler(statloc);
1371 }
1372 updateDictResizePolicy();
1373 }
1374 } else {
1375 /* If there is not a background saving in progress check if
1376 * we have to save now */
1377 time_t now = time(NULL);
1378 for (j = 0; j < server.saveparamslen; j++) {
1379 struct saveparam *sp = server.saveparams+j;
1380
1381 if (server.dirty >= sp->changes &&
1382 now-server.lastsave > sp->seconds) {
1383 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1384 sp->changes, sp->seconds);
1385 rdbSaveBackground(server.dbfilename);
1386 break;
1387 }
1388 }
1389 }
1390
1391 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1392 * will use few CPU cycles if there are few expiring keys, otherwise
1393 * it will get more aggressive to avoid that too much memory is used by
1394 * keys that can be removed from the keyspace. */
1395 for (j = 0; j < server.dbnum; j++) {
1396 int expired;
1397 redisDb *db = server.db+j;
1398
1399 /* Continue to expire if at the end of the cycle more than 25%
1400 * of the keys were expired. */
1401 do {
1402 long num = dictSize(db->expires);
1403 time_t now = time(NULL);
1404
1405 expired = 0;
1406 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1407 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1408 while (num--) {
1409 dictEntry *de;
1410 time_t t;
1411
1412 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1413 t = (time_t) dictGetEntryVal(de);
1414 if (now > t) {
1415 deleteKey(db,dictGetEntryKey(de));
1416 expired++;
1417 server.stat_expiredkeys++;
1418 }
1419 }
1420 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1421 }
1422
1423 /* Swap a few keys on disk if we are over the memory limit and VM
1424 * is enbled. Try to free objects from the free list first. */
1425 if (vmCanSwapOut()) {
1426 while (server.vm_enabled && zmalloc_used_memory() >
1427 server.vm_max_memory)
1428 {
1429 int retval;
1430
1431 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1432 retval = (server.vm_max_threads == 0) ?
1433 vmSwapOneObjectBlocking() :
1434 vmSwapOneObjectThreaded();
1435 if (retval == REDIS_ERR && !(loops % 300) &&
1436 zmalloc_used_memory() >
1437 (server.vm_max_memory+server.vm_max_memory/10))
1438 {
1439 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1440 }
1441 /* Note that when using threade I/O we free just one object,
1442 * because anyway when the I/O thread in charge to swap this
1443 * object out will finish, the handler of completed jobs
1444 * will try to swap more objects if we are still out of memory. */
1445 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1446 }
1447 }
1448
1449 /* Check if we should connect to a MASTER */
1450 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1451 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1452 if (syncWithMaster() == REDIS_OK) {
1453 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1454 }
1455 }
1456 return 100;
1457 }
1458
1459 /* This function gets called every time Redis is entering the
1460 * main loop of the event driven library, that is, before to sleep
1461 * for ready file descriptors. */
1462 static void beforeSleep(struct aeEventLoop *eventLoop) {
1463 REDIS_NOTUSED(eventLoop);
1464
1465 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1466 listIter li;
1467 listNode *ln;
1468
1469 listRewind(server.io_ready_clients,&li);
1470 while((ln = listNext(&li))) {
1471 redisClient *c = ln->value;
1472 struct redisCommand *cmd;
1473
1474 /* Resume the client. */
1475 listDelNode(server.io_ready_clients,ln);
1476 c->flags &= (~REDIS_IO_WAIT);
1477 server.vm_blocked_clients--;
1478 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1479 readQueryFromClient, c);
1480 cmd = lookupCommand(c->argv[0]->ptr);
1481 assert(cmd != NULL);
1482 call(c,cmd);
1483 resetClient(c);
1484 /* There may be more data to process in the input buffer. */
1485 if (c->querybuf && sdslen(c->querybuf) > 0)
1486 processInputBuffer(c);
1487 }
1488 }
1489 }
1490
1491 static void createSharedObjects(void) {
1492 int j;
1493
1494 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1495 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1496 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1497 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1498 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1499 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1500 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1501 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1502 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1503 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1504 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1505 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1507 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR no such key\r\n"));
1509 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR syntax error\r\n"));
1511 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1512 "-ERR source and destination objects are the same\r\n"));
1513 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1514 "-ERR index out of range\r\n"));
1515 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1516 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1517 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1518 shared.select0 = createStringObject("select 0\r\n",10);
1519 shared.select1 = createStringObject("select 1\r\n",10);
1520 shared.select2 = createStringObject("select 2\r\n",10);
1521 shared.select3 = createStringObject("select 3\r\n",10);
1522 shared.select4 = createStringObject("select 4\r\n",10);
1523 shared.select5 = createStringObject("select 5\r\n",10);
1524 shared.select6 = createStringObject("select 6\r\n",10);
1525 shared.select7 = createStringObject("select 7\r\n",10);
1526 shared.select8 = createStringObject("select 8\r\n",10);
1527 shared.select9 = createStringObject("select 9\r\n",10);
1528 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1529 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1530 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1531 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1532 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1533 shared.mbulk3 = createStringObject("*3\r\n",4);
1534 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1535 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1536 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1537 }
1538 }
1539
1540 static void appendServerSaveParams(time_t seconds, int changes) {
1541 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1542 server.saveparams[server.saveparamslen].seconds = seconds;
1543 server.saveparams[server.saveparamslen].changes = changes;
1544 server.saveparamslen++;
1545 }
1546
1547 static void resetServerSaveParams() {
1548 zfree(server.saveparams);
1549 server.saveparams = NULL;
1550 server.saveparamslen = 0;
1551 }
1552
1553 static void initServerConfig() {
1554 server.dbnum = REDIS_DEFAULT_DBNUM;
1555 server.port = REDIS_SERVERPORT;
1556 server.verbosity = REDIS_VERBOSE;
1557 server.maxidletime = REDIS_MAXIDLETIME;
1558 server.saveparams = NULL;
1559 server.logfile = NULL; /* NULL = log on standard output */
1560 server.bindaddr = NULL;
1561 server.glueoutputbuf = 1;
1562 server.daemonize = 0;
1563 server.appendonly = 0;
1564 server.appendfsync = APPENDFSYNC_ALWAYS;
1565 server.lastfsync = time(NULL);
1566 server.appendfd = -1;
1567 server.appendseldb = -1; /* Make sure the first time will not match */
1568 server.pidfile = zstrdup("/var/run/redis.pid");
1569 server.dbfilename = zstrdup("dump.rdb");
1570 server.appendfilename = zstrdup("appendonly.aof");
1571 server.requirepass = NULL;
1572 server.shareobjects = 0;
1573 server.rdbcompression = 1;
1574 server.maxclients = 0;
1575 server.blpop_blocked_clients = 0;
1576 server.maxmemory = 0;
1577 server.vm_enabled = 0;
1578 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1579 server.vm_page_size = 256; /* 256 bytes per page */
1580 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1581 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1582 server.vm_max_threads = 4;
1583 server.vm_blocked_clients = 0;
1584 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1585 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1586
1587 resetServerSaveParams();
1588
1589 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1590 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1591 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1592 /* Replication related */
1593 server.isslave = 0;
1594 server.masterauth = NULL;
1595 server.masterhost = NULL;
1596 server.masterport = 6379;
1597 server.master = NULL;
1598 server.replstate = REDIS_REPL_NONE;
1599
1600 /* Double constants initialization */
1601 R_Zero = 0.0;
1602 R_PosInf = 1.0/R_Zero;
1603 R_NegInf = -1.0/R_Zero;
1604 R_Nan = R_Zero/R_Zero;
1605 }
1606
1607 static void initServer() {
1608 int j;
1609
1610 signal(SIGHUP, SIG_IGN);
1611 signal(SIGPIPE, SIG_IGN);
1612 setupSigSegvAction();
1613
1614 server.devnull = fopen("/dev/null","w");
1615 if (server.devnull == NULL) {
1616 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1617 exit(1);
1618 }
1619 server.clients = listCreate();
1620 server.slaves = listCreate();
1621 server.monitors = listCreate();
1622 server.objfreelist = listCreate();
1623 createSharedObjects();
1624 server.el = aeCreateEventLoop();
1625 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1626 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1627 if (server.fd == -1) {
1628 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1629 exit(1);
1630 }
1631 for (j = 0; j < server.dbnum; j++) {
1632 server.db[j].dict = dictCreate(&dbDictType,NULL);
1633 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1634 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1635 if (server.vm_enabled)
1636 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1637 server.db[j].id = j;
1638 }
1639 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1640 server.pubsub_patterns = listCreate();
1641 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1642 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1643 server.cronloops = 0;
1644 server.bgsavechildpid = -1;
1645 server.bgrewritechildpid = -1;
1646 server.bgrewritebuf = sdsempty();
1647 server.lastsave = time(NULL);
1648 server.dirty = 0;
1649 server.stat_numcommands = 0;
1650 server.stat_numconnections = 0;
1651 server.stat_expiredkeys = 0;
1652 server.stat_starttime = time(NULL);
1653 server.unixtime = time(NULL);
1654 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1655 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1656 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1657
1658 if (server.appendonly) {
1659 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1660 if (server.appendfd == -1) {
1661 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1662 strerror(errno));
1663 exit(1);
1664 }
1665 }
1666
1667 if (server.vm_enabled) vmInit();
1668 }
1669
1670 /* Empty the whole database */
1671 static long long emptyDb() {
1672 int j;
1673 long long removed = 0;
1674
1675 for (j = 0; j < server.dbnum; j++) {
1676 removed += dictSize(server.db[j].dict);
1677 dictEmpty(server.db[j].dict);
1678 dictEmpty(server.db[j].expires);
1679 }
1680 return removed;
1681 }
1682
1683 static int yesnotoi(char *s) {
1684 if (!strcasecmp(s,"yes")) return 1;
1685 else if (!strcasecmp(s,"no")) return 0;
1686 else return -1;
1687 }
1688
1689 /* I agree, this is a very rudimental way to load a configuration...
1690 will improve later if the config gets more complex */
1691 static void loadServerConfig(char *filename) {
1692 FILE *fp;
1693 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1694 int linenum = 0;
1695 sds line = NULL;
1696
1697 if (filename[0] == '-' && filename[1] == '\0')
1698 fp = stdin;
1699 else {
1700 if ((fp = fopen(filename,"r")) == NULL) {
1701 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1702 exit(1);
1703 }
1704 }
1705
1706 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1707 sds *argv;
1708 int argc, j;
1709
1710 linenum++;
1711 line = sdsnew(buf);
1712 line = sdstrim(line," \t\r\n");
1713
1714 /* Skip comments and blank lines*/
1715 if (line[0] == '#' || line[0] == '\0') {
1716 sdsfree(line);
1717 continue;
1718 }
1719
1720 /* Split into arguments */
1721 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1722 sdstolower(argv[0]);
1723
1724 /* Execute config directives */
1725 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1726 server.maxidletime = atoi(argv[1]);
1727 if (server.maxidletime < 0) {
1728 err = "Invalid timeout value"; goto loaderr;
1729 }
1730 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1731 server.port = atoi(argv[1]);
1732 if (server.port < 1 || server.port > 65535) {
1733 err = "Invalid port"; goto loaderr;
1734 }
1735 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1736 server.bindaddr = zstrdup(argv[1]);
1737 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1738 int seconds = atoi(argv[1]);
1739 int changes = atoi(argv[2]);
1740 if (seconds < 1 || changes < 0) {
1741 err = "Invalid save parameters"; goto loaderr;
1742 }
1743 appendServerSaveParams(seconds,changes);
1744 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1745 if (chdir(argv[1]) == -1) {
1746 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1747 argv[1], strerror(errno));
1748 exit(1);
1749 }
1750 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1751 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1752 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1753 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1754 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1755 else {
1756 err = "Invalid log level. Must be one of debug, notice, warning";
1757 goto loaderr;
1758 }
1759 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1760 FILE *logfp;
1761
1762 server.logfile = zstrdup(argv[1]);
1763 if (!strcasecmp(server.logfile,"stdout")) {
1764 zfree(server.logfile);
1765 server.logfile = NULL;
1766 }
1767 if (server.logfile) {
1768 /* Test if we are able to open the file. The server will not
1769 * be able to abort just for this problem later... */
1770 logfp = fopen(server.logfile,"a");
1771 if (logfp == NULL) {
1772 err = sdscatprintf(sdsempty(),
1773 "Can't open the log file: %s", strerror(errno));
1774 goto loaderr;
1775 }
1776 fclose(logfp);
1777 }
1778 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1779 server.dbnum = atoi(argv[1]);
1780 if (server.dbnum < 1) {
1781 err = "Invalid number of databases"; goto loaderr;
1782 }
1783 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1784 loadServerConfig(argv[1]);
1785 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1786 server.maxclients = atoi(argv[1]);
1787 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1788 server.maxmemory = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1790 server.masterhost = sdsnew(argv[1]);
1791 server.masterport = atoi(argv[2]);
1792 server.replstate = REDIS_REPL_CONNECT;
1793 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1794 server.masterauth = zstrdup(argv[1]);
1795 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1796 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1800 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1801 err = "argument must be 'yes' or 'no'"; goto loaderr;
1802 }
1803 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1804 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1805 err = "argument must be 'yes' or 'no'"; goto loaderr;
1806 }
1807 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1808 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1809 err = "argument must be 'yes' or 'no'"; goto loaderr;
1810 }
1811 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1812 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1813 err = "argument must be 'yes' or 'no'"; goto loaderr;
1814 }
1815 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1816 if (!strcasecmp(argv[1],"no")) {
1817 server.appendfsync = APPENDFSYNC_NO;
1818 } else if (!strcasecmp(argv[1],"always")) {
1819 server.appendfsync = APPENDFSYNC_ALWAYS;
1820 } else if (!strcasecmp(argv[1],"everysec")) {
1821 server.appendfsync = APPENDFSYNC_EVERYSEC;
1822 } else {
1823 err = "argument must be 'no', 'always' or 'everysec'";
1824 goto loaderr;
1825 }
1826 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1827 server.requirepass = zstrdup(argv[1]);
1828 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1829 zfree(server.pidfile);
1830 server.pidfile = zstrdup(argv[1]);
1831 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1832 zfree(server.dbfilename);
1833 server.dbfilename = zstrdup(argv[1]);
1834 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1835 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1836 err = "argument must be 'yes' or 'no'"; goto loaderr;
1837 }
1838 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1839 zfree(server.vm_swap_file);
1840 server.vm_swap_file = zstrdup(argv[1]);
1841 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1842 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1843 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1844 server.vm_page_size = strtoll(argv[1], NULL, 10);
1845 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1846 server.vm_pages = strtoll(argv[1], NULL, 10);
1847 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1848 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1849 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1850 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1851 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1852 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1853 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1854 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1855 } else {
1856 err = "Bad directive or wrong number of arguments"; goto loaderr;
1857 }
1858 for (j = 0; j < argc; j++)
1859 sdsfree(argv[j]);
1860 zfree(argv);
1861 sdsfree(line);
1862 }
1863 if (fp != stdin) fclose(fp);
1864 return;
1865
1866 loaderr:
1867 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1868 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1869 fprintf(stderr, ">>> '%s'\n", line);
1870 fprintf(stderr, "%s\n", err);
1871 exit(1);
1872 }
1873
1874 static void freeClientArgv(redisClient *c) {
1875 int j;
1876
1877 for (j = 0; j < c->argc; j++)
1878 decrRefCount(c->argv[j]);
1879 for (j = 0; j < c->mbargc; j++)
1880 decrRefCount(c->mbargv[j]);
1881 c->argc = 0;
1882 c->mbargc = 0;
1883 }
1884
1885 static void freeClient(redisClient *c) {
1886 listNode *ln;
1887
1888 /* Note that if the client we are freeing is blocked into a blocking
1889 * call, we have to set querybuf to NULL *before* to call
1890 * unblockClientWaitingData() to avoid processInputBuffer() will get
1891 * called. Also it is important to remove the file events after
1892 * this, because this call adds the READABLE event. */
1893 sdsfree(c->querybuf);
1894 c->querybuf = NULL;
1895 if (c->flags & REDIS_BLOCKED)
1896 unblockClientWaitingData(c);
1897
1898 /* Unsubscribe from all the pubsub channels */
1899 pubsubUnsubscribeAllChannels(c,0);
1900 pubsubUnsubscribeAllPatterns(c,0);
1901 dictRelease(c->pubsub_channels);
1902 listRelease(c->pubsub_patterns);
1903 /* Obvious cleanup */
1904 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1905 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1906 listRelease(c->reply);
1907 freeClientArgv(c);
1908 close(c->fd);
1909 /* Remove from the list of clients */
1910 ln = listSearchKey(server.clients,c);
1911 redisAssert(ln != NULL);
1912 listDelNode(server.clients,ln);
1913 /* Remove from the list of clients waiting for swapped keys */
1914 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1915 ln = listSearchKey(server.io_ready_clients,c);
1916 if (ln) {
1917 listDelNode(server.io_ready_clients,ln);
1918 server.vm_blocked_clients--;
1919 }
1920 }
1921 while (server.vm_enabled && listLength(c->io_keys)) {
1922 ln = listFirst(c->io_keys);
1923 dontWaitForSwappedKey(c,ln->value);
1924 }
1925 listRelease(c->io_keys);
1926 /* Master/slave cleanup */
1927 if (c->flags & REDIS_SLAVE) {
1928 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1929 close(c->repldbfd);
1930 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1931 ln = listSearchKey(l,c);
1932 redisAssert(ln != NULL);
1933 listDelNode(l,ln);
1934 }
1935 if (c->flags & REDIS_MASTER) {
1936 server.master = NULL;
1937 server.replstate = REDIS_REPL_CONNECT;
1938 }
1939 /* Release memory */
1940 zfree(c->argv);
1941 zfree(c->mbargv);
1942 freeClientMultiState(c);
1943 zfree(c);
1944 }
1945
1946 #define GLUEREPLY_UP_TO (1024)
1947 static void glueReplyBuffersIfNeeded(redisClient *c) {
1948 int copylen = 0;
1949 char buf[GLUEREPLY_UP_TO];
1950 listNode *ln;
1951 listIter li;
1952 robj *o;
1953
1954 listRewind(c->reply,&li);
1955 while((ln = listNext(&li))) {
1956 int objlen;
1957
1958 o = ln->value;
1959 objlen = sdslen(o->ptr);
1960 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1961 memcpy(buf+copylen,o->ptr,objlen);
1962 copylen += objlen;
1963 listDelNode(c->reply,ln);
1964 } else {
1965 if (copylen == 0) return;
1966 break;
1967 }
1968 }
1969 /* Now the output buffer is empty, add the new single element */
1970 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1971 listAddNodeHead(c->reply,o);
1972 }
1973
1974 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1975 redisClient *c = privdata;
1976 int nwritten = 0, totwritten = 0, objlen;
1977 robj *o;
1978 REDIS_NOTUSED(el);
1979 REDIS_NOTUSED(mask);
1980
1981 /* Use writev() if we have enough buffers to send */
1982 if (!server.glueoutputbuf &&
1983 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1984 !(c->flags & REDIS_MASTER))
1985 {
1986 sendReplyToClientWritev(el, fd, privdata, mask);
1987 return;
1988 }
1989
1990 while(listLength(c->reply)) {
1991 if (server.glueoutputbuf && listLength(c->reply) > 1)
1992 glueReplyBuffersIfNeeded(c);
1993
1994 o = listNodeValue(listFirst(c->reply));
1995 objlen = sdslen(o->ptr);
1996
1997 if (objlen == 0) {
1998 listDelNode(c->reply,listFirst(c->reply));
1999 continue;
2000 }
2001
2002 if (c->flags & REDIS_MASTER) {
2003 /* Don't reply to a master */
2004 nwritten = objlen - c->sentlen;
2005 } else {
2006 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2007 if (nwritten <= 0) break;
2008 }
2009 c->sentlen += nwritten;
2010 totwritten += nwritten;
2011 /* If we fully sent the object on head go to the next one */
2012 if (c->sentlen == objlen) {
2013 listDelNode(c->reply,listFirst(c->reply));
2014 c->sentlen = 0;
2015 }
2016 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2017 * bytes, in a single threaded server it's a good idea to serve
2018 * other clients as well, even if a very large request comes from
2019 * super fast link that is always able to accept data (in real world
2020 * scenario think about 'KEYS *' against the loopback interfae) */
2021 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2022 }
2023 if (nwritten == -1) {
2024 if (errno == EAGAIN) {
2025 nwritten = 0;
2026 } else {
2027 redisLog(REDIS_VERBOSE,
2028 "Error writing to client: %s", strerror(errno));
2029 freeClient(c);
2030 return;
2031 }
2032 }
2033 if (totwritten > 0) c->lastinteraction = time(NULL);
2034 if (listLength(c->reply) == 0) {
2035 c->sentlen = 0;
2036 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2037 }
2038 }
2039
2040 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2041 {
2042 redisClient *c = privdata;
2043 int nwritten = 0, totwritten = 0, objlen, willwrite;
2044 robj *o;
2045 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2046 int offset, ion = 0;
2047 REDIS_NOTUSED(el);
2048 REDIS_NOTUSED(mask);
2049
2050 listNode *node;
2051 while (listLength(c->reply)) {
2052 offset = c->sentlen;
2053 ion = 0;
2054 willwrite = 0;
2055
2056 /* fill-in the iov[] array */
2057 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2058 o = listNodeValue(node);
2059 objlen = sdslen(o->ptr);
2060
2061 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2062 break;
2063
2064 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2065 break; /* no more iovecs */
2066
2067 iov[ion].iov_base = ((char*)o->ptr) + offset;
2068 iov[ion].iov_len = objlen - offset;
2069 willwrite += objlen - offset;
2070 offset = 0; /* just for the first item */
2071 ion++;
2072 }
2073
2074 if(willwrite == 0)
2075 break;
2076
2077 /* write all collected blocks at once */
2078 if((nwritten = writev(fd, iov, ion)) < 0) {
2079 if (errno != EAGAIN) {
2080 redisLog(REDIS_VERBOSE,
2081 "Error writing to client: %s", strerror(errno));
2082 freeClient(c);
2083 return;
2084 }
2085 break;
2086 }
2087
2088 totwritten += nwritten;
2089 offset = c->sentlen;
2090
2091 /* remove written robjs from c->reply */
2092 while (nwritten && listLength(c->reply)) {
2093 o = listNodeValue(listFirst(c->reply));
2094 objlen = sdslen(o->ptr);
2095
2096 if(nwritten >= objlen - offset) {
2097 listDelNode(c->reply, listFirst(c->reply));
2098 nwritten -= objlen - offset;
2099 c->sentlen = 0;
2100 } else {
2101 /* partial write */
2102 c->sentlen += nwritten;
2103 break;
2104 }
2105 offset = 0;
2106 }
2107 }
2108
2109 if (totwritten > 0)
2110 c->lastinteraction = time(NULL);
2111
2112 if (listLength(c->reply) == 0) {
2113 c->sentlen = 0;
2114 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2115 }
2116 }
2117
2118 static struct redisCommand *lookupCommand(char *name) {
2119 int j = 0;
2120 while(cmdTable[j].name != NULL) {
2121 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2122 j++;
2123 }
2124 return NULL;
2125 }
2126
2127 /* resetClient prepare the client to process the next command */
2128 static void resetClient(redisClient *c) {
2129 freeClientArgv(c);
2130 c->bulklen = -1;
2131 c->multibulk = 0;
2132 }
2133
2134 /* Call() is the core of Redis execution of a command */
2135 static void call(redisClient *c, struct redisCommand *cmd) {
2136 long long dirty;
2137
2138 dirty = server.dirty;
2139 cmd->proc(c);
2140 dirty = server.dirty-dirty;
2141
2142 if (server.appendonly && dirty)
2143 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2144 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2145 listLength(server.slaves))
2146 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2147 if (listLength(server.monitors))
2148 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2149 server.stat_numcommands++;
2150 }
2151
2152 /* If this function gets called we already read a whole
2153 * command, argments are in the client argv/argc fields.
2154 * processCommand() execute the command or prepare the
2155 * server for a bulk read from the client.
2156 *
2157 * If 1 is returned the client is still alive and valid and
2158 * and other operations can be performed by the caller. Otherwise
2159 * if 0 is returned the client was destroied (i.e. after QUIT). */
2160 static int processCommand(redisClient *c) {
2161 struct redisCommand *cmd;
2162
2163 /* Free some memory if needed (maxmemory setting) */
2164 if (server.maxmemory) freeMemoryIfNeeded();
2165
2166 /* Handle the multi bulk command type. This is an alternative protocol
2167 * supported by Redis in order to receive commands that are composed of
2168 * multiple binary-safe "bulk" arguments. The latency of processing is
2169 * a bit higher but this allows things like multi-sets, so if this
2170 * protocol is used only for MSET and similar commands this is a big win. */
2171 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2172 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2173 if (c->multibulk <= 0) {
2174 resetClient(c);
2175 return 1;
2176 } else {
2177 decrRefCount(c->argv[c->argc-1]);
2178 c->argc--;
2179 return 1;
2180 }
2181 } else if (c->multibulk) {
2182 if (c->bulklen == -1) {
2183 if (((char*)c->argv[0]->ptr)[0] != '$') {
2184 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2185 resetClient(c);
2186 return 1;
2187 } else {
2188 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2189 decrRefCount(c->argv[0]);
2190 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2191 c->argc--;
2192 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2193 resetClient(c);
2194 return 1;
2195 }
2196 c->argc--;
2197 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2198 return 1;
2199 }
2200 } else {
2201 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2202 c->mbargv[c->mbargc] = c->argv[0];
2203 c->mbargc++;
2204 c->argc--;
2205 c->multibulk--;
2206 if (c->multibulk == 0) {
2207 robj **auxargv;
2208 int auxargc;
2209
2210 /* Here we need to swap the multi-bulk argc/argv with the
2211 * normal argc/argv of the client structure. */
2212 auxargv = c->argv;
2213 c->argv = c->mbargv;
2214 c->mbargv = auxargv;
2215
2216 auxargc = c->argc;
2217 c->argc = c->mbargc;
2218 c->mbargc = auxargc;
2219
2220 /* We need to set bulklen to something different than -1
2221 * in order for the code below to process the command without
2222 * to try to read the last argument of a bulk command as
2223 * a special argument. */
2224 c->bulklen = 0;
2225 /* continue below and process the command */
2226 } else {
2227 c->bulklen = -1;
2228 return 1;
2229 }
2230 }
2231 }
2232 /* -- end of multi bulk commands processing -- */
2233
2234 /* The QUIT command is handled as a special case. Normal command
2235 * procs are unable to close the client connection safely */
2236 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2237 freeClient(c);
2238 return 0;
2239 }
2240
2241 /* Now lookup the command and check ASAP about trivial error conditions
2242 * such wrong arity, bad command name and so forth. */
2243 cmd = lookupCommand(c->argv[0]->ptr);
2244 if (!cmd) {
2245 addReplySds(c,
2246 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2247 (char*)c->argv[0]->ptr));
2248 resetClient(c);
2249 return 1;
2250 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2251 (c->argc < -cmd->arity)) {
2252 addReplySds(c,
2253 sdscatprintf(sdsempty(),
2254 "-ERR wrong number of arguments for '%s' command\r\n",
2255 cmd->name));
2256 resetClient(c);
2257 return 1;
2258 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2259 /* This is a bulk command, we have to read the last argument yet. */
2260 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2261
2262 decrRefCount(c->argv[c->argc-1]);
2263 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2264 c->argc--;
2265 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2266 resetClient(c);
2267 return 1;
2268 }
2269 c->argc--;
2270 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2271 /* It is possible that the bulk read is already in the
2272 * buffer. Check this condition and handle it accordingly.
2273 * This is just a fast path, alternative to call processInputBuffer().
2274 * It's a good idea since the code is small and this condition
2275 * happens most of the times. */
2276 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2277 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2278 c->argc++;
2279 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2280 } else {
2281 /* Otherwise return... there is to read the last argument
2282 * from the socket. */
2283 return 1;
2284 }
2285 }
2286 /* Let's try to encode the bulk object to save space. */
2287 if (cmd->flags & REDIS_CMD_BULK)
2288 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2289
2290 /* Check if the user is authenticated */
2291 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2292 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2293 resetClient(c);
2294 return 1;
2295 }
2296
2297 /* Handle the maxmemory directive */
2298 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2299 zmalloc_used_memory() > server.maxmemory)
2300 {
2301 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2302 resetClient(c);
2303 return 1;
2304 }
2305
2306 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2307 if (dictSize(c->pubsub_channels) > 0 &&
2308 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2309 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2310 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2311 resetClient(c);
2312 return 1;
2313 }
2314
2315 /* Exec the command */
2316 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2317 queueMultiCommand(c,cmd);
2318 addReply(c,shared.queued);
2319 } else {
2320 if (server.vm_enabled && server.vm_max_threads > 0 &&
2321 blockClientOnSwappedKeys(cmd,c)) return 1;
2322 call(c,cmd);
2323 }
2324
2325 /* Prepare the client for the next command */
2326 resetClient(c);
2327 return 1;
2328 }
2329
2330 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2331 listNode *ln;
2332 listIter li;
2333 int outc = 0, j;
2334 robj **outv;
2335 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2336 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2337 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2338 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2339 robj *lenobj;
2340
2341 if (argc <= REDIS_STATIC_ARGS) {
2342 outv = static_outv;
2343 } else {
2344 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2345 }
2346
2347 lenobj = createObject(REDIS_STRING,
2348 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2349 lenobj->refcount = 0;
2350 outv[outc++] = lenobj;
2351 for (j = 0; j < argc; j++) {
2352 lenobj = createObject(REDIS_STRING,
2353 sdscatprintf(sdsempty(),"$%lu\r\n",
2354 (unsigned long) stringObjectLen(argv[j])));
2355 lenobj->refcount = 0;
2356 outv[outc++] = lenobj;
2357 outv[outc++] = argv[j];
2358 outv[outc++] = shared.crlf;
2359 }
2360
2361 /* Increment all the refcounts at start and decrement at end in order to
2362 * be sure to free objects if there is no slave in a replication state
2363 * able to be feed with commands */
2364 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2365 listRewind(slaves,&li);
2366 while((ln = listNext(&li))) {
2367 redisClient *slave = ln->value;
2368
2369 /* Don't feed slaves that are still waiting for BGSAVE to start */
2370 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2371
2372 /* Feed all the other slaves, MONITORs and so on */
2373 if (slave->slaveseldb != dictid) {
2374 robj *selectcmd;
2375
2376 switch(dictid) {
2377 case 0: selectcmd = shared.select0; break;
2378 case 1: selectcmd = shared.select1; break;
2379 case 2: selectcmd = shared.select2; break;
2380 case 3: selectcmd = shared.select3; break;
2381 case 4: selectcmd = shared.select4; break;
2382 case 5: selectcmd = shared.select5; break;
2383 case 6: selectcmd = shared.select6; break;
2384 case 7: selectcmd = shared.select7; break;
2385 case 8: selectcmd = shared.select8; break;
2386 case 9: selectcmd = shared.select9; break;
2387 default:
2388 selectcmd = createObject(REDIS_STRING,
2389 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2390 selectcmd->refcount = 0;
2391 break;
2392 }
2393 addReply(slave,selectcmd);
2394 slave->slaveseldb = dictid;
2395 }
2396 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2397 }
2398 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2399 if (outv != static_outv) zfree(outv);
2400 }
2401
2402 static void processInputBuffer(redisClient *c) {
2403 again:
2404 /* Before to process the input buffer, make sure the client is not
2405 * waitig for a blocking operation such as BLPOP. Note that the first
2406 * iteration the client is never blocked, otherwise the processInputBuffer
2407 * would not be called at all, but after the execution of the first commands
2408 * in the input buffer the client may be blocked, and the "goto again"
2409 * will try to reiterate. The following line will make it return asap. */
2410 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2411 if (c->bulklen == -1) {
2412 /* Read the first line of the query */
2413 char *p = strchr(c->querybuf,'\n');
2414 size_t querylen;
2415
2416 if (p) {
2417 sds query, *argv;
2418 int argc, j;
2419
2420 query = c->querybuf;
2421 c->querybuf = sdsempty();
2422 querylen = 1+(p-(query));
2423 if (sdslen(query) > querylen) {
2424 /* leave data after the first line of the query in the buffer */
2425 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2426 }
2427 *p = '\0'; /* remove "\n" */
2428 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2429 sdsupdatelen(query);
2430
2431 /* Now we can split the query in arguments */
2432 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2433 sdsfree(query);
2434
2435 if (c->argv) zfree(c->argv);
2436 c->argv = zmalloc(sizeof(robj*)*argc);
2437
2438 for (j = 0; j < argc; j++) {
2439 if (sdslen(argv[j])) {
2440 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2441 c->argc++;
2442 } else {
2443 sdsfree(argv[j]);
2444 }
2445 }
2446 zfree(argv);
2447 if (c->argc) {
2448 /* Execute the command. If the client is still valid
2449 * after processCommand() return and there is something
2450 * on the query buffer try to process the next command. */
2451 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2452 } else {
2453 /* Nothing to process, argc == 0. Just process the query
2454 * buffer if it's not empty or return to the caller */
2455 if (sdslen(c->querybuf)) goto again;
2456 }
2457 return;
2458 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2459 redisLog(REDIS_VERBOSE, "Client protocol error");
2460 freeClient(c);
2461 return;
2462 }
2463 } else {
2464 /* Bulk read handling. Note that if we are at this point
2465 the client already sent a command terminated with a newline,
2466 we are reading the bulk data that is actually the last
2467 argument of the command. */
2468 int qbl = sdslen(c->querybuf);
2469
2470 if (c->bulklen <= qbl) {
2471 /* Copy everything but the final CRLF as final argument */
2472 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2473 c->argc++;
2474 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2475 /* Process the command. If the client is still valid after
2476 * the processing and there is more data in the buffer
2477 * try to parse it. */
2478 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2479 return;
2480 }
2481 }
2482 }
2483
2484 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2485 redisClient *c = (redisClient*) privdata;
2486 char buf[REDIS_IOBUF_LEN];
2487 int nread;
2488 REDIS_NOTUSED(el);
2489 REDIS_NOTUSED(mask);
2490
2491 nread = read(fd, buf, REDIS_IOBUF_LEN);
2492 if (nread == -1) {
2493 if (errno == EAGAIN) {
2494 nread = 0;
2495 } else {
2496 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2497 freeClient(c);
2498 return;
2499 }
2500 } else if (nread == 0) {
2501 redisLog(REDIS_VERBOSE, "Client closed connection");
2502 freeClient(c);
2503 return;
2504 }
2505 if (nread) {
2506 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2507 c->lastinteraction = time(NULL);
2508 } else {
2509 return;
2510 }
2511 processInputBuffer(c);
2512 }
2513
2514 static int selectDb(redisClient *c, int id) {
2515 if (id < 0 || id >= server.dbnum)
2516 return REDIS_ERR;
2517 c->db = &server.db[id];
2518 return REDIS_OK;
2519 }
2520
2521 static void *dupClientReplyValue(void *o) {
2522 incrRefCount((robj*)o);
2523 return o;
2524 }
2525
2526 static int listMatchObjects(void *a, void *b) {
2527 return compareStringObjects(a,b) == 0;
2528 }
2529
2530 static redisClient *createClient(int fd) {
2531 redisClient *c = zmalloc(sizeof(*c));
2532
2533 anetNonBlock(NULL,fd);
2534 anetTcpNoDelay(NULL,fd);
2535 if (!c) return NULL;
2536 selectDb(c,0);
2537 c->fd = fd;
2538 c->querybuf = sdsempty();
2539 c->argc = 0;
2540 c->argv = NULL;
2541 c->bulklen = -1;
2542 c->multibulk = 0;
2543 c->mbargc = 0;
2544 c->mbargv = NULL;
2545 c->sentlen = 0;
2546 c->flags = 0;
2547 c->lastinteraction = time(NULL);
2548 c->authenticated = 0;
2549 c->replstate = REDIS_REPL_NONE;
2550 c->reply = listCreate();
2551 listSetFreeMethod(c->reply,decrRefCount);
2552 listSetDupMethod(c->reply,dupClientReplyValue);
2553 c->blockingkeys = NULL;
2554 c->blockingkeysnum = 0;
2555 c->io_keys = listCreate();
2556 listSetFreeMethod(c->io_keys,decrRefCount);
2557 c->pubsub_channels = dictCreate(&setDictType,NULL);
2558 c->pubsub_patterns = listCreate();
2559 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2560 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2561 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2562 readQueryFromClient, c) == AE_ERR) {
2563 freeClient(c);
2564 return NULL;
2565 }
2566 listAddNodeTail(server.clients,c);
2567 initClientMultiState(c);
2568 return c;
2569 }
2570
2571 static void addReply(redisClient *c, robj *obj) {
2572 if (listLength(c->reply) == 0 &&
2573 (c->replstate == REDIS_REPL_NONE ||
2574 c->replstate == REDIS_REPL_ONLINE) &&
2575 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2576 sendReplyToClient, c) == AE_ERR) return;
2577
2578 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2579 obj = dupStringObject(obj);
2580 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2581 }
2582 listAddNodeTail(c->reply,getDecodedObject(obj));
2583 }
2584
2585 static void addReplySds(redisClient *c, sds s) {
2586 robj *o = createObject(REDIS_STRING,s);
2587 addReply(c,o);
2588 decrRefCount(o);
2589 }
2590
2591 static void addReplyDouble(redisClient *c, double d) {
2592 char buf[128];
2593
2594 snprintf(buf,sizeof(buf),"%.17g",d);
2595 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2596 (unsigned long) strlen(buf),buf));
2597 }
2598
2599 static void addReplyLong(redisClient *c, long l) {
2600 char buf[128];
2601 size_t len;
2602
2603 if (l == 0) {
2604 addReply(c,shared.czero);
2605 return;
2606 } else if (l == 1) {
2607 addReply(c,shared.cone);
2608 return;
2609 }
2610 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2611 addReplySds(c,sdsnewlen(buf,len));
2612 }
2613
2614 static void addReplyLongLong(redisClient *c, long long ll) {
2615 char buf[128];
2616 size_t len;
2617
2618 if (ll == 0) {
2619 addReply(c,shared.czero);
2620 return;
2621 } else if (ll == 1) {
2622 addReply(c,shared.cone);
2623 return;
2624 }
2625 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2626 addReplySds(c,sdsnewlen(buf,len));
2627 }
2628
2629 static void addReplyUlong(redisClient *c, unsigned long ul) {
2630 char buf[128];
2631 size_t len;
2632
2633 if (ul == 0) {
2634 addReply(c,shared.czero);
2635 return;
2636 } else if (ul == 1) {
2637 addReply(c,shared.cone);
2638 return;
2639 }
2640 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2641 addReplySds(c,sdsnewlen(buf,len));
2642 }
2643
2644 static void addReplyBulkLen(redisClient *c, robj *obj) {
2645 size_t len;
2646
2647 if (obj->encoding == REDIS_ENCODING_RAW) {
2648 len = sdslen(obj->ptr);
2649 } else {
2650 long n = (long)obj->ptr;
2651
2652 /* Compute how many bytes will take this integer as a radix 10 string */
2653 len = 1;
2654 if (n < 0) {
2655 len++;
2656 n = -n;
2657 }
2658 while((n = n/10) != 0) {
2659 len++;
2660 }
2661 }
2662 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2663 }
2664
2665 static void addReplyBulk(redisClient *c, robj *obj) {
2666 addReplyBulkLen(c,obj);
2667 addReply(c,obj);
2668 addReply(c,shared.crlf);
2669 }
2670
2671 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2672 static void addReplyBulkCString(redisClient *c, char *s) {
2673 if (s == NULL) {
2674 addReply(c,shared.nullbulk);
2675 } else {
2676 robj *o = createStringObject(s,strlen(s));
2677 addReplyBulk(c,o);
2678 decrRefCount(o);
2679 }
2680 }
2681
2682 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2683 int cport, cfd;
2684 char cip[128];
2685 redisClient *c;
2686 REDIS_NOTUSED(el);
2687 REDIS_NOTUSED(mask);
2688 REDIS_NOTUSED(privdata);
2689
2690 cfd = anetAccept(server.neterr, fd, cip, &cport);
2691 if (cfd == AE_ERR) {
2692 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2693 return;
2694 }
2695 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2696 if ((c = createClient(cfd)) == NULL) {
2697 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2698 close(cfd); /* May be already closed, just ingore errors */
2699 return;
2700 }
2701 /* If maxclient directive is set and this is one client more... close the
2702 * connection. Note that we create the client instead to check before
2703 * for this condition, since now the socket is already set in nonblocking
2704 * mode and we can send an error for free using the Kernel I/O */
2705 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2706 char *err = "-ERR max number of clients reached\r\n";
2707
2708 /* That's a best effort error message, don't check write errors */
2709 if (write(c->fd,err,strlen(err)) == -1) {
2710 /* Nothing to do, Just to avoid the warning... */
2711 }
2712 freeClient(c);
2713 return;
2714 }
2715 server.stat_numconnections++;
2716 }
2717
2718 /* ======================= Redis objects implementation ===================== */
2719
2720 static robj *createObject(int type, void *ptr) {
2721 robj *o;
2722
2723 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2724 if (listLength(server.objfreelist)) {
2725 listNode *head = listFirst(server.objfreelist);
2726 o = listNodeValue(head);
2727 listDelNode(server.objfreelist,head);
2728 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2729 } else {
2730 if (server.vm_enabled) {
2731 pthread_mutex_unlock(&server.obj_freelist_mutex);
2732 o = zmalloc(sizeof(*o));
2733 } else {
2734 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2735 }
2736 }
2737 o->type = type;
2738 o->encoding = REDIS_ENCODING_RAW;
2739 o->ptr = ptr;
2740 o->refcount = 1;
2741 if (server.vm_enabled) {
2742 /* Note that this code may run in the context of an I/O thread
2743 * and accessing to server.unixtime in theory is an error
2744 * (no locks). But in practice this is safe, and even if we read
2745 * garbage Redis will not fail, as it's just a statistical info */
2746 o->vm.atime = server.unixtime;
2747 o->storage = REDIS_VM_MEMORY;
2748 }
2749 return o;
2750 }
2751
2752 static robj *createStringObject(char *ptr, size_t len) {
2753 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2754 }
2755
2756 static robj *dupStringObject(robj *o) {
2757 assert(o->encoding == REDIS_ENCODING_RAW);
2758 return createStringObject(o->ptr,sdslen(o->ptr));
2759 }
2760
2761 static robj *createListObject(void) {
2762 list *l = listCreate();
2763
2764 listSetFreeMethod(l,decrRefCount);
2765 return createObject(REDIS_LIST,l);
2766 }
2767
2768 static robj *createSetObject(void) {
2769 dict *d = dictCreate(&setDictType,NULL);
2770 return createObject(REDIS_SET,d);
2771 }
2772
2773 static robj *createHashObject(void) {
2774 /* All the Hashes start as zipmaps. Will be automatically converted
2775 * into hash tables if there are enough elements or big elements
2776 * inside. */
2777 unsigned char *zm = zipmapNew();
2778 robj *o = createObject(REDIS_HASH,zm);
2779 o->encoding = REDIS_ENCODING_ZIPMAP;
2780 return o;
2781 }
2782
2783 static robj *createZsetObject(void) {
2784 zset *zs = zmalloc(sizeof(*zs));
2785
2786 zs->dict = dictCreate(&zsetDictType,NULL);
2787 zs->zsl = zslCreate();
2788 return createObject(REDIS_ZSET,zs);
2789 }
2790
2791 static void freeStringObject(robj *o) {
2792 if (o->encoding == REDIS_ENCODING_RAW) {
2793 sdsfree(o->ptr);
2794 }
2795 }
2796
2797 static void freeListObject(robj *o) {
2798 listRelease((list*) o->ptr);
2799 }
2800
2801 static void freeSetObject(robj *o) {
2802 dictRelease((dict*) o->ptr);
2803 }
2804
2805 static void freeZsetObject(robj *o) {
2806 zset *zs = o->ptr;
2807
2808 dictRelease(zs->dict);
2809 zslFree(zs->zsl);
2810 zfree(zs);
2811 }
2812
2813 static void freeHashObject(robj *o) {
2814 switch (o->encoding) {
2815 case REDIS_ENCODING_HT:
2816 dictRelease((dict*) o->ptr);
2817 break;
2818 case REDIS_ENCODING_ZIPMAP:
2819 zfree(o->ptr);
2820 break;
2821 default:
2822 redisAssert(0);
2823 break;
2824 }
2825 }
2826
2827 static void incrRefCount(robj *o) {
2828 o->refcount++;
2829 }
2830
2831 static void decrRefCount(void *obj) {
2832 robj *o = obj;
2833
2834 /* Object is a key of a swapped out value, or in the process of being
2835 * loaded. */
2836 if (server.vm_enabled &&
2837 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2838 {
2839 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2840 redisAssert(o->type == REDIS_STRING);
2841 freeStringObject(o);
2842 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2843 pthread_mutex_lock(&server.obj_freelist_mutex);
2844 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2845 !listAddNodeHead(server.objfreelist,o))
2846 zfree(o);
2847 pthread_mutex_unlock(&server.obj_freelist_mutex);
2848 server.vm_stats_swapped_objects--;
2849 return;
2850 }
2851 /* Object is in memory, or in the process of being swapped out. */
2852 if (--(o->refcount) == 0) {
2853 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2854 vmCancelThreadedIOJob(obj);
2855 switch(o->type) {
2856 case REDIS_STRING: freeStringObject(o); break;
2857 case REDIS_LIST: freeListObject(o); break;
2858 case REDIS_SET: freeSetObject(o); break;
2859 case REDIS_ZSET: freeZsetObject(o); break;
2860 case REDIS_HASH: freeHashObject(o); break;
2861 default: redisAssert(0); break;
2862 }
2863 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2864 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2865 !listAddNodeHead(server.objfreelist,o))
2866 zfree(o);
2867 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2868 }
2869 }
2870
2871 static robj *lookupKey(redisDb *db, robj *key) {
2872 dictEntry *de = dictFind(db->dict,key);
2873 if (de) {
2874 robj *key = dictGetEntryKey(de);
2875 robj *val = dictGetEntryVal(de);
2876
2877 if (server.vm_enabled) {
2878 if (key->storage == REDIS_VM_MEMORY ||
2879 key->storage == REDIS_VM_SWAPPING)
2880 {
2881 /* If we were swapping the object out, stop it, this key
2882 * was requested. */
2883 if (key->storage == REDIS_VM_SWAPPING)
2884 vmCancelThreadedIOJob(key);
2885 /* Update the access time of the key for the aging algorithm. */
2886 key->vm.atime = server.unixtime;
2887 } else {
2888 int notify = (key->storage == REDIS_VM_LOADING);
2889
2890 /* Our value was swapped on disk. Bring it at home. */
2891 redisAssert(val == NULL);
2892 val = vmLoadObject(key);
2893 dictGetEntryVal(de) = val;
2894
2895 /* Clients blocked by the VM subsystem may be waiting for
2896 * this key... */
2897 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2898 }
2899 }
2900 return val;
2901 } else {
2902 return NULL;
2903 }
2904 }
2905
2906 static robj *lookupKeyRead(redisDb *db, robj *key) {
2907 expireIfNeeded(db,key);
2908 return lookupKey(db,key);
2909 }
2910
2911 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2912 deleteIfVolatile(db,key);
2913 return lookupKey(db,key);
2914 }
2915
2916 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2917 robj *o = lookupKeyRead(c->db, key);
2918 if (!o) addReply(c,reply);
2919 return o;
2920 }
2921
2922 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2923 robj *o = lookupKeyWrite(c->db, key);
2924 if (!o) addReply(c,reply);
2925 return o;
2926 }
2927
2928 static int checkType(redisClient *c, robj *o, int type) {
2929 if (o->type != type) {
2930 addReply(c,shared.wrongtypeerr);
2931 return 1;
2932 }
2933 return 0;
2934 }
2935
2936 static int deleteKey(redisDb *db, robj *key) {
2937 int retval;
2938
2939 /* We need to protect key from destruction: after the first dictDelete()
2940 * it may happen that 'key' is no longer valid if we don't increment
2941 * it's count. This may happen when we get the object reference directly
2942 * from the hash table with dictRandomKey() or dict iterators */
2943 incrRefCount(key);
2944 if (dictSize(db->expires)) dictDelete(db->expires,key);
2945 retval = dictDelete(db->dict,key);
2946 decrRefCount(key);
2947
2948 return retval == DICT_OK;
2949 }
2950
2951 /* Check if the nul-terminated string 's' can be represented by a long
2952 * (that is, is a number that fits into long without any other space or
2953 * character before or after the digits).
2954 *
2955 * If so, the function returns REDIS_OK and *longval is set to the value
2956 * of the number. Otherwise REDIS_ERR is returned */
2957 static int isStringRepresentableAsLong(sds s, long *longval) {
2958 char buf[32], *endptr;
2959 long value;
2960 int slen;
2961
2962 value = strtol(s, &endptr, 10);
2963 if (endptr[0] != '\0') return REDIS_ERR;
2964 slen = snprintf(buf,32,"%ld",value);
2965
2966 /* If the number converted back into a string is not identical
2967 * then it's not possible to encode the string as integer */
2968 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2969 if (longval) *longval = value;
2970 return REDIS_OK;
2971 }
2972
2973 /* Try to encode a string object in order to save space */
2974 static robj *tryObjectEncoding(robj *o) {
2975 long value;
2976 sds s = o->ptr;
2977
2978 if (o->encoding != REDIS_ENCODING_RAW)
2979 return o; /* Already encoded */
2980
2981 /* It's not safe to encode shared objects: shared objects can be shared
2982 * everywhere in the "object space" of Redis. Encoded objects can only
2983 * appear as "values" (and not, for instance, as keys) */
2984 if (o->refcount > 1) return o;
2985
2986 /* Currently we try to encode only strings */
2987 redisAssert(o->type == REDIS_STRING);
2988
2989 /* Check if we can represent this string as a long integer */
2990 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2991
2992 /* Ok, this object can be encoded */
2993 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2994 decrRefCount(o);
2995 incrRefCount(shared.integers[value]);
2996 return shared.integers[value];
2997 } else {
2998 o->encoding = REDIS_ENCODING_INT;
2999 sdsfree(o->ptr);
3000 o->ptr = (void*) value;
3001 return o;
3002 }
3003 }
3004
3005 /* Get a decoded version of an encoded object (returned as a new object).
3006 * If the object is already raw-encoded just increment the ref count. */
3007 static robj *getDecodedObject(robj *o) {
3008 robj *dec;
3009
3010 if (o->encoding == REDIS_ENCODING_RAW) {
3011 incrRefCount(o);
3012 return o;
3013 }
3014 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3015 char buf[32];
3016
3017 snprintf(buf,32,"%ld",(long)o->ptr);
3018 dec = createStringObject(buf,strlen(buf));
3019 return dec;
3020 } else {
3021 redisAssert(1 != 1);
3022 }
3023 }
3024
3025 /* Compare two string objects via strcmp() or alike.
3026 * Note that the objects may be integer-encoded. In such a case we
3027 * use snprintf() to get a string representation of the numbers on the stack
3028 * and compare the strings, it's much faster than calling getDecodedObject().
3029 *
3030 * Important note: if objects are not integer encoded, but binary-safe strings,
3031 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3032 * binary safe. */
3033 static int compareStringObjects(robj *a, robj *b) {
3034 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3035 char bufa[128], bufb[128], *astr, *bstr;
3036 int bothsds = 1;
3037
3038 if (a == b) return 0;
3039 if (a->encoding != REDIS_ENCODING_RAW) {
3040 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3041 astr = bufa;
3042 bothsds = 0;
3043 } else {
3044 astr = a->ptr;
3045 }
3046 if (b->encoding != REDIS_ENCODING_RAW) {
3047 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3048 bstr = bufb;
3049 bothsds = 0;
3050 } else {
3051 bstr = b->ptr;
3052 }
3053 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3054 }
3055
3056 static size_t stringObjectLen(robj *o) {
3057 redisAssert(o->type == REDIS_STRING);
3058 if (o->encoding == REDIS_ENCODING_RAW) {
3059 return sdslen(o->ptr);
3060 } else {
3061 char buf[32];
3062
3063 return snprintf(buf,32,"%ld",(long)o->ptr);
3064 }
3065 }
3066
3067 static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
3068 double parsedValue;
3069 char *eptr = NULL;
3070
3071 if (o && o->type != REDIS_STRING) {
3072 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3073 return REDIS_ERR;
3074 }
3075
3076 if (o == NULL)
3077 parsedValue = 0;
3078 else if (o->encoding == REDIS_ENCODING_RAW)
3079 parsedValue = strtod(o->ptr, &eptr);
3080 else if (o->encoding == REDIS_ENCODING_INT)
3081 parsedValue = (long)o->ptr;
3082 else
3083 redisAssert(1 != 1);
3084
3085 if (eptr != NULL && *eptr != '\0') {
3086 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3087 return REDIS_ERR;
3088 }
3089
3090 *value = parsedValue;
3091
3092 return REDIS_OK;
3093 }
3094
3095 static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
3096 long long parsedValue;
3097 char *eptr = NULL;
3098
3099 if (o && o->type != REDIS_STRING) {
3100 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3101 return REDIS_ERR;
3102 }
3103
3104 if (o == NULL)
3105 parsedValue = 0;
3106 else if (o->encoding == REDIS_ENCODING_RAW)
3107 parsedValue = strtoll(o->ptr, &eptr, 10);
3108 else if (o->encoding == REDIS_ENCODING_INT)
3109 parsedValue = (long)o->ptr;
3110 else
3111 redisAssert(1 != 1);
3112
3113 if (eptr != NULL && *eptr != '\0') {
3114 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3115 return REDIS_ERR;
3116 }
3117
3118 *value = parsedValue;
3119
3120 return REDIS_OK;
3121 }
3122
3123 static int getLongFromObject(redisClient *c, robj *o, long *value) {
3124 long long actualValue;
3125
3126 if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;
3127
3128 if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
3129 addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
3130 return REDIS_ERR;
3131 }
3132
3133 *value = actualValue;
3134
3135 return REDIS_OK;
3136 }
3137
3138 /*============================ RDB saving/loading =========================== */
3139
3140 static int rdbSaveType(FILE *fp, unsigned char type) {
3141 if (fwrite(&type,1,1,fp) == 0) return -1;
3142 return 0;
3143 }
3144
3145 static int rdbSaveTime(FILE *fp, time_t t) {
3146 int32_t t32 = (int32_t) t;
3147 if (fwrite(&t32,4,1,fp) == 0) return -1;
3148 return 0;
3149 }
3150
3151 /* check rdbLoadLen() comments for more info */
3152 static int rdbSaveLen(FILE *fp, uint32_t len) {
3153 unsigned char buf[2];
3154
3155 if (len < (1<<6)) {
3156 /* Save a 6 bit len */
3157 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3158 if (fwrite(buf,1,1,fp) == 0) return -1;
3159 } else if (len < (1<<14)) {
3160 /* Save a 14 bit len */
3161 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3162 buf[1] = len&0xFF;
3163 if (fwrite(buf,2,1,fp) == 0) return -1;
3164 } else {
3165 /* Save a 32 bit len */
3166 buf[0] = (REDIS_RDB_32BITLEN<<6);
3167 if (fwrite(buf,1,1,fp) == 0) return -1;
3168 len = htonl(len);
3169 if (fwrite(&len,4,1,fp) == 0) return -1;
3170 }
3171 return 0;
3172 }
3173
3174 /* String objects in the form "2391" "-100" without any space and with a
3175 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3176 * encoded as integers to save space */
3177 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3178 long long value;
3179 char *endptr, buf[32];
3180
3181 /* Check if it's possible to encode this value as a number */
3182 value = strtoll(s, &endptr, 10);
3183 if (endptr[0] != '\0') return 0;
3184 snprintf(buf,32,"%lld",value);
3185
3186 /* If the number converted back into a string is not identical
3187 * then it's not possible to encode the string as integer */
3188 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3189
3190 /* Finally check if it fits in our ranges */
3191 if (value >= -(1<<7) && value <= (1<<7)-1) {
3192 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3193 enc[1] = value&0xFF;
3194 return 2;
3195 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3196 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3197 enc[1] = value&0xFF;
3198 enc[2] = (value>>8)&0xFF;
3199 return 3;
3200 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3201 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3202 enc[1] = value&0xFF;
3203 enc[2] = (value>>8)&0xFF;
3204 enc[3] = (value>>16)&0xFF;
3205 enc[4] = (value>>24)&0xFF;
3206 return 5;
3207 } else {
3208 return 0;
3209 }
3210 }
3211
3212 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3213 size_t comprlen, outlen;
3214 unsigned char byte;
3215 void *out;
3216
3217 /* We require at least four bytes compression for this to be worth it */
3218 if (len <= 4) return 0;
3219 outlen = len-4;
3220 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3221 comprlen = lzf_compress(s, len, out, outlen);
3222 if (comprlen == 0) {
3223 zfree(out);
3224 return 0;
3225 }
3226 /* Data compressed! Let's save it on disk */
3227 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3228 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3229 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3230 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3231 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3232 zfree(out);
3233 return comprlen;
3234
3235 writeerr:
3236 zfree(out);
3237 return -1;
3238 }
3239
3240 /* Save a string objet as [len][data] on disk. If the object is a string
3241 * representation of an integer value we try to safe it in a special form */
3242 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3243 int enclen;
3244
3245 /* Try integer encoding */
3246 if (len <= 11) {
3247 unsigned char buf[5];
3248 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3249 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3250 return 0;
3251 }
3252 }
3253
3254 /* Try LZF compression - under 20 bytes it's unable to compress even
3255 * aaaaaaaaaaaaaaaaaa so skip it */
3256 if (server.rdbcompression && len > 20) {
3257 int retval;
3258
3259 retval = rdbSaveLzfStringObject(fp,s,len);
3260 if (retval == -1) return -1;
3261 if (retval > 0) return 0;
3262 /* retval == 0 means data can't be compressed, save the old way */
3263 }
3264
3265 /* Store verbatim */
3266 if (rdbSaveLen(fp,len) == -1) return -1;
3267 if (len && fwrite(s,len,1,fp) == 0) return -1;
3268 return 0;
3269 }
3270
3271 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3272 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3273 int retval;
3274
3275 /* Avoid incr/decr ref count business when possible.
3276 * This plays well with copy-on-write given that we are probably
3277 * in a child process (BGSAVE). Also this makes sure key objects
3278 * of swapped objects are not incRefCount-ed (an assert does not allow
3279 * this in order to avoid bugs) */
3280 if (obj->encoding != REDIS_ENCODING_RAW) {
3281 obj = getDecodedObject(obj);
3282 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3283 decrRefCount(obj);
3284 } else {
3285 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3286 }
3287 return retval;
3288 }
3289
3290 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3291 * 8 bit integer specifing the length of the representation.
3292 * This 8 bit integer has special values in order to specify the following
3293 * conditions:
3294 * 253: not a number
3295 * 254: + inf
3296 * 255: - inf
3297 */
3298 static int rdbSaveDoubleValue(FILE *fp, double val) {
3299 unsigned char buf[128];
3300 int len;
3301
3302 if (isnan(val)) {
3303 buf[0] = 253;
3304 len = 1;
3305 } else if (!isfinite(val)) {
3306 len = 1;
3307 buf[0] = (val < 0) ? 255 : 254;
3308 } else {
3309 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3310 buf[0] = strlen((char*)buf+1);
3311 len = buf[0]+1;
3312 }
3313 if (fwrite(buf,len,1,fp) == 0) return -1;
3314 return 0;
3315 }
3316
3317 /* Save a Redis object. */
3318 static int rdbSaveObject(FILE *fp, robj *o) {
3319 if (o->type == REDIS_STRING) {
3320 /* Save a string value */
3321 if (rdbSaveStringObject(fp,o) == -1) return -1;
3322 } else if (o->type == REDIS_LIST) {
3323 /* Save a list value */
3324 list *list = o->ptr;
3325 listIter li;
3326 listNode *ln;
3327
3328 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3329 listRewind(list,&li);
3330 while((ln = listNext(&li))) {
3331 robj *eleobj = listNodeValue(ln);
3332
3333 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3334 }
3335 } else if (o->type == REDIS_SET) {
3336 /* Save a set value */
3337 dict *set = o->ptr;
3338 dictIterator *di = dictGetIterator(set);
3339 dictEntry *de;
3340
3341 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3342 while((de = dictNext(di)) != NULL) {
3343 robj *eleobj = dictGetEntryKey(de);
3344
3345 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3346 }
3347 dictReleaseIterator(di);
3348 } else if (o->type == REDIS_ZSET) {
3349 /* Save a set value */
3350 zset *zs = o->ptr;
3351 dictIterator *di = dictGetIterator(zs->dict);
3352 dictEntry *de;
3353
3354 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3355 while((de = dictNext(di)) != NULL) {
3356 robj *eleobj = dictGetEntryKey(de);
3357 double *score = dictGetEntryVal(de);
3358
3359 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3360 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3361 }
3362 dictReleaseIterator(di);
3363 } else if (o->type == REDIS_HASH) {
3364 /* Save a hash value */
3365 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3366 unsigned char *p = zipmapRewind(o->ptr);
3367 unsigned int count = zipmapLen(o->ptr);
3368 unsigned char *key, *val;
3369 unsigned int klen, vlen;
3370
3371 if (rdbSaveLen(fp,count) == -1) return -1;
3372 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3373 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3374 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3375 }
3376 } else {
3377 dictIterator *di = dictGetIterator(o->ptr);
3378 dictEntry *de;
3379
3380 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3381 while((de = dictNext(di)) != NULL) {
3382 robj *key = dictGetEntryKey(de);
3383 robj *val = dictGetEntryVal(de);
3384
3385 if (rdbSaveStringObject(fp,key) == -1) return -1;
3386 if (rdbSaveStringObject(fp,val) == -1) return -1;
3387 }
3388 dictReleaseIterator(di);
3389 }
3390 } else {
3391 redisAssert(0);
3392 }
3393 return 0;
3394 }
3395
3396 /* Return the length the object will have on disk if saved with
3397 * the rdbSaveObject() function. Currently we use a trick to get
3398 * this length with very little changes to the code. In the future
3399 * we could switch to a faster solution. */
3400 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3401 if (fp == NULL) fp = server.devnull;
3402 rewind(fp);
3403 assert(rdbSaveObject(fp,o) != 1);
3404 return ftello(fp);
3405 }
3406
3407 /* Return the number of pages required to save this object in the swap file */
3408 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3409 off_t bytes = rdbSavedObjectLen(o,fp);
3410
3411 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3412 }
3413
3414 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3415 static int rdbSave(char *filename) {
3416 dictIterator *di = NULL;
3417 dictEntry *de;
3418 FILE *fp;
3419 char tmpfile[256];
3420 int j;
3421 time_t now = time(NULL);
3422
3423 /* Wait for I/O therads to terminate, just in case this is a
3424 * foreground-saving, to avoid seeking the swap file descriptor at the
3425 * same time. */
3426 if (server.vm_enabled)
3427 waitEmptyIOJobsQueue();
3428
3429 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3430 fp = fopen(tmpfile,"w");
3431 if (!fp) {
3432 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3433 return REDIS_ERR;
3434 }
3435 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3436 for (j = 0; j < server.dbnum; j++) {
3437 redisDb *db = server.db+j;
3438 dict *d = db->dict;
3439 if (dictSize(d) == 0) continue;
3440 di = dictGetIterator(d);
3441 if (!di) {
3442 fclose(fp);
3443 return REDIS_ERR;
3444 }
3445
3446 /* Write the SELECT DB opcode */
3447 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3448 if (rdbSaveLen(fp,j) == -1) goto werr;
3449
3450 /* Iterate this DB writing every entry */
3451 while((de = dictNext(di)) != NULL) {
3452 robj *key = dictGetEntryKey(de);
3453 robj *o = dictGetEntryVal(de);
3454 time_t expiretime = getExpire(db,key);
3455
3456 /* Save the expire time */
3457 if (expiretime != -1) {
3458 /* If this key is already expired skip it */
3459 if (expiretime < now) continue;
3460 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3461 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3462 }
3463 /* Save the key and associated value. This requires special
3464 * handling if the value is swapped out. */
3465 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3466 key->storage == REDIS_VM_SWAPPING) {
3467 /* Save type, key, value */
3468 if (rdbSaveType(fp,o->type) == -1) goto werr;
3469 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3470 if (rdbSaveObject(fp,o) == -1) goto werr;
3471 } else {
3472 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3473 robj *po;
3474 /* Get a preview of the object in memory */
3475 po = vmPreviewObject(key);
3476 /* Save type, key, value */
3477 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3478 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3479 if (rdbSaveObject(fp,po) == -1) goto werr;
3480 /* Remove the loaded object from memory */
3481 decrRefCount(po);
3482 }
3483 }
3484 dictReleaseIterator(di);
3485 }
3486 /* EOF opcode */
3487 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3488
3489 /* Make sure data will not remain on the OS's output buffers */
3490 fflush(fp);
3491 fsync(fileno(fp));
3492 fclose(fp);
3493
3494 /* Use RENAME to make sure the DB file is changed atomically only
3495 * if the generate DB file is ok. */
3496 if (rename(tmpfile,filename) == -1) {
3497 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3498 unlink(tmpfile);
3499 return REDIS_ERR;
3500 }
3501 redisLog(REDIS_NOTICE,"DB saved on disk");
3502 server.dirty = 0;
3503 server.lastsave = time(NULL);
3504 return REDIS_OK;
3505
3506 werr:
3507 fclose(fp);
3508 unlink(tmpfile);
3509 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3510 if (di) dictReleaseIterator(di);
3511 return REDIS_ERR;
3512 }
3513
3514 static int rdbSaveBackground(char *filename) {
3515 pid_t childpid;
3516
3517 if (server.bgsavechildpid != -1) return REDIS_ERR;
3518 if (server.vm_enabled) waitEmptyIOJobsQueue();
3519 if ((childpid = fork()) == 0) {
3520 /* Child */
3521 if (server.vm_enabled) vmReopenSwapFile();
3522 close(server.fd);
3523 if (rdbSave(filename) == REDIS_OK) {
3524 _exit(0);
3525 } else {
3526 _exit(1);
3527 }
3528 } else {
3529 /* Parent */
3530 if (childpid == -1) {
3531 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3532 strerror(errno));
3533 return REDIS_ERR;
3534 }
3535 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3536 server.bgsavechildpid = childpid;
3537 updateDictResizePolicy();
3538 return REDIS_OK;
3539 }
3540 return REDIS_OK; /* unreached */
3541 }
3542
3543 static void rdbRemoveTempFile(pid_t childpid) {
3544 char tmpfile[256];
3545
3546 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3547 unlink(tmpfile);
3548 }
3549
3550 static int rdbLoadType(FILE *fp) {
3551 unsigned char type;
3552 if (fread(&type,1,1,fp) == 0) return -1;
3553 return type;
3554 }
3555
3556 static time_t rdbLoadTime(FILE *fp) {
3557 int32_t t32;
3558 if (fread(&t32,4,1,fp) == 0) return -1;
3559 return (time_t) t32;
3560 }
3561
3562 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3563 * of this file for a description of how this are stored on disk.
3564 *
3565 * isencoded is set to 1 if the readed length is not actually a length but
3566 * an "encoding type", check the above comments for more info */
3567 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3568 unsigned char buf[2];
3569 uint32_t len;
3570 int type;
3571
3572 if (isencoded) *isencoded = 0;
3573 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3574 type = (buf[0]&0xC0)>>6;
3575 if (type == REDIS_RDB_6BITLEN) {
3576 /* Read a 6 bit len */
3577 return buf[0]&0x3F;
3578 } else if (type == REDIS_RDB_ENCVAL) {
3579 /* Read a 6 bit len encoding type */
3580 if (isencoded) *isencoded = 1;
3581 return buf[0]&0x3F;
3582 } else if (type == REDIS_RDB_14BITLEN) {
3583 /* Read a 14 bit len */
3584 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3585 return ((buf[0]&0x3F)<<8)|buf[1];
3586 } else {
3587 /* Read a 32 bit len */
3588 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3589 return ntohl(len);
3590 }
3591 }
3592
3593 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3594 unsigned char enc[4];
3595 long long val;
3596
3597 if (enctype == REDIS_RDB_ENC_INT8) {
3598 if (fread(enc,1,1,fp) == 0) return NULL;
3599 val = (signed char)enc[0];
3600 } else if (enctype == REDIS_RDB_ENC_INT16) {
3601 uint16_t v;
3602 if (fread(enc,2,1,fp) == 0) return NULL;
3603 v = enc[0]|(enc[1]<<8);
3604 val = (int16_t)v;
3605 } else if (enctype == REDIS_RDB_ENC_INT32) {
3606 uint32_t v;
3607 if (fread(enc,4,1,fp) == 0) return NULL;
3608 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3609 val = (int32_t)v;
3610 } else {
3611 val = 0; /* anti-warning */
3612 redisAssert(0);
3613 }
3614 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3615 }
3616
3617 static robj *rdbLoadLzfStringObject(FILE*fp) {
3618 unsigned int len, clen;
3619 unsigned char *c = NULL;
3620 sds val = NULL;
3621
3622 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3623 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3624 if ((c = zmalloc(clen)) == NULL) goto err;
3625 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3626 if (fread(c,clen,1,fp) == 0) goto err;
3627 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3628 zfree(c);
3629 return createObject(REDIS_STRING,val);
3630 err:
3631 zfree(c);
3632 sdsfree(val);
3633 return NULL;
3634 }
3635
3636 static robj *rdbLoadStringObject(FILE*fp) {
3637 int isencoded;
3638 uint32_t len;
3639 sds val;
3640
3641 len = rdbLoadLen(fp,&isencoded);
3642 if (isencoded) {
3643 switch(len) {
3644 case REDIS_RDB_ENC_INT8:
3645 case REDIS_RDB_ENC_INT16:
3646 case REDIS_RDB_ENC_INT32:
3647 return rdbLoadIntegerObject(fp,len);
3648 case REDIS_RDB_ENC_LZF:
3649 return rdbLoadLzfStringObject(fp);
3650 default:
3651 redisAssert(0);
3652 }
3653 }
3654
3655 if (len == REDIS_RDB_LENERR) return NULL;
3656 val = sdsnewlen(NULL,len);
3657 if (len && fread(val,len,1,fp) == 0) {
3658 sdsfree(val);
3659 return NULL;
3660 }
3661 return createObject(REDIS_STRING,val);
3662 }
3663
3664 /* For information about double serialization check rdbSaveDoubleValue() */
3665 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3666 char buf[128];
3667 unsigned char len;
3668
3669 if (fread(&len,1,1,fp) == 0) return -1;
3670 switch(len) {
3671 case 255: *val = R_NegInf; return 0;
3672 case 254: *val = R_PosInf; return 0;
3673 case 253: *val = R_Nan; return 0;
3674 default:
3675 if (fread(buf,len,1,fp) == 0) return -1;
3676 buf[len] = '\0';
3677 sscanf(buf, "%lg", val);
3678 return 0;
3679 }
3680 }
3681
3682 /* Load a Redis object of the specified type from the specified file.
3683 * On success a newly allocated object is returned, otherwise NULL. */
3684 static robj *rdbLoadObject(int type, FILE *fp) {
3685 robj *o;
3686
3687 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3688 if (type == REDIS_STRING) {
3689 /* Read string value */
3690 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3691 o = tryObjectEncoding(o);
3692 } else if (type == REDIS_LIST || type == REDIS_SET) {
3693 /* Read list/set value */
3694 uint32_t listlen;
3695
3696 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3697 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3698 /* It's faster to expand the dict to the right size asap in order
3699 * to avoid rehashing */
3700 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3701 dictExpand(o->ptr,listlen);
3702 /* Load every single element of the list/set */
3703 while(listlen--) {
3704 robj *ele;
3705
3706 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3707 ele = tryObjectEncoding(ele);
3708 if (type == REDIS_LIST) {
3709 listAddNodeTail((list*)o->ptr,ele);
3710 } else {
3711 dictAdd((dict*)o->ptr,ele,NULL);
3712 }
3713 }
3714 } else if (type == REDIS_ZSET) {
3715 /* Read list/set value */
3716 size_t zsetlen;
3717 zset *zs;
3718
3719 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3720 o = createZsetObject();
3721 zs = o->ptr;
3722 /* Load every single element of the list/set */
3723 while(zsetlen--) {
3724 robj *ele;
3725 double *score = zmalloc(sizeof(double));
3726
3727 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3728 ele = tryObjectEncoding(ele);
3729 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3730 dictAdd(zs->dict,ele,score);
3731 zslInsert(zs->zsl,*score,ele);
3732 incrRefCount(ele); /* added to skiplist */
3733 }
3734 } else if (type == REDIS_HASH) {
3735 size_t hashlen;
3736
3737 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3738 o = createHashObject();
3739 /* Too many entries? Use an hash table. */
3740 if (hashlen > server.hash_max_zipmap_entries)
3741 convertToRealHash(o);
3742 /* Load every key/value, then set it into the zipmap or hash
3743 * table, as needed. */
3744 while(hashlen--) {
3745 robj *key, *val;
3746
3747 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3748 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3749 /* If we are using a zipmap and there are too big values
3750 * the object is converted to real hash table encoding. */
3751 if (o->encoding != REDIS_ENCODING_HT &&
3752 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3753 sdslen(val->ptr) > server.hash_max_zipmap_value))
3754 {
3755 convertToRealHash(o);
3756 }
3757
3758 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3759 unsigned char *zm = o->ptr;
3760
3761 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3762 val->ptr,sdslen(val->ptr),NULL);
3763 o->ptr = zm;
3764 decrRefCount(key);
3765 decrRefCount(val);
3766 } else {
3767 key = tryObjectEncoding(key);
3768 val = tryObjectEncoding(val);
3769 dictAdd((dict*)o->ptr,key,val);
3770 }
3771 }
3772 } else {
3773 redisAssert(0);
3774 }
3775 return o;
3776 }
3777
3778 static int rdbLoad(char *filename) {
3779 FILE *fp;
3780 robj *keyobj = NULL;
3781 uint32_t dbid;
3782 int type, retval, rdbver;
3783 dict *d = server.db[0].dict;
3784 redisDb *db = server.db+0;
3785 char buf[1024];
3786 time_t expiretime = -1, now = time(NULL);
3787 long long loadedkeys = 0;
3788
3789 fp = fopen(filename,"r");
3790 if (!fp) return REDIS_ERR;
3791 if (fread(buf,9,1,fp) == 0) goto eoferr;
3792 buf[9] = '\0';
3793 if (memcmp(buf,"REDIS",5) != 0) {
3794 fclose(fp);
3795 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3796 return REDIS_ERR;
3797 }
3798 rdbver = atoi(buf+5);
3799 if (rdbver != 1) {
3800 fclose(fp);
3801 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3802 return REDIS_ERR;
3803 }
3804 while(1) {
3805 robj *o;
3806
3807 /* Read type. */
3808 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3809 if (type == REDIS_EXPIRETIME) {
3810 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3811 /* We read the time so we need to read the object type again */
3812 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3813 }
3814 if (type == REDIS_EOF) break;
3815 /* Handle SELECT DB opcode as a special case */
3816 if (type == REDIS_SELECTDB) {
3817 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3818 goto eoferr;
3819 if (dbid >= (unsigned)server.dbnum) {
3820 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3821 exit(1);
3822 }
3823 db = server.db+dbid;
3824 d = db->dict;
3825 continue;
3826 }
3827 /* Read key */
3828 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3829 /* Read value */
3830 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3831 /* Add the new object in the hash table */
3832 retval = dictAdd(d,keyobj,o);
3833 if (retval == DICT_ERR) {
3834 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3835 exit(1);
3836 }
3837 /* Set the expire time if needed */
3838 if (expiretime != -1) {
3839 setExpire(db,keyobj,expiretime);
3840 /* Delete this key if already expired */
3841 if (expiretime < now) deleteKey(db,keyobj);
3842 expiretime = -1;
3843 }
3844 keyobj = o = NULL;
3845 /* Handle swapping while loading big datasets when VM is on */
3846 loadedkeys++;
3847 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3848 while (zmalloc_used_memory() > server.vm_max_memory) {
3849 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3850 }
3851 }
3852 }
3853 fclose(fp);
3854 return REDIS_OK;
3855
3856 eoferr: /* unexpected end of file is handled here with a fatal exit */
3857 if (keyobj) decrRefCount(keyobj);
3858 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3859 exit(1);
3860 return REDIS_ERR; /* Just to avoid warning */
3861 }
3862
3863 /*================================== Commands =============================== */
3864
3865 static void authCommand(redisClient *c) {
3866 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3867 c->authenticated = 1;
3868 addReply(c,shared.ok);
3869 } else {
3870 c->authenticated = 0;
3871 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3872 }
3873 }
3874
3875 static void pingCommand(redisClient *c) {
3876 addReply(c,shared.pong);
3877 }
3878
3879 static void echoCommand(redisClient *c) {
3880 addReplyBulk(c,c->argv[1]);
3881 }
3882
3883 /*=================================== Strings =============================== */
3884
3885 static void setGenericCommand(redisClient *c, int nx) {
3886 int retval;
3887
3888 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3889 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3890 if (retval == DICT_ERR) {
3891 if (!nx) {
3892 /* If the key is about a swapped value, we want a new key object
3893 * to overwrite the old. So we delete the old key in the database.
3894 * This will also make sure that swap pages about the old object
3895 * will be marked as free. */
3896 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3897 incrRefCount(c->argv[1]);
3898 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3899 incrRefCount(c->argv[2]);
3900 } else {
3901 addReply(c,shared.czero);
3902 return;
3903 }
3904 } else {
3905 incrRefCount(c->argv[1]);
3906 incrRefCount(c->argv[2]);
3907 }
3908 server.dirty++;
3909 removeExpire(c->db,c->argv[1]);
3910 addReply(c, nx ? shared.cone : shared.ok);
3911 }
3912
3913 static void setCommand(redisClient *c) {
3914 setGenericCommand(c,0);
3915 }
3916
3917 static void setnxCommand(redisClient *c) {
3918 setGenericCommand(c,1);
3919 }
3920
3921 static int getGenericCommand(redisClient *c) {
3922 robj *o;
3923
3924 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3925 return REDIS_OK;
3926
3927 if (o->type != REDIS_STRING) {
3928 addReply(c,shared.wrongtypeerr);
3929 return REDIS_ERR;
3930 } else {
3931 addReplyBulk(c,o);
3932 return REDIS_OK;
3933 }
3934 }
3935
3936 static void getCommand(redisClient *c) {
3937 getGenericCommand(c);
3938 }
3939
3940 static void getsetCommand(redisClient *c) {
3941 if (getGenericCommand(c) == REDIS_ERR) return;
3942 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3943 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3944 } else {
3945 incrRefCount(c->argv[1]);
3946 }
3947 incrRefCount(c->argv[2]);
3948 server.dirty++;
3949 removeExpire(c->db,c->argv[1]);
3950 }
3951
3952 static void mgetCommand(redisClient *c) {
3953 int j;
3954
3955 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3956 for (j = 1; j < c->argc; j++) {
3957 robj *o = lookupKeyRead(c->db,c->argv[j]);
3958 if (o == NULL) {
3959 addReply(c,shared.nullbulk);
3960 } else {
3961 if (o->type != REDIS_STRING) {
3962 addReply(c,shared.nullbulk);
3963 } else {
3964 addReplyBulk(c,o);
3965 }
3966 }
3967 }
3968 }
3969
3970 static void msetGenericCommand(redisClient *c, int nx) {
3971 int j, busykeys = 0;
3972
3973 if ((c->argc % 2) == 0) {
3974 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3975 return;
3976 }
3977 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3978 * set nothing at all if at least one already key exists. */
3979 if (nx) {
3980 for (j = 1; j < c->argc; j += 2) {
3981 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3982 busykeys++;
3983 }
3984 }
3985 }
3986 if (busykeys) {
3987 addReply(c, shared.czero);
3988 return;
3989 }
3990
3991 for (j = 1; j < c->argc; j += 2) {
3992 int retval;
3993
3994 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
3995 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3996 if (retval == DICT_ERR) {
3997 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3998 incrRefCount(c->argv[j+1]);
3999 } else {
4000 incrRefCount(c->argv[j]);
4001 incrRefCount(c->argv[j+1]);
4002 }
4003 removeExpire(c->db,c->argv[j]);
4004 }
4005 server.dirty += (c->argc-1)/2;
4006 addReply(c, nx ? shared.cone : shared.ok);
4007 }
4008
4009 static void msetCommand(redisClient *c) {
4010 msetGenericCommand(c,0);
4011 }
4012
4013 static void msetnxCommand(redisClient *c) {
4014 msetGenericCommand(c,1);
4015 }
4016
4017 static void incrDecrCommand(redisClient *c, long long incr) {
4018 long long value;
4019 int retval;
4020 robj *o;
4021
4022 o = lookupKeyWrite(c->db,c->argv[1]);
4023
4024 if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
4025
4026 value += incr;
4027 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4028 o = tryObjectEncoding(o);
4029 retval = dictAdd(c->db->dict,c->argv[1],o);
4030 if (retval == DICT_ERR) {
4031 dictReplace(c->db->dict,c->argv[1],o);
4032 removeExpire(c->db,c->argv[1]);
4033 } else {
4034 incrRefCount(c->argv[1]);
4035 }
4036 server.dirty++;
4037 addReply(c,shared.colon);
4038 addReply(c,o);
4039 addReply(c,shared.crlf);
4040 }
4041
4042 static void incrCommand(redisClient *c) {
4043 incrDecrCommand(c,1);
4044 }
4045
4046 static void decrCommand(redisClient *c) {
4047 incrDecrCommand(c,-1);
4048 }
4049
4050 static void incrbyCommand(redisClient *c) {
4051 long long incr;
4052
4053 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4054
4055 incrDecrCommand(c,incr);
4056 }
4057
4058 static void decrbyCommand(redisClient *c) {
4059 long long incr;
4060
4061 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4062
4063 incrDecrCommand(c,-incr);
4064 }
4065
4066 static void appendCommand(redisClient *c) {
4067 int retval;
4068 size_t totlen;
4069 robj *o;
4070
4071 o = lookupKeyWrite(c->db,c->argv[1]);
4072 if (o == NULL) {
4073 /* Create the key */
4074 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4075 incrRefCount(c->argv[1]);
4076 incrRefCount(c->argv[2]);
4077 totlen = stringObjectLen(c->argv[2]);
4078 } else {
4079 dictEntry *de;
4080
4081 de = dictFind(c->db->dict,c->argv[1]);
4082 assert(de != NULL);
4083
4084 o = dictGetEntryVal(de);
4085 if (o->type != REDIS_STRING) {
4086 addReply(c,shared.wrongtypeerr);
4087 return;
4088 }
4089 /* If the object is specially encoded or shared we have to make
4090 * a copy */
4091 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4092 robj *decoded = getDecodedObject(o);
4093
4094 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4095 decrRefCount(decoded);
4096 dictReplace(c->db->dict,c->argv[1],o);
4097 }
4098 /* APPEND! */
4099 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4100 o->ptr = sdscatlen(o->ptr,
4101 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4102 } else {
4103 o->ptr = sdscatprintf(o->ptr, "%ld",
4104 (unsigned long) c->argv[2]->ptr);
4105 }
4106 totlen = sdslen(o->ptr);
4107 }
4108 server.dirty++;
4109 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4110 }
4111
4112 static void substrCommand(redisClient *c) {
4113 robj *o;
4114 long start = atoi(c->argv[2]->ptr);
4115 long end = atoi(c->argv[3]->ptr);
4116 size_t rangelen, strlen;
4117 sds range;
4118
4119 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4120 checkType(c,o,REDIS_STRING)) return;
4121
4122 o = getDecodedObject(o);
4123 strlen = sdslen(o->ptr);
4124
4125 /* convert negative indexes */
4126 if (start < 0) start = strlen+start;
4127 if (end < 0) end = strlen+end;
4128 if (start < 0) start = 0;
4129 if (end < 0) end = 0;
4130
4131 /* indexes sanity checks */
4132 if (start > end || (size_t)start >= strlen) {
4133 /* Out of range start or start > end result in null reply */
4134 addReply(c,shared.nullbulk);
4135 decrRefCount(o);
4136 return;
4137 }
4138 if ((size_t)end >= strlen) end = strlen-1;
4139 rangelen = (end-start)+1;
4140
4141 /* Return the result */
4142 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4143 range = sdsnewlen((char*)o->ptr+start,rangelen);
4144 addReplySds(c,range);
4145 addReply(c,shared.crlf);
4146 decrRefCount(o);
4147 }
4148
4149 /* ========================= Type agnostic commands ========================= */
4150
4151 static void delCommand(redisClient *c) {
4152 int deleted = 0, j;
4153
4154 for (j = 1; j < c->argc; j++) {
4155 if (deleteKey(c->db,c->argv[j])) {
4156 server.dirty++;
4157 deleted++;
4158 }
4159 }
4160 addReplyLong(c,deleted);
4161 }
4162
4163 static void existsCommand(redisClient *c) {
4164 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4165 }
4166
4167 static void selectCommand(redisClient *c) {
4168 int id = atoi(c->argv[1]->ptr);
4169
4170 if (selectDb(c,id) == REDIS_ERR) {
4171 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4172 } else {
4173 addReply(c,shared.ok);
4174 }
4175 }
4176
4177 static void randomkeyCommand(redisClient *c) {
4178 dictEntry *de;
4179
4180 while(1) {
4181 de = dictGetRandomKey(c->db->dict);
4182 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4183 }
4184 if (de == NULL) {
4185 addReply(c,shared.plus);
4186 addReply(c,shared.crlf);
4187 } else {
4188 addReply(c,shared.plus);
4189 addReply(c,dictGetEntryKey(de));
4190 addReply(c,shared.crlf);
4191 }
4192 }
4193
4194 static void keysCommand(redisClient *c) {
4195 dictIterator *di;
4196 dictEntry *de;
4197 sds pattern = c->argv[1]->ptr;
4198 int plen = sdslen(pattern);
4199 unsigned long numkeys = 0;
4200 robj *lenobj = createObject(REDIS_STRING,NULL);
4201
4202 di = dictGetIterator(c->db->dict);
4203 addReply(c,lenobj);
4204 decrRefCount(lenobj);
4205 while((de = dictNext(di)) != NULL) {
4206 robj *keyobj = dictGetEntryKey(de);
4207
4208 sds key = keyobj->ptr;
4209 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4210 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4211 if (expireIfNeeded(c->db,keyobj) == 0) {
4212 addReplyBulk(c,keyobj);
4213 numkeys++;
4214 }
4215 }
4216 }
4217 dictReleaseIterator(di);
4218 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4219 }
4220
4221 static void dbsizeCommand(redisClient *c) {
4222 addReplySds(c,
4223 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4224 }
4225
4226 static void lastsaveCommand(redisClient *c) {
4227 addReplySds(c,
4228 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4229 }
4230
4231 static void typeCommand(redisClient *c) {
4232 robj *o;
4233 char *type;
4234
4235 o = lookupKeyRead(c->db,c->argv[1]);
4236 if (o == NULL) {
4237 type = "+none";
4238 } else {
4239 switch(o->type) {
4240 case REDIS_STRING: type = "+string"; break;
4241 case REDIS_LIST: type = "+list"; break;
4242 case REDIS_SET: type = "+set"; break;
4243 case REDIS_ZSET: type = "+zset"; break;
4244 case REDIS_HASH: type = "+hash"; break;
4245 default: type = "+unknown"; break;
4246 }
4247 }
4248 addReplySds(c,sdsnew(type));
4249 addReply(c,shared.crlf);
4250 }
4251
4252 static void saveCommand(redisClient *c) {
4253 if (server.bgsavechildpid != -1) {
4254 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4255 return;
4256 }
4257 if (rdbSave(server.dbfilename) == REDIS_OK) {
4258 addReply(c,shared.ok);
4259 } else {
4260 addReply(c,shared.err);
4261 }
4262 }
4263
4264 static void bgsaveCommand(redisClient *c) {
4265 if (server.bgsavechildpid != -1) {
4266 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4267 return;
4268 }
4269 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4270 char *status = "+Background saving started\r\n";
4271 addReplySds(c,sdsnew(status));
4272 } else {
4273 addReply(c,shared.err);
4274 }
4275 }
4276
4277 static void shutdownCommand(redisClient *c) {
4278 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4279 /* Kill the saving child if there is a background saving in progress.
4280 We want to avoid race conditions, for instance our saving child may
4281 overwrite the synchronous saving did by SHUTDOWN. */
4282 if (server.bgsavechildpid != -1) {
4283 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4284 kill(server.bgsavechildpid,SIGKILL);
4285 rdbRemoveTempFile(server.bgsavechildpid);
4286 }
4287 if (server.appendonly) {
4288 /* Append only file: fsync() the AOF and exit */
4289 fsync(server.appendfd);
4290 if (server.vm_enabled) unlink(server.vm_swap_file);
4291 exit(0);
4292 } else {
4293 /* Snapshotting. Perform a SYNC SAVE and exit */
4294 if (rdbSave(server.dbfilename) == REDIS_OK) {
4295 if (server.daemonize)
4296 unlink(server.pidfile);
4297 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4298 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4299 if (server.vm_enabled) unlink(server.vm_swap_file);
4300 exit(0);
4301 } else {
4302 /* Ooops.. error saving! The best we can do is to continue
4303 * operating. Note that if there was a background saving process,
4304 * in the next cron() Redis will be notified that the background
4305 * saving aborted, handling special stuff like slaves pending for
4306 * synchronization... */
4307 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4308 addReplySds(c,
4309 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4310 }
4311 }
4312 }
4313
4314 static void renameGenericCommand(redisClient *c, int nx) {
4315 robj *o;
4316
4317 /* To use the same key as src and dst is probably an error */
4318 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4319 addReply(c,shared.sameobjecterr);
4320 return;
4321 }
4322
4323 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4324 return;
4325
4326 incrRefCount(o);
4327 deleteIfVolatile(c->db,c->argv[2]);
4328 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4329 if (nx) {
4330 decrRefCount(o);
4331 addReply(c,shared.czero);
4332 return;
4333 }
4334 dictReplace(c->db->dict,c->argv[2],o);
4335 } else {
4336 incrRefCount(c->argv[2]);
4337 }
4338 deleteKey(c->db,c->argv[1]);
4339 server.dirty++;
4340 addReply(c,nx ? shared.cone : shared.ok);
4341 }
4342
4343 static void renameCommand(redisClient *c) {
4344 renameGenericCommand(c,0);
4345 }
4346
4347 static void renamenxCommand(redisClient *c) {
4348 renameGenericCommand(c,1);
4349 }
4350
4351 static void moveCommand(redisClient *c) {
4352 robj *o;
4353 redisDb *src, *dst;
4354 int srcid;
4355
4356 /* Obtain source and target DB pointers */
4357 src = c->db;
4358 srcid = c->db->id;
4359 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4360 addReply(c,shared.outofrangeerr);
4361 return;
4362 }
4363 dst = c->db;
4364 selectDb(c,srcid); /* Back to the source DB */
4365
4366 /* If the user is moving using as target the same
4367 * DB as the source DB it is probably an error. */
4368 if (src == dst) {
4369 addReply(c,shared.sameobjecterr);
4370 return;
4371 }
4372
4373 /* Check if the element exists and get a reference */
4374 o = lookupKeyWrite(c->db,c->argv[1]);
4375 if (!o) {
4376 addReply(c,shared.czero);
4377 return;
4378 }
4379
4380 /* Try to add the element to the target DB */
4381 deleteIfVolatile(dst,c->argv[1]);
4382 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4383 addReply(c,shared.czero);
4384 return;
4385 }
4386 incrRefCount(c->argv[1]);
4387 incrRefCount(o);
4388
4389 /* OK! key moved, free the entry in the source DB */
4390 deleteKey(src,c->argv[1]);
4391 server.dirty++;
4392 addReply(c,shared.cone);
4393 }
4394
4395 /* =================================== Lists ================================ */
4396 static void pushGenericCommand(redisClient *c, int where) {
4397 robj *lobj;
4398 list *list;
4399
4400 lobj = lookupKeyWrite(c->db,c->argv[1]);
4401 if (lobj == NULL) {
4402 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4403 addReply(c,shared.cone);
4404 return;
4405 }
4406 lobj = createListObject();
4407 list = lobj->ptr;
4408 if (where == REDIS_HEAD) {
4409 listAddNodeHead(list,c->argv[2]);
4410 } else {
4411 listAddNodeTail(list,c->argv[2]);
4412 }
4413 dictAdd(c->db->dict,c->argv[1],lobj);
4414 incrRefCount(c->argv[1]);
4415 incrRefCount(c->argv[2]);
4416 } else {
4417 if (lobj->type != REDIS_LIST) {
4418 addReply(c,shared.wrongtypeerr);
4419 return;
4420 }
4421 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4422 addReply(c,shared.cone);
4423 return;
4424 }
4425 list = lobj->ptr;
4426 if (where == REDIS_HEAD) {
4427 listAddNodeHead(list,c->argv[2]);
4428 } else {
4429 listAddNodeTail(list,c->argv[2]);
4430 }
4431 incrRefCount(c->argv[2]);
4432 }
4433 server.dirty++;
4434 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4435 }
4436
4437 static void lpushCommand(redisClient *c) {
4438 pushGenericCommand(c,REDIS_HEAD);
4439 }
4440
4441 static void rpushCommand(redisClient *c) {
4442 pushGenericCommand(c,REDIS_TAIL);
4443 }
4444
4445 static void llenCommand(redisClient *c) {
4446 robj *o;
4447 list *l;
4448
4449 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4450 checkType(c,o,REDIS_LIST)) return;
4451
4452 l = o->ptr;
4453 addReplyUlong(c,listLength(l));
4454 }
4455
4456 static void lindexCommand(redisClient *c) {
4457 robj *o;
4458 int index = atoi(c->argv[2]->ptr);
4459 list *list;
4460 listNode *ln;
4461
4462 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4463 checkType(c,o,REDIS_LIST)) return;
4464 list = o->ptr;
4465
4466 ln = listIndex(list, index);
4467 if (ln == NULL) {
4468 addReply(c,shared.nullbulk);
4469 } else {
4470 robj *ele = listNodeValue(ln);
4471 addReplyBulk(c,ele);
4472 }
4473 }
4474
4475 static void lsetCommand(redisClient *c) {
4476 robj *o;
4477 int index = atoi(c->argv[2]->ptr);
4478 list *list;
4479 listNode *ln;
4480
4481 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4482 checkType(c,o,REDIS_LIST)) return;
4483 list = o->ptr;
4484
4485 ln = listIndex(list, index);
4486 if (ln == NULL) {
4487 addReply(c,shared.outofrangeerr);
4488 } else {
4489 robj *ele = listNodeValue(ln);
4490
4491 decrRefCount(ele);
4492 listNodeValue(ln) = c->argv[3];
4493 incrRefCount(c->argv[3]);
4494 addReply(c,shared.ok);
4495 server.dirty++;
4496 }
4497 }
4498
4499 static void popGenericCommand(redisClient *c, int where) {
4500 robj *o;
4501 list *list;
4502 listNode *ln;
4503
4504 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4505 checkType(c,o,REDIS_LIST)) return;
4506 list = o->ptr;
4507
4508 if (where == REDIS_HEAD)
4509 ln = listFirst(list);
4510 else
4511 ln = listLast(list);
4512
4513 if (ln == NULL) {
4514 addReply(c,shared.nullbulk);
4515 } else {
4516 robj *ele = listNodeValue(ln);
4517 addReplyBulk(c,ele);
4518 listDelNode(list,ln);
4519 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4520 server.dirty++;
4521 }
4522 }
4523
4524 static void lpopCommand(redisClient *c) {
4525 popGenericCommand(c,REDIS_HEAD);
4526 }
4527
4528 static void rpopCommand(redisClient *c) {
4529 popGenericCommand(c,REDIS_TAIL);
4530 }
4531
4532 static void lrangeCommand(redisClient *c) {
4533 robj *o;
4534 int start = atoi(c->argv[2]->ptr);
4535 int end = atoi(c->argv[3]->ptr);
4536 int llen;
4537 int rangelen, j;
4538 list *list;
4539 listNode *ln;
4540 robj *ele;
4541
4542 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4543 || checkType(c,o,REDIS_LIST)) return;
4544 list = o->ptr;
4545 llen = listLength(list);
4546
4547 /* convert negative indexes */
4548 if (start < 0) start = llen+start;
4549 if (end < 0) end = llen+end;
4550 if (start < 0) start = 0;
4551 if (end < 0) end = 0;
4552
4553 /* indexes sanity checks */
4554 if (start > end || start >= llen) {
4555 /* Out of range start or start > end result in empty list */
4556 addReply(c,shared.emptymultibulk);
4557 return;
4558 }
4559 if (end >= llen) end = llen-1;
4560 rangelen = (end-start)+1;
4561
4562 /* Return the result in form of a multi-bulk reply */
4563 ln = listIndex(list, start);
4564 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4565 for (j = 0; j < rangelen; j++) {
4566 ele = listNodeValue(ln);
4567 addReplyBulk(c,ele);
4568 ln = ln->next;
4569 }
4570 }
4571
4572 static void ltrimCommand(redisClient *c) {
4573 robj *o;
4574 int start = atoi(c->argv[2]->ptr);
4575 int end = atoi(c->argv[3]->ptr);
4576 int llen;
4577 int j, ltrim, rtrim;
4578 list *list;
4579 listNode *ln;
4580
4581 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4582 checkType(c,o,REDIS_LIST)) return;
4583 list = o->ptr;
4584 llen = listLength(list);
4585
4586 /* convert negative indexes */
4587 if (start < 0) start = llen+start;
4588 if (end < 0) end = llen+end;
4589 if (start < 0) start = 0;
4590 if (end < 0) end = 0;
4591
4592 /* indexes sanity checks */
4593 if (start > end || start >= llen) {
4594 /* Out of range start or start > end result in empty list */
4595 ltrim = llen;
4596 rtrim = 0;
4597 } else {
4598 if (end >= llen) end = llen-1;
4599 ltrim = start;
4600 rtrim = llen-end-1;
4601 }
4602
4603 /* Remove list elements to perform the trim */
4604 for (j = 0; j < ltrim; j++) {
4605 ln = listFirst(list);
4606 listDelNode(list,ln);
4607 }
4608 for (j = 0; j < rtrim; j++) {
4609 ln = listLast(list);
4610 listDelNode(list,ln);
4611 }
4612 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4613 server.dirty++;
4614 addReply(c,shared.ok);
4615 }
4616
4617 static void lremCommand(redisClient *c) {
4618 robj *o;
4619 list *list;
4620 listNode *ln, *next;
4621 int toremove = atoi(c->argv[2]->ptr);
4622 int removed = 0;
4623 int fromtail = 0;
4624
4625 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4626 checkType(c,o,REDIS_LIST)) return;
4627 list = o->ptr;
4628
4629 if (toremove < 0) {
4630 toremove = -toremove;
4631 fromtail = 1;
4632 }
4633 ln = fromtail ? list->tail : list->head;
4634 while (ln) {
4635 robj *ele = listNodeValue(ln);
4636
4637 next = fromtail ? ln->prev : ln->next;
4638 if (compareStringObjects(ele,c->argv[3]) == 0) {
4639 listDelNode(list,ln);
4640 server.dirty++;
4641 removed++;
4642 if (toremove && removed == toremove) break;
4643 }
4644 ln = next;
4645 }
4646 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4647 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4648 }
4649
4650 /* This is the semantic of this command:
4651 * RPOPLPUSH srclist dstlist:
4652 * IF LLEN(srclist) > 0
4653 * element = RPOP srclist
4654 * LPUSH dstlist element
4655 * RETURN element
4656 * ELSE
4657 * RETURN nil
4658 * END
4659 * END
4660 *
4661 * The idea is to be able to get an element from a list in a reliable way
4662 * since the element is not just returned but pushed against another list
4663 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4664 */
4665 static void rpoplpushcommand(redisClient *c) {
4666 robj *sobj;
4667 list *srclist;
4668 listNode *ln;
4669
4670 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4671 checkType(c,sobj,REDIS_LIST)) return;
4672 srclist = sobj->ptr;
4673 ln = listLast(srclist);
4674
4675 if (ln == NULL) {
4676 addReply(c,shared.nullbulk);
4677 } else {
4678 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4679 robj *ele = listNodeValue(ln);
4680 list *dstlist;
4681
4682 if (dobj && dobj->type != REDIS_LIST) {
4683 addReply(c,shared.wrongtypeerr);
4684 return;
4685 }
4686
4687 /* Add the element to the target list (unless it's directly
4688 * passed to some BLPOP-ing client */
4689 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4690 if (dobj == NULL) {
4691 /* Create the list if the key does not exist */
4692 dobj = createListObject();
4693 dictAdd(c->db->dict,c->argv[2],dobj);
4694 incrRefCount(c->argv[2]);
4695 }
4696 dstlist = dobj->ptr;
4697 listAddNodeHead(dstlist,ele);
4698 incrRefCount(ele);
4699 }
4700
4701 /* Send the element to the client as reply as well */
4702 addReplyBulk(c,ele);
4703
4704 /* Finally remove the element from the source list */
4705 listDelNode(srclist,ln);
4706 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4707 server.dirty++;
4708 }
4709 }
4710
4711 /* ==================================== Sets ================================ */
4712
4713 static void saddCommand(redisClient *c) {
4714 robj *set;
4715
4716 set = lookupKeyWrite(c->db,c->argv[1]);
4717 if (set == NULL) {
4718 set = createSetObject();
4719 dictAdd(c->db->dict,c->argv[1],set);
4720 incrRefCount(c->argv[1]);
4721 } else {
4722 if (set->type != REDIS_SET) {
4723 addReply(c,shared.wrongtypeerr);
4724 return;
4725 }
4726 }
4727 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4728 incrRefCount(c->argv[2]);
4729 server.dirty++;
4730 addReply(c,shared.cone);
4731 } else {
4732 addReply(c,shared.czero);
4733 }
4734 }
4735
4736 static void sremCommand(redisClient *c) {
4737 robj *set;
4738
4739 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4740 checkType(c,set,REDIS_SET)) return;
4741
4742 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4743 server.dirty++;
4744 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4745 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4746 addReply(c,shared.cone);
4747 } else {
4748 addReply(c,shared.czero);
4749 }
4750 }
4751
4752 static void smoveCommand(redisClient *c) {
4753 robj *srcset, *dstset;
4754
4755 srcset = lookupKeyWrite(c->db,c->argv[1]);
4756 dstset = lookupKeyWrite(c->db,c->argv[2]);
4757
4758 /* If the source key does not exist return 0, if it's of the wrong type
4759 * raise an error */
4760 if (srcset == NULL || srcset->type != REDIS_SET) {
4761 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4762 return;
4763 }
4764 /* Error if the destination key is not a set as well */
4765 if (dstset && dstset->type != REDIS_SET) {
4766 addReply(c,shared.wrongtypeerr);
4767 return;
4768 }
4769 /* Remove the element from the source set */
4770 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4771 /* Key not found in the src set! return zero */
4772 addReply(c,shared.czero);
4773 return;
4774 }
4775 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4776 deleteKey(c->db,c->argv[1]);
4777 server.dirty++;
4778 /* Add the element to the destination set */
4779 if (!dstset) {
4780 dstset = createSetObject();
4781 dictAdd(c->db->dict,c->argv[2],dstset);
4782 incrRefCount(c->argv[2]);
4783 }
4784 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4785 incrRefCount(c->argv[3]);
4786 addReply(c,shared.cone);
4787 }
4788
4789 static void sismemberCommand(redisClient *c) {
4790 robj *set;
4791
4792 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4793 checkType(c,set,REDIS_SET)) return;
4794
4795 if (dictFind(set->ptr,c->argv[2]))
4796 addReply(c,shared.cone);
4797 else
4798 addReply(c,shared.czero);
4799 }
4800
4801 static void scardCommand(redisClient *c) {
4802 robj *o;
4803 dict *s;
4804
4805 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4806 checkType(c,o,REDIS_SET)) return;
4807
4808 s = o->ptr;
4809 addReplyUlong(c,dictSize(s));
4810 }
4811
4812 static void spopCommand(redisClient *c) {
4813 robj *set;
4814 dictEntry *de;
4815
4816 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4817 checkType(c,set,REDIS_SET)) return;
4818
4819 de = dictGetRandomKey(set->ptr);
4820 if (de == NULL) {
4821 addReply(c,shared.nullbulk);
4822 } else {
4823 robj *ele = dictGetEntryKey(de);
4824
4825 addReplyBulk(c,ele);
4826 dictDelete(set->ptr,ele);
4827 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4828 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4829 server.dirty++;
4830 }
4831 }
4832
4833 static void srandmemberCommand(redisClient *c) {
4834 robj *set;
4835 dictEntry *de;
4836
4837 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4838 checkType(c,set,REDIS_SET)) return;
4839
4840 de = dictGetRandomKey(set->ptr);
4841 if (de == NULL) {
4842 addReply(c,shared.nullbulk);
4843 } else {
4844 robj *ele = dictGetEntryKey(de);
4845
4846 addReplyBulk(c,ele);
4847 }
4848 }
4849
4850 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4851 dict **d1 = (void*) s1, **d2 = (void*) s2;
4852
4853 return dictSize(*d1)-dictSize(*d2);
4854 }
4855
4856 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4857 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4858 dictIterator *di;
4859 dictEntry *de;
4860 robj *lenobj = NULL, *dstset = NULL;
4861 unsigned long j, cardinality = 0;
4862
4863 for (j = 0; j < setsnum; j++) {
4864 robj *setobj;
4865
4866 setobj = dstkey ?
4867 lookupKeyWrite(c->db,setskeys[j]) :
4868 lookupKeyRead(c->db,setskeys[j]);
4869 if (!setobj) {
4870 zfree(dv);
4871 if (dstkey) {
4872 if (deleteKey(c->db,dstkey))
4873 server.dirty++;
4874 addReply(c,shared.czero);
4875 } else {
4876 addReply(c,shared.emptymultibulk);
4877 }
4878 return;
4879 }
4880 if (setobj->type != REDIS_SET) {
4881 zfree(dv);
4882 addReply(c,shared.wrongtypeerr);
4883 return;
4884 }
4885 dv[j] = setobj->ptr;
4886 }
4887 /* Sort sets from the smallest to largest, this will improve our
4888 * algorithm's performace */
4889 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4890
4891 /* The first thing we should output is the total number of elements...
4892 * since this is a multi-bulk write, but at this stage we don't know
4893 * the intersection set size, so we use a trick, append an empty object
4894 * to the output list and save the pointer to later modify it with the
4895 * right length */
4896 if (!dstkey) {
4897 lenobj = createObject(REDIS_STRING,NULL);
4898 addReply(c,lenobj);
4899 decrRefCount(lenobj);
4900 } else {
4901 /* If we have a target key where to store the resulting set
4902 * create this key with an empty set inside */
4903 dstset = createSetObject();
4904 }
4905
4906 /* Iterate all the elements of the first (smallest) set, and test
4907 * the element against all the other sets, if at least one set does
4908 * not include the element it is discarded */
4909 di = dictGetIterator(dv[0]);
4910
4911 while((de = dictNext(di)) != NULL) {
4912 robj *ele;
4913
4914 for (j = 1; j < setsnum; j++)
4915 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4916 if (j != setsnum)
4917 continue; /* at least one set does not contain the member */
4918 ele = dictGetEntryKey(de);
4919 if (!dstkey) {
4920 addReplyBulk(c,ele);
4921 cardinality++;
4922 } else {
4923 dictAdd(dstset->ptr,ele,NULL);
4924 incrRefCount(ele);
4925 }
4926 }
4927 dictReleaseIterator(di);
4928
4929 if (dstkey) {
4930 /* Store the resulting set into the target, if the intersection
4931 * is not an empty set. */
4932 deleteKey(c->db,dstkey);
4933 if (dictSize((dict*)dstset->ptr) > 0) {
4934 dictAdd(c->db->dict,dstkey,dstset);
4935 incrRefCount(dstkey);
4936 addReplyLong(c,dictSize((dict*)dstset->ptr));
4937 } else {
4938 decrRefCount(dstset);
4939 addReply(c,shared.czero);
4940 }
4941 server.dirty++;
4942 } else {
4943 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4944 }
4945 zfree(dv);
4946 }
4947
4948 static void sinterCommand(redisClient *c) {
4949 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4950 }
4951
4952 static void sinterstoreCommand(redisClient *c) {
4953 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4954 }
4955
4956 #define REDIS_OP_UNION 0
4957 #define REDIS_OP_DIFF 1
4958 #define REDIS_OP_INTER 2
4959
4960 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4961 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4962 dictIterator *di;
4963 dictEntry *de;
4964 robj *dstset = NULL;
4965 int j, cardinality = 0;
4966
4967 for (j = 0; j < setsnum; j++) {
4968 robj *setobj;
4969
4970 setobj = dstkey ?
4971 lookupKeyWrite(c->db,setskeys[j]) :
4972 lookupKeyRead(c->db,setskeys[j]);
4973 if (!setobj) {
4974 dv[j] = NULL;
4975 continue;
4976 }
4977 if (setobj->type != REDIS_SET) {
4978 zfree(dv);
4979 addReply(c,shared.wrongtypeerr);
4980 return;
4981 }
4982 dv[j] = setobj->ptr;
4983 }
4984
4985 /* We need a temp set object to store our union. If the dstkey
4986 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4987 * this set object will be the resulting object to set into the target key*/
4988 dstset = createSetObject();
4989
4990 /* Iterate all the elements of all the sets, add every element a single
4991 * time to the result set */
4992 for (j = 0; j < setsnum; j++) {
4993 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4994 if (!dv[j]) continue; /* non existing keys are like empty sets */
4995
4996 di = dictGetIterator(dv[j]);
4997
4998 while((de = dictNext(di)) != NULL) {
4999 robj *ele;
5000
5001 /* dictAdd will not add the same element multiple times */
5002 ele = dictGetEntryKey(de);
5003 if (op == REDIS_OP_UNION || j == 0) {
5004 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5005 incrRefCount(ele);
5006 cardinality++;
5007 }
5008 } else if (op == REDIS_OP_DIFF) {
5009 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5010 cardinality--;
5011 }
5012 }
5013 }
5014 dictReleaseIterator(di);
5015
5016 /* result set is empty? Exit asap. */
5017 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5018 }
5019
5020 /* Output the content of the resulting set, if not in STORE mode */
5021 if (!dstkey) {
5022 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5023 di = dictGetIterator(dstset->ptr);
5024 while((de = dictNext(di)) != NULL) {
5025 robj *ele;
5026
5027 ele = dictGetEntryKey(de);
5028 addReplyBulk(c,ele);
5029 }
5030 dictReleaseIterator(di);
5031 decrRefCount(dstset);
5032 } else {
5033 /* If we have a target key where to store the resulting set
5034 * create this key with the result set inside */
5035 deleteKey(c->db,dstkey);
5036 if (dictSize((dict*)dstset->ptr) > 0) {
5037 dictAdd(c->db->dict,dstkey,dstset);
5038 incrRefCount(dstkey);
5039 addReplyLong(c,dictSize((dict*)dstset->ptr));
5040 } else {
5041 decrRefCount(dstset);
5042 addReply(c,shared.czero);
5043 }
5044 server.dirty++;
5045 }
5046 zfree(dv);
5047 }
5048
5049 static void sunionCommand(redisClient *c) {
5050 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5051 }
5052
5053 static void sunionstoreCommand(redisClient *c) {
5054 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5055 }
5056
5057 static void sdiffCommand(redisClient *c) {
5058 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5059 }
5060
5061 static void sdiffstoreCommand(redisClient *c) {
5062 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5063 }
5064
5065 /* ==================================== ZSets =============================== */
5066
5067 /* ZSETs are ordered sets using two data structures to hold the same elements
5068 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5069 * data structure.
5070 *
5071 * The elements are added to an hash table mapping Redis objects to scores.
5072 * At the same time the elements are added to a skip list mapping scores
5073 * to Redis objects (so objects are sorted by scores in this "view"). */
5074
5075 /* This skiplist implementation is almost a C translation of the original
5076 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5077 * Alternative to Balanced Trees", modified in three ways:
5078 * a) this implementation allows for repeated values.
5079 * b) the comparison is not just by key (our 'score') but by satellite data.
5080 * c) there is a back pointer, so it's a doubly linked list with the back
5081 * pointers being only at "level 1". This allows to traverse the list
5082 * from tail to head, useful for ZREVRANGE. */
5083
5084 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5085 zskiplistNode *zn = zmalloc(sizeof(*zn));
5086
5087 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5088 if (level > 0)
5089 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5090 zn->score = score;
5091 zn->obj = obj;
5092 return zn;
5093 }
5094
5095 static zskiplist *zslCreate(void) {
5096 int j;
5097 zskiplist *zsl;
5098
5099 zsl = zmalloc(sizeof(*zsl));
5100 zsl->level = 1;
5101 zsl->length = 0;
5102 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5103 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5104 zsl->header->forward[j] = NULL;
5105
5106 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5107 if (j < ZSKIPLIST_MAXLEVEL-1)
5108 zsl->header->span[j] = 0;
5109 }
5110 zsl->header->backward = NULL;
5111 zsl->tail = NULL;
5112 return zsl;
5113 }
5114
5115 static void zslFreeNode(zskiplistNode *node) {
5116 decrRefCount(node->obj);
5117 zfree(node->forward);
5118 zfree(node->span);
5119 zfree(node);
5120 }
5121
5122 static void zslFree(zskiplist *zsl) {
5123 zskiplistNode *node = zsl->header->forward[0], *next;
5124
5125 zfree(zsl->header->forward);
5126 zfree(zsl->header->span);
5127 zfree(zsl->header);
5128 while(node) {
5129 next = node->forward[0];
5130 zslFreeNode(node);
5131 node = next;
5132 }
5133 zfree(zsl);
5134 }
5135
5136 static int zslRandomLevel(void) {
5137 int level = 1;
5138 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5139 level += 1;
5140 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5141 }
5142
5143 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5144 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5145 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5146 int i, level;
5147
5148 x = zsl->header;
5149 for (i = zsl->level-1; i >= 0; i--) {
5150 /* store rank that is crossed to reach the insert position */
5151 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5152
5153 while (x->forward[i] &&
5154 (x->forward[i]->score < score ||
5155 (x->forward[i]->score == score &&
5156 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5157 rank[i] += i > 0 ? x->span[i-1] : 1;
5158 x = x->forward[i];
5159 }
5160 update[i] = x;
5161 }
5162 /* we assume the key is not already inside, since we allow duplicated
5163 * scores, and the re-insertion of score and redis object should never
5164 * happpen since the caller of zslInsert() should test in the hash table
5165 * if the element is already inside or not. */
5166 level = zslRandomLevel();
5167 if (level > zsl->level) {
5168 for (i = zsl->level; i < level; i++) {
5169 rank[i] = 0;
5170 update[i] = zsl->header;
5171 update[i]->span[i-1] = zsl->length;
5172 }
5173 zsl->level = level;
5174 }
5175 x = zslCreateNode(level,score,obj);
5176 for (i = 0; i < level; i++) {
5177 x->forward[i] = update[i]->forward[i];
5178 update[i]->forward[i] = x;
5179
5180 /* update span covered by update[i] as x is inserted here */
5181 if (i > 0) {
5182 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5183 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5184 }
5185 }
5186
5187 /* increment span for untouched levels */
5188 for (i = level; i < zsl->level; i++) {
5189 update[i]->span[i-1]++;
5190 }
5191
5192 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5193 if (x->forward[0])
5194 x->forward[0]->backward = x;
5195 else
5196 zsl->tail = x;
5197 zsl->length++;
5198 }
5199
5200 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5201 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5202 int i;
5203 for (i = 0; i < zsl->level; i++) {
5204 if (update[i]->forward[i] == x) {
5205 if (i > 0) {
5206 update[i]->span[i-1] += x->span[i-1] - 1;
5207 }
5208 update[i]->forward[i] = x->forward[i];
5209 } else {
5210 /* invariant: i > 0, because update[0]->forward[0]
5211 * is always equal to x */
5212 update[i]->span[i-1] -= 1;
5213 }
5214 }
5215 if (x->forward[0]) {
5216 x->forward[0]->backward = x->backward;
5217 } else {
5218 zsl->tail = x->backward;
5219 }
5220 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5221 zsl->level--;
5222 zsl->length--;
5223 }
5224
5225 /* Delete an element with matching score/object from the skiplist. */
5226 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5227 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5228 int i;
5229
5230 x = zsl->header;
5231 for (i = zsl->level-1; i >= 0; i--) {
5232 while (x->forward[i] &&
5233 (x->forward[i]->score < score ||
5234 (x->forward[i]->score == score &&
5235 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5236 x = x->forward[i];
5237 update[i] = x;
5238 }
5239 /* We may have multiple elements with the same score, what we need
5240 * is to find the element with both the right score and object. */
5241 x = x->forward[0];
5242 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5243 zslDeleteNode(zsl, x, update);
5244 zslFreeNode(x);
5245 return 1;
5246 } else {
5247 return 0; /* not found */
5248 }
5249 return 0; /* not found */
5250 }
5251
5252 /* Delete all the elements with score between min and max from the skiplist.
5253 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5254 * Note that this function takes the reference to the hash table view of the
5255 * sorted set, in order to remove the elements from the hash table too. */
5256 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5257 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5258 unsigned long removed = 0;
5259 int i;
5260
5261 x = zsl->header;
5262 for (i = zsl->level-1; i >= 0; i--) {
5263 while (x->forward[i] && x->forward[i]->score < min)
5264 x = x->forward[i];
5265 update[i] = x;
5266 }
5267 /* We may have multiple elements with the same score, what we need
5268 * is to find the element with both the right score and object. */
5269 x = x->forward[0];
5270 while (x && x->score <= max) {
5271 zskiplistNode *next = x->forward[0];
5272 zslDeleteNode(zsl, x, update);
5273 dictDelete(dict,x->obj);
5274 zslFreeNode(x);
5275 removed++;
5276 x = next;
5277 }
5278 return removed; /* not found */
5279 }
5280
5281 /* Delete all the elements with rank between start and end from the skiplist.
5282 * Start and end are inclusive. Note that start and end need to be 1-based */
5283 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5284 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5285 unsigned long traversed = 0, removed = 0;
5286 int i;
5287
5288 x = zsl->header;
5289 for (i = zsl->level-1; i >= 0; i--) {
5290 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5291 traversed += i > 0 ? x->span[i-1] : 1;
5292 x = x->forward[i];
5293 }
5294 update[i] = x;
5295 }
5296
5297 traversed++;
5298 x = x->forward[0];
5299 while (x && traversed <= end) {
5300 zskiplistNode *next = x->forward[0];
5301 zslDeleteNode(zsl, x, update);
5302 dictDelete(dict,x->obj);
5303 zslFreeNode(x);
5304 removed++;
5305 traversed++;
5306 x = next;
5307 }
5308 return removed;
5309 }
5310
5311 /* Find the first node having a score equal or greater than the specified one.
5312 * Returns NULL if there is no match. */
5313 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5314 zskiplistNode *x;
5315 int i;
5316
5317 x = zsl->header;
5318 for (i = zsl->level-1; i >= 0; i--) {
5319 while (x->forward[i] && x->forward[i]->score < score)
5320 x = x->forward[i];
5321 }
5322 /* We may have multiple elements with the same score, what we need
5323 * is to find the element with both the right score and object. */
5324 return x->forward[0];
5325 }
5326
5327 /* Find the rank for an element by both score and key.
5328 * Returns 0 when the element cannot be found, rank otherwise.
5329 * Note that the rank is 1-based due to the span of zsl->header to the
5330 * first element. */
5331 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5332 zskiplistNode *x;
5333 unsigned long rank = 0;
5334 int i;
5335
5336 x = zsl->header;
5337 for (i = zsl->level-1; i >= 0; i--) {
5338 while (x->forward[i] &&
5339 (x->forward[i]->score < score ||
5340 (x->forward[i]->score == score &&
5341 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5342 rank += i > 0 ? x->span[i-1] : 1;
5343 x = x->forward[i];
5344 }
5345
5346 /* x might be equal to zsl->header, so test if obj is non-NULL */
5347 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5348 return rank;
5349 }
5350 }
5351 return 0;
5352 }
5353
5354 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5355 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5356 zskiplistNode *x;
5357 unsigned long traversed = 0;
5358 int i;
5359
5360 x = zsl->header;
5361 for (i = zsl->level-1; i >= 0; i--) {
5362 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5363 {
5364 traversed += i > 0 ? x->span[i-1] : 1;
5365 x = x->forward[i];
5366 }
5367 if (traversed == rank) {
5368 return x;
5369 }
5370 }
5371 return NULL;
5372 }
5373
5374 /* The actual Z-commands implementations */
5375
5376 /* This generic command implements both ZADD and ZINCRBY.
5377 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5378 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5379 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5380 robj *zsetobj;
5381 zset *zs;
5382 double *score;
5383
5384 zsetobj = lookupKeyWrite(c->db,key);
5385 if (zsetobj == NULL) {
5386 zsetobj = createZsetObject();
5387 dictAdd(c->db->dict,key,zsetobj);
5388 incrRefCount(key);
5389 } else {
5390 if (zsetobj->type != REDIS_ZSET) {
5391 addReply(c,shared.wrongtypeerr);
5392 return;
5393 }
5394 }
5395 zs = zsetobj->ptr;
5396
5397 /* Ok now since we implement both ZADD and ZINCRBY here the code
5398 * needs to handle the two different conditions. It's all about setting
5399 * '*score', that is, the new score to set, to the right value. */
5400 score = zmalloc(sizeof(double));
5401 if (doincrement) {
5402 dictEntry *de;
5403
5404 /* Read the old score. If the element was not present starts from 0 */
5405 de = dictFind(zs->dict,ele);
5406 if (de) {
5407 double *oldscore = dictGetEntryVal(de);
5408 *score = *oldscore + scoreval;
5409 } else {
5410 *score = scoreval;
5411 }
5412 } else {
5413 *score = scoreval;
5414 }
5415
5416 /* What follows is a simple remove and re-insert operation that is common
5417 * to both ZADD and ZINCRBY... */
5418 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5419 /* case 1: New element */
5420 incrRefCount(ele); /* added to hash */
5421 zslInsert(zs->zsl,*score,ele);
5422 incrRefCount(ele); /* added to skiplist */
5423 server.dirty++;
5424 if (doincrement)
5425 addReplyDouble(c,*score);
5426 else
5427 addReply(c,shared.cone);
5428 } else {
5429 dictEntry *de;
5430 double *oldscore;
5431
5432 /* case 2: Score update operation */
5433 de = dictFind(zs->dict,ele);
5434 redisAssert(de != NULL);
5435 oldscore = dictGetEntryVal(de);
5436 if (*score != *oldscore) {
5437 int deleted;
5438
5439 /* Remove and insert the element in the skip list with new score */
5440 deleted = zslDelete(zs->zsl,*oldscore,ele);
5441 redisAssert(deleted != 0);
5442 zslInsert(zs->zsl,*score,ele);
5443 incrRefCount(ele);
5444 /* Update the score in the hash table */
5445 dictReplace(zs->dict,ele,score);
5446 server.dirty++;
5447 } else {
5448 zfree(score);
5449 }
5450 if (doincrement)
5451 addReplyDouble(c,*score);
5452 else
5453 addReply(c,shared.czero);
5454 }
5455 }
5456
5457 static void zaddCommand(redisClient *c) {
5458 double scoreval;
5459
5460 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5461
5462 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5463 }
5464
5465 static void zincrbyCommand(redisClient *c) {
5466 double scoreval;
5467
5468 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5469
5470 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5471 }
5472
5473 static void zremCommand(redisClient *c) {
5474 robj *zsetobj;
5475 zset *zs;
5476 dictEntry *de;
5477 double *oldscore;
5478 int deleted;
5479
5480 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5481 checkType(c,zsetobj,REDIS_ZSET)) return;
5482
5483 zs = zsetobj->ptr;
5484 de = dictFind(zs->dict,c->argv[2]);
5485 if (de == NULL) {
5486 addReply(c,shared.czero);
5487 return;
5488 }
5489 /* Delete from the skiplist */
5490 oldscore = dictGetEntryVal(de);
5491 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5492 redisAssert(deleted != 0);
5493
5494 /* Delete from the hash table */
5495 dictDelete(zs->dict,c->argv[2]);
5496 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5497 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5498 server.dirty++;
5499 addReply(c,shared.cone);
5500 }
5501
5502 static void zremrangebyscoreCommand(redisClient *c) {
5503 double min;
5504 double max;
5505 long deleted;
5506 robj *zsetobj;
5507 zset *zs;
5508
5509 if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
5510 (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;
5511
5512 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5513 checkType(c,zsetobj,REDIS_ZSET)) return;
5514
5515 zs = zsetobj->ptr;
5516 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5517 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5518 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5519 server.dirty += deleted;
5520 addReplyLong(c,deleted);
5521 }
5522
5523 static void zremrangebyrankCommand(redisClient *c) {
5524 long start;
5525 long end;
5526 int llen;
5527 long deleted;
5528 robj *zsetobj;
5529 zset *zs;
5530
5531 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5532 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5533
5534 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5535 checkType(c,zsetobj,REDIS_ZSET)) return;
5536 zs = zsetobj->ptr;
5537 llen = zs->zsl->length;
5538
5539 /* convert negative indexes */
5540 if (start < 0) start = llen+start;
5541 if (end < 0) end = llen+end;
5542 if (start < 0) start = 0;
5543 if (end < 0) end = 0;
5544
5545 /* indexes sanity checks */
5546 if (start > end || start >= llen) {
5547 addReply(c,shared.czero);
5548 return;
5549 }
5550 if (end >= llen) end = llen-1;
5551
5552 /* increment start and end because zsl*Rank functions
5553 * use 1-based rank */
5554 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5555 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5556 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5557 server.dirty += deleted;
5558 addReplyLong(c, deleted);
5559 }
5560
5561 typedef struct {
5562 dict *dict;
5563 double weight;
5564 } zsetopsrc;
5565
5566 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5567 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5568 unsigned long size1, size2;
5569 size1 = d1->dict ? dictSize(d1->dict) : 0;
5570 size2 = d2->dict ? dictSize(d2->dict) : 0;
5571 return size1 - size2;
5572 }
5573
5574 #define REDIS_AGGR_SUM 1
5575 #define REDIS_AGGR_MIN 2
5576 #define REDIS_AGGR_MAX 3
5577
5578 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5579 if (aggregate == REDIS_AGGR_SUM) {
5580 *target = *target + val;
5581 } else if (aggregate == REDIS_AGGR_MIN) {
5582 *target = val < *target ? val : *target;
5583 } else if (aggregate == REDIS_AGGR_MAX) {
5584 *target = val > *target ? val : *target;
5585 } else {
5586 /* safety net */
5587 redisAssert(0 != 0);
5588 }
5589 }
5590
5591 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5592 int i, j, zsetnum;
5593 int aggregate = REDIS_AGGR_SUM;
5594 zsetopsrc *src;
5595 robj *dstobj;
5596 zset *dstzset;
5597 dictIterator *di;
5598 dictEntry *de;
5599
5600 /* expect zsetnum input keys to be given */
5601 zsetnum = atoi(c->argv[2]->ptr);
5602 if (zsetnum < 1) {
5603 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5604 return;
5605 }
5606
5607 /* test if the expected number of keys would overflow */
5608 if (3+zsetnum > c->argc) {
5609 addReply(c,shared.syntaxerr);
5610 return;
5611 }
5612
5613 /* read keys to be used for input */
5614 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5615 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5616 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5617 if (!zsetobj) {
5618 src[i].dict = NULL;
5619 } else {
5620 if (zsetobj->type != REDIS_ZSET) {
5621 zfree(src);
5622 addReply(c,shared.wrongtypeerr);
5623 return;
5624 }
5625 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5626 }
5627
5628 /* default all weights to 1 */
5629 src[i].weight = 1.0;
5630 }
5631
5632 /* parse optional extra arguments */
5633 if (j < c->argc) {
5634 int remaining = c->argc - j;
5635
5636 while (remaining) {
5637 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5638 j++; remaining--;
5639 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5640 if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
5641 return;
5642 }
5643 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5644 j++; remaining--;
5645 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5646 aggregate = REDIS_AGGR_SUM;
5647 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5648 aggregate = REDIS_AGGR_MIN;
5649 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5650 aggregate = REDIS_AGGR_MAX;
5651 } else {
5652 zfree(src);
5653 addReply(c,shared.syntaxerr);
5654 return;
5655 }
5656 j++; remaining--;
5657 } else {
5658 zfree(src);
5659 addReply(c,shared.syntaxerr);
5660 return;
5661 }
5662 }
5663 }
5664
5665 /* sort sets from the smallest to largest, this will improve our
5666 * algorithm's performance */
5667 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5668
5669 dstobj = createZsetObject();
5670 dstzset = dstobj->ptr;
5671
5672 if (op == REDIS_OP_INTER) {
5673 /* skip going over all entries if the smallest zset is NULL or empty */
5674 if (src[0].dict && dictSize(src[0].dict) > 0) {
5675 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5676 * from small to large, all src[i > 0].dict are non-empty too */
5677 di = dictGetIterator(src[0].dict);
5678 while((de = dictNext(di)) != NULL) {
5679 double *score = zmalloc(sizeof(double)), value;
5680 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5681
5682 for (j = 1; j < zsetnum; j++) {
5683 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5684 if (other) {
5685 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5686 zunionInterAggregate(score, value, aggregate);
5687 } else {
5688 break;
5689 }
5690 }
5691
5692 /* skip entry when not present in every source dict */
5693 if (j != zsetnum) {
5694 zfree(score);
5695 } else {
5696 robj *o = dictGetEntryKey(de);
5697 dictAdd(dstzset->dict,o,score);
5698 incrRefCount(o); /* added to dictionary */
5699 zslInsert(dstzset->zsl,*score,o);
5700 incrRefCount(o); /* added to skiplist */
5701 }
5702 }
5703 dictReleaseIterator(di);
5704 }
5705 } else if (op == REDIS_OP_UNION) {
5706 for (i = 0; i < zsetnum; i++) {
5707 if (!src[i].dict) continue;
5708
5709 di = dictGetIterator(src[i].dict);
5710 while((de = dictNext(di)) != NULL) {
5711 /* skip key when already processed */
5712 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5713
5714 double *score = zmalloc(sizeof(double)), value;
5715 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5716
5717 /* because the zsets are sorted by size, its only possible
5718 * for sets at larger indices to hold this entry */
5719 for (j = (i+1); j < zsetnum; j++) {
5720 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5721 if (other) {
5722 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5723 zunionInterAggregate(score, value, aggregate);
5724 }
5725 }
5726
5727 robj *o = dictGetEntryKey(de);
5728 dictAdd(dstzset->dict,o,score);
5729 incrRefCount(o); /* added to dictionary */
5730 zslInsert(dstzset->zsl,*score,o);
5731 incrRefCount(o); /* added to skiplist */
5732 }
5733 dictReleaseIterator(di);
5734 }
5735 } else {
5736 /* unknown operator */
5737 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5738 }
5739
5740 deleteKey(c->db,dstkey);
5741 if (dstzset->zsl->length) {
5742 dictAdd(c->db->dict,dstkey,dstobj);
5743 incrRefCount(dstkey);
5744 addReplyLong(c, dstzset->zsl->length);
5745 server.dirty++;
5746 } else {
5747 decrRefCount(dstobj);
5748 addReply(c, shared.czero);
5749 }
5750 zfree(src);
5751 }
5752
5753 static void zunionCommand(redisClient *c) {
5754 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5755 }
5756
5757 static void zinterCommand(redisClient *c) {
5758 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5759 }
5760
5761 static void zrangeGenericCommand(redisClient *c, int reverse) {
5762 robj *o;
5763 long start;
5764 long end;
5765 int withscores = 0;
5766 int llen;
5767 int rangelen, j;
5768 zset *zsetobj;
5769 zskiplist *zsl;
5770 zskiplistNode *ln;
5771 robj *ele;
5772
5773 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5774 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5775
5776 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5777 withscores = 1;
5778 } else if (c->argc >= 5) {
5779 addReply(c,shared.syntaxerr);
5780 return;
5781 }
5782
5783 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5784 || checkType(c,o,REDIS_ZSET)) return;
5785 zsetobj = o->ptr;
5786 zsl = zsetobj->zsl;
5787 llen = zsl->length;
5788
5789 /* convert negative indexes */
5790 if (start < 0) start = llen+start;
5791 if (end < 0) end = llen+end;
5792 if (start < 0) start = 0;
5793 if (end < 0) end = 0;
5794
5795 /* indexes sanity checks */
5796 if (start > end || start >= llen) {
5797 /* Out of range start or start > end result in empty list */
5798 addReply(c,shared.emptymultibulk);
5799 return;
5800 }
5801 if (end >= llen) end = llen-1;
5802 rangelen = (end-start)+1;
5803
5804 /* check if starting point is trivial, before searching
5805 * the element in log(N) time */
5806 if (reverse) {
5807 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5808 } else {
5809 ln = start == 0 ?
5810 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5811 }
5812
5813 /* Return the result in form of a multi-bulk reply */
5814 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5815 withscores ? (rangelen*2) : rangelen));
5816 for (j = 0; j < rangelen; j++) {
5817 ele = ln->obj;
5818 addReplyBulk(c,ele);
5819 if (withscores)
5820 addReplyDouble(c,ln->score);
5821 ln = reverse ? ln->backward : ln->forward[0];
5822 }
5823 }
5824
5825 static void zrangeCommand(redisClient *c) {
5826 zrangeGenericCommand(c,0);
5827 }
5828
5829 static void zrevrangeCommand(redisClient *c) {
5830 zrangeGenericCommand(c,1);
5831 }
5832
5833 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5834 * If justcount is non-zero, just the count is returned. */
5835 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5836 robj *o;
5837 double min, max;
5838 int minex = 0, maxex = 0; /* are min or max exclusive? */
5839 int offset = 0, limit = -1;
5840 int withscores = 0;
5841 int badsyntax = 0;
5842
5843 /* Parse the min-max interval. If one of the values is prefixed
5844 * by the "(" character, it's considered "open". For instance
5845 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5846 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5847 if (((char*)c->argv[2]->ptr)[0] == '(') {
5848 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5849 minex = 1;
5850 } else {
5851 min = strtod(c->argv[2]->ptr,NULL);
5852 }
5853 if (((char*)c->argv[3]->ptr)[0] == '(') {
5854 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5855 maxex = 1;
5856 } else {
5857 max = strtod(c->argv[3]->ptr,NULL);
5858 }
5859
5860 /* Parse "WITHSCORES": note that if the command was called with
5861 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5862 * enter the following paths to parse WITHSCORES and LIMIT. */
5863 if (c->argc == 5 || c->argc == 8) {
5864 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5865 withscores = 1;
5866 else
5867 badsyntax = 1;
5868 }
5869 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5870 badsyntax = 1;
5871 if (badsyntax) {
5872 addReplySds(c,
5873 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5874 return;
5875 }
5876
5877 /* Parse "LIMIT" */
5878 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5879 addReply(c,shared.syntaxerr);
5880 return;
5881 } else if (c->argc == (7 + withscores)) {
5882 offset = atoi(c->argv[5]->ptr);
5883 limit = atoi(c->argv[6]->ptr);
5884 if (offset < 0) offset = 0;
5885 }
5886
5887 /* Ok, lookup the key and get the range */
5888 o = lookupKeyRead(c->db,c->argv[1]);
5889 if (o == NULL) {
5890 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5891 } else {
5892 if (o->type != REDIS_ZSET) {
5893 addReply(c,shared.wrongtypeerr);
5894 } else {
5895 zset *zsetobj = o->ptr;
5896 zskiplist *zsl = zsetobj->zsl;
5897 zskiplistNode *ln;
5898 robj *ele, *lenobj = NULL;
5899 unsigned long rangelen = 0;
5900
5901 /* Get the first node with the score >= min, or with
5902 * score > min if 'minex' is true. */
5903 ln = zslFirstWithScore(zsl,min);
5904 while (minex && ln && ln->score == min) ln = ln->forward[0];
5905
5906 if (ln == NULL) {
5907 /* No element matching the speciifed interval */
5908 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5909 return;
5910 }
5911
5912 /* We don't know in advance how many matching elements there
5913 * are in the list, so we push this object that will represent
5914 * the multi-bulk length in the output buffer, and will "fix"
5915 * it later */
5916 if (!justcount) {
5917 lenobj = createObject(REDIS_STRING,NULL);
5918 addReply(c,lenobj);
5919 decrRefCount(lenobj);
5920 }
5921
5922 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5923 if (offset) {
5924 offset--;
5925 ln = ln->forward[0];
5926 continue;
5927 }
5928 if (limit == 0) break;
5929 if (!justcount) {
5930 ele = ln->obj;
5931 addReplyBulk(c,ele);
5932 if (withscores)
5933 addReplyDouble(c,ln->score);
5934 }
5935 ln = ln->forward[0];
5936 rangelen++;
5937 if (limit > 0) limit--;
5938 }
5939 if (justcount) {
5940 addReplyLong(c,(long)rangelen);
5941 } else {
5942 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5943 withscores ? (rangelen*2) : rangelen);
5944 }
5945 }
5946 }
5947 }
5948
5949 static void zrangebyscoreCommand(redisClient *c) {
5950 genericZrangebyscoreCommand(c,0);
5951 }
5952
5953 static void zcountCommand(redisClient *c) {
5954 genericZrangebyscoreCommand(c,1);
5955 }
5956
5957 static void zcardCommand(redisClient *c) {
5958 robj *o;
5959 zset *zs;
5960
5961 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5962 checkType(c,o,REDIS_ZSET)) return;
5963
5964 zs = o->ptr;
5965 addReplyUlong(c,zs->zsl->length);
5966 }
5967
5968 static void zscoreCommand(redisClient *c) {
5969 robj *o;
5970 zset *zs;
5971 dictEntry *de;
5972
5973 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5974 checkType(c,o,REDIS_ZSET)) return;
5975
5976 zs = o->ptr;
5977 de = dictFind(zs->dict,c->argv[2]);
5978 if (!de) {
5979 addReply(c,shared.nullbulk);
5980 } else {
5981 double *score = dictGetEntryVal(de);
5982
5983 addReplyDouble(c,*score);
5984 }
5985 }
5986
5987 static void zrankGenericCommand(redisClient *c, int reverse) {
5988 robj *o;
5989 zset *zs;
5990 zskiplist *zsl;
5991 dictEntry *de;
5992 unsigned long rank;
5993 double *score;
5994
5995 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5996 checkType(c,o,REDIS_ZSET)) return;
5997
5998 zs = o->ptr;
5999 zsl = zs->zsl;
6000 de = dictFind(zs->dict,c->argv[2]);
6001 if (!de) {
6002 addReply(c,shared.nullbulk);
6003 return;
6004 }
6005
6006 score = dictGetEntryVal(de);
6007 rank = zslGetRank(zsl, *score, c->argv[2]);
6008 if (rank) {
6009 if (reverse) {
6010 addReplyLong(c, zsl->length - rank);
6011 } else {
6012 addReplyLong(c, rank-1);
6013 }
6014 } else {
6015 addReply(c,shared.nullbulk);
6016 }
6017 }
6018
6019 static void zrankCommand(redisClient *c) {
6020 zrankGenericCommand(c, 0);
6021 }
6022
6023 static void zrevrankCommand(redisClient *c) {
6024 zrankGenericCommand(c, 1);
6025 }
6026
6027 /* =================================== Hashes =============================== */
6028 static void hsetCommand(redisClient *c) {
6029 int update = 0;
6030 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6031
6032 if (o == NULL) {
6033 o = createHashObject();
6034 dictAdd(c->db->dict,c->argv[1],o);
6035 incrRefCount(c->argv[1]);
6036 } else {
6037 if (o->type != REDIS_HASH) {
6038 addReply(c,shared.wrongtypeerr);
6039 return;
6040 }
6041 }
6042 /* We want to convert the zipmap into an hash table right now if the
6043 * entry to be added is too big. Note that we check if the object
6044 * is integer encoded before to try fetching the length in the test below.
6045 * This is because integers are small, but currently stringObjectLen()
6046 * performs a slow conversion: not worth it. */
6047 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
6048 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
6049 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
6050 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
6051 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
6052 {
6053 convertToRealHash(o);
6054 }
6055
6056 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6057 unsigned char *zm = o->ptr;
6058 robj *valobj = getDecodedObject(c->argv[3]);
6059
6060 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6061 valobj->ptr,sdslen(valobj->ptr),&update);
6062 decrRefCount(valobj);
6063 o->ptr = zm;
6064
6065 /* And here there is the second check for hash conversion. */
6066 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6067 convertToRealHash(o);
6068 } else {
6069 c->argv[2] = tryObjectEncoding(c->argv[2]);
6070 /* note that c->argv[3] is already encoded, as the latest arg
6071 * of a bulk command is always integer encoded if possible. */
6072 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
6073 incrRefCount(c->argv[2]);
6074 } else {
6075 update = 1;
6076 }
6077 incrRefCount(c->argv[3]);
6078 }
6079 server.dirty++;
6080 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6081 }
6082
6083 static void hmsetCommand(redisClient *c) {
6084 int i;
6085 robj *o, *key, *val;
6086
6087 if ((c->argc % 2) == 1) {
6088 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6089 return;
6090 }
6091
6092 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6093 o = createHashObject();
6094 dictAdd(c->db->dict,c->argv[1],o);
6095 incrRefCount(c->argv[1]);
6096 } else {
6097 if (o->type != REDIS_HASH) {
6098 addReply(c,shared.wrongtypeerr);
6099 return;
6100 }
6101 }
6102
6103 /* We want to convert the zipmap into an hash table right now if the
6104 * entry to be added is too big. */
6105 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6106 for (i = 2; i < c->argc; i+=2) {
6107 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6108 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6109 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6110 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6111 convertToRealHash(o);
6112 break;
6113 }
6114 }
6115 }
6116
6117 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6118 unsigned char *zm = o->ptr;
6119
6120 for (i = 2; i < c->argc; i+=2) {
6121 key = getDecodedObject(c->argv[i]);
6122 val = getDecodedObject(c->argv[i+1]);
6123 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6124 val->ptr,sdslen(val->ptr),NULL);
6125 decrRefCount(key);
6126 decrRefCount(val);
6127 o->ptr = zm;
6128 }
6129
6130 /* And here there is the second check for hash conversion. */
6131 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6132 convertToRealHash(o);
6133 } else {
6134 for (i = 2; i < c->argc; i+=2) {
6135 key = tryObjectEncoding(c->argv[i]);
6136 val = tryObjectEncoding(c->argv[i+1]);
6137 if (dictReplace(o->ptr,key,val)) {
6138 incrRefCount(key);
6139 }
6140 incrRefCount(val);
6141 }
6142 }
6143
6144 addReply(c, shared.ok);
6145 }
6146
6147 static void hincrbyCommand(redisClient *c) {
6148 long long value = 0, incr = 0;
6149 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6150
6151 if (o == NULL) {
6152 o = createHashObject();
6153 dictAdd(c->db->dict,c->argv[1],o);
6154 incrRefCount(c->argv[1]);
6155 } else {
6156 if (o->type != REDIS_HASH) {
6157 addReply(c,shared.wrongtypeerr);
6158 return;
6159 }
6160 }
6161
6162 if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;
6163
6164 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6165 unsigned char *zm = o->ptr;
6166 unsigned char *zval;
6167 unsigned int zvlen;
6168
6169 /* Find value if already present in hash */
6170 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6171 &zval,&zvlen)) {
6172 /* strtoll needs the char* to have a trailing \0, but
6173 * the zipmap doesn't include them. */
6174 sds szval = sdsnewlen(zval, zvlen);
6175 value = strtoll(szval,NULL,10);
6176 sdsfree(szval);
6177 }
6178
6179 value += incr;
6180 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6181 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6182 (unsigned char*)svalue,sdslen(svalue),NULL);
6183 sdsfree(svalue);
6184 o->ptr = zm;
6185
6186 /* Check if the zipmap needs to be converted. */
6187 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6188 convertToRealHash(o);
6189 } else {
6190 robj *hval;
6191 dictEntry *de;
6192
6193 /* Find value if already present in hash */
6194 de = dictFind(o->ptr,c->argv[2]);
6195 if (de != NULL) {
6196 hval = dictGetEntryVal(de);
6197 if (hval->encoding == REDIS_ENCODING_RAW)
6198 value = strtoll(hval->ptr,NULL,10);
6199 else if (hval->encoding == REDIS_ENCODING_INT)
6200 value = (long)hval->ptr;
6201 else
6202 redisAssert(1 != 1);
6203 }
6204
6205 value += incr;
6206 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6207 hval = tryObjectEncoding(hval);
6208 if (dictReplace(o->ptr,c->argv[2],hval)) {
6209 incrRefCount(c->argv[2]);
6210 }
6211 }
6212
6213 server.dirty++;
6214 addReplyLongLong(c, value);
6215 }
6216
6217 static void hgetCommand(redisClient *c) {
6218 robj *o;
6219
6220 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6221 checkType(c,o,REDIS_HASH)) return;
6222
6223 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6224 unsigned char *zm = o->ptr;
6225 unsigned char *val;
6226 unsigned int vlen;
6227 robj *field;
6228
6229 field = getDecodedObject(c->argv[2]);
6230 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6231 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6232 addReplySds(c,sdsnewlen(val,vlen));
6233 addReply(c,shared.crlf);
6234 decrRefCount(field);
6235 return;
6236 } else {
6237 addReply(c,shared.nullbulk);
6238 decrRefCount(field);
6239 return;
6240 }
6241 } else {
6242 struct dictEntry *de;
6243
6244 de = dictFind(o->ptr,c->argv[2]);
6245 if (de == NULL) {
6246 addReply(c,shared.nullbulk);
6247 } else {
6248 robj *e = dictGetEntryVal(de);
6249
6250 addReplyBulk(c,e);
6251 }
6252 }
6253 }
6254
6255 static void hmgetCommand(redisClient *c) {
6256 int i;
6257
6258 robj *o = lookupKeyRead(c->db, c->argv[1]);
6259 if (o == NULL) {
6260 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6261 for (i = 2; i < c->argc; i++) {
6262 addReply(c,shared.nullbulk);
6263 }
6264 return;
6265 } else {
6266 if (o->type != REDIS_HASH) {
6267 addReply(c,shared.wrongtypeerr);
6268 return;
6269 }
6270 }
6271
6272 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6273 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6274 unsigned char *zm = o->ptr;
6275 unsigned char *v;
6276 unsigned int vlen;
6277 robj *field;
6278
6279 for (i = 2; i < c->argc; i++) {
6280 field = getDecodedObject(c->argv[i]);
6281 if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
6282 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6283 addReplySds(c,sdsnewlen(v,vlen));
6284 addReply(c,shared.crlf);
6285 } else {
6286 addReply(c,shared.nullbulk);
6287 }
6288 decrRefCount(field);
6289 }
6290 } else {
6291 dictEntry *de;
6292
6293 for (i = 2; i < c->argc; i++) {
6294 de = dictFind(o->ptr,c->argv[i]);
6295 if (de != NULL) {
6296 addReplyBulk(c,(robj*)dictGetEntryVal(de));
6297 } else {
6298 addReply(c,shared.nullbulk);
6299 }
6300 }
6301 }
6302 }
6303
6304 static void hdelCommand(redisClient *c) {
6305 robj *o;
6306 int deleted = 0;
6307
6308 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6309 checkType(c,o,REDIS_HASH)) return;
6310
6311 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6312 robj *field = getDecodedObject(c->argv[2]);
6313
6314 o->ptr = zipmapDel((unsigned char*) o->ptr,
6315 (unsigned char*) field->ptr,
6316 sdslen(field->ptr), &deleted);
6317 decrRefCount(field);
6318 if (zipmapLen((unsigned char*) o->ptr) == 0)
6319 deleteKey(c->db,c->argv[1]);
6320 } else {
6321 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6322 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6323 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6324 }
6325 if (deleted) server.dirty++;
6326 addReply(c,deleted ? shared.cone : shared.czero);
6327 }
6328
6329 static void hlenCommand(redisClient *c) {
6330 robj *o;
6331 unsigned long len;
6332
6333 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6334 checkType(c,o,REDIS_HASH)) return;
6335
6336 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6337 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6338 addReplyUlong(c,len);
6339 }
6340
6341 #define REDIS_GETALL_KEYS 1
6342 #define REDIS_GETALL_VALS 2
6343 static void genericHgetallCommand(redisClient *c, int flags) {
6344 robj *o, *lenobj;
6345 unsigned long count = 0;
6346
6347 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6348 || checkType(c,o,REDIS_HASH)) return;
6349
6350 lenobj = createObject(REDIS_STRING,NULL);
6351 addReply(c,lenobj);
6352 decrRefCount(lenobj);
6353
6354 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6355 unsigned char *p = zipmapRewind(o->ptr);
6356 unsigned char *field, *val;
6357 unsigned int flen, vlen;
6358
6359 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6360 robj *aux;
6361
6362 if (flags & REDIS_GETALL_KEYS) {
6363 aux = createStringObject((char*)field,flen);
6364 addReplyBulk(c,aux);
6365 decrRefCount(aux);
6366 count++;
6367 }
6368 if (flags & REDIS_GETALL_VALS) {
6369 aux = createStringObject((char*)val,vlen);
6370 addReplyBulk(c,aux);
6371 decrRefCount(aux);
6372 count++;
6373 }
6374 }
6375 } else {
6376 dictIterator *di = dictGetIterator(o->ptr);
6377 dictEntry *de;
6378
6379 while((de = dictNext(di)) != NULL) {
6380 robj *fieldobj = dictGetEntryKey(de);
6381 robj *valobj = dictGetEntryVal(de);
6382
6383 if (flags & REDIS_GETALL_KEYS) {
6384 addReplyBulk(c,fieldobj);
6385 count++;
6386 }
6387 if (flags & REDIS_GETALL_VALS) {
6388 addReplyBulk(c,valobj);
6389 count++;
6390 }
6391 }
6392 dictReleaseIterator(di);
6393 }
6394 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6395 }
6396
6397 static void hkeysCommand(redisClient *c) {
6398 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6399 }
6400
6401 static void hvalsCommand(redisClient *c) {
6402 genericHgetallCommand(c,REDIS_GETALL_VALS);
6403 }
6404
6405 static void hgetallCommand(redisClient *c) {
6406 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6407 }
6408
6409 static void hexistsCommand(redisClient *c) {
6410 robj *o;
6411 int exists = 0;
6412
6413 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6414 checkType(c,o,REDIS_HASH)) return;
6415
6416 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6417 robj *field;
6418 unsigned char *zm = o->ptr;
6419
6420 field = getDecodedObject(c->argv[2]);
6421 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6422 decrRefCount(field);
6423 } else {
6424 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6425 }
6426 addReply(c,exists ? shared.cone : shared.czero);
6427 }
6428
6429 static void convertToRealHash(robj *o) {
6430 unsigned char *key, *val, *p, *zm = o->ptr;
6431 unsigned int klen, vlen;
6432 dict *dict = dictCreate(&hashDictType,NULL);
6433
6434 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6435 p = zipmapRewind(zm);
6436 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6437 robj *keyobj, *valobj;
6438
6439 keyobj = createStringObject((char*)key,klen);
6440 valobj = createStringObject((char*)val,vlen);
6441 keyobj = tryObjectEncoding(keyobj);
6442 valobj = tryObjectEncoding(valobj);
6443 dictAdd(dict,keyobj,valobj);
6444 }
6445 o->encoding = REDIS_ENCODING_HT;
6446 o->ptr = dict;
6447 zfree(zm);
6448 }
6449
6450 /* ========================= Non type-specific commands ==================== */
6451
6452 static void flushdbCommand(redisClient *c) {
6453 server.dirty += dictSize(c->db->dict);
6454 dictEmpty(c->db->dict);
6455 dictEmpty(c->db->expires);
6456 addReply(c,shared.ok);
6457 }
6458
6459 static void flushallCommand(redisClient *c) {
6460 server.dirty += emptyDb();
6461 addReply(c,shared.ok);
6462 if (server.bgsavechildpid != -1) {
6463 kill(server.bgsavechildpid,SIGKILL);
6464 rdbRemoveTempFile(server.bgsavechildpid);
6465 }
6466 rdbSave(server.dbfilename);
6467 server.dirty++;
6468 }
6469
6470 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6471 redisSortOperation *so = zmalloc(sizeof(*so));
6472 so->type = type;
6473 so->pattern = pattern;
6474 return so;
6475 }
6476
6477 /* Return the value associated to the key with a name obtained
6478 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6479 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6480 char *p;
6481 sds spat, ssub;
6482 robj keyobj;
6483 int prefixlen, sublen, postfixlen;
6484 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6485 struct {
6486 long len;
6487 long free;
6488 char buf[REDIS_SORTKEY_MAX+1];
6489 } keyname;
6490
6491 /* If the pattern is "#" return the substitution object itself in order
6492 * to implement the "SORT ... GET #" feature. */
6493 spat = pattern->ptr;
6494 if (spat[0] == '#' && spat[1] == '\0') {
6495 return subst;
6496 }
6497
6498 /* The substitution object may be specially encoded. If so we create
6499 * a decoded object on the fly. Otherwise getDecodedObject will just
6500 * increment the ref count, that we'll decrement later. */
6501 subst = getDecodedObject(subst);
6502
6503 ssub = subst->ptr;
6504 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6505 p = strchr(spat,'*');
6506 if (!p) {
6507 decrRefCount(subst);
6508 return NULL;
6509 }
6510
6511 prefixlen = p-spat;
6512 sublen = sdslen(ssub);
6513 postfixlen = sdslen(spat)-(prefixlen+1);
6514 memcpy(keyname.buf,spat,prefixlen);
6515 memcpy(keyname.buf+prefixlen,ssub,sublen);
6516 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6517 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6518 keyname.len = prefixlen+sublen+postfixlen;
6519
6520 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6521 decrRefCount(subst);
6522
6523 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6524 return lookupKeyRead(db,&keyobj);
6525 }
6526
6527 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6528 * the additional parameter is not standard but a BSD-specific we have to
6529 * pass sorting parameters via the global 'server' structure */
6530 static int sortCompare(const void *s1, const void *s2) {
6531 const redisSortObject *so1 = s1, *so2 = s2;
6532 int cmp;
6533
6534 if (!server.sort_alpha) {
6535 /* Numeric sorting. Here it's trivial as we precomputed scores */
6536 if (so1->u.score > so2->u.score) {
6537 cmp = 1;
6538 } else if (so1->u.score < so2->u.score) {
6539 cmp = -1;
6540 } else {
6541 cmp = 0;
6542 }
6543 } else {
6544 /* Alphanumeric sorting */
6545 if (server.sort_bypattern) {
6546 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6547 /* At least one compare object is NULL */
6548 if (so1->u.cmpobj == so2->u.cmpobj)
6549 cmp = 0;
6550 else if (so1->u.cmpobj == NULL)
6551 cmp = -1;
6552 else
6553 cmp = 1;
6554 } else {
6555 /* We have both the objects, use strcoll */
6556 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6557 }
6558 } else {
6559 /* Compare elements directly */
6560 robj *dec1, *dec2;
6561
6562 dec1 = getDecodedObject(so1->obj);
6563 dec2 = getDecodedObject(so2->obj);
6564 cmp = strcoll(dec1->ptr,dec2->ptr);
6565 decrRefCount(dec1);
6566 decrRefCount(dec2);
6567 }
6568 }
6569 return server.sort_desc ? -cmp : cmp;
6570 }
6571
6572 /* The SORT command is the most complex command in Redis. Warning: this code
6573 * is optimized for speed and a bit less for readability */
6574 static void sortCommand(redisClient *c) {
6575 list *operations;
6576 int outputlen = 0;
6577 int desc = 0, alpha = 0;
6578 int limit_start = 0, limit_count = -1, start, end;
6579 int j, dontsort = 0, vectorlen;
6580 int getop = 0; /* GET operation counter */
6581 robj *sortval, *sortby = NULL, *storekey = NULL;
6582 redisSortObject *vector; /* Resulting vector to sort */
6583
6584 /* Lookup the key to sort. It must be of the right types */
6585 sortval = lookupKeyRead(c->db,c->argv[1]);
6586 if (sortval == NULL) {
6587 addReply(c,shared.emptymultibulk);
6588 return;
6589 }
6590 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6591 sortval->type != REDIS_ZSET)
6592 {
6593 addReply(c,shared.wrongtypeerr);
6594 return;
6595 }
6596
6597 /* Create a list of operations to perform for every sorted element.
6598 * Operations can be GET/DEL/INCR/DECR */
6599 operations = listCreate();
6600 listSetFreeMethod(operations,zfree);
6601 j = 2;
6602
6603 /* Now we need to protect sortval incrementing its count, in the future
6604 * SORT may have options able to overwrite/delete keys during the sorting
6605 * and the sorted key itself may get destroied */
6606 incrRefCount(sortval);
6607
6608 /* The SORT command has an SQL-alike syntax, parse it */
6609 while(j < c->argc) {
6610 int leftargs = c->argc-j-1;
6611 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6612 desc = 0;
6613 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6614 desc = 1;
6615 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6616 alpha = 1;
6617 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6618 limit_start = atoi(c->argv[j+1]->ptr);
6619 limit_count = atoi(c->argv[j+2]->ptr);
6620 j+=2;
6621 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6622 storekey = c->argv[j+1];
6623 j++;
6624 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6625 sortby = c->argv[j+1];
6626 /* If the BY pattern does not contain '*', i.e. it is constant,
6627 * we don't need to sort nor to lookup the weight keys. */
6628 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6629 j++;
6630 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6631 listAddNodeTail(operations,createSortOperation(
6632 REDIS_SORT_GET,c->argv[j+1]));
6633 getop++;
6634 j++;
6635 } else {
6636 decrRefCount(sortval);
6637 listRelease(operations);
6638 addReply(c,shared.syntaxerr);
6639 return;
6640 }
6641 j++;
6642 }
6643
6644 /* Load the sorting vector with all the objects to sort */
6645 switch(sortval->type) {
6646 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6647 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6648 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6649 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6650 }
6651 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6652 j = 0;
6653
6654 if (sortval->type == REDIS_LIST) {
6655 list *list = sortval->ptr;
6656 listNode *ln;
6657 listIter li;
6658
6659 listRewind(list,&li);
6660 while((ln = listNext(&li))) {
6661 robj *ele = ln->value;
6662 vector[j].obj = ele;
6663 vector[j].u.score = 0;
6664 vector[j].u.cmpobj = NULL;
6665 j++;
6666 }
6667 } else {
6668 dict *set;
6669 dictIterator *di;
6670 dictEntry *setele;
6671
6672 if (sortval->type == REDIS_SET) {
6673 set = sortval->ptr;
6674 } else {
6675 zset *zs = sortval->ptr;
6676 set = zs->dict;
6677 }
6678
6679 di = dictGetIterator(set);
6680 while((setele = dictNext(di)) != NULL) {
6681 vector[j].obj = dictGetEntryKey(setele);
6682 vector[j].u.score = 0;
6683 vector[j].u.cmpobj = NULL;
6684 j++;
6685 }
6686 dictReleaseIterator(di);
6687 }
6688 redisAssert(j == vectorlen);
6689
6690 /* Now it's time to load the right scores in the sorting vector */
6691 if (dontsort == 0) {
6692 for (j = 0; j < vectorlen; j++) {
6693 if (sortby) {
6694 robj *byval;
6695
6696 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6697 if (!byval || byval->type != REDIS_STRING) continue;
6698 if (alpha) {
6699 vector[j].u.cmpobj = getDecodedObject(byval);
6700 } else {
6701 if (byval->encoding == REDIS_ENCODING_RAW) {
6702 vector[j].u.score = strtod(byval->ptr,NULL);
6703 } else {
6704 /* Don't need to decode the object if it's
6705 * integer-encoded (the only encoding supported) so
6706 * far. We can just cast it */
6707 if (byval->encoding == REDIS_ENCODING_INT) {
6708 vector[j].u.score = (long)byval->ptr;
6709 } else
6710 redisAssert(1 != 1);
6711 }
6712 }
6713 } else {
6714 if (!alpha) {
6715 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6716 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6717 else {
6718 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6719 vector[j].u.score = (long) vector[j].obj->ptr;
6720 else
6721 redisAssert(1 != 1);
6722 }
6723 }
6724 }
6725 }
6726 }
6727
6728 /* We are ready to sort the vector... perform a bit of sanity check
6729 * on the LIMIT option too. We'll use a partial version of quicksort. */
6730 start = (limit_start < 0) ? 0 : limit_start;
6731 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6732 if (start >= vectorlen) {
6733 start = vectorlen-1;
6734 end = vectorlen-2;
6735 }
6736 if (end >= vectorlen) end = vectorlen-1;
6737
6738 if (dontsort == 0) {
6739 server.sort_desc = desc;
6740 server.sort_alpha = alpha;
6741 server.sort_bypattern = sortby ? 1 : 0;
6742 if (sortby && (start != 0 || end != vectorlen-1))
6743 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6744 else
6745 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6746 }
6747
6748 /* Send command output to the output buffer, performing the specified
6749 * GET/DEL/INCR/DECR operations if any. */
6750 outputlen = getop ? getop*(end-start+1) : end-start+1;
6751 if (storekey == NULL) {
6752 /* STORE option not specified, sent the sorting result to client */
6753 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6754 for (j = start; j <= end; j++) {
6755 listNode *ln;
6756 listIter li;
6757
6758 if (!getop) addReplyBulk(c,vector[j].obj);
6759 listRewind(operations,&li);
6760 while((ln = listNext(&li))) {
6761 redisSortOperation *sop = ln->value;
6762 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6763 vector[j].obj);
6764
6765 if (sop->type == REDIS_SORT_GET) {
6766 if (!val || val->type != REDIS_STRING) {
6767 addReply(c,shared.nullbulk);
6768 } else {
6769 addReplyBulk(c,val);
6770 }
6771 } else {
6772 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6773 }
6774 }
6775 }
6776 } else {
6777 robj *listObject = createListObject();
6778 list *listPtr = (list*) listObject->ptr;
6779
6780 /* STORE option specified, set the sorting result as a List object */
6781 for (j = start; j <= end; j++) {
6782 listNode *ln;
6783 listIter li;
6784
6785 if (!getop) {
6786 listAddNodeTail(listPtr,vector[j].obj);
6787 incrRefCount(vector[j].obj);
6788 }
6789 listRewind(operations,&li);
6790 while((ln = listNext(&li))) {
6791 redisSortOperation *sop = ln->value;
6792 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6793 vector[j].obj);
6794
6795 if (sop->type == REDIS_SORT_GET) {
6796 if (!val || val->type != REDIS_STRING) {
6797 listAddNodeTail(listPtr,createStringObject("",0));
6798 } else {
6799 listAddNodeTail(listPtr,val);
6800 incrRefCount(val);
6801 }
6802 } else {
6803 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6804 }
6805 }
6806 }
6807 if (dictReplace(c->db->dict,storekey,listObject)) {
6808 incrRefCount(storekey);
6809 }
6810 /* Note: we add 1 because the DB is dirty anyway since even if the
6811 * SORT result is empty a new key is set and maybe the old content
6812 * replaced. */
6813 server.dirty += 1+outputlen;
6814 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6815 }
6816
6817 /* Cleanup */
6818 decrRefCount(sortval);
6819 listRelease(operations);
6820 for (j = 0; j < vectorlen; j++) {
6821 if (sortby && alpha && vector[j].u.cmpobj)
6822 decrRefCount(vector[j].u.cmpobj);
6823 }
6824 zfree(vector);
6825 }
6826
6827 /* Convert an amount of bytes into a human readable string in the form
6828 * of 100B, 2G, 100M, 4K, and so forth. */
6829 static void bytesToHuman(char *s, unsigned long long n) {
6830 double d;
6831
6832 if (n < 1024) {
6833 /* Bytes */
6834 sprintf(s,"%lluB",n);
6835 return;
6836 } else if (n < (1024*1024)) {
6837 d = (double)n/(1024);
6838 sprintf(s,"%.2fK",d);
6839 } else if (n < (1024LL*1024*1024)) {
6840 d = (double)n/(1024*1024);
6841 sprintf(s,"%.2fM",d);
6842 } else if (n < (1024LL*1024*1024*1024)) {
6843 d = (double)n/(1024LL*1024*1024);
6844 sprintf(s,"%.2fG",d);
6845 }
6846 }
6847
6848 /* Create the string returned by the INFO command. This is decoupled
6849 * by the INFO command itself as we need to report the same information
6850 * on memory corruption problems. */
6851 static sds genRedisInfoString(void) {
6852 sds info;
6853 time_t uptime = time(NULL)-server.stat_starttime;
6854 int j;
6855 char hmem[64];
6856
6857 bytesToHuman(hmem,zmalloc_used_memory());
6858 info = sdscatprintf(sdsempty(),
6859 "redis_version:%s\r\n"
6860 "arch_bits:%s\r\n"
6861 "multiplexing_api:%s\r\n"
6862 "process_id:%ld\r\n"
6863 "uptime_in_seconds:%ld\r\n"
6864 "uptime_in_days:%ld\r\n"
6865 "connected_clients:%d\r\n"
6866 "connected_slaves:%d\r\n"
6867 "blocked_clients:%d\r\n"
6868 "used_memory:%zu\r\n"
6869 "used_memory_human:%s\r\n"
6870 "changes_since_last_save:%lld\r\n"
6871 "bgsave_in_progress:%d\r\n"
6872 "last_save_time:%ld\r\n"
6873 "bgrewriteaof_in_progress:%d\r\n"
6874 "total_connections_received:%lld\r\n"
6875 "total_commands_processed:%lld\r\n"
6876 "expired_keys:%lld\r\n"
6877 "hash_max_zipmap_entries:%ld\r\n"
6878 "hash_max_zipmap_value:%ld\r\n"
6879 "pubsub_channels:%ld\r\n"
6880 "pubsub_patterns:%u\r\n"
6881 "vm_enabled:%d\r\n"
6882 "role:%s\r\n"
6883 ,REDIS_VERSION,
6884 (sizeof(long) == 8) ? "64" : "32",
6885 aeGetApiName(),
6886 (long) getpid(),
6887 uptime,
6888 uptime/(3600*24),
6889 listLength(server.clients)-listLength(server.slaves),
6890 listLength(server.slaves),
6891 server.blpop_blocked_clients,
6892 zmalloc_used_memory(),
6893 hmem,
6894 server.dirty,
6895 server.bgsavechildpid != -1,
6896 server.lastsave,
6897 server.bgrewritechildpid != -1,
6898 server.stat_numconnections,
6899 server.stat_numcommands,
6900 server.stat_expiredkeys,
6901 server.hash_max_zipmap_entries,
6902 server.hash_max_zipmap_value,
6903 dictSize(server.pubsub_channels),
6904 listLength(server.pubsub_patterns),
6905 server.vm_enabled != 0,
6906 server.masterhost == NULL ? "master" : "slave"
6907 );
6908 if (server.masterhost) {
6909 info = sdscatprintf(info,
6910 "master_host:%s\r\n"
6911 "master_port:%d\r\n"
6912 "master_link_status:%s\r\n"
6913 "master_last_io_seconds_ago:%d\r\n"
6914 ,server.masterhost,
6915 server.masterport,
6916 (server.replstate == REDIS_REPL_CONNECTED) ?
6917 "up" : "down",
6918 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6919 );
6920 }
6921 if (server.vm_enabled) {
6922 lockThreadedIO();
6923 info = sdscatprintf(info,
6924 "vm_conf_max_memory:%llu\r\n"
6925 "vm_conf_page_size:%llu\r\n"
6926 "vm_conf_pages:%llu\r\n"
6927 "vm_stats_used_pages:%llu\r\n"
6928 "vm_stats_swapped_objects:%llu\r\n"
6929 "vm_stats_swappin_count:%llu\r\n"
6930 "vm_stats_swappout_count:%llu\r\n"
6931 "vm_stats_io_newjobs_len:%lu\r\n"
6932 "vm_stats_io_processing_len:%lu\r\n"
6933 "vm_stats_io_processed_len:%lu\r\n"
6934 "vm_stats_io_active_threads:%lu\r\n"
6935 "vm_stats_blocked_clients:%lu\r\n"
6936 ,(unsigned long long) server.vm_max_memory,
6937 (unsigned long long) server.vm_page_size,
6938 (unsigned long long) server.vm_pages,
6939 (unsigned long long) server.vm_stats_used_pages,
6940 (unsigned long long) server.vm_stats_swapped_objects,
6941 (unsigned long long) server.vm_stats_swapins,
6942 (unsigned long long) server.vm_stats_swapouts,
6943 (unsigned long) listLength(server.io_newjobs),
6944 (unsigned long) listLength(server.io_processing),
6945 (unsigned long) listLength(server.io_processed),
6946 (unsigned long) server.io_active_threads,
6947 (unsigned long) server.vm_blocked_clients
6948 );
6949 unlockThreadedIO();
6950 }
6951 for (j = 0; j < server.dbnum; j++) {
6952 long long keys, vkeys;
6953
6954 keys = dictSize(server.db[j].dict);
6955 vkeys = dictSize(server.db[j].expires);
6956 if (keys || vkeys) {
6957 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6958 j, keys, vkeys);
6959 }
6960 }
6961 return info;
6962 }
6963
6964 static void infoCommand(redisClient *c) {
6965 sds info = genRedisInfoString();
6966 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6967 (unsigned long)sdslen(info)));
6968 addReplySds(c,info);
6969 addReply(c,shared.crlf);
6970 }
6971
6972 static void monitorCommand(redisClient *c) {
6973 /* ignore MONITOR if aleady slave or in monitor mode */
6974 if (c->flags & REDIS_SLAVE) return;
6975
6976 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6977 c->slaveseldb = 0;
6978 listAddNodeTail(server.monitors,c);
6979 addReply(c,shared.ok);
6980 }
6981
6982 /* ================================= Expire ================================= */
6983 static int removeExpire(redisDb *db, robj *key) {
6984 if (dictDelete(db->expires,key) == DICT_OK) {
6985 return 1;
6986 } else {
6987 return 0;
6988 }
6989 }
6990
6991 static int setExpire(redisDb *db, robj *key, time_t when) {
6992 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6993 return 0;
6994 } else {
6995 incrRefCount(key);
6996 return 1;
6997 }
6998 }
6999
7000 /* Return the expire time of the specified key, or -1 if no expire
7001 * is associated with this key (i.e. the key is non volatile) */
7002 static time_t getExpire(redisDb *db, robj *key) {
7003 dictEntry *de;
7004
7005 /* No expire? return ASAP */
7006 if (dictSize(db->expires) == 0 ||
7007 (de = dictFind(db->expires,key)) == NULL) return -1;
7008
7009 return (time_t) dictGetEntryVal(de);
7010 }
7011
7012 static int expireIfNeeded(redisDb *db, robj *key) {
7013 time_t when;
7014 dictEntry *de;
7015
7016 /* No expire? return ASAP */
7017 if (dictSize(db->expires) == 0 ||
7018 (de = dictFind(db->expires,key)) == NULL) return 0;
7019
7020 /* Lookup the expire */
7021 when = (time_t) dictGetEntryVal(de);
7022 if (time(NULL) <= when) return 0;
7023
7024 /* Delete the key */
7025 dictDelete(db->expires,key);
7026 server.stat_expiredkeys++;
7027 return dictDelete(db->dict,key) == DICT_OK;
7028 }
7029
7030 static int deleteIfVolatile(redisDb *db, robj *key) {
7031 dictEntry *de;
7032
7033 /* No expire? return ASAP */
7034 if (dictSize(db->expires) == 0 ||
7035 (de = dictFind(db->expires,key)) == NULL) return 0;
7036
7037 /* Delete the key */
7038 server.dirty++;
7039 server.stat_expiredkeys++;
7040 dictDelete(db->expires,key);
7041 return dictDelete(db->dict,key) == DICT_OK;
7042 }
7043
7044 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7045 dictEntry *de;
7046 time_t seconds;
7047
7048 if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;
7049
7050 seconds -= offset;
7051
7052 de = dictFind(c->db->dict,key);
7053 if (de == NULL) {
7054 addReply(c,shared.czero);
7055 return;
7056 }
7057 if (seconds < 0) {
7058 if (deleteKey(c->db,key)) server.dirty++;
7059 addReply(c, shared.cone);
7060 return;
7061 } else {
7062 time_t when = time(NULL)+seconds;
7063 if (setExpire(c->db,key,when)) {
7064 addReply(c,shared.cone);
7065 server.dirty++;
7066 } else {
7067 addReply(c,shared.czero);
7068 }
7069 return;
7070 }
7071 }
7072
7073 static void expireCommand(redisClient *c) {
7074 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7075 }
7076
7077 static void expireatCommand(redisClient *c) {
7078 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7079 }
7080
7081 static void ttlCommand(redisClient *c) {
7082 time_t expire;
7083 int ttl = -1;
7084
7085 expire = getExpire(c->db,c->argv[1]);
7086 if (expire != -1) {
7087 ttl = (int) (expire-time(NULL));
7088 if (ttl < 0) ttl = -1;
7089 }
7090 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7091 }
7092
7093 /* ================================ MULTI/EXEC ============================== */
7094
7095 /* Client state initialization for MULTI/EXEC */
7096 static void initClientMultiState(redisClient *c) {
7097 c->mstate.commands = NULL;
7098 c->mstate.count = 0;
7099 }
7100
7101 /* Release all the resources associated with MULTI/EXEC state */
7102 static void freeClientMultiState(redisClient *c) {
7103 int j;
7104
7105 for (j = 0; j < c->mstate.count; j++) {
7106 int i;
7107 multiCmd *mc = c->mstate.commands+j;
7108
7109 for (i = 0; i < mc->argc; i++)
7110 decrRefCount(mc->argv[i]);
7111 zfree(mc->argv);
7112 }
7113 zfree(c->mstate.commands);
7114 }
7115
7116 /* Add a new command into the MULTI commands queue */
7117 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7118 multiCmd *mc;
7119 int j;
7120
7121 c->mstate.commands = zrealloc(c->mstate.commands,
7122 sizeof(multiCmd)*(c->mstate.count+1));
7123 mc = c->mstate.commands+c->mstate.count;
7124 mc->cmd = cmd;
7125 mc->argc = c->argc;
7126 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7127 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7128 for (j = 0; j < c->argc; j++)
7129 incrRefCount(mc->argv[j]);
7130 c->mstate.count++;
7131 }
7132
7133 static void multiCommand(redisClient *c) {
7134 c->flags |= REDIS_MULTI;
7135 addReply(c,shared.ok);
7136 }
7137
7138 static void discardCommand(redisClient *c) {
7139 if (!(c->flags & REDIS_MULTI)) {
7140 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7141 return;
7142 }
7143
7144 freeClientMultiState(c);
7145 initClientMultiState(c);
7146 c->flags &= (~REDIS_MULTI);
7147 addReply(c,shared.ok);
7148 }
7149
7150 static void execCommand(redisClient *c) {
7151 int j;
7152 robj **orig_argv;
7153 int orig_argc;
7154
7155 if (!(c->flags & REDIS_MULTI)) {
7156 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7157 return;
7158 }
7159
7160 orig_argv = c->argv;
7161 orig_argc = c->argc;
7162 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7163 for (j = 0; j < c->mstate.count; j++) {
7164 c->argc = c->mstate.commands[j].argc;
7165 c->argv = c->mstate.commands[j].argv;
7166 call(c,c->mstate.commands[j].cmd);
7167 }
7168 c->argv = orig_argv;
7169 c->argc = orig_argc;
7170 freeClientMultiState(c);
7171 initClientMultiState(c);
7172 c->flags &= (~REDIS_MULTI);
7173 }
7174
7175 /* =========================== Blocking Operations ========================= */
7176
7177 /* Currently Redis blocking operations support is limited to list POP ops,
7178 * so the current implementation is not fully generic, but it is also not
7179 * completely specific so it will not require a rewrite to support new
7180 * kind of blocking operations in the future.
7181 *
7182 * Still it's important to note that list blocking operations can be already
7183 * used as a notification mechanism in order to implement other blocking
7184 * operations at application level, so there must be a very strong evidence
7185 * of usefulness and generality before new blocking operations are implemented.
7186 *
7187 * This is how the current blocking POP works, we use BLPOP as example:
7188 * - If the user calls BLPOP and the key exists and contains a non empty list
7189 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7190 * if there is not to block.
7191 * - If instead BLPOP is called and the key does not exists or the list is
7192 * empty we need to block. In order to do so we remove the notification for
7193 * new data to read in the client socket (so that we'll not serve new
7194 * requests if the blocking request is not served). Also we put the client
7195 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7196 * blocking for this keys.
7197 * - If a PUSH operation against a key with blocked clients waiting is
7198 * performed, we serve the first in the list: basically instead to push
7199 * the new element inside the list we return it to the (first / oldest)
7200 * blocking client, unblock the client, and remove it form the list.
7201 *
7202 * The above comment and the source code should be enough in order to understand
7203 * the implementation and modify / fix it later.
7204 */
7205
7206 /* Set a client in blocking mode for the specified key, with the specified
7207 * timeout */
7208 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7209 dictEntry *de;
7210 list *l;
7211 int j;
7212
7213 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7214 c->blockingkeysnum = numkeys;
7215 c->blockingto = timeout;
7216 for (j = 0; j < numkeys; j++) {
7217 /* Add the key in the client structure, to map clients -> keys */
7218 c->blockingkeys[j] = keys[j];
7219 incrRefCount(keys[j]);
7220
7221 /* And in the other "side", to map keys -> clients */
7222 de = dictFind(c->db->blockingkeys,keys[j]);
7223 if (de == NULL) {
7224 int retval;
7225
7226 /* For every key we take a list of clients blocked for it */
7227 l = listCreate();
7228 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7229 incrRefCount(keys[j]);
7230 assert(retval == DICT_OK);
7231 } else {
7232 l = dictGetEntryVal(de);
7233 }
7234 listAddNodeTail(l,c);
7235 }
7236 /* Mark the client as a blocked client */
7237 c->flags |= REDIS_BLOCKED;
7238 server.blpop_blocked_clients++;
7239 }
7240
7241 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7242 static void unblockClientWaitingData(redisClient *c) {
7243 dictEntry *de;
7244 list *l;
7245 int j;
7246
7247 assert(c->blockingkeys != NULL);
7248 /* The client may wait for multiple keys, so unblock it for every key. */
7249 for (j = 0; j < c->blockingkeysnum; j++) {
7250 /* Remove this client from the list of clients waiting for this key. */
7251 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7252 assert(de != NULL);
7253 l = dictGetEntryVal(de);
7254 listDelNode(l,listSearchKey(l,c));
7255 /* If the list is empty we need to remove it to avoid wasting memory */
7256 if (listLength(l) == 0)
7257 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7258 decrRefCount(c->blockingkeys[j]);
7259 }
7260 /* Cleanup the client structure */
7261 zfree(c->blockingkeys);
7262 c->blockingkeys = NULL;
7263 c->flags &= (~REDIS_BLOCKED);
7264 server.blpop_blocked_clients--;
7265 /* We want to process data if there is some command waiting
7266 * in the input buffer. Note that this is safe even if
7267 * unblockClientWaitingData() gets called from freeClient() because
7268 * freeClient() will be smart enough to call this function
7269 * *after* c->querybuf was set to NULL. */
7270 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7271 }
7272
7273 /* This should be called from any function PUSHing into lists.
7274 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7275 * 'ele' is the element pushed.
7276 *
7277 * If the function returns 0 there was no client waiting for a list push
7278 * against this key.
7279 *
7280 * If the function returns 1 there was a client waiting for a list push
7281 * against this key, the element was passed to this client thus it's not
7282 * needed to actually add it to the list and the caller should return asap. */
7283 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7284 struct dictEntry *de;
7285 redisClient *receiver;
7286 list *l;
7287 listNode *ln;
7288
7289 de = dictFind(c->db->blockingkeys,key);
7290 if (de == NULL) return 0;
7291 l = dictGetEntryVal(de);
7292 ln = listFirst(l);
7293 assert(ln != NULL);
7294 receiver = ln->value;
7295
7296 addReplySds(receiver,sdsnew("*2\r\n"));
7297 addReplyBulk(receiver,key);
7298 addReplyBulk(receiver,ele);
7299 unblockClientWaitingData(receiver);
7300 return 1;
7301 }
7302
7303 /* Blocking RPOP/LPOP */
7304 static void blockingPopGenericCommand(redisClient *c, int where) {
7305 robj *o;
7306 time_t timeout;
7307 int j;
7308
7309 for (j = 1; j < c->argc-1; j++) {
7310 o = lookupKeyWrite(c->db,c->argv[j]);
7311 if (o != NULL) {
7312 if (o->type != REDIS_LIST) {
7313 addReply(c,shared.wrongtypeerr);
7314 return;
7315 } else {
7316 list *list = o->ptr;
7317 if (listLength(list) != 0) {
7318 /* If the list contains elements fall back to the usual
7319 * non-blocking POP operation */
7320 robj *argv[2], **orig_argv;
7321 int orig_argc;
7322
7323 /* We need to alter the command arguments before to call
7324 * popGenericCommand() as the command takes a single key. */
7325 orig_argv = c->argv;
7326 orig_argc = c->argc;
7327 argv[1] = c->argv[j];
7328 c->argv = argv;
7329 c->argc = 2;
7330
7331 /* Also the return value is different, we need to output
7332 * the multi bulk reply header and the key name. The
7333 * "real" command will add the last element (the value)
7334 * for us. If this souds like an hack to you it's just
7335 * because it is... */
7336 addReplySds(c,sdsnew("*2\r\n"));
7337 addReplyBulk(c,argv[1]);
7338 popGenericCommand(c,where);
7339
7340 /* Fix the client structure with the original stuff */
7341 c->argv = orig_argv;
7342 c->argc = orig_argc;
7343 return;
7344 }
7345 }
7346 }
7347 }
7348 /* If the list is empty or the key does not exists we must block */
7349 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7350 if (timeout > 0) timeout += time(NULL);
7351 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7352 }
7353
7354 static void blpopCommand(redisClient *c) {
7355 blockingPopGenericCommand(c,REDIS_HEAD);
7356 }
7357
7358 static void brpopCommand(redisClient *c) {
7359 blockingPopGenericCommand(c,REDIS_TAIL);
7360 }
7361
7362 /* =============================== Replication ============================= */
7363
7364 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7365 ssize_t nwritten, ret = size;
7366 time_t start = time(NULL);
7367
7368 timeout++;
7369 while(size) {
7370 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7371 nwritten = write(fd,ptr,size);
7372 if (nwritten == -1) return -1;
7373 ptr += nwritten;
7374 size -= nwritten;
7375 }
7376 if ((time(NULL)-start) > timeout) {
7377 errno = ETIMEDOUT;
7378 return -1;
7379 }
7380 }
7381 return ret;
7382 }
7383
7384 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7385 ssize_t nread, totread = 0;
7386 time_t start = time(NULL);
7387
7388 timeout++;
7389 while(size) {
7390 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7391 nread = read(fd,ptr,size);
7392 if (nread == -1) return -1;
7393 ptr += nread;
7394 size -= nread;
7395 totread += nread;
7396 }
7397 if ((time(NULL)-start) > timeout) {
7398 errno = ETIMEDOUT;
7399 return -1;
7400 }
7401 }
7402 return totread;
7403 }
7404
7405 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7406 ssize_t nread = 0;
7407
7408 size--;
7409 while(size) {
7410 char c;
7411
7412 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7413 if (c == '\n') {
7414 *ptr = '\0';
7415 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7416 return nread;
7417 } else {
7418 *ptr++ = c;
7419 *ptr = '\0';
7420 nread++;
7421 }
7422 }
7423 return nread;
7424 }
7425
7426 static void syncCommand(redisClient *c) {
7427 /* ignore SYNC if aleady slave or in monitor mode */
7428 if (c->flags & REDIS_SLAVE) return;
7429
7430 /* SYNC can't be issued when the server has pending data to send to
7431 * the client about already issued commands. We need a fresh reply
7432 * buffer registering the differences between the BGSAVE and the current
7433 * dataset, so that we can copy to other slaves if needed. */
7434 if (listLength(c->reply) != 0) {
7435 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7436 return;
7437 }
7438
7439 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7440 /* Here we need to check if there is a background saving operation
7441 * in progress, or if it is required to start one */
7442 if (server.bgsavechildpid != -1) {
7443 /* Ok a background save is in progress. Let's check if it is a good
7444 * one for replication, i.e. if there is another slave that is
7445 * registering differences since the server forked to save */
7446 redisClient *slave;
7447 listNode *ln;
7448 listIter li;
7449
7450 listRewind(server.slaves,&li);
7451 while((ln = listNext(&li))) {
7452 slave = ln->value;
7453 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7454 }
7455 if (ln) {
7456 /* Perfect, the server is already registering differences for
7457 * another slave. Set the right state, and copy the buffer. */
7458 listRelease(c->reply);
7459 c->reply = listDup(slave->reply);
7460 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7461 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7462 } else {
7463 /* No way, we need to wait for the next BGSAVE in order to
7464 * register differences */
7465 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7466 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7467 }
7468 } else {
7469 /* Ok we don't have a BGSAVE in progress, let's start one */
7470 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7471 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7472 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7473 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7474 return;
7475 }
7476 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7477 }
7478 c->repldbfd = -1;
7479 c->flags |= REDIS_SLAVE;
7480 c->slaveseldb = 0;
7481 listAddNodeTail(server.slaves,c);
7482 return;
7483 }
7484
7485 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7486 redisClient *slave = privdata;
7487 REDIS_NOTUSED(el);
7488 REDIS_NOTUSED(mask);
7489 char buf[REDIS_IOBUF_LEN];
7490 ssize_t nwritten, buflen;
7491
7492 if (slave->repldboff == 0) {
7493 /* Write the bulk write count before to transfer the DB. In theory here
7494 * we don't know how much room there is in the output buffer of the
7495 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7496 * operations) will never be smaller than the few bytes we need. */
7497 sds bulkcount;
7498
7499 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7500 slave->repldbsize);
7501 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7502 {
7503 sdsfree(bulkcount);
7504 freeClient(slave);
7505 return;
7506 }
7507 sdsfree(bulkcount);
7508 }
7509 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7510 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7511 if (buflen <= 0) {
7512 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7513 (buflen == 0) ? "premature EOF" : strerror(errno));
7514 freeClient(slave);
7515 return;
7516 }
7517 if ((nwritten = write(fd,buf,buflen)) == -1) {
7518 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7519 strerror(errno));
7520 freeClient(slave);
7521 return;
7522 }
7523 slave->repldboff += nwritten;
7524 if (slave->repldboff == slave->repldbsize) {
7525 close(slave->repldbfd);
7526 slave->repldbfd = -1;
7527 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7528 slave->replstate = REDIS_REPL_ONLINE;
7529 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7530 sendReplyToClient, slave) == AE_ERR) {
7531 freeClient(slave);
7532 return;
7533 }
7534 addReplySds(slave,sdsempty());
7535 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7536 }
7537 }
7538
7539 /* This function is called at the end of every backgrond saving.
7540 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7541 * otherwise REDIS_ERR is passed to the function.
7542 *
7543 * The goal of this function is to handle slaves waiting for a successful
7544 * background saving in order to perform non-blocking synchronization. */
7545 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7546 listNode *ln;
7547 int startbgsave = 0;
7548 listIter li;
7549
7550 listRewind(server.slaves,&li);
7551 while((ln = listNext(&li))) {
7552 redisClient *slave = ln->value;
7553
7554 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7555 startbgsave = 1;
7556 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7557 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7558 struct redis_stat buf;
7559
7560 if (bgsaveerr != REDIS_OK) {
7561 freeClient(slave);
7562 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7563 continue;
7564 }
7565 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7566 redis_fstat(slave->repldbfd,&buf) == -1) {
7567 freeClient(slave);
7568 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7569 continue;
7570 }
7571 slave->repldboff = 0;
7572 slave->repldbsize = buf.st_size;
7573 slave->replstate = REDIS_REPL_SEND_BULK;
7574 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7575 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7576 freeClient(slave);
7577 continue;
7578 }
7579 }
7580 }
7581 if (startbgsave) {
7582 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7583 listIter li;
7584
7585 listRewind(server.slaves,&li);
7586 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7587 while((ln = listNext(&li))) {
7588 redisClient *slave = ln->value;
7589
7590 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7591 freeClient(slave);
7592 }
7593 }
7594 }
7595 }
7596
7597 static int syncWithMaster(void) {
7598 char buf[1024], tmpfile[256], authcmd[1024];
7599 long dumpsize;
7600 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7601 int dfd, maxtries = 5;
7602
7603 if (fd == -1) {
7604 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7605 strerror(errno));
7606 return REDIS_ERR;
7607 }
7608
7609 /* AUTH with the master if required. */
7610 if(server.masterauth) {
7611 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7612 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7613 close(fd);
7614 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7615 strerror(errno));
7616 return REDIS_ERR;
7617 }
7618 /* Read the AUTH result. */
7619 if (syncReadLine(fd,buf,1024,3600) == -1) {
7620 close(fd);
7621 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7622 strerror(errno));
7623 return REDIS_ERR;
7624 }
7625 if (buf[0] != '+') {
7626 close(fd);
7627 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7628 return REDIS_ERR;
7629 }
7630 }
7631
7632 /* Issue the SYNC command */
7633 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7634 close(fd);
7635 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7636 strerror(errno));
7637 return REDIS_ERR;
7638 }
7639 /* Read the bulk write count */
7640 if (syncReadLine(fd,buf,1024,3600) == -1) {
7641 close(fd);
7642 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7643 strerror(errno));
7644 return REDIS_ERR;
7645 }
7646 if (buf[0] != '$') {
7647 close(fd);
7648 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7649 return REDIS_ERR;
7650 }
7651 dumpsize = strtol(buf+1,NULL,10);
7652 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7653 /* Read the bulk write data on a temp file */
7654 while(maxtries--) {
7655 snprintf(tmpfile,256,
7656 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7657 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7658 if (dfd != -1) break;
7659 sleep(1);
7660 }
7661 if (dfd == -1) {
7662 close(fd);
7663 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7664 return REDIS_ERR;
7665 }
7666 while(dumpsize) {
7667 int nread, nwritten;
7668
7669 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7670 if (nread == -1) {
7671 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7672 strerror(errno));
7673 close(fd);
7674 close(dfd);
7675 return REDIS_ERR;
7676 }
7677 nwritten = write(dfd,buf,nread);
7678 if (nwritten == -1) {
7679 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7680 close(fd);
7681 close(dfd);
7682 return REDIS_ERR;
7683 }
7684 dumpsize -= nread;
7685 }
7686 close(dfd);
7687 if (rename(tmpfile,server.dbfilename) == -1) {
7688 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7689 unlink(tmpfile);
7690 close(fd);
7691 return REDIS_ERR;
7692 }
7693 emptyDb();
7694 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7695 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7696 close(fd);
7697 return REDIS_ERR;
7698 }
7699 server.master = createClient(fd);
7700 server.master->flags |= REDIS_MASTER;
7701 server.master->authenticated = 1;
7702 server.replstate = REDIS_REPL_CONNECTED;
7703 return REDIS_OK;
7704 }
7705
7706 static void slaveofCommand(redisClient *c) {
7707 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7708 !strcasecmp(c->argv[2]->ptr,"one")) {
7709 if (server.masterhost) {
7710 sdsfree(server.masterhost);
7711 server.masterhost = NULL;
7712 if (server.master) freeClient(server.master);
7713 server.replstate = REDIS_REPL_NONE;
7714 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7715 }
7716 } else {
7717 sdsfree(server.masterhost);
7718 server.masterhost = sdsdup(c->argv[1]->ptr);
7719 server.masterport = atoi(c->argv[2]->ptr);
7720 if (server.master) freeClient(server.master);
7721 server.replstate = REDIS_REPL_CONNECT;
7722 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7723 server.masterhost, server.masterport);
7724 }
7725 addReply(c,shared.ok);
7726 }
7727
7728 /* ============================ Maxmemory directive ======================== */
7729
7730 /* Try to free one object form the pre-allocated objects free list.
7731 * This is useful under low mem conditions as by default we take 1 million
7732 * free objects allocated. On success REDIS_OK is returned, otherwise
7733 * REDIS_ERR. */
7734 static int tryFreeOneObjectFromFreelist(void) {
7735 robj *o;
7736
7737 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7738 if (listLength(server.objfreelist)) {
7739 listNode *head = listFirst(server.objfreelist);
7740 o = listNodeValue(head);
7741 listDelNode(server.objfreelist,head);
7742 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7743 zfree(o);
7744 return REDIS_OK;
7745 } else {
7746 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7747 return REDIS_ERR;
7748 }
7749 }
7750
7751 /* This function gets called when 'maxmemory' is set on the config file to limit
7752 * the max memory used by the server, and we are out of memory.
7753 * This function will try to, in order:
7754 *
7755 * - Free objects from the free list
7756 * - Try to remove keys with an EXPIRE set
7757 *
7758 * It is not possible to free enough memory to reach used-memory < maxmemory
7759 * the server will start refusing commands that will enlarge even more the
7760 * memory usage.
7761 */
7762 static void freeMemoryIfNeeded(void) {
7763 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7764 int j, k, freed = 0;
7765
7766 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7767 for (j = 0; j < server.dbnum; j++) {
7768 int minttl = -1;
7769 robj *minkey = NULL;
7770 struct dictEntry *de;
7771
7772 if (dictSize(server.db[j].expires)) {
7773 freed = 1;
7774 /* From a sample of three keys drop the one nearest to
7775 * the natural expire */
7776 for (k = 0; k < 3; k++) {
7777 time_t t;
7778
7779 de = dictGetRandomKey(server.db[j].expires);
7780 t = (time_t) dictGetEntryVal(de);
7781 if (minttl == -1 || t < minttl) {
7782 minkey = dictGetEntryKey(de);
7783 minttl = t;
7784 }
7785 }
7786 deleteKey(server.db+j,minkey);
7787 }
7788 }
7789 if (!freed) return; /* nothing to free... */
7790 }
7791 }
7792
7793 /* ============================== Append Only file ========================== */
7794
7795 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7796 sds buf = sdsempty();
7797 int j;
7798 ssize_t nwritten;
7799 time_t now;
7800 robj *tmpargv[3];
7801
7802 /* The DB this command was targetting is not the same as the last command
7803 * we appendend. To issue a SELECT command is needed. */
7804 if (dictid != server.appendseldb) {
7805 char seldb[64];
7806
7807 snprintf(seldb,sizeof(seldb),"%d",dictid);
7808 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7809 (unsigned long)strlen(seldb),seldb);
7810 server.appendseldb = dictid;
7811 }
7812
7813 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7814 * EXPIREs into EXPIREATs calls */
7815 if (cmd->proc == expireCommand) {
7816 long when;
7817
7818 tmpargv[0] = createStringObject("EXPIREAT",8);
7819 tmpargv[1] = argv[1];
7820 incrRefCount(argv[1]);
7821 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7822 tmpargv[2] = createObject(REDIS_STRING,
7823 sdscatprintf(sdsempty(),"%ld",when));
7824 argv = tmpargv;
7825 }
7826
7827 /* Append the actual command */
7828 buf = sdscatprintf(buf,"*%d\r\n",argc);
7829 for (j = 0; j < argc; j++) {
7830 robj *o = argv[j];
7831
7832 o = getDecodedObject(o);
7833 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7834 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7835 buf = sdscatlen(buf,"\r\n",2);
7836 decrRefCount(o);
7837 }
7838
7839 /* Free the objects from the modified argv for EXPIREAT */
7840 if (cmd->proc == expireCommand) {
7841 for (j = 0; j < 3; j++)
7842 decrRefCount(argv[j]);
7843 }
7844
7845 /* We want to perform a single write. This should be guaranteed atomic
7846 * at least if the filesystem we are writing is a real physical one.
7847 * While this will save us against the server being killed I don't think
7848 * there is much to do about the whole server stopping for power problems
7849 * or alike */
7850 nwritten = write(server.appendfd,buf,sdslen(buf));
7851 if (nwritten != (signed)sdslen(buf)) {
7852 /* Ooops, we are in troubles. The best thing to do for now is
7853 * to simply exit instead to give the illusion that everything is
7854 * working as expected. */
7855 if (nwritten == -1) {
7856 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7857 } else {
7858 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7859 }
7860 exit(1);
7861 }
7862 /* If a background append only file rewriting is in progress we want to
7863 * accumulate the differences between the child DB and the current one
7864 * in a buffer, so that when the child process will do its work we
7865 * can append the differences to the new append only file. */
7866 if (server.bgrewritechildpid != -1)
7867 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7868
7869 sdsfree(buf);
7870 now = time(NULL);
7871 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7872 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7873 now-server.lastfsync > 1))
7874 {
7875 fsync(server.appendfd); /* Let's try to get this data on the disk */
7876 server.lastfsync = now;
7877 }
7878 }
7879
7880 /* In Redis commands are always executed in the context of a client, so in
7881 * order to load the append only file we need to create a fake client. */
7882 static struct redisClient *createFakeClient(void) {
7883 struct redisClient *c = zmalloc(sizeof(*c));
7884
7885 selectDb(c,0);
7886 c->fd = -1;
7887 c->querybuf = sdsempty();
7888 c->argc = 0;
7889 c->argv = NULL;
7890 c->flags = 0;
7891 /* We set the fake client as a slave waiting for the synchronization
7892 * so that Redis will not try to send replies to this client. */
7893 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7894 c->reply = listCreate();
7895 listSetFreeMethod(c->reply,decrRefCount);
7896 listSetDupMethod(c->reply,dupClientReplyValue);
7897 return c;
7898 }
7899
7900 static void freeFakeClient(struct redisClient *c) {
7901 sdsfree(c->querybuf);
7902 listRelease(c->reply);
7903 zfree(c);
7904 }
7905
7906 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7907 * error (the append only file is zero-length) REDIS_ERR is returned. On
7908 * fatal error an error message is logged and the program exists. */
7909 int loadAppendOnlyFile(char *filename) {
7910 struct redisClient *fakeClient;
7911 FILE *fp = fopen(filename,"r");
7912 struct redis_stat sb;
7913 unsigned long long loadedkeys = 0;
7914
7915 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7916 return REDIS_ERR;
7917
7918 if (fp == NULL) {
7919 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7920 exit(1);
7921 }
7922
7923 fakeClient = createFakeClient();
7924 while(1) {
7925 int argc, j;
7926 unsigned long len;
7927 robj **argv;
7928 char buf[128];
7929 sds argsds;
7930 struct redisCommand *cmd;
7931
7932 if (fgets(buf,sizeof(buf),fp) == NULL) {
7933 if (feof(fp))
7934 break;
7935 else
7936 goto readerr;
7937 }
7938 if (buf[0] != '*') goto fmterr;
7939 argc = atoi(buf+1);
7940 argv = zmalloc(sizeof(robj*)*argc);
7941 for (j = 0; j < argc; j++) {
7942 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7943 if (buf[0] != '$') goto fmterr;
7944 len = strtol(buf+1,NULL,10);
7945 argsds = sdsnewlen(NULL,len);
7946 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7947 argv[j] = createObject(REDIS_STRING,argsds);
7948 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7949 }
7950
7951 /* Command lookup */
7952 cmd = lookupCommand(argv[0]->ptr);
7953 if (!cmd) {
7954 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7955 exit(1);
7956 }
7957 /* Try object encoding */
7958 if (cmd->flags & REDIS_CMD_BULK)
7959 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7960 /* Run the command in the context of a fake client */
7961 fakeClient->argc = argc;
7962 fakeClient->argv = argv;
7963 cmd->proc(fakeClient);
7964 /* Discard the reply objects list from the fake client */
7965 while(listLength(fakeClient->reply))
7966 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7967 /* Clean up, ready for the next command */
7968 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7969 zfree(argv);
7970 /* Handle swapping while loading big datasets when VM is on */
7971 loadedkeys++;
7972 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7973 while (zmalloc_used_memory() > server.vm_max_memory) {
7974 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7975 }
7976 }
7977 }
7978 fclose(fp);
7979 freeFakeClient(fakeClient);
7980 return REDIS_OK;
7981
7982 readerr:
7983 if (feof(fp)) {
7984 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7985 } else {
7986 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7987 }
7988 exit(1);
7989 fmterr:
7990 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7991 exit(1);
7992 }
7993
7994 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7995 static int fwriteBulkObject(FILE *fp, robj *obj) {
7996 char buf[128];
7997 int decrrc = 0;
7998
7999 /* Avoid the incr/decr ref count business if possible to help
8000 * copy-on-write (we are often in a child process when this function
8001 * is called).
8002 * Also makes sure that key objects don't get incrRefCount-ed when VM
8003 * is enabled */
8004 if (obj->encoding != REDIS_ENCODING_RAW) {
8005 obj = getDecodedObject(obj);
8006 decrrc = 1;
8007 }
8008 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8009 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8010 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8011 goto err;
8012 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8013 if (decrrc) decrRefCount(obj);
8014 return 1;
8015 err:
8016 if (decrrc) decrRefCount(obj);
8017 return 0;
8018 }
8019
8020 /* Write binary-safe string into a file in the bulkformat
8021 * $<count>\r\n<payload>\r\n */
8022 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8023 char buf[128];
8024
8025 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8026 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8027 if (len && fwrite(s,len,1,fp) == 0) return 0;
8028 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8029 return 1;
8030 }
8031
8032 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8033 static int fwriteBulkDouble(FILE *fp, double d) {
8034 char buf[128], dbuf[128];
8035
8036 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8037 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8038 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8039 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8040 return 1;
8041 }
8042
8043 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8044 static int fwriteBulkLong(FILE *fp, long l) {
8045 char buf[128], lbuf[128];
8046
8047 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8048 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8049 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8050 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8051 return 1;
8052 }
8053
8054 /* Write a sequence of commands able to fully rebuild the dataset into
8055 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8056 static int rewriteAppendOnlyFile(char *filename) {
8057 dictIterator *di = NULL;
8058 dictEntry *de;
8059 FILE *fp;
8060 char tmpfile[256];
8061 int j;
8062 time_t now = time(NULL);
8063
8064 /* Note that we have to use a different temp name here compared to the
8065 * one used by rewriteAppendOnlyFileBackground() function. */
8066 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8067 fp = fopen(tmpfile,"w");
8068 if (!fp) {
8069 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8070 return REDIS_ERR;
8071 }
8072 for (j = 0; j < server.dbnum; j++) {
8073 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8074 redisDb *db = server.db+j;
8075 dict *d = db->dict;
8076 if (dictSize(d) == 0) continue;
8077 di = dictGetIterator(d);
8078 if (!di) {
8079 fclose(fp);
8080 return REDIS_ERR;
8081 }
8082
8083 /* SELECT the new DB */
8084 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8085 if (fwriteBulkLong(fp,j) == 0) goto werr;
8086
8087 /* Iterate this DB writing every entry */
8088 while((de = dictNext(di)) != NULL) {
8089 robj *key, *o;
8090 time_t expiretime;
8091 int swapped;
8092
8093 key = dictGetEntryKey(de);
8094 /* If the value for this key is swapped, load a preview in memory.
8095 * We use a "swapped" flag to remember if we need to free the
8096 * value object instead to just increment the ref count anyway
8097 * in order to avoid copy-on-write of pages if we are forked() */
8098 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8099 key->storage == REDIS_VM_SWAPPING) {
8100 o = dictGetEntryVal(de);
8101 swapped = 0;
8102 } else {
8103 o = vmPreviewObject(key);
8104 swapped = 1;
8105 }
8106 expiretime = getExpire(db,key);
8107
8108 /* Save the key and associated value */
8109 if (o->type == REDIS_STRING) {
8110 /* Emit a SET command */
8111 char cmd[]="*3\r\n$3\r\nSET\r\n";
8112 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8113 /* Key and value */
8114 if (fwriteBulkObject(fp,key) == 0) goto werr;
8115 if (fwriteBulkObject(fp,o) == 0) goto werr;
8116 } else if (o->type == REDIS_LIST) {
8117 /* Emit the RPUSHes needed to rebuild the list */
8118 list *list = o->ptr;
8119 listNode *ln;
8120 listIter li;
8121
8122 listRewind(list,&li);
8123 while((ln = listNext(&li))) {
8124 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8125 robj *eleobj = listNodeValue(ln);
8126
8127 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8128 if (fwriteBulkObject(fp,key) == 0) goto werr;
8129 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8130 }
8131 } else if (o->type == REDIS_SET) {
8132 /* Emit the SADDs needed to rebuild the set */
8133 dict *set = o->ptr;
8134 dictIterator *di = dictGetIterator(set);
8135 dictEntry *de;
8136
8137 while((de = dictNext(di)) != NULL) {
8138 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8139 robj *eleobj = dictGetEntryKey(de);
8140
8141 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8142 if (fwriteBulkObject(fp,key) == 0) goto werr;
8143 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8144 }
8145 dictReleaseIterator(di);
8146 } else if (o->type == REDIS_ZSET) {
8147 /* Emit the ZADDs needed to rebuild the sorted set */
8148 zset *zs = o->ptr;
8149 dictIterator *di = dictGetIterator(zs->dict);
8150 dictEntry *de;
8151
8152 while((de = dictNext(di)) != NULL) {
8153 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8154 robj *eleobj = dictGetEntryKey(de);
8155 double *score = dictGetEntryVal(de);
8156
8157 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8158 if (fwriteBulkObject(fp,key) == 0) goto werr;
8159 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8160 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8161 }
8162 dictReleaseIterator(di);
8163 } else if (o->type == REDIS_HASH) {
8164 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8165
8166 /* Emit the HSETs needed to rebuild the hash */
8167 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8168 unsigned char *p = zipmapRewind(o->ptr);
8169 unsigned char *field, *val;
8170 unsigned int flen, vlen;
8171
8172 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8173 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8174 if (fwriteBulkObject(fp,key) == 0) goto werr;
8175 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8176 return -1;
8177 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8178 return -1;
8179 }
8180 } else {
8181 dictIterator *di = dictGetIterator(o->ptr);
8182 dictEntry *de;
8183
8184 while((de = dictNext(di)) != NULL) {
8185 robj *field = dictGetEntryKey(de);
8186 robj *val = dictGetEntryVal(de);
8187
8188 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8189 if (fwriteBulkObject(fp,key) == 0) goto werr;
8190 if (fwriteBulkObject(fp,field) == -1) return -1;
8191 if (fwriteBulkObject(fp,val) == -1) return -1;
8192 }
8193 dictReleaseIterator(di);
8194 }
8195 } else {
8196 redisAssert(0);
8197 }
8198 /* Save the expire time */
8199 if (expiretime != -1) {
8200 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8201 /* If this key is already expired skip it */
8202 if (expiretime < now) continue;
8203 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8204 if (fwriteBulkObject(fp,key) == 0) goto werr;
8205 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8206 }
8207 if (swapped) decrRefCount(o);
8208 }
8209 dictReleaseIterator(di);
8210 }
8211
8212 /* Make sure data will not remain on the OS's output buffers */
8213 fflush(fp);
8214 fsync(fileno(fp));
8215 fclose(fp);
8216
8217 /* Use RENAME to make sure the DB file is changed atomically only
8218 * if the generate DB file is ok. */
8219 if (rename(tmpfile,filename) == -1) {
8220 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8221 unlink(tmpfile);
8222 return REDIS_ERR;
8223 }
8224 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8225 return REDIS_OK;
8226
8227 werr:
8228 fclose(fp);
8229 unlink(tmpfile);
8230 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8231 if (di) dictReleaseIterator(di);
8232 return REDIS_ERR;
8233 }
8234
8235 /* This is how rewriting of the append only file in background works:
8236 *
8237 * 1) The user calls BGREWRITEAOF
8238 * 2) Redis calls this function, that forks():
8239 * 2a) the child rewrite the append only file in a temp file.
8240 * 2b) the parent accumulates differences in server.bgrewritebuf.
8241 * 3) When the child finished '2a' exists.
8242 * 4) The parent will trap the exit code, if it's OK, will append the
8243 * data accumulated into server.bgrewritebuf into the temp file, and
8244 * finally will rename(2) the temp file in the actual file name.
8245 * The the new file is reopened as the new append only file. Profit!
8246 */
8247 static int rewriteAppendOnlyFileBackground(void) {
8248 pid_t childpid;
8249
8250 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8251 if (server.vm_enabled) waitEmptyIOJobsQueue();
8252 if ((childpid = fork()) == 0) {
8253 /* Child */
8254 char tmpfile[256];
8255
8256 if (server.vm_enabled) vmReopenSwapFile();
8257 close(server.fd);
8258 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8259 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8260 _exit(0);
8261 } else {
8262 _exit(1);
8263 }
8264 } else {
8265 /* Parent */
8266 if (childpid == -1) {
8267 redisLog(REDIS_WARNING,
8268 "Can't rewrite append only file in background: fork: %s",
8269 strerror(errno));
8270 return REDIS_ERR;
8271 }
8272 redisLog(REDIS_NOTICE,
8273 "Background append only file rewriting started by pid %d",childpid);
8274 server.bgrewritechildpid = childpid;
8275 updateDictResizePolicy();
8276 /* We set appendseldb to -1 in order to force the next call to the
8277 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8278 * accumulated by the parent into server.bgrewritebuf will start
8279 * with a SELECT statement and it will be safe to merge. */
8280 server.appendseldb = -1;
8281 return REDIS_OK;
8282 }
8283 return REDIS_OK; /* unreached */
8284 }
8285
8286 static void bgrewriteaofCommand(redisClient *c) {
8287 if (server.bgrewritechildpid != -1) {
8288 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8289 return;
8290 }
8291 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8292 char *status = "+Background append only file rewriting started\r\n";
8293 addReplySds(c,sdsnew(status));
8294 } else {
8295 addReply(c,shared.err);
8296 }
8297 }
8298
8299 static void aofRemoveTempFile(pid_t childpid) {
8300 char tmpfile[256];
8301
8302 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8303 unlink(tmpfile);
8304 }
8305
8306 /* Virtual Memory is composed mainly of two subsystems:
8307 * - Blocking Virutal Memory
8308 * - Threaded Virtual Memory I/O
8309 * The two parts are not fully decoupled, but functions are split among two
8310 * different sections of the source code (delimited by comments) in order to
8311 * make more clear what functionality is about the blocking VM and what about
8312 * the threaded (not blocking) VM.
8313 *
8314 * Redis VM design:
8315 *
8316 * Redis VM is a blocking VM (one that blocks reading swapped values from
8317 * disk into memory when a value swapped out is needed in memory) that is made
8318 * unblocking by trying to examine the command argument vector in order to
8319 * load in background values that will likely be needed in order to exec
8320 * the command. The command is executed only once all the relevant keys
8321 * are loaded into memory.
8322 *
8323 * This basically is almost as simple of a blocking VM, but almost as parallel
8324 * as a fully non-blocking VM.
8325 */
8326
8327 /* =================== Virtual Memory - Blocking Side ====================== */
8328
8329 /* substitute the first occurrence of '%p' with the process pid in the
8330 * swap file name. */
8331 static void expandVmSwapFilename(void) {
8332 char *p = strstr(server.vm_swap_file,"%p");
8333 sds new;
8334
8335 if (!p) return;
8336 new = sdsempty();
8337 *p = '\0';
8338 new = sdscat(new,server.vm_swap_file);
8339 new = sdscatprintf(new,"%ld",(long) getpid());
8340 new = sdscat(new,p+2);
8341 zfree(server.vm_swap_file);
8342 server.vm_swap_file = new;
8343 }
8344
8345 static void vmInit(void) {
8346 off_t totsize;
8347 int pipefds[2];
8348 size_t stacksize;
8349
8350 if (server.vm_max_threads != 0)
8351 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8352
8353 expandVmSwapFilename();
8354 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8355 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8356 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8357 }
8358 if (server.vm_fp == NULL) {
8359 redisLog(REDIS_WARNING,
8360 "Impossible to open the swap file: %s. Exiting.",
8361 strerror(errno));
8362 exit(1);
8363 }
8364 server.vm_fd = fileno(server.vm_fp);
8365 server.vm_next_page = 0;
8366 server.vm_near_pages = 0;
8367 server.vm_stats_used_pages = 0;
8368 server.vm_stats_swapped_objects = 0;
8369 server.vm_stats_swapouts = 0;
8370 server.vm_stats_swapins = 0;
8371 totsize = server.vm_pages*server.vm_page_size;
8372 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8373 if (ftruncate(server.vm_fd,totsize) == -1) {
8374 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8375 strerror(errno));
8376 exit(1);
8377 } else {
8378 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8379 }
8380 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8381 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8382 (long long) (server.vm_pages+7)/8, server.vm_pages);
8383 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8384
8385 /* Initialize threaded I/O (used by Virtual Memory) */
8386 server.io_newjobs = listCreate();
8387 server.io_processing = listCreate();
8388 server.io_processed = listCreate();
8389 server.io_ready_clients = listCreate();
8390 pthread_mutex_init(&server.io_mutex,NULL);
8391 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8392 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8393 server.io_active_threads = 0;
8394 if (pipe(pipefds) == -1) {
8395 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8396 ,strerror(errno));
8397 exit(1);
8398 }
8399 server.io_ready_pipe_read = pipefds[0];
8400 server.io_ready_pipe_write = pipefds[1];
8401 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8402 /* LZF requires a lot of stack */
8403 pthread_attr_init(&server.io_threads_attr);
8404 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8405 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8406 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8407 /* Listen for events in the threaded I/O pipe */
8408 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8409 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8410 oom("creating file event");
8411 }
8412
8413 /* Mark the page as used */
8414 static void vmMarkPageUsed(off_t page) {
8415 off_t byte = page/8;
8416 int bit = page&7;
8417 redisAssert(vmFreePage(page) == 1);
8418 server.vm_bitmap[byte] |= 1<<bit;
8419 }
8420
8421 /* Mark N contiguous pages as used, with 'page' being the first. */
8422 static void vmMarkPagesUsed(off_t page, off_t count) {
8423 off_t j;
8424
8425 for (j = 0; j < count; j++)
8426 vmMarkPageUsed(page+j);
8427 server.vm_stats_used_pages += count;
8428 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8429 (long long)count, (long long)page);
8430 }
8431
8432 /* Mark the page as free */
8433 static void vmMarkPageFree(off_t page) {
8434 off_t byte = page/8;
8435 int bit = page&7;
8436 redisAssert(vmFreePage(page) == 0);
8437 server.vm_bitmap[byte] &= ~(1<<bit);
8438 }
8439
8440 /* Mark N contiguous pages as free, with 'page' being the first. */
8441 static void vmMarkPagesFree(off_t page, off_t count) {
8442 off_t j;
8443
8444 for (j = 0; j < count; j++)
8445 vmMarkPageFree(page+j);
8446 server.vm_stats_used_pages -= count;
8447 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8448 (long long)count, (long long)page);
8449 }
8450
8451 /* Test if the page is free */
8452 static int vmFreePage(off_t page) {
8453 off_t byte = page/8;
8454 int bit = page&7;
8455 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8456 }
8457
8458 /* Find N contiguous free pages storing the first page of the cluster in *first.
8459 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8460 * REDIS_ERR is returned.
8461 *
8462 * This function uses a simple algorithm: we try to allocate
8463 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8464 * again from the start of the swap file searching for free spaces.
8465 *
8466 * If it looks pretty clear that there are no free pages near our offset
8467 * we try to find less populated places doing a forward jump of
8468 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8469 * without hurry, and then we jump again and so forth...
8470 *
8471 * This function can be improved using a free list to avoid to guess
8472 * too much, since we could collect data about freed pages.
8473 *
8474 * note: I implemented this function just after watching an episode of
8475 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8476 */
8477 static int vmFindContiguousPages(off_t *first, off_t n) {
8478 off_t base, offset = 0, since_jump = 0, numfree = 0;
8479
8480 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8481 server.vm_near_pages = 0;
8482 server.vm_next_page = 0;
8483 }
8484 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8485 base = server.vm_next_page;
8486
8487 while(offset < server.vm_pages) {
8488 off_t this = base+offset;
8489
8490 /* If we overflow, restart from page zero */
8491 if (this >= server.vm_pages) {
8492 this -= server.vm_pages;
8493 if (this == 0) {
8494 /* Just overflowed, what we found on tail is no longer
8495 * interesting, as it's no longer contiguous. */
8496 numfree = 0;
8497 }
8498 }
8499 if (vmFreePage(this)) {
8500 /* This is a free page */
8501 numfree++;
8502 /* Already got N free pages? Return to the caller, with success */
8503 if (numfree == n) {
8504 *first = this-(n-1);
8505 server.vm_next_page = this+1;
8506 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8507 return REDIS_OK;
8508 }
8509 } else {
8510 /* The current one is not a free page */
8511 numfree = 0;
8512 }
8513
8514 /* Fast-forward if the current page is not free and we already
8515 * searched enough near this place. */
8516 since_jump++;
8517 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8518 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8519 since_jump = 0;
8520 /* Note that even if we rewind after the jump, we are don't need
8521 * to make sure numfree is set to zero as we only jump *if* it
8522 * is set to zero. */
8523 } else {
8524 /* Otherwise just check the next page */
8525 offset++;
8526 }
8527 }
8528 return REDIS_ERR;
8529 }
8530
8531 /* Write the specified object at the specified page of the swap file */
8532 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8533 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8534 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8535 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8536 redisLog(REDIS_WARNING,
8537 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8538 strerror(errno));
8539 return REDIS_ERR;
8540 }
8541 rdbSaveObject(server.vm_fp,o);
8542 fflush(server.vm_fp);
8543 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8544 return REDIS_OK;
8545 }
8546
8547 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8548 * needed to later retrieve the object into the key object.
8549 * If we can't find enough contiguous empty pages to swap the object on disk
8550 * REDIS_ERR is returned. */
8551 static int vmSwapObjectBlocking(robj *key, robj *val) {
8552 off_t pages = rdbSavedObjectPages(val,NULL);
8553 off_t page;
8554
8555 assert(key->storage == REDIS_VM_MEMORY);
8556 assert(key->refcount == 1);
8557 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8558 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8559 key->vm.page = page;
8560 key->vm.usedpages = pages;
8561 key->storage = REDIS_VM_SWAPPED;
8562 key->vtype = val->type;
8563 decrRefCount(val); /* Deallocate the object from memory. */
8564 vmMarkPagesUsed(page,pages);
8565 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8566 (unsigned char*) key->ptr,
8567 (unsigned long long) page, (unsigned long long) pages);
8568 server.vm_stats_swapped_objects++;
8569 server.vm_stats_swapouts++;
8570 return REDIS_OK;
8571 }
8572
8573 static robj *vmReadObjectFromSwap(off_t page, int type) {
8574 robj *o;
8575
8576 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8577 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8578 redisLog(REDIS_WARNING,
8579 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8580 strerror(errno));
8581 _exit(1);
8582 }
8583 o = rdbLoadObject(type,server.vm_fp);
8584 if (o == NULL) {
8585 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8586 _exit(1);
8587 }
8588 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8589 return o;
8590 }
8591
8592 /* Load the value object relative to the 'key' object from swap to memory.
8593 * The newly allocated object is returned.
8594 *
8595 * If preview is true the unserialized object is returned to the caller but
8596 * no changes are made to the key object, nor the pages are marked as freed */
8597 static robj *vmGenericLoadObject(robj *key, int preview) {
8598 robj *val;
8599
8600 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8601 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8602 if (!preview) {
8603 key->storage = REDIS_VM_MEMORY;
8604 key->vm.atime = server.unixtime;
8605 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8606 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8607 (unsigned char*) key->ptr);
8608 server.vm_stats_swapped_objects--;
8609 } else {
8610 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8611 (unsigned char*) key->ptr);
8612 }
8613 server.vm_stats_swapins++;
8614 return val;
8615 }
8616
8617 /* Plain object loading, from swap to memory */
8618 static robj *vmLoadObject(robj *key) {
8619 /* If we are loading the object in background, stop it, we
8620 * need to load this object synchronously ASAP. */
8621 if (key->storage == REDIS_VM_LOADING)
8622 vmCancelThreadedIOJob(key);
8623 return vmGenericLoadObject(key,0);
8624 }
8625
8626 /* Just load the value on disk, without to modify the key.
8627 * This is useful when we want to perform some operation on the value
8628 * without to really bring it from swap to memory, like while saving the
8629 * dataset or rewriting the append only log. */
8630 static robj *vmPreviewObject(robj *key) {
8631 return vmGenericLoadObject(key,1);
8632 }
8633
8634 /* How a good candidate is this object for swapping?
8635 * The better candidate it is, the greater the returned value.
8636 *
8637 * Currently we try to perform a fast estimation of the object size in
8638 * memory, and combine it with aging informations.
8639 *
8640 * Basically swappability = idle-time * log(estimated size)
8641 *
8642 * Bigger objects are preferred over smaller objects, but not
8643 * proportionally, this is why we use the logarithm. This algorithm is
8644 * just a first try and will probably be tuned later. */
8645 static double computeObjectSwappability(robj *o) {
8646 time_t age = server.unixtime - o->vm.atime;
8647 long asize = 0;
8648 list *l;
8649 dict *d;
8650 struct dictEntry *de;
8651 int z;
8652
8653 if (age <= 0) return 0;
8654 switch(o->type) {
8655 case REDIS_STRING:
8656 if (o->encoding != REDIS_ENCODING_RAW) {
8657 asize = sizeof(*o);
8658 } else {
8659 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8660 }
8661 break;
8662 case REDIS_LIST:
8663 l = o->ptr;
8664 listNode *ln = listFirst(l);
8665
8666 asize = sizeof(list);
8667 if (ln) {
8668 robj *ele = ln->value;
8669 long elesize;
8670
8671 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8672 (sizeof(*o)+sdslen(ele->ptr)) :
8673 sizeof(*o);
8674 asize += (sizeof(listNode)+elesize)*listLength(l);
8675 }
8676 break;
8677 case REDIS_SET:
8678 case REDIS_ZSET:
8679 z = (o->type == REDIS_ZSET);
8680 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8681
8682 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8683 if (z) asize += sizeof(zset)-sizeof(dict);
8684 if (dictSize(d)) {
8685 long elesize;
8686 robj *ele;
8687
8688 de = dictGetRandomKey(d);
8689 ele = dictGetEntryKey(de);
8690 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8691 (sizeof(*o)+sdslen(ele->ptr)) :
8692 sizeof(*o);
8693 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8694 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8695 }
8696 break;
8697 case REDIS_HASH:
8698 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8699 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8700 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8701 unsigned int klen, vlen;
8702 unsigned char *key, *val;
8703
8704 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8705 klen = 0;
8706 vlen = 0;
8707 }
8708 asize = len*(klen+vlen+3);
8709 } else if (o->encoding == REDIS_ENCODING_HT) {
8710 d = o->ptr;
8711 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8712 if (dictSize(d)) {
8713 long elesize;
8714 robj *ele;
8715
8716 de = dictGetRandomKey(d);
8717 ele = dictGetEntryKey(de);
8718 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8719 (sizeof(*o)+sdslen(ele->ptr)) :
8720 sizeof(*o);
8721 ele = dictGetEntryVal(de);
8722 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8723 (sizeof(*o)+sdslen(ele->ptr)) :
8724 sizeof(*o);
8725 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8726 }
8727 }
8728 break;
8729 }
8730 return (double)age*log(1+asize);
8731 }
8732
8733 /* Try to swap an object that's a good candidate for swapping.
8734 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8735 * to swap any object at all.
8736 *
8737 * If 'usethreaded' is true, Redis will try to swap the object in background
8738 * using I/O threads. */
8739 static int vmSwapOneObject(int usethreads) {
8740 int j, i;
8741 struct dictEntry *best = NULL;
8742 double best_swappability = 0;
8743 redisDb *best_db = NULL;
8744 robj *key, *val;
8745
8746 for (j = 0; j < server.dbnum; j++) {
8747 redisDb *db = server.db+j;
8748 /* Why maxtries is set to 100?
8749 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8750 * are swappable objects */
8751 int maxtries = 100;
8752
8753 if (dictSize(db->dict) == 0) continue;
8754 for (i = 0; i < 5; i++) {
8755 dictEntry *de;
8756 double swappability;
8757
8758 if (maxtries) maxtries--;
8759 de = dictGetRandomKey(db->dict);
8760 key = dictGetEntryKey(de);
8761 val = dictGetEntryVal(de);
8762 /* Only swap objects that are currently in memory.
8763 *
8764 * Also don't swap shared objects if threaded VM is on, as we
8765 * try to ensure that the main thread does not touch the
8766 * object while the I/O thread is using it, but we can't
8767 * control other keys without adding additional mutex. */
8768 if (key->storage != REDIS_VM_MEMORY ||
8769 (server.vm_max_threads != 0 && val->refcount != 1)) {
8770 if (maxtries) i--; /* don't count this try */
8771 continue;
8772 }
8773 swappability = computeObjectSwappability(val);
8774 if (!best || swappability > best_swappability) {
8775 best = de;
8776 best_swappability = swappability;
8777 best_db = db;
8778 }
8779 }
8780 }
8781 if (best == NULL) return REDIS_ERR;
8782 key = dictGetEntryKey(best);
8783 val = dictGetEntryVal(best);
8784
8785 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8786 key->ptr, best_swappability);
8787
8788 /* Unshare the key if needed */
8789 if (key->refcount > 1) {
8790 robj *newkey = dupStringObject(key);
8791 decrRefCount(key);
8792 key = dictGetEntryKey(best) = newkey;
8793 }
8794 /* Swap it */
8795 if (usethreads) {
8796 vmSwapObjectThreaded(key,val,best_db);
8797 return REDIS_OK;
8798 } else {
8799 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8800 dictGetEntryVal(best) = NULL;
8801 return REDIS_OK;
8802 } else {
8803 return REDIS_ERR;
8804 }
8805 }
8806 }
8807
8808 static int vmSwapOneObjectBlocking() {
8809 return vmSwapOneObject(0);
8810 }
8811
8812 static int vmSwapOneObjectThreaded() {
8813 return vmSwapOneObject(1);
8814 }
8815
8816 /* Return true if it's safe to swap out objects in a given moment.
8817 * Basically we don't want to swap objects out while there is a BGSAVE
8818 * or a BGAEOREWRITE running in backgroud. */
8819 static int vmCanSwapOut(void) {
8820 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8821 }
8822
8823 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8824 * and was deleted. Otherwise 0 is returned. */
8825 static int deleteIfSwapped(redisDb *db, robj *key) {
8826 dictEntry *de;
8827 robj *foundkey;
8828
8829 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8830 foundkey = dictGetEntryKey(de);
8831 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8832 deleteKey(db,key);
8833 return 1;
8834 }
8835
8836 /* =================== Virtual Memory - Threaded I/O ======================= */
8837
8838 static void freeIOJob(iojob *j) {
8839 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8840 j->type == REDIS_IOJOB_DO_SWAP ||
8841 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8842 decrRefCount(j->val);
8843 /* We don't decrRefCount the j->key field as we did't incremented
8844 * the count creating IO Jobs. This is because the key field here is
8845 * just used as an indentifier and if a key is removed the Job should
8846 * never be touched again. */
8847 zfree(j);
8848 }
8849
8850 /* Every time a thread finished a Job, it writes a byte into the write side
8851 * of an unix pipe in order to "awake" the main thread, and this function
8852 * is called. */
8853 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8854 int mask)
8855 {
8856 char buf[1];
8857 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8858 REDIS_NOTUSED(el);
8859 REDIS_NOTUSED(mask);
8860 REDIS_NOTUSED(privdata);
8861
8862 /* For every byte we read in the read side of the pipe, there is one
8863 * I/O job completed to process. */
8864 while((retval = read(fd,buf,1)) == 1) {
8865 iojob *j;
8866 listNode *ln;
8867 robj *key;
8868 struct dictEntry *de;
8869
8870 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8871
8872 /* Get the processed element (the oldest one) */
8873 lockThreadedIO();
8874 assert(listLength(server.io_processed) != 0);
8875 if (toprocess == -1) {
8876 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8877 if (toprocess <= 0) toprocess = 1;
8878 }
8879 ln = listFirst(server.io_processed);
8880 j = ln->value;
8881 listDelNode(server.io_processed,ln);
8882 unlockThreadedIO();
8883 /* If this job is marked as canceled, just ignore it */
8884 if (j->canceled) {
8885 freeIOJob(j);
8886 continue;
8887 }
8888 /* Post process it in the main thread, as there are things we
8889 * can do just here to avoid race conditions and/or invasive locks */
8890 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8891 de = dictFind(j->db->dict,j->key);
8892 assert(de != NULL);
8893 key = dictGetEntryKey(de);
8894 if (j->type == REDIS_IOJOB_LOAD) {
8895 redisDb *db;
8896
8897 /* Key loaded, bring it at home */
8898 key->storage = REDIS_VM_MEMORY;
8899 key->vm.atime = server.unixtime;
8900 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8901 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8902 (unsigned char*) key->ptr);
8903 server.vm_stats_swapped_objects--;
8904 server.vm_stats_swapins++;
8905 dictGetEntryVal(de) = j->val;
8906 incrRefCount(j->val);
8907 db = j->db;
8908 freeIOJob(j);
8909 /* Handle clients waiting for this key to be loaded. */
8910 handleClientsBlockedOnSwappedKey(db,key);
8911 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8912 /* Now we know the amount of pages required to swap this object.
8913 * Let's find some space for it, and queue this task again
8914 * rebranded as REDIS_IOJOB_DO_SWAP. */
8915 if (!vmCanSwapOut() ||
8916 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8917 {
8918 /* Ooops... no space or we can't swap as there is
8919 * a fork()ed Redis trying to save stuff on disk. */
8920 freeIOJob(j);
8921 key->storage = REDIS_VM_MEMORY; /* undo operation */
8922 } else {
8923 /* Note that we need to mark this pages as used now,
8924 * if the job will be canceled, we'll mark them as freed
8925 * again. */
8926 vmMarkPagesUsed(j->page,j->pages);
8927 j->type = REDIS_IOJOB_DO_SWAP;
8928 lockThreadedIO();
8929 queueIOJob(j);
8930 unlockThreadedIO();
8931 }
8932 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8933 robj *val;
8934
8935 /* Key swapped. We can finally free some memory. */
8936 if (key->storage != REDIS_VM_SWAPPING) {
8937 printf("key->storage: %d\n",key->storage);
8938 printf("key->name: %s\n",(char*)key->ptr);
8939 printf("key->refcount: %d\n",key->refcount);
8940 printf("val: %p\n",(void*)j->val);
8941 printf("val->type: %d\n",j->val->type);
8942 printf("val->ptr: %s\n",(char*)j->val->ptr);
8943 }
8944 redisAssert(key->storage == REDIS_VM_SWAPPING);
8945 val = dictGetEntryVal(de);
8946 key->vm.page = j->page;
8947 key->vm.usedpages = j->pages;
8948 key->storage = REDIS_VM_SWAPPED;
8949 key->vtype = j->val->type;
8950 decrRefCount(val); /* Deallocate the object from memory. */
8951 dictGetEntryVal(de) = NULL;
8952 redisLog(REDIS_DEBUG,
8953 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8954 (unsigned char*) key->ptr,
8955 (unsigned long long) j->page, (unsigned long long) j->pages);
8956 server.vm_stats_swapped_objects++;
8957 server.vm_stats_swapouts++;
8958 freeIOJob(j);
8959 /* Put a few more swap requests in queue if we are still
8960 * out of memory */
8961 if (trytoswap && vmCanSwapOut() &&
8962 zmalloc_used_memory() > server.vm_max_memory)
8963 {
8964 int more = 1;
8965 while(more) {
8966 lockThreadedIO();
8967 more = listLength(server.io_newjobs) <
8968 (unsigned) server.vm_max_threads;
8969 unlockThreadedIO();
8970 /* Don't waste CPU time if swappable objects are rare. */
8971 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8972 trytoswap = 0;
8973 break;
8974 }
8975 }
8976 }
8977 }
8978 processed++;
8979 if (processed == toprocess) return;
8980 }
8981 if (retval < 0 && errno != EAGAIN) {
8982 redisLog(REDIS_WARNING,
8983 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8984 strerror(errno));
8985 }
8986 }
8987
8988 static void lockThreadedIO(void) {
8989 pthread_mutex_lock(&server.io_mutex);
8990 }
8991
8992 static void unlockThreadedIO(void) {
8993 pthread_mutex_unlock(&server.io_mutex);
8994 }
8995
8996 /* Remove the specified object from the threaded I/O queue if still not
8997 * processed, otherwise make sure to flag it as canceled. */
8998 static void vmCancelThreadedIOJob(robj *o) {
8999 list *lists[3] = {
9000 server.io_newjobs, /* 0 */
9001 server.io_processing, /* 1 */
9002 server.io_processed /* 2 */
9003 };
9004 int i;
9005
9006 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9007 again:
9008 lockThreadedIO();
9009 /* Search for a matching key in one of the queues */
9010 for (i = 0; i < 3; i++) {
9011 listNode *ln;
9012 listIter li;
9013
9014 listRewind(lists[i],&li);
9015 while ((ln = listNext(&li)) != NULL) {
9016 iojob *job = ln->value;
9017
9018 if (job->canceled) continue; /* Skip this, already canceled. */
9019 if (job->key == o) {
9020 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9021 (void*)job, (char*)o->ptr, job->type, i);
9022 /* Mark the pages as free since the swap didn't happened
9023 * or happened but is now discarded. */
9024 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9025 vmMarkPagesFree(job->page,job->pages);
9026 /* Cancel the job. It depends on the list the job is
9027 * living in. */
9028 switch(i) {
9029 case 0: /* io_newjobs */
9030 /* If the job was yet not processed the best thing to do
9031 * is to remove it from the queue at all */
9032 freeIOJob(job);
9033 listDelNode(lists[i],ln);
9034 break;
9035 case 1: /* io_processing */
9036 /* Oh Shi- the thread is messing with the Job:
9037 *
9038 * Probably it's accessing the object if this is a
9039 * PREPARE_SWAP or DO_SWAP job.
9040 * If it's a LOAD job it may be reading from disk and
9041 * if we don't wait for the job to terminate before to
9042 * cancel it, maybe in a few microseconds data can be
9043 * corrupted in this pages. So the short story is:
9044 *
9045 * Better to wait for the job to move into the
9046 * next queue (processed)... */
9047
9048 /* We try again and again until the job is completed. */
9049 unlockThreadedIO();
9050 /* But let's wait some time for the I/O thread
9051 * to finish with this job. After all this condition
9052 * should be very rare. */
9053 usleep(1);
9054 goto again;
9055 case 2: /* io_processed */
9056 /* The job was already processed, that's easy...
9057 * just mark it as canceled so that we'll ignore it
9058 * when processing completed jobs. */
9059 job->canceled = 1;
9060 break;
9061 }
9062 /* Finally we have to adjust the storage type of the object
9063 * in order to "UNDO" the operaiton. */
9064 if (o->storage == REDIS_VM_LOADING)
9065 o->storage = REDIS_VM_SWAPPED;
9066 else if (o->storage == REDIS_VM_SWAPPING)
9067 o->storage = REDIS_VM_MEMORY;
9068 unlockThreadedIO();
9069 return;
9070 }
9071 }
9072 }
9073 unlockThreadedIO();
9074 assert(1 != 1); /* We should never reach this */
9075 }
9076
9077 static void *IOThreadEntryPoint(void *arg) {
9078 iojob *j;
9079 listNode *ln;
9080 REDIS_NOTUSED(arg);
9081
9082 pthread_detach(pthread_self());
9083 while(1) {
9084 /* Get a new job to process */
9085 lockThreadedIO();
9086 if (listLength(server.io_newjobs) == 0) {
9087 /* No new jobs in queue, exit. */
9088 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9089 (long) pthread_self());
9090 server.io_active_threads--;
9091 unlockThreadedIO();
9092 return NULL;
9093 }
9094 ln = listFirst(server.io_newjobs);
9095 j = ln->value;
9096 listDelNode(server.io_newjobs,ln);
9097 /* Add the job in the processing queue */
9098 j->thread = pthread_self();
9099 listAddNodeTail(server.io_processing,j);
9100 ln = listLast(server.io_processing); /* We use ln later to remove it */
9101 unlockThreadedIO();
9102 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9103 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9104
9105 /* Process the Job */
9106 if (j->type == REDIS_IOJOB_LOAD) {
9107 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9108 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9109 FILE *fp = fopen("/dev/null","w+");
9110 j->pages = rdbSavedObjectPages(j->val,fp);
9111 fclose(fp);
9112 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9113 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9114 j->canceled = 1;
9115 }
9116
9117 /* Done: insert the job into the processed queue */
9118 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9119 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9120 lockThreadedIO();
9121 listDelNode(server.io_processing,ln);
9122 listAddNodeTail(server.io_processed,j);
9123 unlockThreadedIO();
9124
9125 /* Signal the main thread there is new stuff to process */
9126 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9127 }
9128 return NULL; /* never reached */
9129 }
9130
9131 static void spawnIOThread(void) {
9132 pthread_t thread;
9133 sigset_t mask, omask;
9134 int err;
9135
9136 sigemptyset(&mask);
9137 sigaddset(&mask,SIGCHLD);
9138 sigaddset(&mask,SIGHUP);
9139 sigaddset(&mask,SIGPIPE);
9140 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9141 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9142 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9143 strerror(err));
9144 usleep(1000000);
9145 }
9146 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9147 server.io_active_threads++;
9148 }
9149
9150 /* We need to wait for the last thread to exit before we are able to
9151 * fork() in order to BGSAVE or BGREWRITEAOF. */
9152 static void waitEmptyIOJobsQueue(void) {
9153 while(1) {
9154 int io_processed_len;
9155
9156 lockThreadedIO();
9157 if (listLength(server.io_newjobs) == 0 &&
9158 listLength(server.io_processing) == 0 &&
9159 server.io_active_threads == 0)
9160 {
9161 unlockThreadedIO();
9162 return;
9163 }
9164 /* While waiting for empty jobs queue condition we post-process some
9165 * finshed job, as I/O threads may be hanging trying to write against
9166 * the io_ready_pipe_write FD but there are so much pending jobs that
9167 * it's blocking. */
9168 io_processed_len = listLength(server.io_processed);
9169 unlockThreadedIO();
9170 if (io_processed_len) {
9171 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9172 usleep(1000); /* 1 millisecond */
9173 } else {
9174 usleep(10000); /* 10 milliseconds */
9175 }
9176 }
9177 }
9178
9179 static void vmReopenSwapFile(void) {
9180 /* Note: we don't close the old one as we are in the child process
9181 * and don't want to mess at all with the original file object. */
9182 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9183 if (server.vm_fp == NULL) {
9184 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9185 server.vm_swap_file);
9186 _exit(1);
9187 }
9188 server.vm_fd = fileno(server.vm_fp);
9189 }
9190
9191 /* This function must be called while with threaded IO locked */
9192 static void queueIOJob(iojob *j) {
9193 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9194 (void*)j, j->type, (char*)j->key->ptr);
9195 listAddNodeTail(server.io_newjobs,j);
9196 if (server.io_active_threads < server.vm_max_threads)
9197 spawnIOThread();
9198 }
9199
9200 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9201 iojob *j;
9202
9203 assert(key->storage == REDIS_VM_MEMORY);
9204 assert(key->refcount == 1);
9205
9206 j = zmalloc(sizeof(*j));
9207 j->type = REDIS_IOJOB_PREPARE_SWAP;
9208 j->db = db;
9209 j->key = key;
9210 j->val = val;
9211 incrRefCount(val);
9212 j->canceled = 0;
9213 j->thread = (pthread_t) -1;
9214 key->storage = REDIS_VM_SWAPPING;
9215
9216 lockThreadedIO();
9217 queueIOJob(j);
9218 unlockThreadedIO();
9219 return REDIS_OK;
9220 }
9221
9222 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9223
9224 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9225 * If there is not already a job loading the key, it is craeted.
9226 * The key is added to the io_keys list in the client structure, and also
9227 * in the hash table mapping swapped keys to waiting clients, that is,
9228 * server.io_waited_keys. */
9229 static int waitForSwappedKey(redisClient *c, robj *key) {
9230 struct dictEntry *de;
9231 robj *o;
9232 list *l;
9233
9234 /* If the key does not exist or is already in RAM we don't need to
9235 * block the client at all. */
9236 de = dictFind(c->db->dict,key);
9237 if (de == NULL) return 0;
9238 o = dictGetEntryKey(de);
9239 if (o->storage == REDIS_VM_MEMORY) {
9240 return 0;
9241 } else if (o->storage == REDIS_VM_SWAPPING) {
9242 /* We were swapping the key, undo it! */
9243 vmCancelThreadedIOJob(o);
9244 return 0;
9245 }
9246
9247 /* OK: the key is either swapped, or being loaded just now. */
9248
9249 /* Add the key to the list of keys this client is waiting for.
9250 * This maps clients to keys they are waiting for. */
9251 listAddNodeTail(c->io_keys,key);
9252 incrRefCount(key);
9253
9254 /* Add the client to the swapped keys => clients waiting map. */
9255 de = dictFind(c->db->io_keys,key);
9256 if (de == NULL) {
9257 int retval;
9258
9259 /* For every key we take a list of clients blocked for it */
9260 l = listCreate();
9261 retval = dictAdd(c->db->io_keys,key,l);
9262 incrRefCount(key);
9263 assert(retval == DICT_OK);
9264 } else {
9265 l = dictGetEntryVal(de);
9266 }
9267 listAddNodeTail(l,c);
9268
9269 /* Are we already loading the key from disk? If not create a job */
9270 if (o->storage == REDIS_VM_SWAPPED) {
9271 iojob *j;
9272
9273 o->storage = REDIS_VM_LOADING;
9274 j = zmalloc(sizeof(*j));
9275 j->type = REDIS_IOJOB_LOAD;
9276 j->db = c->db;
9277 j->key = o;
9278 j->key->vtype = o->vtype;
9279 j->page = o->vm.page;
9280 j->val = NULL;
9281 j->canceled = 0;
9282 j->thread = (pthread_t) -1;
9283 lockThreadedIO();
9284 queueIOJob(j);
9285 unlockThreadedIO();
9286 }
9287 return 1;
9288 }
9289
9290 /* Preload keys needed for the ZUNION and ZINTER commands. */
9291 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9292 int i, num;
9293 num = atoi(c->argv[2]->ptr);
9294 for (i = 0; i < num; i++) {
9295 waitForSwappedKey(c,c->argv[3+i]);
9296 }
9297 }
9298
9299 /* Is this client attempting to run a command against swapped keys?
9300 * If so, block it ASAP, load the keys in background, then resume it.
9301 *
9302 * The important idea about this function is that it can fail! If keys will
9303 * still be swapped when the client is resumed, this key lookups will
9304 * just block loading keys from disk. In practical terms this should only
9305 * happen with SORT BY command or if there is a bug in this function.
9306 *
9307 * Return 1 if the client is marked as blocked, 0 if the client can
9308 * continue as the keys it is going to access appear to be in memory. */
9309 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9310 int j, last;
9311
9312 if (cmd->vm_preload_proc != NULL) {
9313 cmd->vm_preload_proc(c);
9314 } else {
9315 if (cmd->vm_firstkey == 0) return 0;
9316 last = cmd->vm_lastkey;
9317 if (last < 0) last = c->argc+last;
9318 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9319 waitForSwappedKey(c,c->argv[j]);
9320 }
9321
9322 /* If the client was blocked for at least one key, mark it as blocked. */
9323 if (listLength(c->io_keys)) {
9324 c->flags |= REDIS_IO_WAIT;
9325 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9326 server.vm_blocked_clients++;
9327 return 1;
9328 } else {
9329 return 0;
9330 }
9331 }
9332
9333 /* Remove the 'key' from the list of blocked keys for a given client.
9334 *
9335 * The function returns 1 when there are no longer blocking keys after
9336 * the current one was removed (and the client can be unblocked). */
9337 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9338 list *l;
9339 listNode *ln;
9340 listIter li;
9341 struct dictEntry *de;
9342
9343 /* Remove the key from the list of keys this client is waiting for. */
9344 listRewind(c->io_keys,&li);
9345 while ((ln = listNext(&li)) != NULL) {
9346 if (compareStringObjects(ln->value,key) == 0) {
9347 listDelNode(c->io_keys,ln);
9348 break;
9349 }
9350 }
9351 assert(ln != NULL);
9352
9353 /* Remove the client form the key => waiting clients map. */
9354 de = dictFind(c->db->io_keys,key);
9355 assert(de != NULL);
9356 l = dictGetEntryVal(de);
9357 ln = listSearchKey(l,c);
9358 assert(ln != NULL);
9359 listDelNode(l,ln);
9360 if (listLength(l) == 0)
9361 dictDelete(c->db->io_keys,key);
9362
9363 return listLength(c->io_keys) == 0;
9364 }
9365
9366 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9367 struct dictEntry *de;
9368 list *l;
9369 listNode *ln;
9370 int len;
9371
9372 de = dictFind(db->io_keys,key);
9373 if (!de) return;
9374
9375 l = dictGetEntryVal(de);
9376 len = listLength(l);
9377 /* Note: we can't use something like while(listLength(l)) as the list
9378 * can be freed by the calling function when we remove the last element. */
9379 while (len--) {
9380 ln = listFirst(l);
9381 redisClient *c = ln->value;
9382
9383 if (dontWaitForSwappedKey(c,key)) {
9384 /* Put the client in the list of clients ready to go as we
9385 * loaded all the keys about it. */
9386 listAddNodeTail(server.io_ready_clients,c);
9387 }
9388 }
9389 }
9390
9391 /* =========================== Remote Configuration ========================= */
9392
9393 static void configSetCommand(redisClient *c) {
9394 robj *o = getDecodedObject(c->argv[3]);
9395 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9396 zfree(server.dbfilename);
9397 server.dbfilename = zstrdup(o->ptr);
9398 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9399 zfree(server.requirepass);
9400 server.requirepass = zstrdup(o->ptr);
9401 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9402 zfree(server.masterauth);
9403 server.masterauth = zstrdup(o->ptr);
9404 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9405 server.maxmemory = strtoll(o->ptr, NULL, 10);
9406 } else {
9407 addReplySds(c,sdscatprintf(sdsempty(),
9408 "-ERR not supported CONFIG parameter %s\r\n",
9409 (char*)c->argv[2]->ptr));
9410 decrRefCount(o);
9411 return;
9412 }
9413 decrRefCount(o);
9414 addReply(c,shared.ok);
9415 }
9416
9417 static void configGetCommand(redisClient *c) {
9418 robj *o = getDecodedObject(c->argv[2]);
9419 robj *lenobj = createObject(REDIS_STRING,NULL);
9420 char *pattern = o->ptr;
9421 int matches = 0;
9422
9423 addReply(c,lenobj);
9424 decrRefCount(lenobj);
9425
9426 if (stringmatch(pattern,"dbfilename",0)) {
9427 addReplyBulkCString(c,"dbfilename");
9428 addReplyBulkCString(c,server.dbfilename);
9429 matches++;
9430 }
9431 if (stringmatch(pattern,"requirepass",0)) {
9432 addReplyBulkCString(c,"requirepass");
9433 addReplyBulkCString(c,server.requirepass);
9434 matches++;
9435 }
9436 if (stringmatch(pattern,"masterauth",0)) {
9437 addReplyBulkCString(c,"masterauth");
9438 addReplyBulkCString(c,server.masterauth);
9439 matches++;
9440 }
9441 if (stringmatch(pattern,"maxmemory",0)) {
9442 char buf[128];
9443
9444 snprintf(buf,128,"%llu\n",server.maxmemory);
9445 addReplyBulkCString(c,"maxmemory");
9446 addReplyBulkCString(c,buf);
9447 matches++;
9448 }
9449 decrRefCount(o);
9450 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9451 }
9452
9453 static void configCommand(redisClient *c) {
9454 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9455 if (c->argc != 4) goto badarity;
9456 configSetCommand(c);
9457 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9458 if (c->argc != 3) goto badarity;
9459 configGetCommand(c);
9460 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9461 if (c->argc != 2) goto badarity;
9462 server.stat_numcommands = 0;
9463 server.stat_numconnections = 0;
9464 server.stat_expiredkeys = 0;
9465 server.stat_starttime = time(NULL);
9466 addReply(c,shared.ok);
9467 } else {
9468 addReplySds(c,sdscatprintf(sdsempty(),
9469 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9470 }
9471 return;
9472
9473 badarity:
9474 addReplySds(c,sdscatprintf(sdsempty(),
9475 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9476 (char*) c->argv[1]->ptr));
9477 }
9478
9479 /* =========================== Pubsub implementation ======================== */
9480
9481 static void freePubsubPattern(void *p) {
9482 pubsubPattern *pat = p;
9483
9484 decrRefCount(pat->pattern);
9485 zfree(pat);
9486 }
9487
9488 static int listMatchPubsubPattern(void *a, void *b) {
9489 pubsubPattern *pa = a, *pb = b;
9490
9491 return (pa->client == pb->client) &&
9492 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9493 }
9494
9495 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9496 * 0 if the client was already subscribed to that channel. */
9497 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9498 struct dictEntry *de;
9499 list *clients = NULL;
9500 int retval = 0;
9501
9502 /* Add the channel to the client -> channels hash table */
9503 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9504 retval = 1;
9505 incrRefCount(channel);
9506 /* Add the client to the channel -> list of clients hash table */
9507 de = dictFind(server.pubsub_channels,channel);
9508 if (de == NULL) {
9509 clients = listCreate();
9510 dictAdd(server.pubsub_channels,channel,clients);
9511 incrRefCount(channel);
9512 } else {
9513 clients = dictGetEntryVal(de);
9514 }
9515 listAddNodeTail(clients,c);
9516 }
9517 /* Notify the client */
9518 addReply(c,shared.mbulk3);
9519 addReply(c,shared.subscribebulk);
9520 addReplyBulk(c,channel);
9521 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9522 return retval;
9523 }
9524
9525 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9526 * 0 if the client was not subscribed to the specified channel. */
9527 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9528 struct dictEntry *de;
9529 list *clients;
9530 listNode *ln;
9531 int retval = 0;
9532
9533 /* Remove the channel from the client -> channels hash table */
9534 incrRefCount(channel); /* channel may be just a pointer to the same object
9535 we have in the hash tables. Protect it... */
9536 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9537 retval = 1;
9538 /* Remove the client from the channel -> clients list hash table */
9539 de = dictFind(server.pubsub_channels,channel);
9540 assert(de != NULL);
9541 clients = dictGetEntryVal(de);
9542 ln = listSearchKey(clients,c);
9543 assert(ln != NULL);
9544 listDelNode(clients,ln);
9545 if (listLength(clients) == 0) {
9546 /* Free the list and associated hash entry at all if this was
9547 * the latest client, so that it will be possible to abuse
9548 * Redis PUBSUB creating millions of channels. */
9549 dictDelete(server.pubsub_channels,channel);
9550 }
9551 }
9552 /* Notify the client */
9553 if (notify) {
9554 addReply(c,shared.mbulk3);
9555 addReply(c,shared.unsubscribebulk);
9556 addReplyBulk(c,channel);
9557 addReplyLong(c,dictSize(c->pubsub_channels)+
9558 listLength(c->pubsub_patterns));
9559
9560 }
9561 decrRefCount(channel); /* it is finally safe to release it */
9562 return retval;
9563 }
9564
9565 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9566 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9567 int retval = 0;
9568
9569 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9570 retval = 1;
9571 pubsubPattern *pat;
9572 listAddNodeTail(c->pubsub_patterns,pattern);
9573 incrRefCount(pattern);
9574 pat = zmalloc(sizeof(*pat));
9575 pat->pattern = getDecodedObject(pattern);
9576 pat->client = c;
9577 listAddNodeTail(server.pubsub_patterns,pat);
9578 }
9579 /* Notify the client */
9580 addReply(c,shared.mbulk3);
9581 addReply(c,shared.psubscribebulk);
9582 addReplyBulk(c,pattern);
9583 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9584 return retval;
9585 }
9586
9587 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9588 * 0 if the client was not subscribed to the specified channel. */
9589 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9590 listNode *ln;
9591 pubsubPattern pat;
9592 int retval = 0;
9593
9594 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9595 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9596 retval = 1;
9597 listDelNode(c->pubsub_patterns,ln);
9598 pat.client = c;
9599 pat.pattern = pattern;
9600 ln = listSearchKey(server.pubsub_patterns,&pat);
9601 listDelNode(server.pubsub_patterns,ln);
9602 }
9603 /* Notify the client */
9604 if (notify) {
9605 addReply(c,shared.mbulk3);
9606 addReply(c,shared.punsubscribebulk);
9607 addReplyBulk(c,pattern);
9608 addReplyLong(c,dictSize(c->pubsub_channels)+
9609 listLength(c->pubsub_patterns));
9610 }
9611 decrRefCount(pattern);
9612 return retval;
9613 }
9614
9615 /* Unsubscribe from all the channels. Return the number of channels the
9616 * client was subscribed from. */
9617 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9618 dictIterator *di = dictGetIterator(c->pubsub_channels);
9619 dictEntry *de;
9620 int count = 0;
9621
9622 while((de = dictNext(di)) != NULL) {
9623 robj *channel = dictGetEntryKey(de);
9624
9625 count += pubsubUnsubscribeChannel(c,channel,notify);
9626 }
9627 dictReleaseIterator(di);
9628 return count;
9629 }
9630
9631 /* Unsubscribe from all the patterns. Return the number of patterns the
9632 * client was subscribed from. */
9633 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9634 listNode *ln;
9635 listIter li;
9636 int count = 0;
9637
9638 listRewind(c->pubsub_patterns,&li);
9639 while ((ln = listNext(&li)) != NULL) {
9640 robj *pattern = ln->value;
9641
9642 count += pubsubUnsubscribePattern(c,pattern,notify);
9643 }
9644 return count;
9645 }
9646
9647 /* Publish a message */
9648 static int pubsubPublishMessage(robj *channel, robj *message) {
9649 int receivers = 0;
9650 struct dictEntry *de;
9651 listNode *ln;
9652 listIter li;
9653
9654 /* Send to clients listening for that channel */
9655 de = dictFind(server.pubsub_channels,channel);
9656 if (de) {
9657 list *list = dictGetEntryVal(de);
9658 listNode *ln;
9659 listIter li;
9660
9661 listRewind(list,&li);
9662 while ((ln = listNext(&li)) != NULL) {
9663 redisClient *c = ln->value;
9664
9665 addReply(c,shared.mbulk3);
9666 addReply(c,shared.messagebulk);
9667 addReplyBulk(c,channel);
9668 addReplyBulk(c,message);
9669 receivers++;
9670 }
9671 }
9672 /* Send to clients listening to matching channels */
9673 if (listLength(server.pubsub_patterns)) {
9674 listRewind(server.pubsub_patterns,&li);
9675 channel = getDecodedObject(channel);
9676 while ((ln = listNext(&li)) != NULL) {
9677 pubsubPattern *pat = ln->value;
9678
9679 if (stringmatchlen((char*)pat->pattern->ptr,
9680 sdslen(pat->pattern->ptr),
9681 (char*)channel->ptr,
9682 sdslen(channel->ptr),0)) {
9683 addReply(pat->client,shared.mbulk3);
9684 addReply(pat->client,shared.messagebulk);
9685 addReplyBulk(pat->client,channel);
9686 addReplyBulk(pat->client,message);
9687 receivers++;
9688 }
9689 }
9690 decrRefCount(channel);
9691 }
9692 return receivers;
9693 }
9694
9695 static void subscribeCommand(redisClient *c) {
9696 int j;
9697
9698 for (j = 1; j < c->argc; j++)
9699 pubsubSubscribeChannel(c,c->argv[j]);
9700 }
9701
9702 static void unsubscribeCommand(redisClient *c) {
9703 if (c->argc == 1) {
9704 pubsubUnsubscribeAllChannels(c,1);
9705 return;
9706 } else {
9707 int j;
9708
9709 for (j = 1; j < c->argc; j++)
9710 pubsubUnsubscribeChannel(c,c->argv[j],1);
9711 }
9712 }
9713
9714 static void psubscribeCommand(redisClient *c) {
9715 int j;
9716
9717 for (j = 1; j < c->argc; j++)
9718 pubsubSubscribePattern(c,c->argv[j]);
9719 }
9720
9721 static void punsubscribeCommand(redisClient *c) {
9722 if (c->argc == 1) {
9723 pubsubUnsubscribeAllPatterns(c,1);
9724 return;
9725 } else {
9726 int j;
9727
9728 for (j = 1; j < c->argc; j++)
9729 pubsubUnsubscribePattern(c,c->argv[j],1);
9730 }
9731 }
9732
9733 static void publishCommand(redisClient *c) {
9734 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9735 addReplyLong(c,receivers);
9736 }
9737
9738 /* ================================= Debugging ============================== */
9739
9740 static void debugCommand(redisClient *c) {
9741 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9742 *((char*)-1) = 'x';
9743 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9744 if (rdbSave(server.dbfilename) != REDIS_OK) {
9745 addReply(c,shared.err);
9746 return;
9747 }
9748 emptyDb();
9749 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9750 addReply(c,shared.err);
9751 return;
9752 }
9753 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9754 addReply(c,shared.ok);
9755 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9756 emptyDb();
9757 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9758 addReply(c,shared.err);
9759 return;
9760 }
9761 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9762 addReply(c,shared.ok);
9763 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9764 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9765 robj *key, *val;
9766
9767 if (!de) {
9768 addReply(c,shared.nokeyerr);
9769 return;
9770 }
9771 key = dictGetEntryKey(de);
9772 val = dictGetEntryVal(de);
9773 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9774 key->storage == REDIS_VM_SWAPPING)) {
9775 char *strenc;
9776 char buf[128];
9777
9778 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9779 strenc = strencoding[val->encoding];
9780 } else {
9781 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9782 strenc = buf;
9783 }
9784 addReplySds(c,sdscatprintf(sdsempty(),
9785 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9786 "encoding:%s serializedlength:%lld\r\n",
9787 (void*)key, key->refcount, (void*)val, val->refcount,
9788 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9789 } else {
9790 addReplySds(c,sdscatprintf(sdsempty(),
9791 "+Key at:%p refcount:%d, value swapped at: page %llu "
9792 "using %llu pages\r\n",
9793 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9794 (unsigned long long) key->vm.usedpages));
9795 }
9796 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9797 lookupKeyRead(c->db,c->argv[2]);
9798 addReply(c,shared.ok);
9799 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9800 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9801 robj *key, *val;
9802
9803 if (!server.vm_enabled) {
9804 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9805 return;
9806 }
9807 if (!de) {
9808 addReply(c,shared.nokeyerr);
9809 return;
9810 }
9811 key = dictGetEntryKey(de);
9812 val = dictGetEntryVal(de);
9813 /* If the key is shared we want to create a copy */
9814 if (key->refcount > 1) {
9815 robj *newkey = dupStringObject(key);
9816 decrRefCount(key);
9817 key = dictGetEntryKey(de) = newkey;
9818 }
9819 /* Swap it */
9820 if (key->storage != REDIS_VM_MEMORY) {
9821 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9822 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9823 dictGetEntryVal(de) = NULL;
9824 addReply(c,shared.ok);
9825 } else {
9826 addReply(c,shared.err);
9827 }
9828 } else {
9829 addReplySds(c,sdsnew(
9830 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9831 }
9832 }
9833
9834 static void _redisAssert(char *estr, char *file, int line) {
9835 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9836 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9837 #ifdef HAVE_BACKTRACE
9838 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9839 *((char*)-1) = 'x';
9840 #endif
9841 }
9842
9843 /* =================================== Main! ================================ */
9844
9845 #ifdef __linux__
9846 int linuxOvercommitMemoryValue(void) {
9847 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9848 char buf[64];
9849
9850 if (!fp) return -1;
9851 if (fgets(buf,64,fp) == NULL) {
9852 fclose(fp);
9853 return -1;
9854 }
9855 fclose(fp);
9856
9857 return atoi(buf);
9858 }
9859
9860 void linuxOvercommitMemoryWarning(void) {
9861 if (linuxOvercommitMemoryValue() == 0) {
9862 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9863 }
9864 }
9865 #endif /* __linux__ */
9866
9867 static void daemonize(void) {
9868 int fd;
9869 FILE *fp;
9870
9871 if (fork() != 0) exit(0); /* parent exits */
9872 setsid(); /* create a new session */
9873
9874 /* Every output goes to /dev/null. If Redis is daemonized but
9875 * the 'logfile' is set to 'stdout' in the configuration file
9876 * it will not log at all. */
9877 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9878 dup2(fd, STDIN_FILENO);
9879 dup2(fd, STDOUT_FILENO);
9880 dup2(fd, STDERR_FILENO);
9881 if (fd > STDERR_FILENO) close(fd);
9882 }
9883 /* Try to write the pid file */
9884 fp = fopen(server.pidfile,"w");
9885 if (fp) {
9886 fprintf(fp,"%d\n",getpid());
9887 fclose(fp);
9888 }
9889 }
9890
9891 static void version() {
9892 printf("Redis server version %s\n", REDIS_VERSION);
9893 exit(0);
9894 }
9895
9896 static void usage() {
9897 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9898 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9899 exit(1);
9900 }
9901
9902 int main(int argc, char **argv) {
9903 time_t start;
9904
9905 initServerConfig();
9906 if (argc == 2) {
9907 if (strcmp(argv[1], "-v") == 0 ||
9908 strcmp(argv[1], "--version") == 0) version();
9909 if (strcmp(argv[1], "--help") == 0) usage();
9910 resetServerSaveParams();
9911 loadServerConfig(argv[1]);
9912 } else if ((argc > 2)) {
9913 usage();
9914 } else {
9915 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9916 }
9917 if (server.daemonize) daemonize();
9918 initServer();
9919 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9920 #ifdef __linux__
9921 linuxOvercommitMemoryWarning();
9922 #endif
9923 start = time(NULL);
9924 if (server.appendonly) {
9925 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9926 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9927 } else {
9928 if (rdbLoad(server.dbfilename) == REDIS_OK)
9929 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9930 }
9931 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9932 aeSetBeforeSleepProc(server.el,beforeSleep);
9933 aeMain(server.el);
9934 aeDeleteEventLoop(server.el);
9935 return 0;
9936 }
9937
9938 /* ============================= Backtrace support ========================= */
9939
9940 #ifdef HAVE_BACKTRACE
9941 static char *findFuncName(void *pointer, unsigned long *offset);
9942
9943 static void *getMcontextEip(ucontext_t *uc) {
9944 #if defined(__FreeBSD__)
9945 return (void*) uc->uc_mcontext.mc_eip;
9946 #elif defined(__dietlibc__)
9947 return (void*) uc->uc_mcontext.eip;
9948 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9949 #if __x86_64__
9950 return (void*) uc->uc_mcontext->__ss.__rip;
9951 #else
9952 return (void*) uc->uc_mcontext->__ss.__eip;
9953 #endif
9954 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9955 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9956 return (void*) uc->uc_mcontext->__ss.__rip;
9957 #else
9958 return (void*) uc->uc_mcontext->__ss.__eip;
9959 #endif
9960 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9961 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9962 #elif defined(__ia64__) /* Linux IA64 */
9963 return (void*) uc->uc_mcontext.sc_ip;
9964 #else
9965 return NULL;
9966 #endif
9967 }
9968
9969 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9970 void *trace[100];
9971 char **messages = NULL;
9972 int i, trace_size = 0;
9973 unsigned long offset=0;
9974 ucontext_t *uc = (ucontext_t*) secret;
9975 sds infostring;
9976 REDIS_NOTUSED(info);
9977
9978 redisLog(REDIS_WARNING,
9979 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9980 infostring = genRedisInfoString();
9981 redisLog(REDIS_WARNING, "%s",infostring);
9982 /* It's not safe to sdsfree() the returned string under memory
9983 * corruption conditions. Let it leak as we are going to abort */
9984
9985 trace_size = backtrace(trace, 100);
9986 /* overwrite sigaction with caller's address */
9987 if (getMcontextEip(uc) != NULL) {
9988 trace[1] = getMcontextEip(uc);
9989 }
9990 messages = backtrace_symbols(trace, trace_size);
9991
9992 for (i=1; i<trace_size; ++i) {
9993 char *fn = findFuncName(trace[i], &offset), *p;
9994
9995 p = strchr(messages[i],'+');
9996 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9997 redisLog(REDIS_WARNING,"%s", messages[i]);
9998 } else {
9999 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10000 }
10001 }
10002 /* free(messages); Don't call free() with possibly corrupted memory. */
10003 _exit(0);
10004 }
10005
10006 static void setupSigSegvAction(void) {
10007 struct sigaction act;
10008
10009 sigemptyset (&act.sa_mask);
10010 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10011 * is used. Otherwise, sa_handler is used */
10012 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10013 act.sa_sigaction = segvHandler;
10014 sigaction (SIGSEGV, &act, NULL);
10015 sigaction (SIGBUS, &act, NULL);
10016 sigaction (SIGFPE, &act, NULL);
10017 sigaction (SIGILL, &act, NULL);
10018 sigaction (SIGBUS, &act, NULL);
10019 return;
10020 }
10021
10022 #include "staticsymbols.h"
10023 /* This function try to convert a pointer into a function name. It's used in
10024 * oreder to provide a backtrace under segmentation fault that's able to
10025 * display functions declared as static (otherwise the backtrace is useless). */
10026 static char *findFuncName(void *pointer, unsigned long *offset){
10027 int i, ret = -1;
10028 unsigned long off, minoff = 0;
10029
10030 /* Try to match against the Symbol with the smallest offset */
10031 for (i=0; symsTable[i].pointer; i++) {
10032 unsigned long lp = (unsigned long) pointer;
10033
10034 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10035 off=lp-symsTable[i].pointer;
10036 if (ret < 0 || off < minoff) {
10037 minoff=off;
10038 ret=i;
10039 }
10040 }
10041 }
10042 if (ret == -1) return NULL;
10043 *offset = minoff;
10044 return symsTable[ret].name;
10045 }
10046 #else /* HAVE_BACKTRACE */
10047 static void setupSigSegvAction(void) {
10048 }
10049 #endif /* HAVE_BACKTRACE */
10050
10051
10052
10053 /* The End */
10054
10055
10056