]> git.saurik.com Git - redis.git/blob - redis.c
Remove trailing whitespace.
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
518 } shared;
519
520 /* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
526 /* VM threaded I/O request message */
527 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
530 typedef struct iojob {
531 int type; /* Request type, REDIS_IOJOB_* */
532 redisDb *db;/* Redis database */
533 robj *key; /* This I/O request is about swapping this key */
534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540 } iojob;
541
542 /*================================ Prototypes =============================== */
543
544 static void freeStringObject(robj *o);
545 static void freeListObject(robj *o);
546 static void freeSetObject(robj *o);
547 static void decrRefCount(void *o);
548 static robj *createObject(int type, void *ptr);
549 static void freeClient(redisClient *c);
550 static int rdbLoad(char *filename);
551 static void addReply(redisClient *c, robj *obj);
552 static void addReplySds(redisClient *c, sds s);
553 static void incrRefCount(robj *o);
554 static int rdbSaveBackground(char *filename);
555 static robj *createStringObject(char *ptr, size_t len);
556 static robj *dupStringObject(robj *o);
557 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
559 static int syncWithMaster(void);
560 static robj *tryObjectEncoding(robj *o);
561 static robj *getDecodedObject(robj *o);
562 static int removeExpire(redisDb *db, robj *key);
563 static int expireIfNeeded(redisDb *db, robj *key);
564 static int deleteIfVolatile(redisDb *db, robj *key);
565 static int deleteIfSwapped(redisDb *db, robj *key);
566 static int deleteKey(redisDb *db, robj *key);
567 static time_t getExpire(redisDb *db, robj *key);
568 static int setExpire(redisDb *db, robj *key, time_t when);
569 static void updateSlavesWaitingBgsave(int bgsaveerr);
570 static void freeMemoryIfNeeded(void);
571 static int processCommand(redisClient *c);
572 static void setupSigSegvAction(void);
573 static void rdbRemoveTempFile(pid_t childpid);
574 static void aofRemoveTempFile(pid_t childpid);
575 static size_t stringObjectLen(robj *o);
576 static void processInputBuffer(redisClient *c);
577 static zskiplist *zslCreate(void);
578 static void zslFree(zskiplist *zsl);
579 static void zslInsert(zskiplist *zsl, double score, robj *obj);
580 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
581 static void initClientMultiState(redisClient *c);
582 static void freeClientMultiState(redisClient *c);
583 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584 static void unblockClientWaitingData(redisClient *c);
585 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586 static void vmInit(void);
587 static void vmMarkPagesFree(off_t page, off_t count);
588 static robj *vmLoadObject(robj *key);
589 static robj *vmPreviewObject(robj *key);
590 static int vmSwapOneObjectBlocking(void);
591 static int vmSwapOneObjectThreaded(void);
592 static int vmCanSwapOut(void);
593 static int tryFreeOneObjectFromFreelist(void);
594 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596 static void vmCancelThreadedIOJob(robj *o);
597 static void lockThreadedIO(void);
598 static void unlockThreadedIO(void);
599 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600 static void freeIOJob(iojob *j);
601 static void queueIOJob(iojob *j);
602 static int vmWriteObjectOnSwap(robj *o, off_t page);
603 static robj *vmReadObjectFromSwap(off_t page, int type);
604 static void waitEmptyIOJobsQueue(void);
605 static void vmReopenSwapFile(void);
606 static int vmFreePage(off_t page);
607 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
608 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609 static int dontWaitForSwappedKey(redisClient *c, robj *key);
610 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612 static struct redisCommand *lookupCommand(char *name);
613 static void call(redisClient *c, struct redisCommand *cmd);
614 static void resetClient(redisClient *c);
615 static void convertToRealHash(robj *o);
616 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618 static void freePubsubPattern(void *p);
619 static int listMatchPubsubPattern(void *a, void *b);
620 static int compareStringObjects(robj *a, robj *b);
621 static void usage();
622
623 static void authCommand(redisClient *c);
624 static void pingCommand(redisClient *c);
625 static void echoCommand(redisClient *c);
626 static void setCommand(redisClient *c);
627 static void setnxCommand(redisClient *c);
628 static void getCommand(redisClient *c);
629 static void delCommand(redisClient *c);
630 static void existsCommand(redisClient *c);
631 static void incrCommand(redisClient *c);
632 static void decrCommand(redisClient *c);
633 static void incrbyCommand(redisClient *c);
634 static void decrbyCommand(redisClient *c);
635 static void selectCommand(redisClient *c);
636 static void randomkeyCommand(redisClient *c);
637 static void keysCommand(redisClient *c);
638 static void dbsizeCommand(redisClient *c);
639 static void lastsaveCommand(redisClient *c);
640 static void saveCommand(redisClient *c);
641 static void bgsaveCommand(redisClient *c);
642 static void bgrewriteaofCommand(redisClient *c);
643 static void shutdownCommand(redisClient *c);
644 static void moveCommand(redisClient *c);
645 static void renameCommand(redisClient *c);
646 static void renamenxCommand(redisClient *c);
647 static void lpushCommand(redisClient *c);
648 static void rpushCommand(redisClient *c);
649 static void lpopCommand(redisClient *c);
650 static void rpopCommand(redisClient *c);
651 static void llenCommand(redisClient *c);
652 static void lindexCommand(redisClient *c);
653 static void lrangeCommand(redisClient *c);
654 static void ltrimCommand(redisClient *c);
655 static void typeCommand(redisClient *c);
656 static void lsetCommand(redisClient *c);
657 static void saddCommand(redisClient *c);
658 static void sremCommand(redisClient *c);
659 static void smoveCommand(redisClient *c);
660 static void sismemberCommand(redisClient *c);
661 static void scardCommand(redisClient *c);
662 static void spopCommand(redisClient *c);
663 static void srandmemberCommand(redisClient *c);
664 static void sinterCommand(redisClient *c);
665 static void sinterstoreCommand(redisClient *c);
666 static void sunionCommand(redisClient *c);
667 static void sunionstoreCommand(redisClient *c);
668 static void sdiffCommand(redisClient *c);
669 static void sdiffstoreCommand(redisClient *c);
670 static void syncCommand(redisClient *c);
671 static void flushdbCommand(redisClient *c);
672 static void flushallCommand(redisClient *c);
673 static void sortCommand(redisClient *c);
674 static void lremCommand(redisClient *c);
675 static void rpoplpushcommand(redisClient *c);
676 static void infoCommand(redisClient *c);
677 static void mgetCommand(redisClient *c);
678 static void monitorCommand(redisClient *c);
679 static void expireCommand(redisClient *c);
680 static void expireatCommand(redisClient *c);
681 static void getsetCommand(redisClient *c);
682 static void ttlCommand(redisClient *c);
683 static void slaveofCommand(redisClient *c);
684 static void debugCommand(redisClient *c);
685 static void msetCommand(redisClient *c);
686 static void msetnxCommand(redisClient *c);
687 static void zaddCommand(redisClient *c);
688 static void zincrbyCommand(redisClient *c);
689 static void zrangeCommand(redisClient *c);
690 static void zrangebyscoreCommand(redisClient *c);
691 static void zcountCommand(redisClient *c);
692 static void zrevrangeCommand(redisClient *c);
693 static void zcardCommand(redisClient *c);
694 static void zremCommand(redisClient *c);
695 static void zscoreCommand(redisClient *c);
696 static void zremrangebyscoreCommand(redisClient *c);
697 static void multiCommand(redisClient *c);
698 static void execCommand(redisClient *c);
699 static void discardCommand(redisClient *c);
700 static void blpopCommand(redisClient *c);
701 static void brpopCommand(redisClient *c);
702 static void appendCommand(redisClient *c);
703 static void substrCommand(redisClient *c);
704 static void zrankCommand(redisClient *c);
705 static void zrevrankCommand(redisClient *c);
706 static void hsetCommand(redisClient *c);
707 static void hmsetCommand(redisClient *c);
708 static void hgetCommand(redisClient *c);
709 static void hdelCommand(redisClient *c);
710 static void hlenCommand(redisClient *c);
711 static void zremrangebyrankCommand(redisClient *c);
712 static void zunionCommand(redisClient *c);
713 static void zinterCommand(redisClient *c);
714 static void hkeysCommand(redisClient *c);
715 static void hvalsCommand(redisClient *c);
716 static void hgetallCommand(redisClient *c);
717 static void hexistsCommand(redisClient *c);
718 static void configCommand(redisClient *c);
719 static void hincrbyCommand(redisClient *c);
720 static void subscribeCommand(redisClient *c);
721 static void unsubscribeCommand(redisClient *c);
722 static void psubscribeCommand(redisClient *c);
723 static void punsubscribeCommand(redisClient *c);
724 static void publishCommand(redisClient *c);
725
726 /*================================= Globals ================================= */
727
728 /* Global vars */
729 static struct redisServer server; /* server global state */
730 static struct redisCommand cmdTable[] = {
731 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
732 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
733 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
735 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
737 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
739 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
741 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
753 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
754 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
756 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
757 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
762 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
763 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
764 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
765 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
766 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
767 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
771 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
774 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
784 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
786 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
787 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
797 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
798 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
799 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
810 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
818 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
829 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
834 {NULL,NULL,0,0,NULL,0,0,0}
835 };
836
837 /*============================ Utility functions ============================ */
838
839 /* Glob-style pattern matching. */
840 static int stringmatchlen(const char *pattern, int patternLen,
841 const char *string, int stringLen, int nocase)
842 {
843 while(patternLen) {
844 switch(pattern[0]) {
845 case '*':
846 while (pattern[1] == '*') {
847 pattern++;
848 patternLen--;
849 }
850 if (patternLen == 1)
851 return 1; /* match */
852 while(stringLen) {
853 if (stringmatchlen(pattern+1, patternLen-1,
854 string, stringLen, nocase))
855 return 1; /* match */
856 string++;
857 stringLen--;
858 }
859 return 0; /* no match */
860 break;
861 case '?':
862 if (stringLen == 0)
863 return 0; /* no match */
864 string++;
865 stringLen--;
866 break;
867 case '[':
868 {
869 int not, match;
870
871 pattern++;
872 patternLen--;
873 not = pattern[0] == '^';
874 if (not) {
875 pattern++;
876 patternLen--;
877 }
878 match = 0;
879 while(1) {
880 if (pattern[0] == '\\') {
881 pattern++;
882 patternLen--;
883 if (pattern[0] == string[0])
884 match = 1;
885 } else if (pattern[0] == ']') {
886 break;
887 } else if (patternLen == 0) {
888 pattern--;
889 patternLen++;
890 break;
891 } else if (pattern[1] == '-' && patternLen >= 3) {
892 int start = pattern[0];
893 int end = pattern[2];
894 int c = string[0];
895 if (start > end) {
896 int t = start;
897 start = end;
898 end = t;
899 }
900 if (nocase) {
901 start = tolower(start);
902 end = tolower(end);
903 c = tolower(c);
904 }
905 pattern += 2;
906 patternLen -= 2;
907 if (c >= start && c <= end)
908 match = 1;
909 } else {
910 if (!nocase) {
911 if (pattern[0] == string[0])
912 match = 1;
913 } else {
914 if (tolower((int)pattern[0]) == tolower((int)string[0]))
915 match = 1;
916 }
917 }
918 pattern++;
919 patternLen--;
920 }
921 if (not)
922 match = !match;
923 if (!match)
924 return 0; /* no match */
925 string++;
926 stringLen--;
927 break;
928 }
929 case '\\':
930 if (patternLen >= 2) {
931 pattern++;
932 patternLen--;
933 }
934 /* fall through */
935 default:
936 if (!nocase) {
937 if (pattern[0] != string[0])
938 return 0; /* no match */
939 } else {
940 if (tolower((int)pattern[0]) != tolower((int)string[0]))
941 return 0; /* no match */
942 }
943 string++;
944 stringLen--;
945 break;
946 }
947 pattern++;
948 patternLen--;
949 if (stringLen == 0) {
950 while(*pattern == '*') {
951 pattern++;
952 patternLen--;
953 }
954 break;
955 }
956 }
957 if (patternLen == 0 && stringLen == 0)
958 return 1;
959 return 0;
960 }
961
962 static int stringmatch(const char *pattern, const char *string, int nocase) {
963 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
964 }
965
966 static void redisLog(int level, const char *fmt, ...) {
967 va_list ap;
968 FILE *fp;
969
970 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
971 if (!fp) return;
972
973 va_start(ap, fmt);
974 if (level >= server.verbosity) {
975 char *c = ".-*#";
976 char buf[64];
977 time_t now;
978
979 now = time(NULL);
980 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
981 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
982 vfprintf(fp, fmt, ap);
983 fprintf(fp,"\n");
984 fflush(fp);
985 }
986 va_end(ap);
987
988 if (server.logfile) fclose(fp);
989 }
990
991 /*====================== Hash table type implementation ==================== */
992
993 /* This is an hash table type that uses the SDS dynamic strings libary as
994 * keys and radis objects as values (objects can hold SDS strings,
995 * lists, sets). */
996
997 static void dictVanillaFree(void *privdata, void *val)
998 {
999 DICT_NOTUSED(privdata);
1000 zfree(val);
1001 }
1002
1003 static void dictListDestructor(void *privdata, void *val)
1004 {
1005 DICT_NOTUSED(privdata);
1006 listRelease((list*)val);
1007 }
1008
1009 static int sdsDictKeyCompare(void *privdata, const void *key1,
1010 const void *key2)
1011 {
1012 int l1,l2;
1013 DICT_NOTUSED(privdata);
1014
1015 l1 = sdslen((sds)key1);
1016 l2 = sdslen((sds)key2);
1017 if (l1 != l2) return 0;
1018 return memcmp(key1, key2, l1) == 0;
1019 }
1020
1021 static void dictRedisObjectDestructor(void *privdata, void *val)
1022 {
1023 DICT_NOTUSED(privdata);
1024
1025 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1026 decrRefCount(val);
1027 }
1028
1029 static int dictObjKeyCompare(void *privdata, const void *key1,
1030 const void *key2)
1031 {
1032 const robj *o1 = key1, *o2 = key2;
1033 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1034 }
1035
1036 static unsigned int dictObjHash(const void *key) {
1037 const robj *o = key;
1038 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1039 }
1040
1041 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1042 const void *key2)
1043 {
1044 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1045 int cmp;
1046
1047 if (o1->encoding == REDIS_ENCODING_INT &&
1048 o2->encoding == REDIS_ENCODING_INT &&
1049 o1->ptr == o2->ptr) return 1;
1050
1051 o1 = getDecodedObject(o1);
1052 o2 = getDecodedObject(o2);
1053 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1054 decrRefCount(o1);
1055 decrRefCount(o2);
1056 return cmp;
1057 }
1058
1059 static unsigned int dictEncObjHash(const void *key) {
1060 robj *o = (robj*) key;
1061
1062 if (o->encoding == REDIS_ENCODING_RAW) {
1063 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1064 } else {
1065 if (o->encoding == REDIS_ENCODING_INT) {
1066 char buf[32];
1067 int len;
1068
1069 len = snprintf(buf,32,"%ld",(long)o->ptr);
1070 return dictGenHashFunction((unsigned char*)buf, len);
1071 } else {
1072 unsigned int hash;
1073
1074 o = getDecodedObject(o);
1075 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1076 decrRefCount(o);
1077 return hash;
1078 }
1079 }
1080 }
1081
1082 /* Sets type and expires */
1083 static dictType setDictType = {
1084 dictEncObjHash, /* hash function */
1085 NULL, /* key dup */
1086 NULL, /* val dup */
1087 dictEncObjKeyCompare, /* key compare */
1088 dictRedisObjectDestructor, /* key destructor */
1089 NULL /* val destructor */
1090 };
1091
1092 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1093 static dictType zsetDictType = {
1094 dictEncObjHash, /* hash function */
1095 NULL, /* key dup */
1096 NULL, /* val dup */
1097 dictEncObjKeyCompare, /* key compare */
1098 dictRedisObjectDestructor, /* key destructor */
1099 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1100 };
1101
1102 /* Db->dict */
1103 static dictType dbDictType = {
1104 dictObjHash, /* hash function */
1105 NULL, /* key dup */
1106 NULL, /* val dup */
1107 dictObjKeyCompare, /* key compare */
1108 dictRedisObjectDestructor, /* key destructor */
1109 dictRedisObjectDestructor /* val destructor */
1110 };
1111
1112 /* Db->expires */
1113 static dictType keyptrDictType = {
1114 dictObjHash, /* hash function */
1115 NULL, /* key dup */
1116 NULL, /* val dup */
1117 dictObjKeyCompare, /* key compare */
1118 dictRedisObjectDestructor, /* key destructor */
1119 NULL /* val destructor */
1120 };
1121
1122 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1123 static dictType hashDictType = {
1124 dictEncObjHash, /* hash function */
1125 NULL, /* key dup */
1126 NULL, /* val dup */
1127 dictEncObjKeyCompare, /* key compare */
1128 dictRedisObjectDestructor, /* key destructor */
1129 dictRedisObjectDestructor /* val destructor */
1130 };
1131
1132 /* Keylist hash table type has unencoded redis objects as keys and
1133 * lists as values. It's used for blocking operations (BLPOP) and to
1134 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1135 static dictType keylistDictType = {
1136 dictObjHash, /* hash function */
1137 NULL, /* key dup */
1138 NULL, /* val dup */
1139 dictObjKeyCompare, /* key compare */
1140 dictRedisObjectDestructor, /* key destructor */
1141 dictListDestructor /* val destructor */
1142 };
1143
1144 static void version();
1145
1146 /* ========================= Random utility functions ======================= */
1147
1148 /* Redis generally does not try to recover from out of memory conditions
1149 * when allocating objects or strings, it is not clear if it will be possible
1150 * to report this condition to the client since the networking layer itself
1151 * is based on heap allocation for send buffers, so we simply abort.
1152 * At least the code will be simpler to read... */
1153 static void oom(const char *msg) {
1154 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1155 sleep(1);
1156 abort();
1157 }
1158
1159 /* ====================== Redis server networking stuff ===================== */
1160 static void closeTimedoutClients(void) {
1161 redisClient *c;
1162 listNode *ln;
1163 time_t now = time(NULL);
1164 listIter li;
1165
1166 listRewind(server.clients,&li);
1167 while ((ln = listNext(&li)) != NULL) {
1168 c = listNodeValue(ln);
1169 if (server.maxidletime &&
1170 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1171 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1172 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1173 listLength(c->pubsub_patterns) == 0 &&
1174 (now - c->lastinteraction > server.maxidletime))
1175 {
1176 redisLog(REDIS_VERBOSE,"Closing idle client");
1177 freeClient(c);
1178 } else if (c->flags & REDIS_BLOCKED) {
1179 if (c->blockingto != 0 && c->blockingto < now) {
1180 addReply(c,shared.nullmultibulk);
1181 unblockClientWaitingData(c);
1182 }
1183 }
1184 }
1185 }
1186
1187 static int htNeedsResize(dict *dict) {
1188 long long size, used;
1189
1190 size = dictSlots(dict);
1191 used = dictSize(dict);
1192 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1193 (used*100/size < REDIS_HT_MINFILL));
1194 }
1195
1196 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1197 * we resize the hash table to save memory */
1198 static void tryResizeHashTables(void) {
1199 int j;
1200
1201 for (j = 0; j < server.dbnum; j++) {
1202 if (htNeedsResize(server.db[j].dict)) {
1203 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1204 dictResize(server.db[j].dict);
1205 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1206 }
1207 if (htNeedsResize(server.db[j].expires))
1208 dictResize(server.db[j].expires);
1209 }
1210 }
1211
1212 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1213 void backgroundSaveDoneHandler(int statloc) {
1214 int exitcode = WEXITSTATUS(statloc);
1215 int bysignal = WIFSIGNALED(statloc);
1216
1217 if (!bysignal && exitcode == 0) {
1218 redisLog(REDIS_NOTICE,
1219 "Background saving terminated with success");
1220 server.dirty = 0;
1221 server.lastsave = time(NULL);
1222 } else if (!bysignal && exitcode != 0) {
1223 redisLog(REDIS_WARNING, "Background saving error");
1224 } else {
1225 redisLog(REDIS_WARNING,
1226 "Background saving terminated by signal %d", WTERMSIG(statloc));
1227 rdbRemoveTempFile(server.bgsavechildpid);
1228 }
1229 server.bgsavechildpid = -1;
1230 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1231 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1232 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1233 }
1234
1235 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1236 * Handle this. */
1237 void backgroundRewriteDoneHandler(int statloc) {
1238 int exitcode = WEXITSTATUS(statloc);
1239 int bysignal = WIFSIGNALED(statloc);
1240
1241 if (!bysignal && exitcode == 0) {
1242 int fd;
1243 char tmpfile[256];
1244
1245 redisLog(REDIS_NOTICE,
1246 "Background append only file rewriting terminated with success");
1247 /* Now it's time to flush the differences accumulated by the parent */
1248 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1249 fd = open(tmpfile,O_WRONLY|O_APPEND);
1250 if (fd == -1) {
1251 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1252 goto cleanup;
1253 }
1254 /* Flush our data... */
1255 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1256 (signed) sdslen(server.bgrewritebuf)) {
1257 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1258 close(fd);
1259 goto cleanup;
1260 }
1261 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1262 /* Now our work is to rename the temp file into the stable file. And
1263 * switch the file descriptor used by the server for append only. */
1264 if (rename(tmpfile,server.appendfilename) == -1) {
1265 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1266 close(fd);
1267 goto cleanup;
1268 }
1269 /* Mission completed... almost */
1270 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1271 if (server.appendfd != -1) {
1272 /* If append only is actually enabled... */
1273 close(server.appendfd);
1274 server.appendfd = fd;
1275 fsync(fd);
1276 server.appendseldb = -1; /* Make sure it will issue SELECT */
1277 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1278 } else {
1279 /* If append only is disabled we just generate a dump in this
1280 * format. Why not? */
1281 close(fd);
1282 }
1283 } else if (!bysignal && exitcode != 0) {
1284 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1285 } else {
1286 redisLog(REDIS_WARNING,
1287 "Background append only file rewriting terminated by signal %d",
1288 WTERMSIG(statloc));
1289 }
1290 cleanup:
1291 sdsfree(server.bgrewritebuf);
1292 server.bgrewritebuf = sdsempty();
1293 aofRemoveTempFile(server.bgrewritechildpid);
1294 server.bgrewritechildpid = -1;
1295 }
1296
1297 /* This function is called once a background process of some kind terminates,
1298 * as we want to avoid resizing the hash tables when there is a child in order
1299 * to play well with copy-on-write (otherwise when a resize happens lots of
1300 * memory pages are copied). The goal of this function is to update the ability
1301 * for dict.c to resize the hash tables accordingly to the fact we have o not
1302 * running childs. */
1303 static void updateDictResizePolicy(void) {
1304 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1305 dictEnableResize();
1306 else
1307 dictDisableResize();
1308 }
1309
1310 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1311 int j, loops = server.cronloops++;
1312 REDIS_NOTUSED(eventLoop);
1313 REDIS_NOTUSED(id);
1314 REDIS_NOTUSED(clientData);
1315
1316 /* We take a cached value of the unix time in the global state because
1317 * with virtual memory and aging there is to store the current time
1318 * in objects at every object access, and accuracy is not needed.
1319 * To access a global var is faster than calling time(NULL) */
1320 server.unixtime = time(NULL);
1321
1322 /* Show some info about non-empty databases */
1323 for (j = 0; j < server.dbnum; j++) {
1324 long long size, used, vkeys;
1325
1326 size = dictSlots(server.db[j].dict);
1327 used = dictSize(server.db[j].dict);
1328 vkeys = dictSize(server.db[j].expires);
1329 if (!(loops % 50) && (used || vkeys)) {
1330 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1331 /* dictPrintStats(server.dict); */
1332 }
1333 }
1334
1335 /* We don't want to resize the hash tables while a bacground saving
1336 * is in progress: the saving child is created using fork() that is
1337 * implemented with a copy-on-write semantic in most modern systems, so
1338 * if we resize the HT while there is the saving child at work actually
1339 * a lot of memory movements in the parent will cause a lot of pages
1340 * copied. */
1341 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1342 !(loops % 10))
1343 {
1344 tryResizeHashTables();
1345 }
1346
1347 /* Show information about connected clients */
1348 if (!(loops % 50)) {
1349 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1350 listLength(server.clients)-listLength(server.slaves),
1351 listLength(server.slaves),
1352 zmalloc_used_memory());
1353 }
1354
1355 /* Close connections of timedout clients */
1356 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1357 closeTimedoutClients();
1358
1359 /* Check if a background saving or AOF rewrite in progress terminated */
1360 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1361 int statloc;
1362 pid_t pid;
1363
1364 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1365 if (pid == server.bgsavechildpid) {
1366 backgroundSaveDoneHandler(statloc);
1367 } else {
1368 backgroundRewriteDoneHandler(statloc);
1369 }
1370 updateDictResizePolicy();
1371 }
1372 } else {
1373 /* If there is not a background saving in progress check if
1374 * we have to save now */
1375 time_t now = time(NULL);
1376 for (j = 0; j < server.saveparamslen; j++) {
1377 struct saveparam *sp = server.saveparams+j;
1378
1379 if (server.dirty >= sp->changes &&
1380 now-server.lastsave > sp->seconds) {
1381 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1382 sp->changes, sp->seconds);
1383 rdbSaveBackground(server.dbfilename);
1384 break;
1385 }
1386 }
1387 }
1388
1389 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1390 * will use few CPU cycles if there are few expiring keys, otherwise
1391 * it will get more aggressive to avoid that too much memory is used by
1392 * keys that can be removed from the keyspace. */
1393 for (j = 0; j < server.dbnum; j++) {
1394 int expired;
1395 redisDb *db = server.db+j;
1396
1397 /* Continue to expire if at the end of the cycle more than 25%
1398 * of the keys were expired. */
1399 do {
1400 long num = dictSize(db->expires);
1401 time_t now = time(NULL);
1402
1403 expired = 0;
1404 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1405 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1406 while (num--) {
1407 dictEntry *de;
1408 time_t t;
1409
1410 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1411 t = (time_t) dictGetEntryVal(de);
1412 if (now > t) {
1413 deleteKey(db,dictGetEntryKey(de));
1414 expired++;
1415 server.stat_expiredkeys++;
1416 }
1417 }
1418 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1419 }
1420
1421 /* Swap a few keys on disk if we are over the memory limit and VM
1422 * is enbled. Try to free objects from the free list first. */
1423 if (vmCanSwapOut()) {
1424 while (server.vm_enabled && zmalloc_used_memory() >
1425 server.vm_max_memory)
1426 {
1427 int retval;
1428
1429 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1430 retval = (server.vm_max_threads == 0) ?
1431 vmSwapOneObjectBlocking() :
1432 vmSwapOneObjectThreaded();
1433 if (retval == REDIS_ERR && !(loops % 300) &&
1434 zmalloc_used_memory() >
1435 (server.vm_max_memory+server.vm_max_memory/10))
1436 {
1437 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1438 }
1439 /* Note that when using threade I/O we free just one object,
1440 * because anyway when the I/O thread in charge to swap this
1441 * object out will finish, the handler of completed jobs
1442 * will try to swap more objects if we are still out of memory. */
1443 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1444 }
1445 }
1446
1447 /* Check if we should connect to a MASTER */
1448 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1449 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1450 if (syncWithMaster() == REDIS_OK) {
1451 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1452 }
1453 }
1454 return 100;
1455 }
1456
1457 /* This function gets called every time Redis is entering the
1458 * main loop of the event driven library, that is, before to sleep
1459 * for ready file descriptors. */
1460 static void beforeSleep(struct aeEventLoop *eventLoop) {
1461 REDIS_NOTUSED(eventLoop);
1462
1463 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1464 listIter li;
1465 listNode *ln;
1466
1467 listRewind(server.io_ready_clients,&li);
1468 while((ln = listNext(&li))) {
1469 redisClient *c = ln->value;
1470 struct redisCommand *cmd;
1471
1472 /* Resume the client. */
1473 listDelNode(server.io_ready_clients,ln);
1474 c->flags &= (~REDIS_IO_WAIT);
1475 server.vm_blocked_clients--;
1476 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1477 readQueryFromClient, c);
1478 cmd = lookupCommand(c->argv[0]->ptr);
1479 assert(cmd != NULL);
1480 call(c,cmd);
1481 resetClient(c);
1482 /* There may be more data to process in the input buffer. */
1483 if (c->querybuf && sdslen(c->querybuf) > 0)
1484 processInputBuffer(c);
1485 }
1486 }
1487 }
1488
1489 static void createSharedObjects(void) {
1490 int j;
1491
1492 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1493 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1494 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1495 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1496 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1497 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1498 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1499 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1500 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1501 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1502 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1503 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1504 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1505 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR no such key\r\n"));
1507 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR syntax error\r\n"));
1509 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR source and destination objects are the same\r\n"));
1511 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1512 "-ERR index out of range\r\n"));
1513 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1514 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1515 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1516 shared.select0 = createStringObject("select 0\r\n",10);
1517 shared.select1 = createStringObject("select 1\r\n",10);
1518 shared.select2 = createStringObject("select 2\r\n",10);
1519 shared.select3 = createStringObject("select 3\r\n",10);
1520 shared.select4 = createStringObject("select 4\r\n",10);
1521 shared.select5 = createStringObject("select 5\r\n",10);
1522 shared.select6 = createStringObject("select 6\r\n",10);
1523 shared.select7 = createStringObject("select 7\r\n",10);
1524 shared.select8 = createStringObject("select 8\r\n",10);
1525 shared.select9 = createStringObject("select 9\r\n",10);
1526 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1527 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1528 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1529 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1530 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1531 shared.mbulk3 = createStringObject("*3\r\n",4);
1532 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1533 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1534 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1535 }
1536 }
1537
1538 static void appendServerSaveParams(time_t seconds, int changes) {
1539 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1540 server.saveparams[server.saveparamslen].seconds = seconds;
1541 server.saveparams[server.saveparamslen].changes = changes;
1542 server.saveparamslen++;
1543 }
1544
1545 static void resetServerSaveParams() {
1546 zfree(server.saveparams);
1547 server.saveparams = NULL;
1548 server.saveparamslen = 0;
1549 }
1550
1551 static void initServerConfig() {
1552 server.dbnum = REDIS_DEFAULT_DBNUM;
1553 server.port = REDIS_SERVERPORT;
1554 server.verbosity = REDIS_VERBOSE;
1555 server.maxidletime = REDIS_MAXIDLETIME;
1556 server.saveparams = NULL;
1557 server.logfile = NULL; /* NULL = log on standard output */
1558 server.bindaddr = NULL;
1559 server.glueoutputbuf = 1;
1560 server.daemonize = 0;
1561 server.appendonly = 0;
1562 server.appendfsync = APPENDFSYNC_ALWAYS;
1563 server.lastfsync = time(NULL);
1564 server.appendfd = -1;
1565 server.appendseldb = -1; /* Make sure the first time will not match */
1566 server.pidfile = zstrdup("/var/run/redis.pid");
1567 server.dbfilename = zstrdup("dump.rdb");
1568 server.appendfilename = zstrdup("appendonly.aof");
1569 server.requirepass = NULL;
1570 server.shareobjects = 0;
1571 server.rdbcompression = 1;
1572 server.maxclients = 0;
1573 server.blpop_blocked_clients = 0;
1574 server.maxmemory = 0;
1575 server.vm_enabled = 0;
1576 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1577 server.vm_page_size = 256; /* 256 bytes per page */
1578 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1579 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1580 server.vm_max_threads = 4;
1581 server.vm_blocked_clients = 0;
1582 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1583 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1584
1585 resetServerSaveParams();
1586
1587 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1588 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1589 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1590 /* Replication related */
1591 server.isslave = 0;
1592 server.masterauth = NULL;
1593 server.masterhost = NULL;
1594 server.masterport = 6379;
1595 server.master = NULL;
1596 server.replstate = REDIS_REPL_NONE;
1597
1598 /* Double constants initialization */
1599 R_Zero = 0.0;
1600 R_PosInf = 1.0/R_Zero;
1601 R_NegInf = -1.0/R_Zero;
1602 R_Nan = R_Zero/R_Zero;
1603 }
1604
1605 static void initServer() {
1606 int j;
1607
1608 signal(SIGHUP, SIG_IGN);
1609 signal(SIGPIPE, SIG_IGN);
1610 setupSigSegvAction();
1611
1612 server.devnull = fopen("/dev/null","w");
1613 if (server.devnull == NULL) {
1614 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1615 exit(1);
1616 }
1617 server.clients = listCreate();
1618 server.slaves = listCreate();
1619 server.monitors = listCreate();
1620 server.objfreelist = listCreate();
1621 createSharedObjects();
1622 server.el = aeCreateEventLoop();
1623 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1624 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1625 if (server.fd == -1) {
1626 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1627 exit(1);
1628 }
1629 for (j = 0; j < server.dbnum; j++) {
1630 server.db[j].dict = dictCreate(&dbDictType,NULL);
1631 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1632 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1633 if (server.vm_enabled)
1634 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1635 server.db[j].id = j;
1636 }
1637 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1638 server.pubsub_patterns = listCreate();
1639 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1640 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1641 server.cronloops = 0;
1642 server.bgsavechildpid = -1;
1643 server.bgrewritechildpid = -1;
1644 server.bgrewritebuf = sdsempty();
1645 server.lastsave = time(NULL);
1646 server.dirty = 0;
1647 server.stat_numcommands = 0;
1648 server.stat_numconnections = 0;
1649 server.stat_expiredkeys = 0;
1650 server.stat_starttime = time(NULL);
1651 server.unixtime = time(NULL);
1652 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1653 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1654 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1655
1656 if (server.appendonly) {
1657 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1658 if (server.appendfd == -1) {
1659 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1660 strerror(errno));
1661 exit(1);
1662 }
1663 }
1664
1665 if (server.vm_enabled) vmInit();
1666 }
1667
1668 /* Empty the whole database */
1669 static long long emptyDb() {
1670 int j;
1671 long long removed = 0;
1672
1673 for (j = 0; j < server.dbnum; j++) {
1674 removed += dictSize(server.db[j].dict);
1675 dictEmpty(server.db[j].dict);
1676 dictEmpty(server.db[j].expires);
1677 }
1678 return removed;
1679 }
1680
1681 static int yesnotoi(char *s) {
1682 if (!strcasecmp(s,"yes")) return 1;
1683 else if (!strcasecmp(s,"no")) return 0;
1684 else return -1;
1685 }
1686
1687 /* I agree, this is a very rudimental way to load a configuration...
1688 will improve later if the config gets more complex */
1689 static void loadServerConfig(char *filename) {
1690 FILE *fp;
1691 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1692 int linenum = 0;
1693 sds line = NULL;
1694 char *errormsg = "Fatal error, can't open config file '%s'";
1695 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1696 sprintf(errorbuf, errormsg, filename);
1697
1698 if (filename[0] == '-' && filename[1] == '\0')
1699 fp = stdin;
1700 else {
1701 if ((fp = fopen(filename,"r")) == NULL) {
1702 redisLog(REDIS_WARNING, errorbuf);
1703 exit(1);
1704 }
1705 }
1706
1707 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1708 sds *argv;
1709 int argc, j;
1710
1711 linenum++;
1712 line = sdsnew(buf);
1713 line = sdstrim(line," \t\r\n");
1714
1715 /* Skip comments and blank lines*/
1716 if (line[0] == '#' || line[0] == '\0') {
1717 sdsfree(line);
1718 continue;
1719 }
1720
1721 /* Split into arguments */
1722 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1723 sdstolower(argv[0]);
1724
1725 /* Execute config directives */
1726 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1727 server.maxidletime = atoi(argv[1]);
1728 if (server.maxidletime < 0) {
1729 err = "Invalid timeout value"; goto loaderr;
1730 }
1731 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1732 server.port = atoi(argv[1]);
1733 if (server.port < 1 || server.port > 65535) {
1734 err = "Invalid port"; goto loaderr;
1735 }
1736 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1737 server.bindaddr = zstrdup(argv[1]);
1738 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1739 int seconds = atoi(argv[1]);
1740 int changes = atoi(argv[2]);
1741 if (seconds < 1 || changes < 0) {
1742 err = "Invalid save parameters"; goto loaderr;
1743 }
1744 appendServerSaveParams(seconds,changes);
1745 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1746 if (chdir(argv[1]) == -1) {
1747 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1748 argv[1], strerror(errno));
1749 exit(1);
1750 }
1751 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1752 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1753 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1754 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1755 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1756 else {
1757 err = "Invalid log level. Must be one of debug, notice, warning";
1758 goto loaderr;
1759 }
1760 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1761 FILE *logfp;
1762
1763 server.logfile = zstrdup(argv[1]);
1764 if (!strcasecmp(server.logfile,"stdout")) {
1765 zfree(server.logfile);
1766 server.logfile = NULL;
1767 }
1768 if (server.logfile) {
1769 /* Test if we are able to open the file. The server will not
1770 * be able to abort just for this problem later... */
1771 logfp = fopen(server.logfile,"a");
1772 if (logfp == NULL) {
1773 err = sdscatprintf(sdsempty(),
1774 "Can't open the log file: %s", strerror(errno));
1775 goto loaderr;
1776 }
1777 fclose(logfp);
1778 }
1779 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1780 server.dbnum = atoi(argv[1]);
1781 if (server.dbnum < 1) {
1782 err = "Invalid number of databases"; goto loaderr;
1783 }
1784 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1785 loadServerConfig(argv[1]);
1786 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1787 server.maxclients = atoi(argv[1]);
1788 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1789 server.maxmemory = strtoll(argv[1], NULL, 10);
1790 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1791 server.masterhost = sdsnew(argv[1]);
1792 server.masterport = atoi(argv[2]);
1793 server.replstate = REDIS_REPL_CONNECT;
1794 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1795 server.masterauth = zstrdup(argv[1]);
1796 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1797 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1798 err = "argument must be 'yes' or 'no'"; goto loaderr;
1799 }
1800 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1801 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1802 err = "argument must be 'yes' or 'no'"; goto loaderr;
1803 }
1804 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1805 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1806 err = "argument must be 'yes' or 'no'"; goto loaderr;
1807 }
1808 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1809 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1810 err = "argument must be 'yes' or 'no'"; goto loaderr;
1811 }
1812 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1813 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1814 err = "argument must be 'yes' or 'no'"; goto loaderr;
1815 }
1816 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1817 if (!strcasecmp(argv[1],"no")) {
1818 server.appendfsync = APPENDFSYNC_NO;
1819 } else if (!strcasecmp(argv[1],"always")) {
1820 server.appendfsync = APPENDFSYNC_ALWAYS;
1821 } else if (!strcasecmp(argv[1],"everysec")) {
1822 server.appendfsync = APPENDFSYNC_EVERYSEC;
1823 } else {
1824 err = "argument must be 'no', 'always' or 'everysec'";
1825 goto loaderr;
1826 }
1827 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1828 server.requirepass = zstrdup(argv[1]);
1829 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1830 zfree(server.pidfile);
1831 server.pidfile = zstrdup(argv[1]);
1832 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1833 zfree(server.dbfilename);
1834 server.dbfilename = zstrdup(argv[1]);
1835 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1836 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1837 err = "argument must be 'yes' or 'no'"; goto loaderr;
1838 }
1839 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1840 zfree(server.vm_swap_file);
1841 server.vm_swap_file = zstrdup(argv[1]);
1842 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1843 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1844 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1845 server.vm_page_size = strtoll(argv[1], NULL, 10);
1846 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1847 server.vm_pages = strtoll(argv[1], NULL, 10);
1848 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1849 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1850 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1851 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1852 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1853 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1854 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1855 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1856 } else {
1857 err = "Bad directive or wrong number of arguments"; goto loaderr;
1858 }
1859 for (j = 0; j < argc; j++)
1860 sdsfree(argv[j]);
1861 zfree(argv);
1862 sdsfree(line);
1863 }
1864 if (fp != stdin) fclose(fp);
1865 return;
1866
1867 loaderr:
1868 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1869 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1870 fprintf(stderr, ">>> '%s'\n", line);
1871 fprintf(stderr, "%s\n", err);
1872 exit(1);
1873 }
1874
1875 static void freeClientArgv(redisClient *c) {
1876 int j;
1877
1878 for (j = 0; j < c->argc; j++)
1879 decrRefCount(c->argv[j]);
1880 for (j = 0; j < c->mbargc; j++)
1881 decrRefCount(c->mbargv[j]);
1882 c->argc = 0;
1883 c->mbargc = 0;
1884 }
1885
1886 static void freeClient(redisClient *c) {
1887 listNode *ln;
1888
1889 /* Note that if the client we are freeing is blocked into a blocking
1890 * call, we have to set querybuf to NULL *before* to call
1891 * unblockClientWaitingData() to avoid processInputBuffer() will get
1892 * called. Also it is important to remove the file events after
1893 * this, because this call adds the READABLE event. */
1894 sdsfree(c->querybuf);
1895 c->querybuf = NULL;
1896 if (c->flags & REDIS_BLOCKED)
1897 unblockClientWaitingData(c);
1898
1899 /* Unsubscribe from all the pubsub channels */
1900 pubsubUnsubscribeAllChannels(c,0);
1901 pubsubUnsubscribeAllPatterns(c,0);
1902 dictRelease(c->pubsub_channels);
1903 listRelease(c->pubsub_patterns);
1904 /* Obvious cleanup */
1905 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1906 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1907 listRelease(c->reply);
1908 freeClientArgv(c);
1909 close(c->fd);
1910 /* Remove from the list of clients */
1911 ln = listSearchKey(server.clients,c);
1912 redisAssert(ln != NULL);
1913 listDelNode(server.clients,ln);
1914 /* Remove from the list of clients waiting for swapped keys */
1915 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1916 ln = listSearchKey(server.io_ready_clients,c);
1917 if (ln) {
1918 listDelNode(server.io_ready_clients,ln);
1919 server.vm_blocked_clients--;
1920 }
1921 }
1922 while (server.vm_enabled && listLength(c->io_keys)) {
1923 ln = listFirst(c->io_keys);
1924 dontWaitForSwappedKey(c,ln->value);
1925 }
1926 listRelease(c->io_keys);
1927 /* Master/slave cleanup */
1928 if (c->flags & REDIS_SLAVE) {
1929 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1930 close(c->repldbfd);
1931 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1932 ln = listSearchKey(l,c);
1933 redisAssert(ln != NULL);
1934 listDelNode(l,ln);
1935 }
1936 if (c->flags & REDIS_MASTER) {
1937 server.master = NULL;
1938 server.replstate = REDIS_REPL_CONNECT;
1939 }
1940 /* Release memory */
1941 zfree(c->argv);
1942 zfree(c->mbargv);
1943 freeClientMultiState(c);
1944 zfree(c);
1945 }
1946
1947 #define GLUEREPLY_UP_TO (1024)
1948 static void glueReplyBuffersIfNeeded(redisClient *c) {
1949 int copylen = 0;
1950 char buf[GLUEREPLY_UP_TO];
1951 listNode *ln;
1952 listIter li;
1953 robj *o;
1954
1955 listRewind(c->reply,&li);
1956 while((ln = listNext(&li))) {
1957 int objlen;
1958
1959 o = ln->value;
1960 objlen = sdslen(o->ptr);
1961 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1962 memcpy(buf+copylen,o->ptr,objlen);
1963 copylen += objlen;
1964 listDelNode(c->reply,ln);
1965 } else {
1966 if (copylen == 0) return;
1967 break;
1968 }
1969 }
1970 /* Now the output buffer is empty, add the new single element */
1971 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1972 listAddNodeHead(c->reply,o);
1973 }
1974
1975 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1976 redisClient *c = privdata;
1977 int nwritten = 0, totwritten = 0, objlen;
1978 robj *o;
1979 REDIS_NOTUSED(el);
1980 REDIS_NOTUSED(mask);
1981
1982 /* Use writev() if we have enough buffers to send */
1983 if (!server.glueoutputbuf &&
1984 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1985 !(c->flags & REDIS_MASTER))
1986 {
1987 sendReplyToClientWritev(el, fd, privdata, mask);
1988 return;
1989 }
1990
1991 while(listLength(c->reply)) {
1992 if (server.glueoutputbuf && listLength(c->reply) > 1)
1993 glueReplyBuffersIfNeeded(c);
1994
1995 o = listNodeValue(listFirst(c->reply));
1996 objlen = sdslen(o->ptr);
1997
1998 if (objlen == 0) {
1999 listDelNode(c->reply,listFirst(c->reply));
2000 continue;
2001 }
2002
2003 if (c->flags & REDIS_MASTER) {
2004 /* Don't reply to a master */
2005 nwritten = objlen - c->sentlen;
2006 } else {
2007 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2008 if (nwritten <= 0) break;
2009 }
2010 c->sentlen += nwritten;
2011 totwritten += nwritten;
2012 /* If we fully sent the object on head go to the next one */
2013 if (c->sentlen == objlen) {
2014 listDelNode(c->reply,listFirst(c->reply));
2015 c->sentlen = 0;
2016 }
2017 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2018 * bytes, in a single threaded server it's a good idea to serve
2019 * other clients as well, even if a very large request comes from
2020 * super fast link that is always able to accept data (in real world
2021 * scenario think about 'KEYS *' against the loopback interfae) */
2022 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2023 }
2024 if (nwritten == -1) {
2025 if (errno == EAGAIN) {
2026 nwritten = 0;
2027 } else {
2028 redisLog(REDIS_VERBOSE,
2029 "Error writing to client: %s", strerror(errno));
2030 freeClient(c);
2031 return;
2032 }
2033 }
2034 if (totwritten > 0) c->lastinteraction = time(NULL);
2035 if (listLength(c->reply) == 0) {
2036 c->sentlen = 0;
2037 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2038 }
2039 }
2040
2041 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2042 {
2043 redisClient *c = privdata;
2044 int nwritten = 0, totwritten = 0, objlen, willwrite;
2045 robj *o;
2046 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2047 int offset, ion = 0;
2048 REDIS_NOTUSED(el);
2049 REDIS_NOTUSED(mask);
2050
2051 listNode *node;
2052 while (listLength(c->reply)) {
2053 offset = c->sentlen;
2054 ion = 0;
2055 willwrite = 0;
2056
2057 /* fill-in the iov[] array */
2058 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2059 o = listNodeValue(node);
2060 objlen = sdslen(o->ptr);
2061
2062 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2063 break;
2064
2065 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2066 break; /* no more iovecs */
2067
2068 iov[ion].iov_base = ((char*)o->ptr) + offset;
2069 iov[ion].iov_len = objlen - offset;
2070 willwrite += objlen - offset;
2071 offset = 0; /* just for the first item */
2072 ion++;
2073 }
2074
2075 if(willwrite == 0)
2076 break;
2077
2078 /* write all collected blocks at once */
2079 if((nwritten = writev(fd, iov, ion)) < 0) {
2080 if (errno != EAGAIN) {
2081 redisLog(REDIS_VERBOSE,
2082 "Error writing to client: %s", strerror(errno));
2083 freeClient(c);
2084 return;
2085 }
2086 break;
2087 }
2088
2089 totwritten += nwritten;
2090 offset = c->sentlen;
2091
2092 /* remove written robjs from c->reply */
2093 while (nwritten && listLength(c->reply)) {
2094 o = listNodeValue(listFirst(c->reply));
2095 objlen = sdslen(o->ptr);
2096
2097 if(nwritten >= objlen - offset) {
2098 listDelNode(c->reply, listFirst(c->reply));
2099 nwritten -= objlen - offset;
2100 c->sentlen = 0;
2101 } else {
2102 /* partial write */
2103 c->sentlen += nwritten;
2104 break;
2105 }
2106 offset = 0;
2107 }
2108 }
2109
2110 if (totwritten > 0)
2111 c->lastinteraction = time(NULL);
2112
2113 if (listLength(c->reply) == 0) {
2114 c->sentlen = 0;
2115 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2116 }
2117 }
2118
2119 static struct redisCommand *lookupCommand(char *name) {
2120 int j = 0;
2121 while(cmdTable[j].name != NULL) {
2122 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2123 j++;
2124 }
2125 return NULL;
2126 }
2127
2128 /* resetClient prepare the client to process the next command */
2129 static void resetClient(redisClient *c) {
2130 freeClientArgv(c);
2131 c->bulklen = -1;
2132 c->multibulk = 0;
2133 }
2134
2135 /* Call() is the core of Redis execution of a command */
2136 static void call(redisClient *c, struct redisCommand *cmd) {
2137 long long dirty;
2138
2139 dirty = server.dirty;
2140 cmd->proc(c);
2141 dirty = server.dirty-dirty;
2142
2143 if (server.appendonly && dirty)
2144 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2145 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2146 listLength(server.slaves))
2147 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2148 if (listLength(server.monitors))
2149 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2150 server.stat_numcommands++;
2151 }
2152
2153 /* If this function gets called we already read a whole
2154 * command, argments are in the client argv/argc fields.
2155 * processCommand() execute the command or prepare the
2156 * server for a bulk read from the client.
2157 *
2158 * If 1 is returned the client is still alive and valid and
2159 * and other operations can be performed by the caller. Otherwise
2160 * if 0 is returned the client was destroied (i.e. after QUIT). */
2161 static int processCommand(redisClient *c) {
2162 struct redisCommand *cmd;
2163
2164 /* Free some memory if needed (maxmemory setting) */
2165 if (server.maxmemory) freeMemoryIfNeeded();
2166
2167 /* Handle the multi bulk command type. This is an alternative protocol
2168 * supported by Redis in order to receive commands that are composed of
2169 * multiple binary-safe "bulk" arguments. The latency of processing is
2170 * a bit higher but this allows things like multi-sets, so if this
2171 * protocol is used only for MSET and similar commands this is a big win. */
2172 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2173 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2174 if (c->multibulk <= 0) {
2175 resetClient(c);
2176 return 1;
2177 } else {
2178 decrRefCount(c->argv[c->argc-1]);
2179 c->argc--;
2180 return 1;
2181 }
2182 } else if (c->multibulk) {
2183 if (c->bulklen == -1) {
2184 if (((char*)c->argv[0]->ptr)[0] != '$') {
2185 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2186 resetClient(c);
2187 return 1;
2188 } else {
2189 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2190 decrRefCount(c->argv[0]);
2191 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2192 c->argc--;
2193 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2194 resetClient(c);
2195 return 1;
2196 }
2197 c->argc--;
2198 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2199 return 1;
2200 }
2201 } else {
2202 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2203 c->mbargv[c->mbargc] = c->argv[0];
2204 c->mbargc++;
2205 c->argc--;
2206 c->multibulk--;
2207 if (c->multibulk == 0) {
2208 robj **auxargv;
2209 int auxargc;
2210
2211 /* Here we need to swap the multi-bulk argc/argv with the
2212 * normal argc/argv of the client structure. */
2213 auxargv = c->argv;
2214 c->argv = c->mbargv;
2215 c->mbargv = auxargv;
2216
2217 auxargc = c->argc;
2218 c->argc = c->mbargc;
2219 c->mbargc = auxargc;
2220
2221 /* We need to set bulklen to something different than -1
2222 * in order for the code below to process the command without
2223 * to try to read the last argument of a bulk command as
2224 * a special argument. */
2225 c->bulklen = 0;
2226 /* continue below and process the command */
2227 } else {
2228 c->bulklen = -1;
2229 return 1;
2230 }
2231 }
2232 }
2233 /* -- end of multi bulk commands processing -- */
2234
2235 /* The QUIT command is handled as a special case. Normal command
2236 * procs are unable to close the client connection safely */
2237 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2238 freeClient(c);
2239 return 0;
2240 }
2241
2242 /* Now lookup the command and check ASAP about trivial error conditions
2243 * such wrong arity, bad command name and so forth. */
2244 cmd = lookupCommand(c->argv[0]->ptr);
2245 if (!cmd) {
2246 addReplySds(c,
2247 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2248 (char*)c->argv[0]->ptr));
2249 resetClient(c);
2250 return 1;
2251 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2252 (c->argc < -cmd->arity)) {
2253 addReplySds(c,
2254 sdscatprintf(sdsempty(),
2255 "-ERR wrong number of arguments for '%s' command\r\n",
2256 cmd->name));
2257 resetClient(c);
2258 return 1;
2259 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2260 /* This is a bulk command, we have to read the last argument yet. */
2261 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2262
2263 decrRefCount(c->argv[c->argc-1]);
2264 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2265 c->argc--;
2266 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2267 resetClient(c);
2268 return 1;
2269 }
2270 c->argc--;
2271 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2272 /* It is possible that the bulk read is already in the
2273 * buffer. Check this condition and handle it accordingly.
2274 * This is just a fast path, alternative to call processInputBuffer().
2275 * It's a good idea since the code is small and this condition
2276 * happens most of the times. */
2277 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2278 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2279 c->argc++;
2280 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2281 } else {
2282 /* Otherwise return... there is to read the last argument
2283 * from the socket. */
2284 return 1;
2285 }
2286 }
2287 /* Let's try to encode the bulk object to save space. */
2288 if (cmd->flags & REDIS_CMD_BULK)
2289 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2290
2291 /* Check if the user is authenticated */
2292 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2293 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2294 resetClient(c);
2295 return 1;
2296 }
2297
2298 /* Handle the maxmemory directive */
2299 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2300 zmalloc_used_memory() > server.maxmemory)
2301 {
2302 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2303 resetClient(c);
2304 return 1;
2305 }
2306
2307 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2308 if (dictSize(c->pubsub_channels) > 0 &&
2309 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2310 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2311 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2312 resetClient(c);
2313 return 1;
2314 }
2315
2316 /* Exec the command */
2317 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2318 queueMultiCommand(c,cmd);
2319 addReply(c,shared.queued);
2320 } else {
2321 if (server.vm_enabled && server.vm_max_threads > 0 &&
2322 blockClientOnSwappedKeys(cmd,c)) return 1;
2323 call(c,cmd);
2324 }
2325
2326 /* Prepare the client for the next command */
2327 resetClient(c);
2328 return 1;
2329 }
2330
2331 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2332 listNode *ln;
2333 listIter li;
2334 int outc = 0, j;
2335 robj **outv;
2336 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2337 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2338 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2339 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2340 robj *lenobj;
2341
2342 if (argc <= REDIS_STATIC_ARGS) {
2343 outv = static_outv;
2344 } else {
2345 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2346 }
2347
2348 lenobj = createObject(REDIS_STRING,
2349 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2350 lenobj->refcount = 0;
2351 outv[outc++] = lenobj;
2352 for (j = 0; j < argc; j++) {
2353 lenobj = createObject(REDIS_STRING,
2354 sdscatprintf(sdsempty(),"$%lu\r\n",
2355 (unsigned long) stringObjectLen(argv[j])));
2356 lenobj->refcount = 0;
2357 outv[outc++] = lenobj;
2358 outv[outc++] = argv[j];
2359 outv[outc++] = shared.crlf;
2360 }
2361
2362 /* Increment all the refcounts at start and decrement at end in order to
2363 * be sure to free objects if there is no slave in a replication state
2364 * able to be feed with commands */
2365 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2366 listRewind(slaves,&li);
2367 while((ln = listNext(&li))) {
2368 redisClient *slave = ln->value;
2369
2370 /* Don't feed slaves that are still waiting for BGSAVE to start */
2371 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2372
2373 /* Feed all the other slaves, MONITORs and so on */
2374 if (slave->slaveseldb != dictid) {
2375 robj *selectcmd;
2376
2377 switch(dictid) {
2378 case 0: selectcmd = shared.select0; break;
2379 case 1: selectcmd = shared.select1; break;
2380 case 2: selectcmd = shared.select2; break;
2381 case 3: selectcmd = shared.select3; break;
2382 case 4: selectcmd = shared.select4; break;
2383 case 5: selectcmd = shared.select5; break;
2384 case 6: selectcmd = shared.select6; break;
2385 case 7: selectcmd = shared.select7; break;
2386 case 8: selectcmd = shared.select8; break;
2387 case 9: selectcmd = shared.select9; break;
2388 default:
2389 selectcmd = createObject(REDIS_STRING,
2390 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2391 selectcmd->refcount = 0;
2392 break;
2393 }
2394 addReply(slave,selectcmd);
2395 slave->slaveseldb = dictid;
2396 }
2397 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2398 }
2399 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2400 if (outv != static_outv) zfree(outv);
2401 }
2402
2403 static void processInputBuffer(redisClient *c) {
2404 again:
2405 /* Before to process the input buffer, make sure the client is not
2406 * waitig for a blocking operation such as BLPOP. Note that the first
2407 * iteration the client is never blocked, otherwise the processInputBuffer
2408 * would not be called at all, but after the execution of the first commands
2409 * in the input buffer the client may be blocked, and the "goto again"
2410 * will try to reiterate. The following line will make it return asap. */
2411 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2412 if (c->bulklen == -1) {
2413 /* Read the first line of the query */
2414 char *p = strchr(c->querybuf,'\n');
2415 size_t querylen;
2416
2417 if (p) {
2418 sds query, *argv;
2419 int argc, j;
2420
2421 query = c->querybuf;
2422 c->querybuf = sdsempty();
2423 querylen = 1+(p-(query));
2424 if (sdslen(query) > querylen) {
2425 /* leave data after the first line of the query in the buffer */
2426 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2427 }
2428 *p = '\0'; /* remove "\n" */
2429 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2430 sdsupdatelen(query);
2431
2432 /* Now we can split the query in arguments */
2433 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2434 sdsfree(query);
2435
2436 if (c->argv) zfree(c->argv);
2437 c->argv = zmalloc(sizeof(robj*)*argc);
2438
2439 for (j = 0; j < argc; j++) {
2440 if (sdslen(argv[j])) {
2441 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2442 c->argc++;
2443 } else {
2444 sdsfree(argv[j]);
2445 }
2446 }
2447 zfree(argv);
2448 if (c->argc) {
2449 /* Execute the command. If the client is still valid
2450 * after processCommand() return and there is something
2451 * on the query buffer try to process the next command. */
2452 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2453 } else {
2454 /* Nothing to process, argc == 0. Just process the query
2455 * buffer if it's not empty or return to the caller */
2456 if (sdslen(c->querybuf)) goto again;
2457 }
2458 return;
2459 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2460 redisLog(REDIS_VERBOSE, "Client protocol error");
2461 freeClient(c);
2462 return;
2463 }
2464 } else {
2465 /* Bulk read handling. Note that if we are at this point
2466 the client already sent a command terminated with a newline,
2467 we are reading the bulk data that is actually the last
2468 argument of the command. */
2469 int qbl = sdslen(c->querybuf);
2470
2471 if (c->bulklen <= qbl) {
2472 /* Copy everything but the final CRLF as final argument */
2473 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2474 c->argc++;
2475 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2476 /* Process the command. If the client is still valid after
2477 * the processing and there is more data in the buffer
2478 * try to parse it. */
2479 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2480 return;
2481 }
2482 }
2483 }
2484
2485 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2486 redisClient *c = (redisClient*) privdata;
2487 char buf[REDIS_IOBUF_LEN];
2488 int nread;
2489 REDIS_NOTUSED(el);
2490 REDIS_NOTUSED(mask);
2491
2492 nread = read(fd, buf, REDIS_IOBUF_LEN);
2493 if (nread == -1) {
2494 if (errno == EAGAIN) {
2495 nread = 0;
2496 } else {
2497 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2498 freeClient(c);
2499 return;
2500 }
2501 } else if (nread == 0) {
2502 redisLog(REDIS_VERBOSE, "Client closed connection");
2503 freeClient(c);
2504 return;
2505 }
2506 if (nread) {
2507 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2508 c->lastinteraction = time(NULL);
2509 } else {
2510 return;
2511 }
2512 processInputBuffer(c);
2513 }
2514
2515 static int selectDb(redisClient *c, int id) {
2516 if (id < 0 || id >= server.dbnum)
2517 return REDIS_ERR;
2518 c->db = &server.db[id];
2519 return REDIS_OK;
2520 }
2521
2522 static void *dupClientReplyValue(void *o) {
2523 incrRefCount((robj*)o);
2524 return o;
2525 }
2526
2527 static int listMatchObjects(void *a, void *b) {
2528 return compareStringObjects(a,b) == 0;
2529 }
2530
2531 static redisClient *createClient(int fd) {
2532 redisClient *c = zmalloc(sizeof(*c));
2533
2534 anetNonBlock(NULL,fd);
2535 anetTcpNoDelay(NULL,fd);
2536 if (!c) return NULL;
2537 selectDb(c,0);
2538 c->fd = fd;
2539 c->querybuf = sdsempty();
2540 c->argc = 0;
2541 c->argv = NULL;
2542 c->bulklen = -1;
2543 c->multibulk = 0;
2544 c->mbargc = 0;
2545 c->mbargv = NULL;
2546 c->sentlen = 0;
2547 c->flags = 0;
2548 c->lastinteraction = time(NULL);
2549 c->authenticated = 0;
2550 c->replstate = REDIS_REPL_NONE;
2551 c->reply = listCreate();
2552 listSetFreeMethod(c->reply,decrRefCount);
2553 listSetDupMethod(c->reply,dupClientReplyValue);
2554 c->blockingkeys = NULL;
2555 c->blockingkeysnum = 0;
2556 c->io_keys = listCreate();
2557 listSetFreeMethod(c->io_keys,decrRefCount);
2558 c->pubsub_channels = dictCreate(&setDictType,NULL);
2559 c->pubsub_patterns = listCreate();
2560 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2561 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2562 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2563 readQueryFromClient, c) == AE_ERR) {
2564 freeClient(c);
2565 return NULL;
2566 }
2567 listAddNodeTail(server.clients,c);
2568 initClientMultiState(c);
2569 return c;
2570 }
2571
2572 static void addReply(redisClient *c, robj *obj) {
2573 if (listLength(c->reply) == 0 &&
2574 (c->replstate == REDIS_REPL_NONE ||
2575 c->replstate == REDIS_REPL_ONLINE) &&
2576 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2577 sendReplyToClient, c) == AE_ERR) return;
2578
2579 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2580 obj = dupStringObject(obj);
2581 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2582 }
2583 listAddNodeTail(c->reply,getDecodedObject(obj));
2584 }
2585
2586 static void addReplySds(redisClient *c, sds s) {
2587 robj *o = createObject(REDIS_STRING,s);
2588 addReply(c,o);
2589 decrRefCount(o);
2590 }
2591
2592 static void addReplyDouble(redisClient *c, double d) {
2593 char buf[128];
2594
2595 snprintf(buf,sizeof(buf),"%.17g",d);
2596 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2597 (unsigned long) strlen(buf),buf));
2598 }
2599
2600 static void addReplyLong(redisClient *c, long l) {
2601 char buf[128];
2602 size_t len;
2603
2604 if (l == 0) {
2605 addReply(c,shared.czero);
2606 return;
2607 } else if (l == 1) {
2608 addReply(c,shared.cone);
2609 return;
2610 }
2611 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2612 addReplySds(c,sdsnewlen(buf,len));
2613 }
2614
2615 static void addReplyLongLong(redisClient *c, long long ll) {
2616 char buf[128];
2617 size_t len;
2618
2619 if (ll == 0) {
2620 addReply(c,shared.czero);
2621 return;
2622 } else if (ll == 1) {
2623 addReply(c,shared.cone);
2624 return;
2625 }
2626 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2627 addReplySds(c,sdsnewlen(buf,len));
2628 }
2629
2630 static void addReplyUlong(redisClient *c, unsigned long ul) {
2631 char buf[128];
2632 size_t len;
2633
2634 if (ul == 0) {
2635 addReply(c,shared.czero);
2636 return;
2637 } else if (ul == 1) {
2638 addReply(c,shared.cone);
2639 return;
2640 }
2641 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2642 addReplySds(c,sdsnewlen(buf,len));
2643 }
2644
2645 static void addReplyBulkLen(redisClient *c, robj *obj) {
2646 size_t len;
2647
2648 if (obj->encoding == REDIS_ENCODING_RAW) {
2649 len = sdslen(obj->ptr);
2650 } else {
2651 long n = (long)obj->ptr;
2652
2653 /* Compute how many bytes will take this integer as a radix 10 string */
2654 len = 1;
2655 if (n < 0) {
2656 len++;
2657 n = -n;
2658 }
2659 while((n = n/10) != 0) {
2660 len++;
2661 }
2662 }
2663 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2664 }
2665
2666 static void addReplyBulk(redisClient *c, robj *obj) {
2667 addReplyBulkLen(c,obj);
2668 addReply(c,obj);
2669 addReply(c,shared.crlf);
2670 }
2671
2672 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2673 static void addReplyBulkCString(redisClient *c, char *s) {
2674 if (s == NULL) {
2675 addReply(c,shared.nullbulk);
2676 } else {
2677 robj *o = createStringObject(s,strlen(s));
2678 addReplyBulk(c,o);
2679 decrRefCount(o);
2680 }
2681 }
2682
2683 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2684 int cport, cfd;
2685 char cip[128];
2686 redisClient *c;
2687 REDIS_NOTUSED(el);
2688 REDIS_NOTUSED(mask);
2689 REDIS_NOTUSED(privdata);
2690
2691 cfd = anetAccept(server.neterr, fd, cip, &cport);
2692 if (cfd == AE_ERR) {
2693 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2694 return;
2695 }
2696 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2697 if ((c = createClient(cfd)) == NULL) {
2698 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2699 close(cfd); /* May be already closed, just ingore errors */
2700 return;
2701 }
2702 /* If maxclient directive is set and this is one client more... close the
2703 * connection. Note that we create the client instead to check before
2704 * for this condition, since now the socket is already set in nonblocking
2705 * mode and we can send an error for free using the Kernel I/O */
2706 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2707 char *err = "-ERR max number of clients reached\r\n";
2708
2709 /* That's a best effort error message, don't check write errors */
2710 if (write(c->fd,err,strlen(err)) == -1) {
2711 /* Nothing to do, Just to avoid the warning... */
2712 }
2713 freeClient(c);
2714 return;
2715 }
2716 server.stat_numconnections++;
2717 }
2718
2719 /* ======================= Redis objects implementation ===================== */
2720
2721 static robj *createObject(int type, void *ptr) {
2722 robj *o;
2723
2724 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2725 if (listLength(server.objfreelist)) {
2726 listNode *head = listFirst(server.objfreelist);
2727 o = listNodeValue(head);
2728 listDelNode(server.objfreelist,head);
2729 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2730 } else {
2731 if (server.vm_enabled) {
2732 pthread_mutex_unlock(&server.obj_freelist_mutex);
2733 o = zmalloc(sizeof(*o));
2734 } else {
2735 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2736 }
2737 }
2738 o->type = type;
2739 o->encoding = REDIS_ENCODING_RAW;
2740 o->ptr = ptr;
2741 o->refcount = 1;
2742 if (server.vm_enabled) {
2743 /* Note that this code may run in the context of an I/O thread
2744 * and accessing to server.unixtime in theory is an error
2745 * (no locks). But in practice this is safe, and even if we read
2746 * garbage Redis will not fail, as it's just a statistical info */
2747 o->vm.atime = server.unixtime;
2748 o->storage = REDIS_VM_MEMORY;
2749 }
2750 return o;
2751 }
2752
2753 static robj *createStringObject(char *ptr, size_t len) {
2754 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2755 }
2756
2757 static robj *dupStringObject(robj *o) {
2758 assert(o->encoding == REDIS_ENCODING_RAW);
2759 return createStringObject(o->ptr,sdslen(o->ptr));
2760 }
2761
2762 static robj *createListObject(void) {
2763 list *l = listCreate();
2764
2765 listSetFreeMethod(l,decrRefCount);
2766 return createObject(REDIS_LIST,l);
2767 }
2768
2769 static robj *createSetObject(void) {
2770 dict *d = dictCreate(&setDictType,NULL);
2771 return createObject(REDIS_SET,d);
2772 }
2773
2774 static robj *createHashObject(void) {
2775 /* All the Hashes start as zipmaps. Will be automatically converted
2776 * into hash tables if there are enough elements or big elements
2777 * inside. */
2778 unsigned char *zm = zipmapNew();
2779 robj *o = createObject(REDIS_HASH,zm);
2780 o->encoding = REDIS_ENCODING_ZIPMAP;
2781 return o;
2782 }
2783
2784 static robj *createZsetObject(void) {
2785 zset *zs = zmalloc(sizeof(*zs));
2786
2787 zs->dict = dictCreate(&zsetDictType,NULL);
2788 zs->zsl = zslCreate();
2789 return createObject(REDIS_ZSET,zs);
2790 }
2791
2792 static void freeStringObject(robj *o) {
2793 if (o->encoding == REDIS_ENCODING_RAW) {
2794 sdsfree(o->ptr);
2795 }
2796 }
2797
2798 static void freeListObject(robj *o) {
2799 listRelease((list*) o->ptr);
2800 }
2801
2802 static void freeSetObject(robj *o) {
2803 dictRelease((dict*) o->ptr);
2804 }
2805
2806 static void freeZsetObject(robj *o) {
2807 zset *zs = o->ptr;
2808
2809 dictRelease(zs->dict);
2810 zslFree(zs->zsl);
2811 zfree(zs);
2812 }
2813
2814 static void freeHashObject(robj *o) {
2815 switch (o->encoding) {
2816 case REDIS_ENCODING_HT:
2817 dictRelease((dict*) o->ptr);
2818 break;
2819 case REDIS_ENCODING_ZIPMAP:
2820 zfree(o->ptr);
2821 break;
2822 default:
2823 redisAssert(0);
2824 break;
2825 }
2826 }
2827
2828 static void incrRefCount(robj *o) {
2829 o->refcount++;
2830 }
2831
2832 static void decrRefCount(void *obj) {
2833 robj *o = obj;
2834
2835 /* Object is a key of a swapped out value, or in the process of being
2836 * loaded. */
2837 if (server.vm_enabled &&
2838 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2839 {
2840 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2841 redisAssert(o->type == REDIS_STRING);
2842 freeStringObject(o);
2843 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2844 pthread_mutex_lock(&server.obj_freelist_mutex);
2845 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2846 !listAddNodeHead(server.objfreelist,o))
2847 zfree(o);
2848 pthread_mutex_unlock(&server.obj_freelist_mutex);
2849 server.vm_stats_swapped_objects--;
2850 return;
2851 }
2852 /* Object is in memory, or in the process of being swapped out. */
2853 if (--(o->refcount) == 0) {
2854 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2855 vmCancelThreadedIOJob(obj);
2856 switch(o->type) {
2857 case REDIS_STRING: freeStringObject(o); break;
2858 case REDIS_LIST: freeListObject(o); break;
2859 case REDIS_SET: freeSetObject(o); break;
2860 case REDIS_ZSET: freeZsetObject(o); break;
2861 case REDIS_HASH: freeHashObject(o); break;
2862 default: redisAssert(0); break;
2863 }
2864 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2865 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2866 !listAddNodeHead(server.objfreelist,o))
2867 zfree(o);
2868 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2869 }
2870 }
2871
2872 static robj *lookupKey(redisDb *db, robj *key) {
2873 dictEntry *de = dictFind(db->dict,key);
2874 if (de) {
2875 robj *key = dictGetEntryKey(de);
2876 robj *val = dictGetEntryVal(de);
2877
2878 if (server.vm_enabled) {
2879 if (key->storage == REDIS_VM_MEMORY ||
2880 key->storage == REDIS_VM_SWAPPING)
2881 {
2882 /* If we were swapping the object out, stop it, this key
2883 * was requested. */
2884 if (key->storage == REDIS_VM_SWAPPING)
2885 vmCancelThreadedIOJob(key);
2886 /* Update the access time of the key for the aging algorithm. */
2887 key->vm.atime = server.unixtime;
2888 } else {
2889 int notify = (key->storage == REDIS_VM_LOADING);
2890
2891 /* Our value was swapped on disk. Bring it at home. */
2892 redisAssert(val == NULL);
2893 val = vmLoadObject(key);
2894 dictGetEntryVal(de) = val;
2895
2896 /* Clients blocked by the VM subsystem may be waiting for
2897 * this key... */
2898 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2899 }
2900 }
2901 return val;
2902 } else {
2903 return NULL;
2904 }
2905 }
2906
2907 static robj *lookupKeyRead(redisDb *db, robj *key) {
2908 expireIfNeeded(db,key);
2909 return lookupKey(db,key);
2910 }
2911
2912 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2913 deleteIfVolatile(db,key);
2914 return lookupKey(db,key);
2915 }
2916
2917 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2918 robj *o = lookupKeyRead(c->db, key);
2919 if (!o) addReply(c,reply);
2920 return o;
2921 }
2922
2923 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2924 robj *o = lookupKeyWrite(c->db, key);
2925 if (!o) addReply(c,reply);
2926 return o;
2927 }
2928
2929 static int checkType(redisClient *c, robj *o, int type) {
2930 if (o->type != type) {
2931 addReply(c,shared.wrongtypeerr);
2932 return 1;
2933 }
2934 return 0;
2935 }
2936
2937 static int deleteKey(redisDb *db, robj *key) {
2938 int retval;
2939
2940 /* We need to protect key from destruction: after the first dictDelete()
2941 * it may happen that 'key' is no longer valid if we don't increment
2942 * it's count. This may happen when we get the object reference directly
2943 * from the hash table with dictRandomKey() or dict iterators */
2944 incrRefCount(key);
2945 if (dictSize(db->expires)) dictDelete(db->expires,key);
2946 retval = dictDelete(db->dict,key);
2947 decrRefCount(key);
2948
2949 return retval == DICT_OK;
2950 }
2951
2952 /* Check if the nul-terminated string 's' can be represented by a long
2953 * (that is, is a number that fits into long without any other space or
2954 * character before or after the digits).
2955 *
2956 * If so, the function returns REDIS_OK and *longval is set to the value
2957 * of the number. Otherwise REDIS_ERR is returned */
2958 static int isStringRepresentableAsLong(sds s, long *longval) {
2959 char buf[32], *endptr;
2960 long value;
2961 int slen;
2962
2963 value = strtol(s, &endptr, 10);
2964 if (endptr[0] != '\0') return REDIS_ERR;
2965 slen = snprintf(buf,32,"%ld",value);
2966
2967 /* If the number converted back into a string is not identical
2968 * then it's not possible to encode the string as integer */
2969 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2970 if (longval) *longval = value;
2971 return REDIS_OK;
2972 }
2973
2974 /* Try to encode a string object in order to save space */
2975 static robj *tryObjectEncoding(robj *o) {
2976 long value;
2977 sds s = o->ptr;
2978
2979 if (o->encoding != REDIS_ENCODING_RAW)
2980 return o; /* Already encoded */
2981
2982 /* It's not safe to encode shared objects: shared objects can be shared
2983 * everywhere in the "object space" of Redis. Encoded objects can only
2984 * appear as "values" (and not, for instance, as keys) */
2985 if (o->refcount > 1) return o;
2986
2987 /* Currently we try to encode only strings */
2988 redisAssert(o->type == REDIS_STRING);
2989
2990 /* Check if we can represent this string as a long integer */
2991 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2992
2993 /* Ok, this object can be encoded */
2994 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2995 decrRefCount(o);
2996 incrRefCount(shared.integers[value]);
2997 return shared.integers[value];
2998 } else {
2999 o->encoding = REDIS_ENCODING_INT;
3000 sdsfree(o->ptr);
3001 o->ptr = (void*) value;
3002 return o;
3003 }
3004 }
3005
3006 /* Get a decoded version of an encoded object (returned as a new object).
3007 * If the object is already raw-encoded just increment the ref count. */
3008 static robj *getDecodedObject(robj *o) {
3009 robj *dec;
3010
3011 if (o->encoding == REDIS_ENCODING_RAW) {
3012 incrRefCount(o);
3013 return o;
3014 }
3015 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3016 char buf[32];
3017
3018 snprintf(buf,32,"%ld",(long)o->ptr);
3019 dec = createStringObject(buf,strlen(buf));
3020 return dec;
3021 } else {
3022 redisAssert(1 != 1);
3023 }
3024 }
3025
3026 /* Compare two string objects via strcmp() or alike.
3027 * Note that the objects may be integer-encoded. In such a case we
3028 * use snprintf() to get a string representation of the numbers on the stack
3029 * and compare the strings, it's much faster than calling getDecodedObject().
3030 *
3031 * Important note: if objects are not integer encoded, but binary-safe strings,
3032 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3033 * binary safe. */
3034 static int compareStringObjects(robj *a, robj *b) {
3035 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3036 char bufa[128], bufb[128], *astr, *bstr;
3037 int bothsds = 1;
3038
3039 if (a == b) return 0;
3040 if (a->encoding != REDIS_ENCODING_RAW) {
3041 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3042 astr = bufa;
3043 bothsds = 0;
3044 } else {
3045 astr = a->ptr;
3046 }
3047 if (b->encoding != REDIS_ENCODING_RAW) {
3048 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3049 bstr = bufb;
3050 bothsds = 0;
3051 } else {
3052 bstr = b->ptr;
3053 }
3054 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3055 }
3056
3057 static size_t stringObjectLen(robj *o) {
3058 redisAssert(o->type == REDIS_STRING);
3059 if (o->encoding == REDIS_ENCODING_RAW) {
3060 return sdslen(o->ptr);
3061 } else {
3062 char buf[32];
3063
3064 return snprintf(buf,32,"%ld",(long)o->ptr);
3065 }
3066 }
3067
3068 /*============================ RDB saving/loading =========================== */
3069
3070 static int rdbSaveType(FILE *fp, unsigned char type) {
3071 if (fwrite(&type,1,1,fp) == 0) return -1;
3072 return 0;
3073 }
3074
3075 static int rdbSaveTime(FILE *fp, time_t t) {
3076 int32_t t32 = (int32_t) t;
3077 if (fwrite(&t32,4,1,fp) == 0) return -1;
3078 return 0;
3079 }
3080
3081 /* check rdbLoadLen() comments for more info */
3082 static int rdbSaveLen(FILE *fp, uint32_t len) {
3083 unsigned char buf[2];
3084
3085 if (len < (1<<6)) {
3086 /* Save a 6 bit len */
3087 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3088 if (fwrite(buf,1,1,fp) == 0) return -1;
3089 } else if (len < (1<<14)) {
3090 /* Save a 14 bit len */
3091 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3092 buf[1] = len&0xFF;
3093 if (fwrite(buf,2,1,fp) == 0) return -1;
3094 } else {
3095 /* Save a 32 bit len */
3096 buf[0] = (REDIS_RDB_32BITLEN<<6);
3097 if (fwrite(buf,1,1,fp) == 0) return -1;
3098 len = htonl(len);
3099 if (fwrite(&len,4,1,fp) == 0) return -1;
3100 }
3101 return 0;
3102 }
3103
3104 /* String objects in the form "2391" "-100" without any space and with a
3105 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3106 * encoded as integers to save space */
3107 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3108 long long value;
3109 char *endptr, buf[32];
3110
3111 /* Check if it's possible to encode this value as a number */
3112 value = strtoll(s, &endptr, 10);
3113 if (endptr[0] != '\0') return 0;
3114 snprintf(buf,32,"%lld",value);
3115
3116 /* If the number converted back into a string is not identical
3117 * then it's not possible to encode the string as integer */
3118 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3119
3120 /* Finally check if it fits in our ranges */
3121 if (value >= -(1<<7) && value <= (1<<7)-1) {
3122 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3123 enc[1] = value&0xFF;
3124 return 2;
3125 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3126 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3127 enc[1] = value&0xFF;
3128 enc[2] = (value>>8)&0xFF;
3129 return 3;
3130 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3131 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3132 enc[1] = value&0xFF;
3133 enc[2] = (value>>8)&0xFF;
3134 enc[3] = (value>>16)&0xFF;
3135 enc[4] = (value>>24)&0xFF;
3136 return 5;
3137 } else {
3138 return 0;
3139 }
3140 }
3141
3142 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3143 size_t comprlen, outlen;
3144 unsigned char byte;
3145 void *out;
3146
3147 /* We require at least four bytes compression for this to be worth it */
3148 if (len <= 4) return 0;
3149 outlen = len-4;
3150 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3151 comprlen = lzf_compress(s, len, out, outlen);
3152 if (comprlen == 0) {
3153 zfree(out);
3154 return 0;
3155 }
3156 /* Data compressed! Let's save it on disk */
3157 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3158 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3159 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3160 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3161 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3162 zfree(out);
3163 return comprlen;
3164
3165 writeerr:
3166 zfree(out);
3167 return -1;
3168 }
3169
3170 /* Save a string objet as [len][data] on disk. If the object is a string
3171 * representation of an integer value we try to safe it in a special form */
3172 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3173 int enclen;
3174
3175 /* Try integer encoding */
3176 if (len <= 11) {
3177 unsigned char buf[5];
3178 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3179 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3180 return 0;
3181 }
3182 }
3183
3184 /* Try LZF compression - under 20 bytes it's unable to compress even
3185 * aaaaaaaaaaaaaaaaaa so skip it */
3186 if (server.rdbcompression && len > 20) {
3187 int retval;
3188
3189 retval = rdbSaveLzfStringObject(fp,s,len);
3190 if (retval == -1) return -1;
3191 if (retval > 0) return 0;
3192 /* retval == 0 means data can't be compressed, save the old way */
3193 }
3194
3195 /* Store verbatim */
3196 if (rdbSaveLen(fp,len) == -1) return -1;
3197 if (len && fwrite(s,len,1,fp) == 0) return -1;
3198 return 0;
3199 }
3200
3201 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3202 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3203 int retval;
3204
3205 /* Avoid incr/decr ref count business when possible.
3206 * This plays well with copy-on-write given that we are probably
3207 * in a child process (BGSAVE). Also this makes sure key objects
3208 * of swapped objects are not incRefCount-ed (an assert does not allow
3209 * this in order to avoid bugs) */
3210 if (obj->encoding != REDIS_ENCODING_RAW) {
3211 obj = getDecodedObject(obj);
3212 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3213 decrRefCount(obj);
3214 } else {
3215 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3216 }
3217 return retval;
3218 }
3219
3220 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3221 * 8 bit integer specifing the length of the representation.
3222 * This 8 bit integer has special values in order to specify the following
3223 * conditions:
3224 * 253: not a number
3225 * 254: + inf
3226 * 255: - inf
3227 */
3228 static int rdbSaveDoubleValue(FILE *fp, double val) {
3229 unsigned char buf[128];
3230 int len;
3231
3232 if (isnan(val)) {
3233 buf[0] = 253;
3234 len = 1;
3235 } else if (!isfinite(val)) {
3236 len = 1;
3237 buf[0] = (val < 0) ? 255 : 254;
3238 } else {
3239 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3240 buf[0] = strlen((char*)buf+1);
3241 len = buf[0]+1;
3242 }
3243 if (fwrite(buf,len,1,fp) == 0) return -1;
3244 return 0;
3245 }
3246
3247 /* Save a Redis object. */
3248 static int rdbSaveObject(FILE *fp, robj *o) {
3249 if (o->type == REDIS_STRING) {
3250 /* Save a string value */
3251 if (rdbSaveStringObject(fp,o) == -1) return -1;
3252 } else if (o->type == REDIS_LIST) {
3253 /* Save a list value */
3254 list *list = o->ptr;
3255 listIter li;
3256 listNode *ln;
3257
3258 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3259 listRewind(list,&li);
3260 while((ln = listNext(&li))) {
3261 robj *eleobj = listNodeValue(ln);
3262
3263 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3264 }
3265 } else if (o->type == REDIS_SET) {
3266 /* Save a set value */
3267 dict *set = o->ptr;
3268 dictIterator *di = dictGetIterator(set);
3269 dictEntry *de;
3270
3271 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3272 while((de = dictNext(di)) != NULL) {
3273 robj *eleobj = dictGetEntryKey(de);
3274
3275 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3276 }
3277 dictReleaseIterator(di);
3278 } else if (o->type == REDIS_ZSET) {
3279 /* Save a set value */
3280 zset *zs = o->ptr;
3281 dictIterator *di = dictGetIterator(zs->dict);
3282 dictEntry *de;
3283
3284 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3285 while((de = dictNext(di)) != NULL) {
3286 robj *eleobj = dictGetEntryKey(de);
3287 double *score = dictGetEntryVal(de);
3288
3289 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3290 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3291 }
3292 dictReleaseIterator(di);
3293 } else if (o->type == REDIS_HASH) {
3294 /* Save a hash value */
3295 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3296 unsigned char *p = zipmapRewind(o->ptr);
3297 unsigned int count = zipmapLen(o->ptr);
3298 unsigned char *key, *val;
3299 unsigned int klen, vlen;
3300
3301 if (rdbSaveLen(fp,count) == -1) return -1;
3302 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3303 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3304 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3305 }
3306 } else {
3307 dictIterator *di = dictGetIterator(o->ptr);
3308 dictEntry *de;
3309
3310 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3311 while((de = dictNext(di)) != NULL) {
3312 robj *key = dictGetEntryKey(de);
3313 robj *val = dictGetEntryVal(de);
3314
3315 if (rdbSaveStringObject(fp,key) == -1) return -1;
3316 if (rdbSaveStringObject(fp,val) == -1) return -1;
3317 }
3318 dictReleaseIterator(di);
3319 }
3320 } else {
3321 redisAssert(0);
3322 }
3323 return 0;
3324 }
3325
3326 /* Return the length the object will have on disk if saved with
3327 * the rdbSaveObject() function. Currently we use a trick to get
3328 * this length with very little changes to the code. In the future
3329 * we could switch to a faster solution. */
3330 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3331 if (fp == NULL) fp = server.devnull;
3332 rewind(fp);
3333 assert(rdbSaveObject(fp,o) != 1);
3334 return ftello(fp);
3335 }
3336
3337 /* Return the number of pages required to save this object in the swap file */
3338 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3339 off_t bytes = rdbSavedObjectLen(o,fp);
3340
3341 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3342 }
3343
3344 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3345 static int rdbSave(char *filename) {
3346 dictIterator *di = NULL;
3347 dictEntry *de;
3348 FILE *fp;
3349 char tmpfile[256];
3350 int j;
3351 time_t now = time(NULL);
3352
3353 /* Wait for I/O therads to terminate, just in case this is a
3354 * foreground-saving, to avoid seeking the swap file descriptor at the
3355 * same time. */
3356 if (server.vm_enabled)
3357 waitEmptyIOJobsQueue();
3358
3359 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3360 fp = fopen(tmpfile,"w");
3361 if (!fp) {
3362 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3363 return REDIS_ERR;
3364 }
3365 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3366 for (j = 0; j < server.dbnum; j++) {
3367 redisDb *db = server.db+j;
3368 dict *d = db->dict;
3369 if (dictSize(d) == 0) continue;
3370 di = dictGetIterator(d);
3371 if (!di) {
3372 fclose(fp);
3373 return REDIS_ERR;
3374 }
3375
3376 /* Write the SELECT DB opcode */
3377 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3378 if (rdbSaveLen(fp,j) == -1) goto werr;
3379
3380 /* Iterate this DB writing every entry */
3381 while((de = dictNext(di)) != NULL) {
3382 robj *key = dictGetEntryKey(de);
3383 robj *o = dictGetEntryVal(de);
3384 time_t expiretime = getExpire(db,key);
3385
3386 /* Save the expire time */
3387 if (expiretime != -1) {
3388 /* If this key is already expired skip it */
3389 if (expiretime < now) continue;
3390 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3391 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3392 }
3393 /* Save the key and associated value. This requires special
3394 * handling if the value is swapped out. */
3395 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3396 key->storage == REDIS_VM_SWAPPING) {
3397 /* Save type, key, value */
3398 if (rdbSaveType(fp,o->type) == -1) goto werr;
3399 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3400 if (rdbSaveObject(fp,o) == -1) goto werr;
3401 } else {
3402 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3403 robj *po;
3404 /* Get a preview of the object in memory */
3405 po = vmPreviewObject(key);
3406 /* Save type, key, value */
3407 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3408 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3409 if (rdbSaveObject(fp,po) == -1) goto werr;
3410 /* Remove the loaded object from memory */
3411 decrRefCount(po);
3412 }
3413 }
3414 dictReleaseIterator(di);
3415 }
3416 /* EOF opcode */
3417 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3418
3419 /* Make sure data will not remain on the OS's output buffers */
3420 fflush(fp);
3421 fsync(fileno(fp));
3422 fclose(fp);
3423
3424 /* Use RENAME to make sure the DB file is changed atomically only
3425 * if the generate DB file is ok. */
3426 if (rename(tmpfile,filename) == -1) {
3427 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3428 unlink(tmpfile);
3429 return REDIS_ERR;
3430 }
3431 redisLog(REDIS_NOTICE,"DB saved on disk");
3432 server.dirty = 0;
3433 server.lastsave = time(NULL);
3434 return REDIS_OK;
3435
3436 werr:
3437 fclose(fp);
3438 unlink(tmpfile);
3439 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3440 if (di) dictReleaseIterator(di);
3441 return REDIS_ERR;
3442 }
3443
3444 static int rdbSaveBackground(char *filename) {
3445 pid_t childpid;
3446
3447 if (server.bgsavechildpid != -1) return REDIS_ERR;
3448 if (server.vm_enabled) waitEmptyIOJobsQueue();
3449 if ((childpid = fork()) == 0) {
3450 /* Child */
3451 if (server.vm_enabled) vmReopenSwapFile();
3452 close(server.fd);
3453 if (rdbSave(filename) == REDIS_OK) {
3454 _exit(0);
3455 } else {
3456 _exit(1);
3457 }
3458 } else {
3459 /* Parent */
3460 if (childpid == -1) {
3461 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3462 strerror(errno));
3463 return REDIS_ERR;
3464 }
3465 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3466 server.bgsavechildpid = childpid;
3467 updateDictResizePolicy();
3468 return REDIS_OK;
3469 }
3470 return REDIS_OK; /* unreached */
3471 }
3472
3473 static void rdbRemoveTempFile(pid_t childpid) {
3474 char tmpfile[256];
3475
3476 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3477 unlink(tmpfile);
3478 }
3479
3480 static int rdbLoadType(FILE *fp) {
3481 unsigned char type;
3482 if (fread(&type,1,1,fp) == 0) return -1;
3483 return type;
3484 }
3485
3486 static time_t rdbLoadTime(FILE *fp) {
3487 int32_t t32;
3488 if (fread(&t32,4,1,fp) == 0) return -1;
3489 return (time_t) t32;
3490 }
3491
3492 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3493 * of this file for a description of how this are stored on disk.
3494 *
3495 * isencoded is set to 1 if the readed length is not actually a length but
3496 * an "encoding type", check the above comments for more info */
3497 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3498 unsigned char buf[2];
3499 uint32_t len;
3500 int type;
3501
3502 if (isencoded) *isencoded = 0;
3503 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3504 type = (buf[0]&0xC0)>>6;
3505 if (type == REDIS_RDB_6BITLEN) {
3506 /* Read a 6 bit len */
3507 return buf[0]&0x3F;
3508 } else if (type == REDIS_RDB_ENCVAL) {
3509 /* Read a 6 bit len encoding type */
3510 if (isencoded) *isencoded = 1;
3511 return buf[0]&0x3F;
3512 } else if (type == REDIS_RDB_14BITLEN) {
3513 /* Read a 14 bit len */
3514 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3515 return ((buf[0]&0x3F)<<8)|buf[1];
3516 } else {
3517 /* Read a 32 bit len */
3518 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3519 return ntohl(len);
3520 }
3521 }
3522
3523 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3524 unsigned char enc[4];
3525 long long val;
3526
3527 if (enctype == REDIS_RDB_ENC_INT8) {
3528 if (fread(enc,1,1,fp) == 0) return NULL;
3529 val = (signed char)enc[0];
3530 } else if (enctype == REDIS_RDB_ENC_INT16) {
3531 uint16_t v;
3532 if (fread(enc,2,1,fp) == 0) return NULL;
3533 v = enc[0]|(enc[1]<<8);
3534 val = (int16_t)v;
3535 } else if (enctype == REDIS_RDB_ENC_INT32) {
3536 uint32_t v;
3537 if (fread(enc,4,1,fp) == 0) return NULL;
3538 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3539 val = (int32_t)v;
3540 } else {
3541 val = 0; /* anti-warning */
3542 redisAssert(0);
3543 }
3544 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3545 }
3546
3547 static robj *rdbLoadLzfStringObject(FILE*fp) {
3548 unsigned int len, clen;
3549 unsigned char *c = NULL;
3550 sds val = NULL;
3551
3552 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3553 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3554 if ((c = zmalloc(clen)) == NULL) goto err;
3555 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3556 if (fread(c,clen,1,fp) == 0) goto err;
3557 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3558 zfree(c);
3559 return createObject(REDIS_STRING,val);
3560 err:
3561 zfree(c);
3562 sdsfree(val);
3563 return NULL;
3564 }
3565
3566 static robj *rdbLoadStringObject(FILE*fp) {
3567 int isencoded;
3568 uint32_t len;
3569 sds val;
3570
3571 len = rdbLoadLen(fp,&isencoded);
3572 if (isencoded) {
3573 switch(len) {
3574 case REDIS_RDB_ENC_INT8:
3575 case REDIS_RDB_ENC_INT16:
3576 case REDIS_RDB_ENC_INT32:
3577 return rdbLoadIntegerObject(fp,len);
3578 case REDIS_RDB_ENC_LZF:
3579 return rdbLoadLzfStringObject(fp);
3580 default:
3581 redisAssert(0);
3582 }
3583 }
3584
3585 if (len == REDIS_RDB_LENERR) return NULL;
3586 val = sdsnewlen(NULL,len);
3587 if (len && fread(val,len,1,fp) == 0) {
3588 sdsfree(val);
3589 return NULL;
3590 }
3591 return createObject(REDIS_STRING,val);
3592 }
3593
3594 /* For information about double serialization check rdbSaveDoubleValue() */
3595 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3596 char buf[128];
3597 unsigned char len;
3598
3599 if (fread(&len,1,1,fp) == 0) return -1;
3600 switch(len) {
3601 case 255: *val = R_NegInf; return 0;
3602 case 254: *val = R_PosInf; return 0;
3603 case 253: *val = R_Nan; return 0;
3604 default:
3605 if (fread(buf,len,1,fp) == 0) return -1;
3606 buf[len] = '\0';
3607 sscanf(buf, "%lg", val);
3608 return 0;
3609 }
3610 }
3611
3612 /* Load a Redis object of the specified type from the specified file.
3613 * On success a newly allocated object is returned, otherwise NULL. */
3614 static robj *rdbLoadObject(int type, FILE *fp) {
3615 robj *o;
3616
3617 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3618 if (type == REDIS_STRING) {
3619 /* Read string value */
3620 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3621 o = tryObjectEncoding(o);
3622 } else if (type == REDIS_LIST || type == REDIS_SET) {
3623 /* Read list/set value */
3624 uint32_t listlen;
3625
3626 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3627 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3628 /* It's faster to expand the dict to the right size asap in order
3629 * to avoid rehashing */
3630 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3631 dictExpand(o->ptr,listlen);
3632 /* Load every single element of the list/set */
3633 while(listlen--) {
3634 robj *ele;
3635
3636 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3637 ele = tryObjectEncoding(ele);
3638 if (type == REDIS_LIST) {
3639 listAddNodeTail((list*)o->ptr,ele);
3640 } else {
3641 dictAdd((dict*)o->ptr,ele,NULL);
3642 }
3643 }
3644 } else if (type == REDIS_ZSET) {
3645 /* Read list/set value */
3646 size_t zsetlen;
3647 zset *zs;
3648
3649 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3650 o = createZsetObject();
3651 zs = o->ptr;
3652 /* Load every single element of the list/set */
3653 while(zsetlen--) {
3654 robj *ele;
3655 double *score = zmalloc(sizeof(double));
3656
3657 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3658 ele = tryObjectEncoding(ele);
3659 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3660 dictAdd(zs->dict,ele,score);
3661 zslInsert(zs->zsl,*score,ele);
3662 incrRefCount(ele); /* added to skiplist */
3663 }
3664 } else if (type == REDIS_HASH) {
3665 size_t hashlen;
3666
3667 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3668 o = createHashObject();
3669 /* Too many entries? Use an hash table. */
3670 if (hashlen > server.hash_max_zipmap_entries)
3671 convertToRealHash(o);
3672 /* Load every key/value, then set it into the zipmap or hash
3673 * table, as needed. */
3674 while(hashlen--) {
3675 robj *key, *val;
3676
3677 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3678 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3679 /* If we are using a zipmap and there are too big values
3680 * the object is converted to real hash table encoding. */
3681 if (o->encoding != REDIS_ENCODING_HT &&
3682 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3683 sdslen(val->ptr) > server.hash_max_zipmap_value))
3684 {
3685 convertToRealHash(o);
3686 }
3687
3688 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3689 unsigned char *zm = o->ptr;
3690
3691 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3692 val->ptr,sdslen(val->ptr),NULL);
3693 o->ptr = zm;
3694 decrRefCount(key);
3695 decrRefCount(val);
3696 } else {
3697 key = tryObjectEncoding(key);
3698 val = tryObjectEncoding(val);
3699 dictAdd((dict*)o->ptr,key,val);
3700 }
3701 }
3702 } else {
3703 redisAssert(0);
3704 }
3705 return o;
3706 }
3707
3708 static int rdbLoad(char *filename) {
3709 FILE *fp;
3710 robj *keyobj = NULL;
3711 uint32_t dbid;
3712 int type, retval, rdbver;
3713 dict *d = server.db[0].dict;
3714 redisDb *db = server.db+0;
3715 char buf[1024];
3716 time_t expiretime = -1, now = time(NULL);
3717 long long loadedkeys = 0;
3718
3719 fp = fopen(filename,"r");
3720 if (!fp) return REDIS_ERR;
3721 if (fread(buf,9,1,fp) == 0) goto eoferr;
3722 buf[9] = '\0';
3723 if (memcmp(buf,"REDIS",5) != 0) {
3724 fclose(fp);
3725 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3726 return REDIS_ERR;
3727 }
3728 rdbver = atoi(buf+5);
3729 if (rdbver != 1) {
3730 fclose(fp);
3731 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3732 return REDIS_ERR;
3733 }
3734 while(1) {
3735 robj *o;
3736
3737 /* Read type. */
3738 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3739 if (type == REDIS_EXPIRETIME) {
3740 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3741 /* We read the time so we need to read the object type again */
3742 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3743 }
3744 if (type == REDIS_EOF) break;
3745 /* Handle SELECT DB opcode as a special case */
3746 if (type == REDIS_SELECTDB) {
3747 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3748 goto eoferr;
3749 if (dbid >= (unsigned)server.dbnum) {
3750 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3751 exit(1);
3752 }
3753 db = server.db+dbid;
3754 d = db->dict;
3755 continue;
3756 }
3757 /* Read key */
3758 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3759 /* Read value */
3760 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3761 /* Add the new object in the hash table */
3762 retval = dictAdd(d,keyobj,o);
3763 if (retval == DICT_ERR) {
3764 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3765 exit(1);
3766 }
3767 /* Set the expire time if needed */
3768 if (expiretime != -1) {
3769 setExpire(db,keyobj,expiretime);
3770 /* Delete this key if already expired */
3771 if (expiretime < now) deleteKey(db,keyobj);
3772 expiretime = -1;
3773 }
3774 keyobj = o = NULL;
3775 /* Handle swapping while loading big datasets when VM is on */
3776 loadedkeys++;
3777 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3778 while (zmalloc_used_memory() > server.vm_max_memory) {
3779 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3780 }
3781 }
3782 }
3783 fclose(fp);
3784 return REDIS_OK;
3785
3786 eoferr: /* unexpected end of file is handled here with a fatal exit */
3787 if (keyobj) decrRefCount(keyobj);
3788 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3789 exit(1);
3790 return REDIS_ERR; /* Just to avoid warning */
3791 }
3792
3793 /*================================== Commands =============================== */
3794
3795 static void authCommand(redisClient *c) {
3796 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3797 c->authenticated = 1;
3798 addReply(c,shared.ok);
3799 } else {
3800 c->authenticated = 0;
3801 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3802 }
3803 }
3804
3805 static void pingCommand(redisClient *c) {
3806 addReply(c,shared.pong);
3807 }
3808
3809 static void echoCommand(redisClient *c) {
3810 addReplyBulk(c,c->argv[1]);
3811 }
3812
3813 /*=================================== Strings =============================== */
3814
3815 static void setGenericCommand(redisClient *c, int nx) {
3816 int retval;
3817
3818 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3819 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3820 if (retval == DICT_ERR) {
3821 if (!nx) {
3822 /* If the key is about a swapped value, we want a new key object
3823 * to overwrite the old. So we delete the old key in the database.
3824 * This will also make sure that swap pages about the old object
3825 * will be marked as free. */
3826 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3827 incrRefCount(c->argv[1]);
3828 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3829 incrRefCount(c->argv[2]);
3830 } else {
3831 addReply(c,shared.czero);
3832 return;
3833 }
3834 } else {
3835 incrRefCount(c->argv[1]);
3836 incrRefCount(c->argv[2]);
3837 }
3838 server.dirty++;
3839 removeExpire(c->db,c->argv[1]);
3840 addReply(c, nx ? shared.cone : shared.ok);
3841 }
3842
3843 static void setCommand(redisClient *c) {
3844 setGenericCommand(c,0);
3845 }
3846
3847 static void setnxCommand(redisClient *c) {
3848 setGenericCommand(c,1);
3849 }
3850
3851 static int getGenericCommand(redisClient *c) {
3852 robj *o;
3853
3854 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3855 return REDIS_OK;
3856
3857 if (o->type != REDIS_STRING) {
3858 addReply(c,shared.wrongtypeerr);
3859 return REDIS_ERR;
3860 } else {
3861 addReplyBulk(c,o);
3862 return REDIS_OK;
3863 }
3864 }
3865
3866 static void getCommand(redisClient *c) {
3867 getGenericCommand(c);
3868 }
3869
3870 static void getsetCommand(redisClient *c) {
3871 if (getGenericCommand(c) == REDIS_ERR) return;
3872 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3873 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3874 } else {
3875 incrRefCount(c->argv[1]);
3876 }
3877 incrRefCount(c->argv[2]);
3878 server.dirty++;
3879 removeExpire(c->db,c->argv[1]);
3880 }
3881
3882 static void mgetCommand(redisClient *c) {
3883 int j;
3884
3885 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3886 for (j = 1; j < c->argc; j++) {
3887 robj *o = lookupKeyRead(c->db,c->argv[j]);
3888 if (o == NULL) {
3889 addReply(c,shared.nullbulk);
3890 } else {
3891 if (o->type != REDIS_STRING) {
3892 addReply(c,shared.nullbulk);
3893 } else {
3894 addReplyBulk(c,o);
3895 }
3896 }
3897 }
3898 }
3899
3900 static void msetGenericCommand(redisClient *c, int nx) {
3901 int j, busykeys = 0;
3902
3903 if ((c->argc % 2) == 0) {
3904 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3905 return;
3906 }
3907 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3908 * set nothing at all if at least one already key exists. */
3909 if (nx) {
3910 for (j = 1; j < c->argc; j += 2) {
3911 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3912 busykeys++;
3913 }
3914 }
3915 }
3916 if (busykeys) {
3917 addReply(c, shared.czero);
3918 return;
3919 }
3920
3921 for (j = 1; j < c->argc; j += 2) {
3922 int retval;
3923
3924 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
3925 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3926 if (retval == DICT_ERR) {
3927 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3928 incrRefCount(c->argv[j+1]);
3929 } else {
3930 incrRefCount(c->argv[j]);
3931 incrRefCount(c->argv[j+1]);
3932 }
3933 removeExpire(c->db,c->argv[j]);
3934 }
3935 server.dirty += (c->argc-1)/2;
3936 addReply(c, nx ? shared.cone : shared.ok);
3937 }
3938
3939 static void msetCommand(redisClient *c) {
3940 msetGenericCommand(c,0);
3941 }
3942
3943 static void msetnxCommand(redisClient *c) {
3944 msetGenericCommand(c,1);
3945 }
3946
3947 static void incrDecrCommand(redisClient *c, long long incr) {
3948 long long value;
3949 int retval;
3950 robj *o;
3951
3952 o = lookupKeyWrite(c->db,c->argv[1]);
3953 if (o == NULL) {
3954 value = 0;
3955 } else {
3956 if (o->type != REDIS_STRING) {
3957 value = 0;
3958 } else {
3959 char *eptr;
3960
3961 if (o->encoding == REDIS_ENCODING_RAW)
3962 value = strtoll(o->ptr, &eptr, 10);
3963 else if (o->encoding == REDIS_ENCODING_INT)
3964 value = (long)o->ptr;
3965 else
3966 redisAssert(1 != 1);
3967 }
3968 }
3969
3970 value += incr;
3971 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3972 o = tryObjectEncoding(o);
3973 retval = dictAdd(c->db->dict,c->argv[1],o);
3974 if (retval == DICT_ERR) {
3975 dictReplace(c->db->dict,c->argv[1],o);
3976 removeExpire(c->db,c->argv[1]);
3977 } else {
3978 incrRefCount(c->argv[1]);
3979 }
3980 server.dirty++;
3981 addReply(c,shared.colon);
3982 addReply(c,o);
3983 addReply(c,shared.crlf);
3984 }
3985
3986 static void incrCommand(redisClient *c) {
3987 incrDecrCommand(c,1);
3988 }
3989
3990 static void decrCommand(redisClient *c) {
3991 incrDecrCommand(c,-1);
3992 }
3993
3994 static void incrbyCommand(redisClient *c) {
3995 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3996 incrDecrCommand(c,incr);
3997 }
3998
3999 static void decrbyCommand(redisClient *c) {
4000 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
4001 incrDecrCommand(c,-incr);
4002 }
4003
4004 static void appendCommand(redisClient *c) {
4005 int retval;
4006 size_t totlen;
4007 robj *o;
4008
4009 o = lookupKeyWrite(c->db,c->argv[1]);
4010 if (o == NULL) {
4011 /* Create the key */
4012 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4013 incrRefCount(c->argv[1]);
4014 incrRefCount(c->argv[2]);
4015 totlen = stringObjectLen(c->argv[2]);
4016 } else {
4017 dictEntry *de;
4018
4019 de = dictFind(c->db->dict,c->argv[1]);
4020 assert(de != NULL);
4021
4022 o = dictGetEntryVal(de);
4023 if (o->type != REDIS_STRING) {
4024 addReply(c,shared.wrongtypeerr);
4025 return;
4026 }
4027 /* If the object is specially encoded or shared we have to make
4028 * a copy */
4029 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4030 robj *decoded = getDecodedObject(o);
4031
4032 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4033 decrRefCount(decoded);
4034 dictReplace(c->db->dict,c->argv[1],o);
4035 }
4036 /* APPEND! */
4037 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4038 o->ptr = sdscatlen(o->ptr,
4039 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4040 } else {
4041 o->ptr = sdscatprintf(o->ptr, "%ld",
4042 (unsigned long) c->argv[2]->ptr);
4043 }
4044 totlen = sdslen(o->ptr);
4045 }
4046 server.dirty++;
4047 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4048 }
4049
4050 static void substrCommand(redisClient *c) {
4051 robj *o;
4052 long start = atoi(c->argv[2]->ptr);
4053 long end = atoi(c->argv[3]->ptr);
4054 size_t rangelen, strlen;
4055 sds range;
4056
4057 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4058 checkType(c,o,REDIS_STRING)) return;
4059
4060 o = getDecodedObject(o);
4061 strlen = sdslen(o->ptr);
4062
4063 /* convert negative indexes */
4064 if (start < 0) start = strlen+start;
4065 if (end < 0) end = strlen+end;
4066 if (start < 0) start = 0;
4067 if (end < 0) end = 0;
4068
4069 /* indexes sanity checks */
4070 if (start > end || (size_t)start >= strlen) {
4071 /* Out of range start or start > end result in null reply */
4072 addReply(c,shared.nullbulk);
4073 decrRefCount(o);
4074 return;
4075 }
4076 if ((size_t)end >= strlen) end = strlen-1;
4077 rangelen = (end-start)+1;
4078
4079 /* Return the result */
4080 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4081 range = sdsnewlen((char*)o->ptr+start,rangelen);
4082 addReplySds(c,range);
4083 addReply(c,shared.crlf);
4084 decrRefCount(o);
4085 }
4086
4087 /* ========================= Type agnostic commands ========================= */
4088
4089 static void delCommand(redisClient *c) {
4090 int deleted = 0, j;
4091
4092 for (j = 1; j < c->argc; j++) {
4093 if (deleteKey(c->db,c->argv[j])) {
4094 server.dirty++;
4095 deleted++;
4096 }
4097 }
4098 addReplyLong(c,deleted);
4099 }
4100
4101 static void existsCommand(redisClient *c) {
4102 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4103 }
4104
4105 static void selectCommand(redisClient *c) {
4106 int id = atoi(c->argv[1]->ptr);
4107
4108 if (selectDb(c,id) == REDIS_ERR) {
4109 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4110 } else {
4111 addReply(c,shared.ok);
4112 }
4113 }
4114
4115 static void randomkeyCommand(redisClient *c) {
4116 dictEntry *de;
4117
4118 while(1) {
4119 de = dictGetRandomKey(c->db->dict);
4120 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4121 }
4122 if (de == NULL) {
4123 addReply(c,shared.plus);
4124 addReply(c,shared.crlf);
4125 } else {
4126 addReply(c,shared.plus);
4127 addReply(c,dictGetEntryKey(de));
4128 addReply(c,shared.crlf);
4129 }
4130 }
4131
4132 static void keysCommand(redisClient *c) {
4133 dictIterator *di;
4134 dictEntry *de;
4135 sds pattern = c->argv[1]->ptr;
4136 int plen = sdslen(pattern);
4137 unsigned long numkeys = 0;
4138 robj *lenobj = createObject(REDIS_STRING,NULL);
4139
4140 di = dictGetIterator(c->db->dict);
4141 addReply(c,lenobj);
4142 decrRefCount(lenobj);
4143 while((de = dictNext(di)) != NULL) {
4144 robj *keyobj = dictGetEntryKey(de);
4145
4146 sds key = keyobj->ptr;
4147 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4148 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4149 if (expireIfNeeded(c->db,keyobj) == 0) {
4150 addReplyBulk(c,keyobj);
4151 numkeys++;
4152 }
4153 }
4154 }
4155 dictReleaseIterator(di);
4156 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4157 }
4158
4159 static void dbsizeCommand(redisClient *c) {
4160 addReplySds(c,
4161 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4162 }
4163
4164 static void lastsaveCommand(redisClient *c) {
4165 addReplySds(c,
4166 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4167 }
4168
4169 static void typeCommand(redisClient *c) {
4170 robj *o;
4171 char *type;
4172
4173 o = lookupKeyRead(c->db,c->argv[1]);
4174 if (o == NULL) {
4175 type = "+none";
4176 } else {
4177 switch(o->type) {
4178 case REDIS_STRING: type = "+string"; break;
4179 case REDIS_LIST: type = "+list"; break;
4180 case REDIS_SET: type = "+set"; break;
4181 case REDIS_ZSET: type = "+zset"; break;
4182 case REDIS_HASH: type = "+hash"; break;
4183 default: type = "+unknown"; break;
4184 }
4185 }
4186 addReplySds(c,sdsnew(type));
4187 addReply(c,shared.crlf);
4188 }
4189
4190 static void saveCommand(redisClient *c) {
4191 if (server.bgsavechildpid != -1) {
4192 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4193 return;
4194 }
4195 if (rdbSave(server.dbfilename) == REDIS_OK) {
4196 addReply(c,shared.ok);
4197 } else {
4198 addReply(c,shared.err);
4199 }
4200 }
4201
4202 static void bgsaveCommand(redisClient *c) {
4203 if (server.bgsavechildpid != -1) {
4204 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4205 return;
4206 }
4207 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4208 char *status = "+Background saving started\r\n";
4209 addReplySds(c,sdsnew(status));
4210 } else {
4211 addReply(c,shared.err);
4212 }
4213 }
4214
4215 static void shutdownCommand(redisClient *c) {
4216 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4217 /* Kill the saving child if there is a background saving in progress.
4218 We want to avoid race conditions, for instance our saving child may
4219 overwrite the synchronous saving did by SHUTDOWN. */
4220 if (server.bgsavechildpid != -1) {
4221 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4222 kill(server.bgsavechildpid,SIGKILL);
4223 rdbRemoveTempFile(server.bgsavechildpid);
4224 }
4225 if (server.appendonly) {
4226 /* Append only file: fsync() the AOF and exit */
4227 fsync(server.appendfd);
4228 if (server.vm_enabled) unlink(server.vm_swap_file);
4229 exit(0);
4230 } else {
4231 /* Snapshotting. Perform a SYNC SAVE and exit */
4232 if (rdbSave(server.dbfilename) == REDIS_OK) {
4233 if (server.daemonize)
4234 unlink(server.pidfile);
4235 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4236 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4237 if (server.vm_enabled) unlink(server.vm_swap_file);
4238 exit(0);
4239 } else {
4240 /* Ooops.. error saving! The best we can do is to continue
4241 * operating. Note that if there was a background saving process,
4242 * in the next cron() Redis will be notified that the background
4243 * saving aborted, handling special stuff like slaves pending for
4244 * synchronization... */
4245 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4246 addReplySds(c,
4247 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4248 }
4249 }
4250 }
4251
4252 static void renameGenericCommand(redisClient *c, int nx) {
4253 robj *o;
4254
4255 /* To use the same key as src and dst is probably an error */
4256 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4257 addReply(c,shared.sameobjecterr);
4258 return;
4259 }
4260
4261 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4262 return;
4263
4264 incrRefCount(o);
4265 deleteIfVolatile(c->db,c->argv[2]);
4266 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4267 if (nx) {
4268 decrRefCount(o);
4269 addReply(c,shared.czero);
4270 return;
4271 }
4272 dictReplace(c->db->dict,c->argv[2],o);
4273 } else {
4274 incrRefCount(c->argv[2]);
4275 }
4276 deleteKey(c->db,c->argv[1]);
4277 server.dirty++;
4278 addReply(c,nx ? shared.cone : shared.ok);
4279 }
4280
4281 static void renameCommand(redisClient *c) {
4282 renameGenericCommand(c,0);
4283 }
4284
4285 static void renamenxCommand(redisClient *c) {
4286 renameGenericCommand(c,1);
4287 }
4288
4289 static void moveCommand(redisClient *c) {
4290 robj *o;
4291 redisDb *src, *dst;
4292 int srcid;
4293
4294 /* Obtain source and target DB pointers */
4295 src = c->db;
4296 srcid = c->db->id;
4297 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4298 addReply(c,shared.outofrangeerr);
4299 return;
4300 }
4301 dst = c->db;
4302 selectDb(c,srcid); /* Back to the source DB */
4303
4304 /* If the user is moving using as target the same
4305 * DB as the source DB it is probably an error. */
4306 if (src == dst) {
4307 addReply(c,shared.sameobjecterr);
4308 return;
4309 }
4310
4311 /* Check if the element exists and get a reference */
4312 o = lookupKeyWrite(c->db,c->argv[1]);
4313 if (!o) {
4314 addReply(c,shared.czero);
4315 return;
4316 }
4317
4318 /* Try to add the element to the target DB */
4319 deleteIfVolatile(dst,c->argv[1]);
4320 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4321 addReply(c,shared.czero);
4322 return;
4323 }
4324 incrRefCount(c->argv[1]);
4325 incrRefCount(o);
4326
4327 /* OK! key moved, free the entry in the source DB */
4328 deleteKey(src,c->argv[1]);
4329 server.dirty++;
4330 addReply(c,shared.cone);
4331 }
4332
4333 /* =================================== Lists ================================ */
4334 static void pushGenericCommand(redisClient *c, int where) {
4335 robj *lobj;
4336 list *list;
4337
4338 lobj = lookupKeyWrite(c->db,c->argv[1]);
4339 if (lobj == NULL) {
4340 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4341 addReply(c,shared.cone);
4342 return;
4343 }
4344 lobj = createListObject();
4345 list = lobj->ptr;
4346 if (where == REDIS_HEAD) {
4347 listAddNodeHead(list,c->argv[2]);
4348 } else {
4349 listAddNodeTail(list,c->argv[2]);
4350 }
4351 dictAdd(c->db->dict,c->argv[1],lobj);
4352 incrRefCount(c->argv[1]);
4353 incrRefCount(c->argv[2]);
4354 } else {
4355 if (lobj->type != REDIS_LIST) {
4356 addReply(c,shared.wrongtypeerr);
4357 return;
4358 }
4359 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4360 addReply(c,shared.cone);
4361 return;
4362 }
4363 list = lobj->ptr;
4364 if (where == REDIS_HEAD) {
4365 listAddNodeHead(list,c->argv[2]);
4366 } else {
4367 listAddNodeTail(list,c->argv[2]);
4368 }
4369 incrRefCount(c->argv[2]);
4370 }
4371 server.dirty++;
4372 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4373 }
4374
4375 static void lpushCommand(redisClient *c) {
4376 pushGenericCommand(c,REDIS_HEAD);
4377 }
4378
4379 static void rpushCommand(redisClient *c) {
4380 pushGenericCommand(c,REDIS_TAIL);
4381 }
4382
4383 static void llenCommand(redisClient *c) {
4384 robj *o;
4385 list *l;
4386
4387 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4388 checkType(c,o,REDIS_LIST)) return;
4389
4390 l = o->ptr;
4391 addReplyUlong(c,listLength(l));
4392 }
4393
4394 static void lindexCommand(redisClient *c) {
4395 robj *o;
4396 int index = atoi(c->argv[2]->ptr);
4397 list *list;
4398 listNode *ln;
4399
4400 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4401 checkType(c,o,REDIS_LIST)) return;
4402 list = o->ptr;
4403
4404 ln = listIndex(list, index);
4405 if (ln == NULL) {
4406 addReply(c,shared.nullbulk);
4407 } else {
4408 robj *ele = listNodeValue(ln);
4409 addReplyBulk(c,ele);
4410 }
4411 }
4412
4413 static void lsetCommand(redisClient *c) {
4414 robj *o;
4415 int index = atoi(c->argv[2]->ptr);
4416 list *list;
4417 listNode *ln;
4418
4419 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4420 checkType(c,o,REDIS_LIST)) return;
4421 list = o->ptr;
4422
4423 ln = listIndex(list, index);
4424 if (ln == NULL) {
4425 addReply(c,shared.outofrangeerr);
4426 } else {
4427 robj *ele = listNodeValue(ln);
4428
4429 decrRefCount(ele);
4430 listNodeValue(ln) = c->argv[3];
4431 incrRefCount(c->argv[3]);
4432 addReply(c,shared.ok);
4433 server.dirty++;
4434 }
4435 }
4436
4437 static void popGenericCommand(redisClient *c, int where) {
4438 robj *o;
4439 list *list;
4440 listNode *ln;
4441
4442 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4443 checkType(c,o,REDIS_LIST)) return;
4444 list = o->ptr;
4445
4446 if (where == REDIS_HEAD)
4447 ln = listFirst(list);
4448 else
4449 ln = listLast(list);
4450
4451 if (ln == NULL) {
4452 addReply(c,shared.nullbulk);
4453 } else {
4454 robj *ele = listNodeValue(ln);
4455 addReplyBulk(c,ele);
4456 listDelNode(list,ln);
4457 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4458 server.dirty++;
4459 }
4460 }
4461
4462 static void lpopCommand(redisClient *c) {
4463 popGenericCommand(c,REDIS_HEAD);
4464 }
4465
4466 static void rpopCommand(redisClient *c) {
4467 popGenericCommand(c,REDIS_TAIL);
4468 }
4469
4470 static void lrangeCommand(redisClient *c) {
4471 robj *o;
4472 int start = atoi(c->argv[2]->ptr);
4473 int end = atoi(c->argv[3]->ptr);
4474 int llen;
4475 int rangelen, j;
4476 list *list;
4477 listNode *ln;
4478 robj *ele;
4479
4480 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4481 || checkType(c,o,REDIS_LIST)) return;
4482 list = o->ptr;
4483 llen = listLength(list);
4484
4485 /* convert negative indexes */
4486 if (start < 0) start = llen+start;
4487 if (end < 0) end = llen+end;
4488 if (start < 0) start = 0;
4489 if (end < 0) end = 0;
4490
4491 /* indexes sanity checks */
4492 if (start > end || start >= llen) {
4493 /* Out of range start or start > end result in empty list */
4494 addReply(c,shared.emptymultibulk);
4495 return;
4496 }
4497 if (end >= llen) end = llen-1;
4498 rangelen = (end-start)+1;
4499
4500 /* Return the result in form of a multi-bulk reply */
4501 ln = listIndex(list, start);
4502 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4503 for (j = 0; j < rangelen; j++) {
4504 ele = listNodeValue(ln);
4505 addReplyBulk(c,ele);
4506 ln = ln->next;
4507 }
4508 }
4509
4510 static void ltrimCommand(redisClient *c) {
4511 robj *o;
4512 int start = atoi(c->argv[2]->ptr);
4513 int end = atoi(c->argv[3]->ptr);
4514 int llen;
4515 int j, ltrim, rtrim;
4516 list *list;
4517 listNode *ln;
4518
4519 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4520 checkType(c,o,REDIS_LIST)) return;
4521 list = o->ptr;
4522 llen = listLength(list);
4523
4524 /* convert negative indexes */
4525 if (start < 0) start = llen+start;
4526 if (end < 0) end = llen+end;
4527 if (start < 0) start = 0;
4528 if (end < 0) end = 0;
4529
4530 /* indexes sanity checks */
4531 if (start > end || start >= llen) {
4532 /* Out of range start or start > end result in empty list */
4533 ltrim = llen;
4534 rtrim = 0;
4535 } else {
4536 if (end >= llen) end = llen-1;
4537 ltrim = start;
4538 rtrim = llen-end-1;
4539 }
4540
4541 /* Remove list elements to perform the trim */
4542 for (j = 0; j < ltrim; j++) {
4543 ln = listFirst(list);
4544 listDelNode(list,ln);
4545 }
4546 for (j = 0; j < rtrim; j++) {
4547 ln = listLast(list);
4548 listDelNode(list,ln);
4549 }
4550 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4551 server.dirty++;
4552 addReply(c,shared.ok);
4553 }
4554
4555 static void lremCommand(redisClient *c) {
4556 robj *o;
4557 list *list;
4558 listNode *ln, *next;
4559 int toremove = atoi(c->argv[2]->ptr);
4560 int removed = 0;
4561 int fromtail = 0;
4562
4563 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4564 checkType(c,o,REDIS_LIST)) return;
4565 list = o->ptr;
4566
4567 if (toremove < 0) {
4568 toremove = -toremove;
4569 fromtail = 1;
4570 }
4571 ln = fromtail ? list->tail : list->head;
4572 while (ln) {
4573 robj *ele = listNodeValue(ln);
4574
4575 next = fromtail ? ln->prev : ln->next;
4576 if (compareStringObjects(ele,c->argv[3]) == 0) {
4577 listDelNode(list,ln);
4578 server.dirty++;
4579 removed++;
4580 if (toremove && removed == toremove) break;
4581 }
4582 ln = next;
4583 }
4584 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4585 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4586 }
4587
4588 /* This is the semantic of this command:
4589 * RPOPLPUSH srclist dstlist:
4590 * IF LLEN(srclist) > 0
4591 * element = RPOP srclist
4592 * LPUSH dstlist element
4593 * RETURN element
4594 * ELSE
4595 * RETURN nil
4596 * END
4597 * END
4598 *
4599 * The idea is to be able to get an element from a list in a reliable way
4600 * since the element is not just returned but pushed against another list
4601 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4602 */
4603 static void rpoplpushcommand(redisClient *c) {
4604 robj *sobj;
4605 list *srclist;
4606 listNode *ln;
4607
4608 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4609 checkType(c,sobj,REDIS_LIST)) return;
4610 srclist = sobj->ptr;
4611 ln = listLast(srclist);
4612
4613 if (ln == NULL) {
4614 addReply(c,shared.nullbulk);
4615 } else {
4616 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4617 robj *ele = listNodeValue(ln);
4618 list *dstlist;
4619
4620 if (dobj && dobj->type != REDIS_LIST) {
4621 addReply(c,shared.wrongtypeerr);
4622 return;
4623 }
4624
4625 /* Add the element to the target list (unless it's directly
4626 * passed to some BLPOP-ing client */
4627 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4628 if (dobj == NULL) {
4629 /* Create the list if the key does not exist */
4630 dobj = createListObject();
4631 dictAdd(c->db->dict,c->argv[2],dobj);
4632 incrRefCount(c->argv[2]);
4633 }
4634 dstlist = dobj->ptr;
4635 listAddNodeHead(dstlist,ele);
4636 incrRefCount(ele);
4637 }
4638
4639 /* Send the element to the client as reply as well */
4640 addReplyBulk(c,ele);
4641
4642 /* Finally remove the element from the source list */
4643 listDelNode(srclist,ln);
4644 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4645 server.dirty++;
4646 }
4647 }
4648
4649 /* ==================================== Sets ================================ */
4650
4651 static void saddCommand(redisClient *c) {
4652 robj *set;
4653
4654 set = lookupKeyWrite(c->db,c->argv[1]);
4655 if (set == NULL) {
4656 set = createSetObject();
4657 dictAdd(c->db->dict,c->argv[1],set);
4658 incrRefCount(c->argv[1]);
4659 } else {
4660 if (set->type != REDIS_SET) {
4661 addReply(c,shared.wrongtypeerr);
4662 return;
4663 }
4664 }
4665 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4666 incrRefCount(c->argv[2]);
4667 server.dirty++;
4668 addReply(c,shared.cone);
4669 } else {
4670 addReply(c,shared.czero);
4671 }
4672 }
4673
4674 static void sremCommand(redisClient *c) {
4675 robj *set;
4676
4677 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4678 checkType(c,set,REDIS_SET)) return;
4679
4680 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4681 server.dirty++;
4682 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4683 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4684 addReply(c,shared.cone);
4685 } else {
4686 addReply(c,shared.czero);
4687 }
4688 }
4689
4690 static void smoveCommand(redisClient *c) {
4691 robj *srcset, *dstset;
4692
4693 srcset = lookupKeyWrite(c->db,c->argv[1]);
4694 dstset = lookupKeyWrite(c->db,c->argv[2]);
4695
4696 /* If the source key does not exist return 0, if it's of the wrong type
4697 * raise an error */
4698 if (srcset == NULL || srcset->type != REDIS_SET) {
4699 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4700 return;
4701 }
4702 /* Error if the destination key is not a set as well */
4703 if (dstset && dstset->type != REDIS_SET) {
4704 addReply(c,shared.wrongtypeerr);
4705 return;
4706 }
4707 /* Remove the element from the source set */
4708 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4709 /* Key not found in the src set! return zero */
4710 addReply(c,shared.czero);
4711 return;
4712 }
4713 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4714 deleteKey(c->db,c->argv[1]);
4715 server.dirty++;
4716 /* Add the element to the destination set */
4717 if (!dstset) {
4718 dstset = createSetObject();
4719 dictAdd(c->db->dict,c->argv[2],dstset);
4720 incrRefCount(c->argv[2]);
4721 }
4722 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4723 incrRefCount(c->argv[3]);
4724 addReply(c,shared.cone);
4725 }
4726
4727 static void sismemberCommand(redisClient *c) {
4728 robj *set;
4729
4730 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4731 checkType(c,set,REDIS_SET)) return;
4732
4733 if (dictFind(set->ptr,c->argv[2]))
4734 addReply(c,shared.cone);
4735 else
4736 addReply(c,shared.czero);
4737 }
4738
4739 static void scardCommand(redisClient *c) {
4740 robj *o;
4741 dict *s;
4742
4743 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4744 checkType(c,o,REDIS_SET)) return;
4745
4746 s = o->ptr;
4747 addReplyUlong(c,dictSize(s));
4748 }
4749
4750 static void spopCommand(redisClient *c) {
4751 robj *set;
4752 dictEntry *de;
4753
4754 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4755 checkType(c,set,REDIS_SET)) return;
4756
4757 de = dictGetRandomKey(set->ptr);
4758 if (de == NULL) {
4759 addReply(c,shared.nullbulk);
4760 } else {
4761 robj *ele = dictGetEntryKey(de);
4762
4763 addReplyBulk(c,ele);
4764 dictDelete(set->ptr,ele);
4765 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4766 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4767 server.dirty++;
4768 }
4769 }
4770
4771 static void srandmemberCommand(redisClient *c) {
4772 robj *set;
4773 dictEntry *de;
4774
4775 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4776 checkType(c,set,REDIS_SET)) return;
4777
4778 de = dictGetRandomKey(set->ptr);
4779 if (de == NULL) {
4780 addReply(c,shared.nullbulk);
4781 } else {
4782 robj *ele = dictGetEntryKey(de);
4783
4784 addReplyBulk(c,ele);
4785 }
4786 }
4787
4788 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4789 dict **d1 = (void*) s1, **d2 = (void*) s2;
4790
4791 return dictSize(*d1)-dictSize(*d2);
4792 }
4793
4794 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4795 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4796 dictIterator *di;
4797 dictEntry *de;
4798 robj *lenobj = NULL, *dstset = NULL;
4799 unsigned long j, cardinality = 0;
4800
4801 for (j = 0; j < setsnum; j++) {
4802 robj *setobj;
4803
4804 setobj = dstkey ?
4805 lookupKeyWrite(c->db,setskeys[j]) :
4806 lookupKeyRead(c->db,setskeys[j]);
4807 if (!setobj) {
4808 zfree(dv);
4809 if (dstkey) {
4810 if (deleteKey(c->db,dstkey))
4811 server.dirty++;
4812 addReply(c,shared.czero);
4813 } else {
4814 addReply(c,shared.emptymultibulk);
4815 }
4816 return;
4817 }
4818 if (setobj->type != REDIS_SET) {
4819 zfree(dv);
4820 addReply(c,shared.wrongtypeerr);
4821 return;
4822 }
4823 dv[j] = setobj->ptr;
4824 }
4825 /* Sort sets from the smallest to largest, this will improve our
4826 * algorithm's performace */
4827 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4828
4829 /* The first thing we should output is the total number of elements...
4830 * since this is a multi-bulk write, but at this stage we don't know
4831 * the intersection set size, so we use a trick, append an empty object
4832 * to the output list and save the pointer to later modify it with the
4833 * right length */
4834 if (!dstkey) {
4835 lenobj = createObject(REDIS_STRING,NULL);
4836 addReply(c,lenobj);
4837 decrRefCount(lenobj);
4838 } else {
4839 /* If we have a target key where to store the resulting set
4840 * create this key with an empty set inside */
4841 dstset = createSetObject();
4842 }
4843
4844 /* Iterate all the elements of the first (smallest) set, and test
4845 * the element against all the other sets, if at least one set does
4846 * not include the element it is discarded */
4847 di = dictGetIterator(dv[0]);
4848
4849 while((de = dictNext(di)) != NULL) {
4850 robj *ele;
4851
4852 for (j = 1; j < setsnum; j++)
4853 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4854 if (j != setsnum)
4855 continue; /* at least one set does not contain the member */
4856 ele = dictGetEntryKey(de);
4857 if (!dstkey) {
4858 addReplyBulk(c,ele);
4859 cardinality++;
4860 } else {
4861 dictAdd(dstset->ptr,ele,NULL);
4862 incrRefCount(ele);
4863 }
4864 }
4865 dictReleaseIterator(di);
4866
4867 if (dstkey) {
4868 /* Store the resulting set into the target, if the intersection
4869 * is not an empty set. */
4870 deleteKey(c->db,dstkey);
4871 if (dictSize((dict*)dstset->ptr) > 0) {
4872 dictAdd(c->db->dict,dstkey,dstset);
4873 incrRefCount(dstkey);
4874 addReplyLong(c,dictSize((dict*)dstset->ptr));
4875 } else {
4876 decrRefCount(dstset);
4877 addReply(c,shared.czero);
4878 }
4879 server.dirty++;
4880 } else {
4881 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4882 }
4883 zfree(dv);
4884 }
4885
4886 static void sinterCommand(redisClient *c) {
4887 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4888 }
4889
4890 static void sinterstoreCommand(redisClient *c) {
4891 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4892 }
4893
4894 #define REDIS_OP_UNION 0
4895 #define REDIS_OP_DIFF 1
4896 #define REDIS_OP_INTER 2
4897
4898 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4899 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4900 dictIterator *di;
4901 dictEntry *de;
4902 robj *dstset = NULL;
4903 int j, cardinality = 0;
4904
4905 for (j = 0; j < setsnum; j++) {
4906 robj *setobj;
4907
4908 setobj = dstkey ?
4909 lookupKeyWrite(c->db,setskeys[j]) :
4910 lookupKeyRead(c->db,setskeys[j]);
4911 if (!setobj) {
4912 dv[j] = NULL;
4913 continue;
4914 }
4915 if (setobj->type != REDIS_SET) {
4916 zfree(dv);
4917 addReply(c,shared.wrongtypeerr);
4918 return;
4919 }
4920 dv[j] = setobj->ptr;
4921 }
4922
4923 /* We need a temp set object to store our union. If the dstkey
4924 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4925 * this set object will be the resulting object to set into the target key*/
4926 dstset = createSetObject();
4927
4928 /* Iterate all the elements of all the sets, add every element a single
4929 * time to the result set */
4930 for (j = 0; j < setsnum; j++) {
4931 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4932 if (!dv[j]) continue; /* non existing keys are like empty sets */
4933
4934 di = dictGetIterator(dv[j]);
4935
4936 while((de = dictNext(di)) != NULL) {
4937 robj *ele;
4938
4939 /* dictAdd will not add the same element multiple times */
4940 ele = dictGetEntryKey(de);
4941 if (op == REDIS_OP_UNION || j == 0) {
4942 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4943 incrRefCount(ele);
4944 cardinality++;
4945 }
4946 } else if (op == REDIS_OP_DIFF) {
4947 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4948 cardinality--;
4949 }
4950 }
4951 }
4952 dictReleaseIterator(di);
4953
4954 /* result set is empty? Exit asap. */
4955 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4956 }
4957
4958 /* Output the content of the resulting set, if not in STORE mode */
4959 if (!dstkey) {
4960 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4961 di = dictGetIterator(dstset->ptr);
4962 while((de = dictNext(di)) != NULL) {
4963 robj *ele;
4964
4965 ele = dictGetEntryKey(de);
4966 addReplyBulk(c,ele);
4967 }
4968 dictReleaseIterator(di);
4969 decrRefCount(dstset);
4970 } else {
4971 /* If we have a target key where to store the resulting set
4972 * create this key with the result set inside */
4973 deleteKey(c->db,dstkey);
4974 if (dictSize((dict*)dstset->ptr) > 0) {
4975 dictAdd(c->db->dict,dstkey,dstset);
4976 incrRefCount(dstkey);
4977 addReplyLong(c,dictSize((dict*)dstset->ptr));
4978 } else {
4979 decrRefCount(dstset);
4980 addReply(c,shared.czero);
4981 }
4982 server.dirty++;
4983 }
4984 zfree(dv);
4985 }
4986
4987 static void sunionCommand(redisClient *c) {
4988 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4989 }
4990
4991 static void sunionstoreCommand(redisClient *c) {
4992 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4993 }
4994
4995 static void sdiffCommand(redisClient *c) {
4996 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4997 }
4998
4999 static void sdiffstoreCommand(redisClient *c) {
5000 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5001 }
5002
5003 /* ==================================== ZSets =============================== */
5004
5005 /* ZSETs are ordered sets using two data structures to hold the same elements
5006 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5007 * data structure.
5008 *
5009 * The elements are added to an hash table mapping Redis objects to scores.
5010 * At the same time the elements are added to a skip list mapping scores
5011 * to Redis objects (so objects are sorted by scores in this "view"). */
5012
5013 /* This skiplist implementation is almost a C translation of the original
5014 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5015 * Alternative to Balanced Trees", modified in three ways:
5016 * a) this implementation allows for repeated values.
5017 * b) the comparison is not just by key (our 'score') but by satellite data.
5018 * c) there is a back pointer, so it's a doubly linked list with the back
5019 * pointers being only at "level 1". This allows to traverse the list
5020 * from tail to head, useful for ZREVRANGE. */
5021
5022 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5023 zskiplistNode *zn = zmalloc(sizeof(*zn));
5024
5025 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5026 if (level > 0)
5027 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5028 zn->score = score;
5029 zn->obj = obj;
5030 return zn;
5031 }
5032
5033 static zskiplist *zslCreate(void) {
5034 int j;
5035 zskiplist *zsl;
5036
5037 zsl = zmalloc(sizeof(*zsl));
5038 zsl->level = 1;
5039 zsl->length = 0;
5040 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5041 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5042 zsl->header->forward[j] = NULL;
5043
5044 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5045 if (j < ZSKIPLIST_MAXLEVEL-1)
5046 zsl->header->span[j] = 0;
5047 }
5048 zsl->header->backward = NULL;
5049 zsl->tail = NULL;
5050 return zsl;
5051 }
5052
5053 static void zslFreeNode(zskiplistNode *node) {
5054 decrRefCount(node->obj);
5055 zfree(node->forward);
5056 zfree(node->span);
5057 zfree(node);
5058 }
5059
5060 static void zslFree(zskiplist *zsl) {
5061 zskiplistNode *node = zsl->header->forward[0], *next;
5062
5063 zfree(zsl->header->forward);
5064 zfree(zsl->header->span);
5065 zfree(zsl->header);
5066 while(node) {
5067 next = node->forward[0];
5068 zslFreeNode(node);
5069 node = next;
5070 }
5071 zfree(zsl);
5072 }
5073
5074 static int zslRandomLevel(void) {
5075 int level = 1;
5076 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5077 level += 1;
5078 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5079 }
5080
5081 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5082 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5083 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5084 int i, level;
5085
5086 x = zsl->header;
5087 for (i = zsl->level-1; i >= 0; i--) {
5088 /* store rank that is crossed to reach the insert position */
5089 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5090
5091 while (x->forward[i] &&
5092 (x->forward[i]->score < score ||
5093 (x->forward[i]->score == score &&
5094 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5095 rank[i] += i > 0 ? x->span[i-1] : 1;
5096 x = x->forward[i];
5097 }
5098 update[i] = x;
5099 }
5100 /* we assume the key is not already inside, since we allow duplicated
5101 * scores, and the re-insertion of score and redis object should never
5102 * happpen since the caller of zslInsert() should test in the hash table
5103 * if the element is already inside or not. */
5104 level = zslRandomLevel();
5105 if (level > zsl->level) {
5106 for (i = zsl->level; i < level; i++) {
5107 rank[i] = 0;
5108 update[i] = zsl->header;
5109 update[i]->span[i-1] = zsl->length;
5110 }
5111 zsl->level = level;
5112 }
5113 x = zslCreateNode(level,score,obj);
5114 for (i = 0; i < level; i++) {
5115 x->forward[i] = update[i]->forward[i];
5116 update[i]->forward[i] = x;
5117
5118 /* update span covered by update[i] as x is inserted here */
5119 if (i > 0) {
5120 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5121 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5122 }
5123 }
5124
5125 /* increment span for untouched levels */
5126 for (i = level; i < zsl->level; i++) {
5127 update[i]->span[i-1]++;
5128 }
5129
5130 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5131 if (x->forward[0])
5132 x->forward[0]->backward = x;
5133 else
5134 zsl->tail = x;
5135 zsl->length++;
5136 }
5137
5138 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5139 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5140 int i;
5141 for (i = 0; i < zsl->level; i++) {
5142 if (update[i]->forward[i] == x) {
5143 if (i > 0) {
5144 update[i]->span[i-1] += x->span[i-1] - 1;
5145 }
5146 update[i]->forward[i] = x->forward[i];
5147 } else {
5148 /* invariant: i > 0, because update[0]->forward[0]
5149 * is always equal to x */
5150 update[i]->span[i-1] -= 1;
5151 }
5152 }
5153 if (x->forward[0]) {
5154 x->forward[0]->backward = x->backward;
5155 } else {
5156 zsl->tail = x->backward;
5157 }
5158 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5159 zsl->level--;
5160 zsl->length--;
5161 }
5162
5163 /* Delete an element with matching score/object from the skiplist. */
5164 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5165 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5166 int i;
5167
5168 x = zsl->header;
5169 for (i = zsl->level-1; i >= 0; i--) {
5170 while (x->forward[i] &&
5171 (x->forward[i]->score < score ||
5172 (x->forward[i]->score == score &&
5173 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5174 x = x->forward[i];
5175 update[i] = x;
5176 }
5177 /* We may have multiple elements with the same score, what we need
5178 * is to find the element with both the right score and object. */
5179 x = x->forward[0];
5180 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5181 zslDeleteNode(zsl, x, update);
5182 zslFreeNode(x);
5183 return 1;
5184 } else {
5185 return 0; /* not found */
5186 }
5187 return 0; /* not found */
5188 }
5189
5190 /* Delete all the elements with score between min and max from the skiplist.
5191 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5192 * Note that this function takes the reference to the hash table view of the
5193 * sorted set, in order to remove the elements from the hash table too. */
5194 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5195 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5196 unsigned long removed = 0;
5197 int i;
5198
5199 x = zsl->header;
5200 for (i = zsl->level-1; i >= 0; i--) {
5201 while (x->forward[i] && x->forward[i]->score < min)
5202 x = x->forward[i];
5203 update[i] = x;
5204 }
5205 /* We may have multiple elements with the same score, what we need
5206 * is to find the element with both the right score and object. */
5207 x = x->forward[0];
5208 while (x && x->score <= max) {
5209 zskiplistNode *next = x->forward[0];
5210 zslDeleteNode(zsl, x, update);
5211 dictDelete(dict,x->obj);
5212 zslFreeNode(x);
5213 removed++;
5214 x = next;
5215 }
5216 return removed; /* not found */
5217 }
5218
5219 /* Delete all the elements with rank between start and end from the skiplist.
5220 * Start and end are inclusive. Note that start and end need to be 1-based */
5221 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5222 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5223 unsigned long traversed = 0, removed = 0;
5224 int i;
5225
5226 x = zsl->header;
5227 for (i = zsl->level-1; i >= 0; i--) {
5228 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5229 traversed += i > 0 ? x->span[i-1] : 1;
5230 x = x->forward[i];
5231 }
5232 update[i] = x;
5233 }
5234
5235 traversed++;
5236 x = x->forward[0];
5237 while (x && traversed <= end) {
5238 zskiplistNode *next = x->forward[0];
5239 zslDeleteNode(zsl, x, update);
5240 dictDelete(dict,x->obj);
5241 zslFreeNode(x);
5242 removed++;
5243 traversed++;
5244 x = next;
5245 }
5246 return removed;
5247 }
5248
5249 /* Find the first node having a score equal or greater than the specified one.
5250 * Returns NULL if there is no match. */
5251 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5252 zskiplistNode *x;
5253 int i;
5254
5255 x = zsl->header;
5256 for (i = zsl->level-1; i >= 0; i--) {
5257 while (x->forward[i] && x->forward[i]->score < score)
5258 x = x->forward[i];
5259 }
5260 /* We may have multiple elements with the same score, what we need
5261 * is to find the element with both the right score and object. */
5262 return x->forward[0];
5263 }
5264
5265 /* Find the rank for an element by both score and key.
5266 * Returns 0 when the element cannot be found, rank otherwise.
5267 * Note that the rank is 1-based due to the span of zsl->header to the
5268 * first element. */
5269 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5270 zskiplistNode *x;
5271 unsigned long rank = 0;
5272 int i;
5273
5274 x = zsl->header;
5275 for (i = zsl->level-1; i >= 0; i--) {
5276 while (x->forward[i] &&
5277 (x->forward[i]->score < score ||
5278 (x->forward[i]->score == score &&
5279 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5280 rank += i > 0 ? x->span[i-1] : 1;
5281 x = x->forward[i];
5282 }
5283
5284 /* x might be equal to zsl->header, so test if obj is non-NULL */
5285 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5286 return rank;
5287 }
5288 }
5289 return 0;
5290 }
5291
5292 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5293 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5294 zskiplistNode *x;
5295 unsigned long traversed = 0;
5296 int i;
5297
5298 x = zsl->header;
5299 for (i = zsl->level-1; i >= 0; i--) {
5300 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5301 {
5302 traversed += i > 0 ? x->span[i-1] : 1;
5303 x = x->forward[i];
5304 }
5305 if (traversed == rank) {
5306 return x;
5307 }
5308 }
5309 return NULL;
5310 }
5311
5312 /* The actual Z-commands implementations */
5313
5314 /* This generic command implements both ZADD and ZINCRBY.
5315 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5316 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5317 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5318 robj *zsetobj;
5319 zset *zs;
5320 double *score;
5321
5322 zsetobj = lookupKeyWrite(c->db,key);
5323 if (zsetobj == NULL) {
5324 zsetobj = createZsetObject();
5325 dictAdd(c->db->dict,key,zsetobj);
5326 incrRefCount(key);
5327 } else {
5328 if (zsetobj->type != REDIS_ZSET) {
5329 addReply(c,shared.wrongtypeerr);
5330 return;
5331 }
5332 }
5333 zs = zsetobj->ptr;
5334
5335 /* Ok now since we implement both ZADD and ZINCRBY here the code
5336 * needs to handle the two different conditions. It's all about setting
5337 * '*score', that is, the new score to set, to the right value. */
5338 score = zmalloc(sizeof(double));
5339 if (doincrement) {
5340 dictEntry *de;
5341
5342 /* Read the old score. If the element was not present starts from 0 */
5343 de = dictFind(zs->dict,ele);
5344 if (de) {
5345 double *oldscore = dictGetEntryVal(de);
5346 *score = *oldscore + scoreval;
5347 } else {
5348 *score = scoreval;
5349 }
5350 } else {
5351 *score = scoreval;
5352 }
5353
5354 /* What follows is a simple remove and re-insert operation that is common
5355 * to both ZADD and ZINCRBY... */
5356 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5357 /* case 1: New element */
5358 incrRefCount(ele); /* added to hash */
5359 zslInsert(zs->zsl,*score,ele);
5360 incrRefCount(ele); /* added to skiplist */
5361 server.dirty++;
5362 if (doincrement)
5363 addReplyDouble(c,*score);
5364 else
5365 addReply(c,shared.cone);
5366 } else {
5367 dictEntry *de;
5368 double *oldscore;
5369
5370 /* case 2: Score update operation */
5371 de = dictFind(zs->dict,ele);
5372 redisAssert(de != NULL);
5373 oldscore = dictGetEntryVal(de);
5374 if (*score != *oldscore) {
5375 int deleted;
5376
5377 /* Remove and insert the element in the skip list with new score */
5378 deleted = zslDelete(zs->zsl,*oldscore,ele);
5379 redisAssert(deleted != 0);
5380 zslInsert(zs->zsl,*score,ele);
5381 incrRefCount(ele);
5382 /* Update the score in the hash table */
5383 dictReplace(zs->dict,ele,score);
5384 server.dirty++;
5385 } else {
5386 zfree(score);
5387 }
5388 if (doincrement)
5389 addReplyDouble(c,*score);
5390 else
5391 addReply(c,shared.czero);
5392 }
5393 }
5394
5395 static void zaddCommand(redisClient *c) {
5396 double scoreval;
5397
5398 scoreval = strtod(c->argv[2]->ptr,NULL);
5399 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5400 }
5401
5402 static void zincrbyCommand(redisClient *c) {
5403 double scoreval;
5404
5405 scoreval = strtod(c->argv[2]->ptr,NULL);
5406 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5407 }
5408
5409 static void zremCommand(redisClient *c) {
5410 robj *zsetobj;
5411 zset *zs;
5412 dictEntry *de;
5413 double *oldscore;
5414 int deleted;
5415
5416 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5417 checkType(c,zsetobj,REDIS_ZSET)) return;
5418
5419 zs = zsetobj->ptr;
5420 de = dictFind(zs->dict,c->argv[2]);
5421 if (de == NULL) {
5422 addReply(c,shared.czero);
5423 return;
5424 }
5425 /* Delete from the skiplist */
5426 oldscore = dictGetEntryVal(de);
5427 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5428 redisAssert(deleted != 0);
5429
5430 /* Delete from the hash table */
5431 dictDelete(zs->dict,c->argv[2]);
5432 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5433 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5434 server.dirty++;
5435 addReply(c,shared.cone);
5436 }
5437
5438 static void zremrangebyscoreCommand(redisClient *c) {
5439 double min = strtod(c->argv[2]->ptr,NULL);
5440 double max = strtod(c->argv[3]->ptr,NULL);
5441 long deleted;
5442 robj *zsetobj;
5443 zset *zs;
5444
5445 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5446 checkType(c,zsetobj,REDIS_ZSET)) return;
5447
5448 zs = zsetobj->ptr;
5449 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5450 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5451 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5452 server.dirty += deleted;
5453 addReplyLong(c,deleted);
5454 }
5455
5456 static void zremrangebyrankCommand(redisClient *c) {
5457 int start = atoi(c->argv[2]->ptr);
5458 int end = atoi(c->argv[3]->ptr);
5459 int llen;
5460 long deleted;
5461 robj *zsetobj;
5462 zset *zs;
5463
5464 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5465 checkType(c,zsetobj,REDIS_ZSET)) return;
5466 zs = zsetobj->ptr;
5467 llen = zs->zsl->length;
5468
5469 /* convert negative indexes */
5470 if (start < 0) start = llen+start;
5471 if (end < 0) end = llen+end;
5472 if (start < 0) start = 0;
5473 if (end < 0) end = 0;
5474
5475 /* indexes sanity checks */
5476 if (start > end || start >= llen) {
5477 addReply(c,shared.czero);
5478 return;
5479 }
5480 if (end >= llen) end = llen-1;
5481
5482 /* increment start and end because zsl*Rank functions
5483 * use 1-based rank */
5484 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5485 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5486 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5487 server.dirty += deleted;
5488 addReplyLong(c, deleted);
5489 }
5490
5491 typedef struct {
5492 dict *dict;
5493 double weight;
5494 } zsetopsrc;
5495
5496 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5497 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5498 unsigned long size1, size2;
5499 size1 = d1->dict ? dictSize(d1->dict) : 0;
5500 size2 = d2->dict ? dictSize(d2->dict) : 0;
5501 return size1 - size2;
5502 }
5503
5504 #define REDIS_AGGR_SUM 1
5505 #define REDIS_AGGR_MIN 2
5506 #define REDIS_AGGR_MAX 3
5507
5508 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5509 if (aggregate == REDIS_AGGR_SUM) {
5510 *target = *target + val;
5511 } else if (aggregate == REDIS_AGGR_MIN) {
5512 *target = val < *target ? val : *target;
5513 } else if (aggregate == REDIS_AGGR_MAX) {
5514 *target = val > *target ? val : *target;
5515 } else {
5516 /* safety net */
5517 redisAssert(0 != 0);
5518 }
5519 }
5520
5521 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5522 int i, j, zsetnum;
5523 int aggregate = REDIS_AGGR_SUM;
5524 zsetopsrc *src;
5525 robj *dstobj;
5526 zset *dstzset;
5527 dictIterator *di;
5528 dictEntry *de;
5529
5530 /* expect zsetnum input keys to be given */
5531 zsetnum = atoi(c->argv[2]->ptr);
5532 if (zsetnum < 1) {
5533 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5534 return;
5535 }
5536
5537 /* test if the expected number of keys would overflow */
5538 if (3+zsetnum > c->argc) {
5539 addReply(c,shared.syntaxerr);
5540 return;
5541 }
5542
5543 /* read keys to be used for input */
5544 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5545 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5546 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5547 if (!zsetobj) {
5548 src[i].dict = NULL;
5549 } else {
5550 if (zsetobj->type != REDIS_ZSET) {
5551 zfree(src);
5552 addReply(c,shared.wrongtypeerr);
5553 return;
5554 }
5555 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5556 }
5557
5558 /* default all weights to 1 */
5559 src[i].weight = 1.0;
5560 }
5561
5562 /* parse optional extra arguments */
5563 if (j < c->argc) {
5564 int remaining = c->argc - j;
5565
5566 while (remaining) {
5567 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5568 j++; remaining--;
5569 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5570 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5571 }
5572 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5573 j++; remaining--;
5574 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5575 aggregate = REDIS_AGGR_SUM;
5576 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5577 aggregate = REDIS_AGGR_MIN;
5578 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5579 aggregate = REDIS_AGGR_MAX;
5580 } else {
5581 zfree(src);
5582 addReply(c,shared.syntaxerr);
5583 return;
5584 }
5585 j++; remaining--;
5586 } else {
5587 zfree(src);
5588 addReply(c,shared.syntaxerr);
5589 return;
5590 }
5591 }
5592 }
5593
5594 /* sort sets from the smallest to largest, this will improve our
5595 * algorithm's performance */
5596 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5597
5598 dstobj = createZsetObject();
5599 dstzset = dstobj->ptr;
5600
5601 if (op == REDIS_OP_INTER) {
5602 /* skip going over all entries if the smallest zset is NULL or empty */
5603 if (src[0].dict && dictSize(src[0].dict) > 0) {
5604 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5605 * from small to large, all src[i > 0].dict are non-empty too */
5606 di = dictGetIterator(src[0].dict);
5607 while((de = dictNext(di)) != NULL) {
5608 double *score = zmalloc(sizeof(double)), value;
5609 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5610
5611 for (j = 1; j < zsetnum; j++) {
5612 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5613 if (other) {
5614 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5615 zunionInterAggregate(score, value, aggregate);
5616 } else {
5617 break;
5618 }
5619 }
5620
5621 /* skip entry when not present in every source dict */
5622 if (j != zsetnum) {
5623 zfree(score);
5624 } else {
5625 robj *o = dictGetEntryKey(de);
5626 dictAdd(dstzset->dict,o,score);
5627 incrRefCount(o); /* added to dictionary */
5628 zslInsert(dstzset->zsl,*score,o);
5629 incrRefCount(o); /* added to skiplist */
5630 }
5631 }
5632 dictReleaseIterator(di);
5633 }
5634 } else if (op == REDIS_OP_UNION) {
5635 for (i = 0; i < zsetnum; i++) {
5636 if (!src[i].dict) continue;
5637
5638 di = dictGetIterator(src[i].dict);
5639 while((de = dictNext(di)) != NULL) {
5640 /* skip key when already processed */
5641 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5642
5643 double *score = zmalloc(sizeof(double)), value;
5644 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5645
5646 /* because the zsets are sorted by size, its only possible
5647 * for sets at larger indices to hold this entry */
5648 for (j = (i+1); j < zsetnum; j++) {
5649 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5650 if (other) {
5651 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5652 zunionInterAggregate(score, value, aggregate);
5653 }
5654 }
5655
5656 robj *o = dictGetEntryKey(de);
5657 dictAdd(dstzset->dict,o,score);
5658 incrRefCount(o); /* added to dictionary */
5659 zslInsert(dstzset->zsl,*score,o);
5660 incrRefCount(o); /* added to skiplist */
5661 }
5662 dictReleaseIterator(di);
5663 }
5664 } else {
5665 /* unknown operator */
5666 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5667 }
5668
5669 deleteKey(c->db,dstkey);
5670 if (dstzset->zsl->length) {
5671 dictAdd(c->db->dict,dstkey,dstobj);
5672 incrRefCount(dstkey);
5673 addReplyLong(c, dstzset->zsl->length);
5674 server.dirty++;
5675 } else {
5676 decrRefCount(dstobj);
5677 addReply(c, shared.czero);
5678 }
5679 zfree(src);
5680 }
5681
5682 static void zunionCommand(redisClient *c) {
5683 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5684 }
5685
5686 static void zinterCommand(redisClient *c) {
5687 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5688 }
5689
5690 static void zrangeGenericCommand(redisClient *c, int reverse) {
5691 robj *o;
5692 int start = atoi(c->argv[2]->ptr);
5693 int end = atoi(c->argv[3]->ptr);
5694 int withscores = 0;
5695 int llen;
5696 int rangelen, j;
5697 zset *zsetobj;
5698 zskiplist *zsl;
5699 zskiplistNode *ln;
5700 robj *ele;
5701
5702 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5703 withscores = 1;
5704 } else if (c->argc >= 5) {
5705 addReply(c,shared.syntaxerr);
5706 return;
5707 }
5708
5709 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5710 || checkType(c,o,REDIS_ZSET)) return;
5711 zsetobj = o->ptr;
5712 zsl = zsetobj->zsl;
5713 llen = zsl->length;
5714
5715 /* convert negative indexes */
5716 if (start < 0) start = llen+start;
5717 if (end < 0) end = llen+end;
5718 if (start < 0) start = 0;
5719 if (end < 0) end = 0;
5720
5721 /* indexes sanity checks */
5722 if (start > end || start >= llen) {
5723 /* Out of range start or start > end result in empty list */
5724 addReply(c,shared.emptymultibulk);
5725 return;
5726 }
5727 if (end >= llen) end = llen-1;
5728 rangelen = (end-start)+1;
5729
5730 /* check if starting point is trivial, before searching
5731 * the element in log(N) time */
5732 if (reverse) {
5733 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5734 } else {
5735 ln = start == 0 ?
5736 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5737 }
5738
5739 /* Return the result in form of a multi-bulk reply */
5740 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5741 withscores ? (rangelen*2) : rangelen));
5742 for (j = 0; j < rangelen; j++) {
5743 ele = ln->obj;
5744 addReplyBulk(c,ele);
5745 if (withscores)
5746 addReplyDouble(c,ln->score);
5747 ln = reverse ? ln->backward : ln->forward[0];
5748 }
5749 }
5750
5751 static void zrangeCommand(redisClient *c) {
5752 zrangeGenericCommand(c,0);
5753 }
5754
5755 static void zrevrangeCommand(redisClient *c) {
5756 zrangeGenericCommand(c,1);
5757 }
5758
5759 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5760 * If justcount is non-zero, just the count is returned. */
5761 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5762 robj *o;
5763 double min, max;
5764 int minex = 0, maxex = 0; /* are min or max exclusive? */
5765 int offset = 0, limit = -1;
5766 int withscores = 0;
5767 int badsyntax = 0;
5768
5769 /* Parse the min-max interval. If one of the values is prefixed
5770 * by the "(" character, it's considered "open". For instance
5771 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5772 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5773 if (((char*)c->argv[2]->ptr)[0] == '(') {
5774 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5775 minex = 1;
5776 } else {
5777 min = strtod(c->argv[2]->ptr,NULL);
5778 }
5779 if (((char*)c->argv[3]->ptr)[0] == '(') {
5780 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5781 maxex = 1;
5782 } else {
5783 max = strtod(c->argv[3]->ptr,NULL);
5784 }
5785
5786 /* Parse "WITHSCORES": note that if the command was called with
5787 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5788 * enter the following paths to parse WITHSCORES and LIMIT. */
5789 if (c->argc == 5 || c->argc == 8) {
5790 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5791 withscores = 1;
5792 else
5793 badsyntax = 1;
5794 }
5795 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5796 badsyntax = 1;
5797 if (badsyntax) {
5798 addReplySds(c,
5799 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5800 return;
5801 }
5802
5803 /* Parse "LIMIT" */
5804 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5805 addReply(c,shared.syntaxerr);
5806 return;
5807 } else if (c->argc == (7 + withscores)) {
5808 offset = atoi(c->argv[5]->ptr);
5809 limit = atoi(c->argv[6]->ptr);
5810 if (offset < 0) offset = 0;
5811 }
5812
5813 /* Ok, lookup the key and get the range */
5814 o = lookupKeyRead(c->db,c->argv[1]);
5815 if (o == NULL) {
5816 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5817 } else {
5818 if (o->type != REDIS_ZSET) {
5819 addReply(c,shared.wrongtypeerr);
5820 } else {
5821 zset *zsetobj = o->ptr;
5822 zskiplist *zsl = zsetobj->zsl;
5823 zskiplistNode *ln;
5824 robj *ele, *lenobj = NULL;
5825 unsigned long rangelen = 0;
5826
5827 /* Get the first node with the score >= min, or with
5828 * score > min if 'minex' is true. */
5829 ln = zslFirstWithScore(zsl,min);
5830 while (minex && ln && ln->score == min) ln = ln->forward[0];
5831
5832 if (ln == NULL) {
5833 /* No element matching the speciifed interval */
5834 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5835 return;
5836 }
5837
5838 /* We don't know in advance how many matching elements there
5839 * are in the list, so we push this object that will represent
5840 * the multi-bulk length in the output buffer, and will "fix"
5841 * it later */
5842 if (!justcount) {
5843 lenobj = createObject(REDIS_STRING,NULL);
5844 addReply(c,lenobj);
5845 decrRefCount(lenobj);
5846 }
5847
5848 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5849 if (offset) {
5850 offset--;
5851 ln = ln->forward[0];
5852 continue;
5853 }
5854 if (limit == 0) break;
5855 if (!justcount) {
5856 ele = ln->obj;
5857 addReplyBulk(c,ele);
5858 if (withscores)
5859 addReplyDouble(c,ln->score);
5860 }
5861 ln = ln->forward[0];
5862 rangelen++;
5863 if (limit > 0) limit--;
5864 }
5865 if (justcount) {
5866 addReplyLong(c,(long)rangelen);
5867 } else {
5868 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5869 withscores ? (rangelen*2) : rangelen);
5870 }
5871 }
5872 }
5873 }
5874
5875 static void zrangebyscoreCommand(redisClient *c) {
5876 genericZrangebyscoreCommand(c,0);
5877 }
5878
5879 static void zcountCommand(redisClient *c) {
5880 genericZrangebyscoreCommand(c,1);
5881 }
5882
5883 static void zcardCommand(redisClient *c) {
5884 robj *o;
5885 zset *zs;
5886
5887 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5888 checkType(c,o,REDIS_ZSET)) return;
5889
5890 zs = o->ptr;
5891 addReplyUlong(c,zs->zsl->length);
5892 }
5893
5894 static void zscoreCommand(redisClient *c) {
5895 robj *o;
5896 zset *zs;
5897 dictEntry *de;
5898
5899 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5900 checkType(c,o,REDIS_ZSET)) return;
5901
5902 zs = o->ptr;
5903 de = dictFind(zs->dict,c->argv[2]);
5904 if (!de) {
5905 addReply(c,shared.nullbulk);
5906 } else {
5907 double *score = dictGetEntryVal(de);
5908
5909 addReplyDouble(c,*score);
5910 }
5911 }
5912
5913 static void zrankGenericCommand(redisClient *c, int reverse) {
5914 robj *o;
5915 zset *zs;
5916 zskiplist *zsl;
5917 dictEntry *de;
5918 unsigned long rank;
5919 double *score;
5920
5921 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5922 checkType(c,o,REDIS_ZSET)) return;
5923
5924 zs = o->ptr;
5925 zsl = zs->zsl;
5926 de = dictFind(zs->dict,c->argv[2]);
5927 if (!de) {
5928 addReply(c,shared.nullbulk);
5929 return;
5930 }
5931
5932 score = dictGetEntryVal(de);
5933 rank = zslGetRank(zsl, *score, c->argv[2]);
5934 if (rank) {
5935 if (reverse) {
5936 addReplyLong(c, zsl->length - rank);
5937 } else {
5938 addReplyLong(c, rank-1);
5939 }
5940 } else {
5941 addReply(c,shared.nullbulk);
5942 }
5943 }
5944
5945 static void zrankCommand(redisClient *c) {
5946 zrankGenericCommand(c, 0);
5947 }
5948
5949 static void zrevrankCommand(redisClient *c) {
5950 zrankGenericCommand(c, 1);
5951 }
5952
5953 /* =================================== Hashes =============================== */
5954 static void hsetCommand(redisClient *c) {
5955 int update = 0;
5956 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5957
5958 if (o == NULL) {
5959 o = createHashObject();
5960 dictAdd(c->db->dict,c->argv[1],o);
5961 incrRefCount(c->argv[1]);
5962 } else {
5963 if (o->type != REDIS_HASH) {
5964 addReply(c,shared.wrongtypeerr);
5965 return;
5966 }
5967 }
5968 /* We want to convert the zipmap into an hash table right now if the
5969 * entry to be added is too big. Note that we check if the object
5970 * is integer encoded before to try fetching the length in the test below.
5971 * This is because integers are small, but currently stringObjectLen()
5972 * performs a slow conversion: not worth it. */
5973 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5974 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5975 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5976 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5977 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5978 {
5979 convertToRealHash(o);
5980 }
5981
5982 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5983 unsigned char *zm = o->ptr;
5984 robj *valobj = getDecodedObject(c->argv[3]);
5985
5986 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5987 valobj->ptr,sdslen(valobj->ptr),&update);
5988 decrRefCount(valobj);
5989 o->ptr = zm;
5990
5991 /* And here there is the second check for hash conversion. */
5992 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
5993 convertToRealHash(o);
5994 } else {
5995 c->argv[2] = tryObjectEncoding(c->argv[2]);
5996 /* note that c->argv[3] is already encoded, as the latest arg
5997 * of a bulk command is always integer encoded if possible. */
5998 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5999 incrRefCount(c->argv[2]);
6000 } else {
6001 update = 1;
6002 }
6003 incrRefCount(c->argv[3]);
6004 }
6005 server.dirty++;
6006 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6007 }
6008
6009 static void hmsetCommand(redisClient *c) {
6010 int i;
6011 robj *o, *key, *val;
6012
6013 if ((c->argc % 2) == 1) {
6014 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6015 return;
6016 }
6017
6018 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6019 o = createHashObject();
6020 dictAdd(c->db->dict,c->argv[1],o);
6021 incrRefCount(c->argv[1]);
6022 } else {
6023 if (o->type != REDIS_HASH) {
6024 addReply(c,shared.wrongtypeerr);
6025 return;
6026 }
6027 }
6028
6029 /* We want to convert the zipmap into an hash table right now if the
6030 * entry to be added is too big. */
6031 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6032 for (i = 2; i < c->argc; i+=2) {
6033 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6034 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6035 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6036 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6037 convertToRealHash(o);
6038 break;
6039 }
6040 }
6041 }
6042
6043 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6044 unsigned char *zm = o->ptr;
6045
6046 for (i = 2; i < c->argc; i+=2) {
6047 key = getDecodedObject(c->argv[i]);
6048 val = getDecodedObject(c->argv[i+1]);
6049 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6050 val->ptr,sdslen(val->ptr),NULL);
6051 decrRefCount(key);
6052 decrRefCount(val);
6053 o->ptr = zm;
6054 }
6055
6056 /* And here there is the second check for hash conversion. */
6057 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6058 convertToRealHash(o);
6059 } else {
6060 for (i = 2; i < c->argc; i+=2) {
6061 key = tryObjectEncoding(c->argv[i]);
6062 val = tryObjectEncoding(c->argv[i+1]);
6063 if (dictReplace(o->ptr,key,val)) {
6064 incrRefCount(key);
6065 }
6066 incrRefCount(val);
6067 }
6068 }
6069
6070 addReply(c, shared.ok);
6071 }
6072
6073 static void hincrbyCommand(redisClient *c) {
6074 long long value = 0, incr = 0;
6075 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6076
6077 if (o == NULL) {
6078 o = createHashObject();
6079 dictAdd(c->db->dict,c->argv[1],o);
6080 incrRefCount(c->argv[1]);
6081 } else {
6082 if (o->type != REDIS_HASH) {
6083 addReply(c,shared.wrongtypeerr);
6084 return;
6085 }
6086 }
6087
6088 incr = strtoll(c->argv[3]->ptr, NULL, 10);
6089 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6090 unsigned char *zm = o->ptr;
6091 unsigned char *zval;
6092 unsigned int zvlen;
6093
6094 /* Find value if already present in hash */
6095 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6096 &zval,&zvlen)) {
6097 /* strtoll needs the char* to have a trailing \0, but
6098 * the zipmap doesn't include them. */
6099 sds szval = sdsnewlen(zval, zvlen);
6100 value = strtoll(szval,NULL,10);
6101 sdsfree(szval);
6102 }
6103
6104 value += incr;
6105 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6106 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6107 (unsigned char*)svalue,sdslen(svalue),NULL);
6108 sdsfree(svalue);
6109 o->ptr = zm;
6110
6111 /* Check if the zipmap needs to be converted. */
6112 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6113 convertToRealHash(o);
6114 } else {
6115 robj *hval;
6116 dictEntry *de;
6117
6118 /* Find value if already present in hash */
6119 de = dictFind(o->ptr,c->argv[2]);
6120 if (de != NULL) {
6121 hval = dictGetEntryVal(de);
6122 if (hval->encoding == REDIS_ENCODING_RAW)
6123 value = strtoll(hval->ptr,NULL,10);
6124 else if (hval->encoding == REDIS_ENCODING_INT)
6125 value = (long)hval->ptr;
6126 else
6127 redisAssert(1 != 1);
6128 }
6129
6130 value += incr;
6131 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6132 hval = tryObjectEncoding(hval);
6133 if (dictReplace(o->ptr,c->argv[2],hval)) {
6134 incrRefCount(c->argv[2]);
6135 }
6136 }
6137
6138 server.dirty++;
6139 addReplyLongLong(c, value);
6140 }
6141
6142 static void hgetCommand(redisClient *c) {
6143 robj *o;
6144
6145 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6146 checkType(c,o,REDIS_HASH)) return;
6147
6148 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6149 unsigned char *zm = o->ptr;
6150 unsigned char *val;
6151 unsigned int vlen;
6152 robj *field;
6153
6154 field = getDecodedObject(c->argv[2]);
6155 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6156 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6157 addReplySds(c,sdsnewlen(val,vlen));
6158 addReply(c,shared.crlf);
6159 decrRefCount(field);
6160 return;
6161 } else {
6162 addReply(c,shared.nullbulk);
6163 decrRefCount(field);
6164 return;
6165 }
6166 } else {
6167 struct dictEntry *de;
6168
6169 de = dictFind(o->ptr,c->argv[2]);
6170 if (de == NULL) {
6171 addReply(c,shared.nullbulk);
6172 } else {
6173 robj *e = dictGetEntryVal(de);
6174
6175 addReplyBulk(c,e);
6176 }
6177 }
6178 }
6179
6180 static void hdelCommand(redisClient *c) {
6181 robj *o;
6182 int deleted = 0;
6183
6184 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6185 checkType(c,o,REDIS_HASH)) return;
6186
6187 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6188 robj *field = getDecodedObject(c->argv[2]);
6189
6190 o->ptr = zipmapDel((unsigned char*) o->ptr,
6191 (unsigned char*) field->ptr,
6192 sdslen(field->ptr), &deleted);
6193 decrRefCount(field);
6194 if (zipmapLen((unsigned char*) o->ptr) == 0)
6195 deleteKey(c->db,c->argv[1]);
6196 } else {
6197 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6198 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6199 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6200 }
6201 if (deleted) server.dirty++;
6202 addReply(c,deleted ? shared.cone : shared.czero);
6203 }
6204
6205 static void hlenCommand(redisClient *c) {
6206 robj *o;
6207 unsigned long len;
6208
6209 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6210 checkType(c,o,REDIS_HASH)) return;
6211
6212 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6213 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6214 addReplyUlong(c,len);
6215 }
6216
6217 #define REDIS_GETALL_KEYS 1
6218 #define REDIS_GETALL_VALS 2
6219 static void genericHgetallCommand(redisClient *c, int flags) {
6220 robj *o, *lenobj;
6221 unsigned long count = 0;
6222
6223 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6224 || checkType(c,o,REDIS_HASH)) return;
6225
6226 lenobj = createObject(REDIS_STRING,NULL);
6227 addReply(c,lenobj);
6228 decrRefCount(lenobj);
6229
6230 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6231 unsigned char *p = zipmapRewind(o->ptr);
6232 unsigned char *field, *val;
6233 unsigned int flen, vlen;
6234
6235 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6236 robj *aux;
6237
6238 if (flags & REDIS_GETALL_KEYS) {
6239 aux = createStringObject((char*)field,flen);
6240 addReplyBulk(c,aux);
6241 decrRefCount(aux);
6242 count++;
6243 }
6244 if (flags & REDIS_GETALL_VALS) {
6245 aux = createStringObject((char*)val,vlen);
6246 addReplyBulk(c,aux);
6247 decrRefCount(aux);
6248 count++;
6249 }
6250 }
6251 } else {
6252 dictIterator *di = dictGetIterator(o->ptr);
6253 dictEntry *de;
6254
6255 while((de = dictNext(di)) != NULL) {
6256 robj *fieldobj = dictGetEntryKey(de);
6257 robj *valobj = dictGetEntryVal(de);
6258
6259 if (flags & REDIS_GETALL_KEYS) {
6260 addReplyBulk(c,fieldobj);
6261 count++;
6262 }
6263 if (flags & REDIS_GETALL_VALS) {
6264 addReplyBulk(c,valobj);
6265 count++;
6266 }
6267 }
6268 dictReleaseIterator(di);
6269 }
6270 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6271 }
6272
6273 static void hkeysCommand(redisClient *c) {
6274 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6275 }
6276
6277 static void hvalsCommand(redisClient *c) {
6278 genericHgetallCommand(c,REDIS_GETALL_VALS);
6279 }
6280
6281 static void hgetallCommand(redisClient *c) {
6282 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6283 }
6284
6285 static void hexistsCommand(redisClient *c) {
6286 robj *o;
6287 int exists = 0;
6288
6289 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6290 checkType(c,o,REDIS_HASH)) return;
6291
6292 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6293 robj *field;
6294 unsigned char *zm = o->ptr;
6295
6296 field = getDecodedObject(c->argv[2]);
6297 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6298 decrRefCount(field);
6299 } else {
6300 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6301 }
6302 addReply(c,exists ? shared.cone : shared.czero);
6303 }
6304
6305 static void convertToRealHash(robj *o) {
6306 unsigned char *key, *val, *p, *zm = o->ptr;
6307 unsigned int klen, vlen;
6308 dict *dict = dictCreate(&hashDictType,NULL);
6309
6310 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6311 p = zipmapRewind(zm);
6312 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6313 robj *keyobj, *valobj;
6314
6315 keyobj = createStringObject((char*)key,klen);
6316 valobj = createStringObject((char*)val,vlen);
6317 keyobj = tryObjectEncoding(keyobj);
6318 valobj = tryObjectEncoding(valobj);
6319 dictAdd(dict,keyobj,valobj);
6320 }
6321 o->encoding = REDIS_ENCODING_HT;
6322 o->ptr = dict;
6323 zfree(zm);
6324 }
6325
6326 /* ========================= Non type-specific commands ==================== */
6327
6328 static void flushdbCommand(redisClient *c) {
6329 server.dirty += dictSize(c->db->dict);
6330 dictEmpty(c->db->dict);
6331 dictEmpty(c->db->expires);
6332 addReply(c,shared.ok);
6333 }
6334
6335 static void flushallCommand(redisClient *c) {
6336 server.dirty += emptyDb();
6337 addReply(c,shared.ok);
6338 if (server.bgsavechildpid != -1) {
6339 kill(server.bgsavechildpid,SIGKILL);
6340 rdbRemoveTempFile(server.bgsavechildpid);
6341 }
6342 rdbSave(server.dbfilename);
6343 server.dirty++;
6344 }
6345
6346 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6347 redisSortOperation *so = zmalloc(sizeof(*so));
6348 so->type = type;
6349 so->pattern = pattern;
6350 return so;
6351 }
6352
6353 /* Return the value associated to the key with a name obtained
6354 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6355 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6356 char *p;
6357 sds spat, ssub;
6358 robj keyobj;
6359 int prefixlen, sublen, postfixlen;
6360 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6361 struct {
6362 long len;
6363 long free;
6364 char buf[REDIS_SORTKEY_MAX+1];
6365 } keyname;
6366
6367 /* If the pattern is "#" return the substitution object itself in order
6368 * to implement the "SORT ... GET #" feature. */
6369 spat = pattern->ptr;
6370 if (spat[0] == '#' && spat[1] == '\0') {
6371 return subst;
6372 }
6373
6374 /* The substitution object may be specially encoded. If so we create
6375 * a decoded object on the fly. Otherwise getDecodedObject will just
6376 * increment the ref count, that we'll decrement later. */
6377 subst = getDecodedObject(subst);
6378
6379 ssub = subst->ptr;
6380 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6381 p = strchr(spat,'*');
6382 if (!p) {
6383 decrRefCount(subst);
6384 return NULL;
6385 }
6386
6387 prefixlen = p-spat;
6388 sublen = sdslen(ssub);
6389 postfixlen = sdslen(spat)-(prefixlen+1);
6390 memcpy(keyname.buf,spat,prefixlen);
6391 memcpy(keyname.buf+prefixlen,ssub,sublen);
6392 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6393 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6394 keyname.len = prefixlen+sublen+postfixlen;
6395
6396 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6397 decrRefCount(subst);
6398
6399 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6400 return lookupKeyRead(db,&keyobj);
6401 }
6402
6403 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6404 * the additional parameter is not standard but a BSD-specific we have to
6405 * pass sorting parameters via the global 'server' structure */
6406 static int sortCompare(const void *s1, const void *s2) {
6407 const redisSortObject *so1 = s1, *so2 = s2;
6408 int cmp;
6409
6410 if (!server.sort_alpha) {
6411 /* Numeric sorting. Here it's trivial as we precomputed scores */
6412 if (so1->u.score > so2->u.score) {
6413 cmp = 1;
6414 } else if (so1->u.score < so2->u.score) {
6415 cmp = -1;
6416 } else {
6417 cmp = 0;
6418 }
6419 } else {
6420 /* Alphanumeric sorting */
6421 if (server.sort_bypattern) {
6422 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6423 /* At least one compare object is NULL */
6424 if (so1->u.cmpobj == so2->u.cmpobj)
6425 cmp = 0;
6426 else if (so1->u.cmpobj == NULL)
6427 cmp = -1;
6428 else
6429 cmp = 1;
6430 } else {
6431 /* We have both the objects, use strcoll */
6432 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6433 }
6434 } else {
6435 /* Compare elements directly */
6436 robj *dec1, *dec2;
6437
6438 dec1 = getDecodedObject(so1->obj);
6439 dec2 = getDecodedObject(so2->obj);
6440 cmp = strcoll(dec1->ptr,dec2->ptr);
6441 decrRefCount(dec1);
6442 decrRefCount(dec2);
6443 }
6444 }
6445 return server.sort_desc ? -cmp : cmp;
6446 }
6447
6448 /* The SORT command is the most complex command in Redis. Warning: this code
6449 * is optimized for speed and a bit less for readability */
6450 static void sortCommand(redisClient *c) {
6451 list *operations;
6452 int outputlen = 0;
6453 int desc = 0, alpha = 0;
6454 int limit_start = 0, limit_count = -1, start, end;
6455 int j, dontsort = 0, vectorlen;
6456 int getop = 0; /* GET operation counter */
6457 robj *sortval, *sortby = NULL, *storekey = NULL;
6458 redisSortObject *vector; /* Resulting vector to sort */
6459
6460 /* Lookup the key to sort. It must be of the right types */
6461 sortval = lookupKeyRead(c->db,c->argv[1]);
6462 if (sortval == NULL) {
6463 addReply(c,shared.emptymultibulk);
6464 return;
6465 }
6466 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6467 sortval->type != REDIS_ZSET)
6468 {
6469 addReply(c,shared.wrongtypeerr);
6470 return;
6471 }
6472
6473 /* Create a list of operations to perform for every sorted element.
6474 * Operations can be GET/DEL/INCR/DECR */
6475 operations = listCreate();
6476 listSetFreeMethod(operations,zfree);
6477 j = 2;
6478
6479 /* Now we need to protect sortval incrementing its count, in the future
6480 * SORT may have options able to overwrite/delete keys during the sorting
6481 * and the sorted key itself may get destroied */
6482 incrRefCount(sortval);
6483
6484 /* The SORT command has an SQL-alike syntax, parse it */
6485 while(j < c->argc) {
6486 int leftargs = c->argc-j-1;
6487 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6488 desc = 0;
6489 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6490 desc = 1;
6491 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6492 alpha = 1;
6493 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6494 limit_start = atoi(c->argv[j+1]->ptr);
6495 limit_count = atoi(c->argv[j+2]->ptr);
6496 j+=2;
6497 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6498 storekey = c->argv[j+1];
6499 j++;
6500 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6501 sortby = c->argv[j+1];
6502 /* If the BY pattern does not contain '*', i.e. it is constant,
6503 * we don't need to sort nor to lookup the weight keys. */
6504 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6505 j++;
6506 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6507 listAddNodeTail(operations,createSortOperation(
6508 REDIS_SORT_GET,c->argv[j+1]));
6509 getop++;
6510 j++;
6511 } else {
6512 decrRefCount(sortval);
6513 listRelease(operations);
6514 addReply(c,shared.syntaxerr);
6515 return;
6516 }
6517 j++;
6518 }
6519
6520 /* Load the sorting vector with all the objects to sort */
6521 switch(sortval->type) {
6522 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6523 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6524 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6525 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6526 }
6527 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6528 j = 0;
6529
6530 if (sortval->type == REDIS_LIST) {
6531 list *list = sortval->ptr;
6532 listNode *ln;
6533 listIter li;
6534
6535 listRewind(list,&li);
6536 while((ln = listNext(&li))) {
6537 robj *ele = ln->value;
6538 vector[j].obj = ele;
6539 vector[j].u.score = 0;
6540 vector[j].u.cmpobj = NULL;
6541 j++;
6542 }
6543 } else {
6544 dict *set;
6545 dictIterator *di;
6546 dictEntry *setele;
6547
6548 if (sortval->type == REDIS_SET) {
6549 set = sortval->ptr;
6550 } else {
6551 zset *zs = sortval->ptr;
6552 set = zs->dict;
6553 }
6554
6555 di = dictGetIterator(set);
6556 while((setele = dictNext(di)) != NULL) {
6557 vector[j].obj = dictGetEntryKey(setele);
6558 vector[j].u.score = 0;
6559 vector[j].u.cmpobj = NULL;
6560 j++;
6561 }
6562 dictReleaseIterator(di);
6563 }
6564 redisAssert(j == vectorlen);
6565
6566 /* Now it's time to load the right scores in the sorting vector */
6567 if (dontsort == 0) {
6568 for (j = 0; j < vectorlen; j++) {
6569 if (sortby) {
6570 robj *byval;
6571
6572 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6573 if (!byval || byval->type != REDIS_STRING) continue;
6574 if (alpha) {
6575 vector[j].u.cmpobj = getDecodedObject(byval);
6576 } else {
6577 if (byval->encoding == REDIS_ENCODING_RAW) {
6578 vector[j].u.score = strtod(byval->ptr,NULL);
6579 } else {
6580 /* Don't need to decode the object if it's
6581 * integer-encoded (the only encoding supported) so
6582 * far. We can just cast it */
6583 if (byval->encoding == REDIS_ENCODING_INT) {
6584 vector[j].u.score = (long)byval->ptr;
6585 } else
6586 redisAssert(1 != 1);
6587 }
6588 }
6589 } else {
6590 if (!alpha) {
6591 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6592 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6593 else {
6594 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6595 vector[j].u.score = (long) vector[j].obj->ptr;
6596 else
6597 redisAssert(1 != 1);
6598 }
6599 }
6600 }
6601 }
6602 }
6603
6604 /* We are ready to sort the vector... perform a bit of sanity check
6605 * on the LIMIT option too. We'll use a partial version of quicksort. */
6606 start = (limit_start < 0) ? 0 : limit_start;
6607 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6608 if (start >= vectorlen) {
6609 start = vectorlen-1;
6610 end = vectorlen-2;
6611 }
6612 if (end >= vectorlen) end = vectorlen-1;
6613
6614 if (dontsort == 0) {
6615 server.sort_desc = desc;
6616 server.sort_alpha = alpha;
6617 server.sort_bypattern = sortby ? 1 : 0;
6618 if (sortby && (start != 0 || end != vectorlen-1))
6619 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6620 else
6621 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6622 }
6623
6624 /* Send command output to the output buffer, performing the specified
6625 * GET/DEL/INCR/DECR operations if any. */
6626 outputlen = getop ? getop*(end-start+1) : end-start+1;
6627 if (storekey == NULL) {
6628 /* STORE option not specified, sent the sorting result to client */
6629 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6630 for (j = start; j <= end; j++) {
6631 listNode *ln;
6632 listIter li;
6633
6634 if (!getop) addReplyBulk(c,vector[j].obj);
6635 listRewind(operations,&li);
6636 while((ln = listNext(&li))) {
6637 redisSortOperation *sop = ln->value;
6638 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6639 vector[j].obj);
6640
6641 if (sop->type == REDIS_SORT_GET) {
6642 if (!val || val->type != REDIS_STRING) {
6643 addReply(c,shared.nullbulk);
6644 } else {
6645 addReplyBulk(c,val);
6646 }
6647 } else {
6648 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6649 }
6650 }
6651 }
6652 } else {
6653 robj *listObject = createListObject();
6654 list *listPtr = (list*) listObject->ptr;
6655
6656 /* STORE option specified, set the sorting result as a List object */
6657 for (j = start; j <= end; j++) {
6658 listNode *ln;
6659 listIter li;
6660
6661 if (!getop) {
6662 listAddNodeTail(listPtr,vector[j].obj);
6663 incrRefCount(vector[j].obj);
6664 }
6665 listRewind(operations,&li);
6666 while((ln = listNext(&li))) {
6667 redisSortOperation *sop = ln->value;
6668 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6669 vector[j].obj);
6670
6671 if (sop->type == REDIS_SORT_GET) {
6672 if (!val || val->type != REDIS_STRING) {
6673 listAddNodeTail(listPtr,createStringObject("",0));
6674 } else {
6675 listAddNodeTail(listPtr,val);
6676 incrRefCount(val);
6677 }
6678 } else {
6679 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6680 }
6681 }
6682 }
6683 if (dictReplace(c->db->dict,storekey,listObject)) {
6684 incrRefCount(storekey);
6685 }
6686 /* Note: we add 1 because the DB is dirty anyway since even if the
6687 * SORT result is empty a new key is set and maybe the old content
6688 * replaced. */
6689 server.dirty += 1+outputlen;
6690 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6691 }
6692
6693 /* Cleanup */
6694 decrRefCount(sortval);
6695 listRelease(operations);
6696 for (j = 0; j < vectorlen; j++) {
6697 if (sortby && alpha && vector[j].u.cmpobj)
6698 decrRefCount(vector[j].u.cmpobj);
6699 }
6700 zfree(vector);
6701 }
6702
6703 /* Convert an amount of bytes into a human readable string in the form
6704 * of 100B, 2G, 100M, 4K, and so forth. */
6705 static void bytesToHuman(char *s, unsigned long long n) {
6706 double d;
6707
6708 if (n < 1024) {
6709 /* Bytes */
6710 sprintf(s,"%lluB",n);
6711 return;
6712 } else if (n < (1024*1024)) {
6713 d = (double)n/(1024);
6714 sprintf(s,"%.2fK",d);
6715 } else if (n < (1024LL*1024*1024)) {
6716 d = (double)n/(1024*1024);
6717 sprintf(s,"%.2fM",d);
6718 } else if (n < (1024LL*1024*1024*1024)) {
6719 d = (double)n/(1024LL*1024*1024);
6720 sprintf(s,"%.2fG",d);
6721 }
6722 }
6723
6724 /* Create the string returned by the INFO command. This is decoupled
6725 * by the INFO command itself as we need to report the same information
6726 * on memory corruption problems. */
6727 static sds genRedisInfoString(void) {
6728 sds info;
6729 time_t uptime = time(NULL)-server.stat_starttime;
6730 int j;
6731 char hmem[64];
6732
6733 bytesToHuman(hmem,zmalloc_used_memory());
6734 info = sdscatprintf(sdsempty(),
6735 "redis_version:%s\r\n"
6736 "arch_bits:%s\r\n"
6737 "multiplexing_api:%s\r\n"
6738 "process_id:%ld\r\n"
6739 "uptime_in_seconds:%ld\r\n"
6740 "uptime_in_days:%ld\r\n"
6741 "connected_clients:%d\r\n"
6742 "connected_slaves:%d\r\n"
6743 "blocked_clients:%d\r\n"
6744 "used_memory:%zu\r\n"
6745 "used_memory_human:%s\r\n"
6746 "changes_since_last_save:%lld\r\n"
6747 "bgsave_in_progress:%d\r\n"
6748 "last_save_time:%ld\r\n"
6749 "bgrewriteaof_in_progress:%d\r\n"
6750 "total_connections_received:%lld\r\n"
6751 "total_commands_processed:%lld\r\n"
6752 "expired_keys:%lld\r\n"
6753 "hash_max_zipmap_entries:%ld\r\n"
6754 "hash_max_zipmap_value:%ld\r\n"
6755 "pubsub_channels:%ld\r\n"
6756 "pubsub_patterns:%u\r\n"
6757 "vm_enabled:%d\r\n"
6758 "role:%s\r\n"
6759 ,REDIS_VERSION,
6760 (sizeof(long) == 8) ? "64" : "32",
6761 aeGetApiName(),
6762 (long) getpid(),
6763 uptime,
6764 uptime/(3600*24),
6765 listLength(server.clients)-listLength(server.slaves),
6766 listLength(server.slaves),
6767 server.blpop_blocked_clients,
6768 zmalloc_used_memory(),
6769 hmem,
6770 server.dirty,
6771 server.bgsavechildpid != -1,
6772 server.lastsave,
6773 server.bgrewritechildpid != -1,
6774 server.stat_numconnections,
6775 server.stat_numcommands,
6776 server.stat_expiredkeys,
6777 server.hash_max_zipmap_entries,
6778 server.hash_max_zipmap_value,
6779 dictSize(server.pubsub_channels),
6780 listLength(server.pubsub_patterns),
6781 server.vm_enabled != 0,
6782 server.masterhost == NULL ? "master" : "slave"
6783 );
6784 if (server.masterhost) {
6785 info = sdscatprintf(info,
6786 "master_host:%s\r\n"
6787 "master_port:%d\r\n"
6788 "master_link_status:%s\r\n"
6789 "master_last_io_seconds_ago:%d\r\n"
6790 ,server.masterhost,
6791 server.masterport,
6792 (server.replstate == REDIS_REPL_CONNECTED) ?
6793 "up" : "down",
6794 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6795 );
6796 }
6797 if (server.vm_enabled) {
6798 lockThreadedIO();
6799 info = sdscatprintf(info,
6800 "vm_conf_max_memory:%llu\r\n"
6801 "vm_conf_page_size:%llu\r\n"
6802 "vm_conf_pages:%llu\r\n"
6803 "vm_stats_used_pages:%llu\r\n"
6804 "vm_stats_swapped_objects:%llu\r\n"
6805 "vm_stats_swappin_count:%llu\r\n"
6806 "vm_stats_swappout_count:%llu\r\n"
6807 "vm_stats_io_newjobs_len:%lu\r\n"
6808 "vm_stats_io_processing_len:%lu\r\n"
6809 "vm_stats_io_processed_len:%lu\r\n"
6810 "vm_stats_io_active_threads:%lu\r\n"
6811 "vm_stats_blocked_clients:%lu\r\n"
6812 ,(unsigned long long) server.vm_max_memory,
6813 (unsigned long long) server.vm_page_size,
6814 (unsigned long long) server.vm_pages,
6815 (unsigned long long) server.vm_stats_used_pages,
6816 (unsigned long long) server.vm_stats_swapped_objects,
6817 (unsigned long long) server.vm_stats_swapins,
6818 (unsigned long long) server.vm_stats_swapouts,
6819 (unsigned long) listLength(server.io_newjobs),
6820 (unsigned long) listLength(server.io_processing),
6821 (unsigned long) listLength(server.io_processed),
6822 (unsigned long) server.io_active_threads,
6823 (unsigned long) server.vm_blocked_clients
6824 );
6825 unlockThreadedIO();
6826 }
6827 for (j = 0; j < server.dbnum; j++) {
6828 long long keys, vkeys;
6829
6830 keys = dictSize(server.db[j].dict);
6831 vkeys = dictSize(server.db[j].expires);
6832 if (keys || vkeys) {
6833 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6834 j, keys, vkeys);
6835 }
6836 }
6837 return info;
6838 }
6839
6840 static void infoCommand(redisClient *c) {
6841 sds info = genRedisInfoString();
6842 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6843 (unsigned long)sdslen(info)));
6844 addReplySds(c,info);
6845 addReply(c,shared.crlf);
6846 }
6847
6848 static void monitorCommand(redisClient *c) {
6849 /* ignore MONITOR if aleady slave or in monitor mode */
6850 if (c->flags & REDIS_SLAVE) return;
6851
6852 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6853 c->slaveseldb = 0;
6854 listAddNodeTail(server.monitors,c);
6855 addReply(c,shared.ok);
6856 }
6857
6858 /* ================================= Expire ================================= */
6859 static int removeExpire(redisDb *db, robj *key) {
6860 if (dictDelete(db->expires,key) == DICT_OK) {
6861 return 1;
6862 } else {
6863 return 0;
6864 }
6865 }
6866
6867 static int setExpire(redisDb *db, robj *key, time_t when) {
6868 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6869 return 0;
6870 } else {
6871 incrRefCount(key);
6872 return 1;
6873 }
6874 }
6875
6876 /* Return the expire time of the specified key, or -1 if no expire
6877 * is associated with this key (i.e. the key is non volatile) */
6878 static time_t getExpire(redisDb *db, robj *key) {
6879 dictEntry *de;
6880
6881 /* No expire? return ASAP */
6882 if (dictSize(db->expires) == 0 ||
6883 (de = dictFind(db->expires,key)) == NULL) return -1;
6884
6885 return (time_t) dictGetEntryVal(de);
6886 }
6887
6888 static int expireIfNeeded(redisDb *db, robj *key) {
6889 time_t when;
6890 dictEntry *de;
6891
6892 /* No expire? return ASAP */
6893 if (dictSize(db->expires) == 0 ||
6894 (de = dictFind(db->expires,key)) == NULL) return 0;
6895
6896 /* Lookup the expire */
6897 when = (time_t) dictGetEntryVal(de);
6898 if (time(NULL) <= when) return 0;
6899
6900 /* Delete the key */
6901 dictDelete(db->expires,key);
6902 server.stat_expiredkeys++;
6903 return dictDelete(db->dict,key) == DICT_OK;
6904 }
6905
6906 static int deleteIfVolatile(redisDb *db, robj *key) {
6907 dictEntry *de;
6908
6909 /* No expire? return ASAP */
6910 if (dictSize(db->expires) == 0 ||
6911 (de = dictFind(db->expires,key)) == NULL) return 0;
6912
6913 /* Delete the key */
6914 server.dirty++;
6915 server.stat_expiredkeys++;
6916 dictDelete(db->expires,key);
6917 return dictDelete(db->dict,key) == DICT_OK;
6918 }
6919
6920 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6921 dictEntry *de;
6922
6923 de = dictFind(c->db->dict,key);
6924 if (de == NULL) {
6925 addReply(c,shared.czero);
6926 return;
6927 }
6928 if (seconds < 0) {
6929 if (deleteKey(c->db,key)) server.dirty++;
6930 addReply(c, shared.cone);
6931 return;
6932 } else {
6933 time_t when = time(NULL)+seconds;
6934 if (setExpire(c->db,key,when)) {
6935 addReply(c,shared.cone);
6936 server.dirty++;
6937 } else {
6938 addReply(c,shared.czero);
6939 }
6940 return;
6941 }
6942 }
6943
6944 static void expireCommand(redisClient *c) {
6945 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6946 }
6947
6948 static void expireatCommand(redisClient *c) {
6949 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6950 }
6951
6952 static void ttlCommand(redisClient *c) {
6953 time_t expire;
6954 int ttl = -1;
6955
6956 expire = getExpire(c->db,c->argv[1]);
6957 if (expire != -1) {
6958 ttl = (int) (expire-time(NULL));
6959 if (ttl < 0) ttl = -1;
6960 }
6961 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6962 }
6963
6964 /* ================================ MULTI/EXEC ============================== */
6965
6966 /* Client state initialization for MULTI/EXEC */
6967 static void initClientMultiState(redisClient *c) {
6968 c->mstate.commands = NULL;
6969 c->mstate.count = 0;
6970 }
6971
6972 /* Release all the resources associated with MULTI/EXEC state */
6973 static void freeClientMultiState(redisClient *c) {
6974 int j;
6975
6976 for (j = 0; j < c->mstate.count; j++) {
6977 int i;
6978 multiCmd *mc = c->mstate.commands+j;
6979
6980 for (i = 0; i < mc->argc; i++)
6981 decrRefCount(mc->argv[i]);
6982 zfree(mc->argv);
6983 }
6984 zfree(c->mstate.commands);
6985 }
6986
6987 /* Add a new command into the MULTI commands queue */
6988 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6989 multiCmd *mc;
6990 int j;
6991
6992 c->mstate.commands = zrealloc(c->mstate.commands,
6993 sizeof(multiCmd)*(c->mstate.count+1));
6994 mc = c->mstate.commands+c->mstate.count;
6995 mc->cmd = cmd;
6996 mc->argc = c->argc;
6997 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6998 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6999 for (j = 0; j < c->argc; j++)
7000 incrRefCount(mc->argv[j]);
7001 c->mstate.count++;
7002 }
7003
7004 static void multiCommand(redisClient *c) {
7005 c->flags |= REDIS_MULTI;
7006 addReply(c,shared.ok);
7007 }
7008
7009 static void discardCommand(redisClient *c) {
7010 if (!(c->flags & REDIS_MULTI)) {
7011 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7012 return;
7013 }
7014
7015 freeClientMultiState(c);
7016 initClientMultiState(c);
7017 c->flags &= (~REDIS_MULTI);
7018 addReply(c,shared.ok);
7019 }
7020
7021 static void execCommand(redisClient *c) {
7022 int j;
7023 robj **orig_argv;
7024 int orig_argc;
7025
7026 if (!(c->flags & REDIS_MULTI)) {
7027 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7028 return;
7029 }
7030
7031 orig_argv = c->argv;
7032 orig_argc = c->argc;
7033 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7034 for (j = 0; j < c->mstate.count; j++) {
7035 c->argc = c->mstate.commands[j].argc;
7036 c->argv = c->mstate.commands[j].argv;
7037 call(c,c->mstate.commands[j].cmd);
7038 }
7039 c->argv = orig_argv;
7040 c->argc = orig_argc;
7041 freeClientMultiState(c);
7042 initClientMultiState(c);
7043 c->flags &= (~REDIS_MULTI);
7044 }
7045
7046 /* =========================== Blocking Operations ========================= */
7047
7048 /* Currently Redis blocking operations support is limited to list POP ops,
7049 * so the current implementation is not fully generic, but it is also not
7050 * completely specific so it will not require a rewrite to support new
7051 * kind of blocking operations in the future.
7052 *
7053 * Still it's important to note that list blocking operations can be already
7054 * used as a notification mechanism in order to implement other blocking
7055 * operations at application level, so there must be a very strong evidence
7056 * of usefulness and generality before new blocking operations are implemented.
7057 *
7058 * This is how the current blocking POP works, we use BLPOP as example:
7059 * - If the user calls BLPOP and the key exists and contains a non empty list
7060 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7061 * if there is not to block.
7062 * - If instead BLPOP is called and the key does not exists or the list is
7063 * empty we need to block. In order to do so we remove the notification for
7064 * new data to read in the client socket (so that we'll not serve new
7065 * requests if the blocking request is not served). Also we put the client
7066 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7067 * blocking for this keys.
7068 * - If a PUSH operation against a key with blocked clients waiting is
7069 * performed, we serve the first in the list: basically instead to push
7070 * the new element inside the list we return it to the (first / oldest)
7071 * blocking client, unblock the client, and remove it form the list.
7072 *
7073 * The above comment and the source code should be enough in order to understand
7074 * the implementation and modify / fix it later.
7075 */
7076
7077 /* Set a client in blocking mode for the specified key, with the specified
7078 * timeout */
7079 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7080 dictEntry *de;
7081 list *l;
7082 int j;
7083
7084 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7085 c->blockingkeysnum = numkeys;
7086 c->blockingto = timeout;
7087 for (j = 0; j < numkeys; j++) {
7088 /* Add the key in the client structure, to map clients -> keys */
7089 c->blockingkeys[j] = keys[j];
7090 incrRefCount(keys[j]);
7091
7092 /* And in the other "side", to map keys -> clients */
7093 de = dictFind(c->db->blockingkeys,keys[j]);
7094 if (de == NULL) {
7095 int retval;
7096
7097 /* For every key we take a list of clients blocked for it */
7098 l = listCreate();
7099 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7100 incrRefCount(keys[j]);
7101 assert(retval == DICT_OK);
7102 } else {
7103 l = dictGetEntryVal(de);
7104 }
7105 listAddNodeTail(l,c);
7106 }
7107 /* Mark the client as a blocked client */
7108 c->flags |= REDIS_BLOCKED;
7109 server.blpop_blocked_clients++;
7110 }
7111
7112 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7113 static void unblockClientWaitingData(redisClient *c) {
7114 dictEntry *de;
7115 list *l;
7116 int j;
7117
7118 assert(c->blockingkeys != NULL);
7119 /* The client may wait for multiple keys, so unblock it for every key. */
7120 for (j = 0; j < c->blockingkeysnum; j++) {
7121 /* Remove this client from the list of clients waiting for this key. */
7122 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7123 assert(de != NULL);
7124 l = dictGetEntryVal(de);
7125 listDelNode(l,listSearchKey(l,c));
7126 /* If the list is empty we need to remove it to avoid wasting memory */
7127 if (listLength(l) == 0)
7128 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7129 decrRefCount(c->blockingkeys[j]);
7130 }
7131 /* Cleanup the client structure */
7132 zfree(c->blockingkeys);
7133 c->blockingkeys = NULL;
7134 c->flags &= (~REDIS_BLOCKED);
7135 server.blpop_blocked_clients--;
7136 /* We want to process data if there is some command waiting
7137 * in the input buffer. Note that this is safe even if
7138 * unblockClientWaitingData() gets called from freeClient() because
7139 * freeClient() will be smart enough to call this function
7140 * *after* c->querybuf was set to NULL. */
7141 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7142 }
7143
7144 /* This should be called from any function PUSHing into lists.
7145 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7146 * 'ele' is the element pushed.
7147 *
7148 * If the function returns 0 there was no client waiting for a list push
7149 * against this key.
7150 *
7151 * If the function returns 1 there was a client waiting for a list push
7152 * against this key, the element was passed to this client thus it's not
7153 * needed to actually add it to the list and the caller should return asap. */
7154 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7155 struct dictEntry *de;
7156 redisClient *receiver;
7157 list *l;
7158 listNode *ln;
7159
7160 de = dictFind(c->db->blockingkeys,key);
7161 if (de == NULL) return 0;
7162 l = dictGetEntryVal(de);
7163 ln = listFirst(l);
7164 assert(ln != NULL);
7165 receiver = ln->value;
7166
7167 addReplySds(receiver,sdsnew("*2\r\n"));
7168 addReplyBulk(receiver,key);
7169 addReplyBulk(receiver,ele);
7170 unblockClientWaitingData(receiver);
7171 return 1;
7172 }
7173
7174 /* Blocking RPOP/LPOP */
7175 static void blockingPopGenericCommand(redisClient *c, int where) {
7176 robj *o;
7177 time_t timeout;
7178 int j;
7179
7180 for (j = 1; j < c->argc-1; j++) {
7181 o = lookupKeyWrite(c->db,c->argv[j]);
7182 if (o != NULL) {
7183 if (o->type != REDIS_LIST) {
7184 addReply(c,shared.wrongtypeerr);
7185 return;
7186 } else {
7187 list *list = o->ptr;
7188 if (listLength(list) != 0) {
7189 /* If the list contains elements fall back to the usual
7190 * non-blocking POP operation */
7191 robj *argv[2], **orig_argv;
7192 int orig_argc;
7193
7194 /* We need to alter the command arguments before to call
7195 * popGenericCommand() as the command takes a single key. */
7196 orig_argv = c->argv;
7197 orig_argc = c->argc;
7198 argv[1] = c->argv[j];
7199 c->argv = argv;
7200 c->argc = 2;
7201
7202 /* Also the return value is different, we need to output
7203 * the multi bulk reply header and the key name. The
7204 * "real" command will add the last element (the value)
7205 * for us. If this souds like an hack to you it's just
7206 * because it is... */
7207 addReplySds(c,sdsnew("*2\r\n"));
7208 addReplyBulk(c,argv[1]);
7209 popGenericCommand(c,where);
7210
7211 /* Fix the client structure with the original stuff */
7212 c->argv = orig_argv;
7213 c->argc = orig_argc;
7214 return;
7215 }
7216 }
7217 }
7218 }
7219 /* If the list is empty or the key does not exists we must block */
7220 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7221 if (timeout > 0) timeout += time(NULL);
7222 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7223 }
7224
7225 static void blpopCommand(redisClient *c) {
7226 blockingPopGenericCommand(c,REDIS_HEAD);
7227 }
7228
7229 static void brpopCommand(redisClient *c) {
7230 blockingPopGenericCommand(c,REDIS_TAIL);
7231 }
7232
7233 /* =============================== Replication ============================= */
7234
7235 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7236 ssize_t nwritten, ret = size;
7237 time_t start = time(NULL);
7238
7239 timeout++;
7240 while(size) {
7241 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7242 nwritten = write(fd,ptr,size);
7243 if (nwritten == -1) return -1;
7244 ptr += nwritten;
7245 size -= nwritten;
7246 }
7247 if ((time(NULL)-start) > timeout) {
7248 errno = ETIMEDOUT;
7249 return -1;
7250 }
7251 }
7252 return ret;
7253 }
7254
7255 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7256 ssize_t nread, totread = 0;
7257 time_t start = time(NULL);
7258
7259 timeout++;
7260 while(size) {
7261 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7262 nread = read(fd,ptr,size);
7263 if (nread == -1) return -1;
7264 ptr += nread;
7265 size -= nread;
7266 totread += nread;
7267 }
7268 if ((time(NULL)-start) > timeout) {
7269 errno = ETIMEDOUT;
7270 return -1;
7271 }
7272 }
7273 return totread;
7274 }
7275
7276 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7277 ssize_t nread = 0;
7278
7279 size--;
7280 while(size) {
7281 char c;
7282
7283 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7284 if (c == '\n') {
7285 *ptr = '\0';
7286 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7287 return nread;
7288 } else {
7289 *ptr++ = c;
7290 *ptr = '\0';
7291 nread++;
7292 }
7293 }
7294 return nread;
7295 }
7296
7297 static void syncCommand(redisClient *c) {
7298 /* ignore SYNC if aleady slave or in monitor mode */
7299 if (c->flags & REDIS_SLAVE) return;
7300
7301 /* SYNC can't be issued when the server has pending data to send to
7302 * the client about already issued commands. We need a fresh reply
7303 * buffer registering the differences between the BGSAVE and the current
7304 * dataset, so that we can copy to other slaves if needed. */
7305 if (listLength(c->reply) != 0) {
7306 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7307 return;
7308 }
7309
7310 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7311 /* Here we need to check if there is a background saving operation
7312 * in progress, or if it is required to start one */
7313 if (server.bgsavechildpid != -1) {
7314 /* Ok a background save is in progress. Let's check if it is a good
7315 * one for replication, i.e. if there is another slave that is
7316 * registering differences since the server forked to save */
7317 redisClient *slave;
7318 listNode *ln;
7319 listIter li;
7320
7321 listRewind(server.slaves,&li);
7322 while((ln = listNext(&li))) {
7323 slave = ln->value;
7324 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7325 }
7326 if (ln) {
7327 /* Perfect, the server is already registering differences for
7328 * another slave. Set the right state, and copy the buffer. */
7329 listRelease(c->reply);
7330 c->reply = listDup(slave->reply);
7331 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7332 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7333 } else {
7334 /* No way, we need to wait for the next BGSAVE in order to
7335 * register differences */
7336 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7337 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7338 }
7339 } else {
7340 /* Ok we don't have a BGSAVE in progress, let's start one */
7341 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7342 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7343 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7344 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7345 return;
7346 }
7347 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7348 }
7349 c->repldbfd = -1;
7350 c->flags |= REDIS_SLAVE;
7351 c->slaveseldb = 0;
7352 listAddNodeTail(server.slaves,c);
7353 return;
7354 }
7355
7356 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7357 redisClient *slave = privdata;
7358 REDIS_NOTUSED(el);
7359 REDIS_NOTUSED(mask);
7360 char buf[REDIS_IOBUF_LEN];
7361 ssize_t nwritten, buflen;
7362
7363 if (slave->repldboff == 0) {
7364 /* Write the bulk write count before to transfer the DB. In theory here
7365 * we don't know how much room there is in the output buffer of the
7366 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7367 * operations) will never be smaller than the few bytes we need. */
7368 sds bulkcount;
7369
7370 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7371 slave->repldbsize);
7372 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7373 {
7374 sdsfree(bulkcount);
7375 freeClient(slave);
7376 return;
7377 }
7378 sdsfree(bulkcount);
7379 }
7380 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7381 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7382 if (buflen <= 0) {
7383 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7384 (buflen == 0) ? "premature EOF" : strerror(errno));
7385 freeClient(slave);
7386 return;
7387 }
7388 if ((nwritten = write(fd,buf,buflen)) == -1) {
7389 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7390 strerror(errno));
7391 freeClient(slave);
7392 return;
7393 }
7394 slave->repldboff += nwritten;
7395 if (slave->repldboff == slave->repldbsize) {
7396 close(slave->repldbfd);
7397 slave->repldbfd = -1;
7398 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7399 slave->replstate = REDIS_REPL_ONLINE;
7400 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7401 sendReplyToClient, slave) == AE_ERR) {
7402 freeClient(slave);
7403 return;
7404 }
7405 addReplySds(slave,sdsempty());
7406 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7407 }
7408 }
7409
7410 /* This function is called at the end of every backgrond saving.
7411 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7412 * otherwise REDIS_ERR is passed to the function.
7413 *
7414 * The goal of this function is to handle slaves waiting for a successful
7415 * background saving in order to perform non-blocking synchronization. */
7416 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7417 listNode *ln;
7418 int startbgsave = 0;
7419 listIter li;
7420
7421 listRewind(server.slaves,&li);
7422 while((ln = listNext(&li))) {
7423 redisClient *slave = ln->value;
7424
7425 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7426 startbgsave = 1;
7427 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7428 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7429 struct redis_stat buf;
7430
7431 if (bgsaveerr != REDIS_OK) {
7432 freeClient(slave);
7433 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7434 continue;
7435 }
7436 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7437 redis_fstat(slave->repldbfd,&buf) == -1) {
7438 freeClient(slave);
7439 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7440 continue;
7441 }
7442 slave->repldboff = 0;
7443 slave->repldbsize = buf.st_size;
7444 slave->replstate = REDIS_REPL_SEND_BULK;
7445 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7446 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7447 freeClient(slave);
7448 continue;
7449 }
7450 }
7451 }
7452 if (startbgsave) {
7453 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7454 listIter li;
7455
7456 listRewind(server.slaves,&li);
7457 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7458 while((ln = listNext(&li))) {
7459 redisClient *slave = ln->value;
7460
7461 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7462 freeClient(slave);
7463 }
7464 }
7465 }
7466 }
7467
7468 static int syncWithMaster(void) {
7469 char buf[1024], tmpfile[256], authcmd[1024];
7470 long dumpsize;
7471 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7472 int dfd, maxtries = 5;
7473
7474 if (fd == -1) {
7475 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7476 strerror(errno));
7477 return REDIS_ERR;
7478 }
7479
7480 /* AUTH with the master if required. */
7481 if(server.masterauth) {
7482 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7483 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7484 close(fd);
7485 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7486 strerror(errno));
7487 return REDIS_ERR;
7488 }
7489 /* Read the AUTH result. */
7490 if (syncReadLine(fd,buf,1024,3600) == -1) {
7491 close(fd);
7492 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7493 strerror(errno));
7494 return REDIS_ERR;
7495 }
7496 if (buf[0] != '+') {
7497 close(fd);
7498 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7499 return REDIS_ERR;
7500 }
7501 }
7502
7503 /* Issue the SYNC command */
7504 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7505 close(fd);
7506 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7507 strerror(errno));
7508 return REDIS_ERR;
7509 }
7510 /* Read the bulk write count */
7511 if (syncReadLine(fd,buf,1024,3600) == -1) {
7512 close(fd);
7513 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7514 strerror(errno));
7515 return REDIS_ERR;
7516 }
7517 if (buf[0] != '$') {
7518 close(fd);
7519 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7520 return REDIS_ERR;
7521 }
7522 dumpsize = strtol(buf+1,NULL,10);
7523 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7524 /* Read the bulk write data on a temp file */
7525 while(maxtries--) {
7526 snprintf(tmpfile,256,
7527 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7528 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7529 if (dfd != -1) break;
7530 sleep(1);
7531 }
7532 if (dfd == -1) {
7533 close(fd);
7534 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7535 return REDIS_ERR;
7536 }
7537 while(dumpsize) {
7538 int nread, nwritten;
7539
7540 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7541 if (nread == -1) {
7542 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7543 strerror(errno));
7544 close(fd);
7545 close(dfd);
7546 return REDIS_ERR;
7547 }
7548 nwritten = write(dfd,buf,nread);
7549 if (nwritten == -1) {
7550 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7551 close(fd);
7552 close(dfd);
7553 return REDIS_ERR;
7554 }
7555 dumpsize -= nread;
7556 }
7557 close(dfd);
7558 if (rename(tmpfile,server.dbfilename) == -1) {
7559 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7560 unlink(tmpfile);
7561 close(fd);
7562 return REDIS_ERR;
7563 }
7564 emptyDb();
7565 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7566 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7567 close(fd);
7568 return REDIS_ERR;
7569 }
7570 server.master = createClient(fd);
7571 server.master->flags |= REDIS_MASTER;
7572 server.master->authenticated = 1;
7573 server.replstate = REDIS_REPL_CONNECTED;
7574 return REDIS_OK;
7575 }
7576
7577 static void slaveofCommand(redisClient *c) {
7578 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7579 !strcasecmp(c->argv[2]->ptr,"one")) {
7580 if (server.masterhost) {
7581 sdsfree(server.masterhost);
7582 server.masterhost = NULL;
7583 if (server.master) freeClient(server.master);
7584 server.replstate = REDIS_REPL_NONE;
7585 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7586 }
7587 } else {
7588 sdsfree(server.masterhost);
7589 server.masterhost = sdsdup(c->argv[1]->ptr);
7590 server.masterport = atoi(c->argv[2]->ptr);
7591 if (server.master) freeClient(server.master);
7592 server.replstate = REDIS_REPL_CONNECT;
7593 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7594 server.masterhost, server.masterport);
7595 }
7596 addReply(c,shared.ok);
7597 }
7598
7599 /* ============================ Maxmemory directive ======================== */
7600
7601 /* Try to free one object form the pre-allocated objects free list.
7602 * This is useful under low mem conditions as by default we take 1 million
7603 * free objects allocated. On success REDIS_OK is returned, otherwise
7604 * REDIS_ERR. */
7605 static int tryFreeOneObjectFromFreelist(void) {
7606 robj *o;
7607
7608 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7609 if (listLength(server.objfreelist)) {
7610 listNode *head = listFirst(server.objfreelist);
7611 o = listNodeValue(head);
7612 listDelNode(server.objfreelist,head);
7613 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7614 zfree(o);
7615 return REDIS_OK;
7616 } else {
7617 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7618 return REDIS_ERR;
7619 }
7620 }
7621
7622 /* This function gets called when 'maxmemory' is set on the config file to limit
7623 * the max memory used by the server, and we are out of memory.
7624 * This function will try to, in order:
7625 *
7626 * - Free objects from the free list
7627 * - Try to remove keys with an EXPIRE set
7628 *
7629 * It is not possible to free enough memory to reach used-memory < maxmemory
7630 * the server will start refusing commands that will enlarge even more the
7631 * memory usage.
7632 */
7633 static void freeMemoryIfNeeded(void) {
7634 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7635 int j, k, freed = 0;
7636
7637 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7638 for (j = 0; j < server.dbnum; j++) {
7639 int minttl = -1;
7640 robj *minkey = NULL;
7641 struct dictEntry *de;
7642
7643 if (dictSize(server.db[j].expires)) {
7644 freed = 1;
7645 /* From a sample of three keys drop the one nearest to
7646 * the natural expire */
7647 for (k = 0; k < 3; k++) {
7648 time_t t;
7649
7650 de = dictGetRandomKey(server.db[j].expires);
7651 t = (time_t) dictGetEntryVal(de);
7652 if (minttl == -1 || t < minttl) {
7653 minkey = dictGetEntryKey(de);
7654 minttl = t;
7655 }
7656 }
7657 deleteKey(server.db+j,minkey);
7658 }
7659 }
7660 if (!freed) return; /* nothing to free... */
7661 }
7662 }
7663
7664 /* ============================== Append Only file ========================== */
7665
7666 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7667 sds buf = sdsempty();
7668 int j;
7669 ssize_t nwritten;
7670 time_t now;
7671 robj *tmpargv[3];
7672
7673 /* The DB this command was targetting is not the same as the last command
7674 * we appendend. To issue a SELECT command is needed. */
7675 if (dictid != server.appendseldb) {
7676 char seldb[64];
7677
7678 snprintf(seldb,sizeof(seldb),"%d",dictid);
7679 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7680 (unsigned long)strlen(seldb),seldb);
7681 server.appendseldb = dictid;
7682 }
7683
7684 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7685 * EXPIREs into EXPIREATs calls */
7686 if (cmd->proc == expireCommand) {
7687 long when;
7688
7689 tmpargv[0] = createStringObject("EXPIREAT",8);
7690 tmpargv[1] = argv[1];
7691 incrRefCount(argv[1]);
7692 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7693 tmpargv[2] = createObject(REDIS_STRING,
7694 sdscatprintf(sdsempty(),"%ld",when));
7695 argv = tmpargv;
7696 }
7697
7698 /* Append the actual command */
7699 buf = sdscatprintf(buf,"*%d\r\n",argc);
7700 for (j = 0; j < argc; j++) {
7701 robj *o = argv[j];
7702
7703 o = getDecodedObject(o);
7704 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7705 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7706 buf = sdscatlen(buf,"\r\n",2);
7707 decrRefCount(o);
7708 }
7709
7710 /* Free the objects from the modified argv for EXPIREAT */
7711 if (cmd->proc == expireCommand) {
7712 for (j = 0; j < 3; j++)
7713 decrRefCount(argv[j]);
7714 }
7715
7716 /* We want to perform a single write. This should be guaranteed atomic
7717 * at least if the filesystem we are writing is a real physical one.
7718 * While this will save us against the server being killed I don't think
7719 * there is much to do about the whole server stopping for power problems
7720 * or alike */
7721 nwritten = write(server.appendfd,buf,sdslen(buf));
7722 if (nwritten != (signed)sdslen(buf)) {
7723 /* Ooops, we are in troubles. The best thing to do for now is
7724 * to simply exit instead to give the illusion that everything is
7725 * working as expected. */
7726 if (nwritten == -1) {
7727 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7728 } else {
7729 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7730 }
7731 exit(1);
7732 }
7733 /* If a background append only file rewriting is in progress we want to
7734 * accumulate the differences between the child DB and the current one
7735 * in a buffer, so that when the child process will do its work we
7736 * can append the differences to the new append only file. */
7737 if (server.bgrewritechildpid != -1)
7738 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7739
7740 sdsfree(buf);
7741 now = time(NULL);
7742 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7743 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7744 now-server.lastfsync > 1))
7745 {
7746 fsync(server.appendfd); /* Let's try to get this data on the disk */
7747 server.lastfsync = now;
7748 }
7749 }
7750
7751 /* In Redis commands are always executed in the context of a client, so in
7752 * order to load the append only file we need to create a fake client. */
7753 static struct redisClient *createFakeClient(void) {
7754 struct redisClient *c = zmalloc(sizeof(*c));
7755
7756 selectDb(c,0);
7757 c->fd = -1;
7758 c->querybuf = sdsempty();
7759 c->argc = 0;
7760 c->argv = NULL;
7761 c->flags = 0;
7762 /* We set the fake client as a slave waiting for the synchronization
7763 * so that Redis will not try to send replies to this client. */
7764 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7765 c->reply = listCreate();
7766 listSetFreeMethod(c->reply,decrRefCount);
7767 listSetDupMethod(c->reply,dupClientReplyValue);
7768 return c;
7769 }
7770
7771 static void freeFakeClient(struct redisClient *c) {
7772 sdsfree(c->querybuf);
7773 listRelease(c->reply);
7774 zfree(c);
7775 }
7776
7777 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7778 * error (the append only file is zero-length) REDIS_ERR is returned. On
7779 * fatal error an error message is logged and the program exists. */
7780 int loadAppendOnlyFile(char *filename) {
7781 struct redisClient *fakeClient;
7782 FILE *fp = fopen(filename,"r");
7783 struct redis_stat sb;
7784 unsigned long long loadedkeys = 0;
7785
7786 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7787 return REDIS_ERR;
7788
7789 if (fp == NULL) {
7790 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7791 exit(1);
7792 }
7793
7794 fakeClient = createFakeClient();
7795 while(1) {
7796 int argc, j;
7797 unsigned long len;
7798 robj **argv;
7799 char buf[128];
7800 sds argsds;
7801 struct redisCommand *cmd;
7802
7803 if (fgets(buf,sizeof(buf),fp) == NULL) {
7804 if (feof(fp))
7805 break;
7806 else
7807 goto readerr;
7808 }
7809 if (buf[0] != '*') goto fmterr;
7810 argc = atoi(buf+1);
7811 argv = zmalloc(sizeof(robj*)*argc);
7812 for (j = 0; j < argc; j++) {
7813 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7814 if (buf[0] != '$') goto fmterr;
7815 len = strtol(buf+1,NULL,10);
7816 argsds = sdsnewlen(NULL,len);
7817 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7818 argv[j] = createObject(REDIS_STRING,argsds);
7819 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7820 }
7821
7822 /* Command lookup */
7823 cmd = lookupCommand(argv[0]->ptr);
7824 if (!cmd) {
7825 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7826 exit(1);
7827 }
7828 /* Try object encoding */
7829 if (cmd->flags & REDIS_CMD_BULK)
7830 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7831 /* Run the command in the context of a fake client */
7832 fakeClient->argc = argc;
7833 fakeClient->argv = argv;
7834 cmd->proc(fakeClient);
7835 /* Discard the reply objects list from the fake client */
7836 while(listLength(fakeClient->reply))
7837 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7838 /* Clean up, ready for the next command */
7839 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7840 zfree(argv);
7841 /* Handle swapping while loading big datasets when VM is on */
7842 loadedkeys++;
7843 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7844 while (zmalloc_used_memory() > server.vm_max_memory) {
7845 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7846 }
7847 }
7848 }
7849 fclose(fp);
7850 freeFakeClient(fakeClient);
7851 return REDIS_OK;
7852
7853 readerr:
7854 if (feof(fp)) {
7855 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7856 } else {
7857 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7858 }
7859 exit(1);
7860 fmterr:
7861 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7862 exit(1);
7863 }
7864
7865 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7866 static int fwriteBulkObject(FILE *fp, robj *obj) {
7867 char buf[128];
7868 int decrrc = 0;
7869
7870 /* Avoid the incr/decr ref count business if possible to help
7871 * copy-on-write (we are often in a child process when this function
7872 * is called).
7873 * Also makes sure that key objects don't get incrRefCount-ed when VM
7874 * is enabled */
7875 if (obj->encoding != REDIS_ENCODING_RAW) {
7876 obj = getDecodedObject(obj);
7877 decrrc = 1;
7878 }
7879 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7880 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7881 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7882 goto err;
7883 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7884 if (decrrc) decrRefCount(obj);
7885 return 1;
7886 err:
7887 if (decrrc) decrRefCount(obj);
7888 return 0;
7889 }
7890
7891 /* Write binary-safe string into a file in the bulkformat
7892 * $<count>\r\n<payload>\r\n */
7893 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7894 char buf[128];
7895
7896 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7897 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7898 if (len && fwrite(s,len,1,fp) == 0) return 0;
7899 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7900 return 1;
7901 }
7902
7903 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7904 static int fwriteBulkDouble(FILE *fp, double d) {
7905 char buf[128], dbuf[128];
7906
7907 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7908 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7909 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7910 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7911 return 1;
7912 }
7913
7914 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7915 static int fwriteBulkLong(FILE *fp, long l) {
7916 char buf[128], lbuf[128];
7917
7918 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7919 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7920 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7921 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7922 return 1;
7923 }
7924
7925 /* Write a sequence of commands able to fully rebuild the dataset into
7926 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7927 static int rewriteAppendOnlyFile(char *filename) {
7928 dictIterator *di = NULL;
7929 dictEntry *de;
7930 FILE *fp;
7931 char tmpfile[256];
7932 int j;
7933 time_t now = time(NULL);
7934
7935 /* Note that we have to use a different temp name here compared to the
7936 * one used by rewriteAppendOnlyFileBackground() function. */
7937 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7938 fp = fopen(tmpfile,"w");
7939 if (!fp) {
7940 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7941 return REDIS_ERR;
7942 }
7943 for (j = 0; j < server.dbnum; j++) {
7944 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7945 redisDb *db = server.db+j;
7946 dict *d = db->dict;
7947 if (dictSize(d) == 0) continue;
7948 di = dictGetIterator(d);
7949 if (!di) {
7950 fclose(fp);
7951 return REDIS_ERR;
7952 }
7953
7954 /* SELECT the new DB */
7955 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7956 if (fwriteBulkLong(fp,j) == 0) goto werr;
7957
7958 /* Iterate this DB writing every entry */
7959 while((de = dictNext(di)) != NULL) {
7960 robj *key, *o;
7961 time_t expiretime;
7962 int swapped;
7963
7964 key = dictGetEntryKey(de);
7965 /* If the value for this key is swapped, load a preview in memory.
7966 * We use a "swapped" flag to remember if we need to free the
7967 * value object instead to just increment the ref count anyway
7968 * in order to avoid copy-on-write of pages if we are forked() */
7969 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7970 key->storage == REDIS_VM_SWAPPING) {
7971 o = dictGetEntryVal(de);
7972 swapped = 0;
7973 } else {
7974 o = vmPreviewObject(key);
7975 swapped = 1;
7976 }
7977 expiretime = getExpire(db,key);
7978
7979 /* Save the key and associated value */
7980 if (o->type == REDIS_STRING) {
7981 /* Emit a SET command */
7982 char cmd[]="*3\r\n$3\r\nSET\r\n";
7983 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7984 /* Key and value */
7985 if (fwriteBulkObject(fp,key) == 0) goto werr;
7986 if (fwriteBulkObject(fp,o) == 0) goto werr;
7987 } else if (o->type == REDIS_LIST) {
7988 /* Emit the RPUSHes needed to rebuild the list */
7989 list *list = o->ptr;
7990 listNode *ln;
7991 listIter li;
7992
7993 listRewind(list,&li);
7994 while((ln = listNext(&li))) {
7995 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7996 robj *eleobj = listNodeValue(ln);
7997
7998 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7999 if (fwriteBulkObject(fp,key) == 0) goto werr;
8000 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8001 }
8002 } else if (o->type == REDIS_SET) {
8003 /* Emit the SADDs needed to rebuild the set */
8004 dict *set = o->ptr;
8005 dictIterator *di = dictGetIterator(set);
8006 dictEntry *de;
8007
8008 while((de = dictNext(di)) != NULL) {
8009 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8010 robj *eleobj = dictGetEntryKey(de);
8011
8012 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8013 if (fwriteBulkObject(fp,key) == 0) goto werr;
8014 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8015 }
8016 dictReleaseIterator(di);
8017 } else if (o->type == REDIS_ZSET) {
8018 /* Emit the ZADDs needed to rebuild the sorted set */
8019 zset *zs = o->ptr;
8020 dictIterator *di = dictGetIterator(zs->dict);
8021 dictEntry *de;
8022
8023 while((de = dictNext(di)) != NULL) {
8024 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8025 robj *eleobj = dictGetEntryKey(de);
8026 double *score = dictGetEntryVal(de);
8027
8028 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8029 if (fwriteBulkObject(fp,key) == 0) goto werr;
8030 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8031 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8032 }
8033 dictReleaseIterator(di);
8034 } else if (o->type == REDIS_HASH) {
8035 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8036
8037 /* Emit the HSETs needed to rebuild the hash */
8038 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8039 unsigned char *p = zipmapRewind(o->ptr);
8040 unsigned char *field, *val;
8041 unsigned int flen, vlen;
8042
8043 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8044 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8045 if (fwriteBulkObject(fp,key) == 0) goto werr;
8046 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8047 return -1;
8048 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8049 return -1;
8050 }
8051 } else {
8052 dictIterator *di = dictGetIterator(o->ptr);
8053 dictEntry *de;
8054
8055 while((de = dictNext(di)) != NULL) {
8056 robj *field = dictGetEntryKey(de);
8057 robj *val = dictGetEntryVal(de);
8058
8059 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8060 if (fwriteBulkObject(fp,key) == 0) goto werr;
8061 if (fwriteBulkObject(fp,field) == -1) return -1;
8062 if (fwriteBulkObject(fp,val) == -1) return -1;
8063 }
8064 dictReleaseIterator(di);
8065 }
8066 } else {
8067 redisAssert(0);
8068 }
8069 /* Save the expire time */
8070 if (expiretime != -1) {
8071 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8072 /* If this key is already expired skip it */
8073 if (expiretime < now) continue;
8074 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8075 if (fwriteBulkObject(fp,key) == 0) goto werr;
8076 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8077 }
8078 if (swapped) decrRefCount(o);
8079 }
8080 dictReleaseIterator(di);
8081 }
8082
8083 /* Make sure data will not remain on the OS's output buffers */
8084 fflush(fp);
8085 fsync(fileno(fp));
8086 fclose(fp);
8087
8088 /* Use RENAME to make sure the DB file is changed atomically only
8089 * if the generate DB file is ok. */
8090 if (rename(tmpfile,filename) == -1) {
8091 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8092 unlink(tmpfile);
8093 return REDIS_ERR;
8094 }
8095 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8096 return REDIS_OK;
8097
8098 werr:
8099 fclose(fp);
8100 unlink(tmpfile);
8101 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8102 if (di) dictReleaseIterator(di);
8103 return REDIS_ERR;
8104 }
8105
8106 /* This is how rewriting of the append only file in background works:
8107 *
8108 * 1) The user calls BGREWRITEAOF
8109 * 2) Redis calls this function, that forks():
8110 * 2a) the child rewrite the append only file in a temp file.
8111 * 2b) the parent accumulates differences in server.bgrewritebuf.
8112 * 3) When the child finished '2a' exists.
8113 * 4) The parent will trap the exit code, if it's OK, will append the
8114 * data accumulated into server.bgrewritebuf into the temp file, and
8115 * finally will rename(2) the temp file in the actual file name.
8116 * The the new file is reopened as the new append only file. Profit!
8117 */
8118 static int rewriteAppendOnlyFileBackground(void) {
8119 pid_t childpid;
8120
8121 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8122 if (server.vm_enabled) waitEmptyIOJobsQueue();
8123 if ((childpid = fork()) == 0) {
8124 /* Child */
8125 char tmpfile[256];
8126
8127 if (server.vm_enabled) vmReopenSwapFile();
8128 close(server.fd);
8129 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8130 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8131 _exit(0);
8132 } else {
8133 _exit(1);
8134 }
8135 } else {
8136 /* Parent */
8137 if (childpid == -1) {
8138 redisLog(REDIS_WARNING,
8139 "Can't rewrite append only file in background: fork: %s",
8140 strerror(errno));
8141 return REDIS_ERR;
8142 }
8143 redisLog(REDIS_NOTICE,
8144 "Background append only file rewriting started by pid %d",childpid);
8145 server.bgrewritechildpid = childpid;
8146 updateDictResizePolicy();
8147 /* We set appendseldb to -1 in order to force the next call to the
8148 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8149 * accumulated by the parent into server.bgrewritebuf will start
8150 * with a SELECT statement and it will be safe to merge. */
8151 server.appendseldb = -1;
8152 return REDIS_OK;
8153 }
8154 return REDIS_OK; /* unreached */
8155 }
8156
8157 static void bgrewriteaofCommand(redisClient *c) {
8158 if (server.bgrewritechildpid != -1) {
8159 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8160 return;
8161 }
8162 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8163 char *status = "+Background append only file rewriting started\r\n";
8164 addReplySds(c,sdsnew(status));
8165 } else {
8166 addReply(c,shared.err);
8167 }
8168 }
8169
8170 static void aofRemoveTempFile(pid_t childpid) {
8171 char tmpfile[256];
8172
8173 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8174 unlink(tmpfile);
8175 }
8176
8177 /* Virtual Memory is composed mainly of two subsystems:
8178 * - Blocking Virutal Memory
8179 * - Threaded Virtual Memory I/O
8180 * The two parts are not fully decoupled, but functions are split among two
8181 * different sections of the source code (delimited by comments) in order to
8182 * make more clear what functionality is about the blocking VM and what about
8183 * the threaded (not blocking) VM.
8184 *
8185 * Redis VM design:
8186 *
8187 * Redis VM is a blocking VM (one that blocks reading swapped values from
8188 * disk into memory when a value swapped out is needed in memory) that is made
8189 * unblocking by trying to examine the command argument vector in order to
8190 * load in background values that will likely be needed in order to exec
8191 * the command. The command is executed only once all the relevant keys
8192 * are loaded into memory.
8193 *
8194 * This basically is almost as simple of a blocking VM, but almost as parallel
8195 * as a fully non-blocking VM.
8196 */
8197
8198 /* =================== Virtual Memory - Blocking Side ====================== */
8199
8200 /* substitute the first occurrence of '%p' with the process pid in the
8201 * swap file name. */
8202 static void expandVmSwapFilename(void) {
8203 char *p = strstr(server.vm_swap_file,"%p");
8204 sds new;
8205
8206 if (!p) return;
8207 new = sdsempty();
8208 *p = '\0';
8209 new = sdscat(new,server.vm_swap_file);
8210 new = sdscatprintf(new,"%ld",(long) getpid());
8211 new = sdscat(new,p+2);
8212 zfree(server.vm_swap_file);
8213 server.vm_swap_file = new;
8214 }
8215
8216 static void vmInit(void) {
8217 off_t totsize;
8218 int pipefds[2];
8219 size_t stacksize;
8220
8221 if (server.vm_max_threads != 0)
8222 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8223
8224 expandVmSwapFilename();
8225 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8226 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8227 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8228 }
8229 if (server.vm_fp == NULL) {
8230 redisLog(REDIS_WARNING,
8231 "Impossible to open the swap file: %s. Exiting.",
8232 strerror(errno));
8233 exit(1);
8234 }
8235 server.vm_fd = fileno(server.vm_fp);
8236 server.vm_next_page = 0;
8237 server.vm_near_pages = 0;
8238 server.vm_stats_used_pages = 0;
8239 server.vm_stats_swapped_objects = 0;
8240 server.vm_stats_swapouts = 0;
8241 server.vm_stats_swapins = 0;
8242 totsize = server.vm_pages*server.vm_page_size;
8243 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8244 if (ftruncate(server.vm_fd,totsize) == -1) {
8245 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8246 strerror(errno));
8247 exit(1);
8248 } else {
8249 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8250 }
8251 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8252 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8253 (long long) (server.vm_pages+7)/8, server.vm_pages);
8254 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8255
8256 /* Initialize threaded I/O (used by Virtual Memory) */
8257 server.io_newjobs = listCreate();
8258 server.io_processing = listCreate();
8259 server.io_processed = listCreate();
8260 server.io_ready_clients = listCreate();
8261 pthread_mutex_init(&server.io_mutex,NULL);
8262 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8263 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8264 server.io_active_threads = 0;
8265 if (pipe(pipefds) == -1) {
8266 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8267 ,strerror(errno));
8268 exit(1);
8269 }
8270 server.io_ready_pipe_read = pipefds[0];
8271 server.io_ready_pipe_write = pipefds[1];
8272 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8273 /* LZF requires a lot of stack */
8274 pthread_attr_init(&server.io_threads_attr);
8275 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8276 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8277 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8278 /* Listen for events in the threaded I/O pipe */
8279 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8280 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8281 oom("creating file event");
8282 }
8283
8284 /* Mark the page as used */
8285 static void vmMarkPageUsed(off_t page) {
8286 off_t byte = page/8;
8287 int bit = page&7;
8288 redisAssert(vmFreePage(page) == 1);
8289 server.vm_bitmap[byte] |= 1<<bit;
8290 }
8291
8292 /* Mark N contiguous pages as used, with 'page' being the first. */
8293 static void vmMarkPagesUsed(off_t page, off_t count) {
8294 off_t j;
8295
8296 for (j = 0; j < count; j++)
8297 vmMarkPageUsed(page+j);
8298 server.vm_stats_used_pages += count;
8299 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8300 (long long)count, (long long)page);
8301 }
8302
8303 /* Mark the page as free */
8304 static void vmMarkPageFree(off_t page) {
8305 off_t byte = page/8;
8306 int bit = page&7;
8307 redisAssert(vmFreePage(page) == 0);
8308 server.vm_bitmap[byte] &= ~(1<<bit);
8309 }
8310
8311 /* Mark N contiguous pages as free, with 'page' being the first. */
8312 static void vmMarkPagesFree(off_t page, off_t count) {
8313 off_t j;
8314
8315 for (j = 0; j < count; j++)
8316 vmMarkPageFree(page+j);
8317 server.vm_stats_used_pages -= count;
8318 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8319 (long long)count, (long long)page);
8320 }
8321
8322 /* Test if the page is free */
8323 static int vmFreePage(off_t page) {
8324 off_t byte = page/8;
8325 int bit = page&7;
8326 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8327 }
8328
8329 /* Find N contiguous free pages storing the first page of the cluster in *first.
8330 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8331 * REDIS_ERR is returned.
8332 *
8333 * This function uses a simple algorithm: we try to allocate
8334 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8335 * again from the start of the swap file searching for free spaces.
8336 *
8337 * If it looks pretty clear that there are no free pages near our offset
8338 * we try to find less populated places doing a forward jump of
8339 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8340 * without hurry, and then we jump again and so forth...
8341 *
8342 * This function can be improved using a free list to avoid to guess
8343 * too much, since we could collect data about freed pages.
8344 *
8345 * note: I implemented this function just after watching an episode of
8346 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8347 */
8348 static int vmFindContiguousPages(off_t *first, off_t n) {
8349 off_t base, offset = 0, since_jump = 0, numfree = 0;
8350
8351 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8352 server.vm_near_pages = 0;
8353 server.vm_next_page = 0;
8354 }
8355 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8356 base = server.vm_next_page;
8357
8358 while(offset < server.vm_pages) {
8359 off_t this = base+offset;
8360
8361 /* If we overflow, restart from page zero */
8362 if (this >= server.vm_pages) {
8363 this -= server.vm_pages;
8364 if (this == 0) {
8365 /* Just overflowed, what we found on tail is no longer
8366 * interesting, as it's no longer contiguous. */
8367 numfree = 0;
8368 }
8369 }
8370 if (vmFreePage(this)) {
8371 /* This is a free page */
8372 numfree++;
8373 /* Already got N free pages? Return to the caller, with success */
8374 if (numfree == n) {
8375 *first = this-(n-1);
8376 server.vm_next_page = this+1;
8377 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8378 return REDIS_OK;
8379 }
8380 } else {
8381 /* The current one is not a free page */
8382 numfree = 0;
8383 }
8384
8385 /* Fast-forward if the current page is not free and we already
8386 * searched enough near this place. */
8387 since_jump++;
8388 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8389 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8390 since_jump = 0;
8391 /* Note that even if we rewind after the jump, we are don't need
8392 * to make sure numfree is set to zero as we only jump *if* it
8393 * is set to zero. */
8394 } else {
8395 /* Otherwise just check the next page */
8396 offset++;
8397 }
8398 }
8399 return REDIS_ERR;
8400 }
8401
8402 /* Write the specified object at the specified page of the swap file */
8403 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8404 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8405 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8406 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8407 redisLog(REDIS_WARNING,
8408 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8409 strerror(errno));
8410 return REDIS_ERR;
8411 }
8412 rdbSaveObject(server.vm_fp,o);
8413 fflush(server.vm_fp);
8414 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8415 return REDIS_OK;
8416 }
8417
8418 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8419 * needed to later retrieve the object into the key object.
8420 * If we can't find enough contiguous empty pages to swap the object on disk
8421 * REDIS_ERR is returned. */
8422 static int vmSwapObjectBlocking(robj *key, robj *val) {
8423 off_t pages = rdbSavedObjectPages(val,NULL);
8424 off_t page;
8425
8426 assert(key->storage == REDIS_VM_MEMORY);
8427 assert(key->refcount == 1);
8428 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8429 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8430 key->vm.page = page;
8431 key->vm.usedpages = pages;
8432 key->storage = REDIS_VM_SWAPPED;
8433 key->vtype = val->type;
8434 decrRefCount(val); /* Deallocate the object from memory. */
8435 vmMarkPagesUsed(page,pages);
8436 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8437 (unsigned char*) key->ptr,
8438 (unsigned long long) page, (unsigned long long) pages);
8439 server.vm_stats_swapped_objects++;
8440 server.vm_stats_swapouts++;
8441 return REDIS_OK;
8442 }
8443
8444 static robj *vmReadObjectFromSwap(off_t page, int type) {
8445 robj *o;
8446
8447 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8448 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8449 redisLog(REDIS_WARNING,
8450 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8451 strerror(errno));
8452 _exit(1);
8453 }
8454 o = rdbLoadObject(type,server.vm_fp);
8455 if (o == NULL) {
8456 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8457 _exit(1);
8458 }
8459 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8460 return o;
8461 }
8462
8463 /* Load the value object relative to the 'key' object from swap to memory.
8464 * The newly allocated object is returned.
8465 *
8466 * If preview is true the unserialized object is returned to the caller but
8467 * no changes are made to the key object, nor the pages are marked as freed */
8468 static robj *vmGenericLoadObject(robj *key, int preview) {
8469 robj *val;
8470
8471 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8472 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8473 if (!preview) {
8474 key->storage = REDIS_VM_MEMORY;
8475 key->vm.atime = server.unixtime;
8476 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8477 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8478 (unsigned char*) key->ptr);
8479 server.vm_stats_swapped_objects--;
8480 } else {
8481 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8482 (unsigned char*) key->ptr);
8483 }
8484 server.vm_stats_swapins++;
8485 return val;
8486 }
8487
8488 /* Plain object loading, from swap to memory */
8489 static robj *vmLoadObject(robj *key) {
8490 /* If we are loading the object in background, stop it, we
8491 * need to load this object synchronously ASAP. */
8492 if (key->storage == REDIS_VM_LOADING)
8493 vmCancelThreadedIOJob(key);
8494 return vmGenericLoadObject(key,0);
8495 }
8496
8497 /* Just load the value on disk, without to modify the key.
8498 * This is useful when we want to perform some operation on the value
8499 * without to really bring it from swap to memory, like while saving the
8500 * dataset or rewriting the append only log. */
8501 static robj *vmPreviewObject(robj *key) {
8502 return vmGenericLoadObject(key,1);
8503 }
8504
8505 /* How a good candidate is this object for swapping?
8506 * The better candidate it is, the greater the returned value.
8507 *
8508 * Currently we try to perform a fast estimation of the object size in
8509 * memory, and combine it with aging informations.
8510 *
8511 * Basically swappability = idle-time * log(estimated size)
8512 *
8513 * Bigger objects are preferred over smaller objects, but not
8514 * proportionally, this is why we use the logarithm. This algorithm is
8515 * just a first try and will probably be tuned later. */
8516 static double computeObjectSwappability(robj *o) {
8517 time_t age = server.unixtime - o->vm.atime;
8518 long asize = 0;
8519 list *l;
8520 dict *d;
8521 struct dictEntry *de;
8522 int z;
8523
8524 if (age <= 0) return 0;
8525 switch(o->type) {
8526 case REDIS_STRING:
8527 if (o->encoding != REDIS_ENCODING_RAW) {
8528 asize = sizeof(*o);
8529 } else {
8530 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8531 }
8532 break;
8533 case REDIS_LIST:
8534 l = o->ptr;
8535 listNode *ln = listFirst(l);
8536
8537 asize = sizeof(list);
8538 if (ln) {
8539 robj *ele = ln->value;
8540 long elesize;
8541
8542 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8543 (sizeof(*o)+sdslen(ele->ptr)) :
8544 sizeof(*o);
8545 asize += (sizeof(listNode)+elesize)*listLength(l);
8546 }
8547 break;
8548 case REDIS_SET:
8549 case REDIS_ZSET:
8550 z = (o->type == REDIS_ZSET);
8551 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8552
8553 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8554 if (z) asize += sizeof(zset)-sizeof(dict);
8555 if (dictSize(d)) {
8556 long elesize;
8557 robj *ele;
8558
8559 de = dictGetRandomKey(d);
8560 ele = dictGetEntryKey(de);
8561 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8562 (sizeof(*o)+sdslen(ele->ptr)) :
8563 sizeof(*o);
8564 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8565 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8566 }
8567 break;
8568 case REDIS_HASH:
8569 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8570 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8571 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8572 unsigned int klen, vlen;
8573 unsigned char *key, *val;
8574
8575 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8576 klen = 0;
8577 vlen = 0;
8578 }
8579 asize = len*(klen+vlen+3);
8580 } else if (o->encoding == REDIS_ENCODING_HT) {
8581 d = o->ptr;
8582 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8583 if (dictSize(d)) {
8584 long elesize;
8585 robj *ele;
8586
8587 de = dictGetRandomKey(d);
8588 ele = dictGetEntryKey(de);
8589 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8590 (sizeof(*o)+sdslen(ele->ptr)) :
8591 sizeof(*o);
8592 ele = dictGetEntryVal(de);
8593 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8594 (sizeof(*o)+sdslen(ele->ptr)) :
8595 sizeof(*o);
8596 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8597 }
8598 }
8599 break;
8600 }
8601 return (double)age*log(1+asize);
8602 }
8603
8604 /* Try to swap an object that's a good candidate for swapping.
8605 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8606 * to swap any object at all.
8607 *
8608 * If 'usethreaded' is true, Redis will try to swap the object in background
8609 * using I/O threads. */
8610 static int vmSwapOneObject(int usethreads) {
8611 int j, i;
8612 struct dictEntry *best = NULL;
8613 double best_swappability = 0;
8614 redisDb *best_db = NULL;
8615 robj *key, *val;
8616
8617 for (j = 0; j < server.dbnum; j++) {
8618 redisDb *db = server.db+j;
8619 /* Why maxtries is set to 100?
8620 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8621 * are swappable objects */
8622 int maxtries = 100;
8623
8624 if (dictSize(db->dict) == 0) continue;
8625 for (i = 0; i < 5; i++) {
8626 dictEntry *de;
8627 double swappability;
8628
8629 if (maxtries) maxtries--;
8630 de = dictGetRandomKey(db->dict);
8631 key = dictGetEntryKey(de);
8632 val = dictGetEntryVal(de);
8633 /* Only swap objects that are currently in memory.
8634 *
8635 * Also don't swap shared objects if threaded VM is on, as we
8636 * try to ensure that the main thread does not touch the
8637 * object while the I/O thread is using it, but we can't
8638 * control other keys without adding additional mutex. */
8639 if (key->storage != REDIS_VM_MEMORY ||
8640 (server.vm_max_threads != 0 && val->refcount != 1)) {
8641 if (maxtries) i--; /* don't count this try */
8642 continue;
8643 }
8644 swappability = computeObjectSwappability(val);
8645 if (!best || swappability > best_swappability) {
8646 best = de;
8647 best_swappability = swappability;
8648 best_db = db;
8649 }
8650 }
8651 }
8652 if (best == NULL) return REDIS_ERR;
8653 key = dictGetEntryKey(best);
8654 val = dictGetEntryVal(best);
8655
8656 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8657 key->ptr, best_swappability);
8658
8659 /* Unshare the key if needed */
8660 if (key->refcount > 1) {
8661 robj *newkey = dupStringObject(key);
8662 decrRefCount(key);
8663 key = dictGetEntryKey(best) = newkey;
8664 }
8665 /* Swap it */
8666 if (usethreads) {
8667 vmSwapObjectThreaded(key,val,best_db);
8668 return REDIS_OK;
8669 } else {
8670 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8671 dictGetEntryVal(best) = NULL;
8672 return REDIS_OK;
8673 } else {
8674 return REDIS_ERR;
8675 }
8676 }
8677 }
8678
8679 static int vmSwapOneObjectBlocking() {
8680 return vmSwapOneObject(0);
8681 }
8682
8683 static int vmSwapOneObjectThreaded() {
8684 return vmSwapOneObject(1);
8685 }
8686
8687 /* Return true if it's safe to swap out objects in a given moment.
8688 * Basically we don't want to swap objects out while there is a BGSAVE
8689 * or a BGAEOREWRITE running in backgroud. */
8690 static int vmCanSwapOut(void) {
8691 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8692 }
8693
8694 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8695 * and was deleted. Otherwise 0 is returned. */
8696 static int deleteIfSwapped(redisDb *db, robj *key) {
8697 dictEntry *de;
8698 robj *foundkey;
8699
8700 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8701 foundkey = dictGetEntryKey(de);
8702 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8703 deleteKey(db,key);
8704 return 1;
8705 }
8706
8707 /* =================== Virtual Memory - Threaded I/O ======================= */
8708
8709 static void freeIOJob(iojob *j) {
8710 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8711 j->type == REDIS_IOJOB_DO_SWAP ||
8712 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8713 decrRefCount(j->val);
8714 /* We don't decrRefCount the j->key field as we did't incremented
8715 * the count creating IO Jobs. This is because the key field here is
8716 * just used as an indentifier and if a key is removed the Job should
8717 * never be touched again. */
8718 zfree(j);
8719 }
8720
8721 /* Every time a thread finished a Job, it writes a byte into the write side
8722 * of an unix pipe in order to "awake" the main thread, and this function
8723 * is called. */
8724 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8725 int mask)
8726 {
8727 char buf[1];
8728 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8729 REDIS_NOTUSED(el);
8730 REDIS_NOTUSED(mask);
8731 REDIS_NOTUSED(privdata);
8732
8733 /* For every byte we read in the read side of the pipe, there is one
8734 * I/O job completed to process. */
8735 while((retval = read(fd,buf,1)) == 1) {
8736 iojob *j;
8737 listNode *ln;
8738 robj *key;
8739 struct dictEntry *de;
8740
8741 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8742
8743 /* Get the processed element (the oldest one) */
8744 lockThreadedIO();
8745 assert(listLength(server.io_processed) != 0);
8746 if (toprocess == -1) {
8747 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8748 if (toprocess <= 0) toprocess = 1;
8749 }
8750 ln = listFirst(server.io_processed);
8751 j = ln->value;
8752 listDelNode(server.io_processed,ln);
8753 unlockThreadedIO();
8754 /* If this job is marked as canceled, just ignore it */
8755 if (j->canceled) {
8756 freeIOJob(j);
8757 continue;
8758 }
8759 /* Post process it in the main thread, as there are things we
8760 * can do just here to avoid race conditions and/or invasive locks */
8761 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8762 de = dictFind(j->db->dict,j->key);
8763 assert(de != NULL);
8764 key = dictGetEntryKey(de);
8765 if (j->type == REDIS_IOJOB_LOAD) {
8766 redisDb *db;
8767
8768 /* Key loaded, bring it at home */
8769 key->storage = REDIS_VM_MEMORY;
8770 key->vm.atime = server.unixtime;
8771 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8772 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8773 (unsigned char*) key->ptr);
8774 server.vm_stats_swapped_objects--;
8775 server.vm_stats_swapins++;
8776 dictGetEntryVal(de) = j->val;
8777 incrRefCount(j->val);
8778 db = j->db;
8779 freeIOJob(j);
8780 /* Handle clients waiting for this key to be loaded. */
8781 handleClientsBlockedOnSwappedKey(db,key);
8782 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8783 /* Now we know the amount of pages required to swap this object.
8784 * Let's find some space for it, and queue this task again
8785 * rebranded as REDIS_IOJOB_DO_SWAP. */
8786 if (!vmCanSwapOut() ||
8787 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8788 {
8789 /* Ooops... no space or we can't swap as there is
8790 * a fork()ed Redis trying to save stuff on disk. */
8791 freeIOJob(j);
8792 key->storage = REDIS_VM_MEMORY; /* undo operation */
8793 } else {
8794 /* Note that we need to mark this pages as used now,
8795 * if the job will be canceled, we'll mark them as freed
8796 * again. */
8797 vmMarkPagesUsed(j->page,j->pages);
8798 j->type = REDIS_IOJOB_DO_SWAP;
8799 lockThreadedIO();
8800 queueIOJob(j);
8801 unlockThreadedIO();
8802 }
8803 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8804 robj *val;
8805
8806 /* Key swapped. We can finally free some memory. */
8807 if (key->storage != REDIS_VM_SWAPPING) {
8808 printf("key->storage: %d\n",key->storage);
8809 printf("key->name: %s\n",(char*)key->ptr);
8810 printf("key->refcount: %d\n",key->refcount);
8811 printf("val: %p\n",(void*)j->val);
8812 printf("val->type: %d\n",j->val->type);
8813 printf("val->ptr: %s\n",(char*)j->val->ptr);
8814 }
8815 redisAssert(key->storage == REDIS_VM_SWAPPING);
8816 val = dictGetEntryVal(de);
8817 key->vm.page = j->page;
8818 key->vm.usedpages = j->pages;
8819 key->storage = REDIS_VM_SWAPPED;
8820 key->vtype = j->val->type;
8821 decrRefCount(val); /* Deallocate the object from memory. */
8822 dictGetEntryVal(de) = NULL;
8823 redisLog(REDIS_DEBUG,
8824 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8825 (unsigned char*) key->ptr,
8826 (unsigned long long) j->page, (unsigned long long) j->pages);
8827 server.vm_stats_swapped_objects++;
8828 server.vm_stats_swapouts++;
8829 freeIOJob(j);
8830 /* Put a few more swap requests in queue if we are still
8831 * out of memory */
8832 if (trytoswap && vmCanSwapOut() &&
8833 zmalloc_used_memory() > server.vm_max_memory)
8834 {
8835 int more = 1;
8836 while(more) {
8837 lockThreadedIO();
8838 more = listLength(server.io_newjobs) <
8839 (unsigned) server.vm_max_threads;
8840 unlockThreadedIO();
8841 /* Don't waste CPU time if swappable objects are rare. */
8842 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8843 trytoswap = 0;
8844 break;
8845 }
8846 }
8847 }
8848 }
8849 processed++;
8850 if (processed == toprocess) return;
8851 }
8852 if (retval < 0 && errno != EAGAIN) {
8853 redisLog(REDIS_WARNING,
8854 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8855 strerror(errno));
8856 }
8857 }
8858
8859 static void lockThreadedIO(void) {
8860 pthread_mutex_lock(&server.io_mutex);
8861 }
8862
8863 static void unlockThreadedIO(void) {
8864 pthread_mutex_unlock(&server.io_mutex);
8865 }
8866
8867 /* Remove the specified object from the threaded I/O queue if still not
8868 * processed, otherwise make sure to flag it as canceled. */
8869 static void vmCancelThreadedIOJob(robj *o) {
8870 list *lists[3] = {
8871 server.io_newjobs, /* 0 */
8872 server.io_processing, /* 1 */
8873 server.io_processed /* 2 */
8874 };
8875 int i;
8876
8877 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8878 again:
8879 lockThreadedIO();
8880 /* Search for a matching key in one of the queues */
8881 for (i = 0; i < 3; i++) {
8882 listNode *ln;
8883 listIter li;
8884
8885 listRewind(lists[i],&li);
8886 while ((ln = listNext(&li)) != NULL) {
8887 iojob *job = ln->value;
8888
8889 if (job->canceled) continue; /* Skip this, already canceled. */
8890 if (job->key == o) {
8891 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8892 (void*)job, (char*)o->ptr, job->type, i);
8893 /* Mark the pages as free since the swap didn't happened
8894 * or happened but is now discarded. */
8895 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8896 vmMarkPagesFree(job->page,job->pages);
8897 /* Cancel the job. It depends on the list the job is
8898 * living in. */
8899 switch(i) {
8900 case 0: /* io_newjobs */
8901 /* If the job was yet not processed the best thing to do
8902 * is to remove it from the queue at all */
8903 freeIOJob(job);
8904 listDelNode(lists[i],ln);
8905 break;
8906 case 1: /* io_processing */
8907 /* Oh Shi- the thread is messing with the Job:
8908 *
8909 * Probably it's accessing the object if this is a
8910 * PREPARE_SWAP or DO_SWAP job.
8911 * If it's a LOAD job it may be reading from disk and
8912 * if we don't wait for the job to terminate before to
8913 * cancel it, maybe in a few microseconds data can be
8914 * corrupted in this pages. So the short story is:
8915 *
8916 * Better to wait for the job to move into the
8917 * next queue (processed)... */
8918
8919 /* We try again and again until the job is completed. */
8920 unlockThreadedIO();
8921 /* But let's wait some time for the I/O thread
8922 * to finish with this job. After all this condition
8923 * should be very rare. */
8924 usleep(1);
8925 goto again;
8926 case 2: /* io_processed */
8927 /* The job was already processed, that's easy...
8928 * just mark it as canceled so that we'll ignore it
8929 * when processing completed jobs. */
8930 job->canceled = 1;
8931 break;
8932 }
8933 /* Finally we have to adjust the storage type of the object
8934 * in order to "UNDO" the operaiton. */
8935 if (o->storage == REDIS_VM_LOADING)
8936 o->storage = REDIS_VM_SWAPPED;
8937 else if (o->storage == REDIS_VM_SWAPPING)
8938 o->storage = REDIS_VM_MEMORY;
8939 unlockThreadedIO();
8940 return;
8941 }
8942 }
8943 }
8944 unlockThreadedIO();
8945 assert(1 != 1); /* We should never reach this */
8946 }
8947
8948 static void *IOThreadEntryPoint(void *arg) {
8949 iojob *j;
8950 listNode *ln;
8951 REDIS_NOTUSED(arg);
8952
8953 pthread_detach(pthread_self());
8954 while(1) {
8955 /* Get a new job to process */
8956 lockThreadedIO();
8957 if (listLength(server.io_newjobs) == 0) {
8958 /* No new jobs in queue, exit. */
8959 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8960 (long) pthread_self());
8961 server.io_active_threads--;
8962 unlockThreadedIO();
8963 return NULL;
8964 }
8965 ln = listFirst(server.io_newjobs);
8966 j = ln->value;
8967 listDelNode(server.io_newjobs,ln);
8968 /* Add the job in the processing queue */
8969 j->thread = pthread_self();
8970 listAddNodeTail(server.io_processing,j);
8971 ln = listLast(server.io_processing); /* We use ln later to remove it */
8972 unlockThreadedIO();
8973 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8974 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8975
8976 /* Process the Job */
8977 if (j->type == REDIS_IOJOB_LOAD) {
8978 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8979 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8980 FILE *fp = fopen("/dev/null","w+");
8981 j->pages = rdbSavedObjectPages(j->val,fp);
8982 fclose(fp);
8983 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8984 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8985 j->canceled = 1;
8986 }
8987
8988 /* Done: insert the job into the processed queue */
8989 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8990 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8991 lockThreadedIO();
8992 listDelNode(server.io_processing,ln);
8993 listAddNodeTail(server.io_processed,j);
8994 unlockThreadedIO();
8995
8996 /* Signal the main thread there is new stuff to process */
8997 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8998 }
8999 return NULL; /* never reached */
9000 }
9001
9002 static void spawnIOThread(void) {
9003 pthread_t thread;
9004 sigset_t mask, omask;
9005 int err;
9006
9007 sigemptyset(&mask);
9008 sigaddset(&mask,SIGCHLD);
9009 sigaddset(&mask,SIGHUP);
9010 sigaddset(&mask,SIGPIPE);
9011 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9012 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9013 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9014 strerror(err));
9015 usleep(1000000);
9016 }
9017 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9018 server.io_active_threads++;
9019 }
9020
9021 /* We need to wait for the last thread to exit before we are able to
9022 * fork() in order to BGSAVE or BGREWRITEAOF. */
9023 static void waitEmptyIOJobsQueue(void) {
9024 while(1) {
9025 int io_processed_len;
9026
9027 lockThreadedIO();
9028 if (listLength(server.io_newjobs) == 0 &&
9029 listLength(server.io_processing) == 0 &&
9030 server.io_active_threads == 0)
9031 {
9032 unlockThreadedIO();
9033 return;
9034 }
9035 /* While waiting for empty jobs queue condition we post-process some
9036 * finshed job, as I/O threads may be hanging trying to write against
9037 * the io_ready_pipe_write FD but there are so much pending jobs that
9038 * it's blocking. */
9039 io_processed_len = listLength(server.io_processed);
9040 unlockThreadedIO();
9041 if (io_processed_len) {
9042 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9043 usleep(1000); /* 1 millisecond */
9044 } else {
9045 usleep(10000); /* 10 milliseconds */
9046 }
9047 }
9048 }
9049
9050 static void vmReopenSwapFile(void) {
9051 /* Note: we don't close the old one as we are in the child process
9052 * and don't want to mess at all with the original file object. */
9053 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9054 if (server.vm_fp == NULL) {
9055 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9056 server.vm_swap_file);
9057 _exit(1);
9058 }
9059 server.vm_fd = fileno(server.vm_fp);
9060 }
9061
9062 /* This function must be called while with threaded IO locked */
9063 static void queueIOJob(iojob *j) {
9064 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9065 (void*)j, j->type, (char*)j->key->ptr);
9066 listAddNodeTail(server.io_newjobs,j);
9067 if (server.io_active_threads < server.vm_max_threads)
9068 spawnIOThread();
9069 }
9070
9071 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9072 iojob *j;
9073
9074 assert(key->storage == REDIS_VM_MEMORY);
9075 assert(key->refcount == 1);
9076
9077 j = zmalloc(sizeof(*j));
9078 j->type = REDIS_IOJOB_PREPARE_SWAP;
9079 j->db = db;
9080 j->key = key;
9081 j->val = val;
9082 incrRefCount(val);
9083 j->canceled = 0;
9084 j->thread = (pthread_t) -1;
9085 key->storage = REDIS_VM_SWAPPING;
9086
9087 lockThreadedIO();
9088 queueIOJob(j);
9089 unlockThreadedIO();
9090 return REDIS_OK;
9091 }
9092
9093 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9094
9095 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9096 * If there is not already a job loading the key, it is craeted.
9097 * The key is added to the io_keys list in the client structure, and also
9098 * in the hash table mapping swapped keys to waiting clients, that is,
9099 * server.io_waited_keys. */
9100 static int waitForSwappedKey(redisClient *c, robj *key) {
9101 struct dictEntry *de;
9102 robj *o;
9103 list *l;
9104
9105 /* If the key does not exist or is already in RAM we don't need to
9106 * block the client at all. */
9107 de = dictFind(c->db->dict,key);
9108 if (de == NULL) return 0;
9109 o = dictGetEntryKey(de);
9110 if (o->storage == REDIS_VM_MEMORY) {
9111 return 0;
9112 } else if (o->storage == REDIS_VM_SWAPPING) {
9113 /* We were swapping the key, undo it! */
9114 vmCancelThreadedIOJob(o);
9115 return 0;
9116 }
9117
9118 /* OK: the key is either swapped, or being loaded just now. */
9119
9120 /* Add the key to the list of keys this client is waiting for.
9121 * This maps clients to keys they are waiting for. */
9122 listAddNodeTail(c->io_keys,key);
9123 incrRefCount(key);
9124
9125 /* Add the client to the swapped keys => clients waiting map. */
9126 de = dictFind(c->db->io_keys,key);
9127 if (de == NULL) {
9128 int retval;
9129
9130 /* For every key we take a list of clients blocked for it */
9131 l = listCreate();
9132 retval = dictAdd(c->db->io_keys,key,l);
9133 incrRefCount(key);
9134 assert(retval == DICT_OK);
9135 } else {
9136 l = dictGetEntryVal(de);
9137 }
9138 listAddNodeTail(l,c);
9139
9140 /* Are we already loading the key from disk? If not create a job */
9141 if (o->storage == REDIS_VM_SWAPPED) {
9142 iojob *j;
9143
9144 o->storage = REDIS_VM_LOADING;
9145 j = zmalloc(sizeof(*j));
9146 j->type = REDIS_IOJOB_LOAD;
9147 j->db = c->db;
9148 j->key = o;
9149 j->key->vtype = o->vtype;
9150 j->page = o->vm.page;
9151 j->val = NULL;
9152 j->canceled = 0;
9153 j->thread = (pthread_t) -1;
9154 lockThreadedIO();
9155 queueIOJob(j);
9156 unlockThreadedIO();
9157 }
9158 return 1;
9159 }
9160
9161 /* Preload keys needed for the ZUNION and ZINTER commands. */
9162 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9163 int i, num;
9164 num = atoi(c->argv[2]->ptr);
9165 for (i = 0; i < num; i++) {
9166 waitForSwappedKey(c,c->argv[3+i]);
9167 }
9168 }
9169
9170 /* Is this client attempting to run a command against swapped keys?
9171 * If so, block it ASAP, load the keys in background, then resume it.
9172 *
9173 * The important idea about this function is that it can fail! If keys will
9174 * still be swapped when the client is resumed, this key lookups will
9175 * just block loading keys from disk. In practical terms this should only
9176 * happen with SORT BY command or if there is a bug in this function.
9177 *
9178 * Return 1 if the client is marked as blocked, 0 if the client can
9179 * continue as the keys it is going to access appear to be in memory. */
9180 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9181 int j, last;
9182
9183 if (cmd->vm_preload_proc != NULL) {
9184 cmd->vm_preload_proc(c);
9185 } else {
9186 if (cmd->vm_firstkey == 0) return 0;
9187 last = cmd->vm_lastkey;
9188 if (last < 0) last = c->argc+last;
9189 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9190 waitForSwappedKey(c,c->argv[j]);
9191 }
9192
9193 /* If the client was blocked for at least one key, mark it as blocked. */
9194 if (listLength(c->io_keys)) {
9195 c->flags |= REDIS_IO_WAIT;
9196 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9197 server.vm_blocked_clients++;
9198 return 1;
9199 } else {
9200 return 0;
9201 }
9202 }
9203
9204 /* Remove the 'key' from the list of blocked keys for a given client.
9205 *
9206 * The function returns 1 when there are no longer blocking keys after
9207 * the current one was removed (and the client can be unblocked). */
9208 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9209 list *l;
9210 listNode *ln;
9211 listIter li;
9212 struct dictEntry *de;
9213
9214 /* Remove the key from the list of keys this client is waiting for. */
9215 listRewind(c->io_keys,&li);
9216 while ((ln = listNext(&li)) != NULL) {
9217 if (compareStringObjects(ln->value,key) == 0) {
9218 listDelNode(c->io_keys,ln);
9219 break;
9220 }
9221 }
9222 assert(ln != NULL);
9223
9224 /* Remove the client form the key => waiting clients map. */
9225 de = dictFind(c->db->io_keys,key);
9226 assert(de != NULL);
9227 l = dictGetEntryVal(de);
9228 ln = listSearchKey(l,c);
9229 assert(ln != NULL);
9230 listDelNode(l,ln);
9231 if (listLength(l) == 0)
9232 dictDelete(c->db->io_keys,key);
9233
9234 return listLength(c->io_keys) == 0;
9235 }
9236
9237 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9238 struct dictEntry *de;
9239 list *l;
9240 listNode *ln;
9241 int len;
9242
9243 de = dictFind(db->io_keys,key);
9244 if (!de) return;
9245
9246 l = dictGetEntryVal(de);
9247 len = listLength(l);
9248 /* Note: we can't use something like while(listLength(l)) as the list
9249 * can be freed by the calling function when we remove the last element. */
9250 while (len--) {
9251 ln = listFirst(l);
9252 redisClient *c = ln->value;
9253
9254 if (dontWaitForSwappedKey(c,key)) {
9255 /* Put the client in the list of clients ready to go as we
9256 * loaded all the keys about it. */
9257 listAddNodeTail(server.io_ready_clients,c);
9258 }
9259 }
9260 }
9261
9262 /* =========================== Remote Configuration ========================= */
9263
9264 static void configSetCommand(redisClient *c) {
9265 robj *o = getDecodedObject(c->argv[3]);
9266 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9267 zfree(server.dbfilename);
9268 server.dbfilename = zstrdup(o->ptr);
9269 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9270 zfree(server.requirepass);
9271 server.requirepass = zstrdup(o->ptr);
9272 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9273 zfree(server.masterauth);
9274 server.masterauth = zstrdup(o->ptr);
9275 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9276 server.maxmemory = strtoll(o->ptr, NULL, 10);
9277 } else {
9278 addReplySds(c,sdscatprintf(sdsempty(),
9279 "-ERR not supported CONFIG parameter %s\r\n",
9280 (char*)c->argv[2]->ptr));
9281 decrRefCount(o);
9282 return;
9283 }
9284 decrRefCount(o);
9285 addReply(c,shared.ok);
9286 }
9287
9288 static void configGetCommand(redisClient *c) {
9289 robj *o = getDecodedObject(c->argv[2]);
9290 robj *lenobj = createObject(REDIS_STRING,NULL);
9291 char *pattern = o->ptr;
9292 int matches = 0;
9293
9294 addReply(c,lenobj);
9295 decrRefCount(lenobj);
9296
9297 if (stringmatch(pattern,"dbfilename",0)) {
9298 addReplyBulkCString(c,"dbfilename");
9299 addReplyBulkCString(c,server.dbfilename);
9300 matches++;
9301 }
9302 if (stringmatch(pattern,"requirepass",0)) {
9303 addReplyBulkCString(c,"requirepass");
9304 addReplyBulkCString(c,server.requirepass);
9305 matches++;
9306 }
9307 if (stringmatch(pattern,"masterauth",0)) {
9308 addReplyBulkCString(c,"masterauth");
9309 addReplyBulkCString(c,server.masterauth);
9310 matches++;
9311 }
9312 if (stringmatch(pattern,"maxmemory",0)) {
9313 char buf[128];
9314
9315 snprintf(buf,128,"%llu\n",server.maxmemory);
9316 addReplyBulkCString(c,"maxmemory");
9317 addReplyBulkCString(c,buf);
9318 matches++;
9319 }
9320 decrRefCount(o);
9321 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9322 }
9323
9324 static void configCommand(redisClient *c) {
9325 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9326 if (c->argc != 4) goto badarity;
9327 configSetCommand(c);
9328 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9329 if (c->argc != 3) goto badarity;
9330 configGetCommand(c);
9331 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9332 if (c->argc != 2) goto badarity;
9333 server.stat_numcommands = 0;
9334 server.stat_numconnections = 0;
9335 server.stat_expiredkeys = 0;
9336 server.stat_starttime = time(NULL);
9337 addReply(c,shared.ok);
9338 } else {
9339 addReplySds(c,sdscatprintf(sdsempty(),
9340 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9341 }
9342 return;
9343
9344 badarity:
9345 addReplySds(c,sdscatprintf(sdsempty(),
9346 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9347 (char*) c->argv[1]->ptr));
9348 }
9349
9350 /* =========================== Pubsub implementation ======================== */
9351
9352 static void freePubsubPattern(void *p) {
9353 pubsubPattern *pat = p;
9354
9355 decrRefCount(pat->pattern);
9356 zfree(pat);
9357 }
9358
9359 static int listMatchPubsubPattern(void *a, void *b) {
9360 pubsubPattern *pa = a, *pb = b;
9361
9362 return (pa->client == pb->client) &&
9363 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9364 }
9365
9366 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9367 * 0 if the client was already subscribed to that channel. */
9368 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9369 struct dictEntry *de;
9370 list *clients = NULL;
9371 int retval = 0;
9372
9373 /* Add the channel to the client -> channels hash table */
9374 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9375 retval = 1;
9376 incrRefCount(channel);
9377 /* Add the client to the channel -> list of clients hash table */
9378 de = dictFind(server.pubsub_channels,channel);
9379 if (de == NULL) {
9380 clients = listCreate();
9381 dictAdd(server.pubsub_channels,channel,clients);
9382 incrRefCount(channel);
9383 } else {
9384 clients = dictGetEntryVal(de);
9385 }
9386 listAddNodeTail(clients,c);
9387 }
9388 /* Notify the client */
9389 addReply(c,shared.mbulk3);
9390 addReply(c,shared.subscribebulk);
9391 addReplyBulk(c,channel);
9392 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9393 return retval;
9394 }
9395
9396 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9397 * 0 if the client was not subscribed to the specified channel. */
9398 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9399 struct dictEntry *de;
9400 list *clients;
9401 listNode *ln;
9402 int retval = 0;
9403
9404 /* Remove the channel from the client -> channels hash table */
9405 incrRefCount(channel); /* channel may be just a pointer to the same object
9406 we have in the hash tables. Protect it... */
9407 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9408 retval = 1;
9409 /* Remove the client from the channel -> clients list hash table */
9410 de = dictFind(server.pubsub_channels,channel);
9411 assert(de != NULL);
9412 clients = dictGetEntryVal(de);
9413 ln = listSearchKey(clients,c);
9414 assert(ln != NULL);
9415 listDelNode(clients,ln);
9416 if (listLength(clients) == 0) {
9417 /* Free the list and associated hash entry at all if this was
9418 * the latest client, so that it will be possible to abuse
9419 * Redis PUBSUB creating millions of channels. */
9420 dictDelete(server.pubsub_channels,channel);
9421 }
9422 }
9423 /* Notify the client */
9424 if (notify) {
9425 addReply(c,shared.mbulk3);
9426 addReply(c,shared.unsubscribebulk);
9427 addReplyBulk(c,channel);
9428 addReplyLong(c,dictSize(c->pubsub_channels)+
9429 listLength(c->pubsub_patterns));
9430
9431 }
9432 decrRefCount(channel); /* it is finally safe to release it */
9433 return retval;
9434 }
9435
9436 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9437 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9438 int retval = 0;
9439
9440 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9441 retval = 1;
9442 pubsubPattern *pat;
9443 listAddNodeTail(c->pubsub_patterns,pattern);
9444 incrRefCount(pattern);
9445 pat = zmalloc(sizeof(*pat));
9446 pat->pattern = getDecodedObject(pattern);
9447 pat->client = c;
9448 listAddNodeTail(server.pubsub_patterns,pat);
9449 }
9450 /* Notify the client */
9451 addReply(c,shared.mbulk3);
9452 addReply(c,shared.psubscribebulk);
9453 addReplyBulk(c,pattern);
9454 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9455 return retval;
9456 }
9457
9458 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9459 * 0 if the client was not subscribed to the specified channel. */
9460 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9461 listNode *ln;
9462 pubsubPattern pat;
9463 int retval = 0;
9464
9465 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9466 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9467 retval = 1;
9468 listDelNode(c->pubsub_patterns,ln);
9469 pat.client = c;
9470 pat.pattern = pattern;
9471 ln = listSearchKey(server.pubsub_patterns,&pat);
9472 listDelNode(server.pubsub_patterns,ln);
9473 }
9474 /* Notify the client */
9475 if (notify) {
9476 addReply(c,shared.mbulk3);
9477 addReply(c,shared.punsubscribebulk);
9478 addReplyBulk(c,pattern);
9479 addReplyLong(c,dictSize(c->pubsub_channels)+
9480 listLength(c->pubsub_patterns));
9481 }
9482 decrRefCount(pattern);
9483 return retval;
9484 }
9485
9486 /* Unsubscribe from all the channels. Return the number of channels the
9487 * client was subscribed from. */
9488 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9489 dictIterator *di = dictGetIterator(c->pubsub_channels);
9490 dictEntry *de;
9491 int count = 0;
9492
9493 while((de = dictNext(di)) != NULL) {
9494 robj *channel = dictGetEntryKey(de);
9495
9496 count += pubsubUnsubscribeChannel(c,channel,notify);
9497 }
9498 dictReleaseIterator(di);
9499 return count;
9500 }
9501
9502 /* Unsubscribe from all the patterns. Return the number of patterns the
9503 * client was subscribed from. */
9504 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9505 listNode *ln;
9506 listIter li;
9507 int count = 0;
9508
9509 listRewind(c->pubsub_patterns,&li);
9510 while ((ln = listNext(&li)) != NULL) {
9511 robj *pattern = ln->value;
9512
9513 count += pubsubUnsubscribePattern(c,pattern,notify);
9514 }
9515 return count;
9516 }
9517
9518 /* Publish a message */
9519 static int pubsubPublishMessage(robj *channel, robj *message) {
9520 int receivers = 0;
9521 struct dictEntry *de;
9522 listNode *ln;
9523 listIter li;
9524
9525 /* Send to clients listening for that channel */
9526 de = dictFind(server.pubsub_channels,channel);
9527 if (de) {
9528 list *list = dictGetEntryVal(de);
9529 listNode *ln;
9530 listIter li;
9531
9532 listRewind(list,&li);
9533 while ((ln = listNext(&li)) != NULL) {
9534 redisClient *c = ln->value;
9535
9536 addReply(c,shared.mbulk3);
9537 addReply(c,shared.messagebulk);
9538 addReplyBulk(c,channel);
9539 addReplyBulk(c,message);
9540 receivers++;
9541 }
9542 }
9543 /* Send to clients listening to matching channels */
9544 if (listLength(server.pubsub_patterns)) {
9545 listRewind(server.pubsub_patterns,&li);
9546 channel = getDecodedObject(channel);
9547 while ((ln = listNext(&li)) != NULL) {
9548 pubsubPattern *pat = ln->value;
9549
9550 if (stringmatchlen((char*)pat->pattern->ptr,
9551 sdslen(pat->pattern->ptr),
9552 (char*)channel->ptr,
9553 sdslen(channel->ptr),0)) {
9554 addReply(pat->client,shared.mbulk3);
9555 addReply(pat->client,shared.messagebulk);
9556 addReplyBulk(pat->client,channel);
9557 addReplyBulk(pat->client,message);
9558 receivers++;
9559 }
9560 }
9561 decrRefCount(channel);
9562 }
9563 return receivers;
9564 }
9565
9566 static void subscribeCommand(redisClient *c) {
9567 int j;
9568
9569 for (j = 1; j < c->argc; j++)
9570 pubsubSubscribeChannel(c,c->argv[j]);
9571 }
9572
9573 static void unsubscribeCommand(redisClient *c) {
9574 if (c->argc == 1) {
9575 pubsubUnsubscribeAllChannels(c,1);
9576 return;
9577 } else {
9578 int j;
9579
9580 for (j = 1; j < c->argc; j++)
9581 pubsubUnsubscribeChannel(c,c->argv[j],1);
9582 }
9583 }
9584
9585 static void psubscribeCommand(redisClient *c) {
9586 int j;
9587
9588 for (j = 1; j < c->argc; j++)
9589 pubsubSubscribePattern(c,c->argv[j]);
9590 }
9591
9592 static void punsubscribeCommand(redisClient *c) {
9593 if (c->argc == 1) {
9594 pubsubUnsubscribeAllPatterns(c,1);
9595 return;
9596 } else {
9597 int j;
9598
9599 for (j = 1; j < c->argc; j++)
9600 pubsubUnsubscribePattern(c,c->argv[j],1);
9601 }
9602 }
9603
9604 static void publishCommand(redisClient *c) {
9605 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9606 addReplyLong(c,receivers);
9607 }
9608
9609 /* ================================= Debugging ============================== */
9610
9611 static void debugCommand(redisClient *c) {
9612 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9613 *((char*)-1) = 'x';
9614 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9615 if (rdbSave(server.dbfilename) != REDIS_OK) {
9616 addReply(c,shared.err);
9617 return;
9618 }
9619 emptyDb();
9620 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9621 addReply(c,shared.err);
9622 return;
9623 }
9624 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9625 addReply(c,shared.ok);
9626 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9627 emptyDb();
9628 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9629 addReply(c,shared.err);
9630 return;
9631 }
9632 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9633 addReply(c,shared.ok);
9634 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9635 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9636 robj *key, *val;
9637
9638 if (!de) {
9639 addReply(c,shared.nokeyerr);
9640 return;
9641 }
9642 key = dictGetEntryKey(de);
9643 val = dictGetEntryVal(de);
9644 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9645 key->storage == REDIS_VM_SWAPPING)) {
9646 char *strenc;
9647 char buf[128];
9648
9649 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9650 strenc = strencoding[val->encoding];
9651 } else {
9652 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9653 strenc = buf;
9654 }
9655 addReplySds(c,sdscatprintf(sdsempty(),
9656 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9657 "encoding:%s serializedlength:%lld\r\n",
9658 (void*)key, key->refcount, (void*)val, val->refcount,
9659 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9660 } else {
9661 addReplySds(c,sdscatprintf(sdsempty(),
9662 "+Key at:%p refcount:%d, value swapped at: page %llu "
9663 "using %llu pages\r\n",
9664 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9665 (unsigned long long) key->vm.usedpages));
9666 }
9667 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9668 lookupKeyRead(c->db,c->argv[2]);
9669 addReply(c,shared.ok);
9670 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9671 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9672 robj *key, *val;
9673
9674 if (!server.vm_enabled) {
9675 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9676 return;
9677 }
9678 if (!de) {
9679 addReply(c,shared.nokeyerr);
9680 return;
9681 }
9682 key = dictGetEntryKey(de);
9683 val = dictGetEntryVal(de);
9684 /* If the key is shared we want to create a copy */
9685 if (key->refcount > 1) {
9686 robj *newkey = dupStringObject(key);
9687 decrRefCount(key);
9688 key = dictGetEntryKey(de) = newkey;
9689 }
9690 /* Swap it */
9691 if (key->storage != REDIS_VM_MEMORY) {
9692 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9693 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9694 dictGetEntryVal(de) = NULL;
9695 addReply(c,shared.ok);
9696 } else {
9697 addReply(c,shared.err);
9698 }
9699 } else {
9700 addReplySds(c,sdsnew(
9701 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9702 }
9703 }
9704
9705 static void _redisAssert(char *estr, char *file, int line) {
9706 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9707 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9708 #ifdef HAVE_BACKTRACE
9709 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9710 *((char*)-1) = 'x';
9711 #endif
9712 }
9713
9714 /* =================================== Main! ================================ */
9715
9716 #ifdef __linux__
9717 int linuxOvercommitMemoryValue(void) {
9718 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9719 char buf[64];
9720
9721 if (!fp) return -1;
9722 if (fgets(buf,64,fp) == NULL) {
9723 fclose(fp);
9724 return -1;
9725 }
9726 fclose(fp);
9727
9728 return atoi(buf);
9729 }
9730
9731 void linuxOvercommitMemoryWarning(void) {
9732 if (linuxOvercommitMemoryValue() == 0) {
9733 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9734 }
9735 }
9736 #endif /* __linux__ */
9737
9738 static void daemonize(void) {
9739 int fd;
9740 FILE *fp;
9741
9742 if (fork() != 0) exit(0); /* parent exits */
9743 setsid(); /* create a new session */
9744
9745 /* Every output goes to /dev/null. If Redis is daemonized but
9746 * the 'logfile' is set to 'stdout' in the configuration file
9747 * it will not log at all. */
9748 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9749 dup2(fd, STDIN_FILENO);
9750 dup2(fd, STDOUT_FILENO);
9751 dup2(fd, STDERR_FILENO);
9752 if (fd > STDERR_FILENO) close(fd);
9753 }
9754 /* Try to write the pid file */
9755 fp = fopen(server.pidfile,"w");
9756 if (fp) {
9757 fprintf(fp,"%d\n",getpid());
9758 fclose(fp);
9759 }
9760 }
9761
9762 static void version() {
9763 printf("Redis server version %s\n", REDIS_VERSION);
9764 exit(0);
9765 }
9766
9767 static void usage() {
9768 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9769 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9770 exit(1);
9771 }
9772
9773 int main(int argc, char **argv) {
9774 time_t start;
9775
9776 initServerConfig();
9777 if (argc == 2) {
9778 if (strcmp(argv[1], "-v") == 0 ||
9779 strcmp(argv[1], "--version") == 0) version();
9780 if (strcmp(argv[1], "--help") == 0) usage();
9781 resetServerSaveParams();
9782 loadServerConfig(argv[1]);
9783 } else if ((argc > 2)) {
9784 usage();
9785 } else {
9786 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9787 }
9788 if (server.daemonize) daemonize();
9789 initServer();
9790 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9791 #ifdef __linux__
9792 linuxOvercommitMemoryWarning();
9793 #endif
9794 start = time(NULL);
9795 if (server.appendonly) {
9796 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9797 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9798 } else {
9799 if (rdbLoad(server.dbfilename) == REDIS_OK)
9800 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9801 }
9802 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9803 aeSetBeforeSleepProc(server.el,beforeSleep);
9804 aeMain(server.el);
9805 aeDeleteEventLoop(server.el);
9806 return 0;
9807 }
9808
9809 /* ============================= Backtrace support ========================= */
9810
9811 #ifdef HAVE_BACKTRACE
9812 static char *findFuncName(void *pointer, unsigned long *offset);
9813
9814 static void *getMcontextEip(ucontext_t *uc) {
9815 #if defined(__FreeBSD__)
9816 return (void*) uc->uc_mcontext.mc_eip;
9817 #elif defined(__dietlibc__)
9818 return (void*) uc->uc_mcontext.eip;
9819 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9820 #if __x86_64__
9821 return (void*) uc->uc_mcontext->__ss.__rip;
9822 #else
9823 return (void*) uc->uc_mcontext->__ss.__eip;
9824 #endif
9825 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9826 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9827 return (void*) uc->uc_mcontext->__ss.__rip;
9828 #else
9829 return (void*) uc->uc_mcontext->__ss.__eip;
9830 #endif
9831 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9832 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9833 #elif defined(__ia64__) /* Linux IA64 */
9834 return (void*) uc->uc_mcontext.sc_ip;
9835 #else
9836 return NULL;
9837 #endif
9838 }
9839
9840 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9841 void *trace[100];
9842 char **messages = NULL;
9843 int i, trace_size = 0;
9844 unsigned long offset=0;
9845 ucontext_t *uc = (ucontext_t*) secret;
9846 sds infostring;
9847 REDIS_NOTUSED(info);
9848
9849 redisLog(REDIS_WARNING,
9850 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9851 infostring = genRedisInfoString();
9852 redisLog(REDIS_WARNING, "%s",infostring);
9853 /* It's not safe to sdsfree() the returned string under memory
9854 * corruption conditions. Let it leak as we are going to abort */
9855
9856 trace_size = backtrace(trace, 100);
9857 /* overwrite sigaction with caller's address */
9858 if (getMcontextEip(uc) != NULL) {
9859 trace[1] = getMcontextEip(uc);
9860 }
9861 messages = backtrace_symbols(trace, trace_size);
9862
9863 for (i=1; i<trace_size; ++i) {
9864 char *fn = findFuncName(trace[i], &offset), *p;
9865
9866 p = strchr(messages[i],'+');
9867 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9868 redisLog(REDIS_WARNING,"%s", messages[i]);
9869 } else {
9870 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9871 }
9872 }
9873 /* free(messages); Don't call free() with possibly corrupted memory. */
9874 _exit(0);
9875 }
9876
9877 static void setupSigSegvAction(void) {
9878 struct sigaction act;
9879
9880 sigemptyset (&act.sa_mask);
9881 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9882 * is used. Otherwise, sa_handler is used */
9883 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9884 act.sa_sigaction = segvHandler;
9885 sigaction (SIGSEGV, &act, NULL);
9886 sigaction (SIGBUS, &act, NULL);
9887 sigaction (SIGFPE, &act, NULL);
9888 sigaction (SIGILL, &act, NULL);
9889 sigaction (SIGBUS, &act, NULL);
9890 return;
9891 }
9892
9893 #include "staticsymbols.h"
9894 /* This function try to convert a pointer into a function name. It's used in
9895 * oreder to provide a backtrace under segmentation fault that's able to
9896 * display functions declared as static (otherwise the backtrace is useless). */
9897 static char *findFuncName(void *pointer, unsigned long *offset){
9898 int i, ret = -1;
9899 unsigned long off, minoff = 0;
9900
9901 /* Try to match against the Symbol with the smallest offset */
9902 for (i=0; symsTable[i].pointer; i++) {
9903 unsigned long lp = (unsigned long) pointer;
9904
9905 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9906 off=lp-symsTable[i].pointer;
9907 if (ret < 0 || off < minoff) {
9908 minoff=off;
9909 ret=i;
9910 }
9911 }
9912 }
9913 if (ret == -1) return NULL;
9914 *offset = minoff;
9915 return symsTable[ret].name;
9916 }
9917 #else /* HAVE_BACKTRACE */
9918 static void setupSigSegvAction(void) {
9919 }
9920 #endif /* HAVE_BACKTRACE */
9921
9922
9923
9924 /* The End */
9925
9926
9927