]> git.saurik.com Git - redis.git/blob - redis.c
dict.c fixed to play well with enabling/disabling of the hash table
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 struct sharedObjectsStruct {
509 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
510 *colon, *nullbulk, *nullmultibulk, *queued,
511 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
512 *outofrangeerr, *plus,
513 *select0, *select1, *select2, *select3, *select4,
514 *select5, *select6, *select7, *select8, *select9,
515 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
516 *psubscribebulk, *punsubscribebulk;
517 } shared;
518
519 /* Global vars that are actally used as constants. The following double
520 * values are used for double on-disk serialization, and are initialized
521 * at runtime to avoid strange compiler optimizations. */
522
523 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
524
525 /* VM threaded I/O request message */
526 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
527 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
528 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
529 typedef struct iojob {
530 int type; /* Request type, REDIS_IOJOB_* */
531 redisDb *db;/* Redis database */
532 robj *key; /* This I/O request is about swapping this key */
533 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
534 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
535 off_t page; /* Swap page where to read/write the object */
536 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
537 int canceled; /* True if this command was canceled by blocking side of VM */
538 pthread_t thread; /* ID of the thread processing this entry */
539 } iojob;
540
541 /*================================ Prototypes =============================== */
542
543 static void freeStringObject(robj *o);
544 static void freeListObject(robj *o);
545 static void freeSetObject(robj *o);
546 static void decrRefCount(void *o);
547 static robj *createObject(int type, void *ptr);
548 static void freeClient(redisClient *c);
549 static int rdbLoad(char *filename);
550 static void addReply(redisClient *c, robj *obj);
551 static void addReplySds(redisClient *c, sds s);
552 static void incrRefCount(robj *o);
553 static int rdbSaveBackground(char *filename);
554 static robj *createStringObject(char *ptr, size_t len);
555 static robj *dupStringObject(robj *o);
556 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
557 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
558 static int syncWithMaster(void);
559 static int tryObjectEncoding(robj *o);
560 static robj *getDecodedObject(robj *o);
561 static int removeExpire(redisDb *db, robj *key);
562 static int expireIfNeeded(redisDb *db, robj *key);
563 static int deleteIfVolatile(redisDb *db, robj *key);
564 static int deleteIfSwapped(redisDb *db, robj *key);
565 static int deleteKey(redisDb *db, robj *key);
566 static time_t getExpire(redisDb *db, robj *key);
567 static int setExpire(redisDb *db, robj *key, time_t when);
568 static void updateSlavesWaitingBgsave(int bgsaveerr);
569 static void freeMemoryIfNeeded(void);
570 static int processCommand(redisClient *c);
571 static void setupSigSegvAction(void);
572 static void rdbRemoveTempFile(pid_t childpid);
573 static void aofRemoveTempFile(pid_t childpid);
574 static size_t stringObjectLen(robj *o);
575 static void processInputBuffer(redisClient *c);
576 static zskiplist *zslCreate(void);
577 static void zslFree(zskiplist *zsl);
578 static void zslInsert(zskiplist *zsl, double score, robj *obj);
579 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
580 static void initClientMultiState(redisClient *c);
581 static void freeClientMultiState(redisClient *c);
582 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
583 static void unblockClientWaitingData(redisClient *c);
584 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
585 static void vmInit(void);
586 static void vmMarkPagesFree(off_t page, off_t count);
587 static robj *vmLoadObject(robj *key);
588 static robj *vmPreviewObject(robj *key);
589 static int vmSwapOneObjectBlocking(void);
590 static int vmSwapOneObjectThreaded(void);
591 static int vmCanSwapOut(void);
592 static int tryFreeOneObjectFromFreelist(void);
593 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
594 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmCancelThreadedIOJob(robj *o);
596 static void lockThreadedIO(void);
597 static void unlockThreadedIO(void);
598 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
599 static void freeIOJob(iojob *j);
600 static void queueIOJob(iojob *j);
601 static int vmWriteObjectOnSwap(robj *o, off_t page);
602 static robj *vmReadObjectFromSwap(off_t page, int type);
603 static void waitEmptyIOJobsQueue(void);
604 static void vmReopenSwapFile(void);
605 static int vmFreePage(off_t page);
606 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
607 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
608 static int dontWaitForSwappedKey(redisClient *c, robj *key);
609 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
610 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
611 static struct redisCommand *lookupCommand(char *name);
612 static void call(redisClient *c, struct redisCommand *cmd);
613 static void resetClient(redisClient *c);
614 static void convertToRealHash(robj *o);
615 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
616 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
617 static void freePubsubPattern(void *p);
618 static int listMatchPubsubPattern(void *a, void *b);
619 static int compareStringObjects(robj *a, robj *b);
620 static void usage();
621
622 static void authCommand(redisClient *c);
623 static void pingCommand(redisClient *c);
624 static void echoCommand(redisClient *c);
625 static void setCommand(redisClient *c);
626 static void setnxCommand(redisClient *c);
627 static void getCommand(redisClient *c);
628 static void delCommand(redisClient *c);
629 static void existsCommand(redisClient *c);
630 static void incrCommand(redisClient *c);
631 static void decrCommand(redisClient *c);
632 static void incrbyCommand(redisClient *c);
633 static void decrbyCommand(redisClient *c);
634 static void selectCommand(redisClient *c);
635 static void randomkeyCommand(redisClient *c);
636 static void keysCommand(redisClient *c);
637 static void dbsizeCommand(redisClient *c);
638 static void lastsaveCommand(redisClient *c);
639 static void saveCommand(redisClient *c);
640 static void bgsaveCommand(redisClient *c);
641 static void bgrewriteaofCommand(redisClient *c);
642 static void shutdownCommand(redisClient *c);
643 static void moveCommand(redisClient *c);
644 static void renameCommand(redisClient *c);
645 static void renamenxCommand(redisClient *c);
646 static void lpushCommand(redisClient *c);
647 static void rpushCommand(redisClient *c);
648 static void lpopCommand(redisClient *c);
649 static void rpopCommand(redisClient *c);
650 static void llenCommand(redisClient *c);
651 static void lindexCommand(redisClient *c);
652 static void lrangeCommand(redisClient *c);
653 static void ltrimCommand(redisClient *c);
654 static void typeCommand(redisClient *c);
655 static void lsetCommand(redisClient *c);
656 static void saddCommand(redisClient *c);
657 static void sremCommand(redisClient *c);
658 static void smoveCommand(redisClient *c);
659 static void sismemberCommand(redisClient *c);
660 static void scardCommand(redisClient *c);
661 static void spopCommand(redisClient *c);
662 static void srandmemberCommand(redisClient *c);
663 static void sinterCommand(redisClient *c);
664 static void sinterstoreCommand(redisClient *c);
665 static void sunionCommand(redisClient *c);
666 static void sunionstoreCommand(redisClient *c);
667 static void sdiffCommand(redisClient *c);
668 static void sdiffstoreCommand(redisClient *c);
669 static void syncCommand(redisClient *c);
670 static void flushdbCommand(redisClient *c);
671 static void flushallCommand(redisClient *c);
672 static void sortCommand(redisClient *c);
673 static void lremCommand(redisClient *c);
674 static void rpoplpushcommand(redisClient *c);
675 static void infoCommand(redisClient *c);
676 static void mgetCommand(redisClient *c);
677 static void monitorCommand(redisClient *c);
678 static void expireCommand(redisClient *c);
679 static void expireatCommand(redisClient *c);
680 static void getsetCommand(redisClient *c);
681 static void ttlCommand(redisClient *c);
682 static void slaveofCommand(redisClient *c);
683 static void debugCommand(redisClient *c);
684 static void msetCommand(redisClient *c);
685 static void msetnxCommand(redisClient *c);
686 static void zaddCommand(redisClient *c);
687 static void zincrbyCommand(redisClient *c);
688 static void zrangeCommand(redisClient *c);
689 static void zrangebyscoreCommand(redisClient *c);
690 static void zcountCommand(redisClient *c);
691 static void zrevrangeCommand(redisClient *c);
692 static void zcardCommand(redisClient *c);
693 static void zremCommand(redisClient *c);
694 static void zscoreCommand(redisClient *c);
695 static void zremrangebyscoreCommand(redisClient *c);
696 static void multiCommand(redisClient *c);
697 static void execCommand(redisClient *c);
698 static void discardCommand(redisClient *c);
699 static void blpopCommand(redisClient *c);
700 static void brpopCommand(redisClient *c);
701 static void appendCommand(redisClient *c);
702 static void substrCommand(redisClient *c);
703 static void zrankCommand(redisClient *c);
704 static void zrevrankCommand(redisClient *c);
705 static void hsetCommand(redisClient *c);
706 static void hgetCommand(redisClient *c);
707 static void hdelCommand(redisClient *c);
708 static void hlenCommand(redisClient *c);
709 static void zremrangebyrankCommand(redisClient *c);
710 static void zunionCommand(redisClient *c);
711 static void zinterCommand(redisClient *c);
712 static void hkeysCommand(redisClient *c);
713 static void hvalsCommand(redisClient *c);
714 static void hgetallCommand(redisClient *c);
715 static void hexistsCommand(redisClient *c);
716 static void configCommand(redisClient *c);
717 static void hincrbyCommand(redisClient *c);
718 static void subscribeCommand(redisClient *c);
719 static void unsubscribeCommand(redisClient *c);
720 static void psubscribeCommand(redisClient *c);
721 static void punsubscribeCommand(redisClient *c);
722 static void publishCommand(redisClient *c);
723
724 /*================================= Globals ================================= */
725
726 /* Global vars */
727 static struct redisServer server; /* server global state */
728 static struct redisCommand cmdTable[] = {
729 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
730 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
731 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
732 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
733 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
735 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
737 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
738 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
739 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
751 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
752 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
754 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
755 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
756 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
760 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
761 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
762 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
763 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
764 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
765 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
772 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
773 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
780 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
794 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
795 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
807 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
815 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
820 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
826 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
831 {NULL,NULL,0,0,NULL,0,0,0}
832 };
833
834 /*============================ Utility functions ============================ */
835
836 /* Glob-style pattern matching. */
837 static int stringmatchlen(const char *pattern, int patternLen,
838 const char *string, int stringLen, int nocase)
839 {
840 while(patternLen) {
841 switch(pattern[0]) {
842 case '*':
843 while (pattern[1] == '*') {
844 pattern++;
845 patternLen--;
846 }
847 if (patternLen == 1)
848 return 1; /* match */
849 while(stringLen) {
850 if (stringmatchlen(pattern+1, patternLen-1,
851 string, stringLen, nocase))
852 return 1; /* match */
853 string++;
854 stringLen--;
855 }
856 return 0; /* no match */
857 break;
858 case '?':
859 if (stringLen == 0)
860 return 0; /* no match */
861 string++;
862 stringLen--;
863 break;
864 case '[':
865 {
866 int not, match;
867
868 pattern++;
869 patternLen--;
870 not = pattern[0] == '^';
871 if (not) {
872 pattern++;
873 patternLen--;
874 }
875 match = 0;
876 while(1) {
877 if (pattern[0] == '\\') {
878 pattern++;
879 patternLen--;
880 if (pattern[0] == string[0])
881 match = 1;
882 } else if (pattern[0] == ']') {
883 break;
884 } else if (patternLen == 0) {
885 pattern--;
886 patternLen++;
887 break;
888 } else if (pattern[1] == '-' && patternLen >= 3) {
889 int start = pattern[0];
890 int end = pattern[2];
891 int c = string[0];
892 if (start > end) {
893 int t = start;
894 start = end;
895 end = t;
896 }
897 if (nocase) {
898 start = tolower(start);
899 end = tolower(end);
900 c = tolower(c);
901 }
902 pattern += 2;
903 patternLen -= 2;
904 if (c >= start && c <= end)
905 match = 1;
906 } else {
907 if (!nocase) {
908 if (pattern[0] == string[0])
909 match = 1;
910 } else {
911 if (tolower((int)pattern[0]) == tolower((int)string[0]))
912 match = 1;
913 }
914 }
915 pattern++;
916 patternLen--;
917 }
918 if (not)
919 match = !match;
920 if (!match)
921 return 0; /* no match */
922 string++;
923 stringLen--;
924 break;
925 }
926 case '\\':
927 if (patternLen >= 2) {
928 pattern++;
929 patternLen--;
930 }
931 /* fall through */
932 default:
933 if (!nocase) {
934 if (pattern[0] != string[0])
935 return 0; /* no match */
936 } else {
937 if (tolower((int)pattern[0]) != tolower((int)string[0]))
938 return 0; /* no match */
939 }
940 string++;
941 stringLen--;
942 break;
943 }
944 pattern++;
945 patternLen--;
946 if (stringLen == 0) {
947 while(*pattern == '*') {
948 pattern++;
949 patternLen--;
950 }
951 break;
952 }
953 }
954 if (patternLen == 0 && stringLen == 0)
955 return 1;
956 return 0;
957 }
958
959 static int stringmatch(const char *pattern, const char *string, int nocase) {
960 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
961 }
962
963 static void redisLog(int level, const char *fmt, ...) {
964 va_list ap;
965 FILE *fp;
966
967 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
968 if (!fp) return;
969
970 va_start(ap, fmt);
971 if (level >= server.verbosity) {
972 char *c = ".-*#";
973 char buf[64];
974 time_t now;
975
976 now = time(NULL);
977 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
978 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
979 vfprintf(fp, fmt, ap);
980 fprintf(fp,"\n");
981 fflush(fp);
982 }
983 va_end(ap);
984
985 if (server.logfile) fclose(fp);
986 }
987
988 /*====================== Hash table type implementation ==================== */
989
990 /* This is an hash table type that uses the SDS dynamic strings libary as
991 * keys and radis objects as values (objects can hold SDS strings,
992 * lists, sets). */
993
994 static void dictVanillaFree(void *privdata, void *val)
995 {
996 DICT_NOTUSED(privdata);
997 zfree(val);
998 }
999
1000 static void dictListDestructor(void *privdata, void *val)
1001 {
1002 DICT_NOTUSED(privdata);
1003 listRelease((list*)val);
1004 }
1005
1006 static int sdsDictKeyCompare(void *privdata, const void *key1,
1007 const void *key2)
1008 {
1009 int l1,l2;
1010 DICT_NOTUSED(privdata);
1011
1012 l1 = sdslen((sds)key1);
1013 l2 = sdslen((sds)key2);
1014 if (l1 != l2) return 0;
1015 return memcmp(key1, key2, l1) == 0;
1016 }
1017
1018 static void dictRedisObjectDestructor(void *privdata, void *val)
1019 {
1020 DICT_NOTUSED(privdata);
1021
1022 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1023 decrRefCount(val);
1024 }
1025
1026 static int dictObjKeyCompare(void *privdata, const void *key1,
1027 const void *key2)
1028 {
1029 const robj *o1 = key1, *o2 = key2;
1030 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1031 }
1032
1033 static unsigned int dictObjHash(const void *key) {
1034 const robj *o = key;
1035 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1036 }
1037
1038 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1039 const void *key2)
1040 {
1041 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1042 int cmp;
1043
1044 if (o1->encoding == REDIS_ENCODING_INT &&
1045 o2->encoding == REDIS_ENCODING_INT &&
1046 o1->ptr == o2->ptr) return 1;
1047
1048 o1 = getDecodedObject(o1);
1049 o2 = getDecodedObject(o2);
1050 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1051 decrRefCount(o1);
1052 decrRefCount(o2);
1053 return cmp;
1054 }
1055
1056 static unsigned int dictEncObjHash(const void *key) {
1057 robj *o = (robj*) key;
1058
1059 if (o->encoding == REDIS_ENCODING_RAW) {
1060 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1061 } else {
1062 if (o->encoding == REDIS_ENCODING_INT) {
1063 char buf[32];
1064 int len;
1065
1066 len = snprintf(buf,32,"%ld",(long)o->ptr);
1067 return dictGenHashFunction((unsigned char*)buf, len);
1068 } else {
1069 unsigned int hash;
1070
1071 o = getDecodedObject(o);
1072 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1073 decrRefCount(o);
1074 return hash;
1075 }
1076 }
1077 }
1078
1079 /* Sets type and expires */
1080 static dictType setDictType = {
1081 dictEncObjHash, /* hash function */
1082 NULL, /* key dup */
1083 NULL, /* val dup */
1084 dictEncObjKeyCompare, /* key compare */
1085 dictRedisObjectDestructor, /* key destructor */
1086 NULL /* val destructor */
1087 };
1088
1089 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1090 static dictType zsetDictType = {
1091 dictEncObjHash, /* hash function */
1092 NULL, /* key dup */
1093 NULL, /* val dup */
1094 dictEncObjKeyCompare, /* key compare */
1095 dictRedisObjectDestructor, /* key destructor */
1096 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1097 };
1098
1099 /* Db->dict */
1100 static dictType dbDictType = {
1101 dictObjHash, /* hash function */
1102 NULL, /* key dup */
1103 NULL, /* val dup */
1104 dictObjKeyCompare, /* key compare */
1105 dictRedisObjectDestructor, /* key destructor */
1106 dictRedisObjectDestructor /* val destructor */
1107 };
1108
1109 /* Db->expires */
1110 static dictType keyptrDictType = {
1111 dictObjHash, /* hash function */
1112 NULL, /* key dup */
1113 NULL, /* val dup */
1114 dictObjKeyCompare, /* key compare */
1115 dictRedisObjectDestructor, /* key destructor */
1116 NULL /* val destructor */
1117 };
1118
1119 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1120 static dictType hashDictType = {
1121 dictEncObjHash, /* hash function */
1122 NULL, /* key dup */
1123 NULL, /* val dup */
1124 dictEncObjKeyCompare, /* key compare */
1125 dictRedisObjectDestructor, /* key destructor */
1126 dictRedisObjectDestructor /* val destructor */
1127 };
1128
1129 /* Keylist hash table type has unencoded redis objects as keys and
1130 * lists as values. It's used for blocking operations (BLPOP) and to
1131 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1132 static dictType keylistDictType = {
1133 dictObjHash, /* hash function */
1134 NULL, /* key dup */
1135 NULL, /* val dup */
1136 dictObjKeyCompare, /* key compare */
1137 dictRedisObjectDestructor, /* key destructor */
1138 dictListDestructor /* val destructor */
1139 };
1140
1141 static void version();
1142
1143 /* ========================= Random utility functions ======================= */
1144
1145 /* Redis generally does not try to recover from out of memory conditions
1146 * when allocating objects or strings, it is not clear if it will be possible
1147 * to report this condition to the client since the networking layer itself
1148 * is based on heap allocation for send buffers, so we simply abort.
1149 * At least the code will be simpler to read... */
1150 static void oom(const char *msg) {
1151 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1152 sleep(1);
1153 abort();
1154 }
1155
1156 /* ====================== Redis server networking stuff ===================== */
1157 static void closeTimedoutClients(void) {
1158 redisClient *c;
1159 listNode *ln;
1160 time_t now = time(NULL);
1161 listIter li;
1162
1163 listRewind(server.clients,&li);
1164 while ((ln = listNext(&li)) != NULL) {
1165 c = listNodeValue(ln);
1166 if (server.maxidletime &&
1167 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1168 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1169 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1170 listLength(c->pubsub_patterns) == 0 &&
1171 (now - c->lastinteraction > server.maxidletime))
1172 {
1173 redisLog(REDIS_VERBOSE,"Closing idle client");
1174 freeClient(c);
1175 } else if (c->flags & REDIS_BLOCKED) {
1176 if (c->blockingto != 0 && c->blockingto < now) {
1177 addReply(c,shared.nullmultibulk);
1178 unblockClientWaitingData(c);
1179 }
1180 }
1181 }
1182 }
1183
1184 static int htNeedsResize(dict *dict) {
1185 long long size, used;
1186
1187 size = dictSlots(dict);
1188 used = dictSize(dict);
1189 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1190 (used*100/size < REDIS_HT_MINFILL));
1191 }
1192
1193 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1194 * we resize the hash table to save memory */
1195 static void tryResizeHashTables(void) {
1196 int j;
1197
1198 for (j = 0; j < server.dbnum; j++) {
1199 if (htNeedsResize(server.db[j].dict)) {
1200 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1201 dictResize(server.db[j].dict);
1202 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1203 }
1204 if (htNeedsResize(server.db[j].expires))
1205 dictResize(server.db[j].expires);
1206 }
1207 }
1208
1209 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1210 void backgroundSaveDoneHandler(int statloc) {
1211 int exitcode = WEXITSTATUS(statloc);
1212 int bysignal = WIFSIGNALED(statloc);
1213
1214 if (!bysignal && exitcode == 0) {
1215 redisLog(REDIS_NOTICE,
1216 "Background saving terminated with success");
1217 server.dirty = 0;
1218 server.lastsave = time(NULL);
1219 } else if (!bysignal && exitcode != 0) {
1220 redisLog(REDIS_WARNING, "Background saving error");
1221 } else {
1222 redisLog(REDIS_WARNING,
1223 "Background saving terminated by signal %d", WTERMSIG(statloc));
1224 rdbRemoveTempFile(server.bgsavechildpid);
1225 }
1226 server.bgsavechildpid = -1;
1227 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1228 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1229 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1230 }
1231
1232 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1233 * Handle this. */
1234 void backgroundRewriteDoneHandler(int statloc) {
1235 int exitcode = WEXITSTATUS(statloc);
1236 int bysignal = WIFSIGNALED(statloc);
1237
1238 if (!bysignal && exitcode == 0) {
1239 int fd;
1240 char tmpfile[256];
1241
1242 redisLog(REDIS_NOTICE,
1243 "Background append only file rewriting terminated with success");
1244 /* Now it's time to flush the differences accumulated by the parent */
1245 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1246 fd = open(tmpfile,O_WRONLY|O_APPEND);
1247 if (fd == -1) {
1248 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1249 goto cleanup;
1250 }
1251 /* Flush our data... */
1252 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1253 (signed) sdslen(server.bgrewritebuf)) {
1254 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1255 close(fd);
1256 goto cleanup;
1257 }
1258 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1259 /* Now our work is to rename the temp file into the stable file. And
1260 * switch the file descriptor used by the server for append only. */
1261 if (rename(tmpfile,server.appendfilename) == -1) {
1262 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1263 close(fd);
1264 goto cleanup;
1265 }
1266 /* Mission completed... almost */
1267 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1268 if (server.appendfd != -1) {
1269 /* If append only is actually enabled... */
1270 close(server.appendfd);
1271 server.appendfd = fd;
1272 fsync(fd);
1273 server.appendseldb = -1; /* Make sure it will issue SELECT */
1274 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1275 } else {
1276 /* If append only is disabled we just generate a dump in this
1277 * format. Why not? */
1278 close(fd);
1279 }
1280 } else if (!bysignal && exitcode != 0) {
1281 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1282 } else {
1283 redisLog(REDIS_WARNING,
1284 "Background append only file rewriting terminated by signal %d",
1285 WTERMSIG(statloc));
1286 }
1287 cleanup:
1288 sdsfree(server.bgrewritebuf);
1289 server.bgrewritebuf = sdsempty();
1290 aofRemoveTempFile(server.bgrewritechildpid);
1291 server.bgrewritechildpid = -1;
1292 }
1293
1294 /* This function is called once a background process of some kind terminates,
1295 * as we want to avoid resizing the hash tables when there is a child in order
1296 * to play well with copy-on-write (otherwise when a resize happens lots of
1297 * memory pages are copied). The goal of this function is to update the ability
1298 * for dict.c to resize the hash tables accordingly to the fact we have o not
1299 * running childs. */
1300 static void updateDictResizePolicy(void) {
1301 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1302 dictEnableResize();
1303 else
1304 dictDisableResize();
1305 }
1306
1307 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1308 int j, loops = server.cronloops++;
1309 REDIS_NOTUSED(eventLoop);
1310 REDIS_NOTUSED(id);
1311 REDIS_NOTUSED(clientData);
1312
1313 /* We take a cached value of the unix time in the global state because
1314 * with virtual memory and aging there is to store the current time
1315 * in objects at every object access, and accuracy is not needed.
1316 * To access a global var is faster than calling time(NULL) */
1317 server.unixtime = time(NULL);
1318
1319 /* Show some info about non-empty databases */
1320 for (j = 0; j < server.dbnum; j++) {
1321 long long size, used, vkeys;
1322
1323 size = dictSlots(server.db[j].dict);
1324 used = dictSize(server.db[j].dict);
1325 vkeys = dictSize(server.db[j].expires);
1326 if (!(loops % 50) && (used || vkeys)) {
1327 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1328 /* dictPrintStats(server.dict); */
1329 }
1330 }
1331
1332 /* We don't want to resize the hash tables while a bacground saving
1333 * is in progress: the saving child is created using fork() that is
1334 * implemented with a copy-on-write semantic in most modern systems, so
1335 * if we resize the HT while there is the saving child at work actually
1336 * a lot of memory movements in the parent will cause a lot of pages
1337 * copied. */
1338 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1339 !(loops % 10))
1340 {
1341 tryResizeHashTables();
1342 }
1343
1344 /* Show information about connected clients */
1345 if (!(loops % 50)) {
1346 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1347 listLength(server.clients)-listLength(server.slaves),
1348 listLength(server.slaves),
1349 zmalloc_used_memory());
1350 }
1351
1352 /* Close connections of timedout clients */
1353 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1354 closeTimedoutClients();
1355
1356 /* Check if a background saving or AOF rewrite in progress terminated */
1357 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1358 int statloc;
1359 pid_t pid;
1360
1361 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1362 if (pid == server.bgsavechildpid) {
1363 backgroundSaveDoneHandler(statloc);
1364 } else {
1365 backgroundRewriteDoneHandler(statloc);
1366 }
1367 updateDictResizePolicy();
1368 }
1369 } else {
1370 /* If there is not a background saving in progress check if
1371 * we have to save now */
1372 time_t now = time(NULL);
1373 for (j = 0; j < server.saveparamslen; j++) {
1374 struct saveparam *sp = server.saveparams+j;
1375
1376 if (server.dirty >= sp->changes &&
1377 now-server.lastsave > sp->seconds) {
1378 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1379 sp->changes, sp->seconds);
1380 rdbSaveBackground(server.dbfilename);
1381 break;
1382 }
1383 }
1384 }
1385
1386 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1387 * will use few CPU cycles if there are few expiring keys, otherwise
1388 * it will get more aggressive to avoid that too much memory is used by
1389 * keys that can be removed from the keyspace. */
1390 for (j = 0; j < server.dbnum; j++) {
1391 int expired;
1392 redisDb *db = server.db+j;
1393
1394 /* Continue to expire if at the end of the cycle more than 25%
1395 * of the keys were expired. */
1396 do {
1397 long num = dictSize(db->expires);
1398 time_t now = time(NULL);
1399
1400 expired = 0;
1401 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1402 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1403 while (num--) {
1404 dictEntry *de;
1405 time_t t;
1406
1407 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1408 t = (time_t) dictGetEntryVal(de);
1409 if (now > t) {
1410 deleteKey(db,dictGetEntryKey(de));
1411 expired++;
1412 server.stat_expiredkeys++;
1413 }
1414 }
1415 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1416 }
1417
1418 /* Swap a few keys on disk if we are over the memory limit and VM
1419 * is enbled. Try to free objects from the free list first. */
1420 if (vmCanSwapOut()) {
1421 while (server.vm_enabled && zmalloc_used_memory() >
1422 server.vm_max_memory)
1423 {
1424 int retval;
1425
1426 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1427 retval = (server.vm_max_threads == 0) ?
1428 vmSwapOneObjectBlocking() :
1429 vmSwapOneObjectThreaded();
1430 if (retval == REDIS_ERR && !(loops % 300) &&
1431 zmalloc_used_memory() >
1432 (server.vm_max_memory+server.vm_max_memory/10))
1433 {
1434 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1435 }
1436 /* Note that when using threade I/O we free just one object,
1437 * because anyway when the I/O thread in charge to swap this
1438 * object out will finish, the handler of completed jobs
1439 * will try to swap more objects if we are still out of memory. */
1440 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1441 }
1442 }
1443
1444 /* Check if we should connect to a MASTER */
1445 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1446 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1447 if (syncWithMaster() == REDIS_OK) {
1448 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1449 }
1450 }
1451 return 100;
1452 }
1453
1454 /* This function gets called every time Redis is entering the
1455 * main loop of the event driven library, that is, before to sleep
1456 * for ready file descriptors. */
1457 static void beforeSleep(struct aeEventLoop *eventLoop) {
1458 REDIS_NOTUSED(eventLoop);
1459
1460 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1461 listIter li;
1462 listNode *ln;
1463
1464 listRewind(server.io_ready_clients,&li);
1465 while((ln = listNext(&li))) {
1466 redisClient *c = ln->value;
1467 struct redisCommand *cmd;
1468
1469 /* Resume the client. */
1470 listDelNode(server.io_ready_clients,ln);
1471 c->flags &= (~REDIS_IO_WAIT);
1472 server.vm_blocked_clients--;
1473 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1474 readQueryFromClient, c);
1475 cmd = lookupCommand(c->argv[0]->ptr);
1476 assert(cmd != NULL);
1477 call(c,cmd);
1478 resetClient(c);
1479 /* There may be more data to process in the input buffer. */
1480 if (c->querybuf && sdslen(c->querybuf) > 0)
1481 processInputBuffer(c);
1482 }
1483 }
1484 }
1485
1486 static void createSharedObjects(void) {
1487 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1488 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1489 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1490 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1491 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1492 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1493 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1494 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1495 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1496 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1497 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1498 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1499 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1500 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1501 "-ERR no such key\r\n"));
1502 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1503 "-ERR syntax error\r\n"));
1504 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1505 "-ERR source and destination objects are the same\r\n"));
1506 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1507 "-ERR index out of range\r\n"));
1508 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1509 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1510 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1511 shared.select0 = createStringObject("select 0\r\n",10);
1512 shared.select1 = createStringObject("select 1\r\n",10);
1513 shared.select2 = createStringObject("select 2\r\n",10);
1514 shared.select3 = createStringObject("select 3\r\n",10);
1515 shared.select4 = createStringObject("select 4\r\n",10);
1516 shared.select5 = createStringObject("select 5\r\n",10);
1517 shared.select6 = createStringObject("select 6\r\n",10);
1518 shared.select7 = createStringObject("select 7\r\n",10);
1519 shared.select8 = createStringObject("select 8\r\n",10);
1520 shared.select9 = createStringObject("select 9\r\n",10);
1521 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1522 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1523 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1524 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1525 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1526 shared.mbulk3 = createStringObject("*3\r\n",4);
1527 }
1528
1529 static void appendServerSaveParams(time_t seconds, int changes) {
1530 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1531 server.saveparams[server.saveparamslen].seconds = seconds;
1532 server.saveparams[server.saveparamslen].changes = changes;
1533 server.saveparamslen++;
1534 }
1535
1536 static void resetServerSaveParams() {
1537 zfree(server.saveparams);
1538 server.saveparams = NULL;
1539 server.saveparamslen = 0;
1540 }
1541
1542 static void initServerConfig() {
1543 server.dbnum = REDIS_DEFAULT_DBNUM;
1544 server.port = REDIS_SERVERPORT;
1545 server.verbosity = REDIS_VERBOSE;
1546 server.maxidletime = REDIS_MAXIDLETIME;
1547 server.saveparams = NULL;
1548 server.logfile = NULL; /* NULL = log on standard output */
1549 server.bindaddr = NULL;
1550 server.glueoutputbuf = 1;
1551 server.daemonize = 0;
1552 server.appendonly = 0;
1553 server.appendfsync = APPENDFSYNC_ALWAYS;
1554 server.lastfsync = time(NULL);
1555 server.appendfd = -1;
1556 server.appendseldb = -1; /* Make sure the first time will not match */
1557 server.pidfile = zstrdup("/var/run/redis.pid");
1558 server.dbfilename = zstrdup("dump.rdb");
1559 server.appendfilename = zstrdup("appendonly.aof");
1560 server.requirepass = NULL;
1561 server.shareobjects = 0;
1562 server.rdbcompression = 1;
1563 server.maxclients = 0;
1564 server.blpop_blocked_clients = 0;
1565 server.maxmemory = 0;
1566 server.vm_enabled = 0;
1567 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1568 server.vm_page_size = 256; /* 256 bytes per page */
1569 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1570 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1571 server.vm_max_threads = 4;
1572 server.vm_blocked_clients = 0;
1573 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1574 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1575
1576 resetServerSaveParams();
1577
1578 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1579 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1580 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1581 /* Replication related */
1582 server.isslave = 0;
1583 server.masterauth = NULL;
1584 server.masterhost = NULL;
1585 server.masterport = 6379;
1586 server.master = NULL;
1587 server.replstate = REDIS_REPL_NONE;
1588
1589 /* Double constants initialization */
1590 R_Zero = 0.0;
1591 R_PosInf = 1.0/R_Zero;
1592 R_NegInf = -1.0/R_Zero;
1593 R_Nan = R_Zero/R_Zero;
1594 }
1595
1596 static void initServer() {
1597 int j;
1598
1599 signal(SIGHUP, SIG_IGN);
1600 signal(SIGPIPE, SIG_IGN);
1601 setupSigSegvAction();
1602
1603 server.devnull = fopen("/dev/null","w");
1604 if (server.devnull == NULL) {
1605 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1606 exit(1);
1607 }
1608 server.clients = listCreate();
1609 server.slaves = listCreate();
1610 server.monitors = listCreate();
1611 server.objfreelist = listCreate();
1612 createSharedObjects();
1613 server.el = aeCreateEventLoop();
1614 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1615 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1616 if (server.fd == -1) {
1617 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1618 exit(1);
1619 }
1620 for (j = 0; j < server.dbnum; j++) {
1621 server.db[j].dict = dictCreate(&dbDictType,NULL);
1622 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1623 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1624 if (server.vm_enabled)
1625 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1626 server.db[j].id = j;
1627 }
1628 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1629 server.pubsub_patterns = listCreate();
1630 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1631 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1632 server.cronloops = 0;
1633 server.bgsavechildpid = -1;
1634 server.bgrewritechildpid = -1;
1635 server.bgrewritebuf = sdsempty();
1636 server.lastsave = time(NULL);
1637 server.dirty = 0;
1638 server.stat_numcommands = 0;
1639 server.stat_numconnections = 0;
1640 server.stat_expiredkeys = 0;
1641 server.stat_starttime = time(NULL);
1642 server.unixtime = time(NULL);
1643 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1644 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1645 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1646
1647 if (server.appendonly) {
1648 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1649 if (server.appendfd == -1) {
1650 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1651 strerror(errno));
1652 exit(1);
1653 }
1654 }
1655
1656 if (server.vm_enabled) vmInit();
1657 }
1658
1659 /* Empty the whole database */
1660 static long long emptyDb() {
1661 int j;
1662 long long removed = 0;
1663
1664 for (j = 0; j < server.dbnum; j++) {
1665 removed += dictSize(server.db[j].dict);
1666 dictEmpty(server.db[j].dict);
1667 dictEmpty(server.db[j].expires);
1668 }
1669 return removed;
1670 }
1671
1672 static int yesnotoi(char *s) {
1673 if (!strcasecmp(s,"yes")) return 1;
1674 else if (!strcasecmp(s,"no")) return 0;
1675 else return -1;
1676 }
1677
1678 /* I agree, this is a very rudimental way to load a configuration...
1679 will improve later if the config gets more complex */
1680 static void loadServerConfig(char *filename) {
1681 FILE *fp;
1682 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1683 int linenum = 0;
1684 sds line = NULL;
1685 char *errormsg = "Fatal error, can't open config file '%s'";
1686 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1687 sprintf(errorbuf, errormsg, filename);
1688
1689 if (filename[0] == '-' && filename[1] == '\0')
1690 fp = stdin;
1691 else {
1692 if ((fp = fopen(filename,"r")) == NULL) {
1693 redisLog(REDIS_WARNING, errorbuf);
1694 exit(1);
1695 }
1696 }
1697
1698 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1699 sds *argv;
1700 int argc, j;
1701
1702 linenum++;
1703 line = sdsnew(buf);
1704 line = sdstrim(line," \t\r\n");
1705
1706 /* Skip comments and blank lines*/
1707 if (line[0] == '#' || line[0] == '\0') {
1708 sdsfree(line);
1709 continue;
1710 }
1711
1712 /* Split into arguments */
1713 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1714 sdstolower(argv[0]);
1715
1716 /* Execute config directives */
1717 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1718 server.maxidletime = atoi(argv[1]);
1719 if (server.maxidletime < 0) {
1720 err = "Invalid timeout value"; goto loaderr;
1721 }
1722 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1723 server.port = atoi(argv[1]);
1724 if (server.port < 1 || server.port > 65535) {
1725 err = "Invalid port"; goto loaderr;
1726 }
1727 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1728 server.bindaddr = zstrdup(argv[1]);
1729 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1730 int seconds = atoi(argv[1]);
1731 int changes = atoi(argv[2]);
1732 if (seconds < 1 || changes < 0) {
1733 err = "Invalid save parameters"; goto loaderr;
1734 }
1735 appendServerSaveParams(seconds,changes);
1736 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1737 if (chdir(argv[1]) == -1) {
1738 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1739 argv[1], strerror(errno));
1740 exit(1);
1741 }
1742 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1743 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1744 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1745 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1746 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1747 else {
1748 err = "Invalid log level. Must be one of debug, notice, warning";
1749 goto loaderr;
1750 }
1751 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1752 FILE *logfp;
1753
1754 server.logfile = zstrdup(argv[1]);
1755 if (!strcasecmp(server.logfile,"stdout")) {
1756 zfree(server.logfile);
1757 server.logfile = NULL;
1758 }
1759 if (server.logfile) {
1760 /* Test if we are able to open the file. The server will not
1761 * be able to abort just for this problem later... */
1762 logfp = fopen(server.logfile,"a");
1763 if (logfp == NULL) {
1764 err = sdscatprintf(sdsempty(),
1765 "Can't open the log file: %s", strerror(errno));
1766 goto loaderr;
1767 }
1768 fclose(logfp);
1769 }
1770 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1771 server.dbnum = atoi(argv[1]);
1772 if (server.dbnum < 1) {
1773 err = "Invalid number of databases"; goto loaderr;
1774 }
1775 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1776 loadServerConfig(argv[1]);
1777 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1778 server.maxclients = atoi(argv[1]);
1779 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1780 server.maxmemory = strtoll(argv[1], NULL, 10);
1781 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1782 server.masterhost = sdsnew(argv[1]);
1783 server.masterport = atoi(argv[2]);
1784 server.replstate = REDIS_REPL_CONNECT;
1785 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1786 server.masterauth = zstrdup(argv[1]);
1787 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1788 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1789 err = "argument must be 'yes' or 'no'"; goto loaderr;
1790 }
1791 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1792 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1793 err = "argument must be 'yes' or 'no'"; goto loaderr;
1794 }
1795 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1796 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1800 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1801 err = "argument must be 'yes' or 'no'"; goto loaderr;
1802 }
1803 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1804 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1805 err = "argument must be 'yes' or 'no'"; goto loaderr;
1806 }
1807 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1808 if (!strcasecmp(argv[1],"no")) {
1809 server.appendfsync = APPENDFSYNC_NO;
1810 } else if (!strcasecmp(argv[1],"always")) {
1811 server.appendfsync = APPENDFSYNC_ALWAYS;
1812 } else if (!strcasecmp(argv[1],"everysec")) {
1813 server.appendfsync = APPENDFSYNC_EVERYSEC;
1814 } else {
1815 err = "argument must be 'no', 'always' or 'everysec'";
1816 goto loaderr;
1817 }
1818 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1819 server.requirepass = zstrdup(argv[1]);
1820 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1821 zfree(server.pidfile);
1822 server.pidfile = zstrdup(argv[1]);
1823 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1824 zfree(server.dbfilename);
1825 server.dbfilename = zstrdup(argv[1]);
1826 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1827 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1828 err = "argument must be 'yes' or 'no'"; goto loaderr;
1829 }
1830 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1831 zfree(server.vm_swap_file);
1832 server.vm_swap_file = zstrdup(argv[1]);
1833 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1834 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1835 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1836 server.vm_page_size = strtoll(argv[1], NULL, 10);
1837 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1838 server.vm_pages = strtoll(argv[1], NULL, 10);
1839 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1840 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1841 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1842 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1843 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1844 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1845 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1846 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1847 } else {
1848 err = "Bad directive or wrong number of arguments"; goto loaderr;
1849 }
1850 for (j = 0; j < argc; j++)
1851 sdsfree(argv[j]);
1852 zfree(argv);
1853 sdsfree(line);
1854 }
1855 if (fp != stdin) fclose(fp);
1856 return;
1857
1858 loaderr:
1859 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1860 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1861 fprintf(stderr, ">>> '%s'\n", line);
1862 fprintf(stderr, "%s\n", err);
1863 exit(1);
1864 }
1865
1866 static void freeClientArgv(redisClient *c) {
1867 int j;
1868
1869 for (j = 0; j < c->argc; j++)
1870 decrRefCount(c->argv[j]);
1871 for (j = 0; j < c->mbargc; j++)
1872 decrRefCount(c->mbargv[j]);
1873 c->argc = 0;
1874 c->mbargc = 0;
1875 }
1876
1877 static void freeClient(redisClient *c) {
1878 listNode *ln;
1879
1880 /* Note that if the client we are freeing is blocked into a blocking
1881 * call, we have to set querybuf to NULL *before* to call
1882 * unblockClientWaitingData() to avoid processInputBuffer() will get
1883 * called. Also it is important to remove the file events after
1884 * this, because this call adds the READABLE event. */
1885 sdsfree(c->querybuf);
1886 c->querybuf = NULL;
1887 if (c->flags & REDIS_BLOCKED)
1888 unblockClientWaitingData(c);
1889
1890 /* Unsubscribe from all the pubsub channels */
1891 pubsubUnsubscribeAllChannels(c,0);
1892 pubsubUnsubscribeAllPatterns(c,0);
1893 dictRelease(c->pubsub_channels);
1894 listRelease(c->pubsub_patterns);
1895 /* Obvious cleanup */
1896 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1897 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1898 listRelease(c->reply);
1899 freeClientArgv(c);
1900 close(c->fd);
1901 /* Remove from the list of clients */
1902 ln = listSearchKey(server.clients,c);
1903 redisAssert(ln != NULL);
1904 listDelNode(server.clients,ln);
1905 /* Remove from the list of clients waiting for swapped keys */
1906 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1907 ln = listSearchKey(server.io_ready_clients,c);
1908 if (ln) {
1909 listDelNode(server.io_ready_clients,ln);
1910 server.vm_blocked_clients--;
1911 }
1912 }
1913 while (server.vm_enabled && listLength(c->io_keys)) {
1914 ln = listFirst(c->io_keys);
1915 dontWaitForSwappedKey(c,ln->value);
1916 }
1917 listRelease(c->io_keys);
1918 /* Master/slave cleanup */
1919 if (c->flags & REDIS_SLAVE) {
1920 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1921 close(c->repldbfd);
1922 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1923 ln = listSearchKey(l,c);
1924 redisAssert(ln != NULL);
1925 listDelNode(l,ln);
1926 }
1927 if (c->flags & REDIS_MASTER) {
1928 server.master = NULL;
1929 server.replstate = REDIS_REPL_CONNECT;
1930 }
1931 /* Release memory */
1932 zfree(c->argv);
1933 zfree(c->mbargv);
1934 freeClientMultiState(c);
1935 zfree(c);
1936 }
1937
1938 #define GLUEREPLY_UP_TO (1024)
1939 static void glueReplyBuffersIfNeeded(redisClient *c) {
1940 int copylen = 0;
1941 char buf[GLUEREPLY_UP_TO];
1942 listNode *ln;
1943 listIter li;
1944 robj *o;
1945
1946 listRewind(c->reply,&li);
1947 while((ln = listNext(&li))) {
1948 int objlen;
1949
1950 o = ln->value;
1951 objlen = sdslen(o->ptr);
1952 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1953 memcpy(buf+copylen,o->ptr,objlen);
1954 copylen += objlen;
1955 listDelNode(c->reply,ln);
1956 } else {
1957 if (copylen == 0) return;
1958 break;
1959 }
1960 }
1961 /* Now the output buffer is empty, add the new single element */
1962 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1963 listAddNodeHead(c->reply,o);
1964 }
1965
1966 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1967 redisClient *c = privdata;
1968 int nwritten = 0, totwritten = 0, objlen;
1969 robj *o;
1970 REDIS_NOTUSED(el);
1971 REDIS_NOTUSED(mask);
1972
1973 /* Use writev() if we have enough buffers to send */
1974 if (!server.glueoutputbuf &&
1975 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1976 !(c->flags & REDIS_MASTER))
1977 {
1978 sendReplyToClientWritev(el, fd, privdata, mask);
1979 return;
1980 }
1981
1982 while(listLength(c->reply)) {
1983 if (server.glueoutputbuf && listLength(c->reply) > 1)
1984 glueReplyBuffersIfNeeded(c);
1985
1986 o = listNodeValue(listFirst(c->reply));
1987 objlen = sdslen(o->ptr);
1988
1989 if (objlen == 0) {
1990 listDelNode(c->reply,listFirst(c->reply));
1991 continue;
1992 }
1993
1994 if (c->flags & REDIS_MASTER) {
1995 /* Don't reply to a master */
1996 nwritten = objlen - c->sentlen;
1997 } else {
1998 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1999 if (nwritten <= 0) break;
2000 }
2001 c->sentlen += nwritten;
2002 totwritten += nwritten;
2003 /* If we fully sent the object on head go to the next one */
2004 if (c->sentlen == objlen) {
2005 listDelNode(c->reply,listFirst(c->reply));
2006 c->sentlen = 0;
2007 }
2008 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2009 * bytes, in a single threaded server it's a good idea to serve
2010 * other clients as well, even if a very large request comes from
2011 * super fast link that is always able to accept data (in real world
2012 * scenario think about 'KEYS *' against the loopback interfae) */
2013 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2014 }
2015 if (nwritten == -1) {
2016 if (errno == EAGAIN) {
2017 nwritten = 0;
2018 } else {
2019 redisLog(REDIS_VERBOSE,
2020 "Error writing to client: %s", strerror(errno));
2021 freeClient(c);
2022 return;
2023 }
2024 }
2025 if (totwritten > 0) c->lastinteraction = time(NULL);
2026 if (listLength(c->reply) == 0) {
2027 c->sentlen = 0;
2028 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2029 }
2030 }
2031
2032 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2033 {
2034 redisClient *c = privdata;
2035 int nwritten = 0, totwritten = 0, objlen, willwrite;
2036 robj *o;
2037 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2038 int offset, ion = 0;
2039 REDIS_NOTUSED(el);
2040 REDIS_NOTUSED(mask);
2041
2042 listNode *node;
2043 while (listLength(c->reply)) {
2044 offset = c->sentlen;
2045 ion = 0;
2046 willwrite = 0;
2047
2048 /* fill-in the iov[] array */
2049 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2050 o = listNodeValue(node);
2051 objlen = sdslen(o->ptr);
2052
2053 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2054 break;
2055
2056 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2057 break; /* no more iovecs */
2058
2059 iov[ion].iov_base = ((char*)o->ptr) + offset;
2060 iov[ion].iov_len = objlen - offset;
2061 willwrite += objlen - offset;
2062 offset = 0; /* just for the first item */
2063 ion++;
2064 }
2065
2066 if(willwrite == 0)
2067 break;
2068
2069 /* write all collected blocks at once */
2070 if((nwritten = writev(fd, iov, ion)) < 0) {
2071 if (errno != EAGAIN) {
2072 redisLog(REDIS_VERBOSE,
2073 "Error writing to client: %s", strerror(errno));
2074 freeClient(c);
2075 return;
2076 }
2077 break;
2078 }
2079
2080 totwritten += nwritten;
2081 offset = c->sentlen;
2082
2083 /* remove written robjs from c->reply */
2084 while (nwritten && listLength(c->reply)) {
2085 o = listNodeValue(listFirst(c->reply));
2086 objlen = sdslen(o->ptr);
2087
2088 if(nwritten >= objlen - offset) {
2089 listDelNode(c->reply, listFirst(c->reply));
2090 nwritten -= objlen - offset;
2091 c->sentlen = 0;
2092 } else {
2093 /* partial write */
2094 c->sentlen += nwritten;
2095 break;
2096 }
2097 offset = 0;
2098 }
2099 }
2100
2101 if (totwritten > 0)
2102 c->lastinteraction = time(NULL);
2103
2104 if (listLength(c->reply) == 0) {
2105 c->sentlen = 0;
2106 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2107 }
2108 }
2109
2110 static struct redisCommand *lookupCommand(char *name) {
2111 int j = 0;
2112 while(cmdTable[j].name != NULL) {
2113 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2114 j++;
2115 }
2116 return NULL;
2117 }
2118
2119 /* resetClient prepare the client to process the next command */
2120 static void resetClient(redisClient *c) {
2121 freeClientArgv(c);
2122 c->bulklen = -1;
2123 c->multibulk = 0;
2124 }
2125
2126 /* Call() is the core of Redis execution of a command */
2127 static void call(redisClient *c, struct redisCommand *cmd) {
2128 long long dirty;
2129
2130 dirty = server.dirty;
2131 cmd->proc(c);
2132 dirty = server.dirty-dirty;
2133
2134 if (server.appendonly && dirty)
2135 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2136 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2137 listLength(server.slaves))
2138 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2139 if (listLength(server.monitors))
2140 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2141 server.stat_numcommands++;
2142 }
2143
2144 /* If this function gets called we already read a whole
2145 * command, argments are in the client argv/argc fields.
2146 * processCommand() execute the command or prepare the
2147 * server for a bulk read from the client.
2148 *
2149 * If 1 is returned the client is still alive and valid and
2150 * and other operations can be performed by the caller. Otherwise
2151 * if 0 is returned the client was destroied (i.e. after QUIT). */
2152 static int processCommand(redisClient *c) {
2153 struct redisCommand *cmd;
2154
2155 /* Free some memory if needed (maxmemory setting) */
2156 if (server.maxmemory) freeMemoryIfNeeded();
2157
2158 /* Handle the multi bulk command type. This is an alternative protocol
2159 * supported by Redis in order to receive commands that are composed of
2160 * multiple binary-safe "bulk" arguments. The latency of processing is
2161 * a bit higher but this allows things like multi-sets, so if this
2162 * protocol is used only for MSET and similar commands this is a big win. */
2163 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2164 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2165 if (c->multibulk <= 0) {
2166 resetClient(c);
2167 return 1;
2168 } else {
2169 decrRefCount(c->argv[c->argc-1]);
2170 c->argc--;
2171 return 1;
2172 }
2173 } else if (c->multibulk) {
2174 if (c->bulklen == -1) {
2175 if (((char*)c->argv[0]->ptr)[0] != '$') {
2176 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2177 resetClient(c);
2178 return 1;
2179 } else {
2180 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2181 decrRefCount(c->argv[0]);
2182 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2183 c->argc--;
2184 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2185 resetClient(c);
2186 return 1;
2187 }
2188 c->argc--;
2189 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2190 return 1;
2191 }
2192 } else {
2193 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2194 c->mbargv[c->mbargc] = c->argv[0];
2195 c->mbargc++;
2196 c->argc--;
2197 c->multibulk--;
2198 if (c->multibulk == 0) {
2199 robj **auxargv;
2200 int auxargc;
2201
2202 /* Here we need to swap the multi-bulk argc/argv with the
2203 * normal argc/argv of the client structure. */
2204 auxargv = c->argv;
2205 c->argv = c->mbargv;
2206 c->mbargv = auxargv;
2207
2208 auxargc = c->argc;
2209 c->argc = c->mbargc;
2210 c->mbargc = auxargc;
2211
2212 /* We need to set bulklen to something different than -1
2213 * in order for the code below to process the command without
2214 * to try to read the last argument of a bulk command as
2215 * a special argument. */
2216 c->bulklen = 0;
2217 /* continue below and process the command */
2218 } else {
2219 c->bulklen = -1;
2220 return 1;
2221 }
2222 }
2223 }
2224 /* -- end of multi bulk commands processing -- */
2225
2226 /* The QUIT command is handled as a special case. Normal command
2227 * procs are unable to close the client connection safely */
2228 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2229 freeClient(c);
2230 return 0;
2231 }
2232
2233 /* Now lookup the command and check ASAP about trivial error conditions
2234 * such wrong arity, bad command name and so forth. */
2235 cmd = lookupCommand(c->argv[0]->ptr);
2236 if (!cmd) {
2237 addReplySds(c,
2238 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2239 (char*)c->argv[0]->ptr));
2240 resetClient(c);
2241 return 1;
2242 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2243 (c->argc < -cmd->arity)) {
2244 addReplySds(c,
2245 sdscatprintf(sdsempty(),
2246 "-ERR wrong number of arguments for '%s' command\r\n",
2247 cmd->name));
2248 resetClient(c);
2249 return 1;
2250 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2251 /* This is a bulk command, we have to read the last argument yet. */
2252 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2253
2254 decrRefCount(c->argv[c->argc-1]);
2255 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2256 c->argc--;
2257 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2258 resetClient(c);
2259 return 1;
2260 }
2261 c->argc--;
2262 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2263 /* It is possible that the bulk read is already in the
2264 * buffer. Check this condition and handle it accordingly.
2265 * This is just a fast path, alternative to call processInputBuffer().
2266 * It's a good idea since the code is small and this condition
2267 * happens most of the times. */
2268 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2269 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2270 c->argc++;
2271 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2272 } else {
2273 /* Otherwise return... there is to read the last argument
2274 * from the socket. */
2275 return 1;
2276 }
2277 }
2278 /* Let's try to encode the bulk object to save space. */
2279 if (cmd->flags & REDIS_CMD_BULK)
2280 tryObjectEncoding(c->argv[c->argc-1]);
2281
2282 /* Check if the user is authenticated */
2283 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2284 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2285 resetClient(c);
2286 return 1;
2287 }
2288
2289 /* Handle the maxmemory directive */
2290 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2291 zmalloc_used_memory() > server.maxmemory)
2292 {
2293 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2294 resetClient(c);
2295 return 1;
2296 }
2297
2298 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2299 if (dictSize(c->pubsub_channels) > 0 &&
2300 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2301 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2302 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2303 resetClient(c);
2304 return 1;
2305 }
2306
2307 /* Exec the command */
2308 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2309 queueMultiCommand(c,cmd);
2310 addReply(c,shared.queued);
2311 } else {
2312 if (server.vm_enabled && server.vm_max_threads > 0 &&
2313 blockClientOnSwappedKeys(cmd,c)) return 1;
2314 call(c,cmd);
2315 }
2316
2317 /* Prepare the client for the next command */
2318 resetClient(c);
2319 return 1;
2320 }
2321
2322 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2323 listNode *ln;
2324 listIter li;
2325 int outc = 0, j;
2326 robj **outv;
2327 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2328 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2329 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2330 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2331 robj *lenobj;
2332
2333 if (argc <= REDIS_STATIC_ARGS) {
2334 outv = static_outv;
2335 } else {
2336 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2337 }
2338
2339 lenobj = createObject(REDIS_STRING,
2340 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2341 lenobj->refcount = 0;
2342 outv[outc++] = lenobj;
2343 for (j = 0; j < argc; j++) {
2344 lenobj = createObject(REDIS_STRING,
2345 sdscatprintf(sdsempty(),"$%lu\r\n",
2346 (unsigned long) stringObjectLen(argv[j])));
2347 lenobj->refcount = 0;
2348 outv[outc++] = lenobj;
2349 outv[outc++] = argv[j];
2350 outv[outc++] = shared.crlf;
2351 }
2352
2353 /* Increment all the refcounts at start and decrement at end in order to
2354 * be sure to free objects if there is no slave in a replication state
2355 * able to be feed with commands */
2356 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2357 listRewind(slaves,&li);
2358 while((ln = listNext(&li))) {
2359 redisClient *slave = ln->value;
2360
2361 /* Don't feed slaves that are still waiting for BGSAVE to start */
2362 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2363
2364 /* Feed all the other slaves, MONITORs and so on */
2365 if (slave->slaveseldb != dictid) {
2366 robj *selectcmd;
2367
2368 switch(dictid) {
2369 case 0: selectcmd = shared.select0; break;
2370 case 1: selectcmd = shared.select1; break;
2371 case 2: selectcmd = shared.select2; break;
2372 case 3: selectcmd = shared.select3; break;
2373 case 4: selectcmd = shared.select4; break;
2374 case 5: selectcmd = shared.select5; break;
2375 case 6: selectcmd = shared.select6; break;
2376 case 7: selectcmd = shared.select7; break;
2377 case 8: selectcmd = shared.select8; break;
2378 case 9: selectcmd = shared.select9; break;
2379 default:
2380 selectcmd = createObject(REDIS_STRING,
2381 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2382 selectcmd->refcount = 0;
2383 break;
2384 }
2385 addReply(slave,selectcmd);
2386 slave->slaveseldb = dictid;
2387 }
2388 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2389 }
2390 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2391 if (outv != static_outv) zfree(outv);
2392 }
2393
2394 static void processInputBuffer(redisClient *c) {
2395 again:
2396 /* Before to process the input buffer, make sure the client is not
2397 * waitig for a blocking operation such as BLPOP. Note that the first
2398 * iteration the client is never blocked, otherwise the processInputBuffer
2399 * would not be called at all, but after the execution of the first commands
2400 * in the input buffer the client may be blocked, and the "goto again"
2401 * will try to reiterate. The following line will make it return asap. */
2402 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2403 if (c->bulklen == -1) {
2404 /* Read the first line of the query */
2405 char *p = strchr(c->querybuf,'\n');
2406 size_t querylen;
2407
2408 if (p) {
2409 sds query, *argv;
2410 int argc, j;
2411
2412 query = c->querybuf;
2413 c->querybuf = sdsempty();
2414 querylen = 1+(p-(query));
2415 if (sdslen(query) > querylen) {
2416 /* leave data after the first line of the query in the buffer */
2417 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2418 }
2419 *p = '\0'; /* remove "\n" */
2420 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2421 sdsupdatelen(query);
2422
2423 /* Now we can split the query in arguments */
2424 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2425 sdsfree(query);
2426
2427 if (c->argv) zfree(c->argv);
2428 c->argv = zmalloc(sizeof(robj*)*argc);
2429
2430 for (j = 0; j < argc; j++) {
2431 if (sdslen(argv[j])) {
2432 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2433 c->argc++;
2434 } else {
2435 sdsfree(argv[j]);
2436 }
2437 }
2438 zfree(argv);
2439 if (c->argc) {
2440 /* Execute the command. If the client is still valid
2441 * after processCommand() return and there is something
2442 * on the query buffer try to process the next command. */
2443 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2444 } else {
2445 /* Nothing to process, argc == 0. Just process the query
2446 * buffer if it's not empty or return to the caller */
2447 if (sdslen(c->querybuf)) goto again;
2448 }
2449 return;
2450 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2451 redisLog(REDIS_VERBOSE, "Client protocol error");
2452 freeClient(c);
2453 return;
2454 }
2455 } else {
2456 /* Bulk read handling. Note that if we are at this point
2457 the client already sent a command terminated with a newline,
2458 we are reading the bulk data that is actually the last
2459 argument of the command. */
2460 int qbl = sdslen(c->querybuf);
2461
2462 if (c->bulklen <= qbl) {
2463 /* Copy everything but the final CRLF as final argument */
2464 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2465 c->argc++;
2466 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2467 /* Process the command. If the client is still valid after
2468 * the processing and there is more data in the buffer
2469 * try to parse it. */
2470 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2471 return;
2472 }
2473 }
2474 }
2475
2476 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2477 redisClient *c = (redisClient*) privdata;
2478 char buf[REDIS_IOBUF_LEN];
2479 int nread;
2480 REDIS_NOTUSED(el);
2481 REDIS_NOTUSED(mask);
2482
2483 nread = read(fd, buf, REDIS_IOBUF_LEN);
2484 if (nread == -1) {
2485 if (errno == EAGAIN) {
2486 nread = 0;
2487 } else {
2488 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2489 freeClient(c);
2490 return;
2491 }
2492 } else if (nread == 0) {
2493 redisLog(REDIS_VERBOSE, "Client closed connection");
2494 freeClient(c);
2495 return;
2496 }
2497 if (nread) {
2498 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2499 c->lastinteraction = time(NULL);
2500 } else {
2501 return;
2502 }
2503 processInputBuffer(c);
2504 }
2505
2506 static int selectDb(redisClient *c, int id) {
2507 if (id < 0 || id >= server.dbnum)
2508 return REDIS_ERR;
2509 c->db = &server.db[id];
2510 return REDIS_OK;
2511 }
2512
2513 static void *dupClientReplyValue(void *o) {
2514 incrRefCount((robj*)o);
2515 return o;
2516 }
2517
2518 static int listMatchObjects(void *a, void *b) {
2519 return compareStringObjects(a,b) == 0;
2520 }
2521
2522 static redisClient *createClient(int fd) {
2523 redisClient *c = zmalloc(sizeof(*c));
2524
2525 anetNonBlock(NULL,fd);
2526 anetTcpNoDelay(NULL,fd);
2527 if (!c) return NULL;
2528 selectDb(c,0);
2529 c->fd = fd;
2530 c->querybuf = sdsempty();
2531 c->argc = 0;
2532 c->argv = NULL;
2533 c->bulklen = -1;
2534 c->multibulk = 0;
2535 c->mbargc = 0;
2536 c->mbargv = NULL;
2537 c->sentlen = 0;
2538 c->flags = 0;
2539 c->lastinteraction = time(NULL);
2540 c->authenticated = 0;
2541 c->replstate = REDIS_REPL_NONE;
2542 c->reply = listCreate();
2543 listSetFreeMethod(c->reply,decrRefCount);
2544 listSetDupMethod(c->reply,dupClientReplyValue);
2545 c->blockingkeys = NULL;
2546 c->blockingkeysnum = 0;
2547 c->io_keys = listCreate();
2548 listSetFreeMethod(c->io_keys,decrRefCount);
2549 c->pubsub_channels = dictCreate(&setDictType,NULL);
2550 c->pubsub_patterns = listCreate();
2551 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2552 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2553 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2554 readQueryFromClient, c) == AE_ERR) {
2555 freeClient(c);
2556 return NULL;
2557 }
2558 listAddNodeTail(server.clients,c);
2559 initClientMultiState(c);
2560 return c;
2561 }
2562
2563 static void addReply(redisClient *c, robj *obj) {
2564 if (listLength(c->reply) == 0 &&
2565 (c->replstate == REDIS_REPL_NONE ||
2566 c->replstate == REDIS_REPL_ONLINE) &&
2567 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2568 sendReplyToClient, c) == AE_ERR) return;
2569
2570 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2571 obj = dupStringObject(obj);
2572 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2573 }
2574 listAddNodeTail(c->reply,getDecodedObject(obj));
2575 }
2576
2577 static void addReplySds(redisClient *c, sds s) {
2578 robj *o = createObject(REDIS_STRING,s);
2579 addReply(c,o);
2580 decrRefCount(o);
2581 }
2582
2583 static void addReplyDouble(redisClient *c, double d) {
2584 char buf[128];
2585
2586 snprintf(buf,sizeof(buf),"%.17g",d);
2587 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2588 (unsigned long) strlen(buf),buf));
2589 }
2590
2591 static void addReplyLong(redisClient *c, long l) {
2592 char buf[128];
2593 size_t len;
2594
2595 if (l == 0) {
2596 addReply(c,shared.czero);
2597 return;
2598 } else if (l == 1) {
2599 addReply(c,shared.cone);
2600 return;
2601 }
2602 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2603 addReplySds(c,sdsnewlen(buf,len));
2604 }
2605
2606 static void addReplyLongLong(redisClient *c, long long ll) {
2607 char buf[128];
2608 size_t len;
2609
2610 if (ll == 0) {
2611 addReply(c,shared.czero);
2612 return;
2613 } else if (ll == 1) {
2614 addReply(c,shared.cone);
2615 return;
2616 }
2617 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2618 addReplySds(c,sdsnewlen(buf,len));
2619 }
2620
2621 static void addReplyUlong(redisClient *c, unsigned long ul) {
2622 char buf[128];
2623 size_t len;
2624
2625 if (ul == 0) {
2626 addReply(c,shared.czero);
2627 return;
2628 } else if (ul == 1) {
2629 addReply(c,shared.cone);
2630 return;
2631 }
2632 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2633 addReplySds(c,sdsnewlen(buf,len));
2634 }
2635
2636 static void addReplyBulkLen(redisClient *c, robj *obj) {
2637 size_t len;
2638
2639 if (obj->encoding == REDIS_ENCODING_RAW) {
2640 len = sdslen(obj->ptr);
2641 } else {
2642 long n = (long)obj->ptr;
2643
2644 /* Compute how many bytes will take this integer as a radix 10 string */
2645 len = 1;
2646 if (n < 0) {
2647 len++;
2648 n = -n;
2649 }
2650 while((n = n/10) != 0) {
2651 len++;
2652 }
2653 }
2654 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2655 }
2656
2657 static void addReplyBulk(redisClient *c, robj *obj) {
2658 addReplyBulkLen(c,obj);
2659 addReply(c,obj);
2660 addReply(c,shared.crlf);
2661 }
2662
2663 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2664 static void addReplyBulkCString(redisClient *c, char *s) {
2665 if (s == NULL) {
2666 addReply(c,shared.nullbulk);
2667 } else {
2668 robj *o = createStringObject(s,strlen(s));
2669 addReplyBulk(c,o);
2670 decrRefCount(o);
2671 }
2672 }
2673
2674 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2675 int cport, cfd;
2676 char cip[128];
2677 redisClient *c;
2678 REDIS_NOTUSED(el);
2679 REDIS_NOTUSED(mask);
2680 REDIS_NOTUSED(privdata);
2681
2682 cfd = anetAccept(server.neterr, fd, cip, &cport);
2683 if (cfd == AE_ERR) {
2684 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2685 return;
2686 }
2687 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2688 if ((c = createClient(cfd)) == NULL) {
2689 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2690 close(cfd); /* May be already closed, just ingore errors */
2691 return;
2692 }
2693 /* If maxclient directive is set and this is one client more... close the
2694 * connection. Note that we create the client instead to check before
2695 * for this condition, since now the socket is already set in nonblocking
2696 * mode and we can send an error for free using the Kernel I/O */
2697 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2698 char *err = "-ERR max number of clients reached\r\n";
2699
2700 /* That's a best effort error message, don't check write errors */
2701 if (write(c->fd,err,strlen(err)) == -1) {
2702 /* Nothing to do, Just to avoid the warning... */
2703 }
2704 freeClient(c);
2705 return;
2706 }
2707 server.stat_numconnections++;
2708 }
2709
2710 /* ======================= Redis objects implementation ===================== */
2711
2712 static robj *createObject(int type, void *ptr) {
2713 robj *o;
2714
2715 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2716 if (listLength(server.objfreelist)) {
2717 listNode *head = listFirst(server.objfreelist);
2718 o = listNodeValue(head);
2719 listDelNode(server.objfreelist,head);
2720 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2721 } else {
2722 if (server.vm_enabled) {
2723 pthread_mutex_unlock(&server.obj_freelist_mutex);
2724 o = zmalloc(sizeof(*o));
2725 } else {
2726 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2727 }
2728 }
2729 o->type = type;
2730 o->encoding = REDIS_ENCODING_RAW;
2731 o->ptr = ptr;
2732 o->refcount = 1;
2733 if (server.vm_enabled) {
2734 /* Note that this code may run in the context of an I/O thread
2735 * and accessing to server.unixtime in theory is an error
2736 * (no locks). But in practice this is safe, and even if we read
2737 * garbage Redis will not fail, as it's just a statistical info */
2738 o->vm.atime = server.unixtime;
2739 o->storage = REDIS_VM_MEMORY;
2740 }
2741 return o;
2742 }
2743
2744 static robj *createStringObject(char *ptr, size_t len) {
2745 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2746 }
2747
2748 static robj *dupStringObject(robj *o) {
2749 assert(o->encoding == REDIS_ENCODING_RAW);
2750 return createStringObject(o->ptr,sdslen(o->ptr));
2751 }
2752
2753 static robj *createListObject(void) {
2754 list *l = listCreate();
2755
2756 listSetFreeMethod(l,decrRefCount);
2757 return createObject(REDIS_LIST,l);
2758 }
2759
2760 static robj *createSetObject(void) {
2761 dict *d = dictCreate(&setDictType,NULL);
2762 return createObject(REDIS_SET,d);
2763 }
2764
2765 static robj *createHashObject(void) {
2766 /* All the Hashes start as zipmaps. Will be automatically converted
2767 * into hash tables if there are enough elements or big elements
2768 * inside. */
2769 unsigned char *zm = zipmapNew();
2770 robj *o = createObject(REDIS_HASH,zm);
2771 o->encoding = REDIS_ENCODING_ZIPMAP;
2772 return o;
2773 }
2774
2775 static robj *createZsetObject(void) {
2776 zset *zs = zmalloc(sizeof(*zs));
2777
2778 zs->dict = dictCreate(&zsetDictType,NULL);
2779 zs->zsl = zslCreate();
2780 return createObject(REDIS_ZSET,zs);
2781 }
2782
2783 static void freeStringObject(robj *o) {
2784 if (o->encoding == REDIS_ENCODING_RAW) {
2785 sdsfree(o->ptr);
2786 }
2787 }
2788
2789 static void freeListObject(robj *o) {
2790 listRelease((list*) o->ptr);
2791 }
2792
2793 static void freeSetObject(robj *o) {
2794 dictRelease((dict*) o->ptr);
2795 }
2796
2797 static void freeZsetObject(robj *o) {
2798 zset *zs = o->ptr;
2799
2800 dictRelease(zs->dict);
2801 zslFree(zs->zsl);
2802 zfree(zs);
2803 }
2804
2805 static void freeHashObject(robj *o) {
2806 switch (o->encoding) {
2807 case REDIS_ENCODING_HT:
2808 dictRelease((dict*) o->ptr);
2809 break;
2810 case REDIS_ENCODING_ZIPMAP:
2811 zfree(o->ptr);
2812 break;
2813 default:
2814 redisAssert(0);
2815 break;
2816 }
2817 }
2818
2819 static void incrRefCount(robj *o) {
2820 o->refcount++;
2821 }
2822
2823 static void decrRefCount(void *obj) {
2824 robj *o = obj;
2825
2826 /* Object is a key of a swapped out value, or in the process of being
2827 * loaded. */
2828 if (server.vm_enabled &&
2829 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2830 {
2831 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2832 redisAssert(o->type == REDIS_STRING);
2833 freeStringObject(o);
2834 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2835 pthread_mutex_lock(&server.obj_freelist_mutex);
2836 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2837 !listAddNodeHead(server.objfreelist,o))
2838 zfree(o);
2839 pthread_mutex_unlock(&server.obj_freelist_mutex);
2840 server.vm_stats_swapped_objects--;
2841 return;
2842 }
2843 /* Object is in memory, or in the process of being swapped out. */
2844 if (--(o->refcount) == 0) {
2845 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2846 vmCancelThreadedIOJob(obj);
2847 switch(o->type) {
2848 case REDIS_STRING: freeStringObject(o); break;
2849 case REDIS_LIST: freeListObject(o); break;
2850 case REDIS_SET: freeSetObject(o); break;
2851 case REDIS_ZSET: freeZsetObject(o); break;
2852 case REDIS_HASH: freeHashObject(o); break;
2853 default: redisAssert(0); break;
2854 }
2855 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2856 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2857 !listAddNodeHead(server.objfreelist,o))
2858 zfree(o);
2859 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2860 }
2861 }
2862
2863 static robj *lookupKey(redisDb *db, robj *key) {
2864 dictEntry *de = dictFind(db->dict,key);
2865 if (de) {
2866 robj *key = dictGetEntryKey(de);
2867 robj *val = dictGetEntryVal(de);
2868
2869 if (server.vm_enabled) {
2870 if (key->storage == REDIS_VM_MEMORY ||
2871 key->storage == REDIS_VM_SWAPPING)
2872 {
2873 /* If we were swapping the object out, stop it, this key
2874 * was requested. */
2875 if (key->storage == REDIS_VM_SWAPPING)
2876 vmCancelThreadedIOJob(key);
2877 /* Update the access time of the key for the aging algorithm. */
2878 key->vm.atime = server.unixtime;
2879 } else {
2880 int notify = (key->storage == REDIS_VM_LOADING);
2881
2882 /* Our value was swapped on disk. Bring it at home. */
2883 redisAssert(val == NULL);
2884 val = vmLoadObject(key);
2885 dictGetEntryVal(de) = val;
2886
2887 /* Clients blocked by the VM subsystem may be waiting for
2888 * this key... */
2889 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2890 }
2891 }
2892 return val;
2893 } else {
2894 return NULL;
2895 }
2896 }
2897
2898 static robj *lookupKeyRead(redisDb *db, robj *key) {
2899 expireIfNeeded(db,key);
2900 return lookupKey(db,key);
2901 }
2902
2903 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2904 deleteIfVolatile(db,key);
2905 return lookupKey(db,key);
2906 }
2907
2908 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2909 robj *o = lookupKeyRead(c->db, key);
2910 if (!o) addReply(c,reply);
2911 return o;
2912 }
2913
2914 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2915 robj *o = lookupKeyWrite(c->db, key);
2916 if (!o) addReply(c,reply);
2917 return o;
2918 }
2919
2920 static int checkType(redisClient *c, robj *o, int type) {
2921 if (o->type != type) {
2922 addReply(c,shared.wrongtypeerr);
2923 return 1;
2924 }
2925 return 0;
2926 }
2927
2928 static int deleteKey(redisDb *db, robj *key) {
2929 int retval;
2930
2931 /* We need to protect key from destruction: after the first dictDelete()
2932 * it may happen that 'key' is no longer valid if we don't increment
2933 * it's count. This may happen when we get the object reference directly
2934 * from the hash table with dictRandomKey() or dict iterators */
2935 incrRefCount(key);
2936 if (dictSize(db->expires)) dictDelete(db->expires,key);
2937 retval = dictDelete(db->dict,key);
2938 decrRefCount(key);
2939
2940 return retval == DICT_OK;
2941 }
2942
2943 /* Check if the nul-terminated string 's' can be represented by a long
2944 * (that is, is a number that fits into long without any other space or
2945 * character before or after the digits).
2946 *
2947 * If so, the function returns REDIS_OK and *longval is set to the value
2948 * of the number. Otherwise REDIS_ERR is returned */
2949 static int isStringRepresentableAsLong(sds s, long *longval) {
2950 char buf[32], *endptr;
2951 long value;
2952 int slen;
2953
2954 value = strtol(s, &endptr, 10);
2955 if (endptr[0] != '\0') return REDIS_ERR;
2956 slen = snprintf(buf,32,"%ld",value);
2957
2958 /* If the number converted back into a string is not identical
2959 * then it's not possible to encode the string as integer */
2960 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2961 if (longval) *longval = value;
2962 return REDIS_OK;
2963 }
2964
2965 /* Try to encode a string object in order to save space */
2966 static int tryObjectEncoding(robj *o) {
2967 long value;
2968 sds s = o->ptr;
2969
2970 if (o->encoding != REDIS_ENCODING_RAW)
2971 return REDIS_ERR; /* Already encoded */
2972
2973 /* It's not save to encode shared objects: shared objects can be shared
2974 * everywhere in the "object space" of Redis. Encoded objects can only
2975 * appear as "values" (and not, for instance, as keys) */
2976 if (o->refcount > 1) return REDIS_ERR;
2977
2978 /* Currently we try to encode only strings */
2979 redisAssert(o->type == REDIS_STRING);
2980
2981 /* Check if we can represent this string as a long integer */
2982 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2983
2984 /* Ok, this object can be encoded */
2985 o->encoding = REDIS_ENCODING_INT;
2986 sdsfree(o->ptr);
2987 o->ptr = (void*) value;
2988 return REDIS_OK;
2989 }
2990
2991 /* Get a decoded version of an encoded object (returned as a new object).
2992 * If the object is already raw-encoded just increment the ref count. */
2993 static robj *getDecodedObject(robj *o) {
2994 robj *dec;
2995
2996 if (o->encoding == REDIS_ENCODING_RAW) {
2997 incrRefCount(o);
2998 return o;
2999 }
3000 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3001 char buf[32];
3002
3003 snprintf(buf,32,"%ld",(long)o->ptr);
3004 dec = createStringObject(buf,strlen(buf));
3005 return dec;
3006 } else {
3007 redisAssert(1 != 1);
3008 }
3009 }
3010
3011 /* Compare two string objects via strcmp() or alike.
3012 * Note that the objects may be integer-encoded. In such a case we
3013 * use snprintf() to get a string representation of the numbers on the stack
3014 * and compare the strings, it's much faster than calling getDecodedObject().
3015 *
3016 * Important note: if objects are not integer encoded, but binary-safe strings,
3017 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3018 * binary safe. */
3019 static int compareStringObjects(robj *a, robj *b) {
3020 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3021 char bufa[128], bufb[128], *astr, *bstr;
3022 int bothsds = 1;
3023
3024 if (a == b) return 0;
3025 if (a->encoding != REDIS_ENCODING_RAW) {
3026 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3027 astr = bufa;
3028 bothsds = 0;
3029 } else {
3030 astr = a->ptr;
3031 }
3032 if (b->encoding != REDIS_ENCODING_RAW) {
3033 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3034 bstr = bufb;
3035 bothsds = 0;
3036 } else {
3037 bstr = b->ptr;
3038 }
3039 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3040 }
3041
3042 static size_t stringObjectLen(robj *o) {
3043 redisAssert(o->type == REDIS_STRING);
3044 if (o->encoding == REDIS_ENCODING_RAW) {
3045 return sdslen(o->ptr);
3046 } else {
3047 char buf[32];
3048
3049 return snprintf(buf,32,"%ld",(long)o->ptr);
3050 }
3051 }
3052
3053 /*============================ RDB saving/loading =========================== */
3054
3055 static int rdbSaveType(FILE *fp, unsigned char type) {
3056 if (fwrite(&type,1,1,fp) == 0) return -1;
3057 return 0;
3058 }
3059
3060 static int rdbSaveTime(FILE *fp, time_t t) {
3061 int32_t t32 = (int32_t) t;
3062 if (fwrite(&t32,4,1,fp) == 0) return -1;
3063 return 0;
3064 }
3065
3066 /* check rdbLoadLen() comments for more info */
3067 static int rdbSaveLen(FILE *fp, uint32_t len) {
3068 unsigned char buf[2];
3069
3070 if (len < (1<<6)) {
3071 /* Save a 6 bit len */
3072 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3073 if (fwrite(buf,1,1,fp) == 0) return -1;
3074 } else if (len < (1<<14)) {
3075 /* Save a 14 bit len */
3076 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3077 buf[1] = len&0xFF;
3078 if (fwrite(buf,2,1,fp) == 0) return -1;
3079 } else {
3080 /* Save a 32 bit len */
3081 buf[0] = (REDIS_RDB_32BITLEN<<6);
3082 if (fwrite(buf,1,1,fp) == 0) return -1;
3083 len = htonl(len);
3084 if (fwrite(&len,4,1,fp) == 0) return -1;
3085 }
3086 return 0;
3087 }
3088
3089 /* String objects in the form "2391" "-100" without any space and with a
3090 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3091 * encoded as integers to save space */
3092 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3093 long long value;
3094 char *endptr, buf[32];
3095
3096 /* Check if it's possible to encode this value as a number */
3097 value = strtoll(s, &endptr, 10);
3098 if (endptr[0] != '\0') return 0;
3099 snprintf(buf,32,"%lld",value);
3100
3101 /* If the number converted back into a string is not identical
3102 * then it's not possible to encode the string as integer */
3103 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3104
3105 /* Finally check if it fits in our ranges */
3106 if (value >= -(1<<7) && value <= (1<<7)-1) {
3107 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3108 enc[1] = value&0xFF;
3109 return 2;
3110 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3111 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3112 enc[1] = value&0xFF;
3113 enc[2] = (value>>8)&0xFF;
3114 return 3;
3115 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3116 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3117 enc[1] = value&0xFF;
3118 enc[2] = (value>>8)&0xFF;
3119 enc[3] = (value>>16)&0xFF;
3120 enc[4] = (value>>24)&0xFF;
3121 return 5;
3122 } else {
3123 return 0;
3124 }
3125 }
3126
3127 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3128 size_t comprlen, outlen;
3129 unsigned char byte;
3130 void *out;
3131
3132 /* We require at least four bytes compression for this to be worth it */
3133 if (len <= 4) return 0;
3134 outlen = len-4;
3135 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3136 comprlen = lzf_compress(s, len, out, outlen);
3137 if (comprlen == 0) {
3138 zfree(out);
3139 return 0;
3140 }
3141 /* Data compressed! Let's save it on disk */
3142 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3143 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3144 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3145 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3146 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3147 zfree(out);
3148 return comprlen;
3149
3150 writeerr:
3151 zfree(out);
3152 return -1;
3153 }
3154
3155 /* Save a string objet as [len][data] on disk. If the object is a string
3156 * representation of an integer value we try to safe it in a special form */
3157 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3158 int enclen;
3159
3160 /* Try integer encoding */
3161 if (len <= 11) {
3162 unsigned char buf[5];
3163 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3164 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3165 return 0;
3166 }
3167 }
3168
3169 /* Try LZF compression - under 20 bytes it's unable to compress even
3170 * aaaaaaaaaaaaaaaaaa so skip it */
3171 if (server.rdbcompression && len > 20) {
3172 int retval;
3173
3174 retval = rdbSaveLzfStringObject(fp,s,len);
3175 if (retval == -1) return -1;
3176 if (retval > 0) return 0;
3177 /* retval == 0 means data can't be compressed, save the old way */
3178 }
3179
3180 /* Store verbatim */
3181 if (rdbSaveLen(fp,len) == -1) return -1;
3182 if (len && fwrite(s,len,1,fp) == 0) return -1;
3183 return 0;
3184 }
3185
3186 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3187 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3188 int retval;
3189
3190 /* Avoid incr/decr ref count business when possible.
3191 * This plays well with copy-on-write given that we are probably
3192 * in a child process (BGSAVE). Also this makes sure key objects
3193 * of swapped objects are not incRefCount-ed (an assert does not allow
3194 * this in order to avoid bugs) */
3195 if (obj->encoding != REDIS_ENCODING_RAW) {
3196 obj = getDecodedObject(obj);
3197 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3198 decrRefCount(obj);
3199 } else {
3200 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3201 }
3202 return retval;
3203 }
3204
3205 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3206 * 8 bit integer specifing the length of the representation.
3207 * This 8 bit integer has special values in order to specify the following
3208 * conditions:
3209 * 253: not a number
3210 * 254: + inf
3211 * 255: - inf
3212 */
3213 static int rdbSaveDoubleValue(FILE *fp, double val) {
3214 unsigned char buf[128];
3215 int len;
3216
3217 if (isnan(val)) {
3218 buf[0] = 253;
3219 len = 1;
3220 } else if (!isfinite(val)) {
3221 len = 1;
3222 buf[0] = (val < 0) ? 255 : 254;
3223 } else {
3224 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3225 buf[0] = strlen((char*)buf+1);
3226 len = buf[0]+1;
3227 }
3228 if (fwrite(buf,len,1,fp) == 0) return -1;
3229 return 0;
3230 }
3231
3232 /* Save a Redis object. */
3233 static int rdbSaveObject(FILE *fp, robj *o) {
3234 if (o->type == REDIS_STRING) {
3235 /* Save a string value */
3236 if (rdbSaveStringObject(fp,o) == -1) return -1;
3237 } else if (o->type == REDIS_LIST) {
3238 /* Save a list value */
3239 list *list = o->ptr;
3240 listIter li;
3241 listNode *ln;
3242
3243 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3244 listRewind(list,&li);
3245 while((ln = listNext(&li))) {
3246 robj *eleobj = listNodeValue(ln);
3247
3248 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3249 }
3250 } else if (o->type == REDIS_SET) {
3251 /* Save a set value */
3252 dict *set = o->ptr;
3253 dictIterator *di = dictGetIterator(set);
3254 dictEntry *de;
3255
3256 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3257 while((de = dictNext(di)) != NULL) {
3258 robj *eleobj = dictGetEntryKey(de);
3259
3260 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3261 }
3262 dictReleaseIterator(di);
3263 } else if (o->type == REDIS_ZSET) {
3264 /* Save a set value */
3265 zset *zs = o->ptr;
3266 dictIterator *di = dictGetIterator(zs->dict);
3267 dictEntry *de;
3268
3269 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3270 while((de = dictNext(di)) != NULL) {
3271 robj *eleobj = dictGetEntryKey(de);
3272 double *score = dictGetEntryVal(de);
3273
3274 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3275 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3276 }
3277 dictReleaseIterator(di);
3278 } else if (o->type == REDIS_HASH) {
3279 /* Save a hash value */
3280 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3281 unsigned char *p = zipmapRewind(o->ptr);
3282 unsigned int count = zipmapLen(o->ptr);
3283 unsigned char *key, *val;
3284 unsigned int klen, vlen;
3285
3286 if (rdbSaveLen(fp,count) == -1) return -1;
3287 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3288 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3289 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3290 }
3291 } else {
3292 dictIterator *di = dictGetIterator(o->ptr);
3293 dictEntry *de;
3294
3295 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3296 while((de = dictNext(di)) != NULL) {
3297 robj *key = dictGetEntryKey(de);
3298 robj *val = dictGetEntryVal(de);
3299
3300 if (rdbSaveStringObject(fp,key) == -1) return -1;
3301 if (rdbSaveStringObject(fp,val) == -1) return -1;
3302 }
3303 dictReleaseIterator(di);
3304 }
3305 } else {
3306 redisAssert(0);
3307 }
3308 return 0;
3309 }
3310
3311 /* Return the length the object will have on disk if saved with
3312 * the rdbSaveObject() function. Currently we use a trick to get
3313 * this length with very little changes to the code. In the future
3314 * we could switch to a faster solution. */
3315 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3316 if (fp == NULL) fp = server.devnull;
3317 rewind(fp);
3318 assert(rdbSaveObject(fp,o) != 1);
3319 return ftello(fp);
3320 }
3321
3322 /* Return the number of pages required to save this object in the swap file */
3323 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3324 off_t bytes = rdbSavedObjectLen(o,fp);
3325
3326 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3327 }
3328
3329 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3330 static int rdbSave(char *filename) {
3331 dictIterator *di = NULL;
3332 dictEntry *de;
3333 FILE *fp;
3334 char tmpfile[256];
3335 int j;
3336 time_t now = time(NULL);
3337
3338 /* Wait for I/O therads to terminate, just in case this is a
3339 * foreground-saving, to avoid seeking the swap file descriptor at the
3340 * same time. */
3341 if (server.vm_enabled)
3342 waitEmptyIOJobsQueue();
3343
3344 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3345 fp = fopen(tmpfile,"w");
3346 if (!fp) {
3347 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3348 return REDIS_ERR;
3349 }
3350 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3351 for (j = 0; j < server.dbnum; j++) {
3352 redisDb *db = server.db+j;
3353 dict *d = db->dict;
3354 if (dictSize(d) == 0) continue;
3355 di = dictGetIterator(d);
3356 if (!di) {
3357 fclose(fp);
3358 return REDIS_ERR;
3359 }
3360
3361 /* Write the SELECT DB opcode */
3362 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3363 if (rdbSaveLen(fp,j) == -1) goto werr;
3364
3365 /* Iterate this DB writing every entry */
3366 while((de = dictNext(di)) != NULL) {
3367 robj *key = dictGetEntryKey(de);
3368 robj *o = dictGetEntryVal(de);
3369 time_t expiretime = getExpire(db,key);
3370
3371 /* Save the expire time */
3372 if (expiretime != -1) {
3373 /* If this key is already expired skip it */
3374 if (expiretime < now) continue;
3375 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3376 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3377 }
3378 /* Save the key and associated value. This requires special
3379 * handling if the value is swapped out. */
3380 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3381 key->storage == REDIS_VM_SWAPPING) {
3382 /* Save type, key, value */
3383 if (rdbSaveType(fp,o->type) == -1) goto werr;
3384 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3385 if (rdbSaveObject(fp,o) == -1) goto werr;
3386 } else {
3387 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3388 robj *po;
3389 /* Get a preview of the object in memory */
3390 po = vmPreviewObject(key);
3391 /* Save type, key, value */
3392 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3393 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3394 if (rdbSaveObject(fp,po) == -1) goto werr;
3395 /* Remove the loaded object from memory */
3396 decrRefCount(po);
3397 }
3398 }
3399 dictReleaseIterator(di);
3400 }
3401 /* EOF opcode */
3402 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3403
3404 /* Make sure data will not remain on the OS's output buffers */
3405 fflush(fp);
3406 fsync(fileno(fp));
3407 fclose(fp);
3408
3409 /* Use RENAME to make sure the DB file is changed atomically only
3410 * if the generate DB file is ok. */
3411 if (rename(tmpfile,filename) == -1) {
3412 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3413 unlink(tmpfile);
3414 return REDIS_ERR;
3415 }
3416 redisLog(REDIS_NOTICE,"DB saved on disk");
3417 server.dirty = 0;
3418 server.lastsave = time(NULL);
3419 return REDIS_OK;
3420
3421 werr:
3422 fclose(fp);
3423 unlink(tmpfile);
3424 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3425 if (di) dictReleaseIterator(di);
3426 return REDIS_ERR;
3427 }
3428
3429 static int rdbSaveBackground(char *filename) {
3430 pid_t childpid;
3431
3432 if (server.bgsavechildpid != -1) return REDIS_ERR;
3433 if (server.vm_enabled) waitEmptyIOJobsQueue();
3434 if ((childpid = fork()) == 0) {
3435 /* Child */
3436 if (server.vm_enabled) vmReopenSwapFile();
3437 close(server.fd);
3438 if (rdbSave(filename) == REDIS_OK) {
3439 _exit(0);
3440 } else {
3441 _exit(1);
3442 }
3443 } else {
3444 /* Parent */
3445 if (childpid == -1) {
3446 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3447 strerror(errno));
3448 return REDIS_ERR;
3449 }
3450 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3451 server.bgsavechildpid = childpid;
3452 updateDictResizePolicy();
3453 return REDIS_OK;
3454 }
3455 return REDIS_OK; /* unreached */
3456 }
3457
3458 static void rdbRemoveTempFile(pid_t childpid) {
3459 char tmpfile[256];
3460
3461 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3462 unlink(tmpfile);
3463 }
3464
3465 static int rdbLoadType(FILE *fp) {
3466 unsigned char type;
3467 if (fread(&type,1,1,fp) == 0) return -1;
3468 return type;
3469 }
3470
3471 static time_t rdbLoadTime(FILE *fp) {
3472 int32_t t32;
3473 if (fread(&t32,4,1,fp) == 0) return -1;
3474 return (time_t) t32;
3475 }
3476
3477 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3478 * of this file for a description of how this are stored on disk.
3479 *
3480 * isencoded is set to 1 if the readed length is not actually a length but
3481 * an "encoding type", check the above comments for more info */
3482 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3483 unsigned char buf[2];
3484 uint32_t len;
3485 int type;
3486
3487 if (isencoded) *isencoded = 0;
3488 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3489 type = (buf[0]&0xC0)>>6;
3490 if (type == REDIS_RDB_6BITLEN) {
3491 /* Read a 6 bit len */
3492 return buf[0]&0x3F;
3493 } else if (type == REDIS_RDB_ENCVAL) {
3494 /* Read a 6 bit len encoding type */
3495 if (isencoded) *isencoded = 1;
3496 return buf[0]&0x3F;
3497 } else if (type == REDIS_RDB_14BITLEN) {
3498 /* Read a 14 bit len */
3499 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3500 return ((buf[0]&0x3F)<<8)|buf[1];
3501 } else {
3502 /* Read a 32 bit len */
3503 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3504 return ntohl(len);
3505 }
3506 }
3507
3508 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3509 unsigned char enc[4];
3510 long long val;
3511
3512 if (enctype == REDIS_RDB_ENC_INT8) {
3513 if (fread(enc,1,1,fp) == 0) return NULL;
3514 val = (signed char)enc[0];
3515 } else if (enctype == REDIS_RDB_ENC_INT16) {
3516 uint16_t v;
3517 if (fread(enc,2,1,fp) == 0) return NULL;
3518 v = enc[0]|(enc[1]<<8);
3519 val = (int16_t)v;
3520 } else if (enctype == REDIS_RDB_ENC_INT32) {
3521 uint32_t v;
3522 if (fread(enc,4,1,fp) == 0) return NULL;
3523 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3524 val = (int32_t)v;
3525 } else {
3526 val = 0; /* anti-warning */
3527 redisAssert(0);
3528 }
3529 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3530 }
3531
3532 static robj *rdbLoadLzfStringObject(FILE*fp) {
3533 unsigned int len, clen;
3534 unsigned char *c = NULL;
3535 sds val = NULL;
3536
3537 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3538 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3539 if ((c = zmalloc(clen)) == NULL) goto err;
3540 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3541 if (fread(c,clen,1,fp) == 0) goto err;
3542 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3543 zfree(c);
3544 return createObject(REDIS_STRING,val);
3545 err:
3546 zfree(c);
3547 sdsfree(val);
3548 return NULL;
3549 }
3550
3551 static robj *rdbLoadStringObject(FILE*fp) {
3552 int isencoded;
3553 uint32_t len;
3554 sds val;
3555
3556 len = rdbLoadLen(fp,&isencoded);
3557 if (isencoded) {
3558 switch(len) {
3559 case REDIS_RDB_ENC_INT8:
3560 case REDIS_RDB_ENC_INT16:
3561 case REDIS_RDB_ENC_INT32:
3562 return rdbLoadIntegerObject(fp,len);
3563 case REDIS_RDB_ENC_LZF:
3564 return rdbLoadLzfStringObject(fp);
3565 default:
3566 redisAssert(0);
3567 }
3568 }
3569
3570 if (len == REDIS_RDB_LENERR) return NULL;
3571 val = sdsnewlen(NULL,len);
3572 if (len && fread(val,len,1,fp) == 0) {
3573 sdsfree(val);
3574 return NULL;
3575 }
3576 return createObject(REDIS_STRING,val);
3577 }
3578
3579 /* For information about double serialization check rdbSaveDoubleValue() */
3580 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3581 char buf[128];
3582 unsigned char len;
3583
3584 if (fread(&len,1,1,fp) == 0) return -1;
3585 switch(len) {
3586 case 255: *val = R_NegInf; return 0;
3587 case 254: *val = R_PosInf; return 0;
3588 case 253: *val = R_Nan; return 0;
3589 default:
3590 if (fread(buf,len,1,fp) == 0) return -1;
3591 buf[len] = '\0';
3592 sscanf(buf, "%lg", val);
3593 return 0;
3594 }
3595 }
3596
3597 /* Load a Redis object of the specified type from the specified file.
3598 * On success a newly allocated object is returned, otherwise NULL. */
3599 static robj *rdbLoadObject(int type, FILE *fp) {
3600 robj *o;
3601
3602 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3603 if (type == REDIS_STRING) {
3604 /* Read string value */
3605 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3606 tryObjectEncoding(o);
3607 } else if (type == REDIS_LIST || type == REDIS_SET) {
3608 /* Read list/set value */
3609 uint32_t listlen;
3610
3611 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3612 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3613 /* It's faster to expand the dict to the right size asap in order
3614 * to avoid rehashing */
3615 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3616 dictExpand(o->ptr,listlen);
3617 /* Load every single element of the list/set */
3618 while(listlen--) {
3619 robj *ele;
3620
3621 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3622 tryObjectEncoding(ele);
3623 if (type == REDIS_LIST) {
3624 listAddNodeTail((list*)o->ptr,ele);
3625 } else {
3626 dictAdd((dict*)o->ptr,ele,NULL);
3627 }
3628 }
3629 } else if (type == REDIS_ZSET) {
3630 /* Read list/set value */
3631 size_t zsetlen;
3632 zset *zs;
3633
3634 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3635 o = createZsetObject();
3636 zs = o->ptr;
3637 /* Load every single element of the list/set */
3638 while(zsetlen--) {
3639 robj *ele;
3640 double *score = zmalloc(sizeof(double));
3641
3642 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3643 tryObjectEncoding(ele);
3644 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3645 dictAdd(zs->dict,ele,score);
3646 zslInsert(zs->zsl,*score,ele);
3647 incrRefCount(ele); /* added to skiplist */
3648 }
3649 } else if (type == REDIS_HASH) {
3650 size_t hashlen;
3651
3652 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3653 o = createHashObject();
3654 /* Too many entries? Use an hash table. */
3655 if (hashlen > server.hash_max_zipmap_entries)
3656 convertToRealHash(o);
3657 /* Load every key/value, then set it into the zipmap or hash
3658 * table, as needed. */
3659 while(hashlen--) {
3660 robj *key, *val;
3661
3662 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3663 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3664 /* If we are using a zipmap and there are too big values
3665 * the object is converted to real hash table encoding. */
3666 if (o->encoding != REDIS_ENCODING_HT &&
3667 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3668 sdslen(val->ptr) > server.hash_max_zipmap_value))
3669 {
3670 convertToRealHash(o);
3671 }
3672
3673 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3674 unsigned char *zm = o->ptr;
3675
3676 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3677 val->ptr,sdslen(val->ptr),NULL);
3678 o->ptr = zm;
3679 decrRefCount(key);
3680 decrRefCount(val);
3681 } else {
3682 tryObjectEncoding(key);
3683 tryObjectEncoding(val);
3684 dictAdd((dict*)o->ptr,key,val);
3685 }
3686 }
3687 } else {
3688 redisAssert(0);
3689 }
3690 return o;
3691 }
3692
3693 static int rdbLoad(char *filename) {
3694 FILE *fp;
3695 robj *keyobj = NULL;
3696 uint32_t dbid;
3697 int type, retval, rdbver;
3698 dict *d = server.db[0].dict;
3699 redisDb *db = server.db+0;
3700 char buf[1024];
3701 time_t expiretime = -1, now = time(NULL);
3702 long long loadedkeys = 0;
3703
3704 fp = fopen(filename,"r");
3705 if (!fp) return REDIS_ERR;
3706 if (fread(buf,9,1,fp) == 0) goto eoferr;
3707 buf[9] = '\0';
3708 if (memcmp(buf,"REDIS",5) != 0) {
3709 fclose(fp);
3710 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3711 return REDIS_ERR;
3712 }
3713 rdbver = atoi(buf+5);
3714 if (rdbver != 1) {
3715 fclose(fp);
3716 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3717 return REDIS_ERR;
3718 }
3719 while(1) {
3720 robj *o;
3721
3722 /* Read type. */
3723 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3724 if (type == REDIS_EXPIRETIME) {
3725 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3726 /* We read the time so we need to read the object type again */
3727 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3728 }
3729 if (type == REDIS_EOF) break;
3730 /* Handle SELECT DB opcode as a special case */
3731 if (type == REDIS_SELECTDB) {
3732 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3733 goto eoferr;
3734 if (dbid >= (unsigned)server.dbnum) {
3735 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3736 exit(1);
3737 }
3738 db = server.db+dbid;
3739 d = db->dict;
3740 continue;
3741 }
3742 /* Read key */
3743 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3744 /* Read value */
3745 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3746 /* Add the new object in the hash table */
3747 retval = dictAdd(d,keyobj,o);
3748 if (retval == DICT_ERR) {
3749 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3750 exit(1);
3751 }
3752 /* Set the expire time if needed */
3753 if (expiretime != -1) {
3754 setExpire(db,keyobj,expiretime);
3755 /* Delete this key if already expired */
3756 if (expiretime < now) deleteKey(db,keyobj);
3757 expiretime = -1;
3758 }
3759 keyobj = o = NULL;
3760 /* Handle swapping while loading big datasets when VM is on */
3761 loadedkeys++;
3762 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3763 while (zmalloc_used_memory() > server.vm_max_memory) {
3764 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3765 }
3766 }
3767 }
3768 fclose(fp);
3769 return REDIS_OK;
3770
3771 eoferr: /* unexpected end of file is handled here with a fatal exit */
3772 if (keyobj) decrRefCount(keyobj);
3773 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3774 exit(1);
3775 return REDIS_ERR; /* Just to avoid warning */
3776 }
3777
3778 /*================================== Commands =============================== */
3779
3780 static void authCommand(redisClient *c) {
3781 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3782 c->authenticated = 1;
3783 addReply(c,shared.ok);
3784 } else {
3785 c->authenticated = 0;
3786 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3787 }
3788 }
3789
3790 static void pingCommand(redisClient *c) {
3791 addReply(c,shared.pong);
3792 }
3793
3794 static void echoCommand(redisClient *c) {
3795 addReplyBulk(c,c->argv[1]);
3796 }
3797
3798 /*=================================== Strings =============================== */
3799
3800 static void setGenericCommand(redisClient *c, int nx) {
3801 int retval;
3802
3803 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3804 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3805 if (retval == DICT_ERR) {
3806 if (!nx) {
3807 /* If the key is about a swapped value, we want a new key object
3808 * to overwrite the old. So we delete the old key in the database.
3809 * This will also make sure that swap pages about the old object
3810 * will be marked as free. */
3811 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3812 incrRefCount(c->argv[1]);
3813 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3814 incrRefCount(c->argv[2]);
3815 } else {
3816 addReply(c,shared.czero);
3817 return;
3818 }
3819 } else {
3820 incrRefCount(c->argv[1]);
3821 incrRefCount(c->argv[2]);
3822 }
3823 server.dirty++;
3824 removeExpire(c->db,c->argv[1]);
3825 addReply(c, nx ? shared.cone : shared.ok);
3826 }
3827
3828 static void setCommand(redisClient *c) {
3829 setGenericCommand(c,0);
3830 }
3831
3832 static void setnxCommand(redisClient *c) {
3833 setGenericCommand(c,1);
3834 }
3835
3836 static int getGenericCommand(redisClient *c) {
3837 robj *o;
3838
3839 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3840 return REDIS_OK;
3841
3842 if (o->type != REDIS_STRING) {
3843 addReply(c,shared.wrongtypeerr);
3844 return REDIS_ERR;
3845 } else {
3846 addReplyBulk(c,o);
3847 return REDIS_OK;
3848 }
3849 }
3850
3851 static void getCommand(redisClient *c) {
3852 getGenericCommand(c);
3853 }
3854
3855 static void getsetCommand(redisClient *c) {
3856 if (getGenericCommand(c) == REDIS_ERR) return;
3857 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3858 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3859 } else {
3860 incrRefCount(c->argv[1]);
3861 }
3862 incrRefCount(c->argv[2]);
3863 server.dirty++;
3864 removeExpire(c->db,c->argv[1]);
3865 }
3866
3867 static void mgetCommand(redisClient *c) {
3868 int j;
3869
3870 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3871 for (j = 1; j < c->argc; j++) {
3872 robj *o = lookupKeyRead(c->db,c->argv[j]);
3873 if (o == NULL) {
3874 addReply(c,shared.nullbulk);
3875 } else {
3876 if (o->type != REDIS_STRING) {
3877 addReply(c,shared.nullbulk);
3878 } else {
3879 addReplyBulk(c,o);
3880 }
3881 }
3882 }
3883 }
3884
3885 static void msetGenericCommand(redisClient *c, int nx) {
3886 int j, busykeys = 0;
3887
3888 if ((c->argc % 2) == 0) {
3889 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3890 return;
3891 }
3892 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3893 * set nothing at all if at least one already key exists. */
3894 if (nx) {
3895 for (j = 1; j < c->argc; j += 2) {
3896 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3897 busykeys++;
3898 }
3899 }
3900 }
3901 if (busykeys) {
3902 addReply(c, shared.czero);
3903 return;
3904 }
3905
3906 for (j = 1; j < c->argc; j += 2) {
3907 int retval;
3908
3909 tryObjectEncoding(c->argv[j+1]);
3910 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3911 if (retval == DICT_ERR) {
3912 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3913 incrRefCount(c->argv[j+1]);
3914 } else {
3915 incrRefCount(c->argv[j]);
3916 incrRefCount(c->argv[j+1]);
3917 }
3918 removeExpire(c->db,c->argv[j]);
3919 }
3920 server.dirty += (c->argc-1)/2;
3921 addReply(c, nx ? shared.cone : shared.ok);
3922 }
3923
3924 static void msetCommand(redisClient *c) {
3925 msetGenericCommand(c,0);
3926 }
3927
3928 static void msetnxCommand(redisClient *c) {
3929 msetGenericCommand(c,1);
3930 }
3931
3932 static void incrDecrCommand(redisClient *c, long long incr) {
3933 long long value;
3934 int retval;
3935 robj *o;
3936
3937 o = lookupKeyWrite(c->db,c->argv[1]);
3938 if (o == NULL) {
3939 value = 0;
3940 } else {
3941 if (o->type != REDIS_STRING) {
3942 value = 0;
3943 } else {
3944 char *eptr;
3945
3946 if (o->encoding == REDIS_ENCODING_RAW)
3947 value = strtoll(o->ptr, &eptr, 10);
3948 else if (o->encoding == REDIS_ENCODING_INT)
3949 value = (long)o->ptr;
3950 else
3951 redisAssert(1 != 1);
3952 }
3953 }
3954
3955 value += incr;
3956 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3957 tryObjectEncoding(o);
3958 retval = dictAdd(c->db->dict,c->argv[1],o);
3959 if (retval == DICT_ERR) {
3960 dictReplace(c->db->dict,c->argv[1],o);
3961 removeExpire(c->db,c->argv[1]);
3962 } else {
3963 incrRefCount(c->argv[1]);
3964 }
3965 server.dirty++;
3966 addReply(c,shared.colon);
3967 addReply(c,o);
3968 addReply(c,shared.crlf);
3969 }
3970
3971 static void incrCommand(redisClient *c) {
3972 incrDecrCommand(c,1);
3973 }
3974
3975 static void decrCommand(redisClient *c) {
3976 incrDecrCommand(c,-1);
3977 }
3978
3979 static void incrbyCommand(redisClient *c) {
3980 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3981 incrDecrCommand(c,incr);
3982 }
3983
3984 static void decrbyCommand(redisClient *c) {
3985 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3986 incrDecrCommand(c,-incr);
3987 }
3988
3989 static void appendCommand(redisClient *c) {
3990 int retval;
3991 size_t totlen;
3992 robj *o;
3993
3994 o = lookupKeyWrite(c->db,c->argv[1]);
3995 if (o == NULL) {
3996 /* Create the key */
3997 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3998 incrRefCount(c->argv[1]);
3999 incrRefCount(c->argv[2]);
4000 totlen = stringObjectLen(c->argv[2]);
4001 } else {
4002 dictEntry *de;
4003
4004 de = dictFind(c->db->dict,c->argv[1]);
4005 assert(de != NULL);
4006
4007 o = dictGetEntryVal(de);
4008 if (o->type != REDIS_STRING) {
4009 addReply(c,shared.wrongtypeerr);
4010 return;
4011 }
4012 /* If the object is specially encoded or shared we have to make
4013 * a copy */
4014 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4015 robj *decoded = getDecodedObject(o);
4016
4017 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4018 decrRefCount(decoded);
4019 dictReplace(c->db->dict,c->argv[1],o);
4020 }
4021 /* APPEND! */
4022 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4023 o->ptr = sdscatlen(o->ptr,
4024 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4025 } else {
4026 o->ptr = sdscatprintf(o->ptr, "%ld",
4027 (unsigned long) c->argv[2]->ptr);
4028 }
4029 totlen = sdslen(o->ptr);
4030 }
4031 server.dirty++;
4032 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4033 }
4034
4035 static void substrCommand(redisClient *c) {
4036 robj *o;
4037 long start = atoi(c->argv[2]->ptr);
4038 long end = atoi(c->argv[3]->ptr);
4039 size_t rangelen, strlen;
4040 sds range;
4041
4042 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4043 checkType(c,o,REDIS_STRING)) return;
4044
4045 o = getDecodedObject(o);
4046 strlen = sdslen(o->ptr);
4047
4048 /* convert negative indexes */
4049 if (start < 0) start = strlen+start;
4050 if (end < 0) end = strlen+end;
4051 if (start < 0) start = 0;
4052 if (end < 0) end = 0;
4053
4054 /* indexes sanity checks */
4055 if (start > end || (size_t)start >= strlen) {
4056 /* Out of range start or start > end result in null reply */
4057 addReply(c,shared.nullbulk);
4058 decrRefCount(o);
4059 return;
4060 }
4061 if ((size_t)end >= strlen) end = strlen-1;
4062 rangelen = (end-start)+1;
4063
4064 /* Return the result */
4065 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4066 range = sdsnewlen((char*)o->ptr+start,rangelen);
4067 addReplySds(c,range);
4068 addReply(c,shared.crlf);
4069 decrRefCount(o);
4070 }
4071
4072 /* ========================= Type agnostic commands ========================= */
4073
4074 static void delCommand(redisClient *c) {
4075 int deleted = 0, j;
4076
4077 for (j = 1; j < c->argc; j++) {
4078 if (deleteKey(c->db,c->argv[j])) {
4079 server.dirty++;
4080 deleted++;
4081 }
4082 }
4083 addReplyLong(c,deleted);
4084 }
4085
4086 static void existsCommand(redisClient *c) {
4087 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4088 }
4089
4090 static void selectCommand(redisClient *c) {
4091 int id = atoi(c->argv[1]->ptr);
4092
4093 if (selectDb(c,id) == REDIS_ERR) {
4094 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4095 } else {
4096 addReply(c,shared.ok);
4097 }
4098 }
4099
4100 static void randomkeyCommand(redisClient *c) {
4101 dictEntry *de;
4102
4103 while(1) {
4104 de = dictGetRandomKey(c->db->dict);
4105 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4106 }
4107 if (de == NULL) {
4108 addReply(c,shared.plus);
4109 addReply(c,shared.crlf);
4110 } else {
4111 addReply(c,shared.plus);
4112 addReply(c,dictGetEntryKey(de));
4113 addReply(c,shared.crlf);
4114 }
4115 }
4116
4117 static void keysCommand(redisClient *c) {
4118 dictIterator *di;
4119 dictEntry *de;
4120 sds pattern = c->argv[1]->ptr;
4121 int plen = sdslen(pattern);
4122 unsigned long numkeys = 0;
4123 robj *lenobj = createObject(REDIS_STRING,NULL);
4124
4125 di = dictGetIterator(c->db->dict);
4126 addReply(c,lenobj);
4127 decrRefCount(lenobj);
4128 while((de = dictNext(di)) != NULL) {
4129 robj *keyobj = dictGetEntryKey(de);
4130
4131 sds key = keyobj->ptr;
4132 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4133 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4134 if (expireIfNeeded(c->db,keyobj) == 0) {
4135 addReplyBulk(c,keyobj);
4136 numkeys++;
4137 }
4138 }
4139 }
4140 dictReleaseIterator(di);
4141 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4142 }
4143
4144 static void dbsizeCommand(redisClient *c) {
4145 addReplySds(c,
4146 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4147 }
4148
4149 static void lastsaveCommand(redisClient *c) {
4150 addReplySds(c,
4151 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4152 }
4153
4154 static void typeCommand(redisClient *c) {
4155 robj *o;
4156 char *type;
4157
4158 o = lookupKeyRead(c->db,c->argv[1]);
4159 if (o == NULL) {
4160 type = "+none";
4161 } else {
4162 switch(o->type) {
4163 case REDIS_STRING: type = "+string"; break;
4164 case REDIS_LIST: type = "+list"; break;
4165 case REDIS_SET: type = "+set"; break;
4166 case REDIS_ZSET: type = "+zset"; break;
4167 case REDIS_HASH: type = "+hash"; break;
4168 default: type = "+unknown"; break;
4169 }
4170 }
4171 addReplySds(c,sdsnew(type));
4172 addReply(c,shared.crlf);
4173 }
4174
4175 static void saveCommand(redisClient *c) {
4176 if (server.bgsavechildpid != -1) {
4177 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4178 return;
4179 }
4180 if (rdbSave(server.dbfilename) == REDIS_OK) {
4181 addReply(c,shared.ok);
4182 } else {
4183 addReply(c,shared.err);
4184 }
4185 }
4186
4187 static void bgsaveCommand(redisClient *c) {
4188 if (server.bgsavechildpid != -1) {
4189 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4190 return;
4191 }
4192 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4193 char *status = "+Background saving started\r\n";
4194 addReplySds(c,sdsnew(status));
4195 } else {
4196 addReply(c,shared.err);
4197 }
4198 }
4199
4200 static void shutdownCommand(redisClient *c) {
4201 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4202 /* Kill the saving child if there is a background saving in progress.
4203 We want to avoid race conditions, for instance our saving child may
4204 overwrite the synchronous saving did by SHUTDOWN. */
4205 if (server.bgsavechildpid != -1) {
4206 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4207 kill(server.bgsavechildpid,SIGKILL);
4208 rdbRemoveTempFile(server.bgsavechildpid);
4209 }
4210 if (server.appendonly) {
4211 /* Append only file: fsync() the AOF and exit */
4212 fsync(server.appendfd);
4213 if (server.vm_enabled) unlink(server.vm_swap_file);
4214 exit(0);
4215 } else {
4216 /* Snapshotting. Perform a SYNC SAVE and exit */
4217 if (rdbSave(server.dbfilename) == REDIS_OK) {
4218 if (server.daemonize)
4219 unlink(server.pidfile);
4220 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4221 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4222 if (server.vm_enabled) unlink(server.vm_swap_file);
4223 exit(0);
4224 } else {
4225 /* Ooops.. error saving! The best we can do is to continue
4226 * operating. Note that if there was a background saving process,
4227 * in the next cron() Redis will be notified that the background
4228 * saving aborted, handling special stuff like slaves pending for
4229 * synchronization... */
4230 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4231 addReplySds(c,
4232 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4233 }
4234 }
4235 }
4236
4237 static void renameGenericCommand(redisClient *c, int nx) {
4238 robj *o;
4239
4240 /* To use the same key as src and dst is probably an error */
4241 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4242 addReply(c,shared.sameobjecterr);
4243 return;
4244 }
4245
4246 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4247 return;
4248
4249 incrRefCount(o);
4250 deleteIfVolatile(c->db,c->argv[2]);
4251 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4252 if (nx) {
4253 decrRefCount(o);
4254 addReply(c,shared.czero);
4255 return;
4256 }
4257 dictReplace(c->db->dict,c->argv[2],o);
4258 } else {
4259 incrRefCount(c->argv[2]);
4260 }
4261 deleteKey(c->db,c->argv[1]);
4262 server.dirty++;
4263 addReply(c,nx ? shared.cone : shared.ok);
4264 }
4265
4266 static void renameCommand(redisClient *c) {
4267 renameGenericCommand(c,0);
4268 }
4269
4270 static void renamenxCommand(redisClient *c) {
4271 renameGenericCommand(c,1);
4272 }
4273
4274 static void moveCommand(redisClient *c) {
4275 robj *o;
4276 redisDb *src, *dst;
4277 int srcid;
4278
4279 /* Obtain source and target DB pointers */
4280 src = c->db;
4281 srcid = c->db->id;
4282 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4283 addReply(c,shared.outofrangeerr);
4284 return;
4285 }
4286 dst = c->db;
4287 selectDb(c,srcid); /* Back to the source DB */
4288
4289 /* If the user is moving using as target the same
4290 * DB as the source DB it is probably an error. */
4291 if (src == dst) {
4292 addReply(c,shared.sameobjecterr);
4293 return;
4294 }
4295
4296 /* Check if the element exists and get a reference */
4297 o = lookupKeyWrite(c->db,c->argv[1]);
4298 if (!o) {
4299 addReply(c,shared.czero);
4300 return;
4301 }
4302
4303 /* Try to add the element to the target DB */
4304 deleteIfVolatile(dst,c->argv[1]);
4305 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4306 addReply(c,shared.czero);
4307 return;
4308 }
4309 incrRefCount(c->argv[1]);
4310 incrRefCount(o);
4311
4312 /* OK! key moved, free the entry in the source DB */
4313 deleteKey(src,c->argv[1]);
4314 server.dirty++;
4315 addReply(c,shared.cone);
4316 }
4317
4318 /* =================================== Lists ================================ */
4319 static void pushGenericCommand(redisClient *c, int where) {
4320 robj *lobj;
4321 list *list;
4322
4323 lobj = lookupKeyWrite(c->db,c->argv[1]);
4324 if (lobj == NULL) {
4325 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4326 addReply(c,shared.cone);
4327 return;
4328 }
4329 lobj = createListObject();
4330 list = lobj->ptr;
4331 if (where == REDIS_HEAD) {
4332 listAddNodeHead(list,c->argv[2]);
4333 } else {
4334 listAddNodeTail(list,c->argv[2]);
4335 }
4336 dictAdd(c->db->dict,c->argv[1],lobj);
4337 incrRefCount(c->argv[1]);
4338 incrRefCount(c->argv[2]);
4339 } else {
4340 if (lobj->type != REDIS_LIST) {
4341 addReply(c,shared.wrongtypeerr);
4342 return;
4343 }
4344 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4345 addReply(c,shared.cone);
4346 return;
4347 }
4348 list = lobj->ptr;
4349 if (where == REDIS_HEAD) {
4350 listAddNodeHead(list,c->argv[2]);
4351 } else {
4352 listAddNodeTail(list,c->argv[2]);
4353 }
4354 incrRefCount(c->argv[2]);
4355 }
4356 server.dirty++;
4357 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4358 }
4359
4360 static void lpushCommand(redisClient *c) {
4361 pushGenericCommand(c,REDIS_HEAD);
4362 }
4363
4364 static void rpushCommand(redisClient *c) {
4365 pushGenericCommand(c,REDIS_TAIL);
4366 }
4367
4368 static void llenCommand(redisClient *c) {
4369 robj *o;
4370 list *l;
4371
4372 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4373 checkType(c,o,REDIS_LIST)) return;
4374
4375 l = o->ptr;
4376 addReplyUlong(c,listLength(l));
4377 }
4378
4379 static void lindexCommand(redisClient *c) {
4380 robj *o;
4381 int index = atoi(c->argv[2]->ptr);
4382 list *list;
4383 listNode *ln;
4384
4385 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4386 checkType(c,o,REDIS_LIST)) return;
4387 list = o->ptr;
4388
4389 ln = listIndex(list, index);
4390 if (ln == NULL) {
4391 addReply(c,shared.nullbulk);
4392 } else {
4393 robj *ele = listNodeValue(ln);
4394 addReplyBulk(c,ele);
4395 }
4396 }
4397
4398 static void lsetCommand(redisClient *c) {
4399 robj *o;
4400 int index = atoi(c->argv[2]->ptr);
4401 list *list;
4402 listNode *ln;
4403
4404 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4405 checkType(c,o,REDIS_LIST)) return;
4406 list = o->ptr;
4407
4408 ln = listIndex(list, index);
4409 if (ln == NULL) {
4410 addReply(c,shared.outofrangeerr);
4411 } else {
4412 robj *ele = listNodeValue(ln);
4413
4414 decrRefCount(ele);
4415 listNodeValue(ln) = c->argv[3];
4416 incrRefCount(c->argv[3]);
4417 addReply(c,shared.ok);
4418 server.dirty++;
4419 }
4420 }
4421
4422 static void popGenericCommand(redisClient *c, int where) {
4423 robj *o;
4424 list *list;
4425 listNode *ln;
4426
4427 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4428 checkType(c,o,REDIS_LIST)) return;
4429 list = o->ptr;
4430
4431 if (where == REDIS_HEAD)
4432 ln = listFirst(list);
4433 else
4434 ln = listLast(list);
4435
4436 if (ln == NULL) {
4437 addReply(c,shared.nullbulk);
4438 } else {
4439 robj *ele = listNodeValue(ln);
4440 addReplyBulk(c,ele);
4441 listDelNode(list,ln);
4442 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4443 server.dirty++;
4444 }
4445 }
4446
4447 static void lpopCommand(redisClient *c) {
4448 popGenericCommand(c,REDIS_HEAD);
4449 }
4450
4451 static void rpopCommand(redisClient *c) {
4452 popGenericCommand(c,REDIS_TAIL);
4453 }
4454
4455 static void lrangeCommand(redisClient *c) {
4456 robj *o;
4457 int start = atoi(c->argv[2]->ptr);
4458 int end = atoi(c->argv[3]->ptr);
4459 int llen;
4460 int rangelen, j;
4461 list *list;
4462 listNode *ln;
4463 robj *ele;
4464
4465 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4466 checkType(c,o,REDIS_LIST)) return;
4467 list = o->ptr;
4468 llen = listLength(list);
4469
4470 /* convert negative indexes */
4471 if (start < 0) start = llen+start;
4472 if (end < 0) end = llen+end;
4473 if (start < 0) start = 0;
4474 if (end < 0) end = 0;
4475
4476 /* indexes sanity checks */
4477 if (start > end || start >= llen) {
4478 /* Out of range start or start > end result in empty list */
4479 addReply(c,shared.emptymultibulk);
4480 return;
4481 }
4482 if (end >= llen) end = llen-1;
4483 rangelen = (end-start)+1;
4484
4485 /* Return the result in form of a multi-bulk reply */
4486 ln = listIndex(list, start);
4487 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4488 for (j = 0; j < rangelen; j++) {
4489 ele = listNodeValue(ln);
4490 addReplyBulk(c,ele);
4491 ln = ln->next;
4492 }
4493 }
4494
4495 static void ltrimCommand(redisClient *c) {
4496 robj *o;
4497 int start = atoi(c->argv[2]->ptr);
4498 int end = atoi(c->argv[3]->ptr);
4499 int llen;
4500 int j, ltrim, rtrim;
4501 list *list;
4502 listNode *ln;
4503
4504 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4505 checkType(c,o,REDIS_LIST)) return;
4506 list = o->ptr;
4507 llen = listLength(list);
4508
4509 /* convert negative indexes */
4510 if (start < 0) start = llen+start;
4511 if (end < 0) end = llen+end;
4512 if (start < 0) start = 0;
4513 if (end < 0) end = 0;
4514
4515 /* indexes sanity checks */
4516 if (start > end || start >= llen) {
4517 /* Out of range start or start > end result in empty list */
4518 ltrim = llen;
4519 rtrim = 0;
4520 } else {
4521 if (end >= llen) end = llen-1;
4522 ltrim = start;
4523 rtrim = llen-end-1;
4524 }
4525
4526 /* Remove list elements to perform the trim */
4527 for (j = 0; j < ltrim; j++) {
4528 ln = listFirst(list);
4529 listDelNode(list,ln);
4530 }
4531 for (j = 0; j < rtrim; j++) {
4532 ln = listLast(list);
4533 listDelNode(list,ln);
4534 }
4535 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4536 server.dirty++;
4537 addReply(c,shared.ok);
4538 }
4539
4540 static void lremCommand(redisClient *c) {
4541 robj *o;
4542 list *list;
4543 listNode *ln, *next;
4544 int toremove = atoi(c->argv[2]->ptr);
4545 int removed = 0;
4546 int fromtail = 0;
4547
4548 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4549 checkType(c,o,REDIS_LIST)) return;
4550 list = o->ptr;
4551
4552 if (toremove < 0) {
4553 toremove = -toremove;
4554 fromtail = 1;
4555 }
4556 ln = fromtail ? list->tail : list->head;
4557 while (ln) {
4558 robj *ele = listNodeValue(ln);
4559
4560 next = fromtail ? ln->prev : ln->next;
4561 if (compareStringObjects(ele,c->argv[3]) == 0) {
4562 listDelNode(list,ln);
4563 server.dirty++;
4564 removed++;
4565 if (toremove && removed == toremove) break;
4566 }
4567 ln = next;
4568 }
4569 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4570 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4571 }
4572
4573 /* This is the semantic of this command:
4574 * RPOPLPUSH srclist dstlist:
4575 * IF LLEN(srclist) > 0
4576 * element = RPOP srclist
4577 * LPUSH dstlist element
4578 * RETURN element
4579 * ELSE
4580 * RETURN nil
4581 * END
4582 * END
4583 *
4584 * The idea is to be able to get an element from a list in a reliable way
4585 * since the element is not just returned but pushed against another list
4586 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4587 */
4588 static void rpoplpushcommand(redisClient *c) {
4589 robj *sobj;
4590 list *srclist;
4591 listNode *ln;
4592
4593 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4594 checkType(c,sobj,REDIS_LIST)) return;
4595 srclist = sobj->ptr;
4596 ln = listLast(srclist);
4597
4598 if (ln == NULL) {
4599 addReply(c,shared.nullbulk);
4600 } else {
4601 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4602 robj *ele = listNodeValue(ln);
4603 list *dstlist;
4604
4605 if (dobj && dobj->type != REDIS_LIST) {
4606 addReply(c,shared.wrongtypeerr);
4607 return;
4608 }
4609
4610 /* Add the element to the target list (unless it's directly
4611 * passed to some BLPOP-ing client */
4612 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4613 if (dobj == NULL) {
4614 /* Create the list if the key does not exist */
4615 dobj = createListObject();
4616 dictAdd(c->db->dict,c->argv[2],dobj);
4617 incrRefCount(c->argv[2]);
4618 }
4619 dstlist = dobj->ptr;
4620 listAddNodeHead(dstlist,ele);
4621 incrRefCount(ele);
4622 }
4623
4624 /* Send the element to the client as reply as well */
4625 addReplyBulk(c,ele);
4626
4627 /* Finally remove the element from the source list */
4628 listDelNode(srclist,ln);
4629 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4630 server.dirty++;
4631 }
4632 }
4633
4634 /* ==================================== Sets ================================ */
4635
4636 static void saddCommand(redisClient *c) {
4637 robj *set;
4638
4639 set = lookupKeyWrite(c->db,c->argv[1]);
4640 if (set == NULL) {
4641 set = createSetObject();
4642 dictAdd(c->db->dict,c->argv[1],set);
4643 incrRefCount(c->argv[1]);
4644 } else {
4645 if (set->type != REDIS_SET) {
4646 addReply(c,shared.wrongtypeerr);
4647 return;
4648 }
4649 }
4650 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4651 incrRefCount(c->argv[2]);
4652 server.dirty++;
4653 addReply(c,shared.cone);
4654 } else {
4655 addReply(c,shared.czero);
4656 }
4657 }
4658
4659 static void sremCommand(redisClient *c) {
4660 robj *set;
4661
4662 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4663 checkType(c,set,REDIS_SET)) return;
4664
4665 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4666 server.dirty++;
4667 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4668 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4669 addReply(c,shared.cone);
4670 } else {
4671 addReply(c,shared.czero);
4672 }
4673 }
4674
4675 static void smoveCommand(redisClient *c) {
4676 robj *srcset, *dstset;
4677
4678 srcset = lookupKeyWrite(c->db,c->argv[1]);
4679 dstset = lookupKeyWrite(c->db,c->argv[2]);
4680
4681 /* If the source key does not exist return 0, if it's of the wrong type
4682 * raise an error */
4683 if (srcset == NULL || srcset->type != REDIS_SET) {
4684 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4685 return;
4686 }
4687 /* Error if the destination key is not a set as well */
4688 if (dstset && dstset->type != REDIS_SET) {
4689 addReply(c,shared.wrongtypeerr);
4690 return;
4691 }
4692 /* Remove the element from the source set */
4693 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4694 /* Key not found in the src set! return zero */
4695 addReply(c,shared.czero);
4696 return;
4697 }
4698 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4699 deleteKey(c->db,c->argv[1]);
4700 server.dirty++;
4701 /* Add the element to the destination set */
4702 if (!dstset) {
4703 dstset = createSetObject();
4704 dictAdd(c->db->dict,c->argv[2],dstset);
4705 incrRefCount(c->argv[2]);
4706 }
4707 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4708 incrRefCount(c->argv[3]);
4709 addReply(c,shared.cone);
4710 }
4711
4712 static void sismemberCommand(redisClient *c) {
4713 robj *set;
4714
4715 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4716 checkType(c,set,REDIS_SET)) return;
4717
4718 if (dictFind(set->ptr,c->argv[2]))
4719 addReply(c,shared.cone);
4720 else
4721 addReply(c,shared.czero);
4722 }
4723
4724 static void scardCommand(redisClient *c) {
4725 robj *o;
4726 dict *s;
4727
4728 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4729 checkType(c,o,REDIS_SET)) return;
4730
4731 s = o->ptr;
4732 addReplyUlong(c,dictSize(s));
4733 }
4734
4735 static void spopCommand(redisClient *c) {
4736 robj *set;
4737 dictEntry *de;
4738
4739 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4740 checkType(c,set,REDIS_SET)) return;
4741
4742 de = dictGetRandomKey(set->ptr);
4743 if (de == NULL) {
4744 addReply(c,shared.nullbulk);
4745 } else {
4746 robj *ele = dictGetEntryKey(de);
4747
4748 addReplyBulk(c,ele);
4749 dictDelete(set->ptr,ele);
4750 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4751 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4752 server.dirty++;
4753 }
4754 }
4755
4756 static void srandmemberCommand(redisClient *c) {
4757 robj *set;
4758 dictEntry *de;
4759
4760 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4761 checkType(c,set,REDIS_SET)) return;
4762
4763 de = dictGetRandomKey(set->ptr);
4764 if (de == NULL) {
4765 addReply(c,shared.nullbulk);
4766 } else {
4767 robj *ele = dictGetEntryKey(de);
4768
4769 addReplyBulk(c,ele);
4770 }
4771 }
4772
4773 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4774 dict **d1 = (void*) s1, **d2 = (void*) s2;
4775
4776 return dictSize(*d1)-dictSize(*d2);
4777 }
4778
4779 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4780 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4781 dictIterator *di;
4782 dictEntry *de;
4783 robj *lenobj = NULL, *dstset = NULL;
4784 unsigned long j, cardinality = 0;
4785
4786 for (j = 0; j < setsnum; j++) {
4787 robj *setobj;
4788
4789 setobj = dstkey ?
4790 lookupKeyWrite(c->db,setskeys[j]) :
4791 lookupKeyRead(c->db,setskeys[j]);
4792 if (!setobj) {
4793 zfree(dv);
4794 if (dstkey) {
4795 if (deleteKey(c->db,dstkey))
4796 server.dirty++;
4797 addReply(c,shared.czero);
4798 } else {
4799 addReply(c,shared.nullmultibulk);
4800 }
4801 return;
4802 }
4803 if (setobj->type != REDIS_SET) {
4804 zfree(dv);
4805 addReply(c,shared.wrongtypeerr);
4806 return;
4807 }
4808 dv[j] = setobj->ptr;
4809 }
4810 /* Sort sets from the smallest to largest, this will improve our
4811 * algorithm's performace */
4812 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4813
4814 /* The first thing we should output is the total number of elements...
4815 * since this is a multi-bulk write, but at this stage we don't know
4816 * the intersection set size, so we use a trick, append an empty object
4817 * to the output list and save the pointer to later modify it with the
4818 * right length */
4819 if (!dstkey) {
4820 lenobj = createObject(REDIS_STRING,NULL);
4821 addReply(c,lenobj);
4822 decrRefCount(lenobj);
4823 } else {
4824 /* If we have a target key where to store the resulting set
4825 * create this key with an empty set inside */
4826 dstset = createSetObject();
4827 }
4828
4829 /* Iterate all the elements of the first (smallest) set, and test
4830 * the element against all the other sets, if at least one set does
4831 * not include the element it is discarded */
4832 di = dictGetIterator(dv[0]);
4833
4834 while((de = dictNext(di)) != NULL) {
4835 robj *ele;
4836
4837 for (j = 1; j < setsnum; j++)
4838 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4839 if (j != setsnum)
4840 continue; /* at least one set does not contain the member */
4841 ele = dictGetEntryKey(de);
4842 if (!dstkey) {
4843 addReplyBulk(c,ele);
4844 cardinality++;
4845 } else {
4846 dictAdd(dstset->ptr,ele,NULL);
4847 incrRefCount(ele);
4848 }
4849 }
4850 dictReleaseIterator(di);
4851
4852 if (dstkey) {
4853 /* Store the resulting set into the target, if the intersection
4854 * is not an empty set. */
4855 deleteKey(c->db,dstkey);
4856 if (dictSize((dict*)dstset->ptr) > 0) {
4857 dictAdd(c->db->dict,dstkey,dstset);
4858 incrRefCount(dstkey);
4859 addReplyLong(c,dictSize((dict*)dstset->ptr));
4860 } else {
4861 decrRefCount(dstset);
4862 addReply(c,shared.czero);
4863 }
4864 server.dirty++;
4865 } else {
4866 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4867 }
4868 zfree(dv);
4869 }
4870
4871 static void sinterCommand(redisClient *c) {
4872 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4873 }
4874
4875 static void sinterstoreCommand(redisClient *c) {
4876 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4877 }
4878
4879 #define REDIS_OP_UNION 0
4880 #define REDIS_OP_DIFF 1
4881 #define REDIS_OP_INTER 2
4882
4883 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4884 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4885 dictIterator *di;
4886 dictEntry *de;
4887 robj *dstset = NULL;
4888 int j, cardinality = 0;
4889
4890 for (j = 0; j < setsnum; j++) {
4891 robj *setobj;
4892
4893 setobj = dstkey ?
4894 lookupKeyWrite(c->db,setskeys[j]) :
4895 lookupKeyRead(c->db,setskeys[j]);
4896 if (!setobj) {
4897 dv[j] = NULL;
4898 continue;
4899 }
4900 if (setobj->type != REDIS_SET) {
4901 zfree(dv);
4902 addReply(c,shared.wrongtypeerr);
4903 return;
4904 }
4905 dv[j] = setobj->ptr;
4906 }
4907
4908 /* We need a temp set object to store our union. If the dstkey
4909 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4910 * this set object will be the resulting object to set into the target key*/
4911 dstset = createSetObject();
4912
4913 /* Iterate all the elements of all the sets, add every element a single
4914 * time to the result set */
4915 for (j = 0; j < setsnum; j++) {
4916 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4917 if (!dv[j]) continue; /* non existing keys are like empty sets */
4918
4919 di = dictGetIterator(dv[j]);
4920
4921 while((de = dictNext(di)) != NULL) {
4922 robj *ele;
4923
4924 /* dictAdd will not add the same element multiple times */
4925 ele = dictGetEntryKey(de);
4926 if (op == REDIS_OP_UNION || j == 0) {
4927 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4928 incrRefCount(ele);
4929 cardinality++;
4930 }
4931 } else if (op == REDIS_OP_DIFF) {
4932 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4933 cardinality--;
4934 }
4935 }
4936 }
4937 dictReleaseIterator(di);
4938
4939 /* result set is empty? Exit asap. */
4940 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4941 }
4942
4943 /* Output the content of the resulting set, if not in STORE mode */
4944 if (!dstkey) {
4945 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4946 di = dictGetIterator(dstset->ptr);
4947 while((de = dictNext(di)) != NULL) {
4948 robj *ele;
4949
4950 ele = dictGetEntryKey(de);
4951 addReplyBulk(c,ele);
4952 }
4953 dictReleaseIterator(di);
4954 decrRefCount(dstset);
4955 } else {
4956 /* If we have a target key where to store the resulting set
4957 * create this key with the result set inside */
4958 deleteKey(c->db,dstkey);
4959 if (dictSize((dict*)dstset->ptr) > 0) {
4960 dictAdd(c->db->dict,dstkey,dstset);
4961 incrRefCount(dstkey);
4962 addReplyLong(c,dictSize((dict*)dstset->ptr));
4963 } else {
4964 decrRefCount(dstset);
4965 addReply(c,shared.czero);
4966 }
4967 server.dirty++;
4968 }
4969 zfree(dv);
4970 }
4971
4972 static void sunionCommand(redisClient *c) {
4973 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4974 }
4975
4976 static void sunionstoreCommand(redisClient *c) {
4977 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4978 }
4979
4980 static void sdiffCommand(redisClient *c) {
4981 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4982 }
4983
4984 static void sdiffstoreCommand(redisClient *c) {
4985 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4986 }
4987
4988 /* ==================================== ZSets =============================== */
4989
4990 /* ZSETs are ordered sets using two data structures to hold the same elements
4991 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4992 * data structure.
4993 *
4994 * The elements are added to an hash table mapping Redis objects to scores.
4995 * At the same time the elements are added to a skip list mapping scores
4996 * to Redis objects (so objects are sorted by scores in this "view"). */
4997
4998 /* This skiplist implementation is almost a C translation of the original
4999 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5000 * Alternative to Balanced Trees", modified in three ways:
5001 * a) this implementation allows for repeated values.
5002 * b) the comparison is not just by key (our 'score') but by satellite data.
5003 * c) there is a back pointer, so it's a doubly linked list with the back
5004 * pointers being only at "level 1". This allows to traverse the list
5005 * from tail to head, useful for ZREVRANGE. */
5006
5007 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5008 zskiplistNode *zn = zmalloc(sizeof(*zn));
5009
5010 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5011 if (level > 0)
5012 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5013 zn->score = score;
5014 zn->obj = obj;
5015 return zn;
5016 }
5017
5018 static zskiplist *zslCreate(void) {
5019 int j;
5020 zskiplist *zsl;
5021
5022 zsl = zmalloc(sizeof(*zsl));
5023 zsl->level = 1;
5024 zsl->length = 0;
5025 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5026 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5027 zsl->header->forward[j] = NULL;
5028
5029 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5030 if (j < ZSKIPLIST_MAXLEVEL-1)
5031 zsl->header->span[j] = 0;
5032 }
5033 zsl->header->backward = NULL;
5034 zsl->tail = NULL;
5035 return zsl;
5036 }
5037
5038 static void zslFreeNode(zskiplistNode *node) {
5039 decrRefCount(node->obj);
5040 zfree(node->forward);
5041 zfree(node->span);
5042 zfree(node);
5043 }
5044
5045 static void zslFree(zskiplist *zsl) {
5046 zskiplistNode *node = zsl->header->forward[0], *next;
5047
5048 zfree(zsl->header->forward);
5049 zfree(zsl->header->span);
5050 zfree(zsl->header);
5051 while(node) {
5052 next = node->forward[0];
5053 zslFreeNode(node);
5054 node = next;
5055 }
5056 zfree(zsl);
5057 }
5058
5059 static int zslRandomLevel(void) {
5060 int level = 1;
5061 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5062 level += 1;
5063 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5064 }
5065
5066 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5067 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5068 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5069 int i, level;
5070
5071 x = zsl->header;
5072 for (i = zsl->level-1; i >= 0; i--) {
5073 /* store rank that is crossed to reach the insert position */
5074 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5075
5076 while (x->forward[i] &&
5077 (x->forward[i]->score < score ||
5078 (x->forward[i]->score == score &&
5079 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5080 rank[i] += i > 0 ? x->span[i-1] : 1;
5081 x = x->forward[i];
5082 }
5083 update[i] = x;
5084 }
5085 /* we assume the key is not already inside, since we allow duplicated
5086 * scores, and the re-insertion of score and redis object should never
5087 * happpen since the caller of zslInsert() should test in the hash table
5088 * if the element is already inside or not. */
5089 level = zslRandomLevel();
5090 if (level > zsl->level) {
5091 for (i = zsl->level; i < level; i++) {
5092 rank[i] = 0;
5093 update[i] = zsl->header;
5094 update[i]->span[i-1] = zsl->length;
5095 }
5096 zsl->level = level;
5097 }
5098 x = zslCreateNode(level,score,obj);
5099 for (i = 0; i < level; i++) {
5100 x->forward[i] = update[i]->forward[i];
5101 update[i]->forward[i] = x;
5102
5103 /* update span covered by update[i] as x is inserted here */
5104 if (i > 0) {
5105 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5106 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5107 }
5108 }
5109
5110 /* increment span for untouched levels */
5111 for (i = level; i < zsl->level; i++) {
5112 update[i]->span[i-1]++;
5113 }
5114
5115 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5116 if (x->forward[0])
5117 x->forward[0]->backward = x;
5118 else
5119 zsl->tail = x;
5120 zsl->length++;
5121 }
5122
5123 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5124 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5125 int i;
5126 for (i = 0; i < zsl->level; i++) {
5127 if (update[i]->forward[i] == x) {
5128 if (i > 0) {
5129 update[i]->span[i-1] += x->span[i-1] - 1;
5130 }
5131 update[i]->forward[i] = x->forward[i];
5132 } else {
5133 /* invariant: i > 0, because update[0]->forward[0]
5134 * is always equal to x */
5135 update[i]->span[i-1] -= 1;
5136 }
5137 }
5138 if (x->forward[0]) {
5139 x->forward[0]->backward = x->backward;
5140 } else {
5141 zsl->tail = x->backward;
5142 }
5143 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5144 zsl->level--;
5145 zsl->length--;
5146 }
5147
5148 /* Delete an element with matching score/object from the skiplist. */
5149 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5150 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5151 int i;
5152
5153 x = zsl->header;
5154 for (i = zsl->level-1; i >= 0; i--) {
5155 while (x->forward[i] &&
5156 (x->forward[i]->score < score ||
5157 (x->forward[i]->score == score &&
5158 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5159 x = x->forward[i];
5160 update[i] = x;
5161 }
5162 /* We may have multiple elements with the same score, what we need
5163 * is to find the element with both the right score and object. */
5164 x = x->forward[0];
5165 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5166 zslDeleteNode(zsl, x, update);
5167 zslFreeNode(x);
5168 return 1;
5169 } else {
5170 return 0; /* not found */
5171 }
5172 return 0; /* not found */
5173 }
5174
5175 /* Delete all the elements with score between min and max from the skiplist.
5176 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5177 * Note that this function takes the reference to the hash table view of the
5178 * sorted set, in order to remove the elements from the hash table too. */
5179 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5180 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5181 unsigned long removed = 0;
5182 int i;
5183
5184 x = zsl->header;
5185 for (i = zsl->level-1; i >= 0; i--) {
5186 while (x->forward[i] && x->forward[i]->score < min)
5187 x = x->forward[i];
5188 update[i] = x;
5189 }
5190 /* We may have multiple elements with the same score, what we need
5191 * is to find the element with both the right score and object. */
5192 x = x->forward[0];
5193 while (x && x->score <= max) {
5194 zskiplistNode *next = x->forward[0];
5195 zslDeleteNode(zsl, x, update);
5196 dictDelete(dict,x->obj);
5197 zslFreeNode(x);
5198 removed++;
5199 x = next;
5200 }
5201 return removed; /* not found */
5202 }
5203
5204 /* Delete all the elements with rank between start and end from the skiplist.
5205 * Start and end are inclusive. Note that start and end need to be 1-based */
5206 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5207 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5208 unsigned long traversed = 0, removed = 0;
5209 int i;
5210
5211 x = zsl->header;
5212 for (i = zsl->level-1; i >= 0; i--) {
5213 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5214 traversed += i > 0 ? x->span[i-1] : 1;
5215 x = x->forward[i];
5216 }
5217 update[i] = x;
5218 }
5219
5220 traversed++;
5221 x = x->forward[0];
5222 while (x && traversed <= end) {
5223 zskiplistNode *next = x->forward[0];
5224 zslDeleteNode(zsl, x, update);
5225 dictDelete(dict,x->obj);
5226 zslFreeNode(x);
5227 removed++;
5228 traversed++;
5229 x = next;
5230 }
5231 return removed;
5232 }
5233
5234 /* Find the first node having a score equal or greater than the specified one.
5235 * Returns NULL if there is no match. */
5236 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5237 zskiplistNode *x;
5238 int i;
5239
5240 x = zsl->header;
5241 for (i = zsl->level-1; i >= 0; i--) {
5242 while (x->forward[i] && x->forward[i]->score < score)
5243 x = x->forward[i];
5244 }
5245 /* We may have multiple elements with the same score, what we need
5246 * is to find the element with both the right score and object. */
5247 return x->forward[0];
5248 }
5249
5250 /* Find the rank for an element by both score and key.
5251 * Returns 0 when the element cannot be found, rank otherwise.
5252 * Note that the rank is 1-based due to the span of zsl->header to the
5253 * first element. */
5254 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5255 zskiplistNode *x;
5256 unsigned long rank = 0;
5257 int i;
5258
5259 x = zsl->header;
5260 for (i = zsl->level-1; i >= 0; i--) {
5261 while (x->forward[i] &&
5262 (x->forward[i]->score < score ||
5263 (x->forward[i]->score == score &&
5264 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5265 rank += i > 0 ? x->span[i-1] : 1;
5266 x = x->forward[i];
5267 }
5268
5269 /* x might be equal to zsl->header, so test if obj is non-NULL */
5270 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5271 return rank;
5272 }
5273 }
5274 return 0;
5275 }
5276
5277 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5278 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5279 zskiplistNode *x;
5280 unsigned long traversed = 0;
5281 int i;
5282
5283 x = zsl->header;
5284 for (i = zsl->level-1; i >= 0; i--) {
5285 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5286 {
5287 traversed += i > 0 ? x->span[i-1] : 1;
5288 x = x->forward[i];
5289 }
5290 if (traversed == rank) {
5291 return x;
5292 }
5293 }
5294 return NULL;
5295 }
5296
5297 /* The actual Z-commands implementations */
5298
5299 /* This generic command implements both ZADD and ZINCRBY.
5300 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5301 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5302 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5303 robj *zsetobj;
5304 zset *zs;
5305 double *score;
5306
5307 zsetobj = lookupKeyWrite(c->db,key);
5308 if (zsetobj == NULL) {
5309 zsetobj = createZsetObject();
5310 dictAdd(c->db->dict,key,zsetobj);
5311 incrRefCount(key);
5312 } else {
5313 if (zsetobj->type != REDIS_ZSET) {
5314 addReply(c,shared.wrongtypeerr);
5315 return;
5316 }
5317 }
5318 zs = zsetobj->ptr;
5319
5320 /* Ok now since we implement both ZADD and ZINCRBY here the code
5321 * needs to handle the two different conditions. It's all about setting
5322 * '*score', that is, the new score to set, to the right value. */
5323 score = zmalloc(sizeof(double));
5324 if (doincrement) {
5325 dictEntry *de;
5326
5327 /* Read the old score. If the element was not present starts from 0 */
5328 de = dictFind(zs->dict,ele);
5329 if (de) {
5330 double *oldscore = dictGetEntryVal(de);
5331 *score = *oldscore + scoreval;
5332 } else {
5333 *score = scoreval;
5334 }
5335 } else {
5336 *score = scoreval;
5337 }
5338
5339 /* What follows is a simple remove and re-insert operation that is common
5340 * to both ZADD and ZINCRBY... */
5341 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5342 /* case 1: New element */
5343 incrRefCount(ele); /* added to hash */
5344 zslInsert(zs->zsl,*score,ele);
5345 incrRefCount(ele); /* added to skiplist */
5346 server.dirty++;
5347 if (doincrement)
5348 addReplyDouble(c,*score);
5349 else
5350 addReply(c,shared.cone);
5351 } else {
5352 dictEntry *de;
5353 double *oldscore;
5354
5355 /* case 2: Score update operation */
5356 de = dictFind(zs->dict,ele);
5357 redisAssert(de != NULL);
5358 oldscore = dictGetEntryVal(de);
5359 if (*score != *oldscore) {
5360 int deleted;
5361
5362 /* Remove and insert the element in the skip list with new score */
5363 deleted = zslDelete(zs->zsl,*oldscore,ele);
5364 redisAssert(deleted != 0);
5365 zslInsert(zs->zsl,*score,ele);
5366 incrRefCount(ele);
5367 /* Update the score in the hash table */
5368 dictReplace(zs->dict,ele,score);
5369 server.dirty++;
5370 } else {
5371 zfree(score);
5372 }
5373 if (doincrement)
5374 addReplyDouble(c,*score);
5375 else
5376 addReply(c,shared.czero);
5377 }
5378 }
5379
5380 static void zaddCommand(redisClient *c) {
5381 double scoreval;
5382
5383 scoreval = strtod(c->argv[2]->ptr,NULL);
5384 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5385 }
5386
5387 static void zincrbyCommand(redisClient *c) {
5388 double scoreval;
5389
5390 scoreval = strtod(c->argv[2]->ptr,NULL);
5391 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5392 }
5393
5394 static void zremCommand(redisClient *c) {
5395 robj *zsetobj;
5396 zset *zs;
5397 dictEntry *de;
5398 double *oldscore;
5399 int deleted;
5400
5401 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5402 checkType(c,zsetobj,REDIS_ZSET)) return;
5403
5404 zs = zsetobj->ptr;
5405 de = dictFind(zs->dict,c->argv[2]);
5406 if (de == NULL) {
5407 addReply(c,shared.czero);
5408 return;
5409 }
5410 /* Delete from the skiplist */
5411 oldscore = dictGetEntryVal(de);
5412 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5413 redisAssert(deleted != 0);
5414
5415 /* Delete from the hash table */
5416 dictDelete(zs->dict,c->argv[2]);
5417 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5418 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5419 server.dirty++;
5420 addReply(c,shared.cone);
5421 }
5422
5423 static void zremrangebyscoreCommand(redisClient *c) {
5424 double min = strtod(c->argv[2]->ptr,NULL);
5425 double max = strtod(c->argv[3]->ptr,NULL);
5426 long deleted;
5427 robj *zsetobj;
5428 zset *zs;
5429
5430 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5431 checkType(c,zsetobj,REDIS_ZSET)) return;
5432
5433 zs = zsetobj->ptr;
5434 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5435 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5436 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5437 server.dirty += deleted;
5438 addReplyLong(c,deleted);
5439 }
5440
5441 static void zremrangebyrankCommand(redisClient *c) {
5442 int start = atoi(c->argv[2]->ptr);
5443 int end = atoi(c->argv[3]->ptr);
5444 int llen;
5445 long deleted;
5446 robj *zsetobj;
5447 zset *zs;
5448
5449 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5450 checkType(c,zsetobj,REDIS_ZSET)) return;
5451 zs = zsetobj->ptr;
5452 llen = zs->zsl->length;
5453
5454 /* convert negative indexes */
5455 if (start < 0) start = llen+start;
5456 if (end < 0) end = llen+end;
5457 if (start < 0) start = 0;
5458 if (end < 0) end = 0;
5459
5460 /* indexes sanity checks */
5461 if (start > end || start >= llen) {
5462 addReply(c,shared.czero);
5463 return;
5464 }
5465 if (end >= llen) end = llen-1;
5466
5467 /* increment start and end because zsl*Rank functions
5468 * use 1-based rank */
5469 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5470 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5471 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5472 server.dirty += deleted;
5473 addReplyLong(c, deleted);
5474 }
5475
5476 typedef struct {
5477 dict *dict;
5478 double weight;
5479 } zsetopsrc;
5480
5481 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5482 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5483 unsigned long size1, size2;
5484 size1 = d1->dict ? dictSize(d1->dict) : 0;
5485 size2 = d2->dict ? dictSize(d2->dict) : 0;
5486 return size1 - size2;
5487 }
5488
5489 #define REDIS_AGGR_SUM 1
5490 #define REDIS_AGGR_MIN 2
5491 #define REDIS_AGGR_MAX 3
5492
5493 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5494 if (aggregate == REDIS_AGGR_SUM) {
5495 *target = *target + val;
5496 } else if (aggregate == REDIS_AGGR_MIN) {
5497 *target = val < *target ? val : *target;
5498 } else if (aggregate == REDIS_AGGR_MAX) {
5499 *target = val > *target ? val : *target;
5500 } else {
5501 /* safety net */
5502 redisAssert(0 != 0);
5503 }
5504 }
5505
5506 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5507 int i, j, zsetnum;
5508 int aggregate = REDIS_AGGR_SUM;
5509 zsetopsrc *src;
5510 robj *dstobj;
5511 zset *dstzset;
5512 dictIterator *di;
5513 dictEntry *de;
5514
5515 /* expect zsetnum input keys to be given */
5516 zsetnum = atoi(c->argv[2]->ptr);
5517 if (zsetnum < 1) {
5518 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5519 return;
5520 }
5521
5522 /* test if the expected number of keys would overflow */
5523 if (3+zsetnum > c->argc) {
5524 addReply(c,shared.syntaxerr);
5525 return;
5526 }
5527
5528 /* read keys to be used for input */
5529 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5530 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5531 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5532 if (!zsetobj) {
5533 src[i].dict = NULL;
5534 } else {
5535 if (zsetobj->type != REDIS_ZSET) {
5536 zfree(src);
5537 addReply(c,shared.wrongtypeerr);
5538 return;
5539 }
5540 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5541 }
5542
5543 /* default all weights to 1 */
5544 src[i].weight = 1.0;
5545 }
5546
5547 /* parse optional extra arguments */
5548 if (j < c->argc) {
5549 int remaining = c->argc - j;
5550
5551 while (remaining) {
5552 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5553 j++; remaining--;
5554 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5555 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5556 }
5557 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5558 j++; remaining--;
5559 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5560 aggregate = REDIS_AGGR_SUM;
5561 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5562 aggregate = REDIS_AGGR_MIN;
5563 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5564 aggregate = REDIS_AGGR_MAX;
5565 } else {
5566 zfree(src);
5567 addReply(c,shared.syntaxerr);
5568 return;
5569 }
5570 j++; remaining--;
5571 } else {
5572 zfree(src);
5573 addReply(c,shared.syntaxerr);
5574 return;
5575 }
5576 }
5577 }
5578
5579 /* sort sets from the smallest to largest, this will improve our
5580 * algorithm's performance */
5581 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5582
5583 dstobj = createZsetObject();
5584 dstzset = dstobj->ptr;
5585
5586 if (op == REDIS_OP_INTER) {
5587 /* skip going over all entries if the smallest zset is NULL or empty */
5588 if (src[0].dict && dictSize(src[0].dict) > 0) {
5589 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5590 * from small to large, all src[i > 0].dict are non-empty too */
5591 di = dictGetIterator(src[0].dict);
5592 while((de = dictNext(di)) != NULL) {
5593 double *score = zmalloc(sizeof(double)), value;
5594 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5595
5596 for (j = 1; j < zsetnum; j++) {
5597 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5598 if (other) {
5599 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5600 zunionInterAggregate(score, value, aggregate);
5601 } else {
5602 break;
5603 }
5604 }
5605
5606 /* skip entry when not present in every source dict */
5607 if (j != zsetnum) {
5608 zfree(score);
5609 } else {
5610 robj *o = dictGetEntryKey(de);
5611 dictAdd(dstzset->dict,o,score);
5612 incrRefCount(o); /* added to dictionary */
5613 zslInsert(dstzset->zsl,*score,o);
5614 incrRefCount(o); /* added to skiplist */
5615 }
5616 }
5617 dictReleaseIterator(di);
5618 }
5619 } else if (op == REDIS_OP_UNION) {
5620 for (i = 0; i < zsetnum; i++) {
5621 if (!src[i].dict) continue;
5622
5623 di = dictGetIterator(src[i].dict);
5624 while((de = dictNext(di)) != NULL) {
5625 /* skip key when already processed */
5626 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5627
5628 double *score = zmalloc(sizeof(double)), value;
5629 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5630
5631 /* because the zsets are sorted by size, its only possible
5632 * for sets at larger indices to hold this entry */
5633 for (j = (i+1); j < zsetnum; j++) {
5634 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5635 if (other) {
5636 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5637 zunionInterAggregate(score, value, aggregate);
5638 }
5639 }
5640
5641 robj *o = dictGetEntryKey(de);
5642 dictAdd(dstzset->dict,o,score);
5643 incrRefCount(o); /* added to dictionary */
5644 zslInsert(dstzset->zsl,*score,o);
5645 incrRefCount(o); /* added to skiplist */
5646 }
5647 dictReleaseIterator(di);
5648 }
5649 } else {
5650 /* unknown operator */
5651 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5652 }
5653
5654 deleteKey(c->db,dstkey);
5655 if (dstzset->zsl->length) {
5656 dictAdd(c->db->dict,dstkey,dstobj);
5657 incrRefCount(dstkey);
5658 addReplyLong(c, dstzset->zsl->length);
5659 server.dirty++;
5660 } else {
5661 decrRefCount(dstobj);
5662 addReply(c, shared.czero);
5663 }
5664 zfree(src);
5665 }
5666
5667 static void zunionCommand(redisClient *c) {
5668 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5669 }
5670
5671 static void zinterCommand(redisClient *c) {
5672 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5673 }
5674
5675 static void zrangeGenericCommand(redisClient *c, int reverse) {
5676 robj *o;
5677 int start = atoi(c->argv[2]->ptr);
5678 int end = atoi(c->argv[3]->ptr);
5679 int withscores = 0;
5680 int llen;
5681 int rangelen, j;
5682 zset *zsetobj;
5683 zskiplist *zsl;
5684 zskiplistNode *ln;
5685 robj *ele;
5686
5687 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5688 withscores = 1;
5689 } else if (c->argc >= 5) {
5690 addReply(c,shared.syntaxerr);
5691 return;
5692 }
5693
5694 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5695 checkType(c,o,REDIS_ZSET)) return;
5696 zsetobj = o->ptr;
5697 zsl = zsetobj->zsl;
5698 llen = zsl->length;
5699
5700 /* convert negative indexes */
5701 if (start < 0) start = llen+start;
5702 if (end < 0) end = llen+end;
5703 if (start < 0) start = 0;
5704 if (end < 0) end = 0;
5705
5706 /* indexes sanity checks */
5707 if (start > end || start >= llen) {
5708 /* Out of range start or start > end result in empty list */
5709 addReply(c,shared.emptymultibulk);
5710 return;
5711 }
5712 if (end >= llen) end = llen-1;
5713 rangelen = (end-start)+1;
5714
5715 /* check if starting point is trivial, before searching
5716 * the element in log(N) time */
5717 if (reverse) {
5718 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5719 } else {
5720 ln = start == 0 ?
5721 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5722 }
5723
5724 /* Return the result in form of a multi-bulk reply */
5725 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5726 withscores ? (rangelen*2) : rangelen));
5727 for (j = 0; j < rangelen; j++) {
5728 ele = ln->obj;
5729 addReplyBulk(c,ele);
5730 if (withscores)
5731 addReplyDouble(c,ln->score);
5732 ln = reverse ? ln->backward : ln->forward[0];
5733 }
5734 }
5735
5736 static void zrangeCommand(redisClient *c) {
5737 zrangeGenericCommand(c,0);
5738 }
5739
5740 static void zrevrangeCommand(redisClient *c) {
5741 zrangeGenericCommand(c,1);
5742 }
5743
5744 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5745 * If justcount is non-zero, just the count is returned. */
5746 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5747 robj *o;
5748 double min, max;
5749 int minex = 0, maxex = 0; /* are min or max exclusive? */
5750 int offset = 0, limit = -1;
5751 int withscores = 0;
5752 int badsyntax = 0;
5753
5754 /* Parse the min-max interval. If one of the values is prefixed
5755 * by the "(" character, it's considered "open". For instance
5756 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5757 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5758 if (((char*)c->argv[2]->ptr)[0] == '(') {
5759 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5760 minex = 1;
5761 } else {
5762 min = strtod(c->argv[2]->ptr,NULL);
5763 }
5764 if (((char*)c->argv[3]->ptr)[0] == '(') {
5765 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5766 maxex = 1;
5767 } else {
5768 max = strtod(c->argv[3]->ptr,NULL);
5769 }
5770
5771 /* Parse "WITHSCORES": note that if the command was called with
5772 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5773 * enter the following paths to parse WITHSCORES and LIMIT. */
5774 if (c->argc == 5 || c->argc == 8) {
5775 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5776 withscores = 1;
5777 else
5778 badsyntax = 1;
5779 }
5780 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5781 badsyntax = 1;
5782 if (badsyntax) {
5783 addReplySds(c,
5784 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5785 return;
5786 }
5787
5788 /* Parse "LIMIT" */
5789 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5790 addReply(c,shared.syntaxerr);
5791 return;
5792 } else if (c->argc == (7 + withscores)) {
5793 offset = atoi(c->argv[5]->ptr);
5794 limit = atoi(c->argv[6]->ptr);
5795 if (offset < 0) offset = 0;
5796 }
5797
5798 /* Ok, lookup the key and get the range */
5799 o = lookupKeyRead(c->db,c->argv[1]);
5800 if (o == NULL) {
5801 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5802 } else {
5803 if (o->type != REDIS_ZSET) {
5804 addReply(c,shared.wrongtypeerr);
5805 } else {
5806 zset *zsetobj = o->ptr;
5807 zskiplist *zsl = zsetobj->zsl;
5808 zskiplistNode *ln;
5809 robj *ele, *lenobj = NULL;
5810 unsigned long rangelen = 0;
5811
5812 /* Get the first node with the score >= min, or with
5813 * score > min if 'minex' is true. */
5814 ln = zslFirstWithScore(zsl,min);
5815 while (minex && ln && ln->score == min) ln = ln->forward[0];
5816
5817 if (ln == NULL) {
5818 /* No element matching the speciifed interval */
5819 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5820 return;
5821 }
5822
5823 /* We don't know in advance how many matching elements there
5824 * are in the list, so we push this object that will represent
5825 * the multi-bulk length in the output buffer, and will "fix"
5826 * it later */
5827 if (!justcount) {
5828 lenobj = createObject(REDIS_STRING,NULL);
5829 addReply(c,lenobj);
5830 decrRefCount(lenobj);
5831 }
5832
5833 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5834 if (offset) {
5835 offset--;
5836 ln = ln->forward[0];
5837 continue;
5838 }
5839 if (limit == 0) break;
5840 if (!justcount) {
5841 ele = ln->obj;
5842 addReplyBulk(c,ele);
5843 if (withscores)
5844 addReplyDouble(c,ln->score);
5845 }
5846 ln = ln->forward[0];
5847 rangelen++;
5848 if (limit > 0) limit--;
5849 }
5850 if (justcount) {
5851 addReplyLong(c,(long)rangelen);
5852 } else {
5853 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5854 withscores ? (rangelen*2) : rangelen);
5855 }
5856 }
5857 }
5858 }
5859
5860 static void zrangebyscoreCommand(redisClient *c) {
5861 genericZrangebyscoreCommand(c,0);
5862 }
5863
5864 static void zcountCommand(redisClient *c) {
5865 genericZrangebyscoreCommand(c,1);
5866 }
5867
5868 static void zcardCommand(redisClient *c) {
5869 robj *o;
5870 zset *zs;
5871
5872 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5873 checkType(c,o,REDIS_ZSET)) return;
5874
5875 zs = o->ptr;
5876 addReplyUlong(c,zs->zsl->length);
5877 }
5878
5879 static void zscoreCommand(redisClient *c) {
5880 robj *o;
5881 zset *zs;
5882 dictEntry *de;
5883
5884 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5885 checkType(c,o,REDIS_ZSET)) return;
5886
5887 zs = o->ptr;
5888 de = dictFind(zs->dict,c->argv[2]);
5889 if (!de) {
5890 addReply(c,shared.nullbulk);
5891 } else {
5892 double *score = dictGetEntryVal(de);
5893
5894 addReplyDouble(c,*score);
5895 }
5896 }
5897
5898 static void zrankGenericCommand(redisClient *c, int reverse) {
5899 robj *o;
5900 zset *zs;
5901 zskiplist *zsl;
5902 dictEntry *de;
5903 unsigned long rank;
5904 double *score;
5905
5906 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5907 checkType(c,o,REDIS_ZSET)) return;
5908
5909 zs = o->ptr;
5910 zsl = zs->zsl;
5911 de = dictFind(zs->dict,c->argv[2]);
5912 if (!de) {
5913 addReply(c,shared.nullbulk);
5914 return;
5915 }
5916
5917 score = dictGetEntryVal(de);
5918 rank = zslGetRank(zsl, *score, c->argv[2]);
5919 if (rank) {
5920 if (reverse) {
5921 addReplyLong(c, zsl->length - rank);
5922 } else {
5923 addReplyLong(c, rank-1);
5924 }
5925 } else {
5926 addReply(c,shared.nullbulk);
5927 }
5928 }
5929
5930 static void zrankCommand(redisClient *c) {
5931 zrankGenericCommand(c, 0);
5932 }
5933
5934 static void zrevrankCommand(redisClient *c) {
5935 zrankGenericCommand(c, 1);
5936 }
5937
5938 /* =================================== Hashes =============================== */
5939 static void hsetCommand(redisClient *c) {
5940 int update = 0;
5941 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5942
5943 if (o == NULL) {
5944 o = createHashObject();
5945 dictAdd(c->db->dict,c->argv[1],o);
5946 incrRefCount(c->argv[1]);
5947 } else {
5948 if (o->type != REDIS_HASH) {
5949 addReply(c,shared.wrongtypeerr);
5950 return;
5951 }
5952 }
5953 /* We want to convert the zipmap into an hash table right now if the
5954 * entry to be added is too big. Note that we check if the object
5955 * is integer encoded before to try fetching the length in the test below.
5956 * This is because integers are small, but currently stringObjectLen()
5957 * performs a slow conversion: not worth it. */
5958 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5959 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5960 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5961 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5962 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5963 {
5964 convertToRealHash(o);
5965 }
5966
5967 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5968 unsigned char *zm = o->ptr;
5969 robj *valobj = getDecodedObject(c->argv[3]);
5970
5971 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5972 valobj->ptr,sdslen(valobj->ptr),&update);
5973 decrRefCount(valobj);
5974 o->ptr = zm;
5975
5976 /* And here there is the second check for hash conversion. */
5977 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
5978 convertToRealHash(o);
5979 } else {
5980 tryObjectEncoding(c->argv[2]);
5981 /* note that c->argv[3] is already encoded, as the latest arg
5982 * of a bulk command is always integer encoded if possible. */
5983 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5984 incrRefCount(c->argv[2]);
5985 } else {
5986 update = 1;
5987 }
5988 incrRefCount(c->argv[3]);
5989 }
5990 server.dirty++;
5991 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5992 }
5993
5994 static void hincrbyCommand(redisClient *c) {
5995 long long value = 0, incr = 0;
5996 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5997
5998 if (o == NULL) {
5999 o = createHashObject();
6000 dictAdd(c->db->dict,c->argv[1],o);
6001 incrRefCount(c->argv[1]);
6002 } else {
6003 if (o->type != REDIS_HASH) {
6004 addReply(c,shared.wrongtypeerr);
6005 return;
6006 }
6007 }
6008
6009 incr = strtoll(c->argv[3]->ptr, NULL, 10);
6010 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6011 unsigned char *zm = o->ptr;
6012 unsigned char *zval;
6013 unsigned int zvlen;
6014
6015 /* Find value if already present in hash */
6016 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6017 &zval,&zvlen)) {
6018 /* strtoll needs the char* to have a trailing \0, but
6019 * the zipmap doesn't include them. */
6020 sds szval = sdsnewlen(zval, zvlen);
6021 value = strtoll(szval,NULL,10);
6022 sdsfree(szval);
6023 }
6024
6025 value += incr;
6026 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6027 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6028 (unsigned char*)svalue,sdslen(svalue),NULL);
6029 sdsfree(svalue);
6030 o->ptr = zm;
6031
6032 /* Check if the zipmap needs to be converted. */
6033 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6034 convertToRealHash(o);
6035 } else {
6036 robj *hval;
6037 dictEntry *de;
6038
6039 /* Find value if already present in hash */
6040 de = dictFind(o->ptr,c->argv[2]);
6041 if (de != NULL) {
6042 hval = dictGetEntryVal(de);
6043 if (hval->encoding == REDIS_ENCODING_RAW)
6044 value = strtoll(hval->ptr,NULL,10);
6045 else if (hval->encoding == REDIS_ENCODING_INT)
6046 value = (long)hval->ptr;
6047 else
6048 redisAssert(1 != 1);
6049 }
6050
6051 value += incr;
6052 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6053 tryObjectEncoding(hval);
6054 if (dictReplace(o->ptr,c->argv[2],hval)) {
6055 incrRefCount(c->argv[2]);
6056 }
6057 }
6058
6059 server.dirty++;
6060 addReplyLongLong(c, value);
6061 }
6062
6063 static void hgetCommand(redisClient *c) {
6064 robj *o;
6065
6066 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6067 checkType(c,o,REDIS_HASH)) return;
6068
6069 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6070 unsigned char *zm = o->ptr;
6071 unsigned char *val;
6072 unsigned int vlen;
6073 robj *field;
6074
6075 field = getDecodedObject(c->argv[2]);
6076 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6077 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6078 addReplySds(c,sdsnewlen(val,vlen));
6079 addReply(c,shared.crlf);
6080 decrRefCount(field);
6081 return;
6082 } else {
6083 addReply(c,shared.nullbulk);
6084 decrRefCount(field);
6085 return;
6086 }
6087 } else {
6088 struct dictEntry *de;
6089
6090 de = dictFind(o->ptr,c->argv[2]);
6091 if (de == NULL) {
6092 addReply(c,shared.nullbulk);
6093 } else {
6094 robj *e = dictGetEntryVal(de);
6095
6096 addReplyBulk(c,e);
6097 }
6098 }
6099 }
6100
6101 static void hdelCommand(redisClient *c) {
6102 robj *o;
6103 int deleted = 0;
6104
6105 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6106 checkType(c,o,REDIS_HASH)) return;
6107
6108 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6109 robj *field = getDecodedObject(c->argv[2]);
6110
6111 o->ptr = zipmapDel((unsigned char*) o->ptr,
6112 (unsigned char*) field->ptr,
6113 sdslen(field->ptr), &deleted);
6114 decrRefCount(field);
6115 if (zipmapLen((unsigned char*) o->ptr) == 0)
6116 deleteKey(c->db,c->argv[1]);
6117 } else {
6118 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6119 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6120 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6121 }
6122 if (deleted) server.dirty++;
6123 addReply(c,deleted ? shared.cone : shared.czero);
6124 }
6125
6126 static void hlenCommand(redisClient *c) {
6127 robj *o;
6128 unsigned long len;
6129
6130 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6131 checkType(c,o,REDIS_HASH)) return;
6132
6133 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6134 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6135 addReplyUlong(c,len);
6136 }
6137
6138 #define REDIS_GETALL_KEYS 1
6139 #define REDIS_GETALL_VALS 2
6140 static void genericHgetallCommand(redisClient *c, int flags) {
6141 robj *o, *lenobj;
6142 unsigned long count = 0;
6143
6144 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6145 || checkType(c,o,REDIS_HASH)) return;
6146
6147 lenobj = createObject(REDIS_STRING,NULL);
6148 addReply(c,lenobj);
6149 decrRefCount(lenobj);
6150
6151 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6152 unsigned char *p = zipmapRewind(o->ptr);
6153 unsigned char *field, *val;
6154 unsigned int flen, vlen;
6155
6156 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6157 robj *aux;
6158
6159 if (flags & REDIS_GETALL_KEYS) {
6160 aux = createStringObject((char*)field,flen);
6161 addReplyBulk(c,aux);
6162 decrRefCount(aux);
6163 count++;
6164 }
6165 if (flags & REDIS_GETALL_VALS) {
6166 aux = createStringObject((char*)val,vlen);
6167 addReplyBulk(c,aux);
6168 decrRefCount(aux);
6169 count++;
6170 }
6171 }
6172 } else {
6173 dictIterator *di = dictGetIterator(o->ptr);
6174 dictEntry *de;
6175
6176 while((de = dictNext(di)) != NULL) {
6177 robj *fieldobj = dictGetEntryKey(de);
6178 robj *valobj = dictGetEntryVal(de);
6179
6180 if (flags & REDIS_GETALL_KEYS) {
6181 addReplyBulk(c,fieldobj);
6182 count++;
6183 }
6184 if (flags & REDIS_GETALL_VALS) {
6185 addReplyBulk(c,valobj);
6186 count++;
6187 }
6188 }
6189 dictReleaseIterator(di);
6190 }
6191 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6192 }
6193
6194 static void hkeysCommand(redisClient *c) {
6195 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6196 }
6197
6198 static void hvalsCommand(redisClient *c) {
6199 genericHgetallCommand(c,REDIS_GETALL_VALS);
6200 }
6201
6202 static void hgetallCommand(redisClient *c) {
6203 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6204 }
6205
6206 static void hexistsCommand(redisClient *c) {
6207 robj *o;
6208 int exists = 0;
6209
6210 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6211 checkType(c,o,REDIS_HASH)) return;
6212
6213 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6214 robj *field;
6215 unsigned char *zm = o->ptr;
6216
6217 field = getDecodedObject(c->argv[2]);
6218 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6219 decrRefCount(field);
6220 } else {
6221 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6222 }
6223 addReply(c,exists ? shared.cone : shared.czero);
6224 }
6225
6226 static void convertToRealHash(robj *o) {
6227 unsigned char *key, *val, *p, *zm = o->ptr;
6228 unsigned int klen, vlen;
6229 dict *dict = dictCreate(&hashDictType,NULL);
6230
6231 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6232 p = zipmapRewind(zm);
6233 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6234 robj *keyobj, *valobj;
6235
6236 keyobj = createStringObject((char*)key,klen);
6237 valobj = createStringObject((char*)val,vlen);
6238 tryObjectEncoding(keyobj);
6239 tryObjectEncoding(valobj);
6240 dictAdd(dict,keyobj,valobj);
6241 }
6242 o->encoding = REDIS_ENCODING_HT;
6243 o->ptr = dict;
6244 zfree(zm);
6245 }
6246
6247 /* ========================= Non type-specific commands ==================== */
6248
6249 static void flushdbCommand(redisClient *c) {
6250 server.dirty += dictSize(c->db->dict);
6251 dictEmpty(c->db->dict);
6252 dictEmpty(c->db->expires);
6253 addReply(c,shared.ok);
6254 }
6255
6256 static void flushallCommand(redisClient *c) {
6257 server.dirty += emptyDb();
6258 addReply(c,shared.ok);
6259 if (server.bgsavechildpid != -1) {
6260 kill(server.bgsavechildpid,SIGKILL);
6261 rdbRemoveTempFile(server.bgsavechildpid);
6262 }
6263 rdbSave(server.dbfilename);
6264 server.dirty++;
6265 }
6266
6267 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6268 redisSortOperation *so = zmalloc(sizeof(*so));
6269 so->type = type;
6270 so->pattern = pattern;
6271 return so;
6272 }
6273
6274 /* Return the value associated to the key with a name obtained
6275 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6276 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6277 char *p;
6278 sds spat, ssub;
6279 robj keyobj;
6280 int prefixlen, sublen, postfixlen;
6281 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6282 struct {
6283 long len;
6284 long free;
6285 char buf[REDIS_SORTKEY_MAX+1];
6286 } keyname;
6287
6288 /* If the pattern is "#" return the substitution object itself in order
6289 * to implement the "SORT ... GET #" feature. */
6290 spat = pattern->ptr;
6291 if (spat[0] == '#' && spat[1] == '\0') {
6292 return subst;
6293 }
6294
6295 /* The substitution object may be specially encoded. If so we create
6296 * a decoded object on the fly. Otherwise getDecodedObject will just
6297 * increment the ref count, that we'll decrement later. */
6298 subst = getDecodedObject(subst);
6299
6300 ssub = subst->ptr;
6301 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6302 p = strchr(spat,'*');
6303 if (!p) {
6304 decrRefCount(subst);
6305 return NULL;
6306 }
6307
6308 prefixlen = p-spat;
6309 sublen = sdslen(ssub);
6310 postfixlen = sdslen(spat)-(prefixlen+1);
6311 memcpy(keyname.buf,spat,prefixlen);
6312 memcpy(keyname.buf+prefixlen,ssub,sublen);
6313 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6314 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6315 keyname.len = prefixlen+sublen+postfixlen;
6316
6317 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6318 decrRefCount(subst);
6319
6320 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6321 return lookupKeyRead(db,&keyobj);
6322 }
6323
6324 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6325 * the additional parameter is not standard but a BSD-specific we have to
6326 * pass sorting parameters via the global 'server' structure */
6327 static int sortCompare(const void *s1, const void *s2) {
6328 const redisSortObject *so1 = s1, *so2 = s2;
6329 int cmp;
6330
6331 if (!server.sort_alpha) {
6332 /* Numeric sorting. Here it's trivial as we precomputed scores */
6333 if (so1->u.score > so2->u.score) {
6334 cmp = 1;
6335 } else if (so1->u.score < so2->u.score) {
6336 cmp = -1;
6337 } else {
6338 cmp = 0;
6339 }
6340 } else {
6341 /* Alphanumeric sorting */
6342 if (server.sort_bypattern) {
6343 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6344 /* At least one compare object is NULL */
6345 if (so1->u.cmpobj == so2->u.cmpobj)
6346 cmp = 0;
6347 else if (so1->u.cmpobj == NULL)
6348 cmp = -1;
6349 else
6350 cmp = 1;
6351 } else {
6352 /* We have both the objects, use strcoll */
6353 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6354 }
6355 } else {
6356 /* Compare elements directly */
6357 robj *dec1, *dec2;
6358
6359 dec1 = getDecodedObject(so1->obj);
6360 dec2 = getDecodedObject(so2->obj);
6361 cmp = strcoll(dec1->ptr,dec2->ptr);
6362 decrRefCount(dec1);
6363 decrRefCount(dec2);
6364 }
6365 }
6366 return server.sort_desc ? -cmp : cmp;
6367 }
6368
6369 /* The SORT command is the most complex command in Redis. Warning: this code
6370 * is optimized for speed and a bit less for readability */
6371 static void sortCommand(redisClient *c) {
6372 list *operations;
6373 int outputlen = 0;
6374 int desc = 0, alpha = 0;
6375 int limit_start = 0, limit_count = -1, start, end;
6376 int j, dontsort = 0, vectorlen;
6377 int getop = 0; /* GET operation counter */
6378 robj *sortval, *sortby = NULL, *storekey = NULL;
6379 redisSortObject *vector; /* Resulting vector to sort */
6380
6381 /* Lookup the key to sort. It must be of the right types */
6382 sortval = lookupKeyRead(c->db,c->argv[1]);
6383 if (sortval == NULL) {
6384 addReply(c,shared.nullmultibulk);
6385 return;
6386 }
6387 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6388 sortval->type != REDIS_ZSET)
6389 {
6390 addReply(c,shared.wrongtypeerr);
6391 return;
6392 }
6393
6394 /* Create a list of operations to perform for every sorted element.
6395 * Operations can be GET/DEL/INCR/DECR */
6396 operations = listCreate();
6397 listSetFreeMethod(operations,zfree);
6398 j = 2;
6399
6400 /* Now we need to protect sortval incrementing its count, in the future
6401 * SORT may have options able to overwrite/delete keys during the sorting
6402 * and the sorted key itself may get destroied */
6403 incrRefCount(sortval);
6404
6405 /* The SORT command has an SQL-alike syntax, parse it */
6406 while(j < c->argc) {
6407 int leftargs = c->argc-j-1;
6408 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6409 desc = 0;
6410 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6411 desc = 1;
6412 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6413 alpha = 1;
6414 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6415 limit_start = atoi(c->argv[j+1]->ptr);
6416 limit_count = atoi(c->argv[j+2]->ptr);
6417 j+=2;
6418 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6419 storekey = c->argv[j+1];
6420 j++;
6421 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6422 sortby = c->argv[j+1];
6423 /* If the BY pattern does not contain '*', i.e. it is constant,
6424 * we don't need to sort nor to lookup the weight keys. */
6425 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6426 j++;
6427 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6428 listAddNodeTail(operations,createSortOperation(
6429 REDIS_SORT_GET,c->argv[j+1]));
6430 getop++;
6431 j++;
6432 } else {
6433 decrRefCount(sortval);
6434 listRelease(operations);
6435 addReply(c,shared.syntaxerr);
6436 return;
6437 }
6438 j++;
6439 }
6440
6441 /* Load the sorting vector with all the objects to sort */
6442 switch(sortval->type) {
6443 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6444 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6445 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6446 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6447 }
6448 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6449 j = 0;
6450
6451 if (sortval->type == REDIS_LIST) {
6452 list *list = sortval->ptr;
6453 listNode *ln;
6454 listIter li;
6455
6456 listRewind(list,&li);
6457 while((ln = listNext(&li))) {
6458 robj *ele = ln->value;
6459 vector[j].obj = ele;
6460 vector[j].u.score = 0;
6461 vector[j].u.cmpobj = NULL;
6462 j++;
6463 }
6464 } else {
6465 dict *set;
6466 dictIterator *di;
6467 dictEntry *setele;
6468
6469 if (sortval->type == REDIS_SET) {
6470 set = sortval->ptr;
6471 } else {
6472 zset *zs = sortval->ptr;
6473 set = zs->dict;
6474 }
6475
6476 di = dictGetIterator(set);
6477 while((setele = dictNext(di)) != NULL) {
6478 vector[j].obj = dictGetEntryKey(setele);
6479 vector[j].u.score = 0;
6480 vector[j].u.cmpobj = NULL;
6481 j++;
6482 }
6483 dictReleaseIterator(di);
6484 }
6485 redisAssert(j == vectorlen);
6486
6487 /* Now it's time to load the right scores in the sorting vector */
6488 if (dontsort == 0) {
6489 for (j = 0; j < vectorlen; j++) {
6490 if (sortby) {
6491 robj *byval;
6492
6493 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6494 if (!byval || byval->type != REDIS_STRING) continue;
6495 if (alpha) {
6496 vector[j].u.cmpobj = getDecodedObject(byval);
6497 } else {
6498 if (byval->encoding == REDIS_ENCODING_RAW) {
6499 vector[j].u.score = strtod(byval->ptr,NULL);
6500 } else {
6501 /* Don't need to decode the object if it's
6502 * integer-encoded (the only encoding supported) so
6503 * far. We can just cast it */
6504 if (byval->encoding == REDIS_ENCODING_INT) {
6505 vector[j].u.score = (long)byval->ptr;
6506 } else
6507 redisAssert(1 != 1);
6508 }
6509 }
6510 } else {
6511 if (!alpha) {
6512 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6513 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6514 else {
6515 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6516 vector[j].u.score = (long) vector[j].obj->ptr;
6517 else
6518 redisAssert(1 != 1);
6519 }
6520 }
6521 }
6522 }
6523 }
6524
6525 /* We are ready to sort the vector... perform a bit of sanity check
6526 * on the LIMIT option too. We'll use a partial version of quicksort. */
6527 start = (limit_start < 0) ? 0 : limit_start;
6528 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6529 if (start >= vectorlen) {
6530 start = vectorlen-1;
6531 end = vectorlen-2;
6532 }
6533 if (end >= vectorlen) end = vectorlen-1;
6534
6535 if (dontsort == 0) {
6536 server.sort_desc = desc;
6537 server.sort_alpha = alpha;
6538 server.sort_bypattern = sortby ? 1 : 0;
6539 if (sortby && (start != 0 || end != vectorlen-1))
6540 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6541 else
6542 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6543 }
6544
6545 /* Send command output to the output buffer, performing the specified
6546 * GET/DEL/INCR/DECR operations if any. */
6547 outputlen = getop ? getop*(end-start+1) : end-start+1;
6548 if (storekey == NULL) {
6549 /* STORE option not specified, sent the sorting result to client */
6550 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6551 for (j = start; j <= end; j++) {
6552 listNode *ln;
6553 listIter li;
6554
6555 if (!getop) addReplyBulk(c,vector[j].obj);
6556 listRewind(operations,&li);
6557 while((ln = listNext(&li))) {
6558 redisSortOperation *sop = ln->value;
6559 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6560 vector[j].obj);
6561
6562 if (sop->type == REDIS_SORT_GET) {
6563 if (!val || val->type != REDIS_STRING) {
6564 addReply(c,shared.nullbulk);
6565 } else {
6566 addReplyBulk(c,val);
6567 }
6568 } else {
6569 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6570 }
6571 }
6572 }
6573 } else {
6574 robj *listObject = createListObject();
6575 list *listPtr = (list*) listObject->ptr;
6576
6577 /* STORE option specified, set the sorting result as a List object */
6578 for (j = start; j <= end; j++) {
6579 listNode *ln;
6580 listIter li;
6581
6582 if (!getop) {
6583 listAddNodeTail(listPtr,vector[j].obj);
6584 incrRefCount(vector[j].obj);
6585 }
6586 listRewind(operations,&li);
6587 while((ln = listNext(&li))) {
6588 redisSortOperation *sop = ln->value;
6589 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6590 vector[j].obj);
6591
6592 if (sop->type == REDIS_SORT_GET) {
6593 if (!val || val->type != REDIS_STRING) {
6594 listAddNodeTail(listPtr,createStringObject("",0));
6595 } else {
6596 listAddNodeTail(listPtr,val);
6597 incrRefCount(val);
6598 }
6599 } else {
6600 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6601 }
6602 }
6603 }
6604 if (dictReplace(c->db->dict,storekey,listObject)) {
6605 incrRefCount(storekey);
6606 }
6607 /* Note: we add 1 because the DB is dirty anyway since even if the
6608 * SORT result is empty a new key is set and maybe the old content
6609 * replaced. */
6610 server.dirty += 1+outputlen;
6611 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6612 }
6613
6614 /* Cleanup */
6615 decrRefCount(sortval);
6616 listRelease(operations);
6617 for (j = 0; j < vectorlen; j++) {
6618 if (sortby && alpha && vector[j].u.cmpobj)
6619 decrRefCount(vector[j].u.cmpobj);
6620 }
6621 zfree(vector);
6622 }
6623
6624 /* Convert an amount of bytes into a human readable string in the form
6625 * of 100B, 2G, 100M, 4K, and so forth. */
6626 static void bytesToHuman(char *s, unsigned long long n) {
6627 double d;
6628
6629 if (n < 1024) {
6630 /* Bytes */
6631 sprintf(s,"%lluB",n);
6632 return;
6633 } else if (n < (1024*1024)) {
6634 d = (double)n/(1024);
6635 sprintf(s,"%.2fK",d);
6636 } else if (n < (1024LL*1024*1024)) {
6637 d = (double)n/(1024*1024);
6638 sprintf(s,"%.2fM",d);
6639 } else if (n < (1024LL*1024*1024*1024)) {
6640 d = (double)n/(1024LL*1024*1024);
6641 sprintf(s,"%.2fG",d);
6642 }
6643 }
6644
6645 /* Create the string returned by the INFO command. This is decoupled
6646 * by the INFO command itself as we need to report the same information
6647 * on memory corruption problems. */
6648 static sds genRedisInfoString(void) {
6649 sds info;
6650 time_t uptime = time(NULL)-server.stat_starttime;
6651 int j;
6652 char hmem[64];
6653
6654 bytesToHuman(hmem,zmalloc_used_memory());
6655 info = sdscatprintf(sdsempty(),
6656 "redis_version:%s\r\n"
6657 "arch_bits:%s\r\n"
6658 "multiplexing_api:%s\r\n"
6659 "process_id:%ld\r\n"
6660 "uptime_in_seconds:%ld\r\n"
6661 "uptime_in_days:%ld\r\n"
6662 "connected_clients:%d\r\n"
6663 "connected_slaves:%d\r\n"
6664 "blocked_clients:%d\r\n"
6665 "used_memory:%zu\r\n"
6666 "used_memory_human:%s\r\n"
6667 "changes_since_last_save:%lld\r\n"
6668 "bgsave_in_progress:%d\r\n"
6669 "last_save_time:%ld\r\n"
6670 "bgrewriteaof_in_progress:%d\r\n"
6671 "total_connections_received:%lld\r\n"
6672 "total_commands_processed:%lld\r\n"
6673 "expired_keys:%lld\r\n"
6674 "hash_max_zipmap_entries:%ld\r\n"
6675 "hash_max_zipmap_value:%ld\r\n"
6676 "pubsub_channels:%ld\r\n"
6677 "pubsub_patterns:%u\r\n"
6678 "vm_enabled:%d\r\n"
6679 "role:%s\r\n"
6680 ,REDIS_VERSION,
6681 (sizeof(long) == 8) ? "64" : "32",
6682 aeGetApiName(),
6683 (long) getpid(),
6684 uptime,
6685 uptime/(3600*24),
6686 listLength(server.clients)-listLength(server.slaves),
6687 listLength(server.slaves),
6688 server.blpop_blocked_clients,
6689 zmalloc_used_memory(),
6690 hmem,
6691 server.dirty,
6692 server.bgsavechildpid != -1,
6693 server.lastsave,
6694 server.bgrewritechildpid != -1,
6695 server.stat_numconnections,
6696 server.stat_numcommands,
6697 server.stat_expiredkeys,
6698 server.hash_max_zipmap_entries,
6699 server.hash_max_zipmap_value,
6700 dictSize(server.pubsub_channels),
6701 listLength(server.pubsub_patterns),
6702 server.vm_enabled != 0,
6703 server.masterhost == NULL ? "master" : "slave"
6704 );
6705 if (server.masterhost) {
6706 info = sdscatprintf(info,
6707 "master_host:%s\r\n"
6708 "master_port:%d\r\n"
6709 "master_link_status:%s\r\n"
6710 "master_last_io_seconds_ago:%d\r\n"
6711 ,server.masterhost,
6712 server.masterport,
6713 (server.replstate == REDIS_REPL_CONNECTED) ?
6714 "up" : "down",
6715 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6716 );
6717 }
6718 if (server.vm_enabled) {
6719 lockThreadedIO();
6720 info = sdscatprintf(info,
6721 "vm_conf_max_memory:%llu\r\n"
6722 "vm_conf_page_size:%llu\r\n"
6723 "vm_conf_pages:%llu\r\n"
6724 "vm_stats_used_pages:%llu\r\n"
6725 "vm_stats_swapped_objects:%llu\r\n"
6726 "vm_stats_swappin_count:%llu\r\n"
6727 "vm_stats_swappout_count:%llu\r\n"
6728 "vm_stats_io_newjobs_len:%lu\r\n"
6729 "vm_stats_io_processing_len:%lu\r\n"
6730 "vm_stats_io_processed_len:%lu\r\n"
6731 "vm_stats_io_active_threads:%lu\r\n"
6732 "vm_stats_blocked_clients:%lu\r\n"
6733 ,(unsigned long long) server.vm_max_memory,
6734 (unsigned long long) server.vm_page_size,
6735 (unsigned long long) server.vm_pages,
6736 (unsigned long long) server.vm_stats_used_pages,
6737 (unsigned long long) server.vm_stats_swapped_objects,
6738 (unsigned long long) server.vm_stats_swapins,
6739 (unsigned long long) server.vm_stats_swapouts,
6740 (unsigned long) listLength(server.io_newjobs),
6741 (unsigned long) listLength(server.io_processing),
6742 (unsigned long) listLength(server.io_processed),
6743 (unsigned long) server.io_active_threads,
6744 (unsigned long) server.vm_blocked_clients
6745 );
6746 unlockThreadedIO();
6747 }
6748 for (j = 0; j < server.dbnum; j++) {
6749 long long keys, vkeys;
6750
6751 keys = dictSize(server.db[j].dict);
6752 vkeys = dictSize(server.db[j].expires);
6753 if (keys || vkeys) {
6754 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6755 j, keys, vkeys);
6756 }
6757 }
6758 return info;
6759 }
6760
6761 static void infoCommand(redisClient *c) {
6762 sds info = genRedisInfoString();
6763 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6764 (unsigned long)sdslen(info)));
6765 addReplySds(c,info);
6766 addReply(c,shared.crlf);
6767 }
6768
6769 static void monitorCommand(redisClient *c) {
6770 /* ignore MONITOR if aleady slave or in monitor mode */
6771 if (c->flags & REDIS_SLAVE) return;
6772
6773 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6774 c->slaveseldb = 0;
6775 listAddNodeTail(server.monitors,c);
6776 addReply(c,shared.ok);
6777 }
6778
6779 /* ================================= Expire ================================= */
6780 static int removeExpire(redisDb *db, robj *key) {
6781 if (dictDelete(db->expires,key) == DICT_OK) {
6782 return 1;
6783 } else {
6784 return 0;
6785 }
6786 }
6787
6788 static int setExpire(redisDb *db, robj *key, time_t when) {
6789 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6790 return 0;
6791 } else {
6792 incrRefCount(key);
6793 return 1;
6794 }
6795 }
6796
6797 /* Return the expire time of the specified key, or -1 if no expire
6798 * is associated with this key (i.e. the key is non volatile) */
6799 static time_t getExpire(redisDb *db, robj *key) {
6800 dictEntry *de;
6801
6802 /* No expire? return ASAP */
6803 if (dictSize(db->expires) == 0 ||
6804 (de = dictFind(db->expires,key)) == NULL) return -1;
6805
6806 return (time_t) dictGetEntryVal(de);
6807 }
6808
6809 static int expireIfNeeded(redisDb *db, robj *key) {
6810 time_t when;
6811 dictEntry *de;
6812
6813 /* No expire? return ASAP */
6814 if (dictSize(db->expires) == 0 ||
6815 (de = dictFind(db->expires,key)) == NULL) return 0;
6816
6817 /* Lookup the expire */
6818 when = (time_t) dictGetEntryVal(de);
6819 if (time(NULL) <= when) return 0;
6820
6821 /* Delete the key */
6822 dictDelete(db->expires,key);
6823 server.stat_expiredkeys++;
6824 return dictDelete(db->dict,key) == DICT_OK;
6825 }
6826
6827 static int deleteIfVolatile(redisDb *db, robj *key) {
6828 dictEntry *de;
6829
6830 /* No expire? return ASAP */
6831 if (dictSize(db->expires) == 0 ||
6832 (de = dictFind(db->expires,key)) == NULL) return 0;
6833
6834 /* Delete the key */
6835 server.dirty++;
6836 server.stat_expiredkeys++;
6837 dictDelete(db->expires,key);
6838 return dictDelete(db->dict,key) == DICT_OK;
6839 }
6840
6841 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6842 dictEntry *de;
6843
6844 de = dictFind(c->db->dict,key);
6845 if (de == NULL) {
6846 addReply(c,shared.czero);
6847 return;
6848 }
6849 if (seconds < 0) {
6850 if (deleteKey(c->db,key)) server.dirty++;
6851 addReply(c, shared.cone);
6852 return;
6853 } else {
6854 time_t when = time(NULL)+seconds;
6855 if (setExpire(c->db,key,when)) {
6856 addReply(c,shared.cone);
6857 server.dirty++;
6858 } else {
6859 addReply(c,shared.czero);
6860 }
6861 return;
6862 }
6863 }
6864
6865 static void expireCommand(redisClient *c) {
6866 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6867 }
6868
6869 static void expireatCommand(redisClient *c) {
6870 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6871 }
6872
6873 static void ttlCommand(redisClient *c) {
6874 time_t expire;
6875 int ttl = -1;
6876
6877 expire = getExpire(c->db,c->argv[1]);
6878 if (expire != -1) {
6879 ttl = (int) (expire-time(NULL));
6880 if (ttl < 0) ttl = -1;
6881 }
6882 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6883 }
6884
6885 /* ================================ MULTI/EXEC ============================== */
6886
6887 /* Client state initialization for MULTI/EXEC */
6888 static void initClientMultiState(redisClient *c) {
6889 c->mstate.commands = NULL;
6890 c->mstate.count = 0;
6891 }
6892
6893 /* Release all the resources associated with MULTI/EXEC state */
6894 static void freeClientMultiState(redisClient *c) {
6895 int j;
6896
6897 for (j = 0; j < c->mstate.count; j++) {
6898 int i;
6899 multiCmd *mc = c->mstate.commands+j;
6900
6901 for (i = 0; i < mc->argc; i++)
6902 decrRefCount(mc->argv[i]);
6903 zfree(mc->argv);
6904 }
6905 zfree(c->mstate.commands);
6906 }
6907
6908 /* Add a new command into the MULTI commands queue */
6909 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6910 multiCmd *mc;
6911 int j;
6912
6913 c->mstate.commands = zrealloc(c->mstate.commands,
6914 sizeof(multiCmd)*(c->mstate.count+1));
6915 mc = c->mstate.commands+c->mstate.count;
6916 mc->cmd = cmd;
6917 mc->argc = c->argc;
6918 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6919 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6920 for (j = 0; j < c->argc; j++)
6921 incrRefCount(mc->argv[j]);
6922 c->mstate.count++;
6923 }
6924
6925 static void multiCommand(redisClient *c) {
6926 c->flags |= REDIS_MULTI;
6927 addReply(c,shared.ok);
6928 }
6929
6930 static void discardCommand(redisClient *c) {
6931 if (!(c->flags & REDIS_MULTI)) {
6932 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6933 return;
6934 }
6935
6936 freeClientMultiState(c);
6937 initClientMultiState(c);
6938 c->flags &= (~REDIS_MULTI);
6939 addReply(c,shared.ok);
6940 }
6941
6942 static void execCommand(redisClient *c) {
6943 int j;
6944 robj **orig_argv;
6945 int orig_argc;
6946
6947 if (!(c->flags & REDIS_MULTI)) {
6948 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6949 return;
6950 }
6951
6952 orig_argv = c->argv;
6953 orig_argc = c->argc;
6954 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6955 for (j = 0; j < c->mstate.count; j++) {
6956 c->argc = c->mstate.commands[j].argc;
6957 c->argv = c->mstate.commands[j].argv;
6958 call(c,c->mstate.commands[j].cmd);
6959 }
6960 c->argv = orig_argv;
6961 c->argc = orig_argc;
6962 freeClientMultiState(c);
6963 initClientMultiState(c);
6964 c->flags &= (~REDIS_MULTI);
6965 }
6966
6967 /* =========================== Blocking Operations ========================= */
6968
6969 /* Currently Redis blocking operations support is limited to list POP ops,
6970 * so the current implementation is not fully generic, but it is also not
6971 * completely specific so it will not require a rewrite to support new
6972 * kind of blocking operations in the future.
6973 *
6974 * Still it's important to note that list blocking operations can be already
6975 * used as a notification mechanism in order to implement other blocking
6976 * operations at application level, so there must be a very strong evidence
6977 * of usefulness and generality before new blocking operations are implemented.
6978 *
6979 * This is how the current blocking POP works, we use BLPOP as example:
6980 * - If the user calls BLPOP and the key exists and contains a non empty list
6981 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6982 * if there is not to block.
6983 * - If instead BLPOP is called and the key does not exists or the list is
6984 * empty we need to block. In order to do so we remove the notification for
6985 * new data to read in the client socket (so that we'll not serve new
6986 * requests if the blocking request is not served). Also we put the client
6987 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6988 * blocking for this keys.
6989 * - If a PUSH operation against a key with blocked clients waiting is
6990 * performed, we serve the first in the list: basically instead to push
6991 * the new element inside the list we return it to the (first / oldest)
6992 * blocking client, unblock the client, and remove it form the list.
6993 *
6994 * The above comment and the source code should be enough in order to understand
6995 * the implementation and modify / fix it later.
6996 */
6997
6998 /* Set a client in blocking mode for the specified key, with the specified
6999 * timeout */
7000 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7001 dictEntry *de;
7002 list *l;
7003 int j;
7004
7005 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7006 c->blockingkeysnum = numkeys;
7007 c->blockingto = timeout;
7008 for (j = 0; j < numkeys; j++) {
7009 /* Add the key in the client structure, to map clients -> keys */
7010 c->blockingkeys[j] = keys[j];
7011 incrRefCount(keys[j]);
7012
7013 /* And in the other "side", to map keys -> clients */
7014 de = dictFind(c->db->blockingkeys,keys[j]);
7015 if (de == NULL) {
7016 int retval;
7017
7018 /* For every key we take a list of clients blocked for it */
7019 l = listCreate();
7020 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7021 incrRefCount(keys[j]);
7022 assert(retval == DICT_OK);
7023 } else {
7024 l = dictGetEntryVal(de);
7025 }
7026 listAddNodeTail(l,c);
7027 }
7028 /* Mark the client as a blocked client */
7029 c->flags |= REDIS_BLOCKED;
7030 server.blpop_blocked_clients++;
7031 }
7032
7033 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7034 static void unblockClientWaitingData(redisClient *c) {
7035 dictEntry *de;
7036 list *l;
7037 int j;
7038
7039 assert(c->blockingkeys != NULL);
7040 /* The client may wait for multiple keys, so unblock it for every key. */
7041 for (j = 0; j < c->blockingkeysnum; j++) {
7042 /* Remove this client from the list of clients waiting for this key. */
7043 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7044 assert(de != NULL);
7045 l = dictGetEntryVal(de);
7046 listDelNode(l,listSearchKey(l,c));
7047 /* If the list is empty we need to remove it to avoid wasting memory */
7048 if (listLength(l) == 0)
7049 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7050 decrRefCount(c->blockingkeys[j]);
7051 }
7052 /* Cleanup the client structure */
7053 zfree(c->blockingkeys);
7054 c->blockingkeys = NULL;
7055 c->flags &= (~REDIS_BLOCKED);
7056 server.blpop_blocked_clients--;
7057 /* We want to process data if there is some command waiting
7058 * in the input buffer. Note that this is safe even if
7059 * unblockClientWaitingData() gets called from freeClient() because
7060 * freeClient() will be smart enough to call this function
7061 * *after* c->querybuf was set to NULL. */
7062 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7063 }
7064
7065 /* This should be called from any function PUSHing into lists.
7066 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7067 * 'ele' is the element pushed.
7068 *
7069 * If the function returns 0 there was no client waiting for a list push
7070 * against this key.
7071 *
7072 * If the function returns 1 there was a client waiting for a list push
7073 * against this key, the element was passed to this client thus it's not
7074 * needed to actually add it to the list and the caller should return asap. */
7075 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7076 struct dictEntry *de;
7077 redisClient *receiver;
7078 list *l;
7079 listNode *ln;
7080
7081 de = dictFind(c->db->blockingkeys,key);
7082 if (de == NULL) return 0;
7083 l = dictGetEntryVal(de);
7084 ln = listFirst(l);
7085 assert(ln != NULL);
7086 receiver = ln->value;
7087
7088 addReplySds(receiver,sdsnew("*2\r\n"));
7089 addReplyBulk(receiver,key);
7090 addReplyBulk(receiver,ele);
7091 unblockClientWaitingData(receiver);
7092 return 1;
7093 }
7094
7095 /* Blocking RPOP/LPOP */
7096 static void blockingPopGenericCommand(redisClient *c, int where) {
7097 robj *o;
7098 time_t timeout;
7099 int j;
7100
7101 for (j = 1; j < c->argc-1; j++) {
7102 o = lookupKeyWrite(c->db,c->argv[j]);
7103 if (o != NULL) {
7104 if (o->type != REDIS_LIST) {
7105 addReply(c,shared.wrongtypeerr);
7106 return;
7107 } else {
7108 list *list = o->ptr;
7109 if (listLength(list) != 0) {
7110 /* If the list contains elements fall back to the usual
7111 * non-blocking POP operation */
7112 robj *argv[2], **orig_argv;
7113 int orig_argc;
7114
7115 /* We need to alter the command arguments before to call
7116 * popGenericCommand() as the command takes a single key. */
7117 orig_argv = c->argv;
7118 orig_argc = c->argc;
7119 argv[1] = c->argv[j];
7120 c->argv = argv;
7121 c->argc = 2;
7122
7123 /* Also the return value is different, we need to output
7124 * the multi bulk reply header and the key name. The
7125 * "real" command will add the last element (the value)
7126 * for us. If this souds like an hack to you it's just
7127 * because it is... */
7128 addReplySds(c,sdsnew("*2\r\n"));
7129 addReplyBulk(c,argv[1]);
7130 popGenericCommand(c,where);
7131
7132 /* Fix the client structure with the original stuff */
7133 c->argv = orig_argv;
7134 c->argc = orig_argc;
7135 return;
7136 }
7137 }
7138 }
7139 }
7140 /* If the list is empty or the key does not exists we must block */
7141 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7142 if (timeout > 0) timeout += time(NULL);
7143 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7144 }
7145
7146 static void blpopCommand(redisClient *c) {
7147 blockingPopGenericCommand(c,REDIS_HEAD);
7148 }
7149
7150 static void brpopCommand(redisClient *c) {
7151 blockingPopGenericCommand(c,REDIS_TAIL);
7152 }
7153
7154 /* =============================== Replication ============================= */
7155
7156 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7157 ssize_t nwritten, ret = size;
7158 time_t start = time(NULL);
7159
7160 timeout++;
7161 while(size) {
7162 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7163 nwritten = write(fd,ptr,size);
7164 if (nwritten == -1) return -1;
7165 ptr += nwritten;
7166 size -= nwritten;
7167 }
7168 if ((time(NULL)-start) > timeout) {
7169 errno = ETIMEDOUT;
7170 return -1;
7171 }
7172 }
7173 return ret;
7174 }
7175
7176 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7177 ssize_t nread, totread = 0;
7178 time_t start = time(NULL);
7179
7180 timeout++;
7181 while(size) {
7182 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7183 nread = read(fd,ptr,size);
7184 if (nread == -1) return -1;
7185 ptr += nread;
7186 size -= nread;
7187 totread += nread;
7188 }
7189 if ((time(NULL)-start) > timeout) {
7190 errno = ETIMEDOUT;
7191 return -1;
7192 }
7193 }
7194 return totread;
7195 }
7196
7197 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7198 ssize_t nread = 0;
7199
7200 size--;
7201 while(size) {
7202 char c;
7203
7204 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7205 if (c == '\n') {
7206 *ptr = '\0';
7207 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7208 return nread;
7209 } else {
7210 *ptr++ = c;
7211 *ptr = '\0';
7212 nread++;
7213 }
7214 }
7215 return nread;
7216 }
7217
7218 static void syncCommand(redisClient *c) {
7219 /* ignore SYNC if aleady slave or in monitor mode */
7220 if (c->flags & REDIS_SLAVE) return;
7221
7222 /* SYNC can't be issued when the server has pending data to send to
7223 * the client about already issued commands. We need a fresh reply
7224 * buffer registering the differences between the BGSAVE and the current
7225 * dataset, so that we can copy to other slaves if needed. */
7226 if (listLength(c->reply) != 0) {
7227 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7228 return;
7229 }
7230
7231 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7232 /* Here we need to check if there is a background saving operation
7233 * in progress, or if it is required to start one */
7234 if (server.bgsavechildpid != -1) {
7235 /* Ok a background save is in progress. Let's check if it is a good
7236 * one for replication, i.e. if there is another slave that is
7237 * registering differences since the server forked to save */
7238 redisClient *slave;
7239 listNode *ln;
7240 listIter li;
7241
7242 listRewind(server.slaves,&li);
7243 while((ln = listNext(&li))) {
7244 slave = ln->value;
7245 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7246 }
7247 if (ln) {
7248 /* Perfect, the server is already registering differences for
7249 * another slave. Set the right state, and copy the buffer. */
7250 listRelease(c->reply);
7251 c->reply = listDup(slave->reply);
7252 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7253 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7254 } else {
7255 /* No way, we need to wait for the next BGSAVE in order to
7256 * register differences */
7257 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7258 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7259 }
7260 } else {
7261 /* Ok we don't have a BGSAVE in progress, let's start one */
7262 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7263 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7264 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7265 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7266 return;
7267 }
7268 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7269 }
7270 c->repldbfd = -1;
7271 c->flags |= REDIS_SLAVE;
7272 c->slaveseldb = 0;
7273 listAddNodeTail(server.slaves,c);
7274 return;
7275 }
7276
7277 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7278 redisClient *slave = privdata;
7279 REDIS_NOTUSED(el);
7280 REDIS_NOTUSED(mask);
7281 char buf[REDIS_IOBUF_LEN];
7282 ssize_t nwritten, buflen;
7283
7284 if (slave->repldboff == 0) {
7285 /* Write the bulk write count before to transfer the DB. In theory here
7286 * we don't know how much room there is in the output buffer of the
7287 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7288 * operations) will never be smaller than the few bytes we need. */
7289 sds bulkcount;
7290
7291 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7292 slave->repldbsize);
7293 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7294 {
7295 sdsfree(bulkcount);
7296 freeClient(slave);
7297 return;
7298 }
7299 sdsfree(bulkcount);
7300 }
7301 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7302 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7303 if (buflen <= 0) {
7304 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7305 (buflen == 0) ? "premature EOF" : strerror(errno));
7306 freeClient(slave);
7307 return;
7308 }
7309 if ((nwritten = write(fd,buf,buflen)) == -1) {
7310 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7311 strerror(errno));
7312 freeClient(slave);
7313 return;
7314 }
7315 slave->repldboff += nwritten;
7316 if (slave->repldboff == slave->repldbsize) {
7317 close(slave->repldbfd);
7318 slave->repldbfd = -1;
7319 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7320 slave->replstate = REDIS_REPL_ONLINE;
7321 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7322 sendReplyToClient, slave) == AE_ERR) {
7323 freeClient(slave);
7324 return;
7325 }
7326 addReplySds(slave,sdsempty());
7327 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7328 }
7329 }
7330
7331 /* This function is called at the end of every backgrond saving.
7332 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7333 * otherwise REDIS_ERR is passed to the function.
7334 *
7335 * The goal of this function is to handle slaves waiting for a successful
7336 * background saving in order to perform non-blocking synchronization. */
7337 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7338 listNode *ln;
7339 int startbgsave = 0;
7340 listIter li;
7341
7342 listRewind(server.slaves,&li);
7343 while((ln = listNext(&li))) {
7344 redisClient *slave = ln->value;
7345
7346 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7347 startbgsave = 1;
7348 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7349 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7350 struct redis_stat buf;
7351
7352 if (bgsaveerr != REDIS_OK) {
7353 freeClient(slave);
7354 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7355 continue;
7356 }
7357 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7358 redis_fstat(slave->repldbfd,&buf) == -1) {
7359 freeClient(slave);
7360 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7361 continue;
7362 }
7363 slave->repldboff = 0;
7364 slave->repldbsize = buf.st_size;
7365 slave->replstate = REDIS_REPL_SEND_BULK;
7366 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7367 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7368 freeClient(slave);
7369 continue;
7370 }
7371 }
7372 }
7373 if (startbgsave) {
7374 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7375 listIter li;
7376
7377 listRewind(server.slaves,&li);
7378 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7379 while((ln = listNext(&li))) {
7380 redisClient *slave = ln->value;
7381
7382 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7383 freeClient(slave);
7384 }
7385 }
7386 }
7387 }
7388
7389 static int syncWithMaster(void) {
7390 char buf[1024], tmpfile[256], authcmd[1024];
7391 long dumpsize;
7392 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7393 int dfd, maxtries = 5;
7394
7395 if (fd == -1) {
7396 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7397 strerror(errno));
7398 return REDIS_ERR;
7399 }
7400
7401 /* AUTH with the master if required. */
7402 if(server.masterauth) {
7403 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7404 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7405 close(fd);
7406 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7407 strerror(errno));
7408 return REDIS_ERR;
7409 }
7410 /* Read the AUTH result. */
7411 if (syncReadLine(fd,buf,1024,3600) == -1) {
7412 close(fd);
7413 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7414 strerror(errno));
7415 return REDIS_ERR;
7416 }
7417 if (buf[0] != '+') {
7418 close(fd);
7419 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7420 return REDIS_ERR;
7421 }
7422 }
7423
7424 /* Issue the SYNC command */
7425 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7426 close(fd);
7427 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7428 strerror(errno));
7429 return REDIS_ERR;
7430 }
7431 /* Read the bulk write count */
7432 if (syncReadLine(fd,buf,1024,3600) == -1) {
7433 close(fd);
7434 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7435 strerror(errno));
7436 return REDIS_ERR;
7437 }
7438 if (buf[0] != '$') {
7439 close(fd);
7440 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7441 return REDIS_ERR;
7442 }
7443 dumpsize = strtol(buf+1,NULL,10);
7444 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7445 /* Read the bulk write data on a temp file */
7446 while(maxtries--) {
7447 snprintf(tmpfile,256,
7448 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7449 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7450 if (dfd != -1) break;
7451 sleep(1);
7452 }
7453 if (dfd == -1) {
7454 close(fd);
7455 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7456 return REDIS_ERR;
7457 }
7458 while(dumpsize) {
7459 int nread, nwritten;
7460
7461 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7462 if (nread == -1) {
7463 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7464 strerror(errno));
7465 close(fd);
7466 close(dfd);
7467 return REDIS_ERR;
7468 }
7469 nwritten = write(dfd,buf,nread);
7470 if (nwritten == -1) {
7471 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7472 close(fd);
7473 close(dfd);
7474 return REDIS_ERR;
7475 }
7476 dumpsize -= nread;
7477 }
7478 close(dfd);
7479 if (rename(tmpfile,server.dbfilename) == -1) {
7480 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7481 unlink(tmpfile);
7482 close(fd);
7483 return REDIS_ERR;
7484 }
7485 emptyDb();
7486 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7487 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7488 close(fd);
7489 return REDIS_ERR;
7490 }
7491 server.master = createClient(fd);
7492 server.master->flags |= REDIS_MASTER;
7493 server.master->authenticated = 1;
7494 server.replstate = REDIS_REPL_CONNECTED;
7495 return REDIS_OK;
7496 }
7497
7498 static void slaveofCommand(redisClient *c) {
7499 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7500 !strcasecmp(c->argv[2]->ptr,"one")) {
7501 if (server.masterhost) {
7502 sdsfree(server.masterhost);
7503 server.masterhost = NULL;
7504 if (server.master) freeClient(server.master);
7505 server.replstate = REDIS_REPL_NONE;
7506 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7507 }
7508 } else {
7509 sdsfree(server.masterhost);
7510 server.masterhost = sdsdup(c->argv[1]->ptr);
7511 server.masterport = atoi(c->argv[2]->ptr);
7512 if (server.master) freeClient(server.master);
7513 server.replstate = REDIS_REPL_CONNECT;
7514 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7515 server.masterhost, server.masterport);
7516 }
7517 addReply(c,shared.ok);
7518 }
7519
7520 /* ============================ Maxmemory directive ======================== */
7521
7522 /* Try to free one object form the pre-allocated objects free list.
7523 * This is useful under low mem conditions as by default we take 1 million
7524 * free objects allocated. On success REDIS_OK is returned, otherwise
7525 * REDIS_ERR. */
7526 static int tryFreeOneObjectFromFreelist(void) {
7527 robj *o;
7528
7529 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7530 if (listLength(server.objfreelist)) {
7531 listNode *head = listFirst(server.objfreelist);
7532 o = listNodeValue(head);
7533 listDelNode(server.objfreelist,head);
7534 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7535 zfree(o);
7536 return REDIS_OK;
7537 } else {
7538 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7539 return REDIS_ERR;
7540 }
7541 }
7542
7543 /* This function gets called when 'maxmemory' is set on the config file to limit
7544 * the max memory used by the server, and we are out of memory.
7545 * This function will try to, in order:
7546 *
7547 * - Free objects from the free list
7548 * - Try to remove keys with an EXPIRE set
7549 *
7550 * It is not possible to free enough memory to reach used-memory < maxmemory
7551 * the server will start refusing commands that will enlarge even more the
7552 * memory usage.
7553 */
7554 static void freeMemoryIfNeeded(void) {
7555 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7556 int j, k, freed = 0;
7557
7558 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7559 for (j = 0; j < server.dbnum; j++) {
7560 int minttl = -1;
7561 robj *minkey = NULL;
7562 struct dictEntry *de;
7563
7564 if (dictSize(server.db[j].expires)) {
7565 freed = 1;
7566 /* From a sample of three keys drop the one nearest to
7567 * the natural expire */
7568 for (k = 0; k < 3; k++) {
7569 time_t t;
7570
7571 de = dictGetRandomKey(server.db[j].expires);
7572 t = (time_t) dictGetEntryVal(de);
7573 if (minttl == -1 || t < minttl) {
7574 minkey = dictGetEntryKey(de);
7575 minttl = t;
7576 }
7577 }
7578 deleteKey(server.db+j,minkey);
7579 }
7580 }
7581 if (!freed) return; /* nothing to free... */
7582 }
7583 }
7584
7585 /* ============================== Append Only file ========================== */
7586
7587 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7588 sds buf = sdsempty();
7589 int j;
7590 ssize_t nwritten;
7591 time_t now;
7592 robj *tmpargv[3];
7593
7594 /* The DB this command was targetting is not the same as the last command
7595 * we appendend. To issue a SELECT command is needed. */
7596 if (dictid != server.appendseldb) {
7597 char seldb[64];
7598
7599 snprintf(seldb,sizeof(seldb),"%d",dictid);
7600 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7601 (unsigned long)strlen(seldb),seldb);
7602 server.appendseldb = dictid;
7603 }
7604
7605 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7606 * EXPIREs into EXPIREATs calls */
7607 if (cmd->proc == expireCommand) {
7608 long when;
7609
7610 tmpargv[0] = createStringObject("EXPIREAT",8);
7611 tmpargv[1] = argv[1];
7612 incrRefCount(argv[1]);
7613 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7614 tmpargv[2] = createObject(REDIS_STRING,
7615 sdscatprintf(sdsempty(),"%ld",when));
7616 argv = tmpargv;
7617 }
7618
7619 /* Append the actual command */
7620 buf = sdscatprintf(buf,"*%d\r\n",argc);
7621 for (j = 0; j < argc; j++) {
7622 robj *o = argv[j];
7623
7624 o = getDecodedObject(o);
7625 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7626 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7627 buf = sdscatlen(buf,"\r\n",2);
7628 decrRefCount(o);
7629 }
7630
7631 /* Free the objects from the modified argv for EXPIREAT */
7632 if (cmd->proc == expireCommand) {
7633 for (j = 0; j < 3; j++)
7634 decrRefCount(argv[j]);
7635 }
7636
7637 /* We want to perform a single write. This should be guaranteed atomic
7638 * at least if the filesystem we are writing is a real physical one.
7639 * While this will save us against the server being killed I don't think
7640 * there is much to do about the whole server stopping for power problems
7641 * or alike */
7642 nwritten = write(server.appendfd,buf,sdslen(buf));
7643 if (nwritten != (signed)sdslen(buf)) {
7644 /* Ooops, we are in troubles. The best thing to do for now is
7645 * to simply exit instead to give the illusion that everything is
7646 * working as expected. */
7647 if (nwritten == -1) {
7648 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7649 } else {
7650 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7651 }
7652 exit(1);
7653 }
7654 /* If a background append only file rewriting is in progress we want to
7655 * accumulate the differences between the child DB and the current one
7656 * in a buffer, so that when the child process will do its work we
7657 * can append the differences to the new append only file. */
7658 if (server.bgrewritechildpid != -1)
7659 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7660
7661 sdsfree(buf);
7662 now = time(NULL);
7663 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7664 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7665 now-server.lastfsync > 1))
7666 {
7667 fsync(server.appendfd); /* Let's try to get this data on the disk */
7668 server.lastfsync = now;
7669 }
7670 }
7671
7672 /* In Redis commands are always executed in the context of a client, so in
7673 * order to load the append only file we need to create a fake client. */
7674 static struct redisClient *createFakeClient(void) {
7675 struct redisClient *c = zmalloc(sizeof(*c));
7676
7677 selectDb(c,0);
7678 c->fd = -1;
7679 c->querybuf = sdsempty();
7680 c->argc = 0;
7681 c->argv = NULL;
7682 c->flags = 0;
7683 /* We set the fake client as a slave waiting for the synchronization
7684 * so that Redis will not try to send replies to this client. */
7685 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7686 c->reply = listCreate();
7687 listSetFreeMethod(c->reply,decrRefCount);
7688 listSetDupMethod(c->reply,dupClientReplyValue);
7689 return c;
7690 }
7691
7692 static void freeFakeClient(struct redisClient *c) {
7693 sdsfree(c->querybuf);
7694 listRelease(c->reply);
7695 zfree(c);
7696 }
7697
7698 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7699 * error (the append only file is zero-length) REDIS_ERR is returned. On
7700 * fatal error an error message is logged and the program exists. */
7701 int loadAppendOnlyFile(char *filename) {
7702 struct redisClient *fakeClient;
7703 FILE *fp = fopen(filename,"r");
7704 struct redis_stat sb;
7705 unsigned long long loadedkeys = 0;
7706
7707 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7708 return REDIS_ERR;
7709
7710 if (fp == NULL) {
7711 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7712 exit(1);
7713 }
7714
7715 fakeClient = createFakeClient();
7716 while(1) {
7717 int argc, j;
7718 unsigned long len;
7719 robj **argv;
7720 char buf[128];
7721 sds argsds;
7722 struct redisCommand *cmd;
7723
7724 if (fgets(buf,sizeof(buf),fp) == NULL) {
7725 if (feof(fp))
7726 break;
7727 else
7728 goto readerr;
7729 }
7730 if (buf[0] != '*') goto fmterr;
7731 argc = atoi(buf+1);
7732 argv = zmalloc(sizeof(robj*)*argc);
7733 for (j = 0; j < argc; j++) {
7734 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7735 if (buf[0] != '$') goto fmterr;
7736 len = strtol(buf+1,NULL,10);
7737 argsds = sdsnewlen(NULL,len);
7738 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7739 argv[j] = createObject(REDIS_STRING,argsds);
7740 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7741 }
7742
7743 /* Command lookup */
7744 cmd = lookupCommand(argv[0]->ptr);
7745 if (!cmd) {
7746 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7747 exit(1);
7748 }
7749 /* Try object encoding */
7750 if (cmd->flags & REDIS_CMD_BULK)
7751 tryObjectEncoding(argv[argc-1]);
7752 /* Run the command in the context of a fake client */
7753 fakeClient->argc = argc;
7754 fakeClient->argv = argv;
7755 cmd->proc(fakeClient);
7756 /* Discard the reply objects list from the fake client */
7757 while(listLength(fakeClient->reply))
7758 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7759 /* Clean up, ready for the next command */
7760 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7761 zfree(argv);
7762 /* Handle swapping while loading big datasets when VM is on */
7763 loadedkeys++;
7764 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7765 while (zmalloc_used_memory() > server.vm_max_memory) {
7766 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7767 }
7768 }
7769 }
7770 fclose(fp);
7771 freeFakeClient(fakeClient);
7772 return REDIS_OK;
7773
7774 readerr:
7775 if (feof(fp)) {
7776 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7777 } else {
7778 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7779 }
7780 exit(1);
7781 fmterr:
7782 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7783 exit(1);
7784 }
7785
7786 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7787 static int fwriteBulkObject(FILE *fp, robj *obj) {
7788 char buf[128];
7789 int decrrc = 0;
7790
7791 /* Avoid the incr/decr ref count business if possible to help
7792 * copy-on-write (we are often in a child process when this function
7793 * is called).
7794 * Also makes sure that key objects don't get incrRefCount-ed when VM
7795 * is enabled */
7796 if (obj->encoding != REDIS_ENCODING_RAW) {
7797 obj = getDecodedObject(obj);
7798 decrrc = 1;
7799 }
7800 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7801 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7802 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7803 goto err;
7804 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7805 if (decrrc) decrRefCount(obj);
7806 return 1;
7807 err:
7808 if (decrrc) decrRefCount(obj);
7809 return 0;
7810 }
7811
7812 /* Write binary-safe string into a file in the bulkformat
7813 * $<count>\r\n<payload>\r\n */
7814 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7815 char buf[128];
7816
7817 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7818 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7819 if (len && fwrite(s,len,1,fp) == 0) return 0;
7820 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7821 return 1;
7822 }
7823
7824 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7825 static int fwriteBulkDouble(FILE *fp, double d) {
7826 char buf[128], dbuf[128];
7827
7828 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7829 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7830 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7831 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7832 return 1;
7833 }
7834
7835 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7836 static int fwriteBulkLong(FILE *fp, long l) {
7837 char buf[128], lbuf[128];
7838
7839 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7840 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7841 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7842 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7843 return 1;
7844 }
7845
7846 /* Write a sequence of commands able to fully rebuild the dataset into
7847 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7848 static int rewriteAppendOnlyFile(char *filename) {
7849 dictIterator *di = NULL;
7850 dictEntry *de;
7851 FILE *fp;
7852 char tmpfile[256];
7853 int j;
7854 time_t now = time(NULL);
7855
7856 /* Note that we have to use a different temp name here compared to the
7857 * one used by rewriteAppendOnlyFileBackground() function. */
7858 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7859 fp = fopen(tmpfile,"w");
7860 if (!fp) {
7861 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7862 return REDIS_ERR;
7863 }
7864 for (j = 0; j < server.dbnum; j++) {
7865 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7866 redisDb *db = server.db+j;
7867 dict *d = db->dict;
7868 if (dictSize(d) == 0) continue;
7869 di = dictGetIterator(d);
7870 if (!di) {
7871 fclose(fp);
7872 return REDIS_ERR;
7873 }
7874
7875 /* SELECT the new DB */
7876 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7877 if (fwriteBulkLong(fp,j) == 0) goto werr;
7878
7879 /* Iterate this DB writing every entry */
7880 while((de = dictNext(di)) != NULL) {
7881 robj *key, *o;
7882 time_t expiretime;
7883 int swapped;
7884
7885 key = dictGetEntryKey(de);
7886 /* If the value for this key is swapped, load a preview in memory.
7887 * We use a "swapped" flag to remember if we need to free the
7888 * value object instead to just increment the ref count anyway
7889 * in order to avoid copy-on-write of pages if we are forked() */
7890 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7891 key->storage == REDIS_VM_SWAPPING) {
7892 o = dictGetEntryVal(de);
7893 swapped = 0;
7894 } else {
7895 o = vmPreviewObject(key);
7896 swapped = 1;
7897 }
7898 expiretime = getExpire(db,key);
7899
7900 /* Save the key and associated value */
7901 if (o->type == REDIS_STRING) {
7902 /* Emit a SET command */
7903 char cmd[]="*3\r\n$3\r\nSET\r\n";
7904 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7905 /* Key and value */
7906 if (fwriteBulkObject(fp,key) == 0) goto werr;
7907 if (fwriteBulkObject(fp,o) == 0) goto werr;
7908 } else if (o->type == REDIS_LIST) {
7909 /* Emit the RPUSHes needed to rebuild the list */
7910 list *list = o->ptr;
7911 listNode *ln;
7912 listIter li;
7913
7914 listRewind(list,&li);
7915 while((ln = listNext(&li))) {
7916 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7917 robj *eleobj = listNodeValue(ln);
7918
7919 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7920 if (fwriteBulkObject(fp,key) == 0) goto werr;
7921 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7922 }
7923 } else if (o->type == REDIS_SET) {
7924 /* Emit the SADDs needed to rebuild the set */
7925 dict *set = o->ptr;
7926 dictIterator *di = dictGetIterator(set);
7927 dictEntry *de;
7928
7929 while((de = dictNext(di)) != NULL) {
7930 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7931 robj *eleobj = dictGetEntryKey(de);
7932
7933 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7934 if (fwriteBulkObject(fp,key) == 0) goto werr;
7935 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7936 }
7937 dictReleaseIterator(di);
7938 } else if (o->type == REDIS_ZSET) {
7939 /* Emit the ZADDs needed to rebuild the sorted set */
7940 zset *zs = o->ptr;
7941 dictIterator *di = dictGetIterator(zs->dict);
7942 dictEntry *de;
7943
7944 while((de = dictNext(di)) != NULL) {
7945 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7946 robj *eleobj = dictGetEntryKey(de);
7947 double *score = dictGetEntryVal(de);
7948
7949 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7950 if (fwriteBulkObject(fp,key) == 0) goto werr;
7951 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7952 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7953 }
7954 dictReleaseIterator(di);
7955 } else if (o->type == REDIS_HASH) {
7956 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7957
7958 /* Emit the HSETs needed to rebuild the hash */
7959 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7960 unsigned char *p = zipmapRewind(o->ptr);
7961 unsigned char *field, *val;
7962 unsigned int flen, vlen;
7963
7964 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7965 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7966 if (fwriteBulkObject(fp,key) == 0) goto werr;
7967 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7968 return -1;
7969 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7970 return -1;
7971 }
7972 } else {
7973 dictIterator *di = dictGetIterator(o->ptr);
7974 dictEntry *de;
7975
7976 while((de = dictNext(di)) != NULL) {
7977 robj *field = dictGetEntryKey(de);
7978 robj *val = dictGetEntryVal(de);
7979
7980 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7981 if (fwriteBulkObject(fp,key) == 0) goto werr;
7982 if (fwriteBulkObject(fp,field) == -1) return -1;
7983 if (fwriteBulkObject(fp,val) == -1) return -1;
7984 }
7985 dictReleaseIterator(di);
7986 }
7987 } else {
7988 redisAssert(0);
7989 }
7990 /* Save the expire time */
7991 if (expiretime != -1) {
7992 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7993 /* If this key is already expired skip it */
7994 if (expiretime < now) continue;
7995 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7996 if (fwriteBulkObject(fp,key) == 0) goto werr;
7997 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7998 }
7999 if (swapped) decrRefCount(o);
8000 }
8001 dictReleaseIterator(di);
8002 }
8003
8004 /* Make sure data will not remain on the OS's output buffers */
8005 fflush(fp);
8006 fsync(fileno(fp));
8007 fclose(fp);
8008
8009 /* Use RENAME to make sure the DB file is changed atomically only
8010 * if the generate DB file is ok. */
8011 if (rename(tmpfile,filename) == -1) {
8012 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8013 unlink(tmpfile);
8014 return REDIS_ERR;
8015 }
8016 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8017 return REDIS_OK;
8018
8019 werr:
8020 fclose(fp);
8021 unlink(tmpfile);
8022 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8023 if (di) dictReleaseIterator(di);
8024 return REDIS_ERR;
8025 }
8026
8027 /* This is how rewriting of the append only file in background works:
8028 *
8029 * 1) The user calls BGREWRITEAOF
8030 * 2) Redis calls this function, that forks():
8031 * 2a) the child rewrite the append only file in a temp file.
8032 * 2b) the parent accumulates differences in server.bgrewritebuf.
8033 * 3) When the child finished '2a' exists.
8034 * 4) The parent will trap the exit code, if it's OK, will append the
8035 * data accumulated into server.bgrewritebuf into the temp file, and
8036 * finally will rename(2) the temp file in the actual file name.
8037 * The the new file is reopened as the new append only file. Profit!
8038 */
8039 static int rewriteAppendOnlyFileBackground(void) {
8040 pid_t childpid;
8041
8042 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8043 if (server.vm_enabled) waitEmptyIOJobsQueue();
8044 if ((childpid = fork()) == 0) {
8045 /* Child */
8046 char tmpfile[256];
8047
8048 if (server.vm_enabled) vmReopenSwapFile();
8049 close(server.fd);
8050 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8051 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8052 _exit(0);
8053 } else {
8054 _exit(1);
8055 }
8056 } else {
8057 /* Parent */
8058 if (childpid == -1) {
8059 redisLog(REDIS_WARNING,
8060 "Can't rewrite append only file in background: fork: %s",
8061 strerror(errno));
8062 return REDIS_ERR;
8063 }
8064 redisLog(REDIS_NOTICE,
8065 "Background append only file rewriting started by pid %d",childpid);
8066 server.bgrewritechildpid = childpid;
8067 updateDictResizePolicy();
8068 /* We set appendseldb to -1 in order to force the next call to the
8069 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8070 * accumulated by the parent into server.bgrewritebuf will start
8071 * with a SELECT statement and it will be safe to merge. */
8072 server.appendseldb = -1;
8073 return REDIS_OK;
8074 }
8075 return REDIS_OK; /* unreached */
8076 }
8077
8078 static void bgrewriteaofCommand(redisClient *c) {
8079 if (server.bgrewritechildpid != -1) {
8080 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8081 return;
8082 }
8083 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8084 char *status = "+Background append only file rewriting started\r\n";
8085 addReplySds(c,sdsnew(status));
8086 } else {
8087 addReply(c,shared.err);
8088 }
8089 }
8090
8091 static void aofRemoveTempFile(pid_t childpid) {
8092 char tmpfile[256];
8093
8094 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8095 unlink(tmpfile);
8096 }
8097
8098 /* Virtual Memory is composed mainly of two subsystems:
8099 * - Blocking Virutal Memory
8100 * - Threaded Virtual Memory I/O
8101 * The two parts are not fully decoupled, but functions are split among two
8102 * different sections of the source code (delimited by comments) in order to
8103 * make more clear what functionality is about the blocking VM and what about
8104 * the threaded (not blocking) VM.
8105 *
8106 * Redis VM design:
8107 *
8108 * Redis VM is a blocking VM (one that blocks reading swapped values from
8109 * disk into memory when a value swapped out is needed in memory) that is made
8110 * unblocking by trying to examine the command argument vector in order to
8111 * load in background values that will likely be needed in order to exec
8112 * the command. The command is executed only once all the relevant keys
8113 * are loaded into memory.
8114 *
8115 * This basically is almost as simple of a blocking VM, but almost as parallel
8116 * as a fully non-blocking VM.
8117 */
8118
8119 /* =================== Virtual Memory - Blocking Side ====================== */
8120
8121 /* substitute the first occurrence of '%p' with the process pid in the
8122 * swap file name. */
8123 static void expandVmSwapFilename(void) {
8124 char *p = strstr(server.vm_swap_file,"%p");
8125 sds new;
8126
8127 if (!p) return;
8128 new = sdsempty();
8129 *p = '\0';
8130 new = sdscat(new,server.vm_swap_file);
8131 new = sdscatprintf(new,"%ld",(long) getpid());
8132 new = sdscat(new,p+2);
8133 zfree(server.vm_swap_file);
8134 server.vm_swap_file = new;
8135 }
8136
8137 static void vmInit(void) {
8138 off_t totsize;
8139 int pipefds[2];
8140 size_t stacksize;
8141
8142 if (server.vm_max_threads != 0)
8143 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8144
8145 expandVmSwapFilename();
8146 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8147 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8148 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8149 }
8150 if (server.vm_fp == NULL) {
8151 redisLog(REDIS_WARNING,
8152 "Impossible to open the swap file: %s. Exiting.",
8153 strerror(errno));
8154 exit(1);
8155 }
8156 server.vm_fd = fileno(server.vm_fp);
8157 server.vm_next_page = 0;
8158 server.vm_near_pages = 0;
8159 server.vm_stats_used_pages = 0;
8160 server.vm_stats_swapped_objects = 0;
8161 server.vm_stats_swapouts = 0;
8162 server.vm_stats_swapins = 0;
8163 totsize = server.vm_pages*server.vm_page_size;
8164 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8165 if (ftruncate(server.vm_fd,totsize) == -1) {
8166 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8167 strerror(errno));
8168 exit(1);
8169 } else {
8170 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8171 }
8172 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8173 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8174 (long long) (server.vm_pages+7)/8, server.vm_pages);
8175 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8176
8177 /* Initialize threaded I/O (used by Virtual Memory) */
8178 server.io_newjobs = listCreate();
8179 server.io_processing = listCreate();
8180 server.io_processed = listCreate();
8181 server.io_ready_clients = listCreate();
8182 pthread_mutex_init(&server.io_mutex,NULL);
8183 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8184 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8185 server.io_active_threads = 0;
8186 if (pipe(pipefds) == -1) {
8187 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8188 ,strerror(errno));
8189 exit(1);
8190 }
8191 server.io_ready_pipe_read = pipefds[0];
8192 server.io_ready_pipe_write = pipefds[1];
8193 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8194 /* LZF requires a lot of stack */
8195 pthread_attr_init(&server.io_threads_attr);
8196 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8197 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8198 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8199 /* Listen for events in the threaded I/O pipe */
8200 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8201 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8202 oom("creating file event");
8203 }
8204
8205 /* Mark the page as used */
8206 static void vmMarkPageUsed(off_t page) {
8207 off_t byte = page/8;
8208 int bit = page&7;
8209 redisAssert(vmFreePage(page) == 1);
8210 server.vm_bitmap[byte] |= 1<<bit;
8211 }
8212
8213 /* Mark N contiguous pages as used, with 'page' being the first. */
8214 static void vmMarkPagesUsed(off_t page, off_t count) {
8215 off_t j;
8216
8217 for (j = 0; j < count; j++)
8218 vmMarkPageUsed(page+j);
8219 server.vm_stats_used_pages += count;
8220 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8221 (long long)count, (long long)page);
8222 }
8223
8224 /* Mark the page as free */
8225 static void vmMarkPageFree(off_t page) {
8226 off_t byte = page/8;
8227 int bit = page&7;
8228 redisAssert(vmFreePage(page) == 0);
8229 server.vm_bitmap[byte] &= ~(1<<bit);
8230 }
8231
8232 /* Mark N contiguous pages as free, with 'page' being the first. */
8233 static void vmMarkPagesFree(off_t page, off_t count) {
8234 off_t j;
8235
8236 for (j = 0; j < count; j++)
8237 vmMarkPageFree(page+j);
8238 server.vm_stats_used_pages -= count;
8239 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8240 (long long)count, (long long)page);
8241 }
8242
8243 /* Test if the page is free */
8244 static int vmFreePage(off_t page) {
8245 off_t byte = page/8;
8246 int bit = page&7;
8247 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8248 }
8249
8250 /* Find N contiguous free pages storing the first page of the cluster in *first.
8251 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8252 * REDIS_ERR is returned.
8253 *
8254 * This function uses a simple algorithm: we try to allocate
8255 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8256 * again from the start of the swap file searching for free spaces.
8257 *
8258 * If it looks pretty clear that there are no free pages near our offset
8259 * we try to find less populated places doing a forward jump of
8260 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8261 * without hurry, and then we jump again and so forth...
8262 *
8263 * This function can be improved using a free list to avoid to guess
8264 * too much, since we could collect data about freed pages.
8265 *
8266 * note: I implemented this function just after watching an episode of
8267 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8268 */
8269 static int vmFindContiguousPages(off_t *first, off_t n) {
8270 off_t base, offset = 0, since_jump = 0, numfree = 0;
8271
8272 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8273 server.vm_near_pages = 0;
8274 server.vm_next_page = 0;
8275 }
8276 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8277 base = server.vm_next_page;
8278
8279 while(offset < server.vm_pages) {
8280 off_t this = base+offset;
8281
8282 /* If we overflow, restart from page zero */
8283 if (this >= server.vm_pages) {
8284 this -= server.vm_pages;
8285 if (this == 0) {
8286 /* Just overflowed, what we found on tail is no longer
8287 * interesting, as it's no longer contiguous. */
8288 numfree = 0;
8289 }
8290 }
8291 if (vmFreePage(this)) {
8292 /* This is a free page */
8293 numfree++;
8294 /* Already got N free pages? Return to the caller, with success */
8295 if (numfree == n) {
8296 *first = this-(n-1);
8297 server.vm_next_page = this+1;
8298 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8299 return REDIS_OK;
8300 }
8301 } else {
8302 /* The current one is not a free page */
8303 numfree = 0;
8304 }
8305
8306 /* Fast-forward if the current page is not free and we already
8307 * searched enough near this place. */
8308 since_jump++;
8309 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8310 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8311 since_jump = 0;
8312 /* Note that even if we rewind after the jump, we are don't need
8313 * to make sure numfree is set to zero as we only jump *if* it
8314 * is set to zero. */
8315 } else {
8316 /* Otherwise just check the next page */
8317 offset++;
8318 }
8319 }
8320 return REDIS_ERR;
8321 }
8322
8323 /* Write the specified object at the specified page of the swap file */
8324 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8325 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8326 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8327 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8328 redisLog(REDIS_WARNING,
8329 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8330 strerror(errno));
8331 return REDIS_ERR;
8332 }
8333 rdbSaveObject(server.vm_fp,o);
8334 fflush(server.vm_fp);
8335 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8336 return REDIS_OK;
8337 }
8338
8339 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8340 * needed to later retrieve the object into the key object.
8341 * If we can't find enough contiguous empty pages to swap the object on disk
8342 * REDIS_ERR is returned. */
8343 static int vmSwapObjectBlocking(robj *key, robj *val) {
8344 off_t pages = rdbSavedObjectPages(val,NULL);
8345 off_t page;
8346
8347 assert(key->storage == REDIS_VM_MEMORY);
8348 assert(key->refcount == 1);
8349 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8350 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8351 key->vm.page = page;
8352 key->vm.usedpages = pages;
8353 key->storage = REDIS_VM_SWAPPED;
8354 key->vtype = val->type;
8355 decrRefCount(val); /* Deallocate the object from memory. */
8356 vmMarkPagesUsed(page,pages);
8357 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8358 (unsigned char*) key->ptr,
8359 (unsigned long long) page, (unsigned long long) pages);
8360 server.vm_stats_swapped_objects++;
8361 server.vm_stats_swapouts++;
8362 return REDIS_OK;
8363 }
8364
8365 static robj *vmReadObjectFromSwap(off_t page, int type) {
8366 robj *o;
8367
8368 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8369 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8370 redisLog(REDIS_WARNING,
8371 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8372 strerror(errno));
8373 _exit(1);
8374 }
8375 o = rdbLoadObject(type,server.vm_fp);
8376 if (o == NULL) {
8377 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8378 _exit(1);
8379 }
8380 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8381 return o;
8382 }
8383
8384 /* Load the value object relative to the 'key' object from swap to memory.
8385 * The newly allocated object is returned.
8386 *
8387 * If preview is true the unserialized object is returned to the caller but
8388 * no changes are made to the key object, nor the pages are marked as freed */
8389 static robj *vmGenericLoadObject(robj *key, int preview) {
8390 robj *val;
8391
8392 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8393 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8394 if (!preview) {
8395 key->storage = REDIS_VM_MEMORY;
8396 key->vm.atime = server.unixtime;
8397 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8398 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8399 (unsigned char*) key->ptr);
8400 server.vm_stats_swapped_objects--;
8401 } else {
8402 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8403 (unsigned char*) key->ptr);
8404 }
8405 server.vm_stats_swapins++;
8406 return val;
8407 }
8408
8409 /* Plain object loading, from swap to memory */
8410 static robj *vmLoadObject(robj *key) {
8411 /* If we are loading the object in background, stop it, we
8412 * need to load this object synchronously ASAP. */
8413 if (key->storage == REDIS_VM_LOADING)
8414 vmCancelThreadedIOJob(key);
8415 return vmGenericLoadObject(key,0);
8416 }
8417
8418 /* Just load the value on disk, without to modify the key.
8419 * This is useful when we want to perform some operation on the value
8420 * without to really bring it from swap to memory, like while saving the
8421 * dataset or rewriting the append only log. */
8422 static robj *vmPreviewObject(robj *key) {
8423 return vmGenericLoadObject(key,1);
8424 }
8425
8426 /* How a good candidate is this object for swapping?
8427 * The better candidate it is, the greater the returned value.
8428 *
8429 * Currently we try to perform a fast estimation of the object size in
8430 * memory, and combine it with aging informations.
8431 *
8432 * Basically swappability = idle-time * log(estimated size)
8433 *
8434 * Bigger objects are preferred over smaller objects, but not
8435 * proportionally, this is why we use the logarithm. This algorithm is
8436 * just a first try and will probably be tuned later. */
8437 static double computeObjectSwappability(robj *o) {
8438 time_t age = server.unixtime - o->vm.atime;
8439 long asize = 0;
8440 list *l;
8441 dict *d;
8442 struct dictEntry *de;
8443 int z;
8444
8445 if (age <= 0) return 0;
8446 switch(o->type) {
8447 case REDIS_STRING:
8448 if (o->encoding != REDIS_ENCODING_RAW) {
8449 asize = sizeof(*o);
8450 } else {
8451 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8452 }
8453 break;
8454 case REDIS_LIST:
8455 l = o->ptr;
8456 listNode *ln = listFirst(l);
8457
8458 asize = sizeof(list);
8459 if (ln) {
8460 robj *ele = ln->value;
8461 long elesize;
8462
8463 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8464 (sizeof(*o)+sdslen(ele->ptr)) :
8465 sizeof(*o);
8466 asize += (sizeof(listNode)+elesize)*listLength(l);
8467 }
8468 break;
8469 case REDIS_SET:
8470 case REDIS_ZSET:
8471 z = (o->type == REDIS_ZSET);
8472 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8473
8474 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8475 if (z) asize += sizeof(zset)-sizeof(dict);
8476 if (dictSize(d)) {
8477 long elesize;
8478 robj *ele;
8479
8480 de = dictGetRandomKey(d);
8481 ele = dictGetEntryKey(de);
8482 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8483 (sizeof(*o)+sdslen(ele->ptr)) :
8484 sizeof(*o);
8485 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8486 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8487 }
8488 break;
8489 case REDIS_HASH:
8490 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8491 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8492 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8493 unsigned int klen, vlen;
8494 unsigned char *key, *val;
8495
8496 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8497 klen = 0;
8498 vlen = 0;
8499 }
8500 asize = len*(klen+vlen+3);
8501 } else if (o->encoding == REDIS_ENCODING_HT) {
8502 d = o->ptr;
8503 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8504 if (dictSize(d)) {
8505 long elesize;
8506 robj *ele;
8507
8508 de = dictGetRandomKey(d);
8509 ele = dictGetEntryKey(de);
8510 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8511 (sizeof(*o)+sdslen(ele->ptr)) :
8512 sizeof(*o);
8513 ele = dictGetEntryVal(de);
8514 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8515 (sizeof(*o)+sdslen(ele->ptr)) :
8516 sizeof(*o);
8517 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8518 }
8519 }
8520 break;
8521 }
8522 return (double)age*log(1+asize);
8523 }
8524
8525 /* Try to swap an object that's a good candidate for swapping.
8526 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8527 * to swap any object at all.
8528 *
8529 * If 'usethreaded' is true, Redis will try to swap the object in background
8530 * using I/O threads. */
8531 static int vmSwapOneObject(int usethreads) {
8532 int j, i;
8533 struct dictEntry *best = NULL;
8534 double best_swappability = 0;
8535 redisDb *best_db = NULL;
8536 robj *key, *val;
8537
8538 for (j = 0; j < server.dbnum; j++) {
8539 redisDb *db = server.db+j;
8540 /* Why maxtries is set to 100?
8541 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8542 * are swappable objects */
8543 int maxtries = 100;
8544
8545 if (dictSize(db->dict) == 0) continue;
8546 for (i = 0; i < 5; i++) {
8547 dictEntry *de;
8548 double swappability;
8549
8550 if (maxtries) maxtries--;
8551 de = dictGetRandomKey(db->dict);
8552 key = dictGetEntryKey(de);
8553 val = dictGetEntryVal(de);
8554 /* Only swap objects that are currently in memory.
8555 *
8556 * Also don't swap shared objects if threaded VM is on, as we
8557 * try to ensure that the main thread does not touch the
8558 * object while the I/O thread is using it, but we can't
8559 * control other keys without adding additional mutex. */
8560 if (key->storage != REDIS_VM_MEMORY ||
8561 (server.vm_max_threads != 0 && val->refcount != 1)) {
8562 if (maxtries) i--; /* don't count this try */
8563 continue;
8564 }
8565 swappability = computeObjectSwappability(val);
8566 if (!best || swappability > best_swappability) {
8567 best = de;
8568 best_swappability = swappability;
8569 best_db = db;
8570 }
8571 }
8572 }
8573 if (best == NULL) return REDIS_ERR;
8574 key = dictGetEntryKey(best);
8575 val = dictGetEntryVal(best);
8576
8577 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8578 key->ptr, best_swappability);
8579
8580 /* Unshare the key if needed */
8581 if (key->refcount > 1) {
8582 robj *newkey = dupStringObject(key);
8583 decrRefCount(key);
8584 key = dictGetEntryKey(best) = newkey;
8585 }
8586 /* Swap it */
8587 if (usethreads) {
8588 vmSwapObjectThreaded(key,val,best_db);
8589 return REDIS_OK;
8590 } else {
8591 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8592 dictGetEntryVal(best) = NULL;
8593 return REDIS_OK;
8594 } else {
8595 return REDIS_ERR;
8596 }
8597 }
8598 }
8599
8600 static int vmSwapOneObjectBlocking() {
8601 return vmSwapOneObject(0);
8602 }
8603
8604 static int vmSwapOneObjectThreaded() {
8605 return vmSwapOneObject(1);
8606 }
8607
8608 /* Return true if it's safe to swap out objects in a given moment.
8609 * Basically we don't want to swap objects out while there is a BGSAVE
8610 * or a BGAEOREWRITE running in backgroud. */
8611 static int vmCanSwapOut(void) {
8612 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8613 }
8614
8615 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8616 * and was deleted. Otherwise 0 is returned. */
8617 static int deleteIfSwapped(redisDb *db, robj *key) {
8618 dictEntry *de;
8619 robj *foundkey;
8620
8621 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8622 foundkey = dictGetEntryKey(de);
8623 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8624 deleteKey(db,key);
8625 return 1;
8626 }
8627
8628 /* =================== Virtual Memory - Threaded I/O ======================= */
8629
8630 static void freeIOJob(iojob *j) {
8631 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8632 j->type == REDIS_IOJOB_DO_SWAP ||
8633 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8634 decrRefCount(j->val);
8635 /* We don't decrRefCount the j->key field as we did't incremented
8636 * the count creating IO Jobs. This is because the key field here is
8637 * just used as an indentifier and if a key is removed the Job should
8638 * never be touched again. */
8639 zfree(j);
8640 }
8641
8642 /* Every time a thread finished a Job, it writes a byte into the write side
8643 * of an unix pipe in order to "awake" the main thread, and this function
8644 * is called. */
8645 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8646 int mask)
8647 {
8648 char buf[1];
8649 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8650 REDIS_NOTUSED(el);
8651 REDIS_NOTUSED(mask);
8652 REDIS_NOTUSED(privdata);
8653
8654 /* For every byte we read in the read side of the pipe, there is one
8655 * I/O job completed to process. */
8656 while((retval = read(fd,buf,1)) == 1) {
8657 iojob *j;
8658 listNode *ln;
8659 robj *key;
8660 struct dictEntry *de;
8661
8662 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8663
8664 /* Get the processed element (the oldest one) */
8665 lockThreadedIO();
8666 assert(listLength(server.io_processed) != 0);
8667 if (toprocess == -1) {
8668 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8669 if (toprocess <= 0) toprocess = 1;
8670 }
8671 ln = listFirst(server.io_processed);
8672 j = ln->value;
8673 listDelNode(server.io_processed,ln);
8674 unlockThreadedIO();
8675 /* If this job is marked as canceled, just ignore it */
8676 if (j->canceled) {
8677 freeIOJob(j);
8678 continue;
8679 }
8680 /* Post process it in the main thread, as there are things we
8681 * can do just here to avoid race conditions and/or invasive locks */
8682 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8683 de = dictFind(j->db->dict,j->key);
8684 assert(de != NULL);
8685 key = dictGetEntryKey(de);
8686 if (j->type == REDIS_IOJOB_LOAD) {
8687 redisDb *db;
8688
8689 /* Key loaded, bring it at home */
8690 key->storage = REDIS_VM_MEMORY;
8691 key->vm.atime = server.unixtime;
8692 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8693 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8694 (unsigned char*) key->ptr);
8695 server.vm_stats_swapped_objects--;
8696 server.vm_stats_swapins++;
8697 dictGetEntryVal(de) = j->val;
8698 incrRefCount(j->val);
8699 db = j->db;
8700 freeIOJob(j);
8701 /* Handle clients waiting for this key to be loaded. */
8702 handleClientsBlockedOnSwappedKey(db,key);
8703 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8704 /* Now we know the amount of pages required to swap this object.
8705 * Let's find some space for it, and queue this task again
8706 * rebranded as REDIS_IOJOB_DO_SWAP. */
8707 if (!vmCanSwapOut() ||
8708 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8709 {
8710 /* Ooops... no space or we can't swap as there is
8711 * a fork()ed Redis trying to save stuff on disk. */
8712 freeIOJob(j);
8713 key->storage = REDIS_VM_MEMORY; /* undo operation */
8714 } else {
8715 /* Note that we need to mark this pages as used now,
8716 * if the job will be canceled, we'll mark them as freed
8717 * again. */
8718 vmMarkPagesUsed(j->page,j->pages);
8719 j->type = REDIS_IOJOB_DO_SWAP;
8720 lockThreadedIO();
8721 queueIOJob(j);
8722 unlockThreadedIO();
8723 }
8724 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8725 robj *val;
8726
8727 /* Key swapped. We can finally free some memory. */
8728 if (key->storage != REDIS_VM_SWAPPING) {
8729 printf("key->storage: %d\n",key->storage);
8730 printf("key->name: %s\n",(char*)key->ptr);
8731 printf("key->refcount: %d\n",key->refcount);
8732 printf("val: %p\n",(void*)j->val);
8733 printf("val->type: %d\n",j->val->type);
8734 printf("val->ptr: %s\n",(char*)j->val->ptr);
8735 }
8736 redisAssert(key->storage == REDIS_VM_SWAPPING);
8737 val = dictGetEntryVal(de);
8738 key->vm.page = j->page;
8739 key->vm.usedpages = j->pages;
8740 key->storage = REDIS_VM_SWAPPED;
8741 key->vtype = j->val->type;
8742 decrRefCount(val); /* Deallocate the object from memory. */
8743 dictGetEntryVal(de) = NULL;
8744 redisLog(REDIS_DEBUG,
8745 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8746 (unsigned char*) key->ptr,
8747 (unsigned long long) j->page, (unsigned long long) j->pages);
8748 server.vm_stats_swapped_objects++;
8749 server.vm_stats_swapouts++;
8750 freeIOJob(j);
8751 /* Put a few more swap requests in queue if we are still
8752 * out of memory */
8753 if (trytoswap && vmCanSwapOut() &&
8754 zmalloc_used_memory() > server.vm_max_memory)
8755 {
8756 int more = 1;
8757 while(more) {
8758 lockThreadedIO();
8759 more = listLength(server.io_newjobs) <
8760 (unsigned) server.vm_max_threads;
8761 unlockThreadedIO();
8762 /* Don't waste CPU time if swappable objects are rare. */
8763 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8764 trytoswap = 0;
8765 break;
8766 }
8767 }
8768 }
8769 }
8770 processed++;
8771 if (processed == toprocess) return;
8772 }
8773 if (retval < 0 && errno != EAGAIN) {
8774 redisLog(REDIS_WARNING,
8775 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8776 strerror(errno));
8777 }
8778 }
8779
8780 static void lockThreadedIO(void) {
8781 pthread_mutex_lock(&server.io_mutex);
8782 }
8783
8784 static void unlockThreadedIO(void) {
8785 pthread_mutex_unlock(&server.io_mutex);
8786 }
8787
8788 /* Remove the specified object from the threaded I/O queue if still not
8789 * processed, otherwise make sure to flag it as canceled. */
8790 static void vmCancelThreadedIOJob(robj *o) {
8791 list *lists[3] = {
8792 server.io_newjobs, /* 0 */
8793 server.io_processing, /* 1 */
8794 server.io_processed /* 2 */
8795 };
8796 int i;
8797
8798 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8799 again:
8800 lockThreadedIO();
8801 /* Search for a matching key in one of the queues */
8802 for (i = 0; i < 3; i++) {
8803 listNode *ln;
8804 listIter li;
8805
8806 listRewind(lists[i],&li);
8807 while ((ln = listNext(&li)) != NULL) {
8808 iojob *job = ln->value;
8809
8810 if (job->canceled) continue; /* Skip this, already canceled. */
8811 if (job->key == o) {
8812 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8813 (void*)job, (char*)o->ptr, job->type, i);
8814 /* Mark the pages as free since the swap didn't happened
8815 * or happened but is now discarded. */
8816 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8817 vmMarkPagesFree(job->page,job->pages);
8818 /* Cancel the job. It depends on the list the job is
8819 * living in. */
8820 switch(i) {
8821 case 0: /* io_newjobs */
8822 /* If the job was yet not processed the best thing to do
8823 * is to remove it from the queue at all */
8824 freeIOJob(job);
8825 listDelNode(lists[i],ln);
8826 break;
8827 case 1: /* io_processing */
8828 /* Oh Shi- the thread is messing with the Job:
8829 *
8830 * Probably it's accessing the object if this is a
8831 * PREPARE_SWAP or DO_SWAP job.
8832 * If it's a LOAD job it may be reading from disk and
8833 * if we don't wait for the job to terminate before to
8834 * cancel it, maybe in a few microseconds data can be
8835 * corrupted in this pages. So the short story is:
8836 *
8837 * Better to wait for the job to move into the
8838 * next queue (processed)... */
8839
8840 /* We try again and again until the job is completed. */
8841 unlockThreadedIO();
8842 /* But let's wait some time for the I/O thread
8843 * to finish with this job. After all this condition
8844 * should be very rare. */
8845 usleep(1);
8846 goto again;
8847 case 2: /* io_processed */
8848 /* The job was already processed, that's easy...
8849 * just mark it as canceled so that we'll ignore it
8850 * when processing completed jobs. */
8851 job->canceled = 1;
8852 break;
8853 }
8854 /* Finally we have to adjust the storage type of the object
8855 * in order to "UNDO" the operaiton. */
8856 if (o->storage == REDIS_VM_LOADING)
8857 o->storage = REDIS_VM_SWAPPED;
8858 else if (o->storage == REDIS_VM_SWAPPING)
8859 o->storage = REDIS_VM_MEMORY;
8860 unlockThreadedIO();
8861 return;
8862 }
8863 }
8864 }
8865 unlockThreadedIO();
8866 assert(1 != 1); /* We should never reach this */
8867 }
8868
8869 static void *IOThreadEntryPoint(void *arg) {
8870 iojob *j;
8871 listNode *ln;
8872 REDIS_NOTUSED(arg);
8873
8874 pthread_detach(pthread_self());
8875 while(1) {
8876 /* Get a new job to process */
8877 lockThreadedIO();
8878 if (listLength(server.io_newjobs) == 0) {
8879 /* No new jobs in queue, exit. */
8880 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8881 (long) pthread_self());
8882 server.io_active_threads--;
8883 unlockThreadedIO();
8884 return NULL;
8885 }
8886 ln = listFirst(server.io_newjobs);
8887 j = ln->value;
8888 listDelNode(server.io_newjobs,ln);
8889 /* Add the job in the processing queue */
8890 j->thread = pthread_self();
8891 listAddNodeTail(server.io_processing,j);
8892 ln = listLast(server.io_processing); /* We use ln later to remove it */
8893 unlockThreadedIO();
8894 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8895 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8896
8897 /* Process the Job */
8898 if (j->type == REDIS_IOJOB_LOAD) {
8899 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8900 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8901 FILE *fp = fopen("/dev/null","w+");
8902 j->pages = rdbSavedObjectPages(j->val,fp);
8903 fclose(fp);
8904 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8905 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8906 j->canceled = 1;
8907 }
8908
8909 /* Done: insert the job into the processed queue */
8910 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8911 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8912 lockThreadedIO();
8913 listDelNode(server.io_processing,ln);
8914 listAddNodeTail(server.io_processed,j);
8915 unlockThreadedIO();
8916
8917 /* Signal the main thread there is new stuff to process */
8918 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8919 }
8920 return NULL; /* never reached */
8921 }
8922
8923 static void spawnIOThread(void) {
8924 pthread_t thread;
8925 sigset_t mask, omask;
8926 int err;
8927
8928 sigemptyset(&mask);
8929 sigaddset(&mask,SIGCHLD);
8930 sigaddset(&mask,SIGHUP);
8931 sigaddset(&mask,SIGPIPE);
8932 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8933 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8934 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8935 strerror(err));
8936 usleep(1000000);
8937 }
8938 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8939 server.io_active_threads++;
8940 }
8941
8942 /* We need to wait for the last thread to exit before we are able to
8943 * fork() in order to BGSAVE or BGREWRITEAOF. */
8944 static void waitEmptyIOJobsQueue(void) {
8945 while(1) {
8946 int io_processed_len;
8947
8948 lockThreadedIO();
8949 if (listLength(server.io_newjobs) == 0 &&
8950 listLength(server.io_processing) == 0 &&
8951 server.io_active_threads == 0)
8952 {
8953 unlockThreadedIO();
8954 return;
8955 }
8956 /* While waiting for empty jobs queue condition we post-process some
8957 * finshed job, as I/O threads may be hanging trying to write against
8958 * the io_ready_pipe_write FD but there are so much pending jobs that
8959 * it's blocking. */
8960 io_processed_len = listLength(server.io_processed);
8961 unlockThreadedIO();
8962 if (io_processed_len) {
8963 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8964 usleep(1000); /* 1 millisecond */
8965 } else {
8966 usleep(10000); /* 10 milliseconds */
8967 }
8968 }
8969 }
8970
8971 static void vmReopenSwapFile(void) {
8972 /* Note: we don't close the old one as we are in the child process
8973 * and don't want to mess at all with the original file object. */
8974 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8975 if (server.vm_fp == NULL) {
8976 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8977 server.vm_swap_file);
8978 _exit(1);
8979 }
8980 server.vm_fd = fileno(server.vm_fp);
8981 }
8982
8983 /* This function must be called while with threaded IO locked */
8984 static void queueIOJob(iojob *j) {
8985 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8986 (void*)j, j->type, (char*)j->key->ptr);
8987 listAddNodeTail(server.io_newjobs,j);
8988 if (server.io_active_threads < server.vm_max_threads)
8989 spawnIOThread();
8990 }
8991
8992 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8993 iojob *j;
8994
8995 assert(key->storage == REDIS_VM_MEMORY);
8996 assert(key->refcount == 1);
8997
8998 j = zmalloc(sizeof(*j));
8999 j->type = REDIS_IOJOB_PREPARE_SWAP;
9000 j->db = db;
9001 j->key = key;
9002 j->val = val;
9003 incrRefCount(val);
9004 j->canceled = 0;
9005 j->thread = (pthread_t) -1;
9006 key->storage = REDIS_VM_SWAPPING;
9007
9008 lockThreadedIO();
9009 queueIOJob(j);
9010 unlockThreadedIO();
9011 return REDIS_OK;
9012 }
9013
9014 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9015
9016 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9017 * If there is not already a job loading the key, it is craeted.
9018 * The key is added to the io_keys list in the client structure, and also
9019 * in the hash table mapping swapped keys to waiting clients, that is,
9020 * server.io_waited_keys. */
9021 static int waitForSwappedKey(redisClient *c, robj *key) {
9022 struct dictEntry *de;
9023 robj *o;
9024 list *l;
9025
9026 /* If the key does not exist or is already in RAM we don't need to
9027 * block the client at all. */
9028 de = dictFind(c->db->dict,key);
9029 if (de == NULL) return 0;
9030 o = dictGetEntryKey(de);
9031 if (o->storage == REDIS_VM_MEMORY) {
9032 return 0;
9033 } else if (o->storage == REDIS_VM_SWAPPING) {
9034 /* We were swapping the key, undo it! */
9035 vmCancelThreadedIOJob(o);
9036 return 0;
9037 }
9038
9039 /* OK: the key is either swapped, or being loaded just now. */
9040
9041 /* Add the key to the list of keys this client is waiting for.
9042 * This maps clients to keys they are waiting for. */
9043 listAddNodeTail(c->io_keys,key);
9044 incrRefCount(key);
9045
9046 /* Add the client to the swapped keys => clients waiting map. */
9047 de = dictFind(c->db->io_keys,key);
9048 if (de == NULL) {
9049 int retval;
9050
9051 /* For every key we take a list of clients blocked for it */
9052 l = listCreate();
9053 retval = dictAdd(c->db->io_keys,key,l);
9054 incrRefCount(key);
9055 assert(retval == DICT_OK);
9056 } else {
9057 l = dictGetEntryVal(de);
9058 }
9059 listAddNodeTail(l,c);
9060
9061 /* Are we already loading the key from disk? If not create a job */
9062 if (o->storage == REDIS_VM_SWAPPED) {
9063 iojob *j;
9064
9065 o->storage = REDIS_VM_LOADING;
9066 j = zmalloc(sizeof(*j));
9067 j->type = REDIS_IOJOB_LOAD;
9068 j->db = c->db;
9069 j->key = o;
9070 j->key->vtype = o->vtype;
9071 j->page = o->vm.page;
9072 j->val = NULL;
9073 j->canceled = 0;
9074 j->thread = (pthread_t) -1;
9075 lockThreadedIO();
9076 queueIOJob(j);
9077 unlockThreadedIO();
9078 }
9079 return 1;
9080 }
9081
9082 /* Preload keys needed for the ZUNION and ZINTER commands. */
9083 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9084 int i, num;
9085 num = atoi(c->argv[2]->ptr);
9086 for (i = 0; i < num; i++) {
9087 waitForSwappedKey(c,c->argv[3+i]);
9088 }
9089 }
9090
9091 /* Is this client attempting to run a command against swapped keys?
9092 * If so, block it ASAP, load the keys in background, then resume it.
9093 *
9094 * The important idea about this function is that it can fail! If keys will
9095 * still be swapped when the client is resumed, this key lookups will
9096 * just block loading keys from disk. In practical terms this should only
9097 * happen with SORT BY command or if there is a bug in this function.
9098 *
9099 * Return 1 if the client is marked as blocked, 0 if the client can
9100 * continue as the keys it is going to access appear to be in memory. */
9101 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9102 int j, last;
9103
9104 if (cmd->vm_preload_proc != NULL) {
9105 cmd->vm_preload_proc(c);
9106 } else {
9107 if (cmd->vm_firstkey == 0) return 0;
9108 last = cmd->vm_lastkey;
9109 if (last < 0) last = c->argc+last;
9110 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9111 waitForSwappedKey(c,c->argv[j]);
9112 }
9113
9114 /* If the client was blocked for at least one key, mark it as blocked. */
9115 if (listLength(c->io_keys)) {
9116 c->flags |= REDIS_IO_WAIT;
9117 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9118 server.vm_blocked_clients++;
9119 return 1;
9120 } else {
9121 return 0;
9122 }
9123 }
9124
9125 /* Remove the 'key' from the list of blocked keys for a given client.
9126 *
9127 * The function returns 1 when there are no longer blocking keys after
9128 * the current one was removed (and the client can be unblocked). */
9129 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9130 list *l;
9131 listNode *ln;
9132 listIter li;
9133 struct dictEntry *de;
9134
9135 /* Remove the key from the list of keys this client is waiting for. */
9136 listRewind(c->io_keys,&li);
9137 while ((ln = listNext(&li)) != NULL) {
9138 if (compareStringObjects(ln->value,key) == 0) {
9139 listDelNode(c->io_keys,ln);
9140 break;
9141 }
9142 }
9143 assert(ln != NULL);
9144
9145 /* Remove the client form the key => waiting clients map. */
9146 de = dictFind(c->db->io_keys,key);
9147 assert(de != NULL);
9148 l = dictGetEntryVal(de);
9149 ln = listSearchKey(l,c);
9150 assert(ln != NULL);
9151 listDelNode(l,ln);
9152 if (listLength(l) == 0)
9153 dictDelete(c->db->io_keys,key);
9154
9155 return listLength(c->io_keys) == 0;
9156 }
9157
9158 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9159 struct dictEntry *de;
9160 list *l;
9161 listNode *ln;
9162 int len;
9163
9164 de = dictFind(db->io_keys,key);
9165 if (!de) return;
9166
9167 l = dictGetEntryVal(de);
9168 len = listLength(l);
9169 /* Note: we can't use something like while(listLength(l)) as the list
9170 * can be freed by the calling function when we remove the last element. */
9171 while (len--) {
9172 ln = listFirst(l);
9173 redisClient *c = ln->value;
9174
9175 if (dontWaitForSwappedKey(c,key)) {
9176 /* Put the client in the list of clients ready to go as we
9177 * loaded all the keys about it. */
9178 listAddNodeTail(server.io_ready_clients,c);
9179 }
9180 }
9181 }
9182
9183 /* =========================== Remote Configuration ========================= */
9184
9185 static void configSetCommand(redisClient *c) {
9186 robj *o = getDecodedObject(c->argv[3]);
9187 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9188 zfree(server.dbfilename);
9189 server.dbfilename = zstrdup(o->ptr);
9190 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9191 zfree(server.requirepass);
9192 server.requirepass = zstrdup(o->ptr);
9193 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9194 zfree(server.masterauth);
9195 server.masterauth = zstrdup(o->ptr);
9196 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9197 server.maxmemory = strtoll(o->ptr, NULL, 10);
9198 } else {
9199 addReplySds(c,sdscatprintf(sdsempty(),
9200 "-ERR not supported CONFIG parameter %s\r\n",
9201 (char*)c->argv[2]->ptr));
9202 decrRefCount(o);
9203 return;
9204 }
9205 decrRefCount(o);
9206 addReply(c,shared.ok);
9207 }
9208
9209 static void configGetCommand(redisClient *c) {
9210 robj *o = getDecodedObject(c->argv[2]);
9211 robj *lenobj = createObject(REDIS_STRING,NULL);
9212 char *pattern = o->ptr;
9213 int matches = 0;
9214
9215 addReply(c,lenobj);
9216 decrRefCount(lenobj);
9217
9218 if (stringmatch(pattern,"dbfilename",0)) {
9219 addReplyBulkCString(c,"dbfilename");
9220 addReplyBulkCString(c,server.dbfilename);
9221 matches++;
9222 }
9223 if (stringmatch(pattern,"requirepass",0)) {
9224 addReplyBulkCString(c,"requirepass");
9225 addReplyBulkCString(c,server.requirepass);
9226 matches++;
9227 }
9228 if (stringmatch(pattern,"masterauth",0)) {
9229 addReplyBulkCString(c,"masterauth");
9230 addReplyBulkCString(c,server.masterauth);
9231 matches++;
9232 }
9233 if (stringmatch(pattern,"maxmemory",0)) {
9234 char buf[128];
9235
9236 snprintf(buf,128,"%llu\n",server.maxmemory);
9237 addReplyBulkCString(c,"maxmemory");
9238 addReplyBulkCString(c,buf);
9239 matches++;
9240 }
9241 decrRefCount(o);
9242 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9243 }
9244
9245 static void configCommand(redisClient *c) {
9246 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9247 if (c->argc != 4) goto badarity;
9248 configSetCommand(c);
9249 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9250 if (c->argc != 3) goto badarity;
9251 configGetCommand(c);
9252 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9253 if (c->argc != 2) goto badarity;
9254 server.stat_numcommands = 0;
9255 server.stat_numconnections = 0;
9256 server.stat_expiredkeys = 0;
9257 server.stat_starttime = time(NULL);
9258 addReply(c,shared.ok);
9259 } else {
9260 addReplySds(c,sdscatprintf(sdsempty(),
9261 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9262 }
9263 return;
9264
9265 badarity:
9266 addReplySds(c,sdscatprintf(sdsempty(),
9267 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9268 (char*) c->argv[1]->ptr));
9269 }
9270
9271 /* =========================== Pubsub implementation ======================== */
9272
9273 static void freePubsubPattern(void *p) {
9274 pubsubPattern *pat = p;
9275
9276 decrRefCount(pat->pattern);
9277 zfree(pat);
9278 }
9279
9280 static int listMatchPubsubPattern(void *a, void *b) {
9281 pubsubPattern *pa = a, *pb = b;
9282
9283 return (pa->client == pb->client) &&
9284 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9285 }
9286
9287 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9288 * 0 if the client was already subscribed to that channel. */
9289 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9290 struct dictEntry *de;
9291 list *clients = NULL;
9292 int retval = 0;
9293
9294 /* Add the channel to the client -> channels hash table */
9295 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9296 retval = 1;
9297 incrRefCount(channel);
9298 /* Add the client to the channel -> list of clients hash table */
9299 de = dictFind(server.pubsub_channels,channel);
9300 if (de == NULL) {
9301 clients = listCreate();
9302 dictAdd(server.pubsub_channels,channel,clients);
9303 incrRefCount(channel);
9304 } else {
9305 clients = dictGetEntryVal(de);
9306 }
9307 listAddNodeTail(clients,c);
9308 }
9309 /* Notify the client */
9310 addReply(c,shared.mbulk3);
9311 addReply(c,shared.subscribebulk);
9312 addReplyBulk(c,channel);
9313 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9314 return retval;
9315 }
9316
9317 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9318 * 0 if the client was not subscribed to the specified channel. */
9319 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9320 struct dictEntry *de;
9321 list *clients;
9322 listNode *ln;
9323 int retval = 0;
9324
9325 /* Remove the channel from the client -> channels hash table */
9326 incrRefCount(channel); /* channel may be just a pointer to the same object
9327 we have in the hash tables. Protect it... */
9328 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9329 retval = 1;
9330 /* Remove the client from the channel -> clients list hash table */
9331 de = dictFind(server.pubsub_channels,channel);
9332 assert(de != NULL);
9333 clients = dictGetEntryVal(de);
9334 ln = listSearchKey(clients,c);
9335 assert(ln != NULL);
9336 listDelNode(clients,ln);
9337 if (listLength(clients) == 0) {
9338 /* Free the list and associated hash entry at all if this was
9339 * the latest client, so that it will be possible to abuse
9340 * Redis PUBSUB creating millions of channels. */
9341 dictDelete(server.pubsub_channels,channel);
9342 }
9343 }
9344 /* Notify the client */
9345 if (notify) {
9346 addReply(c,shared.mbulk3);
9347 addReply(c,shared.unsubscribebulk);
9348 addReplyBulk(c,channel);
9349 addReplyLong(c,dictSize(c->pubsub_channels)+
9350 listLength(c->pubsub_patterns));
9351
9352 }
9353 decrRefCount(channel); /* it is finally safe to release it */
9354 return retval;
9355 }
9356
9357 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9358 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9359 int retval = 0;
9360
9361 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9362 retval = 1;
9363 pubsubPattern *pat;
9364 listAddNodeTail(c->pubsub_patterns,pattern);
9365 incrRefCount(pattern);
9366 pat = zmalloc(sizeof(*pat));
9367 pat->pattern = getDecodedObject(pattern);
9368 pat->client = c;
9369 listAddNodeTail(server.pubsub_patterns,pat);
9370 }
9371 /* Notify the client */
9372 addReply(c,shared.mbulk3);
9373 addReply(c,shared.psubscribebulk);
9374 addReplyBulk(c,pattern);
9375 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9376 return retval;
9377 }
9378
9379 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9380 * 0 if the client was not subscribed to the specified channel. */
9381 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9382 listNode *ln;
9383 pubsubPattern pat;
9384 int retval = 0;
9385
9386 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9387 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9388 retval = 1;
9389 listDelNode(c->pubsub_patterns,ln);
9390 pat.client = c;
9391 pat.pattern = pattern;
9392 ln = listSearchKey(server.pubsub_patterns,&pat);
9393 listDelNode(server.pubsub_patterns,ln);
9394 }
9395 /* Notify the client */
9396 if (notify) {
9397 addReply(c,shared.mbulk3);
9398 addReply(c,shared.punsubscribebulk);
9399 addReplyBulk(c,pattern);
9400 addReplyLong(c,dictSize(c->pubsub_channels)+
9401 listLength(c->pubsub_patterns));
9402 }
9403 decrRefCount(pattern);
9404 return retval;
9405 }
9406
9407 /* Unsubscribe from all the channels. Return the number of channels the
9408 * client was subscribed from. */
9409 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9410 dictIterator *di = dictGetIterator(c->pubsub_channels);
9411 dictEntry *de;
9412 int count = 0;
9413
9414 while((de = dictNext(di)) != NULL) {
9415 robj *channel = dictGetEntryKey(de);
9416
9417 count += pubsubUnsubscribeChannel(c,channel,notify);
9418 }
9419 dictReleaseIterator(di);
9420 return count;
9421 }
9422
9423 /* Unsubscribe from all the patterns. Return the number of patterns the
9424 * client was subscribed from. */
9425 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9426 listNode *ln;
9427 listIter li;
9428 int count = 0;
9429
9430 listRewind(c->pubsub_patterns,&li);
9431 while ((ln = listNext(&li)) != NULL) {
9432 robj *pattern = ln->value;
9433
9434 count += pubsubUnsubscribePattern(c,pattern,notify);
9435 }
9436 return count;
9437 }
9438
9439 /* Publish a message */
9440 static int pubsubPublishMessage(robj *channel, robj *message) {
9441 int receivers = 0;
9442 struct dictEntry *de;
9443 listNode *ln;
9444 listIter li;
9445
9446 /* Send to clients listening for that channel */
9447 de = dictFind(server.pubsub_channels,channel);
9448 if (de) {
9449 list *list = dictGetEntryVal(de);
9450 listNode *ln;
9451 listIter li;
9452
9453 listRewind(list,&li);
9454 while ((ln = listNext(&li)) != NULL) {
9455 redisClient *c = ln->value;
9456
9457 addReply(c,shared.mbulk3);
9458 addReply(c,shared.messagebulk);
9459 addReplyBulk(c,channel);
9460 addReplyBulk(c,message);
9461 receivers++;
9462 }
9463 }
9464 /* Send to clients listening to matching channels */
9465 if (listLength(server.pubsub_patterns)) {
9466 listRewind(server.pubsub_patterns,&li);
9467 channel = getDecodedObject(channel);
9468 while ((ln = listNext(&li)) != NULL) {
9469 pubsubPattern *pat = ln->value;
9470
9471 if (stringmatchlen((char*)pat->pattern->ptr,
9472 sdslen(pat->pattern->ptr),
9473 (char*)channel->ptr,
9474 sdslen(channel->ptr),0)) {
9475 addReply(pat->client,shared.mbulk3);
9476 addReply(pat->client,shared.messagebulk);
9477 addReplyBulk(pat->client,channel);
9478 addReplyBulk(pat->client,message);
9479 receivers++;
9480 }
9481 }
9482 decrRefCount(channel);
9483 }
9484 return receivers;
9485 }
9486
9487 static void subscribeCommand(redisClient *c) {
9488 int j;
9489
9490 for (j = 1; j < c->argc; j++)
9491 pubsubSubscribeChannel(c,c->argv[j]);
9492 }
9493
9494 static void unsubscribeCommand(redisClient *c) {
9495 if (c->argc == 1) {
9496 pubsubUnsubscribeAllChannels(c,1);
9497 return;
9498 } else {
9499 int j;
9500
9501 for (j = 1; j < c->argc; j++)
9502 pubsubUnsubscribeChannel(c,c->argv[j],1);
9503 }
9504 }
9505
9506 static void psubscribeCommand(redisClient *c) {
9507 int j;
9508
9509 for (j = 1; j < c->argc; j++)
9510 pubsubSubscribePattern(c,c->argv[j]);
9511 }
9512
9513 static void punsubscribeCommand(redisClient *c) {
9514 if (c->argc == 1) {
9515 pubsubUnsubscribeAllPatterns(c,1);
9516 return;
9517 } else {
9518 int j;
9519
9520 for (j = 1; j < c->argc; j++)
9521 pubsubUnsubscribePattern(c,c->argv[j],1);
9522 }
9523 }
9524
9525 static void publishCommand(redisClient *c) {
9526 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9527 addReplyLong(c,receivers);
9528 }
9529
9530 /* ================================= Debugging ============================== */
9531
9532 static void debugCommand(redisClient *c) {
9533 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9534 *((char*)-1) = 'x';
9535 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9536 if (rdbSave(server.dbfilename) != REDIS_OK) {
9537 addReply(c,shared.err);
9538 return;
9539 }
9540 emptyDb();
9541 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9542 addReply(c,shared.err);
9543 return;
9544 }
9545 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9546 addReply(c,shared.ok);
9547 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9548 emptyDb();
9549 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9550 addReply(c,shared.err);
9551 return;
9552 }
9553 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9554 addReply(c,shared.ok);
9555 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9556 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9557 robj *key, *val;
9558
9559 if (!de) {
9560 addReply(c,shared.nokeyerr);
9561 return;
9562 }
9563 key = dictGetEntryKey(de);
9564 val = dictGetEntryVal(de);
9565 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9566 key->storage == REDIS_VM_SWAPPING)) {
9567 char *strenc;
9568 char buf[128];
9569
9570 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9571 strenc = strencoding[val->encoding];
9572 } else {
9573 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9574 strenc = buf;
9575 }
9576 addReplySds(c,sdscatprintf(sdsempty(),
9577 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9578 "encoding:%s serializedlength:%lld\r\n",
9579 (void*)key, key->refcount, (void*)val, val->refcount,
9580 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9581 } else {
9582 addReplySds(c,sdscatprintf(sdsempty(),
9583 "+Key at:%p refcount:%d, value swapped at: page %llu "
9584 "using %llu pages\r\n",
9585 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9586 (unsigned long long) key->vm.usedpages));
9587 }
9588 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9589 lookupKeyRead(c->db,c->argv[2]);
9590 addReply(c,shared.ok);
9591 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9592 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9593 robj *key, *val;
9594
9595 if (!server.vm_enabled) {
9596 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9597 return;
9598 }
9599 if (!de) {
9600 addReply(c,shared.nokeyerr);
9601 return;
9602 }
9603 key = dictGetEntryKey(de);
9604 val = dictGetEntryVal(de);
9605 /* If the key is shared we want to create a copy */
9606 if (key->refcount > 1) {
9607 robj *newkey = dupStringObject(key);
9608 decrRefCount(key);
9609 key = dictGetEntryKey(de) = newkey;
9610 }
9611 /* Swap it */
9612 if (key->storage != REDIS_VM_MEMORY) {
9613 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9614 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9615 dictGetEntryVal(de) = NULL;
9616 addReply(c,shared.ok);
9617 } else {
9618 addReply(c,shared.err);
9619 }
9620 } else {
9621 addReplySds(c,sdsnew(
9622 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9623 }
9624 }
9625
9626 static void _redisAssert(char *estr, char *file, int line) {
9627 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9628 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9629 #ifdef HAVE_BACKTRACE
9630 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9631 *((char*)-1) = 'x';
9632 #endif
9633 }
9634
9635 /* =================================== Main! ================================ */
9636
9637 #ifdef __linux__
9638 int linuxOvercommitMemoryValue(void) {
9639 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9640 char buf[64];
9641
9642 if (!fp) return -1;
9643 if (fgets(buf,64,fp) == NULL) {
9644 fclose(fp);
9645 return -1;
9646 }
9647 fclose(fp);
9648
9649 return atoi(buf);
9650 }
9651
9652 void linuxOvercommitMemoryWarning(void) {
9653 if (linuxOvercommitMemoryValue() == 0) {
9654 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9655 }
9656 }
9657 #endif /* __linux__ */
9658
9659 static void daemonize(void) {
9660 int fd;
9661 FILE *fp;
9662
9663 if (fork() != 0) exit(0); /* parent exits */
9664 setsid(); /* create a new session */
9665
9666 /* Every output goes to /dev/null. If Redis is daemonized but
9667 * the 'logfile' is set to 'stdout' in the configuration file
9668 * it will not log at all. */
9669 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9670 dup2(fd, STDIN_FILENO);
9671 dup2(fd, STDOUT_FILENO);
9672 dup2(fd, STDERR_FILENO);
9673 if (fd > STDERR_FILENO) close(fd);
9674 }
9675 /* Try to write the pid file */
9676 fp = fopen(server.pidfile,"w");
9677 if (fp) {
9678 fprintf(fp,"%d\n",getpid());
9679 fclose(fp);
9680 }
9681 }
9682
9683 static void version() {
9684 printf("Redis server version %s\n", REDIS_VERSION);
9685 exit(0);
9686 }
9687
9688 static void usage() {
9689 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9690 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9691 exit(1);
9692 }
9693
9694 int main(int argc, char **argv) {
9695 time_t start;
9696
9697 initServerConfig();
9698 if (argc == 2) {
9699 if (strcmp(argv[1], "-v") == 0 ||
9700 strcmp(argv[1], "--version") == 0) version();
9701 if (strcmp(argv[1], "--help") == 0) usage();
9702 resetServerSaveParams();
9703 loadServerConfig(argv[1]);
9704 } else if ((argc > 2)) {
9705 usage();
9706 } else {
9707 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9708 }
9709 if (server.daemonize) daemonize();
9710 initServer();
9711 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9712 #ifdef __linux__
9713 linuxOvercommitMemoryWarning();
9714 #endif
9715 start = time(NULL);
9716 if (server.appendonly) {
9717 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9718 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9719 } else {
9720 if (rdbLoad(server.dbfilename) == REDIS_OK)
9721 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9722 }
9723 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9724 aeSetBeforeSleepProc(server.el,beforeSleep);
9725 aeMain(server.el);
9726 aeDeleteEventLoop(server.el);
9727 return 0;
9728 }
9729
9730 /* ============================= Backtrace support ========================= */
9731
9732 #ifdef HAVE_BACKTRACE
9733 static char *findFuncName(void *pointer, unsigned long *offset);
9734
9735 static void *getMcontextEip(ucontext_t *uc) {
9736 #if defined(__FreeBSD__)
9737 return (void*) uc->uc_mcontext.mc_eip;
9738 #elif defined(__dietlibc__)
9739 return (void*) uc->uc_mcontext.eip;
9740 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9741 #if __x86_64__
9742 return (void*) uc->uc_mcontext->__ss.__rip;
9743 #else
9744 return (void*) uc->uc_mcontext->__ss.__eip;
9745 #endif
9746 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9747 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9748 return (void*) uc->uc_mcontext->__ss.__rip;
9749 #else
9750 return (void*) uc->uc_mcontext->__ss.__eip;
9751 #endif
9752 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9753 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9754 #elif defined(__ia64__) /* Linux IA64 */
9755 return (void*) uc->uc_mcontext.sc_ip;
9756 #else
9757 return NULL;
9758 #endif
9759 }
9760
9761 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9762 void *trace[100];
9763 char **messages = NULL;
9764 int i, trace_size = 0;
9765 unsigned long offset=0;
9766 ucontext_t *uc = (ucontext_t*) secret;
9767 sds infostring;
9768 REDIS_NOTUSED(info);
9769
9770 redisLog(REDIS_WARNING,
9771 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9772 infostring = genRedisInfoString();
9773 redisLog(REDIS_WARNING, "%s",infostring);
9774 /* It's not safe to sdsfree() the returned string under memory
9775 * corruption conditions. Let it leak as we are going to abort */
9776
9777 trace_size = backtrace(trace, 100);
9778 /* overwrite sigaction with caller's address */
9779 if (getMcontextEip(uc) != NULL) {
9780 trace[1] = getMcontextEip(uc);
9781 }
9782 messages = backtrace_symbols(trace, trace_size);
9783
9784 for (i=1; i<trace_size; ++i) {
9785 char *fn = findFuncName(trace[i], &offset), *p;
9786
9787 p = strchr(messages[i],'+');
9788 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9789 redisLog(REDIS_WARNING,"%s", messages[i]);
9790 } else {
9791 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9792 }
9793 }
9794 /* free(messages); Don't call free() with possibly corrupted memory. */
9795 _exit(0);
9796 }
9797
9798 static void setupSigSegvAction(void) {
9799 struct sigaction act;
9800
9801 sigemptyset (&act.sa_mask);
9802 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9803 * is used. Otherwise, sa_handler is used */
9804 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9805 act.sa_sigaction = segvHandler;
9806 sigaction (SIGSEGV, &act, NULL);
9807 sigaction (SIGBUS, &act, NULL);
9808 sigaction (SIGFPE, &act, NULL);
9809 sigaction (SIGILL, &act, NULL);
9810 sigaction (SIGBUS, &act, NULL);
9811 return;
9812 }
9813
9814 #include "staticsymbols.h"
9815 /* This function try to convert a pointer into a function name. It's used in
9816 * oreder to provide a backtrace under segmentation fault that's able to
9817 * display functions declared as static (otherwise the backtrace is useless). */
9818 static char *findFuncName(void *pointer, unsigned long *offset){
9819 int i, ret = -1;
9820 unsigned long off, minoff = 0;
9821
9822 /* Try to match against the Symbol with the smallest offset */
9823 for (i=0; symsTable[i].pointer; i++) {
9824 unsigned long lp = (unsigned long) pointer;
9825
9826 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9827 off=lp-symsTable[i].pointer;
9828 if (ret < 0 || off < minoff) {
9829 minoff=off;
9830 ret=i;
9831 }
9832 }
9833 }
9834 if (ret == -1) return NULL;
9835 *offset = minoff;
9836 return symsTable[ret].name;
9837 }
9838 #else /* HAVE_BACKTRACE */
9839 static void setupSigSegvAction(void) {
9840 }
9841 #endif /* HAVE_BACKTRACE */
9842
9843
9844
9845 /* The End */
9846
9847
9848