]> git.saurik.com Git - redis.git/blob - redis.c
d80e56b07e8a05d9046ad0631c7b77e57c58debe
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 static void _redisAssert(char *estr, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 struct sharedObjectsStruct {
509 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
510 *colon, *nullbulk, *nullmultibulk, *queued,
511 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
512 *outofrangeerr, *plus,
513 *select0, *select1, *select2, *select3, *select4,
514 *select5, *select6, *select7, *select8, *select9,
515 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
516 *psubscribebulk, *punsubscribebulk;
517 } shared;
518
519 /* Global vars that are actally used as constants. The following double
520 * values are used for double on-disk serialization, and are initialized
521 * at runtime to avoid strange compiler optimizations. */
522
523 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
524
525 /* VM threaded I/O request message */
526 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
527 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
528 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
529 typedef struct iojob {
530 int type; /* Request type, REDIS_IOJOB_* */
531 redisDb *db;/* Redis database */
532 robj *key; /* This I/O request is about swapping this key */
533 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
534 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
535 off_t page; /* Swap page where to read/write the object */
536 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
537 int canceled; /* True if this command was canceled by blocking side of VM */
538 pthread_t thread; /* ID of the thread processing this entry */
539 } iojob;
540
541 /*================================ Prototypes =============================== */
542
543 static void freeStringObject(robj *o);
544 static void freeListObject(robj *o);
545 static void freeSetObject(robj *o);
546 static void decrRefCount(void *o);
547 static robj *createObject(int type, void *ptr);
548 static void freeClient(redisClient *c);
549 static int rdbLoad(char *filename);
550 static void addReply(redisClient *c, robj *obj);
551 static void addReplySds(redisClient *c, sds s);
552 static void incrRefCount(robj *o);
553 static int rdbSaveBackground(char *filename);
554 static robj *createStringObject(char *ptr, size_t len);
555 static robj *dupStringObject(robj *o);
556 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
557 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
558 static int syncWithMaster(void);
559 static int tryObjectEncoding(robj *o);
560 static robj *getDecodedObject(robj *o);
561 static int removeExpire(redisDb *db, robj *key);
562 static int expireIfNeeded(redisDb *db, robj *key);
563 static int deleteIfVolatile(redisDb *db, robj *key);
564 static int deleteIfSwapped(redisDb *db, robj *key);
565 static int deleteKey(redisDb *db, robj *key);
566 static time_t getExpire(redisDb *db, robj *key);
567 static int setExpire(redisDb *db, robj *key, time_t when);
568 static void updateSlavesWaitingBgsave(int bgsaveerr);
569 static void freeMemoryIfNeeded(void);
570 static int processCommand(redisClient *c);
571 static void setupSigSegvAction(void);
572 static void rdbRemoveTempFile(pid_t childpid);
573 static void aofRemoveTempFile(pid_t childpid);
574 static size_t stringObjectLen(robj *o);
575 static void processInputBuffer(redisClient *c);
576 static zskiplist *zslCreate(void);
577 static void zslFree(zskiplist *zsl);
578 static void zslInsert(zskiplist *zsl, double score, robj *obj);
579 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
580 static void initClientMultiState(redisClient *c);
581 static void freeClientMultiState(redisClient *c);
582 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
583 static void unblockClientWaitingData(redisClient *c);
584 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
585 static void vmInit(void);
586 static void vmMarkPagesFree(off_t page, off_t count);
587 static robj *vmLoadObject(robj *key);
588 static robj *vmPreviewObject(robj *key);
589 static int vmSwapOneObjectBlocking(void);
590 static int vmSwapOneObjectThreaded(void);
591 static int vmCanSwapOut(void);
592 static int tryFreeOneObjectFromFreelist(void);
593 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
594 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
595 static void vmCancelThreadedIOJob(robj *o);
596 static void lockThreadedIO(void);
597 static void unlockThreadedIO(void);
598 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
599 static void freeIOJob(iojob *j);
600 static void queueIOJob(iojob *j);
601 static int vmWriteObjectOnSwap(robj *o, off_t page);
602 static robj *vmReadObjectFromSwap(off_t page, int type);
603 static void waitEmptyIOJobsQueue(void);
604 static void vmReopenSwapFile(void);
605 static int vmFreePage(off_t page);
606 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
607 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
608 static int dontWaitForSwappedKey(redisClient *c, robj *key);
609 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
610 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
611 static struct redisCommand *lookupCommand(char *name);
612 static void call(redisClient *c, struct redisCommand *cmd);
613 static void resetClient(redisClient *c);
614 static void convertToRealHash(robj *o);
615 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
616 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
617 static void freePubsubPattern(void *p);
618 static int listMatchPubsubPattern(void *a, void *b);
619 static int compareStringObjects(robj *a, robj *b);
620 static void usage();
621
622 static void authCommand(redisClient *c);
623 static void pingCommand(redisClient *c);
624 static void echoCommand(redisClient *c);
625 static void setCommand(redisClient *c);
626 static void setnxCommand(redisClient *c);
627 static void getCommand(redisClient *c);
628 static void delCommand(redisClient *c);
629 static void existsCommand(redisClient *c);
630 static void incrCommand(redisClient *c);
631 static void decrCommand(redisClient *c);
632 static void incrbyCommand(redisClient *c);
633 static void decrbyCommand(redisClient *c);
634 static void selectCommand(redisClient *c);
635 static void randomkeyCommand(redisClient *c);
636 static void keysCommand(redisClient *c);
637 static void dbsizeCommand(redisClient *c);
638 static void lastsaveCommand(redisClient *c);
639 static void saveCommand(redisClient *c);
640 static void bgsaveCommand(redisClient *c);
641 static void bgrewriteaofCommand(redisClient *c);
642 static void shutdownCommand(redisClient *c);
643 static void moveCommand(redisClient *c);
644 static void renameCommand(redisClient *c);
645 static void renamenxCommand(redisClient *c);
646 static void lpushCommand(redisClient *c);
647 static void rpushCommand(redisClient *c);
648 static void lpopCommand(redisClient *c);
649 static void rpopCommand(redisClient *c);
650 static void llenCommand(redisClient *c);
651 static void lindexCommand(redisClient *c);
652 static void lrangeCommand(redisClient *c);
653 static void ltrimCommand(redisClient *c);
654 static void typeCommand(redisClient *c);
655 static void lsetCommand(redisClient *c);
656 static void saddCommand(redisClient *c);
657 static void sremCommand(redisClient *c);
658 static void smoveCommand(redisClient *c);
659 static void sismemberCommand(redisClient *c);
660 static void scardCommand(redisClient *c);
661 static void spopCommand(redisClient *c);
662 static void srandmemberCommand(redisClient *c);
663 static void sinterCommand(redisClient *c);
664 static void sinterstoreCommand(redisClient *c);
665 static void sunionCommand(redisClient *c);
666 static void sunionstoreCommand(redisClient *c);
667 static void sdiffCommand(redisClient *c);
668 static void sdiffstoreCommand(redisClient *c);
669 static void syncCommand(redisClient *c);
670 static void flushdbCommand(redisClient *c);
671 static void flushallCommand(redisClient *c);
672 static void sortCommand(redisClient *c);
673 static void lremCommand(redisClient *c);
674 static void rpoplpushcommand(redisClient *c);
675 static void infoCommand(redisClient *c);
676 static void mgetCommand(redisClient *c);
677 static void monitorCommand(redisClient *c);
678 static void expireCommand(redisClient *c);
679 static void expireatCommand(redisClient *c);
680 static void getsetCommand(redisClient *c);
681 static void ttlCommand(redisClient *c);
682 static void slaveofCommand(redisClient *c);
683 static void debugCommand(redisClient *c);
684 static void msetCommand(redisClient *c);
685 static void msetnxCommand(redisClient *c);
686 static void zaddCommand(redisClient *c);
687 static void zincrbyCommand(redisClient *c);
688 static void zrangeCommand(redisClient *c);
689 static void zrangebyscoreCommand(redisClient *c);
690 static void zcountCommand(redisClient *c);
691 static void zrevrangeCommand(redisClient *c);
692 static void zcardCommand(redisClient *c);
693 static void zremCommand(redisClient *c);
694 static void zscoreCommand(redisClient *c);
695 static void zremrangebyscoreCommand(redisClient *c);
696 static void multiCommand(redisClient *c);
697 static void execCommand(redisClient *c);
698 static void discardCommand(redisClient *c);
699 static void blpopCommand(redisClient *c);
700 static void brpopCommand(redisClient *c);
701 static void appendCommand(redisClient *c);
702 static void substrCommand(redisClient *c);
703 static void zrankCommand(redisClient *c);
704 static void zrevrankCommand(redisClient *c);
705 static void hsetCommand(redisClient *c);
706 static void hgetCommand(redisClient *c);
707 static void hdelCommand(redisClient *c);
708 static void hlenCommand(redisClient *c);
709 static void zremrangebyrankCommand(redisClient *c);
710 static void zunionCommand(redisClient *c);
711 static void zinterCommand(redisClient *c);
712 static void hkeysCommand(redisClient *c);
713 static void hvalsCommand(redisClient *c);
714 static void hgetallCommand(redisClient *c);
715 static void hexistsCommand(redisClient *c);
716 static void configCommand(redisClient *c);
717 static void hincrbyCommand(redisClient *c);
718 static void subscribeCommand(redisClient *c);
719 static void unsubscribeCommand(redisClient *c);
720 static void psubscribeCommand(redisClient *c);
721 static void punsubscribeCommand(redisClient *c);
722 static void publishCommand(redisClient *c);
723
724 /*================================= Globals ================================= */
725
726 /* Global vars */
727 static struct redisServer server; /* server global state */
728 static struct redisCommand cmdTable[] = {
729 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
730 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
731 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
732 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
733 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
735 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
737 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
738 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
739 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
751 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
752 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
754 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
755 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
756 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
760 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
761 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
762 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
763 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
764 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
765 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
772 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
773 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
780 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
794 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
795 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
807 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
815 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
820 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
826 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
831 {NULL,NULL,0,0,NULL,0,0,0}
832 };
833
834 /*============================ Utility functions ============================ */
835
836 /* Glob-style pattern matching. */
837 static int stringmatchlen(const char *pattern, int patternLen,
838 const char *string, int stringLen, int nocase)
839 {
840 while(patternLen) {
841 switch(pattern[0]) {
842 case '*':
843 while (pattern[1] == '*') {
844 pattern++;
845 patternLen--;
846 }
847 if (patternLen == 1)
848 return 1; /* match */
849 while(stringLen) {
850 if (stringmatchlen(pattern+1, patternLen-1,
851 string, stringLen, nocase))
852 return 1; /* match */
853 string++;
854 stringLen--;
855 }
856 return 0; /* no match */
857 break;
858 case '?':
859 if (stringLen == 0)
860 return 0; /* no match */
861 string++;
862 stringLen--;
863 break;
864 case '[':
865 {
866 int not, match;
867
868 pattern++;
869 patternLen--;
870 not = pattern[0] == '^';
871 if (not) {
872 pattern++;
873 patternLen--;
874 }
875 match = 0;
876 while(1) {
877 if (pattern[0] == '\\') {
878 pattern++;
879 patternLen--;
880 if (pattern[0] == string[0])
881 match = 1;
882 } else if (pattern[0] == ']') {
883 break;
884 } else if (patternLen == 0) {
885 pattern--;
886 patternLen++;
887 break;
888 } else if (pattern[1] == '-' && patternLen >= 3) {
889 int start = pattern[0];
890 int end = pattern[2];
891 int c = string[0];
892 if (start > end) {
893 int t = start;
894 start = end;
895 end = t;
896 }
897 if (nocase) {
898 start = tolower(start);
899 end = tolower(end);
900 c = tolower(c);
901 }
902 pattern += 2;
903 patternLen -= 2;
904 if (c >= start && c <= end)
905 match = 1;
906 } else {
907 if (!nocase) {
908 if (pattern[0] == string[0])
909 match = 1;
910 } else {
911 if (tolower((int)pattern[0]) == tolower((int)string[0]))
912 match = 1;
913 }
914 }
915 pattern++;
916 patternLen--;
917 }
918 if (not)
919 match = !match;
920 if (!match)
921 return 0; /* no match */
922 string++;
923 stringLen--;
924 break;
925 }
926 case '\\':
927 if (patternLen >= 2) {
928 pattern++;
929 patternLen--;
930 }
931 /* fall through */
932 default:
933 if (!nocase) {
934 if (pattern[0] != string[0])
935 return 0; /* no match */
936 } else {
937 if (tolower((int)pattern[0]) != tolower((int)string[0]))
938 return 0; /* no match */
939 }
940 string++;
941 stringLen--;
942 break;
943 }
944 pattern++;
945 patternLen--;
946 if (stringLen == 0) {
947 while(*pattern == '*') {
948 pattern++;
949 patternLen--;
950 }
951 break;
952 }
953 }
954 if (patternLen == 0 && stringLen == 0)
955 return 1;
956 return 0;
957 }
958
959 static int stringmatch(const char *pattern, const char *string, int nocase) {
960 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
961 }
962
963 static void redisLog(int level, const char *fmt, ...) {
964 va_list ap;
965 FILE *fp;
966
967 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
968 if (!fp) return;
969
970 va_start(ap, fmt);
971 if (level >= server.verbosity) {
972 char *c = ".-*#";
973 char buf[64];
974 time_t now;
975
976 now = time(NULL);
977 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
978 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
979 vfprintf(fp, fmt, ap);
980 fprintf(fp,"\n");
981 fflush(fp);
982 }
983 va_end(ap);
984
985 if (server.logfile) fclose(fp);
986 }
987
988 /*====================== Hash table type implementation ==================== */
989
990 /* This is an hash table type that uses the SDS dynamic strings libary as
991 * keys and radis objects as values (objects can hold SDS strings,
992 * lists, sets). */
993
994 static void dictVanillaFree(void *privdata, void *val)
995 {
996 DICT_NOTUSED(privdata);
997 zfree(val);
998 }
999
1000 static void dictListDestructor(void *privdata, void *val)
1001 {
1002 DICT_NOTUSED(privdata);
1003 listRelease((list*)val);
1004 }
1005
1006 static int sdsDictKeyCompare(void *privdata, const void *key1,
1007 const void *key2)
1008 {
1009 int l1,l2;
1010 DICT_NOTUSED(privdata);
1011
1012 l1 = sdslen((sds)key1);
1013 l2 = sdslen((sds)key2);
1014 if (l1 != l2) return 0;
1015 return memcmp(key1, key2, l1) == 0;
1016 }
1017
1018 static void dictRedisObjectDestructor(void *privdata, void *val)
1019 {
1020 DICT_NOTUSED(privdata);
1021
1022 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1023 decrRefCount(val);
1024 }
1025
1026 static int dictObjKeyCompare(void *privdata, const void *key1,
1027 const void *key2)
1028 {
1029 const robj *o1 = key1, *o2 = key2;
1030 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1031 }
1032
1033 static unsigned int dictObjHash(const void *key) {
1034 const robj *o = key;
1035 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1036 }
1037
1038 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1039 const void *key2)
1040 {
1041 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1042 int cmp;
1043
1044 if (o1->encoding == REDIS_ENCODING_INT &&
1045 o2->encoding == REDIS_ENCODING_INT &&
1046 o1->ptr == o2->ptr) return 1;
1047
1048 o1 = getDecodedObject(o1);
1049 o2 = getDecodedObject(o2);
1050 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1051 decrRefCount(o1);
1052 decrRefCount(o2);
1053 return cmp;
1054 }
1055
1056 static unsigned int dictEncObjHash(const void *key) {
1057 robj *o = (robj*) key;
1058
1059 if (o->encoding == REDIS_ENCODING_RAW) {
1060 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1061 } else {
1062 if (o->encoding == REDIS_ENCODING_INT) {
1063 char buf[32];
1064 int len;
1065
1066 len = snprintf(buf,32,"%ld",(long)o->ptr);
1067 return dictGenHashFunction((unsigned char*)buf, len);
1068 } else {
1069 unsigned int hash;
1070
1071 o = getDecodedObject(o);
1072 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1073 decrRefCount(o);
1074 return hash;
1075 }
1076 }
1077 }
1078
1079 /* Sets type and expires */
1080 static dictType setDictType = {
1081 dictEncObjHash, /* hash function */
1082 NULL, /* key dup */
1083 NULL, /* val dup */
1084 dictEncObjKeyCompare, /* key compare */
1085 dictRedisObjectDestructor, /* key destructor */
1086 NULL /* val destructor */
1087 };
1088
1089 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1090 static dictType zsetDictType = {
1091 dictEncObjHash, /* hash function */
1092 NULL, /* key dup */
1093 NULL, /* val dup */
1094 dictEncObjKeyCompare, /* key compare */
1095 dictRedisObjectDestructor, /* key destructor */
1096 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1097 };
1098
1099 /* Db->dict */
1100 static dictType dbDictType = {
1101 dictObjHash, /* hash function */
1102 NULL, /* key dup */
1103 NULL, /* val dup */
1104 dictObjKeyCompare, /* key compare */
1105 dictRedisObjectDestructor, /* key destructor */
1106 dictRedisObjectDestructor /* val destructor */
1107 };
1108
1109 /* Db->expires */
1110 static dictType keyptrDictType = {
1111 dictObjHash, /* hash function */
1112 NULL, /* key dup */
1113 NULL, /* val dup */
1114 dictObjKeyCompare, /* key compare */
1115 dictRedisObjectDestructor, /* key destructor */
1116 NULL /* val destructor */
1117 };
1118
1119 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1120 static dictType hashDictType = {
1121 dictEncObjHash, /* hash function */
1122 NULL, /* key dup */
1123 NULL, /* val dup */
1124 dictEncObjKeyCompare, /* key compare */
1125 dictRedisObjectDestructor, /* key destructor */
1126 dictRedisObjectDestructor /* val destructor */
1127 };
1128
1129 /* Keylist hash table type has unencoded redis objects as keys and
1130 * lists as values. It's used for blocking operations (BLPOP) and to
1131 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1132 static dictType keylistDictType = {
1133 dictObjHash, /* hash function */
1134 NULL, /* key dup */
1135 NULL, /* val dup */
1136 dictObjKeyCompare, /* key compare */
1137 dictRedisObjectDestructor, /* key destructor */
1138 dictListDestructor /* val destructor */
1139 };
1140
1141 static void version();
1142
1143 /* ========================= Random utility functions ======================= */
1144
1145 /* Redis generally does not try to recover from out of memory conditions
1146 * when allocating objects or strings, it is not clear if it will be possible
1147 * to report this condition to the client since the networking layer itself
1148 * is based on heap allocation for send buffers, so we simply abort.
1149 * At least the code will be simpler to read... */
1150 static void oom(const char *msg) {
1151 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1152 sleep(1);
1153 abort();
1154 }
1155
1156 /* ====================== Redis server networking stuff ===================== */
1157 static void closeTimedoutClients(void) {
1158 redisClient *c;
1159 listNode *ln;
1160 time_t now = time(NULL);
1161 listIter li;
1162
1163 listRewind(server.clients,&li);
1164 while ((ln = listNext(&li)) != NULL) {
1165 c = listNodeValue(ln);
1166 if (server.maxidletime &&
1167 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1168 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1169 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1170 listLength(c->pubsub_patterns) == 0 &&
1171 (now - c->lastinteraction > server.maxidletime))
1172 {
1173 redisLog(REDIS_VERBOSE,"Closing idle client");
1174 freeClient(c);
1175 } else if (c->flags & REDIS_BLOCKED) {
1176 if (c->blockingto != 0 && c->blockingto < now) {
1177 addReply(c,shared.nullmultibulk);
1178 unblockClientWaitingData(c);
1179 }
1180 }
1181 }
1182 }
1183
1184 static int htNeedsResize(dict *dict) {
1185 long long size, used;
1186
1187 size = dictSlots(dict);
1188 used = dictSize(dict);
1189 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1190 (used*100/size < REDIS_HT_MINFILL));
1191 }
1192
1193 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1194 * we resize the hash table to save memory */
1195 static void tryResizeHashTables(void) {
1196 int j;
1197
1198 for (j = 0; j < server.dbnum; j++) {
1199 if (htNeedsResize(server.db[j].dict)) {
1200 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1201 dictResize(server.db[j].dict);
1202 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1203 }
1204 if (htNeedsResize(server.db[j].expires))
1205 dictResize(server.db[j].expires);
1206 }
1207 }
1208
1209 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1210 void backgroundSaveDoneHandler(int statloc) {
1211 int exitcode = WEXITSTATUS(statloc);
1212 int bysignal = WIFSIGNALED(statloc);
1213
1214 if (!bysignal && exitcode == 0) {
1215 redisLog(REDIS_NOTICE,
1216 "Background saving terminated with success");
1217 server.dirty = 0;
1218 server.lastsave = time(NULL);
1219 } else if (!bysignal && exitcode != 0) {
1220 redisLog(REDIS_WARNING, "Background saving error");
1221 } else {
1222 redisLog(REDIS_WARNING,
1223 "Background saving terminated by signal %d", WTERMSIG(statloc));
1224 rdbRemoveTempFile(server.bgsavechildpid);
1225 }
1226 server.bgsavechildpid = -1;
1227 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1228 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1229 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1230 }
1231
1232 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1233 * Handle this. */
1234 void backgroundRewriteDoneHandler(int statloc) {
1235 int exitcode = WEXITSTATUS(statloc);
1236 int bysignal = WIFSIGNALED(statloc);
1237
1238 if (!bysignal && exitcode == 0) {
1239 int fd;
1240 char tmpfile[256];
1241
1242 redisLog(REDIS_NOTICE,
1243 "Background append only file rewriting terminated with success");
1244 /* Now it's time to flush the differences accumulated by the parent */
1245 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1246 fd = open(tmpfile,O_WRONLY|O_APPEND);
1247 if (fd == -1) {
1248 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1249 goto cleanup;
1250 }
1251 /* Flush our data... */
1252 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1253 (signed) sdslen(server.bgrewritebuf)) {
1254 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1255 close(fd);
1256 goto cleanup;
1257 }
1258 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1259 /* Now our work is to rename the temp file into the stable file. And
1260 * switch the file descriptor used by the server for append only. */
1261 if (rename(tmpfile,server.appendfilename) == -1) {
1262 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1263 close(fd);
1264 goto cleanup;
1265 }
1266 /* Mission completed... almost */
1267 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1268 if (server.appendfd != -1) {
1269 /* If append only is actually enabled... */
1270 close(server.appendfd);
1271 server.appendfd = fd;
1272 fsync(fd);
1273 server.appendseldb = -1; /* Make sure it will issue SELECT */
1274 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1275 } else {
1276 /* If append only is disabled we just generate a dump in this
1277 * format. Why not? */
1278 close(fd);
1279 }
1280 } else if (!bysignal && exitcode != 0) {
1281 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1282 } else {
1283 redisLog(REDIS_WARNING,
1284 "Background append only file rewriting terminated by signal %d",
1285 WTERMSIG(statloc));
1286 }
1287 cleanup:
1288 sdsfree(server.bgrewritebuf);
1289 server.bgrewritebuf = sdsempty();
1290 aofRemoveTempFile(server.bgrewritechildpid);
1291 server.bgrewritechildpid = -1;
1292 }
1293
1294 /* This function is called once a background process of some kind terminates,
1295 * as we want to avoid resizing the hash tables when there is a child in order
1296 * to play well with copy-on-write (otherwise when a resize happens lots of
1297 * memory pages are copied). The goal of this function is to update the ability
1298 * for dict.c to resize the hash tables accordingly to the fact we have o not
1299 * running childs. */
1300 static void updateDictResizePolicy(void) {
1301 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1302 dictEnableResize();
1303 else
1304 dictDisableResize();
1305 }
1306
1307 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1308 int j, loops = server.cronloops++;
1309 REDIS_NOTUSED(eventLoop);
1310 REDIS_NOTUSED(id);
1311 REDIS_NOTUSED(clientData);
1312
1313 /* We take a cached value of the unix time in the global state because
1314 * with virtual memory and aging there is to store the current time
1315 * in objects at every object access, and accuracy is not needed.
1316 * To access a global var is faster than calling time(NULL) */
1317 server.unixtime = time(NULL);
1318
1319 /* Show some info about non-empty databases */
1320 for (j = 0; j < server.dbnum; j++) {
1321 long long size, used, vkeys;
1322
1323 size = dictSlots(server.db[j].dict);
1324 used = dictSize(server.db[j].dict);
1325 vkeys = dictSize(server.db[j].expires);
1326 if (!(loops % 50) && (used || vkeys)) {
1327 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1328 /* dictPrintStats(server.dict); */
1329 }
1330 }
1331
1332 /* We don't want to resize the hash tables while a bacground saving
1333 * is in progress: the saving child is created using fork() that is
1334 * implemented with a copy-on-write semantic in most modern systems, so
1335 * if we resize the HT while there is the saving child at work actually
1336 * a lot of memory movements in the parent will cause a lot of pages
1337 * copied. */
1338 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1339 !(loops % 10))
1340 {
1341 tryResizeHashTables();
1342 }
1343
1344 /* Show information about connected clients */
1345 if (!(loops % 50)) {
1346 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1347 listLength(server.clients)-listLength(server.slaves),
1348 listLength(server.slaves),
1349 zmalloc_used_memory());
1350 }
1351
1352 /* Close connections of timedout clients */
1353 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1354 closeTimedoutClients();
1355
1356 /* Check if a background saving or AOF rewrite in progress terminated */
1357 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1358 int statloc;
1359 pid_t pid;
1360
1361 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1362 if (pid == server.bgsavechildpid) {
1363 backgroundSaveDoneHandler(statloc);
1364 } else {
1365 backgroundRewriteDoneHandler(statloc);
1366 }
1367 updateDictResizePolicy();
1368 }
1369 } else {
1370 /* If there is not a background saving in progress check if
1371 * we have to save now */
1372 time_t now = time(NULL);
1373 for (j = 0; j < server.saveparamslen; j++) {
1374 struct saveparam *sp = server.saveparams+j;
1375
1376 if (server.dirty >= sp->changes &&
1377 now-server.lastsave > sp->seconds) {
1378 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1379 sp->changes, sp->seconds);
1380 rdbSaveBackground(server.dbfilename);
1381 break;
1382 }
1383 }
1384 }
1385
1386 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1387 * will use few CPU cycles if there are few expiring keys, otherwise
1388 * it will get more aggressive to avoid that too much memory is used by
1389 * keys that can be removed from the keyspace. */
1390 for (j = 0; j < server.dbnum; j++) {
1391 int expired;
1392 redisDb *db = server.db+j;
1393
1394 /* Continue to expire if at the end of the cycle more than 25%
1395 * of the keys were expired. */
1396 do {
1397 long num = dictSize(db->expires);
1398 time_t now = time(NULL);
1399
1400 expired = 0;
1401 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1402 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1403 while (num--) {
1404 dictEntry *de;
1405 time_t t;
1406
1407 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1408 t = (time_t) dictGetEntryVal(de);
1409 if (now > t) {
1410 deleteKey(db,dictGetEntryKey(de));
1411 expired++;
1412 server.stat_expiredkeys++;
1413 }
1414 }
1415 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1416 }
1417
1418 /* Swap a few keys on disk if we are over the memory limit and VM
1419 * is enbled. Try to free objects from the free list first. */
1420 if (vmCanSwapOut()) {
1421 while (server.vm_enabled && zmalloc_used_memory() >
1422 server.vm_max_memory)
1423 {
1424 int retval;
1425
1426 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1427 retval = (server.vm_max_threads == 0) ?
1428 vmSwapOneObjectBlocking() :
1429 vmSwapOneObjectThreaded();
1430 if (retval == REDIS_ERR && !(loops % 300) &&
1431 zmalloc_used_memory() >
1432 (server.vm_max_memory+server.vm_max_memory/10))
1433 {
1434 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1435 }
1436 /* Note that when using threade I/O we free just one object,
1437 * because anyway when the I/O thread in charge to swap this
1438 * object out will finish, the handler of completed jobs
1439 * will try to swap more objects if we are still out of memory. */
1440 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1441 }
1442 }
1443
1444 /* Check if we should connect to a MASTER */
1445 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1446 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1447 if (syncWithMaster() == REDIS_OK) {
1448 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1449 }
1450 }
1451 return 100;
1452 }
1453
1454 /* This function gets called every time Redis is entering the
1455 * main loop of the event driven library, that is, before to sleep
1456 * for ready file descriptors. */
1457 static void beforeSleep(struct aeEventLoop *eventLoop) {
1458 REDIS_NOTUSED(eventLoop);
1459
1460 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1461 listIter li;
1462 listNode *ln;
1463
1464 listRewind(server.io_ready_clients,&li);
1465 while((ln = listNext(&li))) {
1466 redisClient *c = ln->value;
1467 struct redisCommand *cmd;
1468
1469 /* Resume the client. */
1470 listDelNode(server.io_ready_clients,ln);
1471 c->flags &= (~REDIS_IO_WAIT);
1472 server.vm_blocked_clients--;
1473 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1474 readQueryFromClient, c);
1475 cmd = lookupCommand(c->argv[0]->ptr);
1476 assert(cmd != NULL);
1477 call(c,cmd);
1478 resetClient(c);
1479 /* There may be more data to process in the input buffer. */
1480 if (c->querybuf && sdslen(c->querybuf) > 0)
1481 processInputBuffer(c);
1482 }
1483 }
1484 }
1485
1486 static void createSharedObjects(void) {
1487 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1488 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1489 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1490 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1491 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1492 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1493 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1494 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1495 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1496 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1497 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1498 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1499 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1500 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1501 "-ERR no such key\r\n"));
1502 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1503 "-ERR syntax error\r\n"));
1504 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1505 "-ERR source and destination objects are the same\r\n"));
1506 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1507 "-ERR index out of range\r\n"));
1508 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1509 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1510 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1511 shared.select0 = createStringObject("select 0\r\n",10);
1512 shared.select1 = createStringObject("select 1\r\n",10);
1513 shared.select2 = createStringObject("select 2\r\n",10);
1514 shared.select3 = createStringObject("select 3\r\n",10);
1515 shared.select4 = createStringObject("select 4\r\n",10);
1516 shared.select5 = createStringObject("select 5\r\n",10);
1517 shared.select6 = createStringObject("select 6\r\n",10);
1518 shared.select7 = createStringObject("select 7\r\n",10);
1519 shared.select8 = createStringObject("select 8\r\n",10);
1520 shared.select9 = createStringObject("select 9\r\n",10);
1521 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1522 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1523 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1524 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1525 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1526 shared.mbulk3 = createStringObject("*3\r\n",4);
1527 }
1528
1529 static void appendServerSaveParams(time_t seconds, int changes) {
1530 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1531 server.saveparams[server.saveparamslen].seconds = seconds;
1532 server.saveparams[server.saveparamslen].changes = changes;
1533 server.saveparamslen++;
1534 }
1535
1536 static void resetServerSaveParams() {
1537 zfree(server.saveparams);
1538 server.saveparams = NULL;
1539 server.saveparamslen = 0;
1540 }
1541
1542 static void initServerConfig() {
1543 server.dbnum = REDIS_DEFAULT_DBNUM;
1544 server.port = REDIS_SERVERPORT;
1545 server.verbosity = REDIS_VERBOSE;
1546 server.maxidletime = REDIS_MAXIDLETIME;
1547 server.saveparams = NULL;
1548 server.logfile = NULL; /* NULL = log on standard output */
1549 server.bindaddr = NULL;
1550 server.glueoutputbuf = 1;
1551 server.daemonize = 0;
1552 server.appendonly = 0;
1553 server.appendfsync = APPENDFSYNC_ALWAYS;
1554 server.lastfsync = time(NULL);
1555 server.appendfd = -1;
1556 server.appendseldb = -1; /* Make sure the first time will not match */
1557 server.pidfile = zstrdup("/var/run/redis.pid");
1558 server.dbfilename = zstrdup("dump.rdb");
1559 server.appendfilename = zstrdup("appendonly.aof");
1560 server.requirepass = NULL;
1561 server.shareobjects = 0;
1562 server.rdbcompression = 1;
1563 server.maxclients = 0;
1564 server.blpop_blocked_clients = 0;
1565 server.maxmemory = 0;
1566 server.vm_enabled = 0;
1567 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1568 server.vm_page_size = 256; /* 256 bytes per page */
1569 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1570 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1571 server.vm_max_threads = 4;
1572 server.vm_blocked_clients = 0;
1573 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1574 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1575
1576 resetServerSaveParams();
1577
1578 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1579 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1580 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1581 /* Replication related */
1582 server.isslave = 0;
1583 server.masterauth = NULL;
1584 server.masterhost = NULL;
1585 server.masterport = 6379;
1586 server.master = NULL;
1587 server.replstate = REDIS_REPL_NONE;
1588
1589 /* Double constants initialization */
1590 R_Zero = 0.0;
1591 R_PosInf = 1.0/R_Zero;
1592 R_NegInf = -1.0/R_Zero;
1593 R_Nan = R_Zero/R_Zero;
1594 }
1595
1596 static void initServer() {
1597 int j;
1598
1599 signal(SIGHUP, SIG_IGN);
1600 signal(SIGPIPE, SIG_IGN);
1601 setupSigSegvAction();
1602
1603 server.devnull = fopen("/dev/null","w");
1604 if (server.devnull == NULL) {
1605 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1606 exit(1);
1607 }
1608 server.clients = listCreate();
1609 server.slaves = listCreate();
1610 server.monitors = listCreate();
1611 server.objfreelist = listCreate();
1612 createSharedObjects();
1613 server.el = aeCreateEventLoop();
1614 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1615 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1616 if (server.fd == -1) {
1617 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1618 exit(1);
1619 }
1620 for (j = 0; j < server.dbnum; j++) {
1621 server.db[j].dict = dictCreate(&dbDictType,NULL);
1622 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1623 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1624 if (server.vm_enabled)
1625 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1626 server.db[j].id = j;
1627 }
1628 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1629 server.pubsub_patterns = listCreate();
1630 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1631 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1632 server.cronloops = 0;
1633 server.bgsavechildpid = -1;
1634 server.bgrewritechildpid = -1;
1635 server.bgrewritebuf = sdsempty();
1636 server.lastsave = time(NULL);
1637 server.dirty = 0;
1638 server.stat_numcommands = 0;
1639 server.stat_numconnections = 0;
1640 server.stat_expiredkeys = 0;
1641 server.stat_starttime = time(NULL);
1642 server.unixtime = time(NULL);
1643 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1644 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1645 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1646
1647 if (server.appendonly) {
1648 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1649 if (server.appendfd == -1) {
1650 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1651 strerror(errno));
1652 exit(1);
1653 }
1654 }
1655
1656 if (server.vm_enabled) vmInit();
1657 }
1658
1659 /* Empty the whole database */
1660 static long long emptyDb() {
1661 int j;
1662 long long removed = 0;
1663
1664 for (j = 0; j < server.dbnum; j++) {
1665 removed += dictSize(server.db[j].dict);
1666 dictEmpty(server.db[j].dict);
1667 dictEmpty(server.db[j].expires);
1668 }
1669 return removed;
1670 }
1671
1672 static int yesnotoi(char *s) {
1673 if (!strcasecmp(s,"yes")) return 1;
1674 else if (!strcasecmp(s,"no")) return 0;
1675 else return -1;
1676 }
1677
1678 /* I agree, this is a very rudimental way to load a configuration...
1679 will improve later if the config gets more complex */
1680 static void loadServerConfig(char *filename) {
1681 FILE *fp;
1682 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1683 int linenum = 0;
1684 sds line = NULL;
1685 char *errormsg = "Fatal error, can't open config file '%s'";
1686 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1687 sprintf(errorbuf, errormsg, filename);
1688
1689 if (filename[0] == '-' && filename[1] == '\0')
1690 fp = stdin;
1691 else {
1692 if ((fp = fopen(filename,"r")) == NULL) {
1693 redisLog(REDIS_WARNING, errorbuf);
1694 exit(1);
1695 }
1696 }
1697
1698 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1699 sds *argv;
1700 int argc, j;
1701
1702 linenum++;
1703 line = sdsnew(buf);
1704 line = sdstrim(line," \t\r\n");
1705
1706 /* Skip comments and blank lines*/
1707 if (line[0] == '#' || line[0] == '\0') {
1708 sdsfree(line);
1709 continue;
1710 }
1711
1712 /* Split into arguments */
1713 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1714 sdstolower(argv[0]);
1715
1716 /* Execute config directives */
1717 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1718 server.maxidletime = atoi(argv[1]);
1719 if (server.maxidletime < 0) {
1720 err = "Invalid timeout value"; goto loaderr;
1721 }
1722 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1723 server.port = atoi(argv[1]);
1724 if (server.port < 1 || server.port > 65535) {
1725 err = "Invalid port"; goto loaderr;
1726 }
1727 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1728 server.bindaddr = zstrdup(argv[1]);
1729 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1730 int seconds = atoi(argv[1]);
1731 int changes = atoi(argv[2]);
1732 if (seconds < 1 || changes < 0) {
1733 err = "Invalid save parameters"; goto loaderr;
1734 }
1735 appendServerSaveParams(seconds,changes);
1736 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1737 if (chdir(argv[1]) == -1) {
1738 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1739 argv[1], strerror(errno));
1740 exit(1);
1741 }
1742 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1743 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1744 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1745 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1746 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1747 else {
1748 err = "Invalid log level. Must be one of debug, notice, warning";
1749 goto loaderr;
1750 }
1751 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1752 FILE *logfp;
1753
1754 server.logfile = zstrdup(argv[1]);
1755 if (!strcasecmp(server.logfile,"stdout")) {
1756 zfree(server.logfile);
1757 server.logfile = NULL;
1758 }
1759 if (server.logfile) {
1760 /* Test if we are able to open the file. The server will not
1761 * be able to abort just for this problem later... */
1762 logfp = fopen(server.logfile,"a");
1763 if (logfp == NULL) {
1764 err = sdscatprintf(sdsempty(),
1765 "Can't open the log file: %s", strerror(errno));
1766 goto loaderr;
1767 }
1768 fclose(logfp);
1769 }
1770 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1771 server.dbnum = atoi(argv[1]);
1772 if (server.dbnum < 1) {
1773 err = "Invalid number of databases"; goto loaderr;
1774 }
1775 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1776 loadServerConfig(argv[1]);
1777 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1778 server.maxclients = atoi(argv[1]);
1779 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1780 server.maxmemory = strtoll(argv[1], NULL, 10);
1781 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1782 server.masterhost = sdsnew(argv[1]);
1783 server.masterport = atoi(argv[2]);
1784 server.replstate = REDIS_REPL_CONNECT;
1785 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1786 server.masterauth = zstrdup(argv[1]);
1787 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1788 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1789 err = "argument must be 'yes' or 'no'"; goto loaderr;
1790 }
1791 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1792 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1793 err = "argument must be 'yes' or 'no'"; goto loaderr;
1794 }
1795 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1796 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
1799 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1800 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1801 err = "argument must be 'yes' or 'no'"; goto loaderr;
1802 }
1803 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1804 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1805 err = "argument must be 'yes' or 'no'"; goto loaderr;
1806 }
1807 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1808 if (!strcasecmp(argv[1],"no")) {
1809 server.appendfsync = APPENDFSYNC_NO;
1810 } else if (!strcasecmp(argv[1],"always")) {
1811 server.appendfsync = APPENDFSYNC_ALWAYS;
1812 } else if (!strcasecmp(argv[1],"everysec")) {
1813 server.appendfsync = APPENDFSYNC_EVERYSEC;
1814 } else {
1815 err = "argument must be 'no', 'always' or 'everysec'";
1816 goto loaderr;
1817 }
1818 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1819 server.requirepass = zstrdup(argv[1]);
1820 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1821 zfree(server.pidfile);
1822 server.pidfile = zstrdup(argv[1]);
1823 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1824 zfree(server.dbfilename);
1825 server.dbfilename = zstrdup(argv[1]);
1826 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1827 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1828 err = "argument must be 'yes' or 'no'"; goto loaderr;
1829 }
1830 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1831 zfree(server.vm_swap_file);
1832 server.vm_swap_file = zstrdup(argv[1]);
1833 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1834 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1835 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1836 server.vm_page_size = strtoll(argv[1], NULL, 10);
1837 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1838 server.vm_pages = strtoll(argv[1], NULL, 10);
1839 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1840 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1841 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1842 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1843 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1844 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1845 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1846 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1847 } else {
1848 err = "Bad directive or wrong number of arguments"; goto loaderr;
1849 }
1850 for (j = 0; j < argc; j++)
1851 sdsfree(argv[j]);
1852 zfree(argv);
1853 sdsfree(line);
1854 }
1855 if (fp != stdin) fclose(fp);
1856 return;
1857
1858 loaderr:
1859 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1860 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1861 fprintf(stderr, ">>> '%s'\n", line);
1862 fprintf(stderr, "%s\n", err);
1863 exit(1);
1864 }
1865
1866 static void freeClientArgv(redisClient *c) {
1867 int j;
1868
1869 for (j = 0; j < c->argc; j++)
1870 decrRefCount(c->argv[j]);
1871 for (j = 0; j < c->mbargc; j++)
1872 decrRefCount(c->mbargv[j]);
1873 c->argc = 0;
1874 c->mbargc = 0;
1875 }
1876
1877 static void freeClient(redisClient *c) {
1878 listNode *ln;
1879
1880 /* Note that if the client we are freeing is blocked into a blocking
1881 * call, we have to set querybuf to NULL *before* to call
1882 * unblockClientWaitingData() to avoid processInputBuffer() will get
1883 * called. Also it is important to remove the file events after
1884 * this, because this call adds the READABLE event. */
1885 sdsfree(c->querybuf);
1886 c->querybuf = NULL;
1887 if (c->flags & REDIS_BLOCKED)
1888 unblockClientWaitingData(c);
1889
1890 /* Unsubscribe from all the pubsub channels */
1891 pubsubUnsubscribeAllChannels(c,0);
1892 pubsubUnsubscribeAllPatterns(c,0);
1893 dictRelease(c->pubsub_channels);
1894 listRelease(c->pubsub_patterns);
1895 /* Obvious cleanup */
1896 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1897 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1898 listRelease(c->reply);
1899 freeClientArgv(c);
1900 close(c->fd);
1901 /* Remove from the list of clients */
1902 ln = listSearchKey(server.clients,c);
1903 redisAssert(ln != NULL);
1904 listDelNode(server.clients,ln);
1905 /* Remove from the list of clients waiting for swapped keys */
1906 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1907 ln = listSearchKey(server.io_ready_clients,c);
1908 if (ln) {
1909 listDelNode(server.io_ready_clients,ln);
1910 server.vm_blocked_clients--;
1911 }
1912 }
1913 while (server.vm_enabled && listLength(c->io_keys)) {
1914 ln = listFirst(c->io_keys);
1915 dontWaitForSwappedKey(c,ln->value);
1916 }
1917 listRelease(c->io_keys);
1918 /* Master/slave cleanup */
1919 if (c->flags & REDIS_SLAVE) {
1920 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1921 close(c->repldbfd);
1922 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1923 ln = listSearchKey(l,c);
1924 redisAssert(ln != NULL);
1925 listDelNode(l,ln);
1926 }
1927 if (c->flags & REDIS_MASTER) {
1928 server.master = NULL;
1929 server.replstate = REDIS_REPL_CONNECT;
1930 }
1931 /* Release memory */
1932 zfree(c->argv);
1933 zfree(c->mbargv);
1934 freeClientMultiState(c);
1935 zfree(c);
1936 }
1937
1938 #define GLUEREPLY_UP_TO (1024)
1939 static void glueReplyBuffersIfNeeded(redisClient *c) {
1940 int copylen = 0;
1941 char buf[GLUEREPLY_UP_TO];
1942 listNode *ln;
1943 listIter li;
1944 robj *o;
1945
1946 listRewind(c->reply,&li);
1947 while((ln = listNext(&li))) {
1948 int objlen;
1949
1950 o = ln->value;
1951 objlen = sdslen(o->ptr);
1952 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1953 memcpy(buf+copylen,o->ptr,objlen);
1954 copylen += objlen;
1955 listDelNode(c->reply,ln);
1956 } else {
1957 if (copylen == 0) return;
1958 break;
1959 }
1960 }
1961 /* Now the output buffer is empty, add the new single element */
1962 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1963 listAddNodeHead(c->reply,o);
1964 }
1965
1966 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1967 redisClient *c = privdata;
1968 int nwritten = 0, totwritten = 0, objlen;
1969 robj *o;
1970 REDIS_NOTUSED(el);
1971 REDIS_NOTUSED(mask);
1972
1973 /* Use writev() if we have enough buffers to send */
1974 if (!server.glueoutputbuf &&
1975 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1976 !(c->flags & REDIS_MASTER))
1977 {
1978 sendReplyToClientWritev(el, fd, privdata, mask);
1979 return;
1980 }
1981
1982 while(listLength(c->reply)) {
1983 if (server.glueoutputbuf && listLength(c->reply) > 1)
1984 glueReplyBuffersIfNeeded(c);
1985
1986 o = listNodeValue(listFirst(c->reply));
1987 objlen = sdslen(o->ptr);
1988
1989 if (objlen == 0) {
1990 listDelNode(c->reply,listFirst(c->reply));
1991 continue;
1992 }
1993
1994 if (c->flags & REDIS_MASTER) {
1995 /* Don't reply to a master */
1996 nwritten = objlen - c->sentlen;
1997 } else {
1998 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1999 if (nwritten <= 0) break;
2000 }
2001 c->sentlen += nwritten;
2002 totwritten += nwritten;
2003 /* If we fully sent the object on head go to the next one */
2004 if (c->sentlen == objlen) {
2005 listDelNode(c->reply,listFirst(c->reply));
2006 c->sentlen = 0;
2007 }
2008 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2009 * bytes, in a single threaded server it's a good idea to serve
2010 * other clients as well, even if a very large request comes from
2011 * super fast link that is always able to accept data (in real world
2012 * scenario think about 'KEYS *' against the loopback interfae) */
2013 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2014 }
2015 if (nwritten == -1) {
2016 if (errno == EAGAIN) {
2017 nwritten = 0;
2018 } else {
2019 redisLog(REDIS_VERBOSE,
2020 "Error writing to client: %s", strerror(errno));
2021 freeClient(c);
2022 return;
2023 }
2024 }
2025 if (totwritten > 0) c->lastinteraction = time(NULL);
2026 if (listLength(c->reply) == 0) {
2027 c->sentlen = 0;
2028 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2029 }
2030 }
2031
2032 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2033 {
2034 redisClient *c = privdata;
2035 int nwritten = 0, totwritten = 0, objlen, willwrite;
2036 robj *o;
2037 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2038 int offset, ion = 0;
2039 REDIS_NOTUSED(el);
2040 REDIS_NOTUSED(mask);
2041
2042 listNode *node;
2043 while (listLength(c->reply)) {
2044 offset = c->sentlen;
2045 ion = 0;
2046 willwrite = 0;
2047
2048 /* fill-in the iov[] array */
2049 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2050 o = listNodeValue(node);
2051 objlen = sdslen(o->ptr);
2052
2053 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2054 break;
2055
2056 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2057 break; /* no more iovecs */
2058
2059 iov[ion].iov_base = ((char*)o->ptr) + offset;
2060 iov[ion].iov_len = objlen - offset;
2061 willwrite += objlen - offset;
2062 offset = 0; /* just for the first item */
2063 ion++;
2064 }
2065
2066 if(willwrite == 0)
2067 break;
2068
2069 /* write all collected blocks at once */
2070 if((nwritten = writev(fd, iov, ion)) < 0) {
2071 if (errno != EAGAIN) {
2072 redisLog(REDIS_VERBOSE,
2073 "Error writing to client: %s", strerror(errno));
2074 freeClient(c);
2075 return;
2076 }
2077 break;
2078 }
2079
2080 totwritten += nwritten;
2081 offset = c->sentlen;
2082
2083 /* remove written robjs from c->reply */
2084 while (nwritten && listLength(c->reply)) {
2085 o = listNodeValue(listFirst(c->reply));
2086 objlen = sdslen(o->ptr);
2087
2088 if(nwritten >= objlen - offset) {
2089 listDelNode(c->reply, listFirst(c->reply));
2090 nwritten -= objlen - offset;
2091 c->sentlen = 0;
2092 } else {
2093 /* partial write */
2094 c->sentlen += nwritten;
2095 break;
2096 }
2097 offset = 0;
2098 }
2099 }
2100
2101 if (totwritten > 0)
2102 c->lastinteraction = time(NULL);
2103
2104 if (listLength(c->reply) == 0) {
2105 c->sentlen = 0;
2106 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2107 }
2108 }
2109
2110 static struct redisCommand *lookupCommand(char *name) {
2111 int j = 0;
2112 while(cmdTable[j].name != NULL) {
2113 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2114 j++;
2115 }
2116 return NULL;
2117 }
2118
2119 /* resetClient prepare the client to process the next command */
2120 static void resetClient(redisClient *c) {
2121 freeClientArgv(c);
2122 c->bulklen = -1;
2123 c->multibulk = 0;
2124 }
2125
2126 /* Call() is the core of Redis execution of a command */
2127 static void call(redisClient *c, struct redisCommand *cmd) {
2128 long long dirty;
2129
2130 dirty = server.dirty;
2131 cmd->proc(c);
2132 dirty = server.dirty-dirty;
2133
2134 if (server.appendonly && dirty)
2135 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2136 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2137 listLength(server.slaves))
2138 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2139 if (listLength(server.monitors))
2140 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2141 server.stat_numcommands++;
2142 }
2143
2144 /* If this function gets called we already read a whole
2145 * command, argments are in the client argv/argc fields.
2146 * processCommand() execute the command or prepare the
2147 * server for a bulk read from the client.
2148 *
2149 * If 1 is returned the client is still alive and valid and
2150 * and other operations can be performed by the caller. Otherwise
2151 * if 0 is returned the client was destroied (i.e. after QUIT). */
2152 static int processCommand(redisClient *c) {
2153 struct redisCommand *cmd;
2154
2155 /* Free some memory if needed (maxmemory setting) */
2156 if (server.maxmemory) freeMemoryIfNeeded();
2157
2158 /* Handle the multi bulk command type. This is an alternative protocol
2159 * supported by Redis in order to receive commands that are composed of
2160 * multiple binary-safe "bulk" arguments. The latency of processing is
2161 * a bit higher but this allows things like multi-sets, so if this
2162 * protocol is used only for MSET and similar commands this is a big win. */
2163 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2164 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2165 if (c->multibulk <= 0) {
2166 resetClient(c);
2167 return 1;
2168 } else {
2169 decrRefCount(c->argv[c->argc-1]);
2170 c->argc--;
2171 return 1;
2172 }
2173 } else if (c->multibulk) {
2174 if (c->bulklen == -1) {
2175 if (((char*)c->argv[0]->ptr)[0] != '$') {
2176 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2177 resetClient(c);
2178 return 1;
2179 } else {
2180 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2181 decrRefCount(c->argv[0]);
2182 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2183 c->argc--;
2184 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2185 resetClient(c);
2186 return 1;
2187 }
2188 c->argc--;
2189 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2190 return 1;
2191 }
2192 } else {
2193 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2194 c->mbargv[c->mbargc] = c->argv[0];
2195 c->mbargc++;
2196 c->argc--;
2197 c->multibulk--;
2198 if (c->multibulk == 0) {
2199 robj **auxargv;
2200 int auxargc;
2201
2202 /* Here we need to swap the multi-bulk argc/argv with the
2203 * normal argc/argv of the client structure. */
2204 auxargv = c->argv;
2205 c->argv = c->mbargv;
2206 c->mbargv = auxargv;
2207
2208 auxargc = c->argc;
2209 c->argc = c->mbargc;
2210 c->mbargc = auxargc;
2211
2212 /* We need to set bulklen to something different than -1
2213 * in order for the code below to process the command without
2214 * to try to read the last argument of a bulk command as
2215 * a special argument. */
2216 c->bulklen = 0;
2217 /* continue below and process the command */
2218 } else {
2219 c->bulklen = -1;
2220 return 1;
2221 }
2222 }
2223 }
2224 /* -- end of multi bulk commands processing -- */
2225
2226 /* The QUIT command is handled as a special case. Normal command
2227 * procs are unable to close the client connection safely */
2228 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2229 freeClient(c);
2230 return 0;
2231 }
2232
2233 /* Now lookup the command and check ASAP about trivial error conditions
2234 * such wrong arity, bad command name and so forth. */
2235 cmd = lookupCommand(c->argv[0]->ptr);
2236 if (!cmd) {
2237 addReplySds(c,
2238 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2239 (char*)c->argv[0]->ptr));
2240 resetClient(c);
2241 return 1;
2242 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2243 (c->argc < -cmd->arity)) {
2244 addReplySds(c,
2245 sdscatprintf(sdsempty(),
2246 "-ERR wrong number of arguments for '%s' command\r\n",
2247 cmd->name));
2248 resetClient(c);
2249 return 1;
2250 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2251 /* This is a bulk command, we have to read the last argument yet. */
2252 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2253
2254 decrRefCount(c->argv[c->argc-1]);
2255 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2256 c->argc--;
2257 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2258 resetClient(c);
2259 return 1;
2260 }
2261 c->argc--;
2262 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2263 /* It is possible that the bulk read is already in the
2264 * buffer. Check this condition and handle it accordingly.
2265 * This is just a fast path, alternative to call processInputBuffer().
2266 * It's a good idea since the code is small and this condition
2267 * happens most of the times. */
2268 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2269 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2270 c->argc++;
2271 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2272 } else {
2273 /* Otherwise return... there is to read the last argument
2274 * from the socket. */
2275 return 1;
2276 }
2277 }
2278 /* Let's try to encode the bulk object to save space. */
2279 if (cmd->flags & REDIS_CMD_BULK)
2280 tryObjectEncoding(c->argv[c->argc-1]);
2281
2282 /* Check if the user is authenticated */
2283 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2284 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2285 resetClient(c);
2286 return 1;
2287 }
2288
2289 /* Handle the maxmemory directive */
2290 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2291 zmalloc_used_memory() > server.maxmemory)
2292 {
2293 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2294 resetClient(c);
2295 return 1;
2296 }
2297
2298 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2299 if (dictSize(c->pubsub_channels) > 0 &&
2300 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2301 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2302 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2303 resetClient(c);
2304 return 1;
2305 }
2306
2307 /* Exec the command */
2308 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2309 queueMultiCommand(c,cmd);
2310 addReply(c,shared.queued);
2311 } else {
2312 if (server.vm_enabled && server.vm_max_threads > 0 &&
2313 blockClientOnSwappedKeys(cmd,c)) return 1;
2314 call(c,cmd);
2315 }
2316
2317 /* Prepare the client for the next command */
2318 resetClient(c);
2319 return 1;
2320 }
2321
2322 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2323 listNode *ln;
2324 listIter li;
2325 int outc = 0, j;
2326 robj **outv;
2327 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2328 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2329 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2330 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2331 robj *lenobj;
2332
2333 if (argc <= REDIS_STATIC_ARGS) {
2334 outv = static_outv;
2335 } else {
2336 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2337 }
2338
2339 lenobj = createObject(REDIS_STRING,
2340 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2341 lenobj->refcount = 0;
2342 outv[outc++] = lenobj;
2343 for (j = 0; j < argc; j++) {
2344 lenobj = createObject(REDIS_STRING,
2345 sdscatprintf(sdsempty(),"$%lu\r\n",
2346 (unsigned long) stringObjectLen(argv[j])));
2347 lenobj->refcount = 0;
2348 outv[outc++] = lenobj;
2349 outv[outc++] = argv[j];
2350 outv[outc++] = shared.crlf;
2351 }
2352
2353 /* Increment all the refcounts at start and decrement at end in order to
2354 * be sure to free objects if there is no slave in a replication state
2355 * able to be feed with commands */
2356 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2357 listRewind(slaves,&li);
2358 while((ln = listNext(&li))) {
2359 redisClient *slave = ln->value;
2360
2361 /* Don't feed slaves that are still waiting for BGSAVE to start */
2362 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2363
2364 /* Feed all the other slaves, MONITORs and so on */
2365 if (slave->slaveseldb != dictid) {
2366 robj *selectcmd;
2367
2368 switch(dictid) {
2369 case 0: selectcmd = shared.select0; break;
2370 case 1: selectcmd = shared.select1; break;
2371 case 2: selectcmd = shared.select2; break;
2372 case 3: selectcmd = shared.select3; break;
2373 case 4: selectcmd = shared.select4; break;
2374 case 5: selectcmd = shared.select5; break;
2375 case 6: selectcmd = shared.select6; break;
2376 case 7: selectcmd = shared.select7; break;
2377 case 8: selectcmd = shared.select8; break;
2378 case 9: selectcmd = shared.select9; break;
2379 default:
2380 selectcmd = createObject(REDIS_STRING,
2381 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2382 selectcmd->refcount = 0;
2383 break;
2384 }
2385 addReply(slave,selectcmd);
2386 slave->slaveseldb = dictid;
2387 }
2388 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2389 }
2390 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2391 if (outv != static_outv) zfree(outv);
2392 }
2393
2394 static void processInputBuffer(redisClient *c) {
2395 again:
2396 /* Before to process the input buffer, make sure the client is not
2397 * waitig for a blocking operation such as BLPOP. Note that the first
2398 * iteration the client is never blocked, otherwise the processInputBuffer
2399 * would not be called at all, but after the execution of the first commands
2400 * in the input buffer the client may be blocked, and the "goto again"
2401 * will try to reiterate. The following line will make it return asap. */
2402 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2403 if (c->bulklen == -1) {
2404 /* Read the first line of the query */
2405 char *p = strchr(c->querybuf,'\n');
2406 size_t querylen;
2407
2408 if (p) {
2409 sds query, *argv;
2410 int argc, j;
2411
2412 query = c->querybuf;
2413 c->querybuf = sdsempty();
2414 querylen = 1+(p-(query));
2415 if (sdslen(query) > querylen) {
2416 /* leave data after the first line of the query in the buffer */
2417 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2418 }
2419 *p = '\0'; /* remove "\n" */
2420 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2421 sdsupdatelen(query);
2422
2423 /* Now we can split the query in arguments */
2424 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2425 sdsfree(query);
2426
2427 if (c->argv) zfree(c->argv);
2428 c->argv = zmalloc(sizeof(robj*)*argc);
2429
2430 for (j = 0; j < argc; j++) {
2431 if (sdslen(argv[j])) {
2432 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2433 c->argc++;
2434 } else {
2435 sdsfree(argv[j]);
2436 }
2437 }
2438 zfree(argv);
2439 if (c->argc) {
2440 /* Execute the command. If the client is still valid
2441 * after processCommand() return and there is something
2442 * on the query buffer try to process the next command. */
2443 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2444 } else {
2445 /* Nothing to process, argc == 0. Just process the query
2446 * buffer if it's not empty or return to the caller */
2447 if (sdslen(c->querybuf)) goto again;
2448 }
2449 return;
2450 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2451 redisLog(REDIS_VERBOSE, "Client protocol error");
2452 freeClient(c);
2453 return;
2454 }
2455 } else {
2456 /* Bulk read handling. Note that if we are at this point
2457 the client already sent a command terminated with a newline,
2458 we are reading the bulk data that is actually the last
2459 argument of the command. */
2460 int qbl = sdslen(c->querybuf);
2461
2462 if (c->bulklen <= qbl) {
2463 /* Copy everything but the final CRLF as final argument */
2464 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2465 c->argc++;
2466 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2467 /* Process the command. If the client is still valid after
2468 * the processing and there is more data in the buffer
2469 * try to parse it. */
2470 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2471 return;
2472 }
2473 }
2474 }
2475
2476 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2477 redisClient *c = (redisClient*) privdata;
2478 char buf[REDIS_IOBUF_LEN];
2479 int nread;
2480 REDIS_NOTUSED(el);
2481 REDIS_NOTUSED(mask);
2482
2483 nread = read(fd, buf, REDIS_IOBUF_LEN);
2484 if (nread == -1) {
2485 if (errno == EAGAIN) {
2486 nread = 0;
2487 } else {
2488 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2489 freeClient(c);
2490 return;
2491 }
2492 } else if (nread == 0) {
2493 redisLog(REDIS_VERBOSE, "Client closed connection");
2494 freeClient(c);
2495 return;
2496 }
2497 if (nread) {
2498 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2499 c->lastinteraction = time(NULL);
2500 } else {
2501 return;
2502 }
2503 processInputBuffer(c);
2504 }
2505
2506 static int selectDb(redisClient *c, int id) {
2507 if (id < 0 || id >= server.dbnum)
2508 return REDIS_ERR;
2509 c->db = &server.db[id];
2510 return REDIS_OK;
2511 }
2512
2513 static void *dupClientReplyValue(void *o) {
2514 incrRefCount((robj*)o);
2515 return o;
2516 }
2517
2518 static int listMatchObjects(void *a, void *b) {
2519 return compareStringObjects(a,b) == 0;
2520 }
2521
2522 static redisClient *createClient(int fd) {
2523 redisClient *c = zmalloc(sizeof(*c));
2524
2525 anetNonBlock(NULL,fd);
2526 anetTcpNoDelay(NULL,fd);
2527 if (!c) return NULL;
2528 selectDb(c,0);
2529 c->fd = fd;
2530 c->querybuf = sdsempty();
2531 c->argc = 0;
2532 c->argv = NULL;
2533 c->bulklen = -1;
2534 c->multibulk = 0;
2535 c->mbargc = 0;
2536 c->mbargv = NULL;
2537 c->sentlen = 0;
2538 c->flags = 0;
2539 c->lastinteraction = time(NULL);
2540 c->authenticated = 0;
2541 c->replstate = REDIS_REPL_NONE;
2542 c->reply = listCreate();
2543 listSetFreeMethod(c->reply,decrRefCount);
2544 listSetDupMethod(c->reply,dupClientReplyValue);
2545 c->blockingkeys = NULL;
2546 c->blockingkeysnum = 0;
2547 c->io_keys = listCreate();
2548 listSetFreeMethod(c->io_keys,decrRefCount);
2549 c->pubsub_channels = dictCreate(&setDictType,NULL);
2550 c->pubsub_patterns = listCreate();
2551 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2552 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2553 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2554 readQueryFromClient, c) == AE_ERR) {
2555 freeClient(c);
2556 return NULL;
2557 }
2558 listAddNodeTail(server.clients,c);
2559 initClientMultiState(c);
2560 return c;
2561 }
2562
2563 static void addReply(redisClient *c, robj *obj) {
2564 if (listLength(c->reply) == 0 &&
2565 (c->replstate == REDIS_REPL_NONE ||
2566 c->replstate == REDIS_REPL_ONLINE) &&
2567 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2568 sendReplyToClient, c) == AE_ERR) return;
2569
2570 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2571 obj = dupStringObject(obj);
2572 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2573 }
2574 listAddNodeTail(c->reply,getDecodedObject(obj));
2575 }
2576
2577 static void addReplySds(redisClient *c, sds s) {
2578 robj *o = createObject(REDIS_STRING,s);
2579 addReply(c,o);
2580 decrRefCount(o);
2581 }
2582
2583 static void addReplyDouble(redisClient *c, double d) {
2584 char buf[128];
2585
2586 snprintf(buf,sizeof(buf),"%.17g",d);
2587 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2588 (unsigned long) strlen(buf),buf));
2589 }
2590
2591 static void addReplyLong(redisClient *c, long l) {
2592 char buf[128];
2593 size_t len;
2594
2595 if (l == 0) {
2596 addReply(c,shared.czero);
2597 return;
2598 } else if (l == 1) {
2599 addReply(c,shared.cone);
2600 return;
2601 }
2602 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2603 addReplySds(c,sdsnewlen(buf,len));
2604 }
2605
2606 static void addReplyLongLong(redisClient *c, long long ll) {
2607 char buf[128];
2608 size_t len;
2609
2610 if (ll == 0) {
2611 addReply(c,shared.czero);
2612 return;
2613 } else if (ll == 1) {
2614 addReply(c,shared.cone);
2615 return;
2616 }
2617 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2618 addReplySds(c,sdsnewlen(buf,len));
2619 }
2620
2621 static void addReplyUlong(redisClient *c, unsigned long ul) {
2622 char buf[128];
2623 size_t len;
2624
2625 if (ul == 0) {
2626 addReply(c,shared.czero);
2627 return;
2628 } else if (ul == 1) {
2629 addReply(c,shared.cone);
2630 return;
2631 }
2632 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2633 addReplySds(c,sdsnewlen(buf,len));
2634 }
2635
2636 static void addReplyBulkLen(redisClient *c, robj *obj) {
2637 size_t len;
2638
2639 if (obj->encoding == REDIS_ENCODING_RAW) {
2640 len = sdslen(obj->ptr);
2641 } else {
2642 long n = (long)obj->ptr;
2643
2644 /* Compute how many bytes will take this integer as a radix 10 string */
2645 len = 1;
2646 if (n < 0) {
2647 len++;
2648 n = -n;
2649 }
2650 while((n = n/10) != 0) {
2651 len++;
2652 }
2653 }
2654 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2655 }
2656
2657 static void addReplyBulk(redisClient *c, robj *obj) {
2658 addReplyBulkLen(c,obj);
2659 addReply(c,obj);
2660 addReply(c,shared.crlf);
2661 }
2662
2663 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2664 static void addReplyBulkCString(redisClient *c, char *s) {
2665 if (s == NULL) {
2666 addReply(c,shared.nullbulk);
2667 } else {
2668 robj *o = createStringObject(s,strlen(s));
2669 addReplyBulk(c,o);
2670 decrRefCount(o);
2671 }
2672 }
2673
2674 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2675 int cport, cfd;
2676 char cip[128];
2677 redisClient *c;
2678 REDIS_NOTUSED(el);
2679 REDIS_NOTUSED(mask);
2680 REDIS_NOTUSED(privdata);
2681
2682 cfd = anetAccept(server.neterr, fd, cip, &cport);
2683 if (cfd == AE_ERR) {
2684 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2685 return;
2686 }
2687 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2688 if ((c = createClient(cfd)) == NULL) {
2689 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2690 close(cfd); /* May be already closed, just ingore errors */
2691 return;
2692 }
2693 /* If maxclient directive is set and this is one client more... close the
2694 * connection. Note that we create the client instead to check before
2695 * for this condition, since now the socket is already set in nonblocking
2696 * mode and we can send an error for free using the Kernel I/O */
2697 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2698 char *err = "-ERR max number of clients reached\r\n";
2699
2700 /* That's a best effort error message, don't check write errors */
2701 if (write(c->fd,err,strlen(err)) == -1) {
2702 /* Nothing to do, Just to avoid the warning... */
2703 }
2704 freeClient(c);
2705 return;
2706 }
2707 server.stat_numconnections++;
2708 }
2709
2710 /* ======================= Redis objects implementation ===================== */
2711
2712 static robj *createObject(int type, void *ptr) {
2713 robj *o;
2714
2715 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2716 if (listLength(server.objfreelist)) {
2717 listNode *head = listFirst(server.objfreelist);
2718 o = listNodeValue(head);
2719 listDelNode(server.objfreelist,head);
2720 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2721 } else {
2722 if (server.vm_enabled) {
2723 pthread_mutex_unlock(&server.obj_freelist_mutex);
2724 o = zmalloc(sizeof(*o));
2725 } else {
2726 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2727 }
2728 }
2729 o->type = type;
2730 o->encoding = REDIS_ENCODING_RAW;
2731 o->ptr = ptr;
2732 o->refcount = 1;
2733 if (server.vm_enabled) {
2734 /* Note that this code may run in the context of an I/O thread
2735 * and accessing to server.unixtime in theory is an error
2736 * (no locks). But in practice this is safe, and even if we read
2737 * garbage Redis will not fail, as it's just a statistical info */
2738 o->vm.atime = server.unixtime;
2739 o->storage = REDIS_VM_MEMORY;
2740 }
2741 return o;
2742 }
2743
2744 static robj *createStringObject(char *ptr, size_t len) {
2745 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2746 }
2747
2748 static robj *dupStringObject(robj *o) {
2749 assert(o->encoding == REDIS_ENCODING_RAW);
2750 return createStringObject(o->ptr,sdslen(o->ptr));
2751 }
2752
2753 static robj *createListObject(void) {
2754 list *l = listCreate();
2755
2756 listSetFreeMethod(l,decrRefCount);
2757 return createObject(REDIS_LIST,l);
2758 }
2759
2760 static robj *createSetObject(void) {
2761 dict *d = dictCreate(&setDictType,NULL);
2762 return createObject(REDIS_SET,d);
2763 }
2764
2765 static robj *createHashObject(void) {
2766 /* All the Hashes start as zipmaps. Will be automatically converted
2767 * into hash tables if there are enough elements or big elements
2768 * inside. */
2769 unsigned char *zm = zipmapNew();
2770 robj *o = createObject(REDIS_HASH,zm);
2771 o->encoding = REDIS_ENCODING_ZIPMAP;
2772 return o;
2773 }
2774
2775 static robj *createZsetObject(void) {
2776 zset *zs = zmalloc(sizeof(*zs));
2777
2778 zs->dict = dictCreate(&zsetDictType,NULL);
2779 zs->zsl = zslCreate();
2780 return createObject(REDIS_ZSET,zs);
2781 }
2782
2783 static void freeStringObject(robj *o) {
2784 if (o->encoding == REDIS_ENCODING_RAW) {
2785 sdsfree(o->ptr);
2786 }
2787 }
2788
2789 static void freeListObject(robj *o) {
2790 listRelease((list*) o->ptr);
2791 }
2792
2793 static void freeSetObject(robj *o) {
2794 dictRelease((dict*) o->ptr);
2795 }
2796
2797 static void freeZsetObject(robj *o) {
2798 zset *zs = o->ptr;
2799
2800 dictRelease(zs->dict);
2801 zslFree(zs->zsl);
2802 zfree(zs);
2803 }
2804
2805 static void freeHashObject(robj *o) {
2806 switch (o->encoding) {
2807 case REDIS_ENCODING_HT:
2808 dictRelease((dict*) o->ptr);
2809 break;
2810 case REDIS_ENCODING_ZIPMAP:
2811 zfree(o->ptr);
2812 break;
2813 default:
2814 redisAssert(0);
2815 break;
2816 }
2817 }
2818
2819 static void incrRefCount(robj *o) {
2820 o->refcount++;
2821 }
2822
2823 static void decrRefCount(void *obj) {
2824 robj *o = obj;
2825
2826 /* Object is a key of a swapped out value, or in the process of being
2827 * loaded. */
2828 if (server.vm_enabled &&
2829 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2830 {
2831 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2832 redisAssert(o->refcount == 1);
2833 }
2834 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2835 redisAssert(o->type == REDIS_STRING);
2836 freeStringObject(o);
2837 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2838 pthread_mutex_lock(&server.obj_freelist_mutex);
2839 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2840 !listAddNodeHead(server.objfreelist,o))
2841 zfree(o);
2842 pthread_mutex_unlock(&server.obj_freelist_mutex);
2843 server.vm_stats_swapped_objects--;
2844 return;
2845 }
2846 /* Object is in memory, or in the process of being swapped out. */
2847 if (--(o->refcount) == 0) {
2848 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2849 vmCancelThreadedIOJob(obj);
2850 switch(o->type) {
2851 case REDIS_STRING: freeStringObject(o); break;
2852 case REDIS_LIST: freeListObject(o); break;
2853 case REDIS_SET: freeSetObject(o); break;
2854 case REDIS_ZSET: freeZsetObject(o); break;
2855 case REDIS_HASH: freeHashObject(o); break;
2856 default: redisAssert(0); break;
2857 }
2858 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2859 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2860 !listAddNodeHead(server.objfreelist,o))
2861 zfree(o);
2862 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2863 }
2864 }
2865
2866 static robj *lookupKey(redisDb *db, robj *key) {
2867 dictEntry *de = dictFind(db->dict,key);
2868 if (de) {
2869 robj *key = dictGetEntryKey(de);
2870 robj *val = dictGetEntryVal(de);
2871
2872 if (server.vm_enabled) {
2873 if (key->storage == REDIS_VM_MEMORY ||
2874 key->storage == REDIS_VM_SWAPPING)
2875 {
2876 /* If we were swapping the object out, stop it, this key
2877 * was requested. */
2878 if (key->storage == REDIS_VM_SWAPPING)
2879 vmCancelThreadedIOJob(key);
2880 /* Update the access time of the key for the aging algorithm. */
2881 key->vm.atime = server.unixtime;
2882 } else {
2883 int notify = (key->storage == REDIS_VM_LOADING);
2884
2885 /* Our value was swapped on disk. Bring it at home. */
2886 redisAssert(val == NULL);
2887 val = vmLoadObject(key);
2888 dictGetEntryVal(de) = val;
2889
2890 /* Clients blocked by the VM subsystem may be waiting for
2891 * this key... */
2892 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2893 }
2894 }
2895 return val;
2896 } else {
2897 return NULL;
2898 }
2899 }
2900
2901 static robj *lookupKeyRead(redisDb *db, robj *key) {
2902 expireIfNeeded(db,key);
2903 return lookupKey(db,key);
2904 }
2905
2906 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2907 deleteIfVolatile(db,key);
2908 return lookupKey(db,key);
2909 }
2910
2911 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2912 robj *o = lookupKeyRead(c->db, key);
2913 if (!o) addReply(c,reply);
2914 return o;
2915 }
2916
2917 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2918 robj *o = lookupKeyWrite(c->db, key);
2919 if (!o) addReply(c,reply);
2920 return o;
2921 }
2922
2923 static int checkType(redisClient *c, robj *o, int type) {
2924 if (o->type != type) {
2925 addReply(c,shared.wrongtypeerr);
2926 return 1;
2927 }
2928 return 0;
2929 }
2930
2931 static int deleteKey(redisDb *db, robj *key) {
2932 int retval;
2933
2934 /* We need to protect key from destruction: after the first dictDelete()
2935 * it may happen that 'key' is no longer valid if we don't increment
2936 * it's count. This may happen when we get the object reference directly
2937 * from the hash table with dictRandomKey() or dict iterators */
2938 incrRefCount(key);
2939 if (dictSize(db->expires)) dictDelete(db->expires,key);
2940 retval = dictDelete(db->dict,key);
2941 decrRefCount(key);
2942
2943 return retval == DICT_OK;
2944 }
2945
2946 /* Check if the nul-terminated string 's' can be represented by a long
2947 * (that is, is a number that fits into long without any other space or
2948 * character before or after the digits).
2949 *
2950 * If so, the function returns REDIS_OK and *longval is set to the value
2951 * of the number. Otherwise REDIS_ERR is returned */
2952 static int isStringRepresentableAsLong(sds s, long *longval) {
2953 char buf[32], *endptr;
2954 long value;
2955 int slen;
2956
2957 value = strtol(s, &endptr, 10);
2958 if (endptr[0] != '\0') return REDIS_ERR;
2959 slen = snprintf(buf,32,"%ld",value);
2960
2961 /* If the number converted back into a string is not identical
2962 * then it's not possible to encode the string as integer */
2963 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2964 if (longval) *longval = value;
2965 return REDIS_OK;
2966 }
2967
2968 /* Try to encode a string object in order to save space */
2969 static int tryObjectEncoding(robj *o) {
2970 long value;
2971 sds s = o->ptr;
2972
2973 if (o->encoding != REDIS_ENCODING_RAW)
2974 return REDIS_ERR; /* Already encoded */
2975
2976 /* It's not save to encode shared objects: shared objects can be shared
2977 * everywhere in the "object space" of Redis. Encoded objects can only
2978 * appear as "values" (and not, for instance, as keys) */
2979 if (o->refcount > 1) return REDIS_ERR;
2980
2981 /* Currently we try to encode only strings */
2982 redisAssert(o->type == REDIS_STRING);
2983
2984 /* Check if we can represent this string as a long integer */
2985 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2986
2987 /* Ok, this object can be encoded */
2988 o->encoding = REDIS_ENCODING_INT;
2989 sdsfree(o->ptr);
2990 o->ptr = (void*) value;
2991 return REDIS_OK;
2992 }
2993
2994 /* Get a decoded version of an encoded object (returned as a new object).
2995 * If the object is already raw-encoded just increment the ref count. */
2996 static robj *getDecodedObject(robj *o) {
2997 robj *dec;
2998
2999 if (o->encoding == REDIS_ENCODING_RAW) {
3000 incrRefCount(o);
3001 return o;
3002 }
3003 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3004 char buf[32];
3005
3006 snprintf(buf,32,"%ld",(long)o->ptr);
3007 dec = createStringObject(buf,strlen(buf));
3008 return dec;
3009 } else {
3010 redisAssert(1 != 1);
3011 }
3012 }
3013
3014 /* Compare two string objects via strcmp() or alike.
3015 * Note that the objects may be integer-encoded. In such a case we
3016 * use snprintf() to get a string representation of the numbers on the stack
3017 * and compare the strings, it's much faster than calling getDecodedObject().
3018 *
3019 * Important note: if objects are not integer encoded, but binary-safe strings,
3020 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3021 * binary safe. */
3022 static int compareStringObjects(robj *a, robj *b) {
3023 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3024 char bufa[128], bufb[128], *astr, *bstr;
3025 int bothsds = 1;
3026
3027 if (a == b) return 0;
3028 if (a->encoding != REDIS_ENCODING_RAW) {
3029 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3030 astr = bufa;
3031 bothsds = 0;
3032 } else {
3033 astr = a->ptr;
3034 }
3035 if (b->encoding != REDIS_ENCODING_RAW) {
3036 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3037 bstr = bufb;
3038 bothsds = 0;
3039 } else {
3040 bstr = b->ptr;
3041 }
3042 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3043 }
3044
3045 static size_t stringObjectLen(robj *o) {
3046 redisAssert(o->type == REDIS_STRING);
3047 if (o->encoding == REDIS_ENCODING_RAW) {
3048 return sdslen(o->ptr);
3049 } else {
3050 char buf[32];
3051
3052 return snprintf(buf,32,"%ld",(long)o->ptr);
3053 }
3054 }
3055
3056 /*============================ RDB saving/loading =========================== */
3057
3058 static int rdbSaveType(FILE *fp, unsigned char type) {
3059 if (fwrite(&type,1,1,fp) == 0) return -1;
3060 return 0;
3061 }
3062
3063 static int rdbSaveTime(FILE *fp, time_t t) {
3064 int32_t t32 = (int32_t) t;
3065 if (fwrite(&t32,4,1,fp) == 0) return -1;
3066 return 0;
3067 }
3068
3069 /* check rdbLoadLen() comments for more info */
3070 static int rdbSaveLen(FILE *fp, uint32_t len) {
3071 unsigned char buf[2];
3072
3073 if (len < (1<<6)) {
3074 /* Save a 6 bit len */
3075 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3076 if (fwrite(buf,1,1,fp) == 0) return -1;
3077 } else if (len < (1<<14)) {
3078 /* Save a 14 bit len */
3079 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3080 buf[1] = len&0xFF;
3081 if (fwrite(buf,2,1,fp) == 0) return -1;
3082 } else {
3083 /* Save a 32 bit len */
3084 buf[0] = (REDIS_RDB_32BITLEN<<6);
3085 if (fwrite(buf,1,1,fp) == 0) return -1;
3086 len = htonl(len);
3087 if (fwrite(&len,4,1,fp) == 0) return -1;
3088 }
3089 return 0;
3090 }
3091
3092 /* String objects in the form "2391" "-100" without any space and with a
3093 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3094 * encoded as integers to save space */
3095 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3096 long long value;
3097 char *endptr, buf[32];
3098
3099 /* Check if it's possible to encode this value as a number */
3100 value = strtoll(s, &endptr, 10);
3101 if (endptr[0] != '\0') return 0;
3102 snprintf(buf,32,"%lld",value);
3103
3104 /* If the number converted back into a string is not identical
3105 * then it's not possible to encode the string as integer */
3106 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3107
3108 /* Finally check if it fits in our ranges */
3109 if (value >= -(1<<7) && value <= (1<<7)-1) {
3110 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3111 enc[1] = value&0xFF;
3112 return 2;
3113 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3114 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3115 enc[1] = value&0xFF;
3116 enc[2] = (value>>8)&0xFF;
3117 return 3;
3118 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3119 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3120 enc[1] = value&0xFF;
3121 enc[2] = (value>>8)&0xFF;
3122 enc[3] = (value>>16)&0xFF;
3123 enc[4] = (value>>24)&0xFF;
3124 return 5;
3125 } else {
3126 return 0;
3127 }
3128 }
3129
3130 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3131 size_t comprlen, outlen;
3132 unsigned char byte;
3133 void *out;
3134
3135 /* We require at least four bytes compression for this to be worth it */
3136 if (len <= 4) return 0;
3137 outlen = len-4;
3138 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3139 comprlen = lzf_compress(s, len, out, outlen);
3140 if (comprlen == 0) {
3141 zfree(out);
3142 return 0;
3143 }
3144 /* Data compressed! Let's save it on disk */
3145 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3146 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3147 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3148 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3149 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3150 zfree(out);
3151 return comprlen;
3152
3153 writeerr:
3154 zfree(out);
3155 return -1;
3156 }
3157
3158 /* Save a string objet as [len][data] on disk. If the object is a string
3159 * representation of an integer value we try to safe it in a special form */
3160 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3161 int enclen;
3162
3163 /* Try integer encoding */
3164 if (len <= 11) {
3165 unsigned char buf[5];
3166 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3167 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3168 return 0;
3169 }
3170 }
3171
3172 /* Try LZF compression - under 20 bytes it's unable to compress even
3173 * aaaaaaaaaaaaaaaaaa so skip it */
3174 if (server.rdbcompression && len > 20) {
3175 int retval;
3176
3177 retval = rdbSaveLzfStringObject(fp,s,len);
3178 if (retval == -1) return -1;
3179 if (retval > 0) return 0;
3180 /* retval == 0 means data can't be compressed, save the old way */
3181 }
3182
3183 /* Store verbatim */
3184 if (rdbSaveLen(fp,len) == -1) return -1;
3185 if (len && fwrite(s,len,1,fp) == 0) return -1;
3186 return 0;
3187 }
3188
3189 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3190 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3191 int retval;
3192
3193 /* Avoid incr/decr ref count business when possible.
3194 * This plays well with copy-on-write given that we are probably
3195 * in a child process (BGSAVE). Also this makes sure key objects
3196 * of swapped objects are not incRefCount-ed (an assert does not allow
3197 * this in order to avoid bugs) */
3198 if (obj->encoding != REDIS_ENCODING_RAW) {
3199 obj = getDecodedObject(obj);
3200 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3201 decrRefCount(obj);
3202 } else {
3203 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3204 }
3205 return retval;
3206 }
3207
3208 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3209 * 8 bit integer specifing the length of the representation.
3210 * This 8 bit integer has special values in order to specify the following
3211 * conditions:
3212 * 253: not a number
3213 * 254: + inf
3214 * 255: - inf
3215 */
3216 static int rdbSaveDoubleValue(FILE *fp, double val) {
3217 unsigned char buf[128];
3218 int len;
3219
3220 if (isnan(val)) {
3221 buf[0] = 253;
3222 len = 1;
3223 } else if (!isfinite(val)) {
3224 len = 1;
3225 buf[0] = (val < 0) ? 255 : 254;
3226 } else {
3227 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3228 buf[0] = strlen((char*)buf+1);
3229 len = buf[0]+1;
3230 }
3231 if (fwrite(buf,len,1,fp) == 0) return -1;
3232 return 0;
3233 }
3234
3235 /* Save a Redis object. */
3236 static int rdbSaveObject(FILE *fp, robj *o) {
3237 if (o->type == REDIS_STRING) {
3238 /* Save a string value */
3239 if (rdbSaveStringObject(fp,o) == -1) return -1;
3240 } else if (o->type == REDIS_LIST) {
3241 /* Save a list value */
3242 list *list = o->ptr;
3243 listIter li;
3244 listNode *ln;
3245
3246 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3247 listRewind(list,&li);
3248 while((ln = listNext(&li))) {
3249 robj *eleobj = listNodeValue(ln);
3250
3251 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3252 }
3253 } else if (o->type == REDIS_SET) {
3254 /* Save a set value */
3255 dict *set = o->ptr;
3256 dictIterator *di = dictGetIterator(set);
3257 dictEntry *de;
3258
3259 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3260 while((de = dictNext(di)) != NULL) {
3261 robj *eleobj = dictGetEntryKey(de);
3262
3263 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3264 }
3265 dictReleaseIterator(di);
3266 } else if (o->type == REDIS_ZSET) {
3267 /* Save a set value */
3268 zset *zs = o->ptr;
3269 dictIterator *di = dictGetIterator(zs->dict);
3270 dictEntry *de;
3271
3272 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3273 while((de = dictNext(di)) != NULL) {
3274 robj *eleobj = dictGetEntryKey(de);
3275 double *score = dictGetEntryVal(de);
3276
3277 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3278 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3279 }
3280 dictReleaseIterator(di);
3281 } else if (o->type == REDIS_HASH) {
3282 /* Save a hash value */
3283 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3284 unsigned char *p = zipmapRewind(o->ptr);
3285 unsigned int count = zipmapLen(o->ptr);
3286 unsigned char *key, *val;
3287 unsigned int klen, vlen;
3288
3289 if (rdbSaveLen(fp,count) == -1) return -1;
3290 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3291 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3292 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3293 }
3294 } else {
3295 dictIterator *di = dictGetIterator(o->ptr);
3296 dictEntry *de;
3297
3298 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3299 while((de = dictNext(di)) != NULL) {
3300 robj *key = dictGetEntryKey(de);
3301 robj *val = dictGetEntryVal(de);
3302
3303 if (rdbSaveStringObject(fp,key) == -1) return -1;
3304 if (rdbSaveStringObject(fp,val) == -1) return -1;
3305 }
3306 dictReleaseIterator(di);
3307 }
3308 } else {
3309 redisAssert(0);
3310 }
3311 return 0;
3312 }
3313
3314 /* Return the length the object will have on disk if saved with
3315 * the rdbSaveObject() function. Currently we use a trick to get
3316 * this length with very little changes to the code. In the future
3317 * we could switch to a faster solution. */
3318 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3319 if (fp == NULL) fp = server.devnull;
3320 rewind(fp);
3321 assert(rdbSaveObject(fp,o) != 1);
3322 return ftello(fp);
3323 }
3324
3325 /* Return the number of pages required to save this object in the swap file */
3326 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3327 off_t bytes = rdbSavedObjectLen(o,fp);
3328
3329 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3330 }
3331
3332 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3333 static int rdbSave(char *filename) {
3334 dictIterator *di = NULL;
3335 dictEntry *de;
3336 FILE *fp;
3337 char tmpfile[256];
3338 int j;
3339 time_t now = time(NULL);
3340
3341 /* Wait for I/O therads to terminate, just in case this is a
3342 * foreground-saving, to avoid seeking the swap file descriptor at the
3343 * same time. */
3344 if (server.vm_enabled)
3345 waitEmptyIOJobsQueue();
3346
3347 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3348 fp = fopen(tmpfile,"w");
3349 if (!fp) {
3350 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3351 return REDIS_ERR;
3352 }
3353 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3354 for (j = 0; j < server.dbnum; j++) {
3355 redisDb *db = server.db+j;
3356 dict *d = db->dict;
3357 if (dictSize(d) == 0) continue;
3358 di = dictGetIterator(d);
3359 if (!di) {
3360 fclose(fp);
3361 return REDIS_ERR;
3362 }
3363
3364 /* Write the SELECT DB opcode */
3365 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3366 if (rdbSaveLen(fp,j) == -1) goto werr;
3367
3368 /* Iterate this DB writing every entry */
3369 while((de = dictNext(di)) != NULL) {
3370 robj *key = dictGetEntryKey(de);
3371 robj *o = dictGetEntryVal(de);
3372 time_t expiretime = getExpire(db,key);
3373
3374 /* Save the expire time */
3375 if (expiretime != -1) {
3376 /* If this key is already expired skip it */
3377 if (expiretime < now) continue;
3378 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3379 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3380 }
3381 /* Save the key and associated value. This requires special
3382 * handling if the value is swapped out. */
3383 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3384 key->storage == REDIS_VM_SWAPPING) {
3385 /* Save type, key, value */
3386 if (rdbSaveType(fp,o->type) == -1) goto werr;
3387 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3388 if (rdbSaveObject(fp,o) == -1) goto werr;
3389 } else {
3390 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3391 robj *po;
3392 /* Get a preview of the object in memory */
3393 po = vmPreviewObject(key);
3394 /* Save type, key, value */
3395 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3396 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3397 if (rdbSaveObject(fp,po) == -1) goto werr;
3398 /* Remove the loaded object from memory */
3399 decrRefCount(po);
3400 }
3401 }
3402 dictReleaseIterator(di);
3403 }
3404 /* EOF opcode */
3405 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3406
3407 /* Make sure data will not remain on the OS's output buffers */
3408 fflush(fp);
3409 fsync(fileno(fp));
3410 fclose(fp);
3411
3412 /* Use RENAME to make sure the DB file is changed atomically only
3413 * if the generate DB file is ok. */
3414 if (rename(tmpfile,filename) == -1) {
3415 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3416 unlink(tmpfile);
3417 return REDIS_ERR;
3418 }
3419 redisLog(REDIS_NOTICE,"DB saved on disk");
3420 server.dirty = 0;
3421 server.lastsave = time(NULL);
3422 return REDIS_OK;
3423
3424 werr:
3425 fclose(fp);
3426 unlink(tmpfile);
3427 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3428 if (di) dictReleaseIterator(di);
3429 return REDIS_ERR;
3430 }
3431
3432 static int rdbSaveBackground(char *filename) {
3433 pid_t childpid;
3434
3435 if (server.bgsavechildpid != -1) return REDIS_ERR;
3436 if (server.vm_enabled) waitEmptyIOJobsQueue();
3437 if ((childpid = fork()) == 0) {
3438 /* Child */
3439 if (server.vm_enabled) vmReopenSwapFile();
3440 close(server.fd);
3441 if (rdbSave(filename) == REDIS_OK) {
3442 _exit(0);
3443 } else {
3444 _exit(1);
3445 }
3446 } else {
3447 /* Parent */
3448 if (childpid == -1) {
3449 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3450 strerror(errno));
3451 return REDIS_ERR;
3452 }
3453 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3454 server.bgsavechildpid = childpid;
3455 updateDictResizePolicy();
3456 return REDIS_OK;
3457 }
3458 return REDIS_OK; /* unreached */
3459 }
3460
3461 static void rdbRemoveTempFile(pid_t childpid) {
3462 char tmpfile[256];
3463
3464 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3465 unlink(tmpfile);
3466 }
3467
3468 static int rdbLoadType(FILE *fp) {
3469 unsigned char type;
3470 if (fread(&type,1,1,fp) == 0) return -1;
3471 return type;
3472 }
3473
3474 static time_t rdbLoadTime(FILE *fp) {
3475 int32_t t32;
3476 if (fread(&t32,4,1,fp) == 0) return -1;
3477 return (time_t) t32;
3478 }
3479
3480 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3481 * of this file for a description of how this are stored on disk.
3482 *
3483 * isencoded is set to 1 if the readed length is not actually a length but
3484 * an "encoding type", check the above comments for more info */
3485 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3486 unsigned char buf[2];
3487 uint32_t len;
3488 int type;
3489
3490 if (isencoded) *isencoded = 0;
3491 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3492 type = (buf[0]&0xC0)>>6;
3493 if (type == REDIS_RDB_6BITLEN) {
3494 /* Read a 6 bit len */
3495 return buf[0]&0x3F;
3496 } else if (type == REDIS_RDB_ENCVAL) {
3497 /* Read a 6 bit len encoding type */
3498 if (isencoded) *isencoded = 1;
3499 return buf[0]&0x3F;
3500 } else if (type == REDIS_RDB_14BITLEN) {
3501 /* Read a 14 bit len */
3502 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3503 return ((buf[0]&0x3F)<<8)|buf[1];
3504 } else {
3505 /* Read a 32 bit len */
3506 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3507 return ntohl(len);
3508 }
3509 }
3510
3511 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3512 unsigned char enc[4];
3513 long long val;
3514
3515 if (enctype == REDIS_RDB_ENC_INT8) {
3516 if (fread(enc,1,1,fp) == 0) return NULL;
3517 val = (signed char)enc[0];
3518 } else if (enctype == REDIS_RDB_ENC_INT16) {
3519 uint16_t v;
3520 if (fread(enc,2,1,fp) == 0) return NULL;
3521 v = enc[0]|(enc[1]<<8);
3522 val = (int16_t)v;
3523 } else if (enctype == REDIS_RDB_ENC_INT32) {
3524 uint32_t v;
3525 if (fread(enc,4,1,fp) == 0) return NULL;
3526 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3527 val = (int32_t)v;
3528 } else {
3529 val = 0; /* anti-warning */
3530 redisAssert(0);
3531 }
3532 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3533 }
3534
3535 static robj *rdbLoadLzfStringObject(FILE*fp) {
3536 unsigned int len, clen;
3537 unsigned char *c = NULL;
3538 sds val = NULL;
3539
3540 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3541 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3542 if ((c = zmalloc(clen)) == NULL) goto err;
3543 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3544 if (fread(c,clen,1,fp) == 0) goto err;
3545 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3546 zfree(c);
3547 return createObject(REDIS_STRING,val);
3548 err:
3549 zfree(c);
3550 sdsfree(val);
3551 return NULL;
3552 }
3553
3554 static robj *rdbLoadStringObject(FILE*fp) {
3555 int isencoded;
3556 uint32_t len;
3557 sds val;
3558
3559 len = rdbLoadLen(fp,&isencoded);
3560 if (isencoded) {
3561 switch(len) {
3562 case REDIS_RDB_ENC_INT8:
3563 case REDIS_RDB_ENC_INT16:
3564 case REDIS_RDB_ENC_INT32:
3565 return rdbLoadIntegerObject(fp,len);
3566 case REDIS_RDB_ENC_LZF:
3567 return rdbLoadLzfStringObject(fp);
3568 default:
3569 redisAssert(0);
3570 }
3571 }
3572
3573 if (len == REDIS_RDB_LENERR) return NULL;
3574 val = sdsnewlen(NULL,len);
3575 if (len && fread(val,len,1,fp) == 0) {
3576 sdsfree(val);
3577 return NULL;
3578 }
3579 return createObject(REDIS_STRING,val);
3580 }
3581
3582 /* For information about double serialization check rdbSaveDoubleValue() */
3583 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3584 char buf[128];
3585 unsigned char len;
3586
3587 if (fread(&len,1,1,fp) == 0) return -1;
3588 switch(len) {
3589 case 255: *val = R_NegInf; return 0;
3590 case 254: *val = R_PosInf; return 0;
3591 case 253: *val = R_Nan; return 0;
3592 default:
3593 if (fread(buf,len,1,fp) == 0) return -1;
3594 buf[len] = '\0';
3595 sscanf(buf, "%lg", val);
3596 return 0;
3597 }
3598 }
3599
3600 /* Load a Redis object of the specified type from the specified file.
3601 * On success a newly allocated object is returned, otherwise NULL. */
3602 static robj *rdbLoadObject(int type, FILE *fp) {
3603 robj *o;
3604
3605 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3606 if (type == REDIS_STRING) {
3607 /* Read string value */
3608 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3609 tryObjectEncoding(o);
3610 } else if (type == REDIS_LIST || type == REDIS_SET) {
3611 /* Read list/set value */
3612 uint32_t listlen;
3613
3614 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3615 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3616 /* It's faster to expand the dict to the right size asap in order
3617 * to avoid rehashing */
3618 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3619 dictExpand(o->ptr,listlen);
3620 /* Load every single element of the list/set */
3621 while(listlen--) {
3622 robj *ele;
3623
3624 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3625 tryObjectEncoding(ele);
3626 if (type == REDIS_LIST) {
3627 listAddNodeTail((list*)o->ptr,ele);
3628 } else {
3629 dictAdd((dict*)o->ptr,ele,NULL);
3630 }
3631 }
3632 } else if (type == REDIS_ZSET) {
3633 /* Read list/set value */
3634 size_t zsetlen;
3635 zset *zs;
3636
3637 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3638 o = createZsetObject();
3639 zs = o->ptr;
3640 /* Load every single element of the list/set */
3641 while(zsetlen--) {
3642 robj *ele;
3643 double *score = zmalloc(sizeof(double));
3644
3645 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3646 tryObjectEncoding(ele);
3647 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3648 dictAdd(zs->dict,ele,score);
3649 zslInsert(zs->zsl,*score,ele);
3650 incrRefCount(ele); /* added to skiplist */
3651 }
3652 } else if (type == REDIS_HASH) {
3653 size_t hashlen;
3654
3655 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3656 o = createHashObject();
3657 /* Too many entries? Use an hash table. */
3658 if (hashlen > server.hash_max_zipmap_entries)
3659 convertToRealHash(o);
3660 /* Load every key/value, then set it into the zipmap or hash
3661 * table, as needed. */
3662 while(hashlen--) {
3663 robj *key, *val;
3664
3665 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3666 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3667 /* If we are using a zipmap and there are too big values
3668 * the object is converted to real hash table encoding. */
3669 if (o->encoding != REDIS_ENCODING_HT &&
3670 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3671 sdslen(val->ptr) > server.hash_max_zipmap_value))
3672 {
3673 convertToRealHash(o);
3674 }
3675
3676 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3677 unsigned char *zm = o->ptr;
3678
3679 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3680 val->ptr,sdslen(val->ptr),NULL);
3681 o->ptr = zm;
3682 decrRefCount(key);
3683 decrRefCount(val);
3684 } else {
3685 tryObjectEncoding(key);
3686 tryObjectEncoding(val);
3687 dictAdd((dict*)o->ptr,key,val);
3688 }
3689 }
3690 } else {
3691 redisAssert(0);
3692 }
3693 return o;
3694 }
3695
3696 static int rdbLoad(char *filename) {
3697 FILE *fp;
3698 robj *keyobj = NULL;
3699 uint32_t dbid;
3700 int type, retval, rdbver;
3701 dict *d = server.db[0].dict;
3702 redisDb *db = server.db+0;
3703 char buf[1024];
3704 time_t expiretime = -1, now = time(NULL);
3705 long long loadedkeys = 0;
3706
3707 fp = fopen(filename,"r");
3708 if (!fp) return REDIS_ERR;
3709 if (fread(buf,9,1,fp) == 0) goto eoferr;
3710 buf[9] = '\0';
3711 if (memcmp(buf,"REDIS",5) != 0) {
3712 fclose(fp);
3713 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3714 return REDIS_ERR;
3715 }
3716 rdbver = atoi(buf+5);
3717 if (rdbver != 1) {
3718 fclose(fp);
3719 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3720 return REDIS_ERR;
3721 }
3722 while(1) {
3723 robj *o;
3724
3725 /* Read type. */
3726 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3727 if (type == REDIS_EXPIRETIME) {
3728 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3729 /* We read the time so we need to read the object type again */
3730 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3731 }
3732 if (type == REDIS_EOF) break;
3733 /* Handle SELECT DB opcode as a special case */
3734 if (type == REDIS_SELECTDB) {
3735 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3736 goto eoferr;
3737 if (dbid >= (unsigned)server.dbnum) {
3738 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3739 exit(1);
3740 }
3741 db = server.db+dbid;
3742 d = db->dict;
3743 continue;
3744 }
3745 /* Read key */
3746 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3747 /* Read value */
3748 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3749 /* Add the new object in the hash table */
3750 retval = dictAdd(d,keyobj,o);
3751 if (retval == DICT_ERR) {
3752 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3753 exit(1);
3754 }
3755 /* Set the expire time if needed */
3756 if (expiretime != -1) {
3757 setExpire(db,keyobj,expiretime);
3758 /* Delete this key if already expired */
3759 if (expiretime < now) deleteKey(db,keyobj);
3760 expiretime = -1;
3761 }
3762 keyobj = o = NULL;
3763 /* Handle swapping while loading big datasets when VM is on */
3764 loadedkeys++;
3765 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3766 while (zmalloc_used_memory() > server.vm_max_memory) {
3767 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3768 }
3769 }
3770 }
3771 fclose(fp);
3772 return REDIS_OK;
3773
3774 eoferr: /* unexpected end of file is handled here with a fatal exit */
3775 if (keyobj) decrRefCount(keyobj);
3776 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3777 exit(1);
3778 return REDIS_ERR; /* Just to avoid warning */
3779 }
3780
3781 /*================================== Commands =============================== */
3782
3783 static void authCommand(redisClient *c) {
3784 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3785 c->authenticated = 1;
3786 addReply(c,shared.ok);
3787 } else {
3788 c->authenticated = 0;
3789 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3790 }
3791 }
3792
3793 static void pingCommand(redisClient *c) {
3794 addReply(c,shared.pong);
3795 }
3796
3797 static void echoCommand(redisClient *c) {
3798 addReplyBulk(c,c->argv[1]);
3799 }
3800
3801 /*=================================== Strings =============================== */
3802
3803 static void setGenericCommand(redisClient *c, int nx) {
3804 int retval;
3805
3806 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3807 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3808 if (retval == DICT_ERR) {
3809 if (!nx) {
3810 /* If the key is about a swapped value, we want a new key object
3811 * to overwrite the old. So we delete the old key in the database.
3812 * This will also make sure that swap pages about the old object
3813 * will be marked as free. */
3814 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3815 incrRefCount(c->argv[1]);
3816 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3817 incrRefCount(c->argv[2]);
3818 } else {
3819 addReply(c,shared.czero);
3820 return;
3821 }
3822 } else {
3823 incrRefCount(c->argv[1]);
3824 incrRefCount(c->argv[2]);
3825 }
3826 server.dirty++;
3827 removeExpire(c->db,c->argv[1]);
3828 addReply(c, nx ? shared.cone : shared.ok);
3829 }
3830
3831 static void setCommand(redisClient *c) {
3832 setGenericCommand(c,0);
3833 }
3834
3835 static void setnxCommand(redisClient *c) {
3836 setGenericCommand(c,1);
3837 }
3838
3839 static int getGenericCommand(redisClient *c) {
3840 robj *o;
3841
3842 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3843 return REDIS_OK;
3844
3845 if (o->type != REDIS_STRING) {
3846 addReply(c,shared.wrongtypeerr);
3847 return REDIS_ERR;
3848 } else {
3849 addReplyBulk(c,o);
3850 return REDIS_OK;
3851 }
3852 }
3853
3854 static void getCommand(redisClient *c) {
3855 getGenericCommand(c);
3856 }
3857
3858 static void getsetCommand(redisClient *c) {
3859 if (getGenericCommand(c) == REDIS_ERR) return;
3860 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3861 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3862 } else {
3863 incrRefCount(c->argv[1]);
3864 }
3865 incrRefCount(c->argv[2]);
3866 server.dirty++;
3867 removeExpire(c->db,c->argv[1]);
3868 }
3869
3870 static void mgetCommand(redisClient *c) {
3871 int j;
3872
3873 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3874 for (j = 1; j < c->argc; j++) {
3875 robj *o = lookupKeyRead(c->db,c->argv[j]);
3876 if (o == NULL) {
3877 addReply(c,shared.nullbulk);
3878 } else {
3879 if (o->type != REDIS_STRING) {
3880 addReply(c,shared.nullbulk);
3881 } else {
3882 addReplyBulk(c,o);
3883 }
3884 }
3885 }
3886 }
3887
3888 static void msetGenericCommand(redisClient *c, int nx) {
3889 int j, busykeys = 0;
3890
3891 if ((c->argc % 2) == 0) {
3892 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3893 return;
3894 }
3895 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3896 * set nothing at all if at least one already key exists. */
3897 if (nx) {
3898 for (j = 1; j < c->argc; j += 2) {
3899 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3900 busykeys++;
3901 }
3902 }
3903 }
3904 if (busykeys) {
3905 addReply(c, shared.czero);
3906 return;
3907 }
3908
3909 for (j = 1; j < c->argc; j += 2) {
3910 int retval;
3911
3912 tryObjectEncoding(c->argv[j+1]);
3913 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3914 if (retval == DICT_ERR) {
3915 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3916 incrRefCount(c->argv[j+1]);
3917 } else {
3918 incrRefCount(c->argv[j]);
3919 incrRefCount(c->argv[j+1]);
3920 }
3921 removeExpire(c->db,c->argv[j]);
3922 }
3923 server.dirty += (c->argc-1)/2;
3924 addReply(c, nx ? shared.cone : shared.ok);
3925 }
3926
3927 static void msetCommand(redisClient *c) {
3928 msetGenericCommand(c,0);
3929 }
3930
3931 static void msetnxCommand(redisClient *c) {
3932 msetGenericCommand(c,1);
3933 }
3934
3935 static void incrDecrCommand(redisClient *c, long long incr) {
3936 long long value;
3937 int retval;
3938 robj *o;
3939
3940 o = lookupKeyWrite(c->db,c->argv[1]);
3941 if (o == NULL) {
3942 value = 0;
3943 } else {
3944 if (o->type != REDIS_STRING) {
3945 value = 0;
3946 } else {
3947 char *eptr;
3948
3949 if (o->encoding == REDIS_ENCODING_RAW)
3950 value = strtoll(o->ptr, &eptr, 10);
3951 else if (o->encoding == REDIS_ENCODING_INT)
3952 value = (long)o->ptr;
3953 else
3954 redisAssert(1 != 1);
3955 }
3956 }
3957
3958 value += incr;
3959 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3960 tryObjectEncoding(o);
3961 retval = dictAdd(c->db->dict,c->argv[1],o);
3962 if (retval == DICT_ERR) {
3963 dictReplace(c->db->dict,c->argv[1],o);
3964 removeExpire(c->db,c->argv[1]);
3965 } else {
3966 incrRefCount(c->argv[1]);
3967 }
3968 server.dirty++;
3969 addReply(c,shared.colon);
3970 addReply(c,o);
3971 addReply(c,shared.crlf);
3972 }
3973
3974 static void incrCommand(redisClient *c) {
3975 incrDecrCommand(c,1);
3976 }
3977
3978 static void decrCommand(redisClient *c) {
3979 incrDecrCommand(c,-1);
3980 }
3981
3982 static void incrbyCommand(redisClient *c) {
3983 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3984 incrDecrCommand(c,incr);
3985 }
3986
3987 static void decrbyCommand(redisClient *c) {
3988 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3989 incrDecrCommand(c,-incr);
3990 }
3991
3992 static void appendCommand(redisClient *c) {
3993 int retval;
3994 size_t totlen;
3995 robj *o;
3996
3997 o = lookupKeyWrite(c->db,c->argv[1]);
3998 if (o == NULL) {
3999 /* Create the key */
4000 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4001 incrRefCount(c->argv[1]);
4002 incrRefCount(c->argv[2]);
4003 totlen = stringObjectLen(c->argv[2]);
4004 } else {
4005 dictEntry *de;
4006
4007 de = dictFind(c->db->dict,c->argv[1]);
4008 assert(de != NULL);
4009
4010 o = dictGetEntryVal(de);
4011 if (o->type != REDIS_STRING) {
4012 addReply(c,shared.wrongtypeerr);
4013 return;
4014 }
4015 /* If the object is specially encoded or shared we have to make
4016 * a copy */
4017 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4018 robj *decoded = getDecodedObject(o);
4019
4020 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4021 decrRefCount(decoded);
4022 dictReplace(c->db->dict,c->argv[1],o);
4023 }
4024 /* APPEND! */
4025 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4026 o->ptr = sdscatlen(o->ptr,
4027 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4028 } else {
4029 o->ptr = sdscatprintf(o->ptr, "%ld",
4030 (unsigned long) c->argv[2]->ptr);
4031 }
4032 totlen = sdslen(o->ptr);
4033 }
4034 server.dirty++;
4035 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4036 }
4037
4038 static void substrCommand(redisClient *c) {
4039 robj *o;
4040 long start = atoi(c->argv[2]->ptr);
4041 long end = atoi(c->argv[3]->ptr);
4042 size_t rangelen, strlen;
4043 sds range;
4044
4045 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4046 checkType(c,o,REDIS_STRING)) return;
4047
4048 o = getDecodedObject(o);
4049 strlen = sdslen(o->ptr);
4050
4051 /* convert negative indexes */
4052 if (start < 0) start = strlen+start;
4053 if (end < 0) end = strlen+end;
4054 if (start < 0) start = 0;
4055 if (end < 0) end = 0;
4056
4057 /* indexes sanity checks */
4058 if (start > end || (size_t)start >= strlen) {
4059 /* Out of range start or start > end result in null reply */
4060 addReply(c,shared.nullbulk);
4061 decrRefCount(o);
4062 return;
4063 }
4064 if ((size_t)end >= strlen) end = strlen-1;
4065 rangelen = (end-start)+1;
4066
4067 /* Return the result */
4068 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4069 range = sdsnewlen((char*)o->ptr+start,rangelen);
4070 addReplySds(c,range);
4071 addReply(c,shared.crlf);
4072 decrRefCount(o);
4073 }
4074
4075 /* ========================= Type agnostic commands ========================= */
4076
4077 static void delCommand(redisClient *c) {
4078 int deleted = 0, j;
4079
4080 for (j = 1; j < c->argc; j++) {
4081 if (deleteKey(c->db,c->argv[j])) {
4082 server.dirty++;
4083 deleted++;
4084 }
4085 }
4086 addReplyLong(c,deleted);
4087 }
4088
4089 static void existsCommand(redisClient *c) {
4090 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4091 }
4092
4093 static void selectCommand(redisClient *c) {
4094 int id = atoi(c->argv[1]->ptr);
4095
4096 if (selectDb(c,id) == REDIS_ERR) {
4097 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4098 } else {
4099 addReply(c,shared.ok);
4100 }
4101 }
4102
4103 static void randomkeyCommand(redisClient *c) {
4104 dictEntry *de;
4105
4106 while(1) {
4107 de = dictGetRandomKey(c->db->dict);
4108 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4109 }
4110 if (de == NULL) {
4111 addReply(c,shared.plus);
4112 addReply(c,shared.crlf);
4113 } else {
4114 addReply(c,shared.plus);
4115 addReply(c,dictGetEntryKey(de));
4116 addReply(c,shared.crlf);
4117 }
4118 }
4119
4120 static void keysCommand(redisClient *c) {
4121 dictIterator *di;
4122 dictEntry *de;
4123 sds pattern = c->argv[1]->ptr;
4124 int plen = sdslen(pattern);
4125 unsigned long numkeys = 0;
4126 robj *lenobj = createObject(REDIS_STRING,NULL);
4127
4128 di = dictGetIterator(c->db->dict);
4129 addReply(c,lenobj);
4130 decrRefCount(lenobj);
4131 while((de = dictNext(di)) != NULL) {
4132 robj *keyobj = dictGetEntryKey(de);
4133
4134 sds key = keyobj->ptr;
4135 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4136 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4137 if (expireIfNeeded(c->db,keyobj) == 0) {
4138 addReplyBulk(c,keyobj);
4139 numkeys++;
4140 }
4141 }
4142 }
4143 dictReleaseIterator(di);
4144 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4145 }
4146
4147 static void dbsizeCommand(redisClient *c) {
4148 addReplySds(c,
4149 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4150 }
4151
4152 static void lastsaveCommand(redisClient *c) {
4153 addReplySds(c,
4154 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4155 }
4156
4157 static void typeCommand(redisClient *c) {
4158 robj *o;
4159 char *type;
4160
4161 o = lookupKeyRead(c->db,c->argv[1]);
4162 if (o == NULL) {
4163 type = "+none";
4164 } else {
4165 switch(o->type) {
4166 case REDIS_STRING: type = "+string"; break;
4167 case REDIS_LIST: type = "+list"; break;
4168 case REDIS_SET: type = "+set"; break;
4169 case REDIS_ZSET: type = "+zset"; break;
4170 case REDIS_HASH: type = "+hash"; break;
4171 default: type = "+unknown"; break;
4172 }
4173 }
4174 addReplySds(c,sdsnew(type));
4175 addReply(c,shared.crlf);
4176 }
4177
4178 static void saveCommand(redisClient *c) {
4179 if (server.bgsavechildpid != -1) {
4180 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4181 return;
4182 }
4183 if (rdbSave(server.dbfilename) == REDIS_OK) {
4184 addReply(c,shared.ok);
4185 } else {
4186 addReply(c,shared.err);
4187 }
4188 }
4189
4190 static void bgsaveCommand(redisClient *c) {
4191 if (server.bgsavechildpid != -1) {
4192 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4193 return;
4194 }
4195 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4196 char *status = "+Background saving started\r\n";
4197 addReplySds(c,sdsnew(status));
4198 } else {
4199 addReply(c,shared.err);
4200 }
4201 }
4202
4203 static void shutdownCommand(redisClient *c) {
4204 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4205 /* Kill the saving child if there is a background saving in progress.
4206 We want to avoid race conditions, for instance our saving child may
4207 overwrite the synchronous saving did by SHUTDOWN. */
4208 if (server.bgsavechildpid != -1) {
4209 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4210 kill(server.bgsavechildpid,SIGKILL);
4211 rdbRemoveTempFile(server.bgsavechildpid);
4212 }
4213 if (server.appendonly) {
4214 /* Append only file: fsync() the AOF and exit */
4215 fsync(server.appendfd);
4216 if (server.vm_enabled) unlink(server.vm_swap_file);
4217 exit(0);
4218 } else {
4219 /* Snapshotting. Perform a SYNC SAVE and exit */
4220 if (rdbSave(server.dbfilename) == REDIS_OK) {
4221 if (server.daemonize)
4222 unlink(server.pidfile);
4223 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4224 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4225 if (server.vm_enabled) unlink(server.vm_swap_file);
4226 exit(0);
4227 } else {
4228 /* Ooops.. error saving! The best we can do is to continue
4229 * operating. Note that if there was a background saving process,
4230 * in the next cron() Redis will be notified that the background
4231 * saving aborted, handling special stuff like slaves pending for
4232 * synchronization... */
4233 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4234 addReplySds(c,
4235 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4236 }
4237 }
4238 }
4239
4240 static void renameGenericCommand(redisClient *c, int nx) {
4241 robj *o;
4242
4243 /* To use the same key as src and dst is probably an error */
4244 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4245 addReply(c,shared.sameobjecterr);
4246 return;
4247 }
4248
4249 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4250 return;
4251
4252 incrRefCount(o);
4253 deleteIfVolatile(c->db,c->argv[2]);
4254 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4255 if (nx) {
4256 decrRefCount(o);
4257 addReply(c,shared.czero);
4258 return;
4259 }
4260 dictReplace(c->db->dict,c->argv[2],o);
4261 } else {
4262 incrRefCount(c->argv[2]);
4263 }
4264 deleteKey(c->db,c->argv[1]);
4265 server.dirty++;
4266 addReply(c,nx ? shared.cone : shared.ok);
4267 }
4268
4269 static void renameCommand(redisClient *c) {
4270 renameGenericCommand(c,0);
4271 }
4272
4273 static void renamenxCommand(redisClient *c) {
4274 renameGenericCommand(c,1);
4275 }
4276
4277 static void moveCommand(redisClient *c) {
4278 robj *o;
4279 redisDb *src, *dst;
4280 int srcid;
4281
4282 /* Obtain source and target DB pointers */
4283 src = c->db;
4284 srcid = c->db->id;
4285 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4286 addReply(c,shared.outofrangeerr);
4287 return;
4288 }
4289 dst = c->db;
4290 selectDb(c,srcid); /* Back to the source DB */
4291
4292 /* If the user is moving using as target the same
4293 * DB as the source DB it is probably an error. */
4294 if (src == dst) {
4295 addReply(c,shared.sameobjecterr);
4296 return;
4297 }
4298
4299 /* Check if the element exists and get a reference */
4300 o = lookupKeyWrite(c->db,c->argv[1]);
4301 if (!o) {
4302 addReply(c,shared.czero);
4303 return;
4304 }
4305
4306 /* Try to add the element to the target DB */
4307 deleteIfVolatile(dst,c->argv[1]);
4308 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4309 addReply(c,shared.czero);
4310 return;
4311 }
4312 incrRefCount(c->argv[1]);
4313 incrRefCount(o);
4314
4315 /* OK! key moved, free the entry in the source DB */
4316 deleteKey(src,c->argv[1]);
4317 server.dirty++;
4318 addReply(c,shared.cone);
4319 }
4320
4321 /* =================================== Lists ================================ */
4322 static void pushGenericCommand(redisClient *c, int where) {
4323 robj *lobj;
4324 list *list;
4325
4326 lobj = lookupKeyWrite(c->db,c->argv[1]);
4327 if (lobj == NULL) {
4328 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4329 addReply(c,shared.cone);
4330 return;
4331 }
4332 lobj = createListObject();
4333 list = lobj->ptr;
4334 if (where == REDIS_HEAD) {
4335 listAddNodeHead(list,c->argv[2]);
4336 } else {
4337 listAddNodeTail(list,c->argv[2]);
4338 }
4339 dictAdd(c->db->dict,c->argv[1],lobj);
4340 incrRefCount(c->argv[1]);
4341 incrRefCount(c->argv[2]);
4342 } else {
4343 if (lobj->type != REDIS_LIST) {
4344 addReply(c,shared.wrongtypeerr);
4345 return;
4346 }
4347 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4348 addReply(c,shared.cone);
4349 return;
4350 }
4351 list = lobj->ptr;
4352 if (where == REDIS_HEAD) {
4353 listAddNodeHead(list,c->argv[2]);
4354 } else {
4355 listAddNodeTail(list,c->argv[2]);
4356 }
4357 incrRefCount(c->argv[2]);
4358 }
4359 server.dirty++;
4360 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4361 }
4362
4363 static void lpushCommand(redisClient *c) {
4364 pushGenericCommand(c,REDIS_HEAD);
4365 }
4366
4367 static void rpushCommand(redisClient *c) {
4368 pushGenericCommand(c,REDIS_TAIL);
4369 }
4370
4371 static void llenCommand(redisClient *c) {
4372 robj *o;
4373 list *l;
4374
4375 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4376 checkType(c,o,REDIS_LIST)) return;
4377
4378 l = o->ptr;
4379 addReplyUlong(c,listLength(l));
4380 }
4381
4382 static void lindexCommand(redisClient *c) {
4383 robj *o;
4384 int index = atoi(c->argv[2]->ptr);
4385 list *list;
4386 listNode *ln;
4387
4388 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4389 checkType(c,o,REDIS_LIST)) return;
4390 list = o->ptr;
4391
4392 ln = listIndex(list, index);
4393 if (ln == NULL) {
4394 addReply(c,shared.nullbulk);
4395 } else {
4396 robj *ele = listNodeValue(ln);
4397 addReplyBulk(c,ele);
4398 }
4399 }
4400
4401 static void lsetCommand(redisClient *c) {
4402 robj *o;
4403 int index = atoi(c->argv[2]->ptr);
4404 list *list;
4405 listNode *ln;
4406
4407 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4408 checkType(c,o,REDIS_LIST)) return;
4409 list = o->ptr;
4410
4411 ln = listIndex(list, index);
4412 if (ln == NULL) {
4413 addReply(c,shared.outofrangeerr);
4414 } else {
4415 robj *ele = listNodeValue(ln);
4416
4417 decrRefCount(ele);
4418 listNodeValue(ln) = c->argv[3];
4419 incrRefCount(c->argv[3]);
4420 addReply(c,shared.ok);
4421 server.dirty++;
4422 }
4423 }
4424
4425 static void popGenericCommand(redisClient *c, int where) {
4426 robj *o;
4427 list *list;
4428 listNode *ln;
4429
4430 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4431 checkType(c,o,REDIS_LIST)) return;
4432 list = o->ptr;
4433
4434 if (where == REDIS_HEAD)
4435 ln = listFirst(list);
4436 else
4437 ln = listLast(list);
4438
4439 if (ln == NULL) {
4440 addReply(c,shared.nullbulk);
4441 } else {
4442 robj *ele = listNodeValue(ln);
4443 addReplyBulk(c,ele);
4444 listDelNode(list,ln);
4445 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4446 server.dirty++;
4447 }
4448 }
4449
4450 static void lpopCommand(redisClient *c) {
4451 popGenericCommand(c,REDIS_HEAD);
4452 }
4453
4454 static void rpopCommand(redisClient *c) {
4455 popGenericCommand(c,REDIS_TAIL);
4456 }
4457
4458 static void lrangeCommand(redisClient *c) {
4459 robj *o;
4460 int start = atoi(c->argv[2]->ptr);
4461 int end = atoi(c->argv[3]->ptr);
4462 int llen;
4463 int rangelen, j;
4464 list *list;
4465 listNode *ln;
4466 robj *ele;
4467
4468 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4469 checkType(c,o,REDIS_LIST)) return;
4470 list = o->ptr;
4471 llen = listLength(list);
4472
4473 /* convert negative indexes */
4474 if (start < 0) start = llen+start;
4475 if (end < 0) end = llen+end;
4476 if (start < 0) start = 0;
4477 if (end < 0) end = 0;
4478
4479 /* indexes sanity checks */
4480 if (start > end || start >= llen) {
4481 /* Out of range start or start > end result in empty list */
4482 addReply(c,shared.emptymultibulk);
4483 return;
4484 }
4485 if (end >= llen) end = llen-1;
4486 rangelen = (end-start)+1;
4487
4488 /* Return the result in form of a multi-bulk reply */
4489 ln = listIndex(list, start);
4490 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4491 for (j = 0; j < rangelen; j++) {
4492 ele = listNodeValue(ln);
4493 addReplyBulk(c,ele);
4494 ln = ln->next;
4495 }
4496 }
4497
4498 static void ltrimCommand(redisClient *c) {
4499 robj *o;
4500 int start = atoi(c->argv[2]->ptr);
4501 int end = atoi(c->argv[3]->ptr);
4502 int llen;
4503 int j, ltrim, rtrim;
4504 list *list;
4505 listNode *ln;
4506
4507 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4508 checkType(c,o,REDIS_LIST)) return;
4509 list = o->ptr;
4510 llen = listLength(list);
4511
4512 /* convert negative indexes */
4513 if (start < 0) start = llen+start;
4514 if (end < 0) end = llen+end;
4515 if (start < 0) start = 0;
4516 if (end < 0) end = 0;
4517
4518 /* indexes sanity checks */
4519 if (start > end || start >= llen) {
4520 /* Out of range start or start > end result in empty list */
4521 ltrim = llen;
4522 rtrim = 0;
4523 } else {
4524 if (end >= llen) end = llen-1;
4525 ltrim = start;
4526 rtrim = llen-end-1;
4527 }
4528
4529 /* Remove list elements to perform the trim */
4530 for (j = 0; j < ltrim; j++) {
4531 ln = listFirst(list);
4532 listDelNode(list,ln);
4533 }
4534 for (j = 0; j < rtrim; j++) {
4535 ln = listLast(list);
4536 listDelNode(list,ln);
4537 }
4538 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4539 server.dirty++;
4540 addReply(c,shared.ok);
4541 }
4542
4543 static void lremCommand(redisClient *c) {
4544 robj *o;
4545 list *list;
4546 listNode *ln, *next;
4547 int toremove = atoi(c->argv[2]->ptr);
4548 int removed = 0;
4549 int fromtail = 0;
4550
4551 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4552 checkType(c,o,REDIS_LIST)) return;
4553 list = o->ptr;
4554
4555 if (toremove < 0) {
4556 toremove = -toremove;
4557 fromtail = 1;
4558 }
4559 ln = fromtail ? list->tail : list->head;
4560 while (ln) {
4561 robj *ele = listNodeValue(ln);
4562
4563 next = fromtail ? ln->prev : ln->next;
4564 if (compareStringObjects(ele,c->argv[3]) == 0) {
4565 listDelNode(list,ln);
4566 server.dirty++;
4567 removed++;
4568 if (toremove && removed == toremove) break;
4569 }
4570 ln = next;
4571 }
4572 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4573 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4574 }
4575
4576 /* This is the semantic of this command:
4577 * RPOPLPUSH srclist dstlist:
4578 * IF LLEN(srclist) > 0
4579 * element = RPOP srclist
4580 * LPUSH dstlist element
4581 * RETURN element
4582 * ELSE
4583 * RETURN nil
4584 * END
4585 * END
4586 *
4587 * The idea is to be able to get an element from a list in a reliable way
4588 * since the element is not just returned but pushed against another list
4589 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4590 */
4591 static void rpoplpushcommand(redisClient *c) {
4592 robj *sobj;
4593 list *srclist;
4594 listNode *ln;
4595
4596 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4597 checkType(c,sobj,REDIS_LIST)) return;
4598 srclist = sobj->ptr;
4599 ln = listLast(srclist);
4600
4601 if (ln == NULL) {
4602 addReply(c,shared.nullbulk);
4603 } else {
4604 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4605 robj *ele = listNodeValue(ln);
4606 list *dstlist;
4607
4608 if (dobj && dobj->type != REDIS_LIST) {
4609 addReply(c,shared.wrongtypeerr);
4610 return;
4611 }
4612
4613 /* Add the element to the target list (unless it's directly
4614 * passed to some BLPOP-ing client */
4615 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4616 if (dobj == NULL) {
4617 /* Create the list if the key does not exist */
4618 dobj = createListObject();
4619 dictAdd(c->db->dict,c->argv[2],dobj);
4620 incrRefCount(c->argv[2]);
4621 }
4622 dstlist = dobj->ptr;
4623 listAddNodeHead(dstlist,ele);
4624 incrRefCount(ele);
4625 }
4626
4627 /* Send the element to the client as reply as well */
4628 addReplyBulk(c,ele);
4629
4630 /* Finally remove the element from the source list */
4631 listDelNode(srclist,ln);
4632 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4633 server.dirty++;
4634 }
4635 }
4636
4637 /* ==================================== Sets ================================ */
4638
4639 static void saddCommand(redisClient *c) {
4640 robj *set;
4641
4642 set = lookupKeyWrite(c->db,c->argv[1]);
4643 if (set == NULL) {
4644 set = createSetObject();
4645 dictAdd(c->db->dict,c->argv[1],set);
4646 incrRefCount(c->argv[1]);
4647 } else {
4648 if (set->type != REDIS_SET) {
4649 addReply(c,shared.wrongtypeerr);
4650 return;
4651 }
4652 }
4653 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4654 incrRefCount(c->argv[2]);
4655 server.dirty++;
4656 addReply(c,shared.cone);
4657 } else {
4658 addReply(c,shared.czero);
4659 }
4660 }
4661
4662 static void sremCommand(redisClient *c) {
4663 robj *set;
4664
4665 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4666 checkType(c,set,REDIS_SET)) return;
4667
4668 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4669 server.dirty++;
4670 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4671 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4672 addReply(c,shared.cone);
4673 } else {
4674 addReply(c,shared.czero);
4675 }
4676 }
4677
4678 static void smoveCommand(redisClient *c) {
4679 robj *srcset, *dstset;
4680
4681 srcset = lookupKeyWrite(c->db,c->argv[1]);
4682 dstset = lookupKeyWrite(c->db,c->argv[2]);
4683
4684 /* If the source key does not exist return 0, if it's of the wrong type
4685 * raise an error */
4686 if (srcset == NULL || srcset->type != REDIS_SET) {
4687 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4688 return;
4689 }
4690 /* Error if the destination key is not a set as well */
4691 if (dstset && dstset->type != REDIS_SET) {
4692 addReply(c,shared.wrongtypeerr);
4693 return;
4694 }
4695 /* Remove the element from the source set */
4696 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4697 /* Key not found in the src set! return zero */
4698 addReply(c,shared.czero);
4699 return;
4700 }
4701 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4702 deleteKey(c->db,c->argv[1]);
4703 server.dirty++;
4704 /* Add the element to the destination set */
4705 if (!dstset) {
4706 dstset = createSetObject();
4707 dictAdd(c->db->dict,c->argv[2],dstset);
4708 incrRefCount(c->argv[2]);
4709 }
4710 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4711 incrRefCount(c->argv[3]);
4712 addReply(c,shared.cone);
4713 }
4714
4715 static void sismemberCommand(redisClient *c) {
4716 robj *set;
4717
4718 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4719 checkType(c,set,REDIS_SET)) return;
4720
4721 if (dictFind(set->ptr,c->argv[2]))
4722 addReply(c,shared.cone);
4723 else
4724 addReply(c,shared.czero);
4725 }
4726
4727 static void scardCommand(redisClient *c) {
4728 robj *o;
4729 dict *s;
4730
4731 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4732 checkType(c,o,REDIS_SET)) return;
4733
4734 s = o->ptr;
4735 addReplyUlong(c,dictSize(s));
4736 }
4737
4738 static void spopCommand(redisClient *c) {
4739 robj *set;
4740 dictEntry *de;
4741
4742 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4743 checkType(c,set,REDIS_SET)) return;
4744
4745 de = dictGetRandomKey(set->ptr);
4746 if (de == NULL) {
4747 addReply(c,shared.nullbulk);
4748 } else {
4749 robj *ele = dictGetEntryKey(de);
4750
4751 addReplyBulk(c,ele);
4752 dictDelete(set->ptr,ele);
4753 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4754 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4755 server.dirty++;
4756 }
4757 }
4758
4759 static void srandmemberCommand(redisClient *c) {
4760 robj *set;
4761 dictEntry *de;
4762
4763 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4764 checkType(c,set,REDIS_SET)) return;
4765
4766 de = dictGetRandomKey(set->ptr);
4767 if (de == NULL) {
4768 addReply(c,shared.nullbulk);
4769 } else {
4770 robj *ele = dictGetEntryKey(de);
4771
4772 addReplyBulk(c,ele);
4773 }
4774 }
4775
4776 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4777 dict **d1 = (void*) s1, **d2 = (void*) s2;
4778
4779 return dictSize(*d1)-dictSize(*d2);
4780 }
4781
4782 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4783 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4784 dictIterator *di;
4785 dictEntry *de;
4786 robj *lenobj = NULL, *dstset = NULL;
4787 unsigned long j, cardinality = 0;
4788
4789 for (j = 0; j < setsnum; j++) {
4790 robj *setobj;
4791
4792 setobj = dstkey ?
4793 lookupKeyWrite(c->db,setskeys[j]) :
4794 lookupKeyRead(c->db,setskeys[j]);
4795 if (!setobj) {
4796 zfree(dv);
4797 if (dstkey) {
4798 if (deleteKey(c->db,dstkey))
4799 server.dirty++;
4800 addReply(c,shared.czero);
4801 } else {
4802 addReply(c,shared.nullmultibulk);
4803 }
4804 return;
4805 }
4806 if (setobj->type != REDIS_SET) {
4807 zfree(dv);
4808 addReply(c,shared.wrongtypeerr);
4809 return;
4810 }
4811 dv[j] = setobj->ptr;
4812 }
4813 /* Sort sets from the smallest to largest, this will improve our
4814 * algorithm's performace */
4815 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4816
4817 /* The first thing we should output is the total number of elements...
4818 * since this is a multi-bulk write, but at this stage we don't know
4819 * the intersection set size, so we use a trick, append an empty object
4820 * to the output list and save the pointer to later modify it with the
4821 * right length */
4822 if (!dstkey) {
4823 lenobj = createObject(REDIS_STRING,NULL);
4824 addReply(c,lenobj);
4825 decrRefCount(lenobj);
4826 } else {
4827 /* If we have a target key where to store the resulting set
4828 * create this key with an empty set inside */
4829 dstset = createSetObject();
4830 }
4831
4832 /* Iterate all the elements of the first (smallest) set, and test
4833 * the element against all the other sets, if at least one set does
4834 * not include the element it is discarded */
4835 di = dictGetIterator(dv[0]);
4836
4837 while((de = dictNext(di)) != NULL) {
4838 robj *ele;
4839
4840 for (j = 1; j < setsnum; j++)
4841 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4842 if (j != setsnum)
4843 continue; /* at least one set does not contain the member */
4844 ele = dictGetEntryKey(de);
4845 if (!dstkey) {
4846 addReplyBulk(c,ele);
4847 cardinality++;
4848 } else {
4849 dictAdd(dstset->ptr,ele,NULL);
4850 incrRefCount(ele);
4851 }
4852 }
4853 dictReleaseIterator(di);
4854
4855 if (dstkey) {
4856 /* Store the resulting set into the target, if the intersection
4857 * is not an empty set. */
4858 deleteKey(c->db,dstkey);
4859 if (dictSize((dict*)dstset->ptr) > 0) {
4860 dictAdd(c->db->dict,dstkey,dstset);
4861 incrRefCount(dstkey);
4862 addReplyLong(c,dictSize((dict*)dstset->ptr));
4863 } else {
4864 decrRefCount(dstset);
4865 addReply(c,shared.czero);
4866 }
4867 server.dirty++;
4868 } else {
4869 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4870 }
4871 zfree(dv);
4872 }
4873
4874 static void sinterCommand(redisClient *c) {
4875 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4876 }
4877
4878 static void sinterstoreCommand(redisClient *c) {
4879 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4880 }
4881
4882 #define REDIS_OP_UNION 0
4883 #define REDIS_OP_DIFF 1
4884 #define REDIS_OP_INTER 2
4885
4886 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4887 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4888 dictIterator *di;
4889 dictEntry *de;
4890 robj *dstset = NULL;
4891 int j, cardinality = 0;
4892
4893 for (j = 0; j < setsnum; j++) {
4894 robj *setobj;
4895
4896 setobj = dstkey ?
4897 lookupKeyWrite(c->db,setskeys[j]) :
4898 lookupKeyRead(c->db,setskeys[j]);
4899 if (!setobj) {
4900 dv[j] = NULL;
4901 continue;
4902 }
4903 if (setobj->type != REDIS_SET) {
4904 zfree(dv);
4905 addReply(c,shared.wrongtypeerr);
4906 return;
4907 }
4908 dv[j] = setobj->ptr;
4909 }
4910
4911 /* We need a temp set object to store our union. If the dstkey
4912 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4913 * this set object will be the resulting object to set into the target key*/
4914 dstset = createSetObject();
4915
4916 /* Iterate all the elements of all the sets, add every element a single
4917 * time to the result set */
4918 for (j = 0; j < setsnum; j++) {
4919 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4920 if (!dv[j]) continue; /* non existing keys are like empty sets */
4921
4922 di = dictGetIterator(dv[j]);
4923
4924 while((de = dictNext(di)) != NULL) {
4925 robj *ele;
4926
4927 /* dictAdd will not add the same element multiple times */
4928 ele = dictGetEntryKey(de);
4929 if (op == REDIS_OP_UNION || j == 0) {
4930 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4931 incrRefCount(ele);
4932 cardinality++;
4933 }
4934 } else if (op == REDIS_OP_DIFF) {
4935 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4936 cardinality--;
4937 }
4938 }
4939 }
4940 dictReleaseIterator(di);
4941
4942 /* result set is empty? Exit asap. */
4943 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4944 }
4945
4946 /* Output the content of the resulting set, if not in STORE mode */
4947 if (!dstkey) {
4948 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4949 di = dictGetIterator(dstset->ptr);
4950 while((de = dictNext(di)) != NULL) {
4951 robj *ele;
4952
4953 ele = dictGetEntryKey(de);
4954 addReplyBulk(c,ele);
4955 }
4956 dictReleaseIterator(di);
4957 decrRefCount(dstset);
4958 } else {
4959 /* If we have a target key where to store the resulting set
4960 * create this key with the result set inside */
4961 deleteKey(c->db,dstkey);
4962 if (dictSize((dict*)dstset->ptr) > 0) {
4963 dictAdd(c->db->dict,dstkey,dstset);
4964 incrRefCount(dstkey);
4965 addReplyLong(c,dictSize((dict*)dstset->ptr));
4966 } else {
4967 decrRefCount(dstset);
4968 addReply(c,shared.czero);
4969 }
4970 server.dirty++;
4971 }
4972 zfree(dv);
4973 }
4974
4975 static void sunionCommand(redisClient *c) {
4976 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4977 }
4978
4979 static void sunionstoreCommand(redisClient *c) {
4980 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4981 }
4982
4983 static void sdiffCommand(redisClient *c) {
4984 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4985 }
4986
4987 static void sdiffstoreCommand(redisClient *c) {
4988 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4989 }
4990
4991 /* ==================================== ZSets =============================== */
4992
4993 /* ZSETs are ordered sets using two data structures to hold the same elements
4994 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4995 * data structure.
4996 *
4997 * The elements are added to an hash table mapping Redis objects to scores.
4998 * At the same time the elements are added to a skip list mapping scores
4999 * to Redis objects (so objects are sorted by scores in this "view"). */
5000
5001 /* This skiplist implementation is almost a C translation of the original
5002 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5003 * Alternative to Balanced Trees", modified in three ways:
5004 * a) this implementation allows for repeated values.
5005 * b) the comparison is not just by key (our 'score') but by satellite data.
5006 * c) there is a back pointer, so it's a doubly linked list with the back
5007 * pointers being only at "level 1". This allows to traverse the list
5008 * from tail to head, useful for ZREVRANGE. */
5009
5010 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5011 zskiplistNode *zn = zmalloc(sizeof(*zn));
5012
5013 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5014 if (level > 0)
5015 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5016 zn->score = score;
5017 zn->obj = obj;
5018 return zn;
5019 }
5020
5021 static zskiplist *zslCreate(void) {
5022 int j;
5023 zskiplist *zsl;
5024
5025 zsl = zmalloc(sizeof(*zsl));
5026 zsl->level = 1;
5027 zsl->length = 0;
5028 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5029 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5030 zsl->header->forward[j] = NULL;
5031
5032 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5033 if (j < ZSKIPLIST_MAXLEVEL-1)
5034 zsl->header->span[j] = 0;
5035 }
5036 zsl->header->backward = NULL;
5037 zsl->tail = NULL;
5038 return zsl;
5039 }
5040
5041 static void zslFreeNode(zskiplistNode *node) {
5042 decrRefCount(node->obj);
5043 zfree(node->forward);
5044 zfree(node->span);
5045 zfree(node);
5046 }
5047
5048 static void zslFree(zskiplist *zsl) {
5049 zskiplistNode *node = zsl->header->forward[0], *next;
5050
5051 zfree(zsl->header->forward);
5052 zfree(zsl->header->span);
5053 zfree(zsl->header);
5054 while(node) {
5055 next = node->forward[0];
5056 zslFreeNode(node);
5057 node = next;
5058 }
5059 zfree(zsl);
5060 }
5061
5062 static int zslRandomLevel(void) {
5063 int level = 1;
5064 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5065 level += 1;
5066 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5067 }
5068
5069 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5070 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5071 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5072 int i, level;
5073
5074 x = zsl->header;
5075 for (i = zsl->level-1; i >= 0; i--) {
5076 /* store rank that is crossed to reach the insert position */
5077 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5078
5079 while (x->forward[i] &&
5080 (x->forward[i]->score < score ||
5081 (x->forward[i]->score == score &&
5082 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5083 rank[i] += i > 0 ? x->span[i-1] : 1;
5084 x = x->forward[i];
5085 }
5086 update[i] = x;
5087 }
5088 /* we assume the key is not already inside, since we allow duplicated
5089 * scores, and the re-insertion of score and redis object should never
5090 * happpen since the caller of zslInsert() should test in the hash table
5091 * if the element is already inside or not. */
5092 level = zslRandomLevel();
5093 if (level > zsl->level) {
5094 for (i = zsl->level; i < level; i++) {
5095 rank[i] = 0;
5096 update[i] = zsl->header;
5097 update[i]->span[i-1] = zsl->length;
5098 }
5099 zsl->level = level;
5100 }
5101 x = zslCreateNode(level,score,obj);
5102 for (i = 0; i < level; i++) {
5103 x->forward[i] = update[i]->forward[i];
5104 update[i]->forward[i] = x;
5105
5106 /* update span covered by update[i] as x is inserted here */
5107 if (i > 0) {
5108 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5109 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5110 }
5111 }
5112
5113 /* increment span for untouched levels */
5114 for (i = level; i < zsl->level; i++) {
5115 update[i]->span[i-1]++;
5116 }
5117
5118 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5119 if (x->forward[0])
5120 x->forward[0]->backward = x;
5121 else
5122 zsl->tail = x;
5123 zsl->length++;
5124 }
5125
5126 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5127 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5128 int i;
5129 for (i = 0; i < zsl->level; i++) {
5130 if (update[i]->forward[i] == x) {
5131 if (i > 0) {
5132 update[i]->span[i-1] += x->span[i-1] - 1;
5133 }
5134 update[i]->forward[i] = x->forward[i];
5135 } else {
5136 /* invariant: i > 0, because update[0]->forward[0]
5137 * is always equal to x */
5138 update[i]->span[i-1] -= 1;
5139 }
5140 }
5141 if (x->forward[0]) {
5142 x->forward[0]->backward = x->backward;
5143 } else {
5144 zsl->tail = x->backward;
5145 }
5146 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5147 zsl->level--;
5148 zsl->length--;
5149 }
5150
5151 /* Delete an element with matching score/object from the skiplist. */
5152 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5153 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5154 int i;
5155
5156 x = zsl->header;
5157 for (i = zsl->level-1; i >= 0; i--) {
5158 while (x->forward[i] &&
5159 (x->forward[i]->score < score ||
5160 (x->forward[i]->score == score &&
5161 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5162 x = x->forward[i];
5163 update[i] = x;
5164 }
5165 /* We may have multiple elements with the same score, what we need
5166 * is to find the element with both the right score and object. */
5167 x = x->forward[0];
5168 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5169 zslDeleteNode(zsl, x, update);
5170 zslFreeNode(x);
5171 return 1;
5172 } else {
5173 return 0; /* not found */
5174 }
5175 return 0; /* not found */
5176 }
5177
5178 /* Delete all the elements with score between min and max from the skiplist.
5179 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5180 * Note that this function takes the reference to the hash table view of the
5181 * sorted set, in order to remove the elements from the hash table too. */
5182 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5183 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5184 unsigned long removed = 0;
5185 int i;
5186
5187 x = zsl->header;
5188 for (i = zsl->level-1; i >= 0; i--) {
5189 while (x->forward[i] && x->forward[i]->score < min)
5190 x = x->forward[i];
5191 update[i] = x;
5192 }
5193 /* We may have multiple elements with the same score, what we need
5194 * is to find the element with both the right score and object. */
5195 x = x->forward[0];
5196 while (x && x->score <= max) {
5197 zskiplistNode *next = x->forward[0];
5198 zslDeleteNode(zsl, x, update);
5199 dictDelete(dict,x->obj);
5200 zslFreeNode(x);
5201 removed++;
5202 x = next;
5203 }
5204 return removed; /* not found */
5205 }
5206
5207 /* Delete all the elements with rank between start and end from the skiplist.
5208 * Start and end are inclusive. Note that start and end need to be 1-based */
5209 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5210 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5211 unsigned long traversed = 0, removed = 0;
5212 int i;
5213
5214 x = zsl->header;
5215 for (i = zsl->level-1; i >= 0; i--) {
5216 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5217 traversed += i > 0 ? x->span[i-1] : 1;
5218 x = x->forward[i];
5219 }
5220 update[i] = x;
5221 }
5222
5223 traversed++;
5224 x = x->forward[0];
5225 while (x && traversed <= end) {
5226 zskiplistNode *next = x->forward[0];
5227 zslDeleteNode(zsl, x, update);
5228 dictDelete(dict,x->obj);
5229 zslFreeNode(x);
5230 removed++;
5231 traversed++;
5232 x = next;
5233 }
5234 return removed;
5235 }
5236
5237 /* Find the first node having a score equal or greater than the specified one.
5238 * Returns NULL if there is no match. */
5239 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5240 zskiplistNode *x;
5241 int i;
5242
5243 x = zsl->header;
5244 for (i = zsl->level-1; i >= 0; i--) {
5245 while (x->forward[i] && x->forward[i]->score < score)
5246 x = x->forward[i];
5247 }
5248 /* We may have multiple elements with the same score, what we need
5249 * is to find the element with both the right score and object. */
5250 return x->forward[0];
5251 }
5252
5253 /* Find the rank for an element by both score and key.
5254 * Returns 0 when the element cannot be found, rank otherwise.
5255 * Note that the rank is 1-based due to the span of zsl->header to the
5256 * first element. */
5257 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5258 zskiplistNode *x;
5259 unsigned long rank = 0;
5260 int i;
5261
5262 x = zsl->header;
5263 for (i = zsl->level-1; i >= 0; i--) {
5264 while (x->forward[i] &&
5265 (x->forward[i]->score < score ||
5266 (x->forward[i]->score == score &&
5267 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5268 rank += i > 0 ? x->span[i-1] : 1;
5269 x = x->forward[i];
5270 }
5271
5272 /* x might be equal to zsl->header, so test if obj is non-NULL */
5273 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5274 return rank;
5275 }
5276 }
5277 return 0;
5278 }
5279
5280 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5281 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5282 zskiplistNode *x;
5283 unsigned long traversed = 0;
5284 int i;
5285
5286 x = zsl->header;
5287 for (i = zsl->level-1; i >= 0; i--) {
5288 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5289 {
5290 traversed += i > 0 ? x->span[i-1] : 1;
5291 x = x->forward[i];
5292 }
5293 if (traversed == rank) {
5294 return x;
5295 }
5296 }
5297 return NULL;
5298 }
5299
5300 /* The actual Z-commands implementations */
5301
5302 /* This generic command implements both ZADD and ZINCRBY.
5303 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5304 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5305 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5306 robj *zsetobj;
5307 zset *zs;
5308 double *score;
5309
5310 zsetobj = lookupKeyWrite(c->db,key);
5311 if (zsetobj == NULL) {
5312 zsetobj = createZsetObject();
5313 dictAdd(c->db->dict,key,zsetobj);
5314 incrRefCount(key);
5315 } else {
5316 if (zsetobj->type != REDIS_ZSET) {
5317 addReply(c,shared.wrongtypeerr);
5318 return;
5319 }
5320 }
5321 zs = zsetobj->ptr;
5322
5323 /* Ok now since we implement both ZADD and ZINCRBY here the code
5324 * needs to handle the two different conditions. It's all about setting
5325 * '*score', that is, the new score to set, to the right value. */
5326 score = zmalloc(sizeof(double));
5327 if (doincrement) {
5328 dictEntry *de;
5329
5330 /* Read the old score. If the element was not present starts from 0 */
5331 de = dictFind(zs->dict,ele);
5332 if (de) {
5333 double *oldscore = dictGetEntryVal(de);
5334 *score = *oldscore + scoreval;
5335 } else {
5336 *score = scoreval;
5337 }
5338 } else {
5339 *score = scoreval;
5340 }
5341
5342 /* What follows is a simple remove and re-insert operation that is common
5343 * to both ZADD and ZINCRBY... */
5344 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5345 /* case 1: New element */
5346 incrRefCount(ele); /* added to hash */
5347 zslInsert(zs->zsl,*score,ele);
5348 incrRefCount(ele); /* added to skiplist */
5349 server.dirty++;
5350 if (doincrement)
5351 addReplyDouble(c,*score);
5352 else
5353 addReply(c,shared.cone);
5354 } else {
5355 dictEntry *de;
5356 double *oldscore;
5357
5358 /* case 2: Score update operation */
5359 de = dictFind(zs->dict,ele);
5360 redisAssert(de != NULL);
5361 oldscore = dictGetEntryVal(de);
5362 if (*score != *oldscore) {
5363 int deleted;
5364
5365 /* Remove and insert the element in the skip list with new score */
5366 deleted = zslDelete(zs->zsl,*oldscore,ele);
5367 redisAssert(deleted != 0);
5368 zslInsert(zs->zsl,*score,ele);
5369 incrRefCount(ele);
5370 /* Update the score in the hash table */
5371 dictReplace(zs->dict,ele,score);
5372 server.dirty++;
5373 } else {
5374 zfree(score);
5375 }
5376 if (doincrement)
5377 addReplyDouble(c,*score);
5378 else
5379 addReply(c,shared.czero);
5380 }
5381 }
5382
5383 static void zaddCommand(redisClient *c) {
5384 double scoreval;
5385
5386 scoreval = strtod(c->argv[2]->ptr,NULL);
5387 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5388 }
5389
5390 static void zincrbyCommand(redisClient *c) {
5391 double scoreval;
5392
5393 scoreval = strtod(c->argv[2]->ptr,NULL);
5394 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5395 }
5396
5397 static void zremCommand(redisClient *c) {
5398 robj *zsetobj;
5399 zset *zs;
5400 dictEntry *de;
5401 double *oldscore;
5402 int deleted;
5403
5404 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5405 checkType(c,zsetobj,REDIS_ZSET)) return;
5406
5407 zs = zsetobj->ptr;
5408 de = dictFind(zs->dict,c->argv[2]);
5409 if (de == NULL) {
5410 addReply(c,shared.czero);
5411 return;
5412 }
5413 /* Delete from the skiplist */
5414 oldscore = dictGetEntryVal(de);
5415 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5416 redisAssert(deleted != 0);
5417
5418 /* Delete from the hash table */
5419 dictDelete(zs->dict,c->argv[2]);
5420 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5421 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5422 server.dirty++;
5423 addReply(c,shared.cone);
5424 }
5425
5426 static void zremrangebyscoreCommand(redisClient *c) {
5427 double min = strtod(c->argv[2]->ptr,NULL);
5428 double max = strtod(c->argv[3]->ptr,NULL);
5429 long deleted;
5430 robj *zsetobj;
5431 zset *zs;
5432
5433 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5434 checkType(c,zsetobj,REDIS_ZSET)) return;
5435
5436 zs = zsetobj->ptr;
5437 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5438 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5439 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5440 server.dirty += deleted;
5441 addReplyLong(c,deleted);
5442 }
5443
5444 static void zremrangebyrankCommand(redisClient *c) {
5445 int start = atoi(c->argv[2]->ptr);
5446 int end = atoi(c->argv[3]->ptr);
5447 int llen;
5448 long deleted;
5449 robj *zsetobj;
5450 zset *zs;
5451
5452 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5453 checkType(c,zsetobj,REDIS_ZSET)) return;
5454 zs = zsetobj->ptr;
5455 llen = zs->zsl->length;
5456
5457 /* convert negative indexes */
5458 if (start < 0) start = llen+start;
5459 if (end < 0) end = llen+end;
5460 if (start < 0) start = 0;
5461 if (end < 0) end = 0;
5462
5463 /* indexes sanity checks */
5464 if (start > end || start >= llen) {
5465 addReply(c,shared.czero);
5466 return;
5467 }
5468 if (end >= llen) end = llen-1;
5469
5470 /* increment start and end because zsl*Rank functions
5471 * use 1-based rank */
5472 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5473 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5474 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5475 server.dirty += deleted;
5476 addReplyLong(c, deleted);
5477 }
5478
5479 typedef struct {
5480 dict *dict;
5481 double weight;
5482 } zsetopsrc;
5483
5484 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5485 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5486 unsigned long size1, size2;
5487 size1 = d1->dict ? dictSize(d1->dict) : 0;
5488 size2 = d2->dict ? dictSize(d2->dict) : 0;
5489 return size1 - size2;
5490 }
5491
5492 #define REDIS_AGGR_SUM 1
5493 #define REDIS_AGGR_MIN 2
5494 #define REDIS_AGGR_MAX 3
5495
5496 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5497 if (aggregate == REDIS_AGGR_SUM) {
5498 *target = *target + val;
5499 } else if (aggregate == REDIS_AGGR_MIN) {
5500 *target = val < *target ? val : *target;
5501 } else if (aggregate == REDIS_AGGR_MAX) {
5502 *target = val > *target ? val : *target;
5503 } else {
5504 /* safety net */
5505 redisAssert(0 != 0);
5506 }
5507 }
5508
5509 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5510 int i, j, zsetnum;
5511 int aggregate = REDIS_AGGR_SUM;
5512 zsetopsrc *src;
5513 robj *dstobj;
5514 zset *dstzset;
5515 dictIterator *di;
5516 dictEntry *de;
5517
5518 /* expect zsetnum input keys to be given */
5519 zsetnum = atoi(c->argv[2]->ptr);
5520 if (zsetnum < 1) {
5521 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5522 return;
5523 }
5524
5525 /* test if the expected number of keys would overflow */
5526 if (3+zsetnum > c->argc) {
5527 addReply(c,shared.syntaxerr);
5528 return;
5529 }
5530
5531 /* read keys to be used for input */
5532 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5533 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5534 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5535 if (!zsetobj) {
5536 src[i].dict = NULL;
5537 } else {
5538 if (zsetobj->type != REDIS_ZSET) {
5539 zfree(src);
5540 addReply(c,shared.wrongtypeerr);
5541 return;
5542 }
5543 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5544 }
5545
5546 /* default all weights to 1 */
5547 src[i].weight = 1.0;
5548 }
5549
5550 /* parse optional extra arguments */
5551 if (j < c->argc) {
5552 int remaining = c->argc - j;
5553
5554 while (remaining) {
5555 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5556 j++; remaining--;
5557 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5558 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5559 }
5560 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5561 j++; remaining--;
5562 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5563 aggregate = REDIS_AGGR_SUM;
5564 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5565 aggregate = REDIS_AGGR_MIN;
5566 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5567 aggregate = REDIS_AGGR_MAX;
5568 } else {
5569 zfree(src);
5570 addReply(c,shared.syntaxerr);
5571 return;
5572 }
5573 j++; remaining--;
5574 } else {
5575 zfree(src);
5576 addReply(c,shared.syntaxerr);
5577 return;
5578 }
5579 }
5580 }
5581
5582 /* sort sets from the smallest to largest, this will improve our
5583 * algorithm's performance */
5584 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5585
5586 dstobj = createZsetObject();
5587 dstzset = dstobj->ptr;
5588
5589 if (op == REDIS_OP_INTER) {
5590 /* skip going over all entries if the smallest zset is NULL or empty */
5591 if (src[0].dict && dictSize(src[0].dict) > 0) {
5592 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5593 * from small to large, all src[i > 0].dict are non-empty too */
5594 di = dictGetIterator(src[0].dict);
5595 while((de = dictNext(di)) != NULL) {
5596 double *score = zmalloc(sizeof(double)), value;
5597 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5598
5599 for (j = 1; j < zsetnum; j++) {
5600 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5601 if (other) {
5602 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5603 zunionInterAggregate(score, value, aggregate);
5604 } else {
5605 break;
5606 }
5607 }
5608
5609 /* skip entry when not present in every source dict */
5610 if (j != zsetnum) {
5611 zfree(score);
5612 } else {
5613 robj *o = dictGetEntryKey(de);
5614 dictAdd(dstzset->dict,o,score);
5615 incrRefCount(o); /* added to dictionary */
5616 zslInsert(dstzset->zsl,*score,o);
5617 incrRefCount(o); /* added to skiplist */
5618 }
5619 }
5620 dictReleaseIterator(di);
5621 }
5622 } else if (op == REDIS_OP_UNION) {
5623 for (i = 0; i < zsetnum; i++) {
5624 if (!src[i].dict) continue;
5625
5626 di = dictGetIterator(src[i].dict);
5627 while((de = dictNext(di)) != NULL) {
5628 /* skip key when already processed */
5629 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5630
5631 double *score = zmalloc(sizeof(double)), value;
5632 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5633
5634 /* because the zsets are sorted by size, its only possible
5635 * for sets at larger indices to hold this entry */
5636 for (j = (i+1); j < zsetnum; j++) {
5637 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5638 if (other) {
5639 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5640 zunionInterAggregate(score, value, aggregate);
5641 }
5642 }
5643
5644 robj *o = dictGetEntryKey(de);
5645 dictAdd(dstzset->dict,o,score);
5646 incrRefCount(o); /* added to dictionary */
5647 zslInsert(dstzset->zsl,*score,o);
5648 incrRefCount(o); /* added to skiplist */
5649 }
5650 dictReleaseIterator(di);
5651 }
5652 } else {
5653 /* unknown operator */
5654 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5655 }
5656
5657 deleteKey(c->db,dstkey);
5658 if (dstzset->zsl->length) {
5659 dictAdd(c->db->dict,dstkey,dstobj);
5660 incrRefCount(dstkey);
5661 addReplyLong(c, dstzset->zsl->length);
5662 server.dirty++;
5663 } else {
5664 decrRefCount(dstobj);
5665 addReply(c, shared.czero);
5666 }
5667 zfree(src);
5668 }
5669
5670 static void zunionCommand(redisClient *c) {
5671 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5672 }
5673
5674 static void zinterCommand(redisClient *c) {
5675 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5676 }
5677
5678 static void zrangeGenericCommand(redisClient *c, int reverse) {
5679 robj *o;
5680 int start = atoi(c->argv[2]->ptr);
5681 int end = atoi(c->argv[3]->ptr);
5682 int withscores = 0;
5683 int llen;
5684 int rangelen, j;
5685 zset *zsetobj;
5686 zskiplist *zsl;
5687 zskiplistNode *ln;
5688 robj *ele;
5689
5690 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5691 withscores = 1;
5692 } else if (c->argc >= 5) {
5693 addReply(c,shared.syntaxerr);
5694 return;
5695 }
5696
5697 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5698 checkType(c,o,REDIS_ZSET)) return;
5699 zsetobj = o->ptr;
5700 zsl = zsetobj->zsl;
5701 llen = zsl->length;
5702
5703 /* convert negative indexes */
5704 if (start < 0) start = llen+start;
5705 if (end < 0) end = llen+end;
5706 if (start < 0) start = 0;
5707 if (end < 0) end = 0;
5708
5709 /* indexes sanity checks */
5710 if (start > end || start >= llen) {
5711 /* Out of range start or start > end result in empty list */
5712 addReply(c,shared.emptymultibulk);
5713 return;
5714 }
5715 if (end >= llen) end = llen-1;
5716 rangelen = (end-start)+1;
5717
5718 /* check if starting point is trivial, before searching
5719 * the element in log(N) time */
5720 if (reverse) {
5721 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5722 } else {
5723 ln = start == 0 ?
5724 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5725 }
5726
5727 /* Return the result in form of a multi-bulk reply */
5728 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5729 withscores ? (rangelen*2) : rangelen));
5730 for (j = 0; j < rangelen; j++) {
5731 ele = ln->obj;
5732 addReplyBulk(c,ele);
5733 if (withscores)
5734 addReplyDouble(c,ln->score);
5735 ln = reverse ? ln->backward : ln->forward[0];
5736 }
5737 }
5738
5739 static void zrangeCommand(redisClient *c) {
5740 zrangeGenericCommand(c,0);
5741 }
5742
5743 static void zrevrangeCommand(redisClient *c) {
5744 zrangeGenericCommand(c,1);
5745 }
5746
5747 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5748 * If justcount is non-zero, just the count is returned. */
5749 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5750 robj *o;
5751 double min, max;
5752 int minex = 0, maxex = 0; /* are min or max exclusive? */
5753 int offset = 0, limit = -1;
5754 int withscores = 0;
5755 int badsyntax = 0;
5756
5757 /* Parse the min-max interval. If one of the values is prefixed
5758 * by the "(" character, it's considered "open". For instance
5759 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5760 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5761 if (((char*)c->argv[2]->ptr)[0] == '(') {
5762 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5763 minex = 1;
5764 } else {
5765 min = strtod(c->argv[2]->ptr,NULL);
5766 }
5767 if (((char*)c->argv[3]->ptr)[0] == '(') {
5768 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5769 maxex = 1;
5770 } else {
5771 max = strtod(c->argv[3]->ptr,NULL);
5772 }
5773
5774 /* Parse "WITHSCORES": note that if the command was called with
5775 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5776 * enter the following paths to parse WITHSCORES and LIMIT. */
5777 if (c->argc == 5 || c->argc == 8) {
5778 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5779 withscores = 1;
5780 else
5781 badsyntax = 1;
5782 }
5783 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5784 badsyntax = 1;
5785 if (badsyntax) {
5786 addReplySds(c,
5787 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5788 return;
5789 }
5790
5791 /* Parse "LIMIT" */
5792 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5793 addReply(c,shared.syntaxerr);
5794 return;
5795 } else if (c->argc == (7 + withscores)) {
5796 offset = atoi(c->argv[5]->ptr);
5797 limit = atoi(c->argv[6]->ptr);
5798 if (offset < 0) offset = 0;
5799 }
5800
5801 /* Ok, lookup the key and get the range */
5802 o = lookupKeyRead(c->db,c->argv[1]);
5803 if (o == NULL) {
5804 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5805 } else {
5806 if (o->type != REDIS_ZSET) {
5807 addReply(c,shared.wrongtypeerr);
5808 } else {
5809 zset *zsetobj = o->ptr;
5810 zskiplist *zsl = zsetobj->zsl;
5811 zskiplistNode *ln;
5812 robj *ele, *lenobj = NULL;
5813 unsigned long rangelen = 0;
5814
5815 /* Get the first node with the score >= min, or with
5816 * score > min if 'minex' is true. */
5817 ln = zslFirstWithScore(zsl,min);
5818 while (minex && ln && ln->score == min) ln = ln->forward[0];
5819
5820 if (ln == NULL) {
5821 /* No element matching the speciifed interval */
5822 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5823 return;
5824 }
5825
5826 /* We don't know in advance how many matching elements there
5827 * are in the list, so we push this object that will represent
5828 * the multi-bulk length in the output buffer, and will "fix"
5829 * it later */
5830 if (!justcount) {
5831 lenobj = createObject(REDIS_STRING,NULL);
5832 addReply(c,lenobj);
5833 decrRefCount(lenobj);
5834 }
5835
5836 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5837 if (offset) {
5838 offset--;
5839 ln = ln->forward[0];
5840 continue;
5841 }
5842 if (limit == 0) break;
5843 if (!justcount) {
5844 ele = ln->obj;
5845 addReplyBulk(c,ele);
5846 if (withscores)
5847 addReplyDouble(c,ln->score);
5848 }
5849 ln = ln->forward[0];
5850 rangelen++;
5851 if (limit > 0) limit--;
5852 }
5853 if (justcount) {
5854 addReplyLong(c,(long)rangelen);
5855 } else {
5856 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5857 withscores ? (rangelen*2) : rangelen);
5858 }
5859 }
5860 }
5861 }
5862
5863 static void zrangebyscoreCommand(redisClient *c) {
5864 genericZrangebyscoreCommand(c,0);
5865 }
5866
5867 static void zcountCommand(redisClient *c) {
5868 genericZrangebyscoreCommand(c,1);
5869 }
5870
5871 static void zcardCommand(redisClient *c) {
5872 robj *o;
5873 zset *zs;
5874
5875 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5876 checkType(c,o,REDIS_ZSET)) return;
5877
5878 zs = o->ptr;
5879 addReplyUlong(c,zs->zsl->length);
5880 }
5881
5882 static void zscoreCommand(redisClient *c) {
5883 robj *o;
5884 zset *zs;
5885 dictEntry *de;
5886
5887 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5888 checkType(c,o,REDIS_ZSET)) return;
5889
5890 zs = o->ptr;
5891 de = dictFind(zs->dict,c->argv[2]);
5892 if (!de) {
5893 addReply(c,shared.nullbulk);
5894 } else {
5895 double *score = dictGetEntryVal(de);
5896
5897 addReplyDouble(c,*score);
5898 }
5899 }
5900
5901 static void zrankGenericCommand(redisClient *c, int reverse) {
5902 robj *o;
5903 zset *zs;
5904 zskiplist *zsl;
5905 dictEntry *de;
5906 unsigned long rank;
5907 double *score;
5908
5909 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5910 checkType(c,o,REDIS_ZSET)) return;
5911
5912 zs = o->ptr;
5913 zsl = zs->zsl;
5914 de = dictFind(zs->dict,c->argv[2]);
5915 if (!de) {
5916 addReply(c,shared.nullbulk);
5917 return;
5918 }
5919
5920 score = dictGetEntryVal(de);
5921 rank = zslGetRank(zsl, *score, c->argv[2]);
5922 if (rank) {
5923 if (reverse) {
5924 addReplyLong(c, zsl->length - rank);
5925 } else {
5926 addReplyLong(c, rank-1);
5927 }
5928 } else {
5929 addReply(c,shared.nullbulk);
5930 }
5931 }
5932
5933 static void zrankCommand(redisClient *c) {
5934 zrankGenericCommand(c, 0);
5935 }
5936
5937 static void zrevrankCommand(redisClient *c) {
5938 zrankGenericCommand(c, 1);
5939 }
5940
5941 /* =================================== Hashes =============================== */
5942 static void hsetCommand(redisClient *c) {
5943 int update = 0;
5944 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5945
5946 if (o == NULL) {
5947 o = createHashObject();
5948 dictAdd(c->db->dict,c->argv[1],o);
5949 incrRefCount(c->argv[1]);
5950 } else {
5951 if (o->type != REDIS_HASH) {
5952 addReply(c,shared.wrongtypeerr);
5953 return;
5954 }
5955 }
5956 /* We want to convert the zipmap into an hash table right now if the
5957 * entry to be added is too big. Note that we check if the object
5958 * is integer encoded before to try fetching the length in the test below.
5959 * This is because integers are small, but currently stringObjectLen()
5960 * performs a slow conversion: not worth it. */
5961 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5962 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5963 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5964 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5965 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5966 {
5967 convertToRealHash(o);
5968 }
5969
5970 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5971 unsigned char *zm = o->ptr;
5972 robj *valobj = getDecodedObject(c->argv[3]);
5973
5974 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5975 valobj->ptr,sdslen(valobj->ptr),&update);
5976 decrRefCount(valobj);
5977 o->ptr = zm;
5978
5979 /* And here there is the second check for hash conversion. */
5980 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
5981 convertToRealHash(o);
5982 } else {
5983 tryObjectEncoding(c->argv[2]);
5984 /* note that c->argv[3] is already encoded, as the latest arg
5985 * of a bulk command is always integer encoded if possible. */
5986 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5987 incrRefCount(c->argv[2]);
5988 } else {
5989 update = 1;
5990 }
5991 incrRefCount(c->argv[3]);
5992 }
5993 server.dirty++;
5994 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5995 }
5996
5997 static void hincrbyCommand(redisClient *c) {
5998 long long value = 0, incr = 0;
5999 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6000
6001 if (o == NULL) {
6002 o = createHashObject();
6003 dictAdd(c->db->dict,c->argv[1],o);
6004 incrRefCount(c->argv[1]);
6005 } else {
6006 if (o->type != REDIS_HASH) {
6007 addReply(c,shared.wrongtypeerr);
6008 return;
6009 }
6010 }
6011
6012 incr = strtoll(c->argv[3]->ptr, NULL, 10);
6013 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6014 unsigned char *zm = o->ptr;
6015 unsigned char *zval;
6016 unsigned int zvlen;
6017
6018 /* Find value if already present in hash */
6019 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6020 &zval,&zvlen)) {
6021 /* strtoll needs the char* to have a trailing \0, but
6022 * the zipmap doesn't include them. */
6023 sds szval = sdsnewlen(zval, zvlen);
6024 value = strtoll(szval,NULL,10);
6025 sdsfree(szval);
6026 }
6027
6028 value += incr;
6029 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6030 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6031 (unsigned char*)svalue,sdslen(svalue),NULL);
6032 sdsfree(svalue);
6033 o->ptr = zm;
6034
6035 /* Check if the zipmap needs to be converted. */
6036 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6037 convertToRealHash(o);
6038 } else {
6039 robj *hval;
6040 dictEntry *de;
6041
6042 /* Find value if already present in hash */
6043 de = dictFind(o->ptr,c->argv[2]);
6044 if (de != NULL) {
6045 hval = dictGetEntryVal(de);
6046 if (hval->encoding == REDIS_ENCODING_RAW)
6047 value = strtoll(hval->ptr,NULL,10);
6048 else if (hval->encoding == REDIS_ENCODING_INT)
6049 value = (long)hval->ptr;
6050 else
6051 redisAssert(1 != 1);
6052 }
6053
6054 value += incr;
6055 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6056 tryObjectEncoding(hval);
6057 if (dictReplace(o->ptr,c->argv[2],hval)) {
6058 incrRefCount(c->argv[2]);
6059 }
6060 }
6061
6062 server.dirty++;
6063 addReplyLongLong(c, value);
6064 }
6065
6066 static void hgetCommand(redisClient *c) {
6067 robj *o;
6068
6069 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6070 checkType(c,o,REDIS_HASH)) return;
6071
6072 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6073 unsigned char *zm = o->ptr;
6074 unsigned char *val;
6075 unsigned int vlen;
6076 robj *field;
6077
6078 field = getDecodedObject(c->argv[2]);
6079 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6080 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6081 addReplySds(c,sdsnewlen(val,vlen));
6082 addReply(c,shared.crlf);
6083 decrRefCount(field);
6084 return;
6085 } else {
6086 addReply(c,shared.nullbulk);
6087 decrRefCount(field);
6088 return;
6089 }
6090 } else {
6091 struct dictEntry *de;
6092
6093 de = dictFind(o->ptr,c->argv[2]);
6094 if (de == NULL) {
6095 addReply(c,shared.nullbulk);
6096 } else {
6097 robj *e = dictGetEntryVal(de);
6098
6099 addReplyBulk(c,e);
6100 }
6101 }
6102 }
6103
6104 static void hdelCommand(redisClient *c) {
6105 robj *o;
6106 int deleted = 0;
6107
6108 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6109 checkType(c,o,REDIS_HASH)) return;
6110
6111 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6112 robj *field = getDecodedObject(c->argv[2]);
6113
6114 o->ptr = zipmapDel((unsigned char*) o->ptr,
6115 (unsigned char*) field->ptr,
6116 sdslen(field->ptr), &deleted);
6117 decrRefCount(field);
6118 if (zipmapLen((unsigned char*) o->ptr) == 0)
6119 deleteKey(c->db,c->argv[1]);
6120 } else {
6121 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6122 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6123 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6124 }
6125 if (deleted) server.dirty++;
6126 addReply(c,deleted ? shared.cone : shared.czero);
6127 }
6128
6129 static void hlenCommand(redisClient *c) {
6130 robj *o;
6131 unsigned long len;
6132
6133 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6134 checkType(c,o,REDIS_HASH)) return;
6135
6136 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6137 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6138 addReplyUlong(c,len);
6139 }
6140
6141 #define REDIS_GETALL_KEYS 1
6142 #define REDIS_GETALL_VALS 2
6143 static void genericHgetallCommand(redisClient *c, int flags) {
6144 robj *o, *lenobj;
6145 unsigned long count = 0;
6146
6147 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6148 || checkType(c,o,REDIS_HASH)) return;
6149
6150 lenobj = createObject(REDIS_STRING,NULL);
6151 addReply(c,lenobj);
6152 decrRefCount(lenobj);
6153
6154 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6155 unsigned char *p = zipmapRewind(o->ptr);
6156 unsigned char *field, *val;
6157 unsigned int flen, vlen;
6158
6159 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6160 robj *aux;
6161
6162 if (flags & REDIS_GETALL_KEYS) {
6163 aux = createStringObject((char*)field,flen);
6164 addReplyBulk(c,aux);
6165 decrRefCount(aux);
6166 count++;
6167 }
6168 if (flags & REDIS_GETALL_VALS) {
6169 aux = createStringObject((char*)val,vlen);
6170 addReplyBulk(c,aux);
6171 decrRefCount(aux);
6172 count++;
6173 }
6174 }
6175 } else {
6176 dictIterator *di = dictGetIterator(o->ptr);
6177 dictEntry *de;
6178
6179 while((de = dictNext(di)) != NULL) {
6180 robj *fieldobj = dictGetEntryKey(de);
6181 robj *valobj = dictGetEntryVal(de);
6182
6183 if (flags & REDIS_GETALL_KEYS) {
6184 addReplyBulk(c,fieldobj);
6185 count++;
6186 }
6187 if (flags & REDIS_GETALL_VALS) {
6188 addReplyBulk(c,valobj);
6189 count++;
6190 }
6191 }
6192 dictReleaseIterator(di);
6193 }
6194 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6195 }
6196
6197 static void hkeysCommand(redisClient *c) {
6198 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6199 }
6200
6201 static void hvalsCommand(redisClient *c) {
6202 genericHgetallCommand(c,REDIS_GETALL_VALS);
6203 }
6204
6205 static void hgetallCommand(redisClient *c) {
6206 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6207 }
6208
6209 static void hexistsCommand(redisClient *c) {
6210 robj *o;
6211 int exists = 0;
6212
6213 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6214 checkType(c,o,REDIS_HASH)) return;
6215
6216 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6217 robj *field;
6218 unsigned char *zm = o->ptr;
6219
6220 field = getDecodedObject(c->argv[2]);
6221 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6222 decrRefCount(field);
6223 } else {
6224 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6225 }
6226 addReply(c,exists ? shared.cone : shared.czero);
6227 }
6228
6229 static void convertToRealHash(robj *o) {
6230 unsigned char *key, *val, *p, *zm = o->ptr;
6231 unsigned int klen, vlen;
6232 dict *dict = dictCreate(&hashDictType,NULL);
6233
6234 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6235 p = zipmapRewind(zm);
6236 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6237 robj *keyobj, *valobj;
6238
6239 keyobj = createStringObject((char*)key,klen);
6240 valobj = createStringObject((char*)val,vlen);
6241 tryObjectEncoding(keyobj);
6242 tryObjectEncoding(valobj);
6243 dictAdd(dict,keyobj,valobj);
6244 }
6245 o->encoding = REDIS_ENCODING_HT;
6246 o->ptr = dict;
6247 zfree(zm);
6248 }
6249
6250 /* ========================= Non type-specific commands ==================== */
6251
6252 static void flushdbCommand(redisClient *c) {
6253 server.dirty += dictSize(c->db->dict);
6254 dictEmpty(c->db->dict);
6255 dictEmpty(c->db->expires);
6256 addReply(c,shared.ok);
6257 }
6258
6259 static void flushallCommand(redisClient *c) {
6260 server.dirty += emptyDb();
6261 addReply(c,shared.ok);
6262 if (server.bgsavechildpid != -1) {
6263 kill(server.bgsavechildpid,SIGKILL);
6264 rdbRemoveTempFile(server.bgsavechildpid);
6265 }
6266 rdbSave(server.dbfilename);
6267 server.dirty++;
6268 }
6269
6270 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6271 redisSortOperation *so = zmalloc(sizeof(*so));
6272 so->type = type;
6273 so->pattern = pattern;
6274 return so;
6275 }
6276
6277 /* Return the value associated to the key with a name obtained
6278 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6279 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6280 char *p;
6281 sds spat, ssub;
6282 robj keyobj;
6283 int prefixlen, sublen, postfixlen;
6284 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6285 struct {
6286 long len;
6287 long free;
6288 char buf[REDIS_SORTKEY_MAX+1];
6289 } keyname;
6290
6291 /* If the pattern is "#" return the substitution object itself in order
6292 * to implement the "SORT ... GET #" feature. */
6293 spat = pattern->ptr;
6294 if (spat[0] == '#' && spat[1] == '\0') {
6295 return subst;
6296 }
6297
6298 /* The substitution object may be specially encoded. If so we create
6299 * a decoded object on the fly. Otherwise getDecodedObject will just
6300 * increment the ref count, that we'll decrement later. */
6301 subst = getDecodedObject(subst);
6302
6303 ssub = subst->ptr;
6304 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6305 p = strchr(spat,'*');
6306 if (!p) {
6307 decrRefCount(subst);
6308 return NULL;
6309 }
6310
6311 prefixlen = p-spat;
6312 sublen = sdslen(ssub);
6313 postfixlen = sdslen(spat)-(prefixlen+1);
6314 memcpy(keyname.buf,spat,prefixlen);
6315 memcpy(keyname.buf+prefixlen,ssub,sublen);
6316 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6317 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6318 keyname.len = prefixlen+sublen+postfixlen;
6319
6320 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6321 decrRefCount(subst);
6322
6323 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6324 return lookupKeyRead(db,&keyobj);
6325 }
6326
6327 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6328 * the additional parameter is not standard but a BSD-specific we have to
6329 * pass sorting parameters via the global 'server' structure */
6330 static int sortCompare(const void *s1, const void *s2) {
6331 const redisSortObject *so1 = s1, *so2 = s2;
6332 int cmp;
6333
6334 if (!server.sort_alpha) {
6335 /* Numeric sorting. Here it's trivial as we precomputed scores */
6336 if (so1->u.score > so2->u.score) {
6337 cmp = 1;
6338 } else if (so1->u.score < so2->u.score) {
6339 cmp = -1;
6340 } else {
6341 cmp = 0;
6342 }
6343 } else {
6344 /* Alphanumeric sorting */
6345 if (server.sort_bypattern) {
6346 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6347 /* At least one compare object is NULL */
6348 if (so1->u.cmpobj == so2->u.cmpobj)
6349 cmp = 0;
6350 else if (so1->u.cmpobj == NULL)
6351 cmp = -1;
6352 else
6353 cmp = 1;
6354 } else {
6355 /* We have both the objects, use strcoll */
6356 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6357 }
6358 } else {
6359 /* Compare elements directly */
6360 robj *dec1, *dec2;
6361
6362 dec1 = getDecodedObject(so1->obj);
6363 dec2 = getDecodedObject(so2->obj);
6364 cmp = strcoll(dec1->ptr,dec2->ptr);
6365 decrRefCount(dec1);
6366 decrRefCount(dec2);
6367 }
6368 }
6369 return server.sort_desc ? -cmp : cmp;
6370 }
6371
6372 /* The SORT command is the most complex command in Redis. Warning: this code
6373 * is optimized for speed and a bit less for readability */
6374 static void sortCommand(redisClient *c) {
6375 list *operations;
6376 int outputlen = 0;
6377 int desc = 0, alpha = 0;
6378 int limit_start = 0, limit_count = -1, start, end;
6379 int j, dontsort = 0, vectorlen;
6380 int getop = 0; /* GET operation counter */
6381 robj *sortval, *sortby = NULL, *storekey = NULL;
6382 redisSortObject *vector; /* Resulting vector to sort */
6383
6384 /* Lookup the key to sort. It must be of the right types */
6385 sortval = lookupKeyRead(c->db,c->argv[1]);
6386 if (sortval == NULL) {
6387 addReply(c,shared.nullmultibulk);
6388 return;
6389 }
6390 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6391 sortval->type != REDIS_ZSET)
6392 {
6393 addReply(c,shared.wrongtypeerr);
6394 return;
6395 }
6396
6397 /* Create a list of operations to perform for every sorted element.
6398 * Operations can be GET/DEL/INCR/DECR */
6399 operations = listCreate();
6400 listSetFreeMethod(operations,zfree);
6401 j = 2;
6402
6403 /* Now we need to protect sortval incrementing its count, in the future
6404 * SORT may have options able to overwrite/delete keys during the sorting
6405 * and the sorted key itself may get destroied */
6406 incrRefCount(sortval);
6407
6408 /* The SORT command has an SQL-alike syntax, parse it */
6409 while(j < c->argc) {
6410 int leftargs = c->argc-j-1;
6411 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6412 desc = 0;
6413 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6414 desc = 1;
6415 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6416 alpha = 1;
6417 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6418 limit_start = atoi(c->argv[j+1]->ptr);
6419 limit_count = atoi(c->argv[j+2]->ptr);
6420 j+=2;
6421 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6422 storekey = c->argv[j+1];
6423 j++;
6424 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6425 sortby = c->argv[j+1];
6426 /* If the BY pattern does not contain '*', i.e. it is constant,
6427 * we don't need to sort nor to lookup the weight keys. */
6428 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6429 j++;
6430 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6431 listAddNodeTail(operations,createSortOperation(
6432 REDIS_SORT_GET,c->argv[j+1]));
6433 getop++;
6434 j++;
6435 } else {
6436 decrRefCount(sortval);
6437 listRelease(operations);
6438 addReply(c,shared.syntaxerr);
6439 return;
6440 }
6441 j++;
6442 }
6443
6444 /* Load the sorting vector with all the objects to sort */
6445 switch(sortval->type) {
6446 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6447 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6448 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6449 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6450 }
6451 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6452 j = 0;
6453
6454 if (sortval->type == REDIS_LIST) {
6455 list *list = sortval->ptr;
6456 listNode *ln;
6457 listIter li;
6458
6459 listRewind(list,&li);
6460 while((ln = listNext(&li))) {
6461 robj *ele = ln->value;
6462 vector[j].obj = ele;
6463 vector[j].u.score = 0;
6464 vector[j].u.cmpobj = NULL;
6465 j++;
6466 }
6467 } else {
6468 dict *set;
6469 dictIterator *di;
6470 dictEntry *setele;
6471
6472 if (sortval->type == REDIS_SET) {
6473 set = sortval->ptr;
6474 } else {
6475 zset *zs = sortval->ptr;
6476 set = zs->dict;
6477 }
6478
6479 di = dictGetIterator(set);
6480 while((setele = dictNext(di)) != NULL) {
6481 vector[j].obj = dictGetEntryKey(setele);
6482 vector[j].u.score = 0;
6483 vector[j].u.cmpobj = NULL;
6484 j++;
6485 }
6486 dictReleaseIterator(di);
6487 }
6488 redisAssert(j == vectorlen);
6489
6490 /* Now it's time to load the right scores in the sorting vector */
6491 if (dontsort == 0) {
6492 for (j = 0; j < vectorlen; j++) {
6493 if (sortby) {
6494 robj *byval;
6495
6496 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6497 if (!byval || byval->type != REDIS_STRING) continue;
6498 if (alpha) {
6499 vector[j].u.cmpobj = getDecodedObject(byval);
6500 } else {
6501 if (byval->encoding == REDIS_ENCODING_RAW) {
6502 vector[j].u.score = strtod(byval->ptr,NULL);
6503 } else {
6504 /* Don't need to decode the object if it's
6505 * integer-encoded (the only encoding supported) so
6506 * far. We can just cast it */
6507 if (byval->encoding == REDIS_ENCODING_INT) {
6508 vector[j].u.score = (long)byval->ptr;
6509 } else
6510 redisAssert(1 != 1);
6511 }
6512 }
6513 } else {
6514 if (!alpha) {
6515 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6516 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6517 else {
6518 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6519 vector[j].u.score = (long) vector[j].obj->ptr;
6520 else
6521 redisAssert(1 != 1);
6522 }
6523 }
6524 }
6525 }
6526 }
6527
6528 /* We are ready to sort the vector... perform a bit of sanity check
6529 * on the LIMIT option too. We'll use a partial version of quicksort. */
6530 start = (limit_start < 0) ? 0 : limit_start;
6531 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6532 if (start >= vectorlen) {
6533 start = vectorlen-1;
6534 end = vectorlen-2;
6535 }
6536 if (end >= vectorlen) end = vectorlen-1;
6537
6538 if (dontsort == 0) {
6539 server.sort_desc = desc;
6540 server.sort_alpha = alpha;
6541 server.sort_bypattern = sortby ? 1 : 0;
6542 if (sortby && (start != 0 || end != vectorlen-1))
6543 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6544 else
6545 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6546 }
6547
6548 /* Send command output to the output buffer, performing the specified
6549 * GET/DEL/INCR/DECR operations if any. */
6550 outputlen = getop ? getop*(end-start+1) : end-start+1;
6551 if (storekey == NULL) {
6552 /* STORE option not specified, sent the sorting result to client */
6553 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6554 for (j = start; j <= end; j++) {
6555 listNode *ln;
6556 listIter li;
6557
6558 if (!getop) addReplyBulk(c,vector[j].obj);
6559 listRewind(operations,&li);
6560 while((ln = listNext(&li))) {
6561 redisSortOperation *sop = ln->value;
6562 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6563 vector[j].obj);
6564
6565 if (sop->type == REDIS_SORT_GET) {
6566 if (!val || val->type != REDIS_STRING) {
6567 addReply(c,shared.nullbulk);
6568 } else {
6569 addReplyBulk(c,val);
6570 }
6571 } else {
6572 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6573 }
6574 }
6575 }
6576 } else {
6577 robj *listObject = createListObject();
6578 list *listPtr = (list*) listObject->ptr;
6579
6580 /* STORE option specified, set the sorting result as a List object */
6581 for (j = start; j <= end; j++) {
6582 listNode *ln;
6583 listIter li;
6584
6585 if (!getop) {
6586 listAddNodeTail(listPtr,vector[j].obj);
6587 incrRefCount(vector[j].obj);
6588 }
6589 listRewind(operations,&li);
6590 while((ln = listNext(&li))) {
6591 redisSortOperation *sop = ln->value;
6592 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6593 vector[j].obj);
6594
6595 if (sop->type == REDIS_SORT_GET) {
6596 if (!val || val->type != REDIS_STRING) {
6597 listAddNodeTail(listPtr,createStringObject("",0));
6598 } else {
6599 listAddNodeTail(listPtr,val);
6600 incrRefCount(val);
6601 }
6602 } else {
6603 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6604 }
6605 }
6606 }
6607 if (dictReplace(c->db->dict,storekey,listObject)) {
6608 incrRefCount(storekey);
6609 }
6610 /* Note: we add 1 because the DB is dirty anyway since even if the
6611 * SORT result is empty a new key is set and maybe the old content
6612 * replaced. */
6613 server.dirty += 1+outputlen;
6614 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6615 }
6616
6617 /* Cleanup */
6618 decrRefCount(sortval);
6619 listRelease(operations);
6620 for (j = 0; j < vectorlen; j++) {
6621 if (sortby && alpha && vector[j].u.cmpobj)
6622 decrRefCount(vector[j].u.cmpobj);
6623 }
6624 zfree(vector);
6625 }
6626
6627 /* Convert an amount of bytes into a human readable string in the form
6628 * of 100B, 2G, 100M, 4K, and so forth. */
6629 static void bytesToHuman(char *s, unsigned long long n) {
6630 double d;
6631
6632 if (n < 1024) {
6633 /* Bytes */
6634 sprintf(s,"%lluB",n);
6635 return;
6636 } else if (n < (1024*1024)) {
6637 d = (double)n/(1024);
6638 sprintf(s,"%.2fK",d);
6639 } else if (n < (1024LL*1024*1024)) {
6640 d = (double)n/(1024*1024);
6641 sprintf(s,"%.2fM",d);
6642 } else if (n < (1024LL*1024*1024*1024)) {
6643 d = (double)n/(1024LL*1024*1024);
6644 sprintf(s,"%.2fG",d);
6645 }
6646 }
6647
6648 /* Create the string returned by the INFO command. This is decoupled
6649 * by the INFO command itself as we need to report the same information
6650 * on memory corruption problems. */
6651 static sds genRedisInfoString(void) {
6652 sds info;
6653 time_t uptime = time(NULL)-server.stat_starttime;
6654 int j;
6655 char hmem[64];
6656
6657 bytesToHuman(hmem,zmalloc_used_memory());
6658 info = sdscatprintf(sdsempty(),
6659 "redis_version:%s\r\n"
6660 "arch_bits:%s\r\n"
6661 "multiplexing_api:%s\r\n"
6662 "process_id:%ld\r\n"
6663 "uptime_in_seconds:%ld\r\n"
6664 "uptime_in_days:%ld\r\n"
6665 "connected_clients:%d\r\n"
6666 "connected_slaves:%d\r\n"
6667 "blocked_clients:%d\r\n"
6668 "used_memory:%zu\r\n"
6669 "used_memory_human:%s\r\n"
6670 "changes_since_last_save:%lld\r\n"
6671 "bgsave_in_progress:%d\r\n"
6672 "last_save_time:%ld\r\n"
6673 "bgrewriteaof_in_progress:%d\r\n"
6674 "total_connections_received:%lld\r\n"
6675 "total_commands_processed:%lld\r\n"
6676 "expired_keys:%lld\r\n"
6677 "hash_max_zipmap_entries:%ld\r\n"
6678 "hash_max_zipmap_value:%ld\r\n"
6679 "pubsub_channels:%ld\r\n"
6680 "pubsub_patterns:%u\r\n"
6681 "vm_enabled:%d\r\n"
6682 "role:%s\r\n"
6683 ,REDIS_VERSION,
6684 (sizeof(long) == 8) ? "64" : "32",
6685 aeGetApiName(),
6686 (long) getpid(),
6687 uptime,
6688 uptime/(3600*24),
6689 listLength(server.clients)-listLength(server.slaves),
6690 listLength(server.slaves),
6691 server.blpop_blocked_clients,
6692 zmalloc_used_memory(),
6693 hmem,
6694 server.dirty,
6695 server.bgsavechildpid != -1,
6696 server.lastsave,
6697 server.bgrewritechildpid != -1,
6698 server.stat_numconnections,
6699 server.stat_numcommands,
6700 server.stat_expiredkeys,
6701 server.hash_max_zipmap_entries,
6702 server.hash_max_zipmap_value,
6703 dictSize(server.pubsub_channels),
6704 listLength(server.pubsub_patterns),
6705 server.vm_enabled != 0,
6706 server.masterhost == NULL ? "master" : "slave"
6707 );
6708 if (server.masterhost) {
6709 info = sdscatprintf(info,
6710 "master_host:%s\r\n"
6711 "master_port:%d\r\n"
6712 "master_link_status:%s\r\n"
6713 "master_last_io_seconds_ago:%d\r\n"
6714 ,server.masterhost,
6715 server.masterport,
6716 (server.replstate == REDIS_REPL_CONNECTED) ?
6717 "up" : "down",
6718 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6719 );
6720 }
6721 if (server.vm_enabled) {
6722 lockThreadedIO();
6723 info = sdscatprintf(info,
6724 "vm_conf_max_memory:%llu\r\n"
6725 "vm_conf_page_size:%llu\r\n"
6726 "vm_conf_pages:%llu\r\n"
6727 "vm_stats_used_pages:%llu\r\n"
6728 "vm_stats_swapped_objects:%llu\r\n"
6729 "vm_stats_swappin_count:%llu\r\n"
6730 "vm_stats_swappout_count:%llu\r\n"
6731 "vm_stats_io_newjobs_len:%lu\r\n"
6732 "vm_stats_io_processing_len:%lu\r\n"
6733 "vm_stats_io_processed_len:%lu\r\n"
6734 "vm_stats_io_active_threads:%lu\r\n"
6735 "vm_stats_blocked_clients:%lu\r\n"
6736 ,(unsigned long long) server.vm_max_memory,
6737 (unsigned long long) server.vm_page_size,
6738 (unsigned long long) server.vm_pages,
6739 (unsigned long long) server.vm_stats_used_pages,
6740 (unsigned long long) server.vm_stats_swapped_objects,
6741 (unsigned long long) server.vm_stats_swapins,
6742 (unsigned long long) server.vm_stats_swapouts,
6743 (unsigned long) listLength(server.io_newjobs),
6744 (unsigned long) listLength(server.io_processing),
6745 (unsigned long) listLength(server.io_processed),
6746 (unsigned long) server.io_active_threads,
6747 (unsigned long) server.vm_blocked_clients
6748 );
6749 unlockThreadedIO();
6750 }
6751 for (j = 0; j < server.dbnum; j++) {
6752 long long keys, vkeys;
6753
6754 keys = dictSize(server.db[j].dict);
6755 vkeys = dictSize(server.db[j].expires);
6756 if (keys || vkeys) {
6757 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6758 j, keys, vkeys);
6759 }
6760 }
6761 return info;
6762 }
6763
6764 static void infoCommand(redisClient *c) {
6765 sds info = genRedisInfoString();
6766 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6767 (unsigned long)sdslen(info)));
6768 addReplySds(c,info);
6769 addReply(c,shared.crlf);
6770 }
6771
6772 static void monitorCommand(redisClient *c) {
6773 /* ignore MONITOR if aleady slave or in monitor mode */
6774 if (c->flags & REDIS_SLAVE) return;
6775
6776 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6777 c->slaveseldb = 0;
6778 listAddNodeTail(server.monitors,c);
6779 addReply(c,shared.ok);
6780 }
6781
6782 /* ================================= Expire ================================= */
6783 static int removeExpire(redisDb *db, robj *key) {
6784 if (dictDelete(db->expires,key) == DICT_OK) {
6785 return 1;
6786 } else {
6787 return 0;
6788 }
6789 }
6790
6791 static int setExpire(redisDb *db, robj *key, time_t when) {
6792 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6793 return 0;
6794 } else {
6795 incrRefCount(key);
6796 return 1;
6797 }
6798 }
6799
6800 /* Return the expire time of the specified key, or -1 if no expire
6801 * is associated with this key (i.e. the key is non volatile) */
6802 static time_t getExpire(redisDb *db, robj *key) {
6803 dictEntry *de;
6804
6805 /* No expire? return ASAP */
6806 if (dictSize(db->expires) == 0 ||
6807 (de = dictFind(db->expires,key)) == NULL) return -1;
6808
6809 return (time_t) dictGetEntryVal(de);
6810 }
6811
6812 static int expireIfNeeded(redisDb *db, robj *key) {
6813 time_t when;
6814 dictEntry *de;
6815
6816 /* No expire? return ASAP */
6817 if (dictSize(db->expires) == 0 ||
6818 (de = dictFind(db->expires,key)) == NULL) return 0;
6819
6820 /* Lookup the expire */
6821 when = (time_t) dictGetEntryVal(de);
6822 if (time(NULL) <= when) return 0;
6823
6824 /* Delete the key */
6825 dictDelete(db->expires,key);
6826 server.stat_expiredkeys++;
6827 return dictDelete(db->dict,key) == DICT_OK;
6828 }
6829
6830 static int deleteIfVolatile(redisDb *db, robj *key) {
6831 dictEntry *de;
6832
6833 /* No expire? return ASAP */
6834 if (dictSize(db->expires) == 0 ||
6835 (de = dictFind(db->expires,key)) == NULL) return 0;
6836
6837 /* Delete the key */
6838 server.dirty++;
6839 server.stat_expiredkeys++;
6840 dictDelete(db->expires,key);
6841 return dictDelete(db->dict,key) == DICT_OK;
6842 }
6843
6844 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6845 dictEntry *de;
6846
6847 de = dictFind(c->db->dict,key);
6848 if (de == NULL) {
6849 addReply(c,shared.czero);
6850 return;
6851 }
6852 if (seconds < 0) {
6853 if (deleteKey(c->db,key)) server.dirty++;
6854 addReply(c, shared.cone);
6855 return;
6856 } else {
6857 time_t when = time(NULL)+seconds;
6858 if (setExpire(c->db,key,when)) {
6859 addReply(c,shared.cone);
6860 server.dirty++;
6861 } else {
6862 addReply(c,shared.czero);
6863 }
6864 return;
6865 }
6866 }
6867
6868 static void expireCommand(redisClient *c) {
6869 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6870 }
6871
6872 static void expireatCommand(redisClient *c) {
6873 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6874 }
6875
6876 static void ttlCommand(redisClient *c) {
6877 time_t expire;
6878 int ttl = -1;
6879
6880 expire = getExpire(c->db,c->argv[1]);
6881 if (expire != -1) {
6882 ttl = (int) (expire-time(NULL));
6883 if (ttl < 0) ttl = -1;
6884 }
6885 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6886 }
6887
6888 /* ================================ MULTI/EXEC ============================== */
6889
6890 /* Client state initialization for MULTI/EXEC */
6891 static void initClientMultiState(redisClient *c) {
6892 c->mstate.commands = NULL;
6893 c->mstate.count = 0;
6894 }
6895
6896 /* Release all the resources associated with MULTI/EXEC state */
6897 static void freeClientMultiState(redisClient *c) {
6898 int j;
6899
6900 for (j = 0; j < c->mstate.count; j++) {
6901 int i;
6902 multiCmd *mc = c->mstate.commands+j;
6903
6904 for (i = 0; i < mc->argc; i++)
6905 decrRefCount(mc->argv[i]);
6906 zfree(mc->argv);
6907 }
6908 zfree(c->mstate.commands);
6909 }
6910
6911 /* Add a new command into the MULTI commands queue */
6912 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6913 multiCmd *mc;
6914 int j;
6915
6916 c->mstate.commands = zrealloc(c->mstate.commands,
6917 sizeof(multiCmd)*(c->mstate.count+1));
6918 mc = c->mstate.commands+c->mstate.count;
6919 mc->cmd = cmd;
6920 mc->argc = c->argc;
6921 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6922 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6923 for (j = 0; j < c->argc; j++)
6924 incrRefCount(mc->argv[j]);
6925 c->mstate.count++;
6926 }
6927
6928 static void multiCommand(redisClient *c) {
6929 c->flags |= REDIS_MULTI;
6930 addReply(c,shared.ok);
6931 }
6932
6933 static void discardCommand(redisClient *c) {
6934 if (!(c->flags & REDIS_MULTI)) {
6935 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6936 return;
6937 }
6938
6939 freeClientMultiState(c);
6940 initClientMultiState(c);
6941 c->flags &= (~REDIS_MULTI);
6942 addReply(c,shared.ok);
6943 }
6944
6945 static void execCommand(redisClient *c) {
6946 int j;
6947 robj **orig_argv;
6948 int orig_argc;
6949
6950 if (!(c->flags & REDIS_MULTI)) {
6951 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6952 return;
6953 }
6954
6955 orig_argv = c->argv;
6956 orig_argc = c->argc;
6957 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6958 for (j = 0; j < c->mstate.count; j++) {
6959 c->argc = c->mstate.commands[j].argc;
6960 c->argv = c->mstate.commands[j].argv;
6961 call(c,c->mstate.commands[j].cmd);
6962 }
6963 c->argv = orig_argv;
6964 c->argc = orig_argc;
6965 freeClientMultiState(c);
6966 initClientMultiState(c);
6967 c->flags &= (~REDIS_MULTI);
6968 }
6969
6970 /* =========================== Blocking Operations ========================= */
6971
6972 /* Currently Redis blocking operations support is limited to list POP ops,
6973 * so the current implementation is not fully generic, but it is also not
6974 * completely specific so it will not require a rewrite to support new
6975 * kind of blocking operations in the future.
6976 *
6977 * Still it's important to note that list blocking operations can be already
6978 * used as a notification mechanism in order to implement other blocking
6979 * operations at application level, so there must be a very strong evidence
6980 * of usefulness and generality before new blocking operations are implemented.
6981 *
6982 * This is how the current blocking POP works, we use BLPOP as example:
6983 * - If the user calls BLPOP and the key exists and contains a non empty list
6984 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6985 * if there is not to block.
6986 * - If instead BLPOP is called and the key does not exists or the list is
6987 * empty we need to block. In order to do so we remove the notification for
6988 * new data to read in the client socket (so that we'll not serve new
6989 * requests if the blocking request is not served). Also we put the client
6990 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6991 * blocking for this keys.
6992 * - If a PUSH operation against a key with blocked clients waiting is
6993 * performed, we serve the first in the list: basically instead to push
6994 * the new element inside the list we return it to the (first / oldest)
6995 * blocking client, unblock the client, and remove it form the list.
6996 *
6997 * The above comment and the source code should be enough in order to understand
6998 * the implementation and modify / fix it later.
6999 */
7000
7001 /* Set a client in blocking mode for the specified key, with the specified
7002 * timeout */
7003 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7004 dictEntry *de;
7005 list *l;
7006 int j;
7007
7008 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7009 c->blockingkeysnum = numkeys;
7010 c->blockingto = timeout;
7011 for (j = 0; j < numkeys; j++) {
7012 /* Add the key in the client structure, to map clients -> keys */
7013 c->blockingkeys[j] = keys[j];
7014 incrRefCount(keys[j]);
7015
7016 /* And in the other "side", to map keys -> clients */
7017 de = dictFind(c->db->blockingkeys,keys[j]);
7018 if (de == NULL) {
7019 int retval;
7020
7021 /* For every key we take a list of clients blocked for it */
7022 l = listCreate();
7023 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7024 incrRefCount(keys[j]);
7025 assert(retval == DICT_OK);
7026 } else {
7027 l = dictGetEntryVal(de);
7028 }
7029 listAddNodeTail(l,c);
7030 }
7031 /* Mark the client as a blocked client */
7032 c->flags |= REDIS_BLOCKED;
7033 server.blpop_blocked_clients++;
7034 }
7035
7036 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7037 static void unblockClientWaitingData(redisClient *c) {
7038 dictEntry *de;
7039 list *l;
7040 int j;
7041
7042 assert(c->blockingkeys != NULL);
7043 /* The client may wait for multiple keys, so unblock it for every key. */
7044 for (j = 0; j < c->blockingkeysnum; j++) {
7045 /* Remove this client from the list of clients waiting for this key. */
7046 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7047 assert(de != NULL);
7048 l = dictGetEntryVal(de);
7049 listDelNode(l,listSearchKey(l,c));
7050 /* If the list is empty we need to remove it to avoid wasting memory */
7051 if (listLength(l) == 0)
7052 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7053 decrRefCount(c->blockingkeys[j]);
7054 }
7055 /* Cleanup the client structure */
7056 zfree(c->blockingkeys);
7057 c->blockingkeys = NULL;
7058 c->flags &= (~REDIS_BLOCKED);
7059 server.blpop_blocked_clients--;
7060 /* We want to process data if there is some command waiting
7061 * in the input buffer. Note that this is safe even if
7062 * unblockClientWaitingData() gets called from freeClient() because
7063 * freeClient() will be smart enough to call this function
7064 * *after* c->querybuf was set to NULL. */
7065 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7066 }
7067
7068 /* This should be called from any function PUSHing into lists.
7069 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7070 * 'ele' is the element pushed.
7071 *
7072 * If the function returns 0 there was no client waiting for a list push
7073 * against this key.
7074 *
7075 * If the function returns 1 there was a client waiting for a list push
7076 * against this key, the element was passed to this client thus it's not
7077 * needed to actually add it to the list and the caller should return asap. */
7078 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7079 struct dictEntry *de;
7080 redisClient *receiver;
7081 list *l;
7082 listNode *ln;
7083
7084 de = dictFind(c->db->blockingkeys,key);
7085 if (de == NULL) return 0;
7086 l = dictGetEntryVal(de);
7087 ln = listFirst(l);
7088 assert(ln != NULL);
7089 receiver = ln->value;
7090
7091 addReplySds(receiver,sdsnew("*2\r\n"));
7092 addReplyBulk(receiver,key);
7093 addReplyBulk(receiver,ele);
7094 unblockClientWaitingData(receiver);
7095 return 1;
7096 }
7097
7098 /* Blocking RPOP/LPOP */
7099 static void blockingPopGenericCommand(redisClient *c, int where) {
7100 robj *o;
7101 time_t timeout;
7102 int j;
7103
7104 for (j = 1; j < c->argc-1; j++) {
7105 o = lookupKeyWrite(c->db,c->argv[j]);
7106 if (o != NULL) {
7107 if (o->type != REDIS_LIST) {
7108 addReply(c,shared.wrongtypeerr);
7109 return;
7110 } else {
7111 list *list = o->ptr;
7112 if (listLength(list) != 0) {
7113 /* If the list contains elements fall back to the usual
7114 * non-blocking POP operation */
7115 robj *argv[2], **orig_argv;
7116 int orig_argc;
7117
7118 /* We need to alter the command arguments before to call
7119 * popGenericCommand() as the command takes a single key. */
7120 orig_argv = c->argv;
7121 orig_argc = c->argc;
7122 argv[1] = c->argv[j];
7123 c->argv = argv;
7124 c->argc = 2;
7125
7126 /* Also the return value is different, we need to output
7127 * the multi bulk reply header and the key name. The
7128 * "real" command will add the last element (the value)
7129 * for us. If this souds like an hack to you it's just
7130 * because it is... */
7131 addReplySds(c,sdsnew("*2\r\n"));
7132 addReplyBulk(c,argv[1]);
7133 popGenericCommand(c,where);
7134
7135 /* Fix the client structure with the original stuff */
7136 c->argv = orig_argv;
7137 c->argc = orig_argc;
7138 return;
7139 }
7140 }
7141 }
7142 }
7143 /* If the list is empty or the key does not exists we must block */
7144 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7145 if (timeout > 0) timeout += time(NULL);
7146 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7147 }
7148
7149 static void blpopCommand(redisClient *c) {
7150 blockingPopGenericCommand(c,REDIS_HEAD);
7151 }
7152
7153 static void brpopCommand(redisClient *c) {
7154 blockingPopGenericCommand(c,REDIS_TAIL);
7155 }
7156
7157 /* =============================== Replication ============================= */
7158
7159 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7160 ssize_t nwritten, ret = size;
7161 time_t start = time(NULL);
7162
7163 timeout++;
7164 while(size) {
7165 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7166 nwritten = write(fd,ptr,size);
7167 if (nwritten == -1) return -1;
7168 ptr += nwritten;
7169 size -= nwritten;
7170 }
7171 if ((time(NULL)-start) > timeout) {
7172 errno = ETIMEDOUT;
7173 return -1;
7174 }
7175 }
7176 return ret;
7177 }
7178
7179 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7180 ssize_t nread, totread = 0;
7181 time_t start = time(NULL);
7182
7183 timeout++;
7184 while(size) {
7185 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7186 nread = read(fd,ptr,size);
7187 if (nread == -1) return -1;
7188 ptr += nread;
7189 size -= nread;
7190 totread += nread;
7191 }
7192 if ((time(NULL)-start) > timeout) {
7193 errno = ETIMEDOUT;
7194 return -1;
7195 }
7196 }
7197 return totread;
7198 }
7199
7200 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7201 ssize_t nread = 0;
7202
7203 size--;
7204 while(size) {
7205 char c;
7206
7207 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7208 if (c == '\n') {
7209 *ptr = '\0';
7210 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7211 return nread;
7212 } else {
7213 *ptr++ = c;
7214 *ptr = '\0';
7215 nread++;
7216 }
7217 }
7218 return nread;
7219 }
7220
7221 static void syncCommand(redisClient *c) {
7222 /* ignore SYNC if aleady slave or in monitor mode */
7223 if (c->flags & REDIS_SLAVE) return;
7224
7225 /* SYNC can't be issued when the server has pending data to send to
7226 * the client about already issued commands. We need a fresh reply
7227 * buffer registering the differences between the BGSAVE and the current
7228 * dataset, so that we can copy to other slaves if needed. */
7229 if (listLength(c->reply) != 0) {
7230 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7231 return;
7232 }
7233
7234 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7235 /* Here we need to check if there is a background saving operation
7236 * in progress, or if it is required to start one */
7237 if (server.bgsavechildpid != -1) {
7238 /* Ok a background save is in progress. Let's check if it is a good
7239 * one for replication, i.e. if there is another slave that is
7240 * registering differences since the server forked to save */
7241 redisClient *slave;
7242 listNode *ln;
7243 listIter li;
7244
7245 listRewind(server.slaves,&li);
7246 while((ln = listNext(&li))) {
7247 slave = ln->value;
7248 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7249 }
7250 if (ln) {
7251 /* Perfect, the server is already registering differences for
7252 * another slave. Set the right state, and copy the buffer. */
7253 listRelease(c->reply);
7254 c->reply = listDup(slave->reply);
7255 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7256 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7257 } else {
7258 /* No way, we need to wait for the next BGSAVE in order to
7259 * register differences */
7260 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7261 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7262 }
7263 } else {
7264 /* Ok we don't have a BGSAVE in progress, let's start one */
7265 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7266 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7267 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7268 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7269 return;
7270 }
7271 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7272 }
7273 c->repldbfd = -1;
7274 c->flags |= REDIS_SLAVE;
7275 c->slaveseldb = 0;
7276 listAddNodeTail(server.slaves,c);
7277 return;
7278 }
7279
7280 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7281 redisClient *slave = privdata;
7282 REDIS_NOTUSED(el);
7283 REDIS_NOTUSED(mask);
7284 char buf[REDIS_IOBUF_LEN];
7285 ssize_t nwritten, buflen;
7286
7287 if (slave->repldboff == 0) {
7288 /* Write the bulk write count before to transfer the DB. In theory here
7289 * we don't know how much room there is in the output buffer of the
7290 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7291 * operations) will never be smaller than the few bytes we need. */
7292 sds bulkcount;
7293
7294 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7295 slave->repldbsize);
7296 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7297 {
7298 sdsfree(bulkcount);
7299 freeClient(slave);
7300 return;
7301 }
7302 sdsfree(bulkcount);
7303 }
7304 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7305 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7306 if (buflen <= 0) {
7307 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7308 (buflen == 0) ? "premature EOF" : strerror(errno));
7309 freeClient(slave);
7310 return;
7311 }
7312 if ((nwritten = write(fd,buf,buflen)) == -1) {
7313 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7314 strerror(errno));
7315 freeClient(slave);
7316 return;
7317 }
7318 slave->repldboff += nwritten;
7319 if (slave->repldboff == slave->repldbsize) {
7320 close(slave->repldbfd);
7321 slave->repldbfd = -1;
7322 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7323 slave->replstate = REDIS_REPL_ONLINE;
7324 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7325 sendReplyToClient, slave) == AE_ERR) {
7326 freeClient(slave);
7327 return;
7328 }
7329 addReplySds(slave,sdsempty());
7330 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7331 }
7332 }
7333
7334 /* This function is called at the end of every backgrond saving.
7335 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7336 * otherwise REDIS_ERR is passed to the function.
7337 *
7338 * The goal of this function is to handle slaves waiting for a successful
7339 * background saving in order to perform non-blocking synchronization. */
7340 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7341 listNode *ln;
7342 int startbgsave = 0;
7343 listIter li;
7344
7345 listRewind(server.slaves,&li);
7346 while((ln = listNext(&li))) {
7347 redisClient *slave = ln->value;
7348
7349 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7350 startbgsave = 1;
7351 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7352 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7353 struct redis_stat buf;
7354
7355 if (bgsaveerr != REDIS_OK) {
7356 freeClient(slave);
7357 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7358 continue;
7359 }
7360 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7361 redis_fstat(slave->repldbfd,&buf) == -1) {
7362 freeClient(slave);
7363 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7364 continue;
7365 }
7366 slave->repldboff = 0;
7367 slave->repldbsize = buf.st_size;
7368 slave->replstate = REDIS_REPL_SEND_BULK;
7369 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7370 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7371 freeClient(slave);
7372 continue;
7373 }
7374 }
7375 }
7376 if (startbgsave) {
7377 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7378 listIter li;
7379
7380 listRewind(server.slaves,&li);
7381 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7382 while((ln = listNext(&li))) {
7383 redisClient *slave = ln->value;
7384
7385 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7386 freeClient(slave);
7387 }
7388 }
7389 }
7390 }
7391
7392 static int syncWithMaster(void) {
7393 char buf[1024], tmpfile[256], authcmd[1024];
7394 long dumpsize;
7395 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7396 int dfd, maxtries = 5;
7397
7398 if (fd == -1) {
7399 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7400 strerror(errno));
7401 return REDIS_ERR;
7402 }
7403
7404 /* AUTH with the master if required. */
7405 if(server.masterauth) {
7406 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7407 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7408 close(fd);
7409 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7410 strerror(errno));
7411 return REDIS_ERR;
7412 }
7413 /* Read the AUTH result. */
7414 if (syncReadLine(fd,buf,1024,3600) == -1) {
7415 close(fd);
7416 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7417 strerror(errno));
7418 return REDIS_ERR;
7419 }
7420 if (buf[0] != '+') {
7421 close(fd);
7422 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7423 return REDIS_ERR;
7424 }
7425 }
7426
7427 /* Issue the SYNC command */
7428 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7429 close(fd);
7430 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7431 strerror(errno));
7432 return REDIS_ERR;
7433 }
7434 /* Read the bulk write count */
7435 if (syncReadLine(fd,buf,1024,3600) == -1) {
7436 close(fd);
7437 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7438 strerror(errno));
7439 return REDIS_ERR;
7440 }
7441 if (buf[0] != '$') {
7442 close(fd);
7443 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7444 return REDIS_ERR;
7445 }
7446 dumpsize = strtol(buf+1,NULL,10);
7447 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7448 /* Read the bulk write data on a temp file */
7449 while(maxtries--) {
7450 snprintf(tmpfile,256,
7451 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7452 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7453 if (dfd != -1) break;
7454 sleep(1);
7455 }
7456 if (dfd == -1) {
7457 close(fd);
7458 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7459 return REDIS_ERR;
7460 }
7461 while(dumpsize) {
7462 int nread, nwritten;
7463
7464 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7465 if (nread == -1) {
7466 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7467 strerror(errno));
7468 close(fd);
7469 close(dfd);
7470 return REDIS_ERR;
7471 }
7472 nwritten = write(dfd,buf,nread);
7473 if (nwritten == -1) {
7474 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7475 close(fd);
7476 close(dfd);
7477 return REDIS_ERR;
7478 }
7479 dumpsize -= nread;
7480 }
7481 close(dfd);
7482 if (rename(tmpfile,server.dbfilename) == -1) {
7483 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7484 unlink(tmpfile);
7485 close(fd);
7486 return REDIS_ERR;
7487 }
7488 emptyDb();
7489 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7490 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7491 close(fd);
7492 return REDIS_ERR;
7493 }
7494 server.master = createClient(fd);
7495 server.master->flags |= REDIS_MASTER;
7496 server.master->authenticated = 1;
7497 server.replstate = REDIS_REPL_CONNECTED;
7498 return REDIS_OK;
7499 }
7500
7501 static void slaveofCommand(redisClient *c) {
7502 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7503 !strcasecmp(c->argv[2]->ptr,"one")) {
7504 if (server.masterhost) {
7505 sdsfree(server.masterhost);
7506 server.masterhost = NULL;
7507 if (server.master) freeClient(server.master);
7508 server.replstate = REDIS_REPL_NONE;
7509 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7510 }
7511 } else {
7512 sdsfree(server.masterhost);
7513 server.masterhost = sdsdup(c->argv[1]->ptr);
7514 server.masterport = atoi(c->argv[2]->ptr);
7515 if (server.master) freeClient(server.master);
7516 server.replstate = REDIS_REPL_CONNECT;
7517 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7518 server.masterhost, server.masterport);
7519 }
7520 addReply(c,shared.ok);
7521 }
7522
7523 /* ============================ Maxmemory directive ======================== */
7524
7525 /* Try to free one object form the pre-allocated objects free list.
7526 * This is useful under low mem conditions as by default we take 1 million
7527 * free objects allocated. On success REDIS_OK is returned, otherwise
7528 * REDIS_ERR. */
7529 static int tryFreeOneObjectFromFreelist(void) {
7530 robj *o;
7531
7532 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7533 if (listLength(server.objfreelist)) {
7534 listNode *head = listFirst(server.objfreelist);
7535 o = listNodeValue(head);
7536 listDelNode(server.objfreelist,head);
7537 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7538 zfree(o);
7539 return REDIS_OK;
7540 } else {
7541 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7542 return REDIS_ERR;
7543 }
7544 }
7545
7546 /* This function gets called when 'maxmemory' is set on the config file to limit
7547 * the max memory used by the server, and we are out of memory.
7548 * This function will try to, in order:
7549 *
7550 * - Free objects from the free list
7551 * - Try to remove keys with an EXPIRE set
7552 *
7553 * It is not possible to free enough memory to reach used-memory < maxmemory
7554 * the server will start refusing commands that will enlarge even more the
7555 * memory usage.
7556 */
7557 static void freeMemoryIfNeeded(void) {
7558 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7559 int j, k, freed = 0;
7560
7561 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7562 for (j = 0; j < server.dbnum; j++) {
7563 int minttl = -1;
7564 robj *minkey = NULL;
7565 struct dictEntry *de;
7566
7567 if (dictSize(server.db[j].expires)) {
7568 freed = 1;
7569 /* From a sample of three keys drop the one nearest to
7570 * the natural expire */
7571 for (k = 0; k < 3; k++) {
7572 time_t t;
7573
7574 de = dictGetRandomKey(server.db[j].expires);
7575 t = (time_t) dictGetEntryVal(de);
7576 if (minttl == -1 || t < minttl) {
7577 minkey = dictGetEntryKey(de);
7578 minttl = t;
7579 }
7580 }
7581 deleteKey(server.db+j,minkey);
7582 }
7583 }
7584 if (!freed) return; /* nothing to free... */
7585 }
7586 }
7587
7588 /* ============================== Append Only file ========================== */
7589
7590 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7591 sds buf = sdsempty();
7592 int j;
7593 ssize_t nwritten;
7594 time_t now;
7595 robj *tmpargv[3];
7596
7597 /* The DB this command was targetting is not the same as the last command
7598 * we appendend. To issue a SELECT command is needed. */
7599 if (dictid != server.appendseldb) {
7600 char seldb[64];
7601
7602 snprintf(seldb,sizeof(seldb),"%d",dictid);
7603 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7604 (unsigned long)strlen(seldb),seldb);
7605 server.appendseldb = dictid;
7606 }
7607
7608 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7609 * EXPIREs into EXPIREATs calls */
7610 if (cmd->proc == expireCommand) {
7611 long when;
7612
7613 tmpargv[0] = createStringObject("EXPIREAT",8);
7614 tmpargv[1] = argv[1];
7615 incrRefCount(argv[1]);
7616 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7617 tmpargv[2] = createObject(REDIS_STRING,
7618 sdscatprintf(sdsempty(),"%ld",when));
7619 argv = tmpargv;
7620 }
7621
7622 /* Append the actual command */
7623 buf = sdscatprintf(buf,"*%d\r\n",argc);
7624 for (j = 0; j < argc; j++) {
7625 robj *o = argv[j];
7626
7627 o = getDecodedObject(o);
7628 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7629 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7630 buf = sdscatlen(buf,"\r\n",2);
7631 decrRefCount(o);
7632 }
7633
7634 /* Free the objects from the modified argv for EXPIREAT */
7635 if (cmd->proc == expireCommand) {
7636 for (j = 0; j < 3; j++)
7637 decrRefCount(argv[j]);
7638 }
7639
7640 /* We want to perform a single write. This should be guaranteed atomic
7641 * at least if the filesystem we are writing is a real physical one.
7642 * While this will save us against the server being killed I don't think
7643 * there is much to do about the whole server stopping for power problems
7644 * or alike */
7645 nwritten = write(server.appendfd,buf,sdslen(buf));
7646 if (nwritten != (signed)sdslen(buf)) {
7647 /* Ooops, we are in troubles. The best thing to do for now is
7648 * to simply exit instead to give the illusion that everything is
7649 * working as expected. */
7650 if (nwritten == -1) {
7651 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7652 } else {
7653 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7654 }
7655 exit(1);
7656 }
7657 /* If a background append only file rewriting is in progress we want to
7658 * accumulate the differences between the child DB and the current one
7659 * in a buffer, so that when the child process will do its work we
7660 * can append the differences to the new append only file. */
7661 if (server.bgrewritechildpid != -1)
7662 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7663
7664 sdsfree(buf);
7665 now = time(NULL);
7666 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7667 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7668 now-server.lastfsync > 1))
7669 {
7670 fsync(server.appendfd); /* Let's try to get this data on the disk */
7671 server.lastfsync = now;
7672 }
7673 }
7674
7675 /* In Redis commands are always executed in the context of a client, so in
7676 * order to load the append only file we need to create a fake client. */
7677 static struct redisClient *createFakeClient(void) {
7678 struct redisClient *c = zmalloc(sizeof(*c));
7679
7680 selectDb(c,0);
7681 c->fd = -1;
7682 c->querybuf = sdsempty();
7683 c->argc = 0;
7684 c->argv = NULL;
7685 c->flags = 0;
7686 /* We set the fake client as a slave waiting for the synchronization
7687 * so that Redis will not try to send replies to this client. */
7688 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7689 c->reply = listCreate();
7690 listSetFreeMethod(c->reply,decrRefCount);
7691 listSetDupMethod(c->reply,dupClientReplyValue);
7692 return c;
7693 }
7694
7695 static void freeFakeClient(struct redisClient *c) {
7696 sdsfree(c->querybuf);
7697 listRelease(c->reply);
7698 zfree(c);
7699 }
7700
7701 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7702 * error (the append only file is zero-length) REDIS_ERR is returned. On
7703 * fatal error an error message is logged and the program exists. */
7704 int loadAppendOnlyFile(char *filename) {
7705 struct redisClient *fakeClient;
7706 FILE *fp = fopen(filename,"r");
7707 struct redis_stat sb;
7708 unsigned long long loadedkeys = 0;
7709
7710 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7711 return REDIS_ERR;
7712
7713 if (fp == NULL) {
7714 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7715 exit(1);
7716 }
7717
7718 fakeClient = createFakeClient();
7719 while(1) {
7720 int argc, j;
7721 unsigned long len;
7722 robj **argv;
7723 char buf[128];
7724 sds argsds;
7725 struct redisCommand *cmd;
7726
7727 if (fgets(buf,sizeof(buf),fp) == NULL) {
7728 if (feof(fp))
7729 break;
7730 else
7731 goto readerr;
7732 }
7733 if (buf[0] != '*') goto fmterr;
7734 argc = atoi(buf+1);
7735 argv = zmalloc(sizeof(robj*)*argc);
7736 for (j = 0; j < argc; j++) {
7737 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7738 if (buf[0] != '$') goto fmterr;
7739 len = strtol(buf+1,NULL,10);
7740 argsds = sdsnewlen(NULL,len);
7741 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7742 argv[j] = createObject(REDIS_STRING,argsds);
7743 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7744 }
7745
7746 /* Command lookup */
7747 cmd = lookupCommand(argv[0]->ptr);
7748 if (!cmd) {
7749 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7750 exit(1);
7751 }
7752 /* Try object encoding */
7753 if (cmd->flags & REDIS_CMD_BULK)
7754 tryObjectEncoding(argv[argc-1]);
7755 /* Run the command in the context of a fake client */
7756 fakeClient->argc = argc;
7757 fakeClient->argv = argv;
7758 cmd->proc(fakeClient);
7759 /* Discard the reply objects list from the fake client */
7760 while(listLength(fakeClient->reply))
7761 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7762 /* Clean up, ready for the next command */
7763 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7764 zfree(argv);
7765 /* Handle swapping while loading big datasets when VM is on */
7766 loadedkeys++;
7767 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7768 while (zmalloc_used_memory() > server.vm_max_memory) {
7769 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7770 }
7771 }
7772 }
7773 fclose(fp);
7774 freeFakeClient(fakeClient);
7775 return REDIS_OK;
7776
7777 readerr:
7778 if (feof(fp)) {
7779 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7780 } else {
7781 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7782 }
7783 exit(1);
7784 fmterr:
7785 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7786 exit(1);
7787 }
7788
7789 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7790 static int fwriteBulkObject(FILE *fp, robj *obj) {
7791 char buf[128];
7792 int decrrc = 0;
7793
7794 /* Avoid the incr/decr ref count business if possible to help
7795 * copy-on-write (we are often in a child process when this function
7796 * is called).
7797 * Also makes sure that key objects don't get incrRefCount-ed when VM
7798 * is enabled */
7799 if (obj->encoding != REDIS_ENCODING_RAW) {
7800 obj = getDecodedObject(obj);
7801 decrrc = 1;
7802 }
7803 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7804 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7805 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7806 goto err;
7807 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7808 if (decrrc) decrRefCount(obj);
7809 return 1;
7810 err:
7811 if (decrrc) decrRefCount(obj);
7812 return 0;
7813 }
7814
7815 /* Write binary-safe string into a file in the bulkformat
7816 * $<count>\r\n<payload>\r\n */
7817 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7818 char buf[128];
7819
7820 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7821 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7822 if (len && fwrite(s,len,1,fp) == 0) return 0;
7823 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7824 return 1;
7825 }
7826
7827 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7828 static int fwriteBulkDouble(FILE *fp, double d) {
7829 char buf[128], dbuf[128];
7830
7831 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7832 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7833 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7834 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7835 return 1;
7836 }
7837
7838 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7839 static int fwriteBulkLong(FILE *fp, long l) {
7840 char buf[128], lbuf[128];
7841
7842 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7843 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7844 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7845 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7846 return 1;
7847 }
7848
7849 /* Write a sequence of commands able to fully rebuild the dataset into
7850 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7851 static int rewriteAppendOnlyFile(char *filename) {
7852 dictIterator *di = NULL;
7853 dictEntry *de;
7854 FILE *fp;
7855 char tmpfile[256];
7856 int j;
7857 time_t now = time(NULL);
7858
7859 /* Note that we have to use a different temp name here compared to the
7860 * one used by rewriteAppendOnlyFileBackground() function. */
7861 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7862 fp = fopen(tmpfile,"w");
7863 if (!fp) {
7864 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7865 return REDIS_ERR;
7866 }
7867 for (j = 0; j < server.dbnum; j++) {
7868 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7869 redisDb *db = server.db+j;
7870 dict *d = db->dict;
7871 if (dictSize(d) == 0) continue;
7872 di = dictGetIterator(d);
7873 if (!di) {
7874 fclose(fp);
7875 return REDIS_ERR;
7876 }
7877
7878 /* SELECT the new DB */
7879 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7880 if (fwriteBulkLong(fp,j) == 0) goto werr;
7881
7882 /* Iterate this DB writing every entry */
7883 while((de = dictNext(di)) != NULL) {
7884 robj *key, *o;
7885 time_t expiretime;
7886 int swapped;
7887
7888 key = dictGetEntryKey(de);
7889 /* If the value for this key is swapped, load a preview in memory.
7890 * We use a "swapped" flag to remember if we need to free the
7891 * value object instead to just increment the ref count anyway
7892 * in order to avoid copy-on-write of pages if we are forked() */
7893 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7894 key->storage == REDIS_VM_SWAPPING) {
7895 o = dictGetEntryVal(de);
7896 swapped = 0;
7897 } else {
7898 o = vmPreviewObject(key);
7899 swapped = 1;
7900 }
7901 expiretime = getExpire(db,key);
7902
7903 /* Save the key and associated value */
7904 if (o->type == REDIS_STRING) {
7905 /* Emit a SET command */
7906 char cmd[]="*3\r\n$3\r\nSET\r\n";
7907 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7908 /* Key and value */
7909 if (fwriteBulkObject(fp,key) == 0) goto werr;
7910 if (fwriteBulkObject(fp,o) == 0) goto werr;
7911 } else if (o->type == REDIS_LIST) {
7912 /* Emit the RPUSHes needed to rebuild the list */
7913 list *list = o->ptr;
7914 listNode *ln;
7915 listIter li;
7916
7917 listRewind(list,&li);
7918 while((ln = listNext(&li))) {
7919 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7920 robj *eleobj = listNodeValue(ln);
7921
7922 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7923 if (fwriteBulkObject(fp,key) == 0) goto werr;
7924 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7925 }
7926 } else if (o->type == REDIS_SET) {
7927 /* Emit the SADDs needed to rebuild the set */
7928 dict *set = o->ptr;
7929 dictIterator *di = dictGetIterator(set);
7930 dictEntry *de;
7931
7932 while((de = dictNext(di)) != NULL) {
7933 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7934 robj *eleobj = dictGetEntryKey(de);
7935
7936 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7937 if (fwriteBulkObject(fp,key) == 0) goto werr;
7938 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7939 }
7940 dictReleaseIterator(di);
7941 } else if (o->type == REDIS_ZSET) {
7942 /* Emit the ZADDs needed to rebuild the sorted set */
7943 zset *zs = o->ptr;
7944 dictIterator *di = dictGetIterator(zs->dict);
7945 dictEntry *de;
7946
7947 while((de = dictNext(di)) != NULL) {
7948 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7949 robj *eleobj = dictGetEntryKey(de);
7950 double *score = dictGetEntryVal(de);
7951
7952 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7953 if (fwriteBulkObject(fp,key) == 0) goto werr;
7954 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7955 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7956 }
7957 dictReleaseIterator(di);
7958 } else if (o->type == REDIS_HASH) {
7959 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7960
7961 /* Emit the HSETs needed to rebuild the hash */
7962 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7963 unsigned char *p = zipmapRewind(o->ptr);
7964 unsigned char *field, *val;
7965 unsigned int flen, vlen;
7966
7967 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7968 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7969 if (fwriteBulkObject(fp,key) == 0) goto werr;
7970 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7971 return -1;
7972 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7973 return -1;
7974 }
7975 } else {
7976 dictIterator *di = dictGetIterator(o->ptr);
7977 dictEntry *de;
7978
7979 while((de = dictNext(di)) != NULL) {
7980 robj *field = dictGetEntryKey(de);
7981 robj *val = dictGetEntryVal(de);
7982
7983 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7984 if (fwriteBulkObject(fp,key) == 0) goto werr;
7985 if (fwriteBulkObject(fp,field) == -1) return -1;
7986 if (fwriteBulkObject(fp,val) == -1) return -1;
7987 }
7988 dictReleaseIterator(di);
7989 }
7990 } else {
7991 redisAssert(0);
7992 }
7993 /* Save the expire time */
7994 if (expiretime != -1) {
7995 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7996 /* If this key is already expired skip it */
7997 if (expiretime < now) continue;
7998 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7999 if (fwriteBulkObject(fp,key) == 0) goto werr;
8000 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8001 }
8002 if (swapped) decrRefCount(o);
8003 }
8004 dictReleaseIterator(di);
8005 }
8006
8007 /* Make sure data will not remain on the OS's output buffers */
8008 fflush(fp);
8009 fsync(fileno(fp));
8010 fclose(fp);
8011
8012 /* Use RENAME to make sure the DB file is changed atomically only
8013 * if the generate DB file is ok. */
8014 if (rename(tmpfile,filename) == -1) {
8015 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8016 unlink(tmpfile);
8017 return REDIS_ERR;
8018 }
8019 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8020 return REDIS_OK;
8021
8022 werr:
8023 fclose(fp);
8024 unlink(tmpfile);
8025 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8026 if (di) dictReleaseIterator(di);
8027 return REDIS_ERR;
8028 }
8029
8030 /* This is how rewriting of the append only file in background works:
8031 *
8032 * 1) The user calls BGREWRITEAOF
8033 * 2) Redis calls this function, that forks():
8034 * 2a) the child rewrite the append only file in a temp file.
8035 * 2b) the parent accumulates differences in server.bgrewritebuf.
8036 * 3) When the child finished '2a' exists.
8037 * 4) The parent will trap the exit code, if it's OK, will append the
8038 * data accumulated into server.bgrewritebuf into the temp file, and
8039 * finally will rename(2) the temp file in the actual file name.
8040 * The the new file is reopened as the new append only file. Profit!
8041 */
8042 static int rewriteAppendOnlyFileBackground(void) {
8043 pid_t childpid;
8044
8045 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8046 if (server.vm_enabled) waitEmptyIOJobsQueue();
8047 if ((childpid = fork()) == 0) {
8048 /* Child */
8049 char tmpfile[256];
8050
8051 if (server.vm_enabled) vmReopenSwapFile();
8052 close(server.fd);
8053 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8054 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8055 _exit(0);
8056 } else {
8057 _exit(1);
8058 }
8059 } else {
8060 /* Parent */
8061 if (childpid == -1) {
8062 redisLog(REDIS_WARNING,
8063 "Can't rewrite append only file in background: fork: %s",
8064 strerror(errno));
8065 return REDIS_ERR;
8066 }
8067 redisLog(REDIS_NOTICE,
8068 "Background append only file rewriting started by pid %d",childpid);
8069 server.bgrewritechildpid = childpid;
8070 updateDictResizePolicy();
8071 /* We set appendseldb to -1 in order to force the next call to the
8072 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8073 * accumulated by the parent into server.bgrewritebuf will start
8074 * with a SELECT statement and it will be safe to merge. */
8075 server.appendseldb = -1;
8076 return REDIS_OK;
8077 }
8078 return REDIS_OK; /* unreached */
8079 }
8080
8081 static void bgrewriteaofCommand(redisClient *c) {
8082 if (server.bgrewritechildpid != -1) {
8083 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8084 return;
8085 }
8086 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8087 char *status = "+Background append only file rewriting started\r\n";
8088 addReplySds(c,sdsnew(status));
8089 } else {
8090 addReply(c,shared.err);
8091 }
8092 }
8093
8094 static void aofRemoveTempFile(pid_t childpid) {
8095 char tmpfile[256];
8096
8097 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8098 unlink(tmpfile);
8099 }
8100
8101 /* Virtual Memory is composed mainly of two subsystems:
8102 * - Blocking Virutal Memory
8103 * - Threaded Virtual Memory I/O
8104 * The two parts are not fully decoupled, but functions are split among two
8105 * different sections of the source code (delimited by comments) in order to
8106 * make more clear what functionality is about the blocking VM and what about
8107 * the threaded (not blocking) VM.
8108 *
8109 * Redis VM design:
8110 *
8111 * Redis VM is a blocking VM (one that blocks reading swapped values from
8112 * disk into memory when a value swapped out is needed in memory) that is made
8113 * unblocking by trying to examine the command argument vector in order to
8114 * load in background values that will likely be needed in order to exec
8115 * the command. The command is executed only once all the relevant keys
8116 * are loaded into memory.
8117 *
8118 * This basically is almost as simple of a blocking VM, but almost as parallel
8119 * as a fully non-blocking VM.
8120 */
8121
8122 /* =================== Virtual Memory - Blocking Side ====================== */
8123
8124 /* substitute the first occurrence of '%p' with the process pid in the
8125 * swap file name. */
8126 static void expandVmSwapFilename(void) {
8127 char *p = strstr(server.vm_swap_file,"%p");
8128 sds new;
8129
8130 if (!p) return;
8131 new = sdsempty();
8132 *p = '\0';
8133 new = sdscat(new,server.vm_swap_file);
8134 new = sdscatprintf(new,"%ld",(long) getpid());
8135 new = sdscat(new,p+2);
8136 zfree(server.vm_swap_file);
8137 server.vm_swap_file = new;
8138 }
8139
8140 static void vmInit(void) {
8141 off_t totsize;
8142 int pipefds[2];
8143 size_t stacksize;
8144
8145 if (server.vm_max_threads != 0)
8146 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8147
8148 expandVmSwapFilename();
8149 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8150 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8151 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8152 }
8153 if (server.vm_fp == NULL) {
8154 redisLog(REDIS_WARNING,
8155 "Impossible to open the swap file: %s. Exiting.",
8156 strerror(errno));
8157 exit(1);
8158 }
8159 server.vm_fd = fileno(server.vm_fp);
8160 server.vm_next_page = 0;
8161 server.vm_near_pages = 0;
8162 server.vm_stats_used_pages = 0;
8163 server.vm_stats_swapped_objects = 0;
8164 server.vm_stats_swapouts = 0;
8165 server.vm_stats_swapins = 0;
8166 totsize = server.vm_pages*server.vm_page_size;
8167 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8168 if (ftruncate(server.vm_fd,totsize) == -1) {
8169 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8170 strerror(errno));
8171 exit(1);
8172 } else {
8173 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8174 }
8175 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8176 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8177 (long long) (server.vm_pages+7)/8, server.vm_pages);
8178 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8179
8180 /* Initialize threaded I/O (used by Virtual Memory) */
8181 server.io_newjobs = listCreate();
8182 server.io_processing = listCreate();
8183 server.io_processed = listCreate();
8184 server.io_ready_clients = listCreate();
8185 pthread_mutex_init(&server.io_mutex,NULL);
8186 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8187 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8188 server.io_active_threads = 0;
8189 if (pipe(pipefds) == -1) {
8190 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8191 ,strerror(errno));
8192 exit(1);
8193 }
8194 server.io_ready_pipe_read = pipefds[0];
8195 server.io_ready_pipe_write = pipefds[1];
8196 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8197 /* LZF requires a lot of stack */
8198 pthread_attr_init(&server.io_threads_attr);
8199 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8200 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8201 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8202 /* Listen for events in the threaded I/O pipe */
8203 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8204 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8205 oom("creating file event");
8206 }
8207
8208 /* Mark the page as used */
8209 static void vmMarkPageUsed(off_t page) {
8210 off_t byte = page/8;
8211 int bit = page&7;
8212 redisAssert(vmFreePage(page) == 1);
8213 server.vm_bitmap[byte] |= 1<<bit;
8214 }
8215
8216 /* Mark N contiguous pages as used, with 'page' being the first. */
8217 static void vmMarkPagesUsed(off_t page, off_t count) {
8218 off_t j;
8219
8220 for (j = 0; j < count; j++)
8221 vmMarkPageUsed(page+j);
8222 server.vm_stats_used_pages += count;
8223 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8224 (long long)count, (long long)page);
8225 }
8226
8227 /* Mark the page as free */
8228 static void vmMarkPageFree(off_t page) {
8229 off_t byte = page/8;
8230 int bit = page&7;
8231 redisAssert(vmFreePage(page) == 0);
8232 server.vm_bitmap[byte] &= ~(1<<bit);
8233 }
8234
8235 /* Mark N contiguous pages as free, with 'page' being the first. */
8236 static void vmMarkPagesFree(off_t page, off_t count) {
8237 off_t j;
8238
8239 for (j = 0; j < count; j++)
8240 vmMarkPageFree(page+j);
8241 server.vm_stats_used_pages -= count;
8242 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8243 (long long)count, (long long)page);
8244 }
8245
8246 /* Test if the page is free */
8247 static int vmFreePage(off_t page) {
8248 off_t byte = page/8;
8249 int bit = page&7;
8250 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8251 }
8252
8253 /* Find N contiguous free pages storing the first page of the cluster in *first.
8254 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8255 * REDIS_ERR is returned.
8256 *
8257 * This function uses a simple algorithm: we try to allocate
8258 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8259 * again from the start of the swap file searching for free spaces.
8260 *
8261 * If it looks pretty clear that there are no free pages near our offset
8262 * we try to find less populated places doing a forward jump of
8263 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8264 * without hurry, and then we jump again and so forth...
8265 *
8266 * This function can be improved using a free list to avoid to guess
8267 * too much, since we could collect data about freed pages.
8268 *
8269 * note: I implemented this function just after watching an episode of
8270 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8271 */
8272 static int vmFindContiguousPages(off_t *first, off_t n) {
8273 off_t base, offset = 0, since_jump = 0, numfree = 0;
8274
8275 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8276 server.vm_near_pages = 0;
8277 server.vm_next_page = 0;
8278 }
8279 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8280 base = server.vm_next_page;
8281
8282 while(offset < server.vm_pages) {
8283 off_t this = base+offset;
8284
8285 /* If we overflow, restart from page zero */
8286 if (this >= server.vm_pages) {
8287 this -= server.vm_pages;
8288 if (this == 0) {
8289 /* Just overflowed, what we found on tail is no longer
8290 * interesting, as it's no longer contiguous. */
8291 numfree = 0;
8292 }
8293 }
8294 if (vmFreePage(this)) {
8295 /* This is a free page */
8296 numfree++;
8297 /* Already got N free pages? Return to the caller, with success */
8298 if (numfree == n) {
8299 *first = this-(n-1);
8300 server.vm_next_page = this+1;
8301 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8302 return REDIS_OK;
8303 }
8304 } else {
8305 /* The current one is not a free page */
8306 numfree = 0;
8307 }
8308
8309 /* Fast-forward if the current page is not free and we already
8310 * searched enough near this place. */
8311 since_jump++;
8312 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8313 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8314 since_jump = 0;
8315 /* Note that even if we rewind after the jump, we are don't need
8316 * to make sure numfree is set to zero as we only jump *if* it
8317 * is set to zero. */
8318 } else {
8319 /* Otherwise just check the next page */
8320 offset++;
8321 }
8322 }
8323 return REDIS_ERR;
8324 }
8325
8326 /* Write the specified object at the specified page of the swap file */
8327 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8328 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8329 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8330 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8331 redisLog(REDIS_WARNING,
8332 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8333 strerror(errno));
8334 return REDIS_ERR;
8335 }
8336 rdbSaveObject(server.vm_fp,o);
8337 fflush(server.vm_fp);
8338 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8339 return REDIS_OK;
8340 }
8341
8342 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8343 * needed to later retrieve the object into the key object.
8344 * If we can't find enough contiguous empty pages to swap the object on disk
8345 * REDIS_ERR is returned. */
8346 static int vmSwapObjectBlocking(robj *key, robj *val) {
8347 off_t pages = rdbSavedObjectPages(val,NULL);
8348 off_t page;
8349
8350 assert(key->storage == REDIS_VM_MEMORY);
8351 assert(key->refcount == 1);
8352 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8353 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8354 key->vm.page = page;
8355 key->vm.usedpages = pages;
8356 key->storage = REDIS_VM_SWAPPED;
8357 key->vtype = val->type;
8358 decrRefCount(val); /* Deallocate the object from memory. */
8359 vmMarkPagesUsed(page,pages);
8360 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8361 (unsigned char*) key->ptr,
8362 (unsigned long long) page, (unsigned long long) pages);
8363 server.vm_stats_swapped_objects++;
8364 server.vm_stats_swapouts++;
8365 return REDIS_OK;
8366 }
8367
8368 static robj *vmReadObjectFromSwap(off_t page, int type) {
8369 robj *o;
8370
8371 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8372 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8373 redisLog(REDIS_WARNING,
8374 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8375 strerror(errno));
8376 _exit(1);
8377 }
8378 o = rdbLoadObject(type,server.vm_fp);
8379 if (o == NULL) {
8380 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8381 _exit(1);
8382 }
8383 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8384 return o;
8385 }
8386
8387 /* Load the value object relative to the 'key' object from swap to memory.
8388 * The newly allocated object is returned.
8389 *
8390 * If preview is true the unserialized object is returned to the caller but
8391 * no changes are made to the key object, nor the pages are marked as freed */
8392 static robj *vmGenericLoadObject(robj *key, int preview) {
8393 robj *val;
8394
8395 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8396 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8397 if (!preview) {
8398 key->storage = REDIS_VM_MEMORY;
8399 key->vm.atime = server.unixtime;
8400 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8401 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8402 (unsigned char*) key->ptr);
8403 server.vm_stats_swapped_objects--;
8404 } else {
8405 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8406 (unsigned char*) key->ptr);
8407 }
8408 server.vm_stats_swapins++;
8409 return val;
8410 }
8411
8412 /* Plain object loading, from swap to memory */
8413 static robj *vmLoadObject(robj *key) {
8414 /* If we are loading the object in background, stop it, we
8415 * need to load this object synchronously ASAP. */
8416 if (key->storage == REDIS_VM_LOADING)
8417 vmCancelThreadedIOJob(key);
8418 return vmGenericLoadObject(key,0);
8419 }
8420
8421 /* Just load the value on disk, without to modify the key.
8422 * This is useful when we want to perform some operation on the value
8423 * without to really bring it from swap to memory, like while saving the
8424 * dataset or rewriting the append only log. */
8425 static robj *vmPreviewObject(robj *key) {
8426 return vmGenericLoadObject(key,1);
8427 }
8428
8429 /* How a good candidate is this object for swapping?
8430 * The better candidate it is, the greater the returned value.
8431 *
8432 * Currently we try to perform a fast estimation of the object size in
8433 * memory, and combine it with aging informations.
8434 *
8435 * Basically swappability = idle-time * log(estimated size)
8436 *
8437 * Bigger objects are preferred over smaller objects, but not
8438 * proportionally, this is why we use the logarithm. This algorithm is
8439 * just a first try and will probably be tuned later. */
8440 static double computeObjectSwappability(robj *o) {
8441 time_t age = server.unixtime - o->vm.atime;
8442 long asize = 0;
8443 list *l;
8444 dict *d;
8445 struct dictEntry *de;
8446 int z;
8447
8448 if (age <= 0) return 0;
8449 switch(o->type) {
8450 case REDIS_STRING:
8451 if (o->encoding != REDIS_ENCODING_RAW) {
8452 asize = sizeof(*o);
8453 } else {
8454 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8455 }
8456 break;
8457 case REDIS_LIST:
8458 l = o->ptr;
8459 listNode *ln = listFirst(l);
8460
8461 asize = sizeof(list);
8462 if (ln) {
8463 robj *ele = ln->value;
8464 long elesize;
8465
8466 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8467 (sizeof(*o)+sdslen(ele->ptr)) :
8468 sizeof(*o);
8469 asize += (sizeof(listNode)+elesize)*listLength(l);
8470 }
8471 break;
8472 case REDIS_SET:
8473 case REDIS_ZSET:
8474 z = (o->type == REDIS_ZSET);
8475 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8476
8477 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8478 if (z) asize += sizeof(zset)-sizeof(dict);
8479 if (dictSize(d)) {
8480 long elesize;
8481 robj *ele;
8482
8483 de = dictGetRandomKey(d);
8484 ele = dictGetEntryKey(de);
8485 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8486 (sizeof(*o)+sdslen(ele->ptr)) :
8487 sizeof(*o);
8488 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8489 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8490 }
8491 break;
8492 case REDIS_HASH:
8493 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8494 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8495 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8496 unsigned int klen, vlen;
8497 unsigned char *key, *val;
8498
8499 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8500 klen = 0;
8501 vlen = 0;
8502 }
8503 asize = len*(klen+vlen+3);
8504 } else if (o->encoding == REDIS_ENCODING_HT) {
8505 d = o->ptr;
8506 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8507 if (dictSize(d)) {
8508 long elesize;
8509 robj *ele;
8510
8511 de = dictGetRandomKey(d);
8512 ele = dictGetEntryKey(de);
8513 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8514 (sizeof(*o)+sdslen(ele->ptr)) :
8515 sizeof(*o);
8516 ele = dictGetEntryVal(de);
8517 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8518 (sizeof(*o)+sdslen(ele->ptr)) :
8519 sizeof(*o);
8520 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8521 }
8522 }
8523 break;
8524 }
8525 return (double)age*log(1+asize);
8526 }
8527
8528 /* Try to swap an object that's a good candidate for swapping.
8529 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8530 * to swap any object at all.
8531 *
8532 * If 'usethreaded' is true, Redis will try to swap the object in background
8533 * using I/O threads. */
8534 static int vmSwapOneObject(int usethreads) {
8535 int j, i;
8536 struct dictEntry *best = NULL;
8537 double best_swappability = 0;
8538 redisDb *best_db = NULL;
8539 robj *key, *val;
8540
8541 for (j = 0; j < server.dbnum; j++) {
8542 redisDb *db = server.db+j;
8543 /* Why maxtries is set to 100?
8544 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8545 * are swappable objects */
8546 int maxtries = 100;
8547
8548 if (dictSize(db->dict) == 0) continue;
8549 for (i = 0; i < 5; i++) {
8550 dictEntry *de;
8551 double swappability;
8552
8553 if (maxtries) maxtries--;
8554 de = dictGetRandomKey(db->dict);
8555 key = dictGetEntryKey(de);
8556 val = dictGetEntryVal(de);
8557 /* Only swap objects that are currently in memory.
8558 *
8559 * Also don't swap shared objects if threaded VM is on, as we
8560 * try to ensure that the main thread does not touch the
8561 * object while the I/O thread is using it, but we can't
8562 * control other keys without adding additional mutex. */
8563 if (key->storage != REDIS_VM_MEMORY ||
8564 (server.vm_max_threads != 0 && val->refcount != 1)) {
8565 if (maxtries) i--; /* don't count this try */
8566 continue;
8567 }
8568 swappability = computeObjectSwappability(val);
8569 if (!best || swappability > best_swappability) {
8570 best = de;
8571 best_swappability = swappability;
8572 best_db = db;
8573 }
8574 }
8575 }
8576 if (best == NULL) return REDIS_ERR;
8577 key = dictGetEntryKey(best);
8578 val = dictGetEntryVal(best);
8579
8580 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8581 key->ptr, best_swappability);
8582
8583 /* Unshare the key if needed */
8584 if (key->refcount > 1) {
8585 robj *newkey = dupStringObject(key);
8586 decrRefCount(key);
8587 key = dictGetEntryKey(best) = newkey;
8588 }
8589 /* Swap it */
8590 if (usethreads) {
8591 vmSwapObjectThreaded(key,val,best_db);
8592 return REDIS_OK;
8593 } else {
8594 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8595 dictGetEntryVal(best) = NULL;
8596 return REDIS_OK;
8597 } else {
8598 return REDIS_ERR;
8599 }
8600 }
8601 }
8602
8603 static int vmSwapOneObjectBlocking() {
8604 return vmSwapOneObject(0);
8605 }
8606
8607 static int vmSwapOneObjectThreaded() {
8608 return vmSwapOneObject(1);
8609 }
8610
8611 /* Return true if it's safe to swap out objects in a given moment.
8612 * Basically we don't want to swap objects out while there is a BGSAVE
8613 * or a BGAEOREWRITE running in backgroud. */
8614 static int vmCanSwapOut(void) {
8615 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8616 }
8617
8618 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8619 * and was deleted. Otherwise 0 is returned. */
8620 static int deleteIfSwapped(redisDb *db, robj *key) {
8621 dictEntry *de;
8622 robj *foundkey;
8623
8624 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8625 foundkey = dictGetEntryKey(de);
8626 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8627 deleteKey(db,key);
8628 return 1;
8629 }
8630
8631 /* =================== Virtual Memory - Threaded I/O ======================= */
8632
8633 static void freeIOJob(iojob *j) {
8634 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8635 j->type == REDIS_IOJOB_DO_SWAP ||
8636 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8637 decrRefCount(j->val);
8638 /* We don't decrRefCount the j->key field as we did't incremented
8639 * the count creating IO Jobs. This is because the key field here is
8640 * just used as an indentifier and if a key is removed the Job should
8641 * never be touched again. */
8642 zfree(j);
8643 }
8644
8645 /* Every time a thread finished a Job, it writes a byte into the write side
8646 * of an unix pipe in order to "awake" the main thread, and this function
8647 * is called. */
8648 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8649 int mask)
8650 {
8651 char buf[1];
8652 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8653 REDIS_NOTUSED(el);
8654 REDIS_NOTUSED(mask);
8655 REDIS_NOTUSED(privdata);
8656
8657 /* For every byte we read in the read side of the pipe, there is one
8658 * I/O job completed to process. */
8659 while((retval = read(fd,buf,1)) == 1) {
8660 iojob *j;
8661 listNode *ln;
8662 robj *key;
8663 struct dictEntry *de;
8664
8665 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8666
8667 /* Get the processed element (the oldest one) */
8668 lockThreadedIO();
8669 assert(listLength(server.io_processed) != 0);
8670 if (toprocess == -1) {
8671 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8672 if (toprocess <= 0) toprocess = 1;
8673 }
8674 ln = listFirst(server.io_processed);
8675 j = ln->value;
8676 listDelNode(server.io_processed,ln);
8677 unlockThreadedIO();
8678 /* If this job is marked as canceled, just ignore it */
8679 if (j->canceled) {
8680 freeIOJob(j);
8681 continue;
8682 }
8683 /* Post process it in the main thread, as there are things we
8684 * can do just here to avoid race conditions and/or invasive locks */
8685 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8686 de = dictFind(j->db->dict,j->key);
8687 assert(de != NULL);
8688 key = dictGetEntryKey(de);
8689 if (j->type == REDIS_IOJOB_LOAD) {
8690 redisDb *db;
8691
8692 /* Key loaded, bring it at home */
8693 key->storage = REDIS_VM_MEMORY;
8694 key->vm.atime = server.unixtime;
8695 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8696 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8697 (unsigned char*) key->ptr);
8698 server.vm_stats_swapped_objects--;
8699 server.vm_stats_swapins++;
8700 dictGetEntryVal(de) = j->val;
8701 incrRefCount(j->val);
8702 db = j->db;
8703 freeIOJob(j);
8704 /* Handle clients waiting for this key to be loaded. */
8705 handleClientsBlockedOnSwappedKey(db,key);
8706 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8707 /* Now we know the amount of pages required to swap this object.
8708 * Let's find some space for it, and queue this task again
8709 * rebranded as REDIS_IOJOB_DO_SWAP. */
8710 if (!vmCanSwapOut() ||
8711 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8712 {
8713 /* Ooops... no space or we can't swap as there is
8714 * a fork()ed Redis trying to save stuff on disk. */
8715 freeIOJob(j);
8716 key->storage = REDIS_VM_MEMORY; /* undo operation */
8717 } else {
8718 /* Note that we need to mark this pages as used now,
8719 * if the job will be canceled, we'll mark them as freed
8720 * again. */
8721 vmMarkPagesUsed(j->page,j->pages);
8722 j->type = REDIS_IOJOB_DO_SWAP;
8723 lockThreadedIO();
8724 queueIOJob(j);
8725 unlockThreadedIO();
8726 }
8727 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8728 robj *val;
8729
8730 /* Key swapped. We can finally free some memory. */
8731 if (key->storage != REDIS_VM_SWAPPING) {
8732 printf("key->storage: %d\n",key->storage);
8733 printf("key->name: %s\n",(char*)key->ptr);
8734 printf("key->refcount: %d\n",key->refcount);
8735 printf("val: %p\n",(void*)j->val);
8736 printf("val->type: %d\n",j->val->type);
8737 printf("val->ptr: %s\n",(char*)j->val->ptr);
8738 }
8739 redisAssert(key->storage == REDIS_VM_SWAPPING);
8740 val = dictGetEntryVal(de);
8741 key->vm.page = j->page;
8742 key->vm.usedpages = j->pages;
8743 key->storage = REDIS_VM_SWAPPED;
8744 key->vtype = j->val->type;
8745 decrRefCount(val); /* Deallocate the object from memory. */
8746 dictGetEntryVal(de) = NULL;
8747 redisLog(REDIS_DEBUG,
8748 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8749 (unsigned char*) key->ptr,
8750 (unsigned long long) j->page, (unsigned long long) j->pages);
8751 server.vm_stats_swapped_objects++;
8752 server.vm_stats_swapouts++;
8753 freeIOJob(j);
8754 /* Put a few more swap requests in queue if we are still
8755 * out of memory */
8756 if (trytoswap && vmCanSwapOut() &&
8757 zmalloc_used_memory() > server.vm_max_memory)
8758 {
8759 int more = 1;
8760 while(more) {
8761 lockThreadedIO();
8762 more = listLength(server.io_newjobs) <
8763 (unsigned) server.vm_max_threads;
8764 unlockThreadedIO();
8765 /* Don't waste CPU time if swappable objects are rare. */
8766 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8767 trytoswap = 0;
8768 break;
8769 }
8770 }
8771 }
8772 }
8773 processed++;
8774 if (processed == toprocess) return;
8775 }
8776 if (retval < 0 && errno != EAGAIN) {
8777 redisLog(REDIS_WARNING,
8778 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8779 strerror(errno));
8780 }
8781 }
8782
8783 static void lockThreadedIO(void) {
8784 pthread_mutex_lock(&server.io_mutex);
8785 }
8786
8787 static void unlockThreadedIO(void) {
8788 pthread_mutex_unlock(&server.io_mutex);
8789 }
8790
8791 /* Remove the specified object from the threaded I/O queue if still not
8792 * processed, otherwise make sure to flag it as canceled. */
8793 static void vmCancelThreadedIOJob(robj *o) {
8794 list *lists[3] = {
8795 server.io_newjobs, /* 0 */
8796 server.io_processing, /* 1 */
8797 server.io_processed /* 2 */
8798 };
8799 int i;
8800
8801 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8802 again:
8803 lockThreadedIO();
8804 /* Search for a matching key in one of the queues */
8805 for (i = 0; i < 3; i++) {
8806 listNode *ln;
8807 listIter li;
8808
8809 listRewind(lists[i],&li);
8810 while ((ln = listNext(&li)) != NULL) {
8811 iojob *job = ln->value;
8812
8813 if (job->canceled) continue; /* Skip this, already canceled. */
8814 if (job->key == o) {
8815 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8816 (void*)job, (char*)o->ptr, job->type, i);
8817 /* Mark the pages as free since the swap didn't happened
8818 * or happened but is now discarded. */
8819 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8820 vmMarkPagesFree(job->page,job->pages);
8821 /* Cancel the job. It depends on the list the job is
8822 * living in. */
8823 switch(i) {
8824 case 0: /* io_newjobs */
8825 /* If the job was yet not processed the best thing to do
8826 * is to remove it from the queue at all */
8827 freeIOJob(job);
8828 listDelNode(lists[i],ln);
8829 break;
8830 case 1: /* io_processing */
8831 /* Oh Shi- the thread is messing with the Job:
8832 *
8833 * Probably it's accessing the object if this is a
8834 * PREPARE_SWAP or DO_SWAP job.
8835 * If it's a LOAD job it may be reading from disk and
8836 * if we don't wait for the job to terminate before to
8837 * cancel it, maybe in a few microseconds data can be
8838 * corrupted in this pages. So the short story is:
8839 *
8840 * Better to wait for the job to move into the
8841 * next queue (processed)... */
8842
8843 /* We try again and again until the job is completed. */
8844 unlockThreadedIO();
8845 /* But let's wait some time for the I/O thread
8846 * to finish with this job. After all this condition
8847 * should be very rare. */
8848 usleep(1);
8849 goto again;
8850 case 2: /* io_processed */
8851 /* The job was already processed, that's easy...
8852 * just mark it as canceled so that we'll ignore it
8853 * when processing completed jobs. */
8854 job->canceled = 1;
8855 break;
8856 }
8857 /* Finally we have to adjust the storage type of the object
8858 * in order to "UNDO" the operaiton. */
8859 if (o->storage == REDIS_VM_LOADING)
8860 o->storage = REDIS_VM_SWAPPED;
8861 else if (o->storage == REDIS_VM_SWAPPING)
8862 o->storage = REDIS_VM_MEMORY;
8863 unlockThreadedIO();
8864 return;
8865 }
8866 }
8867 }
8868 unlockThreadedIO();
8869 assert(1 != 1); /* We should never reach this */
8870 }
8871
8872 static void *IOThreadEntryPoint(void *arg) {
8873 iojob *j;
8874 listNode *ln;
8875 REDIS_NOTUSED(arg);
8876
8877 pthread_detach(pthread_self());
8878 while(1) {
8879 /* Get a new job to process */
8880 lockThreadedIO();
8881 if (listLength(server.io_newjobs) == 0) {
8882 /* No new jobs in queue, exit. */
8883 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8884 (long) pthread_self());
8885 server.io_active_threads--;
8886 unlockThreadedIO();
8887 return NULL;
8888 }
8889 ln = listFirst(server.io_newjobs);
8890 j = ln->value;
8891 listDelNode(server.io_newjobs,ln);
8892 /* Add the job in the processing queue */
8893 j->thread = pthread_self();
8894 listAddNodeTail(server.io_processing,j);
8895 ln = listLast(server.io_processing); /* We use ln later to remove it */
8896 unlockThreadedIO();
8897 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8898 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8899
8900 /* Process the Job */
8901 if (j->type == REDIS_IOJOB_LOAD) {
8902 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8903 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8904 FILE *fp = fopen("/dev/null","w+");
8905 j->pages = rdbSavedObjectPages(j->val,fp);
8906 fclose(fp);
8907 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8908 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8909 j->canceled = 1;
8910 }
8911
8912 /* Done: insert the job into the processed queue */
8913 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8914 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8915 lockThreadedIO();
8916 listDelNode(server.io_processing,ln);
8917 listAddNodeTail(server.io_processed,j);
8918 unlockThreadedIO();
8919
8920 /* Signal the main thread there is new stuff to process */
8921 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8922 }
8923 return NULL; /* never reached */
8924 }
8925
8926 static void spawnIOThread(void) {
8927 pthread_t thread;
8928 sigset_t mask, omask;
8929 int err;
8930
8931 sigemptyset(&mask);
8932 sigaddset(&mask,SIGCHLD);
8933 sigaddset(&mask,SIGHUP);
8934 sigaddset(&mask,SIGPIPE);
8935 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8936 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8937 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8938 strerror(err));
8939 usleep(1000000);
8940 }
8941 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8942 server.io_active_threads++;
8943 }
8944
8945 /* We need to wait for the last thread to exit before we are able to
8946 * fork() in order to BGSAVE or BGREWRITEAOF. */
8947 static void waitEmptyIOJobsQueue(void) {
8948 while(1) {
8949 int io_processed_len;
8950
8951 lockThreadedIO();
8952 if (listLength(server.io_newjobs) == 0 &&
8953 listLength(server.io_processing) == 0 &&
8954 server.io_active_threads == 0)
8955 {
8956 unlockThreadedIO();
8957 return;
8958 }
8959 /* While waiting for empty jobs queue condition we post-process some
8960 * finshed job, as I/O threads may be hanging trying to write against
8961 * the io_ready_pipe_write FD but there are so much pending jobs that
8962 * it's blocking. */
8963 io_processed_len = listLength(server.io_processed);
8964 unlockThreadedIO();
8965 if (io_processed_len) {
8966 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8967 usleep(1000); /* 1 millisecond */
8968 } else {
8969 usleep(10000); /* 10 milliseconds */
8970 }
8971 }
8972 }
8973
8974 static void vmReopenSwapFile(void) {
8975 /* Note: we don't close the old one as we are in the child process
8976 * and don't want to mess at all with the original file object. */
8977 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8978 if (server.vm_fp == NULL) {
8979 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8980 server.vm_swap_file);
8981 _exit(1);
8982 }
8983 server.vm_fd = fileno(server.vm_fp);
8984 }
8985
8986 /* This function must be called while with threaded IO locked */
8987 static void queueIOJob(iojob *j) {
8988 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8989 (void*)j, j->type, (char*)j->key->ptr);
8990 listAddNodeTail(server.io_newjobs,j);
8991 if (server.io_active_threads < server.vm_max_threads)
8992 spawnIOThread();
8993 }
8994
8995 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8996 iojob *j;
8997
8998 assert(key->storage == REDIS_VM_MEMORY);
8999 assert(key->refcount == 1);
9000
9001 j = zmalloc(sizeof(*j));
9002 j->type = REDIS_IOJOB_PREPARE_SWAP;
9003 j->db = db;
9004 j->key = key;
9005 j->val = val;
9006 incrRefCount(val);
9007 j->canceled = 0;
9008 j->thread = (pthread_t) -1;
9009 key->storage = REDIS_VM_SWAPPING;
9010
9011 lockThreadedIO();
9012 queueIOJob(j);
9013 unlockThreadedIO();
9014 return REDIS_OK;
9015 }
9016
9017 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9018
9019 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9020 * If there is not already a job loading the key, it is craeted.
9021 * The key is added to the io_keys list in the client structure, and also
9022 * in the hash table mapping swapped keys to waiting clients, that is,
9023 * server.io_waited_keys. */
9024 static int waitForSwappedKey(redisClient *c, robj *key) {
9025 struct dictEntry *de;
9026 robj *o;
9027 list *l;
9028
9029 /* If the key does not exist or is already in RAM we don't need to
9030 * block the client at all. */
9031 de = dictFind(c->db->dict,key);
9032 if (de == NULL) return 0;
9033 o = dictGetEntryKey(de);
9034 if (o->storage == REDIS_VM_MEMORY) {
9035 return 0;
9036 } else if (o->storage == REDIS_VM_SWAPPING) {
9037 /* We were swapping the key, undo it! */
9038 vmCancelThreadedIOJob(o);
9039 return 0;
9040 }
9041
9042 /* OK: the key is either swapped, or being loaded just now. */
9043
9044 /* Add the key to the list of keys this client is waiting for.
9045 * This maps clients to keys they are waiting for. */
9046 listAddNodeTail(c->io_keys,key);
9047 incrRefCount(key);
9048
9049 /* Add the client to the swapped keys => clients waiting map. */
9050 de = dictFind(c->db->io_keys,key);
9051 if (de == NULL) {
9052 int retval;
9053
9054 /* For every key we take a list of clients blocked for it */
9055 l = listCreate();
9056 retval = dictAdd(c->db->io_keys,key,l);
9057 incrRefCount(key);
9058 assert(retval == DICT_OK);
9059 } else {
9060 l = dictGetEntryVal(de);
9061 }
9062 listAddNodeTail(l,c);
9063
9064 /* Are we already loading the key from disk? If not create a job */
9065 if (o->storage == REDIS_VM_SWAPPED) {
9066 iojob *j;
9067
9068 o->storage = REDIS_VM_LOADING;
9069 j = zmalloc(sizeof(*j));
9070 j->type = REDIS_IOJOB_LOAD;
9071 j->db = c->db;
9072 j->key = o;
9073 j->key->vtype = o->vtype;
9074 j->page = o->vm.page;
9075 j->val = NULL;
9076 j->canceled = 0;
9077 j->thread = (pthread_t) -1;
9078 lockThreadedIO();
9079 queueIOJob(j);
9080 unlockThreadedIO();
9081 }
9082 return 1;
9083 }
9084
9085 /* Preload keys needed for the ZUNION and ZINTER commands. */
9086 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9087 int i, num;
9088 num = atoi(c->argv[2]->ptr);
9089 for (i = 0; i < num; i++) {
9090 waitForSwappedKey(c,c->argv[3+i]);
9091 }
9092 }
9093
9094 /* Is this client attempting to run a command against swapped keys?
9095 * If so, block it ASAP, load the keys in background, then resume it.
9096 *
9097 * The important idea about this function is that it can fail! If keys will
9098 * still be swapped when the client is resumed, this key lookups will
9099 * just block loading keys from disk. In practical terms this should only
9100 * happen with SORT BY command or if there is a bug in this function.
9101 *
9102 * Return 1 if the client is marked as blocked, 0 if the client can
9103 * continue as the keys it is going to access appear to be in memory. */
9104 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9105 int j, last;
9106
9107 if (cmd->vm_preload_proc != NULL) {
9108 cmd->vm_preload_proc(c);
9109 } else {
9110 if (cmd->vm_firstkey == 0) return 0;
9111 last = cmd->vm_lastkey;
9112 if (last < 0) last = c->argc+last;
9113 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9114 waitForSwappedKey(c,c->argv[j]);
9115 }
9116
9117 /* If the client was blocked for at least one key, mark it as blocked. */
9118 if (listLength(c->io_keys)) {
9119 c->flags |= REDIS_IO_WAIT;
9120 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9121 server.vm_blocked_clients++;
9122 return 1;
9123 } else {
9124 return 0;
9125 }
9126 }
9127
9128 /* Remove the 'key' from the list of blocked keys for a given client.
9129 *
9130 * The function returns 1 when there are no longer blocking keys after
9131 * the current one was removed (and the client can be unblocked). */
9132 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9133 list *l;
9134 listNode *ln;
9135 listIter li;
9136 struct dictEntry *de;
9137
9138 /* Remove the key from the list of keys this client is waiting for. */
9139 listRewind(c->io_keys,&li);
9140 while ((ln = listNext(&li)) != NULL) {
9141 if (compareStringObjects(ln->value,key) == 0) {
9142 listDelNode(c->io_keys,ln);
9143 break;
9144 }
9145 }
9146 assert(ln != NULL);
9147
9148 /* Remove the client form the key => waiting clients map. */
9149 de = dictFind(c->db->io_keys,key);
9150 assert(de != NULL);
9151 l = dictGetEntryVal(de);
9152 ln = listSearchKey(l,c);
9153 assert(ln != NULL);
9154 listDelNode(l,ln);
9155 if (listLength(l) == 0)
9156 dictDelete(c->db->io_keys,key);
9157
9158 return listLength(c->io_keys) == 0;
9159 }
9160
9161 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9162 struct dictEntry *de;
9163 list *l;
9164 listNode *ln;
9165 int len;
9166
9167 de = dictFind(db->io_keys,key);
9168 if (!de) return;
9169
9170 l = dictGetEntryVal(de);
9171 len = listLength(l);
9172 /* Note: we can't use something like while(listLength(l)) as the list
9173 * can be freed by the calling function when we remove the last element. */
9174 while (len--) {
9175 ln = listFirst(l);
9176 redisClient *c = ln->value;
9177
9178 if (dontWaitForSwappedKey(c,key)) {
9179 /* Put the client in the list of clients ready to go as we
9180 * loaded all the keys about it. */
9181 listAddNodeTail(server.io_ready_clients,c);
9182 }
9183 }
9184 }
9185
9186 /* =========================== Remote Configuration ========================= */
9187
9188 static void configSetCommand(redisClient *c) {
9189 robj *o = getDecodedObject(c->argv[3]);
9190 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9191 zfree(server.dbfilename);
9192 server.dbfilename = zstrdup(o->ptr);
9193 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9194 zfree(server.requirepass);
9195 server.requirepass = zstrdup(o->ptr);
9196 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9197 zfree(server.masterauth);
9198 server.masterauth = zstrdup(o->ptr);
9199 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9200 server.maxmemory = strtoll(o->ptr, NULL, 10);
9201 } else {
9202 addReplySds(c,sdscatprintf(sdsempty(),
9203 "-ERR not supported CONFIG parameter %s\r\n",
9204 (char*)c->argv[2]->ptr));
9205 decrRefCount(o);
9206 return;
9207 }
9208 decrRefCount(o);
9209 addReply(c,shared.ok);
9210 }
9211
9212 static void configGetCommand(redisClient *c) {
9213 robj *o = getDecodedObject(c->argv[2]);
9214 robj *lenobj = createObject(REDIS_STRING,NULL);
9215 char *pattern = o->ptr;
9216 int matches = 0;
9217
9218 addReply(c,lenobj);
9219 decrRefCount(lenobj);
9220
9221 if (stringmatch(pattern,"dbfilename",0)) {
9222 addReplyBulkCString(c,"dbfilename");
9223 addReplyBulkCString(c,server.dbfilename);
9224 matches++;
9225 }
9226 if (stringmatch(pattern,"requirepass",0)) {
9227 addReplyBulkCString(c,"requirepass");
9228 addReplyBulkCString(c,server.requirepass);
9229 matches++;
9230 }
9231 if (stringmatch(pattern,"masterauth",0)) {
9232 addReplyBulkCString(c,"masterauth");
9233 addReplyBulkCString(c,server.masterauth);
9234 matches++;
9235 }
9236 if (stringmatch(pattern,"maxmemory",0)) {
9237 char buf[128];
9238
9239 snprintf(buf,128,"%llu\n",server.maxmemory);
9240 addReplyBulkCString(c,"maxmemory");
9241 addReplyBulkCString(c,buf);
9242 matches++;
9243 }
9244 decrRefCount(o);
9245 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9246 }
9247
9248 static void configCommand(redisClient *c) {
9249 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9250 if (c->argc != 4) goto badarity;
9251 configSetCommand(c);
9252 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9253 if (c->argc != 3) goto badarity;
9254 configGetCommand(c);
9255 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9256 if (c->argc != 2) goto badarity;
9257 server.stat_numcommands = 0;
9258 server.stat_numconnections = 0;
9259 server.stat_expiredkeys = 0;
9260 server.stat_starttime = time(NULL);
9261 addReply(c,shared.ok);
9262 } else {
9263 addReplySds(c,sdscatprintf(sdsempty(),
9264 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9265 }
9266 return;
9267
9268 badarity:
9269 addReplySds(c,sdscatprintf(sdsempty(),
9270 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9271 (char*) c->argv[1]->ptr));
9272 }
9273
9274 /* =========================== Pubsub implementation ======================== */
9275
9276 static void freePubsubPattern(void *p) {
9277 pubsubPattern *pat = p;
9278
9279 decrRefCount(pat->pattern);
9280 zfree(pat);
9281 }
9282
9283 static int listMatchPubsubPattern(void *a, void *b) {
9284 pubsubPattern *pa = a, *pb = b;
9285
9286 return (pa->client == pb->client) &&
9287 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9288 }
9289
9290 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9291 * 0 if the client was already subscribed to that channel. */
9292 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9293 struct dictEntry *de;
9294 list *clients = NULL;
9295 int retval = 0;
9296
9297 /* Add the channel to the client -> channels hash table */
9298 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9299 retval = 1;
9300 incrRefCount(channel);
9301 /* Add the client to the channel -> list of clients hash table */
9302 de = dictFind(server.pubsub_channels,channel);
9303 if (de == NULL) {
9304 clients = listCreate();
9305 dictAdd(server.pubsub_channels,channel,clients);
9306 incrRefCount(channel);
9307 } else {
9308 clients = dictGetEntryVal(de);
9309 }
9310 listAddNodeTail(clients,c);
9311 }
9312 /* Notify the client */
9313 addReply(c,shared.mbulk3);
9314 addReply(c,shared.subscribebulk);
9315 addReplyBulk(c,channel);
9316 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9317 return retval;
9318 }
9319
9320 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9321 * 0 if the client was not subscribed to the specified channel. */
9322 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9323 struct dictEntry *de;
9324 list *clients;
9325 listNode *ln;
9326 int retval = 0;
9327
9328 /* Remove the channel from the client -> channels hash table */
9329 incrRefCount(channel); /* channel may be just a pointer to the same object
9330 we have in the hash tables. Protect it... */
9331 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9332 retval = 1;
9333 /* Remove the client from the channel -> clients list hash table */
9334 de = dictFind(server.pubsub_channels,channel);
9335 assert(de != NULL);
9336 clients = dictGetEntryVal(de);
9337 ln = listSearchKey(clients,c);
9338 assert(ln != NULL);
9339 listDelNode(clients,ln);
9340 if (listLength(clients) == 0) {
9341 /* Free the list and associated hash entry at all if this was
9342 * the latest client, so that it will be possible to abuse
9343 * Redis PUBSUB creating millions of channels. */
9344 dictDelete(server.pubsub_channels,channel);
9345 }
9346 }
9347 /* Notify the client */
9348 if (notify) {
9349 addReply(c,shared.mbulk3);
9350 addReply(c,shared.unsubscribebulk);
9351 addReplyBulk(c,channel);
9352 addReplyLong(c,dictSize(c->pubsub_channels)+
9353 listLength(c->pubsub_patterns));
9354
9355 }
9356 decrRefCount(channel); /* it is finally safe to release it */
9357 return retval;
9358 }
9359
9360 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9361 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9362 int retval = 0;
9363
9364 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9365 retval = 1;
9366 pubsubPattern *pat;
9367 listAddNodeTail(c->pubsub_patterns,pattern);
9368 incrRefCount(pattern);
9369 pat = zmalloc(sizeof(*pat));
9370 pat->pattern = getDecodedObject(pattern);
9371 pat->client = c;
9372 listAddNodeTail(server.pubsub_patterns,pat);
9373 }
9374 /* Notify the client */
9375 addReply(c,shared.mbulk3);
9376 addReply(c,shared.psubscribebulk);
9377 addReplyBulk(c,pattern);
9378 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9379 return retval;
9380 }
9381
9382 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9383 * 0 if the client was not subscribed to the specified channel. */
9384 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9385 listNode *ln;
9386 pubsubPattern pat;
9387 int retval = 0;
9388
9389 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9390 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9391 retval = 1;
9392 listDelNode(c->pubsub_patterns,ln);
9393 pat.client = c;
9394 pat.pattern = pattern;
9395 ln = listSearchKey(server.pubsub_patterns,&pat);
9396 listDelNode(server.pubsub_patterns,ln);
9397 }
9398 /* Notify the client */
9399 if (notify) {
9400 addReply(c,shared.mbulk3);
9401 addReply(c,shared.punsubscribebulk);
9402 addReplyBulk(c,pattern);
9403 addReplyLong(c,dictSize(c->pubsub_channels)+
9404 listLength(c->pubsub_patterns));
9405 }
9406 decrRefCount(pattern);
9407 return retval;
9408 }
9409
9410 /* Unsubscribe from all the channels. Return the number of channels the
9411 * client was subscribed from. */
9412 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9413 dictIterator *di = dictGetIterator(c->pubsub_channels);
9414 dictEntry *de;
9415 int count = 0;
9416
9417 while((de = dictNext(di)) != NULL) {
9418 robj *channel = dictGetEntryKey(de);
9419
9420 count += pubsubUnsubscribeChannel(c,channel,notify);
9421 }
9422 dictReleaseIterator(di);
9423 return count;
9424 }
9425
9426 /* Unsubscribe from all the patterns. Return the number of patterns the
9427 * client was subscribed from. */
9428 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9429 listNode *ln;
9430 listIter li;
9431 int count = 0;
9432
9433 listRewind(c->pubsub_patterns,&li);
9434 while ((ln = listNext(&li)) != NULL) {
9435 robj *pattern = ln->value;
9436
9437 count += pubsubUnsubscribePattern(c,pattern,notify);
9438 }
9439 return count;
9440 }
9441
9442 /* Publish a message */
9443 static int pubsubPublishMessage(robj *channel, robj *message) {
9444 int receivers = 0;
9445 struct dictEntry *de;
9446 listNode *ln;
9447 listIter li;
9448
9449 /* Send to clients listening for that channel */
9450 de = dictFind(server.pubsub_channels,channel);
9451 if (de) {
9452 list *list = dictGetEntryVal(de);
9453 listNode *ln;
9454 listIter li;
9455
9456 listRewind(list,&li);
9457 while ((ln = listNext(&li)) != NULL) {
9458 redisClient *c = ln->value;
9459
9460 addReply(c,shared.mbulk3);
9461 addReply(c,shared.messagebulk);
9462 addReplyBulk(c,channel);
9463 addReplyBulk(c,message);
9464 receivers++;
9465 }
9466 }
9467 /* Send to clients listening to matching channels */
9468 if (listLength(server.pubsub_patterns)) {
9469 listRewind(server.pubsub_patterns,&li);
9470 channel = getDecodedObject(channel);
9471 while ((ln = listNext(&li)) != NULL) {
9472 pubsubPattern *pat = ln->value;
9473
9474 if (stringmatchlen((char*)pat->pattern->ptr,
9475 sdslen(pat->pattern->ptr),
9476 (char*)channel->ptr,
9477 sdslen(channel->ptr),0)) {
9478 addReply(pat->client,shared.mbulk3);
9479 addReply(pat->client,shared.messagebulk);
9480 addReplyBulk(pat->client,channel);
9481 addReplyBulk(pat->client,message);
9482 receivers++;
9483 }
9484 }
9485 decrRefCount(channel);
9486 }
9487 return receivers;
9488 }
9489
9490 static void subscribeCommand(redisClient *c) {
9491 int j;
9492
9493 for (j = 1; j < c->argc; j++)
9494 pubsubSubscribeChannel(c,c->argv[j]);
9495 }
9496
9497 static void unsubscribeCommand(redisClient *c) {
9498 if (c->argc == 1) {
9499 pubsubUnsubscribeAllChannels(c,1);
9500 return;
9501 } else {
9502 int j;
9503
9504 for (j = 1; j < c->argc; j++)
9505 pubsubUnsubscribeChannel(c,c->argv[j],1);
9506 }
9507 }
9508
9509 static void psubscribeCommand(redisClient *c) {
9510 int j;
9511
9512 for (j = 1; j < c->argc; j++)
9513 pubsubSubscribePattern(c,c->argv[j]);
9514 }
9515
9516 static void punsubscribeCommand(redisClient *c) {
9517 if (c->argc == 1) {
9518 pubsubUnsubscribeAllPatterns(c,1);
9519 return;
9520 } else {
9521 int j;
9522
9523 for (j = 1; j < c->argc; j++)
9524 pubsubUnsubscribePattern(c,c->argv[j],1);
9525 }
9526 }
9527
9528 static void publishCommand(redisClient *c) {
9529 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9530 addReplyLong(c,receivers);
9531 }
9532
9533 /* ================================= Debugging ============================== */
9534
9535 static void debugCommand(redisClient *c) {
9536 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9537 *((char*)-1) = 'x';
9538 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9539 if (rdbSave(server.dbfilename) != REDIS_OK) {
9540 addReply(c,shared.err);
9541 return;
9542 }
9543 emptyDb();
9544 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9545 addReply(c,shared.err);
9546 return;
9547 }
9548 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9549 addReply(c,shared.ok);
9550 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9551 emptyDb();
9552 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9553 addReply(c,shared.err);
9554 return;
9555 }
9556 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9557 addReply(c,shared.ok);
9558 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9559 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9560 robj *key, *val;
9561
9562 if (!de) {
9563 addReply(c,shared.nokeyerr);
9564 return;
9565 }
9566 key = dictGetEntryKey(de);
9567 val = dictGetEntryVal(de);
9568 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9569 key->storage == REDIS_VM_SWAPPING)) {
9570 char *strenc;
9571 char buf[128];
9572
9573 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9574 strenc = strencoding[val->encoding];
9575 } else {
9576 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9577 strenc = buf;
9578 }
9579 addReplySds(c,sdscatprintf(sdsempty(),
9580 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9581 "encoding:%s serializedlength:%lld\r\n",
9582 (void*)key, key->refcount, (void*)val, val->refcount,
9583 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9584 } else {
9585 addReplySds(c,sdscatprintf(sdsempty(),
9586 "+Key at:%p refcount:%d, value swapped at: page %llu "
9587 "using %llu pages\r\n",
9588 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9589 (unsigned long long) key->vm.usedpages));
9590 }
9591 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9592 lookupKeyRead(c->db,c->argv[2]);
9593 addReply(c,shared.ok);
9594 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9595 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9596 robj *key, *val;
9597
9598 if (!server.vm_enabled) {
9599 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9600 return;
9601 }
9602 if (!de) {
9603 addReply(c,shared.nokeyerr);
9604 return;
9605 }
9606 key = dictGetEntryKey(de);
9607 val = dictGetEntryVal(de);
9608 /* If the key is shared we want to create a copy */
9609 if (key->refcount > 1) {
9610 robj *newkey = dupStringObject(key);
9611 decrRefCount(key);
9612 key = dictGetEntryKey(de) = newkey;
9613 }
9614 /* Swap it */
9615 if (key->storage != REDIS_VM_MEMORY) {
9616 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9617 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9618 dictGetEntryVal(de) = NULL;
9619 addReply(c,shared.ok);
9620 } else {
9621 addReply(c,shared.err);
9622 }
9623 } else {
9624 addReplySds(c,sdsnew(
9625 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9626 }
9627 }
9628
9629 static void _redisAssert(char *estr, char *file, int line) {
9630 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9631 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9632 #ifdef HAVE_BACKTRACE
9633 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9634 *((char*)-1) = 'x';
9635 #endif
9636 }
9637
9638 /* =================================== Main! ================================ */
9639
9640 #ifdef __linux__
9641 int linuxOvercommitMemoryValue(void) {
9642 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9643 char buf[64];
9644
9645 if (!fp) return -1;
9646 if (fgets(buf,64,fp) == NULL) {
9647 fclose(fp);
9648 return -1;
9649 }
9650 fclose(fp);
9651
9652 return atoi(buf);
9653 }
9654
9655 void linuxOvercommitMemoryWarning(void) {
9656 if (linuxOvercommitMemoryValue() == 0) {
9657 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9658 }
9659 }
9660 #endif /* __linux__ */
9661
9662 static void daemonize(void) {
9663 int fd;
9664 FILE *fp;
9665
9666 if (fork() != 0) exit(0); /* parent exits */
9667 setsid(); /* create a new session */
9668
9669 /* Every output goes to /dev/null. If Redis is daemonized but
9670 * the 'logfile' is set to 'stdout' in the configuration file
9671 * it will not log at all. */
9672 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9673 dup2(fd, STDIN_FILENO);
9674 dup2(fd, STDOUT_FILENO);
9675 dup2(fd, STDERR_FILENO);
9676 if (fd > STDERR_FILENO) close(fd);
9677 }
9678 /* Try to write the pid file */
9679 fp = fopen(server.pidfile,"w");
9680 if (fp) {
9681 fprintf(fp,"%d\n",getpid());
9682 fclose(fp);
9683 }
9684 }
9685
9686 static void version() {
9687 printf("Redis server version %s\n", REDIS_VERSION);
9688 exit(0);
9689 }
9690
9691 static void usage() {
9692 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9693 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9694 exit(1);
9695 }
9696
9697 int main(int argc, char **argv) {
9698 time_t start;
9699
9700 initServerConfig();
9701 if (argc == 2) {
9702 if (strcmp(argv[1], "-v") == 0 ||
9703 strcmp(argv[1], "--version") == 0) version();
9704 if (strcmp(argv[1], "--help") == 0) usage();
9705 resetServerSaveParams();
9706 loadServerConfig(argv[1]);
9707 } else if ((argc > 2)) {
9708 usage();
9709 } else {
9710 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9711 }
9712 if (server.daemonize) daemonize();
9713 initServer();
9714 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9715 #ifdef __linux__
9716 linuxOvercommitMemoryWarning();
9717 #endif
9718 start = time(NULL);
9719 if (server.appendonly) {
9720 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9721 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9722 } else {
9723 if (rdbLoad(server.dbfilename) == REDIS_OK)
9724 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9725 }
9726 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9727 aeSetBeforeSleepProc(server.el,beforeSleep);
9728 aeMain(server.el);
9729 aeDeleteEventLoop(server.el);
9730 return 0;
9731 }
9732
9733 /* ============================= Backtrace support ========================= */
9734
9735 #ifdef HAVE_BACKTRACE
9736 static char *findFuncName(void *pointer, unsigned long *offset);
9737
9738 static void *getMcontextEip(ucontext_t *uc) {
9739 #if defined(__FreeBSD__)
9740 return (void*) uc->uc_mcontext.mc_eip;
9741 #elif defined(__dietlibc__)
9742 return (void*) uc->uc_mcontext.eip;
9743 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9744 #if __x86_64__
9745 return (void*) uc->uc_mcontext->__ss.__rip;
9746 #else
9747 return (void*) uc->uc_mcontext->__ss.__eip;
9748 #endif
9749 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9750 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9751 return (void*) uc->uc_mcontext->__ss.__rip;
9752 #else
9753 return (void*) uc->uc_mcontext->__ss.__eip;
9754 #endif
9755 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9756 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9757 #elif defined(__ia64__) /* Linux IA64 */
9758 return (void*) uc->uc_mcontext.sc_ip;
9759 #else
9760 return NULL;
9761 #endif
9762 }
9763
9764 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9765 void *trace[100];
9766 char **messages = NULL;
9767 int i, trace_size = 0;
9768 unsigned long offset=0;
9769 ucontext_t *uc = (ucontext_t*) secret;
9770 sds infostring;
9771 REDIS_NOTUSED(info);
9772
9773 redisLog(REDIS_WARNING,
9774 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9775 infostring = genRedisInfoString();
9776 redisLog(REDIS_WARNING, "%s",infostring);
9777 /* It's not safe to sdsfree() the returned string under memory
9778 * corruption conditions. Let it leak as we are going to abort */
9779
9780 trace_size = backtrace(trace, 100);
9781 /* overwrite sigaction with caller's address */
9782 if (getMcontextEip(uc) != NULL) {
9783 trace[1] = getMcontextEip(uc);
9784 }
9785 messages = backtrace_symbols(trace, trace_size);
9786
9787 for (i=1; i<trace_size; ++i) {
9788 char *fn = findFuncName(trace[i], &offset), *p;
9789
9790 p = strchr(messages[i],'+');
9791 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9792 redisLog(REDIS_WARNING,"%s", messages[i]);
9793 } else {
9794 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9795 }
9796 }
9797 /* free(messages); Don't call free() with possibly corrupted memory. */
9798 _exit(0);
9799 }
9800
9801 static void setupSigSegvAction(void) {
9802 struct sigaction act;
9803
9804 sigemptyset (&act.sa_mask);
9805 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9806 * is used. Otherwise, sa_handler is used */
9807 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9808 act.sa_sigaction = segvHandler;
9809 sigaction (SIGSEGV, &act, NULL);
9810 sigaction (SIGBUS, &act, NULL);
9811 sigaction (SIGFPE, &act, NULL);
9812 sigaction (SIGILL, &act, NULL);
9813 sigaction (SIGBUS, &act, NULL);
9814 return;
9815 }
9816
9817 #include "staticsymbols.h"
9818 /* This function try to convert a pointer into a function name. It's used in
9819 * oreder to provide a backtrace under segmentation fault that's able to
9820 * display functions declared as static (otherwise the backtrace is useless). */
9821 static char *findFuncName(void *pointer, unsigned long *offset){
9822 int i, ret = -1;
9823 unsigned long off, minoff = 0;
9824
9825 /* Try to match against the Symbol with the smallest offset */
9826 for (i=0; symsTable[i].pointer; i++) {
9827 unsigned long lp = (unsigned long) pointer;
9828
9829 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9830 off=lp-symsTable[i].pointer;
9831 if (ret < 0 || off < minoff) {
9832 minoff=off;
9833 ret=i;
9834 }
9835 }
9836 }
9837 if (ret == -1) return NULL;
9838 *offset = minoff;
9839 return symsTable[ret].name;
9840 }
9841 #else /* HAVE_BACKTRACE */
9842 static void setupSigSegvAction(void) {
9843 }
9844 #endif /* HAVE_BACKTRACE */
9845
9846
9847
9848 /* The End */
9849
9850
9851