]> git.saurik.com Git - redis.git/blob - redis.c
11056d54cbd6c1979083236614c36c6b7f6f8f19
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 /* Object types only used for dumping to disk */
131 #define REDIS_EXPIRETIME 253
132 #define REDIS_SELECTDB 254
133 #define REDIS_EOF 255
134
135 /* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
138 *
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
145 *
146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
148 #define REDIS_RDB_6BITLEN 0
149 #define REDIS_RDB_14BITLEN 1
150 #define REDIS_RDB_32BITLEN 2
151 #define REDIS_RDB_ENCVAL 3
152 #define REDIS_RDB_LENERR UINT_MAX
153
154 /* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
160 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
161
162 /* Virtual memory object->where field. */
163 #define REDIS_VM_MEMORY 0 /* The object is on memory */
164 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
165 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
167
168 /* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170 #define REDIS_VM_MAX_NEAR_PAGES 65536
171 #define REDIS_VM_MAX_RANDOM_JUMP 4096
172 #define REDIS_VM_MAX_THREADS 32
173 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
174 /* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
178 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
179
180 /* Client flags */
181 #define REDIS_SLAVE 1 /* This client is a slave server */
182 #define REDIS_MASTER 2 /* This client is a master server */
183 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184 #define REDIS_MULTI 8 /* This client is in a MULTI context */
185 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
187
188 /* Slave replication state - slave side */
189 #define REDIS_REPL_NONE 0 /* No active replication */
190 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
191 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
192
193 /* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
201
202 /* List related stuff */
203 #define REDIS_HEAD 0
204 #define REDIS_TAIL 1
205
206 /* Sort operations */
207 #define REDIS_SORT_GET 0
208 #define REDIS_SORT_ASC 1
209 #define REDIS_SORT_DESC 2
210 #define REDIS_SORTKEY_MAX 1024
211
212 /* Log levels */
213 #define REDIS_DEBUG 0
214 #define REDIS_VERBOSE 1
215 #define REDIS_NOTICE 2
216 #define REDIS_WARNING 3
217
218 /* Anti-warning macro... */
219 #define REDIS_NOTUSED(V) ((void) V)
220
221 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
223
224 /* Append only defines */
225 #define APPENDFSYNC_NO 0
226 #define APPENDFSYNC_ALWAYS 1
227 #define APPENDFSYNC_EVERYSEC 2
228
229 /* We can print the stacktrace, so our assert is defined this way: */
230 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
231 static void _redisAssert(char *estr, char *file, int line);
232
233 /*================================= Data types ============================== */
234
235 /* A redis object, that is a type able to hold a string / list / set */
236
237 /* The VM object structure */
238 struct redisObjectVM {
239 off_t page; /* the page at witch the object is stored on disk */
240 off_t usedpages; /* number of pages used on disk */
241 time_t atime; /* Last access time */
242 } vm;
243
244 /* The actual Redis Object */
245 typedef struct redisObject {
246 void *ptr;
247 unsigned char type;
248 unsigned char encoding;
249 unsigned char storage; /* If this object is a key, where is the value?
250 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
251 unsigned char vtype; /* If this object is a key, and value is swapped out,
252 * this is the type of the swapped out object. */
253 int refcount;
254 /* VM fields, this are only allocated if VM is active, otherwise the
255 * object allocation function will just allocate
256 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
257 * Redis without VM active will not have any overhead. */
258 struct redisObjectVM vm;
259 } robj;
260
261 /* Macro used to initalize a Redis object allocated on the stack.
262 * Note that this macro is taken near the structure definition to make sure
263 * we'll update it when the structure is changed, to avoid bugs like
264 * bug #85 introduced exactly in this way. */
265 #define initStaticStringObject(_var,_ptr) do { \
266 _var.refcount = 1; \
267 _var.type = REDIS_STRING; \
268 _var.encoding = REDIS_ENCODING_RAW; \
269 _var.ptr = _ptr; \
270 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
271 } while(0);
272
273 typedef struct redisDb {
274 dict *dict; /* The keyspace for this DB */
275 dict *expires; /* Timeout of keys with a timeout set */
276 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
277 dict *io_keys; /* Keys with clients waiting for VM I/O */
278 int id;
279 } redisDb;
280
281 /* Client MULTI/EXEC state */
282 typedef struct multiCmd {
283 robj **argv;
284 int argc;
285 struct redisCommand *cmd;
286 } multiCmd;
287
288 typedef struct multiState {
289 multiCmd *commands; /* Array of MULTI commands */
290 int count; /* Total number of MULTI commands */
291 } multiState;
292
293 /* With multiplexing we need to take per-clinet state.
294 * Clients are taken in a liked list. */
295 typedef struct redisClient {
296 int fd;
297 redisDb *db;
298 int dictid;
299 sds querybuf;
300 robj **argv, **mbargv;
301 int argc, mbargc;
302 int bulklen; /* bulk read len. -1 if not in bulk read mode */
303 int multibulk; /* multi bulk command format active */
304 list *reply;
305 int sentlen;
306 time_t lastinteraction; /* time of the last interaction, used for timeout */
307 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
308 int slaveseldb; /* slave selected db, if this client is a slave */
309 int authenticated; /* when requirepass is non-NULL */
310 int replstate; /* replication state if this is a slave */
311 int repldbfd; /* replication DB file descriptor */
312 long repldboff; /* replication DB file offset */
313 off_t repldbsize; /* replication DB file size */
314 multiState mstate; /* MULTI/EXEC state */
315 robj **blockingkeys; /* The key we are waiting to terminate a blocking
316 * operation such as BLPOP. Otherwise NULL. */
317 int blockingkeysnum; /* Number of blocking keys */
318 time_t blockingto; /* Blocking operation timeout. If UNIX current time
319 * is >= blockingto then the operation timed out. */
320 list *io_keys; /* Keys this client is waiting to be loaded from the
321 * swap file in order to continue. */
322 } redisClient;
323
324 struct saveparam {
325 time_t seconds;
326 int changes;
327 };
328
329 /* Global server state structure */
330 struct redisServer {
331 int port;
332 int fd;
333 redisDb *db;
334 dict *sharingpool; /* Poll used for object sharing */
335 unsigned int sharingpoolsize;
336 long long dirty; /* changes to DB from the last save */
337 list *clients;
338 list *slaves, *monitors;
339 char neterr[ANET_ERR_LEN];
340 aeEventLoop *el;
341 int cronloops; /* number of times the cron function run */
342 list *objfreelist; /* A list of freed objects to avoid malloc() */
343 time_t lastsave; /* Unix time of last save succeeede */
344 /* Fields used only for stats */
345 time_t stat_starttime; /* server start time */
346 long long stat_numcommands; /* number of processed commands */
347 long long stat_numconnections; /* number of connections received */
348 /* Configuration */
349 int verbosity;
350 int glueoutputbuf;
351 int maxidletime;
352 int dbnum;
353 int daemonize;
354 int appendonly;
355 int appendfsync;
356 time_t lastfsync;
357 int appendfd;
358 int appendseldb;
359 char *pidfile;
360 pid_t bgsavechildpid;
361 pid_t bgrewritechildpid;
362 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
363 struct saveparam *saveparams;
364 int saveparamslen;
365 char *logfile;
366 char *bindaddr;
367 char *dbfilename;
368 char *appendfilename;
369 char *requirepass;
370 int shareobjects;
371 int rdbcompression;
372 /* Replication related */
373 int isslave;
374 char *masterauth;
375 char *masterhost;
376 int masterport;
377 redisClient *master; /* client that is master for this slave */
378 int replstate;
379 unsigned int maxclients;
380 unsigned long long maxmemory;
381 unsigned int blpop_blocked_clients;
382 unsigned int vm_blocked_clients;
383 /* Sort parameters - qsort_r() is only available under BSD so we
384 * have to take this state global, in order to pass it to sortCompare() */
385 int sort_desc;
386 int sort_alpha;
387 int sort_bypattern;
388 /* Virtual memory configuration */
389 int vm_enabled;
390 char *vm_swap_file;
391 off_t vm_page_size;
392 off_t vm_pages;
393 unsigned long long vm_max_memory;
394 /* Virtual memory state */
395 FILE *vm_fp;
396 int vm_fd;
397 off_t vm_next_page; /* Next probably empty page */
398 off_t vm_near_pages; /* Number of pages allocated sequentially */
399 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
400 time_t unixtime; /* Unix time sampled every second. */
401 /* Virtual memory I/O threads stuff */
402 /* An I/O thread process an element taken from the io_jobs queue and
403 * put the result of the operation in the io_done list. While the
404 * job is being processed, it's put on io_processing queue. */
405 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
406 list *io_processing; /* List of VM I/O jobs being processed */
407 list *io_processed; /* List of VM I/O jobs already processed */
408 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
409 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
410 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
411 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
412 pthread_attr_t io_threads_attr; /* attributes for threads creation */
413 int io_active_threads; /* Number of running I/O threads */
414 int vm_max_threads; /* Max number of I/O threads running at the same time */
415 /* Our main thread is blocked on the event loop, locking for sockets ready
416 * to be read or written, so when a threaded I/O operation is ready to be
417 * processed by the main thread, the I/O thread will use a unix pipe to
418 * awake the main thread. The followings are the two pipe FDs. */
419 int io_ready_pipe_read;
420 int io_ready_pipe_write;
421 /* Virtual memory stats */
422 unsigned long long vm_stats_used_pages;
423 unsigned long long vm_stats_swapped_objects;
424 unsigned long long vm_stats_swapouts;
425 unsigned long long vm_stats_swapins;
426 FILE *devnull;
427 };
428
429 typedef void redisCommandProc(redisClient *c);
430 struct redisCommand {
431 char *name;
432 redisCommandProc *proc;
433 int arity;
434 int flags;
435 /* What keys should be loaded in background when calling this command? */
436 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
437 int vm_lastkey; /* THe last argument that's a key */
438 int vm_keystep; /* The step between first and last key */
439 };
440
441 struct redisFunctionSym {
442 char *name;
443 unsigned long pointer;
444 };
445
446 typedef struct _redisSortObject {
447 robj *obj;
448 union {
449 double score;
450 robj *cmpobj;
451 } u;
452 } redisSortObject;
453
454 typedef struct _redisSortOperation {
455 int type;
456 robj *pattern;
457 } redisSortOperation;
458
459 /* ZSETs use a specialized version of Skiplists */
460
461 typedef struct zskiplistNode {
462 struct zskiplistNode **forward;
463 struct zskiplistNode *backward;
464 unsigned int *span;
465 double score;
466 robj *obj;
467 } zskiplistNode;
468
469 typedef struct zskiplist {
470 struct zskiplistNode *header, *tail;
471 unsigned long length;
472 int level;
473 } zskiplist;
474
475 typedef struct zset {
476 dict *dict;
477 zskiplist *zsl;
478 } zset;
479
480 /* Our shared "common" objects */
481
482 struct sharedObjectsStruct {
483 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
484 *colon, *nullbulk, *nullmultibulk, *queued,
485 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
486 *outofrangeerr, *plus,
487 *select0, *select1, *select2, *select3, *select4,
488 *select5, *select6, *select7, *select8, *select9;
489 } shared;
490
491 /* Global vars that are actally used as constants. The following double
492 * values are used for double on-disk serialization, and are initialized
493 * at runtime to avoid strange compiler optimizations. */
494
495 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
496
497 /* VM threaded I/O request message */
498 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
499 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
500 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
501 typedef struct iojob {
502 int type; /* Request type, REDIS_IOJOB_* */
503 redisDb *db;/* Redis database */
504 robj *key; /* This I/O request is about swapping this key */
505 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
506 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
507 off_t page; /* Swap page where to read/write the object */
508 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
509 int canceled; /* True if this command was canceled by blocking side of VM */
510 pthread_t thread; /* ID of the thread processing this entry */
511 } iojob;
512
513 /*================================ Prototypes =============================== */
514
515 static void freeStringObject(robj *o);
516 static void freeListObject(robj *o);
517 static void freeSetObject(robj *o);
518 static void decrRefCount(void *o);
519 static robj *createObject(int type, void *ptr);
520 static void freeClient(redisClient *c);
521 static int rdbLoad(char *filename);
522 static void addReply(redisClient *c, robj *obj);
523 static void addReplySds(redisClient *c, sds s);
524 static void incrRefCount(robj *o);
525 static int rdbSaveBackground(char *filename);
526 static robj *createStringObject(char *ptr, size_t len);
527 static robj *dupStringObject(robj *o);
528 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
529 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
530 static int syncWithMaster(void);
531 static robj *tryObjectSharing(robj *o);
532 static int tryObjectEncoding(robj *o);
533 static robj *getDecodedObject(robj *o);
534 static int removeExpire(redisDb *db, robj *key);
535 static int expireIfNeeded(redisDb *db, robj *key);
536 static int deleteIfVolatile(redisDb *db, robj *key);
537 static int deleteIfSwapped(redisDb *db, robj *key);
538 static int deleteKey(redisDb *db, robj *key);
539 static time_t getExpire(redisDb *db, robj *key);
540 static int setExpire(redisDb *db, robj *key, time_t when);
541 static void updateSlavesWaitingBgsave(int bgsaveerr);
542 static void freeMemoryIfNeeded(void);
543 static int processCommand(redisClient *c);
544 static void setupSigSegvAction(void);
545 static void rdbRemoveTempFile(pid_t childpid);
546 static void aofRemoveTempFile(pid_t childpid);
547 static size_t stringObjectLen(robj *o);
548 static void processInputBuffer(redisClient *c);
549 static zskiplist *zslCreate(void);
550 static void zslFree(zskiplist *zsl);
551 static void zslInsert(zskiplist *zsl, double score, robj *obj);
552 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
553 static void initClientMultiState(redisClient *c);
554 static void freeClientMultiState(redisClient *c);
555 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
556 static void unblockClientWaitingData(redisClient *c);
557 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
558 static void vmInit(void);
559 static void vmMarkPagesFree(off_t page, off_t count);
560 static robj *vmLoadObject(robj *key);
561 static robj *vmPreviewObject(robj *key);
562 static int vmSwapOneObjectBlocking(void);
563 static int vmSwapOneObjectThreaded(void);
564 static int vmCanSwapOut(void);
565 static int tryFreeOneObjectFromFreelist(void);
566 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
567 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
568 static void vmCancelThreadedIOJob(robj *o);
569 static void lockThreadedIO(void);
570 static void unlockThreadedIO(void);
571 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
572 static void freeIOJob(iojob *j);
573 static void queueIOJob(iojob *j);
574 static int vmWriteObjectOnSwap(robj *o, off_t page);
575 static robj *vmReadObjectFromSwap(off_t page, int type);
576 static void waitEmptyIOJobsQueue(void);
577 static void vmReopenSwapFile(void);
578 static int vmFreePage(off_t page);
579 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
580 static int dontWaitForSwappedKey(redisClient *c, robj *key);
581 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
582 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
583 static struct redisCommand *lookupCommand(char *name);
584 static void call(redisClient *c, struct redisCommand *cmd);
585 static void resetClient(redisClient *c);
586
587 static void authCommand(redisClient *c);
588 static void pingCommand(redisClient *c);
589 static void echoCommand(redisClient *c);
590 static void setCommand(redisClient *c);
591 static void setnxCommand(redisClient *c);
592 static void getCommand(redisClient *c);
593 static void delCommand(redisClient *c);
594 static void existsCommand(redisClient *c);
595 static void incrCommand(redisClient *c);
596 static void decrCommand(redisClient *c);
597 static void incrbyCommand(redisClient *c);
598 static void decrbyCommand(redisClient *c);
599 static void selectCommand(redisClient *c);
600 static void randomkeyCommand(redisClient *c);
601 static void keysCommand(redisClient *c);
602 static void dbsizeCommand(redisClient *c);
603 static void lastsaveCommand(redisClient *c);
604 static void saveCommand(redisClient *c);
605 static void bgsaveCommand(redisClient *c);
606 static void bgrewriteaofCommand(redisClient *c);
607 static void shutdownCommand(redisClient *c);
608 static void moveCommand(redisClient *c);
609 static void renameCommand(redisClient *c);
610 static void renamenxCommand(redisClient *c);
611 static void lpushCommand(redisClient *c);
612 static void rpushCommand(redisClient *c);
613 static void lpopCommand(redisClient *c);
614 static void rpopCommand(redisClient *c);
615 static void llenCommand(redisClient *c);
616 static void lindexCommand(redisClient *c);
617 static void lrangeCommand(redisClient *c);
618 static void ltrimCommand(redisClient *c);
619 static void typeCommand(redisClient *c);
620 static void lsetCommand(redisClient *c);
621 static void saddCommand(redisClient *c);
622 static void sremCommand(redisClient *c);
623 static void smoveCommand(redisClient *c);
624 static void sismemberCommand(redisClient *c);
625 static void scardCommand(redisClient *c);
626 static void spopCommand(redisClient *c);
627 static void srandmemberCommand(redisClient *c);
628 static void sinterCommand(redisClient *c);
629 static void sinterstoreCommand(redisClient *c);
630 static void sunionCommand(redisClient *c);
631 static void sunionstoreCommand(redisClient *c);
632 static void sdiffCommand(redisClient *c);
633 static void sdiffstoreCommand(redisClient *c);
634 static void syncCommand(redisClient *c);
635 static void flushdbCommand(redisClient *c);
636 static void flushallCommand(redisClient *c);
637 static void sortCommand(redisClient *c);
638 static void lremCommand(redisClient *c);
639 static void rpoplpushcommand(redisClient *c);
640 static void infoCommand(redisClient *c);
641 static void mgetCommand(redisClient *c);
642 static void monitorCommand(redisClient *c);
643 static void expireCommand(redisClient *c);
644 static void expireatCommand(redisClient *c);
645 static void getsetCommand(redisClient *c);
646 static void ttlCommand(redisClient *c);
647 static void slaveofCommand(redisClient *c);
648 static void debugCommand(redisClient *c);
649 static void msetCommand(redisClient *c);
650 static void msetnxCommand(redisClient *c);
651 static void zaddCommand(redisClient *c);
652 static void zincrbyCommand(redisClient *c);
653 static void zrangeCommand(redisClient *c);
654 static void zrangebyscoreCommand(redisClient *c);
655 static void zcountCommand(redisClient *c);
656 static void zrevrangeCommand(redisClient *c);
657 static void zcardCommand(redisClient *c);
658 static void zremCommand(redisClient *c);
659 static void zscoreCommand(redisClient *c);
660 static void zremrangebyscoreCommand(redisClient *c);
661 static void multiCommand(redisClient *c);
662 static void execCommand(redisClient *c);
663 static void discardCommand(redisClient *c);
664 static void blpopCommand(redisClient *c);
665 static void brpopCommand(redisClient *c);
666 static void appendCommand(redisClient *c);
667 static void substrCommand(redisClient *c);
668 static void zrankCommand(redisClient *c);
669
670 /*================================= Globals ================================= */
671
672 /* Global vars */
673 static struct redisServer server; /* server global state */
674 static struct redisCommand cmdTable[] = {
675 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
676 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
677 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
678 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
679 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
680 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
681 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
682 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
683 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
684 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
685 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
686 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
687 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
688 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
689 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
690 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
691 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
692 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
693 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
694 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
695 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
696 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
697 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
698 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
699 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
700 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
701 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
702 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
703 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
704 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
705 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
706 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
707 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
708 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
709 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
710 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
711 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
712 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
713 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
714 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
715 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
716 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
717 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
718 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
719 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
720 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
721 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
722 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
723 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
724 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
725 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
726 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
727 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
728 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
729 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
730 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
731 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
732 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
733 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
734 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
735 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
736 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
737 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
738 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
739 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
740 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
741 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
742 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
743 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
744 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
745 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
746 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
747 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
748 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
749 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
750 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
751 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
752 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
753 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
754 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
755 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
756 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
757 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
758 {NULL,NULL,0,0,0,0,0}
759 };
760
761 /*============================ Utility functions ============================ */
762
763 /* Glob-style pattern matching. */
764 int stringmatchlen(const char *pattern, int patternLen,
765 const char *string, int stringLen, int nocase)
766 {
767 while(patternLen) {
768 switch(pattern[0]) {
769 case '*':
770 while (pattern[1] == '*') {
771 pattern++;
772 patternLen--;
773 }
774 if (patternLen == 1)
775 return 1; /* match */
776 while(stringLen) {
777 if (stringmatchlen(pattern+1, patternLen-1,
778 string, stringLen, nocase))
779 return 1; /* match */
780 string++;
781 stringLen--;
782 }
783 return 0; /* no match */
784 break;
785 case '?':
786 if (stringLen == 0)
787 return 0; /* no match */
788 string++;
789 stringLen--;
790 break;
791 case '[':
792 {
793 int not, match;
794
795 pattern++;
796 patternLen--;
797 not = pattern[0] == '^';
798 if (not) {
799 pattern++;
800 patternLen--;
801 }
802 match = 0;
803 while(1) {
804 if (pattern[0] == '\\') {
805 pattern++;
806 patternLen--;
807 if (pattern[0] == string[0])
808 match = 1;
809 } else if (pattern[0] == ']') {
810 break;
811 } else if (patternLen == 0) {
812 pattern--;
813 patternLen++;
814 break;
815 } else if (pattern[1] == '-' && patternLen >= 3) {
816 int start = pattern[0];
817 int end = pattern[2];
818 int c = string[0];
819 if (start > end) {
820 int t = start;
821 start = end;
822 end = t;
823 }
824 if (nocase) {
825 start = tolower(start);
826 end = tolower(end);
827 c = tolower(c);
828 }
829 pattern += 2;
830 patternLen -= 2;
831 if (c >= start && c <= end)
832 match = 1;
833 } else {
834 if (!nocase) {
835 if (pattern[0] == string[0])
836 match = 1;
837 } else {
838 if (tolower((int)pattern[0]) == tolower((int)string[0]))
839 match = 1;
840 }
841 }
842 pattern++;
843 patternLen--;
844 }
845 if (not)
846 match = !match;
847 if (!match)
848 return 0; /* no match */
849 string++;
850 stringLen--;
851 break;
852 }
853 case '\\':
854 if (patternLen >= 2) {
855 pattern++;
856 patternLen--;
857 }
858 /* fall through */
859 default:
860 if (!nocase) {
861 if (pattern[0] != string[0])
862 return 0; /* no match */
863 } else {
864 if (tolower((int)pattern[0]) != tolower((int)string[0]))
865 return 0; /* no match */
866 }
867 string++;
868 stringLen--;
869 break;
870 }
871 pattern++;
872 patternLen--;
873 if (stringLen == 0) {
874 while(*pattern == '*') {
875 pattern++;
876 patternLen--;
877 }
878 break;
879 }
880 }
881 if (patternLen == 0 && stringLen == 0)
882 return 1;
883 return 0;
884 }
885
886 static void redisLog(int level, const char *fmt, ...) {
887 va_list ap;
888 FILE *fp;
889
890 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
891 if (!fp) return;
892
893 va_start(ap, fmt);
894 if (level >= server.verbosity) {
895 char *c = ".-*#";
896 char buf[64];
897 time_t now;
898
899 now = time(NULL);
900 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
901 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
902 vfprintf(fp, fmt, ap);
903 fprintf(fp,"\n");
904 fflush(fp);
905 }
906 va_end(ap);
907
908 if (server.logfile) fclose(fp);
909 }
910
911 /*====================== Hash table type implementation ==================== */
912
913 /* This is an hash table type that uses the SDS dynamic strings libary as
914 * keys and radis objects as values (objects can hold SDS strings,
915 * lists, sets). */
916
917 static void dictVanillaFree(void *privdata, void *val)
918 {
919 DICT_NOTUSED(privdata);
920 zfree(val);
921 }
922
923 static void dictListDestructor(void *privdata, void *val)
924 {
925 DICT_NOTUSED(privdata);
926 listRelease((list*)val);
927 }
928
929 static int sdsDictKeyCompare(void *privdata, const void *key1,
930 const void *key2)
931 {
932 int l1,l2;
933 DICT_NOTUSED(privdata);
934
935 l1 = sdslen((sds)key1);
936 l2 = sdslen((sds)key2);
937 if (l1 != l2) return 0;
938 return memcmp(key1, key2, l1) == 0;
939 }
940
941 static void dictRedisObjectDestructor(void *privdata, void *val)
942 {
943 DICT_NOTUSED(privdata);
944
945 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
946 decrRefCount(val);
947 }
948
949 static int dictObjKeyCompare(void *privdata, const void *key1,
950 const void *key2)
951 {
952 const robj *o1 = key1, *o2 = key2;
953 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
954 }
955
956 static unsigned int dictObjHash(const void *key) {
957 const robj *o = key;
958 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
959 }
960
961 static int dictEncObjKeyCompare(void *privdata, const void *key1,
962 const void *key2)
963 {
964 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
965 int cmp;
966
967 o1 = getDecodedObject(o1);
968 o2 = getDecodedObject(o2);
969 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
970 decrRefCount(o1);
971 decrRefCount(o2);
972 return cmp;
973 }
974
975 static unsigned int dictEncObjHash(const void *key) {
976 robj *o = (robj*) key;
977
978 if (o->encoding == REDIS_ENCODING_RAW) {
979 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
980 } else {
981 if (o->encoding == REDIS_ENCODING_INT) {
982 char buf[32];
983 int len;
984
985 len = snprintf(buf,32,"%ld",(long)o->ptr);
986 return dictGenHashFunction((unsigned char*)buf, len);
987 } else {
988 unsigned int hash;
989
990 o = getDecodedObject(o);
991 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
992 decrRefCount(o);
993 return hash;
994 }
995 }
996 }
997
998 /* Sets type and expires */
999 static dictType setDictType = {
1000 dictEncObjHash, /* hash function */
1001 NULL, /* key dup */
1002 NULL, /* val dup */
1003 dictEncObjKeyCompare, /* key compare */
1004 dictRedisObjectDestructor, /* key destructor */
1005 NULL /* val destructor */
1006 };
1007
1008 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1009 static dictType zsetDictType = {
1010 dictEncObjHash, /* hash function */
1011 NULL, /* key dup */
1012 NULL, /* val dup */
1013 dictEncObjKeyCompare, /* key compare */
1014 dictRedisObjectDestructor, /* key destructor */
1015 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1016 };
1017
1018 /* Db->dict */
1019 static dictType dbDictType = {
1020 dictObjHash, /* hash function */
1021 NULL, /* key dup */
1022 NULL, /* val dup */
1023 dictObjKeyCompare, /* key compare */
1024 dictRedisObjectDestructor, /* key destructor */
1025 dictRedisObjectDestructor /* val destructor */
1026 };
1027
1028 /* Db->expires */
1029 static dictType keyptrDictType = {
1030 dictObjHash, /* hash function */
1031 NULL, /* key dup */
1032 NULL, /* val dup */
1033 dictObjKeyCompare, /* key compare */
1034 dictRedisObjectDestructor, /* key destructor */
1035 NULL /* val destructor */
1036 };
1037
1038 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1039 static dictType hashDictType = {
1040 dictEncObjHash, /* hash function */
1041 NULL, /* key dup */
1042 NULL, /* val dup */
1043 dictEncObjKeyCompare, /* key compare */
1044 dictRedisObjectDestructor, /* key destructor */
1045 dictRedisObjectDestructor /* val destructor */
1046 };
1047
1048 /* Keylist hash table type has unencoded redis objects as keys and
1049 * lists as values. It's used for blocking operations (BLPOP) and to
1050 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1051 static dictType keylistDictType = {
1052 dictObjHash, /* hash function */
1053 NULL, /* key dup */
1054 NULL, /* val dup */
1055 dictObjKeyCompare, /* key compare */
1056 dictRedisObjectDestructor, /* key destructor */
1057 dictListDestructor /* val destructor */
1058 };
1059
1060 /* ========================= Random utility functions ======================= */
1061
1062 /* Redis generally does not try to recover from out of memory conditions
1063 * when allocating objects or strings, it is not clear if it will be possible
1064 * to report this condition to the client since the networking layer itself
1065 * is based on heap allocation for send buffers, so we simply abort.
1066 * At least the code will be simpler to read... */
1067 static void oom(const char *msg) {
1068 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1069 sleep(1);
1070 abort();
1071 }
1072
1073 /* ====================== Redis server networking stuff ===================== */
1074 static void closeTimedoutClients(void) {
1075 redisClient *c;
1076 listNode *ln;
1077 time_t now = time(NULL);
1078 listIter li;
1079
1080 listRewind(server.clients,&li);
1081 while ((ln = listNext(&li)) != NULL) {
1082 c = listNodeValue(ln);
1083 if (server.maxidletime &&
1084 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1085 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1086 (now - c->lastinteraction > server.maxidletime))
1087 {
1088 redisLog(REDIS_VERBOSE,"Closing idle client");
1089 freeClient(c);
1090 } else if (c->flags & REDIS_BLOCKED) {
1091 if (c->blockingto != 0 && c->blockingto < now) {
1092 addReply(c,shared.nullmultibulk);
1093 unblockClientWaitingData(c);
1094 }
1095 }
1096 }
1097 }
1098
1099 static int htNeedsResize(dict *dict) {
1100 long long size, used;
1101
1102 size = dictSlots(dict);
1103 used = dictSize(dict);
1104 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1105 (used*100/size < REDIS_HT_MINFILL));
1106 }
1107
1108 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1109 * we resize the hash table to save memory */
1110 static void tryResizeHashTables(void) {
1111 int j;
1112
1113 for (j = 0; j < server.dbnum; j++) {
1114 if (htNeedsResize(server.db[j].dict)) {
1115 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1116 dictResize(server.db[j].dict);
1117 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1118 }
1119 if (htNeedsResize(server.db[j].expires))
1120 dictResize(server.db[j].expires);
1121 }
1122 }
1123
1124 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1125 void backgroundSaveDoneHandler(int statloc) {
1126 int exitcode = WEXITSTATUS(statloc);
1127 int bysignal = WIFSIGNALED(statloc);
1128
1129 if (!bysignal && exitcode == 0) {
1130 redisLog(REDIS_NOTICE,
1131 "Background saving terminated with success");
1132 server.dirty = 0;
1133 server.lastsave = time(NULL);
1134 } else if (!bysignal && exitcode != 0) {
1135 redisLog(REDIS_WARNING, "Background saving error");
1136 } else {
1137 redisLog(REDIS_WARNING,
1138 "Background saving terminated by signal");
1139 rdbRemoveTempFile(server.bgsavechildpid);
1140 }
1141 server.bgsavechildpid = -1;
1142 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1143 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1144 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1145 }
1146
1147 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1148 * Handle this. */
1149 void backgroundRewriteDoneHandler(int statloc) {
1150 int exitcode = WEXITSTATUS(statloc);
1151 int bysignal = WIFSIGNALED(statloc);
1152
1153 if (!bysignal && exitcode == 0) {
1154 int fd;
1155 char tmpfile[256];
1156
1157 redisLog(REDIS_NOTICE,
1158 "Background append only file rewriting terminated with success");
1159 /* Now it's time to flush the differences accumulated by the parent */
1160 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1161 fd = open(tmpfile,O_WRONLY|O_APPEND);
1162 if (fd == -1) {
1163 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1164 goto cleanup;
1165 }
1166 /* Flush our data... */
1167 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1168 (signed) sdslen(server.bgrewritebuf)) {
1169 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1170 close(fd);
1171 goto cleanup;
1172 }
1173 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1174 /* Now our work is to rename the temp file into the stable file. And
1175 * switch the file descriptor used by the server for append only. */
1176 if (rename(tmpfile,server.appendfilename) == -1) {
1177 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1178 close(fd);
1179 goto cleanup;
1180 }
1181 /* Mission completed... almost */
1182 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1183 if (server.appendfd != -1) {
1184 /* If append only is actually enabled... */
1185 close(server.appendfd);
1186 server.appendfd = fd;
1187 fsync(fd);
1188 server.appendseldb = -1; /* Make sure it will issue SELECT */
1189 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1190 } else {
1191 /* If append only is disabled we just generate a dump in this
1192 * format. Why not? */
1193 close(fd);
1194 }
1195 } else if (!bysignal && exitcode != 0) {
1196 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1197 } else {
1198 redisLog(REDIS_WARNING,
1199 "Background append only file rewriting terminated by signal");
1200 }
1201 cleanup:
1202 sdsfree(server.bgrewritebuf);
1203 server.bgrewritebuf = sdsempty();
1204 aofRemoveTempFile(server.bgrewritechildpid);
1205 server.bgrewritechildpid = -1;
1206 }
1207
1208 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1209 int j, loops = server.cronloops++;
1210 REDIS_NOTUSED(eventLoop);
1211 REDIS_NOTUSED(id);
1212 REDIS_NOTUSED(clientData);
1213
1214 /* We take a cached value of the unix time in the global state because
1215 * with virtual memory and aging there is to store the current time
1216 * in objects at every object access, and accuracy is not needed.
1217 * To access a global var is faster than calling time(NULL) */
1218 server.unixtime = time(NULL);
1219
1220 /* Show some info about non-empty databases */
1221 for (j = 0; j < server.dbnum; j++) {
1222 long long size, used, vkeys;
1223
1224 size = dictSlots(server.db[j].dict);
1225 used = dictSize(server.db[j].dict);
1226 vkeys = dictSize(server.db[j].expires);
1227 if (!(loops % 5) && (used || vkeys)) {
1228 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1229 /* dictPrintStats(server.dict); */
1230 }
1231 }
1232
1233 /* We don't want to resize the hash tables while a bacground saving
1234 * is in progress: the saving child is created using fork() that is
1235 * implemented with a copy-on-write semantic in most modern systems, so
1236 * if we resize the HT while there is the saving child at work actually
1237 * a lot of memory movements in the parent will cause a lot of pages
1238 * copied. */
1239 if (server.bgsavechildpid == -1) tryResizeHashTables();
1240
1241 /* Show information about connected clients */
1242 if (!(loops % 5)) {
1243 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1244 listLength(server.clients)-listLength(server.slaves),
1245 listLength(server.slaves),
1246 zmalloc_used_memory(),
1247 dictSize(server.sharingpool));
1248 }
1249
1250 /* Close connections of timedout clients */
1251 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1252 closeTimedoutClients();
1253
1254 /* Check if a background saving or AOF rewrite in progress terminated */
1255 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1256 int statloc;
1257 pid_t pid;
1258
1259 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1260 if (pid == server.bgsavechildpid) {
1261 backgroundSaveDoneHandler(statloc);
1262 } else {
1263 backgroundRewriteDoneHandler(statloc);
1264 }
1265 }
1266 } else {
1267 /* If there is not a background saving in progress check if
1268 * we have to save now */
1269 time_t now = time(NULL);
1270 for (j = 0; j < server.saveparamslen; j++) {
1271 struct saveparam *sp = server.saveparams+j;
1272
1273 if (server.dirty >= sp->changes &&
1274 now-server.lastsave > sp->seconds) {
1275 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1276 sp->changes, sp->seconds);
1277 rdbSaveBackground(server.dbfilename);
1278 break;
1279 }
1280 }
1281 }
1282
1283 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1284 * will use few CPU cycles if there are few expiring keys, otherwise
1285 * it will get more aggressive to avoid that too much memory is used by
1286 * keys that can be removed from the keyspace. */
1287 for (j = 0; j < server.dbnum; j++) {
1288 int expired;
1289 redisDb *db = server.db+j;
1290
1291 /* Continue to expire if at the end of the cycle more than 25%
1292 * of the keys were expired. */
1293 do {
1294 long num = dictSize(db->expires);
1295 time_t now = time(NULL);
1296
1297 expired = 0;
1298 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1299 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1300 while (num--) {
1301 dictEntry *de;
1302 time_t t;
1303
1304 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1305 t = (time_t) dictGetEntryVal(de);
1306 if (now > t) {
1307 deleteKey(db,dictGetEntryKey(de));
1308 expired++;
1309 }
1310 }
1311 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1312 }
1313
1314 /* Swap a few keys on disk if we are over the memory limit and VM
1315 * is enbled. Try to free objects from the free list first. */
1316 if (vmCanSwapOut()) {
1317 while (server.vm_enabled && zmalloc_used_memory() >
1318 server.vm_max_memory)
1319 {
1320 int retval;
1321
1322 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1323 retval = (server.vm_max_threads == 0) ?
1324 vmSwapOneObjectBlocking() :
1325 vmSwapOneObjectThreaded();
1326 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1327 zmalloc_used_memory() >
1328 (server.vm_max_memory+server.vm_max_memory/10))
1329 {
1330 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1331 }
1332 /* Note that when using threade I/O we free just one object,
1333 * because anyway when the I/O thread in charge to swap this
1334 * object out will finish, the handler of completed jobs
1335 * will try to swap more objects if we are still out of memory. */
1336 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1337 }
1338 }
1339
1340 /* Check if we should connect to a MASTER */
1341 if (server.replstate == REDIS_REPL_CONNECT) {
1342 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1343 if (syncWithMaster() == REDIS_OK) {
1344 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1345 }
1346 }
1347 return 1000;
1348 }
1349
1350 /* This function gets called every time Redis is entering the
1351 * main loop of the event driven library, that is, before to sleep
1352 * for ready file descriptors. */
1353 static void beforeSleep(struct aeEventLoop *eventLoop) {
1354 REDIS_NOTUSED(eventLoop);
1355
1356 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1357 listIter li;
1358 listNode *ln;
1359
1360 listRewind(server.io_ready_clients,&li);
1361 while((ln = listNext(&li))) {
1362 redisClient *c = ln->value;
1363 struct redisCommand *cmd;
1364
1365 /* Resume the client. */
1366 listDelNode(server.io_ready_clients,ln);
1367 c->flags &= (~REDIS_IO_WAIT);
1368 server.vm_blocked_clients--;
1369 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1370 readQueryFromClient, c);
1371 cmd = lookupCommand(c->argv[0]->ptr);
1372 assert(cmd != NULL);
1373 call(c,cmd);
1374 resetClient(c);
1375 /* There may be more data to process in the input buffer. */
1376 if (c->querybuf && sdslen(c->querybuf) > 0)
1377 processInputBuffer(c);
1378 }
1379 }
1380 }
1381
1382 static void createSharedObjects(void) {
1383 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1384 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1385 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1386 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1387 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1388 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1389 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1390 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1391 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1392 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1393 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1394 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1395 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1396 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1397 "-ERR no such key\r\n"));
1398 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1399 "-ERR syntax error\r\n"));
1400 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1401 "-ERR source and destination objects are the same\r\n"));
1402 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1403 "-ERR index out of range\r\n"));
1404 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1405 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1406 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1407 shared.select0 = createStringObject("select 0\r\n",10);
1408 shared.select1 = createStringObject("select 1\r\n",10);
1409 shared.select2 = createStringObject("select 2\r\n",10);
1410 shared.select3 = createStringObject("select 3\r\n",10);
1411 shared.select4 = createStringObject("select 4\r\n",10);
1412 shared.select5 = createStringObject("select 5\r\n",10);
1413 shared.select6 = createStringObject("select 6\r\n",10);
1414 shared.select7 = createStringObject("select 7\r\n",10);
1415 shared.select8 = createStringObject("select 8\r\n",10);
1416 shared.select9 = createStringObject("select 9\r\n",10);
1417 }
1418
1419 static void appendServerSaveParams(time_t seconds, int changes) {
1420 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1421 server.saveparams[server.saveparamslen].seconds = seconds;
1422 server.saveparams[server.saveparamslen].changes = changes;
1423 server.saveparamslen++;
1424 }
1425
1426 static void resetServerSaveParams() {
1427 zfree(server.saveparams);
1428 server.saveparams = NULL;
1429 server.saveparamslen = 0;
1430 }
1431
1432 static void initServerConfig() {
1433 server.dbnum = REDIS_DEFAULT_DBNUM;
1434 server.port = REDIS_SERVERPORT;
1435 server.verbosity = REDIS_VERBOSE;
1436 server.maxidletime = REDIS_MAXIDLETIME;
1437 server.saveparams = NULL;
1438 server.logfile = NULL; /* NULL = log on standard output */
1439 server.bindaddr = NULL;
1440 server.glueoutputbuf = 1;
1441 server.daemonize = 0;
1442 server.appendonly = 0;
1443 server.appendfsync = APPENDFSYNC_ALWAYS;
1444 server.lastfsync = time(NULL);
1445 server.appendfd = -1;
1446 server.appendseldb = -1; /* Make sure the first time will not match */
1447 server.pidfile = "/var/run/redis.pid";
1448 server.dbfilename = "dump.rdb";
1449 server.appendfilename = "appendonly.aof";
1450 server.requirepass = NULL;
1451 server.shareobjects = 0;
1452 server.rdbcompression = 1;
1453 server.sharingpoolsize = 1024;
1454 server.maxclients = 0;
1455 server.blpop_blocked_clients = 0;
1456 server.maxmemory = 0;
1457 server.vm_enabled = 0;
1458 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1459 server.vm_page_size = 256; /* 256 bytes per page */
1460 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1461 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1462 server.vm_max_threads = 4;
1463 server.vm_blocked_clients = 0;
1464
1465 resetServerSaveParams();
1466
1467 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1468 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1469 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1470 /* Replication related */
1471 server.isslave = 0;
1472 server.masterauth = NULL;
1473 server.masterhost = NULL;
1474 server.masterport = 6379;
1475 server.master = NULL;
1476 server.replstate = REDIS_REPL_NONE;
1477
1478 /* Double constants initialization */
1479 R_Zero = 0.0;
1480 R_PosInf = 1.0/R_Zero;
1481 R_NegInf = -1.0/R_Zero;
1482 R_Nan = R_Zero/R_Zero;
1483 }
1484
1485 static void initServer() {
1486 int j;
1487
1488 signal(SIGHUP, SIG_IGN);
1489 signal(SIGPIPE, SIG_IGN);
1490 setupSigSegvAction();
1491
1492 server.devnull = fopen("/dev/null","w");
1493 if (server.devnull == NULL) {
1494 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1495 exit(1);
1496 }
1497 server.clients = listCreate();
1498 server.slaves = listCreate();
1499 server.monitors = listCreate();
1500 server.objfreelist = listCreate();
1501 createSharedObjects();
1502 server.el = aeCreateEventLoop();
1503 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1504 server.sharingpool = dictCreate(&setDictType,NULL);
1505 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1506 if (server.fd == -1) {
1507 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1508 exit(1);
1509 }
1510 for (j = 0; j < server.dbnum; j++) {
1511 server.db[j].dict = dictCreate(&dbDictType,NULL);
1512 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1513 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1514 if (server.vm_enabled)
1515 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1516 server.db[j].id = j;
1517 }
1518 server.cronloops = 0;
1519 server.bgsavechildpid = -1;
1520 server.bgrewritechildpid = -1;
1521 server.bgrewritebuf = sdsempty();
1522 server.lastsave = time(NULL);
1523 server.dirty = 0;
1524 server.stat_numcommands = 0;
1525 server.stat_numconnections = 0;
1526 server.stat_starttime = time(NULL);
1527 server.unixtime = time(NULL);
1528 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1529 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1530 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1531
1532 if (server.appendonly) {
1533 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1534 if (server.appendfd == -1) {
1535 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1536 strerror(errno));
1537 exit(1);
1538 }
1539 }
1540
1541 if (server.vm_enabled) vmInit();
1542 }
1543
1544 /* Empty the whole database */
1545 static long long emptyDb() {
1546 int j;
1547 long long removed = 0;
1548
1549 for (j = 0; j < server.dbnum; j++) {
1550 removed += dictSize(server.db[j].dict);
1551 dictEmpty(server.db[j].dict);
1552 dictEmpty(server.db[j].expires);
1553 }
1554 return removed;
1555 }
1556
1557 static int yesnotoi(char *s) {
1558 if (!strcasecmp(s,"yes")) return 1;
1559 else if (!strcasecmp(s,"no")) return 0;
1560 else return -1;
1561 }
1562
1563 /* I agree, this is a very rudimental way to load a configuration...
1564 will improve later if the config gets more complex */
1565 static void loadServerConfig(char *filename) {
1566 FILE *fp;
1567 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1568 int linenum = 0;
1569 sds line = NULL;
1570
1571 if (filename[0] == '-' && filename[1] == '\0')
1572 fp = stdin;
1573 else {
1574 if ((fp = fopen(filename,"r")) == NULL) {
1575 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1576 exit(1);
1577 }
1578 }
1579
1580 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1581 sds *argv;
1582 int argc, j;
1583
1584 linenum++;
1585 line = sdsnew(buf);
1586 line = sdstrim(line," \t\r\n");
1587
1588 /* Skip comments and blank lines*/
1589 if (line[0] == '#' || line[0] == '\0') {
1590 sdsfree(line);
1591 continue;
1592 }
1593
1594 /* Split into arguments */
1595 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1596 sdstolower(argv[0]);
1597
1598 /* Execute config directives */
1599 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1600 server.maxidletime = atoi(argv[1]);
1601 if (server.maxidletime < 0) {
1602 err = "Invalid timeout value"; goto loaderr;
1603 }
1604 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1605 server.port = atoi(argv[1]);
1606 if (server.port < 1 || server.port > 65535) {
1607 err = "Invalid port"; goto loaderr;
1608 }
1609 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1610 server.bindaddr = zstrdup(argv[1]);
1611 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1612 int seconds = atoi(argv[1]);
1613 int changes = atoi(argv[2]);
1614 if (seconds < 1 || changes < 0) {
1615 err = "Invalid save parameters"; goto loaderr;
1616 }
1617 appendServerSaveParams(seconds,changes);
1618 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1619 if (chdir(argv[1]) == -1) {
1620 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1621 argv[1], strerror(errno));
1622 exit(1);
1623 }
1624 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1625 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1626 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1627 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1628 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1629 else {
1630 err = "Invalid log level. Must be one of debug, notice, warning";
1631 goto loaderr;
1632 }
1633 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1634 FILE *logfp;
1635
1636 server.logfile = zstrdup(argv[1]);
1637 if (!strcasecmp(server.logfile,"stdout")) {
1638 zfree(server.logfile);
1639 server.logfile = NULL;
1640 }
1641 if (server.logfile) {
1642 /* Test if we are able to open the file. The server will not
1643 * be able to abort just for this problem later... */
1644 logfp = fopen(server.logfile,"a");
1645 if (logfp == NULL) {
1646 err = sdscatprintf(sdsempty(),
1647 "Can't open the log file: %s", strerror(errno));
1648 goto loaderr;
1649 }
1650 fclose(logfp);
1651 }
1652 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1653 server.dbnum = atoi(argv[1]);
1654 if (server.dbnum < 1) {
1655 err = "Invalid number of databases"; goto loaderr;
1656 }
1657 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1658 server.maxclients = atoi(argv[1]);
1659 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1660 server.maxmemory = strtoll(argv[1], NULL, 10);
1661 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1662 server.masterhost = sdsnew(argv[1]);
1663 server.masterport = atoi(argv[2]);
1664 server.replstate = REDIS_REPL_CONNECT;
1665 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1666 server.masterauth = zstrdup(argv[1]);
1667 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1668 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1669 err = "argument must be 'yes' or 'no'"; goto loaderr;
1670 }
1671 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1672 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1673 err = "argument must be 'yes' or 'no'"; goto loaderr;
1674 }
1675 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1676 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1677 err = "argument must be 'yes' or 'no'"; goto loaderr;
1678 }
1679 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1680 server.sharingpoolsize = atoi(argv[1]);
1681 if (server.sharingpoolsize < 1) {
1682 err = "invalid object sharing pool size"; goto loaderr;
1683 }
1684 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1685 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1686 err = "argument must be 'yes' or 'no'"; goto loaderr;
1687 }
1688 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1689 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1690 err = "argument must be 'yes' or 'no'"; goto loaderr;
1691 }
1692 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1693 if (!strcasecmp(argv[1],"no")) {
1694 server.appendfsync = APPENDFSYNC_NO;
1695 } else if (!strcasecmp(argv[1],"always")) {
1696 server.appendfsync = APPENDFSYNC_ALWAYS;
1697 } else if (!strcasecmp(argv[1],"everysec")) {
1698 server.appendfsync = APPENDFSYNC_EVERYSEC;
1699 } else {
1700 err = "argument must be 'no', 'always' or 'everysec'";
1701 goto loaderr;
1702 }
1703 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1704 server.requirepass = zstrdup(argv[1]);
1705 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1706 server.pidfile = zstrdup(argv[1]);
1707 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1708 server.dbfilename = zstrdup(argv[1]);
1709 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1710 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1711 err = "argument must be 'yes' or 'no'"; goto loaderr;
1712 }
1713 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1714 zfree(server.vm_swap_file);
1715 server.vm_swap_file = zstrdup(argv[1]);
1716 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1717 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1718 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1719 server.vm_page_size = strtoll(argv[1], NULL, 10);
1720 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1721 server.vm_pages = strtoll(argv[1], NULL, 10);
1722 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1723 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1724 } else {
1725 err = "Bad directive or wrong number of arguments"; goto loaderr;
1726 }
1727 for (j = 0; j < argc; j++)
1728 sdsfree(argv[j]);
1729 zfree(argv);
1730 sdsfree(line);
1731 }
1732 if (fp != stdin) fclose(fp);
1733 return;
1734
1735 loaderr:
1736 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1737 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1738 fprintf(stderr, ">>> '%s'\n", line);
1739 fprintf(stderr, "%s\n", err);
1740 exit(1);
1741 }
1742
1743 static void freeClientArgv(redisClient *c) {
1744 int j;
1745
1746 for (j = 0; j < c->argc; j++)
1747 decrRefCount(c->argv[j]);
1748 for (j = 0; j < c->mbargc; j++)
1749 decrRefCount(c->mbargv[j]);
1750 c->argc = 0;
1751 c->mbargc = 0;
1752 }
1753
1754 static void freeClient(redisClient *c) {
1755 listNode *ln;
1756
1757 /* Note that if the client we are freeing is blocked into a blocking
1758 * call, we have to set querybuf to NULL *before* to call
1759 * unblockClientWaitingData() to avoid processInputBuffer() will get
1760 * called. Also it is important to remove the file events after
1761 * this, because this call adds the READABLE event. */
1762 sdsfree(c->querybuf);
1763 c->querybuf = NULL;
1764 if (c->flags & REDIS_BLOCKED)
1765 unblockClientWaitingData(c);
1766
1767 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1768 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1769 listRelease(c->reply);
1770 freeClientArgv(c);
1771 close(c->fd);
1772 /* Remove from the list of clients */
1773 ln = listSearchKey(server.clients,c);
1774 redisAssert(ln != NULL);
1775 listDelNode(server.clients,ln);
1776 /* Remove from the list of clients waiting for swapped keys */
1777 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1778 ln = listSearchKey(server.io_ready_clients,c);
1779 if (ln) {
1780 listDelNode(server.io_ready_clients,ln);
1781 server.vm_blocked_clients--;
1782 }
1783 }
1784 while (server.vm_enabled && listLength(c->io_keys)) {
1785 ln = listFirst(c->io_keys);
1786 dontWaitForSwappedKey(c,ln->value);
1787 }
1788 listRelease(c->io_keys);
1789 /* Other cleanup */
1790 if (c->flags & REDIS_SLAVE) {
1791 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1792 close(c->repldbfd);
1793 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1794 ln = listSearchKey(l,c);
1795 redisAssert(ln != NULL);
1796 listDelNode(l,ln);
1797 }
1798 if (c->flags & REDIS_MASTER) {
1799 server.master = NULL;
1800 server.replstate = REDIS_REPL_CONNECT;
1801 }
1802 zfree(c->argv);
1803 zfree(c->mbargv);
1804 freeClientMultiState(c);
1805 zfree(c);
1806 }
1807
1808 #define GLUEREPLY_UP_TO (1024)
1809 static void glueReplyBuffersIfNeeded(redisClient *c) {
1810 int copylen = 0;
1811 char buf[GLUEREPLY_UP_TO];
1812 listNode *ln;
1813 listIter li;
1814 robj *o;
1815
1816 listRewind(c->reply,&li);
1817 while((ln = listNext(&li))) {
1818 int objlen;
1819
1820 o = ln->value;
1821 objlen = sdslen(o->ptr);
1822 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1823 memcpy(buf+copylen,o->ptr,objlen);
1824 copylen += objlen;
1825 listDelNode(c->reply,ln);
1826 } else {
1827 if (copylen == 0) return;
1828 break;
1829 }
1830 }
1831 /* Now the output buffer is empty, add the new single element */
1832 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1833 listAddNodeHead(c->reply,o);
1834 }
1835
1836 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1837 redisClient *c = privdata;
1838 int nwritten = 0, totwritten = 0, objlen;
1839 robj *o;
1840 REDIS_NOTUSED(el);
1841 REDIS_NOTUSED(mask);
1842
1843 /* Use writev() if we have enough buffers to send */
1844 if (!server.glueoutputbuf &&
1845 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1846 !(c->flags & REDIS_MASTER))
1847 {
1848 sendReplyToClientWritev(el, fd, privdata, mask);
1849 return;
1850 }
1851
1852 while(listLength(c->reply)) {
1853 if (server.glueoutputbuf && listLength(c->reply) > 1)
1854 glueReplyBuffersIfNeeded(c);
1855
1856 o = listNodeValue(listFirst(c->reply));
1857 objlen = sdslen(o->ptr);
1858
1859 if (objlen == 0) {
1860 listDelNode(c->reply,listFirst(c->reply));
1861 continue;
1862 }
1863
1864 if (c->flags & REDIS_MASTER) {
1865 /* Don't reply to a master */
1866 nwritten = objlen - c->sentlen;
1867 } else {
1868 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1869 if (nwritten <= 0) break;
1870 }
1871 c->sentlen += nwritten;
1872 totwritten += nwritten;
1873 /* If we fully sent the object on head go to the next one */
1874 if (c->sentlen == objlen) {
1875 listDelNode(c->reply,listFirst(c->reply));
1876 c->sentlen = 0;
1877 }
1878 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1879 * bytes, in a single threaded server it's a good idea to serve
1880 * other clients as well, even if a very large request comes from
1881 * super fast link that is always able to accept data (in real world
1882 * scenario think about 'KEYS *' against the loopback interfae) */
1883 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1884 }
1885 if (nwritten == -1) {
1886 if (errno == EAGAIN) {
1887 nwritten = 0;
1888 } else {
1889 redisLog(REDIS_VERBOSE,
1890 "Error writing to client: %s", strerror(errno));
1891 freeClient(c);
1892 return;
1893 }
1894 }
1895 if (totwritten > 0) c->lastinteraction = time(NULL);
1896 if (listLength(c->reply) == 0) {
1897 c->sentlen = 0;
1898 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1899 }
1900 }
1901
1902 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1903 {
1904 redisClient *c = privdata;
1905 int nwritten = 0, totwritten = 0, objlen, willwrite;
1906 robj *o;
1907 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1908 int offset, ion = 0;
1909 REDIS_NOTUSED(el);
1910 REDIS_NOTUSED(mask);
1911
1912 listNode *node;
1913 while (listLength(c->reply)) {
1914 offset = c->sentlen;
1915 ion = 0;
1916 willwrite = 0;
1917
1918 /* fill-in the iov[] array */
1919 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1920 o = listNodeValue(node);
1921 objlen = sdslen(o->ptr);
1922
1923 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1924 break;
1925
1926 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1927 break; /* no more iovecs */
1928
1929 iov[ion].iov_base = ((char*)o->ptr) + offset;
1930 iov[ion].iov_len = objlen - offset;
1931 willwrite += objlen - offset;
1932 offset = 0; /* just for the first item */
1933 ion++;
1934 }
1935
1936 if(willwrite == 0)
1937 break;
1938
1939 /* write all collected blocks at once */
1940 if((nwritten = writev(fd, iov, ion)) < 0) {
1941 if (errno != EAGAIN) {
1942 redisLog(REDIS_VERBOSE,
1943 "Error writing to client: %s", strerror(errno));
1944 freeClient(c);
1945 return;
1946 }
1947 break;
1948 }
1949
1950 totwritten += nwritten;
1951 offset = c->sentlen;
1952
1953 /* remove written robjs from c->reply */
1954 while (nwritten && listLength(c->reply)) {
1955 o = listNodeValue(listFirst(c->reply));
1956 objlen = sdslen(o->ptr);
1957
1958 if(nwritten >= objlen - offset) {
1959 listDelNode(c->reply, listFirst(c->reply));
1960 nwritten -= objlen - offset;
1961 c->sentlen = 0;
1962 } else {
1963 /* partial write */
1964 c->sentlen += nwritten;
1965 break;
1966 }
1967 offset = 0;
1968 }
1969 }
1970
1971 if (totwritten > 0)
1972 c->lastinteraction = time(NULL);
1973
1974 if (listLength(c->reply) == 0) {
1975 c->sentlen = 0;
1976 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1977 }
1978 }
1979
1980 static struct redisCommand *lookupCommand(char *name) {
1981 int j = 0;
1982 while(cmdTable[j].name != NULL) {
1983 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
1984 j++;
1985 }
1986 return NULL;
1987 }
1988
1989 /* resetClient prepare the client to process the next command */
1990 static void resetClient(redisClient *c) {
1991 freeClientArgv(c);
1992 c->bulklen = -1;
1993 c->multibulk = 0;
1994 }
1995
1996 /* Call() is the core of Redis execution of a command */
1997 static void call(redisClient *c, struct redisCommand *cmd) {
1998 long long dirty;
1999
2000 dirty = server.dirty;
2001 cmd->proc(c);
2002 if (server.appendonly && server.dirty-dirty)
2003 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2004 if (server.dirty-dirty && listLength(server.slaves))
2005 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2006 if (listLength(server.monitors))
2007 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2008 server.stat_numcommands++;
2009 }
2010
2011 /* If this function gets called we already read a whole
2012 * command, argments are in the client argv/argc fields.
2013 * processCommand() execute the command or prepare the
2014 * server for a bulk read from the client.
2015 *
2016 * If 1 is returned the client is still alive and valid and
2017 * and other operations can be performed by the caller. Otherwise
2018 * if 0 is returned the client was destroied (i.e. after QUIT). */
2019 static int processCommand(redisClient *c) {
2020 struct redisCommand *cmd;
2021
2022 /* Free some memory if needed (maxmemory setting) */
2023 if (server.maxmemory) freeMemoryIfNeeded();
2024
2025 /* Handle the multi bulk command type. This is an alternative protocol
2026 * supported by Redis in order to receive commands that are composed of
2027 * multiple binary-safe "bulk" arguments. The latency of processing is
2028 * a bit higher but this allows things like multi-sets, so if this
2029 * protocol is used only for MSET and similar commands this is a big win. */
2030 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2031 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2032 if (c->multibulk <= 0) {
2033 resetClient(c);
2034 return 1;
2035 } else {
2036 decrRefCount(c->argv[c->argc-1]);
2037 c->argc--;
2038 return 1;
2039 }
2040 } else if (c->multibulk) {
2041 if (c->bulklen == -1) {
2042 if (((char*)c->argv[0]->ptr)[0] != '$') {
2043 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2044 resetClient(c);
2045 return 1;
2046 } else {
2047 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2048 decrRefCount(c->argv[0]);
2049 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2050 c->argc--;
2051 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2052 resetClient(c);
2053 return 1;
2054 }
2055 c->argc--;
2056 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2057 return 1;
2058 }
2059 } else {
2060 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2061 c->mbargv[c->mbargc] = c->argv[0];
2062 c->mbargc++;
2063 c->argc--;
2064 c->multibulk--;
2065 if (c->multibulk == 0) {
2066 robj **auxargv;
2067 int auxargc;
2068
2069 /* Here we need to swap the multi-bulk argc/argv with the
2070 * normal argc/argv of the client structure. */
2071 auxargv = c->argv;
2072 c->argv = c->mbargv;
2073 c->mbargv = auxargv;
2074
2075 auxargc = c->argc;
2076 c->argc = c->mbargc;
2077 c->mbargc = auxargc;
2078
2079 /* We need to set bulklen to something different than -1
2080 * in order for the code below to process the command without
2081 * to try to read the last argument of a bulk command as
2082 * a special argument. */
2083 c->bulklen = 0;
2084 /* continue below and process the command */
2085 } else {
2086 c->bulklen = -1;
2087 return 1;
2088 }
2089 }
2090 }
2091 /* -- end of multi bulk commands processing -- */
2092
2093 /* The QUIT command is handled as a special case. Normal command
2094 * procs are unable to close the client connection safely */
2095 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2096 freeClient(c);
2097 return 0;
2098 }
2099
2100 /* Now lookup the command and check ASAP about trivial error conditions
2101 * such wrong arity, bad command name and so forth. */
2102 cmd = lookupCommand(c->argv[0]->ptr);
2103 if (!cmd) {
2104 addReplySds(c,
2105 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2106 (char*)c->argv[0]->ptr));
2107 resetClient(c);
2108 return 1;
2109 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2110 (c->argc < -cmd->arity)) {
2111 addReplySds(c,
2112 sdscatprintf(sdsempty(),
2113 "-ERR wrong number of arguments for '%s' command\r\n",
2114 cmd->name));
2115 resetClient(c);
2116 return 1;
2117 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2118 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2119 resetClient(c);
2120 return 1;
2121 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2122 /* This is a bulk command, we have to read the last argument yet. */
2123 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2124
2125 decrRefCount(c->argv[c->argc-1]);
2126 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2127 c->argc--;
2128 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2129 resetClient(c);
2130 return 1;
2131 }
2132 c->argc--;
2133 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2134 /* It is possible that the bulk read is already in the
2135 * buffer. Check this condition and handle it accordingly.
2136 * This is just a fast path, alternative to call processInputBuffer().
2137 * It's a good idea since the code is small and this condition
2138 * happens most of the times. */
2139 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2140 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2141 c->argc++;
2142 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2143 } else {
2144 /* Otherwise return... there is to read the last argument
2145 * from the socket. */
2146 return 1;
2147 }
2148 }
2149 /* Let's try to share objects on the command arguments vector */
2150 if (server.shareobjects) {
2151 int j;
2152 for(j = 1; j < c->argc; j++)
2153 c->argv[j] = tryObjectSharing(c->argv[j]);
2154 }
2155 /* Let's try to encode the bulk object to save space. */
2156 if (cmd->flags & REDIS_CMD_BULK)
2157 tryObjectEncoding(c->argv[c->argc-1]);
2158
2159 /* Check if the user is authenticated */
2160 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2161 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2162 resetClient(c);
2163 return 1;
2164 }
2165
2166 /* Exec the command */
2167 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2168 queueMultiCommand(c,cmd);
2169 addReply(c,shared.queued);
2170 } else {
2171 if (server.vm_enabled && server.vm_max_threads > 0 &&
2172 blockClientOnSwappedKeys(cmd,c)) return 1;
2173 call(c,cmd);
2174 }
2175
2176 /* Prepare the client for the next command */
2177 resetClient(c);
2178 return 1;
2179 }
2180
2181 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2182 listNode *ln;
2183 listIter li;
2184 int outc = 0, j;
2185 robj **outv;
2186 /* (args*2)+1 is enough room for args, spaces, newlines */
2187 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2188
2189 if (argc <= REDIS_STATIC_ARGS) {
2190 outv = static_outv;
2191 } else {
2192 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2193 }
2194
2195 for (j = 0; j < argc; j++) {
2196 if (j != 0) outv[outc++] = shared.space;
2197 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2198 robj *lenobj;
2199
2200 lenobj = createObject(REDIS_STRING,
2201 sdscatprintf(sdsempty(),"%lu\r\n",
2202 (unsigned long) stringObjectLen(argv[j])));
2203 lenobj->refcount = 0;
2204 outv[outc++] = lenobj;
2205 }
2206 outv[outc++] = argv[j];
2207 }
2208 outv[outc++] = shared.crlf;
2209
2210 /* Increment all the refcounts at start and decrement at end in order to
2211 * be sure to free objects if there is no slave in a replication state
2212 * able to be feed with commands */
2213 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2214 listRewind(slaves,&li);
2215 while((ln = listNext(&li))) {
2216 redisClient *slave = ln->value;
2217
2218 /* Don't feed slaves that are still waiting for BGSAVE to start */
2219 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2220
2221 /* Feed all the other slaves, MONITORs and so on */
2222 if (slave->slaveseldb != dictid) {
2223 robj *selectcmd;
2224
2225 switch(dictid) {
2226 case 0: selectcmd = shared.select0; break;
2227 case 1: selectcmd = shared.select1; break;
2228 case 2: selectcmd = shared.select2; break;
2229 case 3: selectcmd = shared.select3; break;
2230 case 4: selectcmd = shared.select4; break;
2231 case 5: selectcmd = shared.select5; break;
2232 case 6: selectcmd = shared.select6; break;
2233 case 7: selectcmd = shared.select7; break;
2234 case 8: selectcmd = shared.select8; break;
2235 case 9: selectcmd = shared.select9; break;
2236 default:
2237 selectcmd = createObject(REDIS_STRING,
2238 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2239 selectcmd->refcount = 0;
2240 break;
2241 }
2242 addReply(slave,selectcmd);
2243 slave->slaveseldb = dictid;
2244 }
2245 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2246 }
2247 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2248 if (outv != static_outv) zfree(outv);
2249 }
2250
2251 static void processInputBuffer(redisClient *c) {
2252 again:
2253 /* Before to process the input buffer, make sure the client is not
2254 * waitig for a blocking operation such as BLPOP. Note that the first
2255 * iteration the client is never blocked, otherwise the processInputBuffer
2256 * would not be called at all, but after the execution of the first commands
2257 * in the input buffer the client may be blocked, and the "goto again"
2258 * will try to reiterate. The following line will make it return asap. */
2259 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2260 if (c->bulklen == -1) {
2261 /* Read the first line of the query */
2262 char *p = strchr(c->querybuf,'\n');
2263 size_t querylen;
2264
2265 if (p) {
2266 sds query, *argv;
2267 int argc, j;
2268
2269 query = c->querybuf;
2270 c->querybuf = sdsempty();
2271 querylen = 1+(p-(query));
2272 if (sdslen(query) > querylen) {
2273 /* leave data after the first line of the query in the buffer */
2274 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2275 }
2276 *p = '\0'; /* remove "\n" */
2277 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2278 sdsupdatelen(query);
2279
2280 /* Now we can split the query in arguments */
2281 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2282 sdsfree(query);
2283
2284 if (c->argv) zfree(c->argv);
2285 c->argv = zmalloc(sizeof(robj*)*argc);
2286
2287 for (j = 0; j < argc; j++) {
2288 if (sdslen(argv[j])) {
2289 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2290 c->argc++;
2291 } else {
2292 sdsfree(argv[j]);
2293 }
2294 }
2295 zfree(argv);
2296 if (c->argc) {
2297 /* Execute the command. If the client is still valid
2298 * after processCommand() return and there is something
2299 * on the query buffer try to process the next command. */
2300 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2301 } else {
2302 /* Nothing to process, argc == 0. Just process the query
2303 * buffer if it's not empty or return to the caller */
2304 if (sdslen(c->querybuf)) goto again;
2305 }
2306 return;
2307 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2308 redisLog(REDIS_VERBOSE, "Client protocol error");
2309 freeClient(c);
2310 return;
2311 }
2312 } else {
2313 /* Bulk read handling. Note that if we are at this point
2314 the client already sent a command terminated with a newline,
2315 we are reading the bulk data that is actually the last
2316 argument of the command. */
2317 int qbl = sdslen(c->querybuf);
2318
2319 if (c->bulklen <= qbl) {
2320 /* Copy everything but the final CRLF as final argument */
2321 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2322 c->argc++;
2323 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2324 /* Process the command. If the client is still valid after
2325 * the processing and there is more data in the buffer
2326 * try to parse it. */
2327 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2328 return;
2329 }
2330 }
2331 }
2332
2333 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2334 redisClient *c = (redisClient*) privdata;
2335 char buf[REDIS_IOBUF_LEN];
2336 int nread;
2337 REDIS_NOTUSED(el);
2338 REDIS_NOTUSED(mask);
2339
2340 nread = read(fd, buf, REDIS_IOBUF_LEN);
2341 if (nread == -1) {
2342 if (errno == EAGAIN) {
2343 nread = 0;
2344 } else {
2345 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2346 freeClient(c);
2347 return;
2348 }
2349 } else if (nread == 0) {
2350 redisLog(REDIS_VERBOSE, "Client closed connection");
2351 freeClient(c);
2352 return;
2353 }
2354 if (nread) {
2355 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2356 c->lastinteraction = time(NULL);
2357 } else {
2358 return;
2359 }
2360 if (!(c->flags & REDIS_BLOCKED))
2361 processInputBuffer(c);
2362 }
2363
2364 static int selectDb(redisClient *c, int id) {
2365 if (id < 0 || id >= server.dbnum)
2366 return REDIS_ERR;
2367 c->db = &server.db[id];
2368 return REDIS_OK;
2369 }
2370
2371 static void *dupClientReplyValue(void *o) {
2372 incrRefCount((robj*)o);
2373 return o;
2374 }
2375
2376 static redisClient *createClient(int fd) {
2377 redisClient *c = zmalloc(sizeof(*c));
2378
2379 anetNonBlock(NULL,fd);
2380 anetTcpNoDelay(NULL,fd);
2381 if (!c) return NULL;
2382 selectDb(c,0);
2383 c->fd = fd;
2384 c->querybuf = sdsempty();
2385 c->argc = 0;
2386 c->argv = NULL;
2387 c->bulklen = -1;
2388 c->multibulk = 0;
2389 c->mbargc = 0;
2390 c->mbargv = NULL;
2391 c->sentlen = 0;
2392 c->flags = 0;
2393 c->lastinteraction = time(NULL);
2394 c->authenticated = 0;
2395 c->replstate = REDIS_REPL_NONE;
2396 c->reply = listCreate();
2397 listSetFreeMethod(c->reply,decrRefCount);
2398 listSetDupMethod(c->reply,dupClientReplyValue);
2399 c->blockingkeys = NULL;
2400 c->blockingkeysnum = 0;
2401 c->io_keys = listCreate();
2402 listSetFreeMethod(c->io_keys,decrRefCount);
2403 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2404 readQueryFromClient, c) == AE_ERR) {
2405 freeClient(c);
2406 return NULL;
2407 }
2408 listAddNodeTail(server.clients,c);
2409 initClientMultiState(c);
2410 return c;
2411 }
2412
2413 static void addReply(redisClient *c, robj *obj) {
2414 if (listLength(c->reply) == 0 &&
2415 (c->replstate == REDIS_REPL_NONE ||
2416 c->replstate == REDIS_REPL_ONLINE) &&
2417 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2418 sendReplyToClient, c) == AE_ERR) return;
2419
2420 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2421 obj = dupStringObject(obj);
2422 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2423 }
2424 listAddNodeTail(c->reply,getDecodedObject(obj));
2425 }
2426
2427 static void addReplySds(redisClient *c, sds s) {
2428 robj *o = createObject(REDIS_STRING,s);
2429 addReply(c,o);
2430 decrRefCount(o);
2431 }
2432
2433 static void addReplyDouble(redisClient *c, double d) {
2434 char buf[128];
2435
2436 snprintf(buf,sizeof(buf),"%.17g",d);
2437 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2438 (unsigned long) strlen(buf),buf));
2439 }
2440
2441 static void addReplyLong(redisClient *c, long l) {
2442 char buf[128];
2443 size_t len;
2444
2445 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2446 addReplySds(c,sdsnewlen(buf,len));
2447 }
2448
2449 static void addReplyBulkLen(redisClient *c, robj *obj) {
2450 size_t len;
2451
2452 if (obj->encoding == REDIS_ENCODING_RAW) {
2453 len = sdslen(obj->ptr);
2454 } else {
2455 long n = (long)obj->ptr;
2456
2457 /* Compute how many bytes will take this integer as a radix 10 string */
2458 len = 1;
2459 if (n < 0) {
2460 len++;
2461 n = -n;
2462 }
2463 while((n = n/10) != 0) {
2464 len++;
2465 }
2466 }
2467 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2468 }
2469
2470 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2471 int cport, cfd;
2472 char cip[128];
2473 redisClient *c;
2474 REDIS_NOTUSED(el);
2475 REDIS_NOTUSED(mask);
2476 REDIS_NOTUSED(privdata);
2477
2478 cfd = anetAccept(server.neterr, fd, cip, &cport);
2479 if (cfd == AE_ERR) {
2480 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2481 return;
2482 }
2483 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2484 if ((c = createClient(cfd)) == NULL) {
2485 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2486 close(cfd); /* May be already closed, just ingore errors */
2487 return;
2488 }
2489 /* If maxclient directive is set and this is one client more... close the
2490 * connection. Note that we create the client instead to check before
2491 * for this condition, since now the socket is already set in nonblocking
2492 * mode and we can send an error for free using the Kernel I/O */
2493 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2494 char *err = "-ERR max number of clients reached\r\n";
2495
2496 /* That's a best effort error message, don't check write errors */
2497 if (write(c->fd,err,strlen(err)) == -1) {
2498 /* Nothing to do, Just to avoid the warning... */
2499 }
2500 freeClient(c);
2501 return;
2502 }
2503 server.stat_numconnections++;
2504 }
2505
2506 /* ======================= Redis objects implementation ===================== */
2507
2508 static robj *createObject(int type, void *ptr) {
2509 robj *o;
2510
2511 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2512 if (listLength(server.objfreelist)) {
2513 listNode *head = listFirst(server.objfreelist);
2514 o = listNodeValue(head);
2515 listDelNode(server.objfreelist,head);
2516 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2517 } else {
2518 if (server.vm_enabled) {
2519 pthread_mutex_unlock(&server.obj_freelist_mutex);
2520 o = zmalloc(sizeof(*o));
2521 } else {
2522 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2523 }
2524 }
2525 o->type = type;
2526 o->encoding = REDIS_ENCODING_RAW;
2527 o->ptr = ptr;
2528 o->refcount = 1;
2529 if (server.vm_enabled) {
2530 /* Note that this code may run in the context of an I/O thread
2531 * and accessing to server.unixtime in theory is an error
2532 * (no locks). But in practice this is safe, and even if we read
2533 * garbage Redis will not fail, as it's just a statistical info */
2534 o->vm.atime = server.unixtime;
2535 o->storage = REDIS_VM_MEMORY;
2536 }
2537 return o;
2538 }
2539
2540 static robj *createStringObject(char *ptr, size_t len) {
2541 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2542 }
2543
2544 static robj *dupStringObject(robj *o) {
2545 assert(o->encoding == REDIS_ENCODING_RAW);
2546 return createStringObject(o->ptr,sdslen(o->ptr));
2547 }
2548
2549 static robj *createListObject(void) {
2550 list *l = listCreate();
2551
2552 listSetFreeMethod(l,decrRefCount);
2553 return createObject(REDIS_LIST,l);
2554 }
2555
2556 static robj *createSetObject(void) {
2557 dict *d = dictCreate(&setDictType,NULL);
2558 return createObject(REDIS_SET,d);
2559 }
2560
2561 static robj *createHashObject(void) {
2562 /* All the Hashes start as zipmaps. Will be automatically converted
2563 * into hash tables if there are enough elements or big elements
2564 * inside. */
2565 unsigned char *zm = zipmapNew();
2566 robj *o = createObject(REDIS_HASH,zm);
2567 o->encoding = REDIS_ENCODING_ZIPMAP;
2568 return o;
2569 }
2570
2571 static robj *createZsetObject(void) {
2572 zset *zs = zmalloc(sizeof(*zs));
2573
2574 zs->dict = dictCreate(&zsetDictType,NULL);
2575 zs->zsl = zslCreate();
2576 return createObject(REDIS_ZSET,zs);
2577 }
2578
2579 static void freeStringObject(robj *o) {
2580 if (o->encoding == REDIS_ENCODING_RAW) {
2581 sdsfree(o->ptr);
2582 }
2583 }
2584
2585 static void freeListObject(robj *o) {
2586 listRelease((list*) o->ptr);
2587 }
2588
2589 static void freeSetObject(robj *o) {
2590 dictRelease((dict*) o->ptr);
2591 }
2592
2593 static void freeZsetObject(robj *o) {
2594 zset *zs = o->ptr;
2595
2596 dictRelease(zs->dict);
2597 zslFree(zs->zsl);
2598 zfree(zs);
2599 }
2600
2601 static void freeHashObject(robj *o) {
2602 dictRelease((dict*) o->ptr);
2603 }
2604
2605 static void incrRefCount(robj *o) {
2606 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2607 o->refcount++;
2608 }
2609
2610 static void decrRefCount(void *obj) {
2611 robj *o = obj;
2612
2613 /* Object is a key of a swapped out value, or in the process of being
2614 * loaded. */
2615 if (server.vm_enabled &&
2616 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2617 {
2618 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2619 redisAssert(o->refcount == 1);
2620 }
2621 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2622 redisAssert(o->type == REDIS_STRING);
2623 freeStringObject(o);
2624 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2625 pthread_mutex_lock(&server.obj_freelist_mutex);
2626 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2627 !listAddNodeHead(server.objfreelist,o))
2628 zfree(o);
2629 pthread_mutex_unlock(&server.obj_freelist_mutex);
2630 server.vm_stats_swapped_objects--;
2631 return;
2632 }
2633 /* Object is in memory, or in the process of being swapped out. */
2634 if (--(o->refcount) == 0) {
2635 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2636 vmCancelThreadedIOJob(obj);
2637 switch(o->type) {
2638 case REDIS_STRING: freeStringObject(o); break;
2639 case REDIS_LIST: freeListObject(o); break;
2640 case REDIS_SET: freeSetObject(o); break;
2641 case REDIS_ZSET: freeZsetObject(o); break;
2642 case REDIS_HASH: freeHashObject(o); break;
2643 default: redisAssert(0 != 0); break;
2644 }
2645 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2646 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2647 !listAddNodeHead(server.objfreelist,o))
2648 zfree(o);
2649 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2650 }
2651 }
2652
2653 static robj *lookupKey(redisDb *db, robj *key) {
2654 dictEntry *de = dictFind(db->dict,key);
2655 if (de) {
2656 robj *key = dictGetEntryKey(de);
2657 robj *val = dictGetEntryVal(de);
2658
2659 if (server.vm_enabled) {
2660 if (key->storage == REDIS_VM_MEMORY ||
2661 key->storage == REDIS_VM_SWAPPING)
2662 {
2663 /* If we were swapping the object out, stop it, this key
2664 * was requested. */
2665 if (key->storage == REDIS_VM_SWAPPING)
2666 vmCancelThreadedIOJob(key);
2667 /* Update the access time of the key for the aging algorithm. */
2668 key->vm.atime = server.unixtime;
2669 } else {
2670 int notify = (key->storage == REDIS_VM_LOADING);
2671
2672 /* Our value was swapped on disk. Bring it at home. */
2673 redisAssert(val == NULL);
2674 val = vmLoadObject(key);
2675 dictGetEntryVal(de) = val;
2676
2677 /* Clients blocked by the VM subsystem may be waiting for
2678 * this key... */
2679 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2680 }
2681 }
2682 return val;
2683 } else {
2684 return NULL;
2685 }
2686 }
2687
2688 static robj *lookupKeyRead(redisDb *db, robj *key) {
2689 expireIfNeeded(db,key);
2690 return lookupKey(db,key);
2691 }
2692
2693 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2694 deleteIfVolatile(db,key);
2695 return lookupKey(db,key);
2696 }
2697
2698 static int deleteKey(redisDb *db, robj *key) {
2699 int retval;
2700
2701 /* We need to protect key from destruction: after the first dictDelete()
2702 * it may happen that 'key' is no longer valid if we don't increment
2703 * it's count. This may happen when we get the object reference directly
2704 * from the hash table with dictRandomKey() or dict iterators */
2705 incrRefCount(key);
2706 if (dictSize(db->expires)) dictDelete(db->expires,key);
2707 retval = dictDelete(db->dict,key);
2708 decrRefCount(key);
2709
2710 return retval == DICT_OK;
2711 }
2712
2713 /* Try to share an object against the shared objects pool */
2714 static robj *tryObjectSharing(robj *o) {
2715 struct dictEntry *de;
2716 unsigned long c;
2717
2718 if (o == NULL || server.shareobjects == 0) return o;
2719
2720 redisAssert(o->type == REDIS_STRING);
2721 de = dictFind(server.sharingpool,o);
2722 if (de) {
2723 robj *shared = dictGetEntryKey(de);
2724
2725 c = ((unsigned long) dictGetEntryVal(de))+1;
2726 dictGetEntryVal(de) = (void*) c;
2727 incrRefCount(shared);
2728 decrRefCount(o);
2729 return shared;
2730 } else {
2731 /* Here we are using a stream algorihtm: Every time an object is
2732 * shared we increment its count, everytime there is a miss we
2733 * recrement the counter of a random object. If this object reaches
2734 * zero we remove the object and put the current object instead. */
2735 if (dictSize(server.sharingpool) >=
2736 server.sharingpoolsize) {
2737 de = dictGetRandomKey(server.sharingpool);
2738 redisAssert(de != NULL);
2739 c = ((unsigned long) dictGetEntryVal(de))-1;
2740 dictGetEntryVal(de) = (void*) c;
2741 if (c == 0) {
2742 dictDelete(server.sharingpool,de->key);
2743 }
2744 } else {
2745 c = 0; /* If the pool is empty we want to add this object */
2746 }
2747 if (c == 0) {
2748 int retval;
2749
2750 retval = dictAdd(server.sharingpool,o,(void*)1);
2751 redisAssert(retval == DICT_OK);
2752 incrRefCount(o);
2753 }
2754 return o;
2755 }
2756 }
2757
2758 /* Check if the nul-terminated string 's' can be represented by a long
2759 * (that is, is a number that fits into long without any other space or
2760 * character before or after the digits).
2761 *
2762 * If so, the function returns REDIS_OK and *longval is set to the value
2763 * of the number. Otherwise REDIS_ERR is returned */
2764 static int isStringRepresentableAsLong(sds s, long *longval) {
2765 char buf[32], *endptr;
2766 long value;
2767 int slen;
2768
2769 value = strtol(s, &endptr, 10);
2770 if (endptr[0] != '\0') return REDIS_ERR;
2771 slen = snprintf(buf,32,"%ld",value);
2772
2773 /* If the number converted back into a string is not identical
2774 * then it's not possible to encode the string as integer */
2775 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2776 if (longval) *longval = value;
2777 return REDIS_OK;
2778 }
2779
2780 /* Try to encode a string object in order to save space */
2781 static int tryObjectEncoding(robj *o) {
2782 long value;
2783 sds s = o->ptr;
2784
2785 if (o->encoding != REDIS_ENCODING_RAW)
2786 return REDIS_ERR; /* Already encoded */
2787
2788 /* It's not save to encode shared objects: shared objects can be shared
2789 * everywhere in the "object space" of Redis. Encoded objects can only
2790 * appear as "values" (and not, for instance, as keys) */
2791 if (o->refcount > 1) return REDIS_ERR;
2792
2793 /* Currently we try to encode only strings */
2794 redisAssert(o->type == REDIS_STRING);
2795
2796 /* Check if we can represent this string as a long integer */
2797 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2798
2799 /* Ok, this object can be encoded */
2800 o->encoding = REDIS_ENCODING_INT;
2801 sdsfree(o->ptr);
2802 o->ptr = (void*) value;
2803 return REDIS_OK;
2804 }
2805
2806 /* Get a decoded version of an encoded object (returned as a new object).
2807 * If the object is already raw-encoded just increment the ref count. */
2808 static robj *getDecodedObject(robj *o) {
2809 robj *dec;
2810
2811 if (o->encoding == REDIS_ENCODING_RAW) {
2812 incrRefCount(o);
2813 return o;
2814 }
2815 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2816 char buf[32];
2817
2818 snprintf(buf,32,"%ld",(long)o->ptr);
2819 dec = createStringObject(buf,strlen(buf));
2820 return dec;
2821 } else {
2822 redisAssert(1 != 1);
2823 }
2824 }
2825
2826 /* Compare two string objects via strcmp() or alike.
2827 * Note that the objects may be integer-encoded. In such a case we
2828 * use snprintf() to get a string representation of the numbers on the stack
2829 * and compare the strings, it's much faster than calling getDecodedObject().
2830 *
2831 * Important note: if objects are not integer encoded, but binary-safe strings,
2832 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2833 * binary safe. */
2834 static int compareStringObjects(robj *a, robj *b) {
2835 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2836 char bufa[128], bufb[128], *astr, *bstr;
2837 int bothsds = 1;
2838
2839 if (a == b) return 0;
2840 if (a->encoding != REDIS_ENCODING_RAW) {
2841 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2842 astr = bufa;
2843 bothsds = 0;
2844 } else {
2845 astr = a->ptr;
2846 }
2847 if (b->encoding != REDIS_ENCODING_RAW) {
2848 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2849 bstr = bufb;
2850 bothsds = 0;
2851 } else {
2852 bstr = b->ptr;
2853 }
2854 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2855 }
2856
2857 static size_t stringObjectLen(robj *o) {
2858 redisAssert(o->type == REDIS_STRING);
2859 if (o->encoding == REDIS_ENCODING_RAW) {
2860 return sdslen(o->ptr);
2861 } else {
2862 char buf[32];
2863
2864 return snprintf(buf,32,"%ld",(long)o->ptr);
2865 }
2866 }
2867
2868 /*============================ RDB saving/loading =========================== */
2869
2870 static int rdbSaveType(FILE *fp, unsigned char type) {
2871 if (fwrite(&type,1,1,fp) == 0) return -1;
2872 return 0;
2873 }
2874
2875 static int rdbSaveTime(FILE *fp, time_t t) {
2876 int32_t t32 = (int32_t) t;
2877 if (fwrite(&t32,4,1,fp) == 0) return -1;
2878 return 0;
2879 }
2880
2881 /* check rdbLoadLen() comments for more info */
2882 static int rdbSaveLen(FILE *fp, uint32_t len) {
2883 unsigned char buf[2];
2884
2885 if (len < (1<<6)) {
2886 /* Save a 6 bit len */
2887 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2888 if (fwrite(buf,1,1,fp) == 0) return -1;
2889 } else if (len < (1<<14)) {
2890 /* Save a 14 bit len */
2891 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2892 buf[1] = len&0xFF;
2893 if (fwrite(buf,2,1,fp) == 0) return -1;
2894 } else {
2895 /* Save a 32 bit len */
2896 buf[0] = (REDIS_RDB_32BITLEN<<6);
2897 if (fwrite(buf,1,1,fp) == 0) return -1;
2898 len = htonl(len);
2899 if (fwrite(&len,4,1,fp) == 0) return -1;
2900 }
2901 return 0;
2902 }
2903
2904 /* String objects in the form "2391" "-100" without any space and with a
2905 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2906 * encoded as integers to save space */
2907 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2908 long long value;
2909 char *endptr, buf[32];
2910
2911 /* Check if it's possible to encode this value as a number */
2912 value = strtoll(s, &endptr, 10);
2913 if (endptr[0] != '\0') return 0;
2914 snprintf(buf,32,"%lld",value);
2915
2916 /* If the number converted back into a string is not identical
2917 * then it's not possible to encode the string as integer */
2918 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2919
2920 /* Finally check if it fits in our ranges */
2921 if (value >= -(1<<7) && value <= (1<<7)-1) {
2922 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2923 enc[1] = value&0xFF;
2924 return 2;
2925 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2926 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2927 enc[1] = value&0xFF;
2928 enc[2] = (value>>8)&0xFF;
2929 return 3;
2930 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2931 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2932 enc[1] = value&0xFF;
2933 enc[2] = (value>>8)&0xFF;
2934 enc[3] = (value>>16)&0xFF;
2935 enc[4] = (value>>24)&0xFF;
2936 return 5;
2937 } else {
2938 return 0;
2939 }
2940 }
2941
2942 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2943 unsigned int comprlen, outlen;
2944 unsigned char byte;
2945 void *out;
2946
2947 /* We require at least four bytes compression for this to be worth it */
2948 outlen = sdslen(obj->ptr)-4;
2949 if (outlen <= 0) return 0;
2950 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2951 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2952 if (comprlen == 0) {
2953 zfree(out);
2954 return 0;
2955 }
2956 /* Data compressed! Let's save it on disk */
2957 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2958 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2959 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2960 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2961 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2962 zfree(out);
2963 return comprlen;
2964
2965 writeerr:
2966 zfree(out);
2967 return -1;
2968 }
2969
2970 /* Save a string objet as [len][data] on disk. If the object is a string
2971 * representation of an integer value we try to safe it in a special form */
2972 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2973 size_t len;
2974 int enclen;
2975
2976 len = sdslen(obj->ptr);
2977
2978 /* Try integer encoding */
2979 if (len <= 11) {
2980 unsigned char buf[5];
2981 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2982 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2983 return 0;
2984 }
2985 }
2986
2987 /* Try LZF compression - under 20 bytes it's unable to compress even
2988 * aaaaaaaaaaaaaaaaaa so skip it */
2989 if (server.rdbcompression && len > 20) {
2990 int retval;
2991
2992 retval = rdbSaveLzfStringObject(fp,obj);
2993 if (retval == -1) return -1;
2994 if (retval > 0) return 0;
2995 /* retval == 0 means data can't be compressed, save the old way */
2996 }
2997
2998 /* Store verbatim */
2999 if (rdbSaveLen(fp,len) == -1) return -1;
3000 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
3001 return 0;
3002 }
3003
3004 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3005 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3006 int retval;
3007
3008 /* Avoid incr/decr ref count business when possible.
3009 * This plays well with copy-on-write given that we are probably
3010 * in a child process (BGSAVE). Also this makes sure key objects
3011 * of swapped objects are not incRefCount-ed (an assert does not allow
3012 * this in order to avoid bugs) */
3013 if (obj->encoding != REDIS_ENCODING_RAW) {
3014 obj = getDecodedObject(obj);
3015 retval = rdbSaveStringObjectRaw(fp,obj);
3016 decrRefCount(obj);
3017 } else {
3018 retval = rdbSaveStringObjectRaw(fp,obj);
3019 }
3020 return retval;
3021 }
3022
3023 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3024 * 8 bit integer specifing the length of the representation.
3025 * This 8 bit integer has special values in order to specify the following
3026 * conditions:
3027 * 253: not a number
3028 * 254: + inf
3029 * 255: - inf
3030 */
3031 static int rdbSaveDoubleValue(FILE *fp, double val) {
3032 unsigned char buf[128];
3033 int len;
3034
3035 if (isnan(val)) {
3036 buf[0] = 253;
3037 len = 1;
3038 } else if (!isfinite(val)) {
3039 len = 1;
3040 buf[0] = (val < 0) ? 255 : 254;
3041 } else {
3042 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3043 buf[0] = strlen((char*)buf+1);
3044 len = buf[0]+1;
3045 }
3046 if (fwrite(buf,len,1,fp) == 0) return -1;
3047 return 0;
3048 }
3049
3050 /* Save a Redis object. */
3051 static int rdbSaveObject(FILE *fp, robj *o) {
3052 if (o->type == REDIS_STRING) {
3053 /* Save a string value */
3054 if (rdbSaveStringObject(fp,o) == -1) return -1;
3055 } else if (o->type == REDIS_LIST) {
3056 /* Save a list value */
3057 list *list = o->ptr;
3058 listIter li;
3059 listNode *ln;
3060
3061 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3062 listRewind(list,&li);
3063 while((ln = listNext(&li))) {
3064 robj *eleobj = listNodeValue(ln);
3065
3066 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3067 }
3068 } else if (o->type == REDIS_SET) {
3069 /* Save a set value */
3070 dict *set = o->ptr;
3071 dictIterator *di = dictGetIterator(set);
3072 dictEntry *de;
3073
3074 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3075 while((de = dictNext(di)) != NULL) {
3076 robj *eleobj = dictGetEntryKey(de);
3077
3078 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3079 }
3080 dictReleaseIterator(di);
3081 } else if (o->type == REDIS_ZSET) {
3082 /* Save a set value */
3083 zset *zs = o->ptr;
3084 dictIterator *di = dictGetIterator(zs->dict);
3085 dictEntry *de;
3086
3087 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3088 while((de = dictNext(di)) != NULL) {
3089 robj *eleobj = dictGetEntryKey(de);
3090 double *score = dictGetEntryVal(de);
3091
3092 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3093 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3094 }
3095 dictReleaseIterator(di);
3096 } else {
3097 redisAssert(0 != 0);
3098 }
3099 return 0;
3100 }
3101
3102 /* Return the length the object will have on disk if saved with
3103 * the rdbSaveObject() function. Currently we use a trick to get
3104 * this length with very little changes to the code. In the future
3105 * we could switch to a faster solution. */
3106 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3107 if (fp == NULL) fp = server.devnull;
3108 rewind(fp);
3109 assert(rdbSaveObject(fp,o) != 1);
3110 return ftello(fp);
3111 }
3112
3113 /* Return the number of pages required to save this object in the swap file */
3114 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3115 off_t bytes = rdbSavedObjectLen(o,fp);
3116
3117 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3118 }
3119
3120 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3121 static int rdbSave(char *filename) {
3122 dictIterator *di = NULL;
3123 dictEntry *de;
3124 FILE *fp;
3125 char tmpfile[256];
3126 int j;
3127 time_t now = time(NULL);
3128
3129 /* Wait for I/O therads to terminate, just in case this is a
3130 * foreground-saving, to avoid seeking the swap file descriptor at the
3131 * same time. */
3132 if (server.vm_enabled)
3133 waitEmptyIOJobsQueue();
3134
3135 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3136 fp = fopen(tmpfile,"w");
3137 if (!fp) {
3138 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3139 return REDIS_ERR;
3140 }
3141 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3142 for (j = 0; j < server.dbnum; j++) {
3143 redisDb *db = server.db+j;
3144 dict *d = db->dict;
3145 if (dictSize(d) == 0) continue;
3146 di = dictGetIterator(d);
3147 if (!di) {
3148 fclose(fp);
3149 return REDIS_ERR;
3150 }
3151
3152 /* Write the SELECT DB opcode */
3153 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3154 if (rdbSaveLen(fp,j) == -1) goto werr;
3155
3156 /* Iterate this DB writing every entry */
3157 while((de = dictNext(di)) != NULL) {
3158 robj *key = dictGetEntryKey(de);
3159 robj *o = dictGetEntryVal(de);
3160 time_t expiretime = getExpire(db,key);
3161
3162 /* Save the expire time */
3163 if (expiretime != -1) {
3164 /* If this key is already expired skip it */
3165 if (expiretime < now) continue;
3166 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3167 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3168 }
3169 /* Save the key and associated value. This requires special
3170 * handling if the value is swapped out. */
3171 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3172 key->storage == REDIS_VM_SWAPPING) {
3173 /* Save type, key, value */
3174 if (rdbSaveType(fp,o->type) == -1) goto werr;
3175 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3176 if (rdbSaveObject(fp,o) == -1) goto werr;
3177 } else {
3178 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3179 robj *po;
3180 /* Get a preview of the object in memory */
3181 po = vmPreviewObject(key);
3182 /* Save type, key, value */
3183 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3184 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3185 if (rdbSaveObject(fp,po) == -1) goto werr;
3186 /* Remove the loaded object from memory */
3187 decrRefCount(po);
3188 }
3189 }
3190 dictReleaseIterator(di);
3191 }
3192 /* EOF opcode */
3193 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3194
3195 /* Make sure data will not remain on the OS's output buffers */
3196 fflush(fp);
3197 fsync(fileno(fp));
3198 fclose(fp);
3199
3200 /* Use RENAME to make sure the DB file is changed atomically only
3201 * if the generate DB file is ok. */
3202 if (rename(tmpfile,filename) == -1) {
3203 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3204 unlink(tmpfile);
3205 return REDIS_ERR;
3206 }
3207 redisLog(REDIS_NOTICE,"DB saved on disk");
3208 server.dirty = 0;
3209 server.lastsave = time(NULL);
3210 return REDIS_OK;
3211
3212 werr:
3213 fclose(fp);
3214 unlink(tmpfile);
3215 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3216 if (di) dictReleaseIterator(di);
3217 return REDIS_ERR;
3218 }
3219
3220 static int rdbSaveBackground(char *filename) {
3221 pid_t childpid;
3222
3223 if (server.bgsavechildpid != -1) return REDIS_ERR;
3224 if (server.vm_enabled) waitEmptyIOJobsQueue();
3225 if ((childpid = fork()) == 0) {
3226 /* Child */
3227 if (server.vm_enabled) vmReopenSwapFile();
3228 close(server.fd);
3229 if (rdbSave(filename) == REDIS_OK) {
3230 _exit(0);
3231 } else {
3232 _exit(1);
3233 }
3234 } else {
3235 /* Parent */
3236 if (childpid == -1) {
3237 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3238 strerror(errno));
3239 return REDIS_ERR;
3240 }
3241 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3242 server.bgsavechildpid = childpid;
3243 return REDIS_OK;
3244 }
3245 return REDIS_OK; /* unreached */
3246 }
3247
3248 static void rdbRemoveTempFile(pid_t childpid) {
3249 char tmpfile[256];
3250
3251 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3252 unlink(tmpfile);
3253 }
3254
3255 static int rdbLoadType(FILE *fp) {
3256 unsigned char type;
3257 if (fread(&type,1,1,fp) == 0) return -1;
3258 return type;
3259 }
3260
3261 static time_t rdbLoadTime(FILE *fp) {
3262 int32_t t32;
3263 if (fread(&t32,4,1,fp) == 0) return -1;
3264 return (time_t) t32;
3265 }
3266
3267 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3268 * of this file for a description of how this are stored on disk.
3269 *
3270 * isencoded is set to 1 if the readed length is not actually a length but
3271 * an "encoding type", check the above comments for more info */
3272 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3273 unsigned char buf[2];
3274 uint32_t len;
3275 int type;
3276
3277 if (isencoded) *isencoded = 0;
3278 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3279 type = (buf[0]&0xC0)>>6;
3280 if (type == REDIS_RDB_6BITLEN) {
3281 /* Read a 6 bit len */
3282 return buf[0]&0x3F;
3283 } else if (type == REDIS_RDB_ENCVAL) {
3284 /* Read a 6 bit len encoding type */
3285 if (isencoded) *isencoded = 1;
3286 return buf[0]&0x3F;
3287 } else if (type == REDIS_RDB_14BITLEN) {
3288 /* Read a 14 bit len */
3289 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3290 return ((buf[0]&0x3F)<<8)|buf[1];
3291 } else {
3292 /* Read a 32 bit len */
3293 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3294 return ntohl(len);
3295 }
3296 }
3297
3298 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3299 unsigned char enc[4];
3300 long long val;
3301
3302 if (enctype == REDIS_RDB_ENC_INT8) {
3303 if (fread(enc,1,1,fp) == 0) return NULL;
3304 val = (signed char)enc[0];
3305 } else if (enctype == REDIS_RDB_ENC_INT16) {
3306 uint16_t v;
3307 if (fread(enc,2,1,fp) == 0) return NULL;
3308 v = enc[0]|(enc[1]<<8);
3309 val = (int16_t)v;
3310 } else if (enctype == REDIS_RDB_ENC_INT32) {
3311 uint32_t v;
3312 if (fread(enc,4,1,fp) == 0) return NULL;
3313 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3314 val = (int32_t)v;
3315 } else {
3316 val = 0; /* anti-warning */
3317 redisAssert(0!=0);
3318 }
3319 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3320 }
3321
3322 static robj *rdbLoadLzfStringObject(FILE*fp) {
3323 unsigned int len, clen;
3324 unsigned char *c = NULL;
3325 sds val = NULL;
3326
3327 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3328 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3329 if ((c = zmalloc(clen)) == NULL) goto err;
3330 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3331 if (fread(c,clen,1,fp) == 0) goto err;
3332 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3333 zfree(c);
3334 return createObject(REDIS_STRING,val);
3335 err:
3336 zfree(c);
3337 sdsfree(val);
3338 return NULL;
3339 }
3340
3341 static robj *rdbLoadStringObject(FILE*fp) {
3342 int isencoded;
3343 uint32_t len;
3344 sds val;
3345
3346 len = rdbLoadLen(fp,&isencoded);
3347 if (isencoded) {
3348 switch(len) {
3349 case REDIS_RDB_ENC_INT8:
3350 case REDIS_RDB_ENC_INT16:
3351 case REDIS_RDB_ENC_INT32:
3352 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3353 case REDIS_RDB_ENC_LZF:
3354 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3355 default:
3356 redisAssert(0!=0);
3357 }
3358 }
3359
3360 if (len == REDIS_RDB_LENERR) return NULL;
3361 val = sdsnewlen(NULL,len);
3362 if (len && fread(val,len,1,fp) == 0) {
3363 sdsfree(val);
3364 return NULL;
3365 }
3366 return tryObjectSharing(createObject(REDIS_STRING,val));
3367 }
3368
3369 /* For information about double serialization check rdbSaveDoubleValue() */
3370 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3371 char buf[128];
3372 unsigned char len;
3373
3374 if (fread(&len,1,1,fp) == 0) return -1;
3375 switch(len) {
3376 case 255: *val = R_NegInf; return 0;
3377 case 254: *val = R_PosInf; return 0;
3378 case 253: *val = R_Nan; return 0;
3379 default:
3380 if (fread(buf,len,1,fp) == 0) return -1;
3381 buf[len] = '\0';
3382 sscanf(buf, "%lg", val);
3383 return 0;
3384 }
3385 }
3386
3387 /* Load a Redis object of the specified type from the specified file.
3388 * On success a newly allocated object is returned, otherwise NULL. */
3389 static robj *rdbLoadObject(int type, FILE *fp) {
3390 robj *o;
3391
3392 if (type == REDIS_STRING) {
3393 /* Read string value */
3394 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3395 tryObjectEncoding(o);
3396 } else if (type == REDIS_LIST || type == REDIS_SET) {
3397 /* Read list/set value */
3398 uint32_t listlen;
3399
3400 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3401 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3402 /* It's faster to expand the dict to the right size asap in order
3403 * to avoid rehashing */
3404 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3405 dictExpand(o->ptr,listlen);
3406 /* Load every single element of the list/set */
3407 while(listlen--) {
3408 robj *ele;
3409
3410 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3411 tryObjectEncoding(ele);
3412 if (type == REDIS_LIST) {
3413 listAddNodeTail((list*)o->ptr,ele);
3414 } else {
3415 dictAdd((dict*)o->ptr,ele,NULL);
3416 }
3417 }
3418 } else if (type == REDIS_ZSET) {
3419 /* Read list/set value */
3420 uint32_t zsetlen;
3421 zset *zs;
3422
3423 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3424 o = createZsetObject();
3425 zs = o->ptr;
3426 /* Load every single element of the list/set */
3427 while(zsetlen--) {
3428 robj *ele;
3429 double *score = zmalloc(sizeof(double));
3430
3431 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3432 tryObjectEncoding(ele);
3433 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3434 dictAdd(zs->dict,ele,score);
3435 zslInsert(zs->zsl,*score,ele);
3436 incrRefCount(ele); /* added to skiplist */
3437 }
3438 } else {
3439 redisAssert(0 != 0);
3440 }
3441 return o;
3442 }
3443
3444 static int rdbLoad(char *filename) {
3445 FILE *fp;
3446 robj *keyobj = NULL;
3447 uint32_t dbid;
3448 int type, retval, rdbver;
3449 dict *d = server.db[0].dict;
3450 redisDb *db = server.db+0;
3451 char buf[1024];
3452 time_t expiretime = -1, now = time(NULL);
3453 long long loadedkeys = 0;
3454
3455 fp = fopen(filename,"r");
3456 if (!fp) return REDIS_ERR;
3457 if (fread(buf,9,1,fp) == 0) goto eoferr;
3458 buf[9] = '\0';
3459 if (memcmp(buf,"REDIS",5) != 0) {
3460 fclose(fp);
3461 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3462 return REDIS_ERR;
3463 }
3464 rdbver = atoi(buf+5);
3465 if (rdbver != 1) {
3466 fclose(fp);
3467 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3468 return REDIS_ERR;
3469 }
3470 while(1) {
3471 robj *o;
3472
3473 /* Read type. */
3474 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3475 if (type == REDIS_EXPIRETIME) {
3476 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3477 /* We read the time so we need to read the object type again */
3478 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3479 }
3480 if (type == REDIS_EOF) break;
3481 /* Handle SELECT DB opcode as a special case */
3482 if (type == REDIS_SELECTDB) {
3483 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3484 goto eoferr;
3485 if (dbid >= (unsigned)server.dbnum) {
3486 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3487 exit(1);
3488 }
3489 db = server.db+dbid;
3490 d = db->dict;
3491 continue;
3492 }
3493 /* Read key */
3494 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3495 /* Read value */
3496 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3497 /* Add the new object in the hash table */
3498 retval = dictAdd(d,keyobj,o);
3499 if (retval == DICT_ERR) {
3500 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3501 exit(1);
3502 }
3503 /* Set the expire time if needed */
3504 if (expiretime != -1) {
3505 setExpire(db,keyobj,expiretime);
3506 /* Delete this key if already expired */
3507 if (expiretime < now) deleteKey(db,keyobj);
3508 expiretime = -1;
3509 }
3510 keyobj = o = NULL;
3511 /* Handle swapping while loading big datasets when VM is on */
3512 loadedkeys++;
3513 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3514 while (zmalloc_used_memory() > server.vm_max_memory) {
3515 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3516 }
3517 }
3518 }
3519 fclose(fp);
3520 return REDIS_OK;
3521
3522 eoferr: /* unexpected end of file is handled here with a fatal exit */
3523 if (keyobj) decrRefCount(keyobj);
3524 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3525 exit(1);
3526 return REDIS_ERR; /* Just to avoid warning */
3527 }
3528
3529 /*================================== Commands =============================== */
3530
3531 static void authCommand(redisClient *c) {
3532 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3533 c->authenticated = 1;
3534 addReply(c,shared.ok);
3535 } else {
3536 c->authenticated = 0;
3537 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3538 }
3539 }
3540
3541 static void pingCommand(redisClient *c) {
3542 addReply(c,shared.pong);
3543 }
3544
3545 static void echoCommand(redisClient *c) {
3546 addReplyBulkLen(c,c->argv[1]);
3547 addReply(c,c->argv[1]);
3548 addReply(c,shared.crlf);
3549 }
3550
3551 /*=================================== Strings =============================== */
3552
3553 static void setGenericCommand(redisClient *c, int nx) {
3554 int retval;
3555
3556 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3557 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3558 if (retval == DICT_ERR) {
3559 if (!nx) {
3560 /* If the key is about a swapped value, we want a new key object
3561 * to overwrite the old. So we delete the old key in the database.
3562 * This will also make sure that swap pages about the old object
3563 * will be marked as free. */
3564 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3565 incrRefCount(c->argv[1]);
3566 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3567 incrRefCount(c->argv[2]);
3568 } else {
3569 addReply(c,shared.czero);
3570 return;
3571 }
3572 } else {
3573 incrRefCount(c->argv[1]);
3574 incrRefCount(c->argv[2]);
3575 }
3576 server.dirty++;
3577 removeExpire(c->db,c->argv[1]);
3578 addReply(c, nx ? shared.cone : shared.ok);
3579 }
3580
3581 static void setCommand(redisClient *c) {
3582 setGenericCommand(c,0);
3583 }
3584
3585 static void setnxCommand(redisClient *c) {
3586 setGenericCommand(c,1);
3587 }
3588
3589 static int getGenericCommand(redisClient *c) {
3590 robj *o = lookupKeyRead(c->db,c->argv[1]);
3591
3592 if (o == NULL) {
3593 addReply(c,shared.nullbulk);
3594 return REDIS_OK;
3595 } else {
3596 if (o->type != REDIS_STRING) {
3597 addReply(c,shared.wrongtypeerr);
3598 return REDIS_ERR;
3599 } else {
3600 addReplyBulkLen(c,o);
3601 addReply(c,o);
3602 addReply(c,shared.crlf);
3603 return REDIS_OK;
3604 }
3605 }
3606 }
3607
3608 static void getCommand(redisClient *c) {
3609 getGenericCommand(c);
3610 }
3611
3612 static void getsetCommand(redisClient *c) {
3613 if (getGenericCommand(c) == REDIS_ERR) return;
3614 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3615 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3616 } else {
3617 incrRefCount(c->argv[1]);
3618 }
3619 incrRefCount(c->argv[2]);
3620 server.dirty++;
3621 removeExpire(c->db,c->argv[1]);
3622 }
3623
3624 static void mgetCommand(redisClient *c) {
3625 int j;
3626
3627 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3628 for (j = 1; j < c->argc; j++) {
3629 robj *o = lookupKeyRead(c->db,c->argv[j]);
3630 if (o == NULL) {
3631 addReply(c,shared.nullbulk);
3632 } else {
3633 if (o->type != REDIS_STRING) {
3634 addReply(c,shared.nullbulk);
3635 } else {
3636 addReplyBulkLen(c,o);
3637 addReply(c,o);
3638 addReply(c,shared.crlf);
3639 }
3640 }
3641 }
3642 }
3643
3644 static void msetGenericCommand(redisClient *c, int nx) {
3645 int j, busykeys = 0;
3646
3647 if ((c->argc % 2) == 0) {
3648 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3649 return;
3650 }
3651 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3652 * set nothing at all if at least one already key exists. */
3653 if (nx) {
3654 for (j = 1; j < c->argc; j += 2) {
3655 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3656 busykeys++;
3657 }
3658 }
3659 }
3660 if (busykeys) {
3661 addReply(c, shared.czero);
3662 return;
3663 }
3664
3665 for (j = 1; j < c->argc; j += 2) {
3666 int retval;
3667
3668 tryObjectEncoding(c->argv[j+1]);
3669 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3670 if (retval == DICT_ERR) {
3671 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3672 incrRefCount(c->argv[j+1]);
3673 } else {
3674 incrRefCount(c->argv[j]);
3675 incrRefCount(c->argv[j+1]);
3676 }
3677 removeExpire(c->db,c->argv[j]);
3678 }
3679 server.dirty += (c->argc-1)/2;
3680 addReply(c, nx ? shared.cone : shared.ok);
3681 }
3682
3683 static void msetCommand(redisClient *c) {
3684 msetGenericCommand(c,0);
3685 }
3686
3687 static void msetnxCommand(redisClient *c) {
3688 msetGenericCommand(c,1);
3689 }
3690
3691 static void incrDecrCommand(redisClient *c, long long incr) {
3692 long long value;
3693 int retval;
3694 robj *o;
3695
3696 o = lookupKeyWrite(c->db,c->argv[1]);
3697 if (o == NULL) {
3698 value = 0;
3699 } else {
3700 if (o->type != REDIS_STRING) {
3701 value = 0;
3702 } else {
3703 char *eptr;
3704
3705 if (o->encoding == REDIS_ENCODING_RAW)
3706 value = strtoll(o->ptr, &eptr, 10);
3707 else if (o->encoding == REDIS_ENCODING_INT)
3708 value = (long)o->ptr;
3709 else
3710 redisAssert(1 != 1);
3711 }
3712 }
3713
3714 value += incr;
3715 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3716 tryObjectEncoding(o);
3717 retval = dictAdd(c->db->dict,c->argv[1],o);
3718 if (retval == DICT_ERR) {
3719 dictReplace(c->db->dict,c->argv[1],o);
3720 removeExpire(c->db,c->argv[1]);
3721 } else {
3722 incrRefCount(c->argv[1]);
3723 }
3724 server.dirty++;
3725 addReply(c,shared.colon);
3726 addReply(c,o);
3727 addReply(c,shared.crlf);
3728 }
3729
3730 static void incrCommand(redisClient *c) {
3731 incrDecrCommand(c,1);
3732 }
3733
3734 static void decrCommand(redisClient *c) {
3735 incrDecrCommand(c,-1);
3736 }
3737
3738 static void incrbyCommand(redisClient *c) {
3739 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3740 incrDecrCommand(c,incr);
3741 }
3742
3743 static void decrbyCommand(redisClient *c) {
3744 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3745 incrDecrCommand(c,-incr);
3746 }
3747
3748 static void appendCommand(redisClient *c) {
3749 int retval;
3750 size_t totlen;
3751 robj *o;
3752
3753 o = lookupKeyWrite(c->db,c->argv[1]);
3754 if (o == NULL) {
3755 /* Create the key */
3756 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3757 incrRefCount(c->argv[1]);
3758 incrRefCount(c->argv[2]);
3759 totlen = stringObjectLen(c->argv[2]);
3760 } else {
3761 dictEntry *de;
3762
3763 de = dictFind(c->db->dict,c->argv[1]);
3764 assert(de != NULL);
3765
3766 o = dictGetEntryVal(de);
3767 if (o->type != REDIS_STRING) {
3768 addReply(c,shared.wrongtypeerr);
3769 return;
3770 }
3771 /* If the object is specially encoded or shared we have to make
3772 * a copy */
3773 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3774 robj *decoded = getDecodedObject(o);
3775
3776 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3777 decrRefCount(decoded);
3778 dictReplace(c->db->dict,c->argv[1],o);
3779 }
3780 /* APPEND! */
3781 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3782 o->ptr = sdscatlen(o->ptr,
3783 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3784 } else {
3785 o->ptr = sdscatprintf(o->ptr, "%ld",
3786 (unsigned long) c->argv[2]->ptr);
3787 }
3788 totlen = sdslen(o->ptr);
3789 }
3790 server.dirty++;
3791 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3792 }
3793
3794 static void substrCommand(redisClient *c) {
3795 robj *o;
3796 long start = atoi(c->argv[2]->ptr);
3797 long end = atoi(c->argv[3]->ptr);
3798
3799 o = lookupKeyRead(c->db,c->argv[1]);
3800 if (o == NULL) {
3801 addReply(c,shared.nullbulk);
3802 } else {
3803 if (o->type != REDIS_STRING) {
3804 addReply(c,shared.wrongtypeerr);
3805 } else {
3806 size_t rangelen, strlen;
3807 sds range;
3808
3809 o = getDecodedObject(o);
3810 strlen = sdslen(o->ptr);
3811
3812 /* convert negative indexes */
3813 if (start < 0) start = strlen+start;
3814 if (end < 0) end = strlen+end;
3815 if (start < 0) start = 0;
3816 if (end < 0) end = 0;
3817
3818 /* indexes sanity checks */
3819 if (start > end || (size_t)start >= strlen) {
3820 /* Out of range start or start > end result in null reply */
3821 addReply(c,shared.nullbulk);
3822 decrRefCount(o);
3823 return;
3824 }
3825 if ((size_t)end >= strlen) end = strlen-1;
3826 rangelen = (end-start)+1;
3827
3828 /* Return the result */
3829 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen));
3830 range = sdsnewlen((char*)o->ptr+start,rangelen);
3831 addReplySds(c,range);
3832 addReply(c,shared.crlf);
3833 decrRefCount(o);
3834 }
3835 }
3836 }
3837
3838 /* ========================= Type agnostic commands ========================= */
3839
3840 static void delCommand(redisClient *c) {
3841 int deleted = 0, j;
3842
3843 for (j = 1; j < c->argc; j++) {
3844 if (deleteKey(c->db,c->argv[j])) {
3845 server.dirty++;
3846 deleted++;
3847 }
3848 }
3849 switch(deleted) {
3850 case 0:
3851 addReply(c,shared.czero);
3852 break;
3853 case 1:
3854 addReply(c,shared.cone);
3855 break;
3856 default:
3857 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3858 break;
3859 }
3860 }
3861
3862 static void existsCommand(redisClient *c) {
3863 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3864 }
3865
3866 static void selectCommand(redisClient *c) {
3867 int id = atoi(c->argv[1]->ptr);
3868
3869 if (selectDb(c,id) == REDIS_ERR) {
3870 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3871 } else {
3872 addReply(c,shared.ok);
3873 }
3874 }
3875
3876 static void randomkeyCommand(redisClient *c) {
3877 dictEntry *de;
3878
3879 while(1) {
3880 de = dictGetRandomKey(c->db->dict);
3881 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3882 }
3883 if (de == NULL) {
3884 addReply(c,shared.plus);
3885 addReply(c,shared.crlf);
3886 } else {
3887 addReply(c,shared.plus);
3888 addReply(c,dictGetEntryKey(de));
3889 addReply(c,shared.crlf);
3890 }
3891 }
3892
3893 static void keysCommand(redisClient *c) {
3894 dictIterator *di;
3895 dictEntry *de;
3896 sds pattern = c->argv[1]->ptr;
3897 int plen = sdslen(pattern);
3898 unsigned long numkeys = 0;
3899 robj *lenobj = createObject(REDIS_STRING,NULL);
3900
3901 di = dictGetIterator(c->db->dict);
3902 addReply(c,lenobj);
3903 decrRefCount(lenobj);
3904 while((de = dictNext(di)) != NULL) {
3905 robj *keyobj = dictGetEntryKey(de);
3906
3907 sds key = keyobj->ptr;
3908 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3909 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3910 if (expireIfNeeded(c->db,keyobj) == 0) {
3911 addReplyBulkLen(c,keyobj);
3912 addReply(c,keyobj);
3913 addReply(c,shared.crlf);
3914 numkeys++;
3915 }
3916 }
3917 }
3918 dictReleaseIterator(di);
3919 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3920 }
3921
3922 static void dbsizeCommand(redisClient *c) {
3923 addReplySds(c,
3924 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3925 }
3926
3927 static void lastsaveCommand(redisClient *c) {
3928 addReplySds(c,
3929 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3930 }
3931
3932 static void typeCommand(redisClient *c) {
3933 robj *o;
3934 char *type;
3935
3936 o = lookupKeyRead(c->db,c->argv[1]);
3937 if (o == NULL) {
3938 type = "+none";
3939 } else {
3940 switch(o->type) {
3941 case REDIS_STRING: type = "+string"; break;
3942 case REDIS_LIST: type = "+list"; break;
3943 case REDIS_SET: type = "+set"; break;
3944 case REDIS_ZSET: type = "+zset"; break;
3945 default: type = "unknown"; break;
3946 }
3947 }
3948 addReplySds(c,sdsnew(type));
3949 addReply(c,shared.crlf);
3950 }
3951
3952 static void saveCommand(redisClient *c) {
3953 if (server.bgsavechildpid != -1) {
3954 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3955 return;
3956 }
3957 if (rdbSave(server.dbfilename) == REDIS_OK) {
3958 addReply(c,shared.ok);
3959 } else {
3960 addReply(c,shared.err);
3961 }
3962 }
3963
3964 static void bgsaveCommand(redisClient *c) {
3965 if (server.bgsavechildpid != -1) {
3966 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3967 return;
3968 }
3969 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
3970 char *status = "+Background saving started\r\n";
3971 addReplySds(c,sdsnew(status));
3972 } else {
3973 addReply(c,shared.err);
3974 }
3975 }
3976
3977 static void shutdownCommand(redisClient *c) {
3978 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
3979 /* Kill the saving child if there is a background saving in progress.
3980 We want to avoid race conditions, for instance our saving child may
3981 overwrite the synchronous saving did by SHUTDOWN. */
3982 if (server.bgsavechildpid != -1) {
3983 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3984 kill(server.bgsavechildpid,SIGKILL);
3985 rdbRemoveTempFile(server.bgsavechildpid);
3986 }
3987 if (server.appendonly) {
3988 /* Append only file: fsync() the AOF and exit */
3989 fsync(server.appendfd);
3990 if (server.vm_enabled) unlink(server.vm_swap_file);
3991 exit(0);
3992 } else {
3993 /* Snapshotting. Perform a SYNC SAVE and exit */
3994 if (rdbSave(server.dbfilename) == REDIS_OK) {
3995 if (server.daemonize)
3996 unlink(server.pidfile);
3997 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3998 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3999 if (server.vm_enabled) unlink(server.vm_swap_file);
4000 exit(0);
4001 } else {
4002 /* Ooops.. error saving! The best we can do is to continue operating.
4003 * Note that if there was a background saving process, in the next
4004 * cron() Redis will be notified that the background saving aborted,
4005 * handling special stuff like slaves pending for synchronization... */
4006 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4007 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4008 }
4009 }
4010 }
4011
4012 static void renameGenericCommand(redisClient *c, int nx) {
4013 robj *o;
4014
4015 /* To use the same key as src and dst is probably an error */
4016 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4017 addReply(c,shared.sameobjecterr);
4018 return;
4019 }
4020
4021 o = lookupKeyWrite(c->db,c->argv[1]);
4022 if (o == NULL) {
4023 addReply(c,shared.nokeyerr);
4024 return;
4025 }
4026 incrRefCount(o);
4027 deleteIfVolatile(c->db,c->argv[2]);
4028 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4029 if (nx) {
4030 decrRefCount(o);
4031 addReply(c,shared.czero);
4032 return;
4033 }
4034 dictReplace(c->db->dict,c->argv[2],o);
4035 } else {
4036 incrRefCount(c->argv[2]);
4037 }
4038 deleteKey(c->db,c->argv[1]);
4039 server.dirty++;
4040 addReply(c,nx ? shared.cone : shared.ok);
4041 }
4042
4043 static void renameCommand(redisClient *c) {
4044 renameGenericCommand(c,0);
4045 }
4046
4047 static void renamenxCommand(redisClient *c) {
4048 renameGenericCommand(c,1);
4049 }
4050
4051 static void moveCommand(redisClient *c) {
4052 robj *o;
4053 redisDb *src, *dst;
4054 int srcid;
4055
4056 /* Obtain source and target DB pointers */
4057 src = c->db;
4058 srcid = c->db->id;
4059 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4060 addReply(c,shared.outofrangeerr);
4061 return;
4062 }
4063 dst = c->db;
4064 selectDb(c,srcid); /* Back to the source DB */
4065
4066 /* If the user is moving using as target the same
4067 * DB as the source DB it is probably an error. */
4068 if (src == dst) {
4069 addReply(c,shared.sameobjecterr);
4070 return;
4071 }
4072
4073 /* Check if the element exists and get a reference */
4074 o = lookupKeyWrite(c->db,c->argv[1]);
4075 if (!o) {
4076 addReply(c,shared.czero);
4077 return;
4078 }
4079
4080 /* Try to add the element to the target DB */
4081 deleteIfVolatile(dst,c->argv[1]);
4082 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4083 addReply(c,shared.czero);
4084 return;
4085 }
4086 incrRefCount(c->argv[1]);
4087 incrRefCount(o);
4088
4089 /* OK! key moved, free the entry in the source DB */
4090 deleteKey(src,c->argv[1]);
4091 server.dirty++;
4092 addReply(c,shared.cone);
4093 }
4094
4095 /* =================================== Lists ================================ */
4096 static void pushGenericCommand(redisClient *c, int where) {
4097 robj *lobj;
4098 list *list;
4099
4100 lobj = lookupKeyWrite(c->db,c->argv[1]);
4101 if (lobj == NULL) {
4102 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4103 addReply(c,shared.cone);
4104 return;
4105 }
4106 lobj = createListObject();
4107 list = lobj->ptr;
4108 if (where == REDIS_HEAD) {
4109 listAddNodeHead(list,c->argv[2]);
4110 } else {
4111 listAddNodeTail(list,c->argv[2]);
4112 }
4113 dictAdd(c->db->dict,c->argv[1],lobj);
4114 incrRefCount(c->argv[1]);
4115 incrRefCount(c->argv[2]);
4116 } else {
4117 if (lobj->type != REDIS_LIST) {
4118 addReply(c,shared.wrongtypeerr);
4119 return;
4120 }
4121 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4122 addReply(c,shared.cone);
4123 return;
4124 }
4125 list = lobj->ptr;
4126 if (where == REDIS_HEAD) {
4127 listAddNodeHead(list,c->argv[2]);
4128 } else {
4129 listAddNodeTail(list,c->argv[2]);
4130 }
4131 incrRefCount(c->argv[2]);
4132 }
4133 server.dirty++;
4134 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4135 }
4136
4137 static void lpushCommand(redisClient *c) {
4138 pushGenericCommand(c,REDIS_HEAD);
4139 }
4140
4141 static void rpushCommand(redisClient *c) {
4142 pushGenericCommand(c,REDIS_TAIL);
4143 }
4144
4145 static void llenCommand(redisClient *c) {
4146 robj *o;
4147 list *l;
4148
4149 o = lookupKeyRead(c->db,c->argv[1]);
4150 if (o == NULL) {
4151 addReply(c,shared.czero);
4152 return;
4153 } else {
4154 if (o->type != REDIS_LIST) {
4155 addReply(c,shared.wrongtypeerr);
4156 } else {
4157 l = o->ptr;
4158 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4159 }
4160 }
4161 }
4162
4163 static void lindexCommand(redisClient *c) {
4164 robj *o;
4165 int index = atoi(c->argv[2]->ptr);
4166
4167 o = lookupKeyRead(c->db,c->argv[1]);
4168 if (o == NULL) {
4169 addReply(c,shared.nullbulk);
4170 } else {
4171 if (o->type != REDIS_LIST) {
4172 addReply(c,shared.wrongtypeerr);
4173 } else {
4174 list *list = o->ptr;
4175 listNode *ln;
4176
4177 ln = listIndex(list, index);
4178 if (ln == NULL) {
4179 addReply(c,shared.nullbulk);
4180 } else {
4181 robj *ele = listNodeValue(ln);
4182 addReplyBulkLen(c,ele);
4183 addReply(c,ele);
4184 addReply(c,shared.crlf);
4185 }
4186 }
4187 }
4188 }
4189
4190 static void lsetCommand(redisClient *c) {
4191 robj *o;
4192 int index = atoi(c->argv[2]->ptr);
4193
4194 o = lookupKeyWrite(c->db,c->argv[1]);
4195 if (o == NULL) {
4196 addReply(c,shared.nokeyerr);
4197 } else {
4198 if (o->type != REDIS_LIST) {
4199 addReply(c,shared.wrongtypeerr);
4200 } else {
4201 list *list = o->ptr;
4202 listNode *ln;
4203
4204 ln = listIndex(list, index);
4205 if (ln == NULL) {
4206 addReply(c,shared.outofrangeerr);
4207 } else {
4208 robj *ele = listNodeValue(ln);
4209
4210 decrRefCount(ele);
4211 listNodeValue(ln) = c->argv[3];
4212 incrRefCount(c->argv[3]);
4213 addReply(c,shared.ok);
4214 server.dirty++;
4215 }
4216 }
4217 }
4218 }
4219
4220 static void popGenericCommand(redisClient *c, int where) {
4221 robj *o;
4222
4223 o = lookupKeyWrite(c->db,c->argv[1]);
4224 if (o == NULL) {
4225 addReply(c,shared.nullbulk);
4226 } else {
4227 if (o->type != REDIS_LIST) {
4228 addReply(c,shared.wrongtypeerr);
4229 } else {
4230 list *list = o->ptr;
4231 listNode *ln;
4232
4233 if (where == REDIS_HEAD)
4234 ln = listFirst(list);
4235 else
4236 ln = listLast(list);
4237
4238 if (ln == NULL) {
4239 addReply(c,shared.nullbulk);
4240 } else {
4241 robj *ele = listNodeValue(ln);
4242 addReplyBulkLen(c,ele);
4243 addReply(c,ele);
4244 addReply(c,shared.crlf);
4245 listDelNode(list,ln);
4246 server.dirty++;
4247 }
4248 }
4249 }
4250 }
4251
4252 static void lpopCommand(redisClient *c) {
4253 popGenericCommand(c,REDIS_HEAD);
4254 }
4255
4256 static void rpopCommand(redisClient *c) {
4257 popGenericCommand(c,REDIS_TAIL);
4258 }
4259
4260 static void lrangeCommand(redisClient *c) {
4261 robj *o;
4262 int start = atoi(c->argv[2]->ptr);
4263 int end = atoi(c->argv[3]->ptr);
4264
4265 o = lookupKeyRead(c->db,c->argv[1]);
4266 if (o == NULL) {
4267 addReply(c,shared.nullmultibulk);
4268 } else {
4269 if (o->type != REDIS_LIST) {
4270 addReply(c,shared.wrongtypeerr);
4271 } else {
4272 list *list = o->ptr;
4273 listNode *ln;
4274 int llen = listLength(list);
4275 int rangelen, j;
4276 robj *ele;
4277
4278 /* convert negative indexes */
4279 if (start < 0) start = llen+start;
4280 if (end < 0) end = llen+end;
4281 if (start < 0) start = 0;
4282 if (end < 0) end = 0;
4283
4284 /* indexes sanity checks */
4285 if (start > end || start >= llen) {
4286 /* Out of range start or start > end result in empty list */
4287 addReply(c,shared.emptymultibulk);
4288 return;
4289 }
4290 if (end >= llen) end = llen-1;
4291 rangelen = (end-start)+1;
4292
4293 /* Return the result in form of a multi-bulk reply */
4294 ln = listIndex(list, start);
4295 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4296 for (j = 0; j < rangelen; j++) {
4297 ele = listNodeValue(ln);
4298 addReplyBulkLen(c,ele);
4299 addReply(c,ele);
4300 addReply(c,shared.crlf);
4301 ln = ln->next;
4302 }
4303 }
4304 }
4305 }
4306
4307 static void ltrimCommand(redisClient *c) {
4308 robj *o;
4309 int start = atoi(c->argv[2]->ptr);
4310 int end = atoi(c->argv[3]->ptr);
4311
4312 o = lookupKeyWrite(c->db,c->argv[1]);
4313 if (o == NULL) {
4314 addReply(c,shared.ok);
4315 } else {
4316 if (o->type != REDIS_LIST) {
4317 addReply(c,shared.wrongtypeerr);
4318 } else {
4319 list *list = o->ptr;
4320 listNode *ln;
4321 int llen = listLength(list);
4322 int j, ltrim, rtrim;
4323
4324 /* convert negative indexes */
4325 if (start < 0) start = llen+start;
4326 if (end < 0) end = llen+end;
4327 if (start < 0) start = 0;
4328 if (end < 0) end = 0;
4329
4330 /* indexes sanity checks */
4331 if (start > end || start >= llen) {
4332 /* Out of range start or start > end result in empty list */
4333 ltrim = llen;
4334 rtrim = 0;
4335 } else {
4336 if (end >= llen) end = llen-1;
4337 ltrim = start;
4338 rtrim = llen-end-1;
4339 }
4340
4341 /* Remove list elements to perform the trim */
4342 for (j = 0; j < ltrim; j++) {
4343 ln = listFirst(list);
4344 listDelNode(list,ln);
4345 }
4346 for (j = 0; j < rtrim; j++) {
4347 ln = listLast(list);
4348 listDelNode(list,ln);
4349 }
4350 server.dirty++;
4351 addReply(c,shared.ok);
4352 }
4353 }
4354 }
4355
4356 static void lremCommand(redisClient *c) {
4357 robj *o;
4358
4359 o = lookupKeyWrite(c->db,c->argv[1]);
4360 if (o == NULL) {
4361 addReply(c,shared.czero);
4362 } else {
4363 if (o->type != REDIS_LIST) {
4364 addReply(c,shared.wrongtypeerr);
4365 } else {
4366 list *list = o->ptr;
4367 listNode *ln, *next;
4368 int toremove = atoi(c->argv[2]->ptr);
4369 int removed = 0;
4370 int fromtail = 0;
4371
4372 if (toremove < 0) {
4373 toremove = -toremove;
4374 fromtail = 1;
4375 }
4376 ln = fromtail ? list->tail : list->head;
4377 while (ln) {
4378 robj *ele = listNodeValue(ln);
4379
4380 next = fromtail ? ln->prev : ln->next;
4381 if (compareStringObjects(ele,c->argv[3]) == 0) {
4382 listDelNode(list,ln);
4383 server.dirty++;
4384 removed++;
4385 if (toremove && removed == toremove) break;
4386 }
4387 ln = next;
4388 }
4389 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4390 }
4391 }
4392 }
4393
4394 /* This is the semantic of this command:
4395 * RPOPLPUSH srclist dstlist:
4396 * IF LLEN(srclist) > 0
4397 * element = RPOP srclist
4398 * LPUSH dstlist element
4399 * RETURN element
4400 * ELSE
4401 * RETURN nil
4402 * END
4403 * END
4404 *
4405 * The idea is to be able to get an element from a list in a reliable way
4406 * since the element is not just returned but pushed against another list
4407 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4408 */
4409 static void rpoplpushcommand(redisClient *c) {
4410 robj *sobj;
4411
4412 sobj = lookupKeyWrite(c->db,c->argv[1]);
4413 if (sobj == NULL) {
4414 addReply(c,shared.nullbulk);
4415 } else {
4416 if (sobj->type != REDIS_LIST) {
4417 addReply(c,shared.wrongtypeerr);
4418 } else {
4419 list *srclist = sobj->ptr;
4420 listNode *ln = listLast(srclist);
4421
4422 if (ln == NULL) {
4423 addReply(c,shared.nullbulk);
4424 } else {
4425 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4426 robj *ele = listNodeValue(ln);
4427 list *dstlist;
4428
4429 if (dobj && dobj->type != REDIS_LIST) {
4430 addReply(c,shared.wrongtypeerr);
4431 return;
4432 }
4433
4434 /* Add the element to the target list (unless it's directly
4435 * passed to some BLPOP-ing client */
4436 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4437 if (dobj == NULL) {
4438 /* Create the list if the key does not exist */
4439 dobj = createListObject();
4440 dictAdd(c->db->dict,c->argv[2],dobj);
4441 incrRefCount(c->argv[2]);
4442 }
4443 dstlist = dobj->ptr;
4444 listAddNodeHead(dstlist,ele);
4445 incrRefCount(ele);
4446 }
4447
4448 /* Send the element to the client as reply as well */
4449 addReplyBulkLen(c,ele);
4450 addReply(c,ele);
4451 addReply(c,shared.crlf);
4452
4453 /* Finally remove the element from the source list */
4454 listDelNode(srclist,ln);
4455 server.dirty++;
4456 }
4457 }
4458 }
4459 }
4460
4461
4462 /* ==================================== Sets ================================ */
4463
4464 static void saddCommand(redisClient *c) {
4465 robj *set;
4466
4467 set = lookupKeyWrite(c->db,c->argv[1]);
4468 if (set == NULL) {
4469 set = createSetObject();
4470 dictAdd(c->db->dict,c->argv[1],set);
4471 incrRefCount(c->argv[1]);
4472 } else {
4473 if (set->type != REDIS_SET) {
4474 addReply(c,shared.wrongtypeerr);
4475 return;
4476 }
4477 }
4478 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4479 incrRefCount(c->argv[2]);
4480 server.dirty++;
4481 addReply(c,shared.cone);
4482 } else {
4483 addReply(c,shared.czero);
4484 }
4485 }
4486
4487 static void sremCommand(redisClient *c) {
4488 robj *set;
4489
4490 set = lookupKeyWrite(c->db,c->argv[1]);
4491 if (set == NULL) {
4492 addReply(c,shared.czero);
4493 } else {
4494 if (set->type != REDIS_SET) {
4495 addReply(c,shared.wrongtypeerr);
4496 return;
4497 }
4498 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4499 server.dirty++;
4500 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4501 addReply(c,shared.cone);
4502 } else {
4503 addReply(c,shared.czero);
4504 }
4505 }
4506 }
4507
4508 static void smoveCommand(redisClient *c) {
4509 robj *srcset, *dstset;
4510
4511 srcset = lookupKeyWrite(c->db,c->argv[1]);
4512 dstset = lookupKeyWrite(c->db,c->argv[2]);
4513
4514 /* If the source key does not exist return 0, if it's of the wrong type
4515 * raise an error */
4516 if (srcset == NULL || srcset->type != REDIS_SET) {
4517 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4518 return;
4519 }
4520 /* Error if the destination key is not a set as well */
4521 if (dstset && dstset->type != REDIS_SET) {
4522 addReply(c,shared.wrongtypeerr);
4523 return;
4524 }
4525 /* Remove the element from the source set */
4526 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4527 /* Key not found in the src set! return zero */
4528 addReply(c,shared.czero);
4529 return;
4530 }
4531 server.dirty++;
4532 /* Add the element to the destination set */
4533 if (!dstset) {
4534 dstset = createSetObject();
4535 dictAdd(c->db->dict,c->argv[2],dstset);
4536 incrRefCount(c->argv[2]);
4537 }
4538 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4539 incrRefCount(c->argv[3]);
4540 addReply(c,shared.cone);
4541 }
4542
4543 static void sismemberCommand(redisClient *c) {
4544 robj *set;
4545
4546 set = lookupKeyRead(c->db,c->argv[1]);
4547 if (set == NULL) {
4548 addReply(c,shared.czero);
4549 } else {
4550 if (set->type != REDIS_SET) {
4551 addReply(c,shared.wrongtypeerr);
4552 return;
4553 }
4554 if (dictFind(set->ptr,c->argv[2]))
4555 addReply(c,shared.cone);
4556 else
4557 addReply(c,shared.czero);
4558 }
4559 }
4560
4561 static void scardCommand(redisClient *c) {
4562 robj *o;
4563 dict *s;
4564
4565 o = lookupKeyRead(c->db,c->argv[1]);
4566 if (o == NULL) {
4567 addReply(c,shared.czero);
4568 return;
4569 } else {
4570 if (o->type != REDIS_SET) {
4571 addReply(c,shared.wrongtypeerr);
4572 } else {
4573 s = o->ptr;
4574 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4575 dictSize(s)));
4576 }
4577 }
4578 }
4579
4580 static void spopCommand(redisClient *c) {
4581 robj *set;
4582 dictEntry *de;
4583
4584 set = lookupKeyWrite(c->db,c->argv[1]);
4585 if (set == NULL) {
4586 addReply(c,shared.nullbulk);
4587 } else {
4588 if (set->type != REDIS_SET) {
4589 addReply(c,shared.wrongtypeerr);
4590 return;
4591 }
4592 de = dictGetRandomKey(set->ptr);
4593 if (de == NULL) {
4594 addReply(c,shared.nullbulk);
4595 } else {
4596 robj *ele = dictGetEntryKey(de);
4597
4598 addReplyBulkLen(c,ele);
4599 addReply(c,ele);
4600 addReply(c,shared.crlf);
4601 dictDelete(set->ptr,ele);
4602 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4603 server.dirty++;
4604 }
4605 }
4606 }
4607
4608 static void srandmemberCommand(redisClient *c) {
4609 robj *set;
4610 dictEntry *de;
4611
4612 set = lookupKeyRead(c->db,c->argv[1]);
4613 if (set == NULL) {
4614 addReply(c,shared.nullbulk);
4615 } else {
4616 if (set->type != REDIS_SET) {
4617 addReply(c,shared.wrongtypeerr);
4618 return;
4619 }
4620 de = dictGetRandomKey(set->ptr);
4621 if (de == NULL) {
4622 addReply(c,shared.nullbulk);
4623 } else {
4624 robj *ele = dictGetEntryKey(de);
4625
4626 addReplyBulkLen(c,ele);
4627 addReply(c,ele);
4628 addReply(c,shared.crlf);
4629 }
4630 }
4631 }
4632
4633 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4634 dict **d1 = (void*) s1, **d2 = (void*) s2;
4635
4636 return dictSize(*d1)-dictSize(*d2);
4637 }
4638
4639 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4640 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4641 dictIterator *di;
4642 dictEntry *de;
4643 robj *lenobj = NULL, *dstset = NULL;
4644 unsigned long j, cardinality = 0;
4645
4646 for (j = 0; j < setsnum; j++) {
4647 robj *setobj;
4648
4649 setobj = dstkey ?
4650 lookupKeyWrite(c->db,setskeys[j]) :
4651 lookupKeyRead(c->db,setskeys[j]);
4652 if (!setobj) {
4653 zfree(dv);
4654 if (dstkey) {
4655 if (deleteKey(c->db,dstkey))
4656 server.dirty++;
4657 addReply(c,shared.czero);
4658 } else {
4659 addReply(c,shared.nullmultibulk);
4660 }
4661 return;
4662 }
4663 if (setobj->type != REDIS_SET) {
4664 zfree(dv);
4665 addReply(c,shared.wrongtypeerr);
4666 return;
4667 }
4668 dv[j] = setobj->ptr;
4669 }
4670 /* Sort sets from the smallest to largest, this will improve our
4671 * algorithm's performace */
4672 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4673
4674 /* The first thing we should output is the total number of elements...
4675 * since this is a multi-bulk write, but at this stage we don't know
4676 * the intersection set size, so we use a trick, append an empty object
4677 * to the output list and save the pointer to later modify it with the
4678 * right length */
4679 if (!dstkey) {
4680 lenobj = createObject(REDIS_STRING,NULL);
4681 addReply(c,lenobj);
4682 decrRefCount(lenobj);
4683 } else {
4684 /* If we have a target key where to store the resulting set
4685 * create this key with an empty set inside */
4686 dstset = createSetObject();
4687 }
4688
4689 /* Iterate all the elements of the first (smallest) set, and test
4690 * the element against all the other sets, if at least one set does
4691 * not include the element it is discarded */
4692 di = dictGetIterator(dv[0]);
4693
4694 while((de = dictNext(di)) != NULL) {
4695 robj *ele;
4696
4697 for (j = 1; j < setsnum; j++)
4698 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4699 if (j != setsnum)
4700 continue; /* at least one set does not contain the member */
4701 ele = dictGetEntryKey(de);
4702 if (!dstkey) {
4703 addReplyBulkLen(c,ele);
4704 addReply(c,ele);
4705 addReply(c,shared.crlf);
4706 cardinality++;
4707 } else {
4708 dictAdd(dstset->ptr,ele,NULL);
4709 incrRefCount(ele);
4710 }
4711 }
4712 dictReleaseIterator(di);
4713
4714 if (dstkey) {
4715 /* Store the resulting set into the target */
4716 deleteKey(c->db,dstkey);
4717 dictAdd(c->db->dict,dstkey,dstset);
4718 incrRefCount(dstkey);
4719 }
4720
4721 if (!dstkey) {
4722 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4723 } else {
4724 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4725 dictSize((dict*)dstset->ptr)));
4726 server.dirty++;
4727 }
4728 zfree(dv);
4729 }
4730
4731 static void sinterCommand(redisClient *c) {
4732 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4733 }
4734
4735 static void sinterstoreCommand(redisClient *c) {
4736 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4737 }
4738
4739 #define REDIS_OP_UNION 0
4740 #define REDIS_OP_DIFF 1
4741
4742 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4743 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4744 dictIterator *di;
4745 dictEntry *de;
4746 robj *dstset = NULL;
4747 int j, cardinality = 0;
4748
4749 for (j = 0; j < setsnum; j++) {
4750 robj *setobj;
4751
4752 setobj = dstkey ?
4753 lookupKeyWrite(c->db,setskeys[j]) :
4754 lookupKeyRead(c->db,setskeys[j]);
4755 if (!setobj) {
4756 dv[j] = NULL;
4757 continue;
4758 }
4759 if (setobj->type != REDIS_SET) {
4760 zfree(dv);
4761 addReply(c,shared.wrongtypeerr);
4762 return;
4763 }
4764 dv[j] = setobj->ptr;
4765 }
4766
4767 /* We need a temp set object to store our union. If the dstkey
4768 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4769 * this set object will be the resulting object to set into the target key*/
4770 dstset = createSetObject();
4771
4772 /* Iterate all the elements of all the sets, add every element a single
4773 * time to the result set */
4774 for (j = 0; j < setsnum; j++) {
4775 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4776 if (!dv[j]) continue; /* non existing keys are like empty sets */
4777
4778 di = dictGetIterator(dv[j]);
4779
4780 while((de = dictNext(di)) != NULL) {
4781 robj *ele;
4782
4783 /* dictAdd will not add the same element multiple times */
4784 ele = dictGetEntryKey(de);
4785 if (op == REDIS_OP_UNION || j == 0) {
4786 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4787 incrRefCount(ele);
4788 cardinality++;
4789 }
4790 } else if (op == REDIS_OP_DIFF) {
4791 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4792 cardinality--;
4793 }
4794 }
4795 }
4796 dictReleaseIterator(di);
4797
4798 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4799 }
4800
4801 /* Output the content of the resulting set, if not in STORE mode */
4802 if (!dstkey) {
4803 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4804 di = dictGetIterator(dstset->ptr);
4805 while((de = dictNext(di)) != NULL) {
4806 robj *ele;
4807
4808 ele = dictGetEntryKey(de);
4809 addReplyBulkLen(c,ele);
4810 addReply(c,ele);
4811 addReply(c,shared.crlf);
4812 }
4813 dictReleaseIterator(di);
4814 } else {
4815 /* If we have a target key where to store the resulting set
4816 * create this key with the result set inside */
4817 deleteKey(c->db,dstkey);
4818 dictAdd(c->db->dict,dstkey,dstset);
4819 incrRefCount(dstkey);
4820 }
4821
4822 /* Cleanup */
4823 if (!dstkey) {
4824 decrRefCount(dstset);
4825 } else {
4826 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4827 dictSize((dict*)dstset->ptr)));
4828 server.dirty++;
4829 }
4830 zfree(dv);
4831 }
4832
4833 static void sunionCommand(redisClient *c) {
4834 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4835 }
4836
4837 static void sunionstoreCommand(redisClient *c) {
4838 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4839 }
4840
4841 static void sdiffCommand(redisClient *c) {
4842 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4843 }
4844
4845 static void sdiffstoreCommand(redisClient *c) {
4846 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4847 }
4848
4849 /* ==================================== ZSets =============================== */
4850
4851 /* ZSETs are ordered sets using two data structures to hold the same elements
4852 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4853 * data structure.
4854 *
4855 * The elements are added to an hash table mapping Redis objects to scores.
4856 * At the same time the elements are added to a skip list mapping scores
4857 * to Redis objects (so objects are sorted by scores in this "view"). */
4858
4859 /* This skiplist implementation is almost a C translation of the original
4860 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4861 * Alternative to Balanced Trees", modified in three ways:
4862 * a) this implementation allows for repeated values.
4863 * b) the comparison is not just by key (our 'score') but by satellite data.
4864 * c) there is a back pointer, so it's a doubly linked list with the back
4865 * pointers being only at "level 1". This allows to traverse the list
4866 * from tail to head, useful for ZREVRANGE. */
4867
4868 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4869 zskiplistNode *zn = zmalloc(sizeof(*zn));
4870
4871 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4872 if (level > 0)
4873 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4874 zn->score = score;
4875 zn->obj = obj;
4876 return zn;
4877 }
4878
4879 static zskiplist *zslCreate(void) {
4880 int j;
4881 zskiplist *zsl;
4882
4883 zsl = zmalloc(sizeof(*zsl));
4884 zsl->level = 1;
4885 zsl->length = 0;
4886 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4887 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4888 zsl->header->forward[j] = NULL;
4889
4890 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4891 if (j < ZSKIPLIST_MAXLEVEL-1)
4892 zsl->header->span[j] = 0;
4893 }
4894 zsl->header->backward = NULL;
4895 zsl->tail = NULL;
4896 return zsl;
4897 }
4898
4899 static void zslFreeNode(zskiplistNode *node) {
4900 decrRefCount(node->obj);
4901 zfree(node->forward);
4902 zfree(node->span);
4903 zfree(node);
4904 }
4905
4906 static void zslFree(zskiplist *zsl) {
4907 zskiplistNode *node = zsl->header->forward[0], *next;
4908
4909 zfree(zsl->header->forward);
4910 zfree(zsl->header->span);
4911 zfree(zsl->header);
4912 while(node) {
4913 next = node->forward[0];
4914 zslFreeNode(node);
4915 node = next;
4916 }
4917 zfree(zsl);
4918 }
4919
4920 static int zslRandomLevel(void) {
4921 int level = 1;
4922 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4923 level += 1;
4924 return level;
4925 }
4926
4927 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4928 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4929 unsigned int rank[ZSKIPLIST_MAXLEVEL];
4930 int i, level;
4931
4932 x = zsl->header;
4933 for (i = zsl->level-1; i >= 0; i--) {
4934 /* store rank that is crossed to reach the insert position */
4935 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
4936
4937 while (x->forward[i] &&
4938 (x->forward[i]->score < score ||
4939 (x->forward[i]->score == score &&
4940 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4941 rank[i] += i > 0 ? x->span[i-1] : 1;
4942 x = x->forward[i];
4943 }
4944 update[i] = x;
4945 }
4946 /* we assume the key is not already inside, since we allow duplicated
4947 * scores, and the re-insertion of score and redis object should never
4948 * happpen since the caller of zslInsert() should test in the hash table
4949 * if the element is already inside or not. */
4950 level = zslRandomLevel();
4951 if (level > zsl->level) {
4952 for (i = zsl->level; i < level; i++) {
4953 rank[i] = 0;
4954 update[i] = zsl->header;
4955 update[i]->span[i-1] = zsl->length;
4956 }
4957 zsl->level = level;
4958 }
4959 x = zslCreateNode(level,score,obj);
4960 for (i = 0; i < level; i++) {
4961 x->forward[i] = update[i]->forward[i];
4962 update[i]->forward[i] = x;
4963
4964 /* update span covered by update[i] as x is inserted here */
4965 if (i > 0) {
4966 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
4967 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
4968 }
4969 }
4970
4971 /* increment span for untouched levels */
4972 for (i = level; i < zsl->level; i++) {
4973 update[i]->span[i-1]++;
4974 }
4975
4976 x->backward = (update[0] == zsl->header) ? NULL : update[0];
4977 if (x->forward[0])
4978 x->forward[0]->backward = x;
4979 else
4980 zsl->tail = x;
4981 zsl->length++;
4982 }
4983
4984 /* Delete an element with matching score/object from the skiplist. */
4985 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
4986 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4987 int i;
4988
4989 x = zsl->header;
4990 for (i = zsl->level-1; i >= 0; i--) {
4991 while (x->forward[i] &&
4992 (x->forward[i]->score < score ||
4993 (x->forward[i]->score == score &&
4994 compareStringObjects(x->forward[i]->obj,obj) < 0)))
4995 x = x->forward[i];
4996 update[i] = x;
4997 }
4998 /* We may have multiple elements with the same score, what we need
4999 * is to find the element with both the right score and object. */
5000 x = x->forward[0];
5001 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5002 for (i = 0; i < zsl->level; i++) {
5003 if (update[i]->forward[i] == x) {
5004 if (i > 0) {
5005 update[i]->span[i-1] += x->span[i-1] - 1;
5006 }
5007 update[i]->forward[i] = x->forward[i];
5008 } else {
5009 /* invariant: i > 0, because update[0]->forward[0]
5010 * is always equal to x */
5011 update[i]->span[i-1] -= 1;
5012 }
5013 }
5014 if (x->forward[0]) {
5015 x->forward[0]->backward = x->backward;
5016 } else {
5017 zsl->tail = x->backward;
5018 }
5019 zslFreeNode(x);
5020 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5021 zsl->level--;
5022 zsl->length--;
5023 return 1;
5024 } else {
5025 return 0; /* not found */
5026 }
5027 return 0; /* not found */
5028 }
5029
5030 /* Delete all the elements with score between min and max from the skiplist.
5031 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5032 * Note that this function takes the reference to the hash table view of the
5033 * sorted set, in order to remove the elements from the hash table too. */
5034 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
5035 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5036 unsigned long removed = 0;
5037 int i;
5038
5039 x = zsl->header;
5040 for (i = zsl->level-1; i >= 0; i--) {
5041 while (x->forward[i] && x->forward[i]->score < min)
5042 x = x->forward[i];
5043 update[i] = x;
5044 }
5045 /* We may have multiple elements with the same score, what we need
5046 * is to find the element with both the right score and object. */
5047 x = x->forward[0];
5048 while (x && x->score <= max) {
5049 zskiplistNode *next;
5050
5051 for (i = 0; i < zsl->level; i++) {
5052 if (update[i]->forward[i] == x) {
5053 if (i > 0) {
5054 update[i]->span[i-1] += x->span[i-1] - 1;
5055 }
5056 update[i]->forward[i] = x->forward[i];
5057 } else {
5058 /* invariant: i > 0, because update[0]->forward[0]
5059 * is always equal to x */
5060 update[i]->span[i-1] -= 1;
5061 }
5062 }
5063 if (x->forward[0]) {
5064 x->forward[0]->backward = x->backward;
5065 } else {
5066 zsl->tail = x->backward;
5067 }
5068 next = x->forward[0];
5069 dictDelete(dict,x->obj);
5070 zslFreeNode(x);
5071 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5072 zsl->level--;
5073 zsl->length--;
5074 removed++;
5075 x = next;
5076 }
5077 return removed; /* not found */
5078 }
5079
5080 /* Find the first node having a score equal or greater than the specified one.
5081 * Returns NULL if there is no match. */
5082 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5083 zskiplistNode *x;
5084 int i;
5085
5086 x = zsl->header;
5087 for (i = zsl->level-1; i >= 0; i--) {
5088 while (x->forward[i] && x->forward[i]->score < score)
5089 x = x->forward[i];
5090 }
5091 /* We may have multiple elements with the same score, what we need
5092 * is to find the element with both the right score and object. */
5093 return x->forward[0];
5094 }
5095
5096 /* Find the rank for an element by both score and key.
5097 * Returns 0 when the element cannot be found, rank otherwise.
5098 * Note that the rank is 1-based due to the span of zsl->header to the
5099 * first element. */
5100 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5101 zskiplistNode *x;
5102 unsigned long rank = 0;
5103 int i;
5104
5105 x = zsl->header;
5106 for (i = zsl->level-1; i >= 0; i--) {
5107 while (x->forward[i] &&
5108 (x->forward[i]->score < score ||
5109 (x->forward[i]->score == score &&
5110 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5111 rank += i > 0 ? x->span[i-1] : 1;
5112 x = x->forward[i];
5113 }
5114
5115 /* x might be equal to zsl->header, so test if obj is non-NULL */
5116 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5117 return rank;
5118 }
5119 }
5120 return 0;
5121 }
5122
5123 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5124 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5125 zskiplistNode *x;
5126 unsigned long traversed = 0;
5127 int i;
5128
5129 x = zsl->header;
5130 for (i = zsl->level-1; i >= 0; i--) {
5131 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5132 traversed += i > 0 ? x->span[i-1] : 1;
5133 x = x->forward[i];
5134 }
5135
5136 if (traversed == rank) {
5137 return x;
5138 }
5139 }
5140 return NULL;
5141 }
5142
5143 /* The actual Z-commands implementations */
5144
5145 /* This generic command implements both ZADD and ZINCRBY.
5146 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5147 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5148 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5149 robj *zsetobj;
5150 zset *zs;
5151 double *score;
5152
5153 zsetobj = lookupKeyWrite(c->db,key);
5154 if (zsetobj == NULL) {
5155 zsetobj = createZsetObject();
5156 dictAdd(c->db->dict,key,zsetobj);
5157 incrRefCount(key);
5158 } else {
5159 if (zsetobj->type != REDIS_ZSET) {
5160 addReply(c,shared.wrongtypeerr);
5161 return;
5162 }
5163 }
5164 zs = zsetobj->ptr;
5165
5166 /* Ok now since we implement both ZADD and ZINCRBY here the code
5167 * needs to handle the two different conditions. It's all about setting
5168 * '*score', that is, the new score to set, to the right value. */
5169 score = zmalloc(sizeof(double));
5170 if (doincrement) {
5171 dictEntry *de;
5172
5173 /* Read the old score. If the element was not present starts from 0 */
5174 de = dictFind(zs->dict,ele);
5175 if (de) {
5176 double *oldscore = dictGetEntryVal(de);
5177 *score = *oldscore + scoreval;
5178 } else {
5179 *score = scoreval;
5180 }
5181 } else {
5182 *score = scoreval;
5183 }
5184
5185 /* What follows is a simple remove and re-insert operation that is common
5186 * to both ZADD and ZINCRBY... */
5187 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5188 /* case 1: New element */
5189 incrRefCount(ele); /* added to hash */
5190 zslInsert(zs->zsl,*score,ele);
5191 incrRefCount(ele); /* added to skiplist */
5192 server.dirty++;
5193 if (doincrement)
5194 addReplyDouble(c,*score);
5195 else
5196 addReply(c,shared.cone);
5197 } else {
5198 dictEntry *de;
5199 double *oldscore;
5200
5201 /* case 2: Score update operation */
5202 de = dictFind(zs->dict,ele);
5203 redisAssert(de != NULL);
5204 oldscore = dictGetEntryVal(de);
5205 if (*score != *oldscore) {
5206 int deleted;
5207
5208 /* Remove and insert the element in the skip list with new score */
5209 deleted = zslDelete(zs->zsl,*oldscore,ele);
5210 redisAssert(deleted != 0);
5211 zslInsert(zs->zsl,*score,ele);
5212 incrRefCount(ele);
5213 /* Update the score in the hash table */
5214 dictReplace(zs->dict,ele,score);
5215 server.dirty++;
5216 } else {
5217 zfree(score);
5218 }
5219 if (doincrement)
5220 addReplyDouble(c,*score);
5221 else
5222 addReply(c,shared.czero);
5223 }
5224 }
5225
5226 static void zaddCommand(redisClient *c) {
5227 double scoreval;
5228
5229 scoreval = strtod(c->argv[2]->ptr,NULL);
5230 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5231 }
5232
5233 static void zincrbyCommand(redisClient *c) {
5234 double scoreval;
5235
5236 scoreval = strtod(c->argv[2]->ptr,NULL);
5237 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5238 }
5239
5240 static void zremCommand(redisClient *c) {
5241 robj *zsetobj;
5242 zset *zs;
5243
5244 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5245 if (zsetobj == NULL) {
5246 addReply(c,shared.czero);
5247 } else {
5248 dictEntry *de;
5249 double *oldscore;
5250 int deleted;
5251
5252 if (zsetobj->type != REDIS_ZSET) {
5253 addReply(c,shared.wrongtypeerr);
5254 return;
5255 }
5256 zs = zsetobj->ptr;
5257 de = dictFind(zs->dict,c->argv[2]);
5258 if (de == NULL) {
5259 addReply(c,shared.czero);
5260 return;
5261 }
5262 /* Delete from the skiplist */
5263 oldscore = dictGetEntryVal(de);
5264 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5265 redisAssert(deleted != 0);
5266
5267 /* Delete from the hash table */
5268 dictDelete(zs->dict,c->argv[2]);
5269 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5270 server.dirty++;
5271 addReply(c,shared.cone);
5272 }
5273 }
5274
5275 static void zremrangebyscoreCommand(redisClient *c) {
5276 double min = strtod(c->argv[2]->ptr,NULL);
5277 double max = strtod(c->argv[3]->ptr,NULL);
5278 robj *zsetobj;
5279 zset *zs;
5280
5281 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5282 if (zsetobj == NULL) {
5283 addReply(c,shared.czero);
5284 } else {
5285 long deleted;
5286
5287 if (zsetobj->type != REDIS_ZSET) {
5288 addReply(c,shared.wrongtypeerr);
5289 return;
5290 }
5291 zs = zsetobj->ptr;
5292 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5293 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5294 server.dirty += deleted;
5295 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5296 }
5297 }
5298
5299 static void zrangeGenericCommand(redisClient *c, int reverse) {
5300 robj *o;
5301 int start = atoi(c->argv[2]->ptr);
5302 int end = atoi(c->argv[3]->ptr);
5303 int withscores = 0;
5304
5305 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5306 withscores = 1;
5307 } else if (c->argc >= 5) {
5308 addReply(c,shared.syntaxerr);
5309 return;
5310 }
5311
5312 o = lookupKeyRead(c->db,c->argv[1]);
5313 if (o == NULL) {
5314 addReply(c,shared.nullmultibulk);
5315 } else {
5316 if (o->type != REDIS_ZSET) {
5317 addReply(c,shared.wrongtypeerr);
5318 } else {
5319 zset *zsetobj = o->ptr;
5320 zskiplist *zsl = zsetobj->zsl;
5321 zskiplistNode *ln;
5322
5323 int llen = zsl->length;
5324 int rangelen, j;
5325 robj *ele;
5326
5327 /* convert negative indexes */
5328 if (start < 0) start = llen+start;
5329 if (end < 0) end = llen+end;
5330 if (start < 0) start = 0;
5331 if (end < 0) end = 0;
5332
5333 /* indexes sanity checks */
5334 if (start > end || start >= llen) {
5335 /* Out of range start or start > end result in empty list */
5336 addReply(c,shared.emptymultibulk);
5337 return;
5338 }
5339 if (end >= llen) end = llen-1;
5340 rangelen = (end-start)+1;
5341
5342 /* check if starting point is trivial, before searching
5343 * the element in log(N) time */
5344 if (reverse) {
5345 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen - start);
5346 } else {
5347 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start + 1);
5348 }
5349
5350 /* Return the result in form of a multi-bulk reply */
5351 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5352 withscores ? (rangelen*2) : rangelen));
5353 for (j = 0; j < rangelen; j++) {
5354 ele = ln->obj;
5355 addReplyBulkLen(c,ele);
5356 addReply(c,ele);
5357 addReply(c,shared.crlf);
5358 if (withscores)
5359 addReplyDouble(c,ln->score);
5360 ln = reverse ? ln->backward : ln->forward[0];
5361 }
5362 }
5363 }
5364 }
5365
5366 static void zrangeCommand(redisClient *c) {
5367 zrangeGenericCommand(c,0);
5368 }
5369
5370 static void zrevrangeCommand(redisClient *c) {
5371 zrangeGenericCommand(c,1);
5372 }
5373
5374 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5375 * If justcount is non-zero, just the count is returned. */
5376 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5377 robj *o;
5378 double min, max;
5379 int minex = 0, maxex = 0; /* are min or max exclusive? */
5380 int offset = 0, limit = -1;
5381 int withscores = 0;
5382 int badsyntax = 0;
5383
5384 /* Parse the min-max interval. If one of the values is prefixed
5385 * by the "(" character, it's considered "open". For instance
5386 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5387 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5388 if (((char*)c->argv[2]->ptr)[0] == '(') {
5389 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5390 minex = 1;
5391 } else {
5392 min = strtod(c->argv[2]->ptr,NULL);
5393 }
5394 if (((char*)c->argv[3]->ptr)[0] == '(') {
5395 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5396 maxex = 1;
5397 } else {
5398 max = strtod(c->argv[3]->ptr,NULL);
5399 }
5400
5401 /* Parse "WITHSCORES": note that if the command was called with
5402 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5403 * enter the following paths to parse WITHSCORES and LIMIT. */
5404 if (c->argc == 5 || c->argc == 8) {
5405 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5406 withscores = 1;
5407 else
5408 badsyntax = 1;
5409 }
5410 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5411 badsyntax = 1;
5412 if (badsyntax) {
5413 addReplySds(c,
5414 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5415 return;
5416 }
5417
5418 /* Parse "LIMIT" */
5419 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5420 addReply(c,shared.syntaxerr);
5421 return;
5422 } else if (c->argc == (7 + withscores)) {
5423 offset = atoi(c->argv[5]->ptr);
5424 limit = atoi(c->argv[6]->ptr);
5425 if (offset < 0) offset = 0;
5426 }
5427
5428 /* Ok, lookup the key and get the range */
5429 o = lookupKeyRead(c->db,c->argv[1]);
5430 if (o == NULL) {
5431 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5432 } else {
5433 if (o->type != REDIS_ZSET) {
5434 addReply(c,shared.wrongtypeerr);
5435 } else {
5436 zset *zsetobj = o->ptr;
5437 zskiplist *zsl = zsetobj->zsl;
5438 zskiplistNode *ln;
5439 robj *ele, *lenobj = NULL;
5440 unsigned long rangelen = 0;
5441
5442 /* Get the first node with the score >= min, or with
5443 * score > min if 'minex' is true. */
5444 ln = zslFirstWithScore(zsl,min);
5445 while (minex && ln && ln->score == min) ln = ln->forward[0];
5446
5447 if (ln == NULL) {
5448 /* No element matching the speciifed interval */
5449 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5450 return;
5451 }
5452
5453 /* We don't know in advance how many matching elements there
5454 * are in the list, so we push this object that will represent
5455 * the multi-bulk length in the output buffer, and will "fix"
5456 * it later */
5457 if (!justcount) {
5458 lenobj = createObject(REDIS_STRING,NULL);
5459 addReply(c,lenobj);
5460 decrRefCount(lenobj);
5461 }
5462
5463 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5464 if (offset) {
5465 offset--;
5466 ln = ln->forward[0];
5467 continue;
5468 }
5469 if (limit == 0) break;
5470 if (!justcount) {
5471 ele = ln->obj;
5472 addReplyBulkLen(c,ele);
5473 addReply(c,ele);
5474 addReply(c,shared.crlf);
5475 if (withscores)
5476 addReplyDouble(c,ln->score);
5477 }
5478 ln = ln->forward[0];
5479 rangelen++;
5480 if (limit > 0) limit--;
5481 }
5482 if (justcount) {
5483 addReplyLong(c,(long)rangelen);
5484 } else {
5485 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5486 withscores ? (rangelen*2) : rangelen);
5487 }
5488 }
5489 }
5490 }
5491
5492 static void zrangebyscoreCommand(redisClient *c) {
5493 genericZrangebyscoreCommand(c,0);
5494 }
5495
5496 static void zcountCommand(redisClient *c) {
5497 genericZrangebyscoreCommand(c,1);
5498 }
5499
5500 static void zcardCommand(redisClient *c) {
5501 robj *o;
5502 zset *zs;
5503
5504 o = lookupKeyRead(c->db,c->argv[1]);
5505 if (o == NULL) {
5506 addReply(c,shared.czero);
5507 return;
5508 } else {
5509 if (o->type != REDIS_ZSET) {
5510 addReply(c,shared.wrongtypeerr);
5511 } else {
5512 zs = o->ptr;
5513 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5514 }
5515 }
5516 }
5517
5518 static void zscoreCommand(redisClient *c) {
5519 robj *o;
5520 zset *zs;
5521
5522 o = lookupKeyRead(c->db,c->argv[1]);
5523 if (o == NULL) {
5524 addReply(c,shared.nullbulk);
5525 return;
5526 } else {
5527 if (o->type != REDIS_ZSET) {
5528 addReply(c,shared.wrongtypeerr);
5529 } else {
5530 dictEntry *de;
5531
5532 zs = o->ptr;
5533 de = dictFind(zs->dict,c->argv[2]);
5534 if (!de) {
5535 addReply(c,shared.nullbulk);
5536 } else {
5537 double *score = dictGetEntryVal(de);
5538
5539 addReplyDouble(c,*score);
5540 }
5541 }
5542 }
5543 }
5544
5545 static void zrankCommand(redisClient *c) {
5546 robj *o;
5547 o = lookupKeyRead(c->db,c->argv[1]);
5548 if (o == NULL) {
5549 addReply(c,shared.nullbulk);
5550 return;
5551 }
5552 if (o->type != REDIS_ZSET) {
5553 addReply(c,shared.wrongtypeerr);
5554 } else {
5555 zset *zs = o->ptr;
5556 zskiplist *zsl = zs->zsl;
5557 dictEntry *de;
5558 unsigned long rank;
5559
5560 de = dictFind(zs->dict,c->argv[2]);
5561 if (!de) {
5562 addReply(c,shared.nullbulk);
5563 return;
5564 }
5565
5566 double *score = dictGetEntryVal(de);
5567 rank = zslGetRank(zsl, *score, c->argv[2]);
5568 if (rank) {
5569 addReplyLong(c, rank-1);
5570 } else {
5571 addReply(c,shared.nullbulk);
5572 }
5573 }
5574 }
5575
5576 /* ========================= Non type-specific commands ==================== */
5577
5578 static void flushdbCommand(redisClient *c) {
5579 server.dirty += dictSize(c->db->dict);
5580 dictEmpty(c->db->dict);
5581 dictEmpty(c->db->expires);
5582 addReply(c,shared.ok);
5583 }
5584
5585 static void flushallCommand(redisClient *c) {
5586 server.dirty += emptyDb();
5587 addReply(c,shared.ok);
5588 rdbSave(server.dbfilename);
5589 server.dirty++;
5590 }
5591
5592 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5593 redisSortOperation *so = zmalloc(sizeof(*so));
5594 so->type = type;
5595 so->pattern = pattern;
5596 return so;
5597 }
5598
5599 /* Return the value associated to the key with a name obtained
5600 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5601 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5602 char *p;
5603 sds spat, ssub;
5604 robj keyobj;
5605 int prefixlen, sublen, postfixlen;
5606 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5607 struct {
5608 long len;
5609 long free;
5610 char buf[REDIS_SORTKEY_MAX+1];
5611 } keyname;
5612
5613 /* If the pattern is "#" return the substitution object itself in order
5614 * to implement the "SORT ... GET #" feature. */
5615 spat = pattern->ptr;
5616 if (spat[0] == '#' && spat[1] == '\0') {
5617 return subst;
5618 }
5619
5620 /* The substitution object may be specially encoded. If so we create
5621 * a decoded object on the fly. Otherwise getDecodedObject will just
5622 * increment the ref count, that we'll decrement later. */
5623 subst = getDecodedObject(subst);
5624
5625 ssub = subst->ptr;
5626 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5627 p = strchr(spat,'*');
5628 if (!p) {
5629 decrRefCount(subst);
5630 return NULL;
5631 }
5632
5633 prefixlen = p-spat;
5634 sublen = sdslen(ssub);
5635 postfixlen = sdslen(spat)-(prefixlen+1);
5636 memcpy(keyname.buf,spat,prefixlen);
5637 memcpy(keyname.buf+prefixlen,ssub,sublen);
5638 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5639 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5640 keyname.len = prefixlen+sublen+postfixlen;
5641
5642 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5643 decrRefCount(subst);
5644
5645 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5646 return lookupKeyRead(db,&keyobj);
5647 }
5648
5649 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5650 * the additional parameter is not standard but a BSD-specific we have to
5651 * pass sorting parameters via the global 'server' structure */
5652 static int sortCompare(const void *s1, const void *s2) {
5653 const redisSortObject *so1 = s1, *so2 = s2;
5654 int cmp;
5655
5656 if (!server.sort_alpha) {
5657 /* Numeric sorting. Here it's trivial as we precomputed scores */
5658 if (so1->u.score > so2->u.score) {
5659 cmp = 1;
5660 } else if (so1->u.score < so2->u.score) {
5661 cmp = -1;
5662 } else {
5663 cmp = 0;
5664 }
5665 } else {
5666 /* Alphanumeric sorting */
5667 if (server.sort_bypattern) {
5668 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5669 /* At least one compare object is NULL */
5670 if (so1->u.cmpobj == so2->u.cmpobj)
5671 cmp = 0;
5672 else if (so1->u.cmpobj == NULL)
5673 cmp = -1;
5674 else
5675 cmp = 1;
5676 } else {
5677 /* We have both the objects, use strcoll */
5678 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5679 }
5680 } else {
5681 /* Compare elements directly */
5682 robj *dec1, *dec2;
5683
5684 dec1 = getDecodedObject(so1->obj);
5685 dec2 = getDecodedObject(so2->obj);
5686 cmp = strcoll(dec1->ptr,dec2->ptr);
5687 decrRefCount(dec1);
5688 decrRefCount(dec2);
5689 }
5690 }
5691 return server.sort_desc ? -cmp : cmp;
5692 }
5693
5694 /* The SORT command is the most complex command in Redis. Warning: this code
5695 * is optimized for speed and a bit less for readability */
5696 static void sortCommand(redisClient *c) {
5697 list *operations;
5698 int outputlen = 0;
5699 int desc = 0, alpha = 0;
5700 int limit_start = 0, limit_count = -1, start, end;
5701 int j, dontsort = 0, vectorlen;
5702 int getop = 0; /* GET operation counter */
5703 robj *sortval, *sortby = NULL, *storekey = NULL;
5704 redisSortObject *vector; /* Resulting vector to sort */
5705
5706 /* Lookup the key to sort. It must be of the right types */
5707 sortval = lookupKeyRead(c->db,c->argv[1]);
5708 if (sortval == NULL) {
5709 addReply(c,shared.nullmultibulk);
5710 return;
5711 }
5712 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5713 sortval->type != REDIS_ZSET)
5714 {
5715 addReply(c,shared.wrongtypeerr);
5716 return;
5717 }
5718
5719 /* Create a list of operations to perform for every sorted element.
5720 * Operations can be GET/DEL/INCR/DECR */
5721 operations = listCreate();
5722 listSetFreeMethod(operations,zfree);
5723 j = 2;
5724
5725 /* Now we need to protect sortval incrementing its count, in the future
5726 * SORT may have options able to overwrite/delete keys during the sorting
5727 * and the sorted key itself may get destroied */
5728 incrRefCount(sortval);
5729
5730 /* The SORT command has an SQL-alike syntax, parse it */
5731 while(j < c->argc) {
5732 int leftargs = c->argc-j-1;
5733 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5734 desc = 0;
5735 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5736 desc = 1;
5737 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5738 alpha = 1;
5739 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5740 limit_start = atoi(c->argv[j+1]->ptr);
5741 limit_count = atoi(c->argv[j+2]->ptr);
5742 j+=2;
5743 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5744 storekey = c->argv[j+1];
5745 j++;
5746 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5747 sortby = c->argv[j+1];
5748 /* If the BY pattern does not contain '*', i.e. it is constant,
5749 * we don't need to sort nor to lookup the weight keys. */
5750 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5751 j++;
5752 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5753 listAddNodeTail(operations,createSortOperation(
5754 REDIS_SORT_GET,c->argv[j+1]));
5755 getop++;
5756 j++;
5757 } else {
5758 decrRefCount(sortval);
5759 listRelease(operations);
5760 addReply(c,shared.syntaxerr);
5761 return;
5762 }
5763 j++;
5764 }
5765
5766 /* Load the sorting vector with all the objects to sort */
5767 switch(sortval->type) {
5768 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5769 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5770 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
5771 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
5772 }
5773 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
5774 j = 0;
5775
5776 if (sortval->type == REDIS_LIST) {
5777 list *list = sortval->ptr;
5778 listNode *ln;
5779 listIter li;
5780
5781 listRewind(list,&li);
5782 while((ln = listNext(&li))) {
5783 robj *ele = ln->value;
5784 vector[j].obj = ele;
5785 vector[j].u.score = 0;
5786 vector[j].u.cmpobj = NULL;
5787 j++;
5788 }
5789 } else {
5790 dict *set;
5791 dictIterator *di;
5792 dictEntry *setele;
5793
5794 if (sortval->type == REDIS_SET) {
5795 set = sortval->ptr;
5796 } else {
5797 zset *zs = sortval->ptr;
5798 set = zs->dict;
5799 }
5800
5801 di = dictGetIterator(set);
5802 while((setele = dictNext(di)) != NULL) {
5803 vector[j].obj = dictGetEntryKey(setele);
5804 vector[j].u.score = 0;
5805 vector[j].u.cmpobj = NULL;
5806 j++;
5807 }
5808 dictReleaseIterator(di);
5809 }
5810 redisAssert(j == vectorlen);
5811
5812 /* Now it's time to load the right scores in the sorting vector */
5813 if (dontsort == 0) {
5814 for (j = 0; j < vectorlen; j++) {
5815 if (sortby) {
5816 robj *byval;
5817
5818 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
5819 if (!byval || byval->type != REDIS_STRING) continue;
5820 if (alpha) {
5821 vector[j].u.cmpobj = getDecodedObject(byval);
5822 } else {
5823 if (byval->encoding == REDIS_ENCODING_RAW) {
5824 vector[j].u.score = strtod(byval->ptr,NULL);
5825 } else {
5826 /* Don't need to decode the object if it's
5827 * integer-encoded (the only encoding supported) so
5828 * far. We can just cast it */
5829 if (byval->encoding == REDIS_ENCODING_INT) {
5830 vector[j].u.score = (long)byval->ptr;
5831 } else
5832 redisAssert(1 != 1);
5833 }
5834 }
5835 } else {
5836 if (!alpha) {
5837 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5838 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5839 else {
5840 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5841 vector[j].u.score = (long) vector[j].obj->ptr;
5842 else
5843 redisAssert(1 != 1);
5844 }
5845 }
5846 }
5847 }
5848 }
5849
5850 /* We are ready to sort the vector... perform a bit of sanity check
5851 * on the LIMIT option too. We'll use a partial version of quicksort. */
5852 start = (limit_start < 0) ? 0 : limit_start;
5853 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5854 if (start >= vectorlen) {
5855 start = vectorlen-1;
5856 end = vectorlen-2;
5857 }
5858 if (end >= vectorlen) end = vectorlen-1;
5859
5860 if (dontsort == 0) {
5861 server.sort_desc = desc;
5862 server.sort_alpha = alpha;
5863 server.sort_bypattern = sortby ? 1 : 0;
5864 if (sortby && (start != 0 || end != vectorlen-1))
5865 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5866 else
5867 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
5868 }
5869
5870 /* Send command output to the output buffer, performing the specified
5871 * GET/DEL/INCR/DECR operations if any. */
5872 outputlen = getop ? getop*(end-start+1) : end-start+1;
5873 if (storekey == NULL) {
5874 /* STORE option not specified, sent the sorting result to client */
5875 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5876 for (j = start; j <= end; j++) {
5877 listNode *ln;
5878 listIter li;
5879
5880 if (!getop) {
5881 addReplyBulkLen(c,vector[j].obj);
5882 addReply(c,vector[j].obj);
5883 addReply(c,shared.crlf);
5884 }
5885 listRewind(operations,&li);
5886 while((ln = listNext(&li))) {
5887 redisSortOperation *sop = ln->value;
5888 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5889 vector[j].obj);
5890
5891 if (sop->type == REDIS_SORT_GET) {
5892 if (!val || val->type != REDIS_STRING) {
5893 addReply(c,shared.nullbulk);
5894 } else {
5895 addReplyBulkLen(c,val);
5896 addReply(c,val);
5897 addReply(c,shared.crlf);
5898 }
5899 } else {
5900 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5901 }
5902 }
5903 }
5904 } else {
5905 robj *listObject = createListObject();
5906 list *listPtr = (list*) listObject->ptr;
5907
5908 /* STORE option specified, set the sorting result as a List object */
5909 for (j = start; j <= end; j++) {
5910 listNode *ln;
5911 listIter li;
5912
5913 if (!getop) {
5914 listAddNodeTail(listPtr,vector[j].obj);
5915 incrRefCount(vector[j].obj);
5916 }
5917 listRewind(operations,&li);
5918 while((ln = listNext(&li))) {
5919 redisSortOperation *sop = ln->value;
5920 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5921 vector[j].obj);
5922
5923 if (sop->type == REDIS_SORT_GET) {
5924 if (!val || val->type != REDIS_STRING) {
5925 listAddNodeTail(listPtr,createStringObject("",0));
5926 } else {
5927 listAddNodeTail(listPtr,val);
5928 incrRefCount(val);
5929 }
5930 } else {
5931 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5932 }
5933 }
5934 }
5935 if (dictReplace(c->db->dict,storekey,listObject)) {
5936 incrRefCount(storekey);
5937 }
5938 /* Note: we add 1 because the DB is dirty anyway since even if the
5939 * SORT result is empty a new key is set and maybe the old content
5940 * replaced. */
5941 server.dirty += 1+outputlen;
5942 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
5943 }
5944
5945 /* Cleanup */
5946 decrRefCount(sortval);
5947 listRelease(operations);
5948 for (j = 0; j < vectorlen; j++) {
5949 if (sortby && alpha && vector[j].u.cmpobj)
5950 decrRefCount(vector[j].u.cmpobj);
5951 }
5952 zfree(vector);
5953 }
5954
5955 /* Convert an amount of bytes into a human readable string in the form
5956 * of 100B, 2G, 100M, 4K, and so forth. */
5957 static void bytesToHuman(char *s, unsigned long long n) {
5958 double d;
5959
5960 if (n < 1024) {
5961 /* Bytes */
5962 sprintf(s,"%lluB",n);
5963 return;
5964 } else if (n < (1024*1024)) {
5965 d = (double)n/(1024);
5966 sprintf(s,"%.2fK",d);
5967 } else if (n < (1024LL*1024*1024)) {
5968 d = (double)n/(1024*1024);
5969 sprintf(s,"%.2fM",d);
5970 } else if (n < (1024LL*1024*1024*1024)) {
5971 d = (double)n/(1024LL*1024*1024);
5972 sprintf(s,"%.2fG",d);
5973 }
5974 }
5975
5976 /* Create the string returned by the INFO command. This is decoupled
5977 * by the INFO command itself as we need to report the same information
5978 * on memory corruption problems. */
5979 static sds genRedisInfoString(void) {
5980 sds info;
5981 time_t uptime = time(NULL)-server.stat_starttime;
5982 int j;
5983 char hmem[64];
5984
5985 bytesToHuman(hmem,zmalloc_used_memory());
5986 info = sdscatprintf(sdsempty(),
5987 "redis_version:%s\r\n"
5988 "arch_bits:%s\r\n"
5989 "multiplexing_api:%s\r\n"
5990 "process_id:%ld\r\n"
5991 "uptime_in_seconds:%ld\r\n"
5992 "uptime_in_days:%ld\r\n"
5993 "connected_clients:%d\r\n"
5994 "connected_slaves:%d\r\n"
5995 "blocked_clients:%d\r\n"
5996 "used_memory:%zu\r\n"
5997 "used_memory_human:%s\r\n"
5998 "changes_since_last_save:%lld\r\n"
5999 "bgsave_in_progress:%d\r\n"
6000 "last_save_time:%ld\r\n"
6001 "bgrewriteaof_in_progress:%d\r\n"
6002 "total_connections_received:%lld\r\n"
6003 "total_commands_processed:%lld\r\n"
6004 "vm_enabled:%d\r\n"
6005 "role:%s\r\n"
6006 ,REDIS_VERSION,
6007 (sizeof(long) == 8) ? "64" : "32",
6008 aeGetApiName(),
6009 (long) getpid(),
6010 uptime,
6011 uptime/(3600*24),
6012 listLength(server.clients)-listLength(server.slaves),
6013 listLength(server.slaves),
6014 server.blpop_blocked_clients,
6015 zmalloc_used_memory(),
6016 hmem,
6017 server.dirty,
6018 server.bgsavechildpid != -1,
6019 server.lastsave,
6020 server.bgrewritechildpid != -1,
6021 server.stat_numconnections,
6022 server.stat_numcommands,
6023 server.vm_enabled != 0,
6024 server.masterhost == NULL ? "master" : "slave"
6025 );
6026 if (server.masterhost) {
6027 info = sdscatprintf(info,
6028 "master_host:%s\r\n"
6029 "master_port:%d\r\n"
6030 "master_link_status:%s\r\n"
6031 "master_last_io_seconds_ago:%d\r\n"
6032 ,server.masterhost,
6033 server.masterport,
6034 (server.replstate == REDIS_REPL_CONNECTED) ?
6035 "up" : "down",
6036 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6037 );
6038 }
6039 if (server.vm_enabled) {
6040 lockThreadedIO();
6041 info = sdscatprintf(info,
6042 "vm_conf_max_memory:%llu\r\n"
6043 "vm_conf_page_size:%llu\r\n"
6044 "vm_conf_pages:%llu\r\n"
6045 "vm_stats_used_pages:%llu\r\n"
6046 "vm_stats_swapped_objects:%llu\r\n"
6047 "vm_stats_swappin_count:%llu\r\n"
6048 "vm_stats_swappout_count:%llu\r\n"
6049 "vm_stats_io_newjobs_len:%lu\r\n"
6050 "vm_stats_io_processing_len:%lu\r\n"
6051 "vm_stats_io_processed_len:%lu\r\n"
6052 "vm_stats_io_active_threads:%lu\r\n"
6053 "vm_stats_blocked_clients:%lu\r\n"
6054 ,(unsigned long long) server.vm_max_memory,
6055 (unsigned long long) server.vm_page_size,
6056 (unsigned long long) server.vm_pages,
6057 (unsigned long long) server.vm_stats_used_pages,
6058 (unsigned long long) server.vm_stats_swapped_objects,
6059 (unsigned long long) server.vm_stats_swapins,
6060 (unsigned long long) server.vm_stats_swapouts,
6061 (unsigned long) listLength(server.io_newjobs),
6062 (unsigned long) listLength(server.io_processing),
6063 (unsigned long) listLength(server.io_processed),
6064 (unsigned long) server.io_active_threads,
6065 (unsigned long) server.vm_blocked_clients
6066 );
6067 unlockThreadedIO();
6068 }
6069 for (j = 0; j < server.dbnum; j++) {
6070 long long keys, vkeys;
6071
6072 keys = dictSize(server.db[j].dict);
6073 vkeys = dictSize(server.db[j].expires);
6074 if (keys || vkeys) {
6075 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6076 j, keys, vkeys);
6077 }
6078 }
6079 return info;
6080 }
6081
6082 static void infoCommand(redisClient *c) {
6083 sds info = genRedisInfoString();
6084 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6085 (unsigned long)sdslen(info)));
6086 addReplySds(c,info);
6087 addReply(c,shared.crlf);
6088 }
6089
6090 static void monitorCommand(redisClient *c) {
6091 /* ignore MONITOR if aleady slave or in monitor mode */
6092 if (c->flags & REDIS_SLAVE) return;
6093
6094 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6095 c->slaveseldb = 0;
6096 listAddNodeTail(server.monitors,c);
6097 addReply(c,shared.ok);
6098 }
6099
6100 /* ================================= Expire ================================= */
6101 static int removeExpire(redisDb *db, robj *key) {
6102 if (dictDelete(db->expires,key) == DICT_OK) {
6103 return 1;
6104 } else {
6105 return 0;
6106 }
6107 }
6108
6109 static int setExpire(redisDb *db, robj *key, time_t when) {
6110 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6111 return 0;
6112 } else {
6113 incrRefCount(key);
6114 return 1;
6115 }
6116 }
6117
6118 /* Return the expire time of the specified key, or -1 if no expire
6119 * is associated with this key (i.e. the key is non volatile) */
6120 static time_t getExpire(redisDb *db, robj *key) {
6121 dictEntry *de;
6122
6123 /* No expire? return ASAP */
6124 if (dictSize(db->expires) == 0 ||
6125 (de = dictFind(db->expires,key)) == NULL) return -1;
6126
6127 return (time_t) dictGetEntryVal(de);
6128 }
6129
6130 static int expireIfNeeded(redisDb *db, robj *key) {
6131 time_t when;
6132 dictEntry *de;
6133
6134 /* No expire? return ASAP */
6135 if (dictSize(db->expires) == 0 ||
6136 (de = dictFind(db->expires,key)) == NULL) return 0;
6137
6138 /* Lookup the expire */
6139 when = (time_t) dictGetEntryVal(de);
6140 if (time(NULL) <= when) return 0;
6141
6142 /* Delete the key */
6143 dictDelete(db->expires,key);
6144 return dictDelete(db->dict,key) == DICT_OK;
6145 }
6146
6147 static int deleteIfVolatile(redisDb *db, robj *key) {
6148 dictEntry *de;
6149
6150 /* No expire? return ASAP */
6151 if (dictSize(db->expires) == 0 ||
6152 (de = dictFind(db->expires,key)) == NULL) return 0;
6153
6154 /* Delete the key */
6155 server.dirty++;
6156 dictDelete(db->expires,key);
6157 return dictDelete(db->dict,key) == DICT_OK;
6158 }
6159
6160 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6161 dictEntry *de;
6162
6163 de = dictFind(c->db->dict,key);
6164 if (de == NULL) {
6165 addReply(c,shared.czero);
6166 return;
6167 }
6168 if (seconds < 0) {
6169 if (deleteKey(c->db,key)) server.dirty++;
6170 addReply(c, shared.cone);
6171 return;
6172 } else {
6173 time_t when = time(NULL)+seconds;
6174 if (setExpire(c->db,key,when)) {
6175 addReply(c,shared.cone);
6176 server.dirty++;
6177 } else {
6178 addReply(c,shared.czero);
6179 }
6180 return;
6181 }
6182 }
6183
6184 static void expireCommand(redisClient *c) {
6185 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6186 }
6187
6188 static void expireatCommand(redisClient *c) {
6189 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6190 }
6191
6192 static void ttlCommand(redisClient *c) {
6193 time_t expire;
6194 int ttl = -1;
6195
6196 expire = getExpire(c->db,c->argv[1]);
6197 if (expire != -1) {
6198 ttl = (int) (expire-time(NULL));
6199 if (ttl < 0) ttl = -1;
6200 }
6201 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6202 }
6203
6204 /* ================================ MULTI/EXEC ============================== */
6205
6206 /* Client state initialization for MULTI/EXEC */
6207 static void initClientMultiState(redisClient *c) {
6208 c->mstate.commands = NULL;
6209 c->mstate.count = 0;
6210 }
6211
6212 /* Release all the resources associated with MULTI/EXEC state */
6213 static void freeClientMultiState(redisClient *c) {
6214 int j;
6215
6216 for (j = 0; j < c->mstate.count; j++) {
6217 int i;
6218 multiCmd *mc = c->mstate.commands+j;
6219
6220 for (i = 0; i < mc->argc; i++)
6221 decrRefCount(mc->argv[i]);
6222 zfree(mc->argv);
6223 }
6224 zfree(c->mstate.commands);
6225 }
6226
6227 /* Add a new command into the MULTI commands queue */
6228 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6229 multiCmd *mc;
6230 int j;
6231
6232 c->mstate.commands = zrealloc(c->mstate.commands,
6233 sizeof(multiCmd)*(c->mstate.count+1));
6234 mc = c->mstate.commands+c->mstate.count;
6235 mc->cmd = cmd;
6236 mc->argc = c->argc;
6237 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6238 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6239 for (j = 0; j < c->argc; j++)
6240 incrRefCount(mc->argv[j]);
6241 c->mstate.count++;
6242 }
6243
6244 static void multiCommand(redisClient *c) {
6245 c->flags |= REDIS_MULTI;
6246 addReply(c,shared.ok);
6247 }
6248
6249 static void discardCommand(redisClient *c) {
6250 if (!(c->flags & REDIS_MULTI)) {
6251 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6252 return;
6253 }
6254
6255 freeClientMultiState(c);
6256 initClientMultiState(c);
6257 c->flags &= (~REDIS_MULTI);
6258 addReply(c,shared.ok);
6259 }
6260
6261 static void execCommand(redisClient *c) {
6262 int j;
6263 robj **orig_argv;
6264 int orig_argc;
6265
6266 if (!(c->flags & REDIS_MULTI)) {
6267 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6268 return;
6269 }
6270
6271 orig_argv = c->argv;
6272 orig_argc = c->argc;
6273 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6274 for (j = 0; j < c->mstate.count; j++) {
6275 c->argc = c->mstate.commands[j].argc;
6276 c->argv = c->mstate.commands[j].argv;
6277 call(c,c->mstate.commands[j].cmd);
6278 }
6279 c->argv = orig_argv;
6280 c->argc = orig_argc;
6281 freeClientMultiState(c);
6282 initClientMultiState(c);
6283 c->flags &= (~REDIS_MULTI);
6284 }
6285
6286 /* =========================== Blocking Operations ========================= */
6287
6288 /* Currently Redis blocking operations support is limited to list POP ops,
6289 * so the current implementation is not fully generic, but it is also not
6290 * completely specific so it will not require a rewrite to support new
6291 * kind of blocking operations in the future.
6292 *
6293 * Still it's important to note that list blocking operations can be already
6294 * used as a notification mechanism in order to implement other blocking
6295 * operations at application level, so there must be a very strong evidence
6296 * of usefulness and generality before new blocking operations are implemented.
6297 *
6298 * This is how the current blocking POP works, we use BLPOP as example:
6299 * - If the user calls BLPOP and the key exists and contains a non empty list
6300 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6301 * if there is not to block.
6302 * - If instead BLPOP is called and the key does not exists or the list is
6303 * empty we need to block. In order to do so we remove the notification for
6304 * new data to read in the client socket (so that we'll not serve new
6305 * requests if the blocking request is not served). Also we put the client
6306 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6307 * blocking for this keys.
6308 * - If a PUSH operation against a key with blocked clients waiting is
6309 * performed, we serve the first in the list: basically instead to push
6310 * the new element inside the list we return it to the (first / oldest)
6311 * blocking client, unblock the client, and remove it form the list.
6312 *
6313 * The above comment and the source code should be enough in order to understand
6314 * the implementation and modify / fix it later.
6315 */
6316
6317 /* Set a client in blocking mode for the specified key, with the specified
6318 * timeout */
6319 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6320 dictEntry *de;
6321 list *l;
6322 int j;
6323
6324 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6325 c->blockingkeysnum = numkeys;
6326 c->blockingto = timeout;
6327 for (j = 0; j < numkeys; j++) {
6328 /* Add the key in the client structure, to map clients -> keys */
6329 c->blockingkeys[j] = keys[j];
6330 incrRefCount(keys[j]);
6331
6332 /* And in the other "side", to map keys -> clients */
6333 de = dictFind(c->db->blockingkeys,keys[j]);
6334 if (de == NULL) {
6335 int retval;
6336
6337 /* For every key we take a list of clients blocked for it */
6338 l = listCreate();
6339 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6340 incrRefCount(keys[j]);
6341 assert(retval == DICT_OK);
6342 } else {
6343 l = dictGetEntryVal(de);
6344 }
6345 listAddNodeTail(l,c);
6346 }
6347 /* Mark the client as a blocked client */
6348 c->flags |= REDIS_BLOCKED;
6349 server.blpop_blocked_clients++;
6350 }
6351
6352 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6353 static void unblockClientWaitingData(redisClient *c) {
6354 dictEntry *de;
6355 list *l;
6356 int j;
6357
6358 assert(c->blockingkeys != NULL);
6359 /* The client may wait for multiple keys, so unblock it for every key. */
6360 for (j = 0; j < c->blockingkeysnum; j++) {
6361 /* Remove this client from the list of clients waiting for this key. */
6362 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6363 assert(de != NULL);
6364 l = dictGetEntryVal(de);
6365 listDelNode(l,listSearchKey(l,c));
6366 /* If the list is empty we need to remove it to avoid wasting memory */
6367 if (listLength(l) == 0)
6368 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6369 decrRefCount(c->blockingkeys[j]);
6370 }
6371 /* Cleanup the client structure */
6372 zfree(c->blockingkeys);
6373 c->blockingkeys = NULL;
6374 c->flags &= (~REDIS_BLOCKED);
6375 server.blpop_blocked_clients--;
6376 /* We want to process data if there is some command waiting
6377 * in the input buffer. Note that this is safe even if
6378 * unblockClientWaitingData() gets called from freeClient() because
6379 * freeClient() will be smart enough to call this function
6380 * *after* c->querybuf was set to NULL. */
6381 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6382 }
6383
6384 /* This should be called from any function PUSHing into lists.
6385 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6386 * 'ele' is the element pushed.
6387 *
6388 * If the function returns 0 there was no client waiting for a list push
6389 * against this key.
6390 *
6391 * If the function returns 1 there was a client waiting for a list push
6392 * against this key, the element was passed to this client thus it's not
6393 * needed to actually add it to the list and the caller should return asap. */
6394 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6395 struct dictEntry *de;
6396 redisClient *receiver;
6397 list *l;
6398 listNode *ln;
6399
6400 de = dictFind(c->db->blockingkeys,key);
6401 if (de == NULL) return 0;
6402 l = dictGetEntryVal(de);
6403 ln = listFirst(l);
6404 assert(ln != NULL);
6405 receiver = ln->value;
6406
6407 addReplySds(receiver,sdsnew("*2\r\n"));
6408 addReplyBulkLen(receiver,key);
6409 addReply(receiver,key);
6410 addReply(receiver,shared.crlf);
6411 addReplyBulkLen(receiver,ele);
6412 addReply(receiver,ele);
6413 addReply(receiver,shared.crlf);
6414 unblockClientWaitingData(receiver);
6415 return 1;
6416 }
6417
6418 /* Blocking RPOP/LPOP */
6419 static void blockingPopGenericCommand(redisClient *c, int where) {
6420 robj *o;
6421 time_t timeout;
6422 int j;
6423
6424 for (j = 1; j < c->argc-1; j++) {
6425 o = lookupKeyWrite(c->db,c->argv[j]);
6426 if (o != NULL) {
6427 if (o->type != REDIS_LIST) {
6428 addReply(c,shared.wrongtypeerr);
6429 return;
6430 } else {
6431 list *list = o->ptr;
6432 if (listLength(list) != 0) {
6433 /* If the list contains elements fall back to the usual
6434 * non-blocking POP operation */
6435 robj *argv[2], **orig_argv;
6436 int orig_argc;
6437
6438 /* We need to alter the command arguments before to call
6439 * popGenericCommand() as the command takes a single key. */
6440 orig_argv = c->argv;
6441 orig_argc = c->argc;
6442 argv[1] = c->argv[j];
6443 c->argv = argv;
6444 c->argc = 2;
6445
6446 /* Also the return value is different, we need to output
6447 * the multi bulk reply header and the key name. The
6448 * "real" command will add the last element (the value)
6449 * for us. If this souds like an hack to you it's just
6450 * because it is... */
6451 addReplySds(c,sdsnew("*2\r\n"));
6452 addReplyBulkLen(c,argv[1]);
6453 addReply(c,argv[1]);
6454 addReply(c,shared.crlf);
6455 popGenericCommand(c,where);
6456
6457 /* Fix the client structure with the original stuff */
6458 c->argv = orig_argv;
6459 c->argc = orig_argc;
6460 return;
6461 }
6462 }
6463 }
6464 }
6465 /* If the list is empty or the key does not exists we must block */
6466 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6467 if (timeout > 0) timeout += time(NULL);
6468 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6469 }
6470
6471 static void blpopCommand(redisClient *c) {
6472 blockingPopGenericCommand(c,REDIS_HEAD);
6473 }
6474
6475 static void brpopCommand(redisClient *c) {
6476 blockingPopGenericCommand(c,REDIS_TAIL);
6477 }
6478
6479 /* =============================== Replication ============================= */
6480
6481 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6482 ssize_t nwritten, ret = size;
6483 time_t start = time(NULL);
6484
6485 timeout++;
6486 while(size) {
6487 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6488 nwritten = write(fd,ptr,size);
6489 if (nwritten == -1) return -1;
6490 ptr += nwritten;
6491 size -= nwritten;
6492 }
6493 if ((time(NULL)-start) > timeout) {
6494 errno = ETIMEDOUT;
6495 return -1;
6496 }
6497 }
6498 return ret;
6499 }
6500
6501 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6502 ssize_t nread, totread = 0;
6503 time_t start = time(NULL);
6504
6505 timeout++;
6506 while(size) {
6507 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6508 nread = read(fd,ptr,size);
6509 if (nread == -1) return -1;
6510 ptr += nread;
6511 size -= nread;
6512 totread += nread;
6513 }
6514 if ((time(NULL)-start) > timeout) {
6515 errno = ETIMEDOUT;
6516 return -1;
6517 }
6518 }
6519 return totread;
6520 }
6521
6522 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6523 ssize_t nread = 0;
6524
6525 size--;
6526 while(size) {
6527 char c;
6528
6529 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6530 if (c == '\n') {
6531 *ptr = '\0';
6532 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6533 return nread;
6534 } else {
6535 *ptr++ = c;
6536 *ptr = '\0';
6537 nread++;
6538 }
6539 }
6540 return nread;
6541 }
6542
6543 static void syncCommand(redisClient *c) {
6544 /* ignore SYNC if aleady slave or in monitor mode */
6545 if (c->flags & REDIS_SLAVE) return;
6546
6547 /* SYNC can't be issued when the server has pending data to send to
6548 * the client about already issued commands. We need a fresh reply
6549 * buffer registering the differences between the BGSAVE and the current
6550 * dataset, so that we can copy to other slaves if needed. */
6551 if (listLength(c->reply) != 0) {
6552 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6553 return;
6554 }
6555
6556 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6557 /* Here we need to check if there is a background saving operation
6558 * in progress, or if it is required to start one */
6559 if (server.bgsavechildpid != -1) {
6560 /* Ok a background save is in progress. Let's check if it is a good
6561 * one for replication, i.e. if there is another slave that is
6562 * registering differences since the server forked to save */
6563 redisClient *slave;
6564 listNode *ln;
6565 listIter li;
6566
6567 listRewind(server.slaves,&li);
6568 while((ln = listNext(&li))) {
6569 slave = ln->value;
6570 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6571 }
6572 if (ln) {
6573 /* Perfect, the server is already registering differences for
6574 * another slave. Set the right state, and copy the buffer. */
6575 listRelease(c->reply);
6576 c->reply = listDup(slave->reply);
6577 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6578 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6579 } else {
6580 /* No way, we need to wait for the next BGSAVE in order to
6581 * register differences */
6582 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6583 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6584 }
6585 } else {
6586 /* Ok we don't have a BGSAVE in progress, let's start one */
6587 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6588 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6589 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6590 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6591 return;
6592 }
6593 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6594 }
6595 c->repldbfd = -1;
6596 c->flags |= REDIS_SLAVE;
6597 c->slaveseldb = 0;
6598 listAddNodeTail(server.slaves,c);
6599 return;
6600 }
6601
6602 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6603 redisClient *slave = privdata;
6604 REDIS_NOTUSED(el);
6605 REDIS_NOTUSED(mask);
6606 char buf[REDIS_IOBUF_LEN];
6607 ssize_t nwritten, buflen;
6608
6609 if (slave->repldboff == 0) {
6610 /* Write the bulk write count before to transfer the DB. In theory here
6611 * we don't know how much room there is in the output buffer of the
6612 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6613 * operations) will never be smaller than the few bytes we need. */
6614 sds bulkcount;
6615
6616 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6617 slave->repldbsize);
6618 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6619 {
6620 sdsfree(bulkcount);
6621 freeClient(slave);
6622 return;
6623 }
6624 sdsfree(bulkcount);
6625 }
6626 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6627 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6628 if (buflen <= 0) {
6629 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6630 (buflen == 0) ? "premature EOF" : strerror(errno));
6631 freeClient(slave);
6632 return;
6633 }
6634 if ((nwritten = write(fd,buf,buflen)) == -1) {
6635 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6636 strerror(errno));
6637 freeClient(slave);
6638 return;
6639 }
6640 slave->repldboff += nwritten;
6641 if (slave->repldboff == slave->repldbsize) {
6642 close(slave->repldbfd);
6643 slave->repldbfd = -1;
6644 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6645 slave->replstate = REDIS_REPL_ONLINE;
6646 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6647 sendReplyToClient, slave) == AE_ERR) {
6648 freeClient(slave);
6649 return;
6650 }
6651 addReplySds(slave,sdsempty());
6652 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6653 }
6654 }
6655
6656 /* This function is called at the end of every backgrond saving.
6657 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6658 * otherwise REDIS_ERR is passed to the function.
6659 *
6660 * The goal of this function is to handle slaves waiting for a successful
6661 * background saving in order to perform non-blocking synchronization. */
6662 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6663 listNode *ln;
6664 int startbgsave = 0;
6665 listIter li;
6666
6667 listRewind(server.slaves,&li);
6668 while((ln = listNext(&li))) {
6669 redisClient *slave = ln->value;
6670
6671 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6672 startbgsave = 1;
6673 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6674 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6675 struct redis_stat buf;
6676
6677 if (bgsaveerr != REDIS_OK) {
6678 freeClient(slave);
6679 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6680 continue;
6681 }
6682 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6683 redis_fstat(slave->repldbfd,&buf) == -1) {
6684 freeClient(slave);
6685 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6686 continue;
6687 }
6688 slave->repldboff = 0;
6689 slave->repldbsize = buf.st_size;
6690 slave->replstate = REDIS_REPL_SEND_BULK;
6691 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6692 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6693 freeClient(slave);
6694 continue;
6695 }
6696 }
6697 }
6698 if (startbgsave) {
6699 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6700 listIter li;
6701
6702 listRewind(server.slaves,&li);
6703 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6704 while((ln = listNext(&li))) {
6705 redisClient *slave = ln->value;
6706
6707 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6708 freeClient(slave);
6709 }
6710 }
6711 }
6712 }
6713
6714 static int syncWithMaster(void) {
6715 char buf[1024], tmpfile[256], authcmd[1024];
6716 int dumpsize;
6717 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6718 int dfd;
6719
6720 if (fd == -1) {
6721 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6722 strerror(errno));
6723 return REDIS_ERR;
6724 }
6725
6726 /* AUTH with the master if required. */
6727 if(server.masterauth) {
6728 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6729 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6730 close(fd);
6731 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6732 strerror(errno));
6733 return REDIS_ERR;
6734 }
6735 /* Read the AUTH result. */
6736 if (syncReadLine(fd,buf,1024,3600) == -1) {
6737 close(fd);
6738 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6739 strerror(errno));
6740 return REDIS_ERR;
6741 }
6742 if (buf[0] != '+') {
6743 close(fd);
6744 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6745 return REDIS_ERR;
6746 }
6747 }
6748
6749 /* Issue the SYNC command */
6750 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6751 close(fd);
6752 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6753 strerror(errno));
6754 return REDIS_ERR;
6755 }
6756 /* Read the bulk write count */
6757 if (syncReadLine(fd,buf,1024,3600) == -1) {
6758 close(fd);
6759 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6760 strerror(errno));
6761 return REDIS_ERR;
6762 }
6763 if (buf[0] != '$') {
6764 close(fd);
6765 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6766 return REDIS_ERR;
6767 }
6768 dumpsize = atoi(buf+1);
6769 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6770 /* Read the bulk write data on a temp file */
6771 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6772 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6773 if (dfd == -1) {
6774 close(fd);
6775 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6776 return REDIS_ERR;
6777 }
6778 while(dumpsize) {
6779 int nread, nwritten;
6780
6781 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6782 if (nread == -1) {
6783 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6784 strerror(errno));
6785 close(fd);
6786 close(dfd);
6787 return REDIS_ERR;
6788 }
6789 nwritten = write(dfd,buf,nread);
6790 if (nwritten == -1) {
6791 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6792 close(fd);
6793 close(dfd);
6794 return REDIS_ERR;
6795 }
6796 dumpsize -= nread;
6797 }
6798 close(dfd);
6799 if (rename(tmpfile,server.dbfilename) == -1) {
6800 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6801 unlink(tmpfile);
6802 close(fd);
6803 return REDIS_ERR;
6804 }
6805 emptyDb();
6806 if (rdbLoad(server.dbfilename) != REDIS_OK) {
6807 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6808 close(fd);
6809 return REDIS_ERR;
6810 }
6811 server.master = createClient(fd);
6812 server.master->flags |= REDIS_MASTER;
6813 server.master->authenticated = 1;
6814 server.replstate = REDIS_REPL_CONNECTED;
6815 return REDIS_OK;
6816 }
6817
6818 static void slaveofCommand(redisClient *c) {
6819 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6820 !strcasecmp(c->argv[2]->ptr,"one")) {
6821 if (server.masterhost) {
6822 sdsfree(server.masterhost);
6823 server.masterhost = NULL;
6824 if (server.master) freeClient(server.master);
6825 server.replstate = REDIS_REPL_NONE;
6826 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6827 }
6828 } else {
6829 sdsfree(server.masterhost);
6830 server.masterhost = sdsdup(c->argv[1]->ptr);
6831 server.masterport = atoi(c->argv[2]->ptr);
6832 if (server.master) freeClient(server.master);
6833 server.replstate = REDIS_REPL_CONNECT;
6834 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6835 server.masterhost, server.masterport);
6836 }
6837 addReply(c,shared.ok);
6838 }
6839
6840 /* ============================ Maxmemory directive ======================== */
6841
6842 /* Try to free one object form the pre-allocated objects free list.
6843 * This is useful under low mem conditions as by default we take 1 million
6844 * free objects allocated. On success REDIS_OK is returned, otherwise
6845 * REDIS_ERR. */
6846 static int tryFreeOneObjectFromFreelist(void) {
6847 robj *o;
6848
6849 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6850 if (listLength(server.objfreelist)) {
6851 listNode *head = listFirst(server.objfreelist);
6852 o = listNodeValue(head);
6853 listDelNode(server.objfreelist,head);
6854 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6855 zfree(o);
6856 return REDIS_OK;
6857 } else {
6858 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6859 return REDIS_ERR;
6860 }
6861 }
6862
6863 /* This function gets called when 'maxmemory' is set on the config file to limit
6864 * the max memory used by the server, and we are out of memory.
6865 * This function will try to, in order:
6866 *
6867 * - Free objects from the free list
6868 * - Try to remove keys with an EXPIRE set
6869 *
6870 * It is not possible to free enough memory to reach used-memory < maxmemory
6871 * the server will start refusing commands that will enlarge even more the
6872 * memory usage.
6873 */
6874 static void freeMemoryIfNeeded(void) {
6875 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
6876 int j, k, freed = 0;
6877
6878 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6879 for (j = 0; j < server.dbnum; j++) {
6880 int minttl = -1;
6881 robj *minkey = NULL;
6882 struct dictEntry *de;
6883
6884 if (dictSize(server.db[j].expires)) {
6885 freed = 1;
6886 /* From a sample of three keys drop the one nearest to
6887 * the natural expire */
6888 for (k = 0; k < 3; k++) {
6889 time_t t;
6890
6891 de = dictGetRandomKey(server.db[j].expires);
6892 t = (time_t) dictGetEntryVal(de);
6893 if (minttl == -1 || t < minttl) {
6894 minkey = dictGetEntryKey(de);
6895 minttl = t;
6896 }
6897 }
6898 deleteKey(server.db+j,minkey);
6899 }
6900 }
6901 if (!freed) return; /* nothing to free... */
6902 }
6903 }
6904
6905 /* ============================== Append Only file ========================== */
6906
6907 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6908 sds buf = sdsempty();
6909 int j;
6910 ssize_t nwritten;
6911 time_t now;
6912 robj *tmpargv[3];
6913
6914 /* The DB this command was targetting is not the same as the last command
6915 * we appendend. To issue a SELECT command is needed. */
6916 if (dictid != server.appendseldb) {
6917 char seldb[64];
6918
6919 snprintf(seldb,sizeof(seldb),"%d",dictid);
6920 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6921 (unsigned long)strlen(seldb),seldb);
6922 server.appendseldb = dictid;
6923 }
6924
6925 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6926 * EXPIREs into EXPIREATs calls */
6927 if (cmd->proc == expireCommand) {
6928 long when;
6929
6930 tmpargv[0] = createStringObject("EXPIREAT",8);
6931 tmpargv[1] = argv[1];
6932 incrRefCount(argv[1]);
6933 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6934 tmpargv[2] = createObject(REDIS_STRING,
6935 sdscatprintf(sdsempty(),"%ld",when));
6936 argv = tmpargv;
6937 }
6938
6939 /* Append the actual command */
6940 buf = sdscatprintf(buf,"*%d\r\n",argc);
6941 for (j = 0; j < argc; j++) {
6942 robj *o = argv[j];
6943
6944 o = getDecodedObject(o);
6945 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
6946 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6947 buf = sdscatlen(buf,"\r\n",2);
6948 decrRefCount(o);
6949 }
6950
6951 /* Free the objects from the modified argv for EXPIREAT */
6952 if (cmd->proc == expireCommand) {
6953 for (j = 0; j < 3; j++)
6954 decrRefCount(argv[j]);
6955 }
6956
6957 /* We want to perform a single write. This should be guaranteed atomic
6958 * at least if the filesystem we are writing is a real physical one.
6959 * While this will save us against the server being killed I don't think
6960 * there is much to do about the whole server stopping for power problems
6961 * or alike */
6962 nwritten = write(server.appendfd,buf,sdslen(buf));
6963 if (nwritten != (signed)sdslen(buf)) {
6964 /* Ooops, we are in troubles. The best thing to do for now is
6965 * to simply exit instead to give the illusion that everything is
6966 * working as expected. */
6967 if (nwritten == -1) {
6968 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6969 } else {
6970 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6971 }
6972 exit(1);
6973 }
6974 /* If a background append only file rewriting is in progress we want to
6975 * accumulate the differences between the child DB and the current one
6976 * in a buffer, so that when the child process will do its work we
6977 * can append the differences to the new append only file. */
6978 if (server.bgrewritechildpid != -1)
6979 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6980
6981 sdsfree(buf);
6982 now = time(NULL);
6983 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6984 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6985 now-server.lastfsync > 1))
6986 {
6987 fsync(server.appendfd); /* Let's try to get this data on the disk */
6988 server.lastfsync = now;
6989 }
6990 }
6991
6992 /* In Redis commands are always executed in the context of a client, so in
6993 * order to load the append only file we need to create a fake client. */
6994 static struct redisClient *createFakeClient(void) {
6995 struct redisClient *c = zmalloc(sizeof(*c));
6996
6997 selectDb(c,0);
6998 c->fd = -1;
6999 c->querybuf = sdsempty();
7000 c->argc = 0;
7001 c->argv = NULL;
7002 c->flags = 0;
7003 /* We set the fake client as a slave waiting for the synchronization
7004 * so that Redis will not try to send replies to this client. */
7005 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7006 c->reply = listCreate();
7007 listSetFreeMethod(c->reply,decrRefCount);
7008 listSetDupMethod(c->reply,dupClientReplyValue);
7009 return c;
7010 }
7011
7012 static void freeFakeClient(struct redisClient *c) {
7013 sdsfree(c->querybuf);
7014 listRelease(c->reply);
7015 zfree(c);
7016 }
7017
7018 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7019 * error (the append only file is zero-length) REDIS_ERR is returned. On
7020 * fatal error an error message is logged and the program exists. */
7021 int loadAppendOnlyFile(char *filename) {
7022 struct redisClient *fakeClient;
7023 FILE *fp = fopen(filename,"r");
7024 struct redis_stat sb;
7025 unsigned long long loadedkeys = 0;
7026
7027 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7028 return REDIS_ERR;
7029
7030 if (fp == NULL) {
7031 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7032 exit(1);
7033 }
7034
7035 fakeClient = createFakeClient();
7036 while(1) {
7037 int argc, j;
7038 unsigned long len;
7039 robj **argv;
7040 char buf[128];
7041 sds argsds;
7042 struct redisCommand *cmd;
7043
7044 if (fgets(buf,sizeof(buf),fp) == NULL) {
7045 if (feof(fp))
7046 break;
7047 else
7048 goto readerr;
7049 }
7050 if (buf[0] != '*') goto fmterr;
7051 argc = atoi(buf+1);
7052 argv = zmalloc(sizeof(robj*)*argc);
7053 for (j = 0; j < argc; j++) {
7054 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7055 if (buf[0] != '$') goto fmterr;
7056 len = strtol(buf+1,NULL,10);
7057 argsds = sdsnewlen(NULL,len);
7058 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7059 argv[j] = createObject(REDIS_STRING,argsds);
7060 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7061 }
7062
7063 /* Command lookup */
7064 cmd = lookupCommand(argv[0]->ptr);
7065 if (!cmd) {
7066 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7067 exit(1);
7068 }
7069 /* Try object sharing and encoding */
7070 if (server.shareobjects) {
7071 int j;
7072 for(j = 1; j < argc; j++)
7073 argv[j] = tryObjectSharing(argv[j]);
7074 }
7075 if (cmd->flags & REDIS_CMD_BULK)
7076 tryObjectEncoding(argv[argc-1]);
7077 /* Run the command in the context of a fake client */
7078 fakeClient->argc = argc;
7079 fakeClient->argv = argv;
7080 cmd->proc(fakeClient);
7081 /* Discard the reply objects list from the fake client */
7082 while(listLength(fakeClient->reply))
7083 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7084 /* Clean up, ready for the next command */
7085 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7086 zfree(argv);
7087 /* Handle swapping while loading big datasets when VM is on */
7088 loadedkeys++;
7089 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7090 while (zmalloc_used_memory() > server.vm_max_memory) {
7091 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7092 }
7093 }
7094 }
7095 fclose(fp);
7096 freeFakeClient(fakeClient);
7097 return REDIS_OK;
7098
7099 readerr:
7100 if (feof(fp)) {
7101 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7102 } else {
7103 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7104 }
7105 exit(1);
7106 fmterr:
7107 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7108 exit(1);
7109 }
7110
7111 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7112 static int fwriteBulk(FILE *fp, robj *obj) {
7113 char buf[128];
7114 int decrrc = 0;
7115
7116 /* Avoid the incr/decr ref count business if possible to help
7117 * copy-on-write (we are often in a child process when this function
7118 * is called).
7119 * Also makes sure that key objects don't get incrRefCount-ed when VM
7120 * is enabled */
7121 if (obj->encoding != REDIS_ENCODING_RAW) {
7122 obj = getDecodedObject(obj);
7123 decrrc = 1;
7124 }
7125 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7126 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7127 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7128 goto err;
7129 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7130 if (decrrc) decrRefCount(obj);
7131 return 1;
7132 err:
7133 if (decrrc) decrRefCount(obj);
7134 return 0;
7135 }
7136
7137 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7138 static int fwriteBulkDouble(FILE *fp, double d) {
7139 char buf[128], dbuf[128];
7140
7141 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7142 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7143 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7144 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7145 return 1;
7146 }
7147
7148 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7149 static int fwriteBulkLong(FILE *fp, long l) {
7150 char buf[128], lbuf[128];
7151
7152 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7153 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7154 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7155 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7156 return 1;
7157 }
7158
7159 /* Write a sequence of commands able to fully rebuild the dataset into
7160 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7161 static int rewriteAppendOnlyFile(char *filename) {
7162 dictIterator *di = NULL;
7163 dictEntry *de;
7164 FILE *fp;
7165 char tmpfile[256];
7166 int j;
7167 time_t now = time(NULL);
7168
7169 /* Note that we have to use a different temp name here compared to the
7170 * one used by rewriteAppendOnlyFileBackground() function. */
7171 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7172 fp = fopen(tmpfile,"w");
7173 if (!fp) {
7174 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7175 return REDIS_ERR;
7176 }
7177 for (j = 0; j < server.dbnum; j++) {
7178 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7179 redisDb *db = server.db+j;
7180 dict *d = db->dict;
7181 if (dictSize(d) == 0) continue;
7182 di = dictGetIterator(d);
7183 if (!di) {
7184 fclose(fp);
7185 return REDIS_ERR;
7186 }
7187
7188 /* SELECT the new DB */
7189 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7190 if (fwriteBulkLong(fp,j) == 0) goto werr;
7191
7192 /* Iterate this DB writing every entry */
7193 while((de = dictNext(di)) != NULL) {
7194 robj *key, *o;
7195 time_t expiretime;
7196 int swapped;
7197
7198 key = dictGetEntryKey(de);
7199 /* If the value for this key is swapped, load a preview in memory.
7200 * We use a "swapped" flag to remember if we need to free the
7201 * value object instead to just increment the ref count anyway
7202 * in order to avoid copy-on-write of pages if we are forked() */
7203 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7204 key->storage == REDIS_VM_SWAPPING) {
7205 o = dictGetEntryVal(de);
7206 swapped = 0;
7207 } else {
7208 o = vmPreviewObject(key);
7209 swapped = 1;
7210 }
7211 expiretime = getExpire(db,key);
7212
7213 /* Save the key and associated value */
7214 if (o->type == REDIS_STRING) {
7215 /* Emit a SET command */
7216 char cmd[]="*3\r\n$3\r\nSET\r\n";
7217 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7218 /* Key and value */
7219 if (fwriteBulk(fp,key) == 0) goto werr;
7220 if (fwriteBulk(fp,o) == 0) goto werr;
7221 } else if (o->type == REDIS_LIST) {
7222 /* Emit the RPUSHes needed to rebuild the list */
7223 list *list = o->ptr;
7224 listNode *ln;
7225 listIter li;
7226
7227 listRewind(list,&li);
7228 while((ln = listNext(&li))) {
7229 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7230 robj *eleobj = listNodeValue(ln);
7231
7232 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7233 if (fwriteBulk(fp,key) == 0) goto werr;
7234 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7235 }
7236 } else if (o->type == REDIS_SET) {
7237 /* Emit the SADDs needed to rebuild the set */
7238 dict *set = o->ptr;
7239 dictIterator *di = dictGetIterator(set);
7240 dictEntry *de;
7241
7242 while((de = dictNext(di)) != NULL) {
7243 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7244 robj *eleobj = dictGetEntryKey(de);
7245
7246 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7247 if (fwriteBulk(fp,key) == 0) goto werr;
7248 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7249 }
7250 dictReleaseIterator(di);
7251 } else if (o->type == REDIS_ZSET) {
7252 /* Emit the ZADDs needed to rebuild the sorted set */
7253 zset *zs = o->ptr;
7254 dictIterator *di = dictGetIterator(zs->dict);
7255 dictEntry *de;
7256
7257 while((de = dictNext(di)) != NULL) {
7258 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7259 robj *eleobj = dictGetEntryKey(de);
7260 double *score = dictGetEntryVal(de);
7261
7262 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7263 if (fwriteBulk(fp,key) == 0) goto werr;
7264 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7265 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7266 }
7267 dictReleaseIterator(di);
7268 } else {
7269 redisAssert(0 != 0);
7270 }
7271 /* Save the expire time */
7272 if (expiretime != -1) {
7273 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7274 /* If this key is already expired skip it */
7275 if (expiretime < now) continue;
7276 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7277 if (fwriteBulk(fp,key) == 0) goto werr;
7278 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7279 }
7280 if (swapped) decrRefCount(o);
7281 }
7282 dictReleaseIterator(di);
7283 }
7284
7285 /* Make sure data will not remain on the OS's output buffers */
7286 fflush(fp);
7287 fsync(fileno(fp));
7288 fclose(fp);
7289
7290 /* Use RENAME to make sure the DB file is changed atomically only
7291 * if the generate DB file is ok. */
7292 if (rename(tmpfile,filename) == -1) {
7293 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7294 unlink(tmpfile);
7295 return REDIS_ERR;
7296 }
7297 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7298 return REDIS_OK;
7299
7300 werr:
7301 fclose(fp);
7302 unlink(tmpfile);
7303 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7304 if (di) dictReleaseIterator(di);
7305 return REDIS_ERR;
7306 }
7307
7308 /* This is how rewriting of the append only file in background works:
7309 *
7310 * 1) The user calls BGREWRITEAOF
7311 * 2) Redis calls this function, that forks():
7312 * 2a) the child rewrite the append only file in a temp file.
7313 * 2b) the parent accumulates differences in server.bgrewritebuf.
7314 * 3) When the child finished '2a' exists.
7315 * 4) The parent will trap the exit code, if it's OK, will append the
7316 * data accumulated into server.bgrewritebuf into the temp file, and
7317 * finally will rename(2) the temp file in the actual file name.
7318 * The the new file is reopened as the new append only file. Profit!
7319 */
7320 static int rewriteAppendOnlyFileBackground(void) {
7321 pid_t childpid;
7322
7323 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7324 if (server.vm_enabled) waitEmptyIOJobsQueue();
7325 if ((childpid = fork()) == 0) {
7326 /* Child */
7327 char tmpfile[256];
7328
7329 if (server.vm_enabled) vmReopenSwapFile();
7330 close(server.fd);
7331 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7332 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7333 _exit(0);
7334 } else {
7335 _exit(1);
7336 }
7337 } else {
7338 /* Parent */
7339 if (childpid == -1) {
7340 redisLog(REDIS_WARNING,
7341 "Can't rewrite append only file in background: fork: %s",
7342 strerror(errno));
7343 return REDIS_ERR;
7344 }
7345 redisLog(REDIS_NOTICE,
7346 "Background append only file rewriting started by pid %d",childpid);
7347 server.bgrewritechildpid = childpid;
7348 /* We set appendseldb to -1 in order to force the next call to the
7349 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7350 * accumulated by the parent into server.bgrewritebuf will start
7351 * with a SELECT statement and it will be safe to merge. */
7352 server.appendseldb = -1;
7353 return REDIS_OK;
7354 }
7355 return REDIS_OK; /* unreached */
7356 }
7357
7358 static void bgrewriteaofCommand(redisClient *c) {
7359 if (server.bgrewritechildpid != -1) {
7360 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7361 return;
7362 }
7363 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7364 char *status = "+Background append only file rewriting started\r\n";
7365 addReplySds(c,sdsnew(status));
7366 } else {
7367 addReply(c,shared.err);
7368 }
7369 }
7370
7371 static void aofRemoveTempFile(pid_t childpid) {
7372 char tmpfile[256];
7373
7374 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7375 unlink(tmpfile);
7376 }
7377
7378 /* Virtual Memory is composed mainly of two subsystems:
7379 * - Blocking Virutal Memory
7380 * - Threaded Virtual Memory I/O
7381 * The two parts are not fully decoupled, but functions are split among two
7382 * different sections of the source code (delimited by comments) in order to
7383 * make more clear what functionality is about the blocking VM and what about
7384 * the threaded (not blocking) VM.
7385 *
7386 * Redis VM design:
7387 *
7388 * Redis VM is a blocking VM (one that blocks reading swapped values from
7389 * disk into memory when a value swapped out is needed in memory) that is made
7390 * unblocking by trying to examine the command argument vector in order to
7391 * load in background values that will likely be needed in order to exec
7392 * the command. The command is executed only once all the relevant keys
7393 * are loaded into memory.
7394 *
7395 * This basically is almost as simple of a blocking VM, but almost as parallel
7396 * as a fully non-blocking VM.
7397 */
7398
7399 /* =================== Virtual Memory - Blocking Side ====================== */
7400
7401 /* substitute the first occurrence of '%p' with the process pid in the
7402 * swap file name. */
7403 static void expandVmSwapFilename(void) {
7404 char *p = strstr(server.vm_swap_file,"%p");
7405 sds new;
7406
7407 if (!p) return;
7408 new = sdsempty();
7409 *p = '\0';
7410 new = sdscat(new,server.vm_swap_file);
7411 new = sdscatprintf(new,"%ld",(long) getpid());
7412 new = sdscat(new,p+2);
7413 zfree(server.vm_swap_file);
7414 server.vm_swap_file = new;
7415 }
7416
7417 static void vmInit(void) {
7418 off_t totsize;
7419 int pipefds[2];
7420 size_t stacksize;
7421
7422 if (server.vm_max_threads != 0)
7423 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7424
7425 expandVmSwapFilename();
7426 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7427 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7428 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7429 }
7430 if (server.vm_fp == NULL) {
7431 redisLog(REDIS_WARNING,
7432 "Impossible to open the swap file: %s. Exiting.",
7433 strerror(errno));
7434 exit(1);
7435 }
7436 server.vm_fd = fileno(server.vm_fp);
7437 server.vm_next_page = 0;
7438 server.vm_near_pages = 0;
7439 server.vm_stats_used_pages = 0;
7440 server.vm_stats_swapped_objects = 0;
7441 server.vm_stats_swapouts = 0;
7442 server.vm_stats_swapins = 0;
7443 totsize = server.vm_pages*server.vm_page_size;
7444 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7445 if (ftruncate(server.vm_fd,totsize) == -1) {
7446 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7447 strerror(errno));
7448 exit(1);
7449 } else {
7450 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7451 }
7452 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7453 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7454 (long long) (server.vm_pages+7)/8, server.vm_pages);
7455 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7456
7457 /* Initialize threaded I/O (used by Virtual Memory) */
7458 server.io_newjobs = listCreate();
7459 server.io_processing = listCreate();
7460 server.io_processed = listCreate();
7461 server.io_ready_clients = listCreate();
7462 pthread_mutex_init(&server.io_mutex,NULL);
7463 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7464 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7465 server.io_active_threads = 0;
7466 if (pipe(pipefds) == -1) {
7467 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7468 ,strerror(errno));
7469 exit(1);
7470 }
7471 server.io_ready_pipe_read = pipefds[0];
7472 server.io_ready_pipe_write = pipefds[1];
7473 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7474 /* LZF requires a lot of stack */
7475 pthread_attr_init(&server.io_threads_attr);
7476 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7477 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7478 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7479 /* Listen for events in the threaded I/O pipe */
7480 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7481 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7482 oom("creating file event");
7483 }
7484
7485 /* Mark the page as used */
7486 static void vmMarkPageUsed(off_t page) {
7487 off_t byte = page/8;
7488 int bit = page&7;
7489 redisAssert(vmFreePage(page) == 1);
7490 server.vm_bitmap[byte] |= 1<<bit;
7491 }
7492
7493 /* Mark N contiguous pages as used, with 'page' being the first. */
7494 static void vmMarkPagesUsed(off_t page, off_t count) {
7495 off_t j;
7496
7497 for (j = 0; j < count; j++)
7498 vmMarkPageUsed(page+j);
7499 server.vm_stats_used_pages += count;
7500 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7501 (long long)count, (long long)page);
7502 }
7503
7504 /* Mark the page as free */
7505 static void vmMarkPageFree(off_t page) {
7506 off_t byte = page/8;
7507 int bit = page&7;
7508 redisAssert(vmFreePage(page) == 0);
7509 server.vm_bitmap[byte] &= ~(1<<bit);
7510 }
7511
7512 /* Mark N contiguous pages as free, with 'page' being the first. */
7513 static void vmMarkPagesFree(off_t page, off_t count) {
7514 off_t j;
7515
7516 for (j = 0; j < count; j++)
7517 vmMarkPageFree(page+j);
7518 server.vm_stats_used_pages -= count;
7519 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7520 (long long)count, (long long)page);
7521 }
7522
7523 /* Test if the page is free */
7524 static int vmFreePage(off_t page) {
7525 off_t byte = page/8;
7526 int bit = page&7;
7527 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7528 }
7529
7530 /* Find N contiguous free pages storing the first page of the cluster in *first.
7531 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7532 * REDIS_ERR is returned.
7533 *
7534 * This function uses a simple algorithm: we try to allocate
7535 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7536 * again from the start of the swap file searching for free spaces.
7537 *
7538 * If it looks pretty clear that there are no free pages near our offset
7539 * we try to find less populated places doing a forward jump of
7540 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7541 * without hurry, and then we jump again and so forth...
7542 *
7543 * This function can be improved using a free list to avoid to guess
7544 * too much, since we could collect data about freed pages.
7545 *
7546 * note: I implemented this function just after watching an episode of
7547 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7548 */
7549 static int vmFindContiguousPages(off_t *first, off_t n) {
7550 off_t base, offset = 0, since_jump = 0, numfree = 0;
7551
7552 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7553 server.vm_near_pages = 0;
7554 server.vm_next_page = 0;
7555 }
7556 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7557 base = server.vm_next_page;
7558
7559 while(offset < server.vm_pages) {
7560 off_t this = base+offset;
7561
7562 /* If we overflow, restart from page zero */
7563 if (this >= server.vm_pages) {
7564 this -= server.vm_pages;
7565 if (this == 0) {
7566 /* Just overflowed, what we found on tail is no longer
7567 * interesting, as it's no longer contiguous. */
7568 numfree = 0;
7569 }
7570 }
7571 if (vmFreePage(this)) {
7572 /* This is a free page */
7573 numfree++;
7574 /* Already got N free pages? Return to the caller, with success */
7575 if (numfree == n) {
7576 *first = this-(n-1);
7577 server.vm_next_page = this+1;
7578 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7579 return REDIS_OK;
7580 }
7581 } else {
7582 /* The current one is not a free page */
7583 numfree = 0;
7584 }
7585
7586 /* Fast-forward if the current page is not free and we already
7587 * searched enough near this place. */
7588 since_jump++;
7589 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7590 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7591 since_jump = 0;
7592 /* Note that even if we rewind after the jump, we are don't need
7593 * to make sure numfree is set to zero as we only jump *if* it
7594 * is set to zero. */
7595 } else {
7596 /* Otherwise just check the next page */
7597 offset++;
7598 }
7599 }
7600 return REDIS_ERR;
7601 }
7602
7603 /* Write the specified object at the specified page of the swap file */
7604 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7605 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7606 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7607 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7608 redisLog(REDIS_WARNING,
7609 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7610 strerror(errno));
7611 return REDIS_ERR;
7612 }
7613 rdbSaveObject(server.vm_fp,o);
7614 fflush(server.vm_fp);
7615 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7616 return REDIS_OK;
7617 }
7618
7619 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7620 * needed to later retrieve the object into the key object.
7621 * If we can't find enough contiguous empty pages to swap the object on disk
7622 * REDIS_ERR is returned. */
7623 static int vmSwapObjectBlocking(robj *key, robj *val) {
7624 off_t pages = rdbSavedObjectPages(val,NULL);
7625 off_t page;
7626
7627 assert(key->storage == REDIS_VM_MEMORY);
7628 assert(key->refcount == 1);
7629 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7630 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7631 key->vm.page = page;
7632 key->vm.usedpages = pages;
7633 key->storage = REDIS_VM_SWAPPED;
7634 key->vtype = val->type;
7635 decrRefCount(val); /* Deallocate the object from memory. */
7636 vmMarkPagesUsed(page,pages);
7637 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7638 (unsigned char*) key->ptr,
7639 (unsigned long long) page, (unsigned long long) pages);
7640 server.vm_stats_swapped_objects++;
7641 server.vm_stats_swapouts++;
7642 return REDIS_OK;
7643 }
7644
7645 static robj *vmReadObjectFromSwap(off_t page, int type) {
7646 robj *o;
7647
7648 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7649 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7650 redisLog(REDIS_WARNING,
7651 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7652 strerror(errno));
7653 _exit(1);
7654 }
7655 o = rdbLoadObject(type,server.vm_fp);
7656 if (o == NULL) {
7657 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7658 _exit(1);
7659 }
7660 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7661 return o;
7662 }
7663
7664 /* Load the value object relative to the 'key' object from swap to memory.
7665 * The newly allocated object is returned.
7666 *
7667 * If preview is true the unserialized object is returned to the caller but
7668 * no changes are made to the key object, nor the pages are marked as freed */
7669 static robj *vmGenericLoadObject(robj *key, int preview) {
7670 robj *val;
7671
7672 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7673 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7674 if (!preview) {
7675 key->storage = REDIS_VM_MEMORY;
7676 key->vm.atime = server.unixtime;
7677 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7678 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7679 (unsigned char*) key->ptr);
7680 server.vm_stats_swapped_objects--;
7681 } else {
7682 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7683 (unsigned char*) key->ptr);
7684 }
7685 server.vm_stats_swapins++;
7686 return val;
7687 }
7688
7689 /* Plain object loading, from swap to memory */
7690 static robj *vmLoadObject(robj *key) {
7691 /* If we are loading the object in background, stop it, we
7692 * need to load this object synchronously ASAP. */
7693 if (key->storage == REDIS_VM_LOADING)
7694 vmCancelThreadedIOJob(key);
7695 return vmGenericLoadObject(key,0);
7696 }
7697
7698 /* Just load the value on disk, without to modify the key.
7699 * This is useful when we want to perform some operation on the value
7700 * without to really bring it from swap to memory, like while saving the
7701 * dataset or rewriting the append only log. */
7702 static robj *vmPreviewObject(robj *key) {
7703 return vmGenericLoadObject(key,1);
7704 }
7705
7706 /* How a good candidate is this object for swapping?
7707 * The better candidate it is, the greater the returned value.
7708 *
7709 * Currently we try to perform a fast estimation of the object size in
7710 * memory, and combine it with aging informations.
7711 *
7712 * Basically swappability = idle-time * log(estimated size)
7713 *
7714 * Bigger objects are preferred over smaller objects, but not
7715 * proportionally, this is why we use the logarithm. This algorithm is
7716 * just a first try and will probably be tuned later. */
7717 static double computeObjectSwappability(robj *o) {
7718 time_t age = server.unixtime - o->vm.atime;
7719 long asize = 0;
7720 list *l;
7721 dict *d;
7722 struct dictEntry *de;
7723 int z;
7724
7725 if (age <= 0) return 0;
7726 switch(o->type) {
7727 case REDIS_STRING:
7728 if (o->encoding != REDIS_ENCODING_RAW) {
7729 asize = sizeof(*o);
7730 } else {
7731 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7732 }
7733 break;
7734 case REDIS_LIST:
7735 l = o->ptr;
7736 listNode *ln = listFirst(l);
7737
7738 asize = sizeof(list);
7739 if (ln) {
7740 robj *ele = ln->value;
7741 long elesize;
7742
7743 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7744 (sizeof(*o)+sdslen(ele->ptr)) :
7745 sizeof(*o);
7746 asize += (sizeof(listNode)+elesize)*listLength(l);
7747 }
7748 break;
7749 case REDIS_SET:
7750 case REDIS_ZSET:
7751 z = (o->type == REDIS_ZSET);
7752 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7753
7754 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7755 if (z) asize += sizeof(zset)-sizeof(dict);
7756 if (dictSize(d)) {
7757 long elesize;
7758 robj *ele;
7759
7760 de = dictGetRandomKey(d);
7761 ele = dictGetEntryKey(de);
7762 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7763 (sizeof(*o)+sdslen(ele->ptr)) :
7764 sizeof(*o);
7765 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7766 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7767 }
7768 break;
7769 }
7770 return (double)age*log(1+asize);
7771 }
7772
7773 /* Try to swap an object that's a good candidate for swapping.
7774 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7775 * to swap any object at all.
7776 *
7777 * If 'usethreaded' is true, Redis will try to swap the object in background
7778 * using I/O threads. */
7779 static int vmSwapOneObject(int usethreads) {
7780 int j, i;
7781 struct dictEntry *best = NULL;
7782 double best_swappability = 0;
7783 redisDb *best_db = NULL;
7784 robj *key, *val;
7785
7786 for (j = 0; j < server.dbnum; j++) {
7787 redisDb *db = server.db+j;
7788 /* Why maxtries is set to 100?
7789 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7790 * are swappable objects */
7791 int maxtries = 100;
7792
7793 if (dictSize(db->dict) == 0) continue;
7794 for (i = 0; i < 5; i++) {
7795 dictEntry *de;
7796 double swappability;
7797
7798 if (maxtries) maxtries--;
7799 de = dictGetRandomKey(db->dict);
7800 key = dictGetEntryKey(de);
7801 val = dictGetEntryVal(de);
7802 /* Only swap objects that are currently in memory.
7803 *
7804 * Also don't swap shared objects if threaded VM is on, as we
7805 * try to ensure that the main thread does not touch the
7806 * object while the I/O thread is using it, but we can't
7807 * control other keys without adding additional mutex. */
7808 if (key->storage != REDIS_VM_MEMORY ||
7809 (server.vm_max_threads != 0 && val->refcount != 1)) {
7810 if (maxtries) i--; /* don't count this try */
7811 continue;
7812 }
7813 swappability = computeObjectSwappability(val);
7814 if (!best || swappability > best_swappability) {
7815 best = de;
7816 best_swappability = swappability;
7817 best_db = db;
7818 }
7819 }
7820 }
7821 if (best == NULL) return REDIS_ERR;
7822 key = dictGetEntryKey(best);
7823 val = dictGetEntryVal(best);
7824
7825 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
7826 key->ptr, best_swappability);
7827
7828 /* Unshare the key if needed */
7829 if (key->refcount > 1) {
7830 robj *newkey = dupStringObject(key);
7831 decrRefCount(key);
7832 key = dictGetEntryKey(best) = newkey;
7833 }
7834 /* Swap it */
7835 if (usethreads) {
7836 vmSwapObjectThreaded(key,val,best_db);
7837 return REDIS_OK;
7838 } else {
7839 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7840 dictGetEntryVal(best) = NULL;
7841 return REDIS_OK;
7842 } else {
7843 return REDIS_ERR;
7844 }
7845 }
7846 }
7847
7848 static int vmSwapOneObjectBlocking() {
7849 return vmSwapOneObject(0);
7850 }
7851
7852 static int vmSwapOneObjectThreaded() {
7853 return vmSwapOneObject(1);
7854 }
7855
7856 /* Return true if it's safe to swap out objects in a given moment.
7857 * Basically we don't want to swap objects out while there is a BGSAVE
7858 * or a BGAEOREWRITE running in backgroud. */
7859 static int vmCanSwapOut(void) {
7860 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7861 }
7862
7863 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7864 * and was deleted. Otherwise 0 is returned. */
7865 static int deleteIfSwapped(redisDb *db, robj *key) {
7866 dictEntry *de;
7867 robj *foundkey;
7868
7869 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7870 foundkey = dictGetEntryKey(de);
7871 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7872 deleteKey(db,key);
7873 return 1;
7874 }
7875
7876 /* =================== Virtual Memory - Threaded I/O ======================= */
7877
7878 static void freeIOJob(iojob *j) {
7879 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7880 j->type == REDIS_IOJOB_DO_SWAP ||
7881 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
7882 decrRefCount(j->val);
7883 decrRefCount(j->key);
7884 zfree(j);
7885 }
7886
7887 /* Every time a thread finished a Job, it writes a byte into the write side
7888 * of an unix pipe in order to "awake" the main thread, and this function
7889 * is called. */
7890 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7891 int mask)
7892 {
7893 char buf[1];
7894 int retval, processed = 0, toprocess = -1, trytoswap = 1;
7895 REDIS_NOTUSED(el);
7896 REDIS_NOTUSED(mask);
7897 REDIS_NOTUSED(privdata);
7898
7899 /* For every byte we read in the read side of the pipe, there is one
7900 * I/O job completed to process. */
7901 while((retval = read(fd,buf,1)) == 1) {
7902 iojob *j;
7903 listNode *ln;
7904 robj *key;
7905 struct dictEntry *de;
7906
7907 redisLog(REDIS_DEBUG,"Processing I/O completed job");
7908
7909 /* Get the processed element (the oldest one) */
7910 lockThreadedIO();
7911 assert(listLength(server.io_processed) != 0);
7912 if (toprocess == -1) {
7913 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7914 if (toprocess <= 0) toprocess = 1;
7915 }
7916 ln = listFirst(server.io_processed);
7917 j = ln->value;
7918 listDelNode(server.io_processed,ln);
7919 unlockThreadedIO();
7920 /* If this job is marked as canceled, just ignore it */
7921 if (j->canceled) {
7922 freeIOJob(j);
7923 continue;
7924 }
7925 /* Post process it in the main thread, as there are things we
7926 * can do just here to avoid race conditions and/or invasive locks */
7927 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
7928 de = dictFind(j->db->dict,j->key);
7929 assert(de != NULL);
7930 key = dictGetEntryKey(de);
7931 if (j->type == REDIS_IOJOB_LOAD) {
7932 redisDb *db;
7933
7934 /* Key loaded, bring it at home */
7935 key->storage = REDIS_VM_MEMORY;
7936 key->vm.atime = server.unixtime;
7937 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7938 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7939 (unsigned char*) key->ptr);
7940 server.vm_stats_swapped_objects--;
7941 server.vm_stats_swapins++;
7942 dictGetEntryVal(de) = j->val;
7943 incrRefCount(j->val);
7944 db = j->db;
7945 freeIOJob(j);
7946 /* Handle clients waiting for this key to be loaded. */
7947 handleClientsBlockedOnSwappedKey(db,key);
7948 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7949 /* Now we know the amount of pages required to swap this object.
7950 * Let's find some space for it, and queue this task again
7951 * rebranded as REDIS_IOJOB_DO_SWAP. */
7952 if (!vmCanSwapOut() ||
7953 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
7954 {
7955 /* Ooops... no space or we can't swap as there is
7956 * a fork()ed Redis trying to save stuff on disk. */
7957 freeIOJob(j);
7958 key->storage = REDIS_VM_MEMORY; /* undo operation */
7959 } else {
7960 /* Note that we need to mark this pages as used now,
7961 * if the job will be canceled, we'll mark them as freed
7962 * again. */
7963 vmMarkPagesUsed(j->page,j->pages);
7964 j->type = REDIS_IOJOB_DO_SWAP;
7965 lockThreadedIO();
7966 queueIOJob(j);
7967 unlockThreadedIO();
7968 }
7969 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7970 robj *val;
7971
7972 /* Key swapped. We can finally free some memory. */
7973 if (key->storage != REDIS_VM_SWAPPING) {
7974 printf("key->storage: %d\n",key->storage);
7975 printf("key->name: %s\n",(char*)key->ptr);
7976 printf("key->refcount: %d\n",key->refcount);
7977 printf("val: %p\n",(void*)j->val);
7978 printf("val->type: %d\n",j->val->type);
7979 printf("val->ptr: %s\n",(char*)j->val->ptr);
7980 }
7981 redisAssert(key->storage == REDIS_VM_SWAPPING);
7982 val = dictGetEntryVal(de);
7983 key->vm.page = j->page;
7984 key->vm.usedpages = j->pages;
7985 key->storage = REDIS_VM_SWAPPED;
7986 key->vtype = j->val->type;
7987 decrRefCount(val); /* Deallocate the object from memory. */
7988 dictGetEntryVal(de) = NULL;
7989 redisLog(REDIS_DEBUG,
7990 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7991 (unsigned char*) key->ptr,
7992 (unsigned long long) j->page, (unsigned long long) j->pages);
7993 server.vm_stats_swapped_objects++;
7994 server.vm_stats_swapouts++;
7995 freeIOJob(j);
7996 /* Put a few more swap requests in queue if we are still
7997 * out of memory */
7998 if (trytoswap && vmCanSwapOut() &&
7999 zmalloc_used_memory() > server.vm_max_memory)
8000 {
8001 int more = 1;
8002 while(more) {
8003 lockThreadedIO();
8004 more = listLength(server.io_newjobs) <
8005 (unsigned) server.vm_max_threads;
8006 unlockThreadedIO();
8007 /* Don't waste CPU time if swappable objects are rare. */
8008 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8009 trytoswap = 0;
8010 break;
8011 }
8012 }
8013 }
8014 }
8015 processed++;
8016 if (processed == toprocess) return;
8017 }
8018 if (retval < 0 && errno != EAGAIN) {
8019 redisLog(REDIS_WARNING,
8020 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8021 strerror(errno));
8022 }
8023 }
8024
8025 static void lockThreadedIO(void) {
8026 pthread_mutex_lock(&server.io_mutex);
8027 }
8028
8029 static void unlockThreadedIO(void) {
8030 pthread_mutex_unlock(&server.io_mutex);
8031 }
8032
8033 /* Remove the specified object from the threaded I/O queue if still not
8034 * processed, otherwise make sure to flag it as canceled. */
8035 static void vmCancelThreadedIOJob(robj *o) {
8036 list *lists[3] = {
8037 server.io_newjobs, /* 0 */
8038 server.io_processing, /* 1 */
8039 server.io_processed /* 2 */
8040 };
8041 int i;
8042
8043 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8044 again:
8045 lockThreadedIO();
8046 /* Search for a matching key in one of the queues */
8047 for (i = 0; i < 3; i++) {
8048 listNode *ln;
8049 listIter li;
8050
8051 listRewind(lists[i],&li);
8052 while ((ln = listNext(&li)) != NULL) {
8053 iojob *job = ln->value;
8054
8055 if (job->canceled) continue; /* Skip this, already canceled. */
8056 if (compareStringObjects(job->key,o) == 0) {
8057 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8058 (void*)job, (char*)o->ptr, job->type, i);
8059 /* Mark the pages as free since the swap didn't happened
8060 * or happened but is now discarded. */
8061 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8062 vmMarkPagesFree(job->page,job->pages);
8063 /* Cancel the job. It depends on the list the job is
8064 * living in. */
8065 switch(i) {
8066 case 0: /* io_newjobs */
8067 /* If the job was yet not processed the best thing to do
8068 * is to remove it from the queue at all */
8069 freeIOJob(job);
8070 listDelNode(lists[i],ln);
8071 break;
8072 case 1: /* io_processing */
8073 /* Oh Shi- the thread is messing with the Job:
8074 *
8075 * Probably it's accessing the object if this is a
8076 * PREPARE_SWAP or DO_SWAP job.
8077 * If it's a LOAD job it may be reading from disk and
8078 * if we don't wait for the job to terminate before to
8079 * cancel it, maybe in a few microseconds data can be
8080 * corrupted in this pages. So the short story is:
8081 *
8082 * Better to wait for the job to move into the
8083 * next queue (processed)... */
8084
8085 /* We try again and again until the job is completed. */
8086 unlockThreadedIO();
8087 /* But let's wait some time for the I/O thread
8088 * to finish with this job. After all this condition
8089 * should be very rare. */
8090 usleep(1);
8091 goto again;
8092 case 2: /* io_processed */
8093 /* The job was already processed, that's easy...
8094 * just mark it as canceled so that we'll ignore it
8095 * when processing completed jobs. */
8096 job->canceled = 1;
8097 break;
8098 }
8099 /* Finally we have to adjust the storage type of the object
8100 * in order to "UNDO" the operaiton. */
8101 if (o->storage == REDIS_VM_LOADING)
8102 o->storage = REDIS_VM_SWAPPED;
8103 else if (o->storage == REDIS_VM_SWAPPING)
8104 o->storage = REDIS_VM_MEMORY;
8105 unlockThreadedIO();
8106 return;
8107 }
8108 }
8109 }
8110 unlockThreadedIO();
8111 assert(1 != 1); /* We should never reach this */
8112 }
8113
8114 static void *IOThreadEntryPoint(void *arg) {
8115 iojob *j;
8116 listNode *ln;
8117 REDIS_NOTUSED(arg);
8118
8119 pthread_detach(pthread_self());
8120 while(1) {
8121 /* Get a new job to process */
8122 lockThreadedIO();
8123 if (listLength(server.io_newjobs) == 0) {
8124 /* No new jobs in queue, exit. */
8125 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8126 (long) pthread_self());
8127 server.io_active_threads--;
8128 unlockThreadedIO();
8129 return NULL;
8130 }
8131 ln = listFirst(server.io_newjobs);
8132 j = ln->value;
8133 listDelNode(server.io_newjobs,ln);
8134 /* Add the job in the processing queue */
8135 j->thread = pthread_self();
8136 listAddNodeTail(server.io_processing,j);
8137 ln = listLast(server.io_processing); /* We use ln later to remove it */
8138 unlockThreadedIO();
8139 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8140 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8141
8142 /* Process the Job */
8143 if (j->type == REDIS_IOJOB_LOAD) {
8144 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8145 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8146 FILE *fp = fopen("/dev/null","w+");
8147 j->pages = rdbSavedObjectPages(j->val,fp);
8148 fclose(fp);
8149 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8150 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8151 j->canceled = 1;
8152 }
8153
8154 /* Done: insert the job into the processed queue */
8155 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8156 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8157 lockThreadedIO();
8158 listDelNode(server.io_processing,ln);
8159 listAddNodeTail(server.io_processed,j);
8160 unlockThreadedIO();
8161
8162 /* Signal the main thread there is new stuff to process */
8163 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8164 }
8165 return NULL; /* never reached */
8166 }
8167
8168 static void spawnIOThread(void) {
8169 pthread_t thread;
8170 sigset_t mask, omask;
8171
8172 sigemptyset(&mask);
8173 sigaddset(&mask,SIGCHLD);
8174 sigaddset(&mask,SIGHUP);
8175 sigaddset(&mask,SIGPIPE);
8176 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8177 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8178 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8179 server.io_active_threads++;
8180 }
8181
8182 /* We need to wait for the last thread to exit before we are able to
8183 * fork() in order to BGSAVE or BGREWRITEAOF. */
8184 static void waitEmptyIOJobsQueue(void) {
8185 while(1) {
8186 int io_processed_len;
8187
8188 lockThreadedIO();
8189 if (listLength(server.io_newjobs) == 0 &&
8190 listLength(server.io_processing) == 0 &&
8191 server.io_active_threads == 0)
8192 {
8193 unlockThreadedIO();
8194 return;
8195 }
8196 /* While waiting for empty jobs queue condition we post-process some
8197 * finshed job, as I/O threads may be hanging trying to write against
8198 * the io_ready_pipe_write FD but there are so much pending jobs that
8199 * it's blocking. */
8200 io_processed_len = listLength(server.io_processed);
8201 unlockThreadedIO();
8202 if (io_processed_len) {
8203 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8204 usleep(1000); /* 1 millisecond */
8205 } else {
8206 usleep(10000); /* 10 milliseconds */
8207 }
8208 }
8209 }
8210
8211 static void vmReopenSwapFile(void) {
8212 /* Note: we don't close the old one as we are in the child process
8213 * and don't want to mess at all with the original file object. */
8214 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8215 if (server.vm_fp == NULL) {
8216 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8217 server.vm_swap_file);
8218 _exit(1);
8219 }
8220 server.vm_fd = fileno(server.vm_fp);
8221 }
8222
8223 /* This function must be called while with threaded IO locked */
8224 static void queueIOJob(iojob *j) {
8225 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8226 (void*)j, j->type, (char*)j->key->ptr);
8227 listAddNodeTail(server.io_newjobs,j);
8228 if (server.io_active_threads < server.vm_max_threads)
8229 spawnIOThread();
8230 }
8231
8232 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8233 iojob *j;
8234
8235 assert(key->storage == REDIS_VM_MEMORY);
8236 assert(key->refcount == 1);
8237
8238 j = zmalloc(sizeof(*j));
8239 j->type = REDIS_IOJOB_PREPARE_SWAP;
8240 j->db = db;
8241 j->key = dupStringObject(key);
8242 j->val = val;
8243 incrRefCount(val);
8244 j->canceled = 0;
8245 j->thread = (pthread_t) -1;
8246 key->storage = REDIS_VM_SWAPPING;
8247
8248 lockThreadedIO();
8249 queueIOJob(j);
8250 unlockThreadedIO();
8251 return REDIS_OK;
8252 }
8253
8254 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8255
8256 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8257 * If there is not already a job loading the key, it is craeted.
8258 * The key is added to the io_keys list in the client structure, and also
8259 * in the hash table mapping swapped keys to waiting clients, that is,
8260 * server.io_waited_keys. */
8261 static int waitForSwappedKey(redisClient *c, robj *key) {
8262 struct dictEntry *de;
8263 robj *o;
8264 list *l;
8265
8266 /* If the key does not exist or is already in RAM we don't need to
8267 * block the client at all. */
8268 de = dictFind(c->db->dict,key);
8269 if (de == NULL) return 0;
8270 o = dictGetEntryKey(de);
8271 if (o->storage == REDIS_VM_MEMORY) {
8272 return 0;
8273 } else if (o->storage == REDIS_VM_SWAPPING) {
8274 /* We were swapping the key, undo it! */
8275 vmCancelThreadedIOJob(o);
8276 return 0;
8277 }
8278
8279 /* OK: the key is either swapped, or being loaded just now. */
8280
8281 /* Add the key to the list of keys this client is waiting for.
8282 * This maps clients to keys they are waiting for. */
8283 listAddNodeTail(c->io_keys,key);
8284 incrRefCount(key);
8285
8286 /* Add the client to the swapped keys => clients waiting map. */
8287 de = dictFind(c->db->io_keys,key);
8288 if (de == NULL) {
8289 int retval;
8290
8291 /* For every key we take a list of clients blocked for it */
8292 l = listCreate();
8293 retval = dictAdd(c->db->io_keys,key,l);
8294 incrRefCount(key);
8295 assert(retval == DICT_OK);
8296 } else {
8297 l = dictGetEntryVal(de);
8298 }
8299 listAddNodeTail(l,c);
8300
8301 /* Are we already loading the key from disk? If not create a job */
8302 if (o->storage == REDIS_VM_SWAPPED) {
8303 iojob *j;
8304
8305 o->storage = REDIS_VM_LOADING;
8306 j = zmalloc(sizeof(*j));
8307 j->type = REDIS_IOJOB_LOAD;
8308 j->db = c->db;
8309 j->key = dupStringObject(key);
8310 j->key->vtype = o->vtype;
8311 j->page = o->vm.page;
8312 j->val = NULL;
8313 j->canceled = 0;
8314 j->thread = (pthread_t) -1;
8315 lockThreadedIO();
8316 queueIOJob(j);
8317 unlockThreadedIO();
8318 }
8319 return 1;
8320 }
8321
8322 /* Is this client attempting to run a command against swapped keys?
8323 * If so, block it ASAP, load the keys in background, then resume it.
8324 *
8325 * The important idea about this function is that it can fail! If keys will
8326 * still be swapped when the client is resumed, this key lookups will
8327 * just block loading keys from disk. In practical terms this should only
8328 * happen with SORT BY command or if there is a bug in this function.
8329 *
8330 * Return 1 if the client is marked as blocked, 0 if the client can
8331 * continue as the keys it is going to access appear to be in memory. */
8332 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8333 int j, last;
8334
8335 if (cmd->vm_firstkey == 0) return 0;
8336 last = cmd->vm_lastkey;
8337 if (last < 0) last = c->argc+last;
8338 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8339 waitForSwappedKey(c,c->argv[j]);
8340 /* If the client was blocked for at least one key, mark it as blocked. */
8341 if (listLength(c->io_keys)) {
8342 c->flags |= REDIS_IO_WAIT;
8343 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8344 server.vm_blocked_clients++;
8345 return 1;
8346 } else {
8347 return 0;
8348 }
8349 }
8350
8351 /* Remove the 'key' from the list of blocked keys for a given client.
8352 *
8353 * The function returns 1 when there are no longer blocking keys after
8354 * the current one was removed (and the client can be unblocked). */
8355 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8356 list *l;
8357 listNode *ln;
8358 listIter li;
8359 struct dictEntry *de;
8360
8361 /* Remove the key from the list of keys this client is waiting for. */
8362 listRewind(c->io_keys,&li);
8363 while ((ln = listNext(&li)) != NULL) {
8364 if (compareStringObjects(ln->value,key) == 0) {
8365 listDelNode(c->io_keys,ln);
8366 break;
8367 }
8368 }
8369 assert(ln != NULL);
8370
8371 /* Remove the client form the key => waiting clients map. */
8372 de = dictFind(c->db->io_keys,key);
8373 assert(de != NULL);
8374 l = dictGetEntryVal(de);
8375 ln = listSearchKey(l,c);
8376 assert(ln != NULL);
8377 listDelNode(l,ln);
8378 if (listLength(l) == 0)
8379 dictDelete(c->db->io_keys,key);
8380
8381 return listLength(c->io_keys) == 0;
8382 }
8383
8384 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8385 struct dictEntry *de;
8386 list *l;
8387 listNode *ln;
8388 int len;
8389
8390 de = dictFind(db->io_keys,key);
8391 if (!de) return;
8392
8393 l = dictGetEntryVal(de);
8394 len = listLength(l);
8395 /* Note: we can't use something like while(listLength(l)) as the list
8396 * can be freed by the calling function when we remove the last element. */
8397 while (len--) {
8398 ln = listFirst(l);
8399 redisClient *c = ln->value;
8400
8401 if (dontWaitForSwappedKey(c,key)) {
8402 /* Put the client in the list of clients ready to go as we
8403 * loaded all the keys about it. */
8404 listAddNodeTail(server.io_ready_clients,c);
8405 }
8406 }
8407 }
8408
8409 /* ================================= Debugging ============================== */
8410
8411 static void debugCommand(redisClient *c) {
8412 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8413 *((char*)-1) = 'x';
8414 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8415 if (rdbSave(server.dbfilename) != REDIS_OK) {
8416 addReply(c,shared.err);
8417 return;
8418 }
8419 emptyDb();
8420 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8421 addReply(c,shared.err);
8422 return;
8423 }
8424 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8425 addReply(c,shared.ok);
8426 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8427 emptyDb();
8428 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8429 addReply(c,shared.err);
8430 return;
8431 }
8432 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8433 addReply(c,shared.ok);
8434 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8435 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8436 robj *key, *val;
8437
8438 if (!de) {
8439 addReply(c,shared.nokeyerr);
8440 return;
8441 }
8442 key = dictGetEntryKey(de);
8443 val = dictGetEntryVal(de);
8444 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8445 key->storage == REDIS_VM_SWAPPING)) {
8446 addReplySds(c,sdscatprintf(sdsempty(),
8447 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8448 "encoding:%d serializedlength:%lld\r\n",
8449 (void*)key, key->refcount, (void*)val, val->refcount,
8450 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8451 } else {
8452 addReplySds(c,sdscatprintf(sdsempty(),
8453 "+Key at:%p refcount:%d, value swapped at: page %llu "
8454 "using %llu pages\r\n",
8455 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8456 (unsigned long long) key->vm.usedpages));
8457 }
8458 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8459 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8460 robj *key, *val;
8461
8462 if (!server.vm_enabled) {
8463 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8464 return;
8465 }
8466 if (!de) {
8467 addReply(c,shared.nokeyerr);
8468 return;
8469 }
8470 key = dictGetEntryKey(de);
8471 val = dictGetEntryVal(de);
8472 /* If the key is shared we want to create a copy */
8473 if (key->refcount > 1) {
8474 robj *newkey = dupStringObject(key);
8475 decrRefCount(key);
8476 key = dictGetEntryKey(de) = newkey;
8477 }
8478 /* Swap it */
8479 if (key->storage != REDIS_VM_MEMORY) {
8480 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8481 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8482 dictGetEntryVal(de) = NULL;
8483 addReply(c,shared.ok);
8484 } else {
8485 addReply(c,shared.err);
8486 }
8487 } else {
8488 addReplySds(c,sdsnew(
8489 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8490 }
8491 }
8492
8493 static void _redisAssert(char *estr, char *file, int line) {
8494 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8495 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8496 #ifdef HAVE_BACKTRACE
8497 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8498 *((char*)-1) = 'x';
8499 #endif
8500 }
8501
8502 /* =================================== Main! ================================ */
8503
8504 #ifdef __linux__
8505 int linuxOvercommitMemoryValue(void) {
8506 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8507 char buf[64];
8508
8509 if (!fp) return -1;
8510 if (fgets(buf,64,fp) == NULL) {
8511 fclose(fp);
8512 return -1;
8513 }
8514 fclose(fp);
8515
8516 return atoi(buf);
8517 }
8518
8519 void linuxOvercommitMemoryWarning(void) {
8520 if (linuxOvercommitMemoryValue() == 0) {
8521 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8522 }
8523 }
8524 #endif /* __linux__ */
8525
8526 static void daemonize(void) {
8527 int fd;
8528 FILE *fp;
8529
8530 if (fork() != 0) exit(0); /* parent exits */
8531 setsid(); /* create a new session */
8532
8533 /* Every output goes to /dev/null. If Redis is daemonized but
8534 * the 'logfile' is set to 'stdout' in the configuration file
8535 * it will not log at all. */
8536 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8537 dup2(fd, STDIN_FILENO);
8538 dup2(fd, STDOUT_FILENO);
8539 dup2(fd, STDERR_FILENO);
8540 if (fd > STDERR_FILENO) close(fd);
8541 }
8542 /* Try to write the pid file */
8543 fp = fopen(server.pidfile,"w");
8544 if (fp) {
8545 fprintf(fp,"%d\n",getpid());
8546 fclose(fp);
8547 }
8548 }
8549
8550 int main(int argc, char **argv) {
8551 time_t start;
8552
8553 initServerConfig();
8554 if (argc == 2) {
8555 resetServerSaveParams();
8556 loadServerConfig(argv[1]);
8557 } else if (argc > 2) {
8558 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8559 exit(1);
8560 } else {
8561 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8562 }
8563 if (server.daemonize) daemonize();
8564 initServer();
8565 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8566 #ifdef __linux__
8567 linuxOvercommitMemoryWarning();
8568 #endif
8569 start = time(NULL);
8570 if (server.appendonly) {
8571 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8572 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8573 } else {
8574 if (rdbLoad(server.dbfilename) == REDIS_OK)
8575 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8576 }
8577 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8578 aeSetBeforeSleepProc(server.el,beforeSleep);
8579 aeMain(server.el);
8580 aeDeleteEventLoop(server.el);
8581 return 0;
8582 }
8583
8584 /* ============================= Backtrace support ========================= */
8585
8586 #ifdef HAVE_BACKTRACE
8587 static char *findFuncName(void *pointer, unsigned long *offset);
8588
8589 static void *getMcontextEip(ucontext_t *uc) {
8590 #if defined(__FreeBSD__)
8591 return (void*) uc->uc_mcontext.mc_eip;
8592 #elif defined(__dietlibc__)
8593 return (void*) uc->uc_mcontext.eip;
8594 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8595 #if __x86_64__
8596 return (void*) uc->uc_mcontext->__ss.__rip;
8597 #else
8598 return (void*) uc->uc_mcontext->__ss.__eip;
8599 #endif
8600 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8601 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8602 return (void*) uc->uc_mcontext->__ss.__rip;
8603 #else
8604 return (void*) uc->uc_mcontext->__ss.__eip;
8605 #endif
8606 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8607 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8608 #elif defined(__ia64__) /* Linux IA64 */
8609 return (void*) uc->uc_mcontext.sc_ip;
8610 #else
8611 return NULL;
8612 #endif
8613 }
8614
8615 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8616 void *trace[100];
8617 char **messages = NULL;
8618 int i, trace_size = 0;
8619 unsigned long offset=0;
8620 ucontext_t *uc = (ucontext_t*) secret;
8621 sds infostring;
8622 REDIS_NOTUSED(info);
8623
8624 redisLog(REDIS_WARNING,
8625 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8626 infostring = genRedisInfoString();
8627 redisLog(REDIS_WARNING, "%s",infostring);
8628 /* It's not safe to sdsfree() the returned string under memory
8629 * corruption conditions. Let it leak as we are going to abort */
8630
8631 trace_size = backtrace(trace, 100);
8632 /* overwrite sigaction with caller's address */
8633 if (getMcontextEip(uc) != NULL) {
8634 trace[1] = getMcontextEip(uc);
8635 }
8636 messages = backtrace_symbols(trace, trace_size);
8637
8638 for (i=1; i<trace_size; ++i) {
8639 char *fn = findFuncName(trace[i], &offset), *p;
8640
8641 p = strchr(messages[i],'+');
8642 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8643 redisLog(REDIS_WARNING,"%s", messages[i]);
8644 } else {
8645 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8646 }
8647 }
8648 /* free(messages); Don't call free() with possibly corrupted memory. */
8649 _exit(0);
8650 }
8651
8652 static void setupSigSegvAction(void) {
8653 struct sigaction act;
8654
8655 sigemptyset (&act.sa_mask);
8656 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8657 * is used. Otherwise, sa_handler is used */
8658 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8659 act.sa_sigaction = segvHandler;
8660 sigaction (SIGSEGV, &act, NULL);
8661 sigaction (SIGBUS, &act, NULL);
8662 sigaction (SIGFPE, &act, NULL);
8663 sigaction (SIGILL, &act, NULL);
8664 sigaction (SIGBUS, &act, NULL);
8665 return;
8666 }
8667
8668 #include "staticsymbols.h"
8669 /* This function try to convert a pointer into a function name. It's used in
8670 * oreder to provide a backtrace under segmentation fault that's able to
8671 * display functions declared as static (otherwise the backtrace is useless). */
8672 static char *findFuncName(void *pointer, unsigned long *offset){
8673 int i, ret = -1;
8674 unsigned long off, minoff = 0;
8675
8676 /* Try to match against the Symbol with the smallest offset */
8677 for (i=0; symsTable[i].pointer; i++) {
8678 unsigned long lp = (unsigned long) pointer;
8679
8680 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8681 off=lp-symsTable[i].pointer;
8682 if (ret < 0 || off < minoff) {
8683 minoff=off;
8684 ret=i;
8685 }
8686 }
8687 }
8688 if (ret == -1) return NULL;
8689 *offset = minoff;
8690 return symsTable[ret].name;
8691 }
8692 #else /* HAVE_BACKTRACE */
8693 static void setupSigSegvAction(void) {
8694 }
8695 #endif /* HAVE_BACKTRACE */
8696
8697
8698
8699 /* The End */
8700
8701
8702