]> git.saurik.com Git - redis.git/blob - redis.c
f213b3b1fdc2dbdcdf84308b34ffddbde8c94e94
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 /* Object types only used for dumping to disk */
131 #define REDIS_EXPIRETIME 253
132 #define REDIS_SELECTDB 254
133 #define REDIS_EOF 255
134
135 /* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
138 *
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
145 *
146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
148 #define REDIS_RDB_6BITLEN 0
149 #define REDIS_RDB_14BITLEN 1
150 #define REDIS_RDB_32BITLEN 2
151 #define REDIS_RDB_ENCVAL 3
152 #define REDIS_RDB_LENERR UINT_MAX
153
154 /* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
160 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
161
162 /* Virtual memory object->where field. */
163 #define REDIS_VM_MEMORY 0 /* The object is on memory */
164 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
165 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
167
168 /* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170 #define REDIS_VM_MAX_NEAR_PAGES 65536
171 #define REDIS_VM_MAX_RANDOM_JUMP 4096
172 #define REDIS_VM_MAX_THREADS 32
173 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
174 /* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
178 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
179
180 /* Client flags */
181 #define REDIS_SLAVE 1 /* This client is a slave server */
182 #define REDIS_MASTER 2 /* This client is a master server */
183 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184 #define REDIS_MULTI 8 /* This client is in a MULTI context */
185 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
187
188 /* Slave replication state - slave side */
189 #define REDIS_REPL_NONE 0 /* No active replication */
190 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
191 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
192
193 /* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
201
202 /* List related stuff */
203 #define REDIS_HEAD 0
204 #define REDIS_TAIL 1
205
206 /* Sort operations */
207 #define REDIS_SORT_GET 0
208 #define REDIS_SORT_ASC 1
209 #define REDIS_SORT_DESC 2
210 #define REDIS_SORTKEY_MAX 1024
211
212 /* Log levels */
213 #define REDIS_DEBUG 0
214 #define REDIS_VERBOSE 1
215 #define REDIS_NOTICE 2
216 #define REDIS_WARNING 3
217
218 /* Anti-warning macro... */
219 #define REDIS_NOTUSED(V) ((void) V)
220
221 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
223
224 /* Append only defines */
225 #define APPENDFSYNC_NO 0
226 #define APPENDFSYNC_ALWAYS 1
227 #define APPENDFSYNC_EVERYSEC 2
228
229 /* We can print the stacktrace, so our assert is defined this way: */
230 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
231 static void _redisAssert(char *estr, char *file, int line);
232
233 /*================================= Data types ============================== */
234
235 /* A redis object, that is a type able to hold a string / list / set */
236
237 /* The VM object structure */
238 struct redisObjectVM {
239 off_t page; /* the page at witch the object is stored on disk */
240 off_t usedpages; /* number of pages used on disk */
241 time_t atime; /* Last access time */
242 } vm;
243
244 /* The actual Redis Object */
245 typedef struct redisObject {
246 void *ptr;
247 unsigned char type;
248 unsigned char encoding;
249 unsigned char storage; /* If this object is a key, where is the value?
250 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
251 unsigned char vtype; /* If this object is a key, and value is swapped out,
252 * this is the type of the swapped out object. */
253 int refcount;
254 /* VM fields, this are only allocated if VM is active, otherwise the
255 * object allocation function will just allocate
256 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
257 * Redis without VM active will not have any overhead. */
258 struct redisObjectVM vm;
259 } robj;
260
261 /* Macro used to initalize a Redis object allocated on the stack.
262 * Note that this macro is taken near the structure definition to make sure
263 * we'll update it when the structure is changed, to avoid bugs like
264 * bug #85 introduced exactly in this way. */
265 #define initStaticStringObject(_var,_ptr) do { \
266 _var.refcount = 1; \
267 _var.type = REDIS_STRING; \
268 _var.encoding = REDIS_ENCODING_RAW; \
269 _var.ptr = _ptr; \
270 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
271 } while(0);
272
273 typedef struct redisDb {
274 dict *dict; /* The keyspace for this DB */
275 dict *expires; /* Timeout of keys with a timeout set */
276 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
277 dict *io_keys; /* Keys with clients waiting for VM I/O */
278 int id;
279 } redisDb;
280
281 /* Client MULTI/EXEC state */
282 typedef struct multiCmd {
283 robj **argv;
284 int argc;
285 struct redisCommand *cmd;
286 } multiCmd;
287
288 typedef struct multiState {
289 multiCmd *commands; /* Array of MULTI commands */
290 int count; /* Total number of MULTI commands */
291 } multiState;
292
293 /* With multiplexing we need to take per-clinet state.
294 * Clients are taken in a liked list. */
295 typedef struct redisClient {
296 int fd;
297 redisDb *db;
298 int dictid;
299 sds querybuf;
300 robj **argv, **mbargv;
301 int argc, mbargc;
302 int bulklen; /* bulk read len. -1 if not in bulk read mode */
303 int multibulk; /* multi bulk command format active */
304 list *reply;
305 int sentlen;
306 time_t lastinteraction; /* time of the last interaction, used for timeout */
307 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
308 int slaveseldb; /* slave selected db, if this client is a slave */
309 int authenticated; /* when requirepass is non-NULL */
310 int replstate; /* replication state if this is a slave */
311 int repldbfd; /* replication DB file descriptor */
312 long repldboff; /* replication DB file offset */
313 off_t repldbsize; /* replication DB file size */
314 multiState mstate; /* MULTI/EXEC state */
315 robj **blockingkeys; /* The key we are waiting to terminate a blocking
316 * operation such as BLPOP. Otherwise NULL. */
317 int blockingkeysnum; /* Number of blocking keys */
318 time_t blockingto; /* Blocking operation timeout. If UNIX current time
319 * is >= blockingto then the operation timed out. */
320 list *io_keys; /* Keys this client is waiting to be loaded from the
321 * swap file in order to continue. */
322 } redisClient;
323
324 struct saveparam {
325 time_t seconds;
326 int changes;
327 };
328
329 /* Global server state structure */
330 struct redisServer {
331 int port;
332 int fd;
333 redisDb *db;
334 dict *sharingpool; /* Poll used for object sharing */
335 unsigned int sharingpoolsize;
336 long long dirty; /* changes to DB from the last save */
337 list *clients;
338 list *slaves, *monitors;
339 char neterr[ANET_ERR_LEN];
340 aeEventLoop *el;
341 int cronloops; /* number of times the cron function run */
342 list *objfreelist; /* A list of freed objects to avoid malloc() */
343 time_t lastsave; /* Unix time of last save succeeede */
344 /* Fields used only for stats */
345 time_t stat_starttime; /* server start time */
346 long long stat_numcommands; /* number of processed commands */
347 long long stat_numconnections; /* number of connections received */
348 /* Configuration */
349 int verbosity;
350 int glueoutputbuf;
351 int maxidletime;
352 int dbnum;
353 int daemonize;
354 int appendonly;
355 int appendfsync;
356 time_t lastfsync;
357 int appendfd;
358 int appendseldb;
359 char *pidfile;
360 pid_t bgsavechildpid;
361 pid_t bgrewritechildpid;
362 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
363 struct saveparam *saveparams;
364 int saveparamslen;
365 char *logfile;
366 char *bindaddr;
367 char *dbfilename;
368 char *appendfilename;
369 char *requirepass;
370 int shareobjects;
371 int rdbcompression;
372 /* Replication related */
373 int isslave;
374 char *masterauth;
375 char *masterhost;
376 int masterport;
377 redisClient *master; /* client that is master for this slave */
378 int replstate;
379 unsigned int maxclients;
380 unsigned long long maxmemory;
381 unsigned int blpop_blocked_clients;
382 unsigned int vm_blocked_clients;
383 /* Sort parameters - qsort_r() is only available under BSD so we
384 * have to take this state global, in order to pass it to sortCompare() */
385 int sort_desc;
386 int sort_alpha;
387 int sort_bypattern;
388 /* Virtual memory configuration */
389 int vm_enabled;
390 char *vm_swap_file;
391 off_t vm_page_size;
392 off_t vm_pages;
393 unsigned long long vm_max_memory;
394 /* Virtual memory state */
395 FILE *vm_fp;
396 int vm_fd;
397 off_t vm_next_page; /* Next probably empty page */
398 off_t vm_near_pages; /* Number of pages allocated sequentially */
399 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
400 time_t unixtime; /* Unix time sampled every second. */
401 /* Virtual memory I/O threads stuff */
402 /* An I/O thread process an element taken from the io_jobs queue and
403 * put the result of the operation in the io_done list. While the
404 * job is being processed, it's put on io_processing queue. */
405 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
406 list *io_processing; /* List of VM I/O jobs being processed */
407 list *io_processed; /* List of VM I/O jobs already processed */
408 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
409 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
410 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
411 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
412 pthread_attr_t io_threads_attr; /* attributes for threads creation */
413 int io_active_threads; /* Number of running I/O threads */
414 int vm_max_threads; /* Max number of I/O threads running at the same time */
415 /* Our main thread is blocked on the event loop, locking for sockets ready
416 * to be read or written, so when a threaded I/O operation is ready to be
417 * processed by the main thread, the I/O thread will use a unix pipe to
418 * awake the main thread. The followings are the two pipe FDs. */
419 int io_ready_pipe_read;
420 int io_ready_pipe_write;
421 /* Virtual memory stats */
422 unsigned long long vm_stats_used_pages;
423 unsigned long long vm_stats_swapped_objects;
424 unsigned long long vm_stats_swapouts;
425 unsigned long long vm_stats_swapins;
426 FILE *devnull;
427 };
428
429 typedef void redisCommandProc(redisClient *c);
430 struct redisCommand {
431 char *name;
432 redisCommandProc *proc;
433 int arity;
434 int flags;
435 /* What keys should be loaded in background when calling this command? */
436 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
437 int vm_lastkey; /* THe last argument that's a key */
438 int vm_keystep; /* The step between first and last key */
439 };
440
441 struct redisFunctionSym {
442 char *name;
443 unsigned long pointer;
444 };
445
446 typedef struct _redisSortObject {
447 robj *obj;
448 union {
449 double score;
450 robj *cmpobj;
451 } u;
452 } redisSortObject;
453
454 typedef struct _redisSortOperation {
455 int type;
456 robj *pattern;
457 } redisSortOperation;
458
459 /* ZSETs use a specialized version of Skiplists */
460
461 typedef struct zskiplistNode {
462 struct zskiplistNode **forward;
463 struct zskiplistNode *backward;
464 unsigned int *span;
465 double score;
466 robj *obj;
467 } zskiplistNode;
468
469 typedef struct zskiplist {
470 struct zskiplistNode *header, *tail;
471 unsigned long length;
472 int level;
473 } zskiplist;
474
475 typedef struct zset {
476 dict *dict;
477 zskiplist *zsl;
478 } zset;
479
480 /* Our shared "common" objects */
481
482 struct sharedObjectsStruct {
483 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
484 *colon, *nullbulk, *nullmultibulk, *queued,
485 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
486 *outofrangeerr, *plus,
487 *select0, *select1, *select2, *select3, *select4,
488 *select5, *select6, *select7, *select8, *select9;
489 } shared;
490
491 /* Global vars that are actally used as constants. The following double
492 * values are used for double on-disk serialization, and are initialized
493 * at runtime to avoid strange compiler optimizations. */
494
495 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
496
497 /* VM threaded I/O request message */
498 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
499 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
500 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
501 typedef struct iojob {
502 int type; /* Request type, REDIS_IOJOB_* */
503 redisDb *db;/* Redis database */
504 robj *key; /* This I/O request is about swapping this key */
505 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
506 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
507 off_t page; /* Swap page where to read/write the object */
508 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
509 int canceled; /* True if this command was canceled by blocking side of VM */
510 pthread_t thread; /* ID of the thread processing this entry */
511 } iojob;
512
513 /*================================ Prototypes =============================== */
514
515 static void freeStringObject(robj *o);
516 static void freeListObject(robj *o);
517 static void freeSetObject(robj *o);
518 static void decrRefCount(void *o);
519 static robj *createObject(int type, void *ptr);
520 static void freeClient(redisClient *c);
521 static int rdbLoad(char *filename);
522 static void addReply(redisClient *c, robj *obj);
523 static void addReplySds(redisClient *c, sds s);
524 static void incrRefCount(robj *o);
525 static int rdbSaveBackground(char *filename);
526 static robj *createStringObject(char *ptr, size_t len);
527 static robj *dupStringObject(robj *o);
528 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
529 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
530 static int syncWithMaster(void);
531 static robj *tryObjectSharing(robj *o);
532 static int tryObjectEncoding(robj *o);
533 static robj *getDecodedObject(robj *o);
534 static int removeExpire(redisDb *db, robj *key);
535 static int expireIfNeeded(redisDb *db, robj *key);
536 static int deleteIfVolatile(redisDb *db, robj *key);
537 static int deleteIfSwapped(redisDb *db, robj *key);
538 static int deleteKey(redisDb *db, robj *key);
539 static time_t getExpire(redisDb *db, robj *key);
540 static int setExpire(redisDb *db, robj *key, time_t when);
541 static void updateSlavesWaitingBgsave(int bgsaveerr);
542 static void freeMemoryIfNeeded(void);
543 static int processCommand(redisClient *c);
544 static void setupSigSegvAction(void);
545 static void rdbRemoveTempFile(pid_t childpid);
546 static void aofRemoveTempFile(pid_t childpid);
547 static size_t stringObjectLen(robj *o);
548 static void processInputBuffer(redisClient *c);
549 static zskiplist *zslCreate(void);
550 static void zslFree(zskiplist *zsl);
551 static void zslInsert(zskiplist *zsl, double score, robj *obj);
552 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
553 static void initClientMultiState(redisClient *c);
554 static void freeClientMultiState(redisClient *c);
555 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
556 static void unblockClientWaitingData(redisClient *c);
557 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
558 static void vmInit(void);
559 static void vmMarkPagesFree(off_t page, off_t count);
560 static robj *vmLoadObject(robj *key);
561 static robj *vmPreviewObject(robj *key);
562 static int vmSwapOneObjectBlocking(void);
563 static int vmSwapOneObjectThreaded(void);
564 static int vmCanSwapOut(void);
565 static int tryFreeOneObjectFromFreelist(void);
566 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
567 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
568 static void vmCancelThreadedIOJob(robj *o);
569 static void lockThreadedIO(void);
570 static void unlockThreadedIO(void);
571 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
572 static void freeIOJob(iojob *j);
573 static void queueIOJob(iojob *j);
574 static int vmWriteObjectOnSwap(robj *o, off_t page);
575 static robj *vmReadObjectFromSwap(off_t page, int type);
576 static void waitEmptyIOJobsQueue(void);
577 static void vmReopenSwapFile(void);
578 static int vmFreePage(off_t page);
579 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
580 static int dontWaitForSwappedKey(redisClient *c, robj *key);
581 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
582 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
583 static struct redisCommand *lookupCommand(char *name);
584 static void call(redisClient *c, struct redisCommand *cmd);
585 static void resetClient(redisClient *c);
586
587 static void authCommand(redisClient *c);
588 static void pingCommand(redisClient *c);
589 static void echoCommand(redisClient *c);
590 static void setCommand(redisClient *c);
591 static void setnxCommand(redisClient *c);
592 static void getCommand(redisClient *c);
593 static void delCommand(redisClient *c);
594 static void existsCommand(redisClient *c);
595 static void incrCommand(redisClient *c);
596 static void decrCommand(redisClient *c);
597 static void incrbyCommand(redisClient *c);
598 static void decrbyCommand(redisClient *c);
599 static void selectCommand(redisClient *c);
600 static void randomkeyCommand(redisClient *c);
601 static void keysCommand(redisClient *c);
602 static void dbsizeCommand(redisClient *c);
603 static void lastsaveCommand(redisClient *c);
604 static void saveCommand(redisClient *c);
605 static void bgsaveCommand(redisClient *c);
606 static void bgrewriteaofCommand(redisClient *c);
607 static void shutdownCommand(redisClient *c);
608 static void moveCommand(redisClient *c);
609 static void renameCommand(redisClient *c);
610 static void renamenxCommand(redisClient *c);
611 static void lpushCommand(redisClient *c);
612 static void rpushCommand(redisClient *c);
613 static void lpopCommand(redisClient *c);
614 static void rpopCommand(redisClient *c);
615 static void llenCommand(redisClient *c);
616 static void lindexCommand(redisClient *c);
617 static void lrangeCommand(redisClient *c);
618 static void ltrimCommand(redisClient *c);
619 static void typeCommand(redisClient *c);
620 static void lsetCommand(redisClient *c);
621 static void saddCommand(redisClient *c);
622 static void sremCommand(redisClient *c);
623 static void smoveCommand(redisClient *c);
624 static void sismemberCommand(redisClient *c);
625 static void scardCommand(redisClient *c);
626 static void spopCommand(redisClient *c);
627 static void srandmemberCommand(redisClient *c);
628 static void sinterCommand(redisClient *c);
629 static void sinterstoreCommand(redisClient *c);
630 static void sunionCommand(redisClient *c);
631 static void sunionstoreCommand(redisClient *c);
632 static void sdiffCommand(redisClient *c);
633 static void sdiffstoreCommand(redisClient *c);
634 static void syncCommand(redisClient *c);
635 static void flushdbCommand(redisClient *c);
636 static void flushallCommand(redisClient *c);
637 static void sortCommand(redisClient *c);
638 static void lremCommand(redisClient *c);
639 static void rpoplpushcommand(redisClient *c);
640 static void infoCommand(redisClient *c);
641 static void mgetCommand(redisClient *c);
642 static void monitorCommand(redisClient *c);
643 static void expireCommand(redisClient *c);
644 static void expireatCommand(redisClient *c);
645 static void getsetCommand(redisClient *c);
646 static void ttlCommand(redisClient *c);
647 static void slaveofCommand(redisClient *c);
648 static void debugCommand(redisClient *c);
649 static void msetCommand(redisClient *c);
650 static void msetnxCommand(redisClient *c);
651 static void zaddCommand(redisClient *c);
652 static void zincrbyCommand(redisClient *c);
653 static void zrangeCommand(redisClient *c);
654 static void zrangebyscoreCommand(redisClient *c);
655 static void zcountCommand(redisClient *c);
656 static void zrevrangeCommand(redisClient *c);
657 static void zcardCommand(redisClient *c);
658 static void zremCommand(redisClient *c);
659 static void zscoreCommand(redisClient *c);
660 static void zremrangebyscoreCommand(redisClient *c);
661 static void multiCommand(redisClient *c);
662 static void execCommand(redisClient *c);
663 static void discardCommand(redisClient *c);
664 static void blpopCommand(redisClient *c);
665 static void brpopCommand(redisClient *c);
666 static void appendCommand(redisClient *c);
667 static void substrCommand(redisClient *c);
668 static void zrankCommand(redisClient *c);
669 static void hsetCommand(redisClient *c);
670 static void hgetCommand(redisClient *c);
671
672 /*================================= Globals ================================= */
673
674 /* Global vars */
675 static struct redisServer server; /* server global state */
676 static struct redisCommand cmdTable[] = {
677 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
678 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
679 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
680 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
681 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
682 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
683 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
684 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
685 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
686 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
687 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
688 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
689 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
690 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
691 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
692 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
693 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
694 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
695 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
696 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
697 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
698 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
699 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
700 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
701 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
702 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
703 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
704 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
705 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
706 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
707 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
708 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
709 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
710 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
711 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
712 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
713 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
715 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
716 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
717 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
718 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
719 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
720 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
721 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
722 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
724 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
725 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
726 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
727 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
728 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
729 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
730 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
731 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
732 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
733 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
734 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
735 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
736 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
737 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
738 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
739 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
740 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
741 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
742 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
743 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
744 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
745 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
746 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
747 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
748 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
749 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
750 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
751 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
752 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
753 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
754 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
755 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
756 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
757 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
758 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
759 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
760 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
761 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
762 {NULL,NULL,0,0,0,0,0}
763 };
764
765 /*============================ Utility functions ============================ */
766
767 /* Glob-style pattern matching. */
768 int stringmatchlen(const char *pattern, int patternLen,
769 const char *string, int stringLen, int nocase)
770 {
771 while(patternLen) {
772 switch(pattern[0]) {
773 case '*':
774 while (pattern[1] == '*') {
775 pattern++;
776 patternLen--;
777 }
778 if (patternLen == 1)
779 return 1; /* match */
780 while(stringLen) {
781 if (stringmatchlen(pattern+1, patternLen-1,
782 string, stringLen, nocase))
783 return 1; /* match */
784 string++;
785 stringLen--;
786 }
787 return 0; /* no match */
788 break;
789 case '?':
790 if (stringLen == 0)
791 return 0; /* no match */
792 string++;
793 stringLen--;
794 break;
795 case '[':
796 {
797 int not, match;
798
799 pattern++;
800 patternLen--;
801 not = pattern[0] == '^';
802 if (not) {
803 pattern++;
804 patternLen--;
805 }
806 match = 0;
807 while(1) {
808 if (pattern[0] == '\\') {
809 pattern++;
810 patternLen--;
811 if (pattern[0] == string[0])
812 match = 1;
813 } else if (pattern[0] == ']') {
814 break;
815 } else if (patternLen == 0) {
816 pattern--;
817 patternLen++;
818 break;
819 } else if (pattern[1] == '-' && patternLen >= 3) {
820 int start = pattern[0];
821 int end = pattern[2];
822 int c = string[0];
823 if (start > end) {
824 int t = start;
825 start = end;
826 end = t;
827 }
828 if (nocase) {
829 start = tolower(start);
830 end = tolower(end);
831 c = tolower(c);
832 }
833 pattern += 2;
834 patternLen -= 2;
835 if (c >= start && c <= end)
836 match = 1;
837 } else {
838 if (!nocase) {
839 if (pattern[0] == string[0])
840 match = 1;
841 } else {
842 if (tolower((int)pattern[0]) == tolower((int)string[0]))
843 match = 1;
844 }
845 }
846 pattern++;
847 patternLen--;
848 }
849 if (not)
850 match = !match;
851 if (!match)
852 return 0; /* no match */
853 string++;
854 stringLen--;
855 break;
856 }
857 case '\\':
858 if (patternLen >= 2) {
859 pattern++;
860 patternLen--;
861 }
862 /* fall through */
863 default:
864 if (!nocase) {
865 if (pattern[0] != string[0])
866 return 0; /* no match */
867 } else {
868 if (tolower((int)pattern[0]) != tolower((int)string[0]))
869 return 0; /* no match */
870 }
871 string++;
872 stringLen--;
873 break;
874 }
875 pattern++;
876 patternLen--;
877 if (stringLen == 0) {
878 while(*pattern == '*') {
879 pattern++;
880 patternLen--;
881 }
882 break;
883 }
884 }
885 if (patternLen == 0 && stringLen == 0)
886 return 1;
887 return 0;
888 }
889
890 static void redisLog(int level, const char *fmt, ...) {
891 va_list ap;
892 FILE *fp;
893
894 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
895 if (!fp) return;
896
897 va_start(ap, fmt);
898 if (level >= server.verbosity) {
899 char *c = ".-*#";
900 char buf[64];
901 time_t now;
902
903 now = time(NULL);
904 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
905 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
906 vfprintf(fp, fmt, ap);
907 fprintf(fp,"\n");
908 fflush(fp);
909 }
910 va_end(ap);
911
912 if (server.logfile) fclose(fp);
913 }
914
915 /*====================== Hash table type implementation ==================== */
916
917 /* This is an hash table type that uses the SDS dynamic strings libary as
918 * keys and radis objects as values (objects can hold SDS strings,
919 * lists, sets). */
920
921 static void dictVanillaFree(void *privdata, void *val)
922 {
923 DICT_NOTUSED(privdata);
924 zfree(val);
925 }
926
927 static void dictListDestructor(void *privdata, void *val)
928 {
929 DICT_NOTUSED(privdata);
930 listRelease((list*)val);
931 }
932
933 static int sdsDictKeyCompare(void *privdata, const void *key1,
934 const void *key2)
935 {
936 int l1,l2;
937 DICT_NOTUSED(privdata);
938
939 l1 = sdslen((sds)key1);
940 l2 = sdslen((sds)key2);
941 if (l1 != l2) return 0;
942 return memcmp(key1, key2, l1) == 0;
943 }
944
945 static void dictRedisObjectDestructor(void *privdata, void *val)
946 {
947 DICT_NOTUSED(privdata);
948
949 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
950 decrRefCount(val);
951 }
952
953 static int dictObjKeyCompare(void *privdata, const void *key1,
954 const void *key2)
955 {
956 const robj *o1 = key1, *o2 = key2;
957 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
958 }
959
960 static unsigned int dictObjHash(const void *key) {
961 const robj *o = key;
962 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
963 }
964
965 static int dictEncObjKeyCompare(void *privdata, const void *key1,
966 const void *key2)
967 {
968 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
969 int cmp;
970
971 o1 = getDecodedObject(o1);
972 o2 = getDecodedObject(o2);
973 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
974 decrRefCount(o1);
975 decrRefCount(o2);
976 return cmp;
977 }
978
979 static unsigned int dictEncObjHash(const void *key) {
980 robj *o = (robj*) key;
981
982 if (o->encoding == REDIS_ENCODING_RAW) {
983 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
984 } else {
985 if (o->encoding == REDIS_ENCODING_INT) {
986 char buf[32];
987 int len;
988
989 len = snprintf(buf,32,"%ld",(long)o->ptr);
990 return dictGenHashFunction((unsigned char*)buf, len);
991 } else {
992 unsigned int hash;
993
994 o = getDecodedObject(o);
995 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
996 decrRefCount(o);
997 return hash;
998 }
999 }
1000 }
1001
1002 /* Sets type and expires */
1003 static dictType setDictType = {
1004 dictEncObjHash, /* hash function */
1005 NULL, /* key dup */
1006 NULL, /* val dup */
1007 dictEncObjKeyCompare, /* key compare */
1008 dictRedisObjectDestructor, /* key destructor */
1009 NULL /* val destructor */
1010 };
1011
1012 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1013 static dictType zsetDictType = {
1014 dictEncObjHash, /* hash function */
1015 NULL, /* key dup */
1016 NULL, /* val dup */
1017 dictEncObjKeyCompare, /* key compare */
1018 dictRedisObjectDestructor, /* key destructor */
1019 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1020 };
1021
1022 /* Db->dict */
1023 static dictType dbDictType = {
1024 dictObjHash, /* hash function */
1025 NULL, /* key dup */
1026 NULL, /* val dup */
1027 dictObjKeyCompare, /* key compare */
1028 dictRedisObjectDestructor, /* key destructor */
1029 dictRedisObjectDestructor /* val destructor */
1030 };
1031
1032 /* Db->expires */
1033 static dictType keyptrDictType = {
1034 dictObjHash, /* hash function */
1035 NULL, /* key dup */
1036 NULL, /* val dup */
1037 dictObjKeyCompare, /* key compare */
1038 dictRedisObjectDestructor, /* key destructor */
1039 NULL /* val destructor */
1040 };
1041
1042 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1043 static dictType hashDictType = {
1044 dictEncObjHash, /* hash function */
1045 NULL, /* key dup */
1046 NULL, /* val dup */
1047 dictEncObjKeyCompare, /* key compare */
1048 dictRedisObjectDestructor, /* key destructor */
1049 dictRedisObjectDestructor /* val destructor */
1050 };
1051
1052 /* Keylist hash table type has unencoded redis objects as keys and
1053 * lists as values. It's used for blocking operations (BLPOP) and to
1054 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1055 static dictType keylistDictType = {
1056 dictObjHash, /* hash function */
1057 NULL, /* key dup */
1058 NULL, /* val dup */
1059 dictObjKeyCompare, /* key compare */
1060 dictRedisObjectDestructor, /* key destructor */
1061 dictListDestructor /* val destructor */
1062 };
1063
1064 /* ========================= Random utility functions ======================= */
1065
1066 /* Redis generally does not try to recover from out of memory conditions
1067 * when allocating objects or strings, it is not clear if it will be possible
1068 * to report this condition to the client since the networking layer itself
1069 * is based on heap allocation for send buffers, so we simply abort.
1070 * At least the code will be simpler to read... */
1071 static void oom(const char *msg) {
1072 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1073 sleep(1);
1074 abort();
1075 }
1076
1077 /* ====================== Redis server networking stuff ===================== */
1078 static void closeTimedoutClients(void) {
1079 redisClient *c;
1080 listNode *ln;
1081 time_t now = time(NULL);
1082 listIter li;
1083
1084 listRewind(server.clients,&li);
1085 while ((ln = listNext(&li)) != NULL) {
1086 c = listNodeValue(ln);
1087 if (server.maxidletime &&
1088 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1089 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1090 (now - c->lastinteraction > server.maxidletime))
1091 {
1092 redisLog(REDIS_VERBOSE,"Closing idle client");
1093 freeClient(c);
1094 } else if (c->flags & REDIS_BLOCKED) {
1095 if (c->blockingto != 0 && c->blockingto < now) {
1096 addReply(c,shared.nullmultibulk);
1097 unblockClientWaitingData(c);
1098 }
1099 }
1100 }
1101 }
1102
1103 static int htNeedsResize(dict *dict) {
1104 long long size, used;
1105
1106 size = dictSlots(dict);
1107 used = dictSize(dict);
1108 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1109 (used*100/size < REDIS_HT_MINFILL));
1110 }
1111
1112 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1113 * we resize the hash table to save memory */
1114 static void tryResizeHashTables(void) {
1115 int j;
1116
1117 for (j = 0; j < server.dbnum; j++) {
1118 if (htNeedsResize(server.db[j].dict)) {
1119 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1120 dictResize(server.db[j].dict);
1121 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1122 }
1123 if (htNeedsResize(server.db[j].expires))
1124 dictResize(server.db[j].expires);
1125 }
1126 }
1127
1128 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1129 void backgroundSaveDoneHandler(int statloc) {
1130 int exitcode = WEXITSTATUS(statloc);
1131 int bysignal = WIFSIGNALED(statloc);
1132
1133 if (!bysignal && exitcode == 0) {
1134 redisLog(REDIS_NOTICE,
1135 "Background saving terminated with success");
1136 server.dirty = 0;
1137 server.lastsave = time(NULL);
1138 } else if (!bysignal && exitcode != 0) {
1139 redisLog(REDIS_WARNING, "Background saving error");
1140 } else {
1141 redisLog(REDIS_WARNING,
1142 "Background saving terminated by signal");
1143 rdbRemoveTempFile(server.bgsavechildpid);
1144 }
1145 server.bgsavechildpid = -1;
1146 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1147 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1148 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1149 }
1150
1151 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1152 * Handle this. */
1153 void backgroundRewriteDoneHandler(int statloc) {
1154 int exitcode = WEXITSTATUS(statloc);
1155 int bysignal = WIFSIGNALED(statloc);
1156
1157 if (!bysignal && exitcode == 0) {
1158 int fd;
1159 char tmpfile[256];
1160
1161 redisLog(REDIS_NOTICE,
1162 "Background append only file rewriting terminated with success");
1163 /* Now it's time to flush the differences accumulated by the parent */
1164 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1165 fd = open(tmpfile,O_WRONLY|O_APPEND);
1166 if (fd == -1) {
1167 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1168 goto cleanup;
1169 }
1170 /* Flush our data... */
1171 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1172 (signed) sdslen(server.bgrewritebuf)) {
1173 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1174 close(fd);
1175 goto cleanup;
1176 }
1177 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1178 /* Now our work is to rename the temp file into the stable file. And
1179 * switch the file descriptor used by the server for append only. */
1180 if (rename(tmpfile,server.appendfilename) == -1) {
1181 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1182 close(fd);
1183 goto cleanup;
1184 }
1185 /* Mission completed... almost */
1186 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1187 if (server.appendfd != -1) {
1188 /* If append only is actually enabled... */
1189 close(server.appendfd);
1190 server.appendfd = fd;
1191 fsync(fd);
1192 server.appendseldb = -1; /* Make sure it will issue SELECT */
1193 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1194 } else {
1195 /* If append only is disabled we just generate a dump in this
1196 * format. Why not? */
1197 close(fd);
1198 }
1199 } else if (!bysignal && exitcode != 0) {
1200 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1201 } else {
1202 redisLog(REDIS_WARNING,
1203 "Background append only file rewriting terminated by signal");
1204 }
1205 cleanup:
1206 sdsfree(server.bgrewritebuf);
1207 server.bgrewritebuf = sdsempty();
1208 aofRemoveTempFile(server.bgrewritechildpid);
1209 server.bgrewritechildpid = -1;
1210 }
1211
1212 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1213 int j, loops = server.cronloops++;
1214 REDIS_NOTUSED(eventLoop);
1215 REDIS_NOTUSED(id);
1216 REDIS_NOTUSED(clientData);
1217
1218 /* We take a cached value of the unix time in the global state because
1219 * with virtual memory and aging there is to store the current time
1220 * in objects at every object access, and accuracy is not needed.
1221 * To access a global var is faster than calling time(NULL) */
1222 server.unixtime = time(NULL);
1223
1224 /* Show some info about non-empty databases */
1225 for (j = 0; j < server.dbnum; j++) {
1226 long long size, used, vkeys;
1227
1228 size = dictSlots(server.db[j].dict);
1229 used = dictSize(server.db[j].dict);
1230 vkeys = dictSize(server.db[j].expires);
1231 if (!(loops % 5) && (used || vkeys)) {
1232 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1233 /* dictPrintStats(server.dict); */
1234 }
1235 }
1236
1237 /* We don't want to resize the hash tables while a bacground saving
1238 * is in progress: the saving child is created using fork() that is
1239 * implemented with a copy-on-write semantic in most modern systems, so
1240 * if we resize the HT while there is the saving child at work actually
1241 * a lot of memory movements in the parent will cause a lot of pages
1242 * copied. */
1243 if (server.bgsavechildpid == -1) tryResizeHashTables();
1244
1245 /* Show information about connected clients */
1246 if (!(loops % 5)) {
1247 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1248 listLength(server.clients)-listLength(server.slaves),
1249 listLength(server.slaves),
1250 zmalloc_used_memory(),
1251 dictSize(server.sharingpool));
1252 }
1253
1254 /* Close connections of timedout clients */
1255 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1256 closeTimedoutClients();
1257
1258 /* Check if a background saving or AOF rewrite in progress terminated */
1259 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1260 int statloc;
1261 pid_t pid;
1262
1263 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1264 if (pid == server.bgsavechildpid) {
1265 backgroundSaveDoneHandler(statloc);
1266 } else {
1267 backgroundRewriteDoneHandler(statloc);
1268 }
1269 }
1270 } else {
1271 /* If there is not a background saving in progress check if
1272 * we have to save now */
1273 time_t now = time(NULL);
1274 for (j = 0; j < server.saveparamslen; j++) {
1275 struct saveparam *sp = server.saveparams+j;
1276
1277 if (server.dirty >= sp->changes &&
1278 now-server.lastsave > sp->seconds) {
1279 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1280 sp->changes, sp->seconds);
1281 rdbSaveBackground(server.dbfilename);
1282 break;
1283 }
1284 }
1285 }
1286
1287 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1288 * will use few CPU cycles if there are few expiring keys, otherwise
1289 * it will get more aggressive to avoid that too much memory is used by
1290 * keys that can be removed from the keyspace. */
1291 for (j = 0; j < server.dbnum; j++) {
1292 int expired;
1293 redisDb *db = server.db+j;
1294
1295 /* Continue to expire if at the end of the cycle more than 25%
1296 * of the keys were expired. */
1297 do {
1298 long num = dictSize(db->expires);
1299 time_t now = time(NULL);
1300
1301 expired = 0;
1302 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1303 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1304 while (num--) {
1305 dictEntry *de;
1306 time_t t;
1307
1308 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1309 t = (time_t) dictGetEntryVal(de);
1310 if (now > t) {
1311 deleteKey(db,dictGetEntryKey(de));
1312 expired++;
1313 }
1314 }
1315 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1316 }
1317
1318 /* Swap a few keys on disk if we are over the memory limit and VM
1319 * is enbled. Try to free objects from the free list first. */
1320 if (vmCanSwapOut()) {
1321 while (server.vm_enabled && zmalloc_used_memory() >
1322 server.vm_max_memory)
1323 {
1324 int retval;
1325
1326 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1327 retval = (server.vm_max_threads == 0) ?
1328 vmSwapOneObjectBlocking() :
1329 vmSwapOneObjectThreaded();
1330 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1331 zmalloc_used_memory() >
1332 (server.vm_max_memory+server.vm_max_memory/10))
1333 {
1334 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1335 }
1336 /* Note that when using threade I/O we free just one object,
1337 * because anyway when the I/O thread in charge to swap this
1338 * object out will finish, the handler of completed jobs
1339 * will try to swap more objects if we are still out of memory. */
1340 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1341 }
1342 }
1343
1344 /* Check if we should connect to a MASTER */
1345 if (server.replstate == REDIS_REPL_CONNECT) {
1346 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1347 if (syncWithMaster() == REDIS_OK) {
1348 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1349 }
1350 }
1351 return 1000;
1352 }
1353
1354 /* This function gets called every time Redis is entering the
1355 * main loop of the event driven library, that is, before to sleep
1356 * for ready file descriptors. */
1357 static void beforeSleep(struct aeEventLoop *eventLoop) {
1358 REDIS_NOTUSED(eventLoop);
1359
1360 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1361 listIter li;
1362 listNode *ln;
1363
1364 listRewind(server.io_ready_clients,&li);
1365 while((ln = listNext(&li))) {
1366 redisClient *c = ln->value;
1367 struct redisCommand *cmd;
1368
1369 /* Resume the client. */
1370 listDelNode(server.io_ready_clients,ln);
1371 c->flags &= (~REDIS_IO_WAIT);
1372 server.vm_blocked_clients--;
1373 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1374 readQueryFromClient, c);
1375 cmd = lookupCommand(c->argv[0]->ptr);
1376 assert(cmd != NULL);
1377 call(c,cmd);
1378 resetClient(c);
1379 /* There may be more data to process in the input buffer. */
1380 if (c->querybuf && sdslen(c->querybuf) > 0)
1381 processInputBuffer(c);
1382 }
1383 }
1384 }
1385
1386 static void createSharedObjects(void) {
1387 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1388 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1389 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1390 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1391 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1392 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1393 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1394 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1395 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1396 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1397 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1398 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1399 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1400 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1401 "-ERR no such key\r\n"));
1402 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1403 "-ERR syntax error\r\n"));
1404 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1405 "-ERR source and destination objects are the same\r\n"));
1406 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1407 "-ERR index out of range\r\n"));
1408 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1409 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1410 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1411 shared.select0 = createStringObject("select 0\r\n",10);
1412 shared.select1 = createStringObject("select 1\r\n",10);
1413 shared.select2 = createStringObject("select 2\r\n",10);
1414 shared.select3 = createStringObject("select 3\r\n",10);
1415 shared.select4 = createStringObject("select 4\r\n",10);
1416 shared.select5 = createStringObject("select 5\r\n",10);
1417 shared.select6 = createStringObject("select 6\r\n",10);
1418 shared.select7 = createStringObject("select 7\r\n",10);
1419 shared.select8 = createStringObject("select 8\r\n",10);
1420 shared.select9 = createStringObject("select 9\r\n",10);
1421 }
1422
1423 static void appendServerSaveParams(time_t seconds, int changes) {
1424 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1425 server.saveparams[server.saveparamslen].seconds = seconds;
1426 server.saveparams[server.saveparamslen].changes = changes;
1427 server.saveparamslen++;
1428 }
1429
1430 static void resetServerSaveParams() {
1431 zfree(server.saveparams);
1432 server.saveparams = NULL;
1433 server.saveparamslen = 0;
1434 }
1435
1436 static void initServerConfig() {
1437 server.dbnum = REDIS_DEFAULT_DBNUM;
1438 server.port = REDIS_SERVERPORT;
1439 server.verbosity = REDIS_VERBOSE;
1440 server.maxidletime = REDIS_MAXIDLETIME;
1441 server.saveparams = NULL;
1442 server.logfile = NULL; /* NULL = log on standard output */
1443 server.bindaddr = NULL;
1444 server.glueoutputbuf = 1;
1445 server.daemonize = 0;
1446 server.appendonly = 0;
1447 server.appendfsync = APPENDFSYNC_ALWAYS;
1448 server.lastfsync = time(NULL);
1449 server.appendfd = -1;
1450 server.appendseldb = -1; /* Make sure the first time will not match */
1451 server.pidfile = "/var/run/redis.pid";
1452 server.dbfilename = "dump.rdb";
1453 server.appendfilename = "appendonly.aof";
1454 server.requirepass = NULL;
1455 server.shareobjects = 0;
1456 server.rdbcompression = 1;
1457 server.sharingpoolsize = 1024;
1458 server.maxclients = 0;
1459 server.blpop_blocked_clients = 0;
1460 server.maxmemory = 0;
1461 server.vm_enabled = 0;
1462 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1463 server.vm_page_size = 256; /* 256 bytes per page */
1464 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1465 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1466 server.vm_max_threads = 4;
1467 server.vm_blocked_clients = 0;
1468
1469 resetServerSaveParams();
1470
1471 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1472 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1473 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1474 /* Replication related */
1475 server.isslave = 0;
1476 server.masterauth = NULL;
1477 server.masterhost = NULL;
1478 server.masterport = 6379;
1479 server.master = NULL;
1480 server.replstate = REDIS_REPL_NONE;
1481
1482 /* Double constants initialization */
1483 R_Zero = 0.0;
1484 R_PosInf = 1.0/R_Zero;
1485 R_NegInf = -1.0/R_Zero;
1486 R_Nan = R_Zero/R_Zero;
1487 }
1488
1489 static void initServer() {
1490 int j;
1491
1492 signal(SIGHUP, SIG_IGN);
1493 signal(SIGPIPE, SIG_IGN);
1494 setupSigSegvAction();
1495
1496 server.devnull = fopen("/dev/null","w");
1497 if (server.devnull == NULL) {
1498 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1499 exit(1);
1500 }
1501 server.clients = listCreate();
1502 server.slaves = listCreate();
1503 server.monitors = listCreate();
1504 server.objfreelist = listCreate();
1505 createSharedObjects();
1506 server.el = aeCreateEventLoop();
1507 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1508 server.sharingpool = dictCreate(&setDictType,NULL);
1509 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1510 if (server.fd == -1) {
1511 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1512 exit(1);
1513 }
1514 for (j = 0; j < server.dbnum; j++) {
1515 server.db[j].dict = dictCreate(&dbDictType,NULL);
1516 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1517 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1518 if (server.vm_enabled)
1519 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1520 server.db[j].id = j;
1521 }
1522 server.cronloops = 0;
1523 server.bgsavechildpid = -1;
1524 server.bgrewritechildpid = -1;
1525 server.bgrewritebuf = sdsempty();
1526 server.lastsave = time(NULL);
1527 server.dirty = 0;
1528 server.stat_numcommands = 0;
1529 server.stat_numconnections = 0;
1530 server.stat_starttime = time(NULL);
1531 server.unixtime = time(NULL);
1532 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1533 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1534 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1535
1536 if (server.appendonly) {
1537 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1538 if (server.appendfd == -1) {
1539 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1540 strerror(errno));
1541 exit(1);
1542 }
1543 }
1544
1545 if (server.vm_enabled) vmInit();
1546 }
1547
1548 /* Empty the whole database */
1549 static long long emptyDb() {
1550 int j;
1551 long long removed = 0;
1552
1553 for (j = 0; j < server.dbnum; j++) {
1554 removed += dictSize(server.db[j].dict);
1555 dictEmpty(server.db[j].dict);
1556 dictEmpty(server.db[j].expires);
1557 }
1558 return removed;
1559 }
1560
1561 static int yesnotoi(char *s) {
1562 if (!strcasecmp(s,"yes")) return 1;
1563 else if (!strcasecmp(s,"no")) return 0;
1564 else return -1;
1565 }
1566
1567 /* I agree, this is a very rudimental way to load a configuration...
1568 will improve later if the config gets more complex */
1569 static void loadServerConfig(char *filename) {
1570 FILE *fp;
1571 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1572 int linenum = 0;
1573 sds line = NULL;
1574
1575 if (filename[0] == '-' && filename[1] == '\0')
1576 fp = stdin;
1577 else {
1578 if ((fp = fopen(filename,"r")) == NULL) {
1579 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1580 exit(1);
1581 }
1582 }
1583
1584 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1585 sds *argv;
1586 int argc, j;
1587
1588 linenum++;
1589 line = sdsnew(buf);
1590 line = sdstrim(line," \t\r\n");
1591
1592 /* Skip comments and blank lines*/
1593 if (line[0] == '#' || line[0] == '\0') {
1594 sdsfree(line);
1595 continue;
1596 }
1597
1598 /* Split into arguments */
1599 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1600 sdstolower(argv[0]);
1601
1602 /* Execute config directives */
1603 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1604 server.maxidletime = atoi(argv[1]);
1605 if (server.maxidletime < 0) {
1606 err = "Invalid timeout value"; goto loaderr;
1607 }
1608 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1609 server.port = atoi(argv[1]);
1610 if (server.port < 1 || server.port > 65535) {
1611 err = "Invalid port"; goto loaderr;
1612 }
1613 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1614 server.bindaddr = zstrdup(argv[1]);
1615 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1616 int seconds = atoi(argv[1]);
1617 int changes = atoi(argv[2]);
1618 if (seconds < 1 || changes < 0) {
1619 err = "Invalid save parameters"; goto loaderr;
1620 }
1621 appendServerSaveParams(seconds,changes);
1622 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1623 if (chdir(argv[1]) == -1) {
1624 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1625 argv[1], strerror(errno));
1626 exit(1);
1627 }
1628 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1629 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1630 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1631 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1632 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1633 else {
1634 err = "Invalid log level. Must be one of debug, notice, warning";
1635 goto loaderr;
1636 }
1637 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1638 FILE *logfp;
1639
1640 server.logfile = zstrdup(argv[1]);
1641 if (!strcasecmp(server.logfile,"stdout")) {
1642 zfree(server.logfile);
1643 server.logfile = NULL;
1644 }
1645 if (server.logfile) {
1646 /* Test if we are able to open the file. The server will not
1647 * be able to abort just for this problem later... */
1648 logfp = fopen(server.logfile,"a");
1649 if (logfp == NULL) {
1650 err = sdscatprintf(sdsempty(),
1651 "Can't open the log file: %s", strerror(errno));
1652 goto loaderr;
1653 }
1654 fclose(logfp);
1655 }
1656 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1657 server.dbnum = atoi(argv[1]);
1658 if (server.dbnum < 1) {
1659 err = "Invalid number of databases"; goto loaderr;
1660 }
1661 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1662 server.maxclients = atoi(argv[1]);
1663 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1664 server.maxmemory = strtoll(argv[1], NULL, 10);
1665 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1666 server.masterhost = sdsnew(argv[1]);
1667 server.masterport = atoi(argv[2]);
1668 server.replstate = REDIS_REPL_CONNECT;
1669 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1670 server.masterauth = zstrdup(argv[1]);
1671 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1672 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1673 err = "argument must be 'yes' or 'no'"; goto loaderr;
1674 }
1675 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1676 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1677 err = "argument must be 'yes' or 'no'"; goto loaderr;
1678 }
1679 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1680 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1681 err = "argument must be 'yes' or 'no'"; goto loaderr;
1682 }
1683 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1684 server.sharingpoolsize = atoi(argv[1]);
1685 if (server.sharingpoolsize < 1) {
1686 err = "invalid object sharing pool size"; goto loaderr;
1687 }
1688 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1689 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1690 err = "argument must be 'yes' or 'no'"; goto loaderr;
1691 }
1692 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1693 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1697 if (!strcasecmp(argv[1],"no")) {
1698 server.appendfsync = APPENDFSYNC_NO;
1699 } else if (!strcasecmp(argv[1],"always")) {
1700 server.appendfsync = APPENDFSYNC_ALWAYS;
1701 } else if (!strcasecmp(argv[1],"everysec")) {
1702 server.appendfsync = APPENDFSYNC_EVERYSEC;
1703 } else {
1704 err = "argument must be 'no', 'always' or 'everysec'";
1705 goto loaderr;
1706 }
1707 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1708 server.requirepass = zstrdup(argv[1]);
1709 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1710 server.pidfile = zstrdup(argv[1]);
1711 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1712 server.dbfilename = zstrdup(argv[1]);
1713 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1714 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1715 err = "argument must be 'yes' or 'no'"; goto loaderr;
1716 }
1717 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1718 zfree(server.vm_swap_file);
1719 server.vm_swap_file = zstrdup(argv[1]);
1720 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1721 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1722 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1723 server.vm_page_size = strtoll(argv[1], NULL, 10);
1724 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1725 server.vm_pages = strtoll(argv[1], NULL, 10);
1726 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1727 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1728 } else {
1729 err = "Bad directive or wrong number of arguments"; goto loaderr;
1730 }
1731 for (j = 0; j < argc; j++)
1732 sdsfree(argv[j]);
1733 zfree(argv);
1734 sdsfree(line);
1735 }
1736 if (fp != stdin) fclose(fp);
1737 return;
1738
1739 loaderr:
1740 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1741 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1742 fprintf(stderr, ">>> '%s'\n", line);
1743 fprintf(stderr, "%s\n", err);
1744 exit(1);
1745 }
1746
1747 static void freeClientArgv(redisClient *c) {
1748 int j;
1749
1750 for (j = 0; j < c->argc; j++)
1751 decrRefCount(c->argv[j]);
1752 for (j = 0; j < c->mbargc; j++)
1753 decrRefCount(c->mbargv[j]);
1754 c->argc = 0;
1755 c->mbargc = 0;
1756 }
1757
1758 static void freeClient(redisClient *c) {
1759 listNode *ln;
1760
1761 /* Note that if the client we are freeing is blocked into a blocking
1762 * call, we have to set querybuf to NULL *before* to call
1763 * unblockClientWaitingData() to avoid processInputBuffer() will get
1764 * called. Also it is important to remove the file events after
1765 * this, because this call adds the READABLE event. */
1766 sdsfree(c->querybuf);
1767 c->querybuf = NULL;
1768 if (c->flags & REDIS_BLOCKED)
1769 unblockClientWaitingData(c);
1770
1771 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1772 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1773 listRelease(c->reply);
1774 freeClientArgv(c);
1775 close(c->fd);
1776 /* Remove from the list of clients */
1777 ln = listSearchKey(server.clients,c);
1778 redisAssert(ln != NULL);
1779 listDelNode(server.clients,ln);
1780 /* Remove from the list of clients waiting for swapped keys */
1781 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1782 ln = listSearchKey(server.io_ready_clients,c);
1783 if (ln) {
1784 listDelNode(server.io_ready_clients,ln);
1785 server.vm_blocked_clients--;
1786 }
1787 }
1788 while (server.vm_enabled && listLength(c->io_keys)) {
1789 ln = listFirst(c->io_keys);
1790 dontWaitForSwappedKey(c,ln->value);
1791 }
1792 listRelease(c->io_keys);
1793 /* Other cleanup */
1794 if (c->flags & REDIS_SLAVE) {
1795 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1796 close(c->repldbfd);
1797 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1798 ln = listSearchKey(l,c);
1799 redisAssert(ln != NULL);
1800 listDelNode(l,ln);
1801 }
1802 if (c->flags & REDIS_MASTER) {
1803 server.master = NULL;
1804 server.replstate = REDIS_REPL_CONNECT;
1805 }
1806 zfree(c->argv);
1807 zfree(c->mbargv);
1808 freeClientMultiState(c);
1809 zfree(c);
1810 }
1811
1812 #define GLUEREPLY_UP_TO (1024)
1813 static void glueReplyBuffersIfNeeded(redisClient *c) {
1814 int copylen = 0;
1815 char buf[GLUEREPLY_UP_TO];
1816 listNode *ln;
1817 listIter li;
1818 robj *o;
1819
1820 listRewind(c->reply,&li);
1821 while((ln = listNext(&li))) {
1822 int objlen;
1823
1824 o = ln->value;
1825 objlen = sdslen(o->ptr);
1826 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1827 memcpy(buf+copylen,o->ptr,objlen);
1828 copylen += objlen;
1829 listDelNode(c->reply,ln);
1830 } else {
1831 if (copylen == 0) return;
1832 break;
1833 }
1834 }
1835 /* Now the output buffer is empty, add the new single element */
1836 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1837 listAddNodeHead(c->reply,o);
1838 }
1839
1840 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1841 redisClient *c = privdata;
1842 int nwritten = 0, totwritten = 0, objlen;
1843 robj *o;
1844 REDIS_NOTUSED(el);
1845 REDIS_NOTUSED(mask);
1846
1847 /* Use writev() if we have enough buffers to send */
1848 if (!server.glueoutputbuf &&
1849 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1850 !(c->flags & REDIS_MASTER))
1851 {
1852 sendReplyToClientWritev(el, fd, privdata, mask);
1853 return;
1854 }
1855
1856 while(listLength(c->reply)) {
1857 if (server.glueoutputbuf && listLength(c->reply) > 1)
1858 glueReplyBuffersIfNeeded(c);
1859
1860 o = listNodeValue(listFirst(c->reply));
1861 objlen = sdslen(o->ptr);
1862
1863 if (objlen == 0) {
1864 listDelNode(c->reply,listFirst(c->reply));
1865 continue;
1866 }
1867
1868 if (c->flags & REDIS_MASTER) {
1869 /* Don't reply to a master */
1870 nwritten = objlen - c->sentlen;
1871 } else {
1872 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1873 if (nwritten <= 0) break;
1874 }
1875 c->sentlen += nwritten;
1876 totwritten += nwritten;
1877 /* If we fully sent the object on head go to the next one */
1878 if (c->sentlen == objlen) {
1879 listDelNode(c->reply,listFirst(c->reply));
1880 c->sentlen = 0;
1881 }
1882 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1883 * bytes, in a single threaded server it's a good idea to serve
1884 * other clients as well, even if a very large request comes from
1885 * super fast link that is always able to accept data (in real world
1886 * scenario think about 'KEYS *' against the loopback interfae) */
1887 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1888 }
1889 if (nwritten == -1) {
1890 if (errno == EAGAIN) {
1891 nwritten = 0;
1892 } else {
1893 redisLog(REDIS_VERBOSE,
1894 "Error writing to client: %s", strerror(errno));
1895 freeClient(c);
1896 return;
1897 }
1898 }
1899 if (totwritten > 0) c->lastinteraction = time(NULL);
1900 if (listLength(c->reply) == 0) {
1901 c->sentlen = 0;
1902 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1903 }
1904 }
1905
1906 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1907 {
1908 redisClient *c = privdata;
1909 int nwritten = 0, totwritten = 0, objlen, willwrite;
1910 robj *o;
1911 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1912 int offset, ion = 0;
1913 REDIS_NOTUSED(el);
1914 REDIS_NOTUSED(mask);
1915
1916 listNode *node;
1917 while (listLength(c->reply)) {
1918 offset = c->sentlen;
1919 ion = 0;
1920 willwrite = 0;
1921
1922 /* fill-in the iov[] array */
1923 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1924 o = listNodeValue(node);
1925 objlen = sdslen(o->ptr);
1926
1927 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1928 break;
1929
1930 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1931 break; /* no more iovecs */
1932
1933 iov[ion].iov_base = ((char*)o->ptr) + offset;
1934 iov[ion].iov_len = objlen - offset;
1935 willwrite += objlen - offset;
1936 offset = 0; /* just for the first item */
1937 ion++;
1938 }
1939
1940 if(willwrite == 0)
1941 break;
1942
1943 /* write all collected blocks at once */
1944 if((nwritten = writev(fd, iov, ion)) < 0) {
1945 if (errno != EAGAIN) {
1946 redisLog(REDIS_VERBOSE,
1947 "Error writing to client: %s", strerror(errno));
1948 freeClient(c);
1949 return;
1950 }
1951 break;
1952 }
1953
1954 totwritten += nwritten;
1955 offset = c->sentlen;
1956
1957 /* remove written robjs from c->reply */
1958 while (nwritten && listLength(c->reply)) {
1959 o = listNodeValue(listFirst(c->reply));
1960 objlen = sdslen(o->ptr);
1961
1962 if(nwritten >= objlen - offset) {
1963 listDelNode(c->reply, listFirst(c->reply));
1964 nwritten -= objlen - offset;
1965 c->sentlen = 0;
1966 } else {
1967 /* partial write */
1968 c->sentlen += nwritten;
1969 break;
1970 }
1971 offset = 0;
1972 }
1973 }
1974
1975 if (totwritten > 0)
1976 c->lastinteraction = time(NULL);
1977
1978 if (listLength(c->reply) == 0) {
1979 c->sentlen = 0;
1980 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1981 }
1982 }
1983
1984 static struct redisCommand *lookupCommand(char *name) {
1985 int j = 0;
1986 while(cmdTable[j].name != NULL) {
1987 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
1988 j++;
1989 }
1990 return NULL;
1991 }
1992
1993 /* resetClient prepare the client to process the next command */
1994 static void resetClient(redisClient *c) {
1995 freeClientArgv(c);
1996 c->bulklen = -1;
1997 c->multibulk = 0;
1998 }
1999
2000 /* Call() is the core of Redis execution of a command */
2001 static void call(redisClient *c, struct redisCommand *cmd) {
2002 long long dirty;
2003
2004 dirty = server.dirty;
2005 cmd->proc(c);
2006 if (server.appendonly && server.dirty-dirty)
2007 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2008 if (server.dirty-dirty && listLength(server.slaves))
2009 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2010 if (listLength(server.monitors))
2011 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2012 server.stat_numcommands++;
2013 }
2014
2015 /* If this function gets called we already read a whole
2016 * command, argments are in the client argv/argc fields.
2017 * processCommand() execute the command or prepare the
2018 * server for a bulk read from the client.
2019 *
2020 * If 1 is returned the client is still alive and valid and
2021 * and other operations can be performed by the caller. Otherwise
2022 * if 0 is returned the client was destroied (i.e. after QUIT). */
2023 static int processCommand(redisClient *c) {
2024 struct redisCommand *cmd;
2025
2026 /* Free some memory if needed (maxmemory setting) */
2027 if (server.maxmemory) freeMemoryIfNeeded();
2028
2029 /* Handle the multi bulk command type. This is an alternative protocol
2030 * supported by Redis in order to receive commands that are composed of
2031 * multiple binary-safe "bulk" arguments. The latency of processing is
2032 * a bit higher but this allows things like multi-sets, so if this
2033 * protocol is used only for MSET and similar commands this is a big win. */
2034 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2035 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2036 if (c->multibulk <= 0) {
2037 resetClient(c);
2038 return 1;
2039 } else {
2040 decrRefCount(c->argv[c->argc-1]);
2041 c->argc--;
2042 return 1;
2043 }
2044 } else if (c->multibulk) {
2045 if (c->bulklen == -1) {
2046 if (((char*)c->argv[0]->ptr)[0] != '$') {
2047 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2048 resetClient(c);
2049 return 1;
2050 } else {
2051 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2052 decrRefCount(c->argv[0]);
2053 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2054 c->argc--;
2055 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2056 resetClient(c);
2057 return 1;
2058 }
2059 c->argc--;
2060 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2061 return 1;
2062 }
2063 } else {
2064 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2065 c->mbargv[c->mbargc] = c->argv[0];
2066 c->mbargc++;
2067 c->argc--;
2068 c->multibulk--;
2069 if (c->multibulk == 0) {
2070 robj **auxargv;
2071 int auxargc;
2072
2073 /* Here we need to swap the multi-bulk argc/argv with the
2074 * normal argc/argv of the client structure. */
2075 auxargv = c->argv;
2076 c->argv = c->mbargv;
2077 c->mbargv = auxargv;
2078
2079 auxargc = c->argc;
2080 c->argc = c->mbargc;
2081 c->mbargc = auxargc;
2082
2083 /* We need to set bulklen to something different than -1
2084 * in order for the code below to process the command without
2085 * to try to read the last argument of a bulk command as
2086 * a special argument. */
2087 c->bulklen = 0;
2088 /* continue below and process the command */
2089 } else {
2090 c->bulklen = -1;
2091 return 1;
2092 }
2093 }
2094 }
2095 /* -- end of multi bulk commands processing -- */
2096
2097 /* The QUIT command is handled as a special case. Normal command
2098 * procs are unable to close the client connection safely */
2099 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2100 freeClient(c);
2101 return 0;
2102 }
2103
2104 /* Now lookup the command and check ASAP about trivial error conditions
2105 * such wrong arity, bad command name and so forth. */
2106 cmd = lookupCommand(c->argv[0]->ptr);
2107 if (!cmd) {
2108 addReplySds(c,
2109 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2110 (char*)c->argv[0]->ptr));
2111 resetClient(c);
2112 return 1;
2113 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2114 (c->argc < -cmd->arity)) {
2115 addReplySds(c,
2116 sdscatprintf(sdsempty(),
2117 "-ERR wrong number of arguments for '%s' command\r\n",
2118 cmd->name));
2119 resetClient(c);
2120 return 1;
2121 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2122 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2123 resetClient(c);
2124 return 1;
2125 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2126 /* This is a bulk command, we have to read the last argument yet. */
2127 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2128
2129 decrRefCount(c->argv[c->argc-1]);
2130 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2131 c->argc--;
2132 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2133 resetClient(c);
2134 return 1;
2135 }
2136 c->argc--;
2137 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2138 /* It is possible that the bulk read is already in the
2139 * buffer. Check this condition and handle it accordingly.
2140 * This is just a fast path, alternative to call processInputBuffer().
2141 * It's a good idea since the code is small and this condition
2142 * happens most of the times. */
2143 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2144 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2145 c->argc++;
2146 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2147 } else {
2148 /* Otherwise return... there is to read the last argument
2149 * from the socket. */
2150 return 1;
2151 }
2152 }
2153 /* Let's try to share objects on the command arguments vector */
2154 if (server.shareobjects) {
2155 int j;
2156 for(j = 1; j < c->argc; j++)
2157 c->argv[j] = tryObjectSharing(c->argv[j]);
2158 }
2159 /* Let's try to encode the bulk object to save space. */
2160 if (cmd->flags & REDIS_CMD_BULK)
2161 tryObjectEncoding(c->argv[c->argc-1]);
2162
2163 /* Check if the user is authenticated */
2164 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2165 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2166 resetClient(c);
2167 return 1;
2168 }
2169
2170 /* Exec the command */
2171 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2172 queueMultiCommand(c,cmd);
2173 addReply(c,shared.queued);
2174 } else {
2175 if (server.vm_enabled && server.vm_max_threads > 0 &&
2176 blockClientOnSwappedKeys(cmd,c)) return 1;
2177 call(c,cmd);
2178 }
2179
2180 /* Prepare the client for the next command */
2181 resetClient(c);
2182 return 1;
2183 }
2184
2185 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2186 listNode *ln;
2187 listIter li;
2188 int outc = 0, j;
2189 robj **outv;
2190 /* (args*2)+1 is enough room for args, spaces, newlines */
2191 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2192
2193 if (argc <= REDIS_STATIC_ARGS) {
2194 outv = static_outv;
2195 } else {
2196 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2197 }
2198
2199 for (j = 0; j < argc; j++) {
2200 if (j != 0) outv[outc++] = shared.space;
2201 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2202 robj *lenobj;
2203
2204 lenobj = createObject(REDIS_STRING,
2205 sdscatprintf(sdsempty(),"%lu\r\n",
2206 (unsigned long) stringObjectLen(argv[j])));
2207 lenobj->refcount = 0;
2208 outv[outc++] = lenobj;
2209 }
2210 outv[outc++] = argv[j];
2211 }
2212 outv[outc++] = shared.crlf;
2213
2214 /* Increment all the refcounts at start and decrement at end in order to
2215 * be sure to free objects if there is no slave in a replication state
2216 * able to be feed with commands */
2217 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2218 listRewind(slaves,&li);
2219 while((ln = listNext(&li))) {
2220 redisClient *slave = ln->value;
2221
2222 /* Don't feed slaves that are still waiting for BGSAVE to start */
2223 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2224
2225 /* Feed all the other slaves, MONITORs and so on */
2226 if (slave->slaveseldb != dictid) {
2227 robj *selectcmd;
2228
2229 switch(dictid) {
2230 case 0: selectcmd = shared.select0; break;
2231 case 1: selectcmd = shared.select1; break;
2232 case 2: selectcmd = shared.select2; break;
2233 case 3: selectcmd = shared.select3; break;
2234 case 4: selectcmd = shared.select4; break;
2235 case 5: selectcmd = shared.select5; break;
2236 case 6: selectcmd = shared.select6; break;
2237 case 7: selectcmd = shared.select7; break;
2238 case 8: selectcmd = shared.select8; break;
2239 case 9: selectcmd = shared.select9; break;
2240 default:
2241 selectcmd = createObject(REDIS_STRING,
2242 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2243 selectcmd->refcount = 0;
2244 break;
2245 }
2246 addReply(slave,selectcmd);
2247 slave->slaveseldb = dictid;
2248 }
2249 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2250 }
2251 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2252 if (outv != static_outv) zfree(outv);
2253 }
2254
2255 static void processInputBuffer(redisClient *c) {
2256 again:
2257 /* Before to process the input buffer, make sure the client is not
2258 * waitig for a blocking operation such as BLPOP. Note that the first
2259 * iteration the client is never blocked, otherwise the processInputBuffer
2260 * would not be called at all, but after the execution of the first commands
2261 * in the input buffer the client may be blocked, and the "goto again"
2262 * will try to reiterate. The following line will make it return asap. */
2263 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2264 if (c->bulklen == -1) {
2265 /* Read the first line of the query */
2266 char *p = strchr(c->querybuf,'\n');
2267 size_t querylen;
2268
2269 if (p) {
2270 sds query, *argv;
2271 int argc, j;
2272
2273 query = c->querybuf;
2274 c->querybuf = sdsempty();
2275 querylen = 1+(p-(query));
2276 if (sdslen(query) > querylen) {
2277 /* leave data after the first line of the query in the buffer */
2278 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2279 }
2280 *p = '\0'; /* remove "\n" */
2281 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2282 sdsupdatelen(query);
2283
2284 /* Now we can split the query in arguments */
2285 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2286 sdsfree(query);
2287
2288 if (c->argv) zfree(c->argv);
2289 c->argv = zmalloc(sizeof(robj*)*argc);
2290
2291 for (j = 0; j < argc; j++) {
2292 if (sdslen(argv[j])) {
2293 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2294 c->argc++;
2295 } else {
2296 sdsfree(argv[j]);
2297 }
2298 }
2299 zfree(argv);
2300 if (c->argc) {
2301 /* Execute the command. If the client is still valid
2302 * after processCommand() return and there is something
2303 * on the query buffer try to process the next command. */
2304 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2305 } else {
2306 /* Nothing to process, argc == 0. Just process the query
2307 * buffer if it's not empty or return to the caller */
2308 if (sdslen(c->querybuf)) goto again;
2309 }
2310 return;
2311 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2312 redisLog(REDIS_VERBOSE, "Client protocol error");
2313 freeClient(c);
2314 return;
2315 }
2316 } else {
2317 /* Bulk read handling. Note that if we are at this point
2318 the client already sent a command terminated with a newline,
2319 we are reading the bulk data that is actually the last
2320 argument of the command. */
2321 int qbl = sdslen(c->querybuf);
2322
2323 if (c->bulklen <= qbl) {
2324 /* Copy everything but the final CRLF as final argument */
2325 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2326 c->argc++;
2327 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2328 /* Process the command. If the client is still valid after
2329 * the processing and there is more data in the buffer
2330 * try to parse it. */
2331 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2332 return;
2333 }
2334 }
2335 }
2336
2337 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2338 redisClient *c = (redisClient*) privdata;
2339 char buf[REDIS_IOBUF_LEN];
2340 int nread;
2341 REDIS_NOTUSED(el);
2342 REDIS_NOTUSED(mask);
2343
2344 nread = read(fd, buf, REDIS_IOBUF_LEN);
2345 if (nread == -1) {
2346 if (errno == EAGAIN) {
2347 nread = 0;
2348 } else {
2349 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2350 freeClient(c);
2351 return;
2352 }
2353 } else if (nread == 0) {
2354 redisLog(REDIS_VERBOSE, "Client closed connection");
2355 freeClient(c);
2356 return;
2357 }
2358 if (nread) {
2359 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2360 c->lastinteraction = time(NULL);
2361 } else {
2362 return;
2363 }
2364 if (!(c->flags & REDIS_BLOCKED))
2365 processInputBuffer(c);
2366 }
2367
2368 static int selectDb(redisClient *c, int id) {
2369 if (id < 0 || id >= server.dbnum)
2370 return REDIS_ERR;
2371 c->db = &server.db[id];
2372 return REDIS_OK;
2373 }
2374
2375 static void *dupClientReplyValue(void *o) {
2376 incrRefCount((robj*)o);
2377 return o;
2378 }
2379
2380 static redisClient *createClient(int fd) {
2381 redisClient *c = zmalloc(sizeof(*c));
2382
2383 anetNonBlock(NULL,fd);
2384 anetTcpNoDelay(NULL,fd);
2385 if (!c) return NULL;
2386 selectDb(c,0);
2387 c->fd = fd;
2388 c->querybuf = sdsempty();
2389 c->argc = 0;
2390 c->argv = NULL;
2391 c->bulklen = -1;
2392 c->multibulk = 0;
2393 c->mbargc = 0;
2394 c->mbargv = NULL;
2395 c->sentlen = 0;
2396 c->flags = 0;
2397 c->lastinteraction = time(NULL);
2398 c->authenticated = 0;
2399 c->replstate = REDIS_REPL_NONE;
2400 c->reply = listCreate();
2401 listSetFreeMethod(c->reply,decrRefCount);
2402 listSetDupMethod(c->reply,dupClientReplyValue);
2403 c->blockingkeys = NULL;
2404 c->blockingkeysnum = 0;
2405 c->io_keys = listCreate();
2406 listSetFreeMethod(c->io_keys,decrRefCount);
2407 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2408 readQueryFromClient, c) == AE_ERR) {
2409 freeClient(c);
2410 return NULL;
2411 }
2412 listAddNodeTail(server.clients,c);
2413 initClientMultiState(c);
2414 return c;
2415 }
2416
2417 static void addReply(redisClient *c, robj *obj) {
2418 if (listLength(c->reply) == 0 &&
2419 (c->replstate == REDIS_REPL_NONE ||
2420 c->replstate == REDIS_REPL_ONLINE) &&
2421 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2422 sendReplyToClient, c) == AE_ERR) return;
2423
2424 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2425 obj = dupStringObject(obj);
2426 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2427 }
2428 listAddNodeTail(c->reply,getDecodedObject(obj));
2429 }
2430
2431 static void addReplySds(redisClient *c, sds s) {
2432 robj *o = createObject(REDIS_STRING,s);
2433 addReply(c,o);
2434 decrRefCount(o);
2435 }
2436
2437 static void addReplyDouble(redisClient *c, double d) {
2438 char buf[128];
2439
2440 snprintf(buf,sizeof(buf),"%.17g",d);
2441 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2442 (unsigned long) strlen(buf),buf));
2443 }
2444
2445 static void addReplyLong(redisClient *c, long l) {
2446 char buf[128];
2447 size_t len;
2448
2449 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2450 addReplySds(c,sdsnewlen(buf,len));
2451 }
2452
2453 static void addReplyBulkLen(redisClient *c, robj *obj) {
2454 size_t len;
2455
2456 if (obj->encoding == REDIS_ENCODING_RAW) {
2457 len = sdslen(obj->ptr);
2458 } else {
2459 long n = (long)obj->ptr;
2460
2461 /* Compute how many bytes will take this integer as a radix 10 string */
2462 len = 1;
2463 if (n < 0) {
2464 len++;
2465 n = -n;
2466 }
2467 while((n = n/10) != 0) {
2468 len++;
2469 }
2470 }
2471 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2472 }
2473
2474 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2475 int cport, cfd;
2476 char cip[128];
2477 redisClient *c;
2478 REDIS_NOTUSED(el);
2479 REDIS_NOTUSED(mask);
2480 REDIS_NOTUSED(privdata);
2481
2482 cfd = anetAccept(server.neterr, fd, cip, &cport);
2483 if (cfd == AE_ERR) {
2484 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2485 return;
2486 }
2487 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2488 if ((c = createClient(cfd)) == NULL) {
2489 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2490 close(cfd); /* May be already closed, just ingore errors */
2491 return;
2492 }
2493 /* If maxclient directive is set and this is one client more... close the
2494 * connection. Note that we create the client instead to check before
2495 * for this condition, since now the socket is already set in nonblocking
2496 * mode and we can send an error for free using the Kernel I/O */
2497 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2498 char *err = "-ERR max number of clients reached\r\n";
2499
2500 /* That's a best effort error message, don't check write errors */
2501 if (write(c->fd,err,strlen(err)) == -1) {
2502 /* Nothing to do, Just to avoid the warning... */
2503 }
2504 freeClient(c);
2505 return;
2506 }
2507 server.stat_numconnections++;
2508 }
2509
2510 /* ======================= Redis objects implementation ===================== */
2511
2512 static robj *createObject(int type, void *ptr) {
2513 robj *o;
2514
2515 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2516 if (listLength(server.objfreelist)) {
2517 listNode *head = listFirst(server.objfreelist);
2518 o = listNodeValue(head);
2519 listDelNode(server.objfreelist,head);
2520 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2521 } else {
2522 if (server.vm_enabled) {
2523 pthread_mutex_unlock(&server.obj_freelist_mutex);
2524 o = zmalloc(sizeof(*o));
2525 } else {
2526 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2527 }
2528 }
2529 o->type = type;
2530 o->encoding = REDIS_ENCODING_RAW;
2531 o->ptr = ptr;
2532 o->refcount = 1;
2533 if (server.vm_enabled) {
2534 /* Note that this code may run in the context of an I/O thread
2535 * and accessing to server.unixtime in theory is an error
2536 * (no locks). But in practice this is safe, and even if we read
2537 * garbage Redis will not fail, as it's just a statistical info */
2538 o->vm.atime = server.unixtime;
2539 o->storage = REDIS_VM_MEMORY;
2540 }
2541 return o;
2542 }
2543
2544 static robj *createStringObject(char *ptr, size_t len) {
2545 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2546 }
2547
2548 static robj *dupStringObject(robj *o) {
2549 assert(o->encoding == REDIS_ENCODING_RAW);
2550 return createStringObject(o->ptr,sdslen(o->ptr));
2551 }
2552
2553 static robj *createListObject(void) {
2554 list *l = listCreate();
2555
2556 listSetFreeMethod(l,decrRefCount);
2557 return createObject(REDIS_LIST,l);
2558 }
2559
2560 static robj *createSetObject(void) {
2561 dict *d = dictCreate(&setDictType,NULL);
2562 return createObject(REDIS_SET,d);
2563 }
2564
2565 static robj *createHashObject(void) {
2566 /* All the Hashes start as zipmaps. Will be automatically converted
2567 * into hash tables if there are enough elements or big elements
2568 * inside. */
2569 unsigned char *zm = zipmapNew();
2570 robj *o = createObject(REDIS_HASH,zm);
2571 o->encoding = REDIS_ENCODING_ZIPMAP;
2572 return o;
2573 }
2574
2575 static robj *createZsetObject(void) {
2576 zset *zs = zmalloc(sizeof(*zs));
2577
2578 zs->dict = dictCreate(&zsetDictType,NULL);
2579 zs->zsl = zslCreate();
2580 return createObject(REDIS_ZSET,zs);
2581 }
2582
2583 static void freeStringObject(robj *o) {
2584 if (o->encoding == REDIS_ENCODING_RAW) {
2585 sdsfree(o->ptr);
2586 }
2587 }
2588
2589 static void freeListObject(robj *o) {
2590 listRelease((list*) o->ptr);
2591 }
2592
2593 static void freeSetObject(robj *o) {
2594 dictRelease((dict*) o->ptr);
2595 }
2596
2597 static void freeZsetObject(robj *o) {
2598 zset *zs = o->ptr;
2599
2600 dictRelease(zs->dict);
2601 zslFree(zs->zsl);
2602 zfree(zs);
2603 }
2604
2605 static void freeHashObject(robj *o) {
2606 dictRelease((dict*) o->ptr);
2607 }
2608
2609 static void incrRefCount(robj *o) {
2610 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2611 o->refcount++;
2612 }
2613
2614 static void decrRefCount(void *obj) {
2615 robj *o = obj;
2616
2617 /* Object is a key of a swapped out value, or in the process of being
2618 * loaded. */
2619 if (server.vm_enabled &&
2620 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2621 {
2622 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2623 redisAssert(o->refcount == 1);
2624 }
2625 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2626 redisAssert(o->type == REDIS_STRING);
2627 freeStringObject(o);
2628 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2629 pthread_mutex_lock(&server.obj_freelist_mutex);
2630 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2631 !listAddNodeHead(server.objfreelist,o))
2632 zfree(o);
2633 pthread_mutex_unlock(&server.obj_freelist_mutex);
2634 server.vm_stats_swapped_objects--;
2635 return;
2636 }
2637 /* Object is in memory, or in the process of being swapped out. */
2638 if (--(o->refcount) == 0) {
2639 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2640 vmCancelThreadedIOJob(obj);
2641 switch(o->type) {
2642 case REDIS_STRING: freeStringObject(o); break;
2643 case REDIS_LIST: freeListObject(o); break;
2644 case REDIS_SET: freeSetObject(o); break;
2645 case REDIS_ZSET: freeZsetObject(o); break;
2646 case REDIS_HASH: freeHashObject(o); break;
2647 default: redisAssert(0 != 0); break;
2648 }
2649 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2650 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2651 !listAddNodeHead(server.objfreelist,o))
2652 zfree(o);
2653 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2654 }
2655 }
2656
2657 static robj *lookupKey(redisDb *db, robj *key) {
2658 dictEntry *de = dictFind(db->dict,key);
2659 if (de) {
2660 robj *key = dictGetEntryKey(de);
2661 robj *val = dictGetEntryVal(de);
2662
2663 if (server.vm_enabled) {
2664 if (key->storage == REDIS_VM_MEMORY ||
2665 key->storage == REDIS_VM_SWAPPING)
2666 {
2667 /* If we were swapping the object out, stop it, this key
2668 * was requested. */
2669 if (key->storage == REDIS_VM_SWAPPING)
2670 vmCancelThreadedIOJob(key);
2671 /* Update the access time of the key for the aging algorithm. */
2672 key->vm.atime = server.unixtime;
2673 } else {
2674 int notify = (key->storage == REDIS_VM_LOADING);
2675
2676 /* Our value was swapped on disk. Bring it at home. */
2677 redisAssert(val == NULL);
2678 val = vmLoadObject(key);
2679 dictGetEntryVal(de) = val;
2680
2681 /* Clients blocked by the VM subsystem may be waiting for
2682 * this key... */
2683 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2684 }
2685 }
2686 return val;
2687 } else {
2688 return NULL;
2689 }
2690 }
2691
2692 static robj *lookupKeyRead(redisDb *db, robj *key) {
2693 expireIfNeeded(db,key);
2694 return lookupKey(db,key);
2695 }
2696
2697 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2698 deleteIfVolatile(db,key);
2699 return lookupKey(db,key);
2700 }
2701
2702 static int deleteKey(redisDb *db, robj *key) {
2703 int retval;
2704
2705 /* We need to protect key from destruction: after the first dictDelete()
2706 * it may happen that 'key' is no longer valid if we don't increment
2707 * it's count. This may happen when we get the object reference directly
2708 * from the hash table with dictRandomKey() or dict iterators */
2709 incrRefCount(key);
2710 if (dictSize(db->expires)) dictDelete(db->expires,key);
2711 retval = dictDelete(db->dict,key);
2712 decrRefCount(key);
2713
2714 return retval == DICT_OK;
2715 }
2716
2717 /* Try to share an object against the shared objects pool */
2718 static robj *tryObjectSharing(robj *o) {
2719 struct dictEntry *de;
2720 unsigned long c;
2721
2722 if (o == NULL || server.shareobjects == 0) return o;
2723
2724 redisAssert(o->type == REDIS_STRING);
2725 de = dictFind(server.sharingpool,o);
2726 if (de) {
2727 robj *shared = dictGetEntryKey(de);
2728
2729 c = ((unsigned long) dictGetEntryVal(de))+1;
2730 dictGetEntryVal(de) = (void*) c;
2731 incrRefCount(shared);
2732 decrRefCount(o);
2733 return shared;
2734 } else {
2735 /* Here we are using a stream algorihtm: Every time an object is
2736 * shared we increment its count, everytime there is a miss we
2737 * recrement the counter of a random object. If this object reaches
2738 * zero we remove the object and put the current object instead. */
2739 if (dictSize(server.sharingpool) >=
2740 server.sharingpoolsize) {
2741 de = dictGetRandomKey(server.sharingpool);
2742 redisAssert(de != NULL);
2743 c = ((unsigned long) dictGetEntryVal(de))-1;
2744 dictGetEntryVal(de) = (void*) c;
2745 if (c == 0) {
2746 dictDelete(server.sharingpool,de->key);
2747 }
2748 } else {
2749 c = 0; /* If the pool is empty we want to add this object */
2750 }
2751 if (c == 0) {
2752 int retval;
2753
2754 retval = dictAdd(server.sharingpool,o,(void*)1);
2755 redisAssert(retval == DICT_OK);
2756 incrRefCount(o);
2757 }
2758 return o;
2759 }
2760 }
2761
2762 /* Check if the nul-terminated string 's' can be represented by a long
2763 * (that is, is a number that fits into long without any other space or
2764 * character before or after the digits).
2765 *
2766 * If so, the function returns REDIS_OK and *longval is set to the value
2767 * of the number. Otherwise REDIS_ERR is returned */
2768 static int isStringRepresentableAsLong(sds s, long *longval) {
2769 char buf[32], *endptr;
2770 long value;
2771 int slen;
2772
2773 value = strtol(s, &endptr, 10);
2774 if (endptr[0] != '\0') return REDIS_ERR;
2775 slen = snprintf(buf,32,"%ld",value);
2776
2777 /* If the number converted back into a string is not identical
2778 * then it's not possible to encode the string as integer */
2779 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2780 if (longval) *longval = value;
2781 return REDIS_OK;
2782 }
2783
2784 /* Try to encode a string object in order to save space */
2785 static int tryObjectEncoding(robj *o) {
2786 long value;
2787 sds s = o->ptr;
2788
2789 if (o->encoding != REDIS_ENCODING_RAW)
2790 return REDIS_ERR; /* Already encoded */
2791
2792 /* It's not save to encode shared objects: shared objects can be shared
2793 * everywhere in the "object space" of Redis. Encoded objects can only
2794 * appear as "values" (and not, for instance, as keys) */
2795 if (o->refcount > 1) return REDIS_ERR;
2796
2797 /* Currently we try to encode only strings */
2798 redisAssert(o->type == REDIS_STRING);
2799
2800 /* Check if we can represent this string as a long integer */
2801 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2802
2803 /* Ok, this object can be encoded */
2804 o->encoding = REDIS_ENCODING_INT;
2805 sdsfree(o->ptr);
2806 o->ptr = (void*) value;
2807 return REDIS_OK;
2808 }
2809
2810 /* Get a decoded version of an encoded object (returned as a new object).
2811 * If the object is already raw-encoded just increment the ref count. */
2812 static robj *getDecodedObject(robj *o) {
2813 robj *dec;
2814
2815 if (o->encoding == REDIS_ENCODING_RAW) {
2816 incrRefCount(o);
2817 return o;
2818 }
2819 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2820 char buf[32];
2821
2822 snprintf(buf,32,"%ld",(long)o->ptr);
2823 dec = createStringObject(buf,strlen(buf));
2824 return dec;
2825 } else {
2826 redisAssert(1 != 1);
2827 }
2828 }
2829
2830 /* Compare two string objects via strcmp() or alike.
2831 * Note that the objects may be integer-encoded. In such a case we
2832 * use snprintf() to get a string representation of the numbers on the stack
2833 * and compare the strings, it's much faster than calling getDecodedObject().
2834 *
2835 * Important note: if objects are not integer encoded, but binary-safe strings,
2836 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2837 * binary safe. */
2838 static int compareStringObjects(robj *a, robj *b) {
2839 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2840 char bufa[128], bufb[128], *astr, *bstr;
2841 int bothsds = 1;
2842
2843 if (a == b) return 0;
2844 if (a->encoding != REDIS_ENCODING_RAW) {
2845 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2846 astr = bufa;
2847 bothsds = 0;
2848 } else {
2849 astr = a->ptr;
2850 }
2851 if (b->encoding != REDIS_ENCODING_RAW) {
2852 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2853 bstr = bufb;
2854 bothsds = 0;
2855 } else {
2856 bstr = b->ptr;
2857 }
2858 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2859 }
2860
2861 static size_t stringObjectLen(robj *o) {
2862 redisAssert(o->type == REDIS_STRING);
2863 if (o->encoding == REDIS_ENCODING_RAW) {
2864 return sdslen(o->ptr);
2865 } else {
2866 char buf[32];
2867
2868 return snprintf(buf,32,"%ld",(long)o->ptr);
2869 }
2870 }
2871
2872 /*============================ RDB saving/loading =========================== */
2873
2874 static int rdbSaveType(FILE *fp, unsigned char type) {
2875 if (fwrite(&type,1,1,fp) == 0) return -1;
2876 return 0;
2877 }
2878
2879 static int rdbSaveTime(FILE *fp, time_t t) {
2880 int32_t t32 = (int32_t) t;
2881 if (fwrite(&t32,4,1,fp) == 0) return -1;
2882 return 0;
2883 }
2884
2885 /* check rdbLoadLen() comments for more info */
2886 static int rdbSaveLen(FILE *fp, uint32_t len) {
2887 unsigned char buf[2];
2888
2889 if (len < (1<<6)) {
2890 /* Save a 6 bit len */
2891 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2892 if (fwrite(buf,1,1,fp) == 0) return -1;
2893 } else if (len < (1<<14)) {
2894 /* Save a 14 bit len */
2895 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2896 buf[1] = len&0xFF;
2897 if (fwrite(buf,2,1,fp) == 0) return -1;
2898 } else {
2899 /* Save a 32 bit len */
2900 buf[0] = (REDIS_RDB_32BITLEN<<6);
2901 if (fwrite(buf,1,1,fp) == 0) return -1;
2902 len = htonl(len);
2903 if (fwrite(&len,4,1,fp) == 0) return -1;
2904 }
2905 return 0;
2906 }
2907
2908 /* String objects in the form "2391" "-100" without any space and with a
2909 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2910 * encoded as integers to save space */
2911 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2912 long long value;
2913 char *endptr, buf[32];
2914
2915 /* Check if it's possible to encode this value as a number */
2916 value = strtoll(s, &endptr, 10);
2917 if (endptr[0] != '\0') return 0;
2918 snprintf(buf,32,"%lld",value);
2919
2920 /* If the number converted back into a string is not identical
2921 * then it's not possible to encode the string as integer */
2922 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2923
2924 /* Finally check if it fits in our ranges */
2925 if (value >= -(1<<7) && value <= (1<<7)-1) {
2926 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2927 enc[1] = value&0xFF;
2928 return 2;
2929 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2930 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2931 enc[1] = value&0xFF;
2932 enc[2] = (value>>8)&0xFF;
2933 return 3;
2934 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2935 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2936 enc[1] = value&0xFF;
2937 enc[2] = (value>>8)&0xFF;
2938 enc[3] = (value>>16)&0xFF;
2939 enc[4] = (value>>24)&0xFF;
2940 return 5;
2941 } else {
2942 return 0;
2943 }
2944 }
2945
2946 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2947 unsigned int comprlen, outlen;
2948 unsigned char byte;
2949 void *out;
2950
2951 /* We require at least four bytes compression for this to be worth it */
2952 outlen = sdslen(obj->ptr)-4;
2953 if (outlen <= 0) return 0;
2954 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2955 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2956 if (comprlen == 0) {
2957 zfree(out);
2958 return 0;
2959 }
2960 /* Data compressed! Let's save it on disk */
2961 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2962 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2963 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2964 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2965 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2966 zfree(out);
2967 return comprlen;
2968
2969 writeerr:
2970 zfree(out);
2971 return -1;
2972 }
2973
2974 /* Save a string objet as [len][data] on disk. If the object is a string
2975 * representation of an integer value we try to safe it in a special form */
2976 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2977 size_t len;
2978 int enclen;
2979
2980 len = sdslen(obj->ptr);
2981
2982 /* Try integer encoding */
2983 if (len <= 11) {
2984 unsigned char buf[5];
2985 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2986 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2987 return 0;
2988 }
2989 }
2990
2991 /* Try LZF compression - under 20 bytes it's unable to compress even
2992 * aaaaaaaaaaaaaaaaaa so skip it */
2993 if (server.rdbcompression && len > 20) {
2994 int retval;
2995
2996 retval = rdbSaveLzfStringObject(fp,obj);
2997 if (retval == -1) return -1;
2998 if (retval > 0) return 0;
2999 /* retval == 0 means data can't be compressed, save the old way */
3000 }
3001
3002 /* Store verbatim */
3003 if (rdbSaveLen(fp,len) == -1) return -1;
3004 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
3005 return 0;
3006 }
3007
3008 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3009 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3010 int retval;
3011
3012 /* Avoid incr/decr ref count business when possible.
3013 * This plays well with copy-on-write given that we are probably
3014 * in a child process (BGSAVE). Also this makes sure key objects
3015 * of swapped objects are not incRefCount-ed (an assert does not allow
3016 * this in order to avoid bugs) */
3017 if (obj->encoding != REDIS_ENCODING_RAW) {
3018 obj = getDecodedObject(obj);
3019 retval = rdbSaveStringObjectRaw(fp,obj);
3020 decrRefCount(obj);
3021 } else {
3022 retval = rdbSaveStringObjectRaw(fp,obj);
3023 }
3024 return retval;
3025 }
3026
3027 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3028 * 8 bit integer specifing the length of the representation.
3029 * This 8 bit integer has special values in order to specify the following
3030 * conditions:
3031 * 253: not a number
3032 * 254: + inf
3033 * 255: - inf
3034 */
3035 static int rdbSaveDoubleValue(FILE *fp, double val) {
3036 unsigned char buf[128];
3037 int len;
3038
3039 if (isnan(val)) {
3040 buf[0] = 253;
3041 len = 1;
3042 } else if (!isfinite(val)) {
3043 len = 1;
3044 buf[0] = (val < 0) ? 255 : 254;
3045 } else {
3046 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3047 buf[0] = strlen((char*)buf+1);
3048 len = buf[0]+1;
3049 }
3050 if (fwrite(buf,len,1,fp) == 0) return -1;
3051 return 0;
3052 }
3053
3054 /* Save a Redis object. */
3055 static int rdbSaveObject(FILE *fp, robj *o) {
3056 if (o->type == REDIS_STRING) {
3057 /* Save a string value */
3058 if (rdbSaveStringObject(fp,o) == -1) return -1;
3059 } else if (o->type == REDIS_LIST) {
3060 /* Save a list value */
3061 list *list = o->ptr;
3062 listIter li;
3063 listNode *ln;
3064
3065 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3066 listRewind(list,&li);
3067 while((ln = listNext(&li))) {
3068 robj *eleobj = listNodeValue(ln);
3069
3070 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3071 }
3072 } else if (o->type == REDIS_SET) {
3073 /* Save a set value */
3074 dict *set = o->ptr;
3075 dictIterator *di = dictGetIterator(set);
3076 dictEntry *de;
3077
3078 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3079 while((de = dictNext(di)) != NULL) {
3080 robj *eleobj = dictGetEntryKey(de);
3081
3082 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3083 }
3084 dictReleaseIterator(di);
3085 } else if (o->type == REDIS_ZSET) {
3086 /* Save a set value */
3087 zset *zs = o->ptr;
3088 dictIterator *di = dictGetIterator(zs->dict);
3089 dictEntry *de;
3090
3091 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3092 while((de = dictNext(di)) != NULL) {
3093 robj *eleobj = dictGetEntryKey(de);
3094 double *score = dictGetEntryVal(de);
3095
3096 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3097 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3098 }
3099 dictReleaseIterator(di);
3100 } else {
3101 redisAssert(0 != 0);
3102 }
3103 return 0;
3104 }
3105
3106 /* Return the length the object will have on disk if saved with
3107 * the rdbSaveObject() function. Currently we use a trick to get
3108 * this length with very little changes to the code. In the future
3109 * we could switch to a faster solution. */
3110 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3111 if (fp == NULL) fp = server.devnull;
3112 rewind(fp);
3113 assert(rdbSaveObject(fp,o) != 1);
3114 return ftello(fp);
3115 }
3116
3117 /* Return the number of pages required to save this object in the swap file */
3118 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3119 off_t bytes = rdbSavedObjectLen(o,fp);
3120
3121 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3122 }
3123
3124 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3125 static int rdbSave(char *filename) {
3126 dictIterator *di = NULL;
3127 dictEntry *de;
3128 FILE *fp;
3129 char tmpfile[256];
3130 int j;
3131 time_t now = time(NULL);
3132
3133 /* Wait for I/O therads to terminate, just in case this is a
3134 * foreground-saving, to avoid seeking the swap file descriptor at the
3135 * same time. */
3136 if (server.vm_enabled)
3137 waitEmptyIOJobsQueue();
3138
3139 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3140 fp = fopen(tmpfile,"w");
3141 if (!fp) {
3142 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3143 return REDIS_ERR;
3144 }
3145 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3146 for (j = 0; j < server.dbnum; j++) {
3147 redisDb *db = server.db+j;
3148 dict *d = db->dict;
3149 if (dictSize(d) == 0) continue;
3150 di = dictGetIterator(d);
3151 if (!di) {
3152 fclose(fp);
3153 return REDIS_ERR;
3154 }
3155
3156 /* Write the SELECT DB opcode */
3157 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3158 if (rdbSaveLen(fp,j) == -1) goto werr;
3159
3160 /* Iterate this DB writing every entry */
3161 while((de = dictNext(di)) != NULL) {
3162 robj *key = dictGetEntryKey(de);
3163 robj *o = dictGetEntryVal(de);
3164 time_t expiretime = getExpire(db,key);
3165
3166 /* Save the expire time */
3167 if (expiretime != -1) {
3168 /* If this key is already expired skip it */
3169 if (expiretime < now) continue;
3170 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3171 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3172 }
3173 /* Save the key and associated value. This requires special
3174 * handling if the value is swapped out. */
3175 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3176 key->storage == REDIS_VM_SWAPPING) {
3177 /* Save type, key, value */
3178 if (rdbSaveType(fp,o->type) == -1) goto werr;
3179 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3180 if (rdbSaveObject(fp,o) == -1) goto werr;
3181 } else {
3182 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3183 robj *po;
3184 /* Get a preview of the object in memory */
3185 po = vmPreviewObject(key);
3186 /* Save type, key, value */
3187 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3188 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3189 if (rdbSaveObject(fp,po) == -1) goto werr;
3190 /* Remove the loaded object from memory */
3191 decrRefCount(po);
3192 }
3193 }
3194 dictReleaseIterator(di);
3195 }
3196 /* EOF opcode */
3197 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3198
3199 /* Make sure data will not remain on the OS's output buffers */
3200 fflush(fp);
3201 fsync(fileno(fp));
3202 fclose(fp);
3203
3204 /* Use RENAME to make sure the DB file is changed atomically only
3205 * if the generate DB file is ok. */
3206 if (rename(tmpfile,filename) == -1) {
3207 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3208 unlink(tmpfile);
3209 return REDIS_ERR;
3210 }
3211 redisLog(REDIS_NOTICE,"DB saved on disk");
3212 server.dirty = 0;
3213 server.lastsave = time(NULL);
3214 return REDIS_OK;
3215
3216 werr:
3217 fclose(fp);
3218 unlink(tmpfile);
3219 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3220 if (di) dictReleaseIterator(di);
3221 return REDIS_ERR;
3222 }
3223
3224 static int rdbSaveBackground(char *filename) {
3225 pid_t childpid;
3226
3227 if (server.bgsavechildpid != -1) return REDIS_ERR;
3228 if (server.vm_enabled) waitEmptyIOJobsQueue();
3229 if ((childpid = fork()) == 0) {
3230 /* Child */
3231 if (server.vm_enabled) vmReopenSwapFile();
3232 close(server.fd);
3233 if (rdbSave(filename) == REDIS_OK) {
3234 _exit(0);
3235 } else {
3236 _exit(1);
3237 }
3238 } else {
3239 /* Parent */
3240 if (childpid == -1) {
3241 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3242 strerror(errno));
3243 return REDIS_ERR;
3244 }
3245 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3246 server.bgsavechildpid = childpid;
3247 return REDIS_OK;
3248 }
3249 return REDIS_OK; /* unreached */
3250 }
3251
3252 static void rdbRemoveTempFile(pid_t childpid) {
3253 char tmpfile[256];
3254
3255 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3256 unlink(tmpfile);
3257 }
3258
3259 static int rdbLoadType(FILE *fp) {
3260 unsigned char type;
3261 if (fread(&type,1,1,fp) == 0) return -1;
3262 return type;
3263 }
3264
3265 static time_t rdbLoadTime(FILE *fp) {
3266 int32_t t32;
3267 if (fread(&t32,4,1,fp) == 0) return -1;
3268 return (time_t) t32;
3269 }
3270
3271 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3272 * of this file for a description of how this are stored on disk.
3273 *
3274 * isencoded is set to 1 if the readed length is not actually a length but
3275 * an "encoding type", check the above comments for more info */
3276 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3277 unsigned char buf[2];
3278 uint32_t len;
3279 int type;
3280
3281 if (isencoded) *isencoded = 0;
3282 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3283 type = (buf[0]&0xC0)>>6;
3284 if (type == REDIS_RDB_6BITLEN) {
3285 /* Read a 6 bit len */
3286 return buf[0]&0x3F;
3287 } else if (type == REDIS_RDB_ENCVAL) {
3288 /* Read a 6 bit len encoding type */
3289 if (isencoded) *isencoded = 1;
3290 return buf[0]&0x3F;
3291 } else if (type == REDIS_RDB_14BITLEN) {
3292 /* Read a 14 bit len */
3293 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3294 return ((buf[0]&0x3F)<<8)|buf[1];
3295 } else {
3296 /* Read a 32 bit len */
3297 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3298 return ntohl(len);
3299 }
3300 }
3301
3302 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3303 unsigned char enc[4];
3304 long long val;
3305
3306 if (enctype == REDIS_RDB_ENC_INT8) {
3307 if (fread(enc,1,1,fp) == 0) return NULL;
3308 val = (signed char)enc[0];
3309 } else if (enctype == REDIS_RDB_ENC_INT16) {
3310 uint16_t v;
3311 if (fread(enc,2,1,fp) == 0) return NULL;
3312 v = enc[0]|(enc[1]<<8);
3313 val = (int16_t)v;
3314 } else if (enctype == REDIS_RDB_ENC_INT32) {
3315 uint32_t v;
3316 if (fread(enc,4,1,fp) == 0) return NULL;
3317 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3318 val = (int32_t)v;
3319 } else {
3320 val = 0; /* anti-warning */
3321 redisAssert(0!=0);
3322 }
3323 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3324 }
3325
3326 static robj *rdbLoadLzfStringObject(FILE*fp) {
3327 unsigned int len, clen;
3328 unsigned char *c = NULL;
3329 sds val = NULL;
3330
3331 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3332 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3333 if ((c = zmalloc(clen)) == NULL) goto err;
3334 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3335 if (fread(c,clen,1,fp) == 0) goto err;
3336 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3337 zfree(c);
3338 return createObject(REDIS_STRING,val);
3339 err:
3340 zfree(c);
3341 sdsfree(val);
3342 return NULL;
3343 }
3344
3345 static robj *rdbLoadStringObject(FILE*fp) {
3346 int isencoded;
3347 uint32_t len;
3348 sds val;
3349
3350 len = rdbLoadLen(fp,&isencoded);
3351 if (isencoded) {
3352 switch(len) {
3353 case REDIS_RDB_ENC_INT8:
3354 case REDIS_RDB_ENC_INT16:
3355 case REDIS_RDB_ENC_INT32:
3356 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3357 case REDIS_RDB_ENC_LZF:
3358 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3359 default:
3360 redisAssert(0!=0);
3361 }
3362 }
3363
3364 if (len == REDIS_RDB_LENERR) return NULL;
3365 val = sdsnewlen(NULL,len);
3366 if (len && fread(val,len,1,fp) == 0) {
3367 sdsfree(val);
3368 return NULL;
3369 }
3370 return tryObjectSharing(createObject(REDIS_STRING,val));
3371 }
3372
3373 /* For information about double serialization check rdbSaveDoubleValue() */
3374 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3375 char buf[128];
3376 unsigned char len;
3377
3378 if (fread(&len,1,1,fp) == 0) return -1;
3379 switch(len) {
3380 case 255: *val = R_NegInf; return 0;
3381 case 254: *val = R_PosInf; return 0;
3382 case 253: *val = R_Nan; return 0;
3383 default:
3384 if (fread(buf,len,1,fp) == 0) return -1;
3385 buf[len] = '\0';
3386 sscanf(buf, "%lg", val);
3387 return 0;
3388 }
3389 }
3390
3391 /* Load a Redis object of the specified type from the specified file.
3392 * On success a newly allocated object is returned, otherwise NULL. */
3393 static robj *rdbLoadObject(int type, FILE *fp) {
3394 robj *o;
3395
3396 if (type == REDIS_STRING) {
3397 /* Read string value */
3398 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3399 tryObjectEncoding(o);
3400 } else if (type == REDIS_LIST || type == REDIS_SET) {
3401 /* Read list/set value */
3402 uint32_t listlen;
3403
3404 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3405 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3406 /* It's faster to expand the dict to the right size asap in order
3407 * to avoid rehashing */
3408 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3409 dictExpand(o->ptr,listlen);
3410 /* Load every single element of the list/set */
3411 while(listlen--) {
3412 robj *ele;
3413
3414 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3415 tryObjectEncoding(ele);
3416 if (type == REDIS_LIST) {
3417 listAddNodeTail((list*)o->ptr,ele);
3418 } else {
3419 dictAdd((dict*)o->ptr,ele,NULL);
3420 }
3421 }
3422 } else if (type == REDIS_ZSET) {
3423 /* Read list/set value */
3424 uint32_t zsetlen;
3425 zset *zs;
3426
3427 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3428 o = createZsetObject();
3429 zs = o->ptr;
3430 /* Load every single element of the list/set */
3431 while(zsetlen--) {
3432 robj *ele;
3433 double *score = zmalloc(sizeof(double));
3434
3435 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3436 tryObjectEncoding(ele);
3437 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3438 dictAdd(zs->dict,ele,score);
3439 zslInsert(zs->zsl,*score,ele);
3440 incrRefCount(ele); /* added to skiplist */
3441 }
3442 } else {
3443 redisAssert(0 != 0);
3444 }
3445 return o;
3446 }
3447
3448 static int rdbLoad(char *filename) {
3449 FILE *fp;
3450 robj *keyobj = NULL;
3451 uint32_t dbid;
3452 int type, retval, rdbver;
3453 dict *d = server.db[0].dict;
3454 redisDb *db = server.db+0;
3455 char buf[1024];
3456 time_t expiretime = -1, now = time(NULL);
3457 long long loadedkeys = 0;
3458
3459 fp = fopen(filename,"r");
3460 if (!fp) return REDIS_ERR;
3461 if (fread(buf,9,1,fp) == 0) goto eoferr;
3462 buf[9] = '\0';
3463 if (memcmp(buf,"REDIS",5) != 0) {
3464 fclose(fp);
3465 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3466 return REDIS_ERR;
3467 }
3468 rdbver = atoi(buf+5);
3469 if (rdbver != 1) {
3470 fclose(fp);
3471 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3472 return REDIS_ERR;
3473 }
3474 while(1) {
3475 robj *o;
3476
3477 /* Read type. */
3478 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3479 if (type == REDIS_EXPIRETIME) {
3480 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3481 /* We read the time so we need to read the object type again */
3482 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3483 }
3484 if (type == REDIS_EOF) break;
3485 /* Handle SELECT DB opcode as a special case */
3486 if (type == REDIS_SELECTDB) {
3487 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3488 goto eoferr;
3489 if (dbid >= (unsigned)server.dbnum) {
3490 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3491 exit(1);
3492 }
3493 db = server.db+dbid;
3494 d = db->dict;
3495 continue;
3496 }
3497 /* Read key */
3498 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3499 /* Read value */
3500 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3501 /* Add the new object in the hash table */
3502 retval = dictAdd(d,keyobj,o);
3503 if (retval == DICT_ERR) {
3504 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3505 exit(1);
3506 }
3507 /* Set the expire time if needed */
3508 if (expiretime != -1) {
3509 setExpire(db,keyobj,expiretime);
3510 /* Delete this key if already expired */
3511 if (expiretime < now) deleteKey(db,keyobj);
3512 expiretime = -1;
3513 }
3514 keyobj = o = NULL;
3515 /* Handle swapping while loading big datasets when VM is on */
3516 loadedkeys++;
3517 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3518 while (zmalloc_used_memory() > server.vm_max_memory) {
3519 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3520 }
3521 }
3522 }
3523 fclose(fp);
3524 return REDIS_OK;
3525
3526 eoferr: /* unexpected end of file is handled here with a fatal exit */
3527 if (keyobj) decrRefCount(keyobj);
3528 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3529 exit(1);
3530 return REDIS_ERR; /* Just to avoid warning */
3531 }
3532
3533 /*================================== Commands =============================== */
3534
3535 static void authCommand(redisClient *c) {
3536 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3537 c->authenticated = 1;
3538 addReply(c,shared.ok);
3539 } else {
3540 c->authenticated = 0;
3541 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3542 }
3543 }
3544
3545 static void pingCommand(redisClient *c) {
3546 addReply(c,shared.pong);
3547 }
3548
3549 static void echoCommand(redisClient *c) {
3550 addReplyBulkLen(c,c->argv[1]);
3551 addReply(c,c->argv[1]);
3552 addReply(c,shared.crlf);
3553 }
3554
3555 /*=================================== Strings =============================== */
3556
3557 static void setGenericCommand(redisClient *c, int nx) {
3558 int retval;
3559
3560 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3561 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3562 if (retval == DICT_ERR) {
3563 if (!nx) {
3564 /* If the key is about a swapped value, we want a new key object
3565 * to overwrite the old. So we delete the old key in the database.
3566 * This will also make sure that swap pages about the old object
3567 * will be marked as free. */
3568 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3569 incrRefCount(c->argv[1]);
3570 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3571 incrRefCount(c->argv[2]);
3572 } else {
3573 addReply(c,shared.czero);
3574 return;
3575 }
3576 } else {
3577 incrRefCount(c->argv[1]);
3578 incrRefCount(c->argv[2]);
3579 }
3580 server.dirty++;
3581 removeExpire(c->db,c->argv[1]);
3582 addReply(c, nx ? shared.cone : shared.ok);
3583 }
3584
3585 static void setCommand(redisClient *c) {
3586 setGenericCommand(c,0);
3587 }
3588
3589 static void setnxCommand(redisClient *c) {
3590 setGenericCommand(c,1);
3591 }
3592
3593 static int getGenericCommand(redisClient *c) {
3594 robj *o = lookupKeyRead(c->db,c->argv[1]);
3595
3596 if (o == NULL) {
3597 addReply(c,shared.nullbulk);
3598 return REDIS_OK;
3599 } else {
3600 if (o->type != REDIS_STRING) {
3601 addReply(c,shared.wrongtypeerr);
3602 return REDIS_ERR;
3603 } else {
3604 addReplyBulkLen(c,o);
3605 addReply(c,o);
3606 addReply(c,shared.crlf);
3607 return REDIS_OK;
3608 }
3609 }
3610 }
3611
3612 static void getCommand(redisClient *c) {
3613 getGenericCommand(c);
3614 }
3615
3616 static void getsetCommand(redisClient *c) {
3617 if (getGenericCommand(c) == REDIS_ERR) return;
3618 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3619 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3620 } else {
3621 incrRefCount(c->argv[1]);
3622 }
3623 incrRefCount(c->argv[2]);
3624 server.dirty++;
3625 removeExpire(c->db,c->argv[1]);
3626 }
3627
3628 static void mgetCommand(redisClient *c) {
3629 int j;
3630
3631 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3632 for (j = 1; j < c->argc; j++) {
3633 robj *o = lookupKeyRead(c->db,c->argv[j]);
3634 if (o == NULL) {
3635 addReply(c,shared.nullbulk);
3636 } else {
3637 if (o->type != REDIS_STRING) {
3638 addReply(c,shared.nullbulk);
3639 } else {
3640 addReplyBulkLen(c,o);
3641 addReply(c,o);
3642 addReply(c,shared.crlf);
3643 }
3644 }
3645 }
3646 }
3647
3648 static void msetGenericCommand(redisClient *c, int nx) {
3649 int j, busykeys = 0;
3650
3651 if ((c->argc % 2) == 0) {
3652 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3653 return;
3654 }
3655 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3656 * set nothing at all if at least one already key exists. */
3657 if (nx) {
3658 for (j = 1; j < c->argc; j += 2) {
3659 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3660 busykeys++;
3661 }
3662 }
3663 }
3664 if (busykeys) {
3665 addReply(c, shared.czero);
3666 return;
3667 }
3668
3669 for (j = 1; j < c->argc; j += 2) {
3670 int retval;
3671
3672 tryObjectEncoding(c->argv[j+1]);
3673 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3674 if (retval == DICT_ERR) {
3675 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3676 incrRefCount(c->argv[j+1]);
3677 } else {
3678 incrRefCount(c->argv[j]);
3679 incrRefCount(c->argv[j+1]);
3680 }
3681 removeExpire(c->db,c->argv[j]);
3682 }
3683 server.dirty += (c->argc-1)/2;
3684 addReply(c, nx ? shared.cone : shared.ok);
3685 }
3686
3687 static void msetCommand(redisClient *c) {
3688 msetGenericCommand(c,0);
3689 }
3690
3691 static void msetnxCommand(redisClient *c) {
3692 msetGenericCommand(c,1);
3693 }
3694
3695 static void incrDecrCommand(redisClient *c, long long incr) {
3696 long long value;
3697 int retval;
3698 robj *o;
3699
3700 o = lookupKeyWrite(c->db,c->argv[1]);
3701 if (o == NULL) {
3702 value = 0;
3703 } else {
3704 if (o->type != REDIS_STRING) {
3705 value = 0;
3706 } else {
3707 char *eptr;
3708
3709 if (o->encoding == REDIS_ENCODING_RAW)
3710 value = strtoll(o->ptr, &eptr, 10);
3711 else if (o->encoding == REDIS_ENCODING_INT)
3712 value = (long)o->ptr;
3713 else
3714 redisAssert(1 != 1);
3715 }
3716 }
3717
3718 value += incr;
3719 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3720 tryObjectEncoding(o);
3721 retval = dictAdd(c->db->dict,c->argv[1],o);
3722 if (retval == DICT_ERR) {
3723 dictReplace(c->db->dict,c->argv[1],o);
3724 removeExpire(c->db,c->argv[1]);
3725 } else {
3726 incrRefCount(c->argv[1]);
3727 }
3728 server.dirty++;
3729 addReply(c,shared.colon);
3730 addReply(c,o);
3731 addReply(c,shared.crlf);
3732 }
3733
3734 static void incrCommand(redisClient *c) {
3735 incrDecrCommand(c,1);
3736 }
3737
3738 static void decrCommand(redisClient *c) {
3739 incrDecrCommand(c,-1);
3740 }
3741
3742 static void incrbyCommand(redisClient *c) {
3743 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3744 incrDecrCommand(c,incr);
3745 }
3746
3747 static void decrbyCommand(redisClient *c) {
3748 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3749 incrDecrCommand(c,-incr);
3750 }
3751
3752 static void appendCommand(redisClient *c) {
3753 int retval;
3754 size_t totlen;
3755 robj *o;
3756
3757 o = lookupKeyWrite(c->db,c->argv[1]);
3758 if (o == NULL) {
3759 /* Create the key */
3760 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3761 incrRefCount(c->argv[1]);
3762 incrRefCount(c->argv[2]);
3763 totlen = stringObjectLen(c->argv[2]);
3764 } else {
3765 dictEntry *de;
3766
3767 de = dictFind(c->db->dict,c->argv[1]);
3768 assert(de != NULL);
3769
3770 o = dictGetEntryVal(de);
3771 if (o->type != REDIS_STRING) {
3772 addReply(c,shared.wrongtypeerr);
3773 return;
3774 }
3775 /* If the object is specially encoded or shared we have to make
3776 * a copy */
3777 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3778 robj *decoded = getDecodedObject(o);
3779
3780 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3781 decrRefCount(decoded);
3782 dictReplace(c->db->dict,c->argv[1],o);
3783 }
3784 /* APPEND! */
3785 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3786 o->ptr = sdscatlen(o->ptr,
3787 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3788 } else {
3789 o->ptr = sdscatprintf(o->ptr, "%ld",
3790 (unsigned long) c->argv[2]->ptr);
3791 }
3792 totlen = sdslen(o->ptr);
3793 }
3794 server.dirty++;
3795 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3796 }
3797
3798 static void substrCommand(redisClient *c) {
3799 robj *o;
3800 long start = atoi(c->argv[2]->ptr);
3801 long end = atoi(c->argv[3]->ptr);
3802
3803 o = lookupKeyRead(c->db,c->argv[1]);
3804 if (o == NULL) {
3805 addReply(c,shared.nullbulk);
3806 } else {
3807 if (o->type != REDIS_STRING) {
3808 addReply(c,shared.wrongtypeerr);
3809 } else {
3810 size_t rangelen, strlen;
3811 sds range;
3812
3813 o = getDecodedObject(o);
3814 strlen = sdslen(o->ptr);
3815
3816 /* convert negative indexes */
3817 if (start < 0) start = strlen+start;
3818 if (end < 0) end = strlen+end;
3819 if (start < 0) start = 0;
3820 if (end < 0) end = 0;
3821
3822 /* indexes sanity checks */
3823 if (start > end || (size_t)start >= strlen) {
3824 /* Out of range start or start > end result in null reply */
3825 addReply(c,shared.nullbulk);
3826 decrRefCount(o);
3827 return;
3828 }
3829 if ((size_t)end >= strlen) end = strlen-1;
3830 rangelen = (end-start)+1;
3831
3832 /* Return the result */
3833 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen));
3834 range = sdsnewlen((char*)o->ptr+start,rangelen);
3835 addReplySds(c,range);
3836 addReply(c,shared.crlf);
3837 decrRefCount(o);
3838 }
3839 }
3840 }
3841
3842 /* ========================= Type agnostic commands ========================= */
3843
3844 static void delCommand(redisClient *c) {
3845 int deleted = 0, j;
3846
3847 for (j = 1; j < c->argc; j++) {
3848 if (deleteKey(c->db,c->argv[j])) {
3849 server.dirty++;
3850 deleted++;
3851 }
3852 }
3853 switch(deleted) {
3854 case 0:
3855 addReply(c,shared.czero);
3856 break;
3857 case 1:
3858 addReply(c,shared.cone);
3859 break;
3860 default:
3861 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3862 break;
3863 }
3864 }
3865
3866 static void existsCommand(redisClient *c) {
3867 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3868 }
3869
3870 static void selectCommand(redisClient *c) {
3871 int id = atoi(c->argv[1]->ptr);
3872
3873 if (selectDb(c,id) == REDIS_ERR) {
3874 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3875 } else {
3876 addReply(c,shared.ok);
3877 }
3878 }
3879
3880 static void randomkeyCommand(redisClient *c) {
3881 dictEntry *de;
3882
3883 while(1) {
3884 de = dictGetRandomKey(c->db->dict);
3885 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3886 }
3887 if (de == NULL) {
3888 addReply(c,shared.plus);
3889 addReply(c,shared.crlf);
3890 } else {
3891 addReply(c,shared.plus);
3892 addReply(c,dictGetEntryKey(de));
3893 addReply(c,shared.crlf);
3894 }
3895 }
3896
3897 static void keysCommand(redisClient *c) {
3898 dictIterator *di;
3899 dictEntry *de;
3900 sds pattern = c->argv[1]->ptr;
3901 int plen = sdslen(pattern);
3902 unsigned long numkeys = 0;
3903 robj *lenobj = createObject(REDIS_STRING,NULL);
3904
3905 di = dictGetIterator(c->db->dict);
3906 addReply(c,lenobj);
3907 decrRefCount(lenobj);
3908 while((de = dictNext(di)) != NULL) {
3909 robj *keyobj = dictGetEntryKey(de);
3910
3911 sds key = keyobj->ptr;
3912 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3913 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3914 if (expireIfNeeded(c->db,keyobj) == 0) {
3915 addReplyBulkLen(c,keyobj);
3916 addReply(c,keyobj);
3917 addReply(c,shared.crlf);
3918 numkeys++;
3919 }
3920 }
3921 }
3922 dictReleaseIterator(di);
3923 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3924 }
3925
3926 static void dbsizeCommand(redisClient *c) {
3927 addReplySds(c,
3928 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3929 }
3930
3931 static void lastsaveCommand(redisClient *c) {
3932 addReplySds(c,
3933 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3934 }
3935
3936 static void typeCommand(redisClient *c) {
3937 robj *o;
3938 char *type;
3939
3940 o = lookupKeyRead(c->db,c->argv[1]);
3941 if (o == NULL) {
3942 type = "+none";
3943 } else {
3944 switch(o->type) {
3945 case REDIS_STRING: type = "+string"; break;
3946 case REDIS_LIST: type = "+list"; break;
3947 case REDIS_SET: type = "+set"; break;
3948 case REDIS_ZSET: type = "+zset"; break;
3949 default: type = "unknown"; break;
3950 }
3951 }
3952 addReplySds(c,sdsnew(type));
3953 addReply(c,shared.crlf);
3954 }
3955
3956 static void saveCommand(redisClient *c) {
3957 if (server.bgsavechildpid != -1) {
3958 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3959 return;
3960 }
3961 if (rdbSave(server.dbfilename) == REDIS_OK) {
3962 addReply(c,shared.ok);
3963 } else {
3964 addReply(c,shared.err);
3965 }
3966 }
3967
3968 static void bgsaveCommand(redisClient *c) {
3969 if (server.bgsavechildpid != -1) {
3970 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3971 return;
3972 }
3973 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
3974 char *status = "+Background saving started\r\n";
3975 addReplySds(c,sdsnew(status));
3976 } else {
3977 addReply(c,shared.err);
3978 }
3979 }
3980
3981 static void shutdownCommand(redisClient *c) {
3982 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
3983 /* Kill the saving child if there is a background saving in progress.
3984 We want to avoid race conditions, for instance our saving child may
3985 overwrite the synchronous saving did by SHUTDOWN. */
3986 if (server.bgsavechildpid != -1) {
3987 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3988 kill(server.bgsavechildpid,SIGKILL);
3989 rdbRemoveTempFile(server.bgsavechildpid);
3990 }
3991 if (server.appendonly) {
3992 /* Append only file: fsync() the AOF and exit */
3993 fsync(server.appendfd);
3994 if (server.vm_enabled) unlink(server.vm_swap_file);
3995 exit(0);
3996 } else {
3997 /* Snapshotting. Perform a SYNC SAVE and exit */
3998 if (rdbSave(server.dbfilename) == REDIS_OK) {
3999 if (server.daemonize)
4000 unlink(server.pidfile);
4001 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4002 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4003 if (server.vm_enabled) unlink(server.vm_swap_file);
4004 exit(0);
4005 } else {
4006 /* Ooops.. error saving! The best we can do is to continue operating.
4007 * Note that if there was a background saving process, in the next
4008 * cron() Redis will be notified that the background saving aborted,
4009 * handling special stuff like slaves pending for synchronization... */
4010 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4011 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4012 }
4013 }
4014 }
4015
4016 static void renameGenericCommand(redisClient *c, int nx) {
4017 robj *o;
4018
4019 /* To use the same key as src and dst is probably an error */
4020 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4021 addReply(c,shared.sameobjecterr);
4022 return;
4023 }
4024
4025 o = lookupKeyWrite(c->db,c->argv[1]);
4026 if (o == NULL) {
4027 addReply(c,shared.nokeyerr);
4028 return;
4029 }
4030 incrRefCount(o);
4031 deleteIfVolatile(c->db,c->argv[2]);
4032 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4033 if (nx) {
4034 decrRefCount(o);
4035 addReply(c,shared.czero);
4036 return;
4037 }
4038 dictReplace(c->db->dict,c->argv[2],o);
4039 } else {
4040 incrRefCount(c->argv[2]);
4041 }
4042 deleteKey(c->db,c->argv[1]);
4043 server.dirty++;
4044 addReply(c,nx ? shared.cone : shared.ok);
4045 }
4046
4047 static void renameCommand(redisClient *c) {
4048 renameGenericCommand(c,0);
4049 }
4050
4051 static void renamenxCommand(redisClient *c) {
4052 renameGenericCommand(c,1);
4053 }
4054
4055 static void moveCommand(redisClient *c) {
4056 robj *o;
4057 redisDb *src, *dst;
4058 int srcid;
4059
4060 /* Obtain source and target DB pointers */
4061 src = c->db;
4062 srcid = c->db->id;
4063 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4064 addReply(c,shared.outofrangeerr);
4065 return;
4066 }
4067 dst = c->db;
4068 selectDb(c,srcid); /* Back to the source DB */
4069
4070 /* If the user is moving using as target the same
4071 * DB as the source DB it is probably an error. */
4072 if (src == dst) {
4073 addReply(c,shared.sameobjecterr);
4074 return;
4075 }
4076
4077 /* Check if the element exists and get a reference */
4078 o = lookupKeyWrite(c->db,c->argv[1]);
4079 if (!o) {
4080 addReply(c,shared.czero);
4081 return;
4082 }
4083
4084 /* Try to add the element to the target DB */
4085 deleteIfVolatile(dst,c->argv[1]);
4086 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4087 addReply(c,shared.czero);
4088 return;
4089 }
4090 incrRefCount(c->argv[1]);
4091 incrRefCount(o);
4092
4093 /* OK! key moved, free the entry in the source DB */
4094 deleteKey(src,c->argv[1]);
4095 server.dirty++;
4096 addReply(c,shared.cone);
4097 }
4098
4099 /* =================================== Lists ================================ */
4100 static void pushGenericCommand(redisClient *c, int where) {
4101 robj *lobj;
4102 list *list;
4103
4104 lobj = lookupKeyWrite(c->db,c->argv[1]);
4105 if (lobj == NULL) {
4106 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4107 addReply(c,shared.cone);
4108 return;
4109 }
4110 lobj = createListObject();
4111 list = lobj->ptr;
4112 if (where == REDIS_HEAD) {
4113 listAddNodeHead(list,c->argv[2]);
4114 } else {
4115 listAddNodeTail(list,c->argv[2]);
4116 }
4117 dictAdd(c->db->dict,c->argv[1],lobj);
4118 incrRefCount(c->argv[1]);
4119 incrRefCount(c->argv[2]);
4120 } else {
4121 if (lobj->type != REDIS_LIST) {
4122 addReply(c,shared.wrongtypeerr);
4123 return;
4124 }
4125 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4126 addReply(c,shared.cone);
4127 return;
4128 }
4129 list = lobj->ptr;
4130 if (where == REDIS_HEAD) {
4131 listAddNodeHead(list,c->argv[2]);
4132 } else {
4133 listAddNodeTail(list,c->argv[2]);
4134 }
4135 incrRefCount(c->argv[2]);
4136 }
4137 server.dirty++;
4138 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4139 }
4140
4141 static void lpushCommand(redisClient *c) {
4142 pushGenericCommand(c,REDIS_HEAD);
4143 }
4144
4145 static void rpushCommand(redisClient *c) {
4146 pushGenericCommand(c,REDIS_TAIL);
4147 }
4148
4149 static void llenCommand(redisClient *c) {
4150 robj *o;
4151 list *l;
4152
4153 o = lookupKeyRead(c->db,c->argv[1]);
4154 if (o == NULL) {
4155 addReply(c,shared.czero);
4156 return;
4157 } else {
4158 if (o->type != REDIS_LIST) {
4159 addReply(c,shared.wrongtypeerr);
4160 } else {
4161 l = o->ptr;
4162 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4163 }
4164 }
4165 }
4166
4167 static void lindexCommand(redisClient *c) {
4168 robj *o;
4169 int index = atoi(c->argv[2]->ptr);
4170
4171 o = lookupKeyRead(c->db,c->argv[1]);
4172 if (o == NULL) {
4173 addReply(c,shared.nullbulk);
4174 } else {
4175 if (o->type != REDIS_LIST) {
4176 addReply(c,shared.wrongtypeerr);
4177 } else {
4178 list *list = o->ptr;
4179 listNode *ln;
4180
4181 ln = listIndex(list, index);
4182 if (ln == NULL) {
4183 addReply(c,shared.nullbulk);
4184 } else {
4185 robj *ele = listNodeValue(ln);
4186 addReplyBulkLen(c,ele);
4187 addReply(c,ele);
4188 addReply(c,shared.crlf);
4189 }
4190 }
4191 }
4192 }
4193
4194 static void lsetCommand(redisClient *c) {
4195 robj *o;
4196 int index = atoi(c->argv[2]->ptr);
4197
4198 o = lookupKeyWrite(c->db,c->argv[1]);
4199 if (o == NULL) {
4200 addReply(c,shared.nokeyerr);
4201 } else {
4202 if (o->type != REDIS_LIST) {
4203 addReply(c,shared.wrongtypeerr);
4204 } else {
4205 list *list = o->ptr;
4206 listNode *ln;
4207
4208 ln = listIndex(list, index);
4209 if (ln == NULL) {
4210 addReply(c,shared.outofrangeerr);
4211 } else {
4212 robj *ele = listNodeValue(ln);
4213
4214 decrRefCount(ele);
4215 listNodeValue(ln) = c->argv[3];
4216 incrRefCount(c->argv[3]);
4217 addReply(c,shared.ok);
4218 server.dirty++;
4219 }
4220 }
4221 }
4222 }
4223
4224 static void popGenericCommand(redisClient *c, int where) {
4225 robj *o;
4226
4227 o = lookupKeyWrite(c->db,c->argv[1]);
4228 if (o == NULL) {
4229 addReply(c,shared.nullbulk);
4230 } else {
4231 if (o->type != REDIS_LIST) {
4232 addReply(c,shared.wrongtypeerr);
4233 } else {
4234 list *list = o->ptr;
4235 listNode *ln;
4236
4237 if (where == REDIS_HEAD)
4238 ln = listFirst(list);
4239 else
4240 ln = listLast(list);
4241
4242 if (ln == NULL) {
4243 addReply(c,shared.nullbulk);
4244 } else {
4245 robj *ele = listNodeValue(ln);
4246 addReplyBulkLen(c,ele);
4247 addReply(c,ele);
4248 addReply(c,shared.crlf);
4249 listDelNode(list,ln);
4250 server.dirty++;
4251 }
4252 }
4253 }
4254 }
4255
4256 static void lpopCommand(redisClient *c) {
4257 popGenericCommand(c,REDIS_HEAD);
4258 }
4259
4260 static void rpopCommand(redisClient *c) {
4261 popGenericCommand(c,REDIS_TAIL);
4262 }
4263
4264 static void lrangeCommand(redisClient *c) {
4265 robj *o;
4266 int start = atoi(c->argv[2]->ptr);
4267 int end = atoi(c->argv[3]->ptr);
4268
4269 o = lookupKeyRead(c->db,c->argv[1]);
4270 if (o == NULL) {
4271 addReply(c,shared.nullmultibulk);
4272 } else {
4273 if (o->type != REDIS_LIST) {
4274 addReply(c,shared.wrongtypeerr);
4275 } else {
4276 list *list = o->ptr;
4277 listNode *ln;
4278 int llen = listLength(list);
4279 int rangelen, j;
4280 robj *ele;
4281
4282 /* convert negative indexes */
4283 if (start < 0) start = llen+start;
4284 if (end < 0) end = llen+end;
4285 if (start < 0) start = 0;
4286 if (end < 0) end = 0;
4287
4288 /* indexes sanity checks */
4289 if (start > end || start >= llen) {
4290 /* Out of range start or start > end result in empty list */
4291 addReply(c,shared.emptymultibulk);
4292 return;
4293 }
4294 if (end >= llen) end = llen-1;
4295 rangelen = (end-start)+1;
4296
4297 /* Return the result in form of a multi-bulk reply */
4298 ln = listIndex(list, start);
4299 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4300 for (j = 0; j < rangelen; j++) {
4301 ele = listNodeValue(ln);
4302 addReplyBulkLen(c,ele);
4303 addReply(c,ele);
4304 addReply(c,shared.crlf);
4305 ln = ln->next;
4306 }
4307 }
4308 }
4309 }
4310
4311 static void ltrimCommand(redisClient *c) {
4312 robj *o;
4313 int start = atoi(c->argv[2]->ptr);
4314 int end = atoi(c->argv[3]->ptr);
4315
4316 o = lookupKeyWrite(c->db,c->argv[1]);
4317 if (o == NULL) {
4318 addReply(c,shared.ok);
4319 } else {
4320 if (o->type != REDIS_LIST) {
4321 addReply(c,shared.wrongtypeerr);
4322 } else {
4323 list *list = o->ptr;
4324 listNode *ln;
4325 int llen = listLength(list);
4326 int j, ltrim, rtrim;
4327
4328 /* convert negative indexes */
4329 if (start < 0) start = llen+start;
4330 if (end < 0) end = llen+end;
4331 if (start < 0) start = 0;
4332 if (end < 0) end = 0;
4333
4334 /* indexes sanity checks */
4335 if (start > end || start >= llen) {
4336 /* Out of range start or start > end result in empty list */
4337 ltrim = llen;
4338 rtrim = 0;
4339 } else {
4340 if (end >= llen) end = llen-1;
4341 ltrim = start;
4342 rtrim = llen-end-1;
4343 }
4344
4345 /* Remove list elements to perform the trim */
4346 for (j = 0; j < ltrim; j++) {
4347 ln = listFirst(list);
4348 listDelNode(list,ln);
4349 }
4350 for (j = 0; j < rtrim; j++) {
4351 ln = listLast(list);
4352 listDelNode(list,ln);
4353 }
4354 server.dirty++;
4355 addReply(c,shared.ok);
4356 }
4357 }
4358 }
4359
4360 static void lremCommand(redisClient *c) {
4361 robj *o;
4362
4363 o = lookupKeyWrite(c->db,c->argv[1]);
4364 if (o == NULL) {
4365 addReply(c,shared.czero);
4366 } else {
4367 if (o->type != REDIS_LIST) {
4368 addReply(c,shared.wrongtypeerr);
4369 } else {
4370 list *list = o->ptr;
4371 listNode *ln, *next;
4372 int toremove = atoi(c->argv[2]->ptr);
4373 int removed = 0;
4374 int fromtail = 0;
4375
4376 if (toremove < 0) {
4377 toremove = -toremove;
4378 fromtail = 1;
4379 }
4380 ln = fromtail ? list->tail : list->head;
4381 while (ln) {
4382 robj *ele = listNodeValue(ln);
4383
4384 next = fromtail ? ln->prev : ln->next;
4385 if (compareStringObjects(ele,c->argv[3]) == 0) {
4386 listDelNode(list,ln);
4387 server.dirty++;
4388 removed++;
4389 if (toremove && removed == toremove) break;
4390 }
4391 ln = next;
4392 }
4393 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4394 }
4395 }
4396 }
4397
4398 /* This is the semantic of this command:
4399 * RPOPLPUSH srclist dstlist:
4400 * IF LLEN(srclist) > 0
4401 * element = RPOP srclist
4402 * LPUSH dstlist element
4403 * RETURN element
4404 * ELSE
4405 * RETURN nil
4406 * END
4407 * END
4408 *
4409 * The idea is to be able to get an element from a list in a reliable way
4410 * since the element is not just returned but pushed against another list
4411 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4412 */
4413 static void rpoplpushcommand(redisClient *c) {
4414 robj *sobj;
4415
4416 sobj = lookupKeyWrite(c->db,c->argv[1]);
4417 if (sobj == NULL) {
4418 addReply(c,shared.nullbulk);
4419 } else {
4420 if (sobj->type != REDIS_LIST) {
4421 addReply(c,shared.wrongtypeerr);
4422 } else {
4423 list *srclist = sobj->ptr;
4424 listNode *ln = listLast(srclist);
4425
4426 if (ln == NULL) {
4427 addReply(c,shared.nullbulk);
4428 } else {
4429 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4430 robj *ele = listNodeValue(ln);
4431 list *dstlist;
4432
4433 if (dobj && dobj->type != REDIS_LIST) {
4434 addReply(c,shared.wrongtypeerr);
4435 return;
4436 }
4437
4438 /* Add the element to the target list (unless it's directly
4439 * passed to some BLPOP-ing client */
4440 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4441 if (dobj == NULL) {
4442 /* Create the list if the key does not exist */
4443 dobj = createListObject();
4444 dictAdd(c->db->dict,c->argv[2],dobj);
4445 incrRefCount(c->argv[2]);
4446 }
4447 dstlist = dobj->ptr;
4448 listAddNodeHead(dstlist,ele);
4449 incrRefCount(ele);
4450 }
4451
4452 /* Send the element to the client as reply as well */
4453 addReplyBulkLen(c,ele);
4454 addReply(c,ele);
4455 addReply(c,shared.crlf);
4456
4457 /* Finally remove the element from the source list */
4458 listDelNode(srclist,ln);
4459 server.dirty++;
4460 }
4461 }
4462 }
4463 }
4464
4465
4466 /* ==================================== Sets ================================ */
4467
4468 static void saddCommand(redisClient *c) {
4469 robj *set;
4470
4471 set = lookupKeyWrite(c->db,c->argv[1]);
4472 if (set == NULL) {
4473 set = createSetObject();
4474 dictAdd(c->db->dict,c->argv[1],set);
4475 incrRefCount(c->argv[1]);
4476 } else {
4477 if (set->type != REDIS_SET) {
4478 addReply(c,shared.wrongtypeerr);
4479 return;
4480 }
4481 }
4482 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4483 incrRefCount(c->argv[2]);
4484 server.dirty++;
4485 addReply(c,shared.cone);
4486 } else {
4487 addReply(c,shared.czero);
4488 }
4489 }
4490
4491 static void sremCommand(redisClient *c) {
4492 robj *set;
4493
4494 set = lookupKeyWrite(c->db,c->argv[1]);
4495 if (set == NULL) {
4496 addReply(c,shared.czero);
4497 } else {
4498 if (set->type != REDIS_SET) {
4499 addReply(c,shared.wrongtypeerr);
4500 return;
4501 }
4502 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4503 server.dirty++;
4504 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4505 addReply(c,shared.cone);
4506 } else {
4507 addReply(c,shared.czero);
4508 }
4509 }
4510 }
4511
4512 static void smoveCommand(redisClient *c) {
4513 robj *srcset, *dstset;
4514
4515 srcset = lookupKeyWrite(c->db,c->argv[1]);
4516 dstset = lookupKeyWrite(c->db,c->argv[2]);
4517
4518 /* If the source key does not exist return 0, if it's of the wrong type
4519 * raise an error */
4520 if (srcset == NULL || srcset->type != REDIS_SET) {
4521 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4522 return;
4523 }
4524 /* Error if the destination key is not a set as well */
4525 if (dstset && dstset->type != REDIS_SET) {
4526 addReply(c,shared.wrongtypeerr);
4527 return;
4528 }
4529 /* Remove the element from the source set */
4530 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4531 /* Key not found in the src set! return zero */
4532 addReply(c,shared.czero);
4533 return;
4534 }
4535 server.dirty++;
4536 /* Add the element to the destination set */
4537 if (!dstset) {
4538 dstset = createSetObject();
4539 dictAdd(c->db->dict,c->argv[2],dstset);
4540 incrRefCount(c->argv[2]);
4541 }
4542 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4543 incrRefCount(c->argv[3]);
4544 addReply(c,shared.cone);
4545 }
4546
4547 static void sismemberCommand(redisClient *c) {
4548 robj *set;
4549
4550 set = lookupKeyRead(c->db,c->argv[1]);
4551 if (set == NULL) {
4552 addReply(c,shared.czero);
4553 } else {
4554 if (set->type != REDIS_SET) {
4555 addReply(c,shared.wrongtypeerr);
4556 return;
4557 }
4558 if (dictFind(set->ptr,c->argv[2]))
4559 addReply(c,shared.cone);
4560 else
4561 addReply(c,shared.czero);
4562 }
4563 }
4564
4565 static void scardCommand(redisClient *c) {
4566 robj *o;
4567 dict *s;
4568
4569 o = lookupKeyRead(c->db,c->argv[1]);
4570 if (o == NULL) {
4571 addReply(c,shared.czero);
4572 return;
4573 } else {
4574 if (o->type != REDIS_SET) {
4575 addReply(c,shared.wrongtypeerr);
4576 } else {
4577 s = o->ptr;
4578 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4579 dictSize(s)));
4580 }
4581 }
4582 }
4583
4584 static void spopCommand(redisClient *c) {
4585 robj *set;
4586 dictEntry *de;
4587
4588 set = lookupKeyWrite(c->db,c->argv[1]);
4589 if (set == NULL) {
4590 addReply(c,shared.nullbulk);
4591 } else {
4592 if (set->type != REDIS_SET) {
4593 addReply(c,shared.wrongtypeerr);
4594 return;
4595 }
4596 de = dictGetRandomKey(set->ptr);
4597 if (de == NULL) {
4598 addReply(c,shared.nullbulk);
4599 } else {
4600 robj *ele = dictGetEntryKey(de);
4601
4602 addReplyBulkLen(c,ele);
4603 addReply(c,ele);
4604 addReply(c,shared.crlf);
4605 dictDelete(set->ptr,ele);
4606 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4607 server.dirty++;
4608 }
4609 }
4610 }
4611
4612 static void srandmemberCommand(redisClient *c) {
4613 robj *set;
4614 dictEntry *de;
4615
4616 set = lookupKeyRead(c->db,c->argv[1]);
4617 if (set == NULL) {
4618 addReply(c,shared.nullbulk);
4619 } else {
4620 if (set->type != REDIS_SET) {
4621 addReply(c,shared.wrongtypeerr);
4622 return;
4623 }
4624 de = dictGetRandomKey(set->ptr);
4625 if (de == NULL) {
4626 addReply(c,shared.nullbulk);
4627 } else {
4628 robj *ele = dictGetEntryKey(de);
4629
4630 addReplyBulkLen(c,ele);
4631 addReply(c,ele);
4632 addReply(c,shared.crlf);
4633 }
4634 }
4635 }
4636
4637 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4638 dict **d1 = (void*) s1, **d2 = (void*) s2;
4639
4640 return dictSize(*d1)-dictSize(*d2);
4641 }
4642
4643 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4644 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4645 dictIterator *di;
4646 dictEntry *de;
4647 robj *lenobj = NULL, *dstset = NULL;
4648 unsigned long j, cardinality = 0;
4649
4650 for (j = 0; j < setsnum; j++) {
4651 robj *setobj;
4652
4653 setobj = dstkey ?
4654 lookupKeyWrite(c->db,setskeys[j]) :
4655 lookupKeyRead(c->db,setskeys[j]);
4656 if (!setobj) {
4657 zfree(dv);
4658 if (dstkey) {
4659 if (deleteKey(c->db,dstkey))
4660 server.dirty++;
4661 addReply(c,shared.czero);
4662 } else {
4663 addReply(c,shared.nullmultibulk);
4664 }
4665 return;
4666 }
4667 if (setobj->type != REDIS_SET) {
4668 zfree(dv);
4669 addReply(c,shared.wrongtypeerr);
4670 return;
4671 }
4672 dv[j] = setobj->ptr;
4673 }
4674 /* Sort sets from the smallest to largest, this will improve our
4675 * algorithm's performace */
4676 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4677
4678 /* The first thing we should output is the total number of elements...
4679 * since this is a multi-bulk write, but at this stage we don't know
4680 * the intersection set size, so we use a trick, append an empty object
4681 * to the output list and save the pointer to later modify it with the
4682 * right length */
4683 if (!dstkey) {
4684 lenobj = createObject(REDIS_STRING,NULL);
4685 addReply(c,lenobj);
4686 decrRefCount(lenobj);
4687 } else {
4688 /* If we have a target key where to store the resulting set
4689 * create this key with an empty set inside */
4690 dstset = createSetObject();
4691 }
4692
4693 /* Iterate all the elements of the first (smallest) set, and test
4694 * the element against all the other sets, if at least one set does
4695 * not include the element it is discarded */
4696 di = dictGetIterator(dv[0]);
4697
4698 while((de = dictNext(di)) != NULL) {
4699 robj *ele;
4700
4701 for (j = 1; j < setsnum; j++)
4702 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4703 if (j != setsnum)
4704 continue; /* at least one set does not contain the member */
4705 ele = dictGetEntryKey(de);
4706 if (!dstkey) {
4707 addReplyBulkLen(c,ele);
4708 addReply(c,ele);
4709 addReply(c,shared.crlf);
4710 cardinality++;
4711 } else {
4712 dictAdd(dstset->ptr,ele,NULL);
4713 incrRefCount(ele);
4714 }
4715 }
4716 dictReleaseIterator(di);
4717
4718 if (dstkey) {
4719 /* Store the resulting set into the target */
4720 deleteKey(c->db,dstkey);
4721 dictAdd(c->db->dict,dstkey,dstset);
4722 incrRefCount(dstkey);
4723 }
4724
4725 if (!dstkey) {
4726 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4727 } else {
4728 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4729 dictSize((dict*)dstset->ptr)));
4730 server.dirty++;
4731 }
4732 zfree(dv);
4733 }
4734
4735 static void sinterCommand(redisClient *c) {
4736 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4737 }
4738
4739 static void sinterstoreCommand(redisClient *c) {
4740 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4741 }
4742
4743 #define REDIS_OP_UNION 0
4744 #define REDIS_OP_DIFF 1
4745
4746 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4747 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4748 dictIterator *di;
4749 dictEntry *de;
4750 robj *dstset = NULL;
4751 int j, cardinality = 0;
4752
4753 for (j = 0; j < setsnum; j++) {
4754 robj *setobj;
4755
4756 setobj = dstkey ?
4757 lookupKeyWrite(c->db,setskeys[j]) :
4758 lookupKeyRead(c->db,setskeys[j]);
4759 if (!setobj) {
4760 dv[j] = NULL;
4761 continue;
4762 }
4763 if (setobj->type != REDIS_SET) {
4764 zfree(dv);
4765 addReply(c,shared.wrongtypeerr);
4766 return;
4767 }
4768 dv[j] = setobj->ptr;
4769 }
4770
4771 /* We need a temp set object to store our union. If the dstkey
4772 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4773 * this set object will be the resulting object to set into the target key*/
4774 dstset = createSetObject();
4775
4776 /* Iterate all the elements of all the sets, add every element a single
4777 * time to the result set */
4778 for (j = 0; j < setsnum; j++) {
4779 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4780 if (!dv[j]) continue; /* non existing keys are like empty sets */
4781
4782 di = dictGetIterator(dv[j]);
4783
4784 while((de = dictNext(di)) != NULL) {
4785 robj *ele;
4786
4787 /* dictAdd will not add the same element multiple times */
4788 ele = dictGetEntryKey(de);
4789 if (op == REDIS_OP_UNION || j == 0) {
4790 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4791 incrRefCount(ele);
4792 cardinality++;
4793 }
4794 } else if (op == REDIS_OP_DIFF) {
4795 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4796 cardinality--;
4797 }
4798 }
4799 }
4800 dictReleaseIterator(di);
4801
4802 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4803 }
4804
4805 /* Output the content of the resulting set, if not in STORE mode */
4806 if (!dstkey) {
4807 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4808 di = dictGetIterator(dstset->ptr);
4809 while((de = dictNext(di)) != NULL) {
4810 robj *ele;
4811
4812 ele = dictGetEntryKey(de);
4813 addReplyBulkLen(c,ele);
4814 addReply(c,ele);
4815 addReply(c,shared.crlf);
4816 }
4817 dictReleaseIterator(di);
4818 } else {
4819 /* If we have a target key where to store the resulting set
4820 * create this key with the result set inside */
4821 deleteKey(c->db,dstkey);
4822 dictAdd(c->db->dict,dstkey,dstset);
4823 incrRefCount(dstkey);
4824 }
4825
4826 /* Cleanup */
4827 if (!dstkey) {
4828 decrRefCount(dstset);
4829 } else {
4830 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4831 dictSize((dict*)dstset->ptr)));
4832 server.dirty++;
4833 }
4834 zfree(dv);
4835 }
4836
4837 static void sunionCommand(redisClient *c) {
4838 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4839 }
4840
4841 static void sunionstoreCommand(redisClient *c) {
4842 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4843 }
4844
4845 static void sdiffCommand(redisClient *c) {
4846 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4847 }
4848
4849 static void sdiffstoreCommand(redisClient *c) {
4850 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4851 }
4852
4853 /* ==================================== ZSets =============================== */
4854
4855 /* ZSETs are ordered sets using two data structures to hold the same elements
4856 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4857 * data structure.
4858 *
4859 * The elements are added to an hash table mapping Redis objects to scores.
4860 * At the same time the elements are added to a skip list mapping scores
4861 * to Redis objects (so objects are sorted by scores in this "view"). */
4862
4863 /* This skiplist implementation is almost a C translation of the original
4864 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4865 * Alternative to Balanced Trees", modified in three ways:
4866 * a) this implementation allows for repeated values.
4867 * b) the comparison is not just by key (our 'score') but by satellite data.
4868 * c) there is a back pointer, so it's a doubly linked list with the back
4869 * pointers being only at "level 1". This allows to traverse the list
4870 * from tail to head, useful for ZREVRANGE. */
4871
4872 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4873 zskiplistNode *zn = zmalloc(sizeof(*zn));
4874
4875 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4876 if (level > 0)
4877 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4878 zn->score = score;
4879 zn->obj = obj;
4880 return zn;
4881 }
4882
4883 static zskiplist *zslCreate(void) {
4884 int j;
4885 zskiplist *zsl;
4886
4887 zsl = zmalloc(sizeof(*zsl));
4888 zsl->level = 1;
4889 zsl->length = 0;
4890 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4891 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4892 zsl->header->forward[j] = NULL;
4893
4894 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4895 if (j < ZSKIPLIST_MAXLEVEL-1)
4896 zsl->header->span[j] = 0;
4897 }
4898 zsl->header->backward = NULL;
4899 zsl->tail = NULL;
4900 return zsl;
4901 }
4902
4903 static void zslFreeNode(zskiplistNode *node) {
4904 decrRefCount(node->obj);
4905 zfree(node->forward);
4906 zfree(node->span);
4907 zfree(node);
4908 }
4909
4910 static void zslFree(zskiplist *zsl) {
4911 zskiplistNode *node = zsl->header->forward[0], *next;
4912
4913 zfree(zsl->header->forward);
4914 zfree(zsl->header->span);
4915 zfree(zsl->header);
4916 while(node) {
4917 next = node->forward[0];
4918 zslFreeNode(node);
4919 node = next;
4920 }
4921 zfree(zsl);
4922 }
4923
4924 static int zslRandomLevel(void) {
4925 int level = 1;
4926 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4927 level += 1;
4928 return level;
4929 }
4930
4931 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4932 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4933 unsigned int rank[ZSKIPLIST_MAXLEVEL];
4934 int i, level;
4935
4936 x = zsl->header;
4937 for (i = zsl->level-1; i >= 0; i--) {
4938 /* store rank that is crossed to reach the insert position */
4939 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
4940
4941 while (x->forward[i] &&
4942 (x->forward[i]->score < score ||
4943 (x->forward[i]->score == score &&
4944 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4945 rank[i] += i > 0 ? x->span[i-1] : 1;
4946 x = x->forward[i];
4947 }
4948 update[i] = x;
4949 }
4950 /* we assume the key is not already inside, since we allow duplicated
4951 * scores, and the re-insertion of score and redis object should never
4952 * happpen since the caller of zslInsert() should test in the hash table
4953 * if the element is already inside or not. */
4954 level = zslRandomLevel();
4955 if (level > zsl->level) {
4956 for (i = zsl->level; i < level; i++) {
4957 rank[i] = 0;
4958 update[i] = zsl->header;
4959 update[i]->span[i-1] = zsl->length;
4960 }
4961 zsl->level = level;
4962 }
4963 x = zslCreateNode(level,score,obj);
4964 for (i = 0; i < level; i++) {
4965 x->forward[i] = update[i]->forward[i];
4966 update[i]->forward[i] = x;
4967
4968 /* update span covered by update[i] as x is inserted here */
4969 if (i > 0) {
4970 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
4971 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
4972 }
4973 }
4974
4975 /* increment span for untouched levels */
4976 for (i = level; i < zsl->level; i++) {
4977 update[i]->span[i-1]++;
4978 }
4979
4980 x->backward = (update[0] == zsl->header) ? NULL : update[0];
4981 if (x->forward[0])
4982 x->forward[0]->backward = x;
4983 else
4984 zsl->tail = x;
4985 zsl->length++;
4986 }
4987
4988 /* Delete an element with matching score/object from the skiplist. */
4989 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
4990 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4991 int i;
4992
4993 x = zsl->header;
4994 for (i = zsl->level-1; i >= 0; i--) {
4995 while (x->forward[i] &&
4996 (x->forward[i]->score < score ||
4997 (x->forward[i]->score == score &&
4998 compareStringObjects(x->forward[i]->obj,obj) < 0)))
4999 x = x->forward[i];
5000 update[i] = x;
5001 }
5002 /* We may have multiple elements with the same score, what we need
5003 * is to find the element with both the right score and object. */
5004 x = x->forward[0];
5005 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5006 for (i = 0; i < zsl->level; i++) {
5007 if (update[i]->forward[i] == x) {
5008 if (i > 0) {
5009 update[i]->span[i-1] += x->span[i-1] - 1;
5010 }
5011 update[i]->forward[i] = x->forward[i];
5012 } else {
5013 /* invariant: i > 0, because update[0]->forward[0]
5014 * is always equal to x */
5015 update[i]->span[i-1] -= 1;
5016 }
5017 }
5018 if (x->forward[0]) {
5019 x->forward[0]->backward = x->backward;
5020 } else {
5021 zsl->tail = x->backward;
5022 }
5023 zslFreeNode(x);
5024 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5025 zsl->level--;
5026 zsl->length--;
5027 return 1;
5028 } else {
5029 return 0; /* not found */
5030 }
5031 return 0; /* not found */
5032 }
5033
5034 /* Delete all the elements with score between min and max from the skiplist.
5035 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5036 * Note that this function takes the reference to the hash table view of the
5037 * sorted set, in order to remove the elements from the hash table too. */
5038 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
5039 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5040 unsigned long removed = 0;
5041 int i;
5042
5043 x = zsl->header;
5044 for (i = zsl->level-1; i >= 0; i--) {
5045 while (x->forward[i] && x->forward[i]->score < min)
5046 x = x->forward[i];
5047 update[i] = x;
5048 }
5049 /* We may have multiple elements with the same score, what we need
5050 * is to find the element with both the right score and object. */
5051 x = x->forward[0];
5052 while (x && x->score <= max) {
5053 zskiplistNode *next;
5054
5055 for (i = 0; i < zsl->level; i++) {
5056 if (update[i]->forward[i] == x) {
5057 if (i > 0) {
5058 update[i]->span[i-1] += x->span[i-1] - 1;
5059 }
5060 update[i]->forward[i] = x->forward[i];
5061 } else {
5062 /* invariant: i > 0, because update[0]->forward[0]
5063 * is always equal to x */
5064 update[i]->span[i-1] -= 1;
5065 }
5066 }
5067 if (x->forward[0]) {
5068 x->forward[0]->backward = x->backward;
5069 } else {
5070 zsl->tail = x->backward;
5071 }
5072 next = x->forward[0];
5073 dictDelete(dict,x->obj);
5074 zslFreeNode(x);
5075 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5076 zsl->level--;
5077 zsl->length--;
5078 removed++;
5079 x = next;
5080 }
5081 return removed; /* not found */
5082 }
5083
5084 /* Find the first node having a score equal or greater than the specified one.
5085 * Returns NULL if there is no match. */
5086 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5087 zskiplistNode *x;
5088 int i;
5089
5090 x = zsl->header;
5091 for (i = zsl->level-1; i >= 0; i--) {
5092 while (x->forward[i] && x->forward[i]->score < score)
5093 x = x->forward[i];
5094 }
5095 /* We may have multiple elements with the same score, what we need
5096 * is to find the element with both the right score and object. */
5097 return x->forward[0];
5098 }
5099
5100 /* Find the rank for an element by both score and key.
5101 * Returns 0 when the element cannot be found, rank otherwise.
5102 * Note that the rank is 1-based due to the span of zsl->header to the
5103 * first element. */
5104 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5105 zskiplistNode *x;
5106 unsigned long rank = 0;
5107 int i;
5108
5109 x = zsl->header;
5110 for (i = zsl->level-1; i >= 0; i--) {
5111 while (x->forward[i] &&
5112 (x->forward[i]->score < score ||
5113 (x->forward[i]->score == score &&
5114 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5115 rank += i > 0 ? x->span[i-1] : 1;
5116 x = x->forward[i];
5117 }
5118
5119 /* x might be equal to zsl->header, so test if obj is non-NULL */
5120 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5121 return rank;
5122 }
5123 }
5124 return 0;
5125 }
5126
5127 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5128 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5129 zskiplistNode *x;
5130 unsigned long traversed = 0;
5131 int i;
5132
5133 x = zsl->header;
5134 for (i = zsl->level-1; i >= 0; i--) {
5135 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5136 traversed += i > 0 ? x->span[i-1] : 1;
5137 x = x->forward[i];
5138 }
5139
5140 if (traversed == rank) {
5141 return x;
5142 }
5143 }
5144 return NULL;
5145 }
5146
5147 /* The actual Z-commands implementations */
5148
5149 /* This generic command implements both ZADD and ZINCRBY.
5150 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5151 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5152 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5153 robj *zsetobj;
5154 zset *zs;
5155 double *score;
5156
5157 zsetobj = lookupKeyWrite(c->db,key);
5158 if (zsetobj == NULL) {
5159 zsetobj = createZsetObject();
5160 dictAdd(c->db->dict,key,zsetobj);
5161 incrRefCount(key);
5162 } else {
5163 if (zsetobj->type != REDIS_ZSET) {
5164 addReply(c,shared.wrongtypeerr);
5165 return;
5166 }
5167 }
5168 zs = zsetobj->ptr;
5169
5170 /* Ok now since we implement both ZADD and ZINCRBY here the code
5171 * needs to handle the two different conditions. It's all about setting
5172 * '*score', that is, the new score to set, to the right value. */
5173 score = zmalloc(sizeof(double));
5174 if (doincrement) {
5175 dictEntry *de;
5176
5177 /* Read the old score. If the element was not present starts from 0 */
5178 de = dictFind(zs->dict,ele);
5179 if (de) {
5180 double *oldscore = dictGetEntryVal(de);
5181 *score = *oldscore + scoreval;
5182 } else {
5183 *score = scoreval;
5184 }
5185 } else {
5186 *score = scoreval;
5187 }
5188
5189 /* What follows is a simple remove and re-insert operation that is common
5190 * to both ZADD and ZINCRBY... */
5191 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5192 /* case 1: New element */
5193 incrRefCount(ele); /* added to hash */
5194 zslInsert(zs->zsl,*score,ele);
5195 incrRefCount(ele); /* added to skiplist */
5196 server.dirty++;
5197 if (doincrement)
5198 addReplyDouble(c,*score);
5199 else
5200 addReply(c,shared.cone);
5201 } else {
5202 dictEntry *de;
5203 double *oldscore;
5204
5205 /* case 2: Score update operation */
5206 de = dictFind(zs->dict,ele);
5207 redisAssert(de != NULL);
5208 oldscore = dictGetEntryVal(de);
5209 if (*score != *oldscore) {
5210 int deleted;
5211
5212 /* Remove and insert the element in the skip list with new score */
5213 deleted = zslDelete(zs->zsl,*oldscore,ele);
5214 redisAssert(deleted != 0);
5215 zslInsert(zs->zsl,*score,ele);
5216 incrRefCount(ele);
5217 /* Update the score in the hash table */
5218 dictReplace(zs->dict,ele,score);
5219 server.dirty++;
5220 } else {
5221 zfree(score);
5222 }
5223 if (doincrement)
5224 addReplyDouble(c,*score);
5225 else
5226 addReply(c,shared.czero);
5227 }
5228 }
5229
5230 static void zaddCommand(redisClient *c) {
5231 double scoreval;
5232
5233 scoreval = strtod(c->argv[2]->ptr,NULL);
5234 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5235 }
5236
5237 static void zincrbyCommand(redisClient *c) {
5238 double scoreval;
5239
5240 scoreval = strtod(c->argv[2]->ptr,NULL);
5241 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5242 }
5243
5244 static void zremCommand(redisClient *c) {
5245 robj *zsetobj;
5246 zset *zs;
5247
5248 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5249 if (zsetobj == NULL) {
5250 addReply(c,shared.czero);
5251 } else {
5252 dictEntry *de;
5253 double *oldscore;
5254 int deleted;
5255
5256 if (zsetobj->type != REDIS_ZSET) {
5257 addReply(c,shared.wrongtypeerr);
5258 return;
5259 }
5260 zs = zsetobj->ptr;
5261 de = dictFind(zs->dict,c->argv[2]);
5262 if (de == NULL) {
5263 addReply(c,shared.czero);
5264 return;
5265 }
5266 /* Delete from the skiplist */
5267 oldscore = dictGetEntryVal(de);
5268 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5269 redisAssert(deleted != 0);
5270
5271 /* Delete from the hash table */
5272 dictDelete(zs->dict,c->argv[2]);
5273 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5274 server.dirty++;
5275 addReply(c,shared.cone);
5276 }
5277 }
5278
5279 static void zremrangebyscoreCommand(redisClient *c) {
5280 double min = strtod(c->argv[2]->ptr,NULL);
5281 double max = strtod(c->argv[3]->ptr,NULL);
5282 robj *zsetobj;
5283 zset *zs;
5284
5285 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5286 if (zsetobj == NULL) {
5287 addReply(c,shared.czero);
5288 } else {
5289 long deleted;
5290
5291 if (zsetobj->type != REDIS_ZSET) {
5292 addReply(c,shared.wrongtypeerr);
5293 return;
5294 }
5295 zs = zsetobj->ptr;
5296 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5297 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5298 server.dirty += deleted;
5299 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5300 }
5301 }
5302
5303 static void zrangeGenericCommand(redisClient *c, int reverse) {
5304 robj *o;
5305 int start = atoi(c->argv[2]->ptr);
5306 int end = atoi(c->argv[3]->ptr);
5307 int withscores = 0;
5308
5309 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5310 withscores = 1;
5311 } else if (c->argc >= 5) {
5312 addReply(c,shared.syntaxerr);
5313 return;
5314 }
5315
5316 o = lookupKeyRead(c->db,c->argv[1]);
5317 if (o == NULL) {
5318 addReply(c,shared.nullmultibulk);
5319 } else {
5320 if (o->type != REDIS_ZSET) {
5321 addReply(c,shared.wrongtypeerr);
5322 } else {
5323 zset *zsetobj = o->ptr;
5324 zskiplist *zsl = zsetobj->zsl;
5325 zskiplistNode *ln;
5326
5327 int llen = zsl->length;
5328 int rangelen, j;
5329 robj *ele;
5330
5331 /* convert negative indexes */
5332 if (start < 0) start = llen+start;
5333 if (end < 0) end = llen+end;
5334 if (start < 0) start = 0;
5335 if (end < 0) end = 0;
5336
5337 /* indexes sanity checks */
5338 if (start > end || start >= llen) {
5339 /* Out of range start or start > end result in empty list */
5340 addReply(c,shared.emptymultibulk);
5341 return;
5342 }
5343 if (end >= llen) end = llen-1;
5344 rangelen = (end-start)+1;
5345
5346 /* check if starting point is trivial, before searching
5347 * the element in log(N) time */
5348 if (reverse) {
5349 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen - start);
5350 } else {
5351 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start + 1);
5352 }
5353
5354 /* Return the result in form of a multi-bulk reply */
5355 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5356 withscores ? (rangelen*2) : rangelen));
5357 for (j = 0; j < rangelen; j++) {
5358 ele = ln->obj;
5359 addReplyBulkLen(c,ele);
5360 addReply(c,ele);
5361 addReply(c,shared.crlf);
5362 if (withscores)
5363 addReplyDouble(c,ln->score);
5364 ln = reverse ? ln->backward : ln->forward[0];
5365 }
5366 }
5367 }
5368 }
5369
5370 static void zrangeCommand(redisClient *c) {
5371 zrangeGenericCommand(c,0);
5372 }
5373
5374 static void zrevrangeCommand(redisClient *c) {
5375 zrangeGenericCommand(c,1);
5376 }
5377
5378 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5379 * If justcount is non-zero, just the count is returned. */
5380 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5381 robj *o;
5382 double min, max;
5383 int minex = 0, maxex = 0; /* are min or max exclusive? */
5384 int offset = 0, limit = -1;
5385 int withscores = 0;
5386 int badsyntax = 0;
5387
5388 /* Parse the min-max interval. If one of the values is prefixed
5389 * by the "(" character, it's considered "open". For instance
5390 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5391 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5392 if (((char*)c->argv[2]->ptr)[0] == '(') {
5393 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5394 minex = 1;
5395 } else {
5396 min = strtod(c->argv[2]->ptr,NULL);
5397 }
5398 if (((char*)c->argv[3]->ptr)[0] == '(') {
5399 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5400 maxex = 1;
5401 } else {
5402 max = strtod(c->argv[3]->ptr,NULL);
5403 }
5404
5405 /* Parse "WITHSCORES": note that if the command was called with
5406 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5407 * enter the following paths to parse WITHSCORES and LIMIT. */
5408 if (c->argc == 5 || c->argc == 8) {
5409 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5410 withscores = 1;
5411 else
5412 badsyntax = 1;
5413 }
5414 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5415 badsyntax = 1;
5416 if (badsyntax) {
5417 addReplySds(c,
5418 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5419 return;
5420 }
5421
5422 /* Parse "LIMIT" */
5423 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5424 addReply(c,shared.syntaxerr);
5425 return;
5426 } else if (c->argc == (7 + withscores)) {
5427 offset = atoi(c->argv[5]->ptr);
5428 limit = atoi(c->argv[6]->ptr);
5429 if (offset < 0) offset = 0;
5430 }
5431
5432 /* Ok, lookup the key and get the range */
5433 o = lookupKeyRead(c->db,c->argv[1]);
5434 if (o == NULL) {
5435 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5436 } else {
5437 if (o->type != REDIS_ZSET) {
5438 addReply(c,shared.wrongtypeerr);
5439 } else {
5440 zset *zsetobj = o->ptr;
5441 zskiplist *zsl = zsetobj->zsl;
5442 zskiplistNode *ln;
5443 robj *ele, *lenobj = NULL;
5444 unsigned long rangelen = 0;
5445
5446 /* Get the first node with the score >= min, or with
5447 * score > min if 'minex' is true. */
5448 ln = zslFirstWithScore(zsl,min);
5449 while (minex && ln && ln->score == min) ln = ln->forward[0];
5450
5451 if (ln == NULL) {
5452 /* No element matching the speciifed interval */
5453 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5454 return;
5455 }
5456
5457 /* We don't know in advance how many matching elements there
5458 * are in the list, so we push this object that will represent
5459 * the multi-bulk length in the output buffer, and will "fix"
5460 * it later */
5461 if (!justcount) {
5462 lenobj = createObject(REDIS_STRING,NULL);
5463 addReply(c,lenobj);
5464 decrRefCount(lenobj);
5465 }
5466
5467 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5468 if (offset) {
5469 offset--;
5470 ln = ln->forward[0];
5471 continue;
5472 }
5473 if (limit == 0) break;
5474 if (!justcount) {
5475 ele = ln->obj;
5476 addReplyBulkLen(c,ele);
5477 addReply(c,ele);
5478 addReply(c,shared.crlf);
5479 if (withscores)
5480 addReplyDouble(c,ln->score);
5481 }
5482 ln = ln->forward[0];
5483 rangelen++;
5484 if (limit > 0) limit--;
5485 }
5486 if (justcount) {
5487 addReplyLong(c,(long)rangelen);
5488 } else {
5489 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5490 withscores ? (rangelen*2) : rangelen);
5491 }
5492 }
5493 }
5494 }
5495
5496 static void zrangebyscoreCommand(redisClient *c) {
5497 genericZrangebyscoreCommand(c,0);
5498 }
5499
5500 static void zcountCommand(redisClient *c) {
5501 genericZrangebyscoreCommand(c,1);
5502 }
5503
5504 static void zcardCommand(redisClient *c) {
5505 robj *o;
5506 zset *zs;
5507
5508 o = lookupKeyRead(c->db,c->argv[1]);
5509 if (o == NULL) {
5510 addReply(c,shared.czero);
5511 return;
5512 } else {
5513 if (o->type != REDIS_ZSET) {
5514 addReply(c,shared.wrongtypeerr);
5515 } else {
5516 zs = o->ptr;
5517 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5518 }
5519 }
5520 }
5521
5522 static void zscoreCommand(redisClient *c) {
5523 robj *o;
5524 zset *zs;
5525
5526 o = lookupKeyRead(c->db,c->argv[1]);
5527 if (o == NULL) {
5528 addReply(c,shared.nullbulk);
5529 return;
5530 } else {
5531 if (o->type != REDIS_ZSET) {
5532 addReply(c,shared.wrongtypeerr);
5533 } else {
5534 dictEntry *de;
5535
5536 zs = o->ptr;
5537 de = dictFind(zs->dict,c->argv[2]);
5538 if (!de) {
5539 addReply(c,shared.nullbulk);
5540 } else {
5541 double *score = dictGetEntryVal(de);
5542
5543 addReplyDouble(c,*score);
5544 }
5545 }
5546 }
5547 }
5548
5549 static void zrankCommand(redisClient *c) {
5550 robj *o;
5551 o = lookupKeyRead(c->db,c->argv[1]);
5552 if (o == NULL) {
5553 addReply(c,shared.nullbulk);
5554 return;
5555 }
5556 if (o->type != REDIS_ZSET) {
5557 addReply(c,shared.wrongtypeerr);
5558 } else {
5559 zset *zs = o->ptr;
5560 zskiplist *zsl = zs->zsl;
5561 dictEntry *de;
5562 unsigned long rank;
5563
5564 de = dictFind(zs->dict,c->argv[2]);
5565 if (!de) {
5566 addReply(c,shared.nullbulk);
5567 return;
5568 }
5569
5570 double *score = dictGetEntryVal(de);
5571 rank = zslGetRank(zsl, *score, c->argv[2]);
5572 if (rank) {
5573 addReplyLong(c, rank-1);
5574 } else {
5575 addReply(c,shared.nullbulk);
5576 }
5577 }
5578 }
5579
5580 /* ==================================== Hash ================================ */
5581 static void hsetCommand(redisClient *c) {
5582 int update = 0;
5583 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5584
5585 if (o == NULL) {
5586 o = createHashObject();
5587 dictAdd(c->db->dict,c->argv[1],o);
5588 incrRefCount(c->argv[1]);
5589 } else {
5590 if (o->type != REDIS_HASH) {
5591 addReply(c,shared.wrongtypeerr);
5592 return;
5593 }
5594 }
5595 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5596 unsigned char *zm = o->ptr;
5597
5598 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5599 c->argv[3]->ptr,sdslen(c->argv[3]->ptr),&update);
5600 } else {
5601 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5602 incrRefCount(c->argv[2]);
5603 } else {
5604 update = 1;
5605 }
5606 incrRefCount(c->argv[3]);
5607 }
5608 server.dirty++;
5609 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5610 }
5611
5612 static void hgetCommand(redisClient *c) {
5613 robj *o = lookupKeyRead(c->db,c->argv[1]);
5614
5615 if (o == NULL) {
5616 addReply(c,shared.nullbulk);
5617 return;
5618 } else {
5619 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5620 unsigned char *zm = o->ptr;
5621 unsigned char *val;
5622 unsigned int vlen;
5623
5624 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
5625 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5626 addReplySds(c,sdsnewlen(val,vlen));
5627 addReply(c,shared.crlf);
5628 return;
5629 } else {
5630 addReply(c,shared.nullbulk);
5631 return;
5632 }
5633 } else {
5634 struct dictEntry *de;
5635
5636 de = dictFind(o->ptr,c->argv[2]);
5637 if (de == NULL) {
5638 addReply(c,shared.nullbulk);
5639 } else {
5640 robj *e = dictGetEntryVal(de);
5641
5642 addReplyBulkLen(c,e);
5643 addReply(c,e);
5644 addReply(c,shared.crlf);
5645 }
5646 }
5647 }
5648 }
5649
5650 /* ========================= Non type-specific commands ==================== */
5651
5652 static void flushdbCommand(redisClient *c) {
5653 server.dirty += dictSize(c->db->dict);
5654 dictEmpty(c->db->dict);
5655 dictEmpty(c->db->expires);
5656 addReply(c,shared.ok);
5657 }
5658
5659 static void flushallCommand(redisClient *c) {
5660 server.dirty += emptyDb();
5661 addReply(c,shared.ok);
5662 rdbSave(server.dbfilename);
5663 server.dirty++;
5664 }
5665
5666 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5667 redisSortOperation *so = zmalloc(sizeof(*so));
5668 so->type = type;
5669 so->pattern = pattern;
5670 return so;
5671 }
5672
5673 /* Return the value associated to the key with a name obtained
5674 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5675 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5676 char *p;
5677 sds spat, ssub;
5678 robj keyobj;
5679 int prefixlen, sublen, postfixlen;
5680 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5681 struct {
5682 long len;
5683 long free;
5684 char buf[REDIS_SORTKEY_MAX+1];
5685 } keyname;
5686
5687 /* If the pattern is "#" return the substitution object itself in order
5688 * to implement the "SORT ... GET #" feature. */
5689 spat = pattern->ptr;
5690 if (spat[0] == '#' && spat[1] == '\0') {
5691 return subst;
5692 }
5693
5694 /* The substitution object may be specially encoded. If so we create
5695 * a decoded object on the fly. Otherwise getDecodedObject will just
5696 * increment the ref count, that we'll decrement later. */
5697 subst = getDecodedObject(subst);
5698
5699 ssub = subst->ptr;
5700 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5701 p = strchr(spat,'*');
5702 if (!p) {
5703 decrRefCount(subst);
5704 return NULL;
5705 }
5706
5707 prefixlen = p-spat;
5708 sublen = sdslen(ssub);
5709 postfixlen = sdslen(spat)-(prefixlen+1);
5710 memcpy(keyname.buf,spat,prefixlen);
5711 memcpy(keyname.buf+prefixlen,ssub,sublen);
5712 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5713 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5714 keyname.len = prefixlen+sublen+postfixlen;
5715
5716 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5717 decrRefCount(subst);
5718
5719 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5720 return lookupKeyRead(db,&keyobj);
5721 }
5722
5723 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5724 * the additional parameter is not standard but a BSD-specific we have to
5725 * pass sorting parameters via the global 'server' structure */
5726 static int sortCompare(const void *s1, const void *s2) {
5727 const redisSortObject *so1 = s1, *so2 = s2;
5728 int cmp;
5729
5730 if (!server.sort_alpha) {
5731 /* Numeric sorting. Here it's trivial as we precomputed scores */
5732 if (so1->u.score > so2->u.score) {
5733 cmp = 1;
5734 } else if (so1->u.score < so2->u.score) {
5735 cmp = -1;
5736 } else {
5737 cmp = 0;
5738 }
5739 } else {
5740 /* Alphanumeric sorting */
5741 if (server.sort_bypattern) {
5742 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5743 /* At least one compare object is NULL */
5744 if (so1->u.cmpobj == so2->u.cmpobj)
5745 cmp = 0;
5746 else if (so1->u.cmpobj == NULL)
5747 cmp = -1;
5748 else
5749 cmp = 1;
5750 } else {
5751 /* We have both the objects, use strcoll */
5752 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5753 }
5754 } else {
5755 /* Compare elements directly */
5756 robj *dec1, *dec2;
5757
5758 dec1 = getDecodedObject(so1->obj);
5759 dec2 = getDecodedObject(so2->obj);
5760 cmp = strcoll(dec1->ptr,dec2->ptr);
5761 decrRefCount(dec1);
5762 decrRefCount(dec2);
5763 }
5764 }
5765 return server.sort_desc ? -cmp : cmp;
5766 }
5767
5768 /* The SORT command is the most complex command in Redis. Warning: this code
5769 * is optimized for speed and a bit less for readability */
5770 static void sortCommand(redisClient *c) {
5771 list *operations;
5772 int outputlen = 0;
5773 int desc = 0, alpha = 0;
5774 int limit_start = 0, limit_count = -1, start, end;
5775 int j, dontsort = 0, vectorlen;
5776 int getop = 0; /* GET operation counter */
5777 robj *sortval, *sortby = NULL, *storekey = NULL;
5778 redisSortObject *vector; /* Resulting vector to sort */
5779
5780 /* Lookup the key to sort. It must be of the right types */
5781 sortval = lookupKeyRead(c->db,c->argv[1]);
5782 if (sortval == NULL) {
5783 addReply(c,shared.nullmultibulk);
5784 return;
5785 }
5786 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5787 sortval->type != REDIS_ZSET)
5788 {
5789 addReply(c,shared.wrongtypeerr);
5790 return;
5791 }
5792
5793 /* Create a list of operations to perform for every sorted element.
5794 * Operations can be GET/DEL/INCR/DECR */
5795 operations = listCreate();
5796 listSetFreeMethod(operations,zfree);
5797 j = 2;
5798
5799 /* Now we need to protect sortval incrementing its count, in the future
5800 * SORT may have options able to overwrite/delete keys during the sorting
5801 * and the sorted key itself may get destroied */
5802 incrRefCount(sortval);
5803
5804 /* The SORT command has an SQL-alike syntax, parse it */
5805 while(j < c->argc) {
5806 int leftargs = c->argc-j-1;
5807 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5808 desc = 0;
5809 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5810 desc = 1;
5811 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5812 alpha = 1;
5813 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5814 limit_start = atoi(c->argv[j+1]->ptr);
5815 limit_count = atoi(c->argv[j+2]->ptr);
5816 j+=2;
5817 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5818 storekey = c->argv[j+1];
5819 j++;
5820 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5821 sortby = c->argv[j+1];
5822 /* If the BY pattern does not contain '*', i.e. it is constant,
5823 * we don't need to sort nor to lookup the weight keys. */
5824 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5825 j++;
5826 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5827 listAddNodeTail(operations,createSortOperation(
5828 REDIS_SORT_GET,c->argv[j+1]));
5829 getop++;
5830 j++;
5831 } else {
5832 decrRefCount(sortval);
5833 listRelease(operations);
5834 addReply(c,shared.syntaxerr);
5835 return;
5836 }
5837 j++;
5838 }
5839
5840 /* Load the sorting vector with all the objects to sort */
5841 switch(sortval->type) {
5842 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5843 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5844 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
5845 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
5846 }
5847 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
5848 j = 0;
5849
5850 if (sortval->type == REDIS_LIST) {
5851 list *list = sortval->ptr;
5852 listNode *ln;
5853 listIter li;
5854
5855 listRewind(list,&li);
5856 while((ln = listNext(&li))) {
5857 robj *ele = ln->value;
5858 vector[j].obj = ele;
5859 vector[j].u.score = 0;
5860 vector[j].u.cmpobj = NULL;
5861 j++;
5862 }
5863 } else {
5864 dict *set;
5865 dictIterator *di;
5866 dictEntry *setele;
5867
5868 if (sortval->type == REDIS_SET) {
5869 set = sortval->ptr;
5870 } else {
5871 zset *zs = sortval->ptr;
5872 set = zs->dict;
5873 }
5874
5875 di = dictGetIterator(set);
5876 while((setele = dictNext(di)) != NULL) {
5877 vector[j].obj = dictGetEntryKey(setele);
5878 vector[j].u.score = 0;
5879 vector[j].u.cmpobj = NULL;
5880 j++;
5881 }
5882 dictReleaseIterator(di);
5883 }
5884 redisAssert(j == vectorlen);
5885
5886 /* Now it's time to load the right scores in the sorting vector */
5887 if (dontsort == 0) {
5888 for (j = 0; j < vectorlen; j++) {
5889 if (sortby) {
5890 robj *byval;
5891
5892 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
5893 if (!byval || byval->type != REDIS_STRING) continue;
5894 if (alpha) {
5895 vector[j].u.cmpobj = getDecodedObject(byval);
5896 } else {
5897 if (byval->encoding == REDIS_ENCODING_RAW) {
5898 vector[j].u.score = strtod(byval->ptr,NULL);
5899 } else {
5900 /* Don't need to decode the object if it's
5901 * integer-encoded (the only encoding supported) so
5902 * far. We can just cast it */
5903 if (byval->encoding == REDIS_ENCODING_INT) {
5904 vector[j].u.score = (long)byval->ptr;
5905 } else
5906 redisAssert(1 != 1);
5907 }
5908 }
5909 } else {
5910 if (!alpha) {
5911 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5912 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5913 else {
5914 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5915 vector[j].u.score = (long) vector[j].obj->ptr;
5916 else
5917 redisAssert(1 != 1);
5918 }
5919 }
5920 }
5921 }
5922 }
5923
5924 /* We are ready to sort the vector... perform a bit of sanity check
5925 * on the LIMIT option too. We'll use a partial version of quicksort. */
5926 start = (limit_start < 0) ? 0 : limit_start;
5927 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5928 if (start >= vectorlen) {
5929 start = vectorlen-1;
5930 end = vectorlen-2;
5931 }
5932 if (end >= vectorlen) end = vectorlen-1;
5933
5934 if (dontsort == 0) {
5935 server.sort_desc = desc;
5936 server.sort_alpha = alpha;
5937 server.sort_bypattern = sortby ? 1 : 0;
5938 if (sortby && (start != 0 || end != vectorlen-1))
5939 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5940 else
5941 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
5942 }
5943
5944 /* Send command output to the output buffer, performing the specified
5945 * GET/DEL/INCR/DECR operations if any. */
5946 outputlen = getop ? getop*(end-start+1) : end-start+1;
5947 if (storekey == NULL) {
5948 /* STORE option not specified, sent the sorting result to client */
5949 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5950 for (j = start; j <= end; j++) {
5951 listNode *ln;
5952 listIter li;
5953
5954 if (!getop) {
5955 addReplyBulkLen(c,vector[j].obj);
5956 addReply(c,vector[j].obj);
5957 addReply(c,shared.crlf);
5958 }
5959 listRewind(operations,&li);
5960 while((ln = listNext(&li))) {
5961 redisSortOperation *sop = ln->value;
5962 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5963 vector[j].obj);
5964
5965 if (sop->type == REDIS_SORT_GET) {
5966 if (!val || val->type != REDIS_STRING) {
5967 addReply(c,shared.nullbulk);
5968 } else {
5969 addReplyBulkLen(c,val);
5970 addReply(c,val);
5971 addReply(c,shared.crlf);
5972 }
5973 } else {
5974 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5975 }
5976 }
5977 }
5978 } else {
5979 robj *listObject = createListObject();
5980 list *listPtr = (list*) listObject->ptr;
5981
5982 /* STORE option specified, set the sorting result as a List object */
5983 for (j = start; j <= end; j++) {
5984 listNode *ln;
5985 listIter li;
5986
5987 if (!getop) {
5988 listAddNodeTail(listPtr,vector[j].obj);
5989 incrRefCount(vector[j].obj);
5990 }
5991 listRewind(operations,&li);
5992 while((ln = listNext(&li))) {
5993 redisSortOperation *sop = ln->value;
5994 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5995 vector[j].obj);
5996
5997 if (sop->type == REDIS_SORT_GET) {
5998 if (!val || val->type != REDIS_STRING) {
5999 listAddNodeTail(listPtr,createStringObject("",0));
6000 } else {
6001 listAddNodeTail(listPtr,val);
6002 incrRefCount(val);
6003 }
6004 } else {
6005 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6006 }
6007 }
6008 }
6009 if (dictReplace(c->db->dict,storekey,listObject)) {
6010 incrRefCount(storekey);
6011 }
6012 /* Note: we add 1 because the DB is dirty anyway since even if the
6013 * SORT result is empty a new key is set and maybe the old content
6014 * replaced. */
6015 server.dirty += 1+outputlen;
6016 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6017 }
6018
6019 /* Cleanup */
6020 decrRefCount(sortval);
6021 listRelease(operations);
6022 for (j = 0; j < vectorlen; j++) {
6023 if (sortby && alpha && vector[j].u.cmpobj)
6024 decrRefCount(vector[j].u.cmpobj);
6025 }
6026 zfree(vector);
6027 }
6028
6029 /* Convert an amount of bytes into a human readable string in the form
6030 * of 100B, 2G, 100M, 4K, and so forth. */
6031 static void bytesToHuman(char *s, unsigned long long n) {
6032 double d;
6033
6034 if (n < 1024) {
6035 /* Bytes */
6036 sprintf(s,"%lluB",n);
6037 return;
6038 } else if (n < (1024*1024)) {
6039 d = (double)n/(1024);
6040 sprintf(s,"%.2fK",d);
6041 } else if (n < (1024LL*1024*1024)) {
6042 d = (double)n/(1024*1024);
6043 sprintf(s,"%.2fM",d);
6044 } else if (n < (1024LL*1024*1024*1024)) {
6045 d = (double)n/(1024LL*1024*1024);
6046 sprintf(s,"%.2fG",d);
6047 }
6048 }
6049
6050 /* Create the string returned by the INFO command. This is decoupled
6051 * by the INFO command itself as we need to report the same information
6052 * on memory corruption problems. */
6053 static sds genRedisInfoString(void) {
6054 sds info;
6055 time_t uptime = time(NULL)-server.stat_starttime;
6056 int j;
6057 char hmem[64];
6058
6059 bytesToHuman(hmem,zmalloc_used_memory());
6060 info = sdscatprintf(sdsempty(),
6061 "redis_version:%s\r\n"
6062 "arch_bits:%s\r\n"
6063 "multiplexing_api:%s\r\n"
6064 "process_id:%ld\r\n"
6065 "uptime_in_seconds:%ld\r\n"
6066 "uptime_in_days:%ld\r\n"
6067 "connected_clients:%d\r\n"
6068 "connected_slaves:%d\r\n"
6069 "blocked_clients:%d\r\n"
6070 "used_memory:%zu\r\n"
6071 "used_memory_human:%s\r\n"
6072 "changes_since_last_save:%lld\r\n"
6073 "bgsave_in_progress:%d\r\n"
6074 "last_save_time:%ld\r\n"
6075 "bgrewriteaof_in_progress:%d\r\n"
6076 "total_connections_received:%lld\r\n"
6077 "total_commands_processed:%lld\r\n"
6078 "vm_enabled:%d\r\n"
6079 "role:%s\r\n"
6080 ,REDIS_VERSION,
6081 (sizeof(long) == 8) ? "64" : "32",
6082 aeGetApiName(),
6083 (long) getpid(),
6084 uptime,
6085 uptime/(3600*24),
6086 listLength(server.clients)-listLength(server.slaves),
6087 listLength(server.slaves),
6088 server.blpop_blocked_clients,
6089 zmalloc_used_memory(),
6090 hmem,
6091 server.dirty,
6092 server.bgsavechildpid != -1,
6093 server.lastsave,
6094 server.bgrewritechildpid != -1,
6095 server.stat_numconnections,
6096 server.stat_numcommands,
6097 server.vm_enabled != 0,
6098 server.masterhost == NULL ? "master" : "slave"
6099 );
6100 if (server.masterhost) {
6101 info = sdscatprintf(info,
6102 "master_host:%s\r\n"
6103 "master_port:%d\r\n"
6104 "master_link_status:%s\r\n"
6105 "master_last_io_seconds_ago:%d\r\n"
6106 ,server.masterhost,
6107 server.masterport,
6108 (server.replstate == REDIS_REPL_CONNECTED) ?
6109 "up" : "down",
6110 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6111 );
6112 }
6113 if (server.vm_enabled) {
6114 lockThreadedIO();
6115 info = sdscatprintf(info,
6116 "vm_conf_max_memory:%llu\r\n"
6117 "vm_conf_page_size:%llu\r\n"
6118 "vm_conf_pages:%llu\r\n"
6119 "vm_stats_used_pages:%llu\r\n"
6120 "vm_stats_swapped_objects:%llu\r\n"
6121 "vm_stats_swappin_count:%llu\r\n"
6122 "vm_stats_swappout_count:%llu\r\n"
6123 "vm_stats_io_newjobs_len:%lu\r\n"
6124 "vm_stats_io_processing_len:%lu\r\n"
6125 "vm_stats_io_processed_len:%lu\r\n"
6126 "vm_stats_io_active_threads:%lu\r\n"
6127 "vm_stats_blocked_clients:%lu\r\n"
6128 ,(unsigned long long) server.vm_max_memory,
6129 (unsigned long long) server.vm_page_size,
6130 (unsigned long long) server.vm_pages,
6131 (unsigned long long) server.vm_stats_used_pages,
6132 (unsigned long long) server.vm_stats_swapped_objects,
6133 (unsigned long long) server.vm_stats_swapins,
6134 (unsigned long long) server.vm_stats_swapouts,
6135 (unsigned long) listLength(server.io_newjobs),
6136 (unsigned long) listLength(server.io_processing),
6137 (unsigned long) listLength(server.io_processed),
6138 (unsigned long) server.io_active_threads,
6139 (unsigned long) server.vm_blocked_clients
6140 );
6141 unlockThreadedIO();
6142 }
6143 for (j = 0; j < server.dbnum; j++) {
6144 long long keys, vkeys;
6145
6146 keys = dictSize(server.db[j].dict);
6147 vkeys = dictSize(server.db[j].expires);
6148 if (keys || vkeys) {
6149 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6150 j, keys, vkeys);
6151 }
6152 }
6153 return info;
6154 }
6155
6156 static void infoCommand(redisClient *c) {
6157 sds info = genRedisInfoString();
6158 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6159 (unsigned long)sdslen(info)));
6160 addReplySds(c,info);
6161 addReply(c,shared.crlf);
6162 }
6163
6164 static void monitorCommand(redisClient *c) {
6165 /* ignore MONITOR if aleady slave or in monitor mode */
6166 if (c->flags & REDIS_SLAVE) return;
6167
6168 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6169 c->slaveseldb = 0;
6170 listAddNodeTail(server.monitors,c);
6171 addReply(c,shared.ok);
6172 }
6173
6174 /* ================================= Expire ================================= */
6175 static int removeExpire(redisDb *db, robj *key) {
6176 if (dictDelete(db->expires,key) == DICT_OK) {
6177 return 1;
6178 } else {
6179 return 0;
6180 }
6181 }
6182
6183 static int setExpire(redisDb *db, robj *key, time_t when) {
6184 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6185 return 0;
6186 } else {
6187 incrRefCount(key);
6188 return 1;
6189 }
6190 }
6191
6192 /* Return the expire time of the specified key, or -1 if no expire
6193 * is associated with this key (i.e. the key is non volatile) */
6194 static time_t getExpire(redisDb *db, robj *key) {
6195 dictEntry *de;
6196
6197 /* No expire? return ASAP */
6198 if (dictSize(db->expires) == 0 ||
6199 (de = dictFind(db->expires,key)) == NULL) return -1;
6200
6201 return (time_t) dictGetEntryVal(de);
6202 }
6203
6204 static int expireIfNeeded(redisDb *db, robj *key) {
6205 time_t when;
6206 dictEntry *de;
6207
6208 /* No expire? return ASAP */
6209 if (dictSize(db->expires) == 0 ||
6210 (de = dictFind(db->expires,key)) == NULL) return 0;
6211
6212 /* Lookup the expire */
6213 when = (time_t) dictGetEntryVal(de);
6214 if (time(NULL) <= when) return 0;
6215
6216 /* Delete the key */
6217 dictDelete(db->expires,key);
6218 return dictDelete(db->dict,key) == DICT_OK;
6219 }
6220
6221 static int deleteIfVolatile(redisDb *db, robj *key) {
6222 dictEntry *de;
6223
6224 /* No expire? return ASAP */
6225 if (dictSize(db->expires) == 0 ||
6226 (de = dictFind(db->expires,key)) == NULL) return 0;
6227
6228 /* Delete the key */
6229 server.dirty++;
6230 dictDelete(db->expires,key);
6231 return dictDelete(db->dict,key) == DICT_OK;
6232 }
6233
6234 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6235 dictEntry *de;
6236
6237 de = dictFind(c->db->dict,key);
6238 if (de == NULL) {
6239 addReply(c,shared.czero);
6240 return;
6241 }
6242 if (seconds < 0) {
6243 if (deleteKey(c->db,key)) server.dirty++;
6244 addReply(c, shared.cone);
6245 return;
6246 } else {
6247 time_t when = time(NULL)+seconds;
6248 if (setExpire(c->db,key,when)) {
6249 addReply(c,shared.cone);
6250 server.dirty++;
6251 } else {
6252 addReply(c,shared.czero);
6253 }
6254 return;
6255 }
6256 }
6257
6258 static void expireCommand(redisClient *c) {
6259 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6260 }
6261
6262 static void expireatCommand(redisClient *c) {
6263 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6264 }
6265
6266 static void ttlCommand(redisClient *c) {
6267 time_t expire;
6268 int ttl = -1;
6269
6270 expire = getExpire(c->db,c->argv[1]);
6271 if (expire != -1) {
6272 ttl = (int) (expire-time(NULL));
6273 if (ttl < 0) ttl = -1;
6274 }
6275 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6276 }
6277
6278 /* ================================ MULTI/EXEC ============================== */
6279
6280 /* Client state initialization for MULTI/EXEC */
6281 static void initClientMultiState(redisClient *c) {
6282 c->mstate.commands = NULL;
6283 c->mstate.count = 0;
6284 }
6285
6286 /* Release all the resources associated with MULTI/EXEC state */
6287 static void freeClientMultiState(redisClient *c) {
6288 int j;
6289
6290 for (j = 0; j < c->mstate.count; j++) {
6291 int i;
6292 multiCmd *mc = c->mstate.commands+j;
6293
6294 for (i = 0; i < mc->argc; i++)
6295 decrRefCount(mc->argv[i]);
6296 zfree(mc->argv);
6297 }
6298 zfree(c->mstate.commands);
6299 }
6300
6301 /* Add a new command into the MULTI commands queue */
6302 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6303 multiCmd *mc;
6304 int j;
6305
6306 c->mstate.commands = zrealloc(c->mstate.commands,
6307 sizeof(multiCmd)*(c->mstate.count+1));
6308 mc = c->mstate.commands+c->mstate.count;
6309 mc->cmd = cmd;
6310 mc->argc = c->argc;
6311 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6312 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6313 for (j = 0; j < c->argc; j++)
6314 incrRefCount(mc->argv[j]);
6315 c->mstate.count++;
6316 }
6317
6318 static void multiCommand(redisClient *c) {
6319 c->flags |= REDIS_MULTI;
6320 addReply(c,shared.ok);
6321 }
6322
6323 static void discardCommand(redisClient *c) {
6324 if (!(c->flags & REDIS_MULTI)) {
6325 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6326 return;
6327 }
6328
6329 freeClientMultiState(c);
6330 initClientMultiState(c);
6331 c->flags &= (~REDIS_MULTI);
6332 addReply(c,shared.ok);
6333 }
6334
6335 static void execCommand(redisClient *c) {
6336 int j;
6337 robj **orig_argv;
6338 int orig_argc;
6339
6340 if (!(c->flags & REDIS_MULTI)) {
6341 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6342 return;
6343 }
6344
6345 orig_argv = c->argv;
6346 orig_argc = c->argc;
6347 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6348 for (j = 0; j < c->mstate.count; j++) {
6349 c->argc = c->mstate.commands[j].argc;
6350 c->argv = c->mstate.commands[j].argv;
6351 call(c,c->mstate.commands[j].cmd);
6352 }
6353 c->argv = orig_argv;
6354 c->argc = orig_argc;
6355 freeClientMultiState(c);
6356 initClientMultiState(c);
6357 c->flags &= (~REDIS_MULTI);
6358 }
6359
6360 /* =========================== Blocking Operations ========================= */
6361
6362 /* Currently Redis blocking operations support is limited to list POP ops,
6363 * so the current implementation is not fully generic, but it is also not
6364 * completely specific so it will not require a rewrite to support new
6365 * kind of blocking operations in the future.
6366 *
6367 * Still it's important to note that list blocking operations can be already
6368 * used as a notification mechanism in order to implement other blocking
6369 * operations at application level, so there must be a very strong evidence
6370 * of usefulness and generality before new blocking operations are implemented.
6371 *
6372 * This is how the current blocking POP works, we use BLPOP as example:
6373 * - If the user calls BLPOP and the key exists and contains a non empty list
6374 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6375 * if there is not to block.
6376 * - If instead BLPOP is called and the key does not exists or the list is
6377 * empty we need to block. In order to do so we remove the notification for
6378 * new data to read in the client socket (so that we'll not serve new
6379 * requests if the blocking request is not served). Also we put the client
6380 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6381 * blocking for this keys.
6382 * - If a PUSH operation against a key with blocked clients waiting is
6383 * performed, we serve the first in the list: basically instead to push
6384 * the new element inside the list we return it to the (first / oldest)
6385 * blocking client, unblock the client, and remove it form the list.
6386 *
6387 * The above comment and the source code should be enough in order to understand
6388 * the implementation and modify / fix it later.
6389 */
6390
6391 /* Set a client in blocking mode for the specified key, with the specified
6392 * timeout */
6393 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6394 dictEntry *de;
6395 list *l;
6396 int j;
6397
6398 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6399 c->blockingkeysnum = numkeys;
6400 c->blockingto = timeout;
6401 for (j = 0; j < numkeys; j++) {
6402 /* Add the key in the client structure, to map clients -> keys */
6403 c->blockingkeys[j] = keys[j];
6404 incrRefCount(keys[j]);
6405
6406 /* And in the other "side", to map keys -> clients */
6407 de = dictFind(c->db->blockingkeys,keys[j]);
6408 if (de == NULL) {
6409 int retval;
6410
6411 /* For every key we take a list of clients blocked for it */
6412 l = listCreate();
6413 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6414 incrRefCount(keys[j]);
6415 assert(retval == DICT_OK);
6416 } else {
6417 l = dictGetEntryVal(de);
6418 }
6419 listAddNodeTail(l,c);
6420 }
6421 /* Mark the client as a blocked client */
6422 c->flags |= REDIS_BLOCKED;
6423 server.blpop_blocked_clients++;
6424 }
6425
6426 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6427 static void unblockClientWaitingData(redisClient *c) {
6428 dictEntry *de;
6429 list *l;
6430 int j;
6431
6432 assert(c->blockingkeys != NULL);
6433 /* The client may wait for multiple keys, so unblock it for every key. */
6434 for (j = 0; j < c->blockingkeysnum; j++) {
6435 /* Remove this client from the list of clients waiting for this key. */
6436 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6437 assert(de != NULL);
6438 l = dictGetEntryVal(de);
6439 listDelNode(l,listSearchKey(l,c));
6440 /* If the list is empty we need to remove it to avoid wasting memory */
6441 if (listLength(l) == 0)
6442 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6443 decrRefCount(c->blockingkeys[j]);
6444 }
6445 /* Cleanup the client structure */
6446 zfree(c->blockingkeys);
6447 c->blockingkeys = NULL;
6448 c->flags &= (~REDIS_BLOCKED);
6449 server.blpop_blocked_clients--;
6450 /* We want to process data if there is some command waiting
6451 * in the input buffer. Note that this is safe even if
6452 * unblockClientWaitingData() gets called from freeClient() because
6453 * freeClient() will be smart enough to call this function
6454 * *after* c->querybuf was set to NULL. */
6455 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6456 }
6457
6458 /* This should be called from any function PUSHing into lists.
6459 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6460 * 'ele' is the element pushed.
6461 *
6462 * If the function returns 0 there was no client waiting for a list push
6463 * against this key.
6464 *
6465 * If the function returns 1 there was a client waiting for a list push
6466 * against this key, the element was passed to this client thus it's not
6467 * needed to actually add it to the list and the caller should return asap. */
6468 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6469 struct dictEntry *de;
6470 redisClient *receiver;
6471 list *l;
6472 listNode *ln;
6473
6474 de = dictFind(c->db->blockingkeys,key);
6475 if (de == NULL) return 0;
6476 l = dictGetEntryVal(de);
6477 ln = listFirst(l);
6478 assert(ln != NULL);
6479 receiver = ln->value;
6480
6481 addReplySds(receiver,sdsnew("*2\r\n"));
6482 addReplyBulkLen(receiver,key);
6483 addReply(receiver,key);
6484 addReply(receiver,shared.crlf);
6485 addReplyBulkLen(receiver,ele);
6486 addReply(receiver,ele);
6487 addReply(receiver,shared.crlf);
6488 unblockClientWaitingData(receiver);
6489 return 1;
6490 }
6491
6492 /* Blocking RPOP/LPOP */
6493 static void blockingPopGenericCommand(redisClient *c, int where) {
6494 robj *o;
6495 time_t timeout;
6496 int j;
6497
6498 for (j = 1; j < c->argc-1; j++) {
6499 o = lookupKeyWrite(c->db,c->argv[j]);
6500 if (o != NULL) {
6501 if (o->type != REDIS_LIST) {
6502 addReply(c,shared.wrongtypeerr);
6503 return;
6504 } else {
6505 list *list = o->ptr;
6506 if (listLength(list) != 0) {
6507 /* If the list contains elements fall back to the usual
6508 * non-blocking POP operation */
6509 robj *argv[2], **orig_argv;
6510 int orig_argc;
6511
6512 /* We need to alter the command arguments before to call
6513 * popGenericCommand() as the command takes a single key. */
6514 orig_argv = c->argv;
6515 orig_argc = c->argc;
6516 argv[1] = c->argv[j];
6517 c->argv = argv;
6518 c->argc = 2;
6519
6520 /* Also the return value is different, we need to output
6521 * the multi bulk reply header and the key name. The
6522 * "real" command will add the last element (the value)
6523 * for us. If this souds like an hack to you it's just
6524 * because it is... */
6525 addReplySds(c,sdsnew("*2\r\n"));
6526 addReplyBulkLen(c,argv[1]);
6527 addReply(c,argv[1]);
6528 addReply(c,shared.crlf);
6529 popGenericCommand(c,where);
6530
6531 /* Fix the client structure with the original stuff */
6532 c->argv = orig_argv;
6533 c->argc = orig_argc;
6534 return;
6535 }
6536 }
6537 }
6538 }
6539 /* If the list is empty or the key does not exists we must block */
6540 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6541 if (timeout > 0) timeout += time(NULL);
6542 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6543 }
6544
6545 static void blpopCommand(redisClient *c) {
6546 blockingPopGenericCommand(c,REDIS_HEAD);
6547 }
6548
6549 static void brpopCommand(redisClient *c) {
6550 blockingPopGenericCommand(c,REDIS_TAIL);
6551 }
6552
6553 /* =============================== Replication ============================= */
6554
6555 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6556 ssize_t nwritten, ret = size;
6557 time_t start = time(NULL);
6558
6559 timeout++;
6560 while(size) {
6561 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6562 nwritten = write(fd,ptr,size);
6563 if (nwritten == -1) return -1;
6564 ptr += nwritten;
6565 size -= nwritten;
6566 }
6567 if ((time(NULL)-start) > timeout) {
6568 errno = ETIMEDOUT;
6569 return -1;
6570 }
6571 }
6572 return ret;
6573 }
6574
6575 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6576 ssize_t nread, totread = 0;
6577 time_t start = time(NULL);
6578
6579 timeout++;
6580 while(size) {
6581 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6582 nread = read(fd,ptr,size);
6583 if (nread == -1) return -1;
6584 ptr += nread;
6585 size -= nread;
6586 totread += nread;
6587 }
6588 if ((time(NULL)-start) > timeout) {
6589 errno = ETIMEDOUT;
6590 return -1;
6591 }
6592 }
6593 return totread;
6594 }
6595
6596 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6597 ssize_t nread = 0;
6598
6599 size--;
6600 while(size) {
6601 char c;
6602
6603 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6604 if (c == '\n') {
6605 *ptr = '\0';
6606 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6607 return nread;
6608 } else {
6609 *ptr++ = c;
6610 *ptr = '\0';
6611 nread++;
6612 }
6613 }
6614 return nread;
6615 }
6616
6617 static void syncCommand(redisClient *c) {
6618 /* ignore SYNC if aleady slave or in monitor mode */
6619 if (c->flags & REDIS_SLAVE) return;
6620
6621 /* SYNC can't be issued when the server has pending data to send to
6622 * the client about already issued commands. We need a fresh reply
6623 * buffer registering the differences between the BGSAVE and the current
6624 * dataset, so that we can copy to other slaves if needed. */
6625 if (listLength(c->reply) != 0) {
6626 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6627 return;
6628 }
6629
6630 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6631 /* Here we need to check if there is a background saving operation
6632 * in progress, or if it is required to start one */
6633 if (server.bgsavechildpid != -1) {
6634 /* Ok a background save is in progress. Let's check if it is a good
6635 * one for replication, i.e. if there is another slave that is
6636 * registering differences since the server forked to save */
6637 redisClient *slave;
6638 listNode *ln;
6639 listIter li;
6640
6641 listRewind(server.slaves,&li);
6642 while((ln = listNext(&li))) {
6643 slave = ln->value;
6644 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6645 }
6646 if (ln) {
6647 /* Perfect, the server is already registering differences for
6648 * another slave. Set the right state, and copy the buffer. */
6649 listRelease(c->reply);
6650 c->reply = listDup(slave->reply);
6651 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6652 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6653 } else {
6654 /* No way, we need to wait for the next BGSAVE in order to
6655 * register differences */
6656 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6657 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6658 }
6659 } else {
6660 /* Ok we don't have a BGSAVE in progress, let's start one */
6661 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6662 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6663 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6664 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6665 return;
6666 }
6667 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6668 }
6669 c->repldbfd = -1;
6670 c->flags |= REDIS_SLAVE;
6671 c->slaveseldb = 0;
6672 listAddNodeTail(server.slaves,c);
6673 return;
6674 }
6675
6676 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6677 redisClient *slave = privdata;
6678 REDIS_NOTUSED(el);
6679 REDIS_NOTUSED(mask);
6680 char buf[REDIS_IOBUF_LEN];
6681 ssize_t nwritten, buflen;
6682
6683 if (slave->repldboff == 0) {
6684 /* Write the bulk write count before to transfer the DB. In theory here
6685 * we don't know how much room there is in the output buffer of the
6686 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6687 * operations) will never be smaller than the few bytes we need. */
6688 sds bulkcount;
6689
6690 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6691 slave->repldbsize);
6692 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6693 {
6694 sdsfree(bulkcount);
6695 freeClient(slave);
6696 return;
6697 }
6698 sdsfree(bulkcount);
6699 }
6700 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6701 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6702 if (buflen <= 0) {
6703 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6704 (buflen == 0) ? "premature EOF" : strerror(errno));
6705 freeClient(slave);
6706 return;
6707 }
6708 if ((nwritten = write(fd,buf,buflen)) == -1) {
6709 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6710 strerror(errno));
6711 freeClient(slave);
6712 return;
6713 }
6714 slave->repldboff += nwritten;
6715 if (slave->repldboff == slave->repldbsize) {
6716 close(slave->repldbfd);
6717 slave->repldbfd = -1;
6718 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6719 slave->replstate = REDIS_REPL_ONLINE;
6720 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6721 sendReplyToClient, slave) == AE_ERR) {
6722 freeClient(slave);
6723 return;
6724 }
6725 addReplySds(slave,sdsempty());
6726 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6727 }
6728 }
6729
6730 /* This function is called at the end of every backgrond saving.
6731 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6732 * otherwise REDIS_ERR is passed to the function.
6733 *
6734 * The goal of this function is to handle slaves waiting for a successful
6735 * background saving in order to perform non-blocking synchronization. */
6736 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6737 listNode *ln;
6738 int startbgsave = 0;
6739 listIter li;
6740
6741 listRewind(server.slaves,&li);
6742 while((ln = listNext(&li))) {
6743 redisClient *slave = ln->value;
6744
6745 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6746 startbgsave = 1;
6747 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6748 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6749 struct redis_stat buf;
6750
6751 if (bgsaveerr != REDIS_OK) {
6752 freeClient(slave);
6753 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6754 continue;
6755 }
6756 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6757 redis_fstat(slave->repldbfd,&buf) == -1) {
6758 freeClient(slave);
6759 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6760 continue;
6761 }
6762 slave->repldboff = 0;
6763 slave->repldbsize = buf.st_size;
6764 slave->replstate = REDIS_REPL_SEND_BULK;
6765 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6766 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6767 freeClient(slave);
6768 continue;
6769 }
6770 }
6771 }
6772 if (startbgsave) {
6773 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6774 listIter li;
6775
6776 listRewind(server.slaves,&li);
6777 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6778 while((ln = listNext(&li))) {
6779 redisClient *slave = ln->value;
6780
6781 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6782 freeClient(slave);
6783 }
6784 }
6785 }
6786 }
6787
6788 static int syncWithMaster(void) {
6789 char buf[1024], tmpfile[256], authcmd[1024];
6790 long dumpsize;
6791 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6792 int dfd;
6793
6794 if (fd == -1) {
6795 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6796 strerror(errno));
6797 return REDIS_ERR;
6798 }
6799
6800 /* AUTH with the master if required. */
6801 if(server.masterauth) {
6802 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6803 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6804 close(fd);
6805 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6806 strerror(errno));
6807 return REDIS_ERR;
6808 }
6809 /* Read the AUTH result. */
6810 if (syncReadLine(fd,buf,1024,3600) == -1) {
6811 close(fd);
6812 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6813 strerror(errno));
6814 return REDIS_ERR;
6815 }
6816 if (buf[0] != '+') {
6817 close(fd);
6818 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6819 return REDIS_ERR;
6820 }
6821 }
6822
6823 /* Issue the SYNC command */
6824 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6825 close(fd);
6826 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6827 strerror(errno));
6828 return REDIS_ERR;
6829 }
6830 /* Read the bulk write count */
6831 if (syncReadLine(fd,buf,1024,3600) == -1) {
6832 close(fd);
6833 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6834 strerror(errno));
6835 return REDIS_ERR;
6836 }
6837 if (buf[0] != '$') {
6838 close(fd);
6839 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6840 return REDIS_ERR;
6841 }
6842 dumpsize = strtol(buf+1,NULL,10);
6843 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
6844 /* Read the bulk write data on a temp file */
6845 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6846 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6847 if (dfd == -1) {
6848 close(fd);
6849 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6850 return REDIS_ERR;
6851 }
6852 while(dumpsize) {
6853 int nread, nwritten;
6854
6855 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6856 if (nread == -1) {
6857 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6858 strerror(errno));
6859 close(fd);
6860 close(dfd);
6861 return REDIS_ERR;
6862 }
6863 nwritten = write(dfd,buf,nread);
6864 if (nwritten == -1) {
6865 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6866 close(fd);
6867 close(dfd);
6868 return REDIS_ERR;
6869 }
6870 dumpsize -= nread;
6871 }
6872 close(dfd);
6873 if (rename(tmpfile,server.dbfilename) == -1) {
6874 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6875 unlink(tmpfile);
6876 close(fd);
6877 return REDIS_ERR;
6878 }
6879 emptyDb();
6880 if (rdbLoad(server.dbfilename) != REDIS_OK) {
6881 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6882 close(fd);
6883 return REDIS_ERR;
6884 }
6885 server.master = createClient(fd);
6886 server.master->flags |= REDIS_MASTER;
6887 server.master->authenticated = 1;
6888 server.replstate = REDIS_REPL_CONNECTED;
6889 return REDIS_OK;
6890 }
6891
6892 static void slaveofCommand(redisClient *c) {
6893 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6894 !strcasecmp(c->argv[2]->ptr,"one")) {
6895 if (server.masterhost) {
6896 sdsfree(server.masterhost);
6897 server.masterhost = NULL;
6898 if (server.master) freeClient(server.master);
6899 server.replstate = REDIS_REPL_NONE;
6900 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6901 }
6902 } else {
6903 sdsfree(server.masterhost);
6904 server.masterhost = sdsdup(c->argv[1]->ptr);
6905 server.masterport = atoi(c->argv[2]->ptr);
6906 if (server.master) freeClient(server.master);
6907 server.replstate = REDIS_REPL_CONNECT;
6908 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6909 server.masterhost, server.masterport);
6910 }
6911 addReply(c,shared.ok);
6912 }
6913
6914 /* ============================ Maxmemory directive ======================== */
6915
6916 /* Try to free one object form the pre-allocated objects free list.
6917 * This is useful under low mem conditions as by default we take 1 million
6918 * free objects allocated. On success REDIS_OK is returned, otherwise
6919 * REDIS_ERR. */
6920 static int tryFreeOneObjectFromFreelist(void) {
6921 robj *o;
6922
6923 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6924 if (listLength(server.objfreelist)) {
6925 listNode *head = listFirst(server.objfreelist);
6926 o = listNodeValue(head);
6927 listDelNode(server.objfreelist,head);
6928 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6929 zfree(o);
6930 return REDIS_OK;
6931 } else {
6932 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6933 return REDIS_ERR;
6934 }
6935 }
6936
6937 /* This function gets called when 'maxmemory' is set on the config file to limit
6938 * the max memory used by the server, and we are out of memory.
6939 * This function will try to, in order:
6940 *
6941 * - Free objects from the free list
6942 * - Try to remove keys with an EXPIRE set
6943 *
6944 * It is not possible to free enough memory to reach used-memory < maxmemory
6945 * the server will start refusing commands that will enlarge even more the
6946 * memory usage.
6947 */
6948 static void freeMemoryIfNeeded(void) {
6949 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
6950 int j, k, freed = 0;
6951
6952 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6953 for (j = 0; j < server.dbnum; j++) {
6954 int minttl = -1;
6955 robj *minkey = NULL;
6956 struct dictEntry *de;
6957
6958 if (dictSize(server.db[j].expires)) {
6959 freed = 1;
6960 /* From a sample of three keys drop the one nearest to
6961 * the natural expire */
6962 for (k = 0; k < 3; k++) {
6963 time_t t;
6964
6965 de = dictGetRandomKey(server.db[j].expires);
6966 t = (time_t) dictGetEntryVal(de);
6967 if (minttl == -1 || t < minttl) {
6968 minkey = dictGetEntryKey(de);
6969 minttl = t;
6970 }
6971 }
6972 deleteKey(server.db+j,minkey);
6973 }
6974 }
6975 if (!freed) return; /* nothing to free... */
6976 }
6977 }
6978
6979 /* ============================== Append Only file ========================== */
6980
6981 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6982 sds buf = sdsempty();
6983 int j;
6984 ssize_t nwritten;
6985 time_t now;
6986 robj *tmpargv[3];
6987
6988 /* The DB this command was targetting is not the same as the last command
6989 * we appendend. To issue a SELECT command is needed. */
6990 if (dictid != server.appendseldb) {
6991 char seldb[64];
6992
6993 snprintf(seldb,sizeof(seldb),"%d",dictid);
6994 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6995 (unsigned long)strlen(seldb),seldb);
6996 server.appendseldb = dictid;
6997 }
6998
6999 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7000 * EXPIREs into EXPIREATs calls */
7001 if (cmd->proc == expireCommand) {
7002 long when;
7003
7004 tmpargv[0] = createStringObject("EXPIREAT",8);
7005 tmpargv[1] = argv[1];
7006 incrRefCount(argv[1]);
7007 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7008 tmpargv[2] = createObject(REDIS_STRING,
7009 sdscatprintf(sdsempty(),"%ld",when));
7010 argv = tmpargv;
7011 }
7012
7013 /* Append the actual command */
7014 buf = sdscatprintf(buf,"*%d\r\n",argc);
7015 for (j = 0; j < argc; j++) {
7016 robj *o = argv[j];
7017
7018 o = getDecodedObject(o);
7019 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7020 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7021 buf = sdscatlen(buf,"\r\n",2);
7022 decrRefCount(o);
7023 }
7024
7025 /* Free the objects from the modified argv for EXPIREAT */
7026 if (cmd->proc == expireCommand) {
7027 for (j = 0; j < 3; j++)
7028 decrRefCount(argv[j]);
7029 }
7030
7031 /* We want to perform a single write. This should be guaranteed atomic
7032 * at least if the filesystem we are writing is a real physical one.
7033 * While this will save us against the server being killed I don't think
7034 * there is much to do about the whole server stopping for power problems
7035 * or alike */
7036 nwritten = write(server.appendfd,buf,sdslen(buf));
7037 if (nwritten != (signed)sdslen(buf)) {
7038 /* Ooops, we are in troubles. The best thing to do for now is
7039 * to simply exit instead to give the illusion that everything is
7040 * working as expected. */
7041 if (nwritten == -1) {
7042 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7043 } else {
7044 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7045 }
7046 exit(1);
7047 }
7048 /* If a background append only file rewriting is in progress we want to
7049 * accumulate the differences between the child DB and the current one
7050 * in a buffer, so that when the child process will do its work we
7051 * can append the differences to the new append only file. */
7052 if (server.bgrewritechildpid != -1)
7053 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7054
7055 sdsfree(buf);
7056 now = time(NULL);
7057 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7058 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7059 now-server.lastfsync > 1))
7060 {
7061 fsync(server.appendfd); /* Let's try to get this data on the disk */
7062 server.lastfsync = now;
7063 }
7064 }
7065
7066 /* In Redis commands are always executed in the context of a client, so in
7067 * order to load the append only file we need to create a fake client. */
7068 static struct redisClient *createFakeClient(void) {
7069 struct redisClient *c = zmalloc(sizeof(*c));
7070
7071 selectDb(c,0);
7072 c->fd = -1;
7073 c->querybuf = sdsempty();
7074 c->argc = 0;
7075 c->argv = NULL;
7076 c->flags = 0;
7077 /* We set the fake client as a slave waiting for the synchronization
7078 * so that Redis will not try to send replies to this client. */
7079 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7080 c->reply = listCreate();
7081 listSetFreeMethod(c->reply,decrRefCount);
7082 listSetDupMethod(c->reply,dupClientReplyValue);
7083 return c;
7084 }
7085
7086 static void freeFakeClient(struct redisClient *c) {
7087 sdsfree(c->querybuf);
7088 listRelease(c->reply);
7089 zfree(c);
7090 }
7091
7092 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7093 * error (the append only file is zero-length) REDIS_ERR is returned. On
7094 * fatal error an error message is logged and the program exists. */
7095 int loadAppendOnlyFile(char *filename) {
7096 struct redisClient *fakeClient;
7097 FILE *fp = fopen(filename,"r");
7098 struct redis_stat sb;
7099 unsigned long long loadedkeys = 0;
7100
7101 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7102 return REDIS_ERR;
7103
7104 if (fp == NULL) {
7105 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7106 exit(1);
7107 }
7108
7109 fakeClient = createFakeClient();
7110 while(1) {
7111 int argc, j;
7112 unsigned long len;
7113 robj **argv;
7114 char buf[128];
7115 sds argsds;
7116 struct redisCommand *cmd;
7117
7118 if (fgets(buf,sizeof(buf),fp) == NULL) {
7119 if (feof(fp))
7120 break;
7121 else
7122 goto readerr;
7123 }
7124 if (buf[0] != '*') goto fmterr;
7125 argc = atoi(buf+1);
7126 argv = zmalloc(sizeof(robj*)*argc);
7127 for (j = 0; j < argc; j++) {
7128 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7129 if (buf[0] != '$') goto fmterr;
7130 len = strtol(buf+1,NULL,10);
7131 argsds = sdsnewlen(NULL,len);
7132 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7133 argv[j] = createObject(REDIS_STRING,argsds);
7134 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7135 }
7136
7137 /* Command lookup */
7138 cmd = lookupCommand(argv[0]->ptr);
7139 if (!cmd) {
7140 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7141 exit(1);
7142 }
7143 /* Try object sharing and encoding */
7144 if (server.shareobjects) {
7145 int j;
7146 for(j = 1; j < argc; j++)
7147 argv[j] = tryObjectSharing(argv[j]);
7148 }
7149 if (cmd->flags & REDIS_CMD_BULK)
7150 tryObjectEncoding(argv[argc-1]);
7151 /* Run the command in the context of a fake client */
7152 fakeClient->argc = argc;
7153 fakeClient->argv = argv;
7154 cmd->proc(fakeClient);
7155 /* Discard the reply objects list from the fake client */
7156 while(listLength(fakeClient->reply))
7157 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7158 /* Clean up, ready for the next command */
7159 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7160 zfree(argv);
7161 /* Handle swapping while loading big datasets when VM is on */
7162 loadedkeys++;
7163 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7164 while (zmalloc_used_memory() > server.vm_max_memory) {
7165 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7166 }
7167 }
7168 }
7169 fclose(fp);
7170 freeFakeClient(fakeClient);
7171 return REDIS_OK;
7172
7173 readerr:
7174 if (feof(fp)) {
7175 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7176 } else {
7177 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7178 }
7179 exit(1);
7180 fmterr:
7181 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7182 exit(1);
7183 }
7184
7185 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7186 static int fwriteBulk(FILE *fp, robj *obj) {
7187 char buf[128];
7188 int decrrc = 0;
7189
7190 /* Avoid the incr/decr ref count business if possible to help
7191 * copy-on-write (we are often in a child process when this function
7192 * is called).
7193 * Also makes sure that key objects don't get incrRefCount-ed when VM
7194 * is enabled */
7195 if (obj->encoding != REDIS_ENCODING_RAW) {
7196 obj = getDecodedObject(obj);
7197 decrrc = 1;
7198 }
7199 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7200 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7201 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7202 goto err;
7203 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7204 if (decrrc) decrRefCount(obj);
7205 return 1;
7206 err:
7207 if (decrrc) decrRefCount(obj);
7208 return 0;
7209 }
7210
7211 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7212 static int fwriteBulkDouble(FILE *fp, double d) {
7213 char buf[128], dbuf[128];
7214
7215 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7216 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7217 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7218 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7219 return 1;
7220 }
7221
7222 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7223 static int fwriteBulkLong(FILE *fp, long l) {
7224 char buf[128], lbuf[128];
7225
7226 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7227 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7228 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7229 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7230 return 1;
7231 }
7232
7233 /* Write a sequence of commands able to fully rebuild the dataset into
7234 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7235 static int rewriteAppendOnlyFile(char *filename) {
7236 dictIterator *di = NULL;
7237 dictEntry *de;
7238 FILE *fp;
7239 char tmpfile[256];
7240 int j;
7241 time_t now = time(NULL);
7242
7243 /* Note that we have to use a different temp name here compared to the
7244 * one used by rewriteAppendOnlyFileBackground() function. */
7245 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7246 fp = fopen(tmpfile,"w");
7247 if (!fp) {
7248 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7249 return REDIS_ERR;
7250 }
7251 for (j = 0; j < server.dbnum; j++) {
7252 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7253 redisDb *db = server.db+j;
7254 dict *d = db->dict;
7255 if (dictSize(d) == 0) continue;
7256 di = dictGetIterator(d);
7257 if (!di) {
7258 fclose(fp);
7259 return REDIS_ERR;
7260 }
7261
7262 /* SELECT the new DB */
7263 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7264 if (fwriteBulkLong(fp,j) == 0) goto werr;
7265
7266 /* Iterate this DB writing every entry */
7267 while((de = dictNext(di)) != NULL) {
7268 robj *key, *o;
7269 time_t expiretime;
7270 int swapped;
7271
7272 key = dictGetEntryKey(de);
7273 /* If the value for this key is swapped, load a preview in memory.
7274 * We use a "swapped" flag to remember if we need to free the
7275 * value object instead to just increment the ref count anyway
7276 * in order to avoid copy-on-write of pages if we are forked() */
7277 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7278 key->storage == REDIS_VM_SWAPPING) {
7279 o = dictGetEntryVal(de);
7280 swapped = 0;
7281 } else {
7282 o = vmPreviewObject(key);
7283 swapped = 1;
7284 }
7285 expiretime = getExpire(db,key);
7286
7287 /* Save the key and associated value */
7288 if (o->type == REDIS_STRING) {
7289 /* Emit a SET command */
7290 char cmd[]="*3\r\n$3\r\nSET\r\n";
7291 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7292 /* Key and value */
7293 if (fwriteBulk(fp,key) == 0) goto werr;
7294 if (fwriteBulk(fp,o) == 0) goto werr;
7295 } else if (o->type == REDIS_LIST) {
7296 /* Emit the RPUSHes needed to rebuild the list */
7297 list *list = o->ptr;
7298 listNode *ln;
7299 listIter li;
7300
7301 listRewind(list,&li);
7302 while((ln = listNext(&li))) {
7303 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7304 robj *eleobj = listNodeValue(ln);
7305
7306 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7307 if (fwriteBulk(fp,key) == 0) goto werr;
7308 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7309 }
7310 } else if (o->type == REDIS_SET) {
7311 /* Emit the SADDs needed to rebuild the set */
7312 dict *set = o->ptr;
7313 dictIterator *di = dictGetIterator(set);
7314 dictEntry *de;
7315
7316 while((de = dictNext(di)) != NULL) {
7317 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7318 robj *eleobj = dictGetEntryKey(de);
7319
7320 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7321 if (fwriteBulk(fp,key) == 0) goto werr;
7322 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7323 }
7324 dictReleaseIterator(di);
7325 } else if (o->type == REDIS_ZSET) {
7326 /* Emit the ZADDs needed to rebuild the sorted set */
7327 zset *zs = o->ptr;
7328 dictIterator *di = dictGetIterator(zs->dict);
7329 dictEntry *de;
7330
7331 while((de = dictNext(di)) != NULL) {
7332 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7333 robj *eleobj = dictGetEntryKey(de);
7334 double *score = dictGetEntryVal(de);
7335
7336 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7337 if (fwriteBulk(fp,key) == 0) goto werr;
7338 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7339 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7340 }
7341 dictReleaseIterator(di);
7342 } else {
7343 redisAssert(0 != 0);
7344 }
7345 /* Save the expire time */
7346 if (expiretime != -1) {
7347 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7348 /* If this key is already expired skip it */
7349 if (expiretime < now) continue;
7350 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7351 if (fwriteBulk(fp,key) == 0) goto werr;
7352 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7353 }
7354 if (swapped) decrRefCount(o);
7355 }
7356 dictReleaseIterator(di);
7357 }
7358
7359 /* Make sure data will not remain on the OS's output buffers */
7360 fflush(fp);
7361 fsync(fileno(fp));
7362 fclose(fp);
7363
7364 /* Use RENAME to make sure the DB file is changed atomically only
7365 * if the generate DB file is ok. */
7366 if (rename(tmpfile,filename) == -1) {
7367 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7368 unlink(tmpfile);
7369 return REDIS_ERR;
7370 }
7371 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7372 return REDIS_OK;
7373
7374 werr:
7375 fclose(fp);
7376 unlink(tmpfile);
7377 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7378 if (di) dictReleaseIterator(di);
7379 return REDIS_ERR;
7380 }
7381
7382 /* This is how rewriting of the append only file in background works:
7383 *
7384 * 1) The user calls BGREWRITEAOF
7385 * 2) Redis calls this function, that forks():
7386 * 2a) the child rewrite the append only file in a temp file.
7387 * 2b) the parent accumulates differences in server.bgrewritebuf.
7388 * 3) When the child finished '2a' exists.
7389 * 4) The parent will trap the exit code, if it's OK, will append the
7390 * data accumulated into server.bgrewritebuf into the temp file, and
7391 * finally will rename(2) the temp file in the actual file name.
7392 * The the new file is reopened as the new append only file. Profit!
7393 */
7394 static int rewriteAppendOnlyFileBackground(void) {
7395 pid_t childpid;
7396
7397 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7398 if (server.vm_enabled) waitEmptyIOJobsQueue();
7399 if ((childpid = fork()) == 0) {
7400 /* Child */
7401 char tmpfile[256];
7402
7403 if (server.vm_enabled) vmReopenSwapFile();
7404 close(server.fd);
7405 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7406 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7407 _exit(0);
7408 } else {
7409 _exit(1);
7410 }
7411 } else {
7412 /* Parent */
7413 if (childpid == -1) {
7414 redisLog(REDIS_WARNING,
7415 "Can't rewrite append only file in background: fork: %s",
7416 strerror(errno));
7417 return REDIS_ERR;
7418 }
7419 redisLog(REDIS_NOTICE,
7420 "Background append only file rewriting started by pid %d",childpid);
7421 server.bgrewritechildpid = childpid;
7422 /* We set appendseldb to -1 in order to force the next call to the
7423 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7424 * accumulated by the parent into server.bgrewritebuf will start
7425 * with a SELECT statement and it will be safe to merge. */
7426 server.appendseldb = -1;
7427 return REDIS_OK;
7428 }
7429 return REDIS_OK; /* unreached */
7430 }
7431
7432 static void bgrewriteaofCommand(redisClient *c) {
7433 if (server.bgrewritechildpid != -1) {
7434 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7435 return;
7436 }
7437 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7438 char *status = "+Background append only file rewriting started\r\n";
7439 addReplySds(c,sdsnew(status));
7440 } else {
7441 addReply(c,shared.err);
7442 }
7443 }
7444
7445 static void aofRemoveTempFile(pid_t childpid) {
7446 char tmpfile[256];
7447
7448 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7449 unlink(tmpfile);
7450 }
7451
7452 /* Virtual Memory is composed mainly of two subsystems:
7453 * - Blocking Virutal Memory
7454 * - Threaded Virtual Memory I/O
7455 * The two parts are not fully decoupled, but functions are split among two
7456 * different sections of the source code (delimited by comments) in order to
7457 * make more clear what functionality is about the blocking VM and what about
7458 * the threaded (not blocking) VM.
7459 *
7460 * Redis VM design:
7461 *
7462 * Redis VM is a blocking VM (one that blocks reading swapped values from
7463 * disk into memory when a value swapped out is needed in memory) that is made
7464 * unblocking by trying to examine the command argument vector in order to
7465 * load in background values that will likely be needed in order to exec
7466 * the command. The command is executed only once all the relevant keys
7467 * are loaded into memory.
7468 *
7469 * This basically is almost as simple of a blocking VM, but almost as parallel
7470 * as a fully non-blocking VM.
7471 */
7472
7473 /* =================== Virtual Memory - Blocking Side ====================== */
7474
7475 /* substitute the first occurrence of '%p' with the process pid in the
7476 * swap file name. */
7477 static void expandVmSwapFilename(void) {
7478 char *p = strstr(server.vm_swap_file,"%p");
7479 sds new;
7480
7481 if (!p) return;
7482 new = sdsempty();
7483 *p = '\0';
7484 new = sdscat(new,server.vm_swap_file);
7485 new = sdscatprintf(new,"%ld",(long) getpid());
7486 new = sdscat(new,p+2);
7487 zfree(server.vm_swap_file);
7488 server.vm_swap_file = new;
7489 }
7490
7491 static void vmInit(void) {
7492 off_t totsize;
7493 int pipefds[2];
7494 size_t stacksize;
7495
7496 if (server.vm_max_threads != 0)
7497 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7498
7499 expandVmSwapFilename();
7500 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7501 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7502 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7503 }
7504 if (server.vm_fp == NULL) {
7505 redisLog(REDIS_WARNING,
7506 "Impossible to open the swap file: %s. Exiting.",
7507 strerror(errno));
7508 exit(1);
7509 }
7510 server.vm_fd = fileno(server.vm_fp);
7511 server.vm_next_page = 0;
7512 server.vm_near_pages = 0;
7513 server.vm_stats_used_pages = 0;
7514 server.vm_stats_swapped_objects = 0;
7515 server.vm_stats_swapouts = 0;
7516 server.vm_stats_swapins = 0;
7517 totsize = server.vm_pages*server.vm_page_size;
7518 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7519 if (ftruncate(server.vm_fd,totsize) == -1) {
7520 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7521 strerror(errno));
7522 exit(1);
7523 } else {
7524 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7525 }
7526 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7527 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7528 (long long) (server.vm_pages+7)/8, server.vm_pages);
7529 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7530
7531 /* Initialize threaded I/O (used by Virtual Memory) */
7532 server.io_newjobs = listCreate();
7533 server.io_processing = listCreate();
7534 server.io_processed = listCreate();
7535 server.io_ready_clients = listCreate();
7536 pthread_mutex_init(&server.io_mutex,NULL);
7537 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7538 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7539 server.io_active_threads = 0;
7540 if (pipe(pipefds) == -1) {
7541 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7542 ,strerror(errno));
7543 exit(1);
7544 }
7545 server.io_ready_pipe_read = pipefds[0];
7546 server.io_ready_pipe_write = pipefds[1];
7547 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7548 /* LZF requires a lot of stack */
7549 pthread_attr_init(&server.io_threads_attr);
7550 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7551 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7552 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7553 /* Listen for events in the threaded I/O pipe */
7554 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7555 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7556 oom("creating file event");
7557 }
7558
7559 /* Mark the page as used */
7560 static void vmMarkPageUsed(off_t page) {
7561 off_t byte = page/8;
7562 int bit = page&7;
7563 redisAssert(vmFreePage(page) == 1);
7564 server.vm_bitmap[byte] |= 1<<bit;
7565 }
7566
7567 /* Mark N contiguous pages as used, with 'page' being the first. */
7568 static void vmMarkPagesUsed(off_t page, off_t count) {
7569 off_t j;
7570
7571 for (j = 0; j < count; j++)
7572 vmMarkPageUsed(page+j);
7573 server.vm_stats_used_pages += count;
7574 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7575 (long long)count, (long long)page);
7576 }
7577
7578 /* Mark the page as free */
7579 static void vmMarkPageFree(off_t page) {
7580 off_t byte = page/8;
7581 int bit = page&7;
7582 redisAssert(vmFreePage(page) == 0);
7583 server.vm_bitmap[byte] &= ~(1<<bit);
7584 }
7585
7586 /* Mark N contiguous pages as free, with 'page' being the first. */
7587 static void vmMarkPagesFree(off_t page, off_t count) {
7588 off_t j;
7589
7590 for (j = 0; j < count; j++)
7591 vmMarkPageFree(page+j);
7592 server.vm_stats_used_pages -= count;
7593 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7594 (long long)count, (long long)page);
7595 }
7596
7597 /* Test if the page is free */
7598 static int vmFreePage(off_t page) {
7599 off_t byte = page/8;
7600 int bit = page&7;
7601 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7602 }
7603
7604 /* Find N contiguous free pages storing the first page of the cluster in *first.
7605 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7606 * REDIS_ERR is returned.
7607 *
7608 * This function uses a simple algorithm: we try to allocate
7609 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7610 * again from the start of the swap file searching for free spaces.
7611 *
7612 * If it looks pretty clear that there are no free pages near our offset
7613 * we try to find less populated places doing a forward jump of
7614 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7615 * without hurry, and then we jump again and so forth...
7616 *
7617 * This function can be improved using a free list to avoid to guess
7618 * too much, since we could collect data about freed pages.
7619 *
7620 * note: I implemented this function just after watching an episode of
7621 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7622 */
7623 static int vmFindContiguousPages(off_t *first, off_t n) {
7624 off_t base, offset = 0, since_jump = 0, numfree = 0;
7625
7626 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7627 server.vm_near_pages = 0;
7628 server.vm_next_page = 0;
7629 }
7630 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7631 base = server.vm_next_page;
7632
7633 while(offset < server.vm_pages) {
7634 off_t this = base+offset;
7635
7636 /* If we overflow, restart from page zero */
7637 if (this >= server.vm_pages) {
7638 this -= server.vm_pages;
7639 if (this == 0) {
7640 /* Just overflowed, what we found on tail is no longer
7641 * interesting, as it's no longer contiguous. */
7642 numfree = 0;
7643 }
7644 }
7645 if (vmFreePage(this)) {
7646 /* This is a free page */
7647 numfree++;
7648 /* Already got N free pages? Return to the caller, with success */
7649 if (numfree == n) {
7650 *first = this-(n-1);
7651 server.vm_next_page = this+1;
7652 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7653 return REDIS_OK;
7654 }
7655 } else {
7656 /* The current one is not a free page */
7657 numfree = 0;
7658 }
7659
7660 /* Fast-forward if the current page is not free and we already
7661 * searched enough near this place. */
7662 since_jump++;
7663 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7664 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7665 since_jump = 0;
7666 /* Note that even if we rewind after the jump, we are don't need
7667 * to make sure numfree is set to zero as we only jump *if* it
7668 * is set to zero. */
7669 } else {
7670 /* Otherwise just check the next page */
7671 offset++;
7672 }
7673 }
7674 return REDIS_ERR;
7675 }
7676
7677 /* Write the specified object at the specified page of the swap file */
7678 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7679 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7680 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7681 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7682 redisLog(REDIS_WARNING,
7683 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7684 strerror(errno));
7685 return REDIS_ERR;
7686 }
7687 rdbSaveObject(server.vm_fp,o);
7688 fflush(server.vm_fp);
7689 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7690 return REDIS_OK;
7691 }
7692
7693 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7694 * needed to later retrieve the object into the key object.
7695 * If we can't find enough contiguous empty pages to swap the object on disk
7696 * REDIS_ERR is returned. */
7697 static int vmSwapObjectBlocking(robj *key, robj *val) {
7698 off_t pages = rdbSavedObjectPages(val,NULL);
7699 off_t page;
7700
7701 assert(key->storage == REDIS_VM_MEMORY);
7702 assert(key->refcount == 1);
7703 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7704 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7705 key->vm.page = page;
7706 key->vm.usedpages = pages;
7707 key->storage = REDIS_VM_SWAPPED;
7708 key->vtype = val->type;
7709 decrRefCount(val); /* Deallocate the object from memory. */
7710 vmMarkPagesUsed(page,pages);
7711 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7712 (unsigned char*) key->ptr,
7713 (unsigned long long) page, (unsigned long long) pages);
7714 server.vm_stats_swapped_objects++;
7715 server.vm_stats_swapouts++;
7716 return REDIS_OK;
7717 }
7718
7719 static robj *vmReadObjectFromSwap(off_t page, int type) {
7720 robj *o;
7721
7722 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7723 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7724 redisLog(REDIS_WARNING,
7725 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7726 strerror(errno));
7727 _exit(1);
7728 }
7729 o = rdbLoadObject(type,server.vm_fp);
7730 if (o == NULL) {
7731 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7732 _exit(1);
7733 }
7734 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7735 return o;
7736 }
7737
7738 /* Load the value object relative to the 'key' object from swap to memory.
7739 * The newly allocated object is returned.
7740 *
7741 * If preview is true the unserialized object is returned to the caller but
7742 * no changes are made to the key object, nor the pages are marked as freed */
7743 static robj *vmGenericLoadObject(robj *key, int preview) {
7744 robj *val;
7745
7746 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7747 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7748 if (!preview) {
7749 key->storage = REDIS_VM_MEMORY;
7750 key->vm.atime = server.unixtime;
7751 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7752 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7753 (unsigned char*) key->ptr);
7754 server.vm_stats_swapped_objects--;
7755 } else {
7756 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7757 (unsigned char*) key->ptr);
7758 }
7759 server.vm_stats_swapins++;
7760 return val;
7761 }
7762
7763 /* Plain object loading, from swap to memory */
7764 static robj *vmLoadObject(robj *key) {
7765 /* If we are loading the object in background, stop it, we
7766 * need to load this object synchronously ASAP. */
7767 if (key->storage == REDIS_VM_LOADING)
7768 vmCancelThreadedIOJob(key);
7769 return vmGenericLoadObject(key,0);
7770 }
7771
7772 /* Just load the value on disk, without to modify the key.
7773 * This is useful when we want to perform some operation on the value
7774 * without to really bring it from swap to memory, like while saving the
7775 * dataset or rewriting the append only log. */
7776 static robj *vmPreviewObject(robj *key) {
7777 return vmGenericLoadObject(key,1);
7778 }
7779
7780 /* How a good candidate is this object for swapping?
7781 * The better candidate it is, the greater the returned value.
7782 *
7783 * Currently we try to perform a fast estimation of the object size in
7784 * memory, and combine it with aging informations.
7785 *
7786 * Basically swappability = idle-time * log(estimated size)
7787 *
7788 * Bigger objects are preferred over smaller objects, but not
7789 * proportionally, this is why we use the logarithm. This algorithm is
7790 * just a first try and will probably be tuned later. */
7791 static double computeObjectSwappability(robj *o) {
7792 time_t age = server.unixtime - o->vm.atime;
7793 long asize = 0;
7794 list *l;
7795 dict *d;
7796 struct dictEntry *de;
7797 int z;
7798
7799 if (age <= 0) return 0;
7800 switch(o->type) {
7801 case REDIS_STRING:
7802 if (o->encoding != REDIS_ENCODING_RAW) {
7803 asize = sizeof(*o);
7804 } else {
7805 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7806 }
7807 break;
7808 case REDIS_LIST:
7809 l = o->ptr;
7810 listNode *ln = listFirst(l);
7811
7812 asize = sizeof(list);
7813 if (ln) {
7814 robj *ele = ln->value;
7815 long elesize;
7816
7817 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7818 (sizeof(*o)+sdslen(ele->ptr)) :
7819 sizeof(*o);
7820 asize += (sizeof(listNode)+elesize)*listLength(l);
7821 }
7822 break;
7823 case REDIS_SET:
7824 case REDIS_ZSET:
7825 z = (o->type == REDIS_ZSET);
7826 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7827
7828 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7829 if (z) asize += sizeof(zset)-sizeof(dict);
7830 if (dictSize(d)) {
7831 long elesize;
7832 robj *ele;
7833
7834 de = dictGetRandomKey(d);
7835 ele = dictGetEntryKey(de);
7836 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7837 (sizeof(*o)+sdslen(ele->ptr)) :
7838 sizeof(*o);
7839 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7840 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7841 }
7842 break;
7843 }
7844 return (double)age*log(1+asize);
7845 }
7846
7847 /* Try to swap an object that's a good candidate for swapping.
7848 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7849 * to swap any object at all.
7850 *
7851 * If 'usethreaded' is true, Redis will try to swap the object in background
7852 * using I/O threads. */
7853 static int vmSwapOneObject(int usethreads) {
7854 int j, i;
7855 struct dictEntry *best = NULL;
7856 double best_swappability = 0;
7857 redisDb *best_db = NULL;
7858 robj *key, *val;
7859
7860 for (j = 0; j < server.dbnum; j++) {
7861 redisDb *db = server.db+j;
7862 /* Why maxtries is set to 100?
7863 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7864 * are swappable objects */
7865 int maxtries = 100;
7866
7867 if (dictSize(db->dict) == 0) continue;
7868 for (i = 0; i < 5; i++) {
7869 dictEntry *de;
7870 double swappability;
7871
7872 if (maxtries) maxtries--;
7873 de = dictGetRandomKey(db->dict);
7874 key = dictGetEntryKey(de);
7875 val = dictGetEntryVal(de);
7876 /* Only swap objects that are currently in memory.
7877 *
7878 * Also don't swap shared objects if threaded VM is on, as we
7879 * try to ensure that the main thread does not touch the
7880 * object while the I/O thread is using it, but we can't
7881 * control other keys without adding additional mutex. */
7882 if (key->storage != REDIS_VM_MEMORY ||
7883 (server.vm_max_threads != 0 && val->refcount != 1)) {
7884 if (maxtries) i--; /* don't count this try */
7885 continue;
7886 }
7887 swappability = computeObjectSwappability(val);
7888 if (!best || swappability > best_swappability) {
7889 best = de;
7890 best_swappability = swappability;
7891 best_db = db;
7892 }
7893 }
7894 }
7895 if (best == NULL) return REDIS_ERR;
7896 key = dictGetEntryKey(best);
7897 val = dictGetEntryVal(best);
7898
7899 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
7900 key->ptr, best_swappability);
7901
7902 /* Unshare the key if needed */
7903 if (key->refcount > 1) {
7904 robj *newkey = dupStringObject(key);
7905 decrRefCount(key);
7906 key = dictGetEntryKey(best) = newkey;
7907 }
7908 /* Swap it */
7909 if (usethreads) {
7910 vmSwapObjectThreaded(key,val,best_db);
7911 return REDIS_OK;
7912 } else {
7913 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7914 dictGetEntryVal(best) = NULL;
7915 return REDIS_OK;
7916 } else {
7917 return REDIS_ERR;
7918 }
7919 }
7920 }
7921
7922 static int vmSwapOneObjectBlocking() {
7923 return vmSwapOneObject(0);
7924 }
7925
7926 static int vmSwapOneObjectThreaded() {
7927 return vmSwapOneObject(1);
7928 }
7929
7930 /* Return true if it's safe to swap out objects in a given moment.
7931 * Basically we don't want to swap objects out while there is a BGSAVE
7932 * or a BGAEOREWRITE running in backgroud. */
7933 static int vmCanSwapOut(void) {
7934 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7935 }
7936
7937 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7938 * and was deleted. Otherwise 0 is returned. */
7939 static int deleteIfSwapped(redisDb *db, robj *key) {
7940 dictEntry *de;
7941 robj *foundkey;
7942
7943 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7944 foundkey = dictGetEntryKey(de);
7945 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7946 deleteKey(db,key);
7947 return 1;
7948 }
7949
7950 /* =================== Virtual Memory - Threaded I/O ======================= */
7951
7952 static void freeIOJob(iojob *j) {
7953 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7954 j->type == REDIS_IOJOB_DO_SWAP ||
7955 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
7956 decrRefCount(j->val);
7957 decrRefCount(j->key);
7958 zfree(j);
7959 }
7960
7961 /* Every time a thread finished a Job, it writes a byte into the write side
7962 * of an unix pipe in order to "awake" the main thread, and this function
7963 * is called. */
7964 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7965 int mask)
7966 {
7967 char buf[1];
7968 int retval, processed = 0, toprocess = -1, trytoswap = 1;
7969 REDIS_NOTUSED(el);
7970 REDIS_NOTUSED(mask);
7971 REDIS_NOTUSED(privdata);
7972
7973 /* For every byte we read in the read side of the pipe, there is one
7974 * I/O job completed to process. */
7975 while((retval = read(fd,buf,1)) == 1) {
7976 iojob *j;
7977 listNode *ln;
7978 robj *key;
7979 struct dictEntry *de;
7980
7981 redisLog(REDIS_DEBUG,"Processing I/O completed job");
7982
7983 /* Get the processed element (the oldest one) */
7984 lockThreadedIO();
7985 assert(listLength(server.io_processed) != 0);
7986 if (toprocess == -1) {
7987 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7988 if (toprocess <= 0) toprocess = 1;
7989 }
7990 ln = listFirst(server.io_processed);
7991 j = ln->value;
7992 listDelNode(server.io_processed,ln);
7993 unlockThreadedIO();
7994 /* If this job is marked as canceled, just ignore it */
7995 if (j->canceled) {
7996 freeIOJob(j);
7997 continue;
7998 }
7999 /* Post process it in the main thread, as there are things we
8000 * can do just here to avoid race conditions and/or invasive locks */
8001 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8002 de = dictFind(j->db->dict,j->key);
8003 assert(de != NULL);
8004 key = dictGetEntryKey(de);
8005 if (j->type == REDIS_IOJOB_LOAD) {
8006 redisDb *db;
8007
8008 /* Key loaded, bring it at home */
8009 key->storage = REDIS_VM_MEMORY;
8010 key->vm.atime = server.unixtime;
8011 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8012 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8013 (unsigned char*) key->ptr);
8014 server.vm_stats_swapped_objects--;
8015 server.vm_stats_swapins++;
8016 dictGetEntryVal(de) = j->val;
8017 incrRefCount(j->val);
8018 db = j->db;
8019 freeIOJob(j);
8020 /* Handle clients waiting for this key to be loaded. */
8021 handleClientsBlockedOnSwappedKey(db,key);
8022 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8023 /* Now we know the amount of pages required to swap this object.
8024 * Let's find some space for it, and queue this task again
8025 * rebranded as REDIS_IOJOB_DO_SWAP. */
8026 if (!vmCanSwapOut() ||
8027 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8028 {
8029 /* Ooops... no space or we can't swap as there is
8030 * a fork()ed Redis trying to save stuff on disk. */
8031 freeIOJob(j);
8032 key->storage = REDIS_VM_MEMORY; /* undo operation */
8033 } else {
8034 /* Note that we need to mark this pages as used now,
8035 * if the job will be canceled, we'll mark them as freed
8036 * again. */
8037 vmMarkPagesUsed(j->page,j->pages);
8038 j->type = REDIS_IOJOB_DO_SWAP;
8039 lockThreadedIO();
8040 queueIOJob(j);
8041 unlockThreadedIO();
8042 }
8043 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8044 robj *val;
8045
8046 /* Key swapped. We can finally free some memory. */
8047 if (key->storage != REDIS_VM_SWAPPING) {
8048 printf("key->storage: %d\n",key->storage);
8049 printf("key->name: %s\n",(char*)key->ptr);
8050 printf("key->refcount: %d\n",key->refcount);
8051 printf("val: %p\n",(void*)j->val);
8052 printf("val->type: %d\n",j->val->type);
8053 printf("val->ptr: %s\n",(char*)j->val->ptr);
8054 }
8055 redisAssert(key->storage == REDIS_VM_SWAPPING);
8056 val = dictGetEntryVal(de);
8057 key->vm.page = j->page;
8058 key->vm.usedpages = j->pages;
8059 key->storage = REDIS_VM_SWAPPED;
8060 key->vtype = j->val->type;
8061 decrRefCount(val); /* Deallocate the object from memory. */
8062 dictGetEntryVal(de) = NULL;
8063 redisLog(REDIS_DEBUG,
8064 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8065 (unsigned char*) key->ptr,
8066 (unsigned long long) j->page, (unsigned long long) j->pages);
8067 server.vm_stats_swapped_objects++;
8068 server.vm_stats_swapouts++;
8069 freeIOJob(j);
8070 /* Put a few more swap requests in queue if we are still
8071 * out of memory */
8072 if (trytoswap && vmCanSwapOut() &&
8073 zmalloc_used_memory() > server.vm_max_memory)
8074 {
8075 int more = 1;
8076 while(more) {
8077 lockThreadedIO();
8078 more = listLength(server.io_newjobs) <
8079 (unsigned) server.vm_max_threads;
8080 unlockThreadedIO();
8081 /* Don't waste CPU time if swappable objects are rare. */
8082 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8083 trytoswap = 0;
8084 break;
8085 }
8086 }
8087 }
8088 }
8089 processed++;
8090 if (processed == toprocess) return;
8091 }
8092 if (retval < 0 && errno != EAGAIN) {
8093 redisLog(REDIS_WARNING,
8094 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8095 strerror(errno));
8096 }
8097 }
8098
8099 static void lockThreadedIO(void) {
8100 pthread_mutex_lock(&server.io_mutex);
8101 }
8102
8103 static void unlockThreadedIO(void) {
8104 pthread_mutex_unlock(&server.io_mutex);
8105 }
8106
8107 /* Remove the specified object from the threaded I/O queue if still not
8108 * processed, otherwise make sure to flag it as canceled. */
8109 static void vmCancelThreadedIOJob(robj *o) {
8110 list *lists[3] = {
8111 server.io_newjobs, /* 0 */
8112 server.io_processing, /* 1 */
8113 server.io_processed /* 2 */
8114 };
8115 int i;
8116
8117 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8118 again:
8119 lockThreadedIO();
8120 /* Search for a matching key in one of the queues */
8121 for (i = 0; i < 3; i++) {
8122 listNode *ln;
8123 listIter li;
8124
8125 listRewind(lists[i],&li);
8126 while ((ln = listNext(&li)) != NULL) {
8127 iojob *job = ln->value;
8128
8129 if (job->canceled) continue; /* Skip this, already canceled. */
8130 if (compareStringObjects(job->key,o) == 0) {
8131 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8132 (void*)job, (char*)o->ptr, job->type, i);
8133 /* Mark the pages as free since the swap didn't happened
8134 * or happened but is now discarded. */
8135 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8136 vmMarkPagesFree(job->page,job->pages);
8137 /* Cancel the job. It depends on the list the job is
8138 * living in. */
8139 switch(i) {
8140 case 0: /* io_newjobs */
8141 /* If the job was yet not processed the best thing to do
8142 * is to remove it from the queue at all */
8143 freeIOJob(job);
8144 listDelNode(lists[i],ln);
8145 break;
8146 case 1: /* io_processing */
8147 /* Oh Shi- the thread is messing with the Job:
8148 *
8149 * Probably it's accessing the object if this is a
8150 * PREPARE_SWAP or DO_SWAP job.
8151 * If it's a LOAD job it may be reading from disk and
8152 * if we don't wait for the job to terminate before to
8153 * cancel it, maybe in a few microseconds data can be
8154 * corrupted in this pages. So the short story is:
8155 *
8156 * Better to wait for the job to move into the
8157 * next queue (processed)... */
8158
8159 /* We try again and again until the job is completed. */
8160 unlockThreadedIO();
8161 /* But let's wait some time for the I/O thread
8162 * to finish with this job. After all this condition
8163 * should be very rare. */
8164 usleep(1);
8165 goto again;
8166 case 2: /* io_processed */
8167 /* The job was already processed, that's easy...
8168 * just mark it as canceled so that we'll ignore it
8169 * when processing completed jobs. */
8170 job->canceled = 1;
8171 break;
8172 }
8173 /* Finally we have to adjust the storage type of the object
8174 * in order to "UNDO" the operaiton. */
8175 if (o->storage == REDIS_VM_LOADING)
8176 o->storage = REDIS_VM_SWAPPED;
8177 else if (o->storage == REDIS_VM_SWAPPING)
8178 o->storage = REDIS_VM_MEMORY;
8179 unlockThreadedIO();
8180 return;
8181 }
8182 }
8183 }
8184 unlockThreadedIO();
8185 assert(1 != 1); /* We should never reach this */
8186 }
8187
8188 static void *IOThreadEntryPoint(void *arg) {
8189 iojob *j;
8190 listNode *ln;
8191 REDIS_NOTUSED(arg);
8192
8193 pthread_detach(pthread_self());
8194 while(1) {
8195 /* Get a new job to process */
8196 lockThreadedIO();
8197 if (listLength(server.io_newjobs) == 0) {
8198 /* No new jobs in queue, exit. */
8199 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8200 (long) pthread_self());
8201 server.io_active_threads--;
8202 unlockThreadedIO();
8203 return NULL;
8204 }
8205 ln = listFirst(server.io_newjobs);
8206 j = ln->value;
8207 listDelNode(server.io_newjobs,ln);
8208 /* Add the job in the processing queue */
8209 j->thread = pthread_self();
8210 listAddNodeTail(server.io_processing,j);
8211 ln = listLast(server.io_processing); /* We use ln later to remove it */
8212 unlockThreadedIO();
8213 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8214 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8215
8216 /* Process the Job */
8217 if (j->type == REDIS_IOJOB_LOAD) {
8218 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8219 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8220 FILE *fp = fopen("/dev/null","w+");
8221 j->pages = rdbSavedObjectPages(j->val,fp);
8222 fclose(fp);
8223 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8224 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8225 j->canceled = 1;
8226 }
8227
8228 /* Done: insert the job into the processed queue */
8229 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8230 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8231 lockThreadedIO();
8232 listDelNode(server.io_processing,ln);
8233 listAddNodeTail(server.io_processed,j);
8234 unlockThreadedIO();
8235
8236 /* Signal the main thread there is new stuff to process */
8237 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8238 }
8239 return NULL; /* never reached */
8240 }
8241
8242 static void spawnIOThread(void) {
8243 pthread_t thread;
8244 sigset_t mask, omask;
8245
8246 sigemptyset(&mask);
8247 sigaddset(&mask,SIGCHLD);
8248 sigaddset(&mask,SIGHUP);
8249 sigaddset(&mask,SIGPIPE);
8250 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8251 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8252 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8253 server.io_active_threads++;
8254 }
8255
8256 /* We need to wait for the last thread to exit before we are able to
8257 * fork() in order to BGSAVE or BGREWRITEAOF. */
8258 static void waitEmptyIOJobsQueue(void) {
8259 while(1) {
8260 int io_processed_len;
8261
8262 lockThreadedIO();
8263 if (listLength(server.io_newjobs) == 0 &&
8264 listLength(server.io_processing) == 0 &&
8265 server.io_active_threads == 0)
8266 {
8267 unlockThreadedIO();
8268 return;
8269 }
8270 /* While waiting for empty jobs queue condition we post-process some
8271 * finshed job, as I/O threads may be hanging trying to write against
8272 * the io_ready_pipe_write FD but there are so much pending jobs that
8273 * it's blocking. */
8274 io_processed_len = listLength(server.io_processed);
8275 unlockThreadedIO();
8276 if (io_processed_len) {
8277 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8278 usleep(1000); /* 1 millisecond */
8279 } else {
8280 usleep(10000); /* 10 milliseconds */
8281 }
8282 }
8283 }
8284
8285 static void vmReopenSwapFile(void) {
8286 /* Note: we don't close the old one as we are in the child process
8287 * and don't want to mess at all with the original file object. */
8288 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8289 if (server.vm_fp == NULL) {
8290 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8291 server.vm_swap_file);
8292 _exit(1);
8293 }
8294 server.vm_fd = fileno(server.vm_fp);
8295 }
8296
8297 /* This function must be called while with threaded IO locked */
8298 static void queueIOJob(iojob *j) {
8299 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8300 (void*)j, j->type, (char*)j->key->ptr);
8301 listAddNodeTail(server.io_newjobs,j);
8302 if (server.io_active_threads < server.vm_max_threads)
8303 spawnIOThread();
8304 }
8305
8306 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8307 iojob *j;
8308
8309 assert(key->storage == REDIS_VM_MEMORY);
8310 assert(key->refcount == 1);
8311
8312 j = zmalloc(sizeof(*j));
8313 j->type = REDIS_IOJOB_PREPARE_SWAP;
8314 j->db = db;
8315 j->key = dupStringObject(key);
8316 j->val = val;
8317 incrRefCount(val);
8318 j->canceled = 0;
8319 j->thread = (pthread_t) -1;
8320 key->storage = REDIS_VM_SWAPPING;
8321
8322 lockThreadedIO();
8323 queueIOJob(j);
8324 unlockThreadedIO();
8325 return REDIS_OK;
8326 }
8327
8328 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8329
8330 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8331 * If there is not already a job loading the key, it is craeted.
8332 * The key is added to the io_keys list in the client structure, and also
8333 * in the hash table mapping swapped keys to waiting clients, that is,
8334 * server.io_waited_keys. */
8335 static int waitForSwappedKey(redisClient *c, robj *key) {
8336 struct dictEntry *de;
8337 robj *o;
8338 list *l;
8339
8340 /* If the key does not exist or is already in RAM we don't need to
8341 * block the client at all. */
8342 de = dictFind(c->db->dict,key);
8343 if (de == NULL) return 0;
8344 o = dictGetEntryKey(de);
8345 if (o->storage == REDIS_VM_MEMORY) {
8346 return 0;
8347 } else if (o->storage == REDIS_VM_SWAPPING) {
8348 /* We were swapping the key, undo it! */
8349 vmCancelThreadedIOJob(o);
8350 return 0;
8351 }
8352
8353 /* OK: the key is either swapped, or being loaded just now. */
8354
8355 /* Add the key to the list of keys this client is waiting for.
8356 * This maps clients to keys they are waiting for. */
8357 listAddNodeTail(c->io_keys,key);
8358 incrRefCount(key);
8359
8360 /* Add the client to the swapped keys => clients waiting map. */
8361 de = dictFind(c->db->io_keys,key);
8362 if (de == NULL) {
8363 int retval;
8364
8365 /* For every key we take a list of clients blocked for it */
8366 l = listCreate();
8367 retval = dictAdd(c->db->io_keys,key,l);
8368 incrRefCount(key);
8369 assert(retval == DICT_OK);
8370 } else {
8371 l = dictGetEntryVal(de);
8372 }
8373 listAddNodeTail(l,c);
8374
8375 /* Are we already loading the key from disk? If not create a job */
8376 if (o->storage == REDIS_VM_SWAPPED) {
8377 iojob *j;
8378
8379 o->storage = REDIS_VM_LOADING;
8380 j = zmalloc(sizeof(*j));
8381 j->type = REDIS_IOJOB_LOAD;
8382 j->db = c->db;
8383 j->key = dupStringObject(key);
8384 j->key->vtype = o->vtype;
8385 j->page = o->vm.page;
8386 j->val = NULL;
8387 j->canceled = 0;
8388 j->thread = (pthread_t) -1;
8389 lockThreadedIO();
8390 queueIOJob(j);
8391 unlockThreadedIO();
8392 }
8393 return 1;
8394 }
8395
8396 /* Is this client attempting to run a command against swapped keys?
8397 * If so, block it ASAP, load the keys in background, then resume it.
8398 *
8399 * The important idea about this function is that it can fail! If keys will
8400 * still be swapped when the client is resumed, this key lookups will
8401 * just block loading keys from disk. In practical terms this should only
8402 * happen with SORT BY command or if there is a bug in this function.
8403 *
8404 * Return 1 if the client is marked as blocked, 0 if the client can
8405 * continue as the keys it is going to access appear to be in memory. */
8406 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8407 int j, last;
8408
8409 if (cmd->vm_firstkey == 0) return 0;
8410 last = cmd->vm_lastkey;
8411 if (last < 0) last = c->argc+last;
8412 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8413 waitForSwappedKey(c,c->argv[j]);
8414 /* If the client was blocked for at least one key, mark it as blocked. */
8415 if (listLength(c->io_keys)) {
8416 c->flags |= REDIS_IO_WAIT;
8417 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8418 server.vm_blocked_clients++;
8419 return 1;
8420 } else {
8421 return 0;
8422 }
8423 }
8424
8425 /* Remove the 'key' from the list of blocked keys for a given client.
8426 *
8427 * The function returns 1 when there are no longer blocking keys after
8428 * the current one was removed (and the client can be unblocked). */
8429 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8430 list *l;
8431 listNode *ln;
8432 listIter li;
8433 struct dictEntry *de;
8434
8435 /* Remove the key from the list of keys this client is waiting for. */
8436 listRewind(c->io_keys,&li);
8437 while ((ln = listNext(&li)) != NULL) {
8438 if (compareStringObjects(ln->value,key) == 0) {
8439 listDelNode(c->io_keys,ln);
8440 break;
8441 }
8442 }
8443 assert(ln != NULL);
8444
8445 /* Remove the client form the key => waiting clients map. */
8446 de = dictFind(c->db->io_keys,key);
8447 assert(de != NULL);
8448 l = dictGetEntryVal(de);
8449 ln = listSearchKey(l,c);
8450 assert(ln != NULL);
8451 listDelNode(l,ln);
8452 if (listLength(l) == 0)
8453 dictDelete(c->db->io_keys,key);
8454
8455 return listLength(c->io_keys) == 0;
8456 }
8457
8458 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8459 struct dictEntry *de;
8460 list *l;
8461 listNode *ln;
8462 int len;
8463
8464 de = dictFind(db->io_keys,key);
8465 if (!de) return;
8466
8467 l = dictGetEntryVal(de);
8468 len = listLength(l);
8469 /* Note: we can't use something like while(listLength(l)) as the list
8470 * can be freed by the calling function when we remove the last element. */
8471 while (len--) {
8472 ln = listFirst(l);
8473 redisClient *c = ln->value;
8474
8475 if (dontWaitForSwappedKey(c,key)) {
8476 /* Put the client in the list of clients ready to go as we
8477 * loaded all the keys about it. */
8478 listAddNodeTail(server.io_ready_clients,c);
8479 }
8480 }
8481 }
8482
8483 /* ================================= Debugging ============================== */
8484
8485 static void debugCommand(redisClient *c) {
8486 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8487 *((char*)-1) = 'x';
8488 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8489 if (rdbSave(server.dbfilename) != REDIS_OK) {
8490 addReply(c,shared.err);
8491 return;
8492 }
8493 emptyDb();
8494 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8495 addReply(c,shared.err);
8496 return;
8497 }
8498 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8499 addReply(c,shared.ok);
8500 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8501 emptyDb();
8502 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8503 addReply(c,shared.err);
8504 return;
8505 }
8506 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8507 addReply(c,shared.ok);
8508 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8509 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8510 robj *key, *val;
8511
8512 if (!de) {
8513 addReply(c,shared.nokeyerr);
8514 return;
8515 }
8516 key = dictGetEntryKey(de);
8517 val = dictGetEntryVal(de);
8518 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8519 key->storage == REDIS_VM_SWAPPING)) {
8520 addReplySds(c,sdscatprintf(sdsempty(),
8521 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8522 "encoding:%d serializedlength:%lld\r\n",
8523 (void*)key, key->refcount, (void*)val, val->refcount,
8524 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8525 } else {
8526 addReplySds(c,sdscatprintf(sdsempty(),
8527 "+Key at:%p refcount:%d, value swapped at: page %llu "
8528 "using %llu pages\r\n",
8529 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8530 (unsigned long long) key->vm.usedpages));
8531 }
8532 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8533 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8534 robj *key, *val;
8535
8536 if (!server.vm_enabled) {
8537 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8538 return;
8539 }
8540 if (!de) {
8541 addReply(c,shared.nokeyerr);
8542 return;
8543 }
8544 key = dictGetEntryKey(de);
8545 val = dictGetEntryVal(de);
8546 /* If the key is shared we want to create a copy */
8547 if (key->refcount > 1) {
8548 robj *newkey = dupStringObject(key);
8549 decrRefCount(key);
8550 key = dictGetEntryKey(de) = newkey;
8551 }
8552 /* Swap it */
8553 if (key->storage != REDIS_VM_MEMORY) {
8554 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8555 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8556 dictGetEntryVal(de) = NULL;
8557 addReply(c,shared.ok);
8558 } else {
8559 addReply(c,shared.err);
8560 }
8561 } else {
8562 addReplySds(c,sdsnew(
8563 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8564 }
8565 }
8566
8567 static void _redisAssert(char *estr, char *file, int line) {
8568 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8569 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8570 #ifdef HAVE_BACKTRACE
8571 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8572 *((char*)-1) = 'x';
8573 #endif
8574 }
8575
8576 /* =================================== Main! ================================ */
8577
8578 #ifdef __linux__
8579 int linuxOvercommitMemoryValue(void) {
8580 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8581 char buf[64];
8582
8583 if (!fp) return -1;
8584 if (fgets(buf,64,fp) == NULL) {
8585 fclose(fp);
8586 return -1;
8587 }
8588 fclose(fp);
8589
8590 return atoi(buf);
8591 }
8592
8593 void linuxOvercommitMemoryWarning(void) {
8594 if (linuxOvercommitMemoryValue() == 0) {
8595 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8596 }
8597 }
8598 #endif /* __linux__ */
8599
8600 static void daemonize(void) {
8601 int fd;
8602 FILE *fp;
8603
8604 if (fork() != 0) exit(0); /* parent exits */
8605 setsid(); /* create a new session */
8606
8607 /* Every output goes to /dev/null. If Redis is daemonized but
8608 * the 'logfile' is set to 'stdout' in the configuration file
8609 * it will not log at all. */
8610 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8611 dup2(fd, STDIN_FILENO);
8612 dup2(fd, STDOUT_FILENO);
8613 dup2(fd, STDERR_FILENO);
8614 if (fd > STDERR_FILENO) close(fd);
8615 }
8616 /* Try to write the pid file */
8617 fp = fopen(server.pidfile,"w");
8618 if (fp) {
8619 fprintf(fp,"%d\n",getpid());
8620 fclose(fp);
8621 }
8622 }
8623
8624 int main(int argc, char **argv) {
8625 time_t start;
8626
8627 initServerConfig();
8628 if (argc == 2) {
8629 resetServerSaveParams();
8630 loadServerConfig(argv[1]);
8631 } else if (argc > 2) {
8632 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8633 exit(1);
8634 } else {
8635 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8636 }
8637 if (server.daemonize) daemonize();
8638 initServer();
8639 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8640 #ifdef __linux__
8641 linuxOvercommitMemoryWarning();
8642 #endif
8643 start = time(NULL);
8644 if (server.appendonly) {
8645 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8646 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8647 } else {
8648 if (rdbLoad(server.dbfilename) == REDIS_OK)
8649 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8650 }
8651 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8652 aeSetBeforeSleepProc(server.el,beforeSleep);
8653 aeMain(server.el);
8654 aeDeleteEventLoop(server.el);
8655 return 0;
8656 }
8657
8658 /* ============================= Backtrace support ========================= */
8659
8660 #ifdef HAVE_BACKTRACE
8661 static char *findFuncName(void *pointer, unsigned long *offset);
8662
8663 static void *getMcontextEip(ucontext_t *uc) {
8664 #if defined(__FreeBSD__)
8665 return (void*) uc->uc_mcontext.mc_eip;
8666 #elif defined(__dietlibc__)
8667 return (void*) uc->uc_mcontext.eip;
8668 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8669 #if __x86_64__
8670 return (void*) uc->uc_mcontext->__ss.__rip;
8671 #else
8672 return (void*) uc->uc_mcontext->__ss.__eip;
8673 #endif
8674 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8675 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8676 return (void*) uc->uc_mcontext->__ss.__rip;
8677 #else
8678 return (void*) uc->uc_mcontext->__ss.__eip;
8679 #endif
8680 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8681 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8682 #elif defined(__ia64__) /* Linux IA64 */
8683 return (void*) uc->uc_mcontext.sc_ip;
8684 #else
8685 return NULL;
8686 #endif
8687 }
8688
8689 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8690 void *trace[100];
8691 char **messages = NULL;
8692 int i, trace_size = 0;
8693 unsigned long offset=0;
8694 ucontext_t *uc = (ucontext_t*) secret;
8695 sds infostring;
8696 REDIS_NOTUSED(info);
8697
8698 redisLog(REDIS_WARNING,
8699 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8700 infostring = genRedisInfoString();
8701 redisLog(REDIS_WARNING, "%s",infostring);
8702 /* It's not safe to sdsfree() the returned string under memory
8703 * corruption conditions. Let it leak as we are going to abort */
8704
8705 trace_size = backtrace(trace, 100);
8706 /* overwrite sigaction with caller's address */
8707 if (getMcontextEip(uc) != NULL) {
8708 trace[1] = getMcontextEip(uc);
8709 }
8710 messages = backtrace_symbols(trace, trace_size);
8711
8712 for (i=1; i<trace_size; ++i) {
8713 char *fn = findFuncName(trace[i], &offset), *p;
8714
8715 p = strchr(messages[i],'+');
8716 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8717 redisLog(REDIS_WARNING,"%s", messages[i]);
8718 } else {
8719 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8720 }
8721 }
8722 /* free(messages); Don't call free() with possibly corrupted memory. */
8723 _exit(0);
8724 }
8725
8726 static void setupSigSegvAction(void) {
8727 struct sigaction act;
8728
8729 sigemptyset (&act.sa_mask);
8730 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8731 * is used. Otherwise, sa_handler is used */
8732 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8733 act.sa_sigaction = segvHandler;
8734 sigaction (SIGSEGV, &act, NULL);
8735 sigaction (SIGBUS, &act, NULL);
8736 sigaction (SIGFPE, &act, NULL);
8737 sigaction (SIGILL, &act, NULL);
8738 sigaction (SIGBUS, &act, NULL);
8739 return;
8740 }
8741
8742 #include "staticsymbols.h"
8743 /* This function try to convert a pointer into a function name. It's used in
8744 * oreder to provide a backtrace under segmentation fault that's able to
8745 * display functions declared as static (otherwise the backtrace is useless). */
8746 static char *findFuncName(void *pointer, unsigned long *offset){
8747 int i, ret = -1;
8748 unsigned long off, minoff = 0;
8749
8750 /* Try to match against the Symbol with the smallest offset */
8751 for (i=0; symsTable[i].pointer; i++) {
8752 unsigned long lp = (unsigned long) pointer;
8753
8754 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8755 off=lp-symsTable[i].pointer;
8756 if (ret < 0 || off < minoff) {
8757 minoff=off;
8758 ret=i;
8759 }
8760 }
8761 }
8762 if (ret == -1) return NULL;
8763 *offset = minoff;
8764 return symsTable[ret].name;
8765 }
8766 #else /* HAVE_BACKTRACE */
8767 static void setupSigSegvAction(void) {
8768 }
8769 #endif /* HAVE_BACKTRACE */
8770
8771
8772
8773 /* The End */
8774
8775
8776