]> git.saurik.com Git - redis.git/blob - redis.c
456596daedcd128755002bfb37f2e540cdff220a
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78
79 /* Error codes */
80 #define REDIS_OK 0
81 #define REDIS_ERR -1
82
83 /* Static server configuration */
84 #define REDIS_SERVERPORT 6379 /* TCP port */
85 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
86 #define REDIS_IOBUF_LEN 1024
87 #define REDIS_LOADBUF_LEN 1024
88 #define REDIS_STATIC_ARGS 4
89 #define REDIS_DEFAULT_DBNUM 16
90 #define REDIS_CONFIGLINE_MAX 1024
91 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
92 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
93 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
94 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
95 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
96
97 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
98 #define REDIS_WRITEV_THRESHOLD 3
99 /* Max number of iovecs used for each writev call */
100 #define REDIS_WRITEV_IOVEC_COUNT 256
101
102 /* Hash table parameters */
103 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
104
105 /* Command flags */
106 #define REDIS_CMD_BULK 1 /* Bulk write command */
107 #define REDIS_CMD_INLINE 2 /* Inline command */
108 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
109 this flags will return an error when the 'maxmemory' option is set in the
110 config file and the server is using more than maxmemory bytes of memory.
111 In short this commands are denied on low memory conditions. */
112 #define REDIS_CMD_DENYOOM 4
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding */
122 #define REDIS_ENCODING_RAW 0 /* Raw representation */
123 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
124
125 /* Object types only used for dumping to disk */
126 #define REDIS_EXPIRETIME 253
127 #define REDIS_SELECTDB 254
128 #define REDIS_EOF 255
129
130 /* Defines related to the dump file format. To store 32 bits lengths for short
131 * keys requires a lot of space, so we check the most significant 2 bits of
132 * the first byte to interpreter the length:
133 *
134 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
135 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
136 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
137 * 11|000000 this means: specially encoded object will follow. The six bits
138 * number specify the kind of object that follows.
139 * See the REDIS_RDB_ENC_* defines.
140 *
141 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
142 * values, will fit inside. */
143 #define REDIS_RDB_6BITLEN 0
144 #define REDIS_RDB_14BITLEN 1
145 #define REDIS_RDB_32BITLEN 2
146 #define REDIS_RDB_ENCVAL 3
147 #define REDIS_RDB_LENERR UINT_MAX
148
149 /* When a length of a string object stored on disk has the first two bits
150 * set, the remaining two bits specify a special encoding for the object
151 * accordingly to the following defines: */
152 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
153 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
154 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
155 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
156
157 /* Virtual memory object->where field. */
158 #define REDIS_VM_MEMORY 0 /* The object is on memory */
159 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
160 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
161 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
162
163 /* Virtual memory static configuration stuff.
164 * Check vmFindContiguousPages() to know more about this magic numbers. */
165 #define REDIS_VM_MAX_NEAR_PAGES 65536
166 #define REDIS_VM_MAX_RANDOM_JUMP 4096
167 #define REDIS_VM_MAX_THREADS 32
168 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
169 /* The following is the *percentage* of completed I/O jobs to process when the
170 * handelr is called. While Virtual Memory I/O operations are performed by
171 * threads, this operations must be processed by the main thread when completed
172 * in order to take effect. */
173 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
174
175 /* Client flags */
176 #define REDIS_SLAVE 1 /* This client is a slave server */
177 #define REDIS_MASTER 2 /* This client is a master server */
178 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
179 #define REDIS_MULTI 8 /* This client is in a MULTI context */
180 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
181 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
182
183 /* Slave replication state - slave side */
184 #define REDIS_REPL_NONE 0 /* No active replication */
185 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
186 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
187
188 /* Slave replication state - from the point of view of master
189 * Note that in SEND_BULK and ONLINE state the slave receives new updates
190 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
191 * to start the next background saving in order to send updates to it. */
192 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
193 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
194 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
195 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
196
197 /* List related stuff */
198 #define REDIS_HEAD 0
199 #define REDIS_TAIL 1
200
201 /* Sort operations */
202 #define REDIS_SORT_GET 0
203 #define REDIS_SORT_ASC 1
204 #define REDIS_SORT_DESC 2
205 #define REDIS_SORTKEY_MAX 1024
206
207 /* Log levels */
208 #define REDIS_DEBUG 0
209 #define REDIS_VERBOSE 1
210 #define REDIS_NOTICE 2
211 #define REDIS_WARNING 3
212
213 /* Anti-warning macro... */
214 #define REDIS_NOTUSED(V) ((void) V)
215
216 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
217 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
218
219 /* Append only defines */
220 #define APPENDFSYNC_NO 0
221 #define APPENDFSYNC_ALWAYS 1
222 #define APPENDFSYNC_EVERYSEC 2
223
224 /* We can print the stacktrace, so our assert is defined this way: */
225 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
226 static void _redisAssert(char *estr, char *file, int line);
227
228 /*================================= Data types ============================== */
229
230 /* A redis object, that is a type able to hold a string / list / set */
231
232 /* The VM object structure */
233 struct redisObjectVM {
234 off_t page; /* the page at witch the object is stored on disk */
235 off_t usedpages; /* number of pages used on disk */
236 time_t atime; /* Last access time */
237 } vm;
238
239 /* The actual Redis Object */
240 typedef struct redisObject {
241 void *ptr;
242 unsigned char type;
243 unsigned char encoding;
244 unsigned char storage; /* If this object is a key, where is the value?
245 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
246 unsigned char vtype; /* If this object is a key, and value is swapped out,
247 * this is the type of the swapped out object. */
248 int refcount;
249 /* VM fields, this are only allocated if VM is active, otherwise the
250 * object allocation function will just allocate
251 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
252 * Redis without VM active will not have any overhead. */
253 struct redisObjectVM vm;
254 } robj;
255
256 /* Macro used to initalize a Redis object allocated on the stack.
257 * Note that this macro is taken near the structure definition to make sure
258 * we'll update it when the structure is changed, to avoid bugs like
259 * bug #85 introduced exactly in this way. */
260 #define initStaticStringObject(_var,_ptr) do { \
261 _var.refcount = 1; \
262 _var.type = REDIS_STRING; \
263 _var.encoding = REDIS_ENCODING_RAW; \
264 _var.ptr = _ptr; \
265 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
266 } while(0);
267
268 typedef struct redisDb {
269 dict *dict; /* The keyspace for this DB */
270 dict *expires; /* Timeout of keys with a timeout set */
271 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
272 dict *io_keys; /* Keys with clients waiting for VM I/O */
273 int id;
274 } redisDb;
275
276 /* Client MULTI/EXEC state */
277 typedef struct multiCmd {
278 robj **argv;
279 int argc;
280 struct redisCommand *cmd;
281 } multiCmd;
282
283 typedef struct multiState {
284 multiCmd *commands; /* Array of MULTI commands */
285 int count; /* Total number of MULTI commands */
286 } multiState;
287
288 /* With multiplexing we need to take per-clinet state.
289 * Clients are taken in a liked list. */
290 typedef struct redisClient {
291 int fd;
292 redisDb *db;
293 int dictid;
294 sds querybuf;
295 robj **argv, **mbargv;
296 int argc, mbargc;
297 int bulklen; /* bulk read len. -1 if not in bulk read mode */
298 int multibulk; /* multi bulk command format active */
299 list *reply;
300 int sentlen;
301 time_t lastinteraction; /* time of the last interaction, used for timeout */
302 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
303 int slaveseldb; /* slave selected db, if this client is a slave */
304 int authenticated; /* when requirepass is non-NULL */
305 int replstate; /* replication state if this is a slave */
306 int repldbfd; /* replication DB file descriptor */
307 long repldboff; /* replication DB file offset */
308 off_t repldbsize; /* replication DB file size */
309 multiState mstate; /* MULTI/EXEC state */
310 robj **blockingkeys; /* The key we are waiting to terminate a blocking
311 * operation such as BLPOP. Otherwise NULL. */
312 int blockingkeysnum; /* Number of blocking keys */
313 time_t blockingto; /* Blocking operation timeout. If UNIX current time
314 * is >= blockingto then the operation timed out. */
315 list *io_keys; /* Keys this client is waiting to be loaded from the
316 * swap file in order to continue. */
317 } redisClient;
318
319 struct saveparam {
320 time_t seconds;
321 int changes;
322 };
323
324 /* Global server state structure */
325 struct redisServer {
326 int port;
327 int fd;
328 redisDb *db;
329 dict *sharingpool; /* Poll used for object sharing */
330 unsigned int sharingpoolsize;
331 long long dirty; /* changes to DB from the last save */
332 list *clients;
333 list *slaves, *monitors;
334 char neterr[ANET_ERR_LEN];
335 aeEventLoop *el;
336 int cronloops; /* number of times the cron function run */
337 list *objfreelist; /* A list of freed objects to avoid malloc() */
338 time_t lastsave; /* Unix time of last save succeeede */
339 /* Fields used only for stats */
340 time_t stat_starttime; /* server start time */
341 long long stat_numcommands; /* number of processed commands */
342 long long stat_numconnections; /* number of connections received */
343 /* Configuration */
344 int verbosity;
345 int glueoutputbuf;
346 int maxidletime;
347 int dbnum;
348 int daemonize;
349 int appendonly;
350 int appendfsync;
351 time_t lastfsync;
352 int appendfd;
353 int appendseldb;
354 char *pidfile;
355 pid_t bgsavechildpid;
356 pid_t bgrewritechildpid;
357 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
358 struct saveparam *saveparams;
359 int saveparamslen;
360 char *logfile;
361 char *bindaddr;
362 char *dbfilename;
363 char *appendfilename;
364 char *requirepass;
365 int shareobjects;
366 int rdbcompression;
367 /* Replication related */
368 int isslave;
369 char *masterauth;
370 char *masterhost;
371 int masterport;
372 redisClient *master; /* client that is master for this slave */
373 int replstate;
374 unsigned int maxclients;
375 unsigned long long maxmemory;
376 unsigned int blpop_blocked_clients;
377 unsigned int vm_blocked_clients;
378 /* Sort parameters - qsort_r() is only available under BSD so we
379 * have to take this state global, in order to pass it to sortCompare() */
380 int sort_desc;
381 int sort_alpha;
382 int sort_bypattern;
383 /* Virtual memory configuration */
384 int vm_enabled;
385 char *vm_swap_file;
386 off_t vm_page_size;
387 off_t vm_pages;
388 unsigned long long vm_max_memory;
389 /* Virtual memory state */
390 FILE *vm_fp;
391 int vm_fd;
392 off_t vm_next_page; /* Next probably empty page */
393 off_t vm_near_pages; /* Number of pages allocated sequentially */
394 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
395 time_t unixtime; /* Unix time sampled every second. */
396 /* Virtual memory I/O threads stuff */
397 /* An I/O thread process an element taken from the io_jobs queue and
398 * put the result of the operation in the io_done list. While the
399 * job is being processed, it's put on io_processing queue. */
400 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
401 list *io_processing; /* List of VM I/O jobs being processed */
402 list *io_processed; /* List of VM I/O jobs already processed */
403 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
404 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
405 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
406 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
407 pthread_attr_t io_threads_attr; /* attributes for threads creation */
408 int io_active_threads; /* Number of running I/O threads */
409 int vm_max_threads; /* Max number of I/O threads running at the same time */
410 /* Our main thread is blocked on the event loop, locking for sockets ready
411 * to be read or written, so when a threaded I/O operation is ready to be
412 * processed by the main thread, the I/O thread will use a unix pipe to
413 * awake the main thread. The followings are the two pipe FDs. */
414 int io_ready_pipe_read;
415 int io_ready_pipe_write;
416 /* Virtual memory stats */
417 unsigned long long vm_stats_used_pages;
418 unsigned long long vm_stats_swapped_objects;
419 unsigned long long vm_stats_swapouts;
420 unsigned long long vm_stats_swapins;
421 FILE *devnull;
422 };
423
424 typedef void redisCommandProc(redisClient *c);
425 struct redisCommand {
426 char *name;
427 redisCommandProc *proc;
428 int arity;
429 int flags;
430 /* What keys should be loaded in background when calling this command? */
431 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
432 int vm_lastkey; /* THe last argument that's a key */
433 int vm_keystep; /* The step between first and last key */
434 };
435
436 struct redisFunctionSym {
437 char *name;
438 unsigned long pointer;
439 };
440
441 typedef struct _redisSortObject {
442 robj *obj;
443 union {
444 double score;
445 robj *cmpobj;
446 } u;
447 } redisSortObject;
448
449 typedef struct _redisSortOperation {
450 int type;
451 robj *pattern;
452 } redisSortOperation;
453
454 /* ZSETs use a specialized version of Skiplists */
455
456 typedef struct zskiplistNode {
457 struct zskiplistNode **forward;
458 struct zskiplistNode *backward;
459 unsigned int *span;
460 double score;
461 robj *obj;
462 } zskiplistNode;
463
464 typedef struct zskiplist {
465 struct zskiplistNode *header, *tail;
466 unsigned long length;
467 int level;
468 } zskiplist;
469
470 typedef struct zset {
471 dict *dict;
472 zskiplist *zsl;
473 } zset;
474
475 /* Our shared "common" objects */
476
477 struct sharedObjectsStruct {
478 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
479 *colon, *nullbulk, *nullmultibulk, *queued,
480 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
481 *outofrangeerr, *plus,
482 *select0, *select1, *select2, *select3, *select4,
483 *select5, *select6, *select7, *select8, *select9;
484 } shared;
485
486 /* Global vars that are actally used as constants. The following double
487 * values are used for double on-disk serialization, and are initialized
488 * at runtime to avoid strange compiler optimizations. */
489
490 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
491
492 /* VM threaded I/O request message */
493 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
494 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
495 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
496 typedef struct iojob {
497 int type; /* Request type, REDIS_IOJOB_* */
498 redisDb *db;/* Redis database */
499 robj *key; /* This I/O request is about swapping this key */
500 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
501 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
502 off_t page; /* Swap page where to read/write the object */
503 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
504 int canceled; /* True if this command was canceled by blocking side of VM */
505 pthread_t thread; /* ID of the thread processing this entry */
506 } iojob;
507
508 /*================================ Prototypes =============================== */
509
510 static void freeStringObject(robj *o);
511 static void freeListObject(robj *o);
512 static void freeSetObject(robj *o);
513 static void decrRefCount(void *o);
514 static robj *createObject(int type, void *ptr);
515 static void freeClient(redisClient *c);
516 static int rdbLoad(char *filename);
517 static void addReply(redisClient *c, robj *obj);
518 static void addReplySds(redisClient *c, sds s);
519 static void incrRefCount(robj *o);
520 static int rdbSaveBackground(char *filename);
521 static robj *createStringObject(char *ptr, size_t len);
522 static robj *dupStringObject(robj *o);
523 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
524 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
525 static int syncWithMaster(void);
526 static robj *tryObjectSharing(robj *o);
527 static int tryObjectEncoding(robj *o);
528 static robj *getDecodedObject(robj *o);
529 static int removeExpire(redisDb *db, robj *key);
530 static int expireIfNeeded(redisDb *db, robj *key);
531 static int deleteIfVolatile(redisDb *db, robj *key);
532 static int deleteIfSwapped(redisDb *db, robj *key);
533 static int deleteKey(redisDb *db, robj *key);
534 static time_t getExpire(redisDb *db, robj *key);
535 static int setExpire(redisDb *db, robj *key, time_t when);
536 static void updateSlavesWaitingBgsave(int bgsaveerr);
537 static void freeMemoryIfNeeded(void);
538 static int processCommand(redisClient *c);
539 static void setupSigSegvAction(void);
540 static void rdbRemoveTempFile(pid_t childpid);
541 static void aofRemoveTempFile(pid_t childpid);
542 static size_t stringObjectLen(robj *o);
543 static void processInputBuffer(redisClient *c);
544 static zskiplist *zslCreate(void);
545 static void zslFree(zskiplist *zsl);
546 static void zslInsert(zskiplist *zsl, double score, robj *obj);
547 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
548 static void initClientMultiState(redisClient *c);
549 static void freeClientMultiState(redisClient *c);
550 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
551 static void unblockClientWaitingData(redisClient *c);
552 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
553 static void vmInit(void);
554 static void vmMarkPagesFree(off_t page, off_t count);
555 static robj *vmLoadObject(robj *key);
556 static robj *vmPreviewObject(robj *key);
557 static int vmSwapOneObjectBlocking(void);
558 static int vmSwapOneObjectThreaded(void);
559 static int vmCanSwapOut(void);
560 static int tryFreeOneObjectFromFreelist(void);
561 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
562 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
563 static void vmCancelThreadedIOJob(robj *o);
564 static void lockThreadedIO(void);
565 static void unlockThreadedIO(void);
566 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
567 static void freeIOJob(iojob *j);
568 static void queueIOJob(iojob *j);
569 static int vmWriteObjectOnSwap(robj *o, off_t page);
570 static robj *vmReadObjectFromSwap(off_t page, int type);
571 static void waitEmptyIOJobsQueue(void);
572 static void vmReopenSwapFile(void);
573 static int vmFreePage(off_t page);
574 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
575 static int dontWaitForSwappedKey(redisClient *c, robj *key);
576 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
577 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
578 static struct redisCommand *lookupCommand(char *name);
579 static void call(redisClient *c, struct redisCommand *cmd);
580 static void resetClient(redisClient *c);
581
582 static void authCommand(redisClient *c);
583 static void pingCommand(redisClient *c);
584 static void echoCommand(redisClient *c);
585 static void setCommand(redisClient *c);
586 static void setnxCommand(redisClient *c);
587 static void getCommand(redisClient *c);
588 static void delCommand(redisClient *c);
589 static void existsCommand(redisClient *c);
590 static void incrCommand(redisClient *c);
591 static void decrCommand(redisClient *c);
592 static void incrbyCommand(redisClient *c);
593 static void decrbyCommand(redisClient *c);
594 static void selectCommand(redisClient *c);
595 static void randomkeyCommand(redisClient *c);
596 static void keysCommand(redisClient *c);
597 static void dbsizeCommand(redisClient *c);
598 static void lastsaveCommand(redisClient *c);
599 static void saveCommand(redisClient *c);
600 static void bgsaveCommand(redisClient *c);
601 static void bgrewriteaofCommand(redisClient *c);
602 static void shutdownCommand(redisClient *c);
603 static void moveCommand(redisClient *c);
604 static void renameCommand(redisClient *c);
605 static void renamenxCommand(redisClient *c);
606 static void lpushCommand(redisClient *c);
607 static void rpushCommand(redisClient *c);
608 static void lpopCommand(redisClient *c);
609 static void rpopCommand(redisClient *c);
610 static void llenCommand(redisClient *c);
611 static void lindexCommand(redisClient *c);
612 static void lrangeCommand(redisClient *c);
613 static void ltrimCommand(redisClient *c);
614 static void typeCommand(redisClient *c);
615 static void lsetCommand(redisClient *c);
616 static void saddCommand(redisClient *c);
617 static void sremCommand(redisClient *c);
618 static void smoveCommand(redisClient *c);
619 static void sismemberCommand(redisClient *c);
620 static void scardCommand(redisClient *c);
621 static void spopCommand(redisClient *c);
622 static void srandmemberCommand(redisClient *c);
623 static void sinterCommand(redisClient *c);
624 static void sinterstoreCommand(redisClient *c);
625 static void sunionCommand(redisClient *c);
626 static void sunionstoreCommand(redisClient *c);
627 static void sdiffCommand(redisClient *c);
628 static void sdiffstoreCommand(redisClient *c);
629 static void syncCommand(redisClient *c);
630 static void flushdbCommand(redisClient *c);
631 static void flushallCommand(redisClient *c);
632 static void sortCommand(redisClient *c);
633 static void lremCommand(redisClient *c);
634 static void rpoplpushcommand(redisClient *c);
635 static void infoCommand(redisClient *c);
636 static void mgetCommand(redisClient *c);
637 static void monitorCommand(redisClient *c);
638 static void expireCommand(redisClient *c);
639 static void expireatCommand(redisClient *c);
640 static void getsetCommand(redisClient *c);
641 static void ttlCommand(redisClient *c);
642 static void slaveofCommand(redisClient *c);
643 static void debugCommand(redisClient *c);
644 static void msetCommand(redisClient *c);
645 static void msetnxCommand(redisClient *c);
646 static void zaddCommand(redisClient *c);
647 static void zincrbyCommand(redisClient *c);
648 static void zrangeCommand(redisClient *c);
649 static void zrangebyscoreCommand(redisClient *c);
650 static void zcountCommand(redisClient *c);
651 static void zrevrangeCommand(redisClient *c);
652 static void zcardCommand(redisClient *c);
653 static void zremCommand(redisClient *c);
654 static void zscoreCommand(redisClient *c);
655 static void zremrangebyscoreCommand(redisClient *c);
656 static void multiCommand(redisClient *c);
657 static void execCommand(redisClient *c);
658 static void discardCommand(redisClient *c);
659 static void blpopCommand(redisClient *c);
660 static void brpopCommand(redisClient *c);
661 static void appendCommand(redisClient *c);
662 static void zrankCommand(redisClient *c);
663
664 /*================================= Globals ================================= */
665
666 /* Global vars */
667 static struct redisServer server; /* server global state */
668 static struct redisCommand cmdTable[] = {
669 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
670 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
671 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
672 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
673 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
674 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
675 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
676 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
677 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
678 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
679 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
680 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
681 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
682 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
683 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
684 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
685 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
686 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
687 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
688 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
689 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
690 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
691 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
692 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
693 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
694 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
695 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
696 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
697 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
698 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
699 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
700 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
701 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
702 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
703 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
704 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
705 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
706 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
707 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
708 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
709 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
710 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
711 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
712 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
713 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
715 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
716 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
717 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
718 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
719 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
720 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
721 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
722 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
723 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
724 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
725 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
726 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
727 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
728 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
729 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
730 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
731 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
732 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
733 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
734 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
735 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
736 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
737 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
738 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
739 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
740 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
741 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
742 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
743 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
744 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
745 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
746 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
747 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
748 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
749 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
750 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
751 {NULL,NULL,0,0,0,0,0}
752 };
753
754 /*============================ Utility functions ============================ */
755
756 /* Glob-style pattern matching. */
757 int stringmatchlen(const char *pattern, int patternLen,
758 const char *string, int stringLen, int nocase)
759 {
760 while(patternLen) {
761 switch(pattern[0]) {
762 case '*':
763 while (pattern[1] == '*') {
764 pattern++;
765 patternLen--;
766 }
767 if (patternLen == 1)
768 return 1; /* match */
769 while(stringLen) {
770 if (stringmatchlen(pattern+1, patternLen-1,
771 string, stringLen, nocase))
772 return 1; /* match */
773 string++;
774 stringLen--;
775 }
776 return 0; /* no match */
777 break;
778 case '?':
779 if (stringLen == 0)
780 return 0; /* no match */
781 string++;
782 stringLen--;
783 break;
784 case '[':
785 {
786 int not, match;
787
788 pattern++;
789 patternLen--;
790 not = pattern[0] == '^';
791 if (not) {
792 pattern++;
793 patternLen--;
794 }
795 match = 0;
796 while(1) {
797 if (pattern[0] == '\\') {
798 pattern++;
799 patternLen--;
800 if (pattern[0] == string[0])
801 match = 1;
802 } else if (pattern[0] == ']') {
803 break;
804 } else if (patternLen == 0) {
805 pattern--;
806 patternLen++;
807 break;
808 } else if (pattern[1] == '-' && patternLen >= 3) {
809 int start = pattern[0];
810 int end = pattern[2];
811 int c = string[0];
812 if (start > end) {
813 int t = start;
814 start = end;
815 end = t;
816 }
817 if (nocase) {
818 start = tolower(start);
819 end = tolower(end);
820 c = tolower(c);
821 }
822 pattern += 2;
823 patternLen -= 2;
824 if (c >= start && c <= end)
825 match = 1;
826 } else {
827 if (!nocase) {
828 if (pattern[0] == string[0])
829 match = 1;
830 } else {
831 if (tolower((int)pattern[0]) == tolower((int)string[0]))
832 match = 1;
833 }
834 }
835 pattern++;
836 patternLen--;
837 }
838 if (not)
839 match = !match;
840 if (!match)
841 return 0; /* no match */
842 string++;
843 stringLen--;
844 break;
845 }
846 case '\\':
847 if (patternLen >= 2) {
848 pattern++;
849 patternLen--;
850 }
851 /* fall through */
852 default:
853 if (!nocase) {
854 if (pattern[0] != string[0])
855 return 0; /* no match */
856 } else {
857 if (tolower((int)pattern[0]) != tolower((int)string[0]))
858 return 0; /* no match */
859 }
860 string++;
861 stringLen--;
862 break;
863 }
864 pattern++;
865 patternLen--;
866 if (stringLen == 0) {
867 while(*pattern == '*') {
868 pattern++;
869 patternLen--;
870 }
871 break;
872 }
873 }
874 if (patternLen == 0 && stringLen == 0)
875 return 1;
876 return 0;
877 }
878
879 static void redisLog(int level, const char *fmt, ...) {
880 va_list ap;
881 FILE *fp;
882
883 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
884 if (!fp) return;
885
886 va_start(ap, fmt);
887 if (level >= server.verbosity) {
888 char *c = ".-*#";
889 char buf[64];
890 time_t now;
891
892 now = time(NULL);
893 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
894 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
895 vfprintf(fp, fmt, ap);
896 fprintf(fp,"\n");
897 fflush(fp);
898 }
899 va_end(ap);
900
901 if (server.logfile) fclose(fp);
902 }
903
904 /*====================== Hash table type implementation ==================== */
905
906 /* This is an hash table type that uses the SDS dynamic strings libary as
907 * keys and radis objects as values (objects can hold SDS strings,
908 * lists, sets). */
909
910 static void dictVanillaFree(void *privdata, void *val)
911 {
912 DICT_NOTUSED(privdata);
913 zfree(val);
914 }
915
916 static void dictListDestructor(void *privdata, void *val)
917 {
918 DICT_NOTUSED(privdata);
919 listRelease((list*)val);
920 }
921
922 static int sdsDictKeyCompare(void *privdata, const void *key1,
923 const void *key2)
924 {
925 int l1,l2;
926 DICT_NOTUSED(privdata);
927
928 l1 = sdslen((sds)key1);
929 l2 = sdslen((sds)key2);
930 if (l1 != l2) return 0;
931 return memcmp(key1, key2, l1) == 0;
932 }
933
934 static void dictRedisObjectDestructor(void *privdata, void *val)
935 {
936 DICT_NOTUSED(privdata);
937
938 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
939 decrRefCount(val);
940 }
941
942 static int dictObjKeyCompare(void *privdata, const void *key1,
943 const void *key2)
944 {
945 const robj *o1 = key1, *o2 = key2;
946 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
947 }
948
949 static unsigned int dictObjHash(const void *key) {
950 const robj *o = key;
951 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
952 }
953
954 static int dictEncObjKeyCompare(void *privdata, const void *key1,
955 const void *key2)
956 {
957 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
958 int cmp;
959
960 o1 = getDecodedObject(o1);
961 o2 = getDecodedObject(o2);
962 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
963 decrRefCount(o1);
964 decrRefCount(o2);
965 return cmp;
966 }
967
968 static unsigned int dictEncObjHash(const void *key) {
969 robj *o = (robj*) key;
970
971 if (o->encoding == REDIS_ENCODING_RAW) {
972 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
973 } else {
974 if (o->encoding == REDIS_ENCODING_INT) {
975 char buf[32];
976 int len;
977
978 len = snprintf(buf,32,"%ld",(long)o->ptr);
979 return dictGenHashFunction((unsigned char*)buf, len);
980 } else {
981 unsigned int hash;
982
983 o = getDecodedObject(o);
984 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
985 decrRefCount(o);
986 return hash;
987 }
988 }
989 }
990
991 /* Sets type and expires */
992 static dictType setDictType = {
993 dictEncObjHash, /* hash function */
994 NULL, /* key dup */
995 NULL, /* val dup */
996 dictEncObjKeyCompare, /* key compare */
997 dictRedisObjectDestructor, /* key destructor */
998 NULL /* val destructor */
999 };
1000
1001 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1002 static dictType zsetDictType = {
1003 dictEncObjHash, /* hash function */
1004 NULL, /* key dup */
1005 NULL, /* val dup */
1006 dictEncObjKeyCompare, /* key compare */
1007 dictRedisObjectDestructor, /* key destructor */
1008 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1009 };
1010
1011 /* Db->dict */
1012 static dictType hashDictType = {
1013 dictObjHash, /* hash function */
1014 NULL, /* key dup */
1015 NULL, /* val dup */
1016 dictObjKeyCompare, /* key compare */
1017 dictRedisObjectDestructor, /* key destructor */
1018 dictRedisObjectDestructor /* val destructor */
1019 };
1020
1021 /* Db->expires */
1022 static dictType keyptrDictType = {
1023 dictObjHash, /* hash function */
1024 NULL, /* key dup */
1025 NULL, /* val dup */
1026 dictObjKeyCompare, /* key compare */
1027 dictRedisObjectDestructor, /* key destructor */
1028 NULL /* val destructor */
1029 };
1030
1031 /* Keylist hash table type has unencoded redis objects as keys and
1032 * lists as values. It's used for blocking operations (BLPOP) and to
1033 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1034 static dictType keylistDictType = {
1035 dictObjHash, /* hash function */
1036 NULL, /* key dup */
1037 NULL, /* val dup */
1038 dictObjKeyCompare, /* key compare */
1039 dictRedisObjectDestructor, /* key destructor */
1040 dictListDestructor /* val destructor */
1041 };
1042
1043 /* ========================= Random utility functions ======================= */
1044
1045 /* Redis generally does not try to recover from out of memory conditions
1046 * when allocating objects or strings, it is not clear if it will be possible
1047 * to report this condition to the client since the networking layer itself
1048 * is based on heap allocation for send buffers, so we simply abort.
1049 * At least the code will be simpler to read... */
1050 static void oom(const char *msg) {
1051 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1052 sleep(1);
1053 abort();
1054 }
1055
1056 /* ====================== Redis server networking stuff ===================== */
1057 static void closeTimedoutClients(void) {
1058 redisClient *c;
1059 listNode *ln;
1060 time_t now = time(NULL);
1061 listIter li;
1062
1063 listRewind(server.clients,&li);
1064 while ((ln = listNext(&li)) != NULL) {
1065 c = listNodeValue(ln);
1066 if (server.maxidletime &&
1067 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1068 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1069 (now - c->lastinteraction > server.maxidletime))
1070 {
1071 redisLog(REDIS_VERBOSE,"Closing idle client");
1072 freeClient(c);
1073 } else if (c->flags & REDIS_BLOCKED) {
1074 if (c->blockingto != 0 && c->blockingto < now) {
1075 addReply(c,shared.nullmultibulk);
1076 unblockClientWaitingData(c);
1077 }
1078 }
1079 }
1080 }
1081
1082 static int htNeedsResize(dict *dict) {
1083 long long size, used;
1084
1085 size = dictSlots(dict);
1086 used = dictSize(dict);
1087 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1088 (used*100/size < REDIS_HT_MINFILL));
1089 }
1090
1091 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1092 * we resize the hash table to save memory */
1093 static void tryResizeHashTables(void) {
1094 int j;
1095
1096 for (j = 0; j < server.dbnum; j++) {
1097 if (htNeedsResize(server.db[j].dict)) {
1098 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1099 dictResize(server.db[j].dict);
1100 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1101 }
1102 if (htNeedsResize(server.db[j].expires))
1103 dictResize(server.db[j].expires);
1104 }
1105 }
1106
1107 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1108 void backgroundSaveDoneHandler(int statloc) {
1109 int exitcode = WEXITSTATUS(statloc);
1110 int bysignal = WIFSIGNALED(statloc);
1111
1112 if (!bysignal && exitcode == 0) {
1113 redisLog(REDIS_NOTICE,
1114 "Background saving terminated with success");
1115 server.dirty = 0;
1116 server.lastsave = time(NULL);
1117 } else if (!bysignal && exitcode != 0) {
1118 redisLog(REDIS_WARNING, "Background saving error");
1119 } else {
1120 redisLog(REDIS_WARNING,
1121 "Background saving terminated by signal");
1122 rdbRemoveTempFile(server.bgsavechildpid);
1123 }
1124 server.bgsavechildpid = -1;
1125 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1126 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1127 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1128 }
1129
1130 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1131 * Handle this. */
1132 void backgroundRewriteDoneHandler(int statloc) {
1133 int exitcode = WEXITSTATUS(statloc);
1134 int bysignal = WIFSIGNALED(statloc);
1135
1136 if (!bysignal && exitcode == 0) {
1137 int fd;
1138 char tmpfile[256];
1139
1140 redisLog(REDIS_NOTICE,
1141 "Background append only file rewriting terminated with success");
1142 /* Now it's time to flush the differences accumulated by the parent */
1143 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1144 fd = open(tmpfile,O_WRONLY|O_APPEND);
1145 if (fd == -1) {
1146 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1147 goto cleanup;
1148 }
1149 /* Flush our data... */
1150 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1151 (signed) sdslen(server.bgrewritebuf)) {
1152 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1153 close(fd);
1154 goto cleanup;
1155 }
1156 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1157 /* Now our work is to rename the temp file into the stable file. And
1158 * switch the file descriptor used by the server for append only. */
1159 if (rename(tmpfile,server.appendfilename) == -1) {
1160 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1161 close(fd);
1162 goto cleanup;
1163 }
1164 /* Mission completed... almost */
1165 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1166 if (server.appendfd != -1) {
1167 /* If append only is actually enabled... */
1168 close(server.appendfd);
1169 server.appendfd = fd;
1170 fsync(fd);
1171 server.appendseldb = -1; /* Make sure it will issue SELECT */
1172 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1173 } else {
1174 /* If append only is disabled we just generate a dump in this
1175 * format. Why not? */
1176 close(fd);
1177 }
1178 } else if (!bysignal && exitcode != 0) {
1179 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1180 } else {
1181 redisLog(REDIS_WARNING,
1182 "Background append only file rewriting terminated by signal");
1183 }
1184 cleanup:
1185 sdsfree(server.bgrewritebuf);
1186 server.bgrewritebuf = sdsempty();
1187 aofRemoveTempFile(server.bgrewritechildpid);
1188 server.bgrewritechildpid = -1;
1189 }
1190
1191 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1192 int j, loops = server.cronloops++;
1193 REDIS_NOTUSED(eventLoop);
1194 REDIS_NOTUSED(id);
1195 REDIS_NOTUSED(clientData);
1196
1197 /* We take a cached value of the unix time in the global state because
1198 * with virtual memory and aging there is to store the current time
1199 * in objects at every object access, and accuracy is not needed.
1200 * To access a global var is faster than calling time(NULL) */
1201 server.unixtime = time(NULL);
1202
1203 /* Show some info about non-empty databases */
1204 for (j = 0; j < server.dbnum; j++) {
1205 long long size, used, vkeys;
1206
1207 size = dictSlots(server.db[j].dict);
1208 used = dictSize(server.db[j].dict);
1209 vkeys = dictSize(server.db[j].expires);
1210 if (!(loops % 5) && (used || vkeys)) {
1211 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1212 /* dictPrintStats(server.dict); */
1213 }
1214 }
1215
1216 /* We don't want to resize the hash tables while a bacground saving
1217 * is in progress: the saving child is created using fork() that is
1218 * implemented with a copy-on-write semantic in most modern systems, so
1219 * if we resize the HT while there is the saving child at work actually
1220 * a lot of memory movements in the parent will cause a lot of pages
1221 * copied. */
1222 if (server.bgsavechildpid == -1) tryResizeHashTables();
1223
1224 /* Show information about connected clients */
1225 if (!(loops % 5)) {
1226 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1227 listLength(server.clients)-listLength(server.slaves),
1228 listLength(server.slaves),
1229 zmalloc_used_memory(),
1230 dictSize(server.sharingpool));
1231 }
1232
1233 /* Close connections of timedout clients */
1234 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1235 closeTimedoutClients();
1236
1237 /* Check if a background saving or AOF rewrite in progress terminated */
1238 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1239 int statloc;
1240 pid_t pid;
1241
1242 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1243 if (pid == server.bgsavechildpid) {
1244 backgroundSaveDoneHandler(statloc);
1245 } else {
1246 backgroundRewriteDoneHandler(statloc);
1247 }
1248 }
1249 } else {
1250 /* If there is not a background saving in progress check if
1251 * we have to save now */
1252 time_t now = time(NULL);
1253 for (j = 0; j < server.saveparamslen; j++) {
1254 struct saveparam *sp = server.saveparams+j;
1255
1256 if (server.dirty >= sp->changes &&
1257 now-server.lastsave > sp->seconds) {
1258 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1259 sp->changes, sp->seconds);
1260 rdbSaveBackground(server.dbfilename);
1261 break;
1262 }
1263 }
1264 }
1265
1266 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1267 * will use few CPU cycles if there are few expiring keys, otherwise
1268 * it will get more aggressive to avoid that too much memory is used by
1269 * keys that can be removed from the keyspace. */
1270 for (j = 0; j < server.dbnum; j++) {
1271 int expired;
1272 redisDb *db = server.db+j;
1273
1274 /* Continue to expire if at the end of the cycle more than 25%
1275 * of the keys were expired. */
1276 do {
1277 long num = dictSize(db->expires);
1278 time_t now = time(NULL);
1279
1280 expired = 0;
1281 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1282 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1283 while (num--) {
1284 dictEntry *de;
1285 time_t t;
1286
1287 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1288 t = (time_t) dictGetEntryVal(de);
1289 if (now > t) {
1290 deleteKey(db,dictGetEntryKey(de));
1291 expired++;
1292 }
1293 }
1294 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1295 }
1296
1297 /* Swap a few keys on disk if we are over the memory limit and VM
1298 * is enbled. Try to free objects from the free list first. */
1299 if (vmCanSwapOut()) {
1300 while (server.vm_enabled && zmalloc_used_memory() >
1301 server.vm_max_memory)
1302 {
1303 int retval;
1304
1305 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1306 retval = (server.vm_max_threads == 0) ?
1307 vmSwapOneObjectBlocking() :
1308 vmSwapOneObjectThreaded();
1309 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1310 zmalloc_used_memory() >
1311 (server.vm_max_memory+server.vm_max_memory/10))
1312 {
1313 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1314 }
1315 /* Note that when using threade I/O we free just one object,
1316 * because anyway when the I/O thread in charge to swap this
1317 * object out will finish, the handler of completed jobs
1318 * will try to swap more objects if we are still out of memory. */
1319 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1320 }
1321 }
1322
1323 /* Check if we should connect to a MASTER */
1324 if (server.replstate == REDIS_REPL_CONNECT) {
1325 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1326 if (syncWithMaster() == REDIS_OK) {
1327 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1328 }
1329 }
1330 return 1000;
1331 }
1332
1333 /* This function gets called every time Redis is entering the
1334 * main loop of the event driven library, that is, before to sleep
1335 * for ready file descriptors. */
1336 static void beforeSleep(struct aeEventLoop *eventLoop) {
1337 REDIS_NOTUSED(eventLoop);
1338
1339 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1340 listIter li;
1341 listNode *ln;
1342
1343 listRewind(server.io_ready_clients,&li);
1344 while((ln = listNext(&li))) {
1345 redisClient *c = ln->value;
1346 struct redisCommand *cmd;
1347
1348 /* Resume the client. */
1349 listDelNode(server.io_ready_clients,ln);
1350 c->flags &= (~REDIS_IO_WAIT);
1351 server.vm_blocked_clients--;
1352 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1353 readQueryFromClient, c);
1354 cmd = lookupCommand(c->argv[0]->ptr);
1355 assert(cmd != NULL);
1356 call(c,cmd);
1357 resetClient(c);
1358 /* There may be more data to process in the input buffer. */
1359 if (c->querybuf && sdslen(c->querybuf) > 0)
1360 processInputBuffer(c);
1361 }
1362 }
1363 }
1364
1365 static void createSharedObjects(void) {
1366 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1367 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1368 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1369 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1370 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1371 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1372 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1373 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1374 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1375 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1376 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1377 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1378 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1379 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1380 "-ERR no such key\r\n"));
1381 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1382 "-ERR syntax error\r\n"));
1383 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1384 "-ERR source and destination objects are the same\r\n"));
1385 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1386 "-ERR index out of range\r\n"));
1387 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1388 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1389 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1390 shared.select0 = createStringObject("select 0\r\n",10);
1391 shared.select1 = createStringObject("select 1\r\n",10);
1392 shared.select2 = createStringObject("select 2\r\n",10);
1393 shared.select3 = createStringObject("select 3\r\n",10);
1394 shared.select4 = createStringObject("select 4\r\n",10);
1395 shared.select5 = createStringObject("select 5\r\n",10);
1396 shared.select6 = createStringObject("select 6\r\n",10);
1397 shared.select7 = createStringObject("select 7\r\n",10);
1398 shared.select8 = createStringObject("select 8\r\n",10);
1399 shared.select9 = createStringObject("select 9\r\n",10);
1400 }
1401
1402 static void appendServerSaveParams(time_t seconds, int changes) {
1403 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1404 server.saveparams[server.saveparamslen].seconds = seconds;
1405 server.saveparams[server.saveparamslen].changes = changes;
1406 server.saveparamslen++;
1407 }
1408
1409 static void resetServerSaveParams() {
1410 zfree(server.saveparams);
1411 server.saveparams = NULL;
1412 server.saveparamslen = 0;
1413 }
1414
1415 static void initServerConfig() {
1416 server.dbnum = REDIS_DEFAULT_DBNUM;
1417 server.port = REDIS_SERVERPORT;
1418 server.verbosity = REDIS_VERBOSE;
1419 server.maxidletime = REDIS_MAXIDLETIME;
1420 server.saveparams = NULL;
1421 server.logfile = NULL; /* NULL = log on standard output */
1422 server.bindaddr = NULL;
1423 server.glueoutputbuf = 1;
1424 server.daemonize = 0;
1425 server.appendonly = 0;
1426 server.appendfsync = APPENDFSYNC_ALWAYS;
1427 server.lastfsync = time(NULL);
1428 server.appendfd = -1;
1429 server.appendseldb = -1; /* Make sure the first time will not match */
1430 server.pidfile = "/var/run/redis.pid";
1431 server.dbfilename = "dump.rdb";
1432 server.appendfilename = "appendonly.aof";
1433 server.requirepass = NULL;
1434 server.shareobjects = 0;
1435 server.rdbcompression = 1;
1436 server.sharingpoolsize = 1024;
1437 server.maxclients = 0;
1438 server.blpop_blocked_clients = 0;
1439 server.maxmemory = 0;
1440 server.vm_enabled = 0;
1441 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1442 server.vm_page_size = 256; /* 256 bytes per page */
1443 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1444 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1445 server.vm_max_threads = 4;
1446 server.vm_blocked_clients = 0;
1447
1448 resetServerSaveParams();
1449
1450 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1451 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1452 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1453 /* Replication related */
1454 server.isslave = 0;
1455 server.masterauth = NULL;
1456 server.masterhost = NULL;
1457 server.masterport = 6379;
1458 server.master = NULL;
1459 server.replstate = REDIS_REPL_NONE;
1460
1461 /* Double constants initialization */
1462 R_Zero = 0.0;
1463 R_PosInf = 1.0/R_Zero;
1464 R_NegInf = -1.0/R_Zero;
1465 R_Nan = R_Zero/R_Zero;
1466 }
1467
1468 static void initServer() {
1469 int j;
1470
1471 signal(SIGHUP, SIG_IGN);
1472 signal(SIGPIPE, SIG_IGN);
1473 setupSigSegvAction();
1474
1475 server.devnull = fopen("/dev/null","w");
1476 if (server.devnull == NULL) {
1477 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1478 exit(1);
1479 }
1480 server.clients = listCreate();
1481 server.slaves = listCreate();
1482 server.monitors = listCreate();
1483 server.objfreelist = listCreate();
1484 createSharedObjects();
1485 server.el = aeCreateEventLoop();
1486 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1487 server.sharingpool = dictCreate(&setDictType,NULL);
1488 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1489 if (server.fd == -1) {
1490 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1491 exit(1);
1492 }
1493 for (j = 0; j < server.dbnum; j++) {
1494 server.db[j].dict = dictCreate(&hashDictType,NULL);
1495 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1496 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1497 if (server.vm_enabled)
1498 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1499 server.db[j].id = j;
1500 }
1501 server.cronloops = 0;
1502 server.bgsavechildpid = -1;
1503 server.bgrewritechildpid = -1;
1504 server.bgrewritebuf = sdsempty();
1505 server.lastsave = time(NULL);
1506 server.dirty = 0;
1507 server.stat_numcommands = 0;
1508 server.stat_numconnections = 0;
1509 server.stat_starttime = time(NULL);
1510 server.unixtime = time(NULL);
1511 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1512 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1513 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1514
1515 if (server.appendonly) {
1516 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1517 if (server.appendfd == -1) {
1518 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1519 strerror(errno));
1520 exit(1);
1521 }
1522 }
1523
1524 if (server.vm_enabled) vmInit();
1525 }
1526
1527 /* Empty the whole database */
1528 static long long emptyDb() {
1529 int j;
1530 long long removed = 0;
1531
1532 for (j = 0; j < server.dbnum; j++) {
1533 removed += dictSize(server.db[j].dict);
1534 dictEmpty(server.db[j].dict);
1535 dictEmpty(server.db[j].expires);
1536 }
1537 return removed;
1538 }
1539
1540 static int yesnotoi(char *s) {
1541 if (!strcasecmp(s,"yes")) return 1;
1542 else if (!strcasecmp(s,"no")) return 0;
1543 else return -1;
1544 }
1545
1546 /* I agree, this is a very rudimental way to load a configuration...
1547 will improve later if the config gets more complex */
1548 static void loadServerConfig(char *filename) {
1549 FILE *fp;
1550 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1551 int linenum = 0;
1552 sds line = NULL;
1553
1554 if (filename[0] == '-' && filename[1] == '\0')
1555 fp = stdin;
1556 else {
1557 if ((fp = fopen(filename,"r")) == NULL) {
1558 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1559 exit(1);
1560 }
1561 }
1562
1563 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1564 sds *argv;
1565 int argc, j;
1566
1567 linenum++;
1568 line = sdsnew(buf);
1569 line = sdstrim(line," \t\r\n");
1570
1571 /* Skip comments and blank lines*/
1572 if (line[0] == '#' || line[0] == '\0') {
1573 sdsfree(line);
1574 continue;
1575 }
1576
1577 /* Split into arguments */
1578 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1579 sdstolower(argv[0]);
1580
1581 /* Execute config directives */
1582 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1583 server.maxidletime = atoi(argv[1]);
1584 if (server.maxidletime < 0) {
1585 err = "Invalid timeout value"; goto loaderr;
1586 }
1587 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1588 server.port = atoi(argv[1]);
1589 if (server.port < 1 || server.port > 65535) {
1590 err = "Invalid port"; goto loaderr;
1591 }
1592 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1593 server.bindaddr = zstrdup(argv[1]);
1594 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1595 int seconds = atoi(argv[1]);
1596 int changes = atoi(argv[2]);
1597 if (seconds < 1 || changes < 0) {
1598 err = "Invalid save parameters"; goto loaderr;
1599 }
1600 appendServerSaveParams(seconds,changes);
1601 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1602 if (chdir(argv[1]) == -1) {
1603 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1604 argv[1], strerror(errno));
1605 exit(1);
1606 }
1607 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1608 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1609 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1610 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1611 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1612 else {
1613 err = "Invalid log level. Must be one of debug, notice, warning";
1614 goto loaderr;
1615 }
1616 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1617 FILE *logfp;
1618
1619 server.logfile = zstrdup(argv[1]);
1620 if (!strcasecmp(server.logfile,"stdout")) {
1621 zfree(server.logfile);
1622 server.logfile = NULL;
1623 }
1624 if (server.logfile) {
1625 /* Test if we are able to open the file. The server will not
1626 * be able to abort just for this problem later... */
1627 logfp = fopen(server.logfile,"a");
1628 if (logfp == NULL) {
1629 err = sdscatprintf(sdsempty(),
1630 "Can't open the log file: %s", strerror(errno));
1631 goto loaderr;
1632 }
1633 fclose(logfp);
1634 }
1635 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1636 server.dbnum = atoi(argv[1]);
1637 if (server.dbnum < 1) {
1638 err = "Invalid number of databases"; goto loaderr;
1639 }
1640 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1641 server.maxclients = atoi(argv[1]);
1642 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1643 server.maxmemory = strtoll(argv[1], NULL, 10);
1644 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1645 server.masterhost = sdsnew(argv[1]);
1646 server.masterport = atoi(argv[2]);
1647 server.replstate = REDIS_REPL_CONNECT;
1648 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1649 server.masterauth = zstrdup(argv[1]);
1650 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1651 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1652 err = "argument must be 'yes' or 'no'"; goto loaderr;
1653 }
1654 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1655 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1656 err = "argument must be 'yes' or 'no'"; goto loaderr;
1657 }
1658 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1659 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1660 err = "argument must be 'yes' or 'no'"; goto loaderr;
1661 }
1662 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1663 server.sharingpoolsize = atoi(argv[1]);
1664 if (server.sharingpoolsize < 1) {
1665 err = "invalid object sharing pool size"; goto loaderr;
1666 }
1667 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1668 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1669 err = "argument must be 'yes' or 'no'"; goto loaderr;
1670 }
1671 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1672 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1673 err = "argument must be 'yes' or 'no'"; goto loaderr;
1674 }
1675 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1676 if (!strcasecmp(argv[1],"no")) {
1677 server.appendfsync = APPENDFSYNC_NO;
1678 } else if (!strcasecmp(argv[1],"always")) {
1679 server.appendfsync = APPENDFSYNC_ALWAYS;
1680 } else if (!strcasecmp(argv[1],"everysec")) {
1681 server.appendfsync = APPENDFSYNC_EVERYSEC;
1682 } else {
1683 err = "argument must be 'no', 'always' or 'everysec'";
1684 goto loaderr;
1685 }
1686 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1687 server.requirepass = zstrdup(argv[1]);
1688 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1689 server.pidfile = zstrdup(argv[1]);
1690 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1691 server.dbfilename = zstrdup(argv[1]);
1692 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1693 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1697 zfree(server.vm_swap_file);
1698 server.vm_swap_file = zstrdup(argv[1]);
1699 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1700 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1701 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1702 server.vm_page_size = strtoll(argv[1], NULL, 10);
1703 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1704 server.vm_pages = strtoll(argv[1], NULL, 10);
1705 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1706 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1707 } else {
1708 err = "Bad directive or wrong number of arguments"; goto loaderr;
1709 }
1710 for (j = 0; j < argc; j++)
1711 sdsfree(argv[j]);
1712 zfree(argv);
1713 sdsfree(line);
1714 }
1715 if (fp != stdin) fclose(fp);
1716 return;
1717
1718 loaderr:
1719 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1720 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1721 fprintf(stderr, ">>> '%s'\n", line);
1722 fprintf(stderr, "%s\n", err);
1723 exit(1);
1724 }
1725
1726 static void freeClientArgv(redisClient *c) {
1727 int j;
1728
1729 for (j = 0; j < c->argc; j++)
1730 decrRefCount(c->argv[j]);
1731 for (j = 0; j < c->mbargc; j++)
1732 decrRefCount(c->mbargv[j]);
1733 c->argc = 0;
1734 c->mbargc = 0;
1735 }
1736
1737 static void freeClient(redisClient *c) {
1738 listNode *ln;
1739
1740 /* Note that if the client we are freeing is blocked into a blocking
1741 * call, we have to set querybuf to NULL *before* to call
1742 * unblockClientWaitingData() to avoid processInputBuffer() will get
1743 * called. Also it is important to remove the file events after
1744 * this, because this call adds the READABLE event. */
1745 sdsfree(c->querybuf);
1746 c->querybuf = NULL;
1747 if (c->flags & REDIS_BLOCKED)
1748 unblockClientWaitingData(c);
1749
1750 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1751 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1752 listRelease(c->reply);
1753 freeClientArgv(c);
1754 close(c->fd);
1755 /* Remove from the list of clients */
1756 ln = listSearchKey(server.clients,c);
1757 redisAssert(ln != NULL);
1758 listDelNode(server.clients,ln);
1759 /* Remove from the list of clients waiting for swapped keys */
1760 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1761 ln = listSearchKey(server.io_ready_clients,c);
1762 if (ln) {
1763 listDelNode(server.io_ready_clients,ln);
1764 server.vm_blocked_clients--;
1765 }
1766 }
1767 while (server.vm_enabled && listLength(c->io_keys)) {
1768 ln = listFirst(c->io_keys);
1769 dontWaitForSwappedKey(c,ln->value);
1770 }
1771 listRelease(c->io_keys);
1772 /* Other cleanup */
1773 if (c->flags & REDIS_SLAVE) {
1774 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1775 close(c->repldbfd);
1776 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1777 ln = listSearchKey(l,c);
1778 redisAssert(ln != NULL);
1779 listDelNode(l,ln);
1780 }
1781 if (c->flags & REDIS_MASTER) {
1782 server.master = NULL;
1783 server.replstate = REDIS_REPL_CONNECT;
1784 }
1785 zfree(c->argv);
1786 zfree(c->mbargv);
1787 freeClientMultiState(c);
1788 zfree(c);
1789 }
1790
1791 #define GLUEREPLY_UP_TO (1024)
1792 static void glueReplyBuffersIfNeeded(redisClient *c) {
1793 int copylen = 0;
1794 char buf[GLUEREPLY_UP_TO];
1795 listNode *ln;
1796 listIter li;
1797 robj *o;
1798
1799 listRewind(c->reply,&li);
1800 while((ln = listNext(&li))) {
1801 int objlen;
1802
1803 o = ln->value;
1804 objlen = sdslen(o->ptr);
1805 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1806 memcpy(buf+copylen,o->ptr,objlen);
1807 copylen += objlen;
1808 listDelNode(c->reply,ln);
1809 } else {
1810 if (copylen == 0) return;
1811 break;
1812 }
1813 }
1814 /* Now the output buffer is empty, add the new single element */
1815 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1816 listAddNodeHead(c->reply,o);
1817 }
1818
1819 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1820 redisClient *c = privdata;
1821 int nwritten = 0, totwritten = 0, objlen;
1822 robj *o;
1823 REDIS_NOTUSED(el);
1824 REDIS_NOTUSED(mask);
1825
1826 /* Use writev() if we have enough buffers to send */
1827 if (!server.glueoutputbuf &&
1828 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1829 !(c->flags & REDIS_MASTER))
1830 {
1831 sendReplyToClientWritev(el, fd, privdata, mask);
1832 return;
1833 }
1834
1835 while(listLength(c->reply)) {
1836 if (server.glueoutputbuf && listLength(c->reply) > 1)
1837 glueReplyBuffersIfNeeded(c);
1838
1839 o = listNodeValue(listFirst(c->reply));
1840 objlen = sdslen(o->ptr);
1841
1842 if (objlen == 0) {
1843 listDelNode(c->reply,listFirst(c->reply));
1844 continue;
1845 }
1846
1847 if (c->flags & REDIS_MASTER) {
1848 /* Don't reply to a master */
1849 nwritten = objlen - c->sentlen;
1850 } else {
1851 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1852 if (nwritten <= 0) break;
1853 }
1854 c->sentlen += nwritten;
1855 totwritten += nwritten;
1856 /* If we fully sent the object on head go to the next one */
1857 if (c->sentlen == objlen) {
1858 listDelNode(c->reply,listFirst(c->reply));
1859 c->sentlen = 0;
1860 }
1861 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1862 * bytes, in a single threaded server it's a good idea to serve
1863 * other clients as well, even if a very large request comes from
1864 * super fast link that is always able to accept data (in real world
1865 * scenario think about 'KEYS *' against the loopback interfae) */
1866 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1867 }
1868 if (nwritten == -1) {
1869 if (errno == EAGAIN) {
1870 nwritten = 0;
1871 } else {
1872 redisLog(REDIS_VERBOSE,
1873 "Error writing to client: %s", strerror(errno));
1874 freeClient(c);
1875 return;
1876 }
1877 }
1878 if (totwritten > 0) c->lastinteraction = time(NULL);
1879 if (listLength(c->reply) == 0) {
1880 c->sentlen = 0;
1881 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1882 }
1883 }
1884
1885 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1886 {
1887 redisClient *c = privdata;
1888 int nwritten = 0, totwritten = 0, objlen, willwrite;
1889 robj *o;
1890 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1891 int offset, ion = 0;
1892 REDIS_NOTUSED(el);
1893 REDIS_NOTUSED(mask);
1894
1895 listNode *node;
1896 while (listLength(c->reply)) {
1897 offset = c->sentlen;
1898 ion = 0;
1899 willwrite = 0;
1900
1901 /* fill-in the iov[] array */
1902 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1903 o = listNodeValue(node);
1904 objlen = sdslen(o->ptr);
1905
1906 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1907 break;
1908
1909 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1910 break; /* no more iovecs */
1911
1912 iov[ion].iov_base = ((char*)o->ptr) + offset;
1913 iov[ion].iov_len = objlen - offset;
1914 willwrite += objlen - offset;
1915 offset = 0; /* just for the first item */
1916 ion++;
1917 }
1918
1919 if(willwrite == 0)
1920 break;
1921
1922 /* write all collected blocks at once */
1923 if((nwritten = writev(fd, iov, ion)) < 0) {
1924 if (errno != EAGAIN) {
1925 redisLog(REDIS_VERBOSE,
1926 "Error writing to client: %s", strerror(errno));
1927 freeClient(c);
1928 return;
1929 }
1930 break;
1931 }
1932
1933 totwritten += nwritten;
1934 offset = c->sentlen;
1935
1936 /* remove written robjs from c->reply */
1937 while (nwritten && listLength(c->reply)) {
1938 o = listNodeValue(listFirst(c->reply));
1939 objlen = sdslen(o->ptr);
1940
1941 if(nwritten >= objlen - offset) {
1942 listDelNode(c->reply, listFirst(c->reply));
1943 nwritten -= objlen - offset;
1944 c->sentlen = 0;
1945 } else {
1946 /* partial write */
1947 c->sentlen += nwritten;
1948 break;
1949 }
1950 offset = 0;
1951 }
1952 }
1953
1954 if (totwritten > 0)
1955 c->lastinteraction = time(NULL);
1956
1957 if (listLength(c->reply) == 0) {
1958 c->sentlen = 0;
1959 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1960 }
1961 }
1962
1963 static struct redisCommand *lookupCommand(char *name) {
1964 int j = 0;
1965 while(cmdTable[j].name != NULL) {
1966 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
1967 j++;
1968 }
1969 return NULL;
1970 }
1971
1972 /* resetClient prepare the client to process the next command */
1973 static void resetClient(redisClient *c) {
1974 freeClientArgv(c);
1975 c->bulklen = -1;
1976 c->multibulk = 0;
1977 }
1978
1979 /* Call() is the core of Redis execution of a command */
1980 static void call(redisClient *c, struct redisCommand *cmd) {
1981 long long dirty;
1982
1983 dirty = server.dirty;
1984 cmd->proc(c);
1985 if (server.appendonly && server.dirty-dirty)
1986 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1987 if (server.dirty-dirty && listLength(server.slaves))
1988 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1989 if (listLength(server.monitors))
1990 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1991 server.stat_numcommands++;
1992 }
1993
1994 /* If this function gets called we already read a whole
1995 * command, argments are in the client argv/argc fields.
1996 * processCommand() execute the command or prepare the
1997 * server for a bulk read from the client.
1998 *
1999 * If 1 is returned the client is still alive and valid and
2000 * and other operations can be performed by the caller. Otherwise
2001 * if 0 is returned the client was destroied (i.e. after QUIT). */
2002 static int processCommand(redisClient *c) {
2003 struct redisCommand *cmd;
2004
2005 /* Free some memory if needed (maxmemory setting) */
2006 if (server.maxmemory) freeMemoryIfNeeded();
2007
2008 /* Handle the multi bulk command type. This is an alternative protocol
2009 * supported by Redis in order to receive commands that are composed of
2010 * multiple binary-safe "bulk" arguments. The latency of processing is
2011 * a bit higher but this allows things like multi-sets, so if this
2012 * protocol is used only for MSET and similar commands this is a big win. */
2013 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2014 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2015 if (c->multibulk <= 0) {
2016 resetClient(c);
2017 return 1;
2018 } else {
2019 decrRefCount(c->argv[c->argc-1]);
2020 c->argc--;
2021 return 1;
2022 }
2023 } else if (c->multibulk) {
2024 if (c->bulklen == -1) {
2025 if (((char*)c->argv[0]->ptr)[0] != '$') {
2026 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2027 resetClient(c);
2028 return 1;
2029 } else {
2030 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2031 decrRefCount(c->argv[0]);
2032 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2033 c->argc--;
2034 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2035 resetClient(c);
2036 return 1;
2037 }
2038 c->argc--;
2039 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2040 return 1;
2041 }
2042 } else {
2043 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2044 c->mbargv[c->mbargc] = c->argv[0];
2045 c->mbargc++;
2046 c->argc--;
2047 c->multibulk--;
2048 if (c->multibulk == 0) {
2049 robj **auxargv;
2050 int auxargc;
2051
2052 /* Here we need to swap the multi-bulk argc/argv with the
2053 * normal argc/argv of the client structure. */
2054 auxargv = c->argv;
2055 c->argv = c->mbargv;
2056 c->mbargv = auxargv;
2057
2058 auxargc = c->argc;
2059 c->argc = c->mbargc;
2060 c->mbargc = auxargc;
2061
2062 /* We need to set bulklen to something different than -1
2063 * in order for the code below to process the command without
2064 * to try to read the last argument of a bulk command as
2065 * a special argument. */
2066 c->bulklen = 0;
2067 /* continue below and process the command */
2068 } else {
2069 c->bulklen = -1;
2070 return 1;
2071 }
2072 }
2073 }
2074 /* -- end of multi bulk commands processing -- */
2075
2076 /* The QUIT command is handled as a special case. Normal command
2077 * procs are unable to close the client connection safely */
2078 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2079 freeClient(c);
2080 return 0;
2081 }
2082
2083 /* Now lookup the command and check ASAP about trivial error conditions
2084 * such wrong arity, bad command name and so forth. */
2085 cmd = lookupCommand(c->argv[0]->ptr);
2086 if (!cmd) {
2087 addReplySds(c,
2088 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2089 (char*)c->argv[0]->ptr));
2090 resetClient(c);
2091 return 1;
2092 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2093 (c->argc < -cmd->arity)) {
2094 addReplySds(c,
2095 sdscatprintf(sdsempty(),
2096 "-ERR wrong number of arguments for '%s' command\r\n",
2097 cmd->name));
2098 resetClient(c);
2099 return 1;
2100 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2101 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2102 resetClient(c);
2103 return 1;
2104 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2105 /* This is a bulk command, we have to read the last argument yet. */
2106 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2107
2108 decrRefCount(c->argv[c->argc-1]);
2109 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2110 c->argc--;
2111 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2112 resetClient(c);
2113 return 1;
2114 }
2115 c->argc--;
2116 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2117 /* It is possible that the bulk read is already in the
2118 * buffer. Check this condition and handle it accordingly.
2119 * This is just a fast path, alternative to call processInputBuffer().
2120 * It's a good idea since the code is small and this condition
2121 * happens most of the times. */
2122 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2123 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2124 c->argc++;
2125 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2126 } else {
2127 /* Otherwise return... there is to read the last argument
2128 * from the socket. */
2129 return 1;
2130 }
2131 }
2132 /* Let's try to share objects on the command arguments vector */
2133 if (server.shareobjects) {
2134 int j;
2135 for(j = 1; j < c->argc; j++)
2136 c->argv[j] = tryObjectSharing(c->argv[j]);
2137 }
2138 /* Let's try to encode the bulk object to save space. */
2139 if (cmd->flags & REDIS_CMD_BULK)
2140 tryObjectEncoding(c->argv[c->argc-1]);
2141
2142 /* Check if the user is authenticated */
2143 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2144 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2145 resetClient(c);
2146 return 1;
2147 }
2148
2149 /* Exec the command */
2150 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2151 queueMultiCommand(c,cmd);
2152 addReply(c,shared.queued);
2153 } else {
2154 if (server.vm_enabled && server.vm_max_threads > 0 &&
2155 blockClientOnSwappedKeys(cmd,c)) return 1;
2156 call(c,cmd);
2157 }
2158
2159 /* Prepare the client for the next command */
2160 resetClient(c);
2161 return 1;
2162 }
2163
2164 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2165 listNode *ln;
2166 listIter li;
2167 int outc = 0, j;
2168 robj **outv;
2169 /* (args*2)+1 is enough room for args, spaces, newlines */
2170 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2171
2172 if (argc <= REDIS_STATIC_ARGS) {
2173 outv = static_outv;
2174 } else {
2175 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2176 }
2177
2178 for (j = 0; j < argc; j++) {
2179 if (j != 0) outv[outc++] = shared.space;
2180 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2181 robj *lenobj;
2182
2183 lenobj = createObject(REDIS_STRING,
2184 sdscatprintf(sdsempty(),"%lu\r\n",
2185 (unsigned long) stringObjectLen(argv[j])));
2186 lenobj->refcount = 0;
2187 outv[outc++] = lenobj;
2188 }
2189 outv[outc++] = argv[j];
2190 }
2191 outv[outc++] = shared.crlf;
2192
2193 /* Increment all the refcounts at start and decrement at end in order to
2194 * be sure to free objects if there is no slave in a replication state
2195 * able to be feed with commands */
2196 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2197 listRewind(slaves,&li);
2198 while((ln = listNext(&li))) {
2199 redisClient *slave = ln->value;
2200
2201 /* Don't feed slaves that are still waiting for BGSAVE to start */
2202 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2203
2204 /* Feed all the other slaves, MONITORs and so on */
2205 if (slave->slaveseldb != dictid) {
2206 robj *selectcmd;
2207
2208 switch(dictid) {
2209 case 0: selectcmd = shared.select0; break;
2210 case 1: selectcmd = shared.select1; break;
2211 case 2: selectcmd = shared.select2; break;
2212 case 3: selectcmd = shared.select3; break;
2213 case 4: selectcmd = shared.select4; break;
2214 case 5: selectcmd = shared.select5; break;
2215 case 6: selectcmd = shared.select6; break;
2216 case 7: selectcmd = shared.select7; break;
2217 case 8: selectcmd = shared.select8; break;
2218 case 9: selectcmd = shared.select9; break;
2219 default:
2220 selectcmd = createObject(REDIS_STRING,
2221 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2222 selectcmd->refcount = 0;
2223 break;
2224 }
2225 addReply(slave,selectcmd);
2226 slave->slaveseldb = dictid;
2227 }
2228 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2229 }
2230 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2231 if (outv != static_outv) zfree(outv);
2232 }
2233
2234 static void processInputBuffer(redisClient *c) {
2235 again:
2236 /* Before to process the input buffer, make sure the client is not
2237 * waitig for a blocking operation such as BLPOP. Note that the first
2238 * iteration the client is never blocked, otherwise the processInputBuffer
2239 * would not be called at all, but after the execution of the first commands
2240 * in the input buffer the client may be blocked, and the "goto again"
2241 * will try to reiterate. The following line will make it return asap. */
2242 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2243 if (c->bulklen == -1) {
2244 /* Read the first line of the query */
2245 char *p = strchr(c->querybuf,'\n');
2246 size_t querylen;
2247
2248 if (p) {
2249 sds query, *argv;
2250 int argc, j;
2251
2252 query = c->querybuf;
2253 c->querybuf = sdsempty();
2254 querylen = 1+(p-(query));
2255 if (sdslen(query) > querylen) {
2256 /* leave data after the first line of the query in the buffer */
2257 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2258 }
2259 *p = '\0'; /* remove "\n" */
2260 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2261 sdsupdatelen(query);
2262
2263 /* Now we can split the query in arguments */
2264 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2265 sdsfree(query);
2266
2267 if (c->argv) zfree(c->argv);
2268 c->argv = zmalloc(sizeof(robj*)*argc);
2269
2270 for (j = 0; j < argc; j++) {
2271 if (sdslen(argv[j])) {
2272 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2273 c->argc++;
2274 } else {
2275 sdsfree(argv[j]);
2276 }
2277 }
2278 zfree(argv);
2279 if (c->argc) {
2280 /* Execute the command. If the client is still valid
2281 * after processCommand() return and there is something
2282 * on the query buffer try to process the next command. */
2283 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2284 } else {
2285 /* Nothing to process, argc == 0. Just process the query
2286 * buffer if it's not empty or return to the caller */
2287 if (sdslen(c->querybuf)) goto again;
2288 }
2289 return;
2290 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2291 redisLog(REDIS_VERBOSE, "Client protocol error");
2292 freeClient(c);
2293 return;
2294 }
2295 } else {
2296 /* Bulk read handling. Note that if we are at this point
2297 the client already sent a command terminated with a newline,
2298 we are reading the bulk data that is actually the last
2299 argument of the command. */
2300 int qbl = sdslen(c->querybuf);
2301
2302 if (c->bulklen <= qbl) {
2303 /* Copy everything but the final CRLF as final argument */
2304 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2305 c->argc++;
2306 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2307 /* Process the command. If the client is still valid after
2308 * the processing and there is more data in the buffer
2309 * try to parse it. */
2310 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2311 return;
2312 }
2313 }
2314 }
2315
2316 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2317 redisClient *c = (redisClient*) privdata;
2318 char buf[REDIS_IOBUF_LEN];
2319 int nread;
2320 REDIS_NOTUSED(el);
2321 REDIS_NOTUSED(mask);
2322
2323 nread = read(fd, buf, REDIS_IOBUF_LEN);
2324 if (nread == -1) {
2325 if (errno == EAGAIN) {
2326 nread = 0;
2327 } else {
2328 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2329 freeClient(c);
2330 return;
2331 }
2332 } else if (nread == 0) {
2333 redisLog(REDIS_VERBOSE, "Client closed connection");
2334 freeClient(c);
2335 return;
2336 }
2337 if (nread) {
2338 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2339 c->lastinteraction = time(NULL);
2340 } else {
2341 return;
2342 }
2343 if (!(c->flags & REDIS_BLOCKED))
2344 processInputBuffer(c);
2345 }
2346
2347 static int selectDb(redisClient *c, int id) {
2348 if (id < 0 || id >= server.dbnum)
2349 return REDIS_ERR;
2350 c->db = &server.db[id];
2351 return REDIS_OK;
2352 }
2353
2354 static void *dupClientReplyValue(void *o) {
2355 incrRefCount((robj*)o);
2356 return o;
2357 }
2358
2359 static redisClient *createClient(int fd) {
2360 redisClient *c = zmalloc(sizeof(*c));
2361
2362 anetNonBlock(NULL,fd);
2363 anetTcpNoDelay(NULL,fd);
2364 if (!c) return NULL;
2365 selectDb(c,0);
2366 c->fd = fd;
2367 c->querybuf = sdsempty();
2368 c->argc = 0;
2369 c->argv = NULL;
2370 c->bulklen = -1;
2371 c->multibulk = 0;
2372 c->mbargc = 0;
2373 c->mbargv = NULL;
2374 c->sentlen = 0;
2375 c->flags = 0;
2376 c->lastinteraction = time(NULL);
2377 c->authenticated = 0;
2378 c->replstate = REDIS_REPL_NONE;
2379 c->reply = listCreate();
2380 listSetFreeMethod(c->reply,decrRefCount);
2381 listSetDupMethod(c->reply,dupClientReplyValue);
2382 c->blockingkeys = NULL;
2383 c->blockingkeysnum = 0;
2384 c->io_keys = listCreate();
2385 listSetFreeMethod(c->io_keys,decrRefCount);
2386 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2387 readQueryFromClient, c) == AE_ERR) {
2388 freeClient(c);
2389 return NULL;
2390 }
2391 listAddNodeTail(server.clients,c);
2392 initClientMultiState(c);
2393 return c;
2394 }
2395
2396 static void addReply(redisClient *c, robj *obj) {
2397 if (listLength(c->reply) == 0 &&
2398 (c->replstate == REDIS_REPL_NONE ||
2399 c->replstate == REDIS_REPL_ONLINE) &&
2400 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2401 sendReplyToClient, c) == AE_ERR) return;
2402
2403 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2404 obj = dupStringObject(obj);
2405 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2406 }
2407 listAddNodeTail(c->reply,getDecodedObject(obj));
2408 }
2409
2410 static void addReplySds(redisClient *c, sds s) {
2411 robj *o = createObject(REDIS_STRING,s);
2412 addReply(c,o);
2413 decrRefCount(o);
2414 }
2415
2416 static void addReplyDouble(redisClient *c, double d) {
2417 char buf[128];
2418
2419 snprintf(buf,sizeof(buf),"%.17g",d);
2420 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2421 (unsigned long) strlen(buf),buf));
2422 }
2423
2424 static void addReplyLong(redisClient *c, long l) {
2425 char buf[128];
2426 size_t len;
2427
2428 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2429 addReplySds(c,sdsnewlen(buf,len));
2430 }
2431
2432 static void addReplyBulkLen(redisClient *c, robj *obj) {
2433 size_t len;
2434
2435 if (obj->encoding == REDIS_ENCODING_RAW) {
2436 len = sdslen(obj->ptr);
2437 } else {
2438 long n = (long)obj->ptr;
2439
2440 /* Compute how many bytes will take this integer as a radix 10 string */
2441 len = 1;
2442 if (n < 0) {
2443 len++;
2444 n = -n;
2445 }
2446 while((n = n/10) != 0) {
2447 len++;
2448 }
2449 }
2450 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2451 }
2452
2453 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2454 int cport, cfd;
2455 char cip[128];
2456 redisClient *c;
2457 REDIS_NOTUSED(el);
2458 REDIS_NOTUSED(mask);
2459 REDIS_NOTUSED(privdata);
2460
2461 cfd = anetAccept(server.neterr, fd, cip, &cport);
2462 if (cfd == AE_ERR) {
2463 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2464 return;
2465 }
2466 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2467 if ((c = createClient(cfd)) == NULL) {
2468 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2469 close(cfd); /* May be already closed, just ingore errors */
2470 return;
2471 }
2472 /* If maxclient directive is set and this is one client more... close the
2473 * connection. Note that we create the client instead to check before
2474 * for this condition, since now the socket is already set in nonblocking
2475 * mode and we can send an error for free using the Kernel I/O */
2476 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2477 char *err = "-ERR max number of clients reached\r\n";
2478
2479 /* That's a best effort error message, don't check write errors */
2480 if (write(c->fd,err,strlen(err)) == -1) {
2481 /* Nothing to do, Just to avoid the warning... */
2482 }
2483 freeClient(c);
2484 return;
2485 }
2486 server.stat_numconnections++;
2487 }
2488
2489 /* ======================= Redis objects implementation ===================== */
2490
2491 static robj *createObject(int type, void *ptr) {
2492 robj *o;
2493
2494 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2495 if (listLength(server.objfreelist)) {
2496 listNode *head = listFirst(server.objfreelist);
2497 o = listNodeValue(head);
2498 listDelNode(server.objfreelist,head);
2499 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2500 } else {
2501 if (server.vm_enabled) {
2502 pthread_mutex_unlock(&server.obj_freelist_mutex);
2503 o = zmalloc(sizeof(*o));
2504 } else {
2505 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2506 }
2507 }
2508 o->type = type;
2509 o->encoding = REDIS_ENCODING_RAW;
2510 o->ptr = ptr;
2511 o->refcount = 1;
2512 if (server.vm_enabled) {
2513 /* Note that this code may run in the context of an I/O thread
2514 * and accessing to server.unixtime in theory is an error
2515 * (no locks). But in practice this is safe, and even if we read
2516 * garbage Redis will not fail, as it's just a statistical info */
2517 o->vm.atime = server.unixtime;
2518 o->storage = REDIS_VM_MEMORY;
2519 }
2520 return o;
2521 }
2522
2523 static robj *createStringObject(char *ptr, size_t len) {
2524 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2525 }
2526
2527 static robj *dupStringObject(robj *o) {
2528 assert(o->encoding == REDIS_ENCODING_RAW);
2529 return createStringObject(o->ptr,sdslen(o->ptr));
2530 }
2531
2532 static robj *createListObject(void) {
2533 list *l = listCreate();
2534
2535 listSetFreeMethod(l,decrRefCount);
2536 return createObject(REDIS_LIST,l);
2537 }
2538
2539 static robj *createSetObject(void) {
2540 dict *d = dictCreate(&setDictType,NULL);
2541 return createObject(REDIS_SET,d);
2542 }
2543
2544 static robj *createZsetObject(void) {
2545 zset *zs = zmalloc(sizeof(*zs));
2546
2547 zs->dict = dictCreate(&zsetDictType,NULL);
2548 zs->zsl = zslCreate();
2549 return createObject(REDIS_ZSET,zs);
2550 }
2551
2552 static void freeStringObject(robj *o) {
2553 if (o->encoding == REDIS_ENCODING_RAW) {
2554 sdsfree(o->ptr);
2555 }
2556 }
2557
2558 static void freeListObject(robj *o) {
2559 listRelease((list*) o->ptr);
2560 }
2561
2562 static void freeSetObject(robj *o) {
2563 dictRelease((dict*) o->ptr);
2564 }
2565
2566 static void freeZsetObject(robj *o) {
2567 zset *zs = o->ptr;
2568
2569 dictRelease(zs->dict);
2570 zslFree(zs->zsl);
2571 zfree(zs);
2572 }
2573
2574 static void freeHashObject(robj *o) {
2575 dictRelease((dict*) o->ptr);
2576 }
2577
2578 static void incrRefCount(robj *o) {
2579 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2580 o->refcount++;
2581 }
2582
2583 static void decrRefCount(void *obj) {
2584 robj *o = obj;
2585
2586 /* Object is a key of a swapped out value, or in the process of being
2587 * loaded. */
2588 if (server.vm_enabled &&
2589 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2590 {
2591 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2592 redisAssert(o->refcount == 1);
2593 }
2594 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2595 redisAssert(o->type == REDIS_STRING);
2596 freeStringObject(o);
2597 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2598 pthread_mutex_lock(&server.obj_freelist_mutex);
2599 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2600 !listAddNodeHead(server.objfreelist,o))
2601 zfree(o);
2602 pthread_mutex_unlock(&server.obj_freelist_mutex);
2603 server.vm_stats_swapped_objects--;
2604 return;
2605 }
2606 /* Object is in memory, or in the process of being swapped out. */
2607 if (--(o->refcount) == 0) {
2608 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2609 vmCancelThreadedIOJob(obj);
2610 switch(o->type) {
2611 case REDIS_STRING: freeStringObject(o); break;
2612 case REDIS_LIST: freeListObject(o); break;
2613 case REDIS_SET: freeSetObject(o); break;
2614 case REDIS_ZSET: freeZsetObject(o); break;
2615 case REDIS_HASH: freeHashObject(o); break;
2616 default: redisAssert(0 != 0); break;
2617 }
2618 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2619 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2620 !listAddNodeHead(server.objfreelist,o))
2621 zfree(o);
2622 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2623 }
2624 }
2625
2626 static robj *lookupKey(redisDb *db, robj *key) {
2627 dictEntry *de = dictFind(db->dict,key);
2628 if (de) {
2629 robj *key = dictGetEntryKey(de);
2630 robj *val = dictGetEntryVal(de);
2631
2632 if (server.vm_enabled) {
2633 if (key->storage == REDIS_VM_MEMORY ||
2634 key->storage == REDIS_VM_SWAPPING)
2635 {
2636 /* If we were swapping the object out, stop it, this key
2637 * was requested. */
2638 if (key->storage == REDIS_VM_SWAPPING)
2639 vmCancelThreadedIOJob(key);
2640 /* Update the access time of the key for the aging algorithm. */
2641 key->vm.atime = server.unixtime;
2642 } else {
2643 int notify = (key->storage == REDIS_VM_LOADING);
2644
2645 /* Our value was swapped on disk. Bring it at home. */
2646 redisAssert(val == NULL);
2647 val = vmLoadObject(key);
2648 dictGetEntryVal(de) = val;
2649
2650 /* Clients blocked by the VM subsystem may be waiting for
2651 * this key... */
2652 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2653 }
2654 }
2655 return val;
2656 } else {
2657 return NULL;
2658 }
2659 }
2660
2661 static robj *lookupKeyRead(redisDb *db, robj *key) {
2662 expireIfNeeded(db,key);
2663 return lookupKey(db,key);
2664 }
2665
2666 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2667 deleteIfVolatile(db,key);
2668 return lookupKey(db,key);
2669 }
2670
2671 static int deleteKey(redisDb *db, robj *key) {
2672 int retval;
2673
2674 /* We need to protect key from destruction: after the first dictDelete()
2675 * it may happen that 'key' is no longer valid if we don't increment
2676 * it's count. This may happen when we get the object reference directly
2677 * from the hash table with dictRandomKey() or dict iterators */
2678 incrRefCount(key);
2679 if (dictSize(db->expires)) dictDelete(db->expires,key);
2680 retval = dictDelete(db->dict,key);
2681 decrRefCount(key);
2682
2683 return retval == DICT_OK;
2684 }
2685
2686 /* Try to share an object against the shared objects pool */
2687 static robj *tryObjectSharing(robj *o) {
2688 struct dictEntry *de;
2689 unsigned long c;
2690
2691 if (o == NULL || server.shareobjects == 0) return o;
2692
2693 redisAssert(o->type == REDIS_STRING);
2694 de = dictFind(server.sharingpool,o);
2695 if (de) {
2696 robj *shared = dictGetEntryKey(de);
2697
2698 c = ((unsigned long) dictGetEntryVal(de))+1;
2699 dictGetEntryVal(de) = (void*) c;
2700 incrRefCount(shared);
2701 decrRefCount(o);
2702 return shared;
2703 } else {
2704 /* Here we are using a stream algorihtm: Every time an object is
2705 * shared we increment its count, everytime there is a miss we
2706 * recrement the counter of a random object. If this object reaches
2707 * zero we remove the object and put the current object instead. */
2708 if (dictSize(server.sharingpool) >=
2709 server.sharingpoolsize) {
2710 de = dictGetRandomKey(server.sharingpool);
2711 redisAssert(de != NULL);
2712 c = ((unsigned long) dictGetEntryVal(de))-1;
2713 dictGetEntryVal(de) = (void*) c;
2714 if (c == 0) {
2715 dictDelete(server.sharingpool,de->key);
2716 }
2717 } else {
2718 c = 0; /* If the pool is empty we want to add this object */
2719 }
2720 if (c == 0) {
2721 int retval;
2722
2723 retval = dictAdd(server.sharingpool,o,(void*)1);
2724 redisAssert(retval == DICT_OK);
2725 incrRefCount(o);
2726 }
2727 return o;
2728 }
2729 }
2730
2731 /* Check if the nul-terminated string 's' can be represented by a long
2732 * (that is, is a number that fits into long without any other space or
2733 * character before or after the digits).
2734 *
2735 * If so, the function returns REDIS_OK and *longval is set to the value
2736 * of the number. Otherwise REDIS_ERR is returned */
2737 static int isStringRepresentableAsLong(sds s, long *longval) {
2738 char buf[32], *endptr;
2739 long value;
2740 int slen;
2741
2742 value = strtol(s, &endptr, 10);
2743 if (endptr[0] != '\0') return REDIS_ERR;
2744 slen = snprintf(buf,32,"%ld",value);
2745
2746 /* If the number converted back into a string is not identical
2747 * then it's not possible to encode the string as integer */
2748 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2749 if (longval) *longval = value;
2750 return REDIS_OK;
2751 }
2752
2753 /* Try to encode a string object in order to save space */
2754 static int tryObjectEncoding(robj *o) {
2755 long value;
2756 sds s = o->ptr;
2757
2758 if (o->encoding != REDIS_ENCODING_RAW)
2759 return REDIS_ERR; /* Already encoded */
2760
2761 /* It's not save to encode shared objects: shared objects can be shared
2762 * everywhere in the "object space" of Redis. Encoded objects can only
2763 * appear as "values" (and not, for instance, as keys) */
2764 if (o->refcount > 1) return REDIS_ERR;
2765
2766 /* Currently we try to encode only strings */
2767 redisAssert(o->type == REDIS_STRING);
2768
2769 /* Check if we can represent this string as a long integer */
2770 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2771
2772 /* Ok, this object can be encoded */
2773 o->encoding = REDIS_ENCODING_INT;
2774 sdsfree(o->ptr);
2775 o->ptr = (void*) value;
2776 return REDIS_OK;
2777 }
2778
2779 /* Get a decoded version of an encoded object (returned as a new object).
2780 * If the object is already raw-encoded just increment the ref count. */
2781 static robj *getDecodedObject(robj *o) {
2782 robj *dec;
2783
2784 if (o->encoding == REDIS_ENCODING_RAW) {
2785 incrRefCount(o);
2786 return o;
2787 }
2788 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2789 char buf[32];
2790
2791 snprintf(buf,32,"%ld",(long)o->ptr);
2792 dec = createStringObject(buf,strlen(buf));
2793 return dec;
2794 } else {
2795 redisAssert(1 != 1);
2796 }
2797 }
2798
2799 /* Compare two string objects via strcmp() or alike.
2800 * Note that the objects may be integer-encoded. In such a case we
2801 * use snprintf() to get a string representation of the numbers on the stack
2802 * and compare the strings, it's much faster than calling getDecodedObject().
2803 *
2804 * Important note: if objects are not integer encoded, but binary-safe strings,
2805 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2806 * binary safe. */
2807 static int compareStringObjects(robj *a, robj *b) {
2808 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2809 char bufa[128], bufb[128], *astr, *bstr;
2810 int bothsds = 1;
2811
2812 if (a == b) return 0;
2813 if (a->encoding != REDIS_ENCODING_RAW) {
2814 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2815 astr = bufa;
2816 bothsds = 0;
2817 } else {
2818 astr = a->ptr;
2819 }
2820 if (b->encoding != REDIS_ENCODING_RAW) {
2821 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2822 bstr = bufb;
2823 bothsds = 0;
2824 } else {
2825 bstr = b->ptr;
2826 }
2827 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2828 }
2829
2830 static size_t stringObjectLen(robj *o) {
2831 redisAssert(o->type == REDIS_STRING);
2832 if (o->encoding == REDIS_ENCODING_RAW) {
2833 return sdslen(o->ptr);
2834 } else {
2835 char buf[32];
2836
2837 return snprintf(buf,32,"%ld",(long)o->ptr);
2838 }
2839 }
2840
2841 /*============================ RDB saving/loading =========================== */
2842
2843 static int rdbSaveType(FILE *fp, unsigned char type) {
2844 if (fwrite(&type,1,1,fp) == 0) return -1;
2845 return 0;
2846 }
2847
2848 static int rdbSaveTime(FILE *fp, time_t t) {
2849 int32_t t32 = (int32_t) t;
2850 if (fwrite(&t32,4,1,fp) == 0) return -1;
2851 return 0;
2852 }
2853
2854 /* check rdbLoadLen() comments for more info */
2855 static int rdbSaveLen(FILE *fp, uint32_t len) {
2856 unsigned char buf[2];
2857
2858 if (len < (1<<6)) {
2859 /* Save a 6 bit len */
2860 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2861 if (fwrite(buf,1,1,fp) == 0) return -1;
2862 } else if (len < (1<<14)) {
2863 /* Save a 14 bit len */
2864 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2865 buf[1] = len&0xFF;
2866 if (fwrite(buf,2,1,fp) == 0) return -1;
2867 } else {
2868 /* Save a 32 bit len */
2869 buf[0] = (REDIS_RDB_32BITLEN<<6);
2870 if (fwrite(buf,1,1,fp) == 0) return -1;
2871 len = htonl(len);
2872 if (fwrite(&len,4,1,fp) == 0) return -1;
2873 }
2874 return 0;
2875 }
2876
2877 /* String objects in the form "2391" "-100" without any space and with a
2878 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2879 * encoded as integers to save space */
2880 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2881 long long value;
2882 char *endptr, buf[32];
2883
2884 /* Check if it's possible to encode this value as a number */
2885 value = strtoll(s, &endptr, 10);
2886 if (endptr[0] != '\0') return 0;
2887 snprintf(buf,32,"%lld",value);
2888
2889 /* If the number converted back into a string is not identical
2890 * then it's not possible to encode the string as integer */
2891 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2892
2893 /* Finally check if it fits in our ranges */
2894 if (value >= -(1<<7) && value <= (1<<7)-1) {
2895 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2896 enc[1] = value&0xFF;
2897 return 2;
2898 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2899 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2900 enc[1] = value&0xFF;
2901 enc[2] = (value>>8)&0xFF;
2902 return 3;
2903 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2904 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2905 enc[1] = value&0xFF;
2906 enc[2] = (value>>8)&0xFF;
2907 enc[3] = (value>>16)&0xFF;
2908 enc[4] = (value>>24)&0xFF;
2909 return 5;
2910 } else {
2911 return 0;
2912 }
2913 }
2914
2915 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2916 unsigned int comprlen, outlen;
2917 unsigned char byte;
2918 void *out;
2919
2920 /* We require at least four bytes compression for this to be worth it */
2921 outlen = sdslen(obj->ptr)-4;
2922 if (outlen <= 0) return 0;
2923 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2924 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2925 if (comprlen == 0) {
2926 zfree(out);
2927 return 0;
2928 }
2929 /* Data compressed! Let's save it on disk */
2930 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2931 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2932 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2933 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2934 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2935 zfree(out);
2936 return comprlen;
2937
2938 writeerr:
2939 zfree(out);
2940 return -1;
2941 }
2942
2943 /* Save a string objet as [len][data] on disk. If the object is a string
2944 * representation of an integer value we try to safe it in a special form */
2945 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2946 size_t len;
2947 int enclen;
2948
2949 len = sdslen(obj->ptr);
2950
2951 /* Try integer encoding */
2952 if (len <= 11) {
2953 unsigned char buf[5];
2954 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2955 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2956 return 0;
2957 }
2958 }
2959
2960 /* Try LZF compression - under 20 bytes it's unable to compress even
2961 * aaaaaaaaaaaaaaaaaa so skip it */
2962 if (server.rdbcompression && len > 20) {
2963 int retval;
2964
2965 retval = rdbSaveLzfStringObject(fp,obj);
2966 if (retval == -1) return -1;
2967 if (retval > 0) return 0;
2968 /* retval == 0 means data can't be compressed, save the old way */
2969 }
2970
2971 /* Store verbatim */
2972 if (rdbSaveLen(fp,len) == -1) return -1;
2973 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2974 return 0;
2975 }
2976
2977 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
2978 static int rdbSaveStringObject(FILE *fp, robj *obj) {
2979 int retval;
2980
2981 /* Avoid incr/decr ref count business when possible.
2982 * This plays well with copy-on-write given that we are probably
2983 * in a child process (BGSAVE). Also this makes sure key objects
2984 * of swapped objects are not incRefCount-ed (an assert does not allow
2985 * this in order to avoid bugs) */
2986 if (obj->encoding != REDIS_ENCODING_RAW) {
2987 obj = getDecodedObject(obj);
2988 retval = rdbSaveStringObjectRaw(fp,obj);
2989 decrRefCount(obj);
2990 } else {
2991 retval = rdbSaveStringObjectRaw(fp,obj);
2992 }
2993 return retval;
2994 }
2995
2996 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
2997 * 8 bit integer specifing the length of the representation.
2998 * This 8 bit integer has special values in order to specify the following
2999 * conditions:
3000 * 253: not a number
3001 * 254: + inf
3002 * 255: - inf
3003 */
3004 static int rdbSaveDoubleValue(FILE *fp, double val) {
3005 unsigned char buf[128];
3006 int len;
3007
3008 if (isnan(val)) {
3009 buf[0] = 253;
3010 len = 1;
3011 } else if (!isfinite(val)) {
3012 len = 1;
3013 buf[0] = (val < 0) ? 255 : 254;
3014 } else {
3015 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3016 buf[0] = strlen((char*)buf+1);
3017 len = buf[0]+1;
3018 }
3019 if (fwrite(buf,len,1,fp) == 0) return -1;
3020 return 0;
3021 }
3022
3023 /* Save a Redis object. */
3024 static int rdbSaveObject(FILE *fp, robj *o) {
3025 if (o->type == REDIS_STRING) {
3026 /* Save a string value */
3027 if (rdbSaveStringObject(fp,o) == -1) return -1;
3028 } else if (o->type == REDIS_LIST) {
3029 /* Save a list value */
3030 list *list = o->ptr;
3031 listIter li;
3032 listNode *ln;
3033
3034 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3035 listRewind(list,&li);
3036 while((ln = listNext(&li))) {
3037 robj *eleobj = listNodeValue(ln);
3038
3039 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3040 }
3041 } else if (o->type == REDIS_SET) {
3042 /* Save a set value */
3043 dict *set = o->ptr;
3044 dictIterator *di = dictGetIterator(set);
3045 dictEntry *de;
3046
3047 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3048 while((de = dictNext(di)) != NULL) {
3049 robj *eleobj = dictGetEntryKey(de);
3050
3051 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3052 }
3053 dictReleaseIterator(di);
3054 } else if (o->type == REDIS_ZSET) {
3055 /* Save a set value */
3056 zset *zs = o->ptr;
3057 dictIterator *di = dictGetIterator(zs->dict);
3058 dictEntry *de;
3059
3060 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3061 while((de = dictNext(di)) != NULL) {
3062 robj *eleobj = dictGetEntryKey(de);
3063 double *score = dictGetEntryVal(de);
3064
3065 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3066 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3067 }
3068 dictReleaseIterator(di);
3069 } else {
3070 redisAssert(0 != 0);
3071 }
3072 return 0;
3073 }
3074
3075 /* Return the length the object will have on disk if saved with
3076 * the rdbSaveObject() function. Currently we use a trick to get
3077 * this length with very little changes to the code. In the future
3078 * we could switch to a faster solution. */
3079 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3080 if (fp == NULL) fp = server.devnull;
3081 rewind(fp);
3082 assert(rdbSaveObject(fp,o) != 1);
3083 return ftello(fp);
3084 }
3085
3086 /* Return the number of pages required to save this object in the swap file */
3087 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3088 off_t bytes = rdbSavedObjectLen(o,fp);
3089
3090 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3091 }
3092
3093 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3094 static int rdbSave(char *filename) {
3095 dictIterator *di = NULL;
3096 dictEntry *de;
3097 FILE *fp;
3098 char tmpfile[256];
3099 int j;
3100 time_t now = time(NULL);
3101
3102 /* Wait for I/O therads to terminate, just in case this is a
3103 * foreground-saving, to avoid seeking the swap file descriptor at the
3104 * same time. */
3105 if (server.vm_enabled)
3106 waitEmptyIOJobsQueue();
3107
3108 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3109 fp = fopen(tmpfile,"w");
3110 if (!fp) {
3111 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3112 return REDIS_ERR;
3113 }
3114 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3115 for (j = 0; j < server.dbnum; j++) {
3116 redisDb *db = server.db+j;
3117 dict *d = db->dict;
3118 if (dictSize(d) == 0) continue;
3119 di = dictGetIterator(d);
3120 if (!di) {
3121 fclose(fp);
3122 return REDIS_ERR;
3123 }
3124
3125 /* Write the SELECT DB opcode */
3126 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3127 if (rdbSaveLen(fp,j) == -1) goto werr;
3128
3129 /* Iterate this DB writing every entry */
3130 while((de = dictNext(di)) != NULL) {
3131 robj *key = dictGetEntryKey(de);
3132 robj *o = dictGetEntryVal(de);
3133 time_t expiretime = getExpire(db,key);
3134
3135 /* Save the expire time */
3136 if (expiretime != -1) {
3137 /* If this key is already expired skip it */
3138 if (expiretime < now) continue;
3139 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3140 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3141 }
3142 /* Save the key and associated value. This requires special
3143 * handling if the value is swapped out. */
3144 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3145 key->storage == REDIS_VM_SWAPPING) {
3146 /* Save type, key, value */
3147 if (rdbSaveType(fp,o->type) == -1) goto werr;
3148 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3149 if (rdbSaveObject(fp,o) == -1) goto werr;
3150 } else {
3151 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3152 robj *po;
3153 /* Get a preview of the object in memory */
3154 po = vmPreviewObject(key);
3155 /* Save type, key, value */
3156 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3157 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3158 if (rdbSaveObject(fp,po) == -1) goto werr;
3159 /* Remove the loaded object from memory */
3160 decrRefCount(po);
3161 }
3162 }
3163 dictReleaseIterator(di);
3164 }
3165 /* EOF opcode */
3166 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3167
3168 /* Make sure data will not remain on the OS's output buffers */
3169 fflush(fp);
3170 fsync(fileno(fp));
3171 fclose(fp);
3172
3173 /* Use RENAME to make sure the DB file is changed atomically only
3174 * if the generate DB file is ok. */
3175 if (rename(tmpfile,filename) == -1) {
3176 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3177 unlink(tmpfile);
3178 return REDIS_ERR;
3179 }
3180 redisLog(REDIS_NOTICE,"DB saved on disk");
3181 server.dirty = 0;
3182 server.lastsave = time(NULL);
3183 return REDIS_OK;
3184
3185 werr:
3186 fclose(fp);
3187 unlink(tmpfile);
3188 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3189 if (di) dictReleaseIterator(di);
3190 return REDIS_ERR;
3191 }
3192
3193 static int rdbSaveBackground(char *filename) {
3194 pid_t childpid;
3195
3196 if (server.bgsavechildpid != -1) return REDIS_ERR;
3197 if (server.vm_enabled) waitEmptyIOJobsQueue();
3198 if ((childpid = fork()) == 0) {
3199 /* Child */
3200 if (server.vm_enabled) vmReopenSwapFile();
3201 close(server.fd);
3202 if (rdbSave(filename) == REDIS_OK) {
3203 _exit(0);
3204 } else {
3205 _exit(1);
3206 }
3207 } else {
3208 /* Parent */
3209 if (childpid == -1) {
3210 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3211 strerror(errno));
3212 return REDIS_ERR;
3213 }
3214 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3215 server.bgsavechildpid = childpid;
3216 return REDIS_OK;
3217 }
3218 return REDIS_OK; /* unreached */
3219 }
3220
3221 static void rdbRemoveTempFile(pid_t childpid) {
3222 char tmpfile[256];
3223
3224 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3225 unlink(tmpfile);
3226 }
3227
3228 static int rdbLoadType(FILE *fp) {
3229 unsigned char type;
3230 if (fread(&type,1,1,fp) == 0) return -1;
3231 return type;
3232 }
3233
3234 static time_t rdbLoadTime(FILE *fp) {
3235 int32_t t32;
3236 if (fread(&t32,4,1,fp) == 0) return -1;
3237 return (time_t) t32;
3238 }
3239
3240 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3241 * of this file for a description of how this are stored on disk.
3242 *
3243 * isencoded is set to 1 if the readed length is not actually a length but
3244 * an "encoding type", check the above comments for more info */
3245 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3246 unsigned char buf[2];
3247 uint32_t len;
3248 int type;
3249
3250 if (isencoded) *isencoded = 0;
3251 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3252 type = (buf[0]&0xC0)>>6;
3253 if (type == REDIS_RDB_6BITLEN) {
3254 /* Read a 6 bit len */
3255 return buf[0]&0x3F;
3256 } else if (type == REDIS_RDB_ENCVAL) {
3257 /* Read a 6 bit len encoding type */
3258 if (isencoded) *isencoded = 1;
3259 return buf[0]&0x3F;
3260 } else if (type == REDIS_RDB_14BITLEN) {
3261 /* Read a 14 bit len */
3262 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3263 return ((buf[0]&0x3F)<<8)|buf[1];
3264 } else {
3265 /* Read a 32 bit len */
3266 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3267 return ntohl(len);
3268 }
3269 }
3270
3271 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3272 unsigned char enc[4];
3273 long long val;
3274
3275 if (enctype == REDIS_RDB_ENC_INT8) {
3276 if (fread(enc,1,1,fp) == 0) return NULL;
3277 val = (signed char)enc[0];
3278 } else if (enctype == REDIS_RDB_ENC_INT16) {
3279 uint16_t v;
3280 if (fread(enc,2,1,fp) == 0) return NULL;
3281 v = enc[0]|(enc[1]<<8);
3282 val = (int16_t)v;
3283 } else if (enctype == REDIS_RDB_ENC_INT32) {
3284 uint32_t v;
3285 if (fread(enc,4,1,fp) == 0) return NULL;
3286 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3287 val = (int32_t)v;
3288 } else {
3289 val = 0; /* anti-warning */
3290 redisAssert(0!=0);
3291 }
3292 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3293 }
3294
3295 static robj *rdbLoadLzfStringObject(FILE*fp) {
3296 unsigned int len, clen;
3297 unsigned char *c = NULL;
3298 sds val = NULL;
3299
3300 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3301 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3302 if ((c = zmalloc(clen)) == NULL) goto err;
3303 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3304 if (fread(c,clen,1,fp) == 0) goto err;
3305 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3306 zfree(c);
3307 return createObject(REDIS_STRING,val);
3308 err:
3309 zfree(c);
3310 sdsfree(val);
3311 return NULL;
3312 }
3313
3314 static robj *rdbLoadStringObject(FILE*fp) {
3315 int isencoded;
3316 uint32_t len;
3317 sds val;
3318
3319 len = rdbLoadLen(fp,&isencoded);
3320 if (isencoded) {
3321 switch(len) {
3322 case REDIS_RDB_ENC_INT8:
3323 case REDIS_RDB_ENC_INT16:
3324 case REDIS_RDB_ENC_INT32:
3325 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3326 case REDIS_RDB_ENC_LZF:
3327 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3328 default:
3329 redisAssert(0!=0);
3330 }
3331 }
3332
3333 if (len == REDIS_RDB_LENERR) return NULL;
3334 val = sdsnewlen(NULL,len);
3335 if (len && fread(val,len,1,fp) == 0) {
3336 sdsfree(val);
3337 return NULL;
3338 }
3339 return tryObjectSharing(createObject(REDIS_STRING,val));
3340 }
3341
3342 /* For information about double serialization check rdbSaveDoubleValue() */
3343 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3344 char buf[128];
3345 unsigned char len;
3346
3347 if (fread(&len,1,1,fp) == 0) return -1;
3348 switch(len) {
3349 case 255: *val = R_NegInf; return 0;
3350 case 254: *val = R_PosInf; return 0;
3351 case 253: *val = R_Nan; return 0;
3352 default:
3353 if (fread(buf,len,1,fp) == 0) return -1;
3354 buf[len] = '\0';
3355 sscanf(buf, "%lg", val);
3356 return 0;
3357 }
3358 }
3359
3360 /* Load a Redis object of the specified type from the specified file.
3361 * On success a newly allocated object is returned, otherwise NULL. */
3362 static robj *rdbLoadObject(int type, FILE *fp) {
3363 robj *o;
3364
3365 if (type == REDIS_STRING) {
3366 /* Read string value */
3367 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3368 tryObjectEncoding(o);
3369 } else if (type == REDIS_LIST || type == REDIS_SET) {
3370 /* Read list/set value */
3371 uint32_t listlen;
3372
3373 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3374 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3375 /* It's faster to expand the dict to the right size asap in order
3376 * to avoid rehashing */
3377 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3378 dictExpand(o->ptr,listlen);
3379 /* Load every single element of the list/set */
3380 while(listlen--) {
3381 robj *ele;
3382
3383 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3384 tryObjectEncoding(ele);
3385 if (type == REDIS_LIST) {
3386 listAddNodeTail((list*)o->ptr,ele);
3387 } else {
3388 dictAdd((dict*)o->ptr,ele,NULL);
3389 }
3390 }
3391 } else if (type == REDIS_ZSET) {
3392 /* Read list/set value */
3393 uint32_t zsetlen;
3394 zset *zs;
3395
3396 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3397 o = createZsetObject();
3398 zs = o->ptr;
3399 /* Load every single element of the list/set */
3400 while(zsetlen--) {
3401 robj *ele;
3402 double *score = zmalloc(sizeof(double));
3403
3404 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3405 tryObjectEncoding(ele);
3406 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3407 dictAdd(zs->dict,ele,score);
3408 zslInsert(zs->zsl,*score,ele);
3409 incrRefCount(ele); /* added to skiplist */
3410 }
3411 } else {
3412 redisAssert(0 != 0);
3413 }
3414 return o;
3415 }
3416
3417 static int rdbLoad(char *filename) {
3418 FILE *fp;
3419 robj *keyobj = NULL;
3420 uint32_t dbid;
3421 int type, retval, rdbver;
3422 dict *d = server.db[0].dict;
3423 redisDb *db = server.db+0;
3424 char buf[1024];
3425 time_t expiretime = -1, now = time(NULL);
3426 long long loadedkeys = 0;
3427
3428 fp = fopen(filename,"r");
3429 if (!fp) return REDIS_ERR;
3430 if (fread(buf,9,1,fp) == 0) goto eoferr;
3431 buf[9] = '\0';
3432 if (memcmp(buf,"REDIS",5) != 0) {
3433 fclose(fp);
3434 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3435 return REDIS_ERR;
3436 }
3437 rdbver = atoi(buf+5);
3438 if (rdbver != 1) {
3439 fclose(fp);
3440 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3441 return REDIS_ERR;
3442 }
3443 while(1) {
3444 robj *o;
3445
3446 /* Read type. */
3447 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3448 if (type == REDIS_EXPIRETIME) {
3449 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3450 /* We read the time so we need to read the object type again */
3451 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3452 }
3453 if (type == REDIS_EOF) break;
3454 /* Handle SELECT DB opcode as a special case */
3455 if (type == REDIS_SELECTDB) {
3456 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3457 goto eoferr;
3458 if (dbid >= (unsigned)server.dbnum) {
3459 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3460 exit(1);
3461 }
3462 db = server.db+dbid;
3463 d = db->dict;
3464 continue;
3465 }
3466 /* Read key */
3467 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3468 /* Read value */
3469 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3470 /* Add the new object in the hash table */
3471 retval = dictAdd(d,keyobj,o);
3472 if (retval == DICT_ERR) {
3473 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3474 exit(1);
3475 }
3476 /* Set the expire time if needed */
3477 if (expiretime != -1) {
3478 setExpire(db,keyobj,expiretime);
3479 /* Delete this key if already expired */
3480 if (expiretime < now) deleteKey(db,keyobj);
3481 expiretime = -1;
3482 }
3483 keyobj = o = NULL;
3484 /* Handle swapping while loading big datasets when VM is on */
3485 loadedkeys++;
3486 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3487 while (zmalloc_used_memory() > server.vm_max_memory) {
3488 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3489 }
3490 }
3491 }
3492 fclose(fp);
3493 return REDIS_OK;
3494
3495 eoferr: /* unexpected end of file is handled here with a fatal exit */
3496 if (keyobj) decrRefCount(keyobj);
3497 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3498 exit(1);
3499 return REDIS_ERR; /* Just to avoid warning */
3500 }
3501
3502 /*================================== Commands =============================== */
3503
3504 static void authCommand(redisClient *c) {
3505 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3506 c->authenticated = 1;
3507 addReply(c,shared.ok);
3508 } else {
3509 c->authenticated = 0;
3510 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3511 }
3512 }
3513
3514 static void pingCommand(redisClient *c) {
3515 addReply(c,shared.pong);
3516 }
3517
3518 static void echoCommand(redisClient *c) {
3519 addReplyBulkLen(c,c->argv[1]);
3520 addReply(c,c->argv[1]);
3521 addReply(c,shared.crlf);
3522 }
3523
3524 /*=================================== Strings =============================== */
3525
3526 static void setGenericCommand(redisClient *c, int nx) {
3527 int retval;
3528
3529 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3530 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3531 if (retval == DICT_ERR) {
3532 if (!nx) {
3533 /* If the key is about a swapped value, we want a new key object
3534 * to overwrite the old. So we delete the old key in the database.
3535 * This will also make sure that swap pages about the old object
3536 * will be marked as free. */
3537 if (deleteIfSwapped(c->db,c->argv[1]))
3538 incrRefCount(c->argv[1]);
3539 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3540 incrRefCount(c->argv[2]);
3541 } else {
3542 addReply(c,shared.czero);
3543 return;
3544 }
3545 } else {
3546 incrRefCount(c->argv[1]);
3547 incrRefCount(c->argv[2]);
3548 }
3549 server.dirty++;
3550 removeExpire(c->db,c->argv[1]);
3551 addReply(c, nx ? shared.cone : shared.ok);
3552 }
3553
3554 static void setCommand(redisClient *c) {
3555 setGenericCommand(c,0);
3556 }
3557
3558 static void setnxCommand(redisClient *c) {
3559 setGenericCommand(c,1);
3560 }
3561
3562 static int getGenericCommand(redisClient *c) {
3563 robj *o = lookupKeyRead(c->db,c->argv[1]);
3564
3565 if (o == NULL) {
3566 addReply(c,shared.nullbulk);
3567 return REDIS_OK;
3568 } else {
3569 if (o->type != REDIS_STRING) {
3570 addReply(c,shared.wrongtypeerr);
3571 return REDIS_ERR;
3572 } else {
3573 addReplyBulkLen(c,o);
3574 addReply(c,o);
3575 addReply(c,shared.crlf);
3576 return REDIS_OK;
3577 }
3578 }
3579 }
3580
3581 static void getCommand(redisClient *c) {
3582 getGenericCommand(c);
3583 }
3584
3585 static void getsetCommand(redisClient *c) {
3586 if (getGenericCommand(c) == REDIS_ERR) return;
3587 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3588 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3589 } else {
3590 incrRefCount(c->argv[1]);
3591 }
3592 incrRefCount(c->argv[2]);
3593 server.dirty++;
3594 removeExpire(c->db,c->argv[1]);
3595 }
3596
3597 static void mgetCommand(redisClient *c) {
3598 int j;
3599
3600 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3601 for (j = 1; j < c->argc; j++) {
3602 robj *o = lookupKeyRead(c->db,c->argv[j]);
3603 if (o == NULL) {
3604 addReply(c,shared.nullbulk);
3605 } else {
3606 if (o->type != REDIS_STRING) {
3607 addReply(c,shared.nullbulk);
3608 } else {
3609 addReplyBulkLen(c,o);
3610 addReply(c,o);
3611 addReply(c,shared.crlf);
3612 }
3613 }
3614 }
3615 }
3616
3617 static void msetGenericCommand(redisClient *c, int nx) {
3618 int j, busykeys = 0;
3619
3620 if ((c->argc % 2) == 0) {
3621 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3622 return;
3623 }
3624 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3625 * set nothing at all if at least one already key exists. */
3626 if (nx) {
3627 for (j = 1; j < c->argc; j += 2) {
3628 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3629 busykeys++;
3630 }
3631 }
3632 }
3633 if (busykeys) {
3634 addReply(c, shared.czero);
3635 return;
3636 }
3637
3638 for (j = 1; j < c->argc; j += 2) {
3639 int retval;
3640
3641 tryObjectEncoding(c->argv[j+1]);
3642 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3643 if (retval == DICT_ERR) {
3644 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3645 incrRefCount(c->argv[j+1]);
3646 } else {
3647 incrRefCount(c->argv[j]);
3648 incrRefCount(c->argv[j+1]);
3649 }
3650 removeExpire(c->db,c->argv[j]);
3651 }
3652 server.dirty += (c->argc-1)/2;
3653 addReply(c, nx ? shared.cone : shared.ok);
3654 }
3655
3656 static void msetCommand(redisClient *c) {
3657 msetGenericCommand(c,0);
3658 }
3659
3660 static void msetnxCommand(redisClient *c) {
3661 msetGenericCommand(c,1);
3662 }
3663
3664 static void incrDecrCommand(redisClient *c, long long incr) {
3665 long long value;
3666 int retval;
3667 robj *o;
3668
3669 o = lookupKeyWrite(c->db,c->argv[1]);
3670 if (o == NULL) {
3671 value = 0;
3672 } else {
3673 if (o->type != REDIS_STRING) {
3674 value = 0;
3675 } else {
3676 char *eptr;
3677
3678 if (o->encoding == REDIS_ENCODING_RAW)
3679 value = strtoll(o->ptr, &eptr, 10);
3680 else if (o->encoding == REDIS_ENCODING_INT)
3681 value = (long)o->ptr;
3682 else
3683 redisAssert(1 != 1);
3684 }
3685 }
3686
3687 value += incr;
3688 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3689 tryObjectEncoding(o);
3690 retval = dictAdd(c->db->dict,c->argv[1],o);
3691 if (retval == DICT_ERR) {
3692 dictReplace(c->db->dict,c->argv[1],o);
3693 removeExpire(c->db,c->argv[1]);
3694 } else {
3695 incrRefCount(c->argv[1]);
3696 }
3697 server.dirty++;
3698 addReply(c,shared.colon);
3699 addReply(c,o);
3700 addReply(c,shared.crlf);
3701 }
3702
3703 static void incrCommand(redisClient *c) {
3704 incrDecrCommand(c,1);
3705 }
3706
3707 static void decrCommand(redisClient *c) {
3708 incrDecrCommand(c,-1);
3709 }
3710
3711 static void incrbyCommand(redisClient *c) {
3712 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3713 incrDecrCommand(c,incr);
3714 }
3715
3716 static void decrbyCommand(redisClient *c) {
3717 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3718 incrDecrCommand(c,-incr);
3719 }
3720
3721 static void appendCommand(redisClient *c) {
3722 int retval;
3723 size_t totlen;
3724 robj *o;
3725
3726 o = lookupKeyWrite(c->db,c->argv[1]);
3727 if (o == NULL) {
3728 /* Create the key */
3729 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3730 incrRefCount(c->argv[1]);
3731 incrRefCount(c->argv[2]);
3732 totlen = stringObjectLen(c->argv[2]);
3733 } else {
3734 dictEntry *de;
3735
3736 de = dictFind(c->db->dict,c->argv[1]);
3737 assert(de != NULL);
3738
3739 o = dictGetEntryVal(de);
3740 if (o->type != REDIS_STRING) {
3741 addReply(c,shared.wrongtypeerr);
3742 return;
3743 }
3744 /* If the object is specially encoded or shared we have to make
3745 * a copy */
3746 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3747 robj *decoded = getDecodedObject(o);
3748
3749 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3750 decrRefCount(decoded);
3751 dictReplace(c->db->dict,c->argv[1],o);
3752 }
3753 /* APPEND! */
3754 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3755 o->ptr = sdscatlen(o->ptr,
3756 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3757 } else {
3758 o->ptr = sdscatprintf(o->ptr, "%ld",
3759 (unsigned long) c->argv[2]->ptr);
3760 }
3761 totlen = sdslen(o->ptr);
3762 }
3763 server.dirty++;
3764 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3765 }
3766
3767 /* ========================= Type agnostic commands ========================= */
3768
3769 static void delCommand(redisClient *c) {
3770 int deleted = 0, j;
3771
3772 for (j = 1; j < c->argc; j++) {
3773 if (deleteKey(c->db,c->argv[j])) {
3774 server.dirty++;
3775 deleted++;
3776 }
3777 }
3778 switch(deleted) {
3779 case 0:
3780 addReply(c,shared.czero);
3781 break;
3782 case 1:
3783 addReply(c,shared.cone);
3784 break;
3785 default:
3786 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3787 break;
3788 }
3789 }
3790
3791 static void existsCommand(redisClient *c) {
3792 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3793 }
3794
3795 static void selectCommand(redisClient *c) {
3796 int id = atoi(c->argv[1]->ptr);
3797
3798 if (selectDb(c,id) == REDIS_ERR) {
3799 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3800 } else {
3801 addReply(c,shared.ok);
3802 }
3803 }
3804
3805 static void randomkeyCommand(redisClient *c) {
3806 dictEntry *de;
3807
3808 while(1) {
3809 de = dictGetRandomKey(c->db->dict);
3810 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3811 }
3812 if (de == NULL) {
3813 addReply(c,shared.plus);
3814 addReply(c,shared.crlf);
3815 } else {
3816 addReply(c,shared.plus);
3817 addReply(c,dictGetEntryKey(de));
3818 addReply(c,shared.crlf);
3819 }
3820 }
3821
3822 static void keysCommand(redisClient *c) {
3823 dictIterator *di;
3824 dictEntry *de;
3825 sds pattern = c->argv[1]->ptr;
3826 int plen = sdslen(pattern);
3827 unsigned long numkeys = 0;
3828 robj *lenobj = createObject(REDIS_STRING,NULL);
3829
3830 di = dictGetIterator(c->db->dict);
3831 addReply(c,lenobj);
3832 decrRefCount(lenobj);
3833 while((de = dictNext(di)) != NULL) {
3834 robj *keyobj = dictGetEntryKey(de);
3835
3836 sds key = keyobj->ptr;
3837 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3838 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3839 if (expireIfNeeded(c->db,keyobj) == 0) {
3840 addReplyBulkLen(c,keyobj);
3841 addReply(c,keyobj);
3842 addReply(c,shared.crlf);
3843 numkeys++;
3844 }
3845 }
3846 }
3847 dictReleaseIterator(di);
3848 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3849 }
3850
3851 static void dbsizeCommand(redisClient *c) {
3852 addReplySds(c,
3853 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3854 }
3855
3856 static void lastsaveCommand(redisClient *c) {
3857 addReplySds(c,
3858 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3859 }
3860
3861 static void typeCommand(redisClient *c) {
3862 robj *o;
3863 char *type;
3864
3865 o = lookupKeyRead(c->db,c->argv[1]);
3866 if (o == NULL) {
3867 type = "+none";
3868 } else {
3869 switch(o->type) {
3870 case REDIS_STRING: type = "+string"; break;
3871 case REDIS_LIST: type = "+list"; break;
3872 case REDIS_SET: type = "+set"; break;
3873 case REDIS_ZSET: type = "+zset"; break;
3874 default: type = "unknown"; break;
3875 }
3876 }
3877 addReplySds(c,sdsnew(type));
3878 addReply(c,shared.crlf);
3879 }
3880
3881 static void saveCommand(redisClient *c) {
3882 if (server.bgsavechildpid != -1) {
3883 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3884 return;
3885 }
3886 if (rdbSave(server.dbfilename) == REDIS_OK) {
3887 addReply(c,shared.ok);
3888 } else {
3889 addReply(c,shared.err);
3890 }
3891 }
3892
3893 static void bgsaveCommand(redisClient *c) {
3894 if (server.bgsavechildpid != -1) {
3895 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3896 return;
3897 }
3898 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
3899 char *status = "+Background saving started\r\n";
3900 addReplySds(c,sdsnew(status));
3901 } else {
3902 addReply(c,shared.err);
3903 }
3904 }
3905
3906 static void shutdownCommand(redisClient *c) {
3907 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
3908 /* Kill the saving child if there is a background saving in progress.
3909 We want to avoid race conditions, for instance our saving child may
3910 overwrite the synchronous saving did by SHUTDOWN. */
3911 if (server.bgsavechildpid != -1) {
3912 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3913 kill(server.bgsavechildpid,SIGKILL);
3914 rdbRemoveTempFile(server.bgsavechildpid);
3915 }
3916 if (server.appendonly) {
3917 /* Append only file: fsync() the AOF and exit */
3918 fsync(server.appendfd);
3919 if (server.vm_enabled) unlink(server.vm_swap_file);
3920 exit(0);
3921 } else {
3922 /* Snapshotting. Perform a SYNC SAVE and exit */
3923 if (rdbSave(server.dbfilename) == REDIS_OK) {
3924 if (server.daemonize)
3925 unlink(server.pidfile);
3926 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3927 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3928 if (server.vm_enabled) unlink(server.vm_swap_file);
3929 exit(0);
3930 } else {
3931 /* Ooops.. error saving! The best we can do is to continue operating.
3932 * Note that if there was a background saving process, in the next
3933 * cron() Redis will be notified that the background saving aborted,
3934 * handling special stuff like slaves pending for synchronization... */
3935 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3936 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3937 }
3938 }
3939 }
3940
3941 static void renameGenericCommand(redisClient *c, int nx) {
3942 robj *o;
3943
3944 /* To use the same key as src and dst is probably an error */
3945 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
3946 addReply(c,shared.sameobjecterr);
3947 return;
3948 }
3949
3950 o = lookupKeyWrite(c->db,c->argv[1]);
3951 if (o == NULL) {
3952 addReply(c,shared.nokeyerr);
3953 return;
3954 }
3955 incrRefCount(o);
3956 deleteIfVolatile(c->db,c->argv[2]);
3957 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
3958 if (nx) {
3959 decrRefCount(o);
3960 addReply(c,shared.czero);
3961 return;
3962 }
3963 dictReplace(c->db->dict,c->argv[2],o);
3964 } else {
3965 incrRefCount(c->argv[2]);
3966 }
3967 deleteKey(c->db,c->argv[1]);
3968 server.dirty++;
3969 addReply(c,nx ? shared.cone : shared.ok);
3970 }
3971
3972 static void renameCommand(redisClient *c) {
3973 renameGenericCommand(c,0);
3974 }
3975
3976 static void renamenxCommand(redisClient *c) {
3977 renameGenericCommand(c,1);
3978 }
3979
3980 static void moveCommand(redisClient *c) {
3981 robj *o;
3982 redisDb *src, *dst;
3983 int srcid;
3984
3985 /* Obtain source and target DB pointers */
3986 src = c->db;
3987 srcid = c->db->id;
3988 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
3989 addReply(c,shared.outofrangeerr);
3990 return;
3991 }
3992 dst = c->db;
3993 selectDb(c,srcid); /* Back to the source DB */
3994
3995 /* If the user is moving using as target the same
3996 * DB as the source DB it is probably an error. */
3997 if (src == dst) {
3998 addReply(c,shared.sameobjecterr);
3999 return;
4000 }
4001
4002 /* Check if the element exists and get a reference */
4003 o = lookupKeyWrite(c->db,c->argv[1]);
4004 if (!o) {
4005 addReply(c,shared.czero);
4006 return;
4007 }
4008
4009 /* Try to add the element to the target DB */
4010 deleteIfVolatile(dst,c->argv[1]);
4011 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4012 addReply(c,shared.czero);
4013 return;
4014 }
4015 incrRefCount(c->argv[1]);
4016 incrRefCount(o);
4017
4018 /* OK! key moved, free the entry in the source DB */
4019 deleteKey(src,c->argv[1]);
4020 server.dirty++;
4021 addReply(c,shared.cone);
4022 }
4023
4024 /* =================================== Lists ================================ */
4025 static void pushGenericCommand(redisClient *c, int where) {
4026 robj *lobj;
4027 list *list;
4028
4029 lobj = lookupKeyWrite(c->db,c->argv[1]);
4030 if (lobj == NULL) {
4031 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4032 addReply(c,shared.ok);
4033 return;
4034 }
4035 lobj = createListObject();
4036 list = lobj->ptr;
4037 if (where == REDIS_HEAD) {
4038 listAddNodeHead(list,c->argv[2]);
4039 } else {
4040 listAddNodeTail(list,c->argv[2]);
4041 }
4042 dictAdd(c->db->dict,c->argv[1],lobj);
4043 incrRefCount(c->argv[1]);
4044 incrRefCount(c->argv[2]);
4045 } else {
4046 if (lobj->type != REDIS_LIST) {
4047 addReply(c,shared.wrongtypeerr);
4048 return;
4049 }
4050 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4051 addReply(c,shared.ok);
4052 return;
4053 }
4054 list = lobj->ptr;
4055 if (where == REDIS_HEAD) {
4056 listAddNodeHead(list,c->argv[2]);
4057 } else {
4058 listAddNodeTail(list,c->argv[2]);
4059 }
4060 incrRefCount(c->argv[2]);
4061 }
4062 server.dirty++;
4063 addReply(c,shared.ok);
4064 }
4065
4066 static void lpushCommand(redisClient *c) {
4067 pushGenericCommand(c,REDIS_HEAD);
4068 }
4069
4070 static void rpushCommand(redisClient *c) {
4071 pushGenericCommand(c,REDIS_TAIL);
4072 }
4073
4074 static void llenCommand(redisClient *c) {
4075 robj *o;
4076 list *l;
4077
4078 o = lookupKeyRead(c->db,c->argv[1]);
4079 if (o == NULL) {
4080 addReply(c,shared.czero);
4081 return;
4082 } else {
4083 if (o->type != REDIS_LIST) {
4084 addReply(c,shared.wrongtypeerr);
4085 } else {
4086 l = o->ptr;
4087 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4088 }
4089 }
4090 }
4091
4092 static void lindexCommand(redisClient *c) {
4093 robj *o;
4094 int index = atoi(c->argv[2]->ptr);
4095
4096 o = lookupKeyRead(c->db,c->argv[1]);
4097 if (o == NULL) {
4098 addReply(c,shared.nullbulk);
4099 } else {
4100 if (o->type != REDIS_LIST) {
4101 addReply(c,shared.wrongtypeerr);
4102 } else {
4103 list *list = o->ptr;
4104 listNode *ln;
4105
4106 ln = listIndex(list, index);
4107 if (ln == NULL) {
4108 addReply(c,shared.nullbulk);
4109 } else {
4110 robj *ele = listNodeValue(ln);
4111 addReplyBulkLen(c,ele);
4112 addReply(c,ele);
4113 addReply(c,shared.crlf);
4114 }
4115 }
4116 }
4117 }
4118
4119 static void lsetCommand(redisClient *c) {
4120 robj *o;
4121 int index = atoi(c->argv[2]->ptr);
4122
4123 o = lookupKeyWrite(c->db,c->argv[1]);
4124 if (o == NULL) {
4125 addReply(c,shared.nokeyerr);
4126 } else {
4127 if (o->type != REDIS_LIST) {
4128 addReply(c,shared.wrongtypeerr);
4129 } else {
4130 list *list = o->ptr;
4131 listNode *ln;
4132
4133 ln = listIndex(list, index);
4134 if (ln == NULL) {
4135 addReply(c,shared.outofrangeerr);
4136 } else {
4137 robj *ele = listNodeValue(ln);
4138
4139 decrRefCount(ele);
4140 listNodeValue(ln) = c->argv[3];
4141 incrRefCount(c->argv[3]);
4142 addReply(c,shared.ok);
4143 server.dirty++;
4144 }
4145 }
4146 }
4147 }
4148
4149 static void popGenericCommand(redisClient *c, int where) {
4150 robj *o;
4151
4152 o = lookupKeyWrite(c->db,c->argv[1]);
4153 if (o == NULL) {
4154 addReply(c,shared.nullbulk);
4155 } else {
4156 if (o->type != REDIS_LIST) {
4157 addReply(c,shared.wrongtypeerr);
4158 } else {
4159 list *list = o->ptr;
4160 listNode *ln;
4161
4162 if (where == REDIS_HEAD)
4163 ln = listFirst(list);
4164 else
4165 ln = listLast(list);
4166
4167 if (ln == NULL) {
4168 addReply(c,shared.nullbulk);
4169 } else {
4170 robj *ele = listNodeValue(ln);
4171 addReplyBulkLen(c,ele);
4172 addReply(c,ele);
4173 addReply(c,shared.crlf);
4174 listDelNode(list,ln);
4175 server.dirty++;
4176 }
4177 }
4178 }
4179 }
4180
4181 static void lpopCommand(redisClient *c) {
4182 popGenericCommand(c,REDIS_HEAD);
4183 }
4184
4185 static void rpopCommand(redisClient *c) {
4186 popGenericCommand(c,REDIS_TAIL);
4187 }
4188
4189 static void lrangeCommand(redisClient *c) {
4190 robj *o;
4191 int start = atoi(c->argv[2]->ptr);
4192 int end = atoi(c->argv[3]->ptr);
4193
4194 o = lookupKeyRead(c->db,c->argv[1]);
4195 if (o == NULL) {
4196 addReply(c,shared.nullmultibulk);
4197 } else {
4198 if (o->type != REDIS_LIST) {
4199 addReply(c,shared.wrongtypeerr);
4200 } else {
4201 list *list = o->ptr;
4202 listNode *ln;
4203 int llen = listLength(list);
4204 int rangelen, j;
4205 robj *ele;
4206
4207 /* convert negative indexes */
4208 if (start < 0) start = llen+start;
4209 if (end < 0) end = llen+end;
4210 if (start < 0) start = 0;
4211 if (end < 0) end = 0;
4212
4213 /* indexes sanity checks */
4214 if (start > end || start >= llen) {
4215 /* Out of range start or start > end result in empty list */
4216 addReply(c,shared.emptymultibulk);
4217 return;
4218 }
4219 if (end >= llen) end = llen-1;
4220 rangelen = (end-start)+1;
4221
4222 /* Return the result in form of a multi-bulk reply */
4223 ln = listIndex(list, start);
4224 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4225 for (j = 0; j < rangelen; j++) {
4226 ele = listNodeValue(ln);
4227 addReplyBulkLen(c,ele);
4228 addReply(c,ele);
4229 addReply(c,shared.crlf);
4230 ln = ln->next;
4231 }
4232 }
4233 }
4234 }
4235
4236 static void ltrimCommand(redisClient *c) {
4237 robj *o;
4238 int start = atoi(c->argv[2]->ptr);
4239 int end = atoi(c->argv[3]->ptr);
4240
4241 o = lookupKeyWrite(c->db,c->argv[1]);
4242 if (o == NULL) {
4243 addReply(c,shared.ok);
4244 } else {
4245 if (o->type != REDIS_LIST) {
4246 addReply(c,shared.wrongtypeerr);
4247 } else {
4248 list *list = o->ptr;
4249 listNode *ln;
4250 int llen = listLength(list);
4251 int j, ltrim, rtrim;
4252
4253 /* convert negative indexes */
4254 if (start < 0) start = llen+start;
4255 if (end < 0) end = llen+end;
4256 if (start < 0) start = 0;
4257 if (end < 0) end = 0;
4258
4259 /* indexes sanity checks */
4260 if (start > end || start >= llen) {
4261 /* Out of range start or start > end result in empty list */
4262 ltrim = llen;
4263 rtrim = 0;
4264 } else {
4265 if (end >= llen) end = llen-1;
4266 ltrim = start;
4267 rtrim = llen-end-1;
4268 }
4269
4270 /* Remove list elements to perform the trim */
4271 for (j = 0; j < ltrim; j++) {
4272 ln = listFirst(list);
4273 listDelNode(list,ln);
4274 }
4275 for (j = 0; j < rtrim; j++) {
4276 ln = listLast(list);
4277 listDelNode(list,ln);
4278 }
4279 server.dirty++;
4280 addReply(c,shared.ok);
4281 }
4282 }
4283 }
4284
4285 static void lremCommand(redisClient *c) {
4286 robj *o;
4287
4288 o = lookupKeyWrite(c->db,c->argv[1]);
4289 if (o == NULL) {
4290 addReply(c,shared.czero);
4291 } else {
4292 if (o->type != REDIS_LIST) {
4293 addReply(c,shared.wrongtypeerr);
4294 } else {
4295 list *list = o->ptr;
4296 listNode *ln, *next;
4297 int toremove = atoi(c->argv[2]->ptr);
4298 int removed = 0;
4299 int fromtail = 0;
4300
4301 if (toremove < 0) {
4302 toremove = -toremove;
4303 fromtail = 1;
4304 }
4305 ln = fromtail ? list->tail : list->head;
4306 while (ln) {
4307 robj *ele = listNodeValue(ln);
4308
4309 next = fromtail ? ln->prev : ln->next;
4310 if (compareStringObjects(ele,c->argv[3]) == 0) {
4311 listDelNode(list,ln);
4312 server.dirty++;
4313 removed++;
4314 if (toremove && removed == toremove) break;
4315 }
4316 ln = next;
4317 }
4318 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4319 }
4320 }
4321 }
4322
4323 /* This is the semantic of this command:
4324 * RPOPLPUSH srclist dstlist:
4325 * IF LLEN(srclist) > 0
4326 * element = RPOP srclist
4327 * LPUSH dstlist element
4328 * RETURN element
4329 * ELSE
4330 * RETURN nil
4331 * END
4332 * END
4333 *
4334 * The idea is to be able to get an element from a list in a reliable way
4335 * since the element is not just returned but pushed against another list
4336 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4337 */
4338 static void rpoplpushcommand(redisClient *c) {
4339 robj *sobj;
4340
4341 sobj = lookupKeyWrite(c->db,c->argv[1]);
4342 if (sobj == NULL) {
4343 addReply(c,shared.nullbulk);
4344 } else {
4345 if (sobj->type != REDIS_LIST) {
4346 addReply(c,shared.wrongtypeerr);
4347 } else {
4348 list *srclist = sobj->ptr;
4349 listNode *ln = listLast(srclist);
4350
4351 if (ln == NULL) {
4352 addReply(c,shared.nullbulk);
4353 } else {
4354 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4355 robj *ele = listNodeValue(ln);
4356 list *dstlist;
4357
4358 if (dobj && dobj->type != REDIS_LIST) {
4359 addReply(c,shared.wrongtypeerr);
4360 return;
4361 }
4362
4363 /* Add the element to the target list (unless it's directly
4364 * passed to some BLPOP-ing client */
4365 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4366 if (dobj == NULL) {
4367 /* Create the list if the key does not exist */
4368 dobj = createListObject();
4369 dictAdd(c->db->dict,c->argv[2],dobj);
4370 incrRefCount(c->argv[2]);
4371 }
4372 dstlist = dobj->ptr;
4373 listAddNodeHead(dstlist,ele);
4374 incrRefCount(ele);
4375 }
4376
4377 /* Send the element to the client as reply as well */
4378 addReplyBulkLen(c,ele);
4379 addReply(c,ele);
4380 addReply(c,shared.crlf);
4381
4382 /* Finally remove the element from the source list */
4383 listDelNode(srclist,ln);
4384 server.dirty++;
4385 }
4386 }
4387 }
4388 }
4389
4390
4391 /* ==================================== Sets ================================ */
4392
4393 static void saddCommand(redisClient *c) {
4394 robj *set;
4395
4396 set = lookupKeyWrite(c->db,c->argv[1]);
4397 if (set == NULL) {
4398 set = createSetObject();
4399 dictAdd(c->db->dict,c->argv[1],set);
4400 incrRefCount(c->argv[1]);
4401 } else {
4402 if (set->type != REDIS_SET) {
4403 addReply(c,shared.wrongtypeerr);
4404 return;
4405 }
4406 }
4407 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4408 incrRefCount(c->argv[2]);
4409 server.dirty++;
4410 addReply(c,shared.cone);
4411 } else {
4412 addReply(c,shared.czero);
4413 }
4414 }
4415
4416 static void sremCommand(redisClient *c) {
4417 robj *set;
4418
4419 set = lookupKeyWrite(c->db,c->argv[1]);
4420 if (set == NULL) {
4421 addReply(c,shared.czero);
4422 } else {
4423 if (set->type != REDIS_SET) {
4424 addReply(c,shared.wrongtypeerr);
4425 return;
4426 }
4427 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4428 server.dirty++;
4429 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4430 addReply(c,shared.cone);
4431 } else {
4432 addReply(c,shared.czero);
4433 }
4434 }
4435 }
4436
4437 static void smoveCommand(redisClient *c) {
4438 robj *srcset, *dstset;
4439
4440 srcset = lookupKeyWrite(c->db,c->argv[1]);
4441 dstset = lookupKeyWrite(c->db,c->argv[2]);
4442
4443 /* If the source key does not exist return 0, if it's of the wrong type
4444 * raise an error */
4445 if (srcset == NULL || srcset->type != REDIS_SET) {
4446 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4447 return;
4448 }
4449 /* Error if the destination key is not a set as well */
4450 if (dstset && dstset->type != REDIS_SET) {
4451 addReply(c,shared.wrongtypeerr);
4452 return;
4453 }
4454 /* Remove the element from the source set */
4455 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4456 /* Key not found in the src set! return zero */
4457 addReply(c,shared.czero);
4458 return;
4459 }
4460 server.dirty++;
4461 /* Add the element to the destination set */
4462 if (!dstset) {
4463 dstset = createSetObject();
4464 dictAdd(c->db->dict,c->argv[2],dstset);
4465 incrRefCount(c->argv[2]);
4466 }
4467 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4468 incrRefCount(c->argv[3]);
4469 addReply(c,shared.cone);
4470 }
4471
4472 static void sismemberCommand(redisClient *c) {
4473 robj *set;
4474
4475 set = lookupKeyRead(c->db,c->argv[1]);
4476 if (set == NULL) {
4477 addReply(c,shared.czero);
4478 } else {
4479 if (set->type != REDIS_SET) {
4480 addReply(c,shared.wrongtypeerr);
4481 return;
4482 }
4483 if (dictFind(set->ptr,c->argv[2]))
4484 addReply(c,shared.cone);
4485 else
4486 addReply(c,shared.czero);
4487 }
4488 }
4489
4490 static void scardCommand(redisClient *c) {
4491 robj *o;
4492 dict *s;
4493
4494 o = lookupKeyRead(c->db,c->argv[1]);
4495 if (o == NULL) {
4496 addReply(c,shared.czero);
4497 return;
4498 } else {
4499 if (o->type != REDIS_SET) {
4500 addReply(c,shared.wrongtypeerr);
4501 } else {
4502 s = o->ptr;
4503 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4504 dictSize(s)));
4505 }
4506 }
4507 }
4508
4509 static void spopCommand(redisClient *c) {
4510 robj *set;
4511 dictEntry *de;
4512
4513 set = lookupKeyWrite(c->db,c->argv[1]);
4514 if (set == NULL) {
4515 addReply(c,shared.nullbulk);
4516 } else {
4517 if (set->type != REDIS_SET) {
4518 addReply(c,shared.wrongtypeerr);
4519 return;
4520 }
4521 de = dictGetRandomKey(set->ptr);
4522 if (de == NULL) {
4523 addReply(c,shared.nullbulk);
4524 } else {
4525 robj *ele = dictGetEntryKey(de);
4526
4527 addReplyBulkLen(c,ele);
4528 addReply(c,ele);
4529 addReply(c,shared.crlf);
4530 dictDelete(set->ptr,ele);
4531 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4532 server.dirty++;
4533 }
4534 }
4535 }
4536
4537 static void srandmemberCommand(redisClient *c) {
4538 robj *set;
4539 dictEntry *de;
4540
4541 set = lookupKeyRead(c->db,c->argv[1]);
4542 if (set == NULL) {
4543 addReply(c,shared.nullbulk);
4544 } else {
4545 if (set->type != REDIS_SET) {
4546 addReply(c,shared.wrongtypeerr);
4547 return;
4548 }
4549 de = dictGetRandomKey(set->ptr);
4550 if (de == NULL) {
4551 addReply(c,shared.nullbulk);
4552 } else {
4553 robj *ele = dictGetEntryKey(de);
4554
4555 addReplyBulkLen(c,ele);
4556 addReply(c,ele);
4557 addReply(c,shared.crlf);
4558 }
4559 }
4560 }
4561
4562 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4563 dict **d1 = (void*) s1, **d2 = (void*) s2;
4564
4565 return dictSize(*d1)-dictSize(*d2);
4566 }
4567
4568 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4569 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4570 dictIterator *di;
4571 dictEntry *de;
4572 robj *lenobj = NULL, *dstset = NULL;
4573 unsigned long j, cardinality = 0;
4574
4575 for (j = 0; j < setsnum; j++) {
4576 robj *setobj;
4577
4578 setobj = dstkey ?
4579 lookupKeyWrite(c->db,setskeys[j]) :
4580 lookupKeyRead(c->db,setskeys[j]);
4581 if (!setobj) {
4582 zfree(dv);
4583 if (dstkey) {
4584 if (deleteKey(c->db,dstkey))
4585 server.dirty++;
4586 addReply(c,shared.czero);
4587 } else {
4588 addReply(c,shared.nullmultibulk);
4589 }
4590 return;
4591 }
4592 if (setobj->type != REDIS_SET) {
4593 zfree(dv);
4594 addReply(c,shared.wrongtypeerr);
4595 return;
4596 }
4597 dv[j] = setobj->ptr;
4598 }
4599 /* Sort sets from the smallest to largest, this will improve our
4600 * algorithm's performace */
4601 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4602
4603 /* The first thing we should output is the total number of elements...
4604 * since this is a multi-bulk write, but at this stage we don't know
4605 * the intersection set size, so we use a trick, append an empty object
4606 * to the output list and save the pointer to later modify it with the
4607 * right length */
4608 if (!dstkey) {
4609 lenobj = createObject(REDIS_STRING,NULL);
4610 addReply(c,lenobj);
4611 decrRefCount(lenobj);
4612 } else {
4613 /* If we have a target key where to store the resulting set
4614 * create this key with an empty set inside */
4615 dstset = createSetObject();
4616 }
4617
4618 /* Iterate all the elements of the first (smallest) set, and test
4619 * the element against all the other sets, if at least one set does
4620 * not include the element it is discarded */
4621 di = dictGetIterator(dv[0]);
4622
4623 while((de = dictNext(di)) != NULL) {
4624 robj *ele;
4625
4626 for (j = 1; j < setsnum; j++)
4627 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4628 if (j != setsnum)
4629 continue; /* at least one set does not contain the member */
4630 ele = dictGetEntryKey(de);
4631 if (!dstkey) {
4632 addReplyBulkLen(c,ele);
4633 addReply(c,ele);
4634 addReply(c,shared.crlf);
4635 cardinality++;
4636 } else {
4637 dictAdd(dstset->ptr,ele,NULL);
4638 incrRefCount(ele);
4639 }
4640 }
4641 dictReleaseIterator(di);
4642
4643 if (dstkey) {
4644 /* Store the resulting set into the target */
4645 deleteKey(c->db,dstkey);
4646 dictAdd(c->db->dict,dstkey,dstset);
4647 incrRefCount(dstkey);
4648 }
4649
4650 if (!dstkey) {
4651 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4652 } else {
4653 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4654 dictSize((dict*)dstset->ptr)));
4655 server.dirty++;
4656 }
4657 zfree(dv);
4658 }
4659
4660 static void sinterCommand(redisClient *c) {
4661 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4662 }
4663
4664 static void sinterstoreCommand(redisClient *c) {
4665 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4666 }
4667
4668 #define REDIS_OP_UNION 0
4669 #define REDIS_OP_DIFF 1
4670
4671 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4672 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4673 dictIterator *di;
4674 dictEntry *de;
4675 robj *dstset = NULL;
4676 int j, cardinality = 0;
4677
4678 for (j = 0; j < setsnum; j++) {
4679 robj *setobj;
4680
4681 setobj = dstkey ?
4682 lookupKeyWrite(c->db,setskeys[j]) :
4683 lookupKeyRead(c->db,setskeys[j]);
4684 if (!setobj) {
4685 dv[j] = NULL;
4686 continue;
4687 }
4688 if (setobj->type != REDIS_SET) {
4689 zfree(dv);
4690 addReply(c,shared.wrongtypeerr);
4691 return;
4692 }
4693 dv[j] = setobj->ptr;
4694 }
4695
4696 /* We need a temp set object to store our union. If the dstkey
4697 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4698 * this set object will be the resulting object to set into the target key*/
4699 dstset = createSetObject();
4700
4701 /* Iterate all the elements of all the sets, add every element a single
4702 * time to the result set */
4703 for (j = 0; j < setsnum; j++) {
4704 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4705 if (!dv[j]) continue; /* non existing keys are like empty sets */
4706
4707 di = dictGetIterator(dv[j]);
4708
4709 while((de = dictNext(di)) != NULL) {
4710 robj *ele;
4711
4712 /* dictAdd will not add the same element multiple times */
4713 ele = dictGetEntryKey(de);
4714 if (op == REDIS_OP_UNION || j == 0) {
4715 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4716 incrRefCount(ele);
4717 cardinality++;
4718 }
4719 } else if (op == REDIS_OP_DIFF) {
4720 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4721 cardinality--;
4722 }
4723 }
4724 }
4725 dictReleaseIterator(di);
4726
4727 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4728 }
4729
4730 /* Output the content of the resulting set, if not in STORE mode */
4731 if (!dstkey) {
4732 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4733 di = dictGetIterator(dstset->ptr);
4734 while((de = dictNext(di)) != NULL) {
4735 robj *ele;
4736
4737 ele = dictGetEntryKey(de);
4738 addReplyBulkLen(c,ele);
4739 addReply(c,ele);
4740 addReply(c,shared.crlf);
4741 }
4742 dictReleaseIterator(di);
4743 } else {
4744 /* If we have a target key where to store the resulting set
4745 * create this key with the result set inside */
4746 deleteKey(c->db,dstkey);
4747 dictAdd(c->db->dict,dstkey,dstset);
4748 incrRefCount(dstkey);
4749 }
4750
4751 /* Cleanup */
4752 if (!dstkey) {
4753 decrRefCount(dstset);
4754 } else {
4755 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4756 dictSize((dict*)dstset->ptr)));
4757 server.dirty++;
4758 }
4759 zfree(dv);
4760 }
4761
4762 static void sunionCommand(redisClient *c) {
4763 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4764 }
4765
4766 static void sunionstoreCommand(redisClient *c) {
4767 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4768 }
4769
4770 static void sdiffCommand(redisClient *c) {
4771 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4772 }
4773
4774 static void sdiffstoreCommand(redisClient *c) {
4775 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4776 }
4777
4778 /* ==================================== ZSets =============================== */
4779
4780 /* ZSETs are ordered sets using two data structures to hold the same elements
4781 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4782 * data structure.
4783 *
4784 * The elements are added to an hash table mapping Redis objects to scores.
4785 * At the same time the elements are added to a skip list mapping scores
4786 * to Redis objects (so objects are sorted by scores in this "view"). */
4787
4788 /* This skiplist implementation is almost a C translation of the original
4789 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4790 * Alternative to Balanced Trees", modified in three ways:
4791 * a) this implementation allows for repeated values.
4792 * b) the comparison is not just by key (our 'score') but by satellite data.
4793 * c) there is a back pointer, so it's a doubly linked list with the back
4794 * pointers being only at "level 1". This allows to traverse the list
4795 * from tail to head, useful for ZREVRANGE. */
4796
4797 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4798 zskiplistNode *zn = zmalloc(sizeof(*zn));
4799
4800 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4801 zn->span = zmalloc(sizeof(unsigned int) * level);
4802 zn->score = score;
4803 zn->obj = obj;
4804 return zn;
4805 }
4806
4807 static zskiplist *zslCreate(void) {
4808 int j;
4809 zskiplist *zsl;
4810
4811 zsl = zmalloc(sizeof(*zsl));
4812 zsl->level = 1;
4813 zsl->length = 0;
4814 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4815 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4816 zsl->header->forward[j] = NULL;
4817 zsl->header->span[j] = 0;
4818 }
4819 zsl->header->backward = NULL;
4820 zsl->tail = NULL;
4821 return zsl;
4822 }
4823
4824 static void zslFreeNode(zskiplistNode *node) {
4825 decrRefCount(node->obj);
4826 zfree(node->forward);
4827 zfree(node->span);
4828 zfree(node);
4829 }
4830
4831 static void zslFree(zskiplist *zsl) {
4832 zskiplistNode *node = zsl->header->forward[0], *next;
4833
4834 zfree(zsl->header->forward);
4835 zfree(zsl->header->span);
4836 zfree(zsl->header);
4837 while(node) {
4838 next = node->forward[0];
4839 zslFreeNode(node);
4840 node = next;
4841 }
4842 zfree(zsl);
4843 }
4844
4845 static int zslRandomLevel(void) {
4846 int level = 1;
4847 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4848 level += 1;
4849 return level;
4850 }
4851
4852 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4853 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4854 unsigned int span[ZSKIPLIST_MAXLEVEL];
4855 int i, level;
4856
4857 x = zsl->header;
4858 for (i = zsl->level-1; i >= 0; i--) {
4859 /* store span that is crossed to reach the insert position */
4860 span[i] = i == (zsl->level-1) ? 0 : span[i+1];
4861
4862 while (x->forward[i] &&
4863 (x->forward[i]->score < score ||
4864 (x->forward[i]->score == score &&
4865 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4866 span[i] += x->span[i];
4867 x = x->forward[i];
4868 }
4869 update[i] = x;
4870 }
4871 /* we assume the key is not already inside, since we allow duplicated
4872 * scores, and the re-insertion of score and redis object should never
4873 * happpen since the caller of zslInsert() should test in the hash table
4874 * if the element is already inside or not. */
4875 level = zslRandomLevel();
4876 if (level > zsl->level) {
4877 for (i = zsl->level; i < level; i++) {
4878 span[i] = 0;
4879 update[i] = zsl->header;
4880 update[i]->span[i] = zsl->length;
4881 }
4882 zsl->level = level;
4883 }
4884 x = zslCreateNode(level,score,obj);
4885 for (i = 0; i < level; i++) {
4886 x->forward[i] = update[i]->forward[i];
4887 update[i]->forward[i] = x;
4888
4889 /* update span covered by update[i] as x is inserted here */
4890 x->span[i] = update[i]->span[i] - (span[0] - span[i]);
4891 update[i]->span[i] = (span[0] - span[i]) + 1;
4892 }
4893
4894 /* increment span for untouched levels */
4895 for (i = level; i < zsl->level; i++) {
4896 update[i]->span[i]++;
4897 }
4898
4899 x->backward = (update[0] == zsl->header) ? NULL : update[0];
4900 if (x->forward[0])
4901 x->forward[0]->backward = x;
4902 else
4903 zsl->tail = x;
4904 zsl->length++;
4905 }
4906
4907 /* Delete an element with matching score/object from the skiplist. */
4908 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
4909 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4910 int i;
4911
4912 x = zsl->header;
4913 for (i = zsl->level-1; i >= 0; i--) {
4914 while (x->forward[i] &&
4915 (x->forward[i]->score < score ||
4916 (x->forward[i]->score == score &&
4917 compareStringObjects(x->forward[i]->obj,obj) < 0)))
4918 x = x->forward[i];
4919 update[i] = x;
4920 }
4921 /* We may have multiple elements with the same score, what we need
4922 * is to find the element with both the right score and object. */
4923 x = x->forward[0];
4924 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
4925 for (i = 0; i < zsl->level; i++) {
4926 if (update[i]->forward[i] == x) {
4927 update[i]->span[i] += x->span[i] - 1;
4928 update[i]->forward[i] = x->forward[i];
4929 } else {
4930 update[i]->span[i] -= 1;
4931 }
4932 }
4933 if (x->forward[0]) {
4934 x->forward[0]->backward = x->backward;
4935 } else {
4936 zsl->tail = x->backward;
4937 }
4938 zslFreeNode(x);
4939 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4940 zsl->level--;
4941 zsl->length--;
4942 return 1;
4943 } else {
4944 return 0; /* not found */
4945 }
4946 return 0; /* not found */
4947 }
4948
4949 /* Delete all the elements with score between min and max from the skiplist.
4950 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4951 * Note that this function takes the reference to the hash table view of the
4952 * sorted set, in order to remove the elements from the hash table too. */
4953 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4954 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4955 unsigned long removed = 0;
4956 int i;
4957
4958 x = zsl->header;
4959 for (i = zsl->level-1; i >= 0; i--) {
4960 while (x->forward[i] && x->forward[i]->score < min)
4961 x = x->forward[i];
4962 update[i] = x;
4963 }
4964 /* We may have multiple elements with the same score, what we need
4965 * is to find the element with both the right score and object. */
4966 x = x->forward[0];
4967 while (x && x->score <= max) {
4968 zskiplistNode *next;
4969
4970 for (i = 0; i < zsl->level; i++) {
4971 if (update[i]->forward[i] == x) {
4972 update[i]->span[i] += x->span[i] - 1;
4973 update[i]->forward[i] = x->forward[i];
4974 } else {
4975 update[i]->span[i] -= 1;
4976 }
4977 }
4978 if (x->forward[0]) {
4979 x->forward[0]->backward = x->backward;
4980 } else {
4981 zsl->tail = x->backward;
4982 }
4983 next = x->forward[0];
4984 dictDelete(dict,x->obj);
4985 zslFreeNode(x);
4986 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4987 zsl->level--;
4988 zsl->length--;
4989 removed++;
4990 x = next;
4991 }
4992 return removed; /* not found */
4993 }
4994
4995 /* Find the first node having a score equal or greater than the specified one.
4996 * Returns NULL if there is no match. */
4997 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
4998 zskiplistNode *x;
4999 int i;
5000
5001 x = zsl->header;
5002 for (i = zsl->level-1; i >= 0; i--) {
5003 while (x->forward[i] && x->forward[i]->score < score)
5004 x = x->forward[i];
5005 }
5006 /* We may have multiple elements with the same score, what we need
5007 * is to find the element with both the right score and object. */
5008 return x->forward[0];
5009 }
5010
5011 /* The actual Z-commands implementations */
5012
5013 /* This generic command implements both ZADD and ZINCRBY.
5014 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5015 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5016 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5017 robj *zsetobj;
5018 zset *zs;
5019 double *score;
5020
5021 zsetobj = lookupKeyWrite(c->db,key);
5022 if (zsetobj == NULL) {
5023 zsetobj = createZsetObject();
5024 dictAdd(c->db->dict,key,zsetobj);
5025 incrRefCount(key);
5026 } else {
5027 if (zsetobj->type != REDIS_ZSET) {
5028 addReply(c,shared.wrongtypeerr);
5029 return;
5030 }
5031 }
5032 zs = zsetobj->ptr;
5033
5034 /* Ok now since we implement both ZADD and ZINCRBY here the code
5035 * needs to handle the two different conditions. It's all about setting
5036 * '*score', that is, the new score to set, to the right value. */
5037 score = zmalloc(sizeof(double));
5038 if (doincrement) {
5039 dictEntry *de;
5040
5041 /* Read the old score. If the element was not present starts from 0 */
5042 de = dictFind(zs->dict,ele);
5043 if (de) {
5044 double *oldscore = dictGetEntryVal(de);
5045 *score = *oldscore + scoreval;
5046 } else {
5047 *score = scoreval;
5048 }
5049 } else {
5050 *score = scoreval;
5051 }
5052
5053 /* What follows is a simple remove and re-insert operation that is common
5054 * to both ZADD and ZINCRBY... */
5055 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5056 /* case 1: New element */
5057 incrRefCount(ele); /* added to hash */
5058 zslInsert(zs->zsl,*score,ele);
5059 incrRefCount(ele); /* added to skiplist */
5060 server.dirty++;
5061 if (doincrement)
5062 addReplyDouble(c,*score);
5063 else
5064 addReply(c,shared.cone);
5065 } else {
5066 dictEntry *de;
5067 double *oldscore;
5068
5069 /* case 2: Score update operation */
5070 de = dictFind(zs->dict,ele);
5071 redisAssert(de != NULL);
5072 oldscore = dictGetEntryVal(de);
5073 if (*score != *oldscore) {
5074 int deleted;
5075
5076 /* Remove and insert the element in the skip list with new score */
5077 deleted = zslDelete(zs->zsl,*oldscore,ele);
5078 redisAssert(deleted != 0);
5079 zslInsert(zs->zsl,*score,ele);
5080 incrRefCount(ele);
5081 /* Update the score in the hash table */
5082 dictReplace(zs->dict,ele,score);
5083 server.dirty++;
5084 } else {
5085 zfree(score);
5086 }
5087 if (doincrement)
5088 addReplyDouble(c,*score);
5089 else
5090 addReply(c,shared.czero);
5091 }
5092 }
5093
5094 static void zaddCommand(redisClient *c) {
5095 double scoreval;
5096
5097 scoreval = strtod(c->argv[2]->ptr,NULL);
5098 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5099 }
5100
5101 static void zincrbyCommand(redisClient *c) {
5102 double scoreval;
5103
5104 scoreval = strtod(c->argv[2]->ptr,NULL);
5105 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5106 }
5107
5108 static void zremCommand(redisClient *c) {
5109 robj *zsetobj;
5110 zset *zs;
5111
5112 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5113 if (zsetobj == NULL) {
5114 addReply(c,shared.czero);
5115 } else {
5116 dictEntry *de;
5117 double *oldscore;
5118 int deleted;
5119
5120 if (zsetobj->type != REDIS_ZSET) {
5121 addReply(c,shared.wrongtypeerr);
5122 return;
5123 }
5124 zs = zsetobj->ptr;
5125 de = dictFind(zs->dict,c->argv[2]);
5126 if (de == NULL) {
5127 addReply(c,shared.czero);
5128 return;
5129 }
5130 /* Delete from the skiplist */
5131 oldscore = dictGetEntryVal(de);
5132 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5133 redisAssert(deleted != 0);
5134
5135 /* Delete from the hash table */
5136 dictDelete(zs->dict,c->argv[2]);
5137 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5138 server.dirty++;
5139 addReply(c,shared.cone);
5140 }
5141 }
5142
5143 static void zremrangebyscoreCommand(redisClient *c) {
5144 double min = strtod(c->argv[2]->ptr,NULL);
5145 double max = strtod(c->argv[3]->ptr,NULL);
5146 robj *zsetobj;
5147 zset *zs;
5148
5149 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5150 if (zsetobj == NULL) {
5151 addReply(c,shared.czero);
5152 } else {
5153 long deleted;
5154
5155 if (zsetobj->type != REDIS_ZSET) {
5156 addReply(c,shared.wrongtypeerr);
5157 return;
5158 }
5159 zs = zsetobj->ptr;
5160 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5161 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5162 server.dirty += deleted;
5163 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5164 }
5165 }
5166
5167 static void zrangeGenericCommand(redisClient *c, int reverse) {
5168 robj *o;
5169 int start = atoi(c->argv[2]->ptr);
5170 int end = atoi(c->argv[3]->ptr);
5171 int withscores = 0;
5172
5173 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5174 withscores = 1;
5175 } else if (c->argc >= 5) {
5176 addReply(c,shared.syntaxerr);
5177 return;
5178 }
5179
5180 o = lookupKeyRead(c->db,c->argv[1]);
5181 if (o == NULL) {
5182 addReply(c,shared.nullmultibulk);
5183 } else {
5184 if (o->type != REDIS_ZSET) {
5185 addReply(c,shared.wrongtypeerr);
5186 } else {
5187 zset *zsetobj = o->ptr;
5188 zskiplist *zsl = zsetobj->zsl;
5189 zskiplistNode *ln;
5190
5191 int llen = zsl->length;
5192 int rangelen, j;
5193 robj *ele;
5194
5195 /* convert negative indexes */
5196 if (start < 0) start = llen+start;
5197 if (end < 0) end = llen+end;
5198 if (start < 0) start = 0;
5199 if (end < 0) end = 0;
5200
5201 /* indexes sanity checks */
5202 if (start > end || start >= llen) {
5203 /* Out of range start or start > end result in empty list */
5204 addReply(c,shared.emptymultibulk);
5205 return;
5206 }
5207 if (end >= llen) end = llen-1;
5208 rangelen = (end-start)+1;
5209
5210 /* Return the result in form of a multi-bulk reply */
5211 if (reverse) {
5212 ln = zsl->tail;
5213 while (start--)
5214 ln = ln->backward;
5215 } else {
5216 ln = zsl->header->forward[0];
5217 while (start--)
5218 ln = ln->forward[0];
5219 }
5220
5221 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5222 withscores ? (rangelen*2) : rangelen));
5223 for (j = 0; j < rangelen; j++) {
5224 ele = ln->obj;
5225 addReplyBulkLen(c,ele);
5226 addReply(c,ele);
5227 addReply(c,shared.crlf);
5228 if (withscores)
5229 addReplyDouble(c,ln->score);
5230 ln = reverse ? ln->backward : ln->forward[0];
5231 }
5232 }
5233 }
5234 }
5235
5236 static void zrangeCommand(redisClient *c) {
5237 zrangeGenericCommand(c,0);
5238 }
5239
5240 static void zrevrangeCommand(redisClient *c) {
5241 zrangeGenericCommand(c,1);
5242 }
5243
5244 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5245 * If justcount is non-zero, just the count is returned. */
5246 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5247 robj *o;
5248 double min, max;
5249 int minex = 0, maxex = 0; /* are min or max exclusive? */
5250 int offset = 0, limit = -1;
5251 int withscores = 0;
5252 int badsyntax = 0;
5253
5254 /* Parse the min-max interval. If one of the values is prefixed
5255 * by the "(" character, it's considered "open". For instance
5256 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5257 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5258 if (((char*)c->argv[2]->ptr)[0] == '(') {
5259 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5260 minex = 1;
5261 } else {
5262 min = strtod(c->argv[2]->ptr,NULL);
5263 }
5264 if (((char*)c->argv[3]->ptr)[0] == '(') {
5265 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5266 maxex = 1;
5267 } else {
5268 max = strtod(c->argv[3]->ptr,NULL);
5269 }
5270
5271 /* Parse "WITHSCORES": note that if the command was called with
5272 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5273 * enter the following paths to parse WITHSCORES and LIMIT. */
5274 if (c->argc == 5 || c->argc == 8) {
5275 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5276 withscores = 1;
5277 else
5278 badsyntax = 1;
5279 }
5280 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5281 badsyntax = 1;
5282 if (badsyntax) {
5283 addReplySds(c,
5284 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5285 return;
5286 }
5287
5288 /* Parse "LIMIT" */
5289 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5290 addReply(c,shared.syntaxerr);
5291 return;
5292 } else if (c->argc == (7 + withscores)) {
5293 offset = atoi(c->argv[5]->ptr);
5294 limit = atoi(c->argv[6]->ptr);
5295 if (offset < 0) offset = 0;
5296 }
5297
5298 /* Ok, lookup the key and get the range */
5299 o = lookupKeyRead(c->db,c->argv[1]);
5300 if (o == NULL) {
5301 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5302 } else {
5303 if (o->type != REDIS_ZSET) {
5304 addReply(c,shared.wrongtypeerr);
5305 } else {
5306 zset *zsetobj = o->ptr;
5307 zskiplist *zsl = zsetobj->zsl;
5308 zskiplistNode *ln;
5309 robj *ele, *lenobj = NULL;
5310 unsigned long rangelen = 0;
5311
5312 /* Get the first node with the score >= min, or with
5313 * score > min if 'minex' is true. */
5314 ln = zslFirstWithScore(zsl,min);
5315 while (minex && ln && ln->score == min) ln = ln->forward[0];
5316
5317 if (ln == NULL) {
5318 /* No element matching the speciifed interval */
5319 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5320 return;
5321 }
5322
5323 /* We don't know in advance how many matching elements there
5324 * are in the list, so we push this object that will represent
5325 * the multi-bulk length in the output buffer, and will "fix"
5326 * it later */
5327 if (!justcount) {
5328 lenobj = createObject(REDIS_STRING,NULL);
5329 addReply(c,lenobj);
5330 decrRefCount(lenobj);
5331 }
5332
5333 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5334 if (offset) {
5335 offset--;
5336 ln = ln->forward[0];
5337 continue;
5338 }
5339 if (limit == 0) break;
5340 if (!justcount) {
5341 ele = ln->obj;
5342 addReplyBulkLen(c,ele);
5343 addReply(c,ele);
5344 addReply(c,shared.crlf);
5345 if (withscores)
5346 addReplyDouble(c,ln->score);
5347 }
5348 ln = ln->forward[0];
5349 rangelen++;
5350 if (limit > 0) limit--;
5351 }
5352 if (justcount) {
5353 addReplyLong(c,(long)rangelen);
5354 } else {
5355 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5356 withscores ? (rangelen*2) : rangelen);
5357 }
5358 }
5359 }
5360 }
5361
5362 static void zrangebyscoreCommand(redisClient *c) {
5363 genericZrangebyscoreCommand(c,0);
5364 }
5365
5366 static void zcountCommand(redisClient *c) {
5367 genericZrangebyscoreCommand(c,1);
5368 }
5369
5370 static void zcardCommand(redisClient *c) {
5371 robj *o;
5372 zset *zs;
5373
5374 o = lookupKeyRead(c->db,c->argv[1]);
5375 if (o == NULL) {
5376 addReply(c,shared.czero);
5377 return;
5378 } else {
5379 if (o->type != REDIS_ZSET) {
5380 addReply(c,shared.wrongtypeerr);
5381 } else {
5382 zs = o->ptr;
5383 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5384 }
5385 }
5386 }
5387
5388 static void zscoreCommand(redisClient *c) {
5389 robj *o;
5390 zset *zs;
5391
5392 o = lookupKeyRead(c->db,c->argv[1]);
5393 if (o == NULL) {
5394 addReply(c,shared.nullbulk);
5395 return;
5396 } else {
5397 if (o->type != REDIS_ZSET) {
5398 addReply(c,shared.wrongtypeerr);
5399 } else {
5400 dictEntry *de;
5401
5402 zs = o->ptr;
5403 de = dictFind(zs->dict,c->argv[2]);
5404 if (!de) {
5405 addReply(c,shared.nullbulk);
5406 } else {
5407 double *score = dictGetEntryVal(de);
5408
5409 addReplyDouble(c,*score);
5410 }
5411 }
5412 }
5413 }
5414
5415 static void zrankCommand(redisClient *c) {
5416 robj *o;
5417 o = lookupKeyRead(c->db,c->argv[1]);
5418 if (o == NULL) {
5419 addReply(c,shared.nullbulk);
5420 return;
5421 }
5422 if (o->type != REDIS_ZSET) {
5423 addReply(c,shared.wrongtypeerr);
5424 return;
5425 }
5426
5427 zset *zs = o->ptr;
5428 zskiplist *zsl = zs->zsl;
5429 dictEntry *de = dictFind(zs->dict,c->argv[2]);
5430 if (!de) {
5431 addReply(c,shared.nullbulk);
5432 return;
5433 }
5434
5435 double *score = dictGetEntryVal(de);
5436 zskiplistNode *x;
5437 unsigned int rank = 0;
5438 int i;
5439
5440 x = zsl->header;
5441 for (i = zsl->level-1; i >= 0; i--) {
5442 while (x->forward[i] &&
5443 (x->forward[i]->score < *score ||
5444 (x->forward[i]->score == *score &&
5445 compareStringObjects(x->forward[i]->obj,c->argv[2]) < 0))) {
5446 rank += x->span[i];
5447 x = x->forward[i];
5448 }
5449
5450 if (x->forward[i] && compareStringObjects(x->forward[i]->obj,c->argv[2]) == 0) {
5451 addReplyLong(c, rank);
5452 return;
5453 }
5454 }
5455
5456 addReply(c,shared.nullbulk);
5457 }
5458
5459 /* ========================= Non type-specific commands ==================== */
5460
5461 static void flushdbCommand(redisClient *c) {
5462 server.dirty += dictSize(c->db->dict);
5463 dictEmpty(c->db->dict);
5464 dictEmpty(c->db->expires);
5465 addReply(c,shared.ok);
5466 }
5467
5468 static void flushallCommand(redisClient *c) {
5469 server.dirty += emptyDb();
5470 addReply(c,shared.ok);
5471 rdbSave(server.dbfilename);
5472 server.dirty++;
5473 }
5474
5475 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5476 redisSortOperation *so = zmalloc(sizeof(*so));
5477 so->type = type;
5478 so->pattern = pattern;
5479 return so;
5480 }
5481
5482 /* Return the value associated to the key with a name obtained
5483 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5484 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5485 char *p;
5486 sds spat, ssub;
5487 robj keyobj;
5488 int prefixlen, sublen, postfixlen;
5489 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5490 struct {
5491 long len;
5492 long free;
5493 char buf[REDIS_SORTKEY_MAX+1];
5494 } keyname;
5495
5496 /* If the pattern is "#" return the substitution object itself in order
5497 * to implement the "SORT ... GET #" feature. */
5498 spat = pattern->ptr;
5499 if (spat[0] == '#' && spat[1] == '\0') {
5500 return subst;
5501 }
5502
5503 /* The substitution object may be specially encoded. If so we create
5504 * a decoded object on the fly. Otherwise getDecodedObject will just
5505 * increment the ref count, that we'll decrement later. */
5506 subst = getDecodedObject(subst);
5507
5508 ssub = subst->ptr;
5509 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5510 p = strchr(spat,'*');
5511 if (!p) {
5512 decrRefCount(subst);
5513 return NULL;
5514 }
5515
5516 prefixlen = p-spat;
5517 sublen = sdslen(ssub);
5518 postfixlen = sdslen(spat)-(prefixlen+1);
5519 memcpy(keyname.buf,spat,prefixlen);
5520 memcpy(keyname.buf+prefixlen,ssub,sublen);
5521 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5522 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5523 keyname.len = prefixlen+sublen+postfixlen;
5524
5525 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5526 decrRefCount(subst);
5527
5528 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5529 return lookupKeyRead(db,&keyobj);
5530 }
5531
5532 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5533 * the additional parameter is not standard but a BSD-specific we have to
5534 * pass sorting parameters via the global 'server' structure */
5535 static int sortCompare(const void *s1, const void *s2) {
5536 const redisSortObject *so1 = s1, *so2 = s2;
5537 int cmp;
5538
5539 if (!server.sort_alpha) {
5540 /* Numeric sorting. Here it's trivial as we precomputed scores */
5541 if (so1->u.score > so2->u.score) {
5542 cmp = 1;
5543 } else if (so1->u.score < so2->u.score) {
5544 cmp = -1;
5545 } else {
5546 cmp = 0;
5547 }
5548 } else {
5549 /* Alphanumeric sorting */
5550 if (server.sort_bypattern) {
5551 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5552 /* At least one compare object is NULL */
5553 if (so1->u.cmpobj == so2->u.cmpobj)
5554 cmp = 0;
5555 else if (so1->u.cmpobj == NULL)
5556 cmp = -1;
5557 else
5558 cmp = 1;
5559 } else {
5560 /* We have both the objects, use strcoll */
5561 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5562 }
5563 } else {
5564 /* Compare elements directly */
5565 robj *dec1, *dec2;
5566
5567 dec1 = getDecodedObject(so1->obj);
5568 dec2 = getDecodedObject(so2->obj);
5569 cmp = strcoll(dec1->ptr,dec2->ptr);
5570 decrRefCount(dec1);
5571 decrRefCount(dec2);
5572 }
5573 }
5574 return server.sort_desc ? -cmp : cmp;
5575 }
5576
5577 /* The SORT command is the most complex command in Redis. Warning: this code
5578 * is optimized for speed and a bit less for readability */
5579 static void sortCommand(redisClient *c) {
5580 list *operations;
5581 int outputlen = 0;
5582 int desc = 0, alpha = 0;
5583 int limit_start = 0, limit_count = -1, start, end;
5584 int j, dontsort = 0, vectorlen;
5585 int getop = 0; /* GET operation counter */
5586 robj *sortval, *sortby = NULL, *storekey = NULL;
5587 redisSortObject *vector; /* Resulting vector to sort */
5588
5589 /* Lookup the key to sort. It must be of the right types */
5590 sortval = lookupKeyRead(c->db,c->argv[1]);
5591 if (sortval == NULL) {
5592 addReply(c,shared.nullmultibulk);
5593 return;
5594 }
5595 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5596 sortval->type != REDIS_ZSET)
5597 {
5598 addReply(c,shared.wrongtypeerr);
5599 return;
5600 }
5601
5602 /* Create a list of operations to perform for every sorted element.
5603 * Operations can be GET/DEL/INCR/DECR */
5604 operations = listCreate();
5605 listSetFreeMethod(operations,zfree);
5606 j = 2;
5607
5608 /* Now we need to protect sortval incrementing its count, in the future
5609 * SORT may have options able to overwrite/delete keys during the sorting
5610 * and the sorted key itself may get destroied */
5611 incrRefCount(sortval);
5612
5613 /* The SORT command has an SQL-alike syntax, parse it */
5614 while(j < c->argc) {
5615 int leftargs = c->argc-j-1;
5616 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5617 desc = 0;
5618 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5619 desc = 1;
5620 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5621 alpha = 1;
5622 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5623 limit_start = atoi(c->argv[j+1]->ptr);
5624 limit_count = atoi(c->argv[j+2]->ptr);
5625 j+=2;
5626 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5627 storekey = c->argv[j+1];
5628 j++;
5629 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5630 sortby = c->argv[j+1];
5631 /* If the BY pattern does not contain '*', i.e. it is constant,
5632 * we don't need to sort nor to lookup the weight keys. */
5633 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5634 j++;
5635 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5636 listAddNodeTail(operations,createSortOperation(
5637 REDIS_SORT_GET,c->argv[j+1]));
5638 getop++;
5639 j++;
5640 } else {
5641 decrRefCount(sortval);
5642 listRelease(operations);
5643 addReply(c,shared.syntaxerr);
5644 return;
5645 }
5646 j++;
5647 }
5648
5649 /* Load the sorting vector with all the objects to sort */
5650 switch(sortval->type) {
5651 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5652 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5653 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
5654 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
5655 }
5656 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
5657 j = 0;
5658
5659 if (sortval->type == REDIS_LIST) {
5660 list *list = sortval->ptr;
5661 listNode *ln;
5662 listIter li;
5663
5664 listRewind(list,&li);
5665 while((ln = listNext(&li))) {
5666 robj *ele = ln->value;
5667 vector[j].obj = ele;
5668 vector[j].u.score = 0;
5669 vector[j].u.cmpobj = NULL;
5670 j++;
5671 }
5672 } else {
5673 dict *set;
5674 dictIterator *di;
5675 dictEntry *setele;
5676
5677 if (sortval->type == REDIS_SET) {
5678 set = sortval->ptr;
5679 } else {
5680 zset *zs = sortval->ptr;
5681 set = zs->dict;
5682 }
5683
5684 di = dictGetIterator(set);
5685 while((setele = dictNext(di)) != NULL) {
5686 vector[j].obj = dictGetEntryKey(setele);
5687 vector[j].u.score = 0;
5688 vector[j].u.cmpobj = NULL;
5689 j++;
5690 }
5691 dictReleaseIterator(di);
5692 }
5693 redisAssert(j == vectorlen);
5694
5695 /* Now it's time to load the right scores in the sorting vector */
5696 if (dontsort == 0) {
5697 for (j = 0; j < vectorlen; j++) {
5698 if (sortby) {
5699 robj *byval;
5700
5701 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
5702 if (!byval || byval->type != REDIS_STRING) continue;
5703 if (alpha) {
5704 vector[j].u.cmpobj = getDecodedObject(byval);
5705 } else {
5706 if (byval->encoding == REDIS_ENCODING_RAW) {
5707 vector[j].u.score = strtod(byval->ptr,NULL);
5708 } else {
5709 /* Don't need to decode the object if it's
5710 * integer-encoded (the only encoding supported) so
5711 * far. We can just cast it */
5712 if (byval->encoding == REDIS_ENCODING_INT) {
5713 vector[j].u.score = (long)byval->ptr;
5714 } else
5715 redisAssert(1 != 1);
5716 }
5717 }
5718 } else {
5719 if (!alpha) {
5720 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5721 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5722 else {
5723 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5724 vector[j].u.score = (long) vector[j].obj->ptr;
5725 else
5726 redisAssert(1 != 1);
5727 }
5728 }
5729 }
5730 }
5731 }
5732
5733 /* We are ready to sort the vector... perform a bit of sanity check
5734 * on the LIMIT option too. We'll use a partial version of quicksort. */
5735 start = (limit_start < 0) ? 0 : limit_start;
5736 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5737 if (start >= vectorlen) {
5738 start = vectorlen-1;
5739 end = vectorlen-2;
5740 }
5741 if (end >= vectorlen) end = vectorlen-1;
5742
5743 if (dontsort == 0) {
5744 server.sort_desc = desc;
5745 server.sort_alpha = alpha;
5746 server.sort_bypattern = sortby ? 1 : 0;
5747 if (sortby && (start != 0 || end != vectorlen-1))
5748 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5749 else
5750 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
5751 }
5752
5753 /* Send command output to the output buffer, performing the specified
5754 * GET/DEL/INCR/DECR operations if any. */
5755 outputlen = getop ? getop*(end-start+1) : end-start+1;
5756 if (storekey == NULL) {
5757 /* STORE option not specified, sent the sorting result to client */
5758 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5759 for (j = start; j <= end; j++) {
5760 listNode *ln;
5761 listIter li;
5762
5763 if (!getop) {
5764 addReplyBulkLen(c,vector[j].obj);
5765 addReply(c,vector[j].obj);
5766 addReply(c,shared.crlf);
5767 }
5768 listRewind(operations,&li);
5769 while((ln = listNext(&li))) {
5770 redisSortOperation *sop = ln->value;
5771 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5772 vector[j].obj);
5773
5774 if (sop->type == REDIS_SORT_GET) {
5775 if (!val || val->type != REDIS_STRING) {
5776 addReply(c,shared.nullbulk);
5777 } else {
5778 addReplyBulkLen(c,val);
5779 addReply(c,val);
5780 addReply(c,shared.crlf);
5781 }
5782 } else {
5783 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5784 }
5785 }
5786 }
5787 } else {
5788 robj *listObject = createListObject();
5789 list *listPtr = (list*) listObject->ptr;
5790
5791 /* STORE option specified, set the sorting result as a List object */
5792 for (j = start; j <= end; j++) {
5793 listNode *ln;
5794 listIter li;
5795
5796 if (!getop) {
5797 listAddNodeTail(listPtr,vector[j].obj);
5798 incrRefCount(vector[j].obj);
5799 }
5800 listRewind(operations,&li);
5801 while((ln = listNext(&li))) {
5802 redisSortOperation *sop = ln->value;
5803 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5804 vector[j].obj);
5805
5806 if (sop->type == REDIS_SORT_GET) {
5807 if (!val || val->type != REDIS_STRING) {
5808 listAddNodeTail(listPtr,createStringObject("",0));
5809 } else {
5810 listAddNodeTail(listPtr,val);
5811 incrRefCount(val);
5812 }
5813 } else {
5814 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5815 }
5816 }
5817 }
5818 if (dictReplace(c->db->dict,storekey,listObject)) {
5819 incrRefCount(storekey);
5820 }
5821 /* Note: we add 1 because the DB is dirty anyway since even if the
5822 * SORT result is empty a new key is set and maybe the old content
5823 * replaced. */
5824 server.dirty += 1+outputlen;
5825 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
5826 }
5827
5828 /* Cleanup */
5829 decrRefCount(sortval);
5830 listRelease(operations);
5831 for (j = 0; j < vectorlen; j++) {
5832 if (sortby && alpha && vector[j].u.cmpobj)
5833 decrRefCount(vector[j].u.cmpobj);
5834 }
5835 zfree(vector);
5836 }
5837
5838 /* Convert an amount of bytes into a human readable string in the form
5839 * of 100B, 2G, 100M, 4K, and so forth. */
5840 static void bytesToHuman(char *s, unsigned long long n) {
5841 double d;
5842
5843 if (n < 1024) {
5844 /* Bytes */
5845 sprintf(s,"%lluB",n);
5846 return;
5847 } else if (n < (1024*1024)) {
5848 d = (double)n/(1024);
5849 sprintf(s,"%.2fK",d);
5850 } else if (n < (1024LL*1024*1024)) {
5851 d = (double)n/(1024*1024);
5852 sprintf(s,"%.2fM",d);
5853 } else if (n < (1024LL*1024*1024*1024)) {
5854 d = (double)n/(1024LL*1024*1024);
5855 sprintf(s,"%.2fG",d);
5856 }
5857 }
5858
5859 /* Create the string returned by the INFO command. This is decoupled
5860 * by the INFO command itself as we need to report the same information
5861 * on memory corruption problems. */
5862 static sds genRedisInfoString(void) {
5863 sds info;
5864 time_t uptime = time(NULL)-server.stat_starttime;
5865 int j;
5866 char hmem[64];
5867
5868 bytesToHuman(hmem,zmalloc_used_memory());
5869 info = sdscatprintf(sdsempty(),
5870 "redis_version:%s\r\n"
5871 "arch_bits:%s\r\n"
5872 "multiplexing_api:%s\r\n"
5873 "process_id:%ld\r\n"
5874 "uptime_in_seconds:%ld\r\n"
5875 "uptime_in_days:%ld\r\n"
5876 "connected_clients:%d\r\n"
5877 "connected_slaves:%d\r\n"
5878 "blocked_clients:%d\r\n"
5879 "used_memory:%zu\r\n"
5880 "used_memory_human:%s\r\n"
5881 "changes_since_last_save:%lld\r\n"
5882 "bgsave_in_progress:%d\r\n"
5883 "last_save_time:%ld\r\n"
5884 "bgrewriteaof_in_progress:%d\r\n"
5885 "total_connections_received:%lld\r\n"
5886 "total_commands_processed:%lld\r\n"
5887 "vm_enabled:%d\r\n"
5888 "role:%s\r\n"
5889 ,REDIS_VERSION,
5890 (sizeof(long) == 8) ? "64" : "32",
5891 aeGetApiName(),
5892 (long) getpid(),
5893 uptime,
5894 uptime/(3600*24),
5895 listLength(server.clients)-listLength(server.slaves),
5896 listLength(server.slaves),
5897 server.blpop_blocked_clients,
5898 zmalloc_used_memory(),
5899 hmem,
5900 server.dirty,
5901 server.bgsavechildpid != -1,
5902 server.lastsave,
5903 server.bgrewritechildpid != -1,
5904 server.stat_numconnections,
5905 server.stat_numcommands,
5906 server.vm_enabled != 0,
5907 server.masterhost == NULL ? "master" : "slave"
5908 );
5909 if (server.masterhost) {
5910 info = sdscatprintf(info,
5911 "master_host:%s\r\n"
5912 "master_port:%d\r\n"
5913 "master_link_status:%s\r\n"
5914 "master_last_io_seconds_ago:%d\r\n"
5915 ,server.masterhost,
5916 server.masterport,
5917 (server.replstate == REDIS_REPL_CONNECTED) ?
5918 "up" : "down",
5919 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
5920 );
5921 }
5922 if (server.vm_enabled) {
5923 lockThreadedIO();
5924 info = sdscatprintf(info,
5925 "vm_conf_max_memory:%llu\r\n"
5926 "vm_conf_page_size:%llu\r\n"
5927 "vm_conf_pages:%llu\r\n"
5928 "vm_stats_used_pages:%llu\r\n"
5929 "vm_stats_swapped_objects:%llu\r\n"
5930 "vm_stats_swappin_count:%llu\r\n"
5931 "vm_stats_swappout_count:%llu\r\n"
5932 "vm_stats_io_newjobs_len:%lu\r\n"
5933 "vm_stats_io_processing_len:%lu\r\n"
5934 "vm_stats_io_processed_len:%lu\r\n"
5935 "vm_stats_io_active_threads:%lu\r\n"
5936 "vm_stats_blocked_clients:%lu\r\n"
5937 ,(unsigned long long) server.vm_max_memory,
5938 (unsigned long long) server.vm_page_size,
5939 (unsigned long long) server.vm_pages,
5940 (unsigned long long) server.vm_stats_used_pages,
5941 (unsigned long long) server.vm_stats_swapped_objects,
5942 (unsigned long long) server.vm_stats_swapins,
5943 (unsigned long long) server.vm_stats_swapouts,
5944 (unsigned long) listLength(server.io_newjobs),
5945 (unsigned long) listLength(server.io_processing),
5946 (unsigned long) listLength(server.io_processed),
5947 (unsigned long) server.io_active_threads,
5948 (unsigned long) server.vm_blocked_clients
5949 );
5950 unlockThreadedIO();
5951 }
5952 for (j = 0; j < server.dbnum; j++) {
5953 long long keys, vkeys;
5954
5955 keys = dictSize(server.db[j].dict);
5956 vkeys = dictSize(server.db[j].expires);
5957 if (keys || vkeys) {
5958 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
5959 j, keys, vkeys);
5960 }
5961 }
5962 return info;
5963 }
5964
5965 static void infoCommand(redisClient *c) {
5966 sds info = genRedisInfoString();
5967 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5968 (unsigned long)sdslen(info)));
5969 addReplySds(c,info);
5970 addReply(c,shared.crlf);
5971 }
5972
5973 static void monitorCommand(redisClient *c) {
5974 /* ignore MONITOR if aleady slave or in monitor mode */
5975 if (c->flags & REDIS_SLAVE) return;
5976
5977 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5978 c->slaveseldb = 0;
5979 listAddNodeTail(server.monitors,c);
5980 addReply(c,shared.ok);
5981 }
5982
5983 /* ================================= Expire ================================= */
5984 static int removeExpire(redisDb *db, robj *key) {
5985 if (dictDelete(db->expires,key) == DICT_OK) {
5986 return 1;
5987 } else {
5988 return 0;
5989 }
5990 }
5991
5992 static int setExpire(redisDb *db, robj *key, time_t when) {
5993 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5994 return 0;
5995 } else {
5996 incrRefCount(key);
5997 return 1;
5998 }
5999 }
6000
6001 /* Return the expire time of the specified key, or -1 if no expire
6002 * is associated with this key (i.e. the key is non volatile) */
6003 static time_t getExpire(redisDb *db, robj *key) {
6004 dictEntry *de;
6005
6006 /* No expire? return ASAP */
6007 if (dictSize(db->expires) == 0 ||
6008 (de = dictFind(db->expires,key)) == NULL) return -1;
6009
6010 return (time_t) dictGetEntryVal(de);
6011 }
6012
6013 static int expireIfNeeded(redisDb *db, robj *key) {
6014 time_t when;
6015 dictEntry *de;
6016
6017 /* No expire? return ASAP */
6018 if (dictSize(db->expires) == 0 ||
6019 (de = dictFind(db->expires,key)) == NULL) return 0;
6020
6021 /* Lookup the expire */
6022 when = (time_t) dictGetEntryVal(de);
6023 if (time(NULL) <= when) return 0;
6024
6025 /* Delete the key */
6026 dictDelete(db->expires,key);
6027 return dictDelete(db->dict,key) == DICT_OK;
6028 }
6029
6030 static int deleteIfVolatile(redisDb *db, robj *key) {
6031 dictEntry *de;
6032
6033 /* No expire? return ASAP */
6034 if (dictSize(db->expires) == 0 ||
6035 (de = dictFind(db->expires,key)) == NULL) return 0;
6036
6037 /* Delete the key */
6038 server.dirty++;
6039 dictDelete(db->expires,key);
6040 return dictDelete(db->dict,key) == DICT_OK;
6041 }
6042
6043 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6044 dictEntry *de;
6045
6046 de = dictFind(c->db->dict,key);
6047 if (de == NULL) {
6048 addReply(c,shared.czero);
6049 return;
6050 }
6051 if (seconds < 0) {
6052 if (deleteKey(c->db,key)) server.dirty++;
6053 addReply(c, shared.cone);
6054 return;
6055 } else {
6056 time_t when = time(NULL)+seconds;
6057 if (setExpire(c->db,key,when)) {
6058 addReply(c,shared.cone);
6059 server.dirty++;
6060 } else {
6061 addReply(c,shared.czero);
6062 }
6063 return;
6064 }
6065 }
6066
6067 static void expireCommand(redisClient *c) {
6068 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6069 }
6070
6071 static void expireatCommand(redisClient *c) {
6072 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6073 }
6074
6075 static void ttlCommand(redisClient *c) {
6076 time_t expire;
6077 int ttl = -1;
6078
6079 expire = getExpire(c->db,c->argv[1]);
6080 if (expire != -1) {
6081 ttl = (int) (expire-time(NULL));
6082 if (ttl < 0) ttl = -1;
6083 }
6084 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6085 }
6086
6087 /* ================================ MULTI/EXEC ============================== */
6088
6089 /* Client state initialization for MULTI/EXEC */
6090 static void initClientMultiState(redisClient *c) {
6091 c->mstate.commands = NULL;
6092 c->mstate.count = 0;
6093 }
6094
6095 /* Release all the resources associated with MULTI/EXEC state */
6096 static void freeClientMultiState(redisClient *c) {
6097 int j;
6098
6099 for (j = 0; j < c->mstate.count; j++) {
6100 int i;
6101 multiCmd *mc = c->mstate.commands+j;
6102
6103 for (i = 0; i < mc->argc; i++)
6104 decrRefCount(mc->argv[i]);
6105 zfree(mc->argv);
6106 }
6107 zfree(c->mstate.commands);
6108 }
6109
6110 /* Add a new command into the MULTI commands queue */
6111 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6112 multiCmd *mc;
6113 int j;
6114
6115 c->mstate.commands = zrealloc(c->mstate.commands,
6116 sizeof(multiCmd)*(c->mstate.count+1));
6117 mc = c->mstate.commands+c->mstate.count;
6118 mc->cmd = cmd;
6119 mc->argc = c->argc;
6120 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6121 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6122 for (j = 0; j < c->argc; j++)
6123 incrRefCount(mc->argv[j]);
6124 c->mstate.count++;
6125 }
6126
6127 static void multiCommand(redisClient *c) {
6128 c->flags |= REDIS_MULTI;
6129 addReply(c,shared.ok);
6130 }
6131
6132 static void discardCommand(redisClient *c) {
6133 if (!(c->flags & REDIS_MULTI)) {
6134 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6135 return;
6136 }
6137
6138 freeClientMultiState(c);
6139 initClientMultiState(c);
6140 c->flags &= (~REDIS_MULTI);
6141 addReply(c,shared.ok);
6142 }
6143
6144 static void execCommand(redisClient *c) {
6145 int j;
6146 robj **orig_argv;
6147 int orig_argc;
6148
6149 if (!(c->flags & REDIS_MULTI)) {
6150 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6151 return;
6152 }
6153
6154 orig_argv = c->argv;
6155 orig_argc = c->argc;
6156 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6157 for (j = 0; j < c->mstate.count; j++) {
6158 c->argc = c->mstate.commands[j].argc;
6159 c->argv = c->mstate.commands[j].argv;
6160 call(c,c->mstate.commands[j].cmd);
6161 }
6162 c->argv = orig_argv;
6163 c->argc = orig_argc;
6164 freeClientMultiState(c);
6165 initClientMultiState(c);
6166 c->flags &= (~REDIS_MULTI);
6167 }
6168
6169 /* =========================== Blocking Operations ========================= */
6170
6171 /* Currently Redis blocking operations support is limited to list POP ops,
6172 * so the current implementation is not fully generic, but it is also not
6173 * completely specific so it will not require a rewrite to support new
6174 * kind of blocking operations in the future.
6175 *
6176 * Still it's important to note that list blocking operations can be already
6177 * used as a notification mechanism in order to implement other blocking
6178 * operations at application level, so there must be a very strong evidence
6179 * of usefulness and generality before new blocking operations are implemented.
6180 *
6181 * This is how the current blocking POP works, we use BLPOP as example:
6182 * - If the user calls BLPOP and the key exists and contains a non empty list
6183 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6184 * if there is not to block.
6185 * - If instead BLPOP is called and the key does not exists or the list is
6186 * empty we need to block. In order to do so we remove the notification for
6187 * new data to read in the client socket (so that we'll not serve new
6188 * requests if the blocking request is not served). Also we put the client
6189 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6190 * blocking for this keys.
6191 * - If a PUSH operation against a key with blocked clients waiting is
6192 * performed, we serve the first in the list: basically instead to push
6193 * the new element inside the list we return it to the (first / oldest)
6194 * blocking client, unblock the client, and remove it form the list.
6195 *
6196 * The above comment and the source code should be enough in order to understand
6197 * the implementation and modify / fix it later.
6198 */
6199
6200 /* Set a client in blocking mode for the specified key, with the specified
6201 * timeout */
6202 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6203 dictEntry *de;
6204 list *l;
6205 int j;
6206
6207 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6208 c->blockingkeysnum = numkeys;
6209 c->blockingto = timeout;
6210 for (j = 0; j < numkeys; j++) {
6211 /* Add the key in the client structure, to map clients -> keys */
6212 c->blockingkeys[j] = keys[j];
6213 incrRefCount(keys[j]);
6214
6215 /* And in the other "side", to map keys -> clients */
6216 de = dictFind(c->db->blockingkeys,keys[j]);
6217 if (de == NULL) {
6218 int retval;
6219
6220 /* For every key we take a list of clients blocked for it */
6221 l = listCreate();
6222 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6223 incrRefCount(keys[j]);
6224 assert(retval == DICT_OK);
6225 } else {
6226 l = dictGetEntryVal(de);
6227 }
6228 listAddNodeTail(l,c);
6229 }
6230 /* Mark the client as a blocked client */
6231 c->flags |= REDIS_BLOCKED;
6232 server.blpop_blocked_clients++;
6233 }
6234
6235 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6236 static void unblockClientWaitingData(redisClient *c) {
6237 dictEntry *de;
6238 list *l;
6239 int j;
6240
6241 assert(c->blockingkeys != NULL);
6242 /* The client may wait for multiple keys, so unblock it for every key. */
6243 for (j = 0; j < c->blockingkeysnum; j++) {
6244 /* Remove this client from the list of clients waiting for this key. */
6245 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6246 assert(de != NULL);
6247 l = dictGetEntryVal(de);
6248 listDelNode(l,listSearchKey(l,c));
6249 /* If the list is empty we need to remove it to avoid wasting memory */
6250 if (listLength(l) == 0)
6251 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6252 decrRefCount(c->blockingkeys[j]);
6253 }
6254 /* Cleanup the client structure */
6255 zfree(c->blockingkeys);
6256 c->blockingkeys = NULL;
6257 c->flags &= (~REDIS_BLOCKED);
6258 server.blpop_blocked_clients--;
6259 /* We want to process data if there is some command waiting
6260 * in the input buffer. Note that this is safe even if
6261 * unblockClientWaitingData() gets called from freeClient() because
6262 * freeClient() will be smart enough to call this function
6263 * *after* c->querybuf was set to NULL. */
6264 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6265 }
6266
6267 /* This should be called from any function PUSHing into lists.
6268 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6269 * 'ele' is the element pushed.
6270 *
6271 * If the function returns 0 there was no client waiting for a list push
6272 * against this key.
6273 *
6274 * If the function returns 1 there was a client waiting for a list push
6275 * against this key, the element was passed to this client thus it's not
6276 * needed to actually add it to the list and the caller should return asap. */
6277 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6278 struct dictEntry *de;
6279 redisClient *receiver;
6280 list *l;
6281 listNode *ln;
6282
6283 de = dictFind(c->db->blockingkeys,key);
6284 if (de == NULL) return 0;
6285 l = dictGetEntryVal(de);
6286 ln = listFirst(l);
6287 assert(ln != NULL);
6288 receiver = ln->value;
6289
6290 addReplySds(receiver,sdsnew("*2\r\n"));
6291 addReplyBulkLen(receiver,key);
6292 addReply(receiver,key);
6293 addReply(receiver,shared.crlf);
6294 addReplyBulkLen(receiver,ele);
6295 addReply(receiver,ele);
6296 addReply(receiver,shared.crlf);
6297 unblockClientWaitingData(receiver);
6298 return 1;
6299 }
6300
6301 /* Blocking RPOP/LPOP */
6302 static void blockingPopGenericCommand(redisClient *c, int where) {
6303 robj *o;
6304 time_t timeout;
6305 int j;
6306
6307 for (j = 1; j < c->argc-1; j++) {
6308 o = lookupKeyWrite(c->db,c->argv[j]);
6309 if (o != NULL) {
6310 if (o->type != REDIS_LIST) {
6311 addReply(c,shared.wrongtypeerr);
6312 return;
6313 } else {
6314 list *list = o->ptr;
6315 if (listLength(list) != 0) {
6316 /* If the list contains elements fall back to the usual
6317 * non-blocking POP operation */
6318 robj *argv[2], **orig_argv;
6319 int orig_argc;
6320
6321 /* We need to alter the command arguments before to call
6322 * popGenericCommand() as the command takes a single key. */
6323 orig_argv = c->argv;
6324 orig_argc = c->argc;
6325 argv[1] = c->argv[j];
6326 c->argv = argv;
6327 c->argc = 2;
6328
6329 /* Also the return value is different, we need to output
6330 * the multi bulk reply header and the key name. The
6331 * "real" command will add the last element (the value)
6332 * for us. If this souds like an hack to you it's just
6333 * because it is... */
6334 addReplySds(c,sdsnew("*2\r\n"));
6335 addReplyBulkLen(c,argv[1]);
6336 addReply(c,argv[1]);
6337 addReply(c,shared.crlf);
6338 popGenericCommand(c,where);
6339
6340 /* Fix the client structure with the original stuff */
6341 c->argv = orig_argv;
6342 c->argc = orig_argc;
6343 return;
6344 }
6345 }
6346 }
6347 }
6348 /* If the list is empty or the key does not exists we must block */
6349 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6350 if (timeout > 0) timeout += time(NULL);
6351 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6352 }
6353
6354 static void blpopCommand(redisClient *c) {
6355 blockingPopGenericCommand(c,REDIS_HEAD);
6356 }
6357
6358 static void brpopCommand(redisClient *c) {
6359 blockingPopGenericCommand(c,REDIS_TAIL);
6360 }
6361
6362 /* =============================== Replication ============================= */
6363
6364 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6365 ssize_t nwritten, ret = size;
6366 time_t start = time(NULL);
6367
6368 timeout++;
6369 while(size) {
6370 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6371 nwritten = write(fd,ptr,size);
6372 if (nwritten == -1) return -1;
6373 ptr += nwritten;
6374 size -= nwritten;
6375 }
6376 if ((time(NULL)-start) > timeout) {
6377 errno = ETIMEDOUT;
6378 return -1;
6379 }
6380 }
6381 return ret;
6382 }
6383
6384 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6385 ssize_t nread, totread = 0;
6386 time_t start = time(NULL);
6387
6388 timeout++;
6389 while(size) {
6390 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6391 nread = read(fd,ptr,size);
6392 if (nread == -1) return -1;
6393 ptr += nread;
6394 size -= nread;
6395 totread += nread;
6396 }
6397 if ((time(NULL)-start) > timeout) {
6398 errno = ETIMEDOUT;
6399 return -1;
6400 }
6401 }
6402 return totread;
6403 }
6404
6405 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6406 ssize_t nread = 0;
6407
6408 size--;
6409 while(size) {
6410 char c;
6411
6412 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6413 if (c == '\n') {
6414 *ptr = '\0';
6415 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6416 return nread;
6417 } else {
6418 *ptr++ = c;
6419 *ptr = '\0';
6420 nread++;
6421 }
6422 }
6423 return nread;
6424 }
6425
6426 static void syncCommand(redisClient *c) {
6427 /* ignore SYNC if aleady slave or in monitor mode */
6428 if (c->flags & REDIS_SLAVE) return;
6429
6430 /* SYNC can't be issued when the server has pending data to send to
6431 * the client about already issued commands. We need a fresh reply
6432 * buffer registering the differences between the BGSAVE and the current
6433 * dataset, so that we can copy to other slaves if needed. */
6434 if (listLength(c->reply) != 0) {
6435 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6436 return;
6437 }
6438
6439 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6440 /* Here we need to check if there is a background saving operation
6441 * in progress, or if it is required to start one */
6442 if (server.bgsavechildpid != -1) {
6443 /* Ok a background save is in progress. Let's check if it is a good
6444 * one for replication, i.e. if there is another slave that is
6445 * registering differences since the server forked to save */
6446 redisClient *slave;
6447 listNode *ln;
6448 listIter li;
6449
6450 listRewind(server.slaves,&li);
6451 while((ln = listNext(&li))) {
6452 slave = ln->value;
6453 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6454 }
6455 if (ln) {
6456 /* Perfect, the server is already registering differences for
6457 * another slave. Set the right state, and copy the buffer. */
6458 listRelease(c->reply);
6459 c->reply = listDup(slave->reply);
6460 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6461 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6462 } else {
6463 /* No way, we need to wait for the next BGSAVE in order to
6464 * register differences */
6465 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6466 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6467 }
6468 } else {
6469 /* Ok we don't have a BGSAVE in progress, let's start one */
6470 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6471 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6472 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6473 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6474 return;
6475 }
6476 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6477 }
6478 c->repldbfd = -1;
6479 c->flags |= REDIS_SLAVE;
6480 c->slaveseldb = 0;
6481 listAddNodeTail(server.slaves,c);
6482 return;
6483 }
6484
6485 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6486 redisClient *slave = privdata;
6487 REDIS_NOTUSED(el);
6488 REDIS_NOTUSED(mask);
6489 char buf[REDIS_IOBUF_LEN];
6490 ssize_t nwritten, buflen;
6491
6492 if (slave->repldboff == 0) {
6493 /* Write the bulk write count before to transfer the DB. In theory here
6494 * we don't know how much room there is in the output buffer of the
6495 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6496 * operations) will never be smaller than the few bytes we need. */
6497 sds bulkcount;
6498
6499 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6500 slave->repldbsize);
6501 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6502 {
6503 sdsfree(bulkcount);
6504 freeClient(slave);
6505 return;
6506 }
6507 sdsfree(bulkcount);
6508 }
6509 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6510 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6511 if (buflen <= 0) {
6512 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6513 (buflen == 0) ? "premature EOF" : strerror(errno));
6514 freeClient(slave);
6515 return;
6516 }
6517 if ((nwritten = write(fd,buf,buflen)) == -1) {
6518 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6519 strerror(errno));
6520 freeClient(slave);
6521 return;
6522 }
6523 slave->repldboff += nwritten;
6524 if (slave->repldboff == slave->repldbsize) {
6525 close(slave->repldbfd);
6526 slave->repldbfd = -1;
6527 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6528 slave->replstate = REDIS_REPL_ONLINE;
6529 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6530 sendReplyToClient, slave) == AE_ERR) {
6531 freeClient(slave);
6532 return;
6533 }
6534 addReplySds(slave,sdsempty());
6535 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6536 }
6537 }
6538
6539 /* This function is called at the end of every backgrond saving.
6540 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6541 * otherwise REDIS_ERR is passed to the function.
6542 *
6543 * The goal of this function is to handle slaves waiting for a successful
6544 * background saving in order to perform non-blocking synchronization. */
6545 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6546 listNode *ln;
6547 int startbgsave = 0;
6548 listIter li;
6549
6550 listRewind(server.slaves,&li);
6551 while((ln = listNext(&li))) {
6552 redisClient *slave = ln->value;
6553
6554 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6555 startbgsave = 1;
6556 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6557 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6558 struct redis_stat buf;
6559
6560 if (bgsaveerr != REDIS_OK) {
6561 freeClient(slave);
6562 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6563 continue;
6564 }
6565 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6566 redis_fstat(slave->repldbfd,&buf) == -1) {
6567 freeClient(slave);
6568 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6569 continue;
6570 }
6571 slave->repldboff = 0;
6572 slave->repldbsize = buf.st_size;
6573 slave->replstate = REDIS_REPL_SEND_BULK;
6574 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6575 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6576 freeClient(slave);
6577 continue;
6578 }
6579 }
6580 }
6581 if (startbgsave) {
6582 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6583 listIter li;
6584
6585 listRewind(server.slaves,&li);
6586 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6587 while((ln = listNext(&li))) {
6588 redisClient *slave = ln->value;
6589
6590 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6591 freeClient(slave);
6592 }
6593 }
6594 }
6595 }
6596
6597 static int syncWithMaster(void) {
6598 char buf[1024], tmpfile[256], authcmd[1024];
6599 int dumpsize;
6600 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6601 int dfd;
6602
6603 if (fd == -1) {
6604 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6605 strerror(errno));
6606 return REDIS_ERR;
6607 }
6608
6609 /* AUTH with the master if required. */
6610 if(server.masterauth) {
6611 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6612 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6613 close(fd);
6614 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6615 strerror(errno));
6616 return REDIS_ERR;
6617 }
6618 /* Read the AUTH result. */
6619 if (syncReadLine(fd,buf,1024,3600) == -1) {
6620 close(fd);
6621 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6622 strerror(errno));
6623 return REDIS_ERR;
6624 }
6625 if (buf[0] != '+') {
6626 close(fd);
6627 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6628 return REDIS_ERR;
6629 }
6630 }
6631
6632 /* Issue the SYNC command */
6633 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6634 close(fd);
6635 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6636 strerror(errno));
6637 return REDIS_ERR;
6638 }
6639 /* Read the bulk write count */
6640 if (syncReadLine(fd,buf,1024,3600) == -1) {
6641 close(fd);
6642 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6643 strerror(errno));
6644 return REDIS_ERR;
6645 }
6646 if (buf[0] != '$') {
6647 close(fd);
6648 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6649 return REDIS_ERR;
6650 }
6651 dumpsize = atoi(buf+1);
6652 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6653 /* Read the bulk write data on a temp file */
6654 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6655 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6656 if (dfd == -1) {
6657 close(fd);
6658 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6659 return REDIS_ERR;
6660 }
6661 while(dumpsize) {
6662 int nread, nwritten;
6663
6664 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6665 if (nread == -1) {
6666 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6667 strerror(errno));
6668 close(fd);
6669 close(dfd);
6670 return REDIS_ERR;
6671 }
6672 nwritten = write(dfd,buf,nread);
6673 if (nwritten == -1) {
6674 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6675 close(fd);
6676 close(dfd);
6677 return REDIS_ERR;
6678 }
6679 dumpsize -= nread;
6680 }
6681 close(dfd);
6682 if (rename(tmpfile,server.dbfilename) == -1) {
6683 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6684 unlink(tmpfile);
6685 close(fd);
6686 return REDIS_ERR;
6687 }
6688 emptyDb();
6689 if (rdbLoad(server.dbfilename) != REDIS_OK) {
6690 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6691 close(fd);
6692 return REDIS_ERR;
6693 }
6694 server.master = createClient(fd);
6695 server.master->flags |= REDIS_MASTER;
6696 server.master->authenticated = 1;
6697 server.replstate = REDIS_REPL_CONNECTED;
6698 return REDIS_OK;
6699 }
6700
6701 static void slaveofCommand(redisClient *c) {
6702 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6703 !strcasecmp(c->argv[2]->ptr,"one")) {
6704 if (server.masterhost) {
6705 sdsfree(server.masterhost);
6706 server.masterhost = NULL;
6707 if (server.master) freeClient(server.master);
6708 server.replstate = REDIS_REPL_NONE;
6709 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6710 }
6711 } else {
6712 sdsfree(server.masterhost);
6713 server.masterhost = sdsdup(c->argv[1]->ptr);
6714 server.masterport = atoi(c->argv[2]->ptr);
6715 if (server.master) freeClient(server.master);
6716 server.replstate = REDIS_REPL_CONNECT;
6717 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6718 server.masterhost, server.masterport);
6719 }
6720 addReply(c,shared.ok);
6721 }
6722
6723 /* ============================ Maxmemory directive ======================== */
6724
6725 /* Try to free one object form the pre-allocated objects free list.
6726 * This is useful under low mem conditions as by default we take 1 million
6727 * free objects allocated. On success REDIS_OK is returned, otherwise
6728 * REDIS_ERR. */
6729 static int tryFreeOneObjectFromFreelist(void) {
6730 robj *o;
6731
6732 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6733 if (listLength(server.objfreelist)) {
6734 listNode *head = listFirst(server.objfreelist);
6735 o = listNodeValue(head);
6736 listDelNode(server.objfreelist,head);
6737 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6738 zfree(o);
6739 return REDIS_OK;
6740 } else {
6741 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6742 return REDIS_ERR;
6743 }
6744 }
6745
6746 /* This function gets called when 'maxmemory' is set on the config file to limit
6747 * the max memory used by the server, and we are out of memory.
6748 * This function will try to, in order:
6749 *
6750 * - Free objects from the free list
6751 * - Try to remove keys with an EXPIRE set
6752 *
6753 * It is not possible to free enough memory to reach used-memory < maxmemory
6754 * the server will start refusing commands that will enlarge even more the
6755 * memory usage.
6756 */
6757 static void freeMemoryIfNeeded(void) {
6758 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
6759 int j, k, freed = 0;
6760
6761 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6762 for (j = 0; j < server.dbnum; j++) {
6763 int minttl = -1;
6764 robj *minkey = NULL;
6765 struct dictEntry *de;
6766
6767 if (dictSize(server.db[j].expires)) {
6768 freed = 1;
6769 /* From a sample of three keys drop the one nearest to
6770 * the natural expire */
6771 for (k = 0; k < 3; k++) {
6772 time_t t;
6773
6774 de = dictGetRandomKey(server.db[j].expires);
6775 t = (time_t) dictGetEntryVal(de);
6776 if (minttl == -1 || t < minttl) {
6777 minkey = dictGetEntryKey(de);
6778 minttl = t;
6779 }
6780 }
6781 deleteKey(server.db+j,minkey);
6782 }
6783 }
6784 if (!freed) return; /* nothing to free... */
6785 }
6786 }
6787
6788 /* ============================== Append Only file ========================== */
6789
6790 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6791 sds buf = sdsempty();
6792 int j;
6793 ssize_t nwritten;
6794 time_t now;
6795 robj *tmpargv[3];
6796
6797 /* The DB this command was targetting is not the same as the last command
6798 * we appendend. To issue a SELECT command is needed. */
6799 if (dictid != server.appendseldb) {
6800 char seldb[64];
6801
6802 snprintf(seldb,sizeof(seldb),"%d",dictid);
6803 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6804 (unsigned long)strlen(seldb),seldb);
6805 server.appendseldb = dictid;
6806 }
6807
6808 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6809 * EXPIREs into EXPIREATs calls */
6810 if (cmd->proc == expireCommand) {
6811 long when;
6812
6813 tmpargv[0] = createStringObject("EXPIREAT",8);
6814 tmpargv[1] = argv[1];
6815 incrRefCount(argv[1]);
6816 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6817 tmpargv[2] = createObject(REDIS_STRING,
6818 sdscatprintf(sdsempty(),"%ld",when));
6819 argv = tmpargv;
6820 }
6821
6822 /* Append the actual command */
6823 buf = sdscatprintf(buf,"*%d\r\n",argc);
6824 for (j = 0; j < argc; j++) {
6825 robj *o = argv[j];
6826
6827 o = getDecodedObject(o);
6828 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
6829 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6830 buf = sdscatlen(buf,"\r\n",2);
6831 decrRefCount(o);
6832 }
6833
6834 /* Free the objects from the modified argv for EXPIREAT */
6835 if (cmd->proc == expireCommand) {
6836 for (j = 0; j < 3; j++)
6837 decrRefCount(argv[j]);
6838 }
6839
6840 /* We want to perform a single write. This should be guaranteed atomic
6841 * at least if the filesystem we are writing is a real physical one.
6842 * While this will save us against the server being killed I don't think
6843 * there is much to do about the whole server stopping for power problems
6844 * or alike */
6845 nwritten = write(server.appendfd,buf,sdslen(buf));
6846 if (nwritten != (signed)sdslen(buf)) {
6847 /* Ooops, we are in troubles. The best thing to do for now is
6848 * to simply exit instead to give the illusion that everything is
6849 * working as expected. */
6850 if (nwritten == -1) {
6851 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6852 } else {
6853 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6854 }
6855 exit(1);
6856 }
6857 /* If a background append only file rewriting is in progress we want to
6858 * accumulate the differences between the child DB and the current one
6859 * in a buffer, so that when the child process will do its work we
6860 * can append the differences to the new append only file. */
6861 if (server.bgrewritechildpid != -1)
6862 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6863
6864 sdsfree(buf);
6865 now = time(NULL);
6866 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6867 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6868 now-server.lastfsync > 1))
6869 {
6870 fsync(server.appendfd); /* Let's try to get this data on the disk */
6871 server.lastfsync = now;
6872 }
6873 }
6874
6875 /* In Redis commands are always executed in the context of a client, so in
6876 * order to load the append only file we need to create a fake client. */
6877 static struct redisClient *createFakeClient(void) {
6878 struct redisClient *c = zmalloc(sizeof(*c));
6879
6880 selectDb(c,0);
6881 c->fd = -1;
6882 c->querybuf = sdsempty();
6883 c->argc = 0;
6884 c->argv = NULL;
6885 c->flags = 0;
6886 /* We set the fake client as a slave waiting for the synchronization
6887 * so that Redis will not try to send replies to this client. */
6888 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6889 c->reply = listCreate();
6890 listSetFreeMethod(c->reply,decrRefCount);
6891 listSetDupMethod(c->reply,dupClientReplyValue);
6892 return c;
6893 }
6894
6895 static void freeFakeClient(struct redisClient *c) {
6896 sdsfree(c->querybuf);
6897 listRelease(c->reply);
6898 zfree(c);
6899 }
6900
6901 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
6902 * error (the append only file is zero-length) REDIS_ERR is returned. On
6903 * fatal error an error message is logged and the program exists. */
6904 int loadAppendOnlyFile(char *filename) {
6905 struct redisClient *fakeClient;
6906 FILE *fp = fopen(filename,"r");
6907 struct redis_stat sb;
6908 unsigned long long loadedkeys = 0;
6909
6910 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6911 return REDIS_ERR;
6912
6913 if (fp == NULL) {
6914 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6915 exit(1);
6916 }
6917
6918 fakeClient = createFakeClient();
6919 while(1) {
6920 int argc, j;
6921 unsigned long len;
6922 robj **argv;
6923 char buf[128];
6924 sds argsds;
6925 struct redisCommand *cmd;
6926
6927 if (fgets(buf,sizeof(buf),fp) == NULL) {
6928 if (feof(fp))
6929 break;
6930 else
6931 goto readerr;
6932 }
6933 if (buf[0] != '*') goto fmterr;
6934 argc = atoi(buf+1);
6935 argv = zmalloc(sizeof(robj*)*argc);
6936 for (j = 0; j < argc; j++) {
6937 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6938 if (buf[0] != '$') goto fmterr;
6939 len = strtol(buf+1,NULL,10);
6940 argsds = sdsnewlen(NULL,len);
6941 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
6942 argv[j] = createObject(REDIS_STRING,argsds);
6943 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6944 }
6945
6946 /* Command lookup */
6947 cmd = lookupCommand(argv[0]->ptr);
6948 if (!cmd) {
6949 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6950 exit(1);
6951 }
6952 /* Try object sharing and encoding */
6953 if (server.shareobjects) {
6954 int j;
6955 for(j = 1; j < argc; j++)
6956 argv[j] = tryObjectSharing(argv[j]);
6957 }
6958 if (cmd->flags & REDIS_CMD_BULK)
6959 tryObjectEncoding(argv[argc-1]);
6960 /* Run the command in the context of a fake client */
6961 fakeClient->argc = argc;
6962 fakeClient->argv = argv;
6963 cmd->proc(fakeClient);
6964 /* Discard the reply objects list from the fake client */
6965 while(listLength(fakeClient->reply))
6966 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6967 /* Clean up, ready for the next command */
6968 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6969 zfree(argv);
6970 /* Handle swapping while loading big datasets when VM is on */
6971 loadedkeys++;
6972 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
6973 while (zmalloc_used_memory() > server.vm_max_memory) {
6974 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
6975 }
6976 }
6977 }
6978 fclose(fp);
6979 freeFakeClient(fakeClient);
6980 return REDIS_OK;
6981
6982 readerr:
6983 if (feof(fp)) {
6984 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6985 } else {
6986 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6987 }
6988 exit(1);
6989 fmterr:
6990 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6991 exit(1);
6992 }
6993
6994 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6995 static int fwriteBulk(FILE *fp, robj *obj) {
6996 char buf[128];
6997 int decrrc = 0;
6998
6999 /* Avoid the incr/decr ref count business if possible to help
7000 * copy-on-write (we are often in a child process when this function
7001 * is called).
7002 * Also makes sure that key objects don't get incrRefCount-ed when VM
7003 * is enabled */
7004 if (obj->encoding != REDIS_ENCODING_RAW) {
7005 obj = getDecodedObject(obj);
7006 decrrc = 1;
7007 }
7008 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7009 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7010 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7011 goto err;
7012 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7013 if (decrrc) decrRefCount(obj);
7014 return 1;
7015 err:
7016 if (decrrc) decrRefCount(obj);
7017 return 0;
7018 }
7019
7020 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7021 static int fwriteBulkDouble(FILE *fp, double d) {
7022 char buf[128], dbuf[128];
7023
7024 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7025 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7026 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7027 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7028 return 1;
7029 }
7030
7031 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7032 static int fwriteBulkLong(FILE *fp, long l) {
7033 char buf[128], lbuf[128];
7034
7035 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7036 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7037 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7038 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7039 return 1;
7040 }
7041
7042 /* Write a sequence of commands able to fully rebuild the dataset into
7043 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7044 static int rewriteAppendOnlyFile(char *filename) {
7045 dictIterator *di = NULL;
7046 dictEntry *de;
7047 FILE *fp;
7048 char tmpfile[256];
7049 int j;
7050 time_t now = time(NULL);
7051
7052 /* Note that we have to use a different temp name here compared to the
7053 * one used by rewriteAppendOnlyFileBackground() function. */
7054 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7055 fp = fopen(tmpfile,"w");
7056 if (!fp) {
7057 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7058 return REDIS_ERR;
7059 }
7060 for (j = 0; j < server.dbnum; j++) {
7061 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7062 redisDb *db = server.db+j;
7063 dict *d = db->dict;
7064 if (dictSize(d) == 0) continue;
7065 di = dictGetIterator(d);
7066 if (!di) {
7067 fclose(fp);
7068 return REDIS_ERR;
7069 }
7070
7071 /* SELECT the new DB */
7072 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7073 if (fwriteBulkLong(fp,j) == 0) goto werr;
7074
7075 /* Iterate this DB writing every entry */
7076 while((de = dictNext(di)) != NULL) {
7077 robj *key, *o;
7078 time_t expiretime;
7079 int swapped;
7080
7081 key = dictGetEntryKey(de);
7082 /* If the value for this key is swapped, load a preview in memory.
7083 * We use a "swapped" flag to remember if we need to free the
7084 * value object instead to just increment the ref count anyway
7085 * in order to avoid copy-on-write of pages if we are forked() */
7086 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7087 key->storage == REDIS_VM_SWAPPING) {
7088 o = dictGetEntryVal(de);
7089 swapped = 0;
7090 } else {
7091 o = vmPreviewObject(key);
7092 swapped = 1;
7093 }
7094 expiretime = getExpire(db,key);
7095
7096 /* Save the key and associated value */
7097 if (o->type == REDIS_STRING) {
7098 /* Emit a SET command */
7099 char cmd[]="*3\r\n$3\r\nSET\r\n";
7100 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7101 /* Key and value */
7102 if (fwriteBulk(fp,key) == 0) goto werr;
7103 if (fwriteBulk(fp,o) == 0) goto werr;
7104 } else if (o->type == REDIS_LIST) {
7105 /* Emit the RPUSHes needed to rebuild the list */
7106 list *list = o->ptr;
7107 listNode *ln;
7108 listIter li;
7109
7110 listRewind(list,&li);
7111 while((ln = listNext(&li))) {
7112 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7113 robj *eleobj = listNodeValue(ln);
7114
7115 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7116 if (fwriteBulk(fp,key) == 0) goto werr;
7117 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7118 }
7119 } else if (o->type == REDIS_SET) {
7120 /* Emit the SADDs needed to rebuild the set */
7121 dict *set = o->ptr;
7122 dictIterator *di = dictGetIterator(set);
7123 dictEntry *de;
7124
7125 while((de = dictNext(di)) != NULL) {
7126 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7127 robj *eleobj = dictGetEntryKey(de);
7128
7129 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7130 if (fwriteBulk(fp,key) == 0) goto werr;
7131 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7132 }
7133 dictReleaseIterator(di);
7134 } else if (o->type == REDIS_ZSET) {
7135 /* Emit the ZADDs needed to rebuild the sorted set */
7136 zset *zs = o->ptr;
7137 dictIterator *di = dictGetIterator(zs->dict);
7138 dictEntry *de;
7139
7140 while((de = dictNext(di)) != NULL) {
7141 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7142 robj *eleobj = dictGetEntryKey(de);
7143 double *score = dictGetEntryVal(de);
7144
7145 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7146 if (fwriteBulk(fp,key) == 0) goto werr;
7147 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7148 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7149 }
7150 dictReleaseIterator(di);
7151 } else {
7152 redisAssert(0 != 0);
7153 }
7154 /* Save the expire time */
7155 if (expiretime != -1) {
7156 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7157 /* If this key is already expired skip it */
7158 if (expiretime < now) continue;
7159 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7160 if (fwriteBulk(fp,key) == 0) goto werr;
7161 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7162 }
7163 if (swapped) decrRefCount(o);
7164 }
7165 dictReleaseIterator(di);
7166 }
7167
7168 /* Make sure data will not remain on the OS's output buffers */
7169 fflush(fp);
7170 fsync(fileno(fp));
7171 fclose(fp);
7172
7173 /* Use RENAME to make sure the DB file is changed atomically only
7174 * if the generate DB file is ok. */
7175 if (rename(tmpfile,filename) == -1) {
7176 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7177 unlink(tmpfile);
7178 return REDIS_ERR;
7179 }
7180 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7181 return REDIS_OK;
7182
7183 werr:
7184 fclose(fp);
7185 unlink(tmpfile);
7186 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7187 if (di) dictReleaseIterator(di);
7188 return REDIS_ERR;
7189 }
7190
7191 /* This is how rewriting of the append only file in background works:
7192 *
7193 * 1) The user calls BGREWRITEAOF
7194 * 2) Redis calls this function, that forks():
7195 * 2a) the child rewrite the append only file in a temp file.
7196 * 2b) the parent accumulates differences in server.bgrewritebuf.
7197 * 3) When the child finished '2a' exists.
7198 * 4) The parent will trap the exit code, if it's OK, will append the
7199 * data accumulated into server.bgrewritebuf into the temp file, and
7200 * finally will rename(2) the temp file in the actual file name.
7201 * The the new file is reopened as the new append only file. Profit!
7202 */
7203 static int rewriteAppendOnlyFileBackground(void) {
7204 pid_t childpid;
7205
7206 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7207 if (server.vm_enabled) waitEmptyIOJobsQueue();
7208 if ((childpid = fork()) == 0) {
7209 /* Child */
7210 char tmpfile[256];
7211
7212 if (server.vm_enabled) vmReopenSwapFile();
7213 close(server.fd);
7214 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7215 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7216 _exit(0);
7217 } else {
7218 _exit(1);
7219 }
7220 } else {
7221 /* Parent */
7222 if (childpid == -1) {
7223 redisLog(REDIS_WARNING,
7224 "Can't rewrite append only file in background: fork: %s",
7225 strerror(errno));
7226 return REDIS_ERR;
7227 }
7228 redisLog(REDIS_NOTICE,
7229 "Background append only file rewriting started by pid %d",childpid);
7230 server.bgrewritechildpid = childpid;
7231 /* We set appendseldb to -1 in order to force the next call to the
7232 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7233 * accumulated by the parent into server.bgrewritebuf will start
7234 * with a SELECT statement and it will be safe to merge. */
7235 server.appendseldb = -1;
7236 return REDIS_OK;
7237 }
7238 return REDIS_OK; /* unreached */
7239 }
7240
7241 static void bgrewriteaofCommand(redisClient *c) {
7242 if (server.bgrewritechildpid != -1) {
7243 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7244 return;
7245 }
7246 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7247 char *status = "+Background append only file rewriting started\r\n";
7248 addReplySds(c,sdsnew(status));
7249 } else {
7250 addReply(c,shared.err);
7251 }
7252 }
7253
7254 static void aofRemoveTempFile(pid_t childpid) {
7255 char tmpfile[256];
7256
7257 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7258 unlink(tmpfile);
7259 }
7260
7261 /* Virtual Memory is composed mainly of two subsystems:
7262 * - Blocking Virutal Memory
7263 * - Threaded Virtual Memory I/O
7264 * The two parts are not fully decoupled, but functions are split among two
7265 * different sections of the source code (delimited by comments) in order to
7266 * make more clear what functionality is about the blocking VM and what about
7267 * the threaded (not blocking) VM.
7268 *
7269 * Redis VM design:
7270 *
7271 * Redis VM is a blocking VM (one that blocks reading swapped values from
7272 * disk into memory when a value swapped out is needed in memory) that is made
7273 * unblocking by trying to examine the command argument vector in order to
7274 * load in background values that will likely be needed in order to exec
7275 * the command. The command is executed only once all the relevant keys
7276 * are loaded into memory.
7277 *
7278 * This basically is almost as simple of a blocking VM, but almost as parallel
7279 * as a fully non-blocking VM.
7280 */
7281
7282 /* =================== Virtual Memory - Blocking Side ====================== */
7283
7284 /* substitute the first occurrence of '%p' with the process pid in the
7285 * swap file name. */
7286 static void expandVmSwapFilename(void) {
7287 char *p = strstr(server.vm_swap_file,"%p");
7288 sds new;
7289
7290 if (!p) return;
7291 new = sdsempty();
7292 *p = '\0';
7293 new = sdscat(new,server.vm_swap_file);
7294 new = sdscatprintf(new,"%ld",(long) getpid());
7295 new = sdscat(new,p+2);
7296 zfree(server.vm_swap_file);
7297 server.vm_swap_file = new;
7298 }
7299
7300 static void vmInit(void) {
7301 off_t totsize;
7302 int pipefds[2];
7303 size_t stacksize;
7304
7305 if (server.vm_max_threads != 0)
7306 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7307
7308 expandVmSwapFilename();
7309 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7310 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7311 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7312 }
7313 if (server.vm_fp == NULL) {
7314 redisLog(REDIS_WARNING,
7315 "Impossible to open the swap file: %s. Exiting.",
7316 strerror(errno));
7317 exit(1);
7318 }
7319 server.vm_fd = fileno(server.vm_fp);
7320 server.vm_next_page = 0;
7321 server.vm_near_pages = 0;
7322 server.vm_stats_used_pages = 0;
7323 server.vm_stats_swapped_objects = 0;
7324 server.vm_stats_swapouts = 0;
7325 server.vm_stats_swapins = 0;
7326 totsize = server.vm_pages*server.vm_page_size;
7327 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7328 if (ftruncate(server.vm_fd,totsize) == -1) {
7329 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7330 strerror(errno));
7331 exit(1);
7332 } else {
7333 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7334 }
7335 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7336 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7337 (long long) (server.vm_pages+7)/8, server.vm_pages);
7338 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7339
7340 /* Initialize threaded I/O (used by Virtual Memory) */
7341 server.io_newjobs = listCreate();
7342 server.io_processing = listCreate();
7343 server.io_processed = listCreate();
7344 server.io_ready_clients = listCreate();
7345 pthread_mutex_init(&server.io_mutex,NULL);
7346 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7347 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7348 server.io_active_threads = 0;
7349 if (pipe(pipefds) == -1) {
7350 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7351 ,strerror(errno));
7352 exit(1);
7353 }
7354 server.io_ready_pipe_read = pipefds[0];
7355 server.io_ready_pipe_write = pipefds[1];
7356 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7357 /* LZF requires a lot of stack */
7358 pthread_attr_init(&server.io_threads_attr);
7359 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7360 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7361 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7362 /* Listen for events in the threaded I/O pipe */
7363 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7364 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7365 oom("creating file event");
7366 }
7367
7368 /* Mark the page as used */
7369 static void vmMarkPageUsed(off_t page) {
7370 off_t byte = page/8;
7371 int bit = page&7;
7372 redisAssert(vmFreePage(page) == 1);
7373 server.vm_bitmap[byte] |= 1<<bit;
7374 }
7375
7376 /* Mark N contiguous pages as used, with 'page' being the first. */
7377 static void vmMarkPagesUsed(off_t page, off_t count) {
7378 off_t j;
7379
7380 for (j = 0; j < count; j++)
7381 vmMarkPageUsed(page+j);
7382 server.vm_stats_used_pages += count;
7383 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7384 (long long)count, (long long)page);
7385 }
7386
7387 /* Mark the page as free */
7388 static void vmMarkPageFree(off_t page) {
7389 off_t byte = page/8;
7390 int bit = page&7;
7391 redisAssert(vmFreePage(page) == 0);
7392 server.vm_bitmap[byte] &= ~(1<<bit);
7393 }
7394
7395 /* Mark N contiguous pages as free, with 'page' being the first. */
7396 static void vmMarkPagesFree(off_t page, off_t count) {
7397 off_t j;
7398
7399 for (j = 0; j < count; j++)
7400 vmMarkPageFree(page+j);
7401 server.vm_stats_used_pages -= count;
7402 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7403 (long long)count, (long long)page);
7404 }
7405
7406 /* Test if the page is free */
7407 static int vmFreePage(off_t page) {
7408 off_t byte = page/8;
7409 int bit = page&7;
7410 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7411 }
7412
7413 /* Find N contiguous free pages storing the first page of the cluster in *first.
7414 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7415 * REDIS_ERR is returned.
7416 *
7417 * This function uses a simple algorithm: we try to allocate
7418 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7419 * again from the start of the swap file searching for free spaces.
7420 *
7421 * If it looks pretty clear that there are no free pages near our offset
7422 * we try to find less populated places doing a forward jump of
7423 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7424 * without hurry, and then we jump again and so forth...
7425 *
7426 * This function can be improved using a free list to avoid to guess
7427 * too much, since we could collect data about freed pages.
7428 *
7429 * note: I implemented this function just after watching an episode of
7430 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7431 */
7432 static int vmFindContiguousPages(off_t *first, off_t n) {
7433 off_t base, offset = 0, since_jump = 0, numfree = 0;
7434
7435 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7436 server.vm_near_pages = 0;
7437 server.vm_next_page = 0;
7438 }
7439 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7440 base = server.vm_next_page;
7441
7442 while(offset < server.vm_pages) {
7443 off_t this = base+offset;
7444
7445 /* If we overflow, restart from page zero */
7446 if (this >= server.vm_pages) {
7447 this -= server.vm_pages;
7448 if (this == 0) {
7449 /* Just overflowed, what we found on tail is no longer
7450 * interesting, as it's no longer contiguous. */
7451 numfree = 0;
7452 }
7453 }
7454 if (vmFreePage(this)) {
7455 /* This is a free page */
7456 numfree++;
7457 /* Already got N free pages? Return to the caller, with success */
7458 if (numfree == n) {
7459 *first = this-(n-1);
7460 server.vm_next_page = this+1;
7461 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7462 return REDIS_OK;
7463 }
7464 } else {
7465 /* The current one is not a free page */
7466 numfree = 0;
7467 }
7468
7469 /* Fast-forward if the current page is not free and we already
7470 * searched enough near this place. */
7471 since_jump++;
7472 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7473 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7474 since_jump = 0;
7475 /* Note that even if we rewind after the jump, we are don't need
7476 * to make sure numfree is set to zero as we only jump *if* it
7477 * is set to zero. */
7478 } else {
7479 /* Otherwise just check the next page */
7480 offset++;
7481 }
7482 }
7483 return REDIS_ERR;
7484 }
7485
7486 /* Write the specified object at the specified page of the swap file */
7487 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7488 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7489 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7490 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7491 redisLog(REDIS_WARNING,
7492 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7493 strerror(errno));
7494 return REDIS_ERR;
7495 }
7496 rdbSaveObject(server.vm_fp,o);
7497 fflush(server.vm_fp);
7498 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7499 return REDIS_OK;
7500 }
7501
7502 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7503 * needed to later retrieve the object into the key object.
7504 * If we can't find enough contiguous empty pages to swap the object on disk
7505 * REDIS_ERR is returned. */
7506 static int vmSwapObjectBlocking(robj *key, robj *val) {
7507 off_t pages = rdbSavedObjectPages(val,NULL);
7508 off_t page;
7509
7510 assert(key->storage == REDIS_VM_MEMORY);
7511 assert(key->refcount == 1);
7512 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7513 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7514 key->vm.page = page;
7515 key->vm.usedpages = pages;
7516 key->storage = REDIS_VM_SWAPPED;
7517 key->vtype = val->type;
7518 decrRefCount(val); /* Deallocate the object from memory. */
7519 vmMarkPagesUsed(page,pages);
7520 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7521 (unsigned char*) key->ptr,
7522 (unsigned long long) page, (unsigned long long) pages);
7523 server.vm_stats_swapped_objects++;
7524 server.vm_stats_swapouts++;
7525 return REDIS_OK;
7526 }
7527
7528 static robj *vmReadObjectFromSwap(off_t page, int type) {
7529 robj *o;
7530
7531 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7532 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7533 redisLog(REDIS_WARNING,
7534 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7535 strerror(errno));
7536 _exit(1);
7537 }
7538 o = rdbLoadObject(type,server.vm_fp);
7539 if (o == NULL) {
7540 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7541 _exit(1);
7542 }
7543 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7544 return o;
7545 }
7546
7547 /* Load the value object relative to the 'key' object from swap to memory.
7548 * The newly allocated object is returned.
7549 *
7550 * If preview is true the unserialized object is returned to the caller but
7551 * no changes are made to the key object, nor the pages are marked as freed */
7552 static robj *vmGenericLoadObject(robj *key, int preview) {
7553 robj *val;
7554
7555 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7556 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7557 if (!preview) {
7558 key->storage = REDIS_VM_MEMORY;
7559 key->vm.atime = server.unixtime;
7560 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7561 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7562 (unsigned char*) key->ptr);
7563 server.vm_stats_swapped_objects--;
7564 } else {
7565 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7566 (unsigned char*) key->ptr);
7567 }
7568 server.vm_stats_swapins++;
7569 return val;
7570 }
7571
7572 /* Plain object loading, from swap to memory */
7573 static robj *vmLoadObject(robj *key) {
7574 /* If we are loading the object in background, stop it, we
7575 * need to load this object synchronously ASAP. */
7576 if (key->storage == REDIS_VM_LOADING)
7577 vmCancelThreadedIOJob(key);
7578 return vmGenericLoadObject(key,0);
7579 }
7580
7581 /* Just load the value on disk, without to modify the key.
7582 * This is useful when we want to perform some operation on the value
7583 * without to really bring it from swap to memory, like while saving the
7584 * dataset or rewriting the append only log. */
7585 static robj *vmPreviewObject(robj *key) {
7586 return vmGenericLoadObject(key,1);
7587 }
7588
7589 /* How a good candidate is this object for swapping?
7590 * The better candidate it is, the greater the returned value.
7591 *
7592 * Currently we try to perform a fast estimation of the object size in
7593 * memory, and combine it with aging informations.
7594 *
7595 * Basically swappability = idle-time * log(estimated size)
7596 *
7597 * Bigger objects are preferred over smaller objects, but not
7598 * proportionally, this is why we use the logarithm. This algorithm is
7599 * just a first try and will probably be tuned later. */
7600 static double computeObjectSwappability(robj *o) {
7601 time_t age = server.unixtime - o->vm.atime;
7602 long asize = 0;
7603 list *l;
7604 dict *d;
7605 struct dictEntry *de;
7606 int z;
7607
7608 if (age <= 0) return 0;
7609 switch(o->type) {
7610 case REDIS_STRING:
7611 if (o->encoding != REDIS_ENCODING_RAW) {
7612 asize = sizeof(*o);
7613 } else {
7614 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7615 }
7616 break;
7617 case REDIS_LIST:
7618 l = o->ptr;
7619 listNode *ln = listFirst(l);
7620
7621 asize = sizeof(list);
7622 if (ln) {
7623 robj *ele = ln->value;
7624 long elesize;
7625
7626 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7627 (sizeof(*o)+sdslen(ele->ptr)) :
7628 sizeof(*o);
7629 asize += (sizeof(listNode)+elesize)*listLength(l);
7630 }
7631 break;
7632 case REDIS_SET:
7633 case REDIS_ZSET:
7634 z = (o->type == REDIS_ZSET);
7635 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7636
7637 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7638 if (z) asize += sizeof(zset)-sizeof(dict);
7639 if (dictSize(d)) {
7640 long elesize;
7641 robj *ele;
7642
7643 de = dictGetRandomKey(d);
7644 ele = dictGetEntryKey(de);
7645 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7646 (sizeof(*o)+sdslen(ele->ptr)) :
7647 sizeof(*o);
7648 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7649 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7650 }
7651 break;
7652 }
7653 return (double)age*log(1+asize);
7654 }
7655
7656 /* Try to swap an object that's a good candidate for swapping.
7657 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7658 * to swap any object at all.
7659 *
7660 * If 'usethreaded' is true, Redis will try to swap the object in background
7661 * using I/O threads. */
7662 static int vmSwapOneObject(int usethreads) {
7663 int j, i;
7664 struct dictEntry *best = NULL;
7665 double best_swappability = 0;
7666 redisDb *best_db = NULL;
7667 robj *key, *val;
7668
7669 for (j = 0; j < server.dbnum; j++) {
7670 redisDb *db = server.db+j;
7671 /* Why maxtries is set to 100?
7672 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7673 * are swappable objects */
7674 int maxtries = 100;
7675
7676 if (dictSize(db->dict) == 0) continue;
7677 for (i = 0; i < 5; i++) {
7678 dictEntry *de;
7679 double swappability;
7680
7681 if (maxtries) maxtries--;
7682 de = dictGetRandomKey(db->dict);
7683 key = dictGetEntryKey(de);
7684 val = dictGetEntryVal(de);
7685 /* Only swap objects that are currently in memory.
7686 *
7687 * Also don't swap shared objects if threaded VM is on, as we
7688 * try to ensure that the main thread does not touch the
7689 * object while the I/O thread is using it, but we can't
7690 * control other keys without adding additional mutex. */
7691 if (key->storage != REDIS_VM_MEMORY ||
7692 (server.vm_max_threads != 0 && val->refcount != 1)) {
7693 if (maxtries) i--; /* don't count this try */
7694 continue;
7695 }
7696 swappability = computeObjectSwappability(val);
7697 if (!best || swappability > best_swappability) {
7698 best = de;
7699 best_swappability = swappability;
7700 best_db = db;
7701 }
7702 }
7703 }
7704 if (best == NULL) return REDIS_ERR;
7705 key = dictGetEntryKey(best);
7706 val = dictGetEntryVal(best);
7707
7708 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
7709 key->ptr, best_swappability);
7710
7711 /* Unshare the key if needed */
7712 if (key->refcount > 1) {
7713 robj *newkey = dupStringObject(key);
7714 decrRefCount(key);
7715 key = dictGetEntryKey(best) = newkey;
7716 }
7717 /* Swap it */
7718 if (usethreads) {
7719 vmSwapObjectThreaded(key,val,best_db);
7720 return REDIS_OK;
7721 } else {
7722 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7723 dictGetEntryVal(best) = NULL;
7724 return REDIS_OK;
7725 } else {
7726 return REDIS_ERR;
7727 }
7728 }
7729 }
7730
7731 static int vmSwapOneObjectBlocking() {
7732 return vmSwapOneObject(0);
7733 }
7734
7735 static int vmSwapOneObjectThreaded() {
7736 return vmSwapOneObject(1);
7737 }
7738
7739 /* Return true if it's safe to swap out objects in a given moment.
7740 * Basically we don't want to swap objects out while there is a BGSAVE
7741 * or a BGAEOREWRITE running in backgroud. */
7742 static int vmCanSwapOut(void) {
7743 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7744 }
7745
7746 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7747 * and was deleted. Otherwise 0 is returned. */
7748 static int deleteIfSwapped(redisDb *db, robj *key) {
7749 dictEntry *de;
7750 robj *foundkey;
7751
7752 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7753 foundkey = dictGetEntryKey(de);
7754 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7755 deleteKey(db,key);
7756 return 1;
7757 }
7758
7759 /* =================== Virtual Memory - Threaded I/O ======================= */
7760
7761 static void freeIOJob(iojob *j) {
7762 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7763 j->type == REDIS_IOJOB_DO_SWAP ||
7764 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
7765 decrRefCount(j->val);
7766 decrRefCount(j->key);
7767 zfree(j);
7768 }
7769
7770 /* Every time a thread finished a Job, it writes a byte into the write side
7771 * of an unix pipe in order to "awake" the main thread, and this function
7772 * is called. */
7773 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7774 int mask)
7775 {
7776 char buf[1];
7777 int retval, processed = 0, toprocess = -1, trytoswap = 1;
7778 REDIS_NOTUSED(el);
7779 REDIS_NOTUSED(mask);
7780 REDIS_NOTUSED(privdata);
7781
7782 /* For every byte we read in the read side of the pipe, there is one
7783 * I/O job completed to process. */
7784 while((retval = read(fd,buf,1)) == 1) {
7785 iojob *j;
7786 listNode *ln;
7787 robj *key;
7788 struct dictEntry *de;
7789
7790 redisLog(REDIS_DEBUG,"Processing I/O completed job");
7791
7792 /* Get the processed element (the oldest one) */
7793 lockThreadedIO();
7794 assert(listLength(server.io_processed) != 0);
7795 if (toprocess == -1) {
7796 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7797 if (toprocess <= 0) toprocess = 1;
7798 }
7799 ln = listFirst(server.io_processed);
7800 j = ln->value;
7801 listDelNode(server.io_processed,ln);
7802 unlockThreadedIO();
7803 /* If this job is marked as canceled, just ignore it */
7804 if (j->canceled) {
7805 freeIOJob(j);
7806 continue;
7807 }
7808 /* Post process it in the main thread, as there are things we
7809 * can do just here to avoid race conditions and/or invasive locks */
7810 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
7811 de = dictFind(j->db->dict,j->key);
7812 assert(de != NULL);
7813 key = dictGetEntryKey(de);
7814 if (j->type == REDIS_IOJOB_LOAD) {
7815 redisDb *db;
7816
7817 /* Key loaded, bring it at home */
7818 key->storage = REDIS_VM_MEMORY;
7819 key->vm.atime = server.unixtime;
7820 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7821 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7822 (unsigned char*) key->ptr);
7823 server.vm_stats_swapped_objects--;
7824 server.vm_stats_swapins++;
7825 dictGetEntryVal(de) = j->val;
7826 incrRefCount(j->val);
7827 db = j->db;
7828 freeIOJob(j);
7829 /* Handle clients waiting for this key to be loaded. */
7830 handleClientsBlockedOnSwappedKey(db,key);
7831 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7832 /* Now we know the amount of pages required to swap this object.
7833 * Let's find some space for it, and queue this task again
7834 * rebranded as REDIS_IOJOB_DO_SWAP. */
7835 if (!vmCanSwapOut() ||
7836 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
7837 {
7838 /* Ooops... no space or we can't swap as there is
7839 * a fork()ed Redis trying to save stuff on disk. */
7840 freeIOJob(j);
7841 key->storage = REDIS_VM_MEMORY; /* undo operation */
7842 } else {
7843 /* Note that we need to mark this pages as used now,
7844 * if the job will be canceled, we'll mark them as freed
7845 * again. */
7846 vmMarkPagesUsed(j->page,j->pages);
7847 j->type = REDIS_IOJOB_DO_SWAP;
7848 lockThreadedIO();
7849 queueIOJob(j);
7850 unlockThreadedIO();
7851 }
7852 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7853 robj *val;
7854
7855 /* Key swapped. We can finally free some memory. */
7856 if (key->storage != REDIS_VM_SWAPPING) {
7857 printf("key->storage: %d\n",key->storage);
7858 printf("key->name: %s\n",(char*)key->ptr);
7859 printf("key->refcount: %d\n",key->refcount);
7860 printf("val: %p\n",(void*)j->val);
7861 printf("val->type: %d\n",j->val->type);
7862 printf("val->ptr: %s\n",(char*)j->val->ptr);
7863 }
7864 redisAssert(key->storage == REDIS_VM_SWAPPING);
7865 val = dictGetEntryVal(de);
7866 key->vm.page = j->page;
7867 key->vm.usedpages = j->pages;
7868 key->storage = REDIS_VM_SWAPPED;
7869 key->vtype = j->val->type;
7870 decrRefCount(val); /* Deallocate the object from memory. */
7871 dictGetEntryVal(de) = NULL;
7872 redisLog(REDIS_DEBUG,
7873 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7874 (unsigned char*) key->ptr,
7875 (unsigned long long) j->page, (unsigned long long) j->pages);
7876 server.vm_stats_swapped_objects++;
7877 server.vm_stats_swapouts++;
7878 freeIOJob(j);
7879 /* Put a few more swap requests in queue if we are still
7880 * out of memory */
7881 if (trytoswap && vmCanSwapOut() &&
7882 zmalloc_used_memory() > server.vm_max_memory)
7883 {
7884 int more = 1;
7885 while(more) {
7886 lockThreadedIO();
7887 more = listLength(server.io_newjobs) <
7888 (unsigned) server.vm_max_threads;
7889 unlockThreadedIO();
7890 /* Don't waste CPU time if swappable objects are rare. */
7891 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
7892 trytoswap = 0;
7893 break;
7894 }
7895 }
7896 }
7897 }
7898 processed++;
7899 if (processed == toprocess) return;
7900 }
7901 if (retval < 0 && errno != EAGAIN) {
7902 redisLog(REDIS_WARNING,
7903 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7904 strerror(errno));
7905 }
7906 }
7907
7908 static void lockThreadedIO(void) {
7909 pthread_mutex_lock(&server.io_mutex);
7910 }
7911
7912 static void unlockThreadedIO(void) {
7913 pthread_mutex_unlock(&server.io_mutex);
7914 }
7915
7916 /* Remove the specified object from the threaded I/O queue if still not
7917 * processed, otherwise make sure to flag it as canceled. */
7918 static void vmCancelThreadedIOJob(robj *o) {
7919 list *lists[3] = {
7920 server.io_newjobs, /* 0 */
7921 server.io_processing, /* 1 */
7922 server.io_processed /* 2 */
7923 };
7924 int i;
7925
7926 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
7927 again:
7928 lockThreadedIO();
7929 /* Search for a matching key in one of the queues */
7930 for (i = 0; i < 3; i++) {
7931 listNode *ln;
7932 listIter li;
7933
7934 listRewind(lists[i],&li);
7935 while ((ln = listNext(&li)) != NULL) {
7936 iojob *job = ln->value;
7937
7938 if (job->canceled) continue; /* Skip this, already canceled. */
7939 if (compareStringObjects(job->key,o) == 0) {
7940 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
7941 (void*)job, (char*)o->ptr, job->type, i);
7942 /* Mark the pages as free since the swap didn't happened
7943 * or happened but is now discarded. */
7944 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
7945 vmMarkPagesFree(job->page,job->pages);
7946 /* Cancel the job. It depends on the list the job is
7947 * living in. */
7948 switch(i) {
7949 case 0: /* io_newjobs */
7950 /* If the job was yet not processed the best thing to do
7951 * is to remove it from the queue at all */
7952 freeIOJob(job);
7953 listDelNode(lists[i],ln);
7954 break;
7955 case 1: /* io_processing */
7956 /* Oh Shi- the thread is messing with the Job:
7957 *
7958 * Probably it's accessing the object if this is a
7959 * PREPARE_SWAP or DO_SWAP job.
7960 * If it's a LOAD job it may be reading from disk and
7961 * if we don't wait for the job to terminate before to
7962 * cancel it, maybe in a few microseconds data can be
7963 * corrupted in this pages. So the short story is:
7964 *
7965 * Better to wait for the job to move into the
7966 * next queue (processed)... */
7967
7968 /* We try again and again until the job is completed. */
7969 unlockThreadedIO();
7970 /* But let's wait some time for the I/O thread
7971 * to finish with this job. After all this condition
7972 * should be very rare. */
7973 usleep(1);
7974 goto again;
7975 case 2: /* io_processed */
7976 /* The job was already processed, that's easy...
7977 * just mark it as canceled so that we'll ignore it
7978 * when processing completed jobs. */
7979 job->canceled = 1;
7980 break;
7981 }
7982 /* Finally we have to adjust the storage type of the object
7983 * in order to "UNDO" the operaiton. */
7984 if (o->storage == REDIS_VM_LOADING)
7985 o->storage = REDIS_VM_SWAPPED;
7986 else if (o->storage == REDIS_VM_SWAPPING)
7987 o->storage = REDIS_VM_MEMORY;
7988 unlockThreadedIO();
7989 return;
7990 }
7991 }
7992 }
7993 unlockThreadedIO();
7994 assert(1 != 1); /* We should never reach this */
7995 }
7996
7997 static void *IOThreadEntryPoint(void *arg) {
7998 iojob *j;
7999 listNode *ln;
8000 REDIS_NOTUSED(arg);
8001
8002 pthread_detach(pthread_self());
8003 while(1) {
8004 /* Get a new job to process */
8005 lockThreadedIO();
8006 if (listLength(server.io_newjobs) == 0) {
8007 /* No new jobs in queue, exit. */
8008 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8009 (long) pthread_self());
8010 server.io_active_threads--;
8011 unlockThreadedIO();
8012 return NULL;
8013 }
8014 ln = listFirst(server.io_newjobs);
8015 j = ln->value;
8016 listDelNode(server.io_newjobs,ln);
8017 /* Add the job in the processing queue */
8018 j->thread = pthread_self();
8019 listAddNodeTail(server.io_processing,j);
8020 ln = listLast(server.io_processing); /* We use ln later to remove it */
8021 unlockThreadedIO();
8022 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8023 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8024
8025 /* Process the Job */
8026 if (j->type == REDIS_IOJOB_LOAD) {
8027 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8028 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8029 FILE *fp = fopen("/dev/null","w+");
8030 j->pages = rdbSavedObjectPages(j->val,fp);
8031 fclose(fp);
8032 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8033 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8034 j->canceled = 1;
8035 }
8036
8037 /* Done: insert the job into the processed queue */
8038 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8039 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8040 lockThreadedIO();
8041 listDelNode(server.io_processing,ln);
8042 listAddNodeTail(server.io_processed,j);
8043 unlockThreadedIO();
8044
8045 /* Signal the main thread there is new stuff to process */
8046 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8047 }
8048 return NULL; /* never reached */
8049 }
8050
8051 static void spawnIOThread(void) {
8052 pthread_t thread;
8053 sigset_t mask, omask;
8054
8055 sigemptyset(&mask);
8056 sigaddset(&mask,SIGCHLD);
8057 sigaddset(&mask,SIGHUP);
8058 sigaddset(&mask,SIGPIPE);
8059 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8060 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8061 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8062 server.io_active_threads++;
8063 }
8064
8065 /* We need to wait for the last thread to exit before we are able to
8066 * fork() in order to BGSAVE or BGREWRITEAOF. */
8067 static void waitEmptyIOJobsQueue(void) {
8068 while(1) {
8069 int io_processed_len;
8070
8071 lockThreadedIO();
8072 if (listLength(server.io_newjobs) == 0 &&
8073 listLength(server.io_processing) == 0 &&
8074 server.io_active_threads == 0)
8075 {
8076 unlockThreadedIO();
8077 return;
8078 }
8079 /* While waiting for empty jobs queue condition we post-process some
8080 * finshed job, as I/O threads may be hanging trying to write against
8081 * the io_ready_pipe_write FD but there are so much pending jobs that
8082 * it's blocking. */
8083 io_processed_len = listLength(server.io_processed);
8084 unlockThreadedIO();
8085 if (io_processed_len) {
8086 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8087 usleep(1000); /* 1 millisecond */
8088 } else {
8089 usleep(10000); /* 10 milliseconds */
8090 }
8091 }
8092 }
8093
8094 static void vmReopenSwapFile(void) {
8095 /* Note: we don't close the old one as we are in the child process
8096 * and don't want to mess at all with the original file object. */
8097 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8098 if (server.vm_fp == NULL) {
8099 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8100 server.vm_swap_file);
8101 _exit(1);
8102 }
8103 server.vm_fd = fileno(server.vm_fp);
8104 }
8105
8106 /* This function must be called while with threaded IO locked */
8107 static void queueIOJob(iojob *j) {
8108 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8109 (void*)j, j->type, (char*)j->key->ptr);
8110 listAddNodeTail(server.io_newjobs,j);
8111 if (server.io_active_threads < server.vm_max_threads)
8112 spawnIOThread();
8113 }
8114
8115 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8116 iojob *j;
8117
8118 assert(key->storage == REDIS_VM_MEMORY);
8119 assert(key->refcount == 1);
8120
8121 j = zmalloc(sizeof(*j));
8122 j->type = REDIS_IOJOB_PREPARE_SWAP;
8123 j->db = db;
8124 j->key = dupStringObject(key);
8125 j->val = val;
8126 incrRefCount(val);
8127 j->canceled = 0;
8128 j->thread = (pthread_t) -1;
8129 key->storage = REDIS_VM_SWAPPING;
8130
8131 lockThreadedIO();
8132 queueIOJob(j);
8133 unlockThreadedIO();
8134 return REDIS_OK;
8135 }
8136
8137 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8138
8139 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8140 * If there is not already a job loading the key, it is craeted.
8141 * The key is added to the io_keys list in the client structure, and also
8142 * in the hash table mapping swapped keys to waiting clients, that is,
8143 * server.io_waited_keys. */
8144 static int waitForSwappedKey(redisClient *c, robj *key) {
8145 struct dictEntry *de;
8146 robj *o;
8147 list *l;
8148
8149 /* If the key does not exist or is already in RAM we don't need to
8150 * block the client at all. */
8151 de = dictFind(c->db->dict,key);
8152 if (de == NULL) return 0;
8153 o = dictGetEntryKey(de);
8154 if (o->storage == REDIS_VM_MEMORY) {
8155 return 0;
8156 } else if (o->storage == REDIS_VM_SWAPPING) {
8157 /* We were swapping the key, undo it! */
8158 vmCancelThreadedIOJob(o);
8159 return 0;
8160 }
8161
8162 /* OK: the key is either swapped, or being loaded just now. */
8163
8164 /* Add the key to the list of keys this client is waiting for.
8165 * This maps clients to keys they are waiting for. */
8166 listAddNodeTail(c->io_keys,key);
8167 incrRefCount(key);
8168
8169 /* Add the client to the swapped keys => clients waiting map. */
8170 de = dictFind(c->db->io_keys,key);
8171 if (de == NULL) {
8172 int retval;
8173
8174 /* For every key we take a list of clients blocked for it */
8175 l = listCreate();
8176 retval = dictAdd(c->db->io_keys,key,l);
8177 incrRefCount(key);
8178 assert(retval == DICT_OK);
8179 } else {
8180 l = dictGetEntryVal(de);
8181 }
8182 listAddNodeTail(l,c);
8183
8184 /* Are we already loading the key from disk? If not create a job */
8185 if (o->storage == REDIS_VM_SWAPPED) {
8186 iojob *j;
8187
8188 o->storage = REDIS_VM_LOADING;
8189 j = zmalloc(sizeof(*j));
8190 j->type = REDIS_IOJOB_LOAD;
8191 j->db = c->db;
8192 j->key = dupStringObject(key);
8193 j->key->vtype = o->vtype;
8194 j->page = o->vm.page;
8195 j->val = NULL;
8196 j->canceled = 0;
8197 j->thread = (pthread_t) -1;
8198 lockThreadedIO();
8199 queueIOJob(j);
8200 unlockThreadedIO();
8201 }
8202 return 1;
8203 }
8204
8205 /* Is this client attempting to run a command against swapped keys?
8206 * If so, block it ASAP, load the keys in background, then resume it.
8207 *
8208 * The important idea about this function is that it can fail! If keys will
8209 * still be swapped when the client is resumed, this key lookups will
8210 * just block loading keys from disk. In practical terms this should only
8211 * happen with SORT BY command or if there is a bug in this function.
8212 *
8213 * Return 1 if the client is marked as blocked, 0 if the client can
8214 * continue as the keys it is going to access appear to be in memory. */
8215 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8216 int j, last;
8217
8218 if (cmd->vm_firstkey == 0) return 0;
8219 last = cmd->vm_lastkey;
8220 if (last < 0) last = c->argc+last;
8221 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8222 waitForSwappedKey(c,c->argv[j]);
8223 /* If the client was blocked for at least one key, mark it as blocked. */
8224 if (listLength(c->io_keys)) {
8225 c->flags |= REDIS_IO_WAIT;
8226 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8227 server.vm_blocked_clients++;
8228 return 1;
8229 } else {
8230 return 0;
8231 }
8232 }
8233
8234 /* Remove the 'key' from the list of blocked keys for a given client.
8235 *
8236 * The function returns 1 when there are no longer blocking keys after
8237 * the current one was removed (and the client can be unblocked). */
8238 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8239 list *l;
8240 listNode *ln;
8241 listIter li;
8242 struct dictEntry *de;
8243
8244 /* Remove the key from the list of keys this client is waiting for. */
8245 listRewind(c->io_keys,&li);
8246 while ((ln = listNext(&li)) != NULL) {
8247 if (compareStringObjects(ln->value,key) == 0) {
8248 listDelNode(c->io_keys,ln);
8249 break;
8250 }
8251 }
8252 assert(ln != NULL);
8253
8254 /* Remove the client form the key => waiting clients map. */
8255 de = dictFind(c->db->io_keys,key);
8256 assert(de != NULL);
8257 l = dictGetEntryVal(de);
8258 ln = listSearchKey(l,c);
8259 assert(ln != NULL);
8260 listDelNode(l,ln);
8261 if (listLength(l) == 0)
8262 dictDelete(c->db->io_keys,key);
8263
8264 return listLength(c->io_keys) == 0;
8265 }
8266
8267 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8268 struct dictEntry *de;
8269 list *l;
8270 listNode *ln;
8271 int len;
8272
8273 de = dictFind(db->io_keys,key);
8274 if (!de) return;
8275
8276 l = dictGetEntryVal(de);
8277 len = listLength(l);
8278 /* Note: we can't use something like while(listLength(l)) as the list
8279 * can be freed by the calling function when we remove the last element. */
8280 while (len--) {
8281 ln = listFirst(l);
8282 redisClient *c = ln->value;
8283
8284 if (dontWaitForSwappedKey(c,key)) {
8285 /* Put the client in the list of clients ready to go as we
8286 * loaded all the keys about it. */
8287 listAddNodeTail(server.io_ready_clients,c);
8288 }
8289 }
8290 }
8291
8292 /* ================================= Debugging ============================== */
8293
8294 static void debugCommand(redisClient *c) {
8295 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8296 *((char*)-1) = 'x';
8297 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8298 if (rdbSave(server.dbfilename) != REDIS_OK) {
8299 addReply(c,shared.err);
8300 return;
8301 }
8302 emptyDb();
8303 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8304 addReply(c,shared.err);
8305 return;
8306 }
8307 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8308 addReply(c,shared.ok);
8309 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8310 emptyDb();
8311 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8312 addReply(c,shared.err);
8313 return;
8314 }
8315 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8316 addReply(c,shared.ok);
8317 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8318 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8319 robj *key, *val;
8320
8321 if (!de) {
8322 addReply(c,shared.nokeyerr);
8323 return;
8324 }
8325 key = dictGetEntryKey(de);
8326 val = dictGetEntryVal(de);
8327 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8328 key->storage == REDIS_VM_SWAPPING)) {
8329 addReplySds(c,sdscatprintf(sdsempty(),
8330 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8331 "encoding:%d serializedlength:%lld\r\n",
8332 (void*)key, key->refcount, (void*)val, val->refcount,
8333 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8334 } else {
8335 addReplySds(c,sdscatprintf(sdsempty(),
8336 "+Key at:%p refcount:%d, value swapped at: page %llu "
8337 "using %llu pages\r\n",
8338 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8339 (unsigned long long) key->vm.usedpages));
8340 }
8341 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8342 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8343 robj *key, *val;
8344
8345 if (!server.vm_enabled) {
8346 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8347 return;
8348 }
8349 if (!de) {
8350 addReply(c,shared.nokeyerr);
8351 return;
8352 }
8353 key = dictGetEntryKey(de);
8354 val = dictGetEntryVal(de);
8355 /* If the key is shared we want to create a copy */
8356 if (key->refcount > 1) {
8357 robj *newkey = dupStringObject(key);
8358 decrRefCount(key);
8359 key = dictGetEntryKey(de) = newkey;
8360 }
8361 /* Swap it */
8362 if (key->storage != REDIS_VM_MEMORY) {
8363 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8364 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8365 dictGetEntryVal(de) = NULL;
8366 addReply(c,shared.ok);
8367 } else {
8368 addReply(c,shared.err);
8369 }
8370 } else {
8371 addReplySds(c,sdsnew(
8372 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8373 }
8374 }
8375
8376 static void _redisAssert(char *estr, char *file, int line) {
8377 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8378 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8379 #ifdef HAVE_BACKTRACE
8380 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8381 *((char*)-1) = 'x';
8382 #endif
8383 }
8384
8385 /* =================================== Main! ================================ */
8386
8387 #ifdef __linux__
8388 int linuxOvercommitMemoryValue(void) {
8389 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8390 char buf[64];
8391
8392 if (!fp) return -1;
8393 if (fgets(buf,64,fp) == NULL) {
8394 fclose(fp);
8395 return -1;
8396 }
8397 fclose(fp);
8398
8399 return atoi(buf);
8400 }
8401
8402 void linuxOvercommitMemoryWarning(void) {
8403 if (linuxOvercommitMemoryValue() == 0) {
8404 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8405 }
8406 }
8407 #endif /* __linux__ */
8408
8409 static void daemonize(void) {
8410 int fd;
8411 FILE *fp;
8412
8413 if (fork() != 0) exit(0); /* parent exits */
8414 setsid(); /* create a new session */
8415
8416 /* Every output goes to /dev/null. If Redis is daemonized but
8417 * the 'logfile' is set to 'stdout' in the configuration file
8418 * it will not log at all. */
8419 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8420 dup2(fd, STDIN_FILENO);
8421 dup2(fd, STDOUT_FILENO);
8422 dup2(fd, STDERR_FILENO);
8423 if (fd > STDERR_FILENO) close(fd);
8424 }
8425 /* Try to write the pid file */
8426 fp = fopen(server.pidfile,"w");
8427 if (fp) {
8428 fprintf(fp,"%d\n",getpid());
8429 fclose(fp);
8430 }
8431 }
8432
8433 int main(int argc, char **argv) {
8434 time_t start;
8435
8436 initServerConfig();
8437 if (argc == 2) {
8438 resetServerSaveParams();
8439 loadServerConfig(argv[1]);
8440 } else if (argc > 2) {
8441 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8442 exit(1);
8443 } else {
8444 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8445 }
8446 if (server.daemonize) daemonize();
8447 initServer();
8448 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8449 #ifdef __linux__
8450 linuxOvercommitMemoryWarning();
8451 #endif
8452 start = time(NULL);
8453 if (server.appendonly) {
8454 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8455 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8456 } else {
8457 if (rdbLoad(server.dbfilename) == REDIS_OK)
8458 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8459 }
8460 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8461 aeSetBeforeSleepProc(server.el,beforeSleep);
8462 aeMain(server.el);
8463 aeDeleteEventLoop(server.el);
8464 return 0;
8465 }
8466
8467 /* ============================= Backtrace support ========================= */
8468
8469 #ifdef HAVE_BACKTRACE
8470 static char *findFuncName(void *pointer, unsigned long *offset);
8471
8472 static void *getMcontextEip(ucontext_t *uc) {
8473 #if defined(__FreeBSD__)
8474 return (void*) uc->uc_mcontext.mc_eip;
8475 #elif defined(__dietlibc__)
8476 return (void*) uc->uc_mcontext.eip;
8477 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8478 #if __x86_64__
8479 return (void*) uc->uc_mcontext->__ss.__rip;
8480 #else
8481 return (void*) uc->uc_mcontext->__ss.__eip;
8482 #endif
8483 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8484 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8485 return (void*) uc->uc_mcontext->__ss.__rip;
8486 #else
8487 return (void*) uc->uc_mcontext->__ss.__eip;
8488 #endif
8489 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8490 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8491 #elif defined(__ia64__) /* Linux IA64 */
8492 return (void*) uc->uc_mcontext.sc_ip;
8493 #else
8494 return NULL;
8495 #endif
8496 }
8497
8498 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8499 void *trace[100];
8500 char **messages = NULL;
8501 int i, trace_size = 0;
8502 unsigned long offset=0;
8503 ucontext_t *uc = (ucontext_t*) secret;
8504 sds infostring;
8505 REDIS_NOTUSED(info);
8506
8507 redisLog(REDIS_WARNING,
8508 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8509 infostring = genRedisInfoString();
8510 redisLog(REDIS_WARNING, "%s",infostring);
8511 /* It's not safe to sdsfree() the returned string under memory
8512 * corruption conditions. Let it leak as we are going to abort */
8513
8514 trace_size = backtrace(trace, 100);
8515 /* overwrite sigaction with caller's address */
8516 if (getMcontextEip(uc) != NULL) {
8517 trace[1] = getMcontextEip(uc);
8518 }
8519 messages = backtrace_symbols(trace, trace_size);
8520
8521 for (i=1; i<trace_size; ++i) {
8522 char *fn = findFuncName(trace[i], &offset), *p;
8523
8524 p = strchr(messages[i],'+');
8525 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8526 redisLog(REDIS_WARNING,"%s", messages[i]);
8527 } else {
8528 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8529 }
8530 }
8531 /* free(messages); Don't call free() with possibly corrupted memory. */
8532 _exit(0);
8533 }
8534
8535 static void setupSigSegvAction(void) {
8536 struct sigaction act;
8537
8538 sigemptyset (&act.sa_mask);
8539 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8540 * is used. Otherwise, sa_handler is used */
8541 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8542 act.sa_sigaction = segvHandler;
8543 sigaction (SIGSEGV, &act, NULL);
8544 sigaction (SIGBUS, &act, NULL);
8545 sigaction (SIGFPE, &act, NULL);
8546 sigaction (SIGILL, &act, NULL);
8547 sigaction (SIGBUS, &act, NULL);
8548 return;
8549 }
8550
8551 #include "staticsymbols.h"
8552 /* This function try to convert a pointer into a function name. It's used in
8553 * oreder to provide a backtrace under segmentation fault that's able to
8554 * display functions declared as static (otherwise the backtrace is useless). */
8555 static char *findFuncName(void *pointer, unsigned long *offset){
8556 int i, ret = -1;
8557 unsigned long off, minoff = 0;
8558
8559 /* Try to match against the Symbol with the smallest offset */
8560 for (i=0; symsTable[i].pointer; i++) {
8561 unsigned long lp = (unsigned long) pointer;
8562
8563 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8564 off=lp-symsTable[i].pointer;
8565 if (ret < 0 || off < minoff) {
8566 minoff=off;
8567 ret=i;
8568 }
8569 }
8570 }
8571 if (ret == -1) return NULL;
8572 *offset = minoff;
8573 return symsTable[ret].name;
8574 }
8575 #else /* HAVE_BACKTRACE */
8576 static void setupSigSegvAction(void) {
8577 }
8578 #endif /* HAVE_BACKTRACE */
8579
8580
8581
8582 /* The End */
8583
8584
8585