]> git.saurik.com Git - redis.git/blob - redis.c
1abfd96b544206dfb7b3d731d575d4115e944993
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78
79 /* Error codes */
80 #define REDIS_OK 0
81 #define REDIS_ERR -1
82
83 /* Static server configuration */
84 #define REDIS_SERVERPORT 6379 /* TCP port */
85 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
86 #define REDIS_IOBUF_LEN 1024
87 #define REDIS_LOADBUF_LEN 1024
88 #define REDIS_STATIC_ARGS 4
89 #define REDIS_DEFAULT_DBNUM 16
90 #define REDIS_CONFIGLINE_MAX 1024
91 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
92 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
93 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
94 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
95 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
96
97 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
98 #define REDIS_WRITEV_THRESHOLD 3
99 /* Max number of iovecs used for each writev call */
100 #define REDIS_WRITEV_IOVEC_COUNT 256
101
102 /* Hash table parameters */
103 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
104
105 /* Command flags */
106 #define REDIS_CMD_BULK 1 /* Bulk write command */
107 #define REDIS_CMD_INLINE 2 /* Inline command */
108 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
109 this flags will return an error when the 'maxmemory' option is set in the
110 config file and the server is using more than maxmemory bytes of memory.
111 In short this commands are denied on low memory conditions. */
112 #define REDIS_CMD_DENYOOM 4
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding */
122 #define REDIS_ENCODING_RAW 0 /* Raw representation */
123 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
124
125 /* Object types only used for dumping to disk */
126 #define REDIS_EXPIRETIME 253
127 #define REDIS_SELECTDB 254
128 #define REDIS_EOF 255
129
130 /* Defines related to the dump file format. To store 32 bits lengths for short
131 * keys requires a lot of space, so we check the most significant 2 bits of
132 * the first byte to interpreter the length:
133 *
134 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
135 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
136 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
137 * 11|000000 this means: specially encoded object will follow. The six bits
138 * number specify the kind of object that follows.
139 * See the REDIS_RDB_ENC_* defines.
140 *
141 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
142 * values, will fit inside. */
143 #define REDIS_RDB_6BITLEN 0
144 #define REDIS_RDB_14BITLEN 1
145 #define REDIS_RDB_32BITLEN 2
146 #define REDIS_RDB_ENCVAL 3
147 #define REDIS_RDB_LENERR UINT_MAX
148
149 /* When a length of a string object stored on disk has the first two bits
150 * set, the remaining two bits specify a special encoding for the object
151 * accordingly to the following defines: */
152 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
153 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
154 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
155 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
156
157 /* Virtual memory object->where field. */
158 #define REDIS_VM_MEMORY 0 /* The object is on memory */
159 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
160 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
161 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
162
163 /* Virtual memory static configuration stuff.
164 * Check vmFindContiguousPages() to know more about this magic numbers. */
165 #define REDIS_VM_MAX_NEAR_PAGES 65536
166 #define REDIS_VM_MAX_RANDOM_JUMP 4096
167 #define REDIS_VM_MAX_THREADS 32
168 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
169 /* The following is the *percentage* of completed I/O jobs to process when the
170 * handelr is called. While Virtual Memory I/O operations are performed by
171 * threads, this operations must be processed by the main thread when completed
172 * in order to take effect. */
173 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
174
175 /* Client flags */
176 #define REDIS_SLAVE 1 /* This client is a slave server */
177 #define REDIS_MASTER 2 /* This client is a master server */
178 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
179 #define REDIS_MULTI 8 /* This client is in a MULTI context */
180 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
181 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
182
183 /* Slave replication state - slave side */
184 #define REDIS_REPL_NONE 0 /* No active replication */
185 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
186 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
187
188 /* Slave replication state - from the point of view of master
189 * Note that in SEND_BULK and ONLINE state the slave receives new updates
190 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
191 * to start the next background saving in order to send updates to it. */
192 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
193 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
194 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
195 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
196
197 /* List related stuff */
198 #define REDIS_HEAD 0
199 #define REDIS_TAIL 1
200
201 /* Sort operations */
202 #define REDIS_SORT_GET 0
203 #define REDIS_SORT_ASC 1
204 #define REDIS_SORT_DESC 2
205 #define REDIS_SORTKEY_MAX 1024
206
207 /* Log levels */
208 #define REDIS_DEBUG 0
209 #define REDIS_VERBOSE 1
210 #define REDIS_NOTICE 2
211 #define REDIS_WARNING 3
212
213 /* Anti-warning macro... */
214 #define REDIS_NOTUSED(V) ((void) V)
215
216 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
217 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
218
219 /* Append only defines */
220 #define APPENDFSYNC_NO 0
221 #define APPENDFSYNC_ALWAYS 1
222 #define APPENDFSYNC_EVERYSEC 2
223
224 /* We can print the stacktrace, so our assert is defined this way: */
225 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
226 static void _redisAssert(char *estr, char *file, int line);
227
228 /*================================= Data types ============================== */
229
230 /* A redis object, that is a type able to hold a string / list / set */
231
232 /* The VM object structure */
233 struct redisObjectVM {
234 off_t page; /* the page at witch the object is stored on disk */
235 off_t usedpages; /* number of pages used on disk */
236 time_t atime; /* Last access time */
237 } vm;
238
239 /* The actual Redis Object */
240 typedef struct redisObject {
241 void *ptr;
242 unsigned char type;
243 unsigned char encoding;
244 unsigned char storage; /* If this object is a key, where is the value?
245 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
246 unsigned char vtype; /* If this object is a key, and value is swapped out,
247 * this is the type of the swapped out object. */
248 int refcount;
249 /* VM fields, this are only allocated if VM is active, otherwise the
250 * object allocation function will just allocate
251 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
252 * Redis without VM active will not have any overhead. */
253 struct redisObjectVM vm;
254 } robj;
255
256 /* Macro used to initalize a Redis object allocated on the stack.
257 * Note that this macro is taken near the structure definition to make sure
258 * we'll update it when the structure is changed, to avoid bugs like
259 * bug #85 introduced exactly in this way. */
260 #define initStaticStringObject(_var,_ptr) do { \
261 _var.refcount = 1; \
262 _var.type = REDIS_STRING; \
263 _var.encoding = REDIS_ENCODING_RAW; \
264 _var.ptr = _ptr; \
265 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
266 } while(0);
267
268 typedef struct redisDb {
269 dict *dict; /* The keyspace for this DB */
270 dict *expires; /* Timeout of keys with a timeout set */
271 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
272 dict *io_keys; /* Keys with clients waiting for VM I/O */
273 int id;
274 } redisDb;
275
276 /* Client MULTI/EXEC state */
277 typedef struct multiCmd {
278 robj **argv;
279 int argc;
280 struct redisCommand *cmd;
281 } multiCmd;
282
283 typedef struct multiState {
284 multiCmd *commands; /* Array of MULTI commands */
285 int count; /* Total number of MULTI commands */
286 } multiState;
287
288 /* With multiplexing we need to take per-clinet state.
289 * Clients are taken in a liked list. */
290 typedef struct redisClient {
291 int fd;
292 redisDb *db;
293 int dictid;
294 sds querybuf;
295 robj **argv, **mbargv;
296 int argc, mbargc;
297 int bulklen; /* bulk read len. -1 if not in bulk read mode */
298 int multibulk; /* multi bulk command format active */
299 list *reply;
300 int sentlen;
301 time_t lastinteraction; /* time of the last interaction, used for timeout */
302 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
303 int slaveseldb; /* slave selected db, if this client is a slave */
304 int authenticated; /* when requirepass is non-NULL */
305 int replstate; /* replication state if this is a slave */
306 int repldbfd; /* replication DB file descriptor */
307 long repldboff; /* replication DB file offset */
308 off_t repldbsize; /* replication DB file size */
309 multiState mstate; /* MULTI/EXEC state */
310 robj **blockingkeys; /* The key we are waiting to terminate a blocking
311 * operation such as BLPOP. Otherwise NULL. */
312 int blockingkeysnum; /* Number of blocking keys */
313 time_t blockingto; /* Blocking operation timeout. If UNIX current time
314 * is >= blockingto then the operation timed out. */
315 list *io_keys; /* Keys this client is waiting to be loaded from the
316 * swap file in order to continue. */
317 } redisClient;
318
319 struct saveparam {
320 time_t seconds;
321 int changes;
322 };
323
324 /* Global server state structure */
325 struct redisServer {
326 int port;
327 int fd;
328 redisDb *db;
329 dict *sharingpool; /* Poll used for object sharing */
330 unsigned int sharingpoolsize;
331 long long dirty; /* changes to DB from the last save */
332 list *clients;
333 list *slaves, *monitors;
334 char neterr[ANET_ERR_LEN];
335 aeEventLoop *el;
336 int cronloops; /* number of times the cron function run */
337 list *objfreelist; /* A list of freed objects to avoid malloc() */
338 time_t lastsave; /* Unix time of last save succeeede */
339 /* Fields used only for stats */
340 time_t stat_starttime; /* server start time */
341 long long stat_numcommands; /* number of processed commands */
342 long long stat_numconnections; /* number of connections received */
343 /* Configuration */
344 int verbosity;
345 int glueoutputbuf;
346 int maxidletime;
347 int dbnum;
348 int daemonize;
349 int appendonly;
350 int appendfsync;
351 time_t lastfsync;
352 int appendfd;
353 int appendseldb;
354 char *pidfile;
355 pid_t bgsavechildpid;
356 pid_t bgrewritechildpid;
357 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
358 struct saveparam *saveparams;
359 int saveparamslen;
360 char *logfile;
361 char *bindaddr;
362 char *dbfilename;
363 char *appendfilename;
364 char *requirepass;
365 int shareobjects;
366 int rdbcompression;
367 /* Replication related */
368 int isslave;
369 char *masterauth;
370 char *masterhost;
371 int masterport;
372 redisClient *master; /* client that is master for this slave */
373 int replstate;
374 unsigned int maxclients;
375 unsigned long long maxmemory;
376 unsigned int blpop_blocked_clients;
377 unsigned int vm_blocked_clients;
378 /* Sort parameters - qsort_r() is only available under BSD so we
379 * have to take this state global, in order to pass it to sortCompare() */
380 int sort_desc;
381 int sort_alpha;
382 int sort_bypattern;
383 /* Virtual memory configuration */
384 int vm_enabled;
385 char *vm_swap_file;
386 off_t vm_page_size;
387 off_t vm_pages;
388 unsigned long long vm_max_memory;
389 /* Virtual memory state */
390 FILE *vm_fp;
391 int vm_fd;
392 off_t vm_next_page; /* Next probably empty page */
393 off_t vm_near_pages; /* Number of pages allocated sequentially */
394 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
395 time_t unixtime; /* Unix time sampled every second. */
396 /* Virtual memory I/O threads stuff */
397 /* An I/O thread process an element taken from the io_jobs queue and
398 * put the result of the operation in the io_done list. While the
399 * job is being processed, it's put on io_processing queue. */
400 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
401 list *io_processing; /* List of VM I/O jobs being processed */
402 list *io_processed; /* List of VM I/O jobs already processed */
403 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
404 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
405 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
406 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
407 pthread_attr_t io_threads_attr; /* attributes for threads creation */
408 int io_active_threads; /* Number of running I/O threads */
409 int vm_max_threads; /* Max number of I/O threads running at the same time */
410 /* Our main thread is blocked on the event loop, locking for sockets ready
411 * to be read or written, so when a threaded I/O operation is ready to be
412 * processed by the main thread, the I/O thread will use a unix pipe to
413 * awake the main thread. The followings are the two pipe FDs. */
414 int io_ready_pipe_read;
415 int io_ready_pipe_write;
416 /* Virtual memory stats */
417 unsigned long long vm_stats_used_pages;
418 unsigned long long vm_stats_swapped_objects;
419 unsigned long long vm_stats_swapouts;
420 unsigned long long vm_stats_swapins;
421 FILE *devnull;
422 };
423
424 typedef void redisCommandProc(redisClient *c);
425 struct redisCommand {
426 char *name;
427 redisCommandProc *proc;
428 int arity;
429 int flags;
430 /* What keys should be loaded in background when calling this command? */
431 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
432 int vm_lastkey; /* THe last argument that's a key */
433 int vm_keystep; /* The step between first and last key */
434 };
435
436 struct redisFunctionSym {
437 char *name;
438 unsigned long pointer;
439 };
440
441 typedef struct _redisSortObject {
442 robj *obj;
443 union {
444 double score;
445 robj *cmpobj;
446 } u;
447 } redisSortObject;
448
449 typedef struct _redisSortOperation {
450 int type;
451 robj *pattern;
452 } redisSortOperation;
453
454 /* ZSETs use a specialized version of Skiplists */
455
456 typedef struct zskiplistNode {
457 struct zskiplistNode **forward;
458 struct zskiplistNode *backward;
459 unsigned long *span;
460 double score;
461 robj *obj;
462 } zskiplistNode;
463
464 typedef struct zskiplist {
465 struct zskiplistNode *header, *tail;
466 unsigned long length;
467 int level;
468 } zskiplist;
469
470 typedef struct zset {
471 dict *dict;
472 zskiplist *zsl;
473 } zset;
474
475 /* Our shared "common" objects */
476
477 struct sharedObjectsStruct {
478 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
479 *colon, *nullbulk, *nullmultibulk, *queued,
480 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
481 *outofrangeerr, *plus,
482 *select0, *select1, *select2, *select3, *select4,
483 *select5, *select6, *select7, *select8, *select9;
484 } shared;
485
486 /* Global vars that are actally used as constants. The following double
487 * values are used for double on-disk serialization, and are initialized
488 * at runtime to avoid strange compiler optimizations. */
489
490 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
491
492 /* VM threaded I/O request message */
493 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
494 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
495 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
496 typedef struct iojob {
497 int type; /* Request type, REDIS_IOJOB_* */
498 redisDb *db;/* Redis database */
499 robj *key; /* This I/O request is about swapping this key */
500 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
501 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
502 off_t page; /* Swap page where to read/write the object */
503 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
504 int canceled; /* True if this command was canceled by blocking side of VM */
505 pthread_t thread; /* ID of the thread processing this entry */
506 } iojob;
507
508 /*================================ Prototypes =============================== */
509
510 static void freeStringObject(robj *o);
511 static void freeListObject(robj *o);
512 static void freeSetObject(robj *o);
513 static void decrRefCount(void *o);
514 static robj *createObject(int type, void *ptr);
515 static void freeClient(redisClient *c);
516 static int rdbLoad(char *filename);
517 static void addReply(redisClient *c, robj *obj);
518 static void addReplySds(redisClient *c, sds s);
519 static void incrRefCount(robj *o);
520 static int rdbSaveBackground(char *filename);
521 static robj *createStringObject(char *ptr, size_t len);
522 static robj *dupStringObject(robj *o);
523 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
524 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
525 static int syncWithMaster(void);
526 static robj *tryObjectSharing(robj *o);
527 static int tryObjectEncoding(robj *o);
528 static robj *getDecodedObject(robj *o);
529 static int removeExpire(redisDb *db, robj *key);
530 static int expireIfNeeded(redisDb *db, robj *key);
531 static int deleteIfVolatile(redisDb *db, robj *key);
532 static int deleteIfSwapped(redisDb *db, robj *key);
533 static int deleteKey(redisDb *db, robj *key);
534 static time_t getExpire(redisDb *db, robj *key);
535 static int setExpire(redisDb *db, robj *key, time_t when);
536 static void updateSlavesWaitingBgsave(int bgsaveerr);
537 static void freeMemoryIfNeeded(void);
538 static int processCommand(redisClient *c);
539 static void setupSigSegvAction(void);
540 static void rdbRemoveTempFile(pid_t childpid);
541 static void aofRemoveTempFile(pid_t childpid);
542 static size_t stringObjectLen(robj *o);
543 static void processInputBuffer(redisClient *c);
544 static zskiplist *zslCreate(void);
545 static void zslFree(zskiplist *zsl);
546 static void zslInsert(zskiplist *zsl, double score, robj *obj);
547 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
548 static void initClientMultiState(redisClient *c);
549 static void freeClientMultiState(redisClient *c);
550 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
551 static void unblockClientWaitingData(redisClient *c);
552 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
553 static void vmInit(void);
554 static void vmMarkPagesFree(off_t page, off_t count);
555 static robj *vmLoadObject(robj *key);
556 static robj *vmPreviewObject(robj *key);
557 static int vmSwapOneObjectBlocking(void);
558 static int vmSwapOneObjectThreaded(void);
559 static int vmCanSwapOut(void);
560 static int tryFreeOneObjectFromFreelist(void);
561 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
562 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
563 static void vmCancelThreadedIOJob(robj *o);
564 static void lockThreadedIO(void);
565 static void unlockThreadedIO(void);
566 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
567 static void freeIOJob(iojob *j);
568 static void queueIOJob(iojob *j);
569 static int vmWriteObjectOnSwap(robj *o, off_t page);
570 static robj *vmReadObjectFromSwap(off_t page, int type);
571 static void waitEmptyIOJobsQueue(void);
572 static void vmReopenSwapFile(void);
573 static int vmFreePage(off_t page);
574 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
575 static int dontWaitForSwappedKey(redisClient *c, robj *key);
576 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
577 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
578 static struct redisCommand *lookupCommand(char *name);
579 static void call(redisClient *c, struct redisCommand *cmd);
580 static void resetClient(redisClient *c);
581
582 static void authCommand(redisClient *c);
583 static void pingCommand(redisClient *c);
584 static void echoCommand(redisClient *c);
585 static void setCommand(redisClient *c);
586 static void setnxCommand(redisClient *c);
587 static void getCommand(redisClient *c);
588 static void delCommand(redisClient *c);
589 static void existsCommand(redisClient *c);
590 static void incrCommand(redisClient *c);
591 static void decrCommand(redisClient *c);
592 static void incrbyCommand(redisClient *c);
593 static void decrbyCommand(redisClient *c);
594 static void selectCommand(redisClient *c);
595 static void randomkeyCommand(redisClient *c);
596 static void keysCommand(redisClient *c);
597 static void dbsizeCommand(redisClient *c);
598 static void lastsaveCommand(redisClient *c);
599 static void saveCommand(redisClient *c);
600 static void bgsaveCommand(redisClient *c);
601 static void bgrewriteaofCommand(redisClient *c);
602 static void shutdownCommand(redisClient *c);
603 static void moveCommand(redisClient *c);
604 static void renameCommand(redisClient *c);
605 static void renamenxCommand(redisClient *c);
606 static void lpushCommand(redisClient *c);
607 static void rpushCommand(redisClient *c);
608 static void lpopCommand(redisClient *c);
609 static void rpopCommand(redisClient *c);
610 static void llenCommand(redisClient *c);
611 static void lindexCommand(redisClient *c);
612 static void lrangeCommand(redisClient *c);
613 static void ltrimCommand(redisClient *c);
614 static void typeCommand(redisClient *c);
615 static void lsetCommand(redisClient *c);
616 static void saddCommand(redisClient *c);
617 static void sremCommand(redisClient *c);
618 static void smoveCommand(redisClient *c);
619 static void sismemberCommand(redisClient *c);
620 static void scardCommand(redisClient *c);
621 static void spopCommand(redisClient *c);
622 static void srandmemberCommand(redisClient *c);
623 static void sinterCommand(redisClient *c);
624 static void sinterstoreCommand(redisClient *c);
625 static void sunionCommand(redisClient *c);
626 static void sunionstoreCommand(redisClient *c);
627 static void sdiffCommand(redisClient *c);
628 static void sdiffstoreCommand(redisClient *c);
629 static void syncCommand(redisClient *c);
630 static void flushdbCommand(redisClient *c);
631 static void flushallCommand(redisClient *c);
632 static void sortCommand(redisClient *c);
633 static void lremCommand(redisClient *c);
634 static void rpoplpushcommand(redisClient *c);
635 static void infoCommand(redisClient *c);
636 static void mgetCommand(redisClient *c);
637 static void monitorCommand(redisClient *c);
638 static void expireCommand(redisClient *c);
639 static void expireatCommand(redisClient *c);
640 static void getsetCommand(redisClient *c);
641 static void ttlCommand(redisClient *c);
642 static void slaveofCommand(redisClient *c);
643 static void debugCommand(redisClient *c);
644 static void msetCommand(redisClient *c);
645 static void msetnxCommand(redisClient *c);
646 static void zaddCommand(redisClient *c);
647 static void zincrbyCommand(redisClient *c);
648 static void zrangeCommand(redisClient *c);
649 static void zrangebyscoreCommand(redisClient *c);
650 static void zcountCommand(redisClient *c);
651 static void zrevrangeCommand(redisClient *c);
652 static void zcardCommand(redisClient *c);
653 static void zremCommand(redisClient *c);
654 static void zscoreCommand(redisClient *c);
655 static void zremrangebyscoreCommand(redisClient *c);
656 static void multiCommand(redisClient *c);
657 static void execCommand(redisClient *c);
658 static void discardCommand(redisClient *c);
659 static void blpopCommand(redisClient *c);
660 static void brpopCommand(redisClient *c);
661 static void appendCommand(redisClient *c);
662 static void zrankCommand(redisClient *c);
663
664 /*================================= Globals ================================= */
665
666 /* Global vars */
667 static struct redisServer server; /* server global state */
668 static struct redisCommand cmdTable[] = {
669 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
670 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
671 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
672 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
673 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
674 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
675 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
676 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
677 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
678 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
679 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
680 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
681 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
682 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
683 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
684 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
685 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
686 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
687 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
688 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
689 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
690 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
691 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
692 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
693 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
694 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
695 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
696 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
697 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
698 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
699 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
700 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
701 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
702 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
703 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
704 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
705 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
706 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
707 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
708 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
709 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
710 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
711 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
712 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
713 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
715 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
716 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
717 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
718 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
719 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
720 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
721 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
722 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
723 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
724 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
725 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
726 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
727 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
728 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
729 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
730 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
731 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
732 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
733 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
734 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
735 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
736 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
737 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
738 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
739 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
740 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
741 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
742 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
743 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
744 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
745 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
746 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
747 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
748 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
749 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
750 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
751 {NULL,NULL,0,0,0,0,0}
752 };
753
754 /*============================ Utility functions ============================ */
755
756 /* Glob-style pattern matching. */
757 int stringmatchlen(const char *pattern, int patternLen,
758 const char *string, int stringLen, int nocase)
759 {
760 while(patternLen) {
761 switch(pattern[0]) {
762 case '*':
763 while (pattern[1] == '*') {
764 pattern++;
765 patternLen--;
766 }
767 if (patternLen == 1)
768 return 1; /* match */
769 while(stringLen) {
770 if (stringmatchlen(pattern+1, patternLen-1,
771 string, stringLen, nocase))
772 return 1; /* match */
773 string++;
774 stringLen--;
775 }
776 return 0; /* no match */
777 break;
778 case '?':
779 if (stringLen == 0)
780 return 0; /* no match */
781 string++;
782 stringLen--;
783 break;
784 case '[':
785 {
786 int not, match;
787
788 pattern++;
789 patternLen--;
790 not = pattern[0] == '^';
791 if (not) {
792 pattern++;
793 patternLen--;
794 }
795 match = 0;
796 while(1) {
797 if (pattern[0] == '\\') {
798 pattern++;
799 patternLen--;
800 if (pattern[0] == string[0])
801 match = 1;
802 } else if (pattern[0] == ']') {
803 break;
804 } else if (patternLen == 0) {
805 pattern--;
806 patternLen++;
807 break;
808 } else if (pattern[1] == '-' && patternLen >= 3) {
809 int start = pattern[0];
810 int end = pattern[2];
811 int c = string[0];
812 if (start > end) {
813 int t = start;
814 start = end;
815 end = t;
816 }
817 if (nocase) {
818 start = tolower(start);
819 end = tolower(end);
820 c = tolower(c);
821 }
822 pattern += 2;
823 patternLen -= 2;
824 if (c >= start && c <= end)
825 match = 1;
826 } else {
827 if (!nocase) {
828 if (pattern[0] == string[0])
829 match = 1;
830 } else {
831 if (tolower((int)pattern[0]) == tolower((int)string[0]))
832 match = 1;
833 }
834 }
835 pattern++;
836 patternLen--;
837 }
838 if (not)
839 match = !match;
840 if (!match)
841 return 0; /* no match */
842 string++;
843 stringLen--;
844 break;
845 }
846 case '\\':
847 if (patternLen >= 2) {
848 pattern++;
849 patternLen--;
850 }
851 /* fall through */
852 default:
853 if (!nocase) {
854 if (pattern[0] != string[0])
855 return 0; /* no match */
856 } else {
857 if (tolower((int)pattern[0]) != tolower((int)string[0]))
858 return 0; /* no match */
859 }
860 string++;
861 stringLen--;
862 break;
863 }
864 pattern++;
865 patternLen--;
866 if (stringLen == 0) {
867 while(*pattern == '*') {
868 pattern++;
869 patternLen--;
870 }
871 break;
872 }
873 }
874 if (patternLen == 0 && stringLen == 0)
875 return 1;
876 return 0;
877 }
878
879 static void redisLog(int level, const char *fmt, ...) {
880 va_list ap;
881 FILE *fp;
882
883 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
884 if (!fp) return;
885
886 va_start(ap, fmt);
887 if (level >= server.verbosity) {
888 char *c = ".-*#";
889 char buf[64];
890 time_t now;
891
892 now = time(NULL);
893 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
894 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
895 vfprintf(fp, fmt, ap);
896 fprintf(fp,"\n");
897 fflush(fp);
898 }
899 va_end(ap);
900
901 if (server.logfile) fclose(fp);
902 }
903
904 /*====================== Hash table type implementation ==================== */
905
906 /* This is an hash table type that uses the SDS dynamic strings libary as
907 * keys and radis objects as values (objects can hold SDS strings,
908 * lists, sets). */
909
910 static void dictVanillaFree(void *privdata, void *val)
911 {
912 DICT_NOTUSED(privdata);
913 zfree(val);
914 }
915
916 static void dictListDestructor(void *privdata, void *val)
917 {
918 DICT_NOTUSED(privdata);
919 listRelease((list*)val);
920 }
921
922 static int sdsDictKeyCompare(void *privdata, const void *key1,
923 const void *key2)
924 {
925 int l1,l2;
926 DICT_NOTUSED(privdata);
927
928 l1 = sdslen((sds)key1);
929 l2 = sdslen((sds)key2);
930 if (l1 != l2) return 0;
931 return memcmp(key1, key2, l1) == 0;
932 }
933
934 static void dictRedisObjectDestructor(void *privdata, void *val)
935 {
936 DICT_NOTUSED(privdata);
937
938 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
939 decrRefCount(val);
940 }
941
942 static int dictObjKeyCompare(void *privdata, const void *key1,
943 const void *key2)
944 {
945 const robj *o1 = key1, *o2 = key2;
946 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
947 }
948
949 static unsigned int dictObjHash(const void *key) {
950 const robj *o = key;
951 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
952 }
953
954 static int dictEncObjKeyCompare(void *privdata, const void *key1,
955 const void *key2)
956 {
957 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
958 int cmp;
959
960 o1 = getDecodedObject(o1);
961 o2 = getDecodedObject(o2);
962 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
963 decrRefCount(o1);
964 decrRefCount(o2);
965 return cmp;
966 }
967
968 static unsigned int dictEncObjHash(const void *key) {
969 robj *o = (robj*) key;
970
971 if (o->encoding == REDIS_ENCODING_RAW) {
972 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
973 } else {
974 if (o->encoding == REDIS_ENCODING_INT) {
975 char buf[32];
976 int len;
977
978 len = snprintf(buf,32,"%ld",(long)o->ptr);
979 return dictGenHashFunction((unsigned char*)buf, len);
980 } else {
981 unsigned int hash;
982
983 o = getDecodedObject(o);
984 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
985 decrRefCount(o);
986 return hash;
987 }
988 }
989 }
990
991 /* Sets type and expires */
992 static dictType setDictType = {
993 dictEncObjHash, /* hash function */
994 NULL, /* key dup */
995 NULL, /* val dup */
996 dictEncObjKeyCompare, /* key compare */
997 dictRedisObjectDestructor, /* key destructor */
998 NULL /* val destructor */
999 };
1000
1001 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1002 static dictType zsetDictType = {
1003 dictEncObjHash, /* hash function */
1004 NULL, /* key dup */
1005 NULL, /* val dup */
1006 dictEncObjKeyCompare, /* key compare */
1007 dictRedisObjectDestructor, /* key destructor */
1008 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1009 };
1010
1011 /* Db->dict */
1012 static dictType hashDictType = {
1013 dictObjHash, /* hash function */
1014 NULL, /* key dup */
1015 NULL, /* val dup */
1016 dictObjKeyCompare, /* key compare */
1017 dictRedisObjectDestructor, /* key destructor */
1018 dictRedisObjectDestructor /* val destructor */
1019 };
1020
1021 /* Db->expires */
1022 static dictType keyptrDictType = {
1023 dictObjHash, /* hash function */
1024 NULL, /* key dup */
1025 NULL, /* val dup */
1026 dictObjKeyCompare, /* key compare */
1027 dictRedisObjectDestructor, /* key destructor */
1028 NULL /* val destructor */
1029 };
1030
1031 /* Keylist hash table type has unencoded redis objects as keys and
1032 * lists as values. It's used for blocking operations (BLPOP) and to
1033 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1034 static dictType keylistDictType = {
1035 dictObjHash, /* hash function */
1036 NULL, /* key dup */
1037 NULL, /* val dup */
1038 dictObjKeyCompare, /* key compare */
1039 dictRedisObjectDestructor, /* key destructor */
1040 dictListDestructor /* val destructor */
1041 };
1042
1043 /* ========================= Random utility functions ======================= */
1044
1045 /* Redis generally does not try to recover from out of memory conditions
1046 * when allocating objects or strings, it is not clear if it will be possible
1047 * to report this condition to the client since the networking layer itself
1048 * is based on heap allocation for send buffers, so we simply abort.
1049 * At least the code will be simpler to read... */
1050 static void oom(const char *msg) {
1051 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1052 sleep(1);
1053 abort();
1054 }
1055
1056 /* ====================== Redis server networking stuff ===================== */
1057 static void closeTimedoutClients(void) {
1058 redisClient *c;
1059 listNode *ln;
1060 time_t now = time(NULL);
1061 listIter li;
1062
1063 listRewind(server.clients,&li);
1064 while ((ln = listNext(&li)) != NULL) {
1065 c = listNodeValue(ln);
1066 if (server.maxidletime &&
1067 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1068 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1069 (now - c->lastinteraction > server.maxidletime))
1070 {
1071 redisLog(REDIS_VERBOSE,"Closing idle client");
1072 freeClient(c);
1073 } else if (c->flags & REDIS_BLOCKED) {
1074 if (c->blockingto != 0 && c->blockingto < now) {
1075 addReply(c,shared.nullmultibulk);
1076 unblockClientWaitingData(c);
1077 }
1078 }
1079 }
1080 }
1081
1082 static int htNeedsResize(dict *dict) {
1083 long long size, used;
1084
1085 size = dictSlots(dict);
1086 used = dictSize(dict);
1087 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1088 (used*100/size < REDIS_HT_MINFILL));
1089 }
1090
1091 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1092 * we resize the hash table to save memory */
1093 static void tryResizeHashTables(void) {
1094 int j;
1095
1096 for (j = 0; j < server.dbnum; j++) {
1097 if (htNeedsResize(server.db[j].dict)) {
1098 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1099 dictResize(server.db[j].dict);
1100 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1101 }
1102 if (htNeedsResize(server.db[j].expires))
1103 dictResize(server.db[j].expires);
1104 }
1105 }
1106
1107 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1108 void backgroundSaveDoneHandler(int statloc) {
1109 int exitcode = WEXITSTATUS(statloc);
1110 int bysignal = WIFSIGNALED(statloc);
1111
1112 if (!bysignal && exitcode == 0) {
1113 redisLog(REDIS_NOTICE,
1114 "Background saving terminated with success");
1115 server.dirty = 0;
1116 server.lastsave = time(NULL);
1117 } else if (!bysignal && exitcode != 0) {
1118 redisLog(REDIS_WARNING, "Background saving error");
1119 } else {
1120 redisLog(REDIS_WARNING,
1121 "Background saving terminated by signal");
1122 rdbRemoveTempFile(server.bgsavechildpid);
1123 }
1124 server.bgsavechildpid = -1;
1125 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1126 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1127 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1128 }
1129
1130 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1131 * Handle this. */
1132 void backgroundRewriteDoneHandler(int statloc) {
1133 int exitcode = WEXITSTATUS(statloc);
1134 int bysignal = WIFSIGNALED(statloc);
1135
1136 if (!bysignal && exitcode == 0) {
1137 int fd;
1138 char tmpfile[256];
1139
1140 redisLog(REDIS_NOTICE,
1141 "Background append only file rewriting terminated with success");
1142 /* Now it's time to flush the differences accumulated by the parent */
1143 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1144 fd = open(tmpfile,O_WRONLY|O_APPEND);
1145 if (fd == -1) {
1146 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1147 goto cleanup;
1148 }
1149 /* Flush our data... */
1150 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1151 (signed) sdslen(server.bgrewritebuf)) {
1152 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1153 close(fd);
1154 goto cleanup;
1155 }
1156 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1157 /* Now our work is to rename the temp file into the stable file. And
1158 * switch the file descriptor used by the server for append only. */
1159 if (rename(tmpfile,server.appendfilename) == -1) {
1160 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1161 close(fd);
1162 goto cleanup;
1163 }
1164 /* Mission completed... almost */
1165 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1166 if (server.appendfd != -1) {
1167 /* If append only is actually enabled... */
1168 close(server.appendfd);
1169 server.appendfd = fd;
1170 fsync(fd);
1171 server.appendseldb = -1; /* Make sure it will issue SELECT */
1172 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1173 } else {
1174 /* If append only is disabled we just generate a dump in this
1175 * format. Why not? */
1176 close(fd);
1177 }
1178 } else if (!bysignal && exitcode != 0) {
1179 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1180 } else {
1181 redisLog(REDIS_WARNING,
1182 "Background append only file rewriting terminated by signal");
1183 }
1184 cleanup:
1185 sdsfree(server.bgrewritebuf);
1186 server.bgrewritebuf = sdsempty();
1187 aofRemoveTempFile(server.bgrewritechildpid);
1188 server.bgrewritechildpid = -1;
1189 }
1190
1191 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1192 int j, loops = server.cronloops++;
1193 REDIS_NOTUSED(eventLoop);
1194 REDIS_NOTUSED(id);
1195 REDIS_NOTUSED(clientData);
1196
1197 /* We take a cached value of the unix time in the global state because
1198 * with virtual memory and aging there is to store the current time
1199 * in objects at every object access, and accuracy is not needed.
1200 * To access a global var is faster than calling time(NULL) */
1201 server.unixtime = time(NULL);
1202
1203 /* Show some info about non-empty databases */
1204 for (j = 0; j < server.dbnum; j++) {
1205 long long size, used, vkeys;
1206
1207 size = dictSlots(server.db[j].dict);
1208 used = dictSize(server.db[j].dict);
1209 vkeys = dictSize(server.db[j].expires);
1210 if (!(loops % 5) && (used || vkeys)) {
1211 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1212 /* dictPrintStats(server.dict); */
1213 }
1214 }
1215
1216 /* We don't want to resize the hash tables while a bacground saving
1217 * is in progress: the saving child is created using fork() that is
1218 * implemented with a copy-on-write semantic in most modern systems, so
1219 * if we resize the HT while there is the saving child at work actually
1220 * a lot of memory movements in the parent will cause a lot of pages
1221 * copied. */
1222 if (server.bgsavechildpid == -1) tryResizeHashTables();
1223
1224 /* Show information about connected clients */
1225 if (!(loops % 5)) {
1226 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1227 listLength(server.clients)-listLength(server.slaves),
1228 listLength(server.slaves),
1229 zmalloc_used_memory(),
1230 dictSize(server.sharingpool));
1231 }
1232
1233 /* Close connections of timedout clients */
1234 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1235 closeTimedoutClients();
1236
1237 /* Check if a background saving or AOF rewrite in progress terminated */
1238 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1239 int statloc;
1240 pid_t pid;
1241
1242 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1243 if (pid == server.bgsavechildpid) {
1244 backgroundSaveDoneHandler(statloc);
1245 } else {
1246 backgroundRewriteDoneHandler(statloc);
1247 }
1248 }
1249 } else {
1250 /* If there is not a background saving in progress check if
1251 * we have to save now */
1252 time_t now = time(NULL);
1253 for (j = 0; j < server.saveparamslen; j++) {
1254 struct saveparam *sp = server.saveparams+j;
1255
1256 if (server.dirty >= sp->changes &&
1257 now-server.lastsave > sp->seconds) {
1258 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1259 sp->changes, sp->seconds);
1260 rdbSaveBackground(server.dbfilename);
1261 break;
1262 }
1263 }
1264 }
1265
1266 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1267 * will use few CPU cycles if there are few expiring keys, otherwise
1268 * it will get more aggressive to avoid that too much memory is used by
1269 * keys that can be removed from the keyspace. */
1270 for (j = 0; j < server.dbnum; j++) {
1271 int expired;
1272 redisDb *db = server.db+j;
1273
1274 /* Continue to expire if at the end of the cycle more than 25%
1275 * of the keys were expired. */
1276 do {
1277 long num = dictSize(db->expires);
1278 time_t now = time(NULL);
1279
1280 expired = 0;
1281 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1282 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1283 while (num--) {
1284 dictEntry *de;
1285 time_t t;
1286
1287 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1288 t = (time_t) dictGetEntryVal(de);
1289 if (now > t) {
1290 deleteKey(db,dictGetEntryKey(de));
1291 expired++;
1292 }
1293 }
1294 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1295 }
1296
1297 /* Swap a few keys on disk if we are over the memory limit and VM
1298 * is enbled. Try to free objects from the free list first. */
1299 if (vmCanSwapOut()) {
1300 while (server.vm_enabled && zmalloc_used_memory() >
1301 server.vm_max_memory)
1302 {
1303 int retval;
1304
1305 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1306 retval = (server.vm_max_threads == 0) ?
1307 vmSwapOneObjectBlocking() :
1308 vmSwapOneObjectThreaded();
1309 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1310 zmalloc_used_memory() >
1311 (server.vm_max_memory+server.vm_max_memory/10))
1312 {
1313 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1314 }
1315 /* Note that when using threade I/O we free just one object,
1316 * because anyway when the I/O thread in charge to swap this
1317 * object out will finish, the handler of completed jobs
1318 * will try to swap more objects if we are still out of memory. */
1319 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1320 }
1321 }
1322
1323 /* Check if we should connect to a MASTER */
1324 if (server.replstate == REDIS_REPL_CONNECT) {
1325 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1326 if (syncWithMaster() == REDIS_OK) {
1327 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1328 }
1329 }
1330 return 1000;
1331 }
1332
1333 /* This function gets called every time Redis is entering the
1334 * main loop of the event driven library, that is, before to sleep
1335 * for ready file descriptors. */
1336 static void beforeSleep(struct aeEventLoop *eventLoop) {
1337 REDIS_NOTUSED(eventLoop);
1338
1339 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1340 listIter li;
1341 listNode *ln;
1342
1343 listRewind(server.io_ready_clients,&li);
1344 while((ln = listNext(&li))) {
1345 redisClient *c = ln->value;
1346 struct redisCommand *cmd;
1347
1348 /* Resume the client. */
1349 listDelNode(server.io_ready_clients,ln);
1350 c->flags &= (~REDIS_IO_WAIT);
1351 server.vm_blocked_clients--;
1352 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1353 readQueryFromClient, c);
1354 cmd = lookupCommand(c->argv[0]->ptr);
1355 assert(cmd != NULL);
1356 call(c,cmd);
1357 resetClient(c);
1358 /* There may be more data to process in the input buffer. */
1359 if (c->querybuf && sdslen(c->querybuf) > 0)
1360 processInputBuffer(c);
1361 }
1362 }
1363 }
1364
1365 static void createSharedObjects(void) {
1366 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1367 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1368 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1369 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1370 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1371 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1372 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1373 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1374 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1375 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1376 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1377 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1378 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1379 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1380 "-ERR no such key\r\n"));
1381 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1382 "-ERR syntax error\r\n"));
1383 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1384 "-ERR source and destination objects are the same\r\n"));
1385 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1386 "-ERR index out of range\r\n"));
1387 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1388 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1389 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1390 shared.select0 = createStringObject("select 0\r\n",10);
1391 shared.select1 = createStringObject("select 1\r\n",10);
1392 shared.select2 = createStringObject("select 2\r\n",10);
1393 shared.select3 = createStringObject("select 3\r\n",10);
1394 shared.select4 = createStringObject("select 4\r\n",10);
1395 shared.select5 = createStringObject("select 5\r\n",10);
1396 shared.select6 = createStringObject("select 6\r\n",10);
1397 shared.select7 = createStringObject("select 7\r\n",10);
1398 shared.select8 = createStringObject("select 8\r\n",10);
1399 shared.select9 = createStringObject("select 9\r\n",10);
1400 }
1401
1402 static void appendServerSaveParams(time_t seconds, int changes) {
1403 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1404 server.saveparams[server.saveparamslen].seconds = seconds;
1405 server.saveparams[server.saveparamslen].changes = changes;
1406 server.saveparamslen++;
1407 }
1408
1409 static void resetServerSaveParams() {
1410 zfree(server.saveparams);
1411 server.saveparams = NULL;
1412 server.saveparamslen = 0;
1413 }
1414
1415 static void initServerConfig() {
1416 server.dbnum = REDIS_DEFAULT_DBNUM;
1417 server.port = REDIS_SERVERPORT;
1418 server.verbosity = REDIS_VERBOSE;
1419 server.maxidletime = REDIS_MAXIDLETIME;
1420 server.saveparams = NULL;
1421 server.logfile = NULL; /* NULL = log on standard output */
1422 server.bindaddr = NULL;
1423 server.glueoutputbuf = 1;
1424 server.daemonize = 0;
1425 server.appendonly = 0;
1426 server.appendfsync = APPENDFSYNC_ALWAYS;
1427 server.lastfsync = time(NULL);
1428 server.appendfd = -1;
1429 server.appendseldb = -1; /* Make sure the first time will not match */
1430 server.pidfile = "/var/run/redis.pid";
1431 server.dbfilename = "dump.rdb";
1432 server.appendfilename = "appendonly.aof";
1433 server.requirepass = NULL;
1434 server.shareobjects = 0;
1435 server.rdbcompression = 1;
1436 server.sharingpoolsize = 1024;
1437 server.maxclients = 0;
1438 server.blpop_blocked_clients = 0;
1439 server.maxmemory = 0;
1440 server.vm_enabled = 0;
1441 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1442 server.vm_page_size = 256; /* 256 bytes per page */
1443 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1444 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1445 server.vm_max_threads = 4;
1446 server.vm_blocked_clients = 0;
1447
1448 resetServerSaveParams();
1449
1450 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1451 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1452 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1453 /* Replication related */
1454 server.isslave = 0;
1455 server.masterauth = NULL;
1456 server.masterhost = NULL;
1457 server.masterport = 6379;
1458 server.master = NULL;
1459 server.replstate = REDIS_REPL_NONE;
1460
1461 /* Double constants initialization */
1462 R_Zero = 0.0;
1463 R_PosInf = 1.0/R_Zero;
1464 R_NegInf = -1.0/R_Zero;
1465 R_Nan = R_Zero/R_Zero;
1466 }
1467
1468 static void initServer() {
1469 int j;
1470
1471 signal(SIGHUP, SIG_IGN);
1472 signal(SIGPIPE, SIG_IGN);
1473 setupSigSegvAction();
1474
1475 server.devnull = fopen("/dev/null","w");
1476 if (server.devnull == NULL) {
1477 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1478 exit(1);
1479 }
1480 server.clients = listCreate();
1481 server.slaves = listCreate();
1482 server.monitors = listCreate();
1483 server.objfreelist = listCreate();
1484 createSharedObjects();
1485 server.el = aeCreateEventLoop();
1486 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1487 server.sharingpool = dictCreate(&setDictType,NULL);
1488 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1489 if (server.fd == -1) {
1490 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1491 exit(1);
1492 }
1493 for (j = 0; j < server.dbnum; j++) {
1494 server.db[j].dict = dictCreate(&hashDictType,NULL);
1495 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1496 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1497 if (server.vm_enabled)
1498 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1499 server.db[j].id = j;
1500 }
1501 server.cronloops = 0;
1502 server.bgsavechildpid = -1;
1503 server.bgrewritechildpid = -1;
1504 server.bgrewritebuf = sdsempty();
1505 server.lastsave = time(NULL);
1506 server.dirty = 0;
1507 server.stat_numcommands = 0;
1508 server.stat_numconnections = 0;
1509 server.stat_starttime = time(NULL);
1510 server.unixtime = time(NULL);
1511 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1512 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1513 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1514
1515 if (server.appendonly) {
1516 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1517 if (server.appendfd == -1) {
1518 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1519 strerror(errno));
1520 exit(1);
1521 }
1522 }
1523
1524 if (server.vm_enabled) vmInit();
1525 }
1526
1527 /* Empty the whole database */
1528 static long long emptyDb() {
1529 int j;
1530 long long removed = 0;
1531
1532 for (j = 0; j < server.dbnum; j++) {
1533 removed += dictSize(server.db[j].dict);
1534 dictEmpty(server.db[j].dict);
1535 dictEmpty(server.db[j].expires);
1536 }
1537 return removed;
1538 }
1539
1540 static int yesnotoi(char *s) {
1541 if (!strcasecmp(s,"yes")) return 1;
1542 else if (!strcasecmp(s,"no")) return 0;
1543 else return -1;
1544 }
1545
1546 /* I agree, this is a very rudimental way to load a configuration...
1547 will improve later if the config gets more complex */
1548 static void loadServerConfig(char *filename) {
1549 FILE *fp;
1550 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1551 int linenum = 0;
1552 sds line = NULL;
1553
1554 if (filename[0] == '-' && filename[1] == '\0')
1555 fp = stdin;
1556 else {
1557 if ((fp = fopen(filename,"r")) == NULL) {
1558 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1559 exit(1);
1560 }
1561 }
1562
1563 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1564 sds *argv;
1565 int argc, j;
1566
1567 linenum++;
1568 line = sdsnew(buf);
1569 line = sdstrim(line," \t\r\n");
1570
1571 /* Skip comments and blank lines*/
1572 if (line[0] == '#' || line[0] == '\0') {
1573 sdsfree(line);
1574 continue;
1575 }
1576
1577 /* Split into arguments */
1578 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1579 sdstolower(argv[0]);
1580
1581 /* Execute config directives */
1582 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1583 server.maxidletime = atoi(argv[1]);
1584 if (server.maxidletime < 0) {
1585 err = "Invalid timeout value"; goto loaderr;
1586 }
1587 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1588 server.port = atoi(argv[1]);
1589 if (server.port < 1 || server.port > 65535) {
1590 err = "Invalid port"; goto loaderr;
1591 }
1592 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1593 server.bindaddr = zstrdup(argv[1]);
1594 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1595 int seconds = atoi(argv[1]);
1596 int changes = atoi(argv[2]);
1597 if (seconds < 1 || changes < 0) {
1598 err = "Invalid save parameters"; goto loaderr;
1599 }
1600 appendServerSaveParams(seconds,changes);
1601 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1602 if (chdir(argv[1]) == -1) {
1603 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1604 argv[1], strerror(errno));
1605 exit(1);
1606 }
1607 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1608 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1609 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1610 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1611 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1612 else {
1613 err = "Invalid log level. Must be one of debug, notice, warning";
1614 goto loaderr;
1615 }
1616 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1617 FILE *logfp;
1618
1619 server.logfile = zstrdup(argv[1]);
1620 if (!strcasecmp(server.logfile,"stdout")) {
1621 zfree(server.logfile);
1622 server.logfile = NULL;
1623 }
1624 if (server.logfile) {
1625 /* Test if we are able to open the file. The server will not
1626 * be able to abort just for this problem later... */
1627 logfp = fopen(server.logfile,"a");
1628 if (logfp == NULL) {
1629 err = sdscatprintf(sdsempty(),
1630 "Can't open the log file: %s", strerror(errno));
1631 goto loaderr;
1632 }
1633 fclose(logfp);
1634 }
1635 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1636 server.dbnum = atoi(argv[1]);
1637 if (server.dbnum < 1) {
1638 err = "Invalid number of databases"; goto loaderr;
1639 }
1640 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1641 server.maxclients = atoi(argv[1]);
1642 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1643 server.maxmemory = strtoll(argv[1], NULL, 10);
1644 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1645 server.masterhost = sdsnew(argv[1]);
1646 server.masterport = atoi(argv[2]);
1647 server.replstate = REDIS_REPL_CONNECT;
1648 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1649 server.masterauth = zstrdup(argv[1]);
1650 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1651 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1652 err = "argument must be 'yes' or 'no'"; goto loaderr;
1653 }
1654 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1655 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1656 err = "argument must be 'yes' or 'no'"; goto loaderr;
1657 }
1658 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1659 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1660 err = "argument must be 'yes' or 'no'"; goto loaderr;
1661 }
1662 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1663 server.sharingpoolsize = atoi(argv[1]);
1664 if (server.sharingpoolsize < 1) {
1665 err = "invalid object sharing pool size"; goto loaderr;
1666 }
1667 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1668 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1669 err = "argument must be 'yes' or 'no'"; goto loaderr;
1670 }
1671 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1672 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1673 err = "argument must be 'yes' or 'no'"; goto loaderr;
1674 }
1675 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1676 if (!strcasecmp(argv[1],"no")) {
1677 server.appendfsync = APPENDFSYNC_NO;
1678 } else if (!strcasecmp(argv[1],"always")) {
1679 server.appendfsync = APPENDFSYNC_ALWAYS;
1680 } else if (!strcasecmp(argv[1],"everysec")) {
1681 server.appendfsync = APPENDFSYNC_EVERYSEC;
1682 } else {
1683 err = "argument must be 'no', 'always' or 'everysec'";
1684 goto loaderr;
1685 }
1686 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1687 server.requirepass = zstrdup(argv[1]);
1688 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1689 server.pidfile = zstrdup(argv[1]);
1690 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1691 server.dbfilename = zstrdup(argv[1]);
1692 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1693 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1697 zfree(server.vm_swap_file);
1698 server.vm_swap_file = zstrdup(argv[1]);
1699 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1700 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1701 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1702 server.vm_page_size = strtoll(argv[1], NULL, 10);
1703 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1704 server.vm_pages = strtoll(argv[1], NULL, 10);
1705 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1706 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1707 } else {
1708 err = "Bad directive or wrong number of arguments"; goto loaderr;
1709 }
1710 for (j = 0; j < argc; j++)
1711 sdsfree(argv[j]);
1712 zfree(argv);
1713 sdsfree(line);
1714 }
1715 if (fp != stdin) fclose(fp);
1716 return;
1717
1718 loaderr:
1719 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1720 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1721 fprintf(stderr, ">>> '%s'\n", line);
1722 fprintf(stderr, "%s\n", err);
1723 exit(1);
1724 }
1725
1726 static void freeClientArgv(redisClient *c) {
1727 int j;
1728
1729 for (j = 0; j < c->argc; j++)
1730 decrRefCount(c->argv[j]);
1731 for (j = 0; j < c->mbargc; j++)
1732 decrRefCount(c->mbargv[j]);
1733 c->argc = 0;
1734 c->mbargc = 0;
1735 }
1736
1737 static void freeClient(redisClient *c) {
1738 listNode *ln;
1739
1740 /* Note that if the client we are freeing is blocked into a blocking
1741 * call, we have to set querybuf to NULL *before* to call
1742 * unblockClientWaitingData() to avoid processInputBuffer() will get
1743 * called. Also it is important to remove the file events after
1744 * this, because this call adds the READABLE event. */
1745 sdsfree(c->querybuf);
1746 c->querybuf = NULL;
1747 if (c->flags & REDIS_BLOCKED)
1748 unblockClientWaitingData(c);
1749
1750 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1751 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1752 listRelease(c->reply);
1753 freeClientArgv(c);
1754 close(c->fd);
1755 /* Remove from the list of clients */
1756 ln = listSearchKey(server.clients,c);
1757 redisAssert(ln != NULL);
1758 listDelNode(server.clients,ln);
1759 /* Remove from the list of clients waiting for swapped keys */
1760 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1761 ln = listSearchKey(server.io_ready_clients,c);
1762 if (ln) {
1763 listDelNode(server.io_ready_clients,ln);
1764 server.vm_blocked_clients--;
1765 }
1766 }
1767 while (server.vm_enabled && listLength(c->io_keys)) {
1768 ln = listFirst(c->io_keys);
1769 dontWaitForSwappedKey(c,ln->value);
1770 }
1771 listRelease(c->io_keys);
1772 /* Other cleanup */
1773 if (c->flags & REDIS_SLAVE) {
1774 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1775 close(c->repldbfd);
1776 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1777 ln = listSearchKey(l,c);
1778 redisAssert(ln != NULL);
1779 listDelNode(l,ln);
1780 }
1781 if (c->flags & REDIS_MASTER) {
1782 server.master = NULL;
1783 server.replstate = REDIS_REPL_CONNECT;
1784 }
1785 zfree(c->argv);
1786 zfree(c->mbargv);
1787 freeClientMultiState(c);
1788 zfree(c);
1789 }
1790
1791 #define GLUEREPLY_UP_TO (1024)
1792 static void glueReplyBuffersIfNeeded(redisClient *c) {
1793 int copylen = 0;
1794 char buf[GLUEREPLY_UP_TO];
1795 listNode *ln;
1796 listIter li;
1797 robj *o;
1798
1799 listRewind(c->reply,&li);
1800 while((ln = listNext(&li))) {
1801 int objlen;
1802
1803 o = ln->value;
1804 objlen = sdslen(o->ptr);
1805 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1806 memcpy(buf+copylen,o->ptr,objlen);
1807 copylen += objlen;
1808 listDelNode(c->reply,ln);
1809 } else {
1810 if (copylen == 0) return;
1811 break;
1812 }
1813 }
1814 /* Now the output buffer is empty, add the new single element */
1815 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1816 listAddNodeHead(c->reply,o);
1817 }
1818
1819 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1820 redisClient *c = privdata;
1821 int nwritten = 0, totwritten = 0, objlen;
1822 robj *o;
1823 REDIS_NOTUSED(el);
1824 REDIS_NOTUSED(mask);
1825
1826 /* Use writev() if we have enough buffers to send */
1827 if (!server.glueoutputbuf &&
1828 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1829 !(c->flags & REDIS_MASTER))
1830 {
1831 sendReplyToClientWritev(el, fd, privdata, mask);
1832 return;
1833 }
1834
1835 while(listLength(c->reply)) {
1836 if (server.glueoutputbuf && listLength(c->reply) > 1)
1837 glueReplyBuffersIfNeeded(c);
1838
1839 o = listNodeValue(listFirst(c->reply));
1840 objlen = sdslen(o->ptr);
1841
1842 if (objlen == 0) {
1843 listDelNode(c->reply,listFirst(c->reply));
1844 continue;
1845 }
1846
1847 if (c->flags & REDIS_MASTER) {
1848 /* Don't reply to a master */
1849 nwritten = objlen - c->sentlen;
1850 } else {
1851 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1852 if (nwritten <= 0) break;
1853 }
1854 c->sentlen += nwritten;
1855 totwritten += nwritten;
1856 /* If we fully sent the object on head go to the next one */
1857 if (c->sentlen == objlen) {
1858 listDelNode(c->reply,listFirst(c->reply));
1859 c->sentlen = 0;
1860 }
1861 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1862 * bytes, in a single threaded server it's a good idea to serve
1863 * other clients as well, even if a very large request comes from
1864 * super fast link that is always able to accept data (in real world
1865 * scenario think about 'KEYS *' against the loopback interfae) */
1866 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1867 }
1868 if (nwritten == -1) {
1869 if (errno == EAGAIN) {
1870 nwritten = 0;
1871 } else {
1872 redisLog(REDIS_VERBOSE,
1873 "Error writing to client: %s", strerror(errno));
1874 freeClient(c);
1875 return;
1876 }
1877 }
1878 if (totwritten > 0) c->lastinteraction = time(NULL);
1879 if (listLength(c->reply) == 0) {
1880 c->sentlen = 0;
1881 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1882 }
1883 }
1884
1885 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1886 {
1887 redisClient *c = privdata;
1888 int nwritten = 0, totwritten = 0, objlen, willwrite;
1889 robj *o;
1890 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1891 int offset, ion = 0;
1892 REDIS_NOTUSED(el);
1893 REDIS_NOTUSED(mask);
1894
1895 listNode *node;
1896 while (listLength(c->reply)) {
1897 offset = c->sentlen;
1898 ion = 0;
1899 willwrite = 0;
1900
1901 /* fill-in the iov[] array */
1902 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1903 o = listNodeValue(node);
1904 objlen = sdslen(o->ptr);
1905
1906 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1907 break;
1908
1909 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1910 break; /* no more iovecs */
1911
1912 iov[ion].iov_base = ((char*)o->ptr) + offset;
1913 iov[ion].iov_len = objlen - offset;
1914 willwrite += objlen - offset;
1915 offset = 0; /* just for the first item */
1916 ion++;
1917 }
1918
1919 if(willwrite == 0)
1920 break;
1921
1922 /* write all collected blocks at once */
1923 if((nwritten = writev(fd, iov, ion)) < 0) {
1924 if (errno != EAGAIN) {
1925 redisLog(REDIS_VERBOSE,
1926 "Error writing to client: %s", strerror(errno));
1927 freeClient(c);
1928 return;
1929 }
1930 break;
1931 }
1932
1933 totwritten += nwritten;
1934 offset = c->sentlen;
1935
1936 /* remove written robjs from c->reply */
1937 while (nwritten && listLength(c->reply)) {
1938 o = listNodeValue(listFirst(c->reply));
1939 objlen = sdslen(o->ptr);
1940
1941 if(nwritten >= objlen - offset) {
1942 listDelNode(c->reply, listFirst(c->reply));
1943 nwritten -= objlen - offset;
1944 c->sentlen = 0;
1945 } else {
1946 /* partial write */
1947 c->sentlen += nwritten;
1948 break;
1949 }
1950 offset = 0;
1951 }
1952 }
1953
1954 if (totwritten > 0)
1955 c->lastinteraction = time(NULL);
1956
1957 if (listLength(c->reply) == 0) {
1958 c->sentlen = 0;
1959 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1960 }
1961 }
1962
1963 static struct redisCommand *lookupCommand(char *name) {
1964 int j = 0;
1965 while(cmdTable[j].name != NULL) {
1966 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
1967 j++;
1968 }
1969 return NULL;
1970 }
1971
1972 /* resetClient prepare the client to process the next command */
1973 static void resetClient(redisClient *c) {
1974 freeClientArgv(c);
1975 c->bulklen = -1;
1976 c->multibulk = 0;
1977 }
1978
1979 /* Call() is the core of Redis execution of a command */
1980 static void call(redisClient *c, struct redisCommand *cmd) {
1981 long long dirty;
1982
1983 dirty = server.dirty;
1984 cmd->proc(c);
1985 if (server.appendonly && server.dirty-dirty)
1986 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1987 if (server.dirty-dirty && listLength(server.slaves))
1988 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1989 if (listLength(server.monitors))
1990 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1991 server.stat_numcommands++;
1992 }
1993
1994 /* If this function gets called we already read a whole
1995 * command, argments are in the client argv/argc fields.
1996 * processCommand() execute the command or prepare the
1997 * server for a bulk read from the client.
1998 *
1999 * If 1 is returned the client is still alive and valid and
2000 * and other operations can be performed by the caller. Otherwise
2001 * if 0 is returned the client was destroied (i.e. after QUIT). */
2002 static int processCommand(redisClient *c) {
2003 struct redisCommand *cmd;
2004
2005 /* Free some memory if needed (maxmemory setting) */
2006 if (server.maxmemory) freeMemoryIfNeeded();
2007
2008 /* Handle the multi bulk command type. This is an alternative protocol
2009 * supported by Redis in order to receive commands that are composed of
2010 * multiple binary-safe "bulk" arguments. The latency of processing is
2011 * a bit higher but this allows things like multi-sets, so if this
2012 * protocol is used only for MSET and similar commands this is a big win. */
2013 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2014 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2015 if (c->multibulk <= 0) {
2016 resetClient(c);
2017 return 1;
2018 } else {
2019 decrRefCount(c->argv[c->argc-1]);
2020 c->argc--;
2021 return 1;
2022 }
2023 } else if (c->multibulk) {
2024 if (c->bulklen == -1) {
2025 if (((char*)c->argv[0]->ptr)[0] != '$') {
2026 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2027 resetClient(c);
2028 return 1;
2029 } else {
2030 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2031 decrRefCount(c->argv[0]);
2032 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2033 c->argc--;
2034 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2035 resetClient(c);
2036 return 1;
2037 }
2038 c->argc--;
2039 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2040 return 1;
2041 }
2042 } else {
2043 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2044 c->mbargv[c->mbargc] = c->argv[0];
2045 c->mbargc++;
2046 c->argc--;
2047 c->multibulk--;
2048 if (c->multibulk == 0) {
2049 robj **auxargv;
2050 int auxargc;
2051
2052 /* Here we need to swap the multi-bulk argc/argv with the
2053 * normal argc/argv of the client structure. */
2054 auxargv = c->argv;
2055 c->argv = c->mbargv;
2056 c->mbargv = auxargv;
2057
2058 auxargc = c->argc;
2059 c->argc = c->mbargc;
2060 c->mbargc = auxargc;
2061
2062 /* We need to set bulklen to something different than -1
2063 * in order for the code below to process the command without
2064 * to try to read the last argument of a bulk command as
2065 * a special argument. */
2066 c->bulklen = 0;
2067 /* continue below and process the command */
2068 } else {
2069 c->bulklen = -1;
2070 return 1;
2071 }
2072 }
2073 }
2074 /* -- end of multi bulk commands processing -- */
2075
2076 /* The QUIT command is handled as a special case. Normal command
2077 * procs are unable to close the client connection safely */
2078 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2079 freeClient(c);
2080 return 0;
2081 }
2082
2083 /* Now lookup the command and check ASAP about trivial error conditions
2084 * such wrong arity, bad command name and so forth. */
2085 cmd = lookupCommand(c->argv[0]->ptr);
2086 if (!cmd) {
2087 addReplySds(c,
2088 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2089 (char*)c->argv[0]->ptr));
2090 resetClient(c);
2091 return 1;
2092 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2093 (c->argc < -cmd->arity)) {
2094 addReplySds(c,
2095 sdscatprintf(sdsempty(),
2096 "-ERR wrong number of arguments for '%s' command\r\n",
2097 cmd->name));
2098 resetClient(c);
2099 return 1;
2100 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2101 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2102 resetClient(c);
2103 return 1;
2104 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2105 /* This is a bulk command, we have to read the last argument yet. */
2106 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2107
2108 decrRefCount(c->argv[c->argc-1]);
2109 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2110 c->argc--;
2111 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2112 resetClient(c);
2113 return 1;
2114 }
2115 c->argc--;
2116 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2117 /* It is possible that the bulk read is already in the
2118 * buffer. Check this condition and handle it accordingly.
2119 * This is just a fast path, alternative to call processInputBuffer().
2120 * It's a good idea since the code is small and this condition
2121 * happens most of the times. */
2122 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2123 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2124 c->argc++;
2125 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2126 } else {
2127 /* Otherwise return... there is to read the last argument
2128 * from the socket. */
2129 return 1;
2130 }
2131 }
2132 /* Let's try to share objects on the command arguments vector */
2133 if (server.shareobjects) {
2134 int j;
2135 for(j = 1; j < c->argc; j++)
2136 c->argv[j] = tryObjectSharing(c->argv[j]);
2137 }
2138 /* Let's try to encode the bulk object to save space. */
2139 if (cmd->flags & REDIS_CMD_BULK)
2140 tryObjectEncoding(c->argv[c->argc-1]);
2141
2142 /* Check if the user is authenticated */
2143 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2144 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2145 resetClient(c);
2146 return 1;
2147 }
2148
2149 /* Exec the command */
2150 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2151 queueMultiCommand(c,cmd);
2152 addReply(c,shared.queued);
2153 } else {
2154 if (server.vm_enabled && server.vm_max_threads > 0 &&
2155 blockClientOnSwappedKeys(cmd,c)) return 1;
2156 call(c,cmd);
2157 }
2158
2159 /* Prepare the client for the next command */
2160 resetClient(c);
2161 return 1;
2162 }
2163
2164 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2165 listNode *ln;
2166 listIter li;
2167 int outc = 0, j;
2168 robj **outv;
2169 /* (args*2)+1 is enough room for args, spaces, newlines */
2170 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2171
2172 if (argc <= REDIS_STATIC_ARGS) {
2173 outv = static_outv;
2174 } else {
2175 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2176 }
2177
2178 for (j = 0; j < argc; j++) {
2179 if (j != 0) outv[outc++] = shared.space;
2180 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2181 robj *lenobj;
2182
2183 lenobj = createObject(REDIS_STRING,
2184 sdscatprintf(sdsempty(),"%lu\r\n",
2185 (unsigned long) stringObjectLen(argv[j])));
2186 lenobj->refcount = 0;
2187 outv[outc++] = lenobj;
2188 }
2189 outv[outc++] = argv[j];
2190 }
2191 outv[outc++] = shared.crlf;
2192
2193 /* Increment all the refcounts at start and decrement at end in order to
2194 * be sure to free objects if there is no slave in a replication state
2195 * able to be feed with commands */
2196 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2197 listRewind(slaves,&li);
2198 while((ln = listNext(&li))) {
2199 redisClient *slave = ln->value;
2200
2201 /* Don't feed slaves that are still waiting for BGSAVE to start */
2202 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2203
2204 /* Feed all the other slaves, MONITORs and so on */
2205 if (slave->slaveseldb != dictid) {
2206 robj *selectcmd;
2207
2208 switch(dictid) {
2209 case 0: selectcmd = shared.select0; break;
2210 case 1: selectcmd = shared.select1; break;
2211 case 2: selectcmd = shared.select2; break;
2212 case 3: selectcmd = shared.select3; break;
2213 case 4: selectcmd = shared.select4; break;
2214 case 5: selectcmd = shared.select5; break;
2215 case 6: selectcmd = shared.select6; break;
2216 case 7: selectcmd = shared.select7; break;
2217 case 8: selectcmd = shared.select8; break;
2218 case 9: selectcmd = shared.select9; break;
2219 default:
2220 selectcmd = createObject(REDIS_STRING,
2221 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2222 selectcmd->refcount = 0;
2223 break;
2224 }
2225 addReply(slave,selectcmd);
2226 slave->slaveseldb = dictid;
2227 }
2228 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2229 }
2230 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2231 if (outv != static_outv) zfree(outv);
2232 }
2233
2234 static void processInputBuffer(redisClient *c) {
2235 again:
2236 /* Before to process the input buffer, make sure the client is not
2237 * waitig for a blocking operation such as BLPOP. Note that the first
2238 * iteration the client is never blocked, otherwise the processInputBuffer
2239 * would not be called at all, but after the execution of the first commands
2240 * in the input buffer the client may be blocked, and the "goto again"
2241 * will try to reiterate. The following line will make it return asap. */
2242 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2243 if (c->bulklen == -1) {
2244 /* Read the first line of the query */
2245 char *p = strchr(c->querybuf,'\n');
2246 size_t querylen;
2247
2248 if (p) {
2249 sds query, *argv;
2250 int argc, j;
2251
2252 query = c->querybuf;
2253 c->querybuf = sdsempty();
2254 querylen = 1+(p-(query));
2255 if (sdslen(query) > querylen) {
2256 /* leave data after the first line of the query in the buffer */
2257 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2258 }
2259 *p = '\0'; /* remove "\n" */
2260 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2261 sdsupdatelen(query);
2262
2263 /* Now we can split the query in arguments */
2264 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2265 sdsfree(query);
2266
2267 if (c->argv) zfree(c->argv);
2268 c->argv = zmalloc(sizeof(robj*)*argc);
2269
2270 for (j = 0; j < argc; j++) {
2271 if (sdslen(argv[j])) {
2272 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2273 c->argc++;
2274 } else {
2275 sdsfree(argv[j]);
2276 }
2277 }
2278 zfree(argv);
2279 if (c->argc) {
2280 /* Execute the command. If the client is still valid
2281 * after processCommand() return and there is something
2282 * on the query buffer try to process the next command. */
2283 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2284 } else {
2285 /* Nothing to process, argc == 0. Just process the query
2286 * buffer if it's not empty or return to the caller */
2287 if (sdslen(c->querybuf)) goto again;
2288 }
2289 return;
2290 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2291 redisLog(REDIS_VERBOSE, "Client protocol error");
2292 freeClient(c);
2293 return;
2294 }
2295 } else {
2296 /* Bulk read handling. Note that if we are at this point
2297 the client already sent a command terminated with a newline,
2298 we are reading the bulk data that is actually the last
2299 argument of the command. */
2300 int qbl = sdslen(c->querybuf);
2301
2302 if (c->bulklen <= qbl) {
2303 /* Copy everything but the final CRLF as final argument */
2304 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2305 c->argc++;
2306 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2307 /* Process the command. If the client is still valid after
2308 * the processing and there is more data in the buffer
2309 * try to parse it. */
2310 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2311 return;
2312 }
2313 }
2314 }
2315
2316 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2317 redisClient *c = (redisClient*) privdata;
2318 char buf[REDIS_IOBUF_LEN];
2319 int nread;
2320 REDIS_NOTUSED(el);
2321 REDIS_NOTUSED(mask);
2322
2323 nread = read(fd, buf, REDIS_IOBUF_LEN);
2324 if (nread == -1) {
2325 if (errno == EAGAIN) {
2326 nread = 0;
2327 } else {
2328 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2329 freeClient(c);
2330 return;
2331 }
2332 } else if (nread == 0) {
2333 redisLog(REDIS_VERBOSE, "Client closed connection");
2334 freeClient(c);
2335 return;
2336 }
2337 if (nread) {
2338 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2339 c->lastinteraction = time(NULL);
2340 } else {
2341 return;
2342 }
2343 if (!(c->flags & REDIS_BLOCKED))
2344 processInputBuffer(c);
2345 }
2346
2347 static int selectDb(redisClient *c, int id) {
2348 if (id < 0 || id >= server.dbnum)
2349 return REDIS_ERR;
2350 c->db = &server.db[id];
2351 return REDIS_OK;
2352 }
2353
2354 static void *dupClientReplyValue(void *o) {
2355 incrRefCount((robj*)o);
2356 return o;
2357 }
2358
2359 static redisClient *createClient(int fd) {
2360 redisClient *c = zmalloc(sizeof(*c));
2361
2362 anetNonBlock(NULL,fd);
2363 anetTcpNoDelay(NULL,fd);
2364 if (!c) return NULL;
2365 selectDb(c,0);
2366 c->fd = fd;
2367 c->querybuf = sdsempty();
2368 c->argc = 0;
2369 c->argv = NULL;
2370 c->bulklen = -1;
2371 c->multibulk = 0;
2372 c->mbargc = 0;
2373 c->mbargv = NULL;
2374 c->sentlen = 0;
2375 c->flags = 0;
2376 c->lastinteraction = time(NULL);
2377 c->authenticated = 0;
2378 c->replstate = REDIS_REPL_NONE;
2379 c->reply = listCreate();
2380 listSetFreeMethod(c->reply,decrRefCount);
2381 listSetDupMethod(c->reply,dupClientReplyValue);
2382 c->blockingkeys = NULL;
2383 c->blockingkeysnum = 0;
2384 c->io_keys = listCreate();
2385 listSetFreeMethod(c->io_keys,decrRefCount);
2386 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2387 readQueryFromClient, c) == AE_ERR) {
2388 freeClient(c);
2389 return NULL;
2390 }
2391 listAddNodeTail(server.clients,c);
2392 initClientMultiState(c);
2393 return c;
2394 }
2395
2396 static void addReply(redisClient *c, robj *obj) {
2397 if (listLength(c->reply) == 0 &&
2398 (c->replstate == REDIS_REPL_NONE ||
2399 c->replstate == REDIS_REPL_ONLINE) &&
2400 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2401 sendReplyToClient, c) == AE_ERR) return;
2402
2403 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2404 obj = dupStringObject(obj);
2405 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2406 }
2407 listAddNodeTail(c->reply,getDecodedObject(obj));
2408 }
2409
2410 static void addReplySds(redisClient *c, sds s) {
2411 robj *o = createObject(REDIS_STRING,s);
2412 addReply(c,o);
2413 decrRefCount(o);
2414 }
2415
2416 static void addReplyDouble(redisClient *c, double d) {
2417 char buf[128];
2418
2419 snprintf(buf,sizeof(buf),"%.17g",d);
2420 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2421 (unsigned long) strlen(buf),buf));
2422 }
2423
2424 static void addReplyLong(redisClient *c, long l) {
2425 char buf[128];
2426 size_t len;
2427
2428 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2429 addReplySds(c,sdsnewlen(buf,len));
2430 }
2431
2432 static void addReplyBulkLen(redisClient *c, robj *obj) {
2433 size_t len;
2434
2435 if (obj->encoding == REDIS_ENCODING_RAW) {
2436 len = sdslen(obj->ptr);
2437 } else {
2438 long n = (long)obj->ptr;
2439
2440 /* Compute how many bytes will take this integer as a radix 10 string */
2441 len = 1;
2442 if (n < 0) {
2443 len++;
2444 n = -n;
2445 }
2446 while((n = n/10) != 0) {
2447 len++;
2448 }
2449 }
2450 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2451 }
2452
2453 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2454 int cport, cfd;
2455 char cip[128];
2456 redisClient *c;
2457 REDIS_NOTUSED(el);
2458 REDIS_NOTUSED(mask);
2459 REDIS_NOTUSED(privdata);
2460
2461 cfd = anetAccept(server.neterr, fd, cip, &cport);
2462 if (cfd == AE_ERR) {
2463 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2464 return;
2465 }
2466 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2467 if ((c = createClient(cfd)) == NULL) {
2468 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2469 close(cfd); /* May be already closed, just ingore errors */
2470 return;
2471 }
2472 /* If maxclient directive is set and this is one client more... close the
2473 * connection. Note that we create the client instead to check before
2474 * for this condition, since now the socket is already set in nonblocking
2475 * mode and we can send an error for free using the Kernel I/O */
2476 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2477 char *err = "-ERR max number of clients reached\r\n";
2478
2479 /* That's a best effort error message, don't check write errors */
2480 if (write(c->fd,err,strlen(err)) == -1) {
2481 /* Nothing to do, Just to avoid the warning... */
2482 }
2483 freeClient(c);
2484 return;
2485 }
2486 server.stat_numconnections++;
2487 }
2488
2489 /* ======================= Redis objects implementation ===================== */
2490
2491 static robj *createObject(int type, void *ptr) {
2492 robj *o;
2493
2494 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2495 if (listLength(server.objfreelist)) {
2496 listNode *head = listFirst(server.objfreelist);
2497 o = listNodeValue(head);
2498 listDelNode(server.objfreelist,head);
2499 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2500 } else {
2501 if (server.vm_enabled) {
2502 pthread_mutex_unlock(&server.obj_freelist_mutex);
2503 o = zmalloc(sizeof(*o));
2504 } else {
2505 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2506 }
2507 }
2508 o->type = type;
2509 o->encoding = REDIS_ENCODING_RAW;
2510 o->ptr = ptr;
2511 o->refcount = 1;
2512 if (server.vm_enabled) {
2513 /* Note that this code may run in the context of an I/O thread
2514 * and accessing to server.unixtime in theory is an error
2515 * (no locks). But in practice this is safe, and even if we read
2516 * garbage Redis will not fail, as it's just a statistical info */
2517 o->vm.atime = server.unixtime;
2518 o->storage = REDIS_VM_MEMORY;
2519 }
2520 return o;
2521 }
2522
2523 static robj *createStringObject(char *ptr, size_t len) {
2524 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2525 }
2526
2527 static robj *dupStringObject(robj *o) {
2528 assert(o->encoding == REDIS_ENCODING_RAW);
2529 return createStringObject(o->ptr,sdslen(o->ptr));
2530 }
2531
2532 static robj *createListObject(void) {
2533 list *l = listCreate();
2534
2535 listSetFreeMethod(l,decrRefCount);
2536 return createObject(REDIS_LIST,l);
2537 }
2538
2539 static robj *createSetObject(void) {
2540 dict *d = dictCreate(&setDictType,NULL);
2541 return createObject(REDIS_SET,d);
2542 }
2543
2544 static robj *createZsetObject(void) {
2545 zset *zs = zmalloc(sizeof(*zs));
2546
2547 zs->dict = dictCreate(&zsetDictType,NULL);
2548 zs->zsl = zslCreate();
2549 return createObject(REDIS_ZSET,zs);
2550 }
2551
2552 static void freeStringObject(robj *o) {
2553 if (o->encoding == REDIS_ENCODING_RAW) {
2554 sdsfree(o->ptr);
2555 }
2556 }
2557
2558 static void freeListObject(robj *o) {
2559 listRelease((list*) o->ptr);
2560 }
2561
2562 static void freeSetObject(robj *o) {
2563 dictRelease((dict*) o->ptr);
2564 }
2565
2566 static void freeZsetObject(robj *o) {
2567 zset *zs = o->ptr;
2568
2569 dictRelease(zs->dict);
2570 zslFree(zs->zsl);
2571 zfree(zs);
2572 }
2573
2574 static void freeHashObject(robj *o) {
2575 dictRelease((dict*) o->ptr);
2576 }
2577
2578 static void incrRefCount(robj *o) {
2579 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2580 o->refcount++;
2581 }
2582
2583 static void decrRefCount(void *obj) {
2584 robj *o = obj;
2585
2586 /* Object is a key of a swapped out value, or in the process of being
2587 * loaded. */
2588 if (server.vm_enabled &&
2589 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2590 {
2591 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2592 redisAssert(o->refcount == 1);
2593 }
2594 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2595 redisAssert(o->type == REDIS_STRING);
2596 freeStringObject(o);
2597 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2598 pthread_mutex_lock(&server.obj_freelist_mutex);
2599 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2600 !listAddNodeHead(server.objfreelist,o))
2601 zfree(o);
2602 pthread_mutex_unlock(&server.obj_freelist_mutex);
2603 server.vm_stats_swapped_objects--;
2604 return;
2605 }
2606 /* Object is in memory, or in the process of being swapped out. */
2607 if (--(o->refcount) == 0) {
2608 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2609 vmCancelThreadedIOJob(obj);
2610 switch(o->type) {
2611 case REDIS_STRING: freeStringObject(o); break;
2612 case REDIS_LIST: freeListObject(o); break;
2613 case REDIS_SET: freeSetObject(o); break;
2614 case REDIS_ZSET: freeZsetObject(o); break;
2615 case REDIS_HASH: freeHashObject(o); break;
2616 default: redisAssert(0 != 0); break;
2617 }
2618 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2619 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2620 !listAddNodeHead(server.objfreelist,o))
2621 zfree(o);
2622 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2623 }
2624 }
2625
2626 static robj *lookupKey(redisDb *db, robj *key) {
2627 dictEntry *de = dictFind(db->dict,key);
2628 if (de) {
2629 robj *key = dictGetEntryKey(de);
2630 robj *val = dictGetEntryVal(de);
2631
2632 if (server.vm_enabled) {
2633 if (key->storage == REDIS_VM_MEMORY ||
2634 key->storage == REDIS_VM_SWAPPING)
2635 {
2636 /* If we were swapping the object out, stop it, this key
2637 * was requested. */
2638 if (key->storage == REDIS_VM_SWAPPING)
2639 vmCancelThreadedIOJob(key);
2640 /* Update the access time of the key for the aging algorithm. */
2641 key->vm.atime = server.unixtime;
2642 } else {
2643 int notify = (key->storage == REDIS_VM_LOADING);
2644
2645 /* Our value was swapped on disk. Bring it at home. */
2646 redisAssert(val == NULL);
2647 val = vmLoadObject(key);
2648 dictGetEntryVal(de) = val;
2649
2650 /* Clients blocked by the VM subsystem may be waiting for
2651 * this key... */
2652 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2653 }
2654 }
2655 return val;
2656 } else {
2657 return NULL;
2658 }
2659 }
2660
2661 static robj *lookupKeyRead(redisDb *db, robj *key) {
2662 expireIfNeeded(db,key);
2663 return lookupKey(db,key);
2664 }
2665
2666 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2667 deleteIfVolatile(db,key);
2668 return lookupKey(db,key);
2669 }
2670
2671 static int deleteKey(redisDb *db, robj *key) {
2672 int retval;
2673
2674 /* We need to protect key from destruction: after the first dictDelete()
2675 * it may happen that 'key' is no longer valid if we don't increment
2676 * it's count. This may happen when we get the object reference directly
2677 * from the hash table with dictRandomKey() or dict iterators */
2678 incrRefCount(key);
2679 if (dictSize(db->expires)) dictDelete(db->expires,key);
2680 retval = dictDelete(db->dict,key);
2681 decrRefCount(key);
2682
2683 return retval == DICT_OK;
2684 }
2685
2686 /* Try to share an object against the shared objects pool */
2687 static robj *tryObjectSharing(robj *o) {
2688 struct dictEntry *de;
2689 unsigned long c;
2690
2691 if (o == NULL || server.shareobjects == 0) return o;
2692
2693 redisAssert(o->type == REDIS_STRING);
2694 de = dictFind(server.sharingpool,o);
2695 if (de) {
2696 robj *shared = dictGetEntryKey(de);
2697
2698 c = ((unsigned long) dictGetEntryVal(de))+1;
2699 dictGetEntryVal(de) = (void*) c;
2700 incrRefCount(shared);
2701 decrRefCount(o);
2702 return shared;
2703 } else {
2704 /* Here we are using a stream algorihtm: Every time an object is
2705 * shared we increment its count, everytime there is a miss we
2706 * recrement the counter of a random object. If this object reaches
2707 * zero we remove the object and put the current object instead. */
2708 if (dictSize(server.sharingpool) >=
2709 server.sharingpoolsize) {
2710 de = dictGetRandomKey(server.sharingpool);
2711 redisAssert(de != NULL);
2712 c = ((unsigned long) dictGetEntryVal(de))-1;
2713 dictGetEntryVal(de) = (void*) c;
2714 if (c == 0) {
2715 dictDelete(server.sharingpool,de->key);
2716 }
2717 } else {
2718 c = 0; /* If the pool is empty we want to add this object */
2719 }
2720 if (c == 0) {
2721 int retval;
2722
2723 retval = dictAdd(server.sharingpool,o,(void*)1);
2724 redisAssert(retval == DICT_OK);
2725 incrRefCount(o);
2726 }
2727 return o;
2728 }
2729 }
2730
2731 /* Check if the nul-terminated string 's' can be represented by a long
2732 * (that is, is a number that fits into long without any other space or
2733 * character before or after the digits).
2734 *
2735 * If so, the function returns REDIS_OK and *longval is set to the value
2736 * of the number. Otherwise REDIS_ERR is returned */
2737 static int isStringRepresentableAsLong(sds s, long *longval) {
2738 char buf[32], *endptr;
2739 long value;
2740 int slen;
2741
2742 value = strtol(s, &endptr, 10);
2743 if (endptr[0] != '\0') return REDIS_ERR;
2744 slen = snprintf(buf,32,"%ld",value);
2745
2746 /* If the number converted back into a string is not identical
2747 * then it's not possible to encode the string as integer */
2748 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2749 if (longval) *longval = value;
2750 return REDIS_OK;
2751 }
2752
2753 /* Try to encode a string object in order to save space */
2754 static int tryObjectEncoding(robj *o) {
2755 long value;
2756 sds s = o->ptr;
2757
2758 if (o->encoding != REDIS_ENCODING_RAW)
2759 return REDIS_ERR; /* Already encoded */
2760
2761 /* It's not save to encode shared objects: shared objects can be shared
2762 * everywhere in the "object space" of Redis. Encoded objects can only
2763 * appear as "values" (and not, for instance, as keys) */
2764 if (o->refcount > 1) return REDIS_ERR;
2765
2766 /* Currently we try to encode only strings */
2767 redisAssert(o->type == REDIS_STRING);
2768
2769 /* Check if we can represent this string as a long integer */
2770 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2771
2772 /* Ok, this object can be encoded */
2773 o->encoding = REDIS_ENCODING_INT;
2774 sdsfree(o->ptr);
2775 o->ptr = (void*) value;
2776 return REDIS_OK;
2777 }
2778
2779 /* Get a decoded version of an encoded object (returned as a new object).
2780 * If the object is already raw-encoded just increment the ref count. */
2781 static robj *getDecodedObject(robj *o) {
2782 robj *dec;
2783
2784 if (o->encoding == REDIS_ENCODING_RAW) {
2785 incrRefCount(o);
2786 return o;
2787 }
2788 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2789 char buf[32];
2790
2791 snprintf(buf,32,"%ld",(long)o->ptr);
2792 dec = createStringObject(buf,strlen(buf));
2793 return dec;
2794 } else {
2795 redisAssert(1 != 1);
2796 }
2797 }
2798
2799 /* Compare two string objects via strcmp() or alike.
2800 * Note that the objects may be integer-encoded. In such a case we
2801 * use snprintf() to get a string representation of the numbers on the stack
2802 * and compare the strings, it's much faster than calling getDecodedObject().
2803 *
2804 * Important note: if objects are not integer encoded, but binary-safe strings,
2805 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2806 * binary safe. */
2807 static int compareStringObjects(robj *a, robj *b) {
2808 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2809 char bufa[128], bufb[128], *astr, *bstr;
2810 int bothsds = 1;
2811
2812 if (a == b) return 0;
2813 if (a->encoding != REDIS_ENCODING_RAW) {
2814 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2815 astr = bufa;
2816 bothsds = 0;
2817 } else {
2818 astr = a->ptr;
2819 }
2820 if (b->encoding != REDIS_ENCODING_RAW) {
2821 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2822 bstr = bufb;
2823 bothsds = 0;
2824 } else {
2825 bstr = b->ptr;
2826 }
2827 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2828 }
2829
2830 static size_t stringObjectLen(robj *o) {
2831 redisAssert(o->type == REDIS_STRING);
2832 if (o->encoding == REDIS_ENCODING_RAW) {
2833 return sdslen(o->ptr);
2834 } else {
2835 char buf[32];
2836
2837 return snprintf(buf,32,"%ld",(long)o->ptr);
2838 }
2839 }
2840
2841 /*============================ RDB saving/loading =========================== */
2842
2843 static int rdbSaveType(FILE *fp, unsigned char type) {
2844 if (fwrite(&type,1,1,fp) == 0) return -1;
2845 return 0;
2846 }
2847
2848 static int rdbSaveTime(FILE *fp, time_t t) {
2849 int32_t t32 = (int32_t) t;
2850 if (fwrite(&t32,4,1,fp) == 0) return -1;
2851 return 0;
2852 }
2853
2854 /* check rdbLoadLen() comments for more info */
2855 static int rdbSaveLen(FILE *fp, uint32_t len) {
2856 unsigned char buf[2];
2857
2858 if (len < (1<<6)) {
2859 /* Save a 6 bit len */
2860 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2861 if (fwrite(buf,1,1,fp) == 0) return -1;
2862 } else if (len < (1<<14)) {
2863 /* Save a 14 bit len */
2864 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2865 buf[1] = len&0xFF;
2866 if (fwrite(buf,2,1,fp) == 0) return -1;
2867 } else {
2868 /* Save a 32 bit len */
2869 buf[0] = (REDIS_RDB_32BITLEN<<6);
2870 if (fwrite(buf,1,1,fp) == 0) return -1;
2871 len = htonl(len);
2872 if (fwrite(&len,4,1,fp) == 0) return -1;
2873 }
2874 return 0;
2875 }
2876
2877 /* String objects in the form "2391" "-100" without any space and with a
2878 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2879 * encoded as integers to save space */
2880 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2881 long long value;
2882 char *endptr, buf[32];
2883
2884 /* Check if it's possible to encode this value as a number */
2885 value = strtoll(s, &endptr, 10);
2886 if (endptr[0] != '\0') return 0;
2887 snprintf(buf,32,"%lld",value);
2888
2889 /* If the number converted back into a string is not identical
2890 * then it's not possible to encode the string as integer */
2891 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2892
2893 /* Finally check if it fits in our ranges */
2894 if (value >= -(1<<7) && value <= (1<<7)-1) {
2895 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2896 enc[1] = value&0xFF;
2897 return 2;
2898 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2899 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2900 enc[1] = value&0xFF;
2901 enc[2] = (value>>8)&0xFF;
2902 return 3;
2903 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2904 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2905 enc[1] = value&0xFF;
2906 enc[2] = (value>>8)&0xFF;
2907 enc[3] = (value>>16)&0xFF;
2908 enc[4] = (value>>24)&0xFF;
2909 return 5;
2910 } else {
2911 return 0;
2912 }
2913 }
2914
2915 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2916 unsigned int comprlen, outlen;
2917 unsigned char byte;
2918 void *out;
2919
2920 /* We require at least four bytes compression for this to be worth it */
2921 outlen = sdslen(obj->ptr)-4;
2922 if (outlen <= 0) return 0;
2923 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2924 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2925 if (comprlen == 0) {
2926 zfree(out);
2927 return 0;
2928 }
2929 /* Data compressed! Let's save it on disk */
2930 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2931 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2932 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2933 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2934 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2935 zfree(out);
2936 return comprlen;
2937
2938 writeerr:
2939 zfree(out);
2940 return -1;
2941 }
2942
2943 /* Save a string objet as [len][data] on disk. If the object is a string
2944 * representation of an integer value we try to safe it in a special form */
2945 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2946 size_t len;
2947 int enclen;
2948
2949 len = sdslen(obj->ptr);
2950
2951 /* Try integer encoding */
2952 if (len <= 11) {
2953 unsigned char buf[5];
2954 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2955 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2956 return 0;
2957 }
2958 }
2959
2960 /* Try LZF compression - under 20 bytes it's unable to compress even
2961 * aaaaaaaaaaaaaaaaaa so skip it */
2962 if (server.rdbcompression && len > 20) {
2963 int retval;
2964
2965 retval = rdbSaveLzfStringObject(fp,obj);
2966 if (retval == -1) return -1;
2967 if (retval > 0) return 0;
2968 /* retval == 0 means data can't be compressed, save the old way */
2969 }
2970
2971 /* Store verbatim */
2972 if (rdbSaveLen(fp,len) == -1) return -1;
2973 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2974 return 0;
2975 }
2976
2977 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
2978 static int rdbSaveStringObject(FILE *fp, robj *obj) {
2979 int retval;
2980
2981 /* Avoid incr/decr ref count business when possible.
2982 * This plays well with copy-on-write given that we are probably
2983 * in a child process (BGSAVE). Also this makes sure key objects
2984 * of swapped objects are not incRefCount-ed (an assert does not allow
2985 * this in order to avoid bugs) */
2986 if (obj->encoding != REDIS_ENCODING_RAW) {
2987 obj = getDecodedObject(obj);
2988 retval = rdbSaveStringObjectRaw(fp,obj);
2989 decrRefCount(obj);
2990 } else {
2991 retval = rdbSaveStringObjectRaw(fp,obj);
2992 }
2993 return retval;
2994 }
2995
2996 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
2997 * 8 bit integer specifing the length of the representation.
2998 * This 8 bit integer has special values in order to specify the following
2999 * conditions:
3000 * 253: not a number
3001 * 254: + inf
3002 * 255: - inf
3003 */
3004 static int rdbSaveDoubleValue(FILE *fp, double val) {
3005 unsigned char buf[128];
3006 int len;
3007
3008 if (isnan(val)) {
3009 buf[0] = 253;
3010 len = 1;
3011 } else if (!isfinite(val)) {
3012 len = 1;
3013 buf[0] = (val < 0) ? 255 : 254;
3014 } else {
3015 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3016 buf[0] = strlen((char*)buf+1);
3017 len = buf[0]+1;
3018 }
3019 if (fwrite(buf,len,1,fp) == 0) return -1;
3020 return 0;
3021 }
3022
3023 /* Save a Redis object. */
3024 static int rdbSaveObject(FILE *fp, robj *o) {
3025 if (o->type == REDIS_STRING) {
3026 /* Save a string value */
3027 if (rdbSaveStringObject(fp,o) == -1) return -1;
3028 } else if (o->type == REDIS_LIST) {
3029 /* Save a list value */
3030 list *list = o->ptr;
3031 listIter li;
3032 listNode *ln;
3033
3034 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3035 listRewind(list,&li);
3036 while((ln = listNext(&li))) {
3037 robj *eleobj = listNodeValue(ln);
3038
3039 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3040 }
3041 } else if (o->type == REDIS_SET) {
3042 /* Save a set value */
3043 dict *set = o->ptr;
3044 dictIterator *di = dictGetIterator(set);
3045 dictEntry *de;
3046
3047 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3048 while((de = dictNext(di)) != NULL) {
3049 robj *eleobj = dictGetEntryKey(de);
3050
3051 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3052 }
3053 dictReleaseIterator(di);
3054 } else if (o->type == REDIS_ZSET) {
3055 /* Save a set value */
3056 zset *zs = o->ptr;
3057 dictIterator *di = dictGetIterator(zs->dict);
3058 dictEntry *de;
3059
3060 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3061 while((de = dictNext(di)) != NULL) {
3062 robj *eleobj = dictGetEntryKey(de);
3063 double *score = dictGetEntryVal(de);
3064
3065 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3066 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3067 }
3068 dictReleaseIterator(di);
3069 } else {
3070 redisAssert(0 != 0);
3071 }
3072 return 0;
3073 }
3074
3075 /* Return the length the object will have on disk if saved with
3076 * the rdbSaveObject() function. Currently we use a trick to get
3077 * this length with very little changes to the code. In the future
3078 * we could switch to a faster solution. */
3079 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3080 if (fp == NULL) fp = server.devnull;
3081 rewind(fp);
3082 assert(rdbSaveObject(fp,o) != 1);
3083 return ftello(fp);
3084 }
3085
3086 /* Return the number of pages required to save this object in the swap file */
3087 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3088 off_t bytes = rdbSavedObjectLen(o,fp);
3089
3090 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3091 }
3092
3093 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3094 static int rdbSave(char *filename) {
3095 dictIterator *di = NULL;
3096 dictEntry *de;
3097 FILE *fp;
3098 char tmpfile[256];
3099 int j;
3100 time_t now = time(NULL);
3101
3102 /* Wait for I/O therads to terminate, just in case this is a
3103 * foreground-saving, to avoid seeking the swap file descriptor at the
3104 * same time. */
3105 if (server.vm_enabled)
3106 waitEmptyIOJobsQueue();
3107
3108 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3109 fp = fopen(tmpfile,"w");
3110 if (!fp) {
3111 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3112 return REDIS_ERR;
3113 }
3114 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3115 for (j = 0; j < server.dbnum; j++) {
3116 redisDb *db = server.db+j;
3117 dict *d = db->dict;
3118 if (dictSize(d) == 0) continue;
3119 di = dictGetIterator(d);
3120 if (!di) {
3121 fclose(fp);
3122 return REDIS_ERR;
3123 }
3124
3125 /* Write the SELECT DB opcode */
3126 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3127 if (rdbSaveLen(fp,j) == -1) goto werr;
3128
3129 /* Iterate this DB writing every entry */
3130 while((de = dictNext(di)) != NULL) {
3131 robj *key = dictGetEntryKey(de);
3132 robj *o = dictGetEntryVal(de);
3133 time_t expiretime = getExpire(db,key);
3134
3135 /* Save the expire time */
3136 if (expiretime != -1) {
3137 /* If this key is already expired skip it */
3138 if (expiretime < now) continue;
3139 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3140 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3141 }
3142 /* Save the key and associated value. This requires special
3143 * handling if the value is swapped out. */
3144 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3145 key->storage == REDIS_VM_SWAPPING) {
3146 /* Save type, key, value */
3147 if (rdbSaveType(fp,o->type) == -1) goto werr;
3148 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3149 if (rdbSaveObject(fp,o) == -1) goto werr;
3150 } else {
3151 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3152 robj *po;
3153 /* Get a preview of the object in memory */
3154 po = vmPreviewObject(key);
3155 /* Save type, key, value */
3156 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3157 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3158 if (rdbSaveObject(fp,po) == -1) goto werr;
3159 /* Remove the loaded object from memory */
3160 decrRefCount(po);
3161 }
3162 }
3163 dictReleaseIterator(di);
3164 }
3165 /* EOF opcode */
3166 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3167
3168 /* Make sure data will not remain on the OS's output buffers */
3169 fflush(fp);
3170 fsync(fileno(fp));
3171 fclose(fp);
3172
3173 /* Use RENAME to make sure the DB file is changed atomically only
3174 * if the generate DB file is ok. */
3175 if (rename(tmpfile,filename) == -1) {
3176 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3177 unlink(tmpfile);
3178 return REDIS_ERR;
3179 }
3180 redisLog(REDIS_NOTICE,"DB saved on disk");
3181 server.dirty = 0;
3182 server.lastsave = time(NULL);
3183 return REDIS_OK;
3184
3185 werr:
3186 fclose(fp);
3187 unlink(tmpfile);
3188 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3189 if (di) dictReleaseIterator(di);
3190 return REDIS_ERR;
3191 }
3192
3193 static int rdbSaveBackground(char *filename) {
3194 pid_t childpid;
3195
3196 if (server.bgsavechildpid != -1) return REDIS_ERR;
3197 if (server.vm_enabled) waitEmptyIOJobsQueue();
3198 if ((childpid = fork()) == 0) {
3199 /* Child */
3200 if (server.vm_enabled) vmReopenSwapFile();
3201 close(server.fd);
3202 if (rdbSave(filename) == REDIS_OK) {
3203 _exit(0);
3204 } else {
3205 _exit(1);
3206 }
3207 } else {
3208 /* Parent */
3209 if (childpid == -1) {
3210 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3211 strerror(errno));
3212 return REDIS_ERR;
3213 }
3214 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3215 server.bgsavechildpid = childpid;
3216 return REDIS_OK;
3217 }
3218 return REDIS_OK; /* unreached */
3219 }
3220
3221 static void rdbRemoveTempFile(pid_t childpid) {
3222 char tmpfile[256];
3223
3224 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3225 unlink(tmpfile);
3226 }
3227
3228 static int rdbLoadType(FILE *fp) {
3229 unsigned char type;
3230 if (fread(&type,1,1,fp) == 0) return -1;
3231 return type;
3232 }
3233
3234 static time_t rdbLoadTime(FILE *fp) {
3235 int32_t t32;
3236 if (fread(&t32,4,1,fp) == 0) return -1;
3237 return (time_t) t32;
3238 }
3239
3240 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3241 * of this file for a description of how this are stored on disk.
3242 *
3243 * isencoded is set to 1 if the readed length is not actually a length but
3244 * an "encoding type", check the above comments for more info */
3245 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3246 unsigned char buf[2];
3247 uint32_t len;
3248 int type;
3249
3250 if (isencoded) *isencoded = 0;
3251 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3252 type = (buf[0]&0xC0)>>6;
3253 if (type == REDIS_RDB_6BITLEN) {
3254 /* Read a 6 bit len */
3255 return buf[0]&0x3F;
3256 } else if (type == REDIS_RDB_ENCVAL) {
3257 /* Read a 6 bit len encoding type */
3258 if (isencoded) *isencoded = 1;
3259 return buf[0]&0x3F;
3260 } else if (type == REDIS_RDB_14BITLEN) {
3261 /* Read a 14 bit len */
3262 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3263 return ((buf[0]&0x3F)<<8)|buf[1];
3264 } else {
3265 /* Read a 32 bit len */
3266 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3267 return ntohl(len);
3268 }
3269 }
3270
3271 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3272 unsigned char enc[4];
3273 long long val;
3274
3275 if (enctype == REDIS_RDB_ENC_INT8) {
3276 if (fread(enc,1,1,fp) == 0) return NULL;
3277 val = (signed char)enc[0];
3278 } else if (enctype == REDIS_RDB_ENC_INT16) {
3279 uint16_t v;
3280 if (fread(enc,2,1,fp) == 0) return NULL;
3281 v = enc[0]|(enc[1]<<8);
3282 val = (int16_t)v;
3283 } else if (enctype == REDIS_RDB_ENC_INT32) {
3284 uint32_t v;
3285 if (fread(enc,4,1,fp) == 0) return NULL;
3286 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3287 val = (int32_t)v;
3288 } else {
3289 val = 0; /* anti-warning */
3290 redisAssert(0!=0);
3291 }
3292 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3293 }
3294
3295 static robj *rdbLoadLzfStringObject(FILE*fp) {
3296 unsigned int len, clen;
3297 unsigned char *c = NULL;
3298 sds val = NULL;
3299
3300 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3301 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3302 if ((c = zmalloc(clen)) == NULL) goto err;
3303 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3304 if (fread(c,clen,1,fp) == 0) goto err;
3305 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3306 zfree(c);
3307 return createObject(REDIS_STRING,val);
3308 err:
3309 zfree(c);
3310 sdsfree(val);
3311 return NULL;
3312 }
3313
3314 static robj *rdbLoadStringObject(FILE*fp) {
3315 int isencoded;
3316 uint32_t len;
3317 sds val;
3318
3319 len = rdbLoadLen(fp,&isencoded);
3320 if (isencoded) {
3321 switch(len) {
3322 case REDIS_RDB_ENC_INT8:
3323 case REDIS_RDB_ENC_INT16:
3324 case REDIS_RDB_ENC_INT32:
3325 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3326 case REDIS_RDB_ENC_LZF:
3327 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3328 default:
3329 redisAssert(0!=0);
3330 }
3331 }
3332
3333 if (len == REDIS_RDB_LENERR) return NULL;
3334 val = sdsnewlen(NULL,len);
3335 if (len && fread(val,len,1,fp) == 0) {
3336 sdsfree(val);
3337 return NULL;
3338 }
3339 return tryObjectSharing(createObject(REDIS_STRING,val));
3340 }
3341
3342 /* For information about double serialization check rdbSaveDoubleValue() */
3343 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3344 char buf[128];
3345 unsigned char len;
3346
3347 if (fread(&len,1,1,fp) == 0) return -1;
3348 switch(len) {
3349 case 255: *val = R_NegInf; return 0;
3350 case 254: *val = R_PosInf; return 0;
3351 case 253: *val = R_Nan; return 0;
3352 default:
3353 if (fread(buf,len,1,fp) == 0) return -1;
3354 buf[len] = '\0';
3355 sscanf(buf, "%lg", val);
3356 return 0;
3357 }
3358 }
3359
3360 /* Load a Redis object of the specified type from the specified file.
3361 * On success a newly allocated object is returned, otherwise NULL. */
3362 static robj *rdbLoadObject(int type, FILE *fp) {
3363 robj *o;
3364
3365 if (type == REDIS_STRING) {
3366 /* Read string value */
3367 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3368 tryObjectEncoding(o);
3369 } else if (type == REDIS_LIST || type == REDIS_SET) {
3370 /* Read list/set value */
3371 uint32_t listlen;
3372
3373 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3374 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3375 /* It's faster to expand the dict to the right size asap in order
3376 * to avoid rehashing */
3377 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3378 dictExpand(o->ptr,listlen);
3379 /* Load every single element of the list/set */
3380 while(listlen--) {
3381 robj *ele;
3382
3383 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3384 tryObjectEncoding(ele);
3385 if (type == REDIS_LIST) {
3386 listAddNodeTail((list*)o->ptr,ele);
3387 } else {
3388 dictAdd((dict*)o->ptr,ele,NULL);
3389 }
3390 }
3391 } else if (type == REDIS_ZSET) {
3392 /* Read list/set value */
3393 uint32_t zsetlen;
3394 zset *zs;
3395
3396 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3397 o = createZsetObject();
3398 zs = o->ptr;
3399 /* Load every single element of the list/set */
3400 while(zsetlen--) {
3401 robj *ele;
3402 double *score = zmalloc(sizeof(double));
3403
3404 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3405 tryObjectEncoding(ele);
3406 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3407 dictAdd(zs->dict,ele,score);
3408 zslInsert(zs->zsl,*score,ele);
3409 incrRefCount(ele); /* added to skiplist */
3410 }
3411 } else {
3412 redisAssert(0 != 0);
3413 }
3414 return o;
3415 }
3416
3417 static int rdbLoad(char *filename) {
3418 FILE *fp;
3419 robj *keyobj = NULL;
3420 uint32_t dbid;
3421 int type, retval, rdbver;
3422 dict *d = server.db[0].dict;
3423 redisDb *db = server.db+0;
3424 char buf[1024];
3425 time_t expiretime = -1, now = time(NULL);
3426 long long loadedkeys = 0;
3427
3428 fp = fopen(filename,"r");
3429 if (!fp) return REDIS_ERR;
3430 if (fread(buf,9,1,fp) == 0) goto eoferr;
3431 buf[9] = '\0';
3432 if (memcmp(buf,"REDIS",5) != 0) {
3433 fclose(fp);
3434 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3435 return REDIS_ERR;
3436 }
3437 rdbver = atoi(buf+5);
3438 if (rdbver != 1) {
3439 fclose(fp);
3440 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3441 return REDIS_ERR;
3442 }
3443 while(1) {
3444 robj *o;
3445
3446 /* Read type. */
3447 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3448 if (type == REDIS_EXPIRETIME) {
3449 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3450 /* We read the time so we need to read the object type again */
3451 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3452 }
3453 if (type == REDIS_EOF) break;
3454 /* Handle SELECT DB opcode as a special case */
3455 if (type == REDIS_SELECTDB) {
3456 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3457 goto eoferr;
3458 if (dbid >= (unsigned)server.dbnum) {
3459 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3460 exit(1);
3461 }
3462 db = server.db+dbid;
3463 d = db->dict;
3464 continue;
3465 }
3466 /* Read key */
3467 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3468 /* Read value */
3469 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3470 /* Add the new object in the hash table */
3471 retval = dictAdd(d,keyobj,o);
3472 if (retval == DICT_ERR) {
3473 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3474 exit(1);
3475 }
3476 /* Set the expire time if needed */
3477 if (expiretime != -1) {
3478 setExpire(db,keyobj,expiretime);
3479 /* Delete this key if already expired */
3480 if (expiretime < now) deleteKey(db,keyobj);
3481 expiretime = -1;
3482 }
3483 keyobj = o = NULL;
3484 /* Handle swapping while loading big datasets when VM is on */
3485 loadedkeys++;
3486 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3487 while (zmalloc_used_memory() > server.vm_max_memory) {
3488 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3489 }
3490 }
3491 }
3492 fclose(fp);
3493 return REDIS_OK;
3494
3495 eoferr: /* unexpected end of file is handled here with a fatal exit */
3496 if (keyobj) decrRefCount(keyobj);
3497 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3498 exit(1);
3499 return REDIS_ERR; /* Just to avoid warning */
3500 }
3501
3502 /*================================== Commands =============================== */
3503
3504 static void authCommand(redisClient *c) {
3505 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3506 c->authenticated = 1;
3507 addReply(c,shared.ok);
3508 } else {
3509 c->authenticated = 0;
3510 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3511 }
3512 }
3513
3514 static void pingCommand(redisClient *c) {
3515 addReply(c,shared.pong);
3516 }
3517
3518 static void echoCommand(redisClient *c) {
3519 addReplyBulkLen(c,c->argv[1]);
3520 addReply(c,c->argv[1]);
3521 addReply(c,shared.crlf);
3522 }
3523
3524 /*=================================== Strings =============================== */
3525
3526 static void setGenericCommand(redisClient *c, int nx) {
3527 int retval;
3528
3529 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3530 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3531 if (retval == DICT_ERR) {
3532 if (!nx) {
3533 /* If the key is about a swapped value, we want a new key object
3534 * to overwrite the old. So we delete the old key in the database.
3535 * This will also make sure that swap pages about the old object
3536 * will be marked as free. */
3537 if (deleteIfSwapped(c->db,c->argv[1]))
3538 incrRefCount(c->argv[1]);
3539 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3540 incrRefCount(c->argv[2]);
3541 } else {
3542 addReply(c,shared.czero);
3543 return;
3544 }
3545 } else {
3546 incrRefCount(c->argv[1]);
3547 incrRefCount(c->argv[2]);
3548 }
3549 server.dirty++;
3550 removeExpire(c->db,c->argv[1]);
3551 addReply(c, nx ? shared.cone : shared.ok);
3552 }
3553
3554 static void setCommand(redisClient *c) {
3555 setGenericCommand(c,0);
3556 }
3557
3558 static void setnxCommand(redisClient *c) {
3559 setGenericCommand(c,1);
3560 }
3561
3562 static int getGenericCommand(redisClient *c) {
3563 robj *o = lookupKeyRead(c->db,c->argv[1]);
3564
3565 if (o == NULL) {
3566 addReply(c,shared.nullbulk);
3567 return REDIS_OK;
3568 } else {
3569 if (o->type != REDIS_STRING) {
3570 addReply(c,shared.wrongtypeerr);
3571 return REDIS_ERR;
3572 } else {
3573 addReplyBulkLen(c,o);
3574 addReply(c,o);
3575 addReply(c,shared.crlf);
3576 return REDIS_OK;
3577 }
3578 }
3579 }
3580
3581 static void getCommand(redisClient *c) {
3582 getGenericCommand(c);
3583 }
3584
3585 static void getsetCommand(redisClient *c) {
3586 if (getGenericCommand(c) == REDIS_ERR) return;
3587 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3588 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3589 } else {
3590 incrRefCount(c->argv[1]);
3591 }
3592 incrRefCount(c->argv[2]);
3593 server.dirty++;
3594 removeExpire(c->db,c->argv[1]);
3595 }
3596
3597 static void mgetCommand(redisClient *c) {
3598 int j;
3599
3600 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3601 for (j = 1; j < c->argc; j++) {
3602 robj *o = lookupKeyRead(c->db,c->argv[j]);
3603 if (o == NULL) {
3604 addReply(c,shared.nullbulk);
3605 } else {
3606 if (o->type != REDIS_STRING) {
3607 addReply(c,shared.nullbulk);
3608 } else {
3609 addReplyBulkLen(c,o);
3610 addReply(c,o);
3611 addReply(c,shared.crlf);
3612 }
3613 }
3614 }
3615 }
3616
3617 static void msetGenericCommand(redisClient *c, int nx) {
3618 int j, busykeys = 0;
3619
3620 if ((c->argc % 2) == 0) {
3621 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3622 return;
3623 }
3624 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3625 * set nothing at all if at least one already key exists. */
3626 if (nx) {
3627 for (j = 1; j < c->argc; j += 2) {
3628 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3629 busykeys++;
3630 }
3631 }
3632 }
3633 if (busykeys) {
3634 addReply(c, shared.czero);
3635 return;
3636 }
3637
3638 for (j = 1; j < c->argc; j += 2) {
3639 int retval;
3640
3641 tryObjectEncoding(c->argv[j+1]);
3642 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3643 if (retval == DICT_ERR) {
3644 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3645 incrRefCount(c->argv[j+1]);
3646 } else {
3647 incrRefCount(c->argv[j]);
3648 incrRefCount(c->argv[j+1]);
3649 }
3650 removeExpire(c->db,c->argv[j]);
3651 }
3652 server.dirty += (c->argc-1)/2;
3653 addReply(c, nx ? shared.cone : shared.ok);
3654 }
3655
3656 static void msetCommand(redisClient *c) {
3657 msetGenericCommand(c,0);
3658 }
3659
3660 static void msetnxCommand(redisClient *c) {
3661 msetGenericCommand(c,1);
3662 }
3663
3664 static void incrDecrCommand(redisClient *c, long long incr) {
3665 long long value;
3666 int retval;
3667 robj *o;
3668
3669 o = lookupKeyWrite(c->db,c->argv[1]);
3670 if (o == NULL) {
3671 value = 0;
3672 } else {
3673 if (o->type != REDIS_STRING) {
3674 value = 0;
3675 } else {
3676 char *eptr;
3677
3678 if (o->encoding == REDIS_ENCODING_RAW)
3679 value = strtoll(o->ptr, &eptr, 10);
3680 else if (o->encoding == REDIS_ENCODING_INT)
3681 value = (long)o->ptr;
3682 else
3683 redisAssert(1 != 1);
3684 }
3685 }
3686
3687 value += incr;
3688 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3689 tryObjectEncoding(o);
3690 retval = dictAdd(c->db->dict,c->argv[1],o);
3691 if (retval == DICT_ERR) {
3692 dictReplace(c->db->dict,c->argv[1],o);
3693 removeExpire(c->db,c->argv[1]);
3694 } else {
3695 incrRefCount(c->argv[1]);
3696 }
3697 server.dirty++;
3698 addReply(c,shared.colon);
3699 addReply(c,o);
3700 addReply(c,shared.crlf);
3701 }
3702
3703 static void incrCommand(redisClient *c) {
3704 incrDecrCommand(c,1);
3705 }
3706
3707 static void decrCommand(redisClient *c) {
3708 incrDecrCommand(c,-1);
3709 }
3710
3711 static void incrbyCommand(redisClient *c) {
3712 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3713 incrDecrCommand(c,incr);
3714 }
3715
3716 static void decrbyCommand(redisClient *c) {
3717 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3718 incrDecrCommand(c,-incr);
3719 }
3720
3721 static void appendCommand(redisClient *c) {
3722 int retval;
3723 size_t totlen;
3724 robj *o;
3725
3726 o = lookupKeyWrite(c->db,c->argv[1]);
3727 if (o == NULL) {
3728 /* Create the key */
3729 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3730 incrRefCount(c->argv[1]);
3731 incrRefCount(c->argv[2]);
3732 totlen = stringObjectLen(c->argv[2]);
3733 } else {
3734 dictEntry *de;
3735
3736 de = dictFind(c->db->dict,c->argv[1]);
3737 assert(de != NULL);
3738
3739 o = dictGetEntryVal(de);
3740 if (o->type != REDIS_STRING) {
3741 addReply(c,shared.wrongtypeerr);
3742 return;
3743 }
3744 /* If the object is specially encoded or shared we have to make
3745 * a copy */
3746 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3747 robj *decoded = getDecodedObject(o);
3748
3749 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3750 decrRefCount(decoded);
3751 dictReplace(c->db->dict,c->argv[1],o);
3752 }
3753 /* APPEND! */
3754 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3755 o->ptr = sdscatlen(o->ptr,
3756 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3757 } else {
3758 o->ptr = sdscatprintf(o->ptr, "%ld",
3759 (unsigned long) c->argv[2]->ptr);
3760 }
3761 totlen = sdslen(o->ptr);
3762 }
3763 server.dirty++;
3764 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3765 }
3766
3767 /* ========================= Type agnostic commands ========================= */
3768
3769 static void delCommand(redisClient *c) {
3770 int deleted = 0, j;
3771
3772 for (j = 1; j < c->argc; j++) {
3773 if (deleteKey(c->db,c->argv[j])) {
3774 server.dirty++;
3775 deleted++;
3776 }
3777 }
3778 switch(deleted) {
3779 case 0:
3780 addReply(c,shared.czero);
3781 break;
3782 case 1:
3783 addReply(c,shared.cone);
3784 break;
3785 default:
3786 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3787 break;
3788 }
3789 }
3790
3791 static void existsCommand(redisClient *c) {
3792 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3793 }
3794
3795 static void selectCommand(redisClient *c) {
3796 int id = atoi(c->argv[1]->ptr);
3797
3798 if (selectDb(c,id) == REDIS_ERR) {
3799 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3800 } else {
3801 addReply(c,shared.ok);
3802 }
3803 }
3804
3805 static void randomkeyCommand(redisClient *c) {
3806 dictEntry *de;
3807
3808 while(1) {
3809 de = dictGetRandomKey(c->db->dict);
3810 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3811 }
3812 if (de == NULL) {
3813 addReply(c,shared.plus);
3814 addReply(c,shared.crlf);
3815 } else {
3816 addReply(c,shared.plus);
3817 addReply(c,dictGetEntryKey(de));
3818 addReply(c,shared.crlf);
3819 }
3820 }
3821
3822 static void keysCommand(redisClient *c) {
3823 dictIterator *di;
3824 dictEntry *de;
3825 sds pattern = c->argv[1]->ptr;
3826 int plen = sdslen(pattern);
3827 unsigned long numkeys = 0;
3828 robj *lenobj = createObject(REDIS_STRING,NULL);
3829
3830 di = dictGetIterator(c->db->dict);
3831 addReply(c,lenobj);
3832 decrRefCount(lenobj);
3833 while((de = dictNext(di)) != NULL) {
3834 robj *keyobj = dictGetEntryKey(de);
3835
3836 sds key = keyobj->ptr;
3837 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3838 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3839 if (expireIfNeeded(c->db,keyobj) == 0) {
3840 addReplyBulkLen(c,keyobj);
3841 addReply(c,keyobj);
3842 addReply(c,shared.crlf);
3843 numkeys++;
3844 }
3845 }
3846 }
3847 dictReleaseIterator(di);
3848 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3849 }
3850
3851 static void dbsizeCommand(redisClient *c) {
3852 addReplySds(c,
3853 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3854 }
3855
3856 static void lastsaveCommand(redisClient *c) {
3857 addReplySds(c,
3858 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3859 }
3860
3861 static void typeCommand(redisClient *c) {
3862 robj *o;
3863 char *type;
3864
3865 o = lookupKeyRead(c->db,c->argv[1]);
3866 if (o == NULL) {
3867 type = "+none";
3868 } else {
3869 switch(o->type) {
3870 case REDIS_STRING: type = "+string"; break;
3871 case REDIS_LIST: type = "+list"; break;
3872 case REDIS_SET: type = "+set"; break;
3873 case REDIS_ZSET: type = "+zset"; break;
3874 default: type = "unknown"; break;
3875 }
3876 }
3877 addReplySds(c,sdsnew(type));
3878 addReply(c,shared.crlf);
3879 }
3880
3881 static void saveCommand(redisClient *c) {
3882 if (server.bgsavechildpid != -1) {
3883 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3884 return;
3885 }
3886 if (rdbSave(server.dbfilename) == REDIS_OK) {
3887 addReply(c,shared.ok);
3888 } else {
3889 addReply(c,shared.err);
3890 }
3891 }
3892
3893 static void bgsaveCommand(redisClient *c) {
3894 if (server.bgsavechildpid != -1) {
3895 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3896 return;
3897 }
3898 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
3899 char *status = "+Background saving started\r\n";
3900 addReplySds(c,sdsnew(status));
3901 } else {
3902 addReply(c,shared.err);
3903 }
3904 }
3905
3906 static void shutdownCommand(redisClient *c) {
3907 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
3908 /* Kill the saving child if there is a background saving in progress.
3909 We want to avoid race conditions, for instance our saving child may
3910 overwrite the synchronous saving did by SHUTDOWN. */
3911 if (server.bgsavechildpid != -1) {
3912 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3913 kill(server.bgsavechildpid,SIGKILL);
3914 rdbRemoveTempFile(server.bgsavechildpid);
3915 }
3916 if (server.appendonly) {
3917 /* Append only file: fsync() the AOF and exit */
3918 fsync(server.appendfd);
3919 if (server.vm_enabled) unlink(server.vm_swap_file);
3920 exit(0);
3921 } else {
3922 /* Snapshotting. Perform a SYNC SAVE and exit */
3923 if (rdbSave(server.dbfilename) == REDIS_OK) {
3924 if (server.daemonize)
3925 unlink(server.pidfile);
3926 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3927 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3928 if (server.vm_enabled) unlink(server.vm_swap_file);
3929 exit(0);
3930 } else {
3931 /* Ooops.. error saving! The best we can do is to continue operating.
3932 * Note that if there was a background saving process, in the next
3933 * cron() Redis will be notified that the background saving aborted,
3934 * handling special stuff like slaves pending for synchronization... */
3935 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3936 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3937 }
3938 }
3939 }
3940
3941 static void renameGenericCommand(redisClient *c, int nx) {
3942 robj *o;
3943
3944 /* To use the same key as src and dst is probably an error */
3945 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
3946 addReply(c,shared.sameobjecterr);
3947 return;
3948 }
3949
3950 o = lookupKeyWrite(c->db,c->argv[1]);
3951 if (o == NULL) {
3952 addReply(c,shared.nokeyerr);
3953 return;
3954 }
3955 incrRefCount(o);
3956 deleteIfVolatile(c->db,c->argv[2]);
3957 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
3958 if (nx) {
3959 decrRefCount(o);
3960 addReply(c,shared.czero);
3961 return;
3962 }
3963 dictReplace(c->db->dict,c->argv[2],o);
3964 } else {
3965 incrRefCount(c->argv[2]);
3966 }
3967 deleteKey(c->db,c->argv[1]);
3968 server.dirty++;
3969 addReply(c,nx ? shared.cone : shared.ok);
3970 }
3971
3972 static void renameCommand(redisClient *c) {
3973 renameGenericCommand(c,0);
3974 }
3975
3976 static void renamenxCommand(redisClient *c) {
3977 renameGenericCommand(c,1);
3978 }
3979
3980 static void moveCommand(redisClient *c) {
3981 robj *o;
3982 redisDb *src, *dst;
3983 int srcid;
3984
3985 /* Obtain source and target DB pointers */
3986 src = c->db;
3987 srcid = c->db->id;
3988 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
3989 addReply(c,shared.outofrangeerr);
3990 return;
3991 }
3992 dst = c->db;
3993 selectDb(c,srcid); /* Back to the source DB */
3994
3995 /* If the user is moving using as target the same
3996 * DB as the source DB it is probably an error. */
3997 if (src == dst) {
3998 addReply(c,shared.sameobjecterr);
3999 return;
4000 }
4001
4002 /* Check if the element exists and get a reference */
4003 o = lookupKeyWrite(c->db,c->argv[1]);
4004 if (!o) {
4005 addReply(c,shared.czero);
4006 return;
4007 }
4008
4009 /* Try to add the element to the target DB */
4010 deleteIfVolatile(dst,c->argv[1]);
4011 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4012 addReply(c,shared.czero);
4013 return;
4014 }
4015 incrRefCount(c->argv[1]);
4016 incrRefCount(o);
4017
4018 /* OK! key moved, free the entry in the source DB */
4019 deleteKey(src,c->argv[1]);
4020 server.dirty++;
4021 addReply(c,shared.cone);
4022 }
4023
4024 /* =================================== Lists ================================ */
4025 static void pushGenericCommand(redisClient *c, int where) {
4026 robj *lobj;
4027 list *list;
4028
4029 lobj = lookupKeyWrite(c->db,c->argv[1]);
4030 if (lobj == NULL) {
4031 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4032 addReply(c,shared.ok);
4033 return;
4034 }
4035 lobj = createListObject();
4036 list = lobj->ptr;
4037 if (where == REDIS_HEAD) {
4038 listAddNodeHead(list,c->argv[2]);
4039 } else {
4040 listAddNodeTail(list,c->argv[2]);
4041 }
4042 dictAdd(c->db->dict,c->argv[1],lobj);
4043 incrRefCount(c->argv[1]);
4044 incrRefCount(c->argv[2]);
4045 } else {
4046 if (lobj->type != REDIS_LIST) {
4047 addReply(c,shared.wrongtypeerr);
4048 return;
4049 }
4050 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4051 addReply(c,shared.ok);
4052 return;
4053 }
4054 list = lobj->ptr;
4055 if (where == REDIS_HEAD) {
4056 listAddNodeHead(list,c->argv[2]);
4057 } else {
4058 listAddNodeTail(list,c->argv[2]);
4059 }
4060 incrRefCount(c->argv[2]);
4061 }
4062 server.dirty++;
4063 addReply(c,shared.ok);
4064 }
4065
4066 static void lpushCommand(redisClient *c) {
4067 pushGenericCommand(c,REDIS_HEAD);
4068 }
4069
4070 static void rpushCommand(redisClient *c) {
4071 pushGenericCommand(c,REDIS_TAIL);
4072 }
4073
4074 static void llenCommand(redisClient *c) {
4075 robj *o;
4076 list *l;
4077
4078 o = lookupKeyRead(c->db,c->argv[1]);
4079 if (o == NULL) {
4080 addReply(c,shared.czero);
4081 return;
4082 } else {
4083 if (o->type != REDIS_LIST) {
4084 addReply(c,shared.wrongtypeerr);
4085 } else {
4086 l = o->ptr;
4087 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4088 }
4089 }
4090 }
4091
4092 static void lindexCommand(redisClient *c) {
4093 robj *o;
4094 int index = atoi(c->argv[2]->ptr);
4095
4096 o = lookupKeyRead(c->db,c->argv[1]);
4097 if (o == NULL) {
4098 addReply(c,shared.nullbulk);
4099 } else {
4100 if (o->type != REDIS_LIST) {
4101 addReply(c,shared.wrongtypeerr);
4102 } else {
4103 list *list = o->ptr;
4104 listNode *ln;
4105
4106 ln = listIndex(list, index);
4107 if (ln == NULL) {
4108 addReply(c,shared.nullbulk);
4109 } else {
4110 robj *ele = listNodeValue(ln);
4111 addReplyBulkLen(c,ele);
4112 addReply(c,ele);
4113 addReply(c,shared.crlf);
4114 }
4115 }
4116 }
4117 }
4118
4119 static void lsetCommand(redisClient *c) {
4120 robj *o;
4121 int index = atoi(c->argv[2]->ptr);
4122
4123 o = lookupKeyWrite(c->db,c->argv[1]);
4124 if (o == NULL) {
4125 addReply(c,shared.nokeyerr);
4126 } else {
4127 if (o->type != REDIS_LIST) {
4128 addReply(c,shared.wrongtypeerr);
4129 } else {
4130 list *list = o->ptr;
4131 listNode *ln;
4132
4133 ln = listIndex(list, index);
4134 if (ln == NULL) {
4135 addReply(c,shared.outofrangeerr);
4136 } else {
4137 robj *ele = listNodeValue(ln);
4138
4139 decrRefCount(ele);
4140 listNodeValue(ln) = c->argv[3];
4141 incrRefCount(c->argv[3]);
4142 addReply(c,shared.ok);
4143 server.dirty++;
4144 }
4145 }
4146 }
4147 }
4148
4149 static void popGenericCommand(redisClient *c, int where) {
4150 robj *o;
4151
4152 o = lookupKeyWrite(c->db,c->argv[1]);
4153 if (o == NULL) {
4154 addReply(c,shared.nullbulk);
4155 } else {
4156 if (o->type != REDIS_LIST) {
4157 addReply(c,shared.wrongtypeerr);
4158 } else {
4159 list *list = o->ptr;
4160 listNode *ln;
4161
4162 if (where == REDIS_HEAD)
4163 ln = listFirst(list);
4164 else
4165 ln = listLast(list);
4166
4167 if (ln == NULL) {
4168 addReply(c,shared.nullbulk);
4169 } else {
4170 robj *ele = listNodeValue(ln);
4171 addReplyBulkLen(c,ele);
4172 addReply(c,ele);
4173 addReply(c,shared.crlf);
4174 listDelNode(list,ln);
4175 server.dirty++;
4176 }
4177 }
4178 }
4179 }
4180
4181 static void lpopCommand(redisClient *c) {
4182 popGenericCommand(c,REDIS_HEAD);
4183 }
4184
4185 static void rpopCommand(redisClient *c) {
4186 popGenericCommand(c,REDIS_TAIL);
4187 }
4188
4189 static void lrangeCommand(redisClient *c) {
4190 robj *o;
4191 int start = atoi(c->argv[2]->ptr);
4192 int end = atoi(c->argv[3]->ptr);
4193
4194 o = lookupKeyRead(c->db,c->argv[1]);
4195 if (o == NULL) {
4196 addReply(c,shared.nullmultibulk);
4197 } else {
4198 if (o->type != REDIS_LIST) {
4199 addReply(c,shared.wrongtypeerr);
4200 } else {
4201 list *list = o->ptr;
4202 listNode *ln;
4203 int llen = listLength(list);
4204 int rangelen, j;
4205 robj *ele;
4206
4207 /* convert negative indexes */
4208 if (start < 0) start = llen+start;
4209 if (end < 0) end = llen+end;
4210 if (start < 0) start = 0;
4211 if (end < 0) end = 0;
4212
4213 /* indexes sanity checks */
4214 if (start > end || start >= llen) {
4215 /* Out of range start or start > end result in empty list */
4216 addReply(c,shared.emptymultibulk);
4217 return;
4218 }
4219 if (end >= llen) end = llen-1;
4220 rangelen = (end-start)+1;
4221
4222 /* Return the result in form of a multi-bulk reply */
4223 ln = listIndex(list, start);
4224 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4225 for (j = 0; j < rangelen; j++) {
4226 ele = listNodeValue(ln);
4227 addReplyBulkLen(c,ele);
4228 addReply(c,ele);
4229 addReply(c,shared.crlf);
4230 ln = ln->next;
4231 }
4232 }
4233 }
4234 }
4235
4236 static void ltrimCommand(redisClient *c) {
4237 robj *o;
4238 int start = atoi(c->argv[2]->ptr);
4239 int end = atoi(c->argv[3]->ptr);
4240
4241 o = lookupKeyWrite(c->db,c->argv[1]);
4242 if (o == NULL) {
4243 addReply(c,shared.ok);
4244 } else {
4245 if (o->type != REDIS_LIST) {
4246 addReply(c,shared.wrongtypeerr);
4247 } else {
4248 list *list = o->ptr;
4249 listNode *ln;
4250 int llen = listLength(list);
4251 int j, ltrim, rtrim;
4252
4253 /* convert negative indexes */
4254 if (start < 0) start = llen+start;
4255 if (end < 0) end = llen+end;
4256 if (start < 0) start = 0;
4257 if (end < 0) end = 0;
4258
4259 /* indexes sanity checks */
4260 if (start > end || start >= llen) {
4261 /* Out of range start or start > end result in empty list */
4262 ltrim = llen;
4263 rtrim = 0;
4264 } else {
4265 if (end >= llen) end = llen-1;
4266 ltrim = start;
4267 rtrim = llen-end-1;
4268 }
4269
4270 /* Remove list elements to perform the trim */
4271 for (j = 0; j < ltrim; j++) {
4272 ln = listFirst(list);
4273 listDelNode(list,ln);
4274 }
4275 for (j = 0; j < rtrim; j++) {
4276 ln = listLast(list);
4277 listDelNode(list,ln);
4278 }
4279 server.dirty++;
4280 addReply(c,shared.ok);
4281 }
4282 }
4283 }
4284
4285 static void lremCommand(redisClient *c) {
4286 robj *o;
4287
4288 o = lookupKeyWrite(c->db,c->argv[1]);
4289 if (o == NULL) {
4290 addReply(c,shared.czero);
4291 } else {
4292 if (o->type != REDIS_LIST) {
4293 addReply(c,shared.wrongtypeerr);
4294 } else {
4295 list *list = o->ptr;
4296 listNode *ln, *next;
4297 int toremove = atoi(c->argv[2]->ptr);
4298 int removed = 0;
4299 int fromtail = 0;
4300
4301 if (toremove < 0) {
4302 toremove = -toremove;
4303 fromtail = 1;
4304 }
4305 ln = fromtail ? list->tail : list->head;
4306 while (ln) {
4307 robj *ele = listNodeValue(ln);
4308
4309 next = fromtail ? ln->prev : ln->next;
4310 if (compareStringObjects(ele,c->argv[3]) == 0) {
4311 listDelNode(list,ln);
4312 server.dirty++;
4313 removed++;
4314 if (toremove && removed == toremove) break;
4315 }
4316 ln = next;
4317 }
4318 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4319 }
4320 }
4321 }
4322
4323 /* This is the semantic of this command:
4324 * RPOPLPUSH srclist dstlist:
4325 * IF LLEN(srclist) > 0
4326 * element = RPOP srclist
4327 * LPUSH dstlist element
4328 * RETURN element
4329 * ELSE
4330 * RETURN nil
4331 * END
4332 * END
4333 *
4334 * The idea is to be able to get an element from a list in a reliable way
4335 * since the element is not just returned but pushed against another list
4336 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4337 */
4338 static void rpoplpushcommand(redisClient *c) {
4339 robj *sobj;
4340
4341 sobj = lookupKeyWrite(c->db,c->argv[1]);
4342 if (sobj == NULL) {
4343 addReply(c,shared.nullbulk);
4344 } else {
4345 if (sobj->type != REDIS_LIST) {
4346 addReply(c,shared.wrongtypeerr);
4347 } else {
4348 list *srclist = sobj->ptr;
4349 listNode *ln = listLast(srclist);
4350
4351 if (ln == NULL) {
4352 addReply(c,shared.nullbulk);
4353 } else {
4354 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4355 robj *ele = listNodeValue(ln);
4356 list *dstlist;
4357
4358 if (dobj && dobj->type != REDIS_LIST) {
4359 addReply(c,shared.wrongtypeerr);
4360 return;
4361 }
4362
4363 /* Add the element to the target list (unless it's directly
4364 * passed to some BLPOP-ing client */
4365 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4366 if (dobj == NULL) {
4367 /* Create the list if the key does not exist */
4368 dobj = createListObject();
4369 dictAdd(c->db->dict,c->argv[2],dobj);
4370 incrRefCount(c->argv[2]);
4371 }
4372 dstlist = dobj->ptr;
4373 listAddNodeHead(dstlist,ele);
4374 incrRefCount(ele);
4375 }
4376
4377 /* Send the element to the client as reply as well */
4378 addReplyBulkLen(c,ele);
4379 addReply(c,ele);
4380 addReply(c,shared.crlf);
4381
4382 /* Finally remove the element from the source list */
4383 listDelNode(srclist,ln);
4384 server.dirty++;
4385 }
4386 }
4387 }
4388 }
4389
4390
4391 /* ==================================== Sets ================================ */
4392
4393 static void saddCommand(redisClient *c) {
4394 robj *set;
4395
4396 set = lookupKeyWrite(c->db,c->argv[1]);
4397 if (set == NULL) {
4398 set = createSetObject();
4399 dictAdd(c->db->dict,c->argv[1],set);
4400 incrRefCount(c->argv[1]);
4401 } else {
4402 if (set->type != REDIS_SET) {
4403 addReply(c,shared.wrongtypeerr);
4404 return;
4405 }
4406 }
4407 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4408 incrRefCount(c->argv[2]);
4409 server.dirty++;
4410 addReply(c,shared.cone);
4411 } else {
4412 addReply(c,shared.czero);
4413 }
4414 }
4415
4416 static void sremCommand(redisClient *c) {
4417 robj *set;
4418
4419 set = lookupKeyWrite(c->db,c->argv[1]);
4420 if (set == NULL) {
4421 addReply(c,shared.czero);
4422 } else {
4423 if (set->type != REDIS_SET) {
4424 addReply(c,shared.wrongtypeerr);
4425 return;
4426 }
4427 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4428 server.dirty++;
4429 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4430 addReply(c,shared.cone);
4431 } else {
4432 addReply(c,shared.czero);
4433 }
4434 }
4435 }
4436
4437 static void smoveCommand(redisClient *c) {
4438 robj *srcset, *dstset;
4439
4440 srcset = lookupKeyWrite(c->db,c->argv[1]);
4441 dstset = lookupKeyWrite(c->db,c->argv[2]);
4442
4443 /* If the source key does not exist return 0, if it's of the wrong type
4444 * raise an error */
4445 if (srcset == NULL || srcset->type != REDIS_SET) {
4446 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4447 return;
4448 }
4449 /* Error if the destination key is not a set as well */
4450 if (dstset && dstset->type != REDIS_SET) {
4451 addReply(c,shared.wrongtypeerr);
4452 return;
4453 }
4454 /* Remove the element from the source set */
4455 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4456 /* Key not found in the src set! return zero */
4457 addReply(c,shared.czero);
4458 return;
4459 }
4460 server.dirty++;
4461 /* Add the element to the destination set */
4462 if (!dstset) {
4463 dstset = createSetObject();
4464 dictAdd(c->db->dict,c->argv[2],dstset);
4465 incrRefCount(c->argv[2]);
4466 }
4467 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4468 incrRefCount(c->argv[3]);
4469 addReply(c,shared.cone);
4470 }
4471
4472 static void sismemberCommand(redisClient *c) {
4473 robj *set;
4474
4475 set = lookupKeyRead(c->db,c->argv[1]);
4476 if (set == NULL) {
4477 addReply(c,shared.czero);
4478 } else {
4479 if (set->type != REDIS_SET) {
4480 addReply(c,shared.wrongtypeerr);
4481 return;
4482 }
4483 if (dictFind(set->ptr,c->argv[2]))
4484 addReply(c,shared.cone);
4485 else
4486 addReply(c,shared.czero);
4487 }
4488 }
4489
4490 static void scardCommand(redisClient *c) {
4491 robj *o;
4492 dict *s;
4493
4494 o = lookupKeyRead(c->db,c->argv[1]);
4495 if (o == NULL) {
4496 addReply(c,shared.czero);
4497 return;
4498 } else {
4499 if (o->type != REDIS_SET) {
4500 addReply(c,shared.wrongtypeerr);
4501 } else {
4502 s = o->ptr;
4503 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4504 dictSize(s)));
4505 }
4506 }
4507 }
4508
4509 static void spopCommand(redisClient *c) {
4510 robj *set;
4511 dictEntry *de;
4512
4513 set = lookupKeyWrite(c->db,c->argv[1]);
4514 if (set == NULL) {
4515 addReply(c,shared.nullbulk);
4516 } else {
4517 if (set->type != REDIS_SET) {
4518 addReply(c,shared.wrongtypeerr);
4519 return;
4520 }
4521 de = dictGetRandomKey(set->ptr);
4522 if (de == NULL) {
4523 addReply(c,shared.nullbulk);
4524 } else {
4525 robj *ele = dictGetEntryKey(de);
4526
4527 addReplyBulkLen(c,ele);
4528 addReply(c,ele);
4529 addReply(c,shared.crlf);
4530 dictDelete(set->ptr,ele);
4531 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4532 server.dirty++;
4533 }
4534 }
4535 }
4536
4537 static void srandmemberCommand(redisClient *c) {
4538 robj *set;
4539 dictEntry *de;
4540
4541 set = lookupKeyRead(c->db,c->argv[1]);
4542 if (set == NULL) {
4543 addReply(c,shared.nullbulk);
4544 } else {
4545 if (set->type != REDIS_SET) {
4546 addReply(c,shared.wrongtypeerr);
4547 return;
4548 }
4549 de = dictGetRandomKey(set->ptr);
4550 if (de == NULL) {
4551 addReply(c,shared.nullbulk);
4552 } else {
4553 robj *ele = dictGetEntryKey(de);
4554
4555 addReplyBulkLen(c,ele);
4556 addReply(c,ele);
4557 addReply(c,shared.crlf);
4558 }
4559 }
4560 }
4561
4562 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4563 dict **d1 = (void*) s1, **d2 = (void*) s2;
4564
4565 return dictSize(*d1)-dictSize(*d2);
4566 }
4567
4568 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4569 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4570 dictIterator *di;
4571 dictEntry *de;
4572 robj *lenobj = NULL, *dstset = NULL;
4573 unsigned long j, cardinality = 0;
4574
4575 for (j = 0; j < setsnum; j++) {
4576 robj *setobj;
4577
4578 setobj = dstkey ?
4579 lookupKeyWrite(c->db,setskeys[j]) :
4580 lookupKeyRead(c->db,setskeys[j]);
4581 if (!setobj) {
4582 zfree(dv);
4583 if (dstkey) {
4584 if (deleteKey(c->db,dstkey))
4585 server.dirty++;
4586 addReply(c,shared.czero);
4587 } else {
4588 addReply(c,shared.nullmultibulk);
4589 }
4590 return;
4591 }
4592 if (setobj->type != REDIS_SET) {
4593 zfree(dv);
4594 addReply(c,shared.wrongtypeerr);
4595 return;
4596 }
4597 dv[j] = setobj->ptr;
4598 }
4599 /* Sort sets from the smallest to largest, this will improve our
4600 * algorithm's performace */
4601 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4602
4603 /* The first thing we should output is the total number of elements...
4604 * since this is a multi-bulk write, but at this stage we don't know
4605 * the intersection set size, so we use a trick, append an empty object
4606 * to the output list and save the pointer to later modify it with the
4607 * right length */
4608 if (!dstkey) {
4609 lenobj = createObject(REDIS_STRING,NULL);
4610 addReply(c,lenobj);
4611 decrRefCount(lenobj);
4612 } else {
4613 /* If we have a target key where to store the resulting set
4614 * create this key with an empty set inside */
4615 dstset = createSetObject();
4616 }
4617
4618 /* Iterate all the elements of the first (smallest) set, and test
4619 * the element against all the other sets, if at least one set does
4620 * not include the element it is discarded */
4621 di = dictGetIterator(dv[0]);
4622
4623 while((de = dictNext(di)) != NULL) {
4624 robj *ele;
4625
4626 for (j = 1; j < setsnum; j++)
4627 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4628 if (j != setsnum)
4629 continue; /* at least one set does not contain the member */
4630 ele = dictGetEntryKey(de);
4631 if (!dstkey) {
4632 addReplyBulkLen(c,ele);
4633 addReply(c,ele);
4634 addReply(c,shared.crlf);
4635 cardinality++;
4636 } else {
4637 dictAdd(dstset->ptr,ele,NULL);
4638 incrRefCount(ele);
4639 }
4640 }
4641 dictReleaseIterator(di);
4642
4643 if (dstkey) {
4644 /* Store the resulting set into the target */
4645 deleteKey(c->db,dstkey);
4646 dictAdd(c->db->dict,dstkey,dstset);
4647 incrRefCount(dstkey);
4648 }
4649
4650 if (!dstkey) {
4651 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4652 } else {
4653 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4654 dictSize((dict*)dstset->ptr)));
4655 server.dirty++;
4656 }
4657 zfree(dv);
4658 }
4659
4660 static void sinterCommand(redisClient *c) {
4661 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4662 }
4663
4664 static void sinterstoreCommand(redisClient *c) {
4665 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4666 }
4667
4668 #define REDIS_OP_UNION 0
4669 #define REDIS_OP_DIFF 1
4670
4671 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4672 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4673 dictIterator *di;
4674 dictEntry *de;
4675 robj *dstset = NULL;
4676 int j, cardinality = 0;
4677
4678 for (j = 0; j < setsnum; j++) {
4679 robj *setobj;
4680
4681 setobj = dstkey ?
4682 lookupKeyWrite(c->db,setskeys[j]) :
4683 lookupKeyRead(c->db,setskeys[j]);
4684 if (!setobj) {
4685 dv[j] = NULL;
4686 continue;
4687 }
4688 if (setobj->type != REDIS_SET) {
4689 zfree(dv);
4690 addReply(c,shared.wrongtypeerr);
4691 return;
4692 }
4693 dv[j] = setobj->ptr;
4694 }
4695
4696 /* We need a temp set object to store our union. If the dstkey
4697 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4698 * this set object will be the resulting object to set into the target key*/
4699 dstset = createSetObject();
4700
4701 /* Iterate all the elements of all the sets, add every element a single
4702 * time to the result set */
4703 for (j = 0; j < setsnum; j++) {
4704 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4705 if (!dv[j]) continue; /* non existing keys are like empty sets */
4706
4707 di = dictGetIterator(dv[j]);
4708
4709 while((de = dictNext(di)) != NULL) {
4710 robj *ele;
4711
4712 /* dictAdd will not add the same element multiple times */
4713 ele = dictGetEntryKey(de);
4714 if (op == REDIS_OP_UNION || j == 0) {
4715 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4716 incrRefCount(ele);
4717 cardinality++;
4718 }
4719 } else if (op == REDIS_OP_DIFF) {
4720 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4721 cardinality--;
4722 }
4723 }
4724 }
4725 dictReleaseIterator(di);
4726
4727 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4728 }
4729
4730 /* Output the content of the resulting set, if not in STORE mode */
4731 if (!dstkey) {
4732 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4733 di = dictGetIterator(dstset->ptr);
4734 while((de = dictNext(di)) != NULL) {
4735 robj *ele;
4736
4737 ele = dictGetEntryKey(de);
4738 addReplyBulkLen(c,ele);
4739 addReply(c,ele);
4740 addReply(c,shared.crlf);
4741 }
4742 dictReleaseIterator(di);
4743 } else {
4744 /* If we have a target key where to store the resulting set
4745 * create this key with the result set inside */
4746 deleteKey(c->db,dstkey);
4747 dictAdd(c->db->dict,dstkey,dstset);
4748 incrRefCount(dstkey);
4749 }
4750
4751 /* Cleanup */
4752 if (!dstkey) {
4753 decrRefCount(dstset);
4754 } else {
4755 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4756 dictSize((dict*)dstset->ptr)));
4757 server.dirty++;
4758 }
4759 zfree(dv);
4760 }
4761
4762 static void sunionCommand(redisClient *c) {
4763 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4764 }
4765
4766 static void sunionstoreCommand(redisClient *c) {
4767 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4768 }
4769
4770 static void sdiffCommand(redisClient *c) {
4771 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4772 }
4773
4774 static void sdiffstoreCommand(redisClient *c) {
4775 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4776 }
4777
4778 /* ==================================== ZSets =============================== */
4779
4780 /* ZSETs are ordered sets using two data structures to hold the same elements
4781 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4782 * data structure.
4783 *
4784 * The elements are added to an hash table mapping Redis objects to scores.
4785 * At the same time the elements are added to a skip list mapping scores
4786 * to Redis objects (so objects are sorted by scores in this "view"). */
4787
4788 /* This skiplist implementation is almost a C translation of the original
4789 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4790 * Alternative to Balanced Trees", modified in three ways:
4791 * a) this implementation allows for repeated values.
4792 * b) the comparison is not just by key (our 'score') but by satellite data.
4793 * c) there is a back pointer, so it's a doubly linked list with the back
4794 * pointers being only at "level 1". This allows to traverse the list
4795 * from tail to head, useful for ZREVRANGE. */
4796
4797 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4798 zskiplistNode *zn = zmalloc(sizeof(*zn));
4799
4800 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4801 zn->span = zmalloc(sizeof(unsigned long) * level);
4802 zn->score = score;
4803 zn->obj = obj;
4804 return zn;
4805 }
4806
4807 static zskiplist *zslCreate(void) {
4808 int j;
4809 zskiplist *zsl;
4810
4811 zsl = zmalloc(sizeof(*zsl));
4812 zsl->level = 1;
4813 zsl->length = 0;
4814 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4815 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4816 zsl->header->forward[j] = NULL;
4817 zsl->header->span[j] = 0;
4818 }
4819 zsl->header->backward = NULL;
4820 zsl->tail = NULL;
4821 return zsl;
4822 }
4823
4824 static void zslFreeNode(zskiplistNode *node) {
4825 decrRefCount(node->obj);
4826 zfree(node->forward);
4827 zfree(node->span);
4828 zfree(node);
4829 }
4830
4831 static void zslFree(zskiplist *zsl) {
4832 zskiplistNode *node = zsl->header->forward[0], *next;
4833
4834 zfree(zsl->header->forward);
4835 zfree(zsl->header->span);
4836 zfree(zsl->header);
4837 while(node) {
4838 next = node->forward[0];
4839 zslFreeNode(node);
4840 node = next;
4841 }
4842 zfree(zsl);
4843 }
4844
4845 static int zslRandomLevel(void) {
4846 int level = 1;
4847 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4848 level += 1;
4849 return level;
4850 }
4851
4852 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4853 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4854 unsigned long span[ZSKIPLIST_MAXLEVEL];
4855 int i, level;
4856
4857 x = zsl->header;
4858 for (i = zsl->level-1; i >= 0; i--) {
4859 /* store span that is crossed to reach the insert position */
4860 span[i] = i == (zsl->level-1) ? 0 : span[i+1];
4861
4862 while (x->forward[i] &&
4863 (x->forward[i]->score < score ||
4864 (x->forward[i]->score == score &&
4865 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4866 span[i] += x->span[i];
4867 x = x->forward[i];
4868 }
4869 update[i] = x;
4870 }
4871 /* we assume the key is not already inside, since we allow duplicated
4872 * scores, and the re-insertion of score and redis object should never
4873 * happpen since the caller of zslInsert() should test in the hash table
4874 * if the element is already inside or not. */
4875 level = zslRandomLevel();
4876 if (level > zsl->level) {
4877 for (i = zsl->level; i < level; i++) {
4878 span[i] = 0;
4879 update[i] = zsl->header;
4880 update[i]->span[i] = zsl->length;
4881 }
4882 zsl->level = level;
4883 }
4884 x = zslCreateNode(level,score,obj);
4885 for (i = 0; i < level; i++) {
4886 x->forward[i] = update[i]->forward[i];
4887 update[i]->forward[i] = x;
4888
4889 /* update span covered by update[i] as x is inserted here */
4890 x->span[i] = update[i]->span[i] - (span[0] - span[i]);
4891 update[i]->span[i] = (span[0] - span[i]) + 1;
4892 }
4893
4894 /* increment span for untouched levels */
4895 for (i = level; i < zsl->level; i++) {
4896 update[i]->span[i]++;
4897 }
4898
4899 x->backward = (update[0] == zsl->header) ? NULL : update[0];
4900 if (x->forward[0])
4901 x->forward[0]->backward = x;
4902 else
4903 zsl->tail = x;
4904 zsl->length++;
4905 }
4906
4907 /* Delete an element with matching score/object from the skiplist. */
4908 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
4909 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4910 int i;
4911
4912 x = zsl->header;
4913 for (i = zsl->level-1; i >= 0; i--) {
4914 while (x->forward[i] &&
4915 (x->forward[i]->score < score ||
4916 (x->forward[i]->score == score &&
4917 compareStringObjects(x->forward[i]->obj,obj) < 0)))
4918 x = x->forward[i];
4919 update[i] = x;
4920 }
4921 /* We may have multiple elements with the same score, what we need
4922 * is to find the element with both the right score and object. */
4923 x = x->forward[0];
4924 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
4925 for (i = 0; i < zsl->level; i++) {
4926 if (update[i]->forward[i] == x) {
4927 update[i]->span[i] += x->span[i] - 1;
4928 update[i]->forward[i] = x->forward[i];
4929 } else {
4930 update[i]->span[i] -= 1;
4931 }
4932 }
4933 if (x->forward[0]) {
4934 x->forward[0]->backward = (x->backward == zsl->header) ?
4935 NULL : x->backward;
4936 } else {
4937 zsl->tail = x->backward;
4938 }
4939 zslFreeNode(x);
4940 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4941 zsl->level--;
4942 zsl->length--;
4943 return 1;
4944 } else {
4945 return 0; /* not found */
4946 }
4947 return 0; /* not found */
4948 }
4949
4950 /* Delete all the elements with score between min and max from the skiplist.
4951 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4952 * Note that this function takes the reference to the hash table view of the
4953 * sorted set, in order to remove the elements from the hash table too. */
4954 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4955 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4956 unsigned long removed = 0;
4957 int i;
4958
4959 x = zsl->header;
4960 for (i = zsl->level-1; i >= 0; i--) {
4961 while (x->forward[i] && x->forward[i]->score < min)
4962 x = x->forward[i];
4963 update[i] = x;
4964 }
4965 /* We may have multiple elements with the same score, what we need
4966 * is to find the element with both the right score and object. */
4967 x = x->forward[0];
4968 while (x && x->score <= max) {
4969 zskiplistNode *next;
4970
4971 for (i = 0; i < zsl->level; i++) {
4972 if (update[i]->forward[i] == x) {
4973 update[i]->span[i] += x->span[i] - 1;
4974 update[i]->forward[i] = x->forward[i];
4975 } else {
4976 update[i]->span[i] -= 1;
4977 }
4978 }
4979 if (x->forward[0]) {
4980 x->forward[0]->backward = (x->backward == zsl->header) ?
4981 NULL : x->backward;
4982 } else {
4983 zsl->tail = x->backward;
4984 }
4985 next = x->forward[0];
4986 dictDelete(dict,x->obj);
4987 zslFreeNode(x);
4988 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4989 zsl->level--;
4990 zsl->length--;
4991 removed++;
4992 x = next;
4993 }
4994 return removed; /* not found */
4995 }
4996
4997 /* Find the first node having a score equal or greater than the specified one.
4998 * Returns NULL if there is no match. */
4999 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5000 zskiplistNode *x;
5001 int i;
5002
5003 x = zsl->header;
5004 for (i = zsl->level-1; i >= 0; i--) {
5005 while (x->forward[i] && x->forward[i]->score < score)
5006 x = x->forward[i];
5007 }
5008 /* We may have multiple elements with the same score, what we need
5009 * is to find the element with both the right score and object. */
5010 return x->forward[0];
5011 }
5012
5013 /* The actual Z-commands implementations */
5014
5015 /* This generic command implements both ZADD and ZINCRBY.
5016 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5017 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5018 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5019 robj *zsetobj;
5020 zset *zs;
5021 double *score;
5022
5023 zsetobj = lookupKeyWrite(c->db,key);
5024 if (zsetobj == NULL) {
5025 zsetobj = createZsetObject();
5026 dictAdd(c->db->dict,key,zsetobj);
5027 incrRefCount(key);
5028 } else {
5029 if (zsetobj->type != REDIS_ZSET) {
5030 addReply(c,shared.wrongtypeerr);
5031 return;
5032 }
5033 }
5034 zs = zsetobj->ptr;
5035
5036 /* Ok now since we implement both ZADD and ZINCRBY here the code
5037 * needs to handle the two different conditions. It's all about setting
5038 * '*score', that is, the new score to set, to the right value. */
5039 score = zmalloc(sizeof(double));
5040 if (doincrement) {
5041 dictEntry *de;
5042
5043 /* Read the old score. If the element was not present starts from 0 */
5044 de = dictFind(zs->dict,ele);
5045 if (de) {
5046 double *oldscore = dictGetEntryVal(de);
5047 *score = *oldscore + scoreval;
5048 } else {
5049 *score = scoreval;
5050 }
5051 } else {
5052 *score = scoreval;
5053 }
5054
5055 /* What follows is a simple remove and re-insert operation that is common
5056 * to both ZADD and ZINCRBY... */
5057 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5058 /* case 1: New element */
5059 incrRefCount(ele); /* added to hash */
5060 zslInsert(zs->zsl,*score,ele);
5061 incrRefCount(ele); /* added to skiplist */
5062 server.dirty++;
5063 if (doincrement)
5064 addReplyDouble(c,*score);
5065 else
5066 addReply(c,shared.cone);
5067 } else {
5068 dictEntry *de;
5069 double *oldscore;
5070
5071 /* case 2: Score update operation */
5072 de = dictFind(zs->dict,ele);
5073 redisAssert(de != NULL);
5074 oldscore = dictGetEntryVal(de);
5075 if (*score != *oldscore) {
5076 int deleted;
5077
5078 /* Remove and insert the element in the skip list with new score */
5079 deleted = zslDelete(zs->zsl,*oldscore,ele);
5080 redisAssert(deleted != 0);
5081 zslInsert(zs->zsl,*score,ele);
5082 incrRefCount(ele);
5083 /* Update the score in the hash table */
5084 dictReplace(zs->dict,ele,score);
5085 server.dirty++;
5086 } else {
5087 zfree(score);
5088 }
5089 if (doincrement)
5090 addReplyDouble(c,*score);
5091 else
5092 addReply(c,shared.czero);
5093 }
5094 }
5095
5096 static void zaddCommand(redisClient *c) {
5097 double scoreval;
5098
5099 scoreval = strtod(c->argv[2]->ptr,NULL);
5100 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5101 }
5102
5103 static void zincrbyCommand(redisClient *c) {
5104 double scoreval;
5105
5106 scoreval = strtod(c->argv[2]->ptr,NULL);
5107 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5108 }
5109
5110 static void zremCommand(redisClient *c) {
5111 robj *zsetobj;
5112 zset *zs;
5113
5114 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5115 if (zsetobj == NULL) {
5116 addReply(c,shared.czero);
5117 } else {
5118 dictEntry *de;
5119 double *oldscore;
5120 int deleted;
5121
5122 if (zsetobj->type != REDIS_ZSET) {
5123 addReply(c,shared.wrongtypeerr);
5124 return;
5125 }
5126 zs = zsetobj->ptr;
5127 de = dictFind(zs->dict,c->argv[2]);
5128 if (de == NULL) {
5129 addReply(c,shared.czero);
5130 return;
5131 }
5132 /* Delete from the skiplist */
5133 oldscore = dictGetEntryVal(de);
5134 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5135 redisAssert(deleted != 0);
5136
5137 /* Delete from the hash table */
5138 dictDelete(zs->dict,c->argv[2]);
5139 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5140 server.dirty++;
5141 addReply(c,shared.cone);
5142 }
5143 }
5144
5145 static void zremrangebyscoreCommand(redisClient *c) {
5146 double min = strtod(c->argv[2]->ptr,NULL);
5147 double max = strtod(c->argv[3]->ptr,NULL);
5148 robj *zsetobj;
5149 zset *zs;
5150
5151 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5152 if (zsetobj == NULL) {
5153 addReply(c,shared.czero);
5154 } else {
5155 long deleted;
5156
5157 if (zsetobj->type != REDIS_ZSET) {
5158 addReply(c,shared.wrongtypeerr);
5159 return;
5160 }
5161 zs = zsetobj->ptr;
5162 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5163 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5164 server.dirty += deleted;
5165 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5166 }
5167 }
5168
5169 static void zrangeGenericCommand(redisClient *c, int reverse) {
5170 robj *o;
5171 int start = atoi(c->argv[2]->ptr);
5172 int end = atoi(c->argv[3]->ptr);
5173 int withscores = 0;
5174
5175 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5176 withscores = 1;
5177 } else if (c->argc >= 5) {
5178 addReply(c,shared.syntaxerr);
5179 return;
5180 }
5181
5182 o = lookupKeyRead(c->db,c->argv[1]);
5183 if (o == NULL) {
5184 addReply(c,shared.nullmultibulk);
5185 } else {
5186 if (o->type != REDIS_ZSET) {
5187 addReply(c,shared.wrongtypeerr);
5188 } else {
5189 zset *zsetobj = o->ptr;
5190 zskiplist *zsl = zsetobj->zsl;
5191 zskiplistNode *ln;
5192
5193 int llen = zsl->length;
5194 int rangelen, j;
5195 robj *ele;
5196
5197 /* convert negative indexes */
5198 if (start < 0) start = llen+start;
5199 if (end < 0) end = llen+end;
5200 if (start < 0) start = 0;
5201 if (end < 0) end = 0;
5202
5203 /* indexes sanity checks */
5204 if (start > end || start >= llen) {
5205 /* Out of range start or start > end result in empty list */
5206 addReply(c,shared.emptymultibulk);
5207 return;
5208 }
5209 if (end >= llen) end = llen-1;
5210 rangelen = (end-start)+1;
5211
5212 /* Return the result in form of a multi-bulk reply */
5213 if (reverse) {
5214 ln = zsl->tail;
5215 while (start--)
5216 ln = ln->backward;
5217 } else {
5218 ln = zsl->header->forward[0];
5219 while (start--)
5220 ln = ln->forward[0];
5221 }
5222
5223 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5224 withscores ? (rangelen*2) : rangelen));
5225 for (j = 0; j < rangelen; j++) {
5226 ele = ln->obj;
5227 addReplyBulkLen(c,ele);
5228 addReply(c,ele);
5229 addReply(c,shared.crlf);
5230 if (withscores)
5231 addReplyDouble(c,ln->score);
5232 ln = reverse ? ln->backward : ln->forward[0];
5233 }
5234 }
5235 }
5236 }
5237
5238 static void zrangeCommand(redisClient *c) {
5239 zrangeGenericCommand(c,0);
5240 }
5241
5242 static void zrevrangeCommand(redisClient *c) {
5243 zrangeGenericCommand(c,1);
5244 }
5245
5246 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5247 * If justcount is non-zero, just the count is returned. */
5248 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5249 robj *o;
5250 double min, max;
5251 int minex = 0, maxex = 0; /* are min or max exclusive? */
5252 int offset = 0, limit = -1;
5253 int withscores = 0;
5254 int badsyntax = 0;
5255
5256 /* Parse the min-max interval. If one of the values is prefixed
5257 * by the "(" character, it's considered "open". For instance
5258 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5259 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5260 if (((char*)c->argv[2]->ptr)[0] == '(') {
5261 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5262 minex = 1;
5263 } else {
5264 min = strtod(c->argv[2]->ptr,NULL);
5265 }
5266 if (((char*)c->argv[3]->ptr)[0] == '(') {
5267 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5268 maxex = 1;
5269 } else {
5270 max = strtod(c->argv[3]->ptr,NULL);
5271 }
5272
5273 /* Parse "WITHSCORES": note that if the command was called with
5274 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5275 * enter the following paths to parse WITHSCORES and LIMIT. */
5276 if (c->argc == 5 || c->argc == 8) {
5277 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5278 withscores = 1;
5279 else
5280 badsyntax = 1;
5281 }
5282 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5283 badsyntax = 1;
5284 if (badsyntax) {
5285 addReplySds(c,
5286 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5287 return;
5288 }
5289
5290 /* Parse "LIMIT" */
5291 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5292 addReply(c,shared.syntaxerr);
5293 return;
5294 } else if (c->argc == (7 + withscores)) {
5295 offset = atoi(c->argv[5]->ptr);
5296 limit = atoi(c->argv[6]->ptr);
5297 if (offset < 0) offset = 0;
5298 }
5299
5300 /* Ok, lookup the key and get the range */
5301 o = lookupKeyRead(c->db,c->argv[1]);
5302 if (o == NULL) {
5303 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5304 } else {
5305 if (o->type != REDIS_ZSET) {
5306 addReply(c,shared.wrongtypeerr);
5307 } else {
5308 zset *zsetobj = o->ptr;
5309 zskiplist *zsl = zsetobj->zsl;
5310 zskiplistNode *ln;
5311 robj *ele, *lenobj = NULL;
5312 unsigned long rangelen = 0;
5313
5314 /* Get the first node with the score >= min, or with
5315 * score > min if 'minex' is true. */
5316 ln = zslFirstWithScore(zsl,min);
5317 while (minex && ln && ln->score == min) ln = ln->forward[0];
5318
5319 if (ln == NULL) {
5320 /* No element matching the speciifed interval */
5321 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5322 return;
5323 }
5324
5325 /* We don't know in advance how many matching elements there
5326 * are in the list, so we push this object that will represent
5327 * the multi-bulk length in the output buffer, and will "fix"
5328 * it later */
5329 if (!justcount) {
5330 lenobj = createObject(REDIS_STRING,NULL);
5331 addReply(c,lenobj);
5332 decrRefCount(lenobj);
5333 }
5334
5335 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5336 if (offset) {
5337 offset--;
5338 ln = ln->forward[0];
5339 continue;
5340 }
5341 if (limit == 0) break;
5342 if (!justcount) {
5343 ele = ln->obj;
5344 addReplyBulkLen(c,ele);
5345 addReply(c,ele);
5346 addReply(c,shared.crlf);
5347 if (withscores)
5348 addReplyDouble(c,ln->score);
5349 }
5350 ln = ln->forward[0];
5351 rangelen++;
5352 if (limit > 0) limit--;
5353 }
5354 if (justcount) {
5355 addReplyLong(c,(long)rangelen);
5356 } else {
5357 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5358 withscores ? (rangelen*2) : rangelen);
5359 }
5360 }
5361 }
5362 }
5363
5364 static void zrangebyscoreCommand(redisClient *c) {
5365 genericZrangebyscoreCommand(c,0);
5366 }
5367
5368 static void zcountCommand(redisClient *c) {
5369 genericZrangebyscoreCommand(c,1);
5370 }
5371
5372 static void zcardCommand(redisClient *c) {
5373 robj *o;
5374 zset *zs;
5375
5376 o = lookupKeyRead(c->db,c->argv[1]);
5377 if (o == NULL) {
5378 addReply(c,shared.czero);
5379 return;
5380 } else {
5381 if (o->type != REDIS_ZSET) {
5382 addReply(c,shared.wrongtypeerr);
5383 } else {
5384 zs = o->ptr;
5385 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5386 }
5387 }
5388 }
5389
5390 static void zscoreCommand(redisClient *c) {
5391 robj *o;
5392 zset *zs;
5393
5394 o = lookupKeyRead(c->db,c->argv[1]);
5395 if (o == NULL) {
5396 addReply(c,shared.nullbulk);
5397 return;
5398 } else {
5399 if (o->type != REDIS_ZSET) {
5400 addReply(c,shared.wrongtypeerr);
5401 } else {
5402 dictEntry *de;
5403
5404 zs = o->ptr;
5405 de = dictFind(zs->dict,c->argv[2]);
5406 if (!de) {
5407 addReply(c,shared.nullbulk);
5408 } else {
5409 double *score = dictGetEntryVal(de);
5410
5411 addReplyDouble(c,*score);
5412 }
5413 }
5414 }
5415 }
5416
5417 static void zrankCommand(redisClient *c) {
5418 robj *o;
5419 o = lookupKeyRead(c->db,c->argv[1]);
5420 if (o == NULL) {
5421 addReply(c,shared.nullbulk);
5422 return;
5423 }
5424 if (o->type != REDIS_ZSET) {
5425 addReply(c,shared.wrongtypeerr);
5426 return;
5427 }
5428
5429 zset *zs = o->ptr;
5430 zskiplist *zsl = zs->zsl;
5431 dictEntry *de = dictFind(zs->dict,c->argv[2]);
5432 if (!de) {
5433 addReply(c,shared.nullbulk);
5434 return;
5435 }
5436
5437 double *score = dictGetEntryVal(de);
5438 zskiplistNode *x;
5439 unsigned long rank = 0;
5440 int i;
5441
5442 x = zsl->header;
5443 for (i = zsl->level-1; i >= 0; i--) {
5444 while (x->forward[i] &&
5445 (x->forward[i]->score < *score ||
5446 (x->forward[i]->score == *score &&
5447 compareStringObjects(x->forward[i]->obj,c->argv[2]) < 0))) {
5448 rank += x->span[i];
5449 x = x->forward[i];
5450 }
5451
5452 if (x->forward[i] && compareStringObjects(x->forward[i]->obj,c->argv[2]) == 0) {
5453 addReplyLong(c, rank);
5454 return;
5455 }
5456 }
5457
5458 addReply(c,shared.nullbulk);
5459 }
5460
5461 /* ========================= Non type-specific commands ==================== */
5462
5463 static void flushdbCommand(redisClient *c) {
5464 server.dirty += dictSize(c->db->dict);
5465 dictEmpty(c->db->dict);
5466 dictEmpty(c->db->expires);
5467 addReply(c,shared.ok);
5468 }
5469
5470 static void flushallCommand(redisClient *c) {
5471 server.dirty += emptyDb();
5472 addReply(c,shared.ok);
5473 rdbSave(server.dbfilename);
5474 server.dirty++;
5475 }
5476
5477 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5478 redisSortOperation *so = zmalloc(sizeof(*so));
5479 so->type = type;
5480 so->pattern = pattern;
5481 return so;
5482 }
5483
5484 /* Return the value associated to the key with a name obtained
5485 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5486 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5487 char *p;
5488 sds spat, ssub;
5489 robj keyobj;
5490 int prefixlen, sublen, postfixlen;
5491 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5492 struct {
5493 long len;
5494 long free;
5495 char buf[REDIS_SORTKEY_MAX+1];
5496 } keyname;
5497
5498 /* If the pattern is "#" return the substitution object itself in order
5499 * to implement the "SORT ... GET #" feature. */
5500 spat = pattern->ptr;
5501 if (spat[0] == '#' && spat[1] == '\0') {
5502 return subst;
5503 }
5504
5505 /* The substitution object may be specially encoded. If so we create
5506 * a decoded object on the fly. Otherwise getDecodedObject will just
5507 * increment the ref count, that we'll decrement later. */
5508 subst = getDecodedObject(subst);
5509
5510 ssub = subst->ptr;
5511 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5512 p = strchr(spat,'*');
5513 if (!p) {
5514 decrRefCount(subst);
5515 return NULL;
5516 }
5517
5518 prefixlen = p-spat;
5519 sublen = sdslen(ssub);
5520 postfixlen = sdslen(spat)-(prefixlen+1);
5521 memcpy(keyname.buf,spat,prefixlen);
5522 memcpy(keyname.buf+prefixlen,ssub,sublen);
5523 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5524 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5525 keyname.len = prefixlen+sublen+postfixlen;
5526
5527 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5528 decrRefCount(subst);
5529
5530 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5531 return lookupKeyRead(db,&keyobj);
5532 }
5533
5534 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5535 * the additional parameter is not standard but a BSD-specific we have to
5536 * pass sorting parameters via the global 'server' structure */
5537 static int sortCompare(const void *s1, const void *s2) {
5538 const redisSortObject *so1 = s1, *so2 = s2;
5539 int cmp;
5540
5541 if (!server.sort_alpha) {
5542 /* Numeric sorting. Here it's trivial as we precomputed scores */
5543 if (so1->u.score > so2->u.score) {
5544 cmp = 1;
5545 } else if (so1->u.score < so2->u.score) {
5546 cmp = -1;
5547 } else {
5548 cmp = 0;
5549 }
5550 } else {
5551 /* Alphanumeric sorting */
5552 if (server.sort_bypattern) {
5553 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5554 /* At least one compare object is NULL */
5555 if (so1->u.cmpobj == so2->u.cmpobj)
5556 cmp = 0;
5557 else if (so1->u.cmpobj == NULL)
5558 cmp = -1;
5559 else
5560 cmp = 1;
5561 } else {
5562 /* We have both the objects, use strcoll */
5563 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5564 }
5565 } else {
5566 /* Compare elements directly */
5567 robj *dec1, *dec2;
5568
5569 dec1 = getDecodedObject(so1->obj);
5570 dec2 = getDecodedObject(so2->obj);
5571 cmp = strcoll(dec1->ptr,dec2->ptr);
5572 decrRefCount(dec1);
5573 decrRefCount(dec2);
5574 }
5575 }
5576 return server.sort_desc ? -cmp : cmp;
5577 }
5578
5579 /* The SORT command is the most complex command in Redis. Warning: this code
5580 * is optimized for speed and a bit less for readability */
5581 static void sortCommand(redisClient *c) {
5582 list *operations;
5583 int outputlen = 0;
5584 int desc = 0, alpha = 0;
5585 int limit_start = 0, limit_count = -1, start, end;
5586 int j, dontsort = 0, vectorlen;
5587 int getop = 0; /* GET operation counter */
5588 robj *sortval, *sortby = NULL, *storekey = NULL;
5589 redisSortObject *vector; /* Resulting vector to sort */
5590
5591 /* Lookup the key to sort. It must be of the right types */
5592 sortval = lookupKeyRead(c->db,c->argv[1]);
5593 if (sortval == NULL) {
5594 addReply(c,shared.nullmultibulk);
5595 return;
5596 }
5597 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5598 sortval->type != REDIS_ZSET)
5599 {
5600 addReply(c,shared.wrongtypeerr);
5601 return;
5602 }
5603
5604 /* Create a list of operations to perform for every sorted element.
5605 * Operations can be GET/DEL/INCR/DECR */
5606 operations = listCreate();
5607 listSetFreeMethod(operations,zfree);
5608 j = 2;
5609
5610 /* Now we need to protect sortval incrementing its count, in the future
5611 * SORT may have options able to overwrite/delete keys during the sorting
5612 * and the sorted key itself may get destroied */
5613 incrRefCount(sortval);
5614
5615 /* The SORT command has an SQL-alike syntax, parse it */
5616 while(j < c->argc) {
5617 int leftargs = c->argc-j-1;
5618 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5619 desc = 0;
5620 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5621 desc = 1;
5622 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5623 alpha = 1;
5624 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5625 limit_start = atoi(c->argv[j+1]->ptr);
5626 limit_count = atoi(c->argv[j+2]->ptr);
5627 j+=2;
5628 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5629 storekey = c->argv[j+1];
5630 j++;
5631 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5632 sortby = c->argv[j+1];
5633 /* If the BY pattern does not contain '*', i.e. it is constant,
5634 * we don't need to sort nor to lookup the weight keys. */
5635 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5636 j++;
5637 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5638 listAddNodeTail(operations,createSortOperation(
5639 REDIS_SORT_GET,c->argv[j+1]));
5640 getop++;
5641 j++;
5642 } else {
5643 decrRefCount(sortval);
5644 listRelease(operations);
5645 addReply(c,shared.syntaxerr);
5646 return;
5647 }
5648 j++;
5649 }
5650
5651 /* Load the sorting vector with all the objects to sort */
5652 switch(sortval->type) {
5653 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5654 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5655 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
5656 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
5657 }
5658 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
5659 j = 0;
5660
5661 if (sortval->type == REDIS_LIST) {
5662 list *list = sortval->ptr;
5663 listNode *ln;
5664 listIter li;
5665
5666 listRewind(list,&li);
5667 while((ln = listNext(&li))) {
5668 robj *ele = ln->value;
5669 vector[j].obj = ele;
5670 vector[j].u.score = 0;
5671 vector[j].u.cmpobj = NULL;
5672 j++;
5673 }
5674 } else {
5675 dict *set;
5676 dictIterator *di;
5677 dictEntry *setele;
5678
5679 if (sortval->type == REDIS_SET) {
5680 set = sortval->ptr;
5681 } else {
5682 zset *zs = sortval->ptr;
5683 set = zs->dict;
5684 }
5685
5686 di = dictGetIterator(set);
5687 while((setele = dictNext(di)) != NULL) {
5688 vector[j].obj = dictGetEntryKey(setele);
5689 vector[j].u.score = 0;
5690 vector[j].u.cmpobj = NULL;
5691 j++;
5692 }
5693 dictReleaseIterator(di);
5694 }
5695 redisAssert(j == vectorlen);
5696
5697 /* Now it's time to load the right scores in the sorting vector */
5698 if (dontsort == 0) {
5699 for (j = 0; j < vectorlen; j++) {
5700 if (sortby) {
5701 robj *byval;
5702
5703 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
5704 if (!byval || byval->type != REDIS_STRING) continue;
5705 if (alpha) {
5706 vector[j].u.cmpobj = getDecodedObject(byval);
5707 } else {
5708 if (byval->encoding == REDIS_ENCODING_RAW) {
5709 vector[j].u.score = strtod(byval->ptr,NULL);
5710 } else {
5711 /* Don't need to decode the object if it's
5712 * integer-encoded (the only encoding supported) so
5713 * far. We can just cast it */
5714 if (byval->encoding == REDIS_ENCODING_INT) {
5715 vector[j].u.score = (long)byval->ptr;
5716 } else
5717 redisAssert(1 != 1);
5718 }
5719 }
5720 } else {
5721 if (!alpha) {
5722 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5723 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5724 else {
5725 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5726 vector[j].u.score = (long) vector[j].obj->ptr;
5727 else
5728 redisAssert(1 != 1);
5729 }
5730 }
5731 }
5732 }
5733 }
5734
5735 /* We are ready to sort the vector... perform a bit of sanity check
5736 * on the LIMIT option too. We'll use a partial version of quicksort. */
5737 start = (limit_start < 0) ? 0 : limit_start;
5738 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5739 if (start >= vectorlen) {
5740 start = vectorlen-1;
5741 end = vectorlen-2;
5742 }
5743 if (end >= vectorlen) end = vectorlen-1;
5744
5745 if (dontsort == 0) {
5746 server.sort_desc = desc;
5747 server.sort_alpha = alpha;
5748 server.sort_bypattern = sortby ? 1 : 0;
5749 if (sortby && (start != 0 || end != vectorlen-1))
5750 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5751 else
5752 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
5753 }
5754
5755 /* Send command output to the output buffer, performing the specified
5756 * GET/DEL/INCR/DECR operations if any. */
5757 outputlen = getop ? getop*(end-start+1) : end-start+1;
5758 if (storekey == NULL) {
5759 /* STORE option not specified, sent the sorting result to client */
5760 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5761 for (j = start; j <= end; j++) {
5762 listNode *ln;
5763 listIter li;
5764
5765 if (!getop) {
5766 addReplyBulkLen(c,vector[j].obj);
5767 addReply(c,vector[j].obj);
5768 addReply(c,shared.crlf);
5769 }
5770 listRewind(operations,&li);
5771 while((ln = listNext(&li))) {
5772 redisSortOperation *sop = ln->value;
5773 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5774 vector[j].obj);
5775
5776 if (sop->type == REDIS_SORT_GET) {
5777 if (!val || val->type != REDIS_STRING) {
5778 addReply(c,shared.nullbulk);
5779 } else {
5780 addReplyBulkLen(c,val);
5781 addReply(c,val);
5782 addReply(c,shared.crlf);
5783 }
5784 } else {
5785 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5786 }
5787 }
5788 }
5789 } else {
5790 robj *listObject = createListObject();
5791 list *listPtr = (list*) listObject->ptr;
5792
5793 /* STORE option specified, set the sorting result as a List object */
5794 for (j = start; j <= end; j++) {
5795 listNode *ln;
5796 listIter li;
5797
5798 if (!getop) {
5799 listAddNodeTail(listPtr,vector[j].obj);
5800 incrRefCount(vector[j].obj);
5801 }
5802 listRewind(operations,&li);
5803 while((ln = listNext(&li))) {
5804 redisSortOperation *sop = ln->value;
5805 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5806 vector[j].obj);
5807
5808 if (sop->type == REDIS_SORT_GET) {
5809 if (!val || val->type != REDIS_STRING) {
5810 listAddNodeTail(listPtr,createStringObject("",0));
5811 } else {
5812 listAddNodeTail(listPtr,val);
5813 incrRefCount(val);
5814 }
5815 } else {
5816 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
5817 }
5818 }
5819 }
5820 if (dictReplace(c->db->dict,storekey,listObject)) {
5821 incrRefCount(storekey);
5822 }
5823 /* Note: we add 1 because the DB is dirty anyway since even if the
5824 * SORT result is empty a new key is set and maybe the old content
5825 * replaced. */
5826 server.dirty += 1+outputlen;
5827 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
5828 }
5829
5830 /* Cleanup */
5831 decrRefCount(sortval);
5832 listRelease(operations);
5833 for (j = 0; j < vectorlen; j++) {
5834 if (sortby && alpha && vector[j].u.cmpobj)
5835 decrRefCount(vector[j].u.cmpobj);
5836 }
5837 zfree(vector);
5838 }
5839
5840 /* Convert an amount of bytes into a human readable string in the form
5841 * of 100B, 2G, 100M, 4K, and so forth. */
5842 static void bytesToHuman(char *s, unsigned long long n) {
5843 double d;
5844
5845 if (n < 1024) {
5846 /* Bytes */
5847 sprintf(s,"%lluB",n);
5848 return;
5849 } else if (n < (1024*1024)) {
5850 d = (double)n/(1024);
5851 sprintf(s,"%.2fK",d);
5852 } else if (n < (1024LL*1024*1024)) {
5853 d = (double)n/(1024*1024);
5854 sprintf(s,"%.2fM",d);
5855 } else if (n < (1024LL*1024*1024*1024)) {
5856 d = (double)n/(1024LL*1024*1024);
5857 sprintf(s,"%.2fG",d);
5858 }
5859 }
5860
5861 /* Create the string returned by the INFO command. This is decoupled
5862 * by the INFO command itself as we need to report the same information
5863 * on memory corruption problems. */
5864 static sds genRedisInfoString(void) {
5865 sds info;
5866 time_t uptime = time(NULL)-server.stat_starttime;
5867 int j;
5868 char hmem[64];
5869
5870 bytesToHuman(hmem,zmalloc_used_memory());
5871 info = sdscatprintf(sdsempty(),
5872 "redis_version:%s\r\n"
5873 "arch_bits:%s\r\n"
5874 "multiplexing_api:%s\r\n"
5875 "process_id:%ld\r\n"
5876 "uptime_in_seconds:%ld\r\n"
5877 "uptime_in_days:%ld\r\n"
5878 "connected_clients:%d\r\n"
5879 "connected_slaves:%d\r\n"
5880 "blocked_clients:%d\r\n"
5881 "used_memory:%zu\r\n"
5882 "used_memory_human:%s\r\n"
5883 "changes_since_last_save:%lld\r\n"
5884 "bgsave_in_progress:%d\r\n"
5885 "last_save_time:%ld\r\n"
5886 "bgrewriteaof_in_progress:%d\r\n"
5887 "total_connections_received:%lld\r\n"
5888 "total_commands_processed:%lld\r\n"
5889 "vm_enabled:%d\r\n"
5890 "role:%s\r\n"
5891 ,REDIS_VERSION,
5892 (sizeof(long) == 8) ? "64" : "32",
5893 aeGetApiName(),
5894 (long) getpid(),
5895 uptime,
5896 uptime/(3600*24),
5897 listLength(server.clients)-listLength(server.slaves),
5898 listLength(server.slaves),
5899 server.blpop_blocked_clients,
5900 zmalloc_used_memory(),
5901 hmem,
5902 server.dirty,
5903 server.bgsavechildpid != -1,
5904 server.lastsave,
5905 server.bgrewritechildpid != -1,
5906 server.stat_numconnections,
5907 server.stat_numcommands,
5908 server.vm_enabled != 0,
5909 server.masterhost == NULL ? "master" : "slave"
5910 );
5911 if (server.masterhost) {
5912 info = sdscatprintf(info,
5913 "master_host:%s\r\n"
5914 "master_port:%d\r\n"
5915 "master_link_status:%s\r\n"
5916 "master_last_io_seconds_ago:%d\r\n"
5917 ,server.masterhost,
5918 server.masterport,
5919 (server.replstate == REDIS_REPL_CONNECTED) ?
5920 "up" : "down",
5921 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
5922 );
5923 }
5924 if (server.vm_enabled) {
5925 lockThreadedIO();
5926 info = sdscatprintf(info,
5927 "vm_conf_max_memory:%llu\r\n"
5928 "vm_conf_page_size:%llu\r\n"
5929 "vm_conf_pages:%llu\r\n"
5930 "vm_stats_used_pages:%llu\r\n"
5931 "vm_stats_swapped_objects:%llu\r\n"
5932 "vm_stats_swappin_count:%llu\r\n"
5933 "vm_stats_swappout_count:%llu\r\n"
5934 "vm_stats_io_newjobs_len:%lu\r\n"
5935 "vm_stats_io_processing_len:%lu\r\n"
5936 "vm_stats_io_processed_len:%lu\r\n"
5937 "vm_stats_io_active_threads:%lu\r\n"
5938 "vm_stats_blocked_clients:%lu\r\n"
5939 ,(unsigned long long) server.vm_max_memory,
5940 (unsigned long long) server.vm_page_size,
5941 (unsigned long long) server.vm_pages,
5942 (unsigned long long) server.vm_stats_used_pages,
5943 (unsigned long long) server.vm_stats_swapped_objects,
5944 (unsigned long long) server.vm_stats_swapins,
5945 (unsigned long long) server.vm_stats_swapouts,
5946 (unsigned long) listLength(server.io_newjobs),
5947 (unsigned long) listLength(server.io_processing),
5948 (unsigned long) listLength(server.io_processed),
5949 (unsigned long) server.io_active_threads,
5950 (unsigned long) server.vm_blocked_clients
5951 );
5952 unlockThreadedIO();
5953 }
5954 for (j = 0; j < server.dbnum; j++) {
5955 long long keys, vkeys;
5956
5957 keys = dictSize(server.db[j].dict);
5958 vkeys = dictSize(server.db[j].expires);
5959 if (keys || vkeys) {
5960 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
5961 j, keys, vkeys);
5962 }
5963 }
5964 return info;
5965 }
5966
5967 static void infoCommand(redisClient *c) {
5968 sds info = genRedisInfoString();
5969 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5970 (unsigned long)sdslen(info)));
5971 addReplySds(c,info);
5972 addReply(c,shared.crlf);
5973 }
5974
5975 static void monitorCommand(redisClient *c) {
5976 /* ignore MONITOR if aleady slave or in monitor mode */
5977 if (c->flags & REDIS_SLAVE) return;
5978
5979 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5980 c->slaveseldb = 0;
5981 listAddNodeTail(server.monitors,c);
5982 addReply(c,shared.ok);
5983 }
5984
5985 /* ================================= Expire ================================= */
5986 static int removeExpire(redisDb *db, robj *key) {
5987 if (dictDelete(db->expires,key) == DICT_OK) {
5988 return 1;
5989 } else {
5990 return 0;
5991 }
5992 }
5993
5994 static int setExpire(redisDb *db, robj *key, time_t when) {
5995 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5996 return 0;
5997 } else {
5998 incrRefCount(key);
5999 return 1;
6000 }
6001 }
6002
6003 /* Return the expire time of the specified key, or -1 if no expire
6004 * is associated with this key (i.e. the key is non volatile) */
6005 static time_t getExpire(redisDb *db, robj *key) {
6006 dictEntry *de;
6007
6008 /* No expire? return ASAP */
6009 if (dictSize(db->expires) == 0 ||
6010 (de = dictFind(db->expires,key)) == NULL) return -1;
6011
6012 return (time_t) dictGetEntryVal(de);
6013 }
6014
6015 static int expireIfNeeded(redisDb *db, robj *key) {
6016 time_t when;
6017 dictEntry *de;
6018
6019 /* No expire? return ASAP */
6020 if (dictSize(db->expires) == 0 ||
6021 (de = dictFind(db->expires,key)) == NULL) return 0;
6022
6023 /* Lookup the expire */
6024 when = (time_t) dictGetEntryVal(de);
6025 if (time(NULL) <= when) return 0;
6026
6027 /* Delete the key */
6028 dictDelete(db->expires,key);
6029 return dictDelete(db->dict,key) == DICT_OK;
6030 }
6031
6032 static int deleteIfVolatile(redisDb *db, robj *key) {
6033 dictEntry *de;
6034
6035 /* No expire? return ASAP */
6036 if (dictSize(db->expires) == 0 ||
6037 (de = dictFind(db->expires,key)) == NULL) return 0;
6038
6039 /* Delete the key */
6040 server.dirty++;
6041 dictDelete(db->expires,key);
6042 return dictDelete(db->dict,key) == DICT_OK;
6043 }
6044
6045 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6046 dictEntry *de;
6047
6048 de = dictFind(c->db->dict,key);
6049 if (de == NULL) {
6050 addReply(c,shared.czero);
6051 return;
6052 }
6053 if (seconds < 0) {
6054 if (deleteKey(c->db,key)) server.dirty++;
6055 addReply(c, shared.cone);
6056 return;
6057 } else {
6058 time_t when = time(NULL)+seconds;
6059 if (setExpire(c->db,key,when)) {
6060 addReply(c,shared.cone);
6061 server.dirty++;
6062 } else {
6063 addReply(c,shared.czero);
6064 }
6065 return;
6066 }
6067 }
6068
6069 static void expireCommand(redisClient *c) {
6070 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6071 }
6072
6073 static void expireatCommand(redisClient *c) {
6074 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6075 }
6076
6077 static void ttlCommand(redisClient *c) {
6078 time_t expire;
6079 int ttl = -1;
6080
6081 expire = getExpire(c->db,c->argv[1]);
6082 if (expire != -1) {
6083 ttl = (int) (expire-time(NULL));
6084 if (ttl < 0) ttl = -1;
6085 }
6086 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6087 }
6088
6089 /* ================================ MULTI/EXEC ============================== */
6090
6091 /* Client state initialization for MULTI/EXEC */
6092 static void initClientMultiState(redisClient *c) {
6093 c->mstate.commands = NULL;
6094 c->mstate.count = 0;
6095 }
6096
6097 /* Release all the resources associated with MULTI/EXEC state */
6098 static void freeClientMultiState(redisClient *c) {
6099 int j;
6100
6101 for (j = 0; j < c->mstate.count; j++) {
6102 int i;
6103 multiCmd *mc = c->mstate.commands+j;
6104
6105 for (i = 0; i < mc->argc; i++)
6106 decrRefCount(mc->argv[i]);
6107 zfree(mc->argv);
6108 }
6109 zfree(c->mstate.commands);
6110 }
6111
6112 /* Add a new command into the MULTI commands queue */
6113 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6114 multiCmd *mc;
6115 int j;
6116
6117 c->mstate.commands = zrealloc(c->mstate.commands,
6118 sizeof(multiCmd)*(c->mstate.count+1));
6119 mc = c->mstate.commands+c->mstate.count;
6120 mc->cmd = cmd;
6121 mc->argc = c->argc;
6122 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6123 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6124 for (j = 0; j < c->argc; j++)
6125 incrRefCount(mc->argv[j]);
6126 c->mstate.count++;
6127 }
6128
6129 static void multiCommand(redisClient *c) {
6130 c->flags |= REDIS_MULTI;
6131 addReply(c,shared.ok);
6132 }
6133
6134 static void discardCommand(redisClient *c) {
6135 if (!(c->flags & REDIS_MULTI)) {
6136 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6137 return;
6138 }
6139
6140 freeClientMultiState(c);
6141 initClientMultiState(c);
6142 c->flags &= (~REDIS_MULTI);
6143 addReply(c,shared.ok);
6144 }
6145
6146 static void execCommand(redisClient *c) {
6147 int j;
6148 robj **orig_argv;
6149 int orig_argc;
6150
6151 if (!(c->flags & REDIS_MULTI)) {
6152 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6153 return;
6154 }
6155
6156 orig_argv = c->argv;
6157 orig_argc = c->argc;
6158 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6159 for (j = 0; j < c->mstate.count; j++) {
6160 c->argc = c->mstate.commands[j].argc;
6161 c->argv = c->mstate.commands[j].argv;
6162 call(c,c->mstate.commands[j].cmd);
6163 }
6164 c->argv = orig_argv;
6165 c->argc = orig_argc;
6166 freeClientMultiState(c);
6167 initClientMultiState(c);
6168 c->flags &= (~REDIS_MULTI);
6169 }
6170
6171 /* =========================== Blocking Operations ========================= */
6172
6173 /* Currently Redis blocking operations support is limited to list POP ops,
6174 * so the current implementation is not fully generic, but it is also not
6175 * completely specific so it will not require a rewrite to support new
6176 * kind of blocking operations in the future.
6177 *
6178 * Still it's important to note that list blocking operations can be already
6179 * used as a notification mechanism in order to implement other blocking
6180 * operations at application level, so there must be a very strong evidence
6181 * of usefulness and generality before new blocking operations are implemented.
6182 *
6183 * This is how the current blocking POP works, we use BLPOP as example:
6184 * - If the user calls BLPOP and the key exists and contains a non empty list
6185 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6186 * if there is not to block.
6187 * - If instead BLPOP is called and the key does not exists or the list is
6188 * empty we need to block. In order to do so we remove the notification for
6189 * new data to read in the client socket (so that we'll not serve new
6190 * requests if the blocking request is not served). Also we put the client
6191 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6192 * blocking for this keys.
6193 * - If a PUSH operation against a key with blocked clients waiting is
6194 * performed, we serve the first in the list: basically instead to push
6195 * the new element inside the list we return it to the (first / oldest)
6196 * blocking client, unblock the client, and remove it form the list.
6197 *
6198 * The above comment and the source code should be enough in order to understand
6199 * the implementation and modify / fix it later.
6200 */
6201
6202 /* Set a client in blocking mode for the specified key, with the specified
6203 * timeout */
6204 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6205 dictEntry *de;
6206 list *l;
6207 int j;
6208
6209 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6210 c->blockingkeysnum = numkeys;
6211 c->blockingto = timeout;
6212 for (j = 0; j < numkeys; j++) {
6213 /* Add the key in the client structure, to map clients -> keys */
6214 c->blockingkeys[j] = keys[j];
6215 incrRefCount(keys[j]);
6216
6217 /* And in the other "side", to map keys -> clients */
6218 de = dictFind(c->db->blockingkeys,keys[j]);
6219 if (de == NULL) {
6220 int retval;
6221
6222 /* For every key we take a list of clients blocked for it */
6223 l = listCreate();
6224 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6225 incrRefCount(keys[j]);
6226 assert(retval == DICT_OK);
6227 } else {
6228 l = dictGetEntryVal(de);
6229 }
6230 listAddNodeTail(l,c);
6231 }
6232 /* Mark the client as a blocked client */
6233 c->flags |= REDIS_BLOCKED;
6234 server.blpop_blocked_clients++;
6235 }
6236
6237 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6238 static void unblockClientWaitingData(redisClient *c) {
6239 dictEntry *de;
6240 list *l;
6241 int j;
6242
6243 assert(c->blockingkeys != NULL);
6244 /* The client may wait for multiple keys, so unblock it for every key. */
6245 for (j = 0; j < c->blockingkeysnum; j++) {
6246 /* Remove this client from the list of clients waiting for this key. */
6247 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6248 assert(de != NULL);
6249 l = dictGetEntryVal(de);
6250 listDelNode(l,listSearchKey(l,c));
6251 /* If the list is empty we need to remove it to avoid wasting memory */
6252 if (listLength(l) == 0)
6253 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6254 decrRefCount(c->blockingkeys[j]);
6255 }
6256 /* Cleanup the client structure */
6257 zfree(c->blockingkeys);
6258 c->blockingkeys = NULL;
6259 c->flags &= (~REDIS_BLOCKED);
6260 server.blpop_blocked_clients--;
6261 /* We want to process data if there is some command waiting
6262 * in the input buffer. Note that this is safe even if
6263 * unblockClientWaitingData() gets called from freeClient() because
6264 * freeClient() will be smart enough to call this function
6265 * *after* c->querybuf was set to NULL. */
6266 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6267 }
6268
6269 /* This should be called from any function PUSHing into lists.
6270 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6271 * 'ele' is the element pushed.
6272 *
6273 * If the function returns 0 there was no client waiting for a list push
6274 * against this key.
6275 *
6276 * If the function returns 1 there was a client waiting for a list push
6277 * against this key, the element was passed to this client thus it's not
6278 * needed to actually add it to the list and the caller should return asap. */
6279 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6280 struct dictEntry *de;
6281 redisClient *receiver;
6282 list *l;
6283 listNode *ln;
6284
6285 de = dictFind(c->db->blockingkeys,key);
6286 if (de == NULL) return 0;
6287 l = dictGetEntryVal(de);
6288 ln = listFirst(l);
6289 assert(ln != NULL);
6290 receiver = ln->value;
6291
6292 addReplySds(receiver,sdsnew("*2\r\n"));
6293 addReplyBulkLen(receiver,key);
6294 addReply(receiver,key);
6295 addReply(receiver,shared.crlf);
6296 addReplyBulkLen(receiver,ele);
6297 addReply(receiver,ele);
6298 addReply(receiver,shared.crlf);
6299 unblockClientWaitingData(receiver);
6300 return 1;
6301 }
6302
6303 /* Blocking RPOP/LPOP */
6304 static void blockingPopGenericCommand(redisClient *c, int where) {
6305 robj *o;
6306 time_t timeout;
6307 int j;
6308
6309 for (j = 1; j < c->argc-1; j++) {
6310 o = lookupKeyWrite(c->db,c->argv[j]);
6311 if (o != NULL) {
6312 if (o->type != REDIS_LIST) {
6313 addReply(c,shared.wrongtypeerr);
6314 return;
6315 } else {
6316 list *list = o->ptr;
6317 if (listLength(list) != 0) {
6318 /* If the list contains elements fall back to the usual
6319 * non-blocking POP operation */
6320 robj *argv[2], **orig_argv;
6321 int orig_argc;
6322
6323 /* We need to alter the command arguments before to call
6324 * popGenericCommand() as the command takes a single key. */
6325 orig_argv = c->argv;
6326 orig_argc = c->argc;
6327 argv[1] = c->argv[j];
6328 c->argv = argv;
6329 c->argc = 2;
6330
6331 /* Also the return value is different, we need to output
6332 * the multi bulk reply header and the key name. The
6333 * "real" command will add the last element (the value)
6334 * for us. If this souds like an hack to you it's just
6335 * because it is... */
6336 addReplySds(c,sdsnew("*2\r\n"));
6337 addReplyBulkLen(c,argv[1]);
6338 addReply(c,argv[1]);
6339 addReply(c,shared.crlf);
6340 popGenericCommand(c,where);
6341
6342 /* Fix the client structure with the original stuff */
6343 c->argv = orig_argv;
6344 c->argc = orig_argc;
6345 return;
6346 }
6347 }
6348 }
6349 }
6350 /* If the list is empty or the key does not exists we must block */
6351 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6352 if (timeout > 0) timeout += time(NULL);
6353 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6354 }
6355
6356 static void blpopCommand(redisClient *c) {
6357 blockingPopGenericCommand(c,REDIS_HEAD);
6358 }
6359
6360 static void brpopCommand(redisClient *c) {
6361 blockingPopGenericCommand(c,REDIS_TAIL);
6362 }
6363
6364 /* =============================== Replication ============================= */
6365
6366 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6367 ssize_t nwritten, ret = size;
6368 time_t start = time(NULL);
6369
6370 timeout++;
6371 while(size) {
6372 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6373 nwritten = write(fd,ptr,size);
6374 if (nwritten == -1) return -1;
6375 ptr += nwritten;
6376 size -= nwritten;
6377 }
6378 if ((time(NULL)-start) > timeout) {
6379 errno = ETIMEDOUT;
6380 return -1;
6381 }
6382 }
6383 return ret;
6384 }
6385
6386 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6387 ssize_t nread, totread = 0;
6388 time_t start = time(NULL);
6389
6390 timeout++;
6391 while(size) {
6392 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6393 nread = read(fd,ptr,size);
6394 if (nread == -1) return -1;
6395 ptr += nread;
6396 size -= nread;
6397 totread += nread;
6398 }
6399 if ((time(NULL)-start) > timeout) {
6400 errno = ETIMEDOUT;
6401 return -1;
6402 }
6403 }
6404 return totread;
6405 }
6406
6407 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6408 ssize_t nread = 0;
6409
6410 size--;
6411 while(size) {
6412 char c;
6413
6414 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6415 if (c == '\n') {
6416 *ptr = '\0';
6417 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6418 return nread;
6419 } else {
6420 *ptr++ = c;
6421 *ptr = '\0';
6422 nread++;
6423 }
6424 }
6425 return nread;
6426 }
6427
6428 static void syncCommand(redisClient *c) {
6429 /* ignore SYNC if aleady slave or in monitor mode */
6430 if (c->flags & REDIS_SLAVE) return;
6431
6432 /* SYNC can't be issued when the server has pending data to send to
6433 * the client about already issued commands. We need a fresh reply
6434 * buffer registering the differences between the BGSAVE and the current
6435 * dataset, so that we can copy to other slaves if needed. */
6436 if (listLength(c->reply) != 0) {
6437 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6438 return;
6439 }
6440
6441 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6442 /* Here we need to check if there is a background saving operation
6443 * in progress, or if it is required to start one */
6444 if (server.bgsavechildpid != -1) {
6445 /* Ok a background save is in progress. Let's check if it is a good
6446 * one for replication, i.e. if there is another slave that is
6447 * registering differences since the server forked to save */
6448 redisClient *slave;
6449 listNode *ln;
6450 listIter li;
6451
6452 listRewind(server.slaves,&li);
6453 while((ln = listNext(&li))) {
6454 slave = ln->value;
6455 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6456 }
6457 if (ln) {
6458 /* Perfect, the server is already registering differences for
6459 * another slave. Set the right state, and copy the buffer. */
6460 listRelease(c->reply);
6461 c->reply = listDup(slave->reply);
6462 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6463 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6464 } else {
6465 /* No way, we need to wait for the next BGSAVE in order to
6466 * register differences */
6467 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6468 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6469 }
6470 } else {
6471 /* Ok we don't have a BGSAVE in progress, let's start one */
6472 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6473 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6474 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6475 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6476 return;
6477 }
6478 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6479 }
6480 c->repldbfd = -1;
6481 c->flags |= REDIS_SLAVE;
6482 c->slaveseldb = 0;
6483 listAddNodeTail(server.slaves,c);
6484 return;
6485 }
6486
6487 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6488 redisClient *slave = privdata;
6489 REDIS_NOTUSED(el);
6490 REDIS_NOTUSED(mask);
6491 char buf[REDIS_IOBUF_LEN];
6492 ssize_t nwritten, buflen;
6493
6494 if (slave->repldboff == 0) {
6495 /* Write the bulk write count before to transfer the DB. In theory here
6496 * we don't know how much room there is in the output buffer of the
6497 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6498 * operations) will never be smaller than the few bytes we need. */
6499 sds bulkcount;
6500
6501 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6502 slave->repldbsize);
6503 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6504 {
6505 sdsfree(bulkcount);
6506 freeClient(slave);
6507 return;
6508 }
6509 sdsfree(bulkcount);
6510 }
6511 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6512 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6513 if (buflen <= 0) {
6514 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6515 (buflen == 0) ? "premature EOF" : strerror(errno));
6516 freeClient(slave);
6517 return;
6518 }
6519 if ((nwritten = write(fd,buf,buflen)) == -1) {
6520 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6521 strerror(errno));
6522 freeClient(slave);
6523 return;
6524 }
6525 slave->repldboff += nwritten;
6526 if (slave->repldboff == slave->repldbsize) {
6527 close(slave->repldbfd);
6528 slave->repldbfd = -1;
6529 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6530 slave->replstate = REDIS_REPL_ONLINE;
6531 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6532 sendReplyToClient, slave) == AE_ERR) {
6533 freeClient(slave);
6534 return;
6535 }
6536 addReplySds(slave,sdsempty());
6537 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6538 }
6539 }
6540
6541 /* This function is called at the end of every backgrond saving.
6542 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6543 * otherwise REDIS_ERR is passed to the function.
6544 *
6545 * The goal of this function is to handle slaves waiting for a successful
6546 * background saving in order to perform non-blocking synchronization. */
6547 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6548 listNode *ln;
6549 int startbgsave = 0;
6550 listIter li;
6551
6552 listRewind(server.slaves,&li);
6553 while((ln = listNext(&li))) {
6554 redisClient *slave = ln->value;
6555
6556 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6557 startbgsave = 1;
6558 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6559 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6560 struct redis_stat buf;
6561
6562 if (bgsaveerr != REDIS_OK) {
6563 freeClient(slave);
6564 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6565 continue;
6566 }
6567 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6568 redis_fstat(slave->repldbfd,&buf) == -1) {
6569 freeClient(slave);
6570 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6571 continue;
6572 }
6573 slave->repldboff = 0;
6574 slave->repldbsize = buf.st_size;
6575 slave->replstate = REDIS_REPL_SEND_BULK;
6576 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6577 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6578 freeClient(slave);
6579 continue;
6580 }
6581 }
6582 }
6583 if (startbgsave) {
6584 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6585 listIter li;
6586
6587 listRewind(server.slaves,&li);
6588 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6589 while((ln = listNext(&li))) {
6590 redisClient *slave = ln->value;
6591
6592 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6593 freeClient(slave);
6594 }
6595 }
6596 }
6597 }
6598
6599 static int syncWithMaster(void) {
6600 char buf[1024], tmpfile[256], authcmd[1024];
6601 int dumpsize;
6602 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6603 int dfd;
6604
6605 if (fd == -1) {
6606 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6607 strerror(errno));
6608 return REDIS_ERR;
6609 }
6610
6611 /* AUTH with the master if required. */
6612 if(server.masterauth) {
6613 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6614 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6615 close(fd);
6616 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6617 strerror(errno));
6618 return REDIS_ERR;
6619 }
6620 /* Read the AUTH result. */
6621 if (syncReadLine(fd,buf,1024,3600) == -1) {
6622 close(fd);
6623 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6624 strerror(errno));
6625 return REDIS_ERR;
6626 }
6627 if (buf[0] != '+') {
6628 close(fd);
6629 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6630 return REDIS_ERR;
6631 }
6632 }
6633
6634 /* Issue the SYNC command */
6635 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6636 close(fd);
6637 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6638 strerror(errno));
6639 return REDIS_ERR;
6640 }
6641 /* Read the bulk write count */
6642 if (syncReadLine(fd,buf,1024,3600) == -1) {
6643 close(fd);
6644 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6645 strerror(errno));
6646 return REDIS_ERR;
6647 }
6648 if (buf[0] != '$') {
6649 close(fd);
6650 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6651 return REDIS_ERR;
6652 }
6653 dumpsize = atoi(buf+1);
6654 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6655 /* Read the bulk write data on a temp file */
6656 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6657 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6658 if (dfd == -1) {
6659 close(fd);
6660 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6661 return REDIS_ERR;
6662 }
6663 while(dumpsize) {
6664 int nread, nwritten;
6665
6666 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6667 if (nread == -1) {
6668 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6669 strerror(errno));
6670 close(fd);
6671 close(dfd);
6672 return REDIS_ERR;
6673 }
6674 nwritten = write(dfd,buf,nread);
6675 if (nwritten == -1) {
6676 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6677 close(fd);
6678 close(dfd);
6679 return REDIS_ERR;
6680 }
6681 dumpsize -= nread;
6682 }
6683 close(dfd);
6684 if (rename(tmpfile,server.dbfilename) == -1) {
6685 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6686 unlink(tmpfile);
6687 close(fd);
6688 return REDIS_ERR;
6689 }
6690 emptyDb();
6691 if (rdbLoad(server.dbfilename) != REDIS_OK) {
6692 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6693 close(fd);
6694 return REDIS_ERR;
6695 }
6696 server.master = createClient(fd);
6697 server.master->flags |= REDIS_MASTER;
6698 server.master->authenticated = 1;
6699 server.replstate = REDIS_REPL_CONNECTED;
6700 return REDIS_OK;
6701 }
6702
6703 static void slaveofCommand(redisClient *c) {
6704 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6705 !strcasecmp(c->argv[2]->ptr,"one")) {
6706 if (server.masterhost) {
6707 sdsfree(server.masterhost);
6708 server.masterhost = NULL;
6709 if (server.master) freeClient(server.master);
6710 server.replstate = REDIS_REPL_NONE;
6711 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6712 }
6713 } else {
6714 sdsfree(server.masterhost);
6715 server.masterhost = sdsdup(c->argv[1]->ptr);
6716 server.masterport = atoi(c->argv[2]->ptr);
6717 if (server.master) freeClient(server.master);
6718 server.replstate = REDIS_REPL_CONNECT;
6719 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6720 server.masterhost, server.masterport);
6721 }
6722 addReply(c,shared.ok);
6723 }
6724
6725 /* ============================ Maxmemory directive ======================== */
6726
6727 /* Try to free one object form the pre-allocated objects free list.
6728 * This is useful under low mem conditions as by default we take 1 million
6729 * free objects allocated. On success REDIS_OK is returned, otherwise
6730 * REDIS_ERR. */
6731 static int tryFreeOneObjectFromFreelist(void) {
6732 robj *o;
6733
6734 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6735 if (listLength(server.objfreelist)) {
6736 listNode *head = listFirst(server.objfreelist);
6737 o = listNodeValue(head);
6738 listDelNode(server.objfreelist,head);
6739 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6740 zfree(o);
6741 return REDIS_OK;
6742 } else {
6743 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6744 return REDIS_ERR;
6745 }
6746 }
6747
6748 /* This function gets called when 'maxmemory' is set on the config file to limit
6749 * the max memory used by the server, and we are out of memory.
6750 * This function will try to, in order:
6751 *
6752 * - Free objects from the free list
6753 * - Try to remove keys with an EXPIRE set
6754 *
6755 * It is not possible to free enough memory to reach used-memory < maxmemory
6756 * the server will start refusing commands that will enlarge even more the
6757 * memory usage.
6758 */
6759 static void freeMemoryIfNeeded(void) {
6760 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
6761 int j, k, freed = 0;
6762
6763 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6764 for (j = 0; j < server.dbnum; j++) {
6765 int minttl = -1;
6766 robj *minkey = NULL;
6767 struct dictEntry *de;
6768
6769 if (dictSize(server.db[j].expires)) {
6770 freed = 1;
6771 /* From a sample of three keys drop the one nearest to
6772 * the natural expire */
6773 for (k = 0; k < 3; k++) {
6774 time_t t;
6775
6776 de = dictGetRandomKey(server.db[j].expires);
6777 t = (time_t) dictGetEntryVal(de);
6778 if (minttl == -1 || t < minttl) {
6779 minkey = dictGetEntryKey(de);
6780 minttl = t;
6781 }
6782 }
6783 deleteKey(server.db+j,minkey);
6784 }
6785 }
6786 if (!freed) return; /* nothing to free... */
6787 }
6788 }
6789
6790 /* ============================== Append Only file ========================== */
6791
6792 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6793 sds buf = sdsempty();
6794 int j;
6795 ssize_t nwritten;
6796 time_t now;
6797 robj *tmpargv[3];
6798
6799 /* The DB this command was targetting is not the same as the last command
6800 * we appendend. To issue a SELECT command is needed. */
6801 if (dictid != server.appendseldb) {
6802 char seldb[64];
6803
6804 snprintf(seldb,sizeof(seldb),"%d",dictid);
6805 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
6806 (unsigned long)strlen(seldb),seldb);
6807 server.appendseldb = dictid;
6808 }
6809
6810 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6811 * EXPIREs into EXPIREATs calls */
6812 if (cmd->proc == expireCommand) {
6813 long when;
6814
6815 tmpargv[0] = createStringObject("EXPIREAT",8);
6816 tmpargv[1] = argv[1];
6817 incrRefCount(argv[1]);
6818 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6819 tmpargv[2] = createObject(REDIS_STRING,
6820 sdscatprintf(sdsempty(),"%ld",when));
6821 argv = tmpargv;
6822 }
6823
6824 /* Append the actual command */
6825 buf = sdscatprintf(buf,"*%d\r\n",argc);
6826 for (j = 0; j < argc; j++) {
6827 robj *o = argv[j];
6828
6829 o = getDecodedObject(o);
6830 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
6831 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6832 buf = sdscatlen(buf,"\r\n",2);
6833 decrRefCount(o);
6834 }
6835
6836 /* Free the objects from the modified argv for EXPIREAT */
6837 if (cmd->proc == expireCommand) {
6838 for (j = 0; j < 3; j++)
6839 decrRefCount(argv[j]);
6840 }
6841
6842 /* We want to perform a single write. This should be guaranteed atomic
6843 * at least if the filesystem we are writing is a real physical one.
6844 * While this will save us against the server being killed I don't think
6845 * there is much to do about the whole server stopping for power problems
6846 * or alike */
6847 nwritten = write(server.appendfd,buf,sdslen(buf));
6848 if (nwritten != (signed)sdslen(buf)) {
6849 /* Ooops, we are in troubles. The best thing to do for now is
6850 * to simply exit instead to give the illusion that everything is
6851 * working as expected. */
6852 if (nwritten == -1) {
6853 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6854 } else {
6855 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6856 }
6857 exit(1);
6858 }
6859 /* If a background append only file rewriting is in progress we want to
6860 * accumulate the differences between the child DB and the current one
6861 * in a buffer, so that when the child process will do its work we
6862 * can append the differences to the new append only file. */
6863 if (server.bgrewritechildpid != -1)
6864 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6865
6866 sdsfree(buf);
6867 now = time(NULL);
6868 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6869 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6870 now-server.lastfsync > 1))
6871 {
6872 fsync(server.appendfd); /* Let's try to get this data on the disk */
6873 server.lastfsync = now;
6874 }
6875 }
6876
6877 /* In Redis commands are always executed in the context of a client, so in
6878 * order to load the append only file we need to create a fake client. */
6879 static struct redisClient *createFakeClient(void) {
6880 struct redisClient *c = zmalloc(sizeof(*c));
6881
6882 selectDb(c,0);
6883 c->fd = -1;
6884 c->querybuf = sdsempty();
6885 c->argc = 0;
6886 c->argv = NULL;
6887 c->flags = 0;
6888 /* We set the fake client as a slave waiting for the synchronization
6889 * so that Redis will not try to send replies to this client. */
6890 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6891 c->reply = listCreate();
6892 listSetFreeMethod(c->reply,decrRefCount);
6893 listSetDupMethod(c->reply,dupClientReplyValue);
6894 return c;
6895 }
6896
6897 static void freeFakeClient(struct redisClient *c) {
6898 sdsfree(c->querybuf);
6899 listRelease(c->reply);
6900 zfree(c);
6901 }
6902
6903 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
6904 * error (the append only file is zero-length) REDIS_ERR is returned. On
6905 * fatal error an error message is logged and the program exists. */
6906 int loadAppendOnlyFile(char *filename) {
6907 struct redisClient *fakeClient;
6908 FILE *fp = fopen(filename,"r");
6909 struct redis_stat sb;
6910 unsigned long long loadedkeys = 0;
6911
6912 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6913 return REDIS_ERR;
6914
6915 if (fp == NULL) {
6916 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6917 exit(1);
6918 }
6919
6920 fakeClient = createFakeClient();
6921 while(1) {
6922 int argc, j;
6923 unsigned long len;
6924 robj **argv;
6925 char buf[128];
6926 sds argsds;
6927 struct redisCommand *cmd;
6928
6929 if (fgets(buf,sizeof(buf),fp) == NULL) {
6930 if (feof(fp))
6931 break;
6932 else
6933 goto readerr;
6934 }
6935 if (buf[0] != '*') goto fmterr;
6936 argc = atoi(buf+1);
6937 argv = zmalloc(sizeof(robj*)*argc);
6938 for (j = 0; j < argc; j++) {
6939 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6940 if (buf[0] != '$') goto fmterr;
6941 len = strtol(buf+1,NULL,10);
6942 argsds = sdsnewlen(NULL,len);
6943 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
6944 argv[j] = createObject(REDIS_STRING,argsds);
6945 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6946 }
6947
6948 /* Command lookup */
6949 cmd = lookupCommand(argv[0]->ptr);
6950 if (!cmd) {
6951 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6952 exit(1);
6953 }
6954 /* Try object sharing and encoding */
6955 if (server.shareobjects) {
6956 int j;
6957 for(j = 1; j < argc; j++)
6958 argv[j] = tryObjectSharing(argv[j]);
6959 }
6960 if (cmd->flags & REDIS_CMD_BULK)
6961 tryObjectEncoding(argv[argc-1]);
6962 /* Run the command in the context of a fake client */
6963 fakeClient->argc = argc;
6964 fakeClient->argv = argv;
6965 cmd->proc(fakeClient);
6966 /* Discard the reply objects list from the fake client */
6967 while(listLength(fakeClient->reply))
6968 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6969 /* Clean up, ready for the next command */
6970 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6971 zfree(argv);
6972 /* Handle swapping while loading big datasets when VM is on */
6973 loadedkeys++;
6974 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
6975 while (zmalloc_used_memory() > server.vm_max_memory) {
6976 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
6977 }
6978 }
6979 }
6980 fclose(fp);
6981 freeFakeClient(fakeClient);
6982 return REDIS_OK;
6983
6984 readerr:
6985 if (feof(fp)) {
6986 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6987 } else {
6988 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6989 }
6990 exit(1);
6991 fmterr:
6992 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6993 exit(1);
6994 }
6995
6996 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6997 static int fwriteBulk(FILE *fp, robj *obj) {
6998 char buf[128];
6999 int decrrc = 0;
7000
7001 /* Avoid the incr/decr ref count business if possible to help
7002 * copy-on-write (we are often in a child process when this function
7003 * is called).
7004 * Also makes sure that key objects don't get incrRefCount-ed when VM
7005 * is enabled */
7006 if (obj->encoding != REDIS_ENCODING_RAW) {
7007 obj = getDecodedObject(obj);
7008 decrrc = 1;
7009 }
7010 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7011 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7012 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7013 goto err;
7014 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7015 if (decrrc) decrRefCount(obj);
7016 return 1;
7017 err:
7018 if (decrrc) decrRefCount(obj);
7019 return 0;
7020 }
7021
7022 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7023 static int fwriteBulkDouble(FILE *fp, double d) {
7024 char buf[128], dbuf[128];
7025
7026 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7027 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7028 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7029 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7030 return 1;
7031 }
7032
7033 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7034 static int fwriteBulkLong(FILE *fp, long l) {
7035 char buf[128], lbuf[128];
7036
7037 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7038 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7039 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7040 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7041 return 1;
7042 }
7043
7044 /* Write a sequence of commands able to fully rebuild the dataset into
7045 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7046 static int rewriteAppendOnlyFile(char *filename) {
7047 dictIterator *di = NULL;
7048 dictEntry *de;
7049 FILE *fp;
7050 char tmpfile[256];
7051 int j;
7052 time_t now = time(NULL);
7053
7054 /* Note that we have to use a different temp name here compared to the
7055 * one used by rewriteAppendOnlyFileBackground() function. */
7056 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7057 fp = fopen(tmpfile,"w");
7058 if (!fp) {
7059 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7060 return REDIS_ERR;
7061 }
7062 for (j = 0; j < server.dbnum; j++) {
7063 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7064 redisDb *db = server.db+j;
7065 dict *d = db->dict;
7066 if (dictSize(d) == 0) continue;
7067 di = dictGetIterator(d);
7068 if (!di) {
7069 fclose(fp);
7070 return REDIS_ERR;
7071 }
7072
7073 /* SELECT the new DB */
7074 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7075 if (fwriteBulkLong(fp,j) == 0) goto werr;
7076
7077 /* Iterate this DB writing every entry */
7078 while((de = dictNext(di)) != NULL) {
7079 robj *key, *o;
7080 time_t expiretime;
7081 int swapped;
7082
7083 key = dictGetEntryKey(de);
7084 /* If the value for this key is swapped, load a preview in memory.
7085 * We use a "swapped" flag to remember if we need to free the
7086 * value object instead to just increment the ref count anyway
7087 * in order to avoid copy-on-write of pages if we are forked() */
7088 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7089 key->storage == REDIS_VM_SWAPPING) {
7090 o = dictGetEntryVal(de);
7091 swapped = 0;
7092 } else {
7093 o = vmPreviewObject(key);
7094 swapped = 1;
7095 }
7096 expiretime = getExpire(db,key);
7097
7098 /* Save the key and associated value */
7099 if (o->type == REDIS_STRING) {
7100 /* Emit a SET command */
7101 char cmd[]="*3\r\n$3\r\nSET\r\n";
7102 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7103 /* Key and value */
7104 if (fwriteBulk(fp,key) == 0) goto werr;
7105 if (fwriteBulk(fp,o) == 0) goto werr;
7106 } else if (o->type == REDIS_LIST) {
7107 /* Emit the RPUSHes needed to rebuild the list */
7108 list *list = o->ptr;
7109 listNode *ln;
7110 listIter li;
7111
7112 listRewind(list,&li);
7113 while((ln = listNext(&li))) {
7114 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7115 robj *eleobj = listNodeValue(ln);
7116
7117 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7118 if (fwriteBulk(fp,key) == 0) goto werr;
7119 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7120 }
7121 } else if (o->type == REDIS_SET) {
7122 /* Emit the SADDs needed to rebuild the set */
7123 dict *set = o->ptr;
7124 dictIterator *di = dictGetIterator(set);
7125 dictEntry *de;
7126
7127 while((de = dictNext(di)) != NULL) {
7128 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7129 robj *eleobj = dictGetEntryKey(de);
7130
7131 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7132 if (fwriteBulk(fp,key) == 0) goto werr;
7133 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7134 }
7135 dictReleaseIterator(di);
7136 } else if (o->type == REDIS_ZSET) {
7137 /* Emit the ZADDs needed to rebuild the sorted set */
7138 zset *zs = o->ptr;
7139 dictIterator *di = dictGetIterator(zs->dict);
7140 dictEntry *de;
7141
7142 while((de = dictNext(di)) != NULL) {
7143 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7144 robj *eleobj = dictGetEntryKey(de);
7145 double *score = dictGetEntryVal(de);
7146
7147 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7148 if (fwriteBulk(fp,key) == 0) goto werr;
7149 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7150 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7151 }
7152 dictReleaseIterator(di);
7153 } else {
7154 redisAssert(0 != 0);
7155 }
7156 /* Save the expire time */
7157 if (expiretime != -1) {
7158 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7159 /* If this key is already expired skip it */
7160 if (expiretime < now) continue;
7161 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7162 if (fwriteBulk(fp,key) == 0) goto werr;
7163 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7164 }
7165 if (swapped) decrRefCount(o);
7166 }
7167 dictReleaseIterator(di);
7168 }
7169
7170 /* Make sure data will not remain on the OS's output buffers */
7171 fflush(fp);
7172 fsync(fileno(fp));
7173 fclose(fp);
7174
7175 /* Use RENAME to make sure the DB file is changed atomically only
7176 * if the generate DB file is ok. */
7177 if (rename(tmpfile,filename) == -1) {
7178 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7179 unlink(tmpfile);
7180 return REDIS_ERR;
7181 }
7182 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7183 return REDIS_OK;
7184
7185 werr:
7186 fclose(fp);
7187 unlink(tmpfile);
7188 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7189 if (di) dictReleaseIterator(di);
7190 return REDIS_ERR;
7191 }
7192
7193 /* This is how rewriting of the append only file in background works:
7194 *
7195 * 1) The user calls BGREWRITEAOF
7196 * 2) Redis calls this function, that forks():
7197 * 2a) the child rewrite the append only file in a temp file.
7198 * 2b) the parent accumulates differences in server.bgrewritebuf.
7199 * 3) When the child finished '2a' exists.
7200 * 4) The parent will trap the exit code, if it's OK, will append the
7201 * data accumulated into server.bgrewritebuf into the temp file, and
7202 * finally will rename(2) the temp file in the actual file name.
7203 * The the new file is reopened as the new append only file. Profit!
7204 */
7205 static int rewriteAppendOnlyFileBackground(void) {
7206 pid_t childpid;
7207
7208 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7209 if (server.vm_enabled) waitEmptyIOJobsQueue();
7210 if ((childpid = fork()) == 0) {
7211 /* Child */
7212 char tmpfile[256];
7213
7214 if (server.vm_enabled) vmReopenSwapFile();
7215 close(server.fd);
7216 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7217 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7218 _exit(0);
7219 } else {
7220 _exit(1);
7221 }
7222 } else {
7223 /* Parent */
7224 if (childpid == -1) {
7225 redisLog(REDIS_WARNING,
7226 "Can't rewrite append only file in background: fork: %s",
7227 strerror(errno));
7228 return REDIS_ERR;
7229 }
7230 redisLog(REDIS_NOTICE,
7231 "Background append only file rewriting started by pid %d",childpid);
7232 server.bgrewritechildpid = childpid;
7233 /* We set appendseldb to -1 in order to force the next call to the
7234 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7235 * accumulated by the parent into server.bgrewritebuf will start
7236 * with a SELECT statement and it will be safe to merge. */
7237 server.appendseldb = -1;
7238 return REDIS_OK;
7239 }
7240 return REDIS_OK; /* unreached */
7241 }
7242
7243 static void bgrewriteaofCommand(redisClient *c) {
7244 if (server.bgrewritechildpid != -1) {
7245 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7246 return;
7247 }
7248 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7249 char *status = "+Background append only file rewriting started\r\n";
7250 addReplySds(c,sdsnew(status));
7251 } else {
7252 addReply(c,shared.err);
7253 }
7254 }
7255
7256 static void aofRemoveTempFile(pid_t childpid) {
7257 char tmpfile[256];
7258
7259 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7260 unlink(tmpfile);
7261 }
7262
7263 /* Virtual Memory is composed mainly of two subsystems:
7264 * - Blocking Virutal Memory
7265 * - Threaded Virtual Memory I/O
7266 * The two parts are not fully decoupled, but functions are split among two
7267 * different sections of the source code (delimited by comments) in order to
7268 * make more clear what functionality is about the blocking VM and what about
7269 * the threaded (not blocking) VM.
7270 *
7271 * Redis VM design:
7272 *
7273 * Redis VM is a blocking VM (one that blocks reading swapped values from
7274 * disk into memory when a value swapped out is needed in memory) that is made
7275 * unblocking by trying to examine the command argument vector in order to
7276 * load in background values that will likely be needed in order to exec
7277 * the command. The command is executed only once all the relevant keys
7278 * are loaded into memory.
7279 *
7280 * This basically is almost as simple of a blocking VM, but almost as parallel
7281 * as a fully non-blocking VM.
7282 */
7283
7284 /* =================== Virtual Memory - Blocking Side ====================== */
7285
7286 /* substitute the first occurrence of '%p' with the process pid in the
7287 * swap file name. */
7288 static void expandVmSwapFilename(void) {
7289 char *p = strstr(server.vm_swap_file,"%p");
7290 sds new;
7291
7292 if (!p) return;
7293 new = sdsempty();
7294 *p = '\0';
7295 new = sdscat(new,server.vm_swap_file);
7296 new = sdscatprintf(new,"%ld",(long) getpid());
7297 new = sdscat(new,p+2);
7298 zfree(server.vm_swap_file);
7299 server.vm_swap_file = new;
7300 }
7301
7302 static void vmInit(void) {
7303 off_t totsize;
7304 int pipefds[2];
7305 size_t stacksize;
7306
7307 if (server.vm_max_threads != 0)
7308 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7309
7310 expandVmSwapFilename();
7311 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7312 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7313 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7314 }
7315 if (server.vm_fp == NULL) {
7316 redisLog(REDIS_WARNING,
7317 "Impossible to open the swap file: %s. Exiting.",
7318 strerror(errno));
7319 exit(1);
7320 }
7321 server.vm_fd = fileno(server.vm_fp);
7322 server.vm_next_page = 0;
7323 server.vm_near_pages = 0;
7324 server.vm_stats_used_pages = 0;
7325 server.vm_stats_swapped_objects = 0;
7326 server.vm_stats_swapouts = 0;
7327 server.vm_stats_swapins = 0;
7328 totsize = server.vm_pages*server.vm_page_size;
7329 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7330 if (ftruncate(server.vm_fd,totsize) == -1) {
7331 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7332 strerror(errno));
7333 exit(1);
7334 } else {
7335 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7336 }
7337 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7338 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7339 (long long) (server.vm_pages+7)/8, server.vm_pages);
7340 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7341
7342 /* Initialize threaded I/O (used by Virtual Memory) */
7343 server.io_newjobs = listCreate();
7344 server.io_processing = listCreate();
7345 server.io_processed = listCreate();
7346 server.io_ready_clients = listCreate();
7347 pthread_mutex_init(&server.io_mutex,NULL);
7348 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7349 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7350 server.io_active_threads = 0;
7351 if (pipe(pipefds) == -1) {
7352 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7353 ,strerror(errno));
7354 exit(1);
7355 }
7356 server.io_ready_pipe_read = pipefds[0];
7357 server.io_ready_pipe_write = pipefds[1];
7358 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7359 /* LZF requires a lot of stack */
7360 pthread_attr_init(&server.io_threads_attr);
7361 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7362 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7363 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7364 /* Listen for events in the threaded I/O pipe */
7365 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7366 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7367 oom("creating file event");
7368 }
7369
7370 /* Mark the page as used */
7371 static void vmMarkPageUsed(off_t page) {
7372 off_t byte = page/8;
7373 int bit = page&7;
7374 redisAssert(vmFreePage(page) == 1);
7375 server.vm_bitmap[byte] |= 1<<bit;
7376 }
7377
7378 /* Mark N contiguous pages as used, with 'page' being the first. */
7379 static void vmMarkPagesUsed(off_t page, off_t count) {
7380 off_t j;
7381
7382 for (j = 0; j < count; j++)
7383 vmMarkPageUsed(page+j);
7384 server.vm_stats_used_pages += count;
7385 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7386 (long long)count, (long long)page);
7387 }
7388
7389 /* Mark the page as free */
7390 static void vmMarkPageFree(off_t page) {
7391 off_t byte = page/8;
7392 int bit = page&7;
7393 redisAssert(vmFreePage(page) == 0);
7394 server.vm_bitmap[byte] &= ~(1<<bit);
7395 }
7396
7397 /* Mark N contiguous pages as free, with 'page' being the first. */
7398 static void vmMarkPagesFree(off_t page, off_t count) {
7399 off_t j;
7400
7401 for (j = 0; j < count; j++)
7402 vmMarkPageFree(page+j);
7403 server.vm_stats_used_pages -= count;
7404 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7405 (long long)count, (long long)page);
7406 }
7407
7408 /* Test if the page is free */
7409 static int vmFreePage(off_t page) {
7410 off_t byte = page/8;
7411 int bit = page&7;
7412 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7413 }
7414
7415 /* Find N contiguous free pages storing the first page of the cluster in *first.
7416 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7417 * REDIS_ERR is returned.
7418 *
7419 * This function uses a simple algorithm: we try to allocate
7420 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7421 * again from the start of the swap file searching for free spaces.
7422 *
7423 * If it looks pretty clear that there are no free pages near our offset
7424 * we try to find less populated places doing a forward jump of
7425 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7426 * without hurry, and then we jump again and so forth...
7427 *
7428 * This function can be improved using a free list to avoid to guess
7429 * too much, since we could collect data about freed pages.
7430 *
7431 * note: I implemented this function just after watching an episode of
7432 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7433 */
7434 static int vmFindContiguousPages(off_t *first, off_t n) {
7435 off_t base, offset = 0, since_jump = 0, numfree = 0;
7436
7437 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7438 server.vm_near_pages = 0;
7439 server.vm_next_page = 0;
7440 }
7441 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7442 base = server.vm_next_page;
7443
7444 while(offset < server.vm_pages) {
7445 off_t this = base+offset;
7446
7447 /* If we overflow, restart from page zero */
7448 if (this >= server.vm_pages) {
7449 this -= server.vm_pages;
7450 if (this == 0) {
7451 /* Just overflowed, what we found on tail is no longer
7452 * interesting, as it's no longer contiguous. */
7453 numfree = 0;
7454 }
7455 }
7456 if (vmFreePage(this)) {
7457 /* This is a free page */
7458 numfree++;
7459 /* Already got N free pages? Return to the caller, with success */
7460 if (numfree == n) {
7461 *first = this-(n-1);
7462 server.vm_next_page = this+1;
7463 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7464 return REDIS_OK;
7465 }
7466 } else {
7467 /* The current one is not a free page */
7468 numfree = 0;
7469 }
7470
7471 /* Fast-forward if the current page is not free and we already
7472 * searched enough near this place. */
7473 since_jump++;
7474 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7475 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7476 since_jump = 0;
7477 /* Note that even if we rewind after the jump, we are don't need
7478 * to make sure numfree is set to zero as we only jump *if* it
7479 * is set to zero. */
7480 } else {
7481 /* Otherwise just check the next page */
7482 offset++;
7483 }
7484 }
7485 return REDIS_ERR;
7486 }
7487
7488 /* Write the specified object at the specified page of the swap file */
7489 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7490 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7491 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7492 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7493 redisLog(REDIS_WARNING,
7494 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7495 strerror(errno));
7496 return REDIS_ERR;
7497 }
7498 rdbSaveObject(server.vm_fp,o);
7499 fflush(server.vm_fp);
7500 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7501 return REDIS_OK;
7502 }
7503
7504 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7505 * needed to later retrieve the object into the key object.
7506 * If we can't find enough contiguous empty pages to swap the object on disk
7507 * REDIS_ERR is returned. */
7508 static int vmSwapObjectBlocking(robj *key, robj *val) {
7509 off_t pages = rdbSavedObjectPages(val,NULL);
7510 off_t page;
7511
7512 assert(key->storage == REDIS_VM_MEMORY);
7513 assert(key->refcount == 1);
7514 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7515 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7516 key->vm.page = page;
7517 key->vm.usedpages = pages;
7518 key->storage = REDIS_VM_SWAPPED;
7519 key->vtype = val->type;
7520 decrRefCount(val); /* Deallocate the object from memory. */
7521 vmMarkPagesUsed(page,pages);
7522 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7523 (unsigned char*) key->ptr,
7524 (unsigned long long) page, (unsigned long long) pages);
7525 server.vm_stats_swapped_objects++;
7526 server.vm_stats_swapouts++;
7527 return REDIS_OK;
7528 }
7529
7530 static robj *vmReadObjectFromSwap(off_t page, int type) {
7531 robj *o;
7532
7533 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7534 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7535 redisLog(REDIS_WARNING,
7536 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7537 strerror(errno));
7538 _exit(1);
7539 }
7540 o = rdbLoadObject(type,server.vm_fp);
7541 if (o == NULL) {
7542 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7543 _exit(1);
7544 }
7545 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7546 return o;
7547 }
7548
7549 /* Load the value object relative to the 'key' object from swap to memory.
7550 * The newly allocated object is returned.
7551 *
7552 * If preview is true the unserialized object is returned to the caller but
7553 * no changes are made to the key object, nor the pages are marked as freed */
7554 static robj *vmGenericLoadObject(robj *key, int preview) {
7555 robj *val;
7556
7557 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7558 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7559 if (!preview) {
7560 key->storage = REDIS_VM_MEMORY;
7561 key->vm.atime = server.unixtime;
7562 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7563 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7564 (unsigned char*) key->ptr);
7565 server.vm_stats_swapped_objects--;
7566 } else {
7567 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7568 (unsigned char*) key->ptr);
7569 }
7570 server.vm_stats_swapins++;
7571 return val;
7572 }
7573
7574 /* Plain object loading, from swap to memory */
7575 static robj *vmLoadObject(robj *key) {
7576 /* If we are loading the object in background, stop it, we
7577 * need to load this object synchronously ASAP. */
7578 if (key->storage == REDIS_VM_LOADING)
7579 vmCancelThreadedIOJob(key);
7580 return vmGenericLoadObject(key,0);
7581 }
7582
7583 /* Just load the value on disk, without to modify the key.
7584 * This is useful when we want to perform some operation on the value
7585 * without to really bring it from swap to memory, like while saving the
7586 * dataset or rewriting the append only log. */
7587 static robj *vmPreviewObject(robj *key) {
7588 return vmGenericLoadObject(key,1);
7589 }
7590
7591 /* How a good candidate is this object for swapping?
7592 * The better candidate it is, the greater the returned value.
7593 *
7594 * Currently we try to perform a fast estimation of the object size in
7595 * memory, and combine it with aging informations.
7596 *
7597 * Basically swappability = idle-time * log(estimated size)
7598 *
7599 * Bigger objects are preferred over smaller objects, but not
7600 * proportionally, this is why we use the logarithm. This algorithm is
7601 * just a first try and will probably be tuned later. */
7602 static double computeObjectSwappability(robj *o) {
7603 time_t age = server.unixtime - o->vm.atime;
7604 long asize = 0;
7605 list *l;
7606 dict *d;
7607 struct dictEntry *de;
7608 int z;
7609
7610 if (age <= 0) return 0;
7611 switch(o->type) {
7612 case REDIS_STRING:
7613 if (o->encoding != REDIS_ENCODING_RAW) {
7614 asize = sizeof(*o);
7615 } else {
7616 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7617 }
7618 break;
7619 case REDIS_LIST:
7620 l = o->ptr;
7621 listNode *ln = listFirst(l);
7622
7623 asize = sizeof(list);
7624 if (ln) {
7625 robj *ele = ln->value;
7626 long elesize;
7627
7628 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7629 (sizeof(*o)+sdslen(ele->ptr)) :
7630 sizeof(*o);
7631 asize += (sizeof(listNode)+elesize)*listLength(l);
7632 }
7633 break;
7634 case REDIS_SET:
7635 case REDIS_ZSET:
7636 z = (o->type == REDIS_ZSET);
7637 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7638
7639 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7640 if (z) asize += sizeof(zset)-sizeof(dict);
7641 if (dictSize(d)) {
7642 long elesize;
7643 robj *ele;
7644
7645 de = dictGetRandomKey(d);
7646 ele = dictGetEntryKey(de);
7647 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7648 (sizeof(*o)+sdslen(ele->ptr)) :
7649 sizeof(*o);
7650 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7651 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7652 }
7653 break;
7654 }
7655 return (double)age*log(1+asize);
7656 }
7657
7658 /* Try to swap an object that's a good candidate for swapping.
7659 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7660 * to swap any object at all.
7661 *
7662 * If 'usethreaded' is true, Redis will try to swap the object in background
7663 * using I/O threads. */
7664 static int vmSwapOneObject(int usethreads) {
7665 int j, i;
7666 struct dictEntry *best = NULL;
7667 double best_swappability = 0;
7668 redisDb *best_db = NULL;
7669 robj *key, *val;
7670
7671 for (j = 0; j < server.dbnum; j++) {
7672 redisDb *db = server.db+j;
7673 /* Why maxtries is set to 100?
7674 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7675 * are swappable objects */
7676 int maxtries = 100;
7677
7678 if (dictSize(db->dict) == 0) continue;
7679 for (i = 0; i < 5; i++) {
7680 dictEntry *de;
7681 double swappability;
7682
7683 if (maxtries) maxtries--;
7684 de = dictGetRandomKey(db->dict);
7685 key = dictGetEntryKey(de);
7686 val = dictGetEntryVal(de);
7687 /* Only swap objects that are currently in memory.
7688 *
7689 * Also don't swap shared objects if threaded VM is on, as we
7690 * try to ensure that the main thread does not touch the
7691 * object while the I/O thread is using it, but we can't
7692 * control other keys without adding additional mutex. */
7693 if (key->storage != REDIS_VM_MEMORY ||
7694 (server.vm_max_threads != 0 && val->refcount != 1)) {
7695 if (maxtries) i--; /* don't count this try */
7696 continue;
7697 }
7698 swappability = computeObjectSwappability(val);
7699 if (!best || swappability > best_swappability) {
7700 best = de;
7701 best_swappability = swappability;
7702 best_db = db;
7703 }
7704 }
7705 }
7706 if (best == NULL) return REDIS_ERR;
7707 key = dictGetEntryKey(best);
7708 val = dictGetEntryVal(best);
7709
7710 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
7711 key->ptr, best_swappability);
7712
7713 /* Unshare the key if needed */
7714 if (key->refcount > 1) {
7715 robj *newkey = dupStringObject(key);
7716 decrRefCount(key);
7717 key = dictGetEntryKey(best) = newkey;
7718 }
7719 /* Swap it */
7720 if (usethreads) {
7721 vmSwapObjectThreaded(key,val,best_db);
7722 return REDIS_OK;
7723 } else {
7724 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7725 dictGetEntryVal(best) = NULL;
7726 return REDIS_OK;
7727 } else {
7728 return REDIS_ERR;
7729 }
7730 }
7731 }
7732
7733 static int vmSwapOneObjectBlocking() {
7734 return vmSwapOneObject(0);
7735 }
7736
7737 static int vmSwapOneObjectThreaded() {
7738 return vmSwapOneObject(1);
7739 }
7740
7741 /* Return true if it's safe to swap out objects in a given moment.
7742 * Basically we don't want to swap objects out while there is a BGSAVE
7743 * or a BGAEOREWRITE running in backgroud. */
7744 static int vmCanSwapOut(void) {
7745 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7746 }
7747
7748 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
7749 * and was deleted. Otherwise 0 is returned. */
7750 static int deleteIfSwapped(redisDb *db, robj *key) {
7751 dictEntry *de;
7752 robj *foundkey;
7753
7754 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7755 foundkey = dictGetEntryKey(de);
7756 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7757 deleteKey(db,key);
7758 return 1;
7759 }
7760
7761 /* =================== Virtual Memory - Threaded I/O ======================= */
7762
7763 static void freeIOJob(iojob *j) {
7764 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7765 j->type == REDIS_IOJOB_DO_SWAP ||
7766 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
7767 decrRefCount(j->val);
7768 decrRefCount(j->key);
7769 zfree(j);
7770 }
7771
7772 /* Every time a thread finished a Job, it writes a byte into the write side
7773 * of an unix pipe in order to "awake" the main thread, and this function
7774 * is called. */
7775 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7776 int mask)
7777 {
7778 char buf[1];
7779 int retval, processed = 0, toprocess = -1, trytoswap = 1;
7780 REDIS_NOTUSED(el);
7781 REDIS_NOTUSED(mask);
7782 REDIS_NOTUSED(privdata);
7783
7784 /* For every byte we read in the read side of the pipe, there is one
7785 * I/O job completed to process. */
7786 while((retval = read(fd,buf,1)) == 1) {
7787 iojob *j;
7788 listNode *ln;
7789 robj *key;
7790 struct dictEntry *de;
7791
7792 redisLog(REDIS_DEBUG,"Processing I/O completed job");
7793
7794 /* Get the processed element (the oldest one) */
7795 lockThreadedIO();
7796 assert(listLength(server.io_processed) != 0);
7797 if (toprocess == -1) {
7798 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7799 if (toprocess <= 0) toprocess = 1;
7800 }
7801 ln = listFirst(server.io_processed);
7802 j = ln->value;
7803 listDelNode(server.io_processed,ln);
7804 unlockThreadedIO();
7805 /* If this job is marked as canceled, just ignore it */
7806 if (j->canceled) {
7807 freeIOJob(j);
7808 continue;
7809 }
7810 /* Post process it in the main thread, as there are things we
7811 * can do just here to avoid race conditions and/or invasive locks */
7812 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
7813 de = dictFind(j->db->dict,j->key);
7814 assert(de != NULL);
7815 key = dictGetEntryKey(de);
7816 if (j->type == REDIS_IOJOB_LOAD) {
7817 redisDb *db;
7818
7819 /* Key loaded, bring it at home */
7820 key->storage = REDIS_VM_MEMORY;
7821 key->vm.atime = server.unixtime;
7822 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7823 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7824 (unsigned char*) key->ptr);
7825 server.vm_stats_swapped_objects--;
7826 server.vm_stats_swapins++;
7827 dictGetEntryVal(de) = j->val;
7828 incrRefCount(j->val);
7829 db = j->db;
7830 freeIOJob(j);
7831 /* Handle clients waiting for this key to be loaded. */
7832 handleClientsBlockedOnSwappedKey(db,key);
7833 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7834 /* Now we know the amount of pages required to swap this object.
7835 * Let's find some space for it, and queue this task again
7836 * rebranded as REDIS_IOJOB_DO_SWAP. */
7837 if (!vmCanSwapOut() ||
7838 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
7839 {
7840 /* Ooops... no space or we can't swap as there is
7841 * a fork()ed Redis trying to save stuff on disk. */
7842 freeIOJob(j);
7843 key->storage = REDIS_VM_MEMORY; /* undo operation */
7844 } else {
7845 /* Note that we need to mark this pages as used now,
7846 * if the job will be canceled, we'll mark them as freed
7847 * again. */
7848 vmMarkPagesUsed(j->page,j->pages);
7849 j->type = REDIS_IOJOB_DO_SWAP;
7850 lockThreadedIO();
7851 queueIOJob(j);
7852 unlockThreadedIO();
7853 }
7854 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7855 robj *val;
7856
7857 /* Key swapped. We can finally free some memory. */
7858 if (key->storage != REDIS_VM_SWAPPING) {
7859 printf("key->storage: %d\n",key->storage);
7860 printf("key->name: %s\n",(char*)key->ptr);
7861 printf("key->refcount: %d\n",key->refcount);
7862 printf("val: %p\n",(void*)j->val);
7863 printf("val->type: %d\n",j->val->type);
7864 printf("val->ptr: %s\n",(char*)j->val->ptr);
7865 }
7866 redisAssert(key->storage == REDIS_VM_SWAPPING);
7867 val = dictGetEntryVal(de);
7868 key->vm.page = j->page;
7869 key->vm.usedpages = j->pages;
7870 key->storage = REDIS_VM_SWAPPED;
7871 key->vtype = j->val->type;
7872 decrRefCount(val); /* Deallocate the object from memory. */
7873 dictGetEntryVal(de) = NULL;
7874 redisLog(REDIS_DEBUG,
7875 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7876 (unsigned char*) key->ptr,
7877 (unsigned long long) j->page, (unsigned long long) j->pages);
7878 server.vm_stats_swapped_objects++;
7879 server.vm_stats_swapouts++;
7880 freeIOJob(j);
7881 /* Put a few more swap requests in queue if we are still
7882 * out of memory */
7883 if (trytoswap && vmCanSwapOut() &&
7884 zmalloc_used_memory() > server.vm_max_memory)
7885 {
7886 int more = 1;
7887 while(more) {
7888 lockThreadedIO();
7889 more = listLength(server.io_newjobs) <
7890 (unsigned) server.vm_max_threads;
7891 unlockThreadedIO();
7892 /* Don't waste CPU time if swappable objects are rare. */
7893 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
7894 trytoswap = 0;
7895 break;
7896 }
7897 }
7898 }
7899 }
7900 processed++;
7901 if (processed == toprocess) return;
7902 }
7903 if (retval < 0 && errno != EAGAIN) {
7904 redisLog(REDIS_WARNING,
7905 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7906 strerror(errno));
7907 }
7908 }
7909
7910 static void lockThreadedIO(void) {
7911 pthread_mutex_lock(&server.io_mutex);
7912 }
7913
7914 static void unlockThreadedIO(void) {
7915 pthread_mutex_unlock(&server.io_mutex);
7916 }
7917
7918 /* Remove the specified object from the threaded I/O queue if still not
7919 * processed, otherwise make sure to flag it as canceled. */
7920 static void vmCancelThreadedIOJob(robj *o) {
7921 list *lists[3] = {
7922 server.io_newjobs, /* 0 */
7923 server.io_processing, /* 1 */
7924 server.io_processed /* 2 */
7925 };
7926 int i;
7927
7928 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
7929 again:
7930 lockThreadedIO();
7931 /* Search for a matching key in one of the queues */
7932 for (i = 0; i < 3; i++) {
7933 listNode *ln;
7934 listIter li;
7935
7936 listRewind(lists[i],&li);
7937 while ((ln = listNext(&li)) != NULL) {
7938 iojob *job = ln->value;
7939
7940 if (job->canceled) continue; /* Skip this, already canceled. */
7941 if (compareStringObjects(job->key,o) == 0) {
7942 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
7943 (void*)job, (char*)o->ptr, job->type, i);
7944 /* Mark the pages as free since the swap didn't happened
7945 * or happened but is now discarded. */
7946 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
7947 vmMarkPagesFree(job->page,job->pages);
7948 /* Cancel the job. It depends on the list the job is
7949 * living in. */
7950 switch(i) {
7951 case 0: /* io_newjobs */
7952 /* If the job was yet not processed the best thing to do
7953 * is to remove it from the queue at all */
7954 freeIOJob(job);
7955 listDelNode(lists[i],ln);
7956 break;
7957 case 1: /* io_processing */
7958 /* Oh Shi- the thread is messing with the Job:
7959 *
7960 * Probably it's accessing the object if this is a
7961 * PREPARE_SWAP or DO_SWAP job.
7962 * If it's a LOAD job it may be reading from disk and
7963 * if we don't wait for the job to terminate before to
7964 * cancel it, maybe in a few microseconds data can be
7965 * corrupted in this pages. So the short story is:
7966 *
7967 * Better to wait for the job to move into the
7968 * next queue (processed)... */
7969
7970 /* We try again and again until the job is completed. */
7971 unlockThreadedIO();
7972 /* But let's wait some time for the I/O thread
7973 * to finish with this job. After all this condition
7974 * should be very rare. */
7975 usleep(1);
7976 goto again;
7977 case 2: /* io_processed */
7978 /* The job was already processed, that's easy...
7979 * just mark it as canceled so that we'll ignore it
7980 * when processing completed jobs. */
7981 job->canceled = 1;
7982 break;
7983 }
7984 /* Finally we have to adjust the storage type of the object
7985 * in order to "UNDO" the operaiton. */
7986 if (o->storage == REDIS_VM_LOADING)
7987 o->storage = REDIS_VM_SWAPPED;
7988 else if (o->storage == REDIS_VM_SWAPPING)
7989 o->storage = REDIS_VM_MEMORY;
7990 unlockThreadedIO();
7991 return;
7992 }
7993 }
7994 }
7995 unlockThreadedIO();
7996 assert(1 != 1); /* We should never reach this */
7997 }
7998
7999 static void *IOThreadEntryPoint(void *arg) {
8000 iojob *j;
8001 listNode *ln;
8002 REDIS_NOTUSED(arg);
8003
8004 pthread_detach(pthread_self());
8005 while(1) {
8006 /* Get a new job to process */
8007 lockThreadedIO();
8008 if (listLength(server.io_newjobs) == 0) {
8009 /* No new jobs in queue, exit. */
8010 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8011 (long) pthread_self());
8012 server.io_active_threads--;
8013 unlockThreadedIO();
8014 return NULL;
8015 }
8016 ln = listFirst(server.io_newjobs);
8017 j = ln->value;
8018 listDelNode(server.io_newjobs,ln);
8019 /* Add the job in the processing queue */
8020 j->thread = pthread_self();
8021 listAddNodeTail(server.io_processing,j);
8022 ln = listLast(server.io_processing); /* We use ln later to remove it */
8023 unlockThreadedIO();
8024 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8025 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8026
8027 /* Process the Job */
8028 if (j->type == REDIS_IOJOB_LOAD) {
8029 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8030 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8031 FILE *fp = fopen("/dev/null","w+");
8032 j->pages = rdbSavedObjectPages(j->val,fp);
8033 fclose(fp);
8034 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8035 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8036 j->canceled = 1;
8037 }
8038
8039 /* Done: insert the job into the processed queue */
8040 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8041 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8042 lockThreadedIO();
8043 listDelNode(server.io_processing,ln);
8044 listAddNodeTail(server.io_processed,j);
8045 unlockThreadedIO();
8046
8047 /* Signal the main thread there is new stuff to process */
8048 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8049 }
8050 return NULL; /* never reached */
8051 }
8052
8053 static void spawnIOThread(void) {
8054 pthread_t thread;
8055 sigset_t mask, omask;
8056
8057 sigemptyset(&mask);
8058 sigaddset(&mask,SIGCHLD);
8059 sigaddset(&mask,SIGHUP);
8060 sigaddset(&mask,SIGPIPE);
8061 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8062 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8063 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8064 server.io_active_threads++;
8065 }
8066
8067 /* We need to wait for the last thread to exit before we are able to
8068 * fork() in order to BGSAVE or BGREWRITEAOF. */
8069 static void waitEmptyIOJobsQueue(void) {
8070 while(1) {
8071 int io_processed_len;
8072
8073 lockThreadedIO();
8074 if (listLength(server.io_newjobs) == 0 &&
8075 listLength(server.io_processing) == 0 &&
8076 server.io_active_threads == 0)
8077 {
8078 unlockThreadedIO();
8079 return;
8080 }
8081 /* While waiting for empty jobs queue condition we post-process some
8082 * finshed job, as I/O threads may be hanging trying to write against
8083 * the io_ready_pipe_write FD but there are so much pending jobs that
8084 * it's blocking. */
8085 io_processed_len = listLength(server.io_processed);
8086 unlockThreadedIO();
8087 if (io_processed_len) {
8088 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8089 usleep(1000); /* 1 millisecond */
8090 } else {
8091 usleep(10000); /* 10 milliseconds */
8092 }
8093 }
8094 }
8095
8096 static void vmReopenSwapFile(void) {
8097 /* Note: we don't close the old one as we are in the child process
8098 * and don't want to mess at all with the original file object. */
8099 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8100 if (server.vm_fp == NULL) {
8101 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8102 server.vm_swap_file);
8103 _exit(1);
8104 }
8105 server.vm_fd = fileno(server.vm_fp);
8106 }
8107
8108 /* This function must be called while with threaded IO locked */
8109 static void queueIOJob(iojob *j) {
8110 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8111 (void*)j, j->type, (char*)j->key->ptr);
8112 listAddNodeTail(server.io_newjobs,j);
8113 if (server.io_active_threads < server.vm_max_threads)
8114 spawnIOThread();
8115 }
8116
8117 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8118 iojob *j;
8119
8120 assert(key->storage == REDIS_VM_MEMORY);
8121 assert(key->refcount == 1);
8122
8123 j = zmalloc(sizeof(*j));
8124 j->type = REDIS_IOJOB_PREPARE_SWAP;
8125 j->db = db;
8126 j->key = dupStringObject(key);
8127 j->val = val;
8128 incrRefCount(val);
8129 j->canceled = 0;
8130 j->thread = (pthread_t) -1;
8131 key->storage = REDIS_VM_SWAPPING;
8132
8133 lockThreadedIO();
8134 queueIOJob(j);
8135 unlockThreadedIO();
8136 return REDIS_OK;
8137 }
8138
8139 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8140
8141 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8142 * If there is not already a job loading the key, it is craeted.
8143 * The key is added to the io_keys list in the client structure, and also
8144 * in the hash table mapping swapped keys to waiting clients, that is,
8145 * server.io_waited_keys. */
8146 static int waitForSwappedKey(redisClient *c, robj *key) {
8147 struct dictEntry *de;
8148 robj *o;
8149 list *l;
8150
8151 /* If the key does not exist or is already in RAM we don't need to
8152 * block the client at all. */
8153 de = dictFind(c->db->dict,key);
8154 if (de == NULL) return 0;
8155 o = dictGetEntryKey(de);
8156 if (o->storage == REDIS_VM_MEMORY) {
8157 return 0;
8158 } else if (o->storage == REDIS_VM_SWAPPING) {
8159 /* We were swapping the key, undo it! */
8160 vmCancelThreadedIOJob(o);
8161 return 0;
8162 }
8163
8164 /* OK: the key is either swapped, or being loaded just now. */
8165
8166 /* Add the key to the list of keys this client is waiting for.
8167 * This maps clients to keys they are waiting for. */
8168 listAddNodeTail(c->io_keys,key);
8169 incrRefCount(key);
8170
8171 /* Add the client to the swapped keys => clients waiting map. */
8172 de = dictFind(c->db->io_keys,key);
8173 if (de == NULL) {
8174 int retval;
8175
8176 /* For every key we take a list of clients blocked for it */
8177 l = listCreate();
8178 retval = dictAdd(c->db->io_keys,key,l);
8179 incrRefCount(key);
8180 assert(retval == DICT_OK);
8181 } else {
8182 l = dictGetEntryVal(de);
8183 }
8184 listAddNodeTail(l,c);
8185
8186 /* Are we already loading the key from disk? If not create a job */
8187 if (o->storage == REDIS_VM_SWAPPED) {
8188 iojob *j;
8189
8190 o->storage = REDIS_VM_LOADING;
8191 j = zmalloc(sizeof(*j));
8192 j->type = REDIS_IOJOB_LOAD;
8193 j->db = c->db;
8194 j->key = dupStringObject(key);
8195 j->key->vtype = o->vtype;
8196 j->page = o->vm.page;
8197 j->val = NULL;
8198 j->canceled = 0;
8199 j->thread = (pthread_t) -1;
8200 lockThreadedIO();
8201 queueIOJob(j);
8202 unlockThreadedIO();
8203 }
8204 return 1;
8205 }
8206
8207 /* Is this client attempting to run a command against swapped keys?
8208 * If so, block it ASAP, load the keys in background, then resume it.
8209 *
8210 * The important idea about this function is that it can fail! If keys will
8211 * still be swapped when the client is resumed, this key lookups will
8212 * just block loading keys from disk. In practical terms this should only
8213 * happen with SORT BY command or if there is a bug in this function.
8214 *
8215 * Return 1 if the client is marked as blocked, 0 if the client can
8216 * continue as the keys it is going to access appear to be in memory. */
8217 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8218 int j, last;
8219
8220 if (cmd->vm_firstkey == 0) return 0;
8221 last = cmd->vm_lastkey;
8222 if (last < 0) last = c->argc+last;
8223 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8224 waitForSwappedKey(c,c->argv[j]);
8225 /* If the client was blocked for at least one key, mark it as blocked. */
8226 if (listLength(c->io_keys)) {
8227 c->flags |= REDIS_IO_WAIT;
8228 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8229 server.vm_blocked_clients++;
8230 return 1;
8231 } else {
8232 return 0;
8233 }
8234 }
8235
8236 /* Remove the 'key' from the list of blocked keys for a given client.
8237 *
8238 * The function returns 1 when there are no longer blocking keys after
8239 * the current one was removed (and the client can be unblocked). */
8240 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8241 list *l;
8242 listNode *ln;
8243 listIter li;
8244 struct dictEntry *de;
8245
8246 /* Remove the key from the list of keys this client is waiting for. */
8247 listRewind(c->io_keys,&li);
8248 while ((ln = listNext(&li)) != NULL) {
8249 if (compareStringObjects(ln->value,key) == 0) {
8250 listDelNode(c->io_keys,ln);
8251 break;
8252 }
8253 }
8254 assert(ln != NULL);
8255
8256 /* Remove the client form the key => waiting clients map. */
8257 de = dictFind(c->db->io_keys,key);
8258 assert(de != NULL);
8259 l = dictGetEntryVal(de);
8260 ln = listSearchKey(l,c);
8261 assert(ln != NULL);
8262 listDelNode(l,ln);
8263 if (listLength(l) == 0)
8264 dictDelete(c->db->io_keys,key);
8265
8266 return listLength(c->io_keys) == 0;
8267 }
8268
8269 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8270 struct dictEntry *de;
8271 list *l;
8272 listNode *ln;
8273 int len;
8274
8275 de = dictFind(db->io_keys,key);
8276 if (!de) return;
8277
8278 l = dictGetEntryVal(de);
8279 len = listLength(l);
8280 /* Note: we can't use something like while(listLength(l)) as the list
8281 * can be freed by the calling function when we remove the last element. */
8282 while (len--) {
8283 ln = listFirst(l);
8284 redisClient *c = ln->value;
8285
8286 if (dontWaitForSwappedKey(c,key)) {
8287 /* Put the client in the list of clients ready to go as we
8288 * loaded all the keys about it. */
8289 listAddNodeTail(server.io_ready_clients,c);
8290 }
8291 }
8292 }
8293
8294 /* ================================= Debugging ============================== */
8295
8296 static void debugCommand(redisClient *c) {
8297 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8298 *((char*)-1) = 'x';
8299 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8300 if (rdbSave(server.dbfilename) != REDIS_OK) {
8301 addReply(c,shared.err);
8302 return;
8303 }
8304 emptyDb();
8305 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8306 addReply(c,shared.err);
8307 return;
8308 }
8309 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8310 addReply(c,shared.ok);
8311 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8312 emptyDb();
8313 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8314 addReply(c,shared.err);
8315 return;
8316 }
8317 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8318 addReply(c,shared.ok);
8319 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8320 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8321 robj *key, *val;
8322
8323 if (!de) {
8324 addReply(c,shared.nokeyerr);
8325 return;
8326 }
8327 key = dictGetEntryKey(de);
8328 val = dictGetEntryVal(de);
8329 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8330 key->storage == REDIS_VM_SWAPPING)) {
8331 addReplySds(c,sdscatprintf(sdsempty(),
8332 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8333 "encoding:%d serializedlength:%lld\r\n",
8334 (void*)key, key->refcount, (void*)val, val->refcount,
8335 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8336 } else {
8337 addReplySds(c,sdscatprintf(sdsempty(),
8338 "+Key at:%p refcount:%d, value swapped at: page %llu "
8339 "using %llu pages\r\n",
8340 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8341 (unsigned long long) key->vm.usedpages));
8342 }
8343 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8344 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8345 robj *key, *val;
8346
8347 if (!server.vm_enabled) {
8348 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8349 return;
8350 }
8351 if (!de) {
8352 addReply(c,shared.nokeyerr);
8353 return;
8354 }
8355 key = dictGetEntryKey(de);
8356 val = dictGetEntryVal(de);
8357 /* If the key is shared we want to create a copy */
8358 if (key->refcount > 1) {
8359 robj *newkey = dupStringObject(key);
8360 decrRefCount(key);
8361 key = dictGetEntryKey(de) = newkey;
8362 }
8363 /* Swap it */
8364 if (key->storage != REDIS_VM_MEMORY) {
8365 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8366 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8367 dictGetEntryVal(de) = NULL;
8368 addReply(c,shared.ok);
8369 } else {
8370 addReply(c,shared.err);
8371 }
8372 } else {
8373 addReplySds(c,sdsnew(
8374 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8375 }
8376 }
8377
8378 static void _redisAssert(char *estr, char *file, int line) {
8379 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8380 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8381 #ifdef HAVE_BACKTRACE
8382 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8383 *((char*)-1) = 'x';
8384 #endif
8385 }
8386
8387 /* =================================== Main! ================================ */
8388
8389 #ifdef __linux__
8390 int linuxOvercommitMemoryValue(void) {
8391 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8392 char buf[64];
8393
8394 if (!fp) return -1;
8395 if (fgets(buf,64,fp) == NULL) {
8396 fclose(fp);
8397 return -1;
8398 }
8399 fclose(fp);
8400
8401 return atoi(buf);
8402 }
8403
8404 void linuxOvercommitMemoryWarning(void) {
8405 if (linuxOvercommitMemoryValue() == 0) {
8406 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8407 }
8408 }
8409 #endif /* __linux__ */
8410
8411 static void daemonize(void) {
8412 int fd;
8413 FILE *fp;
8414
8415 if (fork() != 0) exit(0); /* parent exits */
8416 setsid(); /* create a new session */
8417
8418 /* Every output goes to /dev/null. If Redis is daemonized but
8419 * the 'logfile' is set to 'stdout' in the configuration file
8420 * it will not log at all. */
8421 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8422 dup2(fd, STDIN_FILENO);
8423 dup2(fd, STDOUT_FILENO);
8424 dup2(fd, STDERR_FILENO);
8425 if (fd > STDERR_FILENO) close(fd);
8426 }
8427 /* Try to write the pid file */
8428 fp = fopen(server.pidfile,"w");
8429 if (fp) {
8430 fprintf(fp,"%d\n",getpid());
8431 fclose(fp);
8432 }
8433 }
8434
8435 int main(int argc, char **argv) {
8436 time_t start;
8437
8438 initServerConfig();
8439 if (argc == 2) {
8440 resetServerSaveParams();
8441 loadServerConfig(argv[1]);
8442 } else if (argc > 2) {
8443 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8444 exit(1);
8445 } else {
8446 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8447 }
8448 if (server.daemonize) daemonize();
8449 initServer();
8450 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8451 #ifdef __linux__
8452 linuxOvercommitMemoryWarning();
8453 #endif
8454 start = time(NULL);
8455 if (server.appendonly) {
8456 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8457 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8458 } else {
8459 if (rdbLoad(server.dbfilename) == REDIS_OK)
8460 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8461 }
8462 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8463 aeSetBeforeSleepProc(server.el,beforeSleep);
8464 aeMain(server.el);
8465 aeDeleteEventLoop(server.el);
8466 return 0;
8467 }
8468
8469 /* ============================= Backtrace support ========================= */
8470
8471 #ifdef HAVE_BACKTRACE
8472 static char *findFuncName(void *pointer, unsigned long *offset);
8473
8474 static void *getMcontextEip(ucontext_t *uc) {
8475 #if defined(__FreeBSD__)
8476 return (void*) uc->uc_mcontext.mc_eip;
8477 #elif defined(__dietlibc__)
8478 return (void*) uc->uc_mcontext.eip;
8479 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8480 #if __x86_64__
8481 return (void*) uc->uc_mcontext->__ss.__rip;
8482 #else
8483 return (void*) uc->uc_mcontext->__ss.__eip;
8484 #endif
8485 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8486 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8487 return (void*) uc->uc_mcontext->__ss.__rip;
8488 #else
8489 return (void*) uc->uc_mcontext->__ss.__eip;
8490 #endif
8491 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8492 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8493 #elif defined(__ia64__) /* Linux IA64 */
8494 return (void*) uc->uc_mcontext.sc_ip;
8495 #else
8496 return NULL;
8497 #endif
8498 }
8499
8500 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8501 void *trace[100];
8502 char **messages = NULL;
8503 int i, trace_size = 0;
8504 unsigned long offset=0;
8505 ucontext_t *uc = (ucontext_t*) secret;
8506 sds infostring;
8507 REDIS_NOTUSED(info);
8508
8509 redisLog(REDIS_WARNING,
8510 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8511 infostring = genRedisInfoString();
8512 redisLog(REDIS_WARNING, "%s",infostring);
8513 /* It's not safe to sdsfree() the returned string under memory
8514 * corruption conditions. Let it leak as we are going to abort */
8515
8516 trace_size = backtrace(trace, 100);
8517 /* overwrite sigaction with caller's address */
8518 if (getMcontextEip(uc) != NULL) {
8519 trace[1] = getMcontextEip(uc);
8520 }
8521 messages = backtrace_symbols(trace, trace_size);
8522
8523 for (i=1; i<trace_size; ++i) {
8524 char *fn = findFuncName(trace[i], &offset), *p;
8525
8526 p = strchr(messages[i],'+');
8527 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8528 redisLog(REDIS_WARNING,"%s", messages[i]);
8529 } else {
8530 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8531 }
8532 }
8533 /* free(messages); Don't call free() with possibly corrupted memory. */
8534 _exit(0);
8535 }
8536
8537 static void setupSigSegvAction(void) {
8538 struct sigaction act;
8539
8540 sigemptyset (&act.sa_mask);
8541 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8542 * is used. Otherwise, sa_handler is used */
8543 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8544 act.sa_sigaction = segvHandler;
8545 sigaction (SIGSEGV, &act, NULL);
8546 sigaction (SIGBUS, &act, NULL);
8547 sigaction (SIGFPE, &act, NULL);
8548 sigaction (SIGILL, &act, NULL);
8549 sigaction (SIGBUS, &act, NULL);
8550 return;
8551 }
8552
8553 #include "staticsymbols.h"
8554 /* This function try to convert a pointer into a function name. It's used in
8555 * oreder to provide a backtrace under segmentation fault that's able to
8556 * display functions declared as static (otherwise the backtrace is useless). */
8557 static char *findFuncName(void *pointer, unsigned long *offset){
8558 int i, ret = -1;
8559 unsigned long off, minoff = 0;
8560
8561 /* Try to match against the Symbol with the smallest offset */
8562 for (i=0; symsTable[i].pointer; i++) {
8563 unsigned long lp = (unsigned long) pointer;
8564
8565 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8566 off=lp-symsTable[i].pointer;
8567 if (ret < 0 || off < minoff) {
8568 minoff=off;
8569 ret=i;
8570 }
8571 }
8572 }
8573 if (ret == -1) return NULL;
8574 *offset = minoff;
8575 return symsTable[ret].name;
8576 }
8577 #else /* HAVE_BACKTRACE */
8578 static void setupSigSegvAction(void) {
8579 }
8580 #endif /* HAVE_BACKTRACE */
8581
8582
8583
8584 /* The End */
8585
8586
8587