]> git.saurik.com Git - redis.git/blame - redis.c
more steps towards a working non blocking VM
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
2 * Copyright (c) 2006-2009, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
5dc70bff 30#define REDIS_VERSION "1.3.2"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
ed9b544e 41#include <signal.h>
fbf9bcdb 42
43#ifdef HAVE_BACKTRACE
c9468bcf 44#include <execinfo.h>
45#include <ucontext.h>
fbf9bcdb 46#endif /* HAVE_BACKTRACE */
47
ed9b544e 48#include <sys/wait.h>
49#include <errno.h>
50#include <assert.h>
51#include <ctype.h>
52#include <stdarg.h>
53#include <inttypes.h>
54#include <arpa/inet.h>
55#include <sys/stat.h>
56#include <fcntl.h>
57#include <sys/time.h>
58#include <sys/resource.h>
2895e862 59#include <sys/uio.h>
f78fd11b 60#include <limits.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ed9b544e 77
78/* Error codes */
79#define REDIS_OK 0
80#define REDIS_ERR -1
81
82/* Static server configuration */
83#define REDIS_SERVERPORT 6379 /* TCP port */
84#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 85#define REDIS_IOBUF_LEN 1024
ed9b544e 86#define REDIS_LOADBUF_LEN 1024
93ea3759 87#define REDIS_STATIC_ARGS 4
ed9b544e 88#define REDIS_DEFAULT_DBNUM 16
89#define REDIS_CONFIGLINE_MAX 1024
90#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 92#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 93#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 94#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97#define REDIS_WRITEV_THRESHOLD 3
98/* Max number of iovecs used for each writev call */
99#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 100
101/* Hash table parameters */
102#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 103
104/* Command flags */
3fd78bcd 105#define REDIS_CMD_BULK 1 /* Bulk write command */
106#define REDIS_CMD_INLINE 2 /* Inline command */
107/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111#define REDIS_CMD_DENYOOM 4
ed9b544e 112
113/* Object types */
114#define REDIS_STRING 0
115#define REDIS_LIST 1
116#define REDIS_SET 2
1812e024 117#define REDIS_ZSET 3
118#define REDIS_HASH 4
f78fd11b 119
942a3961 120/* Objects encoding */
121#define REDIS_ENCODING_RAW 0 /* Raw representation */
122#define REDIS_ENCODING_INT 1 /* Encoded as integer */
123
f78fd11b 124/* Object types only used for dumping to disk */
bb32ede5 125#define REDIS_EXPIRETIME 253
ed9b544e 126#define REDIS_SELECTDB 254
127#define REDIS_EOF 255
128
f78fd11b 129/* Defines related to the dump file format. To store 32 bits lengths for short
130 * keys requires a lot of space, so we check the most significant 2 bits of
131 * the first byte to interpreter the length:
132 *
133 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
134 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
135 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 136 * 11|000000 this means: specially encoded object will follow. The six bits
137 * number specify the kind of object that follows.
138 * See the REDIS_RDB_ENC_* defines.
f78fd11b 139 *
10c43610 140 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
141 * values, will fit inside. */
f78fd11b 142#define REDIS_RDB_6BITLEN 0
143#define REDIS_RDB_14BITLEN 1
144#define REDIS_RDB_32BITLEN 2
17be1a4a 145#define REDIS_RDB_ENCVAL 3
f78fd11b 146#define REDIS_RDB_LENERR UINT_MAX
147
a4d1ba9a 148/* When a length of a string object stored on disk has the first two bits
149 * set, the remaining two bits specify a special encoding for the object
150 * accordingly to the following defines: */
151#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
152#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
153#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 154#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 155
75680a3c 156/* Virtual memory object->where field. */
157#define REDIS_VM_MEMORY 0 /* The object is on memory */
158#define REDIS_VM_SWAPPED 1 /* The object is on disk */
159#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
160#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
161
06224fec 162/* Virtual memory static configuration stuff.
163 * Check vmFindContiguousPages() to know more about this magic numbers. */
164#define REDIS_VM_MAX_NEAR_PAGES 65536
165#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 166#define REDIS_VM_MAX_THREADS 32
06224fec 167
ed9b544e 168/* Client flags */
169#define REDIS_CLOSE 1 /* This client connection should be closed ASAP */
170#define REDIS_SLAVE 2 /* This client is a slave server */
171#define REDIS_MASTER 4 /* This client is a master server */
87eca727 172#define REDIS_MONITOR 8 /* This client is a slave monitor, see MONITOR */
6e469882 173#define REDIS_MULTI 16 /* This client is in a MULTI context */
4409877e 174#define REDIS_BLOCKED 32 /* The client is waiting in a blocking operation */
996cb5f7 175#define REDIS_IO_WAIT 64 /* The client is waiting for Virtual Memory I/O */
ed9b544e 176
40d224a9 177/* Slave replication state - slave side */
ed9b544e 178#define REDIS_REPL_NONE 0 /* No active replication */
179#define REDIS_REPL_CONNECT 1 /* Must connect to master */
180#define REDIS_REPL_CONNECTED 2 /* Connected to master */
181
40d224a9 182/* Slave replication state - from the point of view of master
183 * Note that in SEND_BULK and ONLINE state the slave receives new updates
184 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
185 * to start the next background saving in order to send updates to it. */
186#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
187#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
188#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
189#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
190
ed9b544e 191/* List related stuff */
192#define REDIS_HEAD 0
193#define REDIS_TAIL 1
194
195/* Sort operations */
196#define REDIS_SORT_GET 0
443c6409 197#define REDIS_SORT_ASC 1
198#define REDIS_SORT_DESC 2
ed9b544e 199#define REDIS_SORTKEY_MAX 1024
200
201/* Log levels */
202#define REDIS_DEBUG 0
f870935d 203#define REDIS_VERBOSE 1
204#define REDIS_NOTICE 2
205#define REDIS_WARNING 3
ed9b544e 206
207/* Anti-warning macro... */
208#define REDIS_NOTUSED(V) ((void) V)
209
6b47e12e 210#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
211#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 212
48f0308a 213/* Append only defines */
214#define APPENDFSYNC_NO 0
215#define APPENDFSYNC_ALWAYS 1
216#define APPENDFSYNC_EVERYSEC 2
217
dfc5e96c 218/* We can print the stacktrace, so our assert is defined this way: */
219#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e),exit(1)))
220static void _redisAssert(char *estr);
221
ed9b544e 222/*================================= Data types ============================== */
223
224/* A redis object, that is a type able to hold a string / list / set */
75680a3c 225
226/* The VM object structure */
227struct redisObjectVM {
3a66edc7 228 off_t page; /* the page at witch the object is stored on disk */
229 off_t usedpages; /* number of pages used on disk */
230 time_t atime; /* Last access time */
75680a3c 231} vm;
232
233/* The actual Redis Object */
ed9b544e 234typedef struct redisObject {
ed9b544e 235 void *ptr;
942a3961 236 unsigned char type;
237 unsigned char encoding;
d894161b 238 unsigned char storage; /* If this object is a key, where is the value?
239 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
240 unsigned char vtype; /* If this object is a key, and value is swapped out,
241 * this is the type of the swapped out object. */
ed9b544e 242 int refcount;
75680a3c 243 /* VM fields, this are only allocated if VM is active, otherwise the
244 * object allocation function will just allocate
245 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
246 * Redis without VM active will not have any overhead. */
247 struct redisObjectVM vm;
ed9b544e 248} robj;
249
dfc5e96c 250/* Macro used to initalize a Redis object allocated on the stack.
251 * Note that this macro is taken near the structure definition to make sure
252 * we'll update it when the structure is changed, to avoid bugs like
253 * bug #85 introduced exactly in this way. */
254#define initStaticStringObject(_var,_ptr) do { \
255 _var.refcount = 1; \
256 _var.type = REDIS_STRING; \
257 _var.encoding = REDIS_ENCODING_RAW; \
258 _var.ptr = _ptr; \
3a66edc7 259 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 260} while(0);
261
3305306f 262typedef struct redisDb {
4409877e 263 dict *dict; /* The keyspace for this DB */
264 dict *expires; /* Timeout of keys with a timeout set */
265 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
3305306f 266 int id;
267} redisDb;
268
6e469882 269/* Client MULTI/EXEC state */
270typedef struct multiCmd {
271 robj **argv;
272 int argc;
273 struct redisCommand *cmd;
274} multiCmd;
275
276typedef struct multiState {
277 multiCmd *commands; /* Array of MULTI commands */
278 int count; /* Total number of MULTI commands */
279} multiState;
280
ed9b544e 281/* With multiplexing we need to take per-clinet state.
282 * Clients are taken in a liked list. */
283typedef struct redisClient {
284 int fd;
3305306f 285 redisDb *db;
ed9b544e 286 int dictid;
287 sds querybuf;
e8a74421 288 robj **argv, **mbargv;
289 int argc, mbargc;
40d224a9 290 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 291 int multibulk; /* multi bulk command format active */
ed9b544e 292 list *reply;
293 int sentlen;
294 time_t lastinteraction; /* time of the last interaction, used for timeout */
40d224a9 295 int flags; /* REDIS_CLOSE | REDIS_SLAVE | REDIS_MONITOR */
6e469882 296 /* REDIS_MULTI */
40d224a9 297 int slaveseldb; /* slave selected db, if this client is a slave */
298 int authenticated; /* when requirepass is non-NULL */
299 int replstate; /* replication state if this is a slave */
300 int repldbfd; /* replication DB file descriptor */
6e469882 301 long repldboff; /* replication DB file offset */
40d224a9 302 off_t repldbsize; /* replication DB file size */
6e469882 303 multiState mstate; /* MULTI/EXEC state */
b177fd30 304 robj **blockingkeys; /* The key we waiting to terminate a blocking
4409877e 305 * operation such as BLPOP. Otherwise NULL. */
b177fd30 306 int blockingkeysnum; /* Number of blocking keys */
4409877e 307 time_t blockingto; /* Blocking operation timeout. If UNIX current time
308 * is >= blockingto then the operation timed out. */
92f8e882 309 list *io_keys; /* Keys this client is waiting to be loaded from the
310 * swap file in order to continue. */
ed9b544e 311} redisClient;
312
313struct saveparam {
314 time_t seconds;
315 int changes;
316};
317
318/* Global server state structure */
319struct redisServer {
320 int port;
321 int fd;
3305306f 322 redisDb *db;
4409877e 323 dict *sharingpool; /* Poll used for object sharing */
10c43610 324 unsigned int sharingpoolsize;
ed9b544e 325 long long dirty; /* changes to DB from the last save */
326 list *clients;
87eca727 327 list *slaves, *monitors;
ed9b544e 328 char neterr[ANET_ERR_LEN];
329 aeEventLoop *el;
330 int cronloops; /* number of times the cron function run */
331 list *objfreelist; /* A list of freed objects to avoid malloc() */
332 time_t lastsave; /* Unix time of last save succeeede */
5fba9f71 333 size_t usedmemory; /* Used memory in megabytes */
ed9b544e 334 /* Fields used only for stats */
335 time_t stat_starttime; /* server start time */
336 long long stat_numcommands; /* number of processed commands */
337 long long stat_numconnections; /* number of connections received */
338 /* Configuration */
339 int verbosity;
340 int glueoutputbuf;
341 int maxidletime;
342 int dbnum;
343 int daemonize;
44b38ef4 344 int appendonly;
48f0308a 345 int appendfsync;
346 time_t lastfsync;
44b38ef4 347 int appendfd;
348 int appendseldb;
ed329fcf 349 char *pidfile;
9f3c422c 350 pid_t bgsavechildpid;
9d65a1bb 351 pid_t bgrewritechildpid;
352 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 353 struct saveparam *saveparams;
354 int saveparamslen;
355 char *logfile;
356 char *bindaddr;
357 char *dbfilename;
44b38ef4 358 char *appendfilename;
abcb223e 359 char *requirepass;
10c43610 360 int shareobjects;
121f70cf 361 int rdbcompression;
ed9b544e 362 /* Replication related */
363 int isslave;
d0ccebcf 364 char *masterauth;
ed9b544e 365 char *masterhost;
366 int masterport;
40d224a9 367 redisClient *master; /* client that is master for this slave */
ed9b544e 368 int replstate;
285add55 369 unsigned int maxclients;
4ef8de8a 370 unsigned long long maxmemory;
f86a74e9 371 unsigned int blockedclients;
ed9b544e 372 /* Sort parameters - qsort_r() is only available under BSD so we
373 * have to take this state global, in order to pass it to sortCompare() */
374 int sort_desc;
375 int sort_alpha;
376 int sort_bypattern;
75680a3c 377 /* Virtual memory configuration */
378 int vm_enabled;
379 off_t vm_page_size;
380 off_t vm_pages;
4ef8de8a 381 unsigned long long vm_max_memory;
75680a3c 382 /* Virtual memory state */
383 FILE *vm_fp;
384 int vm_fd;
385 off_t vm_next_page; /* Next probably empty page */
386 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 387 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 388 time_t unixtime; /* Unix time sampled every second. */
92f8e882 389 /* Virtual memory I/O threads stuff */
92f8e882 390 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 391 * put the result of the operation in the io_done list. While the
392 * job is being processed, it's put on io_processing queue. */
393 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
394 list *io_processing; /* List of VM I/O jobs being processed */
395 list *io_processed; /* List of VM I/O jobs already processed */
92f8e882 396 list *io_clients; /* All the clients waiting for SWAP I/O operations */
996cb5f7 397 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 398 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
399 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
92f8e882 400 int io_active_threads; /* Number of running I/O threads */
401 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 402 /* Our main thread is blocked on the event loop, locking for sockets ready
403 * to be read or written, so when a threaded I/O operation is ready to be
404 * processed by the main thread, the I/O thread will use a unix pipe to
405 * awake the main thread. The followings are the two pipe FDs. */
406 int io_ready_pipe_read;
407 int io_ready_pipe_write;
7d98e08c 408 /* Virtual memory stats */
409 unsigned long long vm_stats_used_pages;
410 unsigned long long vm_stats_swapped_objects;
411 unsigned long long vm_stats_swapouts;
412 unsigned long long vm_stats_swapins;
b9bc0eef 413 FILE *devnull;
ed9b544e 414};
415
416typedef void redisCommandProc(redisClient *c);
417struct redisCommand {
418 char *name;
419 redisCommandProc *proc;
420 int arity;
421 int flags;
422};
423
de96dbfe 424struct redisFunctionSym {
425 char *name;
56906eef 426 unsigned long pointer;
de96dbfe 427};
428
ed9b544e 429typedef struct _redisSortObject {
430 robj *obj;
431 union {
432 double score;
433 robj *cmpobj;
434 } u;
435} redisSortObject;
436
437typedef struct _redisSortOperation {
438 int type;
439 robj *pattern;
440} redisSortOperation;
441
6b47e12e 442/* ZSETs use a specialized version of Skiplists */
443
444typedef struct zskiplistNode {
445 struct zskiplistNode **forward;
e3870fab 446 struct zskiplistNode *backward;
6b47e12e 447 double score;
448 robj *obj;
449} zskiplistNode;
450
451typedef struct zskiplist {
e3870fab 452 struct zskiplistNode *header, *tail;
d13f767c 453 unsigned long length;
6b47e12e 454 int level;
455} zskiplist;
456
1812e024 457typedef struct zset {
458 dict *dict;
6b47e12e 459 zskiplist *zsl;
1812e024 460} zset;
461
6b47e12e 462/* Our shared "common" objects */
463
ed9b544e 464struct sharedObjectsStruct {
c937aa89 465 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 466 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 467 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
468 *outofrangeerr, *plus,
ed9b544e 469 *select0, *select1, *select2, *select3, *select4,
470 *select5, *select6, *select7, *select8, *select9;
471} shared;
472
a7866db6 473/* Global vars that are actally used as constants. The following double
474 * values are used for double on-disk serialization, and are initialized
475 * at runtime to avoid strange compiler optimizations. */
476
477static double R_Zero, R_PosInf, R_NegInf, R_Nan;
478
92f8e882 479/* VM threaded I/O request message */
b9bc0eef 480#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
481#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
482#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
996cb5f7 483typedef struct iojon {
484 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 485 redisDb *db;/* Redis database */
92f8e882 486 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 487 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 488 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
489 off_t page; /* Swap page where to read/write the object */
b9bc0eef 490 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
996cb5f7 491 int canceled; /* True if this command was canceled by blocking side of VM */
492 pthread_t thread; /* ID of the thread processing this entry */
493} iojob;
92f8e882 494
ed9b544e 495/*================================ Prototypes =============================== */
496
497static void freeStringObject(robj *o);
498static void freeListObject(robj *o);
499static void freeSetObject(robj *o);
500static void decrRefCount(void *o);
501static robj *createObject(int type, void *ptr);
502static void freeClient(redisClient *c);
f78fd11b 503static int rdbLoad(char *filename);
ed9b544e 504static void addReply(redisClient *c, robj *obj);
505static void addReplySds(redisClient *c, sds s);
506static void incrRefCount(robj *o);
f78fd11b 507static int rdbSaveBackground(char *filename);
ed9b544e 508static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 509static robj *dupStringObject(robj *o);
87eca727 510static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 511static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 512static int syncWithMaster(void);
10c43610 513static robj *tryObjectSharing(robj *o);
942a3961 514static int tryObjectEncoding(robj *o);
9d65a1bb 515static robj *getDecodedObject(robj *o);
3305306f 516static int removeExpire(redisDb *db, robj *key);
517static int expireIfNeeded(redisDb *db, robj *key);
518static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 519static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 520static int deleteKey(redisDb *db, robj *key);
bb32ede5 521static time_t getExpire(redisDb *db, robj *key);
522static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 523static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 524static void freeMemoryIfNeeded(void);
de96dbfe 525static int processCommand(redisClient *c);
56906eef 526static void setupSigSegvAction(void);
a3b21203 527static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 528static void aofRemoveTempFile(pid_t childpid);
0ea663ea 529static size_t stringObjectLen(robj *o);
638e42ac 530static void processInputBuffer(redisClient *c);
6b47e12e 531static zskiplist *zslCreate(void);
fd8ccf44 532static void zslFree(zskiplist *zsl);
2b59cfdf 533static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 534static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 535static void initClientMultiState(redisClient *c);
536static void freeClientMultiState(redisClient *c);
537static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
4409877e 538static void unblockClient(redisClient *c);
539static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 540static void vmInit(void);
a35ddf12 541static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 542static robj *vmLoadObject(robj *key);
7e69548d 543static robj *vmPreviewObject(robj *key);
a69a0c9c 544static int vmSwapOneObjectBlocking(void);
545static int vmSwapOneObjectThreaded(void);
7e69548d 546static int vmCanSwapOut(void);
a5819310 547static int tryFreeOneObjectFromFreelist(void);
996cb5f7 548static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
549static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
550static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 551static void lockThreadedIO(void);
552static void unlockThreadedIO(void);
553static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
554static void freeIOJob(iojob *j);
555static void queueIOJob(iojob *j);
a5819310 556static int vmWriteObjectOnSwap(robj *o, off_t page);
557static robj *vmReadObjectFromSwap(off_t page, int type);
ed9b544e 558
abcb223e 559static void authCommand(redisClient *c);
ed9b544e 560static void pingCommand(redisClient *c);
561static void echoCommand(redisClient *c);
562static void setCommand(redisClient *c);
563static void setnxCommand(redisClient *c);
564static void getCommand(redisClient *c);
565static void delCommand(redisClient *c);
566static void existsCommand(redisClient *c);
567static void incrCommand(redisClient *c);
568static void decrCommand(redisClient *c);
569static void incrbyCommand(redisClient *c);
570static void decrbyCommand(redisClient *c);
571static void selectCommand(redisClient *c);
572static void randomkeyCommand(redisClient *c);
573static void keysCommand(redisClient *c);
574static void dbsizeCommand(redisClient *c);
575static void lastsaveCommand(redisClient *c);
576static void saveCommand(redisClient *c);
577static void bgsaveCommand(redisClient *c);
9d65a1bb 578static void bgrewriteaofCommand(redisClient *c);
ed9b544e 579static void shutdownCommand(redisClient *c);
580static void moveCommand(redisClient *c);
581static void renameCommand(redisClient *c);
582static void renamenxCommand(redisClient *c);
583static void lpushCommand(redisClient *c);
584static void rpushCommand(redisClient *c);
585static void lpopCommand(redisClient *c);
586static void rpopCommand(redisClient *c);
587static void llenCommand(redisClient *c);
588static void lindexCommand(redisClient *c);
589static void lrangeCommand(redisClient *c);
590static void ltrimCommand(redisClient *c);
591static void typeCommand(redisClient *c);
592static void lsetCommand(redisClient *c);
593static void saddCommand(redisClient *c);
594static void sremCommand(redisClient *c);
a4460ef4 595static void smoveCommand(redisClient *c);
ed9b544e 596static void sismemberCommand(redisClient *c);
597static void scardCommand(redisClient *c);
12fea928 598static void spopCommand(redisClient *c);
2abb95a9 599static void srandmemberCommand(redisClient *c);
ed9b544e 600static void sinterCommand(redisClient *c);
601static void sinterstoreCommand(redisClient *c);
40d224a9 602static void sunionCommand(redisClient *c);
603static void sunionstoreCommand(redisClient *c);
f4f56e1d 604static void sdiffCommand(redisClient *c);
605static void sdiffstoreCommand(redisClient *c);
ed9b544e 606static void syncCommand(redisClient *c);
607static void flushdbCommand(redisClient *c);
608static void flushallCommand(redisClient *c);
609static void sortCommand(redisClient *c);
610static void lremCommand(redisClient *c);
0f5f7e9a 611static void rpoplpushcommand(redisClient *c);
ed9b544e 612static void infoCommand(redisClient *c);
70003d28 613static void mgetCommand(redisClient *c);
87eca727 614static void monitorCommand(redisClient *c);
3305306f 615static void expireCommand(redisClient *c);
802e8373 616static void expireatCommand(redisClient *c);
f6b141c5 617static void getsetCommand(redisClient *c);
fd88489a 618static void ttlCommand(redisClient *c);
321b0e13 619static void slaveofCommand(redisClient *c);
7f957c92 620static void debugCommand(redisClient *c);
f6b141c5 621static void msetCommand(redisClient *c);
622static void msetnxCommand(redisClient *c);
fd8ccf44 623static void zaddCommand(redisClient *c);
7db723ad 624static void zincrbyCommand(redisClient *c);
cc812361 625static void zrangeCommand(redisClient *c);
50c55df5 626static void zrangebyscoreCommand(redisClient *c);
e3870fab 627static void zrevrangeCommand(redisClient *c);
3c41331e 628static void zcardCommand(redisClient *c);
1b7106e7 629static void zremCommand(redisClient *c);
6e333bbe 630static void zscoreCommand(redisClient *c);
1807985b 631static void zremrangebyscoreCommand(redisClient *c);
6e469882 632static void multiCommand(redisClient *c);
633static void execCommand(redisClient *c);
4409877e 634static void blpopCommand(redisClient *c);
635static void brpopCommand(redisClient *c);
f6b141c5 636
ed9b544e 637/*================================= Globals ================================= */
638
639/* Global vars */
640static struct redisServer server; /* server global state */
641static struct redisCommand cmdTable[] = {
642 {"get",getCommand,2,REDIS_CMD_INLINE},
3fd78bcd 643 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
644 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
5109cdff 645 {"del",delCommand,-2,REDIS_CMD_INLINE},
ed9b544e 646 {"exists",existsCommand,2,REDIS_CMD_INLINE},
3fd78bcd 647 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
648 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
70003d28 649 {"mget",mgetCommand,-2,REDIS_CMD_INLINE},
3fd78bcd 650 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
651 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 652 {"rpop",rpopCommand,2,REDIS_CMD_INLINE},
653 {"lpop",lpopCommand,2,REDIS_CMD_INLINE},
b177fd30 654 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE},
655 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE},
ed9b544e 656 {"llen",llenCommand,2,REDIS_CMD_INLINE},
657 {"lindex",lindexCommand,3,REDIS_CMD_INLINE},
3fd78bcd 658 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 659 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE},
660 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE},
661 {"lrem",lremCommand,4,REDIS_CMD_BULK},
0b13687c 662 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 663 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 664 {"srem",sremCommand,3,REDIS_CMD_BULK},
a4460ef4 665 {"smove",smoveCommand,4,REDIS_CMD_BULK},
ed9b544e 666 {"sismember",sismemberCommand,3,REDIS_CMD_BULK},
667 {"scard",scardCommand,2,REDIS_CMD_INLINE},
12fea928 668 {"spop",spopCommand,2,REDIS_CMD_INLINE},
2abb95a9 669 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE},
3fd78bcd 670 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
671 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
672 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
673 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
674 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
675 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 676 {"smembers",sinterCommand,2,REDIS_CMD_INLINE},
fd8ccf44 677 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
7db723ad 678 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
1b7106e7 679 {"zrem",zremCommand,3,REDIS_CMD_BULK},
1807985b 680 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE},
752da584 681 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE},
80181f78 682 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE},
752da584 683 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE},
3c41331e 684 {"zcard",zcardCommand,2,REDIS_CMD_INLINE},
6e333bbe 685 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 686 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
687 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
f6b141c5 688 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
689 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
690 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 691 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE},
692 {"select",selectCommand,2,REDIS_CMD_INLINE},
693 {"move",moveCommand,3,REDIS_CMD_INLINE},
694 {"rename",renameCommand,3,REDIS_CMD_INLINE},
695 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE},
321b0e13 696 {"expire",expireCommand,3,REDIS_CMD_INLINE},
802e8373 697 {"expireat",expireatCommand,3,REDIS_CMD_INLINE},
ed9b544e 698 {"keys",keysCommand,2,REDIS_CMD_INLINE},
699 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE},
abcb223e 700 {"auth",authCommand,2,REDIS_CMD_INLINE},
ed9b544e 701 {"ping",pingCommand,1,REDIS_CMD_INLINE},
702 {"echo",echoCommand,2,REDIS_CMD_BULK},
703 {"save",saveCommand,1,REDIS_CMD_INLINE},
704 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE},
9d65a1bb 705 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE},
ed9b544e 706 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE},
707 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE},
708 {"type",typeCommand,2,REDIS_CMD_INLINE},
6e469882 709 {"multi",multiCommand,1,REDIS_CMD_INLINE},
710 {"exec",execCommand,1,REDIS_CMD_INLINE},
ed9b544e 711 {"sync",syncCommand,1,REDIS_CMD_INLINE},
712 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE},
713 {"flushall",flushallCommand,1,REDIS_CMD_INLINE},
3fd78bcd 714 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 715 {"info",infoCommand,1,REDIS_CMD_INLINE},
87eca727 716 {"monitor",monitorCommand,1,REDIS_CMD_INLINE},
fd88489a 717 {"ttl",ttlCommand,2,REDIS_CMD_INLINE},
321b0e13 718 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE},
7f957c92 719 {"debug",debugCommand,-2,REDIS_CMD_INLINE},
ed9b544e 720 {NULL,NULL,0,0}
721};
bcfc686d 722
ed9b544e 723/*============================ Utility functions ============================ */
724
725/* Glob-style pattern matching. */
726int stringmatchlen(const char *pattern, int patternLen,
727 const char *string, int stringLen, int nocase)
728{
729 while(patternLen) {
730 switch(pattern[0]) {
731 case '*':
732 while (pattern[1] == '*') {
733 pattern++;
734 patternLen--;
735 }
736 if (patternLen == 1)
737 return 1; /* match */
738 while(stringLen) {
739 if (stringmatchlen(pattern+1, patternLen-1,
740 string, stringLen, nocase))
741 return 1; /* match */
742 string++;
743 stringLen--;
744 }
745 return 0; /* no match */
746 break;
747 case '?':
748 if (stringLen == 0)
749 return 0; /* no match */
750 string++;
751 stringLen--;
752 break;
753 case '[':
754 {
755 int not, match;
756
757 pattern++;
758 patternLen--;
759 not = pattern[0] == '^';
760 if (not) {
761 pattern++;
762 patternLen--;
763 }
764 match = 0;
765 while(1) {
766 if (pattern[0] == '\\') {
767 pattern++;
768 patternLen--;
769 if (pattern[0] == string[0])
770 match = 1;
771 } else if (pattern[0] == ']') {
772 break;
773 } else if (patternLen == 0) {
774 pattern--;
775 patternLen++;
776 break;
777 } else if (pattern[1] == '-' && patternLen >= 3) {
778 int start = pattern[0];
779 int end = pattern[2];
780 int c = string[0];
781 if (start > end) {
782 int t = start;
783 start = end;
784 end = t;
785 }
786 if (nocase) {
787 start = tolower(start);
788 end = tolower(end);
789 c = tolower(c);
790 }
791 pattern += 2;
792 patternLen -= 2;
793 if (c >= start && c <= end)
794 match = 1;
795 } else {
796 if (!nocase) {
797 if (pattern[0] == string[0])
798 match = 1;
799 } else {
800 if (tolower((int)pattern[0]) == tolower((int)string[0]))
801 match = 1;
802 }
803 }
804 pattern++;
805 patternLen--;
806 }
807 if (not)
808 match = !match;
809 if (!match)
810 return 0; /* no match */
811 string++;
812 stringLen--;
813 break;
814 }
815 case '\\':
816 if (patternLen >= 2) {
817 pattern++;
818 patternLen--;
819 }
820 /* fall through */
821 default:
822 if (!nocase) {
823 if (pattern[0] != string[0])
824 return 0; /* no match */
825 } else {
826 if (tolower((int)pattern[0]) != tolower((int)string[0]))
827 return 0; /* no match */
828 }
829 string++;
830 stringLen--;
831 break;
832 }
833 pattern++;
834 patternLen--;
835 if (stringLen == 0) {
836 while(*pattern == '*') {
837 pattern++;
838 patternLen--;
839 }
840 break;
841 }
842 }
843 if (patternLen == 0 && stringLen == 0)
844 return 1;
845 return 0;
846}
847
56906eef 848static void redisLog(int level, const char *fmt, ...) {
ed9b544e 849 va_list ap;
850 FILE *fp;
851
852 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
853 if (!fp) return;
854
855 va_start(ap, fmt);
856 if (level >= server.verbosity) {
857 char *c = ".-*";
1904ecc1 858 char buf[64];
859 time_t now;
860
861 now = time(NULL);
6c9385e0 862 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1904ecc1 863 fprintf(fp,"%s %c ",buf,c[level]);
ed9b544e 864 vfprintf(fp, fmt, ap);
865 fprintf(fp,"\n");
866 fflush(fp);
867 }
868 va_end(ap);
869
870 if (server.logfile) fclose(fp);
871}
872
873/*====================== Hash table type implementation ==================== */
874
875/* This is an hash table type that uses the SDS dynamic strings libary as
876 * keys and radis objects as values (objects can hold SDS strings,
877 * lists, sets). */
878
1812e024 879static void dictVanillaFree(void *privdata, void *val)
880{
881 DICT_NOTUSED(privdata);
882 zfree(val);
883}
884
4409877e 885static void dictListDestructor(void *privdata, void *val)
886{
887 DICT_NOTUSED(privdata);
888 listRelease((list*)val);
889}
890
ed9b544e 891static int sdsDictKeyCompare(void *privdata, const void *key1,
892 const void *key2)
893{
894 int l1,l2;
895 DICT_NOTUSED(privdata);
896
897 l1 = sdslen((sds)key1);
898 l2 = sdslen((sds)key2);
899 if (l1 != l2) return 0;
900 return memcmp(key1, key2, l1) == 0;
901}
902
903static void dictRedisObjectDestructor(void *privdata, void *val)
904{
905 DICT_NOTUSED(privdata);
906
a35ddf12 907 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 908 decrRefCount(val);
909}
910
942a3961 911static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 912 const void *key2)
913{
914 const robj *o1 = key1, *o2 = key2;
915 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
916}
917
942a3961 918static unsigned int dictObjHash(const void *key) {
ed9b544e 919 const robj *o = key;
920 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
921}
922
942a3961 923static int dictEncObjKeyCompare(void *privdata, const void *key1,
924 const void *key2)
925{
9d65a1bb 926 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
927 int cmp;
942a3961 928
9d65a1bb 929 o1 = getDecodedObject(o1);
930 o2 = getDecodedObject(o2);
931 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
932 decrRefCount(o1);
933 decrRefCount(o2);
934 return cmp;
942a3961 935}
936
937static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 938 robj *o = (robj*) key;
942a3961 939
9d65a1bb 940 o = getDecodedObject(o);
941 unsigned int hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
942 decrRefCount(o);
943 return hash;
942a3961 944}
945
ed9b544e 946static dictType setDictType = {
942a3961 947 dictEncObjHash, /* hash function */
ed9b544e 948 NULL, /* key dup */
949 NULL, /* val dup */
942a3961 950 dictEncObjKeyCompare, /* key compare */
ed9b544e 951 dictRedisObjectDestructor, /* key destructor */
952 NULL /* val destructor */
953};
954
1812e024 955static dictType zsetDictType = {
956 dictEncObjHash, /* hash function */
957 NULL, /* key dup */
958 NULL, /* val dup */
959 dictEncObjKeyCompare, /* key compare */
960 dictRedisObjectDestructor, /* key destructor */
da0a1620 961 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 962};
963
ed9b544e 964static dictType hashDictType = {
942a3961 965 dictObjHash, /* hash function */
ed9b544e 966 NULL, /* key dup */
967 NULL, /* val dup */
942a3961 968 dictObjKeyCompare, /* key compare */
ed9b544e 969 dictRedisObjectDestructor, /* key destructor */
970 dictRedisObjectDestructor /* val destructor */
971};
972
4409877e 973/* Keylist hash table type has unencoded redis objects as keys and
974 * lists as values. It's used for blocking operations (BLPOP) */
975static dictType keylistDictType = {
976 dictObjHash, /* hash function */
977 NULL, /* key dup */
978 NULL, /* val dup */
979 dictObjKeyCompare, /* key compare */
980 dictRedisObjectDestructor, /* key destructor */
981 dictListDestructor /* val destructor */
982};
983
ed9b544e 984/* ========================= Random utility functions ======================= */
985
986/* Redis generally does not try to recover from out of memory conditions
987 * when allocating objects or strings, it is not clear if it will be possible
988 * to report this condition to the client since the networking layer itself
989 * is based on heap allocation for send buffers, so we simply abort.
990 * At least the code will be simpler to read... */
991static void oom(const char *msg) {
71c54b21 992 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 993 sleep(1);
994 abort();
995}
996
997/* ====================== Redis server networking stuff ===================== */
56906eef 998static void closeTimedoutClients(void) {
ed9b544e 999 redisClient *c;
ed9b544e 1000 listNode *ln;
1001 time_t now = time(NULL);
1002
6208b3a7 1003 listRewind(server.clients);
1004 while ((ln = listYield(server.clients)) != NULL) {
ed9b544e 1005 c = listNodeValue(ln);
f86a74e9 1006 if (server.maxidletime &&
1007 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1008 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1009 (now - c->lastinteraction > server.maxidletime))
1010 {
f870935d 1011 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1012 freeClient(c);
f86a74e9 1013 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1014 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1015 addReply(c,shared.nullmultibulk);
f86a74e9 1016 unblockClient(c);
1017 }
ed9b544e 1018 }
1019 }
ed9b544e 1020}
1021
12fea928 1022static int htNeedsResize(dict *dict) {
1023 long long size, used;
1024
1025 size = dictSlots(dict);
1026 used = dictSize(dict);
1027 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1028 (used*100/size < REDIS_HT_MINFILL));
1029}
1030
0bc03378 1031/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1032 * we resize the hash table to save memory */
56906eef 1033static void tryResizeHashTables(void) {
0bc03378 1034 int j;
1035
1036 for (j = 0; j < server.dbnum; j++) {
12fea928 1037 if (htNeedsResize(server.db[j].dict)) {
f870935d 1038 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1039 dictResize(server.db[j].dict);
f870935d 1040 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1041 }
12fea928 1042 if (htNeedsResize(server.db[j].expires))
1043 dictResize(server.db[j].expires);
0bc03378 1044 }
1045}
1046
9d65a1bb 1047/* A background saving child (BGSAVE) terminated its work. Handle this. */
1048void backgroundSaveDoneHandler(int statloc) {
1049 int exitcode = WEXITSTATUS(statloc);
1050 int bysignal = WIFSIGNALED(statloc);
1051
1052 if (!bysignal && exitcode == 0) {
1053 redisLog(REDIS_NOTICE,
1054 "Background saving terminated with success");
1055 server.dirty = 0;
1056 server.lastsave = time(NULL);
1057 } else if (!bysignal && exitcode != 0) {
1058 redisLog(REDIS_WARNING, "Background saving error");
1059 } else {
1060 redisLog(REDIS_WARNING,
1061 "Background saving terminated by signal");
1062 rdbRemoveTempFile(server.bgsavechildpid);
1063 }
1064 server.bgsavechildpid = -1;
1065 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1066 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1067 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1068}
1069
1070/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1071 * Handle this. */
1072void backgroundRewriteDoneHandler(int statloc) {
1073 int exitcode = WEXITSTATUS(statloc);
1074 int bysignal = WIFSIGNALED(statloc);
1075
1076 if (!bysignal && exitcode == 0) {
1077 int fd;
1078 char tmpfile[256];
1079
1080 redisLog(REDIS_NOTICE,
1081 "Background append only file rewriting terminated with success");
1082 /* Now it's time to flush the differences accumulated by the parent */
1083 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1084 fd = open(tmpfile,O_WRONLY|O_APPEND);
1085 if (fd == -1) {
1086 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1087 goto cleanup;
1088 }
1089 /* Flush our data... */
1090 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1091 (signed) sdslen(server.bgrewritebuf)) {
1092 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1093 close(fd);
1094 goto cleanup;
1095 }
b32627cd 1096 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1097 /* Now our work is to rename the temp file into the stable file. And
1098 * switch the file descriptor used by the server for append only. */
1099 if (rename(tmpfile,server.appendfilename) == -1) {
1100 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1101 close(fd);
1102 goto cleanup;
1103 }
1104 /* Mission completed... almost */
1105 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1106 if (server.appendfd != -1) {
1107 /* If append only is actually enabled... */
1108 close(server.appendfd);
1109 server.appendfd = fd;
1110 fsync(fd);
85a83172 1111 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1112 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1113 } else {
1114 /* If append only is disabled we just generate a dump in this
1115 * format. Why not? */
1116 close(fd);
1117 }
1118 } else if (!bysignal && exitcode != 0) {
1119 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1120 } else {
1121 redisLog(REDIS_WARNING,
1122 "Background append only file rewriting terminated by signal");
1123 }
1124cleanup:
1125 sdsfree(server.bgrewritebuf);
1126 server.bgrewritebuf = sdsempty();
1127 aofRemoveTempFile(server.bgrewritechildpid);
1128 server.bgrewritechildpid = -1;
1129}
1130
56906eef 1131static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1132 int j, loops = server.cronloops++;
ed9b544e 1133 REDIS_NOTUSED(eventLoop);
1134 REDIS_NOTUSED(id);
1135 REDIS_NOTUSED(clientData);
1136
3a66edc7 1137 /* We take a cached value of the unix time in the global state because
1138 * with virtual memory and aging there is to store the current time
1139 * in objects at every object access, and accuracy is not needed.
1140 * To access a global var is faster than calling time(NULL) */
1141 server.unixtime = time(NULL);
1142
ed9b544e 1143 /* Update the global state with the amount of used memory */
1144 server.usedmemory = zmalloc_used_memory();
1145
0bc03378 1146 /* Show some info about non-empty databases */
ed9b544e 1147 for (j = 0; j < server.dbnum; j++) {
dec423d9 1148 long long size, used, vkeys;
94754ccc 1149
3305306f 1150 size = dictSlots(server.db[j].dict);
1151 used = dictSize(server.db[j].dict);
94754ccc 1152 vkeys = dictSize(server.db[j].expires);
c3cb078d 1153 if (!(loops % 5) && (used || vkeys)) {
f870935d 1154 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1155 /* dictPrintStats(server.dict); */
ed9b544e 1156 }
ed9b544e 1157 }
1158
0bc03378 1159 /* We don't want to resize the hash tables while a bacground saving
1160 * is in progress: the saving child is created using fork() that is
1161 * implemented with a copy-on-write semantic in most modern systems, so
1162 * if we resize the HT while there is the saving child at work actually
1163 * a lot of memory movements in the parent will cause a lot of pages
1164 * copied. */
9d65a1bb 1165 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1166
ed9b544e 1167 /* Show information about connected clients */
1168 if (!(loops % 5)) {
f870935d 1169 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1170 listLength(server.clients)-listLength(server.slaves),
1171 listLength(server.slaves),
10c43610 1172 server.usedmemory,
3305306f 1173 dictSize(server.sharingpool));
ed9b544e 1174 }
1175
1176 /* Close connections of timedout clients */
f86a74e9 1177 if ((server.maxidletime && !(loops % 10)) || server.blockedclients)
ed9b544e 1178 closeTimedoutClients();
1179
9d65a1bb 1180 /* Check if a background saving or AOF rewrite in progress terminated */
1181 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1182 int statloc;
9d65a1bb 1183 pid_t pid;
1184
1185 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1186 if (pid == server.bgsavechildpid) {
1187 backgroundSaveDoneHandler(statloc);
ed9b544e 1188 } else {
9d65a1bb 1189 backgroundRewriteDoneHandler(statloc);
ed9b544e 1190 }
ed9b544e 1191 }
1192 } else {
1193 /* If there is not a background saving in progress check if
1194 * we have to save now */
1195 time_t now = time(NULL);
1196 for (j = 0; j < server.saveparamslen; j++) {
1197 struct saveparam *sp = server.saveparams+j;
1198
1199 if (server.dirty >= sp->changes &&
1200 now-server.lastsave > sp->seconds) {
1201 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1202 sp->changes, sp->seconds);
f78fd11b 1203 rdbSaveBackground(server.dbfilename);
ed9b544e 1204 break;
1205 }
1206 }
1207 }
94754ccc 1208
f2324293 1209 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1210 * will use few CPU cycles if there are few expiring keys, otherwise
1211 * it will get more aggressive to avoid that too much memory is used by
1212 * keys that can be removed from the keyspace. */
94754ccc 1213 for (j = 0; j < server.dbnum; j++) {
f2324293 1214 int expired;
94754ccc 1215 redisDb *db = server.db+j;
94754ccc 1216
f2324293 1217 /* Continue to expire if at the end of the cycle more than 25%
1218 * of the keys were expired. */
1219 do {
4ef8de8a 1220 long num = dictSize(db->expires);
94754ccc 1221 time_t now = time(NULL);
1222
f2324293 1223 expired = 0;
94754ccc 1224 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1225 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1226 while (num--) {
1227 dictEntry *de;
1228 time_t t;
1229
1230 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1231 t = (time_t) dictGetEntryVal(de);
1232 if (now > t) {
1233 deleteKey(db,dictGetEntryKey(de));
f2324293 1234 expired++;
94754ccc 1235 }
1236 }
f2324293 1237 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1238 }
1239
4ef8de8a 1240 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1241 * is enbled. Try to free objects from the free list first. */
7e69548d 1242 if (vmCanSwapOut()) {
1243 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1244 server.vm_max_memory)
1245 {
a5819310 1246 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1247 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
1248 if ((loops % 30) == 0 && zmalloc_used_memory() >
1249 (server.vm_max_memory+server.vm_max_memory/10)) {
1250 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1251 }
7e69548d 1252 }
a5819310 1253 /* Note that we freed just one object, because anyway when
1254 * the I/O thread in charge to swap this object out will
1255 * do its work, the handler of completed jobs will try to swap
1256 * more objects if we are out of memory. */
1257 break;
4ef8de8a 1258 }
1259 }
1260
ed9b544e 1261 /* Check if we should connect to a MASTER */
1262 if (server.replstate == REDIS_REPL_CONNECT) {
1263 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1264 if (syncWithMaster() == REDIS_OK) {
1265 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1266 }
1267 }
1268 return 1000;
1269}
1270
1271static void createSharedObjects(void) {
1272 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1273 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1274 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1275 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1276 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1277 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1278 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1279 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1280 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1281 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1282 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1283 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1284 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1285 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1286 "-ERR no such key\r\n"));
ed9b544e 1287 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1288 "-ERR syntax error\r\n"));
c937aa89 1289 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1290 "-ERR source and destination objects are the same\r\n"));
1291 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1292 "-ERR index out of range\r\n"));
ed9b544e 1293 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1294 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1295 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1296 shared.select0 = createStringObject("select 0\r\n",10);
1297 shared.select1 = createStringObject("select 1\r\n",10);
1298 shared.select2 = createStringObject("select 2\r\n",10);
1299 shared.select3 = createStringObject("select 3\r\n",10);
1300 shared.select4 = createStringObject("select 4\r\n",10);
1301 shared.select5 = createStringObject("select 5\r\n",10);
1302 shared.select6 = createStringObject("select 6\r\n",10);
1303 shared.select7 = createStringObject("select 7\r\n",10);
1304 shared.select8 = createStringObject("select 8\r\n",10);
1305 shared.select9 = createStringObject("select 9\r\n",10);
1306}
1307
1308static void appendServerSaveParams(time_t seconds, int changes) {
1309 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1310 server.saveparams[server.saveparamslen].seconds = seconds;
1311 server.saveparams[server.saveparamslen].changes = changes;
1312 server.saveparamslen++;
1313}
1314
bcfc686d 1315static void resetServerSaveParams() {
ed9b544e 1316 zfree(server.saveparams);
1317 server.saveparams = NULL;
1318 server.saveparamslen = 0;
1319}
1320
1321static void initServerConfig() {
1322 server.dbnum = REDIS_DEFAULT_DBNUM;
1323 server.port = REDIS_SERVERPORT;
f870935d 1324 server.verbosity = REDIS_VERBOSE;
ed9b544e 1325 server.maxidletime = REDIS_MAXIDLETIME;
1326 server.saveparams = NULL;
1327 server.logfile = NULL; /* NULL = log on standard output */
1328 server.bindaddr = NULL;
1329 server.glueoutputbuf = 1;
1330 server.daemonize = 0;
44b38ef4 1331 server.appendonly = 0;
4e141d5a 1332 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1333 server.lastfsync = time(NULL);
44b38ef4 1334 server.appendfd = -1;
1335 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1336 server.pidfile = "/var/run/redis.pid";
ed9b544e 1337 server.dbfilename = "dump.rdb";
9d65a1bb 1338 server.appendfilename = "appendonly.aof";
abcb223e 1339 server.requirepass = NULL;
10c43610 1340 server.shareobjects = 0;
b0553789 1341 server.rdbcompression = 1;
21aecf4b 1342 server.sharingpoolsize = 1024;
285add55 1343 server.maxclients = 0;
f86a74e9 1344 server.blockedclients = 0;
3fd78bcd 1345 server.maxmemory = 0;
75680a3c 1346 server.vm_enabled = 0;
1347 server.vm_page_size = 256; /* 256 bytes per page */
1348 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1349 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1350 server.vm_max_threads = 4;
75680a3c 1351
bcfc686d 1352 resetServerSaveParams();
ed9b544e 1353
1354 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1355 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1356 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1357 /* Replication related */
1358 server.isslave = 0;
d0ccebcf 1359 server.masterauth = NULL;
ed9b544e 1360 server.masterhost = NULL;
1361 server.masterport = 6379;
1362 server.master = NULL;
1363 server.replstate = REDIS_REPL_NONE;
a7866db6 1364
1365 /* Double constants initialization */
1366 R_Zero = 0.0;
1367 R_PosInf = 1.0/R_Zero;
1368 R_NegInf = -1.0/R_Zero;
1369 R_Nan = R_Zero/R_Zero;
ed9b544e 1370}
1371
1372static void initServer() {
1373 int j;
1374
1375 signal(SIGHUP, SIG_IGN);
1376 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1377 setupSigSegvAction();
ed9b544e 1378
b9bc0eef 1379 server.devnull = fopen("/dev/null","w");
1380 if (server.devnull == NULL) {
1381 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1382 exit(1);
1383 }
ed9b544e 1384 server.clients = listCreate();
1385 server.slaves = listCreate();
87eca727 1386 server.monitors = listCreate();
ed9b544e 1387 server.objfreelist = listCreate();
1388 createSharedObjects();
1389 server.el = aeCreateEventLoop();
3305306f 1390 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1391 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1392 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1393 if (server.fd == -1) {
1394 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1395 exit(1);
1396 }
3305306f 1397 for (j = 0; j < server.dbnum; j++) {
1398 server.db[j].dict = dictCreate(&hashDictType,NULL);
1399 server.db[j].expires = dictCreate(&setDictType,NULL);
4409877e 1400 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
3305306f 1401 server.db[j].id = j;
1402 }
ed9b544e 1403 server.cronloops = 0;
9f3c422c 1404 server.bgsavechildpid = -1;
9d65a1bb 1405 server.bgrewritechildpid = -1;
1406 server.bgrewritebuf = sdsempty();
ed9b544e 1407 server.lastsave = time(NULL);
1408 server.dirty = 0;
1409 server.usedmemory = 0;
1410 server.stat_numcommands = 0;
1411 server.stat_numconnections = 0;
1412 server.stat_starttime = time(NULL);
3a66edc7 1413 server.unixtime = time(NULL);
d8f8b666 1414 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1415 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1416 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1417
1418 if (server.appendonly) {
71eba477 1419 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1420 if (server.appendfd == -1) {
1421 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1422 strerror(errno));
1423 exit(1);
1424 }
1425 }
75680a3c 1426
1427 if (server.vm_enabled) vmInit();
ed9b544e 1428}
1429
1430/* Empty the whole database */
ca37e9cd 1431static long long emptyDb() {
ed9b544e 1432 int j;
ca37e9cd 1433 long long removed = 0;
ed9b544e 1434
3305306f 1435 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1436 removed += dictSize(server.db[j].dict);
3305306f 1437 dictEmpty(server.db[j].dict);
1438 dictEmpty(server.db[j].expires);
1439 }
ca37e9cd 1440 return removed;
ed9b544e 1441}
1442
85dd2f3a 1443static int yesnotoi(char *s) {
1444 if (!strcasecmp(s,"yes")) return 1;
1445 else if (!strcasecmp(s,"no")) return 0;
1446 else return -1;
1447}
1448
ed9b544e 1449/* I agree, this is a very rudimental way to load a configuration...
1450 will improve later if the config gets more complex */
1451static void loadServerConfig(char *filename) {
c9a111ac 1452 FILE *fp;
ed9b544e 1453 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1454 int linenum = 0;
1455 sds line = NULL;
c9a111ac 1456
1457 if (filename[0] == '-' && filename[1] == '\0')
1458 fp = stdin;
1459 else {
1460 if ((fp = fopen(filename,"r")) == NULL) {
1461 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1462 exit(1);
1463 }
ed9b544e 1464 }
c9a111ac 1465
ed9b544e 1466 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1467 sds *argv;
1468 int argc, j;
1469
1470 linenum++;
1471 line = sdsnew(buf);
1472 line = sdstrim(line," \t\r\n");
1473
1474 /* Skip comments and blank lines*/
1475 if (line[0] == '#' || line[0] == '\0') {
1476 sdsfree(line);
1477 continue;
1478 }
1479
1480 /* Split into arguments */
1481 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1482 sdstolower(argv[0]);
1483
1484 /* Execute config directives */
bb0b03a3 1485 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1486 server.maxidletime = atoi(argv[1]);
0150db36 1487 if (server.maxidletime < 0) {
ed9b544e 1488 err = "Invalid timeout value"; goto loaderr;
1489 }
bb0b03a3 1490 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1491 server.port = atoi(argv[1]);
1492 if (server.port < 1 || server.port > 65535) {
1493 err = "Invalid port"; goto loaderr;
1494 }
bb0b03a3 1495 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1496 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1497 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1498 int seconds = atoi(argv[1]);
1499 int changes = atoi(argv[2]);
1500 if (seconds < 1 || changes < 0) {
1501 err = "Invalid save parameters"; goto loaderr;
1502 }
1503 appendServerSaveParams(seconds,changes);
bb0b03a3 1504 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1505 if (chdir(argv[1]) == -1) {
1506 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1507 argv[1], strerror(errno));
1508 exit(1);
1509 }
bb0b03a3 1510 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1511 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1512 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1513 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1514 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1515 else {
1516 err = "Invalid log level. Must be one of debug, notice, warning";
1517 goto loaderr;
1518 }
bb0b03a3 1519 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1520 FILE *logfp;
ed9b544e 1521
1522 server.logfile = zstrdup(argv[1]);
bb0b03a3 1523 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1524 zfree(server.logfile);
1525 server.logfile = NULL;
1526 }
1527 if (server.logfile) {
1528 /* Test if we are able to open the file. The server will not
1529 * be able to abort just for this problem later... */
c9a111ac 1530 logfp = fopen(server.logfile,"a");
1531 if (logfp == NULL) {
ed9b544e 1532 err = sdscatprintf(sdsempty(),
1533 "Can't open the log file: %s", strerror(errno));
1534 goto loaderr;
1535 }
c9a111ac 1536 fclose(logfp);
ed9b544e 1537 }
bb0b03a3 1538 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1539 server.dbnum = atoi(argv[1]);
1540 if (server.dbnum < 1) {
1541 err = "Invalid number of databases"; goto loaderr;
1542 }
285add55 1543 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1544 server.maxclients = atoi(argv[1]);
3fd78bcd 1545 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1546 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1547 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1548 server.masterhost = sdsnew(argv[1]);
1549 server.masterport = atoi(argv[2]);
1550 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1551 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1552 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1553 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1554 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1555 err = "argument must be 'yes' or 'no'"; goto loaderr;
1556 }
bb0b03a3 1557 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1558 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1559 err = "argument must be 'yes' or 'no'"; goto loaderr;
1560 }
121f70cf 1561 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1562 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1563 err = "argument must be 'yes' or 'no'"; goto loaderr;
1564 }
e52c65b9 1565 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1566 server.sharingpoolsize = atoi(argv[1]);
1567 if (server.sharingpoolsize < 1) {
1568 err = "invalid object sharing pool size"; goto loaderr;
1569 }
bb0b03a3 1570 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1571 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1572 err = "argument must be 'yes' or 'no'"; goto loaderr;
1573 }
44b38ef4 1574 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1575 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1576 err = "argument must be 'yes' or 'no'"; goto loaderr;
1577 }
48f0308a 1578 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1579 if (!strcasecmp(argv[1],"no")) {
48f0308a 1580 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1581 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1582 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1583 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1584 server.appendfsync = APPENDFSYNC_EVERYSEC;
1585 } else {
1586 err = "argument must be 'no', 'always' or 'everysec'";
1587 goto loaderr;
1588 }
bb0b03a3 1589 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
abcb223e 1590 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1591 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
ed329fcf 1592 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1593 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
b8b553c8 1594 server.dbfilename = zstrdup(argv[1]);
75680a3c 1595 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1596 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1597 err = "argument must be 'yes' or 'no'"; goto loaderr;
1598 }
4ef8de8a 1599 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1600 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1601 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1602 server.vm_page_size = strtoll(argv[1], NULL, 10);
1603 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1604 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1605 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1606 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1607 } else {
1608 err = "Bad directive or wrong number of arguments"; goto loaderr;
1609 }
1610 for (j = 0; j < argc; j++)
1611 sdsfree(argv[j]);
1612 zfree(argv);
1613 sdsfree(line);
1614 }
c9a111ac 1615 if (fp != stdin) fclose(fp);
ed9b544e 1616 return;
1617
1618loaderr:
1619 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1620 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1621 fprintf(stderr, ">>> '%s'\n", line);
1622 fprintf(stderr, "%s\n", err);
1623 exit(1);
1624}
1625
1626static void freeClientArgv(redisClient *c) {
1627 int j;
1628
1629 for (j = 0; j < c->argc; j++)
1630 decrRefCount(c->argv[j]);
e8a74421 1631 for (j = 0; j < c->mbargc; j++)
1632 decrRefCount(c->mbargv[j]);
ed9b544e 1633 c->argc = 0;
e8a74421 1634 c->mbargc = 0;
ed9b544e 1635}
1636
1637static void freeClient(redisClient *c) {
1638 listNode *ln;
1639
4409877e 1640 /* Note that if the client we are freeing is blocked into a blocking
1641 * call, we have to set querybuf to NULL *before* to call unblockClient()
1642 * to avoid processInputBuffer() will get called. Also it is important
1643 * to remove the file events after this, because this call adds
1644 * the READABLE event. */
1645 sdsfree(c->querybuf);
1646 c->querybuf = NULL;
1647 if (c->flags & REDIS_BLOCKED)
1648 unblockClient(c);
1649
ed9b544e 1650 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1651 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1652 listRelease(c->reply);
1653 freeClientArgv(c);
1654 close(c->fd);
92f8e882 1655 /* Remove from the list of clients */
ed9b544e 1656 ln = listSearchKey(server.clients,c);
dfc5e96c 1657 redisAssert(ln != NULL);
ed9b544e 1658 listDelNode(server.clients,ln);
92f8e882 1659 /* Remove from the list of clients waiting for VM operations */
1660 if (server.vm_enabled && listLength(c->io_keys)) {
1661 ln = listSearchKey(server.io_clients,c);
1662 if (ln) listDelNode(server.io_clients,ln);
1663 listRelease(c->io_keys);
1664 }
b3e3d0d7 1665 listRelease(c->io_keys);
92f8e882 1666 /* Other cleanup */
ed9b544e 1667 if (c->flags & REDIS_SLAVE) {
6208b3a7 1668 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1669 close(c->repldbfd);
87eca727 1670 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1671 ln = listSearchKey(l,c);
dfc5e96c 1672 redisAssert(ln != NULL);
87eca727 1673 listDelNode(l,ln);
ed9b544e 1674 }
1675 if (c->flags & REDIS_MASTER) {
1676 server.master = NULL;
1677 server.replstate = REDIS_REPL_CONNECT;
1678 }
93ea3759 1679 zfree(c->argv);
e8a74421 1680 zfree(c->mbargv);
6e469882 1681 freeClientMultiState(c);
ed9b544e 1682 zfree(c);
1683}
1684
cc30e368 1685#define GLUEREPLY_UP_TO (1024)
ed9b544e 1686static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1687 int copylen = 0;
1688 char buf[GLUEREPLY_UP_TO];
6208b3a7 1689 listNode *ln;
ed9b544e 1690 robj *o;
1691
6208b3a7 1692 listRewind(c->reply);
1693 while((ln = listYield(c->reply))) {
c28b42ac 1694 int objlen;
1695
ed9b544e 1696 o = ln->value;
c28b42ac 1697 objlen = sdslen(o->ptr);
1698 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1699 memcpy(buf+copylen,o->ptr,objlen);
1700 copylen += objlen;
ed9b544e 1701 listDelNode(c->reply,ln);
c28b42ac 1702 } else {
1703 if (copylen == 0) return;
1704 break;
ed9b544e 1705 }
ed9b544e 1706 }
c28b42ac 1707 /* Now the output buffer is empty, add the new single element */
1708 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1709 listAddNodeHead(c->reply,o);
ed9b544e 1710}
1711
1712static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1713 redisClient *c = privdata;
1714 int nwritten = 0, totwritten = 0, objlen;
1715 robj *o;
1716 REDIS_NOTUSED(el);
1717 REDIS_NOTUSED(mask);
1718
2895e862 1719 /* Use writev() if we have enough buffers to send */
7ea870c0 1720 if (!server.glueoutputbuf &&
1721 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1722 !(c->flags & REDIS_MASTER))
2895e862 1723 {
1724 sendReplyToClientWritev(el, fd, privdata, mask);
1725 return;
1726 }
2895e862 1727
ed9b544e 1728 while(listLength(c->reply)) {
c28b42ac 1729 if (server.glueoutputbuf && listLength(c->reply) > 1)
1730 glueReplyBuffersIfNeeded(c);
1731
ed9b544e 1732 o = listNodeValue(listFirst(c->reply));
1733 objlen = sdslen(o->ptr);
1734
1735 if (objlen == 0) {
1736 listDelNode(c->reply,listFirst(c->reply));
1737 continue;
1738 }
1739
1740 if (c->flags & REDIS_MASTER) {
6f376729 1741 /* Don't reply to a master */
ed9b544e 1742 nwritten = objlen - c->sentlen;
1743 } else {
a4d1ba9a 1744 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1745 if (nwritten <= 0) break;
1746 }
1747 c->sentlen += nwritten;
1748 totwritten += nwritten;
1749 /* If we fully sent the object on head go to the next one */
1750 if (c->sentlen == objlen) {
1751 listDelNode(c->reply,listFirst(c->reply));
1752 c->sentlen = 0;
1753 }
6f376729 1754 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1755 * bytes, in a single threaded server it's a good idea to serve
6f376729 1756 * other clients as well, even if a very large request comes from
1757 * super fast link that is always able to accept data (in real world
12f9d551 1758 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1759 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1760 }
1761 if (nwritten == -1) {
1762 if (errno == EAGAIN) {
1763 nwritten = 0;
1764 } else {
f870935d 1765 redisLog(REDIS_VERBOSE,
ed9b544e 1766 "Error writing to client: %s", strerror(errno));
1767 freeClient(c);
1768 return;
1769 }
1770 }
1771 if (totwritten > 0) c->lastinteraction = time(NULL);
1772 if (listLength(c->reply) == 0) {
1773 c->sentlen = 0;
1774 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1775 }
1776}
1777
2895e862 1778static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1779{
1780 redisClient *c = privdata;
1781 int nwritten = 0, totwritten = 0, objlen, willwrite;
1782 robj *o;
1783 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1784 int offset, ion = 0;
1785 REDIS_NOTUSED(el);
1786 REDIS_NOTUSED(mask);
1787
1788 listNode *node;
1789 while (listLength(c->reply)) {
1790 offset = c->sentlen;
1791 ion = 0;
1792 willwrite = 0;
1793
1794 /* fill-in the iov[] array */
1795 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1796 o = listNodeValue(node);
1797 objlen = sdslen(o->ptr);
1798
1799 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1800 break;
1801
1802 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1803 break; /* no more iovecs */
1804
1805 iov[ion].iov_base = ((char*)o->ptr) + offset;
1806 iov[ion].iov_len = objlen - offset;
1807 willwrite += objlen - offset;
1808 offset = 0; /* just for the first item */
1809 ion++;
1810 }
1811
1812 if(willwrite == 0)
1813 break;
1814
1815 /* write all collected blocks at once */
1816 if((nwritten = writev(fd, iov, ion)) < 0) {
1817 if (errno != EAGAIN) {
f870935d 1818 redisLog(REDIS_VERBOSE,
2895e862 1819 "Error writing to client: %s", strerror(errno));
1820 freeClient(c);
1821 return;
1822 }
1823 break;
1824 }
1825
1826 totwritten += nwritten;
1827 offset = c->sentlen;
1828
1829 /* remove written robjs from c->reply */
1830 while (nwritten && listLength(c->reply)) {
1831 o = listNodeValue(listFirst(c->reply));
1832 objlen = sdslen(o->ptr);
1833
1834 if(nwritten >= objlen - offset) {
1835 listDelNode(c->reply, listFirst(c->reply));
1836 nwritten -= objlen - offset;
1837 c->sentlen = 0;
1838 } else {
1839 /* partial write */
1840 c->sentlen += nwritten;
1841 break;
1842 }
1843 offset = 0;
1844 }
1845 }
1846
1847 if (totwritten > 0)
1848 c->lastinteraction = time(NULL);
1849
1850 if (listLength(c->reply) == 0) {
1851 c->sentlen = 0;
1852 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1853 }
1854}
1855
ed9b544e 1856static struct redisCommand *lookupCommand(char *name) {
1857 int j = 0;
1858 while(cmdTable[j].name != NULL) {
bb0b03a3 1859 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 1860 j++;
1861 }
1862 return NULL;
1863}
1864
1865/* resetClient prepare the client to process the next command */
1866static void resetClient(redisClient *c) {
1867 freeClientArgv(c);
1868 c->bulklen = -1;
e8a74421 1869 c->multibulk = 0;
ed9b544e 1870}
1871
6e469882 1872/* Call() is the core of Redis execution of a command */
1873static void call(redisClient *c, struct redisCommand *cmd) {
1874 long long dirty;
1875
1876 dirty = server.dirty;
1877 cmd->proc(c);
1878 if (server.appendonly && server.dirty-dirty)
1879 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1880 if (server.dirty-dirty && listLength(server.slaves))
1881 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1882 if (listLength(server.monitors))
1883 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1884 server.stat_numcommands++;
1885}
1886
ed9b544e 1887/* If this function gets called we already read a whole
1888 * command, argments are in the client argv/argc fields.
1889 * processCommand() execute the command or prepare the
1890 * server for a bulk read from the client.
1891 *
1892 * If 1 is returned the client is still alive and valid and
1893 * and other operations can be performed by the caller. Otherwise
1894 * if 0 is returned the client was destroied (i.e. after QUIT). */
1895static int processCommand(redisClient *c) {
1896 struct redisCommand *cmd;
ed9b544e 1897
3fd78bcd 1898 /* Free some memory if needed (maxmemory setting) */
1899 if (server.maxmemory) freeMemoryIfNeeded();
1900
e8a74421 1901 /* Handle the multi bulk command type. This is an alternative protocol
1902 * supported by Redis in order to receive commands that are composed of
1903 * multiple binary-safe "bulk" arguments. The latency of processing is
1904 * a bit higher but this allows things like multi-sets, so if this
1905 * protocol is used only for MSET and similar commands this is a big win. */
1906 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
1907 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
1908 if (c->multibulk <= 0) {
1909 resetClient(c);
1910 return 1;
1911 } else {
1912 decrRefCount(c->argv[c->argc-1]);
1913 c->argc--;
1914 return 1;
1915 }
1916 } else if (c->multibulk) {
1917 if (c->bulklen == -1) {
1918 if (((char*)c->argv[0]->ptr)[0] != '$') {
1919 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
1920 resetClient(c);
1921 return 1;
1922 } else {
1923 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
1924 decrRefCount(c->argv[0]);
1925 if (bulklen < 0 || bulklen > 1024*1024*1024) {
1926 c->argc--;
1927 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
1928 resetClient(c);
1929 return 1;
1930 }
1931 c->argc--;
1932 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
1933 return 1;
1934 }
1935 } else {
1936 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
1937 c->mbargv[c->mbargc] = c->argv[0];
1938 c->mbargc++;
1939 c->argc--;
1940 c->multibulk--;
1941 if (c->multibulk == 0) {
1942 robj **auxargv;
1943 int auxargc;
1944
1945 /* Here we need to swap the multi-bulk argc/argv with the
1946 * normal argc/argv of the client structure. */
1947 auxargv = c->argv;
1948 c->argv = c->mbargv;
1949 c->mbargv = auxargv;
1950
1951 auxargc = c->argc;
1952 c->argc = c->mbargc;
1953 c->mbargc = auxargc;
1954
1955 /* We need to set bulklen to something different than -1
1956 * in order for the code below to process the command without
1957 * to try to read the last argument of a bulk command as
1958 * a special argument. */
1959 c->bulklen = 0;
1960 /* continue below and process the command */
1961 } else {
1962 c->bulklen = -1;
1963 return 1;
1964 }
1965 }
1966 }
1967 /* -- end of multi bulk commands processing -- */
1968
ed9b544e 1969 /* The QUIT command is handled as a special case. Normal command
1970 * procs are unable to close the client connection safely */
bb0b03a3 1971 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 1972 freeClient(c);
1973 return 0;
1974 }
1975 cmd = lookupCommand(c->argv[0]->ptr);
1976 if (!cmd) {
2c14807b 1977 addReplySds(c,
1978 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
1979 (char*)c->argv[0]->ptr));
ed9b544e 1980 resetClient(c);
1981 return 1;
1982 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
1983 (c->argc < -cmd->arity)) {
454d4e43 1984 addReplySds(c,
1985 sdscatprintf(sdsempty(),
1986 "-ERR wrong number of arguments for '%s' command\r\n",
1987 cmd->name));
ed9b544e 1988 resetClient(c);
1989 return 1;
3fd78bcd 1990 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
1991 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
1992 resetClient(c);
1993 return 1;
ed9b544e 1994 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
1995 int bulklen = atoi(c->argv[c->argc-1]->ptr);
1996
1997 decrRefCount(c->argv[c->argc-1]);
1998 if (bulklen < 0 || bulklen > 1024*1024*1024) {
1999 c->argc--;
2000 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2001 resetClient(c);
2002 return 1;
2003 }
2004 c->argc--;
2005 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2006 /* It is possible that the bulk read is already in the
8d0490e7 2007 * buffer. Check this condition and handle it accordingly.
2008 * This is just a fast path, alternative to call processInputBuffer().
2009 * It's a good idea since the code is small and this condition
2010 * happens most of the times. */
ed9b544e 2011 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2012 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2013 c->argc++;
2014 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2015 } else {
2016 return 1;
2017 }
2018 }
10c43610 2019 /* Let's try to share objects on the command arguments vector */
2020 if (server.shareobjects) {
2021 int j;
2022 for(j = 1; j < c->argc; j++)
2023 c->argv[j] = tryObjectSharing(c->argv[j]);
2024 }
942a3961 2025 /* Let's try to encode the bulk object to save space. */
2026 if (cmd->flags & REDIS_CMD_BULK)
2027 tryObjectEncoding(c->argv[c->argc-1]);
2028
e63943a4 2029 /* Check if the user is authenticated */
2030 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2031 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2032 resetClient(c);
2033 return 1;
2034 }
2035
ed9b544e 2036 /* Exec the command */
6e469882 2037 if (c->flags & REDIS_MULTI && cmd->proc != execCommand) {
2038 queueMultiCommand(c,cmd);
2039 addReply(c,shared.queued);
2040 } else {
2041 call(c,cmd);
2042 }
ed9b544e 2043
2044 /* Prepare the client for the next command */
2045 if (c->flags & REDIS_CLOSE) {
2046 freeClient(c);
2047 return 0;
2048 }
2049 resetClient(c);
2050 return 1;
2051}
2052
87eca727 2053static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 2054 listNode *ln;
ed9b544e 2055 int outc = 0, j;
93ea3759 2056 robj **outv;
2057 /* (args*2)+1 is enough room for args, spaces, newlines */
2058 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2059
2060 if (argc <= REDIS_STATIC_ARGS) {
2061 outv = static_outv;
2062 } else {
2063 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 2064 }
ed9b544e 2065
2066 for (j = 0; j < argc; j++) {
2067 if (j != 0) outv[outc++] = shared.space;
2068 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2069 robj *lenobj;
2070
2071 lenobj = createObject(REDIS_STRING,
682ac724 2072 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 2073 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 2074 lenobj->refcount = 0;
2075 outv[outc++] = lenobj;
2076 }
2077 outv[outc++] = argv[j];
2078 }
2079 outv[outc++] = shared.crlf;
2080
40d224a9 2081 /* Increment all the refcounts at start and decrement at end in order to
2082 * be sure to free objects if there is no slave in a replication state
2083 * able to be feed with commands */
2084 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
6208b3a7 2085 listRewind(slaves);
2086 while((ln = listYield(slaves))) {
ed9b544e 2087 redisClient *slave = ln->value;
40d224a9 2088
2089 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2090 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2091
2092 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2093 if (slave->slaveseldb != dictid) {
2094 robj *selectcmd;
2095
2096 switch(dictid) {
2097 case 0: selectcmd = shared.select0; break;
2098 case 1: selectcmd = shared.select1; break;
2099 case 2: selectcmd = shared.select2; break;
2100 case 3: selectcmd = shared.select3; break;
2101 case 4: selectcmd = shared.select4; break;
2102 case 5: selectcmd = shared.select5; break;
2103 case 6: selectcmd = shared.select6; break;
2104 case 7: selectcmd = shared.select7; break;
2105 case 8: selectcmd = shared.select8; break;
2106 case 9: selectcmd = shared.select9; break;
2107 default:
2108 selectcmd = createObject(REDIS_STRING,
2109 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2110 selectcmd->refcount = 0;
2111 break;
2112 }
2113 addReply(slave,selectcmd);
2114 slave->slaveseldb = dictid;
2115 }
2116 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2117 }
40d224a9 2118 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2119 if (outv != static_outv) zfree(outv);
ed9b544e 2120}
2121
638e42ac 2122static void processInputBuffer(redisClient *c) {
ed9b544e 2123again:
4409877e 2124 /* Before to process the input buffer, make sure the client is not
2125 * waitig for a blocking operation such as BLPOP. Note that the first
2126 * iteration the client is never blocked, otherwise the processInputBuffer
2127 * would not be called at all, but after the execution of the first commands
2128 * in the input buffer the client may be blocked, and the "goto again"
2129 * will try to reiterate. The following line will make it return asap. */
92f8e882 2130 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2131 if (c->bulklen == -1) {
2132 /* Read the first line of the query */
2133 char *p = strchr(c->querybuf,'\n');
2134 size_t querylen;
644fafa3 2135
ed9b544e 2136 if (p) {
2137 sds query, *argv;
2138 int argc, j;
2139
2140 query = c->querybuf;
2141 c->querybuf = sdsempty();
2142 querylen = 1+(p-(query));
2143 if (sdslen(query) > querylen) {
2144 /* leave data after the first line of the query in the buffer */
2145 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2146 }
2147 *p = '\0'; /* remove "\n" */
2148 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2149 sdsupdatelen(query);
2150
2151 /* Now we can split the query in arguments */
ed9b544e 2152 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2153 sdsfree(query);
2154
2155 if (c->argv) zfree(c->argv);
2156 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2157
2158 for (j = 0; j < argc; j++) {
ed9b544e 2159 if (sdslen(argv[j])) {
2160 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2161 c->argc++;
2162 } else {
2163 sdsfree(argv[j]);
2164 }
2165 }
2166 zfree(argv);
7c49733c 2167 if (c->argc) {
2168 /* Execute the command. If the client is still valid
2169 * after processCommand() return and there is something
2170 * on the query buffer try to process the next command. */
2171 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2172 } else {
2173 /* Nothing to process, argc == 0. Just process the query
2174 * buffer if it's not empty or return to the caller */
2175 if (sdslen(c->querybuf)) goto again;
2176 }
ed9b544e 2177 return;
644fafa3 2178 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2179 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2180 freeClient(c);
2181 return;
2182 }
2183 } else {
2184 /* Bulk read handling. Note that if we are at this point
2185 the client already sent a command terminated with a newline,
2186 we are reading the bulk data that is actually the last
2187 argument of the command. */
2188 int qbl = sdslen(c->querybuf);
2189
2190 if (c->bulklen <= qbl) {
2191 /* Copy everything but the final CRLF as final argument */
2192 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2193 c->argc++;
2194 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2195 /* Process the command. If the client is still valid after
2196 * the processing and there is more data in the buffer
2197 * try to parse it. */
2198 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2199 return;
2200 }
2201 }
2202}
2203
638e42ac 2204static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2205 redisClient *c = (redisClient*) privdata;
2206 char buf[REDIS_IOBUF_LEN];
2207 int nread;
2208 REDIS_NOTUSED(el);
2209 REDIS_NOTUSED(mask);
2210
2211 nread = read(fd, buf, REDIS_IOBUF_LEN);
2212 if (nread == -1) {
2213 if (errno == EAGAIN) {
2214 nread = 0;
2215 } else {
f870935d 2216 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2217 freeClient(c);
2218 return;
2219 }
2220 } else if (nread == 0) {
f870935d 2221 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2222 freeClient(c);
2223 return;
2224 }
2225 if (nread) {
2226 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2227 c->lastinteraction = time(NULL);
2228 } else {
2229 return;
2230 }
2231 processInputBuffer(c);
2232}
2233
ed9b544e 2234static int selectDb(redisClient *c, int id) {
2235 if (id < 0 || id >= server.dbnum)
2236 return REDIS_ERR;
3305306f 2237 c->db = &server.db[id];
ed9b544e 2238 return REDIS_OK;
2239}
2240
40d224a9 2241static void *dupClientReplyValue(void *o) {
2242 incrRefCount((robj*)o);
2243 return 0;
2244}
2245
ed9b544e 2246static redisClient *createClient(int fd) {
2247 redisClient *c = zmalloc(sizeof(*c));
2248
2249 anetNonBlock(NULL,fd);
2250 anetTcpNoDelay(NULL,fd);
2251 if (!c) return NULL;
2252 selectDb(c,0);
2253 c->fd = fd;
2254 c->querybuf = sdsempty();
2255 c->argc = 0;
93ea3759 2256 c->argv = NULL;
ed9b544e 2257 c->bulklen = -1;
e8a74421 2258 c->multibulk = 0;
2259 c->mbargc = 0;
2260 c->mbargv = NULL;
ed9b544e 2261 c->sentlen = 0;
2262 c->flags = 0;
2263 c->lastinteraction = time(NULL);
abcb223e 2264 c->authenticated = 0;
40d224a9 2265 c->replstate = REDIS_REPL_NONE;
6b47e12e 2266 c->reply = listCreate();
ed9b544e 2267 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2268 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2269 c->blockingkeys = NULL;
2270 c->blockingkeysnum = 0;
2271 c->io_keys = listCreate();
2272 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2273 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2274 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2275 freeClient(c);
2276 return NULL;
2277 }
6b47e12e 2278 listAddNodeTail(server.clients,c);
6e469882 2279 initClientMultiState(c);
ed9b544e 2280 return c;
2281}
2282
2283static void addReply(redisClient *c, robj *obj) {
2284 if (listLength(c->reply) == 0 &&
6208b3a7 2285 (c->replstate == REDIS_REPL_NONE ||
2286 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2287 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2288 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2289
2290 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2291 obj = dupStringObject(obj);
2292 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2293 }
9d65a1bb 2294 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2295}
2296
2297static void addReplySds(redisClient *c, sds s) {
2298 robj *o = createObject(REDIS_STRING,s);
2299 addReply(c,o);
2300 decrRefCount(o);
2301}
2302
e2665397 2303static void addReplyDouble(redisClient *c, double d) {
2304 char buf[128];
2305
2306 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2307 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2308 (unsigned long) strlen(buf),buf));
e2665397 2309}
2310
942a3961 2311static void addReplyBulkLen(redisClient *c, robj *obj) {
2312 size_t len;
2313
2314 if (obj->encoding == REDIS_ENCODING_RAW) {
2315 len = sdslen(obj->ptr);
2316 } else {
2317 long n = (long)obj->ptr;
2318
e054afda 2319 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2320 len = 1;
2321 if (n < 0) {
2322 len++;
2323 n = -n;
2324 }
2325 while((n = n/10) != 0) {
2326 len++;
2327 }
2328 }
83c6a618 2329 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2330}
2331
ed9b544e 2332static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2333 int cport, cfd;
2334 char cip[128];
285add55 2335 redisClient *c;
ed9b544e 2336 REDIS_NOTUSED(el);
2337 REDIS_NOTUSED(mask);
2338 REDIS_NOTUSED(privdata);
2339
2340 cfd = anetAccept(server.neterr, fd, cip, &cport);
2341 if (cfd == AE_ERR) {
f870935d 2342 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2343 return;
2344 }
f870935d 2345 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2346 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2347 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2348 close(cfd); /* May be already closed, just ingore errors */
2349 return;
2350 }
285add55 2351 /* If maxclient directive is set and this is one client more... close the
2352 * connection. Note that we create the client instead to check before
2353 * for this condition, since now the socket is already set in nonblocking
2354 * mode and we can send an error for free using the Kernel I/O */
2355 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2356 char *err = "-ERR max number of clients reached\r\n";
2357
2358 /* That's a best effort error message, don't check write errors */
fee803ba 2359 if (write(c->fd,err,strlen(err)) == -1) {
2360 /* Nothing to do, Just to avoid the warning... */
2361 }
285add55 2362 freeClient(c);
2363 return;
2364 }
ed9b544e 2365 server.stat_numconnections++;
2366}
2367
2368/* ======================= Redis objects implementation ===================== */
2369
2370static robj *createObject(int type, void *ptr) {
2371 robj *o;
2372
a5819310 2373 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2374 if (listLength(server.objfreelist)) {
2375 listNode *head = listFirst(server.objfreelist);
2376 o = listNodeValue(head);
2377 listDelNode(server.objfreelist,head);
a5819310 2378 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2379 } else {
75680a3c 2380 if (server.vm_enabled) {
a5819310 2381 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2382 o = zmalloc(sizeof(*o));
2383 } else {
2384 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2385 }
ed9b544e 2386 }
ed9b544e 2387 o->type = type;
942a3961 2388 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2389 o->ptr = ptr;
2390 o->refcount = 1;
3a66edc7 2391 if (server.vm_enabled) {
2392 o->vm.atime = server.unixtime;
2393 o->storage = REDIS_VM_MEMORY;
2394 }
ed9b544e 2395 return o;
2396}
2397
2398static robj *createStringObject(char *ptr, size_t len) {
2399 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2400}
2401
4ef8de8a 2402static robj *dupStringObject(robj *o) {
b9bc0eef 2403 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2404 return createStringObject(o->ptr,sdslen(o->ptr));
2405}
2406
ed9b544e 2407static robj *createListObject(void) {
2408 list *l = listCreate();
2409
ed9b544e 2410 listSetFreeMethod(l,decrRefCount);
2411 return createObject(REDIS_LIST,l);
2412}
2413
2414static robj *createSetObject(void) {
2415 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2416 return createObject(REDIS_SET,d);
2417}
2418
1812e024 2419static robj *createZsetObject(void) {
6b47e12e 2420 zset *zs = zmalloc(sizeof(*zs));
2421
2422 zs->dict = dictCreate(&zsetDictType,NULL);
2423 zs->zsl = zslCreate();
2424 return createObject(REDIS_ZSET,zs);
1812e024 2425}
2426
ed9b544e 2427static void freeStringObject(robj *o) {
942a3961 2428 if (o->encoding == REDIS_ENCODING_RAW) {
2429 sdsfree(o->ptr);
2430 }
ed9b544e 2431}
2432
2433static void freeListObject(robj *o) {
2434 listRelease((list*) o->ptr);
2435}
2436
2437static void freeSetObject(robj *o) {
2438 dictRelease((dict*) o->ptr);
2439}
2440
fd8ccf44 2441static void freeZsetObject(robj *o) {
2442 zset *zs = o->ptr;
2443
2444 dictRelease(zs->dict);
2445 zslFree(zs->zsl);
2446 zfree(zs);
2447}
2448
ed9b544e 2449static void freeHashObject(robj *o) {
2450 dictRelease((dict*) o->ptr);
2451}
2452
2453static void incrRefCount(robj *o) {
f2b8ab34 2454 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2455 o->refcount++;
2456}
2457
2458static void decrRefCount(void *obj) {
2459 robj *o = obj;
94754ccc 2460
996cb5f7 2461 /* Object is swapped out, or in the process of being loaded. */
2462 if (server.vm_enabled &&
2463 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2464 {
2465 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2466 redisAssert(o->refcount == 1);
2467 }
2468 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2469 redisAssert(o->type == REDIS_STRING);
a35ddf12 2470 freeStringObject(o);
2471 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2472 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2473 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2474 !listAddNodeHead(server.objfreelist,o))
2475 zfree(o);
a5819310 2476 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2477 server.vm_stats_swapped_objects--;
a35ddf12 2478 return;
2479 }
996cb5f7 2480 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2481 if (--(o->refcount) == 0) {
996cb5f7 2482 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2483 vmCancelThreadedIOJob(obj);
ed9b544e 2484 switch(o->type) {
2485 case REDIS_STRING: freeStringObject(o); break;
2486 case REDIS_LIST: freeListObject(o); break;
2487 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2488 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2489 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2490 default: redisAssert(0 != 0); break;
ed9b544e 2491 }
a5819310 2492 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2493 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2494 !listAddNodeHead(server.objfreelist,o))
2495 zfree(o);
a5819310 2496 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2497 }
2498}
2499
942a3961 2500static robj *lookupKey(redisDb *db, robj *key) {
2501 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2502 if (de) {
55cf8433 2503 robj *key = dictGetEntryKey(de);
2504 robj *val = dictGetEntryVal(de);
3a66edc7 2505
55cf8433 2506 if (server.vm_enabled) {
996cb5f7 2507 if (key->storage == REDIS_VM_MEMORY ||
2508 key->storage == REDIS_VM_SWAPPING)
2509 {
2510 /* If we were swapping the object out, stop it, this key
2511 * was requested. */
2512 if (key->storage == REDIS_VM_SWAPPING)
2513 vmCancelThreadedIOJob(key);
55cf8433 2514 /* Update the access time of the key for the aging algorithm. */
2515 key->vm.atime = server.unixtime;
2516 } else {
2517 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2518 redisAssert(val == NULL);
55cf8433 2519 val = vmLoadObject(key);
2520 dictGetEntryVal(de) = val;
2521 }
2522 }
2523 return val;
3a66edc7 2524 } else {
2525 return NULL;
2526 }
942a3961 2527}
2528
2529static robj *lookupKeyRead(redisDb *db, robj *key) {
2530 expireIfNeeded(db,key);
2531 return lookupKey(db,key);
2532}
2533
2534static robj *lookupKeyWrite(redisDb *db, robj *key) {
2535 deleteIfVolatile(db,key);
2536 return lookupKey(db,key);
2537}
2538
2539static int deleteKey(redisDb *db, robj *key) {
2540 int retval;
2541
2542 /* We need to protect key from destruction: after the first dictDelete()
2543 * it may happen that 'key' is no longer valid if we don't increment
2544 * it's count. This may happen when we get the object reference directly
2545 * from the hash table with dictRandomKey() or dict iterators */
2546 incrRefCount(key);
2547 if (dictSize(db->expires)) dictDelete(db->expires,key);
2548 retval = dictDelete(db->dict,key);
2549 decrRefCount(key);
2550
2551 return retval == DICT_OK;
2552}
2553
10c43610 2554/* Try to share an object against the shared objects pool */
2555static robj *tryObjectSharing(robj *o) {
2556 struct dictEntry *de;
2557 unsigned long c;
2558
3305306f 2559 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2560
dfc5e96c 2561 redisAssert(o->type == REDIS_STRING);
10c43610 2562 de = dictFind(server.sharingpool,o);
2563 if (de) {
2564 robj *shared = dictGetEntryKey(de);
2565
2566 c = ((unsigned long) dictGetEntryVal(de))+1;
2567 dictGetEntryVal(de) = (void*) c;
2568 incrRefCount(shared);
2569 decrRefCount(o);
2570 return shared;
2571 } else {
2572 /* Here we are using a stream algorihtm: Every time an object is
2573 * shared we increment its count, everytime there is a miss we
2574 * recrement the counter of a random object. If this object reaches
2575 * zero we remove the object and put the current object instead. */
3305306f 2576 if (dictSize(server.sharingpool) >=
10c43610 2577 server.sharingpoolsize) {
2578 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2579 redisAssert(de != NULL);
10c43610 2580 c = ((unsigned long) dictGetEntryVal(de))-1;
2581 dictGetEntryVal(de) = (void*) c;
2582 if (c == 0) {
2583 dictDelete(server.sharingpool,de->key);
2584 }
2585 } else {
2586 c = 0; /* If the pool is empty we want to add this object */
2587 }
2588 if (c == 0) {
2589 int retval;
2590
2591 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2592 redisAssert(retval == DICT_OK);
10c43610 2593 incrRefCount(o);
2594 }
2595 return o;
2596 }
2597}
2598
724a51b1 2599/* Check if the nul-terminated string 's' can be represented by a long
2600 * (that is, is a number that fits into long without any other space or
2601 * character before or after the digits).
2602 *
2603 * If so, the function returns REDIS_OK and *longval is set to the value
2604 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2605static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2606 char buf[32], *endptr;
2607 long value;
2608 int slen;
2609
2610 value = strtol(s, &endptr, 10);
2611 if (endptr[0] != '\0') return REDIS_ERR;
2612 slen = snprintf(buf,32,"%ld",value);
2613
2614 /* If the number converted back into a string is not identical
2615 * then it's not possible to encode the string as integer */
f69f2cba 2616 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2617 if (longval) *longval = value;
2618 return REDIS_OK;
2619}
2620
942a3961 2621/* Try to encode a string object in order to save space */
2622static int tryObjectEncoding(robj *o) {
2623 long value;
942a3961 2624 sds s = o->ptr;
3305306f 2625
942a3961 2626 if (o->encoding != REDIS_ENCODING_RAW)
2627 return REDIS_ERR; /* Already encoded */
3305306f 2628
942a3961 2629 /* It's not save to encode shared objects: shared objects can be shared
2630 * everywhere in the "object space" of Redis. Encoded objects can only
2631 * appear as "values" (and not, for instance, as keys) */
2632 if (o->refcount > 1) return REDIS_ERR;
3305306f 2633
942a3961 2634 /* Currently we try to encode only strings */
dfc5e96c 2635 redisAssert(o->type == REDIS_STRING);
94754ccc 2636
724a51b1 2637 /* Check if we can represent this string as a long integer */
2638 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2639
2640 /* Ok, this object can be encoded */
2641 o->encoding = REDIS_ENCODING_INT;
2642 sdsfree(o->ptr);
2643 o->ptr = (void*) value;
2644 return REDIS_OK;
2645}
2646
9d65a1bb 2647/* Get a decoded version of an encoded object (returned as a new object).
2648 * If the object is already raw-encoded just increment the ref count. */
2649static robj *getDecodedObject(robj *o) {
942a3961 2650 robj *dec;
2651
9d65a1bb 2652 if (o->encoding == REDIS_ENCODING_RAW) {
2653 incrRefCount(o);
2654 return o;
2655 }
942a3961 2656 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2657 char buf[32];
2658
2659 snprintf(buf,32,"%ld",(long)o->ptr);
2660 dec = createStringObject(buf,strlen(buf));
2661 return dec;
2662 } else {
dfc5e96c 2663 redisAssert(1 != 1);
942a3961 2664 }
3305306f 2665}
2666
d7f43c08 2667/* Compare two string objects via strcmp() or alike.
2668 * Note that the objects may be integer-encoded. In such a case we
2669 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2670 * and compare the strings, it's much faster than calling getDecodedObject().
2671 *
2672 * Important note: if objects are not integer encoded, but binary-safe strings,
2673 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2674 * binary safe. */
724a51b1 2675static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2676 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2677 char bufa[128], bufb[128], *astr, *bstr;
2678 int bothsds = 1;
724a51b1 2679
e197b441 2680 if (a == b) return 0;
d7f43c08 2681 if (a->encoding != REDIS_ENCODING_RAW) {
2682 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2683 astr = bufa;
2684 bothsds = 0;
724a51b1 2685 } else {
d7f43c08 2686 astr = a->ptr;
724a51b1 2687 }
d7f43c08 2688 if (b->encoding != REDIS_ENCODING_RAW) {
2689 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2690 bstr = bufb;
2691 bothsds = 0;
2692 } else {
2693 bstr = b->ptr;
2694 }
2695 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2696}
2697
0ea663ea 2698static size_t stringObjectLen(robj *o) {
dfc5e96c 2699 redisAssert(o->type == REDIS_STRING);
0ea663ea 2700 if (o->encoding == REDIS_ENCODING_RAW) {
2701 return sdslen(o->ptr);
2702 } else {
2703 char buf[32];
2704
2705 return snprintf(buf,32,"%ld",(long)o->ptr);
2706 }
2707}
2708
06233c45 2709/*============================ RDB saving/loading =========================== */
ed9b544e 2710
f78fd11b 2711static int rdbSaveType(FILE *fp, unsigned char type) {
2712 if (fwrite(&type,1,1,fp) == 0) return -1;
2713 return 0;
2714}
2715
bb32ede5 2716static int rdbSaveTime(FILE *fp, time_t t) {
2717 int32_t t32 = (int32_t) t;
2718 if (fwrite(&t32,4,1,fp) == 0) return -1;
2719 return 0;
2720}
2721
e3566d4b 2722/* check rdbLoadLen() comments for more info */
f78fd11b 2723static int rdbSaveLen(FILE *fp, uint32_t len) {
2724 unsigned char buf[2];
2725
2726 if (len < (1<<6)) {
2727 /* Save a 6 bit len */
10c43610 2728 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2729 if (fwrite(buf,1,1,fp) == 0) return -1;
2730 } else if (len < (1<<14)) {
2731 /* Save a 14 bit len */
10c43610 2732 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2733 buf[1] = len&0xFF;
17be1a4a 2734 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2735 } else {
2736 /* Save a 32 bit len */
10c43610 2737 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2738 if (fwrite(buf,1,1,fp) == 0) return -1;
2739 len = htonl(len);
2740 if (fwrite(&len,4,1,fp) == 0) return -1;
2741 }
2742 return 0;
2743}
2744
e3566d4b 2745/* String objects in the form "2391" "-100" without any space and with a
2746 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2747 * encoded as integers to save space */
56906eef 2748static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
e3566d4b 2749 long long value;
2750 char *endptr, buf[32];
2751
2752 /* Check if it's possible to encode this value as a number */
2753 value = strtoll(s, &endptr, 10);
2754 if (endptr[0] != '\0') return 0;
2755 snprintf(buf,32,"%lld",value);
2756
2757 /* If the number converted back into a string is not identical
2758 * then it's not possible to encode the string as integer */
2759 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2760
2761 /* Finally check if it fits in our ranges */
2762 if (value >= -(1<<7) && value <= (1<<7)-1) {
2763 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2764 enc[1] = value&0xFF;
2765 return 2;
2766 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2767 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2768 enc[1] = value&0xFF;
2769 enc[2] = (value>>8)&0xFF;
2770 return 3;
2771 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2772 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2773 enc[1] = value&0xFF;
2774 enc[2] = (value>>8)&0xFF;
2775 enc[3] = (value>>16)&0xFF;
2776 enc[4] = (value>>24)&0xFF;
2777 return 5;
2778 } else {
2779 return 0;
2780 }
2781}
2782
774e3047 2783static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2784 unsigned int comprlen, outlen;
2785 unsigned char byte;
2786 void *out;
2787
2788 /* We require at least four bytes compression for this to be worth it */
2789 outlen = sdslen(obj->ptr)-4;
2790 if (outlen <= 0) return 0;
3a2694c4 2791 if ((out = zmalloc(outlen+1)) == NULL) return 0;
774e3047 2792 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2793 if (comprlen == 0) {
88e85998 2794 zfree(out);
774e3047 2795 return 0;
2796 }
2797 /* Data compressed! Let's save it on disk */
2798 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2799 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2800 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2801 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2802 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 2803 zfree(out);
774e3047 2804 return comprlen;
2805
2806writeerr:
88e85998 2807 zfree(out);
774e3047 2808 return -1;
2809}
2810
e3566d4b 2811/* Save a string objet as [len][data] on disk. If the object is a string
2812 * representation of an integer value we try to safe it in a special form */
942a3961 2813static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2814 size_t len;
e3566d4b 2815 int enclen;
10c43610 2816
942a3961 2817 len = sdslen(obj->ptr);
2818
774e3047 2819 /* Try integer encoding */
e3566d4b 2820 if (len <= 11) {
2821 unsigned char buf[5];
2822 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2823 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2824 return 0;
2825 }
2826 }
774e3047 2827
2828 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 2829 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 2830 if (server.rdbcompression && len > 20) {
774e3047 2831 int retval;
2832
2833 retval = rdbSaveLzfStringObject(fp,obj);
2834 if (retval == -1) return -1;
2835 if (retval > 0) return 0;
2836 /* retval == 0 means data can't be compressed, save the old way */
2837 }
2838
2839 /* Store verbatim */
10c43610 2840 if (rdbSaveLen(fp,len) == -1) return -1;
2841 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2842 return 0;
2843}
2844
942a3961 2845/* Like rdbSaveStringObjectRaw() but handle encoded objects */
2846static int rdbSaveStringObject(FILE *fp, robj *obj) {
2847 int retval;
942a3961 2848
996cb5f7 2849 if (obj->storage == REDIS_VM_MEMORY &&
2850 obj->encoding != REDIS_ENCODING_RAW)
2851 {
2852 obj = getDecodedObject(obj);
2853 retval = rdbSaveStringObjectRaw(fp,obj);
2854 decrRefCount(obj);
2855 } else {
2856 /* This is a fast path when we are sure the object is not encoded.
2857 * Note that's any *faster* actually as we needed to add the conditional
2858 * but because this may happen in a background process we don't want
2859 * to touch the object fields with incr/decrRefCount in order to
2860 * preveny copy on write of pages.
2861 *
2862 * Also incrRefCount() will have a failing assert() if we try to call
2863 * it against an object with storage != REDIS_VM_MEMORY. */
2864 retval = rdbSaveStringObjectRaw(fp,obj);
2865 }
9d65a1bb 2866 return retval;
942a3961 2867}
2868
a7866db6 2869/* Save a double value. Doubles are saved as strings prefixed by an unsigned
2870 * 8 bit integer specifing the length of the representation.
2871 * This 8 bit integer has special values in order to specify the following
2872 * conditions:
2873 * 253: not a number
2874 * 254: + inf
2875 * 255: - inf
2876 */
2877static int rdbSaveDoubleValue(FILE *fp, double val) {
2878 unsigned char buf[128];
2879 int len;
2880
2881 if (isnan(val)) {
2882 buf[0] = 253;
2883 len = 1;
2884 } else if (!isfinite(val)) {
2885 len = 1;
2886 buf[0] = (val < 0) ? 255 : 254;
2887 } else {
eaa256ad 2888 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 2889 buf[0] = strlen((char*)buf+1);
a7866db6 2890 len = buf[0]+1;
2891 }
2892 if (fwrite(buf,len,1,fp) == 0) return -1;
2893 return 0;
2894}
2895
06233c45 2896/* Save a Redis object. */
2897static int rdbSaveObject(FILE *fp, robj *o) {
2898 if (o->type == REDIS_STRING) {
2899 /* Save a string value */
2900 if (rdbSaveStringObject(fp,o) == -1) return -1;
2901 } else if (o->type == REDIS_LIST) {
2902 /* Save a list value */
2903 list *list = o->ptr;
2904 listNode *ln;
2905
2906 listRewind(list);
2907 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
2908 while((ln = listYield(list))) {
2909 robj *eleobj = listNodeValue(ln);
2910
2911 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2912 }
2913 } else if (o->type == REDIS_SET) {
2914 /* Save a set value */
2915 dict *set = o->ptr;
2916 dictIterator *di = dictGetIterator(set);
2917 dictEntry *de;
2918
2919 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
2920 while((de = dictNext(di)) != NULL) {
2921 robj *eleobj = dictGetEntryKey(de);
2922
2923 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2924 }
2925 dictReleaseIterator(di);
2926 } else if (o->type == REDIS_ZSET) {
2927 /* Save a set value */
2928 zset *zs = o->ptr;
2929 dictIterator *di = dictGetIterator(zs->dict);
2930 dictEntry *de;
2931
2932 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
2933 while((de = dictNext(di)) != NULL) {
2934 robj *eleobj = dictGetEntryKey(de);
2935 double *score = dictGetEntryVal(de);
2936
2937 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2938 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
2939 }
2940 dictReleaseIterator(di);
2941 } else {
2942 redisAssert(0 != 0);
2943 }
2944 return 0;
2945}
2946
2947/* Return the length the object will have on disk if saved with
2948 * the rdbSaveObject() function. Currently we use a trick to get
2949 * this length with very little changes to the code. In the future
2950 * we could switch to a faster solution. */
b9bc0eef 2951static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
2952 if (fp == NULL) fp = server.devnull;
06233c45 2953 rewind(fp);
2954 assert(rdbSaveObject(fp,o) != 1);
2955 return ftello(fp);
2956}
2957
06224fec 2958/* Return the number of pages required to save this object in the swap file */
b9bc0eef 2959static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
2960 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 2961
2962 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
2963}
2964
ed9b544e 2965/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 2966static int rdbSave(char *filename) {
ed9b544e 2967 dictIterator *di = NULL;
2968 dictEntry *de;
ed9b544e 2969 FILE *fp;
2970 char tmpfile[256];
2971 int j;
bb32ede5 2972 time_t now = time(NULL);
ed9b544e 2973
a3b21203 2974 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 2975 fp = fopen(tmpfile,"w");
2976 if (!fp) {
2977 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
2978 return REDIS_ERR;
2979 }
f78fd11b 2980 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 2981 for (j = 0; j < server.dbnum; j++) {
bb32ede5 2982 redisDb *db = server.db+j;
2983 dict *d = db->dict;
3305306f 2984 if (dictSize(d) == 0) continue;
ed9b544e 2985 di = dictGetIterator(d);
2986 if (!di) {
2987 fclose(fp);
2988 return REDIS_ERR;
2989 }
2990
2991 /* Write the SELECT DB opcode */
f78fd11b 2992 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
2993 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 2994
2995 /* Iterate this DB writing every entry */
2996 while((de = dictNext(di)) != NULL) {
2997 robj *key = dictGetEntryKey(de);
2998 robj *o = dictGetEntryVal(de);
bb32ede5 2999 time_t expiretime = getExpire(db,key);
3000
3001 /* Save the expire time */
3002 if (expiretime != -1) {
3003 /* If this key is already expired skip it */
3004 if (expiretime < now) continue;
3005 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3006 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3007 }
7e69548d 3008 /* Save the key and associated value. This requires special
3009 * handling if the value is swapped out. */
996cb5f7 3010 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3011 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3012 /* Save type, key, value */
3013 if (rdbSaveType(fp,o->type) == -1) goto werr;
3014 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3015 if (rdbSaveObject(fp,o) == -1) goto werr;
3016 } else {
996cb5f7 3017 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3018 robj *po;
7e69548d 3019 /* Get a preview of the object in memory */
3020 po = vmPreviewObject(key);
7e69548d 3021 /* Save type, key, value */
3022 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3023 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3024 if (rdbSaveObject(fp,po) == -1) goto werr;
3025 /* Remove the loaded object from memory */
3026 decrRefCount(po);
7e69548d 3027 }
ed9b544e 3028 }
3029 dictReleaseIterator(di);
3030 }
3031 /* EOF opcode */
f78fd11b 3032 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3033
3034 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3035 fflush(fp);
3036 fsync(fileno(fp));
3037 fclose(fp);
3038
3039 /* Use RENAME to make sure the DB file is changed atomically only
3040 * if the generate DB file is ok. */
3041 if (rename(tmpfile,filename) == -1) {
325d1eb4 3042 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3043 unlink(tmpfile);
3044 return REDIS_ERR;
3045 }
3046 redisLog(REDIS_NOTICE,"DB saved on disk");
3047 server.dirty = 0;
3048 server.lastsave = time(NULL);
3049 return REDIS_OK;
3050
3051werr:
3052 fclose(fp);
3053 unlink(tmpfile);
3054 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3055 if (di) dictReleaseIterator(di);
3056 return REDIS_ERR;
3057}
3058
f78fd11b 3059static int rdbSaveBackground(char *filename) {
ed9b544e 3060 pid_t childpid;
3061
9d65a1bb 3062 if (server.bgsavechildpid != -1) return REDIS_ERR;
ed9b544e 3063 if ((childpid = fork()) == 0) {
3064 /* Child */
3065 close(server.fd);
f78fd11b 3066 if (rdbSave(filename) == REDIS_OK) {
ed9b544e 3067 exit(0);
3068 } else {
3069 exit(1);
3070 }
3071 } else {
3072 /* Parent */
5a7c647e 3073 if (childpid == -1) {
3074 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3075 strerror(errno));
3076 return REDIS_ERR;
3077 }
ed9b544e 3078 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3079 server.bgsavechildpid = childpid;
ed9b544e 3080 return REDIS_OK;
3081 }
3082 return REDIS_OK; /* unreached */
3083}
3084
a3b21203 3085static void rdbRemoveTempFile(pid_t childpid) {
3086 char tmpfile[256];
3087
3088 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3089 unlink(tmpfile);
3090}
3091
f78fd11b 3092static int rdbLoadType(FILE *fp) {
3093 unsigned char type;
7b45bfb2 3094 if (fread(&type,1,1,fp) == 0) return -1;
3095 return type;
3096}
3097
bb32ede5 3098static time_t rdbLoadTime(FILE *fp) {
3099 int32_t t32;
3100 if (fread(&t32,4,1,fp) == 0) return -1;
3101 return (time_t) t32;
3102}
3103
e3566d4b 3104/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3105 * of this file for a description of how this are stored on disk.
3106 *
3107 * isencoded is set to 1 if the readed length is not actually a length but
3108 * an "encoding type", check the above comments for more info */
c78a8ccc 3109static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3110 unsigned char buf[2];
3111 uint32_t len;
c78a8ccc 3112 int type;
f78fd11b 3113
e3566d4b 3114 if (isencoded) *isencoded = 0;
c78a8ccc 3115 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3116 type = (buf[0]&0xC0)>>6;
3117 if (type == REDIS_RDB_6BITLEN) {
3118 /* Read a 6 bit len */
3119 return buf[0]&0x3F;
3120 } else if (type == REDIS_RDB_ENCVAL) {
3121 /* Read a 6 bit len encoding type */
3122 if (isencoded) *isencoded = 1;
3123 return buf[0]&0x3F;
3124 } else if (type == REDIS_RDB_14BITLEN) {
3125 /* Read a 14 bit len */
3126 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3127 return ((buf[0]&0x3F)<<8)|buf[1];
3128 } else {
3129 /* Read a 32 bit len */
f78fd11b 3130 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3131 return ntohl(len);
f78fd11b 3132 }
f78fd11b 3133}
3134
e3566d4b 3135static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3136 unsigned char enc[4];
3137 long long val;
3138
3139 if (enctype == REDIS_RDB_ENC_INT8) {
3140 if (fread(enc,1,1,fp) == 0) return NULL;
3141 val = (signed char)enc[0];
3142 } else if (enctype == REDIS_RDB_ENC_INT16) {
3143 uint16_t v;
3144 if (fread(enc,2,1,fp) == 0) return NULL;
3145 v = enc[0]|(enc[1]<<8);
3146 val = (int16_t)v;
3147 } else if (enctype == REDIS_RDB_ENC_INT32) {
3148 uint32_t v;
3149 if (fread(enc,4,1,fp) == 0) return NULL;
3150 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3151 val = (int32_t)v;
3152 } else {
3153 val = 0; /* anti-warning */
dfc5e96c 3154 redisAssert(0!=0);
e3566d4b 3155 }
3156 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3157}
3158
c78a8ccc 3159static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3160 unsigned int len, clen;
3161 unsigned char *c = NULL;
3162 sds val = NULL;
3163
c78a8ccc 3164 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3165 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3166 if ((c = zmalloc(clen)) == NULL) goto err;
3167 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3168 if (fread(c,clen,1,fp) == 0) goto err;
3169 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3170 zfree(c);
88e85998 3171 return createObject(REDIS_STRING,val);
3172err:
3173 zfree(c);
3174 sdsfree(val);
3175 return NULL;
3176}
3177
c78a8ccc 3178static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3179 int isencoded;
3180 uint32_t len;
f78fd11b 3181 sds val;
3182
c78a8ccc 3183 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3184 if (isencoded) {
3185 switch(len) {
3186 case REDIS_RDB_ENC_INT8:
3187 case REDIS_RDB_ENC_INT16:
3188 case REDIS_RDB_ENC_INT32:
3305306f 3189 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3190 case REDIS_RDB_ENC_LZF:
c78a8ccc 3191 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3192 default:
dfc5e96c 3193 redisAssert(0!=0);
e3566d4b 3194 }
3195 }
3196
f78fd11b 3197 if (len == REDIS_RDB_LENERR) return NULL;
3198 val = sdsnewlen(NULL,len);
3199 if (len && fread(val,len,1,fp) == 0) {
3200 sdsfree(val);
3201 return NULL;
3202 }
10c43610 3203 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3204}
3205
a7866db6 3206/* For information about double serialization check rdbSaveDoubleValue() */
3207static int rdbLoadDoubleValue(FILE *fp, double *val) {
3208 char buf[128];
3209 unsigned char len;
3210
3211 if (fread(&len,1,1,fp) == 0) return -1;
3212 switch(len) {
3213 case 255: *val = R_NegInf; return 0;
3214 case 254: *val = R_PosInf; return 0;
3215 case 253: *val = R_Nan; return 0;
3216 default:
3217 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3218 buf[len] = '\0';
a7866db6 3219 sscanf(buf, "%lg", val);
3220 return 0;
3221 }
3222}
3223
c78a8ccc 3224/* Load a Redis object of the specified type from the specified file.
3225 * On success a newly allocated object is returned, otherwise NULL. */
3226static robj *rdbLoadObject(int type, FILE *fp) {
3227 robj *o;
3228
3229 if (type == REDIS_STRING) {
3230 /* Read string value */
3231 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3232 tryObjectEncoding(o);
3233 } else if (type == REDIS_LIST || type == REDIS_SET) {
3234 /* Read list/set value */
3235 uint32_t listlen;
3236
3237 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3238 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3239 /* Load every single element of the list/set */
3240 while(listlen--) {
3241 robj *ele;
3242
3243 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3244 tryObjectEncoding(ele);
3245 if (type == REDIS_LIST) {
3246 listAddNodeTail((list*)o->ptr,ele);
3247 } else {
3248 dictAdd((dict*)o->ptr,ele,NULL);
3249 }
3250 }
3251 } else if (type == REDIS_ZSET) {
3252 /* Read list/set value */
3253 uint32_t zsetlen;
3254 zset *zs;
3255
3256 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3257 o = createZsetObject();
3258 zs = o->ptr;
3259 /* Load every single element of the list/set */
3260 while(zsetlen--) {
3261 robj *ele;
3262 double *score = zmalloc(sizeof(double));
3263
3264 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3265 tryObjectEncoding(ele);
3266 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3267 dictAdd(zs->dict,ele,score);
3268 zslInsert(zs->zsl,*score,ele);
3269 incrRefCount(ele); /* added to skiplist */
3270 }
3271 } else {
3272 redisAssert(0 != 0);
3273 }
3274 return o;
3275}
3276
f78fd11b 3277static int rdbLoad(char *filename) {
ed9b544e 3278 FILE *fp;
f78fd11b 3279 robj *keyobj = NULL;
3280 uint32_t dbid;
bb32ede5 3281 int type, retval, rdbver;
3305306f 3282 dict *d = server.db[0].dict;
bb32ede5 3283 redisDb *db = server.db+0;
f78fd11b 3284 char buf[1024];
bb32ede5 3285 time_t expiretime = -1, now = time(NULL);
b492cf00 3286 long long loadedkeys = 0;
bb32ede5 3287
ed9b544e 3288 fp = fopen(filename,"r");
3289 if (!fp) return REDIS_ERR;
3290 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3291 buf[9] = '\0';
3292 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3293 fclose(fp);
3294 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3295 return REDIS_ERR;
3296 }
f78fd11b 3297 rdbver = atoi(buf+5);
c78a8ccc 3298 if (rdbver != 1) {
f78fd11b 3299 fclose(fp);
3300 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3301 return REDIS_ERR;
3302 }
ed9b544e 3303 while(1) {
3304 robj *o;
3305
3306 /* Read type. */
f78fd11b 3307 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3308 if (type == REDIS_EXPIRETIME) {
3309 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3310 /* We read the time so we need to read the object type again */
3311 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3312 }
ed9b544e 3313 if (type == REDIS_EOF) break;
3314 /* Handle SELECT DB opcode as a special case */
3315 if (type == REDIS_SELECTDB) {
c78a8ccc 3316 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3317 goto eoferr;
ed9b544e 3318 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3319 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3320 exit(1);
3321 }
bb32ede5 3322 db = server.db+dbid;
3323 d = db->dict;
ed9b544e 3324 continue;
3325 }
3326 /* Read key */
c78a8ccc 3327 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3328 /* Read value */
3329 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3330 /* Add the new object in the hash table */
f78fd11b 3331 retval = dictAdd(d,keyobj,o);
ed9b544e 3332 if (retval == DICT_ERR) {
f78fd11b 3333 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3334 exit(1);
3335 }
bb32ede5 3336 /* Set the expire time if needed */
3337 if (expiretime != -1) {
3338 setExpire(db,keyobj,expiretime);
3339 /* Delete this key if already expired */
3340 if (expiretime < now) deleteKey(db,keyobj);
3341 expiretime = -1;
3342 }
f78fd11b 3343 keyobj = o = NULL;
b492cf00 3344 /* Handle swapping while loading big datasets when VM is on */
3345 loadedkeys++;
3346 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3347 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3348 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3349 }
3350 }
ed9b544e 3351 }
3352 fclose(fp);
3353 return REDIS_OK;
3354
3355eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3356 if (keyobj) decrRefCount(keyobj);
f80dff62 3357 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3358 exit(1);
3359 return REDIS_ERR; /* Just to avoid warning */
3360}
3361
3362/*================================== Commands =============================== */
3363
abcb223e 3364static void authCommand(redisClient *c) {
2e77c2ee 3365 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3366 c->authenticated = 1;
3367 addReply(c,shared.ok);
3368 } else {
3369 c->authenticated = 0;
fa4c0aba 3370 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3371 }
3372}
3373
ed9b544e 3374static void pingCommand(redisClient *c) {
3375 addReply(c,shared.pong);
3376}
3377
3378static void echoCommand(redisClient *c) {
942a3961 3379 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3380 addReply(c,c->argv[1]);
3381 addReply(c,shared.crlf);
3382}
3383
3384/*=================================== Strings =============================== */
3385
3386static void setGenericCommand(redisClient *c, int nx) {
3387 int retval;
3388
333fd216 3389 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3390 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3391 if (retval == DICT_ERR) {
3392 if (!nx) {
1b03836c 3393 /* If the key is about a swapped value, we want a new key object
3394 * to overwrite the old. So we delete the old key in the database.
3395 * This will also make sure that swap pages about the old object
3396 * will be marked as free. */
3397 if (deleteIfSwapped(c->db,c->argv[1]))
3398 incrRefCount(c->argv[1]);
3305306f 3399 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3400 incrRefCount(c->argv[2]);
3401 } else {
c937aa89 3402 addReply(c,shared.czero);
ed9b544e 3403 return;
3404 }
3405 } else {
3406 incrRefCount(c->argv[1]);
3407 incrRefCount(c->argv[2]);
3408 }
3409 server.dirty++;
3305306f 3410 removeExpire(c->db,c->argv[1]);
c937aa89 3411 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3412}
3413
3414static void setCommand(redisClient *c) {
a4d1ba9a 3415 setGenericCommand(c,0);
ed9b544e 3416}
3417
3418static void setnxCommand(redisClient *c) {
a4d1ba9a 3419 setGenericCommand(c,1);
ed9b544e 3420}
3421
322fc7d8 3422static int getGenericCommand(redisClient *c) {
3305306f 3423 robj *o = lookupKeyRead(c->db,c->argv[1]);
3424
3425 if (o == NULL) {
c937aa89 3426 addReply(c,shared.nullbulk);
322fc7d8 3427 return REDIS_OK;
ed9b544e 3428 } else {
ed9b544e 3429 if (o->type != REDIS_STRING) {
c937aa89 3430 addReply(c,shared.wrongtypeerr);
322fc7d8 3431 return REDIS_ERR;
ed9b544e 3432 } else {
942a3961 3433 addReplyBulkLen(c,o);
ed9b544e 3434 addReply(c,o);
3435 addReply(c,shared.crlf);
322fc7d8 3436 return REDIS_OK;
ed9b544e 3437 }
3438 }
3439}
3440
322fc7d8 3441static void getCommand(redisClient *c) {
3442 getGenericCommand(c);
3443}
3444
f6b141c5 3445static void getsetCommand(redisClient *c) {
322fc7d8 3446 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3447 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3448 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3449 } else {
3450 incrRefCount(c->argv[1]);
3451 }
3452 incrRefCount(c->argv[2]);
3453 server.dirty++;
3454 removeExpire(c->db,c->argv[1]);
3455}
3456
70003d28 3457static void mgetCommand(redisClient *c) {
70003d28 3458 int j;
3459
c937aa89 3460 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3461 for (j = 1; j < c->argc; j++) {
3305306f 3462 robj *o = lookupKeyRead(c->db,c->argv[j]);
3463 if (o == NULL) {
c937aa89 3464 addReply(c,shared.nullbulk);
70003d28 3465 } else {
70003d28 3466 if (o->type != REDIS_STRING) {
c937aa89 3467 addReply(c,shared.nullbulk);
70003d28 3468 } else {
942a3961 3469 addReplyBulkLen(c,o);
70003d28 3470 addReply(c,o);
3471 addReply(c,shared.crlf);
3472 }
3473 }
3474 }
3475}
3476
6c446631 3477static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3478 int j, busykeys = 0;
6c446631 3479
3480 if ((c->argc % 2) == 0) {
454d4e43 3481 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3482 return;
3483 }
3484 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3485 * set nothing at all if at least one already key exists. */
3486 if (nx) {
3487 for (j = 1; j < c->argc; j += 2) {
906573e7 3488 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3489 busykeys++;
6c446631 3490 }
3491 }
3492 }
906573e7 3493 if (busykeys) {
3494 addReply(c, shared.czero);
3495 return;
3496 }
6c446631 3497
3498 for (j = 1; j < c->argc; j += 2) {
3499 int retval;
3500
17511391 3501 tryObjectEncoding(c->argv[j+1]);
6c446631 3502 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3503 if (retval == DICT_ERR) {
3504 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3505 incrRefCount(c->argv[j+1]);
3506 } else {
3507 incrRefCount(c->argv[j]);
3508 incrRefCount(c->argv[j+1]);
3509 }
3510 removeExpire(c->db,c->argv[j]);
3511 }
3512 server.dirty += (c->argc-1)/2;
3513 addReply(c, nx ? shared.cone : shared.ok);
3514}
3515
3516static void msetCommand(redisClient *c) {
3517 msetGenericCommand(c,0);
3518}
3519
3520static void msetnxCommand(redisClient *c) {
3521 msetGenericCommand(c,1);
3522}
3523
d68ed120 3524static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3525 long long value;
3526 int retval;
3527 robj *o;
3528
3305306f 3529 o = lookupKeyWrite(c->db,c->argv[1]);
3530 if (o == NULL) {
ed9b544e 3531 value = 0;
3532 } else {
ed9b544e 3533 if (o->type != REDIS_STRING) {
3534 value = 0;
3535 } else {
3536 char *eptr;
3537
942a3961 3538 if (o->encoding == REDIS_ENCODING_RAW)
3539 value = strtoll(o->ptr, &eptr, 10);
3540 else if (o->encoding == REDIS_ENCODING_INT)
3541 value = (long)o->ptr;
3542 else
dfc5e96c 3543 redisAssert(1 != 1);
ed9b544e 3544 }
3545 }
3546
3547 value += incr;
3548 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3549 tryObjectEncoding(o);
3305306f 3550 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3551 if (retval == DICT_ERR) {
3305306f 3552 dictReplace(c->db->dict,c->argv[1],o);
3553 removeExpire(c->db,c->argv[1]);
ed9b544e 3554 } else {
3555 incrRefCount(c->argv[1]);
3556 }
3557 server.dirty++;
c937aa89 3558 addReply(c,shared.colon);
ed9b544e 3559 addReply(c,o);
3560 addReply(c,shared.crlf);
3561}
3562
3563static void incrCommand(redisClient *c) {
a4d1ba9a 3564 incrDecrCommand(c,1);
ed9b544e 3565}
3566
3567static void decrCommand(redisClient *c) {
a4d1ba9a 3568 incrDecrCommand(c,-1);
ed9b544e 3569}
3570
3571static void incrbyCommand(redisClient *c) {
d68ed120 3572 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3573 incrDecrCommand(c,incr);
ed9b544e 3574}
3575
3576static void decrbyCommand(redisClient *c) {
d68ed120 3577 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3578 incrDecrCommand(c,-incr);
ed9b544e 3579}
3580
3581/* ========================= Type agnostic commands ========================= */
3582
3583static void delCommand(redisClient *c) {
5109cdff 3584 int deleted = 0, j;
3585
3586 for (j = 1; j < c->argc; j++) {
3587 if (deleteKey(c->db,c->argv[j])) {
3588 server.dirty++;
3589 deleted++;
3590 }
3591 }
3592 switch(deleted) {
3593 case 0:
c937aa89 3594 addReply(c,shared.czero);
5109cdff 3595 break;
3596 case 1:
3597 addReply(c,shared.cone);
3598 break;
3599 default:
3600 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3601 break;
ed9b544e 3602 }
3603}
3604
3605static void existsCommand(redisClient *c) {
3305306f 3606 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 3607}
3608
3609static void selectCommand(redisClient *c) {
3610 int id = atoi(c->argv[1]->ptr);
3611
3612 if (selectDb(c,id) == REDIS_ERR) {
774e3047 3613 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 3614 } else {
3615 addReply(c,shared.ok);
3616 }
3617}
3618
3619static void randomkeyCommand(redisClient *c) {
3620 dictEntry *de;
3305306f 3621
3622 while(1) {
3623 de = dictGetRandomKey(c->db->dict);
ce7bef07 3624 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 3625 }
ed9b544e 3626 if (de == NULL) {
ce7bef07 3627 addReply(c,shared.plus);
ed9b544e 3628 addReply(c,shared.crlf);
3629 } else {
c937aa89 3630 addReply(c,shared.plus);
ed9b544e 3631 addReply(c,dictGetEntryKey(de));
3632 addReply(c,shared.crlf);
3633 }
3634}
3635
3636static void keysCommand(redisClient *c) {
3637 dictIterator *di;
3638 dictEntry *de;
3639 sds pattern = c->argv[1]->ptr;
3640 int plen = sdslen(pattern);
682ac724 3641 unsigned long numkeys = 0, keyslen = 0;
ed9b544e 3642 robj *lenobj = createObject(REDIS_STRING,NULL);
3643
3305306f 3644 di = dictGetIterator(c->db->dict);
ed9b544e 3645 addReply(c,lenobj);
3646 decrRefCount(lenobj);
3647 while((de = dictNext(di)) != NULL) {
3648 robj *keyobj = dictGetEntryKey(de);
3305306f 3649
ed9b544e 3650 sds key = keyobj->ptr;
3651 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3652 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 3653 if (expireIfNeeded(c->db,keyobj) == 0) {
3654 if (numkeys != 0)
3655 addReply(c,shared.space);
3656 addReply(c,keyobj);
3657 numkeys++;
3658 keyslen += sdslen(key);
3659 }
ed9b544e 3660 }
3661 }
3662 dictReleaseIterator(di);
c937aa89 3663 lenobj->ptr = sdscatprintf(sdsempty(),"$%lu\r\n",keyslen+(numkeys ? (numkeys-1) : 0));
ed9b544e 3664 addReply(c,shared.crlf);
3665}
3666
3667static void dbsizeCommand(redisClient *c) {
3668 addReplySds(c,
3305306f 3669 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 3670}
3671
3672static void lastsaveCommand(redisClient *c) {
3673 addReplySds(c,
c937aa89 3674 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 3675}
3676
3677static void typeCommand(redisClient *c) {
3305306f 3678 robj *o;
ed9b544e 3679 char *type;
3305306f 3680
3681 o = lookupKeyRead(c->db,c->argv[1]);
3682 if (o == NULL) {
c937aa89 3683 type = "+none";
ed9b544e 3684 } else {
ed9b544e 3685 switch(o->type) {
c937aa89 3686 case REDIS_STRING: type = "+string"; break;
3687 case REDIS_LIST: type = "+list"; break;
3688 case REDIS_SET: type = "+set"; break;
412a8bce 3689 case REDIS_ZSET: type = "+zset"; break;
ed9b544e 3690 default: type = "unknown"; break;
3691 }
3692 }
3693 addReplySds(c,sdsnew(type));
3694 addReply(c,shared.crlf);
3695}
3696
3697static void saveCommand(redisClient *c) {
9d65a1bb 3698 if (server.bgsavechildpid != -1) {
05557f6d 3699 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3700 return;
3701 }
f78fd11b 3702 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 3703 addReply(c,shared.ok);
3704 } else {
3705 addReply(c,shared.err);
3706 }
3707}
3708
3709static void bgsaveCommand(redisClient *c) {
9d65a1bb 3710 if (server.bgsavechildpid != -1) {
ed9b544e 3711 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3712 return;
3713 }
f78fd11b 3714 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 3715 char *status = "+Background saving started\r\n";
3716 addReplySds(c,sdsnew(status));
ed9b544e 3717 } else {
3718 addReply(c,shared.err);
3719 }
3720}
3721
3722static void shutdownCommand(redisClient *c) {
3723 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 3724 /* Kill the saving child if there is a background saving in progress.
3725 We want to avoid race conditions, for instance our saving child may
3726 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 3727 if (server.bgsavechildpid != -1) {
9f3c422c 3728 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3729 kill(server.bgsavechildpid,SIGKILL);
a3b21203 3730 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 3731 }
ac945e2d 3732 if (server.appendonly) {
3733 /* Append only file: fsync() the AOF and exit */
3734 fsync(server.appendfd);
3735 exit(0);
ed9b544e 3736 } else {
ac945e2d 3737 /* Snapshotting. Perform a SYNC SAVE and exit */
3738 if (rdbSave(server.dbfilename) == REDIS_OK) {
3739 if (server.daemonize)
3740 unlink(server.pidfile);
3741 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3742 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3743 exit(0);
3744 } else {
3745 /* Ooops.. error saving! The best we can do is to continue operating.
3746 * Note that if there was a background saving process, in the next
3747 * cron() Redis will be notified that the background saving aborted,
3748 * handling special stuff like slaves pending for synchronization... */
3749 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3750 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3751 }
ed9b544e 3752 }
3753}
3754
3755static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 3756 robj *o;
3757
3758 /* To use the same key as src and dst is probably an error */
3759 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 3760 addReply(c,shared.sameobjecterr);
ed9b544e 3761 return;
3762 }
3763
3305306f 3764 o = lookupKeyWrite(c->db,c->argv[1]);
3765 if (o == NULL) {
c937aa89 3766 addReply(c,shared.nokeyerr);
ed9b544e 3767 return;
3768 }
ed9b544e 3769 incrRefCount(o);
3305306f 3770 deleteIfVolatile(c->db,c->argv[2]);
3771 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 3772 if (nx) {
3773 decrRefCount(o);
c937aa89 3774 addReply(c,shared.czero);
ed9b544e 3775 return;
3776 }
3305306f 3777 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 3778 } else {
3779 incrRefCount(c->argv[2]);
3780 }
3305306f 3781 deleteKey(c->db,c->argv[1]);
ed9b544e 3782 server.dirty++;
c937aa89 3783 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 3784}
3785
3786static void renameCommand(redisClient *c) {
3787 renameGenericCommand(c,0);
3788}
3789
3790static void renamenxCommand(redisClient *c) {
3791 renameGenericCommand(c,1);
3792}
3793
3794static void moveCommand(redisClient *c) {
3305306f 3795 robj *o;
3796 redisDb *src, *dst;
ed9b544e 3797 int srcid;
3798
3799 /* Obtain source and target DB pointers */
3305306f 3800 src = c->db;
3801 srcid = c->db->id;
ed9b544e 3802 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 3803 addReply(c,shared.outofrangeerr);
ed9b544e 3804 return;
3805 }
3305306f 3806 dst = c->db;
3807 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 3808
3809 /* If the user is moving using as target the same
3810 * DB as the source DB it is probably an error. */
3811 if (src == dst) {
c937aa89 3812 addReply(c,shared.sameobjecterr);
ed9b544e 3813 return;
3814 }
3815
3816 /* Check if the element exists and get a reference */
3305306f 3817 o = lookupKeyWrite(c->db,c->argv[1]);
3818 if (!o) {
c937aa89 3819 addReply(c,shared.czero);
ed9b544e 3820 return;
3821 }
3822
3823 /* Try to add the element to the target DB */
3305306f 3824 deleteIfVolatile(dst,c->argv[1]);
3825 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 3826 addReply(c,shared.czero);
ed9b544e 3827 return;
3828 }
3305306f 3829 incrRefCount(c->argv[1]);
ed9b544e 3830 incrRefCount(o);
3831
3832 /* OK! key moved, free the entry in the source DB */
3305306f 3833 deleteKey(src,c->argv[1]);
ed9b544e 3834 server.dirty++;
c937aa89 3835 addReply(c,shared.cone);
ed9b544e 3836}
3837
3838/* =================================== Lists ================================ */
3839static void pushGenericCommand(redisClient *c, int where) {
3840 robj *lobj;
ed9b544e 3841 list *list;
3305306f 3842
3843 lobj = lookupKeyWrite(c->db,c->argv[1]);
3844 if (lobj == NULL) {
95242ab5 3845 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3846 addReply(c,shared.ok);
3847 return;
3848 }
ed9b544e 3849 lobj = createListObject();
3850 list = lobj->ptr;
3851 if (where == REDIS_HEAD) {
6b47e12e 3852 listAddNodeHead(list,c->argv[2]);
ed9b544e 3853 } else {
6b47e12e 3854 listAddNodeTail(list,c->argv[2]);
ed9b544e 3855 }
3305306f 3856 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 3857 incrRefCount(c->argv[1]);
3858 incrRefCount(c->argv[2]);
3859 } else {
ed9b544e 3860 if (lobj->type != REDIS_LIST) {
3861 addReply(c,shared.wrongtypeerr);
3862 return;
3863 }
95242ab5 3864 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3865 addReply(c,shared.ok);
3866 return;
3867 }
ed9b544e 3868 list = lobj->ptr;
3869 if (where == REDIS_HEAD) {
6b47e12e 3870 listAddNodeHead(list,c->argv[2]);
ed9b544e 3871 } else {
6b47e12e 3872 listAddNodeTail(list,c->argv[2]);
ed9b544e 3873 }
3874 incrRefCount(c->argv[2]);
3875 }
3876 server.dirty++;
3877 addReply(c,shared.ok);
3878}
3879
3880static void lpushCommand(redisClient *c) {
3881 pushGenericCommand(c,REDIS_HEAD);
3882}
3883
3884static void rpushCommand(redisClient *c) {
3885 pushGenericCommand(c,REDIS_TAIL);
3886}
3887
3888static void llenCommand(redisClient *c) {
3305306f 3889 robj *o;
ed9b544e 3890 list *l;
3891
3305306f 3892 o = lookupKeyRead(c->db,c->argv[1]);
3893 if (o == NULL) {
c937aa89 3894 addReply(c,shared.czero);
ed9b544e 3895 return;
3896 } else {
ed9b544e 3897 if (o->type != REDIS_LIST) {
c937aa89 3898 addReply(c,shared.wrongtypeerr);
ed9b544e 3899 } else {
3900 l = o->ptr;
c937aa89 3901 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 3902 }
3903 }
3904}
3905
3906static void lindexCommand(redisClient *c) {
3305306f 3907 robj *o;
ed9b544e 3908 int index = atoi(c->argv[2]->ptr);
3909
3305306f 3910 o = lookupKeyRead(c->db,c->argv[1]);
3911 if (o == NULL) {
c937aa89 3912 addReply(c,shared.nullbulk);
ed9b544e 3913 } else {
ed9b544e 3914 if (o->type != REDIS_LIST) {
c937aa89 3915 addReply(c,shared.wrongtypeerr);
ed9b544e 3916 } else {
3917 list *list = o->ptr;
3918 listNode *ln;
3919
3920 ln = listIndex(list, index);
3921 if (ln == NULL) {
c937aa89 3922 addReply(c,shared.nullbulk);
ed9b544e 3923 } else {
3924 robj *ele = listNodeValue(ln);
942a3961 3925 addReplyBulkLen(c,ele);
ed9b544e 3926 addReply(c,ele);
3927 addReply(c,shared.crlf);
3928 }
3929 }
3930 }
3931}
3932
3933static void lsetCommand(redisClient *c) {
3305306f 3934 robj *o;
ed9b544e 3935 int index = atoi(c->argv[2]->ptr);
3936
3305306f 3937 o = lookupKeyWrite(c->db,c->argv[1]);
3938 if (o == NULL) {
ed9b544e 3939 addReply(c,shared.nokeyerr);
3940 } else {
ed9b544e 3941 if (o->type != REDIS_LIST) {
3942 addReply(c,shared.wrongtypeerr);
3943 } else {
3944 list *list = o->ptr;
3945 listNode *ln;
3946
3947 ln = listIndex(list, index);
3948 if (ln == NULL) {
c937aa89 3949 addReply(c,shared.outofrangeerr);
ed9b544e 3950 } else {
3951 robj *ele = listNodeValue(ln);
3952
3953 decrRefCount(ele);
3954 listNodeValue(ln) = c->argv[3];
3955 incrRefCount(c->argv[3]);
3956 addReply(c,shared.ok);
3957 server.dirty++;
3958 }
3959 }
3960 }
3961}
3962
3963static void popGenericCommand(redisClient *c, int where) {
3305306f 3964 robj *o;
3965
3966 o = lookupKeyWrite(c->db,c->argv[1]);
3967 if (o == NULL) {
c937aa89 3968 addReply(c,shared.nullbulk);
ed9b544e 3969 } else {
ed9b544e 3970 if (o->type != REDIS_LIST) {
c937aa89 3971 addReply(c,shared.wrongtypeerr);
ed9b544e 3972 } else {
3973 list *list = o->ptr;
3974 listNode *ln;
3975
3976 if (where == REDIS_HEAD)
3977 ln = listFirst(list);
3978 else
3979 ln = listLast(list);
3980
3981 if (ln == NULL) {
c937aa89 3982 addReply(c,shared.nullbulk);
ed9b544e 3983 } else {
3984 robj *ele = listNodeValue(ln);
942a3961 3985 addReplyBulkLen(c,ele);
ed9b544e 3986 addReply(c,ele);
3987 addReply(c,shared.crlf);
3988 listDelNode(list,ln);
3989 server.dirty++;
3990 }
3991 }
3992 }
3993}
3994
3995static void lpopCommand(redisClient *c) {
3996 popGenericCommand(c,REDIS_HEAD);
3997}
3998
3999static void rpopCommand(redisClient *c) {
4000 popGenericCommand(c,REDIS_TAIL);
4001}
4002
4003static void lrangeCommand(redisClient *c) {
3305306f 4004 robj *o;
ed9b544e 4005 int start = atoi(c->argv[2]->ptr);
4006 int end = atoi(c->argv[3]->ptr);
3305306f 4007
4008 o = lookupKeyRead(c->db,c->argv[1]);
4009 if (o == NULL) {
c937aa89 4010 addReply(c,shared.nullmultibulk);
ed9b544e 4011 } else {
ed9b544e 4012 if (o->type != REDIS_LIST) {
c937aa89 4013 addReply(c,shared.wrongtypeerr);
ed9b544e 4014 } else {
4015 list *list = o->ptr;
4016 listNode *ln;
4017 int llen = listLength(list);
4018 int rangelen, j;
4019 robj *ele;
4020
4021 /* convert negative indexes */
4022 if (start < 0) start = llen+start;
4023 if (end < 0) end = llen+end;
4024 if (start < 0) start = 0;
4025 if (end < 0) end = 0;
4026
4027 /* indexes sanity checks */
4028 if (start > end || start >= llen) {
4029 /* Out of range start or start > end result in empty list */
c937aa89 4030 addReply(c,shared.emptymultibulk);
ed9b544e 4031 return;
4032 }
4033 if (end >= llen) end = llen-1;
4034 rangelen = (end-start)+1;
4035
4036 /* Return the result in form of a multi-bulk reply */
4037 ln = listIndex(list, start);
c937aa89 4038 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 4039 for (j = 0; j < rangelen; j++) {
4040 ele = listNodeValue(ln);
942a3961 4041 addReplyBulkLen(c,ele);
ed9b544e 4042 addReply(c,ele);
4043 addReply(c,shared.crlf);
4044 ln = ln->next;
4045 }
4046 }
4047 }
4048}
4049
4050static void ltrimCommand(redisClient *c) {
3305306f 4051 robj *o;
ed9b544e 4052 int start = atoi(c->argv[2]->ptr);
4053 int end = atoi(c->argv[3]->ptr);
4054
3305306f 4055 o = lookupKeyWrite(c->db,c->argv[1]);
4056 if (o == NULL) {
ab9d4cb1 4057 addReply(c,shared.ok);
ed9b544e 4058 } else {
ed9b544e 4059 if (o->type != REDIS_LIST) {
4060 addReply(c,shared.wrongtypeerr);
4061 } else {
4062 list *list = o->ptr;
4063 listNode *ln;
4064 int llen = listLength(list);
4065 int j, ltrim, rtrim;
4066
4067 /* convert negative indexes */
4068 if (start < 0) start = llen+start;
4069 if (end < 0) end = llen+end;
4070 if (start < 0) start = 0;
4071 if (end < 0) end = 0;
4072
4073 /* indexes sanity checks */
4074 if (start > end || start >= llen) {
4075 /* Out of range start or start > end result in empty list */
4076 ltrim = llen;
4077 rtrim = 0;
4078 } else {
4079 if (end >= llen) end = llen-1;
4080 ltrim = start;
4081 rtrim = llen-end-1;
4082 }
4083
4084 /* Remove list elements to perform the trim */
4085 for (j = 0; j < ltrim; j++) {
4086 ln = listFirst(list);
4087 listDelNode(list,ln);
4088 }
4089 for (j = 0; j < rtrim; j++) {
4090 ln = listLast(list);
4091 listDelNode(list,ln);
4092 }
ed9b544e 4093 server.dirty++;
e59229a2 4094 addReply(c,shared.ok);
ed9b544e 4095 }
4096 }
4097}
4098
4099static void lremCommand(redisClient *c) {
3305306f 4100 robj *o;
ed9b544e 4101
3305306f 4102 o = lookupKeyWrite(c->db,c->argv[1]);
4103 if (o == NULL) {
33c08b39 4104 addReply(c,shared.czero);
ed9b544e 4105 } else {
ed9b544e 4106 if (o->type != REDIS_LIST) {
c937aa89 4107 addReply(c,shared.wrongtypeerr);
ed9b544e 4108 } else {
4109 list *list = o->ptr;
4110 listNode *ln, *next;
4111 int toremove = atoi(c->argv[2]->ptr);
4112 int removed = 0;
4113 int fromtail = 0;
4114
4115 if (toremove < 0) {
4116 toremove = -toremove;
4117 fromtail = 1;
4118 }
4119 ln = fromtail ? list->tail : list->head;
4120 while (ln) {
ed9b544e 4121 robj *ele = listNodeValue(ln);
a4d1ba9a 4122
4123 next = fromtail ? ln->prev : ln->next;
724a51b1 4124 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4125 listDelNode(list,ln);
4126 server.dirty++;
4127 removed++;
4128 if (toremove && removed == toremove) break;
4129 }
4130 ln = next;
4131 }
c937aa89 4132 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4133 }
4134 }
4135}
4136
12f9d551 4137/* This is the semantic of this command:
0f5f7e9a 4138 * RPOPLPUSH srclist dstlist:
12f9d551 4139 * IF LLEN(srclist) > 0
4140 * element = RPOP srclist
4141 * LPUSH dstlist element
4142 * RETURN element
4143 * ELSE
4144 * RETURN nil
4145 * END
4146 * END
4147 *
4148 * The idea is to be able to get an element from a list in a reliable way
4149 * since the element is not just returned but pushed against another list
4150 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4151 */
0f5f7e9a 4152static void rpoplpushcommand(redisClient *c) {
12f9d551 4153 robj *sobj;
4154
4155 sobj = lookupKeyWrite(c->db,c->argv[1]);
4156 if (sobj == NULL) {
4157 addReply(c,shared.nullbulk);
4158 } else {
4159 if (sobj->type != REDIS_LIST) {
4160 addReply(c,shared.wrongtypeerr);
4161 } else {
4162 list *srclist = sobj->ptr;
4163 listNode *ln = listLast(srclist);
4164
4165 if (ln == NULL) {
4166 addReply(c,shared.nullbulk);
4167 } else {
4168 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4169 robj *ele = listNodeValue(ln);
4170 list *dstlist;
4171
e20fb74f 4172 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4173 addReply(c,shared.wrongtypeerr);
4174 return;
4175 }
e20fb74f 4176
4177 /* Add the element to the target list (unless it's directly
4178 * passed to some BLPOP-ing client */
4179 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4180 if (dobj == NULL) {
4181 /* Create the list if the key does not exist */
4182 dobj = createListObject();
4183 dictAdd(c->db->dict,c->argv[2],dobj);
4184 incrRefCount(c->argv[2]);
4185 }
4186 dstlist = dobj->ptr;
4187 listAddNodeHead(dstlist,ele);
4188 incrRefCount(ele);
4189 }
12f9d551 4190
4191 /* Send the element to the client as reply as well */
4192 addReplyBulkLen(c,ele);
4193 addReply(c,ele);
4194 addReply(c,shared.crlf);
4195
4196 /* Finally remove the element from the source list */
4197 listDelNode(srclist,ln);
4198 server.dirty++;
4199 }
4200 }
4201 }
4202}
4203
4204
ed9b544e 4205/* ==================================== Sets ================================ */
4206
4207static void saddCommand(redisClient *c) {
ed9b544e 4208 robj *set;
4209
3305306f 4210 set = lookupKeyWrite(c->db,c->argv[1]);
4211 if (set == NULL) {
ed9b544e 4212 set = createSetObject();
3305306f 4213 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4214 incrRefCount(c->argv[1]);
4215 } else {
ed9b544e 4216 if (set->type != REDIS_SET) {
c937aa89 4217 addReply(c,shared.wrongtypeerr);
ed9b544e 4218 return;
4219 }
4220 }
4221 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4222 incrRefCount(c->argv[2]);
4223 server.dirty++;
c937aa89 4224 addReply(c,shared.cone);
ed9b544e 4225 } else {
c937aa89 4226 addReply(c,shared.czero);
ed9b544e 4227 }
4228}
4229
4230static void sremCommand(redisClient *c) {
3305306f 4231 robj *set;
ed9b544e 4232
3305306f 4233 set = lookupKeyWrite(c->db,c->argv[1]);
4234 if (set == NULL) {
c937aa89 4235 addReply(c,shared.czero);
ed9b544e 4236 } else {
ed9b544e 4237 if (set->type != REDIS_SET) {
c937aa89 4238 addReply(c,shared.wrongtypeerr);
ed9b544e 4239 return;
4240 }
4241 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4242 server.dirty++;
12fea928 4243 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4244 addReply(c,shared.cone);
ed9b544e 4245 } else {
c937aa89 4246 addReply(c,shared.czero);
ed9b544e 4247 }
4248 }
4249}
4250
a4460ef4 4251static void smoveCommand(redisClient *c) {
4252 robj *srcset, *dstset;
4253
4254 srcset = lookupKeyWrite(c->db,c->argv[1]);
4255 dstset = lookupKeyWrite(c->db,c->argv[2]);
4256
4257 /* If the source key does not exist return 0, if it's of the wrong type
4258 * raise an error */
4259 if (srcset == NULL || srcset->type != REDIS_SET) {
4260 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4261 return;
4262 }
4263 /* Error if the destination key is not a set as well */
4264 if (dstset && dstset->type != REDIS_SET) {
4265 addReply(c,shared.wrongtypeerr);
4266 return;
4267 }
4268 /* Remove the element from the source set */
4269 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4270 /* Key not found in the src set! return zero */
4271 addReply(c,shared.czero);
4272 return;
4273 }
4274 server.dirty++;
4275 /* Add the element to the destination set */
4276 if (!dstset) {
4277 dstset = createSetObject();
4278 dictAdd(c->db->dict,c->argv[2],dstset);
4279 incrRefCount(c->argv[2]);
4280 }
4281 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4282 incrRefCount(c->argv[3]);
4283 addReply(c,shared.cone);
4284}
4285
ed9b544e 4286static void sismemberCommand(redisClient *c) {
3305306f 4287 robj *set;
ed9b544e 4288
3305306f 4289 set = lookupKeyRead(c->db,c->argv[1]);
4290 if (set == NULL) {
c937aa89 4291 addReply(c,shared.czero);
ed9b544e 4292 } else {
ed9b544e 4293 if (set->type != REDIS_SET) {
c937aa89 4294 addReply(c,shared.wrongtypeerr);
ed9b544e 4295 return;
4296 }
4297 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4298 addReply(c,shared.cone);
ed9b544e 4299 else
c937aa89 4300 addReply(c,shared.czero);
ed9b544e 4301 }
4302}
4303
4304static void scardCommand(redisClient *c) {
3305306f 4305 robj *o;
ed9b544e 4306 dict *s;
4307
3305306f 4308 o = lookupKeyRead(c->db,c->argv[1]);
4309 if (o == NULL) {
c937aa89 4310 addReply(c,shared.czero);
ed9b544e 4311 return;
4312 } else {
ed9b544e 4313 if (o->type != REDIS_SET) {
c937aa89 4314 addReply(c,shared.wrongtypeerr);
ed9b544e 4315 } else {
4316 s = o->ptr;
682ac724 4317 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4318 dictSize(s)));
ed9b544e 4319 }
4320 }
4321}
4322
12fea928 4323static void spopCommand(redisClient *c) {
4324 robj *set;
4325 dictEntry *de;
4326
4327 set = lookupKeyWrite(c->db,c->argv[1]);
4328 if (set == NULL) {
4329 addReply(c,shared.nullbulk);
4330 } else {
4331 if (set->type != REDIS_SET) {
4332 addReply(c,shared.wrongtypeerr);
4333 return;
4334 }
4335 de = dictGetRandomKey(set->ptr);
4336 if (de == NULL) {
4337 addReply(c,shared.nullbulk);
4338 } else {
4339 robj *ele = dictGetEntryKey(de);
4340
942a3961 4341 addReplyBulkLen(c,ele);
12fea928 4342 addReply(c,ele);
4343 addReply(c,shared.crlf);
4344 dictDelete(set->ptr,ele);
4345 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4346 server.dirty++;
4347 }
4348 }
4349}
4350
2abb95a9 4351static void srandmemberCommand(redisClient *c) {
4352 robj *set;
4353 dictEntry *de;
4354
4355 set = lookupKeyRead(c->db,c->argv[1]);
4356 if (set == NULL) {
4357 addReply(c,shared.nullbulk);
4358 } else {
4359 if (set->type != REDIS_SET) {
4360 addReply(c,shared.wrongtypeerr);
4361 return;
4362 }
4363 de = dictGetRandomKey(set->ptr);
4364 if (de == NULL) {
4365 addReply(c,shared.nullbulk);
4366 } else {
4367 robj *ele = dictGetEntryKey(de);
4368
4369 addReplyBulkLen(c,ele);
4370 addReply(c,ele);
4371 addReply(c,shared.crlf);
4372 }
4373 }
4374}
4375
ed9b544e 4376static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4377 dict **d1 = (void*) s1, **d2 = (void*) s2;
4378
3305306f 4379 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4380}
4381
682ac724 4382static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4383 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4384 dictIterator *di;
4385 dictEntry *de;
4386 robj *lenobj = NULL, *dstset = NULL;
682ac724 4387 unsigned long j, cardinality = 0;
ed9b544e 4388
ed9b544e 4389 for (j = 0; j < setsnum; j++) {
4390 robj *setobj;
3305306f 4391
4392 setobj = dstkey ?
4393 lookupKeyWrite(c->db,setskeys[j]) :
4394 lookupKeyRead(c->db,setskeys[j]);
4395 if (!setobj) {
ed9b544e 4396 zfree(dv);
5faa6025 4397 if (dstkey) {
fdcaae84 4398 if (deleteKey(c->db,dstkey))
4399 server.dirty++;
0d36ded0 4400 addReply(c,shared.czero);
5faa6025 4401 } else {
4402 addReply(c,shared.nullmultibulk);
4403 }
ed9b544e 4404 return;
4405 }
ed9b544e 4406 if (setobj->type != REDIS_SET) {
4407 zfree(dv);
c937aa89 4408 addReply(c,shared.wrongtypeerr);
ed9b544e 4409 return;
4410 }
4411 dv[j] = setobj->ptr;
4412 }
4413 /* Sort sets from the smallest to largest, this will improve our
4414 * algorithm's performace */
4415 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4416
4417 /* The first thing we should output is the total number of elements...
4418 * since this is a multi-bulk write, but at this stage we don't know
4419 * the intersection set size, so we use a trick, append an empty object
4420 * to the output list and save the pointer to later modify it with the
4421 * right length */
4422 if (!dstkey) {
4423 lenobj = createObject(REDIS_STRING,NULL);
4424 addReply(c,lenobj);
4425 decrRefCount(lenobj);
4426 } else {
4427 /* If we have a target key where to store the resulting set
4428 * create this key with an empty set inside */
4429 dstset = createSetObject();
ed9b544e 4430 }
4431
4432 /* Iterate all the elements of the first (smallest) set, and test
4433 * the element against all the other sets, if at least one set does
4434 * not include the element it is discarded */
4435 di = dictGetIterator(dv[0]);
ed9b544e 4436
4437 while((de = dictNext(di)) != NULL) {
4438 robj *ele;
4439
4440 for (j = 1; j < setsnum; j++)
4441 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4442 if (j != setsnum)
4443 continue; /* at least one set does not contain the member */
4444 ele = dictGetEntryKey(de);
4445 if (!dstkey) {
942a3961 4446 addReplyBulkLen(c,ele);
ed9b544e 4447 addReply(c,ele);
4448 addReply(c,shared.crlf);
4449 cardinality++;
4450 } else {
4451 dictAdd(dstset->ptr,ele,NULL);
4452 incrRefCount(ele);
4453 }
4454 }
4455 dictReleaseIterator(di);
4456
83cdfe18
AG
4457 if (dstkey) {
4458 /* Store the resulting set into the target */
4459 deleteKey(c->db,dstkey);
4460 dictAdd(c->db->dict,dstkey,dstset);
4461 incrRefCount(dstkey);
4462 }
4463
40d224a9 4464 if (!dstkey) {
682ac724 4465 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4466 } else {
682ac724 4467 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4468 dictSize((dict*)dstset->ptr)));
40d224a9 4469 server.dirty++;
4470 }
ed9b544e 4471 zfree(dv);
4472}
4473
4474static void sinterCommand(redisClient *c) {
4475 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4476}
4477
4478static void sinterstoreCommand(redisClient *c) {
4479 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4480}
4481
f4f56e1d 4482#define REDIS_OP_UNION 0
4483#define REDIS_OP_DIFF 1
4484
4485static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4486 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4487 dictIterator *di;
4488 dictEntry *de;
f4f56e1d 4489 robj *dstset = NULL;
40d224a9 4490 int j, cardinality = 0;
4491
40d224a9 4492 for (j = 0; j < setsnum; j++) {
4493 robj *setobj;
4494
4495 setobj = dstkey ?
4496 lookupKeyWrite(c->db,setskeys[j]) :
4497 lookupKeyRead(c->db,setskeys[j]);
4498 if (!setobj) {
4499 dv[j] = NULL;
4500 continue;
4501 }
4502 if (setobj->type != REDIS_SET) {
4503 zfree(dv);
4504 addReply(c,shared.wrongtypeerr);
4505 return;
4506 }
4507 dv[j] = setobj->ptr;
4508 }
4509
4510 /* We need a temp set object to store our union. If the dstkey
4511 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4512 * this set object will be the resulting object to set into the target key*/
4513 dstset = createSetObject();
4514
40d224a9 4515 /* Iterate all the elements of all the sets, add every element a single
4516 * time to the result set */
4517 for (j = 0; j < setsnum; j++) {
51829ed3 4518 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4519 if (!dv[j]) continue; /* non existing keys are like empty sets */
4520
4521 di = dictGetIterator(dv[j]);
40d224a9 4522
4523 while((de = dictNext(di)) != NULL) {
4524 robj *ele;
4525
4526 /* dictAdd will not add the same element multiple times */
4527 ele = dictGetEntryKey(de);
f4f56e1d 4528 if (op == REDIS_OP_UNION || j == 0) {
4529 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4530 incrRefCount(ele);
40d224a9 4531 cardinality++;
4532 }
f4f56e1d 4533 } else if (op == REDIS_OP_DIFF) {
4534 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4535 cardinality--;
4536 }
40d224a9 4537 }
4538 }
4539 dictReleaseIterator(di);
51829ed3
AG
4540
4541 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4542 }
4543
f4f56e1d 4544 /* Output the content of the resulting set, if not in STORE mode */
4545 if (!dstkey) {
4546 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4547 di = dictGetIterator(dstset->ptr);
f4f56e1d 4548 while((de = dictNext(di)) != NULL) {
4549 robj *ele;
4550
4551 ele = dictGetEntryKey(de);
942a3961 4552 addReplyBulkLen(c,ele);
f4f56e1d 4553 addReply(c,ele);
4554 addReply(c,shared.crlf);
4555 }
4556 dictReleaseIterator(di);
83cdfe18
AG
4557 } else {
4558 /* If we have a target key where to store the resulting set
4559 * create this key with the result set inside */
4560 deleteKey(c->db,dstkey);
4561 dictAdd(c->db->dict,dstkey,dstset);
4562 incrRefCount(dstkey);
f4f56e1d 4563 }
4564
4565 /* Cleanup */
40d224a9 4566 if (!dstkey) {
40d224a9 4567 decrRefCount(dstset);
4568 } else {
682ac724 4569 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4570 dictSize((dict*)dstset->ptr)));
40d224a9 4571 server.dirty++;
4572 }
4573 zfree(dv);
4574}
4575
4576static void sunionCommand(redisClient *c) {
f4f56e1d 4577 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4578}
4579
4580static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4581 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4582}
4583
4584static void sdiffCommand(redisClient *c) {
4585 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4586}
4587
4588static void sdiffstoreCommand(redisClient *c) {
4589 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4590}
4591
6b47e12e 4592/* ==================================== ZSets =============================== */
4593
4594/* ZSETs are ordered sets using two data structures to hold the same elements
4595 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4596 * data structure.
4597 *
4598 * The elements are added to an hash table mapping Redis objects to scores.
4599 * At the same time the elements are added to a skip list mapping scores
4600 * to Redis objects (so objects are sorted by scores in this "view"). */
4601
4602/* This skiplist implementation is almost a C translation of the original
4603 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4604 * Alternative to Balanced Trees", modified in three ways:
4605 * a) this implementation allows for repeated values.
4606 * b) the comparison is not just by key (our 'score') but by satellite data.
4607 * c) there is a back pointer, so it's a doubly linked list with the back
4608 * pointers being only at "level 1". This allows to traverse the list
4609 * from tail to head, useful for ZREVRANGE. */
4610
4611static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4612 zskiplistNode *zn = zmalloc(sizeof(*zn));
4613
4614 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4615 zn->score = score;
4616 zn->obj = obj;
4617 return zn;
4618}
4619
4620static zskiplist *zslCreate(void) {
4621 int j;
4622 zskiplist *zsl;
4623
4624 zsl = zmalloc(sizeof(*zsl));
4625 zsl->level = 1;
cc812361 4626 zsl->length = 0;
6b47e12e 4627 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4628 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++)
4629 zsl->header->forward[j] = NULL;
e3870fab 4630 zsl->header->backward = NULL;
4631 zsl->tail = NULL;
6b47e12e 4632 return zsl;
4633}
4634
fd8ccf44 4635static void zslFreeNode(zskiplistNode *node) {
4636 decrRefCount(node->obj);
ad807e6f 4637 zfree(node->forward);
fd8ccf44 4638 zfree(node);
4639}
4640
4641static void zslFree(zskiplist *zsl) {
ad807e6f 4642 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 4643
ad807e6f 4644 zfree(zsl->header->forward);
4645 zfree(zsl->header);
fd8ccf44 4646 while(node) {
599379dd 4647 next = node->forward[0];
fd8ccf44 4648 zslFreeNode(node);
4649 node = next;
4650 }
ad807e6f 4651 zfree(zsl);
fd8ccf44 4652}
4653
6b47e12e 4654static int zslRandomLevel(void) {
4655 int level = 1;
4656 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4657 level += 1;
4658 return level;
4659}
4660
4661static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4662 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4663 int i, level;
4664
4665 x = zsl->header;
4666 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4667 while (x->forward[i] &&
4668 (x->forward[i]->score < score ||
4669 (x->forward[i]->score == score &&
4670 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6b47e12e 4671 x = x->forward[i];
4672 update[i] = x;
4673 }
6b47e12e 4674 /* we assume the key is not already inside, since we allow duplicated
4675 * scores, and the re-insertion of score and redis object should never
4676 * happpen since the caller of zslInsert() should test in the hash table
4677 * if the element is already inside or not. */
4678 level = zslRandomLevel();
4679 if (level > zsl->level) {
4680 for (i = zsl->level; i < level; i++)
4681 update[i] = zsl->header;
4682 zsl->level = level;
4683 }
4684 x = zslCreateNode(level,score,obj);
4685 for (i = 0; i < level; i++) {
4686 x->forward[i] = update[i]->forward[i];
4687 update[i]->forward[i] = x;
4688 }
bb975144 4689 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 4690 if (x->forward[0])
4691 x->forward[0]->backward = x;
4692 else
4693 zsl->tail = x;
cc812361 4694 zsl->length++;
6b47e12e 4695}
4696
50c55df5 4697/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 4698static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 4699 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4700 int i;
4701
4702 x = zsl->header;
4703 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4704 while (x->forward[i] &&
4705 (x->forward[i]->score < score ||
4706 (x->forward[i]->score == score &&
4707 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 4708 x = x->forward[i];
4709 update[i] = x;
4710 }
4711 /* We may have multiple elements with the same score, what we need
4712 * is to find the element with both the right score and object. */
4713 x = x->forward[0];
50c55df5 4714 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
9d60e6e4 4715 for (i = 0; i < zsl->level; i++) {
4716 if (update[i]->forward[i] != x) break;
4717 update[i]->forward[i] = x->forward[i];
4718 }
4719 if (x->forward[0]) {
4720 x->forward[0]->backward = (x->backward == zsl->header) ?
4721 NULL : x->backward;
e197b441 4722 } else {
9d60e6e4 4723 zsl->tail = x->backward;
e197b441 4724 }
9d60e6e4 4725 zslFreeNode(x);
4726 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4727 zsl->level--;
4728 zsl->length--;
4729 return 1;
4730 } else {
4731 return 0; /* not found */
e197b441 4732 }
4733 return 0; /* not found */
fd8ccf44 4734}
4735
1807985b 4736/* Delete all the elements with score between min and max from the skiplist.
4737 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4738 * Note that this function takes the reference to the hash table view of the
4739 * sorted set, in order to remove the elements from the hash table too. */
4740static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4741 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4742 unsigned long removed = 0;
4743 int i;
4744
4745 x = zsl->header;
4746 for (i = zsl->level-1; i >= 0; i--) {
4747 while (x->forward[i] && x->forward[i]->score < min)
4748 x = x->forward[i];
4749 update[i] = x;
4750 }
4751 /* We may have multiple elements with the same score, what we need
4752 * is to find the element with both the right score and object. */
4753 x = x->forward[0];
4754 while (x && x->score <= max) {
4755 zskiplistNode *next;
4756
4757 for (i = 0; i < zsl->level; i++) {
4758 if (update[i]->forward[i] != x) break;
4759 update[i]->forward[i] = x->forward[i];
4760 }
4761 if (x->forward[0]) {
4762 x->forward[0]->backward = (x->backward == zsl->header) ?
4763 NULL : x->backward;
4764 } else {
4765 zsl->tail = x->backward;
4766 }
4767 next = x->forward[0];
4768 dictDelete(dict,x->obj);
4769 zslFreeNode(x);
4770 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4771 zsl->level--;
4772 zsl->length--;
4773 removed++;
4774 x = next;
4775 }
4776 return removed; /* not found */
4777}
4778
50c55df5 4779/* Find the first node having a score equal or greater than the specified one.
4780 * Returns NULL if there is no match. */
4781static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
4782 zskiplistNode *x;
4783 int i;
4784
4785 x = zsl->header;
4786 for (i = zsl->level-1; i >= 0; i--) {
4787 while (x->forward[i] && x->forward[i]->score < score)
4788 x = x->forward[i];
4789 }
4790 /* We may have multiple elements with the same score, what we need
4791 * is to find the element with both the right score and object. */
4792 return x->forward[0];
4793}
4794
fd8ccf44 4795/* The actual Z-commands implementations */
4796
7db723ad 4797/* This generic command implements both ZADD and ZINCRBY.
e2665397 4798 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 4799 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 4800static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 4801 robj *zsetobj;
4802 zset *zs;
4803 double *score;
4804
e2665397 4805 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 4806 if (zsetobj == NULL) {
4807 zsetobj = createZsetObject();
e2665397 4808 dictAdd(c->db->dict,key,zsetobj);
4809 incrRefCount(key);
fd8ccf44 4810 } else {
4811 if (zsetobj->type != REDIS_ZSET) {
4812 addReply(c,shared.wrongtypeerr);
4813 return;
4814 }
4815 }
fd8ccf44 4816 zs = zsetobj->ptr;
e2665397 4817
7db723ad 4818 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 4819 * needs to handle the two different conditions. It's all about setting
4820 * '*score', that is, the new score to set, to the right value. */
4821 score = zmalloc(sizeof(double));
4822 if (doincrement) {
4823 dictEntry *de;
4824
4825 /* Read the old score. If the element was not present starts from 0 */
4826 de = dictFind(zs->dict,ele);
4827 if (de) {
4828 double *oldscore = dictGetEntryVal(de);
4829 *score = *oldscore + scoreval;
4830 } else {
4831 *score = scoreval;
4832 }
4833 } else {
4834 *score = scoreval;
4835 }
4836
4837 /* What follows is a simple remove and re-insert operation that is common
7db723ad 4838 * to both ZADD and ZINCRBY... */
e2665397 4839 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 4840 /* case 1: New element */
e2665397 4841 incrRefCount(ele); /* added to hash */
4842 zslInsert(zs->zsl,*score,ele);
4843 incrRefCount(ele); /* added to skiplist */
fd8ccf44 4844 server.dirty++;
e2665397 4845 if (doincrement)
e2665397 4846 addReplyDouble(c,*score);
91d71bfc 4847 else
4848 addReply(c,shared.cone);
fd8ccf44 4849 } else {
4850 dictEntry *de;
4851 double *oldscore;
4852
4853 /* case 2: Score update operation */
e2665397 4854 de = dictFind(zs->dict,ele);
dfc5e96c 4855 redisAssert(de != NULL);
fd8ccf44 4856 oldscore = dictGetEntryVal(de);
4857 if (*score != *oldscore) {
4858 int deleted;
4859
e2665397 4860 /* Remove and insert the element in the skip list with new score */
4861 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 4862 redisAssert(deleted != 0);
e2665397 4863 zslInsert(zs->zsl,*score,ele);
4864 incrRefCount(ele);
4865 /* Update the score in the hash table */
4866 dictReplace(zs->dict,ele,score);
fd8ccf44 4867 server.dirty++;
2161a965 4868 } else {
4869 zfree(score);
fd8ccf44 4870 }
e2665397 4871 if (doincrement)
4872 addReplyDouble(c,*score);
4873 else
4874 addReply(c,shared.czero);
fd8ccf44 4875 }
4876}
4877
e2665397 4878static void zaddCommand(redisClient *c) {
4879 double scoreval;
4880
4881 scoreval = strtod(c->argv[2]->ptr,NULL);
4882 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
4883}
4884
7db723ad 4885static void zincrbyCommand(redisClient *c) {
e2665397 4886 double scoreval;
4887
4888 scoreval = strtod(c->argv[2]->ptr,NULL);
4889 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
4890}
4891
1b7106e7 4892static void zremCommand(redisClient *c) {
4893 robj *zsetobj;
4894 zset *zs;
4895
4896 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4897 if (zsetobj == NULL) {
4898 addReply(c,shared.czero);
4899 } else {
4900 dictEntry *de;
4901 double *oldscore;
4902 int deleted;
4903
4904 if (zsetobj->type != REDIS_ZSET) {
4905 addReply(c,shared.wrongtypeerr);
4906 return;
4907 }
4908 zs = zsetobj->ptr;
4909 de = dictFind(zs->dict,c->argv[2]);
4910 if (de == NULL) {
4911 addReply(c,shared.czero);
4912 return;
4913 }
4914 /* Delete from the skiplist */
4915 oldscore = dictGetEntryVal(de);
4916 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 4917 redisAssert(deleted != 0);
1b7106e7 4918
4919 /* Delete from the hash table */
4920 dictDelete(zs->dict,c->argv[2]);
4921 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4922 server.dirty++;
4923 addReply(c,shared.cone);
4924 }
4925}
4926
1807985b 4927static void zremrangebyscoreCommand(redisClient *c) {
4928 double min = strtod(c->argv[2]->ptr,NULL);
4929 double max = strtod(c->argv[3]->ptr,NULL);
4930 robj *zsetobj;
4931 zset *zs;
4932
4933 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4934 if (zsetobj == NULL) {
4935 addReply(c,shared.czero);
4936 } else {
4937 long deleted;
4938
4939 if (zsetobj->type != REDIS_ZSET) {
4940 addReply(c,shared.wrongtypeerr);
4941 return;
4942 }
4943 zs = zsetobj->ptr;
4944 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
4945 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4946 server.dirty += deleted;
4947 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
4948 }
4949}
4950
e3870fab 4951static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 4952 robj *o;
4953 int start = atoi(c->argv[2]->ptr);
4954 int end = atoi(c->argv[3]->ptr);
752da584 4955 int withscores = 0;
4956
4957 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
4958 withscores = 1;
4959 } else if (c->argc >= 5) {
4960 addReply(c,shared.syntaxerr);
4961 return;
4962 }
cc812361 4963
4964 o = lookupKeyRead(c->db,c->argv[1]);
4965 if (o == NULL) {
4966 addReply(c,shared.nullmultibulk);
4967 } else {
4968 if (o->type != REDIS_ZSET) {
4969 addReply(c,shared.wrongtypeerr);
4970 } else {
4971 zset *zsetobj = o->ptr;
4972 zskiplist *zsl = zsetobj->zsl;
4973 zskiplistNode *ln;
4974
4975 int llen = zsl->length;
4976 int rangelen, j;
4977 robj *ele;
4978
4979 /* convert negative indexes */
4980 if (start < 0) start = llen+start;
4981 if (end < 0) end = llen+end;
4982 if (start < 0) start = 0;
4983 if (end < 0) end = 0;
4984
4985 /* indexes sanity checks */
4986 if (start > end || start >= llen) {
4987 /* Out of range start or start > end result in empty list */
4988 addReply(c,shared.emptymultibulk);
4989 return;
4990 }
4991 if (end >= llen) end = llen-1;
4992 rangelen = (end-start)+1;
4993
4994 /* Return the result in form of a multi-bulk reply */
e3870fab 4995 if (reverse) {
4996 ln = zsl->tail;
4997 while (start--)
4998 ln = ln->backward;
4999 } else {
5000 ln = zsl->header->forward[0];
5001 while (start--)
5002 ln = ln->forward[0];
5003 }
cc812361 5004
752da584 5005 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5006 withscores ? (rangelen*2) : rangelen));
cc812361 5007 for (j = 0; j < rangelen; j++) {
0aad7a19 5008 ele = ln->obj;
cc812361 5009 addReplyBulkLen(c,ele);
5010 addReply(c,ele);
5011 addReply(c,shared.crlf);
752da584 5012 if (withscores)
5013 addReplyDouble(c,ln->score);
e3870fab 5014 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5015 }
5016 }
5017 }
5018}
5019
e3870fab 5020static void zrangeCommand(redisClient *c) {
5021 zrangeGenericCommand(c,0);
5022}
5023
5024static void zrevrangeCommand(redisClient *c) {
5025 zrangeGenericCommand(c,1);
5026}
5027
50c55df5 5028static void zrangebyscoreCommand(redisClient *c) {
5029 robj *o;
5030 double min = strtod(c->argv[2]->ptr,NULL);
5031 double max = strtod(c->argv[3]->ptr,NULL);
80181f78 5032 int offset = 0, limit = -1;
5033
5034 if (c->argc != 4 && c->argc != 7) {
454d4e43 5035 addReplySds(c,
5036 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5037 return;
5038 } else if (c->argc == 7 && strcasecmp(c->argv[4]->ptr,"limit")) {
5039 addReply(c,shared.syntaxerr);
5040 return;
5041 } else if (c->argc == 7) {
5042 offset = atoi(c->argv[5]->ptr);
5043 limit = atoi(c->argv[6]->ptr);
0b13687c 5044 if (offset < 0) offset = 0;
80181f78 5045 }
50c55df5 5046
5047 o = lookupKeyRead(c->db,c->argv[1]);
5048 if (o == NULL) {
5049 addReply(c,shared.nullmultibulk);
5050 } else {
5051 if (o->type != REDIS_ZSET) {
5052 addReply(c,shared.wrongtypeerr);
5053 } else {
5054 zset *zsetobj = o->ptr;
5055 zskiplist *zsl = zsetobj->zsl;
5056 zskiplistNode *ln;
5057 robj *ele, *lenobj;
5058 unsigned int rangelen = 0;
5059
5060 /* Get the first node with the score >= min */
5061 ln = zslFirstWithScore(zsl,min);
5062 if (ln == NULL) {
5063 /* No element matching the speciifed interval */
5064 addReply(c,shared.emptymultibulk);
5065 return;
5066 }
5067
5068 /* We don't know in advance how many matching elements there
5069 * are in the list, so we push this object that will represent
5070 * the multi-bulk length in the output buffer, and will "fix"
5071 * it later */
5072 lenobj = createObject(REDIS_STRING,NULL);
5073 addReply(c,lenobj);
c74e7c77 5074 decrRefCount(lenobj);
50c55df5 5075
dbbc7285 5076 while(ln && ln->score <= max) {
80181f78 5077 if (offset) {
5078 offset--;
5079 ln = ln->forward[0];
5080 continue;
5081 }
5082 if (limit == 0) break;
50c55df5 5083 ele = ln->obj;
5084 addReplyBulkLen(c,ele);
5085 addReply(c,ele);
5086 addReply(c,shared.crlf);
5087 ln = ln->forward[0];
5088 rangelen++;
80181f78 5089 if (limit > 0) limit--;
50c55df5 5090 }
5091 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",rangelen);
5092 }
5093 }
5094}
5095
3c41331e 5096static void zcardCommand(redisClient *c) {
e197b441 5097 robj *o;
5098 zset *zs;
5099
5100 o = lookupKeyRead(c->db,c->argv[1]);
5101 if (o == NULL) {
5102 addReply(c,shared.czero);
5103 return;
5104 } else {
5105 if (o->type != REDIS_ZSET) {
5106 addReply(c,shared.wrongtypeerr);
5107 } else {
5108 zs = o->ptr;
682ac724 5109 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 5110 }
5111 }
5112}
5113
6e333bbe 5114static void zscoreCommand(redisClient *c) {
5115 robj *o;
5116 zset *zs;
5117
5118 o = lookupKeyRead(c->db,c->argv[1]);
5119 if (o == NULL) {
96d8b4ee 5120 addReply(c,shared.nullbulk);
6e333bbe 5121 return;
5122 } else {
5123 if (o->type != REDIS_ZSET) {
5124 addReply(c,shared.wrongtypeerr);
5125 } else {
5126 dictEntry *de;
5127
5128 zs = o->ptr;
5129 de = dictFind(zs->dict,c->argv[2]);
5130 if (!de) {
5131 addReply(c,shared.nullbulk);
5132 } else {
6e333bbe 5133 double *score = dictGetEntryVal(de);
5134
e2665397 5135 addReplyDouble(c,*score);
6e333bbe 5136 }
5137 }
5138 }
5139}
5140
6b47e12e 5141/* ========================= Non type-specific commands ==================== */
5142
ed9b544e 5143static void flushdbCommand(redisClient *c) {
ca37e9cd 5144 server.dirty += dictSize(c->db->dict);
3305306f 5145 dictEmpty(c->db->dict);
5146 dictEmpty(c->db->expires);
ed9b544e 5147 addReply(c,shared.ok);
ed9b544e 5148}
5149
5150static void flushallCommand(redisClient *c) {
ca37e9cd 5151 server.dirty += emptyDb();
ed9b544e 5152 addReply(c,shared.ok);
f78fd11b 5153 rdbSave(server.dbfilename);
ca37e9cd 5154 server.dirty++;
ed9b544e 5155}
5156
56906eef 5157static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 5158 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 5159 so->type = type;
5160 so->pattern = pattern;
5161 return so;
5162}
5163
5164/* Return the value associated to the key with a name obtained
5165 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 5166static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 5167 char *p;
5168 sds spat, ssub;
5169 robj keyobj;
5170 int prefixlen, sublen, postfixlen;
ed9b544e 5171 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5172 struct {
f1017b3f 5173 long len;
5174 long free;
ed9b544e 5175 char buf[REDIS_SORTKEY_MAX+1];
5176 } keyname;
5177
28173a49 5178 /* If the pattern is "#" return the substitution object itself in order
5179 * to implement the "SORT ... GET #" feature. */
5180 spat = pattern->ptr;
5181 if (spat[0] == '#' && spat[1] == '\0') {
5182 return subst;
5183 }
5184
5185 /* The substitution object may be specially encoded. If so we create
9d65a1bb 5186 * a decoded object on the fly. Otherwise getDecodedObject will just
5187 * increment the ref count, that we'll decrement later. */
5188 subst = getDecodedObject(subst);
942a3961 5189
ed9b544e 5190 ssub = subst->ptr;
5191 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5192 p = strchr(spat,'*');
ed5a857a 5193 if (!p) {
5194 decrRefCount(subst);
5195 return NULL;
5196 }
ed9b544e 5197
5198 prefixlen = p-spat;
5199 sublen = sdslen(ssub);
5200 postfixlen = sdslen(spat)-(prefixlen+1);
5201 memcpy(keyname.buf,spat,prefixlen);
5202 memcpy(keyname.buf+prefixlen,ssub,sublen);
5203 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5204 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5205 keyname.len = prefixlen+sublen+postfixlen;
5206
dfc5e96c 5207 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 5208 decrRefCount(subst);
5209
a4d1ba9a 5210 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 5211 return lookupKeyRead(db,&keyobj);
ed9b544e 5212}
5213
5214/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5215 * the additional parameter is not standard but a BSD-specific we have to
5216 * pass sorting parameters via the global 'server' structure */
5217static int sortCompare(const void *s1, const void *s2) {
5218 const redisSortObject *so1 = s1, *so2 = s2;
5219 int cmp;
5220
5221 if (!server.sort_alpha) {
5222 /* Numeric sorting. Here it's trivial as we precomputed scores */
5223 if (so1->u.score > so2->u.score) {
5224 cmp = 1;
5225 } else if (so1->u.score < so2->u.score) {
5226 cmp = -1;
5227 } else {
5228 cmp = 0;
5229 }
5230 } else {
5231 /* Alphanumeric sorting */
5232 if (server.sort_bypattern) {
5233 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5234 /* At least one compare object is NULL */
5235 if (so1->u.cmpobj == so2->u.cmpobj)
5236 cmp = 0;
5237 else if (so1->u.cmpobj == NULL)
5238 cmp = -1;
5239 else
5240 cmp = 1;
5241 } else {
5242 /* We have both the objects, use strcoll */
5243 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5244 }
5245 } else {
5246 /* Compare elements directly */
9d65a1bb 5247 robj *dec1, *dec2;
5248
5249 dec1 = getDecodedObject(so1->obj);
5250 dec2 = getDecodedObject(so2->obj);
5251 cmp = strcoll(dec1->ptr,dec2->ptr);
5252 decrRefCount(dec1);
5253 decrRefCount(dec2);
ed9b544e 5254 }
5255 }
5256 return server.sort_desc ? -cmp : cmp;
5257}
5258
5259/* The SORT command is the most complex command in Redis. Warning: this code
5260 * is optimized for speed and a bit less for readability */
5261static void sortCommand(redisClient *c) {
ed9b544e 5262 list *operations;
5263 int outputlen = 0;
5264 int desc = 0, alpha = 0;
5265 int limit_start = 0, limit_count = -1, start, end;
5266 int j, dontsort = 0, vectorlen;
5267 int getop = 0; /* GET operation counter */
443c6409 5268 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 5269 redisSortObject *vector; /* Resulting vector to sort */
5270
5271 /* Lookup the key to sort. It must be of the right types */
3305306f 5272 sortval = lookupKeyRead(c->db,c->argv[1]);
5273 if (sortval == NULL) {
d922ae65 5274 addReply(c,shared.nullmultibulk);
ed9b544e 5275 return;
5276 }
a5eb649b 5277 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5278 sortval->type != REDIS_ZSET)
5279 {
c937aa89 5280 addReply(c,shared.wrongtypeerr);
ed9b544e 5281 return;
5282 }
5283
5284 /* Create a list of operations to perform for every sorted element.
5285 * Operations can be GET/DEL/INCR/DECR */
5286 operations = listCreate();
092dac2a 5287 listSetFreeMethod(operations,zfree);
ed9b544e 5288 j = 2;
5289
5290 /* Now we need to protect sortval incrementing its count, in the future
5291 * SORT may have options able to overwrite/delete keys during the sorting
5292 * and the sorted key itself may get destroied */
5293 incrRefCount(sortval);
5294
5295 /* The SORT command has an SQL-alike syntax, parse it */
5296 while(j < c->argc) {
5297 int leftargs = c->argc-j-1;
5298 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5299 desc = 0;
5300 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5301 desc = 1;
5302 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5303 alpha = 1;
5304 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5305 limit_start = atoi(c->argv[j+1]->ptr);
5306 limit_count = atoi(c->argv[j+2]->ptr);
5307 j+=2;
443c6409 5308 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5309 storekey = c->argv[j+1];
5310 j++;
ed9b544e 5311 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5312 sortby = c->argv[j+1];
5313 /* If the BY pattern does not contain '*', i.e. it is constant,
5314 * we don't need to sort nor to lookup the weight keys. */
5315 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5316 j++;
5317 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5318 listAddNodeTail(operations,createSortOperation(
5319 REDIS_SORT_GET,c->argv[j+1]));
5320 getop++;
5321 j++;
ed9b544e 5322 } else {
5323 decrRefCount(sortval);
5324 listRelease(operations);
c937aa89 5325 addReply(c,shared.syntaxerr);
ed9b544e 5326 return;
5327 }
5328 j++;
5329 }
5330
5331 /* Load the sorting vector with all the objects to sort */
a5eb649b 5332 switch(sortval->type) {
5333 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5334 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5335 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 5336 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 5337 }
ed9b544e 5338 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 5339 j = 0;
a5eb649b 5340
ed9b544e 5341 if (sortval->type == REDIS_LIST) {
5342 list *list = sortval->ptr;
6208b3a7 5343 listNode *ln;
5344
5345 listRewind(list);
5346 while((ln = listYield(list))) {
ed9b544e 5347 robj *ele = ln->value;
5348 vector[j].obj = ele;
5349 vector[j].u.score = 0;
5350 vector[j].u.cmpobj = NULL;
ed9b544e 5351 j++;
5352 }
5353 } else {
a5eb649b 5354 dict *set;
ed9b544e 5355 dictIterator *di;
5356 dictEntry *setele;
5357
a5eb649b 5358 if (sortval->type == REDIS_SET) {
5359 set = sortval->ptr;
5360 } else {
5361 zset *zs = sortval->ptr;
5362 set = zs->dict;
5363 }
5364
ed9b544e 5365 di = dictGetIterator(set);
ed9b544e 5366 while((setele = dictNext(di)) != NULL) {
5367 vector[j].obj = dictGetEntryKey(setele);
5368 vector[j].u.score = 0;
5369 vector[j].u.cmpobj = NULL;
5370 j++;
5371 }
5372 dictReleaseIterator(di);
5373 }
dfc5e96c 5374 redisAssert(j == vectorlen);
ed9b544e 5375
5376 /* Now it's time to load the right scores in the sorting vector */
5377 if (dontsort == 0) {
5378 for (j = 0; j < vectorlen; j++) {
5379 if (sortby) {
5380 robj *byval;
5381
3305306f 5382 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 5383 if (!byval || byval->type != REDIS_STRING) continue;
5384 if (alpha) {
9d65a1bb 5385 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 5386 } else {
942a3961 5387 if (byval->encoding == REDIS_ENCODING_RAW) {
5388 vector[j].u.score = strtod(byval->ptr,NULL);
5389 } else {
9d65a1bb 5390 /* Don't need to decode the object if it's
5391 * integer-encoded (the only encoding supported) so
5392 * far. We can just cast it */
f1017b3f 5393 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 5394 vector[j].u.score = (long)byval->ptr;
f1017b3f 5395 } else
dfc5e96c 5396 redisAssert(1 != 1);
942a3961 5397 }
ed9b544e 5398 }
5399 } else {
942a3961 5400 if (!alpha) {
5401 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5402 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5403 else {
5404 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5405 vector[j].u.score = (long) vector[j].obj->ptr;
5406 else
dfc5e96c 5407 redisAssert(1 != 1);
942a3961 5408 }
5409 }
ed9b544e 5410 }
5411 }
5412 }
5413
5414 /* We are ready to sort the vector... perform a bit of sanity check
5415 * on the LIMIT option too. We'll use a partial version of quicksort. */
5416 start = (limit_start < 0) ? 0 : limit_start;
5417 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5418 if (start >= vectorlen) {
5419 start = vectorlen-1;
5420 end = vectorlen-2;
5421 }
5422 if (end >= vectorlen) end = vectorlen-1;
5423
5424 if (dontsort == 0) {
5425 server.sort_desc = desc;
5426 server.sort_alpha = alpha;
5427 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 5428 if (sortby && (start != 0 || end != vectorlen-1))
5429 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5430 else
5431 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 5432 }
5433
5434 /* Send command output to the output buffer, performing the specified
5435 * GET/DEL/INCR/DECR operations if any. */
5436 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 5437 if (storekey == NULL) {
5438 /* STORE option not specified, sent the sorting result to client */
5439 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5440 for (j = start; j <= end; j++) {
5441 listNode *ln;
5442 if (!getop) {
5443 addReplyBulkLen(c,vector[j].obj);
5444 addReply(c,vector[j].obj);
5445 addReply(c,shared.crlf);
5446 }
5447 listRewind(operations);
5448 while((ln = listYield(operations))) {
5449 redisSortOperation *sop = ln->value;
5450 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5451 vector[j].obj);
5452
5453 if (sop->type == REDIS_SORT_GET) {
5454 if (!val || val->type != REDIS_STRING) {
5455 addReply(c,shared.nullbulk);
5456 } else {
5457 addReplyBulkLen(c,val);
5458 addReply(c,val);
5459 addReply(c,shared.crlf);
5460 }
5461 } else {
dfc5e96c 5462 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 5463 }
5464 }
ed9b544e 5465 }
443c6409 5466 } else {
5467 robj *listObject = createListObject();
5468 list *listPtr = (list*) listObject->ptr;
5469
5470 /* STORE option specified, set the sorting result as a List object */
5471 for (j = start; j <= end; j++) {
5472 listNode *ln;
5473 if (!getop) {
5474 listAddNodeTail(listPtr,vector[j].obj);
5475 incrRefCount(vector[j].obj);
5476 }
5477 listRewind(operations);
5478 while((ln = listYield(operations))) {
5479 redisSortOperation *sop = ln->value;
5480 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5481 vector[j].obj);
5482
5483 if (sop->type == REDIS_SORT_GET) {
5484 if (!val || val->type != REDIS_STRING) {
5485 listAddNodeTail(listPtr,createStringObject("",0));
5486 } else {
5487 listAddNodeTail(listPtr,val);
5488 incrRefCount(val);
5489 }
ed9b544e 5490 } else {
dfc5e96c 5491 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 5492 }
ed9b544e 5493 }
ed9b544e 5494 }
121796f7 5495 if (dictReplace(c->db->dict,storekey,listObject)) {
5496 incrRefCount(storekey);
5497 }
443c6409 5498 /* Note: we add 1 because the DB is dirty anyway since even if the
5499 * SORT result is empty a new key is set and maybe the old content
5500 * replaced. */
5501 server.dirty += 1+outputlen;
5502 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 5503 }
5504
5505 /* Cleanup */
5506 decrRefCount(sortval);
5507 listRelease(operations);
5508 for (j = 0; j < vectorlen; j++) {
5509 if (sortby && alpha && vector[j].u.cmpobj)
5510 decrRefCount(vector[j].u.cmpobj);
5511 }
5512 zfree(vector);
5513}
5514
ec6c7a1d 5515/* Convert an amount of bytes into a human readable string in the form
5516 * of 100B, 2G, 100M, 4K, and so forth. */
5517static void bytesToHuman(char *s, unsigned long long n) {
5518 double d;
5519
5520 if (n < 1024) {
5521 /* Bytes */
5522 sprintf(s,"%lluB",n);
5523 return;
5524 } else if (n < (1024*1024)) {
5525 d = (double)n/(1024);
5526 sprintf(s,"%.2fK",d);
5527 } else if (n < (1024LL*1024*1024)) {
5528 d = (double)n/(1024*1024);
5529 sprintf(s,"%.2fM",d);
5530 } else if (n < (1024LL*1024*1024*1024)) {
5531 d = (double)n/(1024LL*1024*1024);
5532 sprintf(s,"%.2fM",d);
5533 }
5534}
5535
1c85b79f 5536/* Create the string returned by the INFO command. This is decoupled
5537 * by the INFO command itself as we need to report the same information
5538 * on memory corruption problems. */
5539static sds genRedisInfoString(void) {
ed9b544e 5540 sds info;
5541 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 5542 int j;
ec6c7a1d 5543 char hmem[64];
5544
5545 bytesToHuman(hmem,server.usedmemory);
ed9b544e 5546 info = sdscatprintf(sdsempty(),
5547 "redis_version:%s\r\n"
f1017b3f 5548 "arch_bits:%s\r\n"
7a932b74 5549 "multiplexing_api:%s\r\n"
0d7170a4 5550 "process_id:%ld\r\n"
682ac724 5551 "uptime_in_seconds:%ld\r\n"
5552 "uptime_in_days:%ld\r\n"
ed9b544e 5553 "connected_clients:%d\r\n"
5554 "connected_slaves:%d\r\n"
f86a74e9 5555 "blocked_clients:%d\r\n"
5fba9f71 5556 "used_memory:%zu\r\n"
ec6c7a1d 5557 "used_memory_human:%s\r\n"
ed9b544e 5558 "changes_since_last_save:%lld\r\n"
be2bb6b0 5559 "bgsave_in_progress:%d\r\n"
682ac724 5560 "last_save_time:%ld\r\n"
b3fad521 5561 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 5562 "total_connections_received:%lld\r\n"
5563 "total_commands_processed:%lld\r\n"
7d98e08c 5564 "vm_enabled:%d\r\n"
a0f643ea 5565 "role:%s\r\n"
ed9b544e 5566 ,REDIS_VERSION,
f1017b3f 5567 (sizeof(long) == 8) ? "64" : "32",
7a932b74 5568 aeGetApiName(),
0d7170a4 5569 (long) getpid(),
a0f643ea 5570 uptime,
5571 uptime/(3600*24),
ed9b544e 5572 listLength(server.clients)-listLength(server.slaves),
5573 listLength(server.slaves),
f86a74e9 5574 server.blockedclients,
ed9b544e 5575 server.usedmemory,
ec6c7a1d 5576 hmem,
ed9b544e 5577 server.dirty,
9d65a1bb 5578 server.bgsavechildpid != -1,
ed9b544e 5579 server.lastsave,
b3fad521 5580 server.bgrewritechildpid != -1,
ed9b544e 5581 server.stat_numconnections,
5582 server.stat_numcommands,
7d98e08c 5583 server.vm_enabled != 0,
a0f643ea 5584 server.masterhost == NULL ? "master" : "slave"
ed9b544e 5585 );
a0f643ea 5586 if (server.masterhost) {
5587 info = sdscatprintf(info,
5588 "master_host:%s\r\n"
5589 "master_port:%d\r\n"
5590 "master_link_status:%s\r\n"
5591 "master_last_io_seconds_ago:%d\r\n"
5592 ,server.masterhost,
5593 server.masterport,
5594 (server.replstate == REDIS_REPL_CONNECTED) ?
5595 "up" : "down",
f72b934d 5596 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 5597 );
5598 }
7d98e08c 5599 if (server.vm_enabled) {
5600 info = sdscatprintf(info,
5601 "vm_conf_max_memory:%llu\r\n"
5602 "vm_conf_page_size:%llu\r\n"
5603 "vm_conf_pages:%llu\r\n"
5604 "vm_stats_used_pages:%llu\r\n"
5605 "vm_stats_swapped_objects:%llu\r\n"
5606 "vm_stats_swappin_count:%llu\r\n"
5607 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 5608 "vm_stats_io_newjobs_len:%lu\r\n"
5609 "vm_stats_io_processing_len:%lu\r\n"
5610 "vm_stats_io_processed_len:%lu\r\n"
5611 "vm_stats_io_waiting_clients:%lu\r\n"
7d98e08c 5612 ,(unsigned long long) server.vm_max_memory,
5613 (unsigned long long) server.vm_page_size,
5614 (unsigned long long) server.vm_pages,
5615 (unsigned long long) server.vm_stats_used_pages,
5616 (unsigned long long) server.vm_stats_swapped_objects,
5617 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 5618 (unsigned long long) server.vm_stats_swapouts,
5619 (unsigned long) listLength(server.io_newjobs),
5620 (unsigned long) listLength(server.io_processing),
5621 (unsigned long) listLength(server.io_processed),
5622 (unsigned long) listLength(server.io_clients)
7d98e08c 5623 );
5624 }
c3cb078d 5625 for (j = 0; j < server.dbnum; j++) {
5626 long long keys, vkeys;
5627
5628 keys = dictSize(server.db[j].dict);
5629 vkeys = dictSize(server.db[j].expires);
5630 if (keys || vkeys) {
9d65a1bb 5631 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 5632 j, keys, vkeys);
5633 }
5634 }
1c85b79f 5635 return info;
5636}
5637
5638static void infoCommand(redisClient *c) {
5639 sds info = genRedisInfoString();
83c6a618 5640 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5641 (unsigned long)sdslen(info)));
ed9b544e 5642 addReplySds(c,info);
70003d28 5643 addReply(c,shared.crlf);
ed9b544e 5644}
5645
3305306f 5646static void monitorCommand(redisClient *c) {
5647 /* ignore MONITOR if aleady slave or in monitor mode */
5648 if (c->flags & REDIS_SLAVE) return;
5649
5650 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5651 c->slaveseldb = 0;
6b47e12e 5652 listAddNodeTail(server.monitors,c);
3305306f 5653 addReply(c,shared.ok);
5654}
5655
5656/* ================================= Expire ================================= */
5657static int removeExpire(redisDb *db, robj *key) {
5658 if (dictDelete(db->expires,key) == DICT_OK) {
5659 return 1;
5660 } else {
5661 return 0;
5662 }
5663}
5664
5665static int setExpire(redisDb *db, robj *key, time_t when) {
5666 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5667 return 0;
5668 } else {
5669 incrRefCount(key);
5670 return 1;
5671 }
5672}
5673
bb32ede5 5674/* Return the expire time of the specified key, or -1 if no expire
5675 * is associated with this key (i.e. the key is non volatile) */
5676static time_t getExpire(redisDb *db, robj *key) {
5677 dictEntry *de;
5678
5679 /* No expire? return ASAP */
5680 if (dictSize(db->expires) == 0 ||
5681 (de = dictFind(db->expires,key)) == NULL) return -1;
5682
5683 return (time_t) dictGetEntryVal(de);
5684}
5685
3305306f 5686static int expireIfNeeded(redisDb *db, robj *key) {
5687 time_t when;
5688 dictEntry *de;
5689
5690 /* No expire? return ASAP */
5691 if (dictSize(db->expires) == 0 ||
5692 (de = dictFind(db->expires,key)) == NULL) return 0;
5693
5694 /* Lookup the expire */
5695 when = (time_t) dictGetEntryVal(de);
5696 if (time(NULL) <= when) return 0;
5697
5698 /* Delete the key */
5699 dictDelete(db->expires,key);
5700 return dictDelete(db->dict,key) == DICT_OK;
5701}
5702
5703static int deleteIfVolatile(redisDb *db, robj *key) {
5704 dictEntry *de;
5705
5706 /* No expire? return ASAP */
5707 if (dictSize(db->expires) == 0 ||
5708 (de = dictFind(db->expires,key)) == NULL) return 0;
5709
5710 /* Delete the key */
0c66a471 5711 server.dirty++;
3305306f 5712 dictDelete(db->expires,key);
5713 return dictDelete(db->dict,key) == DICT_OK;
5714}
5715
802e8373 5716static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 5717 dictEntry *de;
3305306f 5718
802e8373 5719 de = dictFind(c->db->dict,key);
3305306f 5720 if (de == NULL) {
5721 addReply(c,shared.czero);
5722 return;
5723 }
43e5ccdf 5724 if (seconds < 0) {
5725 if (deleteKey(c->db,key)) server.dirty++;
5726 addReply(c, shared.cone);
3305306f 5727 return;
5728 } else {
5729 time_t when = time(NULL)+seconds;
802e8373 5730 if (setExpire(c->db,key,when)) {
3305306f 5731 addReply(c,shared.cone);
77423026 5732 server.dirty++;
5733 } else {
3305306f 5734 addReply(c,shared.czero);
77423026 5735 }
3305306f 5736 return;
5737 }
5738}
5739
802e8373 5740static void expireCommand(redisClient *c) {
5741 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
5742}
5743
5744static void expireatCommand(redisClient *c) {
5745 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
5746}
5747
fd88489a 5748static void ttlCommand(redisClient *c) {
5749 time_t expire;
5750 int ttl = -1;
5751
5752 expire = getExpire(c->db,c->argv[1]);
5753 if (expire != -1) {
5754 ttl = (int) (expire-time(NULL));
5755 if (ttl < 0) ttl = -1;
5756 }
5757 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
5758}
5759
6e469882 5760/* ================================ MULTI/EXEC ============================== */
5761
5762/* Client state initialization for MULTI/EXEC */
5763static void initClientMultiState(redisClient *c) {
5764 c->mstate.commands = NULL;
5765 c->mstate.count = 0;
5766}
5767
5768/* Release all the resources associated with MULTI/EXEC state */
5769static void freeClientMultiState(redisClient *c) {
5770 int j;
5771
5772 for (j = 0; j < c->mstate.count; j++) {
5773 int i;
5774 multiCmd *mc = c->mstate.commands+j;
5775
5776 for (i = 0; i < mc->argc; i++)
5777 decrRefCount(mc->argv[i]);
5778 zfree(mc->argv);
5779 }
5780 zfree(c->mstate.commands);
5781}
5782
5783/* Add a new command into the MULTI commands queue */
5784static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
5785 multiCmd *mc;
5786 int j;
5787
5788 c->mstate.commands = zrealloc(c->mstate.commands,
5789 sizeof(multiCmd)*(c->mstate.count+1));
5790 mc = c->mstate.commands+c->mstate.count;
5791 mc->cmd = cmd;
5792 mc->argc = c->argc;
5793 mc->argv = zmalloc(sizeof(robj*)*c->argc);
5794 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
5795 for (j = 0; j < c->argc; j++)
5796 incrRefCount(mc->argv[j]);
5797 c->mstate.count++;
5798}
5799
5800static void multiCommand(redisClient *c) {
5801 c->flags |= REDIS_MULTI;
36c548f0 5802 addReply(c,shared.ok);
6e469882 5803}
5804
5805static void execCommand(redisClient *c) {
5806 int j;
5807 robj **orig_argv;
5808 int orig_argc;
5809
5810 if (!(c->flags & REDIS_MULTI)) {
5811 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
5812 return;
5813 }
5814
5815 orig_argv = c->argv;
5816 orig_argc = c->argc;
5817 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
5818 for (j = 0; j < c->mstate.count; j++) {
5819 c->argc = c->mstate.commands[j].argc;
5820 c->argv = c->mstate.commands[j].argv;
5821 call(c,c->mstate.commands[j].cmd);
5822 }
5823 c->argv = orig_argv;
5824 c->argc = orig_argc;
5825 freeClientMultiState(c);
5826 initClientMultiState(c);
5827 c->flags &= (~REDIS_MULTI);
5828}
5829
4409877e 5830/* =========================== Blocking Operations ========================= */
5831
5832/* Currently Redis blocking operations support is limited to list POP ops,
5833 * so the current implementation is not fully generic, but it is also not
5834 * completely specific so it will not require a rewrite to support new
5835 * kind of blocking operations in the future.
5836 *
5837 * Still it's important to note that list blocking operations can be already
5838 * used as a notification mechanism in order to implement other blocking
5839 * operations at application level, so there must be a very strong evidence
5840 * of usefulness and generality before new blocking operations are implemented.
5841 *
5842 * This is how the current blocking POP works, we use BLPOP as example:
5843 * - If the user calls BLPOP and the key exists and contains a non empty list
5844 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
5845 * if there is not to block.
5846 * - If instead BLPOP is called and the key does not exists or the list is
5847 * empty we need to block. In order to do so we remove the notification for
5848 * new data to read in the client socket (so that we'll not serve new
5849 * requests if the blocking request is not served). Also we put the client
95242ab5 5850 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 5851 * blocking for this keys.
5852 * - If a PUSH operation against a key with blocked clients waiting is
5853 * performed, we serve the first in the list: basically instead to push
5854 * the new element inside the list we return it to the (first / oldest)
5855 * blocking client, unblock the client, and remove it form the list.
5856 *
5857 * The above comment and the source code should be enough in order to understand
5858 * the implementation and modify / fix it later.
5859 */
5860
5861/* Set a client in blocking mode for the specified key, with the specified
5862 * timeout */
b177fd30 5863static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 5864 dictEntry *de;
5865 list *l;
b177fd30 5866 int j;
4409877e 5867
b177fd30 5868 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
5869 c->blockingkeysnum = numkeys;
4409877e 5870 c->blockingto = timeout;
b177fd30 5871 for (j = 0; j < numkeys; j++) {
5872 /* Add the key in the client structure, to map clients -> keys */
5873 c->blockingkeys[j] = keys[j];
5874 incrRefCount(keys[j]);
4409877e 5875
b177fd30 5876 /* And in the other "side", to map keys -> clients */
5877 de = dictFind(c->db->blockingkeys,keys[j]);
5878 if (de == NULL) {
5879 int retval;
5880
5881 /* For every key we take a list of clients blocked for it */
5882 l = listCreate();
5883 retval = dictAdd(c->db->blockingkeys,keys[j],l);
5884 incrRefCount(keys[j]);
5885 assert(retval == DICT_OK);
5886 } else {
5887 l = dictGetEntryVal(de);
5888 }
5889 listAddNodeTail(l,c);
4409877e 5890 }
b177fd30 5891 /* Mark the client as a blocked client */
4409877e 5892 c->flags |= REDIS_BLOCKED;
5893 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
f86a74e9 5894 server.blockedclients++;
4409877e 5895}
5896
5897/* Unblock a client that's waiting in a blocking operation such as BLPOP */
5898static void unblockClient(redisClient *c) {
5899 dictEntry *de;
5900 list *l;
b177fd30 5901 int j;
4409877e 5902
b177fd30 5903 assert(c->blockingkeys != NULL);
5904 /* The client may wait for multiple keys, so unblock it for every key. */
5905 for (j = 0; j < c->blockingkeysnum; j++) {
5906 /* Remove this client from the list of clients waiting for this key. */
5907 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
5908 assert(de != NULL);
5909 l = dictGetEntryVal(de);
5910 listDelNode(l,listSearchKey(l,c));
5911 /* If the list is empty we need to remove it to avoid wasting memory */
5912 if (listLength(l) == 0)
5913 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
5914 decrRefCount(c->blockingkeys[j]);
5915 }
5916 /* Cleanup the client structure */
5917 zfree(c->blockingkeys);
5918 c->blockingkeys = NULL;
4409877e 5919 c->flags &= (~REDIS_BLOCKED);
f86a74e9 5920 server.blockedclients--;
4409877e 5921 /* Ok now we are ready to get read events from socket, note that we
5922 * can't trap errors here as it's possible that unblockClients() is
5923 * called from freeClient() itself, and the only thing we can do
5924 * if we failed to register the READABLE event is to kill the client.
5925 * Still the following function should never fail in the real world as
5926 * we are sure the file descriptor is sane, and we exit on out of mem. */
5927 aeCreateFileEvent(server.el, c->fd, AE_READABLE, readQueryFromClient, c);
5928 /* As a final step we want to process data if there is some command waiting
5929 * in the input buffer. Note that this is safe even if unblockClient()
5930 * gets called from freeClient() because freeClient() will be smart
5931 * enough to call this function *after* c->querybuf was set to NULL. */
5932 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
5933}
5934
5935/* This should be called from any function PUSHing into lists.
5936 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
5937 * 'ele' is the element pushed.
5938 *
5939 * If the function returns 0 there was no client waiting for a list push
5940 * against this key.
5941 *
5942 * If the function returns 1 there was a client waiting for a list push
5943 * against this key, the element was passed to this client thus it's not
5944 * needed to actually add it to the list and the caller should return asap. */
5945static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
5946 struct dictEntry *de;
5947 redisClient *receiver;
5948 list *l;
5949 listNode *ln;
5950
5951 de = dictFind(c->db->blockingkeys,key);
5952 if (de == NULL) return 0;
5953 l = dictGetEntryVal(de);
5954 ln = listFirst(l);
5955 assert(ln != NULL);
5956 receiver = ln->value;
4409877e 5957
b177fd30 5958 addReplySds(receiver,sdsnew("*2\r\n"));
5959 addReplyBulkLen(receiver,key);
5960 addReply(receiver,key);
5961 addReply(receiver,shared.crlf);
4409877e 5962 addReplyBulkLen(receiver,ele);
5963 addReply(receiver,ele);
5964 addReply(receiver,shared.crlf);
5965 unblockClient(receiver);
5966 return 1;
5967}
5968
5969/* Blocking RPOP/LPOP */
5970static void blockingPopGenericCommand(redisClient *c, int where) {
5971 robj *o;
5972 time_t timeout;
b177fd30 5973 int j;
4409877e 5974
b177fd30 5975 for (j = 1; j < c->argc-1; j++) {
5976 o = lookupKeyWrite(c->db,c->argv[j]);
5977 if (o != NULL) {
5978 if (o->type != REDIS_LIST) {
5979 addReply(c,shared.wrongtypeerr);
4409877e 5980 return;
b177fd30 5981 } else {
5982 list *list = o->ptr;
5983 if (listLength(list) != 0) {
5984 /* If the list contains elements fall back to the usual
5985 * non-blocking POP operation */
5986 robj *argv[2], **orig_argv;
5987 int orig_argc;
5988
5989 /* We need to alter the command arguments before to call
5990 * popGenericCommand() as the command takes a single key. */
5991 orig_argv = c->argv;
5992 orig_argc = c->argc;
5993 argv[1] = c->argv[j];
5994 c->argv = argv;
5995 c->argc = 2;
5996
5997 /* Also the return value is different, we need to output
5998 * the multi bulk reply header and the key name. The
5999 * "real" command will add the last element (the value)
6000 * for us. If this souds like an hack to you it's just
6001 * because it is... */
6002 addReplySds(c,sdsnew("*2\r\n"));
6003 addReplyBulkLen(c,argv[1]);
6004 addReply(c,argv[1]);
6005 addReply(c,shared.crlf);
6006 popGenericCommand(c,where);
6007
6008 /* Fix the client structure with the original stuff */
6009 c->argv = orig_argv;
6010 c->argc = orig_argc;
6011 return;
6012 }
4409877e 6013 }
6014 }
6015 }
6016 /* If the list is empty or the key does not exists we must block */
b177fd30 6017 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 6018 if (timeout > 0) timeout += time(NULL);
b177fd30 6019 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 6020}
6021
6022static void blpopCommand(redisClient *c) {
6023 blockingPopGenericCommand(c,REDIS_HEAD);
6024}
6025
6026static void brpopCommand(redisClient *c) {
6027 blockingPopGenericCommand(c,REDIS_TAIL);
6028}
6029
ed9b544e 6030/* =============================== Replication ============================= */
6031
a4d1ba9a 6032static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6033 ssize_t nwritten, ret = size;
6034 time_t start = time(NULL);
6035
6036 timeout++;
6037 while(size) {
6038 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6039 nwritten = write(fd,ptr,size);
6040 if (nwritten == -1) return -1;
6041 ptr += nwritten;
6042 size -= nwritten;
6043 }
6044 if ((time(NULL)-start) > timeout) {
6045 errno = ETIMEDOUT;
6046 return -1;
6047 }
6048 }
6049 return ret;
6050}
6051
a4d1ba9a 6052static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6053 ssize_t nread, totread = 0;
6054 time_t start = time(NULL);
6055
6056 timeout++;
6057 while(size) {
6058 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6059 nread = read(fd,ptr,size);
6060 if (nread == -1) return -1;
6061 ptr += nread;
6062 size -= nread;
6063 totread += nread;
6064 }
6065 if ((time(NULL)-start) > timeout) {
6066 errno = ETIMEDOUT;
6067 return -1;
6068 }
6069 }
6070 return totread;
6071}
6072
6073static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6074 ssize_t nread = 0;
6075
6076 size--;
6077 while(size) {
6078 char c;
6079
6080 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6081 if (c == '\n') {
6082 *ptr = '\0';
6083 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6084 return nread;
6085 } else {
6086 *ptr++ = c;
6087 *ptr = '\0';
6088 nread++;
6089 }
6090 }
6091 return nread;
6092}
6093
6094static void syncCommand(redisClient *c) {
40d224a9 6095 /* ignore SYNC if aleady slave or in monitor mode */
6096 if (c->flags & REDIS_SLAVE) return;
6097
6098 /* SYNC can't be issued when the server has pending data to send to
6099 * the client about already issued commands. We need a fresh reply
6100 * buffer registering the differences between the BGSAVE and the current
6101 * dataset, so that we can copy to other slaves if needed. */
6102 if (listLength(c->reply) != 0) {
6103 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6104 return;
6105 }
6106
6107 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6108 /* Here we need to check if there is a background saving operation
6109 * in progress, or if it is required to start one */
9d65a1bb 6110 if (server.bgsavechildpid != -1) {
40d224a9 6111 /* Ok a background save is in progress. Let's check if it is a good
6112 * one for replication, i.e. if there is another slave that is
6113 * registering differences since the server forked to save */
6114 redisClient *slave;
6115 listNode *ln;
6116
6208b3a7 6117 listRewind(server.slaves);
6118 while((ln = listYield(server.slaves))) {
40d224a9 6119 slave = ln->value;
6120 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 6121 }
6122 if (ln) {
6123 /* Perfect, the server is already registering differences for
6124 * another slave. Set the right state, and copy the buffer. */
6125 listRelease(c->reply);
6126 c->reply = listDup(slave->reply);
40d224a9 6127 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6128 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6129 } else {
6130 /* No way, we need to wait for the next BGSAVE in order to
6131 * register differences */
6132 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6133 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6134 }
6135 } else {
6136 /* Ok we don't have a BGSAVE in progress, let's start one */
6137 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6138 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6139 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6140 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6141 return;
6142 }
6143 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6144 }
6208b3a7 6145 c->repldbfd = -1;
40d224a9 6146 c->flags |= REDIS_SLAVE;
6147 c->slaveseldb = 0;
6b47e12e 6148 listAddNodeTail(server.slaves,c);
40d224a9 6149 return;
6150}
6151
6208b3a7 6152static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6153 redisClient *slave = privdata;
6154 REDIS_NOTUSED(el);
6155 REDIS_NOTUSED(mask);
6156 char buf[REDIS_IOBUF_LEN];
6157 ssize_t nwritten, buflen;
6158
6159 if (slave->repldboff == 0) {
6160 /* Write the bulk write count before to transfer the DB. In theory here
6161 * we don't know how much room there is in the output buffer of the
6162 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6163 * operations) will never be smaller than the few bytes we need. */
6164 sds bulkcount;
6165
6166 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6167 slave->repldbsize);
6168 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6169 {
6170 sdsfree(bulkcount);
6171 freeClient(slave);
6172 return;
6173 }
6174 sdsfree(bulkcount);
6175 }
6176 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6177 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6178 if (buflen <= 0) {
6179 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6180 (buflen == 0) ? "premature EOF" : strerror(errno));
6181 freeClient(slave);
6182 return;
6183 }
6184 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 6185 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 6186 strerror(errno));
6187 freeClient(slave);
6188 return;
6189 }
6190 slave->repldboff += nwritten;
6191 if (slave->repldboff == slave->repldbsize) {
6192 close(slave->repldbfd);
6193 slave->repldbfd = -1;
6194 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6195 slave->replstate = REDIS_REPL_ONLINE;
6196 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 6197 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 6198 freeClient(slave);
6199 return;
6200 }
6201 addReplySds(slave,sdsempty());
6202 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6203 }
6204}
ed9b544e 6205
a3b21203 6206/* This function is called at the end of every backgrond saving.
6207 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6208 * otherwise REDIS_ERR is passed to the function.
6209 *
6210 * The goal of this function is to handle slaves waiting for a successful
6211 * background saving in order to perform non-blocking synchronization. */
6212static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 6213 listNode *ln;
6214 int startbgsave = 0;
ed9b544e 6215
6208b3a7 6216 listRewind(server.slaves);
6217 while((ln = listYield(server.slaves))) {
6218 redisClient *slave = ln->value;
ed9b544e 6219
6208b3a7 6220 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6221 startbgsave = 1;
6222 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6223 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 6224 struct redis_stat buf;
6208b3a7 6225
6226 if (bgsaveerr != REDIS_OK) {
6227 freeClient(slave);
6228 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6229 continue;
6230 }
6231 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 6232 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 6233 freeClient(slave);
6234 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6235 continue;
6236 }
6237 slave->repldboff = 0;
6238 slave->repldbsize = buf.st_size;
6239 slave->replstate = REDIS_REPL_SEND_BULK;
6240 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 6241 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 6242 freeClient(slave);
6243 continue;
6244 }
6245 }
ed9b544e 6246 }
6208b3a7 6247 if (startbgsave) {
6248 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6249 listRewind(server.slaves);
6250 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6251 while((ln = listYield(server.slaves))) {
6252 redisClient *slave = ln->value;
ed9b544e 6253
6208b3a7 6254 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6255 freeClient(slave);
6256 }
6257 }
6258 }
ed9b544e 6259}
6260
6261static int syncWithMaster(void) {
d0ccebcf 6262 char buf[1024], tmpfile[256], authcmd[1024];
ed9b544e 6263 int dumpsize;
6264 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6265 int dfd;
6266
6267 if (fd == -1) {
6268 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6269 strerror(errno));
6270 return REDIS_ERR;
6271 }
d0ccebcf 6272
6273 /* AUTH with the master if required. */
6274 if(server.masterauth) {
6275 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6276 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6277 close(fd);
6278 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6279 strerror(errno));
6280 return REDIS_ERR;
6281 }
6282 /* Read the AUTH result. */
6283 if (syncReadLine(fd,buf,1024,3600) == -1) {
6284 close(fd);
6285 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6286 strerror(errno));
6287 return REDIS_ERR;
6288 }
6289 if (buf[0] != '+') {
6290 close(fd);
6291 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6292 return REDIS_ERR;
6293 }
6294 }
6295
ed9b544e 6296 /* Issue the SYNC command */
6297 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6298 close(fd);
6299 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6300 strerror(errno));
6301 return REDIS_ERR;
6302 }
6303 /* Read the bulk write count */
8c4d91fc 6304 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 6305 close(fd);
6306 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6307 strerror(errno));
6308 return REDIS_ERR;
6309 }
4aa701c1 6310 if (buf[0] != '$') {
6311 close(fd);
6312 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6313 return REDIS_ERR;
6314 }
c937aa89 6315 dumpsize = atoi(buf+1);
ed9b544e 6316 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6317 /* Read the bulk write data on a temp file */
6318 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6319 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6320 if (dfd == -1) {
6321 close(fd);
6322 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6323 return REDIS_ERR;
6324 }
6325 while(dumpsize) {
6326 int nread, nwritten;
6327
6328 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6329 if (nread == -1) {
6330 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6331 strerror(errno));
6332 close(fd);
6333 close(dfd);
6334 return REDIS_ERR;
6335 }
6336 nwritten = write(dfd,buf,nread);
6337 if (nwritten == -1) {
6338 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6339 close(fd);
6340 close(dfd);
6341 return REDIS_ERR;
6342 }
6343 dumpsize -= nread;
6344 }
6345 close(dfd);
6346 if (rename(tmpfile,server.dbfilename) == -1) {
6347 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6348 unlink(tmpfile);
6349 close(fd);
6350 return REDIS_ERR;
6351 }
6352 emptyDb();
f78fd11b 6353 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 6354 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6355 close(fd);
6356 return REDIS_ERR;
6357 }
6358 server.master = createClient(fd);
6359 server.master->flags |= REDIS_MASTER;
179b3952 6360 server.master->authenticated = 1;
ed9b544e 6361 server.replstate = REDIS_REPL_CONNECTED;
6362 return REDIS_OK;
6363}
6364
321b0e13 6365static void slaveofCommand(redisClient *c) {
6366 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6367 !strcasecmp(c->argv[2]->ptr,"one")) {
6368 if (server.masterhost) {
6369 sdsfree(server.masterhost);
6370 server.masterhost = NULL;
6371 if (server.master) freeClient(server.master);
6372 server.replstate = REDIS_REPL_NONE;
6373 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6374 }
6375 } else {
6376 sdsfree(server.masterhost);
6377 server.masterhost = sdsdup(c->argv[1]->ptr);
6378 server.masterport = atoi(c->argv[2]->ptr);
6379 if (server.master) freeClient(server.master);
6380 server.replstate = REDIS_REPL_CONNECT;
6381 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6382 server.masterhost, server.masterport);
6383 }
6384 addReply(c,shared.ok);
6385}
6386
3fd78bcd 6387/* ============================ Maxmemory directive ======================== */
6388
a5819310 6389/* Try to free one object form the pre-allocated objects free list.
6390 * This is useful under low mem conditions as by default we take 1 million
6391 * free objects allocated. On success REDIS_OK is returned, otherwise
6392 * REDIS_ERR. */
6393static int tryFreeOneObjectFromFreelist(void) {
f870935d 6394 robj *o;
6395
a5819310 6396 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6397 if (listLength(server.objfreelist)) {
6398 listNode *head = listFirst(server.objfreelist);
6399 o = listNodeValue(head);
6400 listDelNode(server.objfreelist,head);
6401 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6402 zfree(o);
6403 return REDIS_OK;
6404 } else {
6405 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6406 return REDIS_ERR;
6407 }
f870935d 6408}
6409
3fd78bcd 6410/* This function gets called when 'maxmemory' is set on the config file to limit
6411 * the max memory used by the server, and we are out of memory.
6412 * This function will try to, in order:
6413 *
6414 * - Free objects from the free list
6415 * - Try to remove keys with an EXPIRE set
6416 *
6417 * It is not possible to free enough memory to reach used-memory < maxmemory
6418 * the server will start refusing commands that will enlarge even more the
6419 * memory usage.
6420 */
6421static void freeMemoryIfNeeded(void) {
6422 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 6423 int j, k, freed = 0;
6424
6425 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6426 for (j = 0; j < server.dbnum; j++) {
6427 int minttl = -1;
6428 robj *minkey = NULL;
6429 struct dictEntry *de;
6430
6431 if (dictSize(server.db[j].expires)) {
6432 freed = 1;
6433 /* From a sample of three keys drop the one nearest to
6434 * the natural expire */
6435 for (k = 0; k < 3; k++) {
6436 time_t t;
6437
6438 de = dictGetRandomKey(server.db[j].expires);
6439 t = (time_t) dictGetEntryVal(de);
6440 if (minttl == -1 || t < minttl) {
6441 minkey = dictGetEntryKey(de);
6442 minttl = t;
3fd78bcd 6443 }
3fd78bcd 6444 }
a5819310 6445 deleteKey(server.db+j,minkey);
3fd78bcd 6446 }
3fd78bcd 6447 }
a5819310 6448 if (!freed) return; /* nothing to free... */
3fd78bcd 6449 }
6450}
6451
f80dff62 6452/* ============================== Append Only file ========================== */
6453
6454static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6455 sds buf = sdsempty();
6456 int j;
6457 ssize_t nwritten;
6458 time_t now;
6459 robj *tmpargv[3];
6460
6461 /* The DB this command was targetting is not the same as the last command
6462 * we appendend. To issue a SELECT command is needed. */
6463 if (dictid != server.appendseldb) {
6464 char seldb[64];
6465
6466 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 6467 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 6468 (unsigned long)strlen(seldb),seldb);
f80dff62 6469 server.appendseldb = dictid;
6470 }
6471
6472 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6473 * EXPIREs into EXPIREATs calls */
6474 if (cmd->proc == expireCommand) {
6475 long when;
6476
6477 tmpargv[0] = createStringObject("EXPIREAT",8);
6478 tmpargv[1] = argv[1];
6479 incrRefCount(argv[1]);
6480 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6481 tmpargv[2] = createObject(REDIS_STRING,
6482 sdscatprintf(sdsempty(),"%ld",when));
6483 argv = tmpargv;
6484 }
6485
6486 /* Append the actual command */
6487 buf = sdscatprintf(buf,"*%d\r\n",argc);
6488 for (j = 0; j < argc; j++) {
6489 robj *o = argv[j];
6490
9d65a1bb 6491 o = getDecodedObject(o);
83c6a618 6492 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 6493 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6494 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 6495 decrRefCount(o);
f80dff62 6496 }
6497
6498 /* Free the objects from the modified argv for EXPIREAT */
6499 if (cmd->proc == expireCommand) {
6500 for (j = 0; j < 3; j++)
6501 decrRefCount(argv[j]);
6502 }
6503
6504 /* We want to perform a single write. This should be guaranteed atomic
6505 * at least if the filesystem we are writing is a real physical one.
6506 * While this will save us against the server being killed I don't think
6507 * there is much to do about the whole server stopping for power problems
6508 * or alike */
6509 nwritten = write(server.appendfd,buf,sdslen(buf));
6510 if (nwritten != (signed)sdslen(buf)) {
6511 /* Ooops, we are in troubles. The best thing to do for now is
6512 * to simply exit instead to give the illusion that everything is
6513 * working as expected. */
6514 if (nwritten == -1) {
6515 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6516 } else {
6517 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6518 }
6519 exit(1);
6520 }
85a83172 6521 /* If a background append only file rewriting is in progress we want to
6522 * accumulate the differences between the child DB and the current one
6523 * in a buffer, so that when the child process will do its work we
6524 * can append the differences to the new append only file. */
6525 if (server.bgrewritechildpid != -1)
6526 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6527
6528 sdsfree(buf);
f80dff62 6529 now = time(NULL);
6530 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6531 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6532 now-server.lastfsync > 1))
6533 {
6534 fsync(server.appendfd); /* Let's try to get this data on the disk */
6535 server.lastfsync = now;
6536 }
6537}
6538
6539/* In Redis commands are always executed in the context of a client, so in
6540 * order to load the append only file we need to create a fake client. */
6541static struct redisClient *createFakeClient(void) {
6542 struct redisClient *c = zmalloc(sizeof(*c));
6543
6544 selectDb(c,0);
6545 c->fd = -1;
6546 c->querybuf = sdsempty();
6547 c->argc = 0;
6548 c->argv = NULL;
6549 c->flags = 0;
9387d17d 6550 /* We set the fake client as a slave waiting for the synchronization
6551 * so that Redis will not try to send replies to this client. */
6552 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 6553 c->reply = listCreate();
6554 listSetFreeMethod(c->reply,decrRefCount);
6555 listSetDupMethod(c->reply,dupClientReplyValue);
6556 return c;
6557}
6558
6559static void freeFakeClient(struct redisClient *c) {
6560 sdsfree(c->querybuf);
6561 listRelease(c->reply);
6562 zfree(c);
6563}
6564
6565/* Replay the append log file. On error REDIS_OK is returned. On non fatal
6566 * error (the append only file is zero-length) REDIS_ERR is returned. On
6567 * fatal error an error message is logged and the program exists. */
6568int loadAppendOnlyFile(char *filename) {
6569 struct redisClient *fakeClient;
6570 FILE *fp = fopen(filename,"r");
6571 struct redis_stat sb;
b492cf00 6572 unsigned long long loadedkeys = 0;
f80dff62 6573
6574 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6575 return REDIS_ERR;
6576
6577 if (fp == NULL) {
6578 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6579 exit(1);
6580 }
6581
6582 fakeClient = createFakeClient();
6583 while(1) {
6584 int argc, j;
6585 unsigned long len;
6586 robj **argv;
6587 char buf[128];
6588 sds argsds;
6589 struct redisCommand *cmd;
6590
6591 if (fgets(buf,sizeof(buf),fp) == NULL) {
6592 if (feof(fp))
6593 break;
6594 else
6595 goto readerr;
6596 }
6597 if (buf[0] != '*') goto fmterr;
6598 argc = atoi(buf+1);
6599 argv = zmalloc(sizeof(robj*)*argc);
6600 for (j = 0; j < argc; j++) {
6601 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6602 if (buf[0] != '$') goto fmterr;
6603 len = strtol(buf+1,NULL,10);
6604 argsds = sdsnewlen(NULL,len);
0f151ef1 6605 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 6606 argv[j] = createObject(REDIS_STRING,argsds);
6607 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6608 }
6609
6610 /* Command lookup */
6611 cmd = lookupCommand(argv[0]->ptr);
6612 if (!cmd) {
6613 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6614 exit(1);
6615 }
6616 /* Try object sharing and encoding */
6617 if (server.shareobjects) {
6618 int j;
6619 for(j = 1; j < argc; j++)
6620 argv[j] = tryObjectSharing(argv[j]);
6621 }
6622 if (cmd->flags & REDIS_CMD_BULK)
6623 tryObjectEncoding(argv[argc-1]);
6624 /* Run the command in the context of a fake client */
6625 fakeClient->argc = argc;
6626 fakeClient->argv = argv;
6627 cmd->proc(fakeClient);
6628 /* Discard the reply objects list from the fake client */
6629 while(listLength(fakeClient->reply))
6630 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6631 /* Clean up, ready for the next command */
6632 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6633 zfree(argv);
b492cf00 6634 /* Handle swapping while loading big datasets when VM is on */
6635 loadedkeys++;
6636 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
6637 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 6638 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 6639 }
6640 }
f80dff62 6641 }
6642 fclose(fp);
6643 freeFakeClient(fakeClient);
6644 return REDIS_OK;
6645
6646readerr:
6647 if (feof(fp)) {
6648 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6649 } else {
6650 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6651 }
6652 exit(1);
6653fmterr:
6654 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6655 exit(1);
6656}
6657
9d65a1bb 6658/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6659static int fwriteBulk(FILE *fp, robj *obj) {
6660 char buf[128];
b9bc0eef 6661 int decrrc = 0;
6662
6663 if (obj->storage == REDIS_VM_MEMORY && obj->encoding != REDIS_ENCODING_RAW){
6664 obj = getDecodedObject(obj);
6665 decrrc = 1;
6666 }
9d65a1bb 6667 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
6668 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 6669 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
6670 goto err;
9d65a1bb 6671 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 6672 if (decrrc) decrRefCount(obj);
9d65a1bb 6673 return 1;
6674err:
b9bc0eef 6675 if (decrrc) decrRefCount(obj);
9d65a1bb 6676 return 0;
6677}
6678
6679/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
6680static int fwriteBulkDouble(FILE *fp, double d) {
6681 char buf[128], dbuf[128];
6682
6683 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
6684 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
6685 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6686 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
6687 return 1;
6688}
6689
6690/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
6691static int fwriteBulkLong(FILE *fp, long l) {
6692 char buf[128], lbuf[128];
6693
6694 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
6695 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
6696 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6697 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
6698 return 1;
6699}
6700
6701/* Write a sequence of commands able to fully rebuild the dataset into
6702 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
6703static int rewriteAppendOnlyFile(char *filename) {
6704 dictIterator *di = NULL;
6705 dictEntry *de;
6706 FILE *fp;
6707 char tmpfile[256];
6708 int j;
6709 time_t now = time(NULL);
6710
6711 /* Note that we have to use a different temp name here compared to the
6712 * one used by rewriteAppendOnlyFileBackground() function. */
6713 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
6714 fp = fopen(tmpfile,"w");
6715 if (!fp) {
6716 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
6717 return REDIS_ERR;
6718 }
6719 for (j = 0; j < server.dbnum; j++) {
6720 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
6721 redisDb *db = server.db+j;
6722 dict *d = db->dict;
6723 if (dictSize(d) == 0) continue;
6724 di = dictGetIterator(d);
6725 if (!di) {
6726 fclose(fp);
6727 return REDIS_ERR;
6728 }
6729
6730 /* SELECT the new DB */
6731 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 6732 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 6733
6734 /* Iterate this DB writing every entry */
6735 while((de = dictNext(di)) != NULL) {
e7546c63 6736 robj *key, *o;
6737 time_t expiretime;
6738 int swapped;
6739
6740 key = dictGetEntryKey(de);
b9bc0eef 6741 /* If the value for this key is swapped, load a preview in memory.
6742 * We use a "swapped" flag to remember if we need to free the
6743 * value object instead to just increment the ref count anyway
6744 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 6745 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
6746 key->storage == REDIS_VM_SWAPPING) {
e7546c63 6747 o = dictGetEntryVal(de);
6748 swapped = 0;
6749 } else {
6750 o = vmPreviewObject(key);
e7546c63 6751 swapped = 1;
6752 }
6753 expiretime = getExpire(db,key);
9d65a1bb 6754
6755 /* Save the key and associated value */
9d65a1bb 6756 if (o->type == REDIS_STRING) {
6757 /* Emit a SET command */
6758 char cmd[]="*3\r\n$3\r\nSET\r\n";
6759 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6760 /* Key and value */
6761 if (fwriteBulk(fp,key) == 0) goto werr;
6762 if (fwriteBulk(fp,o) == 0) goto werr;
6763 } else if (o->type == REDIS_LIST) {
6764 /* Emit the RPUSHes needed to rebuild the list */
6765 list *list = o->ptr;
6766 listNode *ln;
6767
6768 listRewind(list);
6769 while((ln = listYield(list))) {
6770 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
6771 robj *eleobj = listNodeValue(ln);
6772
6773 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6774 if (fwriteBulk(fp,key) == 0) goto werr;
6775 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6776 }
6777 } else if (o->type == REDIS_SET) {
6778 /* Emit the SADDs needed to rebuild the set */
6779 dict *set = o->ptr;
6780 dictIterator *di = dictGetIterator(set);
6781 dictEntry *de;
6782
6783 while((de = dictNext(di)) != NULL) {
6784 char cmd[]="*3\r\n$4\r\nSADD\r\n";
6785 robj *eleobj = dictGetEntryKey(de);
6786
6787 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6788 if (fwriteBulk(fp,key) == 0) goto werr;
6789 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6790 }
6791 dictReleaseIterator(di);
6792 } else if (o->type == REDIS_ZSET) {
6793 /* Emit the ZADDs needed to rebuild the sorted set */
6794 zset *zs = o->ptr;
6795 dictIterator *di = dictGetIterator(zs->dict);
6796 dictEntry *de;
6797
6798 while((de = dictNext(di)) != NULL) {
6799 char cmd[]="*4\r\n$4\r\nZADD\r\n";
6800 robj *eleobj = dictGetEntryKey(de);
6801 double *score = dictGetEntryVal(de);
6802
6803 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6804 if (fwriteBulk(fp,key) == 0) goto werr;
6805 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
6806 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6807 }
6808 dictReleaseIterator(di);
6809 } else {
dfc5e96c 6810 redisAssert(0 != 0);
9d65a1bb 6811 }
6812 /* Save the expire time */
6813 if (expiretime != -1) {
e96e4fbf 6814 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 6815 /* If this key is already expired skip it */
6816 if (expiretime < now) continue;
6817 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6818 if (fwriteBulk(fp,key) == 0) goto werr;
6819 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
6820 }
b9bc0eef 6821 if (swapped) decrRefCount(o);
9d65a1bb 6822 }
6823 dictReleaseIterator(di);
6824 }
6825
6826 /* Make sure data will not remain on the OS's output buffers */
6827 fflush(fp);
6828 fsync(fileno(fp));
6829 fclose(fp);
6830
6831 /* Use RENAME to make sure the DB file is changed atomically only
6832 * if the generate DB file is ok. */
6833 if (rename(tmpfile,filename) == -1) {
6834 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
6835 unlink(tmpfile);
6836 return REDIS_ERR;
6837 }
6838 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
6839 return REDIS_OK;
6840
6841werr:
6842 fclose(fp);
6843 unlink(tmpfile);
e96e4fbf 6844 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 6845 if (di) dictReleaseIterator(di);
6846 return REDIS_ERR;
6847}
6848
6849/* This is how rewriting of the append only file in background works:
6850 *
6851 * 1) The user calls BGREWRITEAOF
6852 * 2) Redis calls this function, that forks():
6853 * 2a) the child rewrite the append only file in a temp file.
6854 * 2b) the parent accumulates differences in server.bgrewritebuf.
6855 * 3) When the child finished '2a' exists.
6856 * 4) The parent will trap the exit code, if it's OK, will append the
6857 * data accumulated into server.bgrewritebuf into the temp file, and
6858 * finally will rename(2) the temp file in the actual file name.
6859 * The the new file is reopened as the new append only file. Profit!
6860 */
6861static int rewriteAppendOnlyFileBackground(void) {
6862 pid_t childpid;
6863
6864 if (server.bgrewritechildpid != -1) return REDIS_ERR;
6865 if ((childpid = fork()) == 0) {
6866 /* Child */
6867 char tmpfile[256];
6868 close(server.fd);
6869
6870 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
6871 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
6872 exit(0);
6873 } else {
6874 exit(1);
6875 }
6876 } else {
6877 /* Parent */
6878 if (childpid == -1) {
6879 redisLog(REDIS_WARNING,
6880 "Can't rewrite append only file in background: fork: %s",
6881 strerror(errno));
6882 return REDIS_ERR;
6883 }
6884 redisLog(REDIS_NOTICE,
6885 "Background append only file rewriting started by pid %d",childpid);
6886 server.bgrewritechildpid = childpid;
85a83172 6887 /* We set appendseldb to -1 in order to force the next call to the
6888 * feedAppendOnlyFile() to issue a SELECT command, so the differences
6889 * accumulated by the parent into server.bgrewritebuf will start
6890 * with a SELECT statement and it will be safe to merge. */
6891 server.appendseldb = -1;
9d65a1bb 6892 return REDIS_OK;
6893 }
6894 return REDIS_OK; /* unreached */
6895}
6896
6897static void bgrewriteaofCommand(redisClient *c) {
6898 if (server.bgrewritechildpid != -1) {
6899 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
6900 return;
6901 }
6902 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 6903 char *status = "+Background append only file rewriting started\r\n";
6904 addReplySds(c,sdsnew(status));
9d65a1bb 6905 } else {
6906 addReply(c,shared.err);
6907 }
6908}
6909
6910static void aofRemoveTempFile(pid_t childpid) {
6911 char tmpfile[256];
6912
6913 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
6914 unlink(tmpfile);
6915}
6916
996cb5f7 6917/* Virtual Memory is composed mainly of two subsystems:
6918 * - Blocking Virutal Memory
6919 * - Threaded Virtual Memory I/O
6920 * The two parts are not fully decoupled, but functions are split among two
6921 * different sections of the source code (delimited by comments) in order to
6922 * make more clear what functionality is about the blocking VM and what about
6923 * the threaded (not blocking) VM.
6924 *
6925 * Redis VM design:
6926 *
6927 * Redis VM is a blocking VM (one that blocks reading swapped values from
6928 * disk into memory when a value swapped out is needed in memory) that is made
6929 * unblocking by trying to examine the command argument vector in order to
6930 * load in background values that will likely be needed in order to exec
6931 * the command. The command is executed only once all the relevant keys
6932 * are loaded into memory.
6933 *
6934 * This basically is almost as simple of a blocking VM, but almost as parallel
6935 * as a fully non-blocking VM.
6936 */
6937
6938/* =================== Virtual Memory - Blocking Side ====================== */
75680a3c 6939static void vmInit(void) {
6940 off_t totsize;
996cb5f7 6941 int pipefds[2];
75680a3c 6942
6943 server.vm_fp = fopen("/tmp/redisvm","w+b");
6944 if (server.vm_fp == NULL) {
6945 redisLog(REDIS_WARNING,"Impossible to open the swap file. Exiting.");
6946 exit(1);
6947 }
6948 server.vm_fd = fileno(server.vm_fp);
6949 server.vm_next_page = 0;
6950 server.vm_near_pages = 0;
7d98e08c 6951 server.vm_stats_used_pages = 0;
6952 server.vm_stats_swapped_objects = 0;
6953 server.vm_stats_swapouts = 0;
6954 server.vm_stats_swapins = 0;
75680a3c 6955 totsize = server.vm_pages*server.vm_page_size;
6956 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
6957 if (ftruncate(server.vm_fd,totsize) == -1) {
6958 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
6959 strerror(errno));
6960 exit(1);
6961 } else {
6962 redisLog(REDIS_NOTICE,"Swap file allocated with success");
6963 }
7d30035d 6964 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 6965 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 6966 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 6967 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
75680a3c 6968 /* Try to remove the swap file, so the OS will really delete it from the
6969 * file system when Redis exists. */
6970 unlink("/tmp/redisvm");
92f8e882 6971
996cb5f7 6972 /* Initialize threaded I/O (used by Virtual Memory) */
6973 server.io_newjobs = listCreate();
6974 server.io_processing = listCreate();
6975 server.io_processed = listCreate();
92f8e882 6976 server.io_clients = listCreate();
6977 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 6978 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
6979 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 6980 server.io_active_threads = 0;
996cb5f7 6981 if (pipe(pipefds) == -1) {
6982 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
6983 ,strerror(errno));
6984 exit(1);
6985 }
6986 server.io_ready_pipe_read = pipefds[0];
6987 server.io_ready_pipe_write = pipefds[1];
6988 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
b9bc0eef 6989 /* Listen for events in the threaded I/O pipe */
6990 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
6991 vmThreadedIOCompletedJob, NULL) == AE_ERR)
6992 oom("creating file event");
75680a3c 6993}
6994
06224fec 6995/* Mark the page as used */
6996static void vmMarkPageUsed(off_t page) {
6997 off_t byte = page/8;
6998 int bit = page&7;
6999 server.vm_bitmap[byte] |= 1<<bit;
f870935d 7000 redisLog(REDIS_DEBUG,"Mark used: %lld (byte:%lld bit:%d)\n",
7001 (long long)page, (long long)byte, bit);
06224fec 7002}
7003
7004/* Mark N contiguous pages as used, with 'page' being the first. */
7005static void vmMarkPagesUsed(off_t page, off_t count) {
7006 off_t j;
7007
7008 for (j = 0; j < count; j++)
7d30035d 7009 vmMarkPageUsed(page+j);
7d98e08c 7010 server.vm_stats_used_pages += count;
06224fec 7011}
7012
7013/* Mark the page as free */
7014static void vmMarkPageFree(off_t page) {
7015 off_t byte = page/8;
7016 int bit = page&7;
7017 server.vm_bitmap[byte] &= ~(1<<bit);
7018}
7019
7020/* Mark N contiguous pages as free, with 'page' being the first. */
7021static void vmMarkPagesFree(off_t page, off_t count) {
7022 off_t j;
7023
7024 for (j = 0; j < count; j++)
7d30035d 7025 vmMarkPageFree(page+j);
7d98e08c 7026 server.vm_stats_used_pages -= count;
06224fec 7027}
7028
7029/* Test if the page is free */
7030static int vmFreePage(off_t page) {
7031 off_t byte = page/8;
7032 int bit = page&7;
7d30035d 7033 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 7034}
7035
7036/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 7037 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7038 * REDIS_ERR is returned.
06224fec 7039 *
7040 * This function uses a simple algorithm: we try to allocate
7041 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7042 * again from the start of the swap file searching for free spaces.
7043 *
7044 * If it looks pretty clear that there are no free pages near our offset
7045 * we try to find less populated places doing a forward jump of
7046 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7047 * without hurry, and then we jump again and so forth...
7048 *
7049 * This function can be improved using a free list to avoid to guess
7050 * too much, since we could collect data about freed pages.
7051 *
7052 * note: I implemented this function just after watching an episode of
7053 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7054 */
7055static int vmFindContiguousPages(off_t *first, int n) {
7056 off_t base, offset = 0, since_jump = 0, numfree = 0;
7057
7058 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7059 server.vm_near_pages = 0;
7060 server.vm_next_page = 0;
7061 }
7062 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7063 base = server.vm_next_page;
7064
7065 while(offset < server.vm_pages) {
7066 off_t this = base+offset;
7067
f870935d 7068 redisLog(REDIS_DEBUG, "THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
06224fec 7069 /* If we overflow, restart from page zero */
7070 if (this >= server.vm_pages) {
7071 this -= server.vm_pages;
7072 if (this == 0) {
7073 /* Just overflowed, what we found on tail is no longer
7074 * interesting, as it's no longer contiguous. */
7075 numfree = 0;
7076 }
7077 }
7078 if (vmFreePage(this)) {
7079 /* This is a free page */
7080 numfree++;
7081 /* Already got N free pages? Return to the caller, with success */
7082 if (numfree == n) {
7d30035d 7083 *first = this-(n-1);
7084 server.vm_next_page = this+1;
3a66edc7 7085 return REDIS_OK;
06224fec 7086 }
7087 } else {
7088 /* The current one is not a free page */
7089 numfree = 0;
7090 }
7091
7092 /* Fast-forward if the current page is not free and we already
7093 * searched enough near this place. */
7094 since_jump++;
7095 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7096 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7097 since_jump = 0;
7098 /* Note that even if we rewind after the jump, we are don't need
7099 * to make sure numfree is set to zero as we only jump *if* it
7100 * is set to zero. */
7101 } else {
7102 /* Otherwise just check the next page */
7103 offset++;
7104 }
7105 }
3a66edc7 7106 return REDIS_ERR;
7107}
7108
a5819310 7109/* Write the specified object at the specified page of the swap file */
7110static int vmWriteObjectOnSwap(robj *o, off_t page) {
7111 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7112 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7113 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7114 redisLog(REDIS_WARNING,
7115 "Critical VM problem in vmSwapObjectBlocking(): can't seek: %s",
7116 strerror(errno));
7117 return REDIS_ERR;
7118 }
7119 rdbSaveObject(server.vm_fp,o);
7120 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7121 return REDIS_OK;
7122}
7123
3a66edc7 7124/* Swap the 'val' object relative to 'key' into disk. Store all the information
7125 * needed to later retrieve the object into the key object.
7126 * If we can't find enough contiguous empty pages to swap the object on disk
7127 * REDIS_ERR is returned. */
a69a0c9c 7128static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 7129 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 7130 off_t page;
7131
7132 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 7133 assert(key->refcount == 1);
3a66edc7 7134 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 7135 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 7136 key->vm.page = page;
7137 key->vm.usedpages = pages;
7138 key->storage = REDIS_VM_SWAPPED;
d894161b 7139 key->vtype = val->type;
3a66edc7 7140 decrRefCount(val); /* Deallocate the object from memory. */
7141 vmMarkPagesUsed(page,pages);
7d30035d 7142 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7143 (unsigned char*) key->ptr,
7144 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 7145 server.vm_stats_swapped_objects++;
7146 server.vm_stats_swapouts++;
0841cc92 7147 fflush(server.vm_fp);
3a66edc7 7148 return REDIS_OK;
7149}
7150
a5819310 7151static robj *vmReadObjectFromSwap(off_t page, int type) {
7152 robj *o;
3a66edc7 7153
a5819310 7154 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7155 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 7156 redisLog(REDIS_WARNING,
7157 "Unrecoverable VM problem in vmLoadObject(): can't seek: %s",
7158 strerror(errno));
7159 exit(1);
7160 }
a5819310 7161 o = rdbLoadObject(type,server.vm_fp);
7162 if (o == NULL) {
3a66edc7 7163 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmLoadObject(): can't load object from swap file: %s", strerror(errno));
7164 exit(1);
7165 }
a5819310 7166 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7167 return o;
7168}
7169
7170/* Load the value object relative to the 'key' object from swap to memory.
7171 * The newly allocated object is returned.
7172 *
7173 * If preview is true the unserialized object is returned to the caller but
7174 * no changes are made to the key object, nor the pages are marked as freed */
7175static robj *vmGenericLoadObject(robj *key, int preview) {
7176 robj *val;
7177
7178 redisAssert(key->storage == REDIS_VM_SWAPPED);
7179 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 7180 if (!preview) {
7181 key->storage = REDIS_VM_MEMORY;
7182 key->vm.atime = server.unixtime;
7183 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7184 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7185 (unsigned char*) key->ptr);
7d98e08c 7186 server.vm_stats_swapped_objects--;
38aba9a1 7187 } else {
7188 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7189 (unsigned char*) key->ptr);
7e69548d 7190 }
7d98e08c 7191 server.vm_stats_swapins++;
3a66edc7 7192 return val;
06224fec 7193}
7194
7e69548d 7195/* Plain object loading, from swap to memory */
7196static robj *vmLoadObject(robj *key) {
996cb5f7 7197 /* If we are loading the object in background, stop it, we
7198 * need to load this object synchronously ASAP. */
7199 if (key->storage == REDIS_VM_LOADING)
7200 vmCancelThreadedIOJob(key);
7e69548d 7201 return vmGenericLoadObject(key,0);
7202}
7203
7204/* Just load the value on disk, without to modify the key.
7205 * This is useful when we want to perform some operation on the value
7206 * without to really bring it from swap to memory, like while saving the
7207 * dataset or rewriting the append only log. */
7208static robj *vmPreviewObject(robj *key) {
7209 return vmGenericLoadObject(key,1);
7210}
7211
4ef8de8a 7212/* How a good candidate is this object for swapping?
7213 * The better candidate it is, the greater the returned value.
7214 *
7215 * Currently we try to perform a fast estimation of the object size in
7216 * memory, and combine it with aging informations.
7217 *
7218 * Basically swappability = idle-time * log(estimated size)
7219 *
7220 * Bigger objects are preferred over smaller objects, but not
7221 * proportionally, this is why we use the logarithm. This algorithm is
7222 * just a first try and will probably be tuned later. */
7223static double computeObjectSwappability(robj *o) {
7224 time_t age = server.unixtime - o->vm.atime;
7225 long asize = 0;
7226 list *l;
7227 dict *d;
7228 struct dictEntry *de;
7229 int z;
7230
7231 if (age <= 0) return 0;
7232 switch(o->type) {
7233 case REDIS_STRING:
7234 if (o->encoding != REDIS_ENCODING_RAW) {
7235 asize = sizeof(*o);
7236 } else {
7237 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7238 }
7239 break;
7240 case REDIS_LIST:
7241 l = o->ptr;
7242 listNode *ln = listFirst(l);
7243
7244 asize = sizeof(list);
7245 if (ln) {
7246 robj *ele = ln->value;
7247 long elesize;
7248
7249 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7250 (sizeof(*o)+sdslen(ele->ptr)) :
7251 sizeof(*o);
7252 asize += (sizeof(listNode)+elesize)*listLength(l);
7253 }
7254 break;
7255 case REDIS_SET:
7256 case REDIS_ZSET:
7257 z = (o->type == REDIS_ZSET);
7258 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7259
7260 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7261 if (z) asize += sizeof(zset)-sizeof(dict);
7262 if (dictSize(d)) {
7263 long elesize;
7264 robj *ele;
7265
7266 de = dictGetRandomKey(d);
7267 ele = dictGetEntryKey(de);
7268 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7269 (sizeof(*o)+sdslen(ele->ptr)) :
7270 sizeof(*o);
7271 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7272 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7273 }
7274 break;
7275 }
7276 return (double)asize*log(1+asize);
7277}
7278
7279/* Try to swap an object that's a good candidate for swapping.
7280 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 7281 * to swap any object at all.
7282 *
7283 * If 'usethreaded' is true, Redis will try to swap the object in background
7284 * using I/O threads. */
7285static int vmSwapOneObject(int usethreads) {
4ef8de8a 7286 int j, i;
7287 struct dictEntry *best = NULL;
7288 double best_swappability = 0;
b9bc0eef 7289 redisDb *best_db = NULL;
4ef8de8a 7290 robj *key, *val;
7291
7292 for (j = 0; j < server.dbnum; j++) {
7293 redisDb *db = server.db+j;
e3cadb8a 7294 int maxtries = 1000;
4ef8de8a 7295
7296 if (dictSize(db->dict) == 0) continue;
7297 for (i = 0; i < 5; i++) {
7298 dictEntry *de;
7299 double swappability;
7300
e3cadb8a 7301 if (maxtries) maxtries--;
4ef8de8a 7302 de = dictGetRandomKey(db->dict);
7303 key = dictGetEntryKey(de);
7304 val = dictGetEntryVal(de);
e3cadb8a 7305 if (key->storage != REDIS_VM_MEMORY) {
7306 if (maxtries) i--; /* don't count this try */
7307 continue;
7308 }
4ef8de8a 7309 swappability = computeObjectSwappability(val);
7310 if (!best || swappability > best_swappability) {
7311 best = de;
7312 best_swappability = swappability;
b9bc0eef 7313 best_db = db;
4ef8de8a 7314 }
7315 }
7316 }
e3cadb8a 7317 if (best == NULL) {
7318 redisLog(REDIS_DEBUG,"No swappable key found!");
7319 return REDIS_ERR;
7320 }
4ef8de8a 7321 key = dictGetEntryKey(best);
7322 val = dictGetEntryVal(best);
7323
e3cadb8a 7324 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 7325 key->ptr, best_swappability);
7326
7327 /* Unshare the key if needed */
7328 if (key->refcount > 1) {
7329 robj *newkey = dupStringObject(key);
7330 decrRefCount(key);
7331 key = dictGetEntryKey(best) = newkey;
7332 }
7333 /* Swap it */
a69a0c9c 7334 if (usethreads) {
b9bc0eef 7335 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 7336 return REDIS_OK;
7337 } else {
a69a0c9c 7338 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7339 dictGetEntryVal(best) = NULL;
7340 return REDIS_OK;
7341 } else {
7342 return REDIS_ERR;
7343 }
4ef8de8a 7344 }
7345}
7346
a69a0c9c 7347static int vmSwapOneObjectBlocking() {
7348 return vmSwapOneObject(0);
7349}
7350
7351static int vmSwapOneObjectThreaded() {
7352 return vmSwapOneObject(1);
7353}
7354
7e69548d 7355/* Return true if it's safe to swap out objects in a given moment.
7356 * Basically we don't want to swap objects out while there is a BGSAVE
7357 * or a BGAEOREWRITE running in backgroud. */
7358static int vmCanSwapOut(void) {
7359 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7360}
7361
1b03836c 7362/* Delete a key if swapped. Returns 1 if the key was found, was swapped
7363 * and was deleted. Otherwise 0 is returned. */
7364static int deleteIfSwapped(redisDb *db, robj *key) {
7365 dictEntry *de;
7366 robj *foundkey;
7367
7368 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7369 foundkey = dictGetEntryKey(de);
7370 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7371 deleteKey(db,key);
7372 return 1;
7373}
7374
996cb5f7 7375/* =================== Virtual Memory - Threaded I/O ======================= */
7376
b9bc0eef 7377static void freeIOJob(iojob *j) {
7378 if (j->type == REDIS_IOJOB_PREPARE_SWAP ||
7379 j->type == REDIS_IOJOB_DO_SWAP)
7380 decrRefCount(j->val);
7381 decrRefCount(j->key);
7382 zfree(j);
7383}
7384
996cb5f7 7385/* Every time a thread finished a Job, it writes a byte into the write side
7386 * of an unix pipe in order to "awake" the main thread, and this function
7387 * is called. */
7388static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7389 int mask)
7390{
7391 char buf[1];
7392 int retval;
7393 REDIS_NOTUSED(el);
7394 REDIS_NOTUSED(mask);
7395 REDIS_NOTUSED(privdata);
7396
7397 /* For every byte we read in the read side of the pipe, there is one
7398 * I/O job completed to process. */
7399 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 7400 iojob *j;
7401 listNode *ln;
7402 robj *key;
7403 struct dictEntry *de;
7404
996cb5f7 7405 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 7406 assert(listLength(server.io_processed) != 0);
7407
7408 /* Get the processed element (the oldest one) */
7409 lockThreadedIO();
7410 ln = listFirst(server.io_processed);
7411 j = ln->value;
7412 listDelNode(server.io_processed,ln);
7413 unlockThreadedIO();
7414 /* If this job is marked as canceled, just ignore it */
7415 if (j->canceled) {
7416 freeIOJob(j);
7417 continue;
7418 }
7419 /* Post process it in the main thread, as there are things we
7420 * can do just here to avoid race conditions and/or invasive locks */
f11b8647 7421 redisLog(REDIS_DEBUG,"Job type: %d, key at %p (%s) refcount: %d\n", j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 7422 de = dictFind(j->db->dict,j->key);
7423 assert(de != NULL);
7424 key = dictGetEntryKey(de);
7425 if (j->type == REDIS_IOJOB_LOAD) {
7426 /* Key loaded, bring it at home */
7427 key->storage = REDIS_VM_MEMORY;
7428 key->vm.atime = server.unixtime;
7429 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7430 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7431 (unsigned char*) key->ptr);
7432 server.vm_stats_swapped_objects--;
7433 server.vm_stats_swapins++;
7434 freeIOJob(j);
7435 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7436 /* Now we know the amount of pages required to swap this object.
7437 * Let's find some space for it, and queue this task again
7438 * rebranded as REDIS_IOJOB_DO_SWAP. */
7439 if (vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) {
7440 /* Ooops... no space! */
7441 freeIOJob(j);
7442 } else {
7443 j->type = REDIS_IOJOB_DO_SWAP;
7444 lockThreadedIO();
7445 queueIOJob(j);
7446 unlockThreadedIO();
7447 }
7448 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7449 robj *val;
7450
7451 /* Key swapped. We can finally free some memory. */
7452 val = dictGetEntryVal(de);
7453 key->vm.page = j->page;
7454 key->vm.usedpages = j->pages;
7455 key->storage = REDIS_VM_SWAPPED;
7456 key->vtype = j->val->type;
7457 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 7458 dictGetEntryVal(de) = NULL;
b9bc0eef 7459 vmMarkPagesUsed(j->page,j->pages);
7460 redisLog(REDIS_DEBUG,
7461 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7462 (unsigned char*) key->ptr,
7463 (unsigned long long) j->page, (unsigned long long) j->pages);
7464 server.vm_stats_swapped_objects++;
7465 server.vm_stats_swapouts++;
7466 freeIOJob(j);
f11b8647 7467 /* Put a few more swap requests in queue if we are still
7468 * out of memory */
7469 if (zmalloc_used_memory() > server.vm_max_memory) {
7470 int more = 1;
7471 while(more) {
7472 lockThreadedIO();
7473 more = listLength(server.io_newjobs) <
7474 (unsigned) server.vm_max_threads;
7475 unlockThreadedIO();
7476 /* Don't waste CPU time if swappable objects are rare. */
7477 if (vmSwapOneObjectThreaded() == REDIS_ERR) break;
7478 }
7479 }
b9bc0eef 7480 }
a5819310 7481 return; /* XXX REMOVE ME */
996cb5f7 7482 }
7483 if (retval < 0 && errno != EAGAIN) {
7484 redisLog(REDIS_WARNING,
7485 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7486 strerror(errno));
7487 }
7488}
7489
7490static void lockThreadedIO(void) {
7491 pthread_mutex_lock(&server.io_mutex);
7492}
7493
7494static void unlockThreadedIO(void) {
7495 pthread_mutex_unlock(&server.io_mutex);
7496}
7497
7498/* Remove the specified object from the threaded I/O queue if still not
7499 * processed, otherwise make sure to flag it as canceled. */
7500static void vmCancelThreadedIOJob(robj *o) {
7501 list *lists[3] = {
7502 server.io_newjobs, server.io_processing, server.io_processed
7503 };
7504 int i;
7505
7506 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
7507 lockThreadedIO();
7508 /* Search for a matching key in one of the queues */
7509 for (i = 0; i < 3; i++) {
7510 listNode *ln;
7511
7512 listRewind(lists[i]);
7513 while ((ln = listYield(lists[i])) != NULL) {
7514 iojob *job = ln->value;
7515
7516 if (compareStringObjects(job->key,o) == 0) {
7517 switch(i) {
7518 case 0: /* io_newjobs */
7519 /* If the job was not yet processed the best thing to do
7520 * is to remove it from the queue at all */
7521 decrRefCount(job->key);
b9bc0eef 7522 if (job->type == REDIS_IOJOB_PREPARE_SWAP ||
7523 job->type == REDIS_IOJOB_DO_SWAP)
996cb5f7 7524 decrRefCount(job->val);
7525 listDelNode(lists[i],ln);
a69a0c9c 7526 zfree(job);
996cb5f7 7527 break;
7528 case 1: /* io_processing */
7529 case 2: /* io_processed */
7530 job->canceled = 1;
7531 break;
7532 }
7533 if (o->storage == REDIS_VM_LOADING)
7534 o->storage = REDIS_VM_SWAPPED;
7535 else if (o->storage == REDIS_VM_SWAPPING)
7536 o->storage = REDIS_VM_MEMORY;
7537 unlockThreadedIO();
7538 return;
7539 }
7540 }
7541 }
7542 unlockThreadedIO();
7543 assert(1 != 1); /* We should never reach this */
7544}
7545
b9bc0eef 7546static void *IOThreadEntryPoint(void *arg) {
7547 iojob *j;
7548 listNode *ln;
7549 REDIS_NOTUSED(arg);
7550
7551 pthread_detach(pthread_self());
7552 while(1) {
7553 /* Get a new job to process */
7554 lockThreadedIO();
7555 if (listLength(server.io_newjobs) == 0) {
7556 /* No new jobs in queue, exit. */
20f5b388 7557 redisLog(REDIS_DEBUG,"Thread %lld exiting, nothing to do\n",
b9bc0eef 7558 (long long) pthread_self());
7559 server.io_active_threads--;
7560 unlockThreadedIO();
7561 return NULL;
7562 }
7563 ln = listFirst(server.io_newjobs);
7564 j = ln->value;
7565 listDelNode(server.io_newjobs,ln);
7566 /* Add the job in the processing queue */
7567 j->thread = pthread_self();
7568 listAddNodeTail(server.io_processing,j);
7569 ln = listLast(server.io_processing); /* We use ln later to remove it */
7570 unlockThreadedIO();
20f5b388 7571 redisLog(REDIS_DEBUG,"Thread %lld got a new job: %p about key '%s'\n",
b9bc0eef 7572 (long long) pthread_self(), (void*)j, (char*)j->key->ptr);
7573
7574 /* Process the Job */
7575 if (j->type == REDIS_IOJOB_LOAD) {
7576 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7577 FILE *fp = fopen("/dev/null","w+");
7578 j->pages = rdbSavedObjectPages(j->val,fp);
7579 fclose(fp);
7580 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 7581 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
7582 j->canceled = 1;
b9bc0eef 7583 }
7584
7585 /* Done: insert the job into the processed queue */
20f5b388 7586 redisLog(REDIS_DEBUG,"Thread %lld completed the job: %p\n",
b9bc0eef 7587 (long long) pthread_self(), (void*)j);
7588 lockThreadedIO();
7589 listDelNode(server.io_processing,ln);
7590 listAddNodeTail(server.io_processed,j);
7591 unlockThreadedIO();
7592
7593 /* Signal the main thread there is new stuff to process */
7594 assert(write(server.io_ready_pipe_write,"x",1) == 1);
7595 }
7596 return NULL; /* never reached */
7597}
7598
7599static void spawnIOThread(void) {
7600 pthread_t thread;
7601
7602 pthread_create(&thread,NULL,IOThreadEntryPoint,NULL);
7603 server.io_active_threads++;
7604}
7605
7606/* This function must be called while with threaded IO locked */
7607static void queueIOJob(iojob *j) {
7608 listAddNodeTail(server.io_newjobs,j);
7609 if (server.io_active_threads < server.vm_max_threads)
7610 spawnIOThread();
7611}
7612
7613static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
7614 iojob *j;
7615
7616 assert(key->storage == REDIS_VM_MEMORY);
7617 assert(key->refcount == 1);
7618
7619 j = zmalloc(sizeof(*j));
7620 j->type = REDIS_IOJOB_PREPARE_SWAP;
7621 j->db = db;
7622 j->key = dupStringObject(key);
7623 j->val = val;
7624 incrRefCount(val);
7625 j->canceled = 0;
7626 j->thread = (pthread_t) -1;
f11b8647 7627 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 7628
7629 lockThreadedIO();
7630 queueIOJob(j);
7631 unlockThreadedIO();
7632 return REDIS_OK;
7633}
7634
7f957c92 7635/* ================================= Debugging ============================== */
7636
7637static void debugCommand(redisClient *c) {
7638 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
7639 *((char*)-1) = 'x';
210e29f7 7640 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
7641 if (rdbSave(server.dbfilename) != REDIS_OK) {
7642 addReply(c,shared.err);
7643 return;
7644 }
7645 emptyDb();
7646 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7647 addReply(c,shared.err);
7648 return;
7649 }
7650 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
7651 addReply(c,shared.ok);
71c2b467 7652 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
7653 emptyDb();
7654 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
7655 addReply(c,shared.err);
7656 return;
7657 }
7658 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
7659 addReply(c,shared.ok);
333298da 7660 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
7661 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7662 robj *key, *val;
7663
7664 if (!de) {
7665 addReply(c,shared.nokeyerr);
7666 return;
7667 }
7668 key = dictGetEntryKey(de);
7669 val = dictGetEntryVal(de);
b9bc0eef 7670 if (server.vm_enabled && (key->storage == REDIS_VM_MEMORY ||
7671 key->storage == REDIS_VM_SWAPPING)) {
ace06542 7672 addReplySds(c,sdscatprintf(sdsempty(),
7673 "+Key at:%p refcount:%d, value at:%p refcount:%d "
7674 "encoding:%d serializedlength:%lld\r\n",
682ac724 7675 (void*)key, key->refcount, (void*)val, val->refcount,
b9bc0eef 7676 val->encoding, rdbSavedObjectLen(val,NULL)));
ace06542 7677 } else {
7678 addReplySds(c,sdscatprintf(sdsempty(),
7679 "+Key at:%p refcount:%d, value swapped at: page %llu "
7680 "using %llu pages\r\n",
7681 (void*)key, key->refcount, (unsigned long long) key->vm.page,
7682 (unsigned long long) key->vm.usedpages));
7683 }
7d30035d 7684 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
7685 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7686 robj *key, *val;
7687
7688 if (!server.vm_enabled) {
7689 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
7690 return;
7691 }
7692 if (!de) {
7693 addReply(c,shared.nokeyerr);
7694 return;
7695 }
7696 key = dictGetEntryKey(de);
7697 val = dictGetEntryVal(de);
4ef8de8a 7698 /* If the key is shared we want to create a copy */
7699 if (key->refcount > 1) {
7700 robj *newkey = dupStringObject(key);
7701 decrRefCount(key);
7702 key = dictGetEntryKey(de) = newkey;
7703 }
7704 /* Swap it */
7d30035d 7705 if (key->storage != REDIS_VM_MEMORY) {
7706 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 7707 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 7708 dictGetEntryVal(de) = NULL;
7709 addReply(c,shared.ok);
7710 } else {
7711 addReply(c,shared.err);
7712 }
7f957c92 7713 } else {
333298da 7714 addReplySds(c,sdsnew(
7d30035d 7715 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 7716 }
7717}
56906eef 7718
dfc5e96c 7719static void _redisAssert(char *estr) {
7720 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
7721 redisLog(REDIS_WARNING,"==> %s\n",estr);
7722#ifdef HAVE_BACKTRACE
7723 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
7724 *((char*)-1) = 'x';
7725#endif
7726}
7727
bcfc686d 7728/* =================================== Main! ================================ */
56906eef 7729
bcfc686d 7730#ifdef __linux__
7731int linuxOvercommitMemoryValue(void) {
7732 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
7733 char buf[64];
56906eef 7734
bcfc686d 7735 if (!fp) return -1;
7736 if (fgets(buf,64,fp) == NULL) {
7737 fclose(fp);
7738 return -1;
7739 }
7740 fclose(fp);
56906eef 7741
bcfc686d 7742 return atoi(buf);
7743}
7744
7745void linuxOvercommitMemoryWarning(void) {
7746 if (linuxOvercommitMemoryValue() == 0) {
7747 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
7748 }
7749}
7750#endif /* __linux__ */
7751
7752static void daemonize(void) {
7753 int fd;
7754 FILE *fp;
7755
7756 if (fork() != 0) exit(0); /* parent exits */
7757 setsid(); /* create a new session */
7758
7759 /* Every output goes to /dev/null. If Redis is daemonized but
7760 * the 'logfile' is set to 'stdout' in the configuration file
7761 * it will not log at all. */
7762 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
7763 dup2(fd, STDIN_FILENO);
7764 dup2(fd, STDOUT_FILENO);
7765 dup2(fd, STDERR_FILENO);
7766 if (fd > STDERR_FILENO) close(fd);
7767 }
7768 /* Try to write the pid file */
7769 fp = fopen(server.pidfile,"w");
7770 if (fp) {
7771 fprintf(fp,"%d\n",getpid());
7772 fclose(fp);
56906eef 7773 }
56906eef 7774}
7775
bcfc686d 7776int main(int argc, char **argv) {
7777 initServerConfig();
7778 if (argc == 2) {
7779 resetServerSaveParams();
7780 loadServerConfig(argv[1]);
7781 } else if (argc > 2) {
7782 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
7783 exit(1);
7784 } else {
7785 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
7786 }
bcfc686d 7787 if (server.daemonize) daemonize();
71c54b21 7788 initServer();
bcfc686d 7789 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
7790#ifdef __linux__
7791 linuxOvercommitMemoryWarning();
7792#endif
7793 if (server.appendonly) {
7794 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
7795 redisLog(REDIS_NOTICE,"DB loaded from append only file");
7796 } else {
7797 if (rdbLoad(server.dbfilename) == REDIS_OK)
7798 redisLog(REDIS_NOTICE,"DB loaded from disk");
7799 }
bcfc686d 7800 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
7801 aeMain(server.el);
7802 aeDeleteEventLoop(server.el);
7803 return 0;
7804}
7805
7806/* ============================= Backtrace support ========================= */
7807
7808#ifdef HAVE_BACKTRACE
7809static char *findFuncName(void *pointer, unsigned long *offset);
7810
56906eef 7811static void *getMcontextEip(ucontext_t *uc) {
7812#if defined(__FreeBSD__)
7813 return (void*) uc->uc_mcontext.mc_eip;
7814#elif defined(__dietlibc__)
7815 return (void*) uc->uc_mcontext.eip;
06db1f50 7816#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 7817 #if __x86_64__
7818 return (void*) uc->uc_mcontext->__ss.__rip;
7819 #else
56906eef 7820 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 7821 #endif
06db1f50 7822#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 7823 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 7824 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 7825 #else
7826 return (void*) uc->uc_mcontext->__ss.__eip;
7827 #endif
c04c9ac9 7828#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
7829 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 7830#elif defined(__ia64__) /* Linux IA64 */
7831 return (void*) uc->uc_mcontext.sc_ip;
7832#else
7833 return NULL;
56906eef 7834#endif
7835}
7836
7837static void segvHandler(int sig, siginfo_t *info, void *secret) {
7838 void *trace[100];
7839 char **messages = NULL;
7840 int i, trace_size = 0;
7841 unsigned long offset=0;
56906eef 7842 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 7843 sds infostring;
56906eef 7844 REDIS_NOTUSED(info);
7845
7846 redisLog(REDIS_WARNING,
7847 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 7848 infostring = genRedisInfoString();
7849 redisLog(REDIS_WARNING, "%s",infostring);
7850 /* It's not safe to sdsfree() the returned string under memory
7851 * corruption conditions. Let it leak as we are going to abort */
56906eef 7852
7853 trace_size = backtrace(trace, 100);
de96dbfe 7854 /* overwrite sigaction with caller's address */
b91cf5ef 7855 if (getMcontextEip(uc) != NULL) {
7856 trace[1] = getMcontextEip(uc);
7857 }
56906eef 7858 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 7859
d76412d1 7860 for (i=1; i<trace_size; ++i) {
56906eef 7861 char *fn = findFuncName(trace[i], &offset), *p;
7862
7863 p = strchr(messages[i],'+');
7864 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
7865 redisLog(REDIS_WARNING,"%s", messages[i]);
7866 } else {
7867 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
7868 }
7869 }
b177fd30 7870 /* free(messages); Don't call free() with possibly corrupted memory. */
56906eef 7871 exit(0);
fe3bbfbe 7872}
56906eef 7873
7874static void setupSigSegvAction(void) {
7875 struct sigaction act;
7876
7877 sigemptyset (&act.sa_mask);
7878 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
7879 * is used. Otherwise, sa_handler is used */
7880 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
7881 act.sa_sigaction = segvHandler;
7882 sigaction (SIGSEGV, &act, NULL);
7883 sigaction (SIGBUS, &act, NULL);
12fea928 7884 sigaction (SIGFPE, &act, NULL);
7885 sigaction (SIGILL, &act, NULL);
7886 sigaction (SIGBUS, &act, NULL);
e65fdc78 7887 return;
56906eef 7888}
e65fdc78 7889
bcfc686d 7890#include "staticsymbols.h"
7891/* This function try to convert a pointer into a function name. It's used in
7892 * oreder to provide a backtrace under segmentation fault that's able to
7893 * display functions declared as static (otherwise the backtrace is useless). */
7894static char *findFuncName(void *pointer, unsigned long *offset){
7895 int i, ret = -1;
7896 unsigned long off, minoff = 0;
ed9b544e 7897
bcfc686d 7898 /* Try to match against the Symbol with the smallest offset */
7899 for (i=0; symsTable[i].pointer; i++) {
7900 unsigned long lp = (unsigned long) pointer;
0bc03378 7901
bcfc686d 7902 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
7903 off=lp-symsTable[i].pointer;
7904 if (ret < 0 || off < minoff) {
7905 minoff=off;
7906 ret=i;
7907 }
7908 }
0bc03378 7909 }
bcfc686d 7910 if (ret == -1) return NULL;
7911 *offset = minoff;
7912 return symsTable[ret].name;
0bc03378 7913}
bcfc686d 7914#else /* HAVE_BACKTRACE */
7915static void setupSigSegvAction(void) {
0bc03378 7916}
bcfc686d 7917#endif /* HAVE_BACKTRACE */
0bc03378 7918
ed9b544e 7919
ed9b544e 7920
bcfc686d 7921/* The End */
7922
7923
ed9b544e 7924